@alis-build/harness-eval 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +104 -10
- package/dist/adapters/claude-code/index.d.ts +2 -2
- package/dist/adapters/claude-code/index.js +2 -1
- package/dist/adapters/codex/index.d.ts +68 -0
- package/dist/adapters/codex/index.js +3 -0
- package/dist/{claude-code-ycT0JQZF.js → claude-code-C_7hxC8z.js} +37 -250
- package/dist/claude-code-C_7hxC8z.js.map +1 -0
- package/dist/cli/bin.js +204 -127
- package/dist/cli/bin.js.map +1 -1
- package/dist/codex-0cHO2te9.js +496 -0
- package/dist/codex-0cHO2te9.js.map +1 -0
- package/dist/config/loader.d.ts +2 -2
- package/dist/config/loader.js +2 -2
- package/dist/{index-6Z17eKZx.d.ts → index-DnvP1UBl.d.ts} +3 -2
- package/dist/index.d.ts +397 -153
- package/dist/index.js +125 -5
- package/dist/index.js.map +1 -0
- package/dist/loader-B1WmGGzf.d.ts +107 -0
- package/dist/{loader-BCnFJ8rm.js → loader-DnQ6Jt0i.js} +707 -157
- package/dist/loader-DnQ6Jt0i.js.map +1 -0
- package/dist/reporter-Biy-5-9M.js +2216 -0
- package/dist/reporter-Biy-5-9M.js.map +1 -0
- package/dist/runner/suite.d.ts +1 -1
- package/dist/runner/suite.js +1 -1
- package/dist/{suite-BoOvK_lq.d.ts → suite-BEShV0by.d.ts} +7 -2
- package/dist/{suite-chj0j22j.js → suite-BcP64nlb.js} +72 -4
- package/dist/suite-BcP64nlb.js.map +1 -0
- package/dist/{types-BQol062t.d.ts → types-0QkNVyp9.d.ts} +152 -11
- package/dist/types-Bac8_Ixb.js +246 -0
- package/dist/types-Bac8_Ixb.js.map +1 -0
- package/dist/types-Bu8uOZZN.d.ts +77 -0
- package/dist/{types-B9H4IZtA.d.ts → types-C0gBkl0-.d.ts} +3 -2
- package/package.json +7 -2
- package/schemas/eval-interchange-instances.schema.json +196 -0
- package/schemas/eval-interchange.schema.json +65 -52
- package/schemas/eval-run-envelope.schema.json +182 -425
- package/dist/build-DsVJ_UeU.js +0 -1396
- package/dist/build-DsVJ_UeU.js.map +0 -1
- package/dist/claude-code-ycT0JQZF.js.map +0 -1
- package/dist/loader-BCnFJ8rm.js.map +0 -1
- package/dist/loader-DTvoVfN0.d.ts +0 -33
- package/dist/suite-chj0j22j.js.map +0 -1
- package/schemas/eval-interchange-agent-trace.schema.json +0 -322
- package/schemas/eval-interchange-proto-instance.schema.json +0 -106
package/README.md
CHANGED
|
@@ -54,10 +54,11 @@ pnpm exec harness-eval --help
|
|
|
54
54
|
|
|
55
55
|
Suites are YAML files. Committed examples:
|
|
56
56
|
|
|
57
|
-
- [`examples/
|
|
57
|
+
- [`examples/pipeline/`](examples/pipeline/) — **recommended** unified layout with inline `judge:` + `pipeline:` orchestration
|
|
58
|
+
- [`examples/basic.yaml`](examples/basic.yaml) — minimal smoke test using the built-in `Read` tool on this repo's README
|
|
58
59
|
- [`examples/matrix.yaml`](examples/matrix.yaml) — same idea with a model matrix (sonnet vs opus)
|
|
59
60
|
- [`examples/multi-file/`](examples/multi-file/) — directory layout with `suite.yaml` plus cases under `cases/`
|
|
60
|
-
- [`examples/grading.yaml`](examples/grading.yaml) — standalone judge config
|
|
61
|
+
- [`examples/grading.yaml`](examples/grading.yaml) — standalone judge config (alternate to inline `judge:`)
|
|
61
62
|
|
|
62
63
|
```yaml
|
|
63
64
|
adapter: claude-code
|
|
@@ -96,9 +97,15 @@ cases:
|
|
|
96
97
|
|
|
97
98
|
Generic fields (`model`, `cwd`, `timeoutMs`, `env`) sit at the top level. Claude-specific options go under `claudeCode`.
|
|
98
99
|
|
|
100
|
+
**Full suite & grading YAML reference:** [docs/suite-config.md](docs/suite-config.md) — all case/matrix fields, inline `judge:` / `pipeline:`, multi-file layout, and standalone `grading.yaml`.
|
|
101
|
+
|
|
99
102
|
### 2. Run behavioral eval
|
|
100
103
|
|
|
101
104
|
```bash
|
|
105
|
+
# Unified pipeline (run + optional grade + envelope when pipeline: is defined)
|
|
106
|
+
npx @alis-build/harness-eval pipeline examples/pipeline/
|
|
107
|
+
|
|
108
|
+
# Or run harness only
|
|
102
109
|
npx @alis-build/harness-eval run examples/basic.yaml --output report.json --max-concurrent 1 --format console
|
|
103
110
|
```
|
|
104
111
|
|
|
@@ -110,7 +117,14 @@ Exit code `0` = all cells passed all assertion thresholds.
|
|
|
110
117
|
|
|
111
118
|
### 3. Grade outcomes (optional)
|
|
112
119
|
|
|
113
|
-
|
|
120
|
+
**Unified suite:** add a top-level `judge:` block in `suite.yaml` (see [`examples/pipeline/suite.yaml`](examples/pipeline/suite.yaml)), then:
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
npx @alis-build/harness-eval grade report.json --suite examples/pipeline/suite.yaml --output grading.json --max-concurrent 1 --format console
|
|
124
|
+
# or: npx @alis-build/harness-eval pipeline examples/pipeline/ --steps grade
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
**Standalone grading file:** judge config in a separate **`grading.yaml`** (still supported). See [`examples/grading.yaml`](examples/grading.yaml).
|
|
114
128
|
|
|
115
129
|
```bash
|
|
116
130
|
npx @alis-build/harness-eval grade report.json --config examples/grading.yaml --output grading.json --max-concurrent 1 --format console
|
|
@@ -399,6 +413,7 @@ Both layers use statistical thresholds: a case runs `repetitions` times per matr
|
|
|
399
413
|
npx @alis-build/harness-eval run <suite.yaml> [options]
|
|
400
414
|
npx @alis-build/harness-eval grade <report.json> [options]
|
|
401
415
|
npx @alis-build/harness-eval envelope <report.json> [options]
|
|
416
|
+
npx @alis-build/harness-eval pipeline <suite.yaml|dir> [options]
|
|
402
417
|
npx @alis-build/harness-eval format <report.json> [options]
|
|
403
418
|
npx @alis-build/harness-eval --help
|
|
404
419
|
```
|
|
@@ -420,7 +435,9 @@ npx @alis-build/harness-eval --help
|
|
|
420
435
|
|
|
421
436
|
### `grade`
|
|
422
437
|
|
|
423
|
-
Uses
|
|
438
|
+
Uses **`grading.yaml`** or an inline **`judge:`** block in `suite.yaml` (`--suite`).
|
|
439
|
+
|
|
440
|
+
**Field reference:** [docs/suite-config.md — Grading config](docs/suite-config.md#grading-config-gradingyaml)
|
|
424
441
|
|
|
425
442
|
```yaml
|
|
426
443
|
# examples/grading.yaml
|
|
@@ -440,6 +457,7 @@ npx @alis-build/harness-eval grade report.json --config examples/grading.yaml --
|
|
|
440
457
|
| Option | Description |
|
|
441
458
|
| -------------------------------------- | ----------------------------------------------------------------- |
|
|
442
459
|
| `--config <path>` | Grading YAML (`judge` block) — model, env, timeout, `claudeCode` |
|
|
460
|
+
| `--suite <path>` | Unified `suite.yaml` with inline `judge:` (alternative to `--config`) |
|
|
443
461
|
| `--output <path>` | Write grading JSON |
|
|
444
462
|
| `--expectations <path>` | Sidecar YAML/JSON if report lacks expectations |
|
|
445
463
|
| `--format console\|json` | Output format |
|
|
@@ -467,7 +485,7 @@ npx @alis-build/harness-eval envelope report.json --suite examples/basic.yaml --
|
|
|
467
485
|
# Interchange projections
|
|
468
486
|
npx @alis-build/harness-eval envelope report.json --projection trajectory --output trajectory.jsonl
|
|
469
487
|
npx @alis-build/harness-eval envelope report.json --projection instances --output instances.json
|
|
470
|
-
npx @alis-build/harness-eval envelope report.json --projection
|
|
488
|
+
npx @alis-build/harness-eval envelope report.json --projection instances --output instances.jsonl
|
|
471
489
|
```
|
|
472
490
|
|
|
473
491
|
| Option | Description |
|
|
@@ -475,12 +493,34 @@ npx @alis-build/harness-eval envelope report.json --projection agent-trace --out
|
|
|
475
493
|
| `--output <path>` | Write output (stdout if omitted) |
|
|
476
494
|
| `--grading <path>` | Merge `grading.json` outcome scores into the envelope |
|
|
477
495
|
| `--suite <path>` | Suite YAML for provenance (`uri`, `contentHash`) |
|
|
478
|
-
| `--projection envelope\|trajectory\|instances
|
|
496
|
+
| `--projection envelope\|trajectory\|instances` | Output shape (default: `envelope`) |
|
|
479
497
|
| `--include-raw-stream-events` | Include adapter raw stream events in repetition artifacts |
|
|
480
498
|
| `--no-transcript` | Omit judge transcript artifacts |
|
|
481
499
|
|
|
482
500
|
Exit codes: `0` = envelope built and behavioral pass; `1` = built but behavioral failures; `2` = usage or file errors.
|
|
483
501
|
|
|
502
|
+
### `pipeline`
|
|
503
|
+
|
|
504
|
+
Orchestrate **run → grade → envelope** from a unified `suite.yaml` when a `pipeline:` block is present. See [docs/suite-config.md — Pipeline orchestration](docs/suite-config.md#pipeline-orchestration-pipeline).
|
|
505
|
+
|
|
506
|
+
```bash
|
|
507
|
+
npx @alis-build/harness-eval pipeline examples/pipeline/
|
|
508
|
+
npx @alis-build/harness-eval pipeline my-suite/ --steps run,grade
|
|
509
|
+
```
|
|
510
|
+
|
|
511
|
+
| Option | Description |
|
|
512
|
+
| ------ | ----------- |
|
|
513
|
+
| `--steps run,grade,envelope` | Subset of configured steps (default: all configured) |
|
|
514
|
+
| `--output <path>` | Override `pipeline.run.output` |
|
|
515
|
+
| `--report <path>` | Override report input for grade/envelope |
|
|
516
|
+
| `--grading <path>` | Override grading input for envelope |
|
|
517
|
+
| `--grading-output <path>` | Override `pipeline.grade.output` |
|
|
518
|
+
| `--envelope-output <path>` | Override `pipeline.envelope.output` |
|
|
519
|
+
| `--projection envelope\|trajectory\|instances` | Envelope projection |
|
|
520
|
+
| `--max-concurrent <n>` | Parallel harness/judge workers |
|
|
521
|
+
|
|
522
|
+
Exit codes match the first failing step (`run`, `grade`, or `envelope`). Returns `2` when no `pipeline:` block exists.
|
|
523
|
+
|
|
484
524
|
### `format`
|
|
485
525
|
|
|
486
526
|
Re-render an existing `report.json` without re-running the harness.
|
|
@@ -509,6 +549,8 @@ See [Data contracts & schemas](#data-contracts--schemas) for type details.
|
|
|
509
549
|
|
|
510
550
|
## Suite concepts
|
|
511
551
|
|
|
552
|
+
**Authoring reference:** [docs/suite-config.md](docs/suite-config.md) — complete field list for suite YAML, matrix cells, test cases, reference trajectories, and grading config.
|
|
553
|
+
|
|
512
554
|
### Test case
|
|
513
555
|
|
|
514
556
|
One prompt + assertions + optional expectations, run N times per matrix cell.
|
|
@@ -533,13 +575,17 @@ assertions:
|
|
|
533
575
|
|
|
534
576
|
Default threshold is `1.0` (every evaluated rep must pass). Reps where the harness crashes are excluded from the denominator and counted as `adapterErrors`.
|
|
535
577
|
|
|
578
|
+
### Reference trajectory (optional)
|
|
579
|
+
|
|
580
|
+
Define expected tool calls for Vertex trajectory metrics on the eval envelope. Use `tool_name_mode: bare` when reference steps use short tool names but the harness records MCP-prefixed names. See [docs/suite-config.md — Reference trajectory](docs/suite-config.md#reference-trajectory).
|
|
581
|
+
|
|
536
582
|
**Full reference:** [docs/assertions.md](docs/assertions.md) — all assertion kinds, predicates, statistical model, and how to add new assertion types or harness adapters.
|
|
537
583
|
|
|
538
584
|
---
|
|
539
585
|
|
|
540
586
|
## Adding harness adapters
|
|
541
587
|
|
|
542
|
-
Built-in adapters register at module load.
|
|
588
|
+
Built-in adapters register at module load. **`claude-code`** and **`codex`** ship today; additional harnesses (Gemini CLI, Antigravity CLI) plug in via the same pattern:
|
|
543
589
|
|
|
544
590
|
1. Implement `HarnessAdapter` under `src/adapters/<id>/` with a `run(config)` that returns a `TrajectoryView`.
|
|
545
591
|
2. Add a nested config key on `SuiteConfig` (e.g. `codex: { ... }`) for harness-specific options.
|
|
@@ -554,7 +600,7 @@ import {
|
|
|
554
600
|
} from "@alis-build/harness-eval";
|
|
555
601
|
|
|
556
602
|
registerAdapter("my-harness", myAdapter);
|
|
557
|
-
console.log(listAdapters()); // ["claude-code", "my-harness"]
|
|
603
|
+
console.log(listAdapters()); // ["claude-code", "codex", "my-harness"]
|
|
558
604
|
```
|
|
559
605
|
|
|
560
606
|
Duplicate registration throws so accidental overrides fail fast during startup or tests.
|
|
@@ -610,12 +656,55 @@ The adapter captures Claude’s stream-json output and builds a `TrajectoryView`
|
|
|
610
656
|
|
|
611
657
|
---
|
|
612
658
|
|
|
659
|
+
## Codex CLI adapter
|
|
660
|
+
|
|
661
|
+
Nested under `codex` in YAML (or flat in programmatic config). Maps to [Codex CLI reference](https://developers.openai.com/codex/cli/reference) (`codex exec` flags).
|
|
662
|
+
|
|
663
|
+
The harness adapter invokes:
|
|
664
|
+
|
|
665
|
+
```bash
|
|
666
|
+
codex --ask-for-approval never exec --json [exec flags…] "<prompt>"
|
|
667
|
+
```
|
|
668
|
+
|
|
669
|
+
`--ask-for-approval` is a **global** flag (before `exec`); other options attach to the `exec` subcommand.
|
|
670
|
+
|
|
671
|
+
| Field | CLI flag | Notes |
|
|
672
|
+
| ----- | -------- | ----- |
|
|
673
|
+
| `binary` | — | Default `codex` |
|
|
674
|
+
| `model` | `--model` | Also settable at top level |
|
|
675
|
+
| `profile` | `--profile` | Layer `$CODEX_HOME/<profile>.config.toml` |
|
|
676
|
+
| `sandbox` | `--sandbox` | `read-only`, `workspace-write`, `danger-full-access` |
|
|
677
|
+
| `addDirs` | `--add-dir` | Extra writable dirs (repeatable) |
|
|
678
|
+
| `configOverrides` | `-c key=value` | Inline TOML overrides (repeatable) |
|
|
679
|
+
| `askForApproval` | `--ask-for-approval` | Default `never` for non-interactive eval |
|
|
680
|
+
| `dangerouslyBypassApprovalsAndSandbox` | `--yolo` | Hardened CI only |
|
|
681
|
+
| `dangerouslyBypassHookTrust` | `--dangerously-bypass-hook-trust` | Automation with vetted hooks |
|
|
682
|
+
| `ephemeral` | `--ephemeral` | No session rollout files |
|
|
683
|
+
| `ignoreUserConfig` | `--ignore-user-config` | Skip `$CODEX_HOME/config.toml` |
|
|
684
|
+
| `skipGitRepoCheck` | `--skip-git-repo-check` | Allow runs outside git repos |
|
|
685
|
+
| `outputSchema` | `--output-schema` | JSON Schema for structured final output |
|
|
686
|
+
| `outputLastMessage` | `--output-last-message` | Write final assistant message to file (auto temp path when `captureLastMessage` is true) |
|
|
687
|
+
| `captureLastMessage` | — | Default `true`: auto `--output-last-message` and read into `finalResponse` if JSONL has no assistant text |
|
|
688
|
+
| `isolateConfig` | — | `false` (default) = inherit `~/.codex`; `true` = temp `$CODEX_HOME` per run |
|
|
689
|
+
|
|
690
|
+
Generic `cwd` sets the child process working directory (`--cd`). MCP tool calls in Codex `--json` output map to harness names `mcp__<server>__<tool>`; shell commands map to `Bash`.
|
|
691
|
+
|
|
692
|
+
The adapter maps Codex JSONL events into the shared `StreamEvent` shape and feeds `TrajectoryBuilder`. Fixture-driven tests use committed recordings under `tests/fixtures/codex/` — CI does not require `codex` on `PATH`.
|
|
693
|
+
|
|
694
|
+
**Example suite:** [examples/codex-basic.yaml](examples/codex-basic.yaml)
|
|
695
|
+
|
|
696
|
+
**Codex judge:** set `judge.adapter: codex` and nest options under `judge.codex` in grading YAML (see [docs/suite-config.md](docs/suite-config.md)).
|
|
697
|
+
|
|
698
|
+
---
|
|
699
|
+
|
|
613
700
|
## Library API
|
|
614
701
|
|
|
615
702
|
```typescript
|
|
616
703
|
import {
|
|
617
704
|
loadSuite,
|
|
705
|
+
loadSuiteDocument,
|
|
618
706
|
runSuite,
|
|
707
|
+
runPipeline,
|
|
619
708
|
gradeReport,
|
|
620
709
|
buildEvalRunEnvelope,
|
|
621
710
|
trajectoryToTranscript,
|
|
@@ -625,6 +714,11 @@ import {
|
|
|
625
714
|
} from "@alis-build/harness-eval";
|
|
626
715
|
import { loadGradingConfig } from "@alis-build/harness-eval/config";
|
|
627
716
|
|
|
717
|
+
// Unified pipeline
|
|
718
|
+
const doc = await loadSuiteDocument("./examples/pipeline/suite.yaml");
|
|
719
|
+
const { exitCode } = await runPipeline(doc, { maxConcurrent: 2 });
|
|
720
|
+
|
|
721
|
+
// Or step-by-step
|
|
628
722
|
const suite = await loadSuite("./examples/basic.yaml");
|
|
629
723
|
const report = await runSuite(suite, { maxConcurrent: 2 });
|
|
630
724
|
|
|
@@ -649,7 +743,7 @@ const envelope = buildEvalRunEnvelope(report, {
|
|
|
649
743
|
});
|
|
650
744
|
```
|
|
651
745
|
|
|
652
|
-
Subpath exports: `@alis-build/harness-eval/runner`, `@alis-build/harness-eval/config`, `@alis-build/harness-eval/adapters/claude-code`.
|
|
746
|
+
Subpath exports: `@alis-build/harness-eval/runner`, `@alis-build/harness-eval/config`, `@alis-build/harness-eval/adapters/claude-code`, `@alis-build/harness-eval/adapters/codex`.
|
|
653
747
|
|
|
654
748
|
---
|
|
655
749
|
|
|
@@ -690,7 +784,7 @@ pnpm run typecheck
|
|
|
690
784
|
pnpm run generate-schemas # Zod → schemas/*.schema.json only
|
|
691
785
|
```
|
|
692
786
|
|
|
693
|
-
**Docs:** [Assertion DSL & adapter extension](docs/assertions.md) · [Eval record contract (DB / CI)](docs/eval-record.md)
|
|
787
|
+
**Docs:** [Suite & grading YAML](docs/suite-config.md) · [Assertion DSL & adapter extension](docs/assertions.md) · [Eval record contract (DB / CI)](docs/eval-record.md)
|
|
694
788
|
|
|
695
789
|
---
|
|
696
790
|
|
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
import { n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics } from "../../types-
|
|
2
|
-
import { a as ClaudeCodeAdapterResult, i as ClaudeCodeAdapterConfig, o as ClaudeCodeOptions, r as runClaudeCode, s as PermissionMode, t as claudeCodeAdapter } from "../../index-
|
|
1
|
+
import { n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics } from "../../types-C0gBkl0-.js";
|
|
2
|
+
import { a as ClaudeCodeAdapterResult, i as ClaudeCodeAdapterConfig, o as ClaudeCodeOptions, r as runClaudeCode, s as PermissionMode, t as claudeCodeAdapter } from "../../index-DnvP1UBl.js";
|
|
3
3
|
export { type AdapterDiagnostics, AdapterError, type AdapterResult, type ClaudeCodeAdapterConfig, type ClaudeCodeAdapterResult, type ClaudeCodeOptions, type ParseErrorRecord, type PermissionMode, claudeCodeAdapter, runClaudeCode };
|
|
@@ -1,2 +1,3 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { t as AdapterError } from "../../types-Bac8_Ixb.js";
|
|
2
|
+
import { r as runClaudeCode, t as claudeCodeAdapter } from "../../claude-code-C_7hxC8z.js";
|
|
2
3
|
export { AdapterError, claudeCodeAdapter, runClaudeCode };
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import { a as HarnessAdapter, n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics, x as StreamEvent } from "../../types-C0gBkl0-.js";
|
|
2
|
+
import { i as CodexOptions, n as CodexAdapterResult, r as CodexJsonEvent, t as CodexAdapterConfig } from "../../types-Bu8uOZZN.js";
|
|
3
|
+
|
|
4
|
+
//#region src/adapters/codex/map-events.d.ts
|
|
5
|
+
/** Stateful mapper — tracks session id and pending tool calls across the stream. */
|
|
6
|
+
declare class CodexEventMapper {
|
|
7
|
+
private sessionId;
|
|
8
|
+
private sawInit;
|
|
9
|
+
private startedItems;
|
|
10
|
+
private turnCount;
|
|
11
|
+
/** Map one parsed Codex JSON object to zero or more stream events. */
|
|
12
|
+
map(event: CodexJsonEvent): StreamEvent[];
|
|
13
|
+
private buildInit;
|
|
14
|
+
private ensureInit;
|
|
15
|
+
private mapItemStarted;
|
|
16
|
+
private mapItemCompleted;
|
|
17
|
+
private toolUseEvent;
|
|
18
|
+
private commandUseEvent;
|
|
19
|
+
private toolResultEvent;
|
|
20
|
+
private buildResult;
|
|
21
|
+
}
|
|
22
|
+
/** Map an entire fixture or stream of Codex events through a fresh mapper. */
|
|
23
|
+
declare function mapCodexEvents(events: CodexJsonEvent[]): StreamEvent[];
|
|
24
|
+
/** Build harness-qualified MCP tool name from Codex server + tool fields. */
|
|
25
|
+
declare function mcpToolName(server: string, tool: string): string;
|
|
26
|
+
//#endregion
|
|
27
|
+
//#region src/adapters/codex/flags.d.ts
|
|
28
|
+
/** Prepend global flags that must appear before the `exec` subcommand. */
|
|
29
|
+
declare function appendGlobalCodexFlags(args: string[], config: CodexOptions): void;
|
|
30
|
+
/** Append `codex exec` subcommand flags (after `exec`, before prompt). */
|
|
31
|
+
declare function appendExecCodexFlags(args: string[], config: CodexOptions & {
|
|
32
|
+
model?: string;
|
|
33
|
+
cwd?: string;
|
|
34
|
+
}): void;
|
|
35
|
+
/** @deprecated Use appendGlobalCodexFlags + appendExecCodexFlags */
|
|
36
|
+
declare function appendCodexFlags(args: string[], config: CodexOptions & {
|
|
37
|
+
model?: string;
|
|
38
|
+
cwd?: string;
|
|
39
|
+
}): void;
|
|
40
|
+
/**
|
|
41
|
+
* Ensure harness runs pass `--output-last-message` when capture is enabled.
|
|
42
|
+
* Returns the auto-generated path (for cleanup), or null if unchanged.
|
|
43
|
+
*/
|
|
44
|
+
declare function ensureHarnessOutputLastMessage(config: CodexAdapterConfig): string | null;
|
|
45
|
+
/**
|
|
46
|
+
* Build argv for `codex --ask-for-approval never exec --json … "<prompt>"`.
|
|
47
|
+
*
|
|
48
|
+
* Expects `config.outputLastMessage` to already be set if capture is desired;
|
|
49
|
+
* call {@link ensureHarnessOutputLastMessage} before this if spawning outside
|
|
50
|
+
* of {@link spawnCodex}.
|
|
51
|
+
*/
|
|
52
|
+
declare function buildArgs(config: CodexAdapterConfig): string[];
|
|
53
|
+
/**
|
|
54
|
+
* Build argv for `codex --ask-for-approval never exec … "<prompt>"` (no `--json`).
|
|
55
|
+
*/
|
|
56
|
+
declare function buildJudgeArgs(prompt: string, config?: CodexOptions & {
|
|
57
|
+
model?: string;
|
|
58
|
+
cwd?: string;
|
|
59
|
+
}): string[];
|
|
60
|
+
//#endregion
|
|
61
|
+
//#region src/adapters/codex/index.d.ts
|
|
62
|
+
/** Run Codex in headless `exec --json` mode and return a trajectory. */
|
|
63
|
+
declare function runCodex(config: CodexAdapterConfig): Promise<CodexAdapterResult>;
|
|
64
|
+
/** Registered {@link HarnessAdapter} for Codex CLI headless runs. */
|
|
65
|
+
declare const codexAdapter: HarnessAdapter<CodexAdapterConfig>;
|
|
66
|
+
//#endregion
|
|
67
|
+
export { type AdapterDiagnostics, AdapterError, type AdapterResult, type CodexAdapterConfig, type CodexAdapterResult, CodexEventMapper, type CodexOptions, type ParseErrorRecord, appendCodexFlags, appendExecCodexFlags, appendGlobalCodexFlags, buildArgs, buildJudgeArgs, codexAdapter, ensureHarnessOutputLastMessage, mapCodexEvents, mcpToolName, runCodex };
|
|
68
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
import { t as AdapterError } from "../../types-Bac8_Ixb.js";
|
|
2
|
+
import { a as appendGlobalCodexFlags, c as ensureHarnessOutputLastMessage, d as mcpToolName, i as appendExecCodexFlags, l as CodexEventMapper, n as runCodex, o as buildArgs, r as appendCodexFlags, s as buildJudgeArgs, t as codexAdapter, u as mapCodexEvents } from "../../codex-0cHO2te9.js";
|
|
3
|
+
export { AdapterError, CodexEventMapper, appendCodexFlags, appendExecCodexFlags, appendGlobalCodexFlags, buildArgs, buildJudgeArgs, codexAdapter, ensureHarnessOutputLastMessage, mapCodexEvents, mcpToolName, runCodex };
|
|
@@ -1,235 +1,9 @@
|
|
|
1
1
|
import { t as __exportAll } from "./rolldown-runtime-D7D4PA-g.js";
|
|
2
|
+
import { n as TrajectoryBuilder, t as AdapterError } from "./types-Bac8_Ixb.js";
|
|
2
3
|
import { spawn } from "node:child_process";
|
|
3
4
|
import { mkdtemp, rm } from "node:fs/promises";
|
|
4
5
|
import { tmpdir } from "node:os";
|
|
5
6
|
import { join } from "node:path";
|
|
6
|
-
//#region src/types/stream.ts
|
|
7
|
-
/** Type guards. Prefer these over manual `e.type === "..."` checks at call sites. */
|
|
8
|
-
function isSystemInit(e) {
|
|
9
|
-
return e.type === "system" && e.subtype === "init";
|
|
10
|
-
}
|
|
11
|
-
function isSystemRetry(e) {
|
|
12
|
-
return e.type === "system" && e.subtype === "api_retry";
|
|
13
|
-
}
|
|
14
|
-
function isAssistantMessage(e) {
|
|
15
|
-
return e.type === "assistant";
|
|
16
|
-
}
|
|
17
|
-
function isUserMessage(e) {
|
|
18
|
-
return e.type === "user";
|
|
19
|
-
}
|
|
20
|
-
function isResult(e) {
|
|
21
|
-
return e.type === "result";
|
|
22
|
-
}
|
|
23
|
-
function isTextBlock(b) {
|
|
24
|
-
return b.type === "text";
|
|
25
|
-
}
|
|
26
|
-
function isToolUseBlock(b) {
|
|
27
|
-
return b.type === "tool_use";
|
|
28
|
-
}
|
|
29
|
-
function isToolResultBlock(b) {
|
|
30
|
-
return b.type === "tool_result";
|
|
31
|
-
}
|
|
32
|
-
//#endregion
|
|
33
|
-
//#region src/types/trajectory.ts
|
|
34
|
-
/**
|
|
35
|
-
* Extract the MCP namespace prefix from a tool name.
|
|
36
|
-
*
|
|
37
|
-
* Claude Code formats MCP tool names as `mcp__<server>__<tool>`. The namespace
|
|
38
|
-
* is the first two segments joined: `mcp__<server>`. Returns null for non-MCP
|
|
39
|
-
* tool names (built-ins like `Bash`, `Read`, `Edit`).
|
|
40
|
-
*
|
|
41
|
-
* @example
|
|
42
|
-
* namespaceOf("mcp__api__search_skills") // "mcp__api"
|
|
43
|
-
* namespaceOf("Bash") // null
|
|
44
|
-
*/
|
|
45
|
-
function namespaceOf(toolName) {
|
|
46
|
-
if (!toolName.startsWith("mcp__")) return null;
|
|
47
|
-
const parts = toolName.split("__");
|
|
48
|
-
if (parts.length < 3) return null;
|
|
49
|
-
return `${parts[0]}__${parts[1]}`;
|
|
50
|
-
}
|
|
51
|
-
//#endregion
|
|
52
|
-
//#region src/trajectory/builder.ts
|
|
53
|
-
/**
|
|
54
|
-
* TrajectoryBuilder — consumes a stream of {@link StreamEvent} values and
|
|
55
|
-
* produces a {@link TrajectoryView}.
|
|
56
|
-
*
|
|
57
|
-
* State machine: the builder is a small, tolerant state machine. Invariants:
|
|
58
|
-
*
|
|
59
|
-
* - Exactly one `system/init` event opens the session. The builder requires
|
|
60
|
-
* it to be present before `build()`.
|
|
61
|
-
* - Each `assistant` event begins a new turn. Text blocks accumulate into
|
|
62
|
-
* the turn's text; `tool_use` blocks become `ToolCall` records.
|
|
63
|
-
* - `user` events with `tool_result` blocks deliver tool results back. We
|
|
64
|
-
* match them to pending calls by `tool_use_id`.
|
|
65
|
-
* - One `result` event closes the session and carries aggregate usage.
|
|
66
|
-
*
|
|
67
|
-
* The builder is *tolerant of partial streams*: a process killed mid-run
|
|
68
|
-
* produces a coherent (but flagged) view. Tool calls without matching results
|
|
69
|
-
* keep `result: null`. The `success` flag reflects whether a successful result
|
|
70
|
-
* event was actually observed.
|
|
71
|
-
*
|
|
72
|
-
* Why a class (not a reducer)?
|
|
73
|
-
* The internal `pendingCalls` map is mutable by design — we modify ToolCall
|
|
74
|
-
* objects in place when results arrive, so other parts of the view (which
|
|
75
|
-
* hold references to the same objects) see the update for free. A reducer
|
|
76
|
-
* would force a deep copy per result event, which is wasteful and would
|
|
77
|
-
* complicate identity-based queries.
|
|
78
|
-
*/
|
|
79
|
-
var TrajectoryBuilder = class {
|
|
80
|
-
meta = null;
|
|
81
|
-
sessionStartTs = null;
|
|
82
|
-
turns = [];
|
|
83
|
-
allToolCalls = [];
|
|
84
|
-
/**
|
|
85
|
-
* tool_use_id → ToolCall, for matching results back to calls.
|
|
86
|
-
* Entries are removed once a result is observed.
|
|
87
|
-
*/
|
|
88
|
-
pendingCalls = /* @__PURE__ */ new Map();
|
|
89
|
-
retries = [];
|
|
90
|
-
finalUsage = null;
|
|
91
|
-
finalCostUsd = 0;
|
|
92
|
-
finalDurationMs = 0;
|
|
93
|
-
finalNumTurns = 0;
|
|
94
|
-
finalResultText = "";
|
|
95
|
-
sawResultEvent = false;
|
|
96
|
-
resultIsError = false;
|
|
97
|
-
/**
|
|
98
|
-
* Consume one event. Safe to call with events in stream order.
|
|
99
|
-
*
|
|
100
|
-
* Unknown event types are silently ignored — the schema evolves and we
|
|
101
|
-
* don't want CI to break on a new event type we haven't modelled.
|
|
102
|
-
*/
|
|
103
|
-
consume(event) {
|
|
104
|
-
if (isSystemInit(event)) {
|
|
105
|
-
this.meta = {
|
|
106
|
-
sessionId: event.session_id,
|
|
107
|
-
model: event.model,
|
|
108
|
-
cwd: event.cwd,
|
|
109
|
-
permissionMode: event.permissionMode,
|
|
110
|
-
availableTools: event.tools ?? [],
|
|
111
|
-
mcpServers: (event.mcp_servers ?? []).map((s) => ({
|
|
112
|
-
name: s.name,
|
|
113
|
-
status: s.status
|
|
114
|
-
}))
|
|
115
|
-
};
|
|
116
|
-
this.sessionStartTs = Date.now();
|
|
117
|
-
return;
|
|
118
|
-
}
|
|
119
|
-
if (event.type === "system" && event.subtype === "api_retry") {
|
|
120
|
-
this.retries.push({
|
|
121
|
-
offsetMs: this.sessionStartTs ? Date.now() - this.sessionStartTs : 0,
|
|
122
|
-
raw: event
|
|
123
|
-
});
|
|
124
|
-
return;
|
|
125
|
-
}
|
|
126
|
-
if (isAssistantMessage(event)) {
|
|
127
|
-
this.handleAssistantMessage(event);
|
|
128
|
-
return;
|
|
129
|
-
}
|
|
130
|
-
if (isUserMessage(event)) {
|
|
131
|
-
this.handleUserMessage(event);
|
|
132
|
-
return;
|
|
133
|
-
}
|
|
134
|
-
if (isResult(event)) {
|
|
135
|
-
this.sawResultEvent = true;
|
|
136
|
-
this.resultIsError = event.is_error;
|
|
137
|
-
this.finalUsage = event.usage ?? null;
|
|
138
|
-
this.finalCostUsd = event.total_cost_usd ?? 0;
|
|
139
|
-
this.finalDurationMs = event.duration_ms ?? 0;
|
|
140
|
-
this.finalNumTurns = event.num_turns ?? 0;
|
|
141
|
-
this.finalResultText = event.result ?? "";
|
|
142
|
-
return;
|
|
143
|
-
}
|
|
144
|
-
}
|
|
145
|
-
/**
|
|
146
|
-
* Finalize the view. Call after consuming the last event from the stream.
|
|
147
|
-
*
|
|
148
|
-
* Throws if no `system/init` was observed — at that point we have no model,
|
|
149
|
-
* no session id, and no available-tools list, which means assertions like
|
|
150
|
-
* "called any mcp__api__* tool" can't even be evaluated meaningfully.
|
|
151
|
-
*/
|
|
152
|
-
build() {
|
|
153
|
-
if (this.meta === null) throw new Error("TrajectoryBuilder.build() called before any system/init event was observed. The harness may have failed to start, or the stream was truncated before init.");
|
|
154
|
-
const lastTurn = this.turns[this.turns.length - 1];
|
|
155
|
-
const accumulatedText = this.turns.map((t) => t.text).filter((t) => t.length > 0).join("\n\n").trim();
|
|
156
|
-
return {
|
|
157
|
-
meta: this.meta,
|
|
158
|
-
toolCalls: this.allToolCalls,
|
|
159
|
-
turns: this.turns,
|
|
160
|
-
finalResponse: accumulatedText || this.finalResultText,
|
|
161
|
-
finalStopReason: lastTurn?.stopReason ?? null,
|
|
162
|
-
usage: {
|
|
163
|
-
inputTokens: this.finalUsage?.input_tokens ?? 0,
|
|
164
|
-
outputTokens: this.finalUsage?.output_tokens ?? 0,
|
|
165
|
-
totalCostUsd: this.finalCostUsd,
|
|
166
|
-
durationMs: this.finalDurationMs,
|
|
167
|
-
numTurns: this.finalNumTurns || this.turns.length
|
|
168
|
-
},
|
|
169
|
-
retries: this.retries,
|
|
170
|
-
success: this.sawResultEvent && !this.resultIsError
|
|
171
|
-
};
|
|
172
|
-
}
|
|
173
|
-
handleAssistantMessage(event) {
|
|
174
|
-
const turnIndex = this.turns.length;
|
|
175
|
-
const textChunks = [];
|
|
176
|
-
const toolCallsThisTurn = [];
|
|
177
|
-
for (const block of event.message.content) {
|
|
178
|
-
if (isTextBlock(block)) {
|
|
179
|
-
textChunks.push(block.text);
|
|
180
|
-
continue;
|
|
181
|
-
}
|
|
182
|
-
if (isToolUseBlock(block)) {
|
|
183
|
-
const call = {
|
|
184
|
-
name: block.name,
|
|
185
|
-
namespace: namespaceOf(block.name),
|
|
186
|
-
callId: block.id,
|
|
187
|
-
args: block.input,
|
|
188
|
-
result: null,
|
|
189
|
-
isError: false,
|
|
190
|
-
turnIndex,
|
|
191
|
-
callIndex: this.allToolCalls.length
|
|
192
|
-
};
|
|
193
|
-
this.allToolCalls.push(call);
|
|
194
|
-
this.pendingCalls.set(block.id, call);
|
|
195
|
-
toolCallsThisTurn.push(call);
|
|
196
|
-
continue;
|
|
197
|
-
}
|
|
198
|
-
}
|
|
199
|
-
this.turns.push({
|
|
200
|
-
turnIndex,
|
|
201
|
-
text: textChunks.join("").trim(),
|
|
202
|
-
toolCalls: toolCallsThisTurn,
|
|
203
|
-
stopReason: event.message.stop_reason ?? null
|
|
204
|
-
});
|
|
205
|
-
}
|
|
206
|
-
handleUserMessage(event) {
|
|
207
|
-
const content = event.message.content;
|
|
208
|
-
if (typeof content === "string") return;
|
|
209
|
-
for (const block of content) {
|
|
210
|
-
if (!isToolResultBlock(block)) continue;
|
|
211
|
-
const call = this.pendingCalls.get(block.tool_use_id);
|
|
212
|
-
if (!call) continue;
|
|
213
|
-
call.result = block.content;
|
|
214
|
-
call.isError = block.is_error ?? false;
|
|
215
|
-
this.pendingCalls.delete(block.tool_use_id);
|
|
216
|
-
}
|
|
217
|
-
}
|
|
218
|
-
};
|
|
219
|
-
/**
|
|
220
|
-
* Convenience: drain an async iterable of events through a fresh builder.
|
|
221
|
-
*
|
|
222
|
-
* Suitable when you have the full event stream and just want the view.
|
|
223
|
-
* For interactive/incremental scenarios (e.g. surfacing partial state in a UI)
|
|
224
|
-
* instantiate {@link TrajectoryBuilder} directly and call `consume()` /
|
|
225
|
-
* `build()` yourself.
|
|
226
|
-
*/
|
|
227
|
-
async function buildTrajectory(events) {
|
|
228
|
-
const builder = new TrajectoryBuilder();
|
|
229
|
-
for await (const event of events) builder.consume(event);
|
|
230
|
-
return builder.build();
|
|
231
|
-
}
|
|
232
|
-
//#endregion
|
|
233
7
|
//#region src/parsers/stream-json.ts
|
|
234
8
|
/**
|
|
235
9
|
* Parse a readable stream of NDJSON into a sequence of typed stream-json events.
|
|
@@ -281,27 +55,16 @@ function tryParseLine(line) {
|
|
|
281
55
|
}
|
|
282
56
|
}
|
|
283
57
|
//#endregion
|
|
284
|
-
//#region src/adapters/types.ts
|
|
285
|
-
/**
|
|
286
|
-
* Thrown when the harness fails to produce a usable trajectory.
|
|
287
|
-
*
|
|
288
|
-
* Most commonly this means the process failed before emitting a usable
|
|
289
|
-
* session init event. Inspect `diagnostics.stderr` for the cause.
|
|
290
|
-
*/
|
|
291
|
-
var AdapterError = class extends Error {
|
|
292
|
-
diagnostics;
|
|
293
|
-
constructor(message, diagnostics) {
|
|
294
|
-
super(message);
|
|
295
|
-
this.diagnostics = diagnostics;
|
|
296
|
-
this.name = "AdapterError";
|
|
297
|
-
}
|
|
298
|
-
};
|
|
299
|
-
//#endregion
|
|
300
58
|
//#region src/adapters/claude-code/flags.ts
|
|
59
|
+
/** Append repeated `--flag value` pairs for array config fields. */
|
|
301
60
|
function pushRepeatableFlag(args, flag, values) {
|
|
302
61
|
if (!values) return;
|
|
303
62
|
for (const value of values) args.push(flag, value);
|
|
304
63
|
}
|
|
64
|
+
/**
|
|
65
|
+
* Append an optional CLI flag. Boolean `true` emits the flag alone; other
|
|
66
|
+
* scalars emit `--flag value`.
|
|
67
|
+
*/
|
|
305
68
|
function pushOptionalFlag(args, flag, value) {
|
|
306
69
|
if (value === void 0) return;
|
|
307
70
|
if (typeof value === "boolean") {
|
|
@@ -360,7 +123,12 @@ function buildArgs(config) {
|
|
|
360
123
|
appendClaudeCodeFlags(args, config);
|
|
361
124
|
return args;
|
|
362
125
|
}
|
|
363
|
-
/**
|
|
126
|
+
/**
|
|
127
|
+
* Build args for an LLM judge subprocess (`--output-format json`).
|
|
128
|
+
*
|
|
129
|
+
* Defaults permission mode to `bypassPermissions` so the judge does not
|
|
130
|
+
* block on tool permission prompts during single-shot JSON grading.
|
|
131
|
+
*/
|
|
364
132
|
function buildJudgeArgs(prompt, config = {}) {
|
|
365
133
|
const args = [
|
|
366
134
|
"-p",
|
|
@@ -402,6 +170,14 @@ const KILL_GRACE_MS = 5e3;
|
|
|
402
170
|
/**
|
|
403
171
|
* Spawn `claude` in headless mode with isolated config and a process-group
|
|
404
172
|
* lifecycle. See {@link SpawnedClaude} for how to consume the result.
|
|
173
|
+
*
|
|
174
|
+
* **Kill sequence:** timeout and abort both follow the same two-step path:
|
|
175
|
+
* `SIGTERM` to the process group, then `SIGKILL` after {@link KILL_GRACE_MS}
|
|
176
|
+
* if the group is still alive. This avoids leaving MCP/tool subprocesses
|
|
177
|
+
* running while still giving claude a chance to flush stream-json output.
|
|
178
|
+
*
|
|
179
|
+
* @param config - Adapter options; `timeoutMs`, `signal`, and `isolateConfig`
|
|
180
|
+
* control lifecycle and config isolation.
|
|
405
181
|
*/
|
|
406
182
|
async function spawnClaude(config) {
|
|
407
183
|
const binary = config.binary ?? "claude";
|
|
@@ -425,6 +201,10 @@ async function spawnClaude(config) {
|
|
|
425
201
|
let timedOut = false;
|
|
426
202
|
let killEscalation = null;
|
|
427
203
|
const timeoutMs = config.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
204
|
+
/**
|
|
205
|
+
* Arm (or re-arm) the SIGKILL fallback. Each SIGTERM attempt gets its own
|
|
206
|
+
* grace window so a slow shutdown doesn't leave orphaned MCP servers.
|
|
207
|
+
*/
|
|
428
208
|
const scheduleKillEscalation = () => {
|
|
429
209
|
if (killEscalation) clearTimeout(killEscalation);
|
|
430
210
|
killEscalation = setTimeout(() => killTree(child, "SIGKILL"), KILL_GRACE_MS);
|
|
@@ -487,10 +267,16 @@ async function spawnClaude(config) {
|
|
|
487
267
|
* group is already gone. This catches MCP server subprocesses and tool
|
|
488
268
|
* processes spawned by claude.
|
|
489
269
|
*
|
|
490
|
-
*
|
|
491
|
-
*
|
|
492
|
-
*
|
|
493
|
-
*
|
|
270
|
+
* **Signal escalation:** callers typically invoke this first with `SIGTERM`,
|
|
271
|
+
* then again with `SIGKILL` after {@link KILL_GRACE_MS}. The group kill is
|
|
272
|
+
* essential — a bare `child.kill()` would leave MCP servers running.
|
|
273
|
+
*
|
|
274
|
+
* **Platform edge case:** when the group leader exits first, `kill(-pid)`
|
|
275
|
+
* throws `ESRCH`. The single-PID fallback covers that without failing the
|
|
276
|
+
* adapter run.
|
|
277
|
+
*
|
|
278
|
+
* @param child - Spawned process handle from {@link spawn}.
|
|
279
|
+
* @param signal - POSIX signal to deliver (`SIGTERM` or `SIGKILL` in practice).
|
|
494
280
|
*/
|
|
495
281
|
function killTree(child, signal) {
|
|
496
282
|
if (child.pid === void 0) return;
|
|
@@ -553,11 +339,12 @@ async function runClaudeCode(config) {
|
|
|
553
339
|
await spawned.cleanup();
|
|
554
340
|
}
|
|
555
341
|
}
|
|
342
|
+
/** Registered {@link HarnessAdapter} for Claude Code headless runs. */
|
|
556
343
|
const claudeCodeAdapter = {
|
|
557
344
|
id: "claude-code",
|
|
558
345
|
run: runClaudeCode
|
|
559
346
|
};
|
|
560
347
|
//#endregion
|
|
561
|
-
export {
|
|
348
|
+
export { parseStreamJson as a, buildJudgeArgs as i, claude_code_exports as n, runClaudeCode as r, claudeCodeAdapter as t };
|
|
562
349
|
|
|
563
|
-
//# sourceMappingURL=claude-code-
|
|
350
|
+
//# sourceMappingURL=claude-code-C_7hxC8z.js.map
|