@alis-build/harness-eval 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/README.md +104 -10
  2. package/dist/adapters/claude-code/index.d.ts +2 -2
  3. package/dist/adapters/claude-code/index.js +2 -1
  4. package/dist/adapters/codex/index.d.ts +68 -0
  5. package/dist/adapters/codex/index.js +3 -0
  6. package/dist/{claude-code-ycT0JQZF.js → claude-code-C_7hxC8z.js} +37 -250
  7. package/dist/claude-code-C_7hxC8z.js.map +1 -0
  8. package/dist/cli/bin.js +204 -127
  9. package/dist/cli/bin.js.map +1 -1
  10. package/dist/codex-0cHO2te9.js +496 -0
  11. package/dist/codex-0cHO2te9.js.map +1 -0
  12. package/dist/config/loader.d.ts +2 -2
  13. package/dist/config/loader.js +2 -2
  14. package/dist/{index-6Z17eKZx.d.ts → index-DnvP1UBl.d.ts} +3 -2
  15. package/dist/index.d.ts +397 -153
  16. package/dist/index.js +125 -5
  17. package/dist/index.js.map +1 -0
  18. package/dist/loader-B1WmGGzf.d.ts +107 -0
  19. package/dist/{loader-BCnFJ8rm.js → loader-DnQ6Jt0i.js} +707 -157
  20. package/dist/loader-DnQ6Jt0i.js.map +1 -0
  21. package/dist/reporter-Biy-5-9M.js +2216 -0
  22. package/dist/reporter-Biy-5-9M.js.map +1 -0
  23. package/dist/runner/suite.d.ts +1 -1
  24. package/dist/runner/suite.js +1 -1
  25. package/dist/{suite-BoOvK_lq.d.ts → suite-BEShV0by.d.ts} +7 -2
  26. package/dist/{suite-chj0j22j.js → suite-BcP64nlb.js} +72 -4
  27. package/dist/suite-BcP64nlb.js.map +1 -0
  28. package/dist/{types-BQol062t.d.ts → types-0QkNVyp9.d.ts} +152 -11
  29. package/dist/types-Bac8_Ixb.js +246 -0
  30. package/dist/types-Bac8_Ixb.js.map +1 -0
  31. package/dist/types-Bu8uOZZN.d.ts +77 -0
  32. package/dist/{types-B9H4IZtA.d.ts → types-C0gBkl0-.d.ts} +3 -2
  33. package/package.json +7 -2
  34. package/schemas/eval-interchange-instances.schema.json +196 -0
  35. package/schemas/eval-interchange.schema.json +65 -52
  36. package/schemas/eval-run-envelope.schema.json +182 -425
  37. package/dist/build-DsVJ_UeU.js +0 -1396
  38. package/dist/build-DsVJ_UeU.js.map +0 -1
  39. package/dist/claude-code-ycT0JQZF.js.map +0 -1
  40. package/dist/loader-BCnFJ8rm.js.map +0 -1
  41. package/dist/loader-DTvoVfN0.d.ts +0 -33
  42. package/dist/suite-chj0j22j.js.map +0 -1
  43. package/schemas/eval-interchange-agent-trace.schema.json +0 -322
  44. package/schemas/eval-interchange-proto-instance.schema.json +0 -106
package/README.md CHANGED
@@ -54,10 +54,11 @@ pnpm exec harness-eval --help
54
54
 
55
55
  Suites are YAML files. Committed examples:
56
56
 
57
- - [`examples/basic.yaml`](examples/basic.yaml) — smoke test using the built-in `Read` tool on this repo's README
57
+ - [`examples/pipeline/`](examples/pipeline/) — **recommended** unified layout with inline `judge:` + `pipeline:` orchestration
58
+ - [`examples/basic.yaml`](examples/basic.yaml) — minimal smoke test using the built-in `Read` tool on this repo's README
58
59
  - [`examples/matrix.yaml`](examples/matrix.yaml) — same idea with a model matrix (sonnet vs opus)
59
60
  - [`examples/multi-file/`](examples/multi-file/) — directory layout with `suite.yaml` plus cases under `cases/`
60
- - [`examples/grading.yaml`](examples/grading.yaml) — standalone judge config for `harness-eval grade`
61
+ - [`examples/grading.yaml`](examples/grading.yaml) — standalone judge config (alternate to inline `judge:`)
61
62
 
62
63
  ```yaml
63
64
  adapter: claude-code
@@ -96,9 +97,15 @@ cases:
96
97
 
97
98
  Generic fields (`model`, `cwd`, `timeoutMs`, `env`) sit at the top level. Claude-specific options go under `claudeCode`.
98
99
 
100
+ **Full suite & grading YAML reference:** [docs/suite-config.md](docs/suite-config.md) — all case/matrix fields, inline `judge:` / `pipeline:`, multi-file layout, and standalone `grading.yaml`.
101
+
99
102
  ### 2. Run behavioral eval
100
103
 
101
104
  ```bash
105
+ # Unified pipeline (run + optional grade + envelope when pipeline: is defined)
106
+ npx @alis-build/harness-eval pipeline examples/pipeline/
107
+
108
+ # Or run harness only
102
109
  npx @alis-build/harness-eval run examples/basic.yaml --output report.json --max-concurrent 1 --format console
103
110
  ```
104
111
 
@@ -110,7 +117,14 @@ Exit code `0` = all cells passed all assertion thresholds.
110
117
 
111
118
  ### 3. Grade outcomes (optional)
112
119
 
113
- Judge model, timeout, env, and `claudeCode` flags live in a separate **`grading.yaml`** (not in the suite file). See [`examples/grading.yaml`](examples/grading.yaml).
120
+ **Unified suite:** add a top-level `judge:` block in `suite.yaml` (see [`examples/pipeline/suite.yaml`](examples/pipeline/suite.yaml)), then:
121
+
122
+ ```bash
123
+ npx @alis-build/harness-eval grade report.json --suite examples/pipeline/suite.yaml --output grading.json --max-concurrent 1 --format console
124
+ # or: npx @alis-build/harness-eval pipeline examples/pipeline/ --steps grade
125
+ ```
126
+
127
+ **Standalone grading file:** judge config in a separate **`grading.yaml`** (still supported). See [`examples/grading.yaml`](examples/grading.yaml).
114
128
 
115
129
  ```bash
116
130
  npx @alis-build/harness-eval grade report.json --config examples/grading.yaml --output grading.json --max-concurrent 1 --format console
@@ -399,6 +413,7 @@ Both layers use statistical thresholds: a case runs `repetitions` times per matr
399
413
  npx @alis-build/harness-eval run <suite.yaml> [options]
400
414
  npx @alis-build/harness-eval grade <report.json> [options]
401
415
  npx @alis-build/harness-eval envelope <report.json> [options]
416
+ npx @alis-build/harness-eval pipeline <suite.yaml|dir> [options]
402
417
  npx @alis-build/harness-eval format <report.json> [options]
403
418
  npx @alis-build/harness-eval --help
404
419
  ```
@@ -420,7 +435,9 @@ npx @alis-build/harness-eval --help
420
435
 
421
436
  ### `grade`
422
437
 
423
- Uses a standalone **`grading.yaml`** for judge model, timeout, env, and `claudeCode` flags (Option B — separate from the suite file).
438
+ Uses **`grading.yaml`** or an inline **`judge:`** block in `suite.yaml` (`--suite`).
439
+
440
+ **Field reference:** [docs/suite-config.md — Grading config](docs/suite-config.md#grading-config-gradingyaml)
424
441
 
425
442
  ```yaml
426
443
  # examples/grading.yaml
@@ -440,6 +457,7 @@ npx @alis-build/harness-eval grade report.json --config examples/grading.yaml --
440
457
  | Option | Description |
441
458
  | -------------------------------------- | ----------------------------------------------------------------- |
442
459
  | `--config <path>` | Grading YAML (`judge` block) — model, env, timeout, `claudeCode` |
460
+ | `--suite <path>` | Unified `suite.yaml` with inline `judge:` (alternative to `--config`) |
443
461
  | `--output <path>` | Write grading JSON |
444
462
  | `--expectations <path>` | Sidecar YAML/JSON if report lacks expectations |
445
463
  | `--format console\|json` | Output format |
@@ -467,7 +485,7 @@ npx @alis-build/harness-eval envelope report.json --suite examples/basic.yaml --
467
485
  # Interchange projections
468
486
  npx @alis-build/harness-eval envelope report.json --projection trajectory --output trajectory.jsonl
469
487
  npx @alis-build/harness-eval envelope report.json --projection instances --output instances.json
470
- npx @alis-build/harness-eval envelope report.json --projection agent-trace --output agent-traces.json
488
+ npx @alis-build/harness-eval envelope report.json --projection instances --output instances.jsonl
471
489
  ```
472
490
 
473
491
  | Option | Description |
@@ -475,12 +493,34 @@ npx @alis-build/harness-eval envelope report.json --projection agent-trace --out
475
493
  | `--output <path>` | Write output (stdout if omitted) |
476
494
  | `--grading <path>` | Merge `grading.json` outcome scores into the envelope |
477
495
  | `--suite <path>` | Suite YAML for provenance (`uri`, `contentHash`) |
478
- | `--projection envelope\|trajectory\|instances\|agent-trace` | Output shape (default: `envelope`) |
496
+ | `--projection envelope\|trajectory\|instances` | Output shape (default: `envelope`) |
479
497
  | `--include-raw-stream-events` | Include adapter raw stream events in repetition artifacts |
480
498
  | `--no-transcript` | Omit judge transcript artifacts |
481
499
 
482
500
  Exit codes: `0` = envelope built and behavioral pass; `1` = built but behavioral failures; `2` = usage or file errors.
483
501
 
502
+ ### `pipeline`
503
+
504
+ Orchestrate **run → grade → envelope** from a unified `suite.yaml` when a `pipeline:` block is present. See [docs/suite-config.md — Pipeline orchestration](docs/suite-config.md#pipeline-orchestration-pipeline).
505
+
506
+ ```bash
507
+ npx @alis-build/harness-eval pipeline examples/pipeline/
508
+ npx @alis-build/harness-eval pipeline my-suite/ --steps run,grade
509
+ ```
510
+
511
+ | Option | Description |
512
+ | ------ | ----------- |
513
+ | `--steps run,grade,envelope` | Subset of configured steps (default: all configured) |
514
+ | `--output <path>` | Override `pipeline.run.output` |
515
+ | `--report <path>` | Override report input for grade/envelope |
516
+ | `--grading <path>` | Override grading input for envelope |
517
+ | `--grading-output <path>` | Override `pipeline.grade.output` |
518
+ | `--envelope-output <path>` | Override `pipeline.envelope.output` |
519
+ | `--projection envelope\|trajectory\|instances` | Envelope projection |
520
+ | `--max-concurrent <n>` | Parallel harness/judge workers |
521
+
522
+ Exit codes match the first failing step (`run`, `grade`, or `envelope`). Returns `2` when no `pipeline:` block exists.
523
+
484
524
  ### `format`
485
525
 
486
526
  Re-render an existing `report.json` without re-running the harness.
@@ -509,6 +549,8 @@ See [Data contracts & schemas](#data-contracts--schemas) for type details.
509
549
 
510
550
  ## Suite concepts
511
551
 
552
+ **Authoring reference:** [docs/suite-config.md](docs/suite-config.md) — complete field list for suite YAML, matrix cells, test cases, reference trajectories, and grading config.
553
+
512
554
  ### Test case
513
555
 
514
556
  One prompt + assertions + optional expectations, run N times per matrix cell.
@@ -533,13 +575,17 @@ assertions:
533
575
 
534
576
  Default threshold is `1.0` (every evaluated rep must pass). Reps where the harness crashes are excluded from the denominator and counted as `adapterErrors`.
535
577
 
578
+ ### Reference trajectory (optional)
579
+
580
+ Define expected tool calls for Vertex trajectory metrics on the eval envelope. Use `tool_name_mode: bare` when reference steps use short tool names but the harness records MCP-prefixed names. See [docs/suite-config.md — Reference trajectory](docs/suite-config.md#reference-trajectory).
581
+
536
582
  **Full reference:** [docs/assertions.md](docs/assertions.md) — all assertion kinds, predicates, statistical model, and how to add new assertion types or harness adapters.
537
583
 
538
584
  ---
539
585
 
540
586
  ## Adding harness adapters
541
587
 
542
- Built-in adapters register at module load. Today only `claude-code` ships; additional harnesses (Codex, Gemini CLI, Antigravity CLI) plug in via the same pattern:
588
+ Built-in adapters register at module load. **`claude-code`** and **`codex`** ship today; additional harnesses (Gemini CLI, Antigravity CLI) plug in via the same pattern:
543
589
 
544
590
  1. Implement `HarnessAdapter` under `src/adapters/<id>/` with a `run(config)` that returns a `TrajectoryView`.
545
591
  2. Add a nested config key on `SuiteConfig` (e.g. `codex: { ... }`) for harness-specific options.
@@ -554,7 +600,7 @@ import {
554
600
  } from "@alis-build/harness-eval";
555
601
 
556
602
  registerAdapter("my-harness", myAdapter);
557
- console.log(listAdapters()); // ["claude-code", "my-harness"]
603
+ console.log(listAdapters()); // ["claude-code", "codex", "my-harness"]
558
604
  ```
559
605
 
560
606
  Duplicate registration throws so accidental overrides fail fast during startup or tests.
@@ -610,12 +656,55 @@ The adapter captures Claude’s stream-json output and builds a `TrajectoryView`
610
656
 
611
657
  ---
612
658
 
659
+ ## Codex CLI adapter
660
+
661
+ Nested under `codex` in YAML (or flat in programmatic config). Maps to [Codex CLI reference](https://developers.openai.com/codex/cli/reference) (`codex exec` flags).
662
+
663
+ The harness adapter invokes:
664
+
665
+ ```bash
666
+ codex --ask-for-approval never exec --json [exec flags…] "<prompt>"
667
+ ```
668
+
669
+ `--ask-for-approval` is a **global** flag (before `exec`); other options attach to the `exec` subcommand.
670
+
671
+ | Field | CLI flag | Notes |
672
+ | ----- | -------- | ----- |
673
+ | `binary` | — | Default `codex` |
674
+ | `model` | `--model` | Also settable at top level |
675
+ | `profile` | `--profile` | Layer `$CODEX_HOME/<profile>.config.toml` |
676
+ | `sandbox` | `--sandbox` | `read-only`, `workspace-write`, `danger-full-access` |
677
+ | `addDirs` | `--add-dir` | Extra writable dirs (repeatable) |
678
+ | `configOverrides` | `-c key=value` | Inline TOML overrides (repeatable) |
679
+ | `askForApproval` | `--ask-for-approval` | Default `never` for non-interactive eval |
680
+ | `dangerouslyBypassApprovalsAndSandbox` | `--yolo` | Hardened CI only |
681
+ | `dangerouslyBypassHookTrust` | `--dangerously-bypass-hook-trust` | Automation with vetted hooks |
682
+ | `ephemeral` | `--ephemeral` | No session rollout files |
683
+ | `ignoreUserConfig` | `--ignore-user-config` | Skip `$CODEX_HOME/config.toml` |
684
+ | `skipGitRepoCheck` | `--skip-git-repo-check` | Allow runs outside git repos |
685
+ | `outputSchema` | `--output-schema` | JSON Schema for structured final output |
686
+ | `outputLastMessage` | `--output-last-message` | Write final assistant message to file (auto temp path when `captureLastMessage` is true) |
687
+ | `captureLastMessage` | — | Default `true`: auto `--output-last-message` and read into `finalResponse` if JSONL has no assistant text |
688
+ | `isolateConfig` | — | `false` (default) = inherit `~/.codex`; `true` = temp `$CODEX_HOME` per run |
689
+
690
+ Generic `cwd` sets the child process working directory (`--cd`). MCP tool calls in Codex `--json` output map to harness names `mcp__<server>__<tool>`; shell commands map to `Bash`.
691
+
692
+ The adapter maps Codex JSONL events into the shared `StreamEvent` shape and feeds `TrajectoryBuilder`. Fixture-driven tests use committed recordings under `tests/fixtures/codex/` — CI does not require `codex` on `PATH`.
693
+
694
+ **Example suite:** [examples/codex-basic.yaml](examples/codex-basic.yaml)
695
+
696
+ **Codex judge:** set `judge.adapter: codex` and nest options under `judge.codex` in grading YAML (see [docs/suite-config.md](docs/suite-config.md)).
697
+
698
+ ---
699
+
613
700
  ## Library API
614
701
 
615
702
  ```typescript
616
703
  import {
617
704
  loadSuite,
705
+ loadSuiteDocument,
618
706
  runSuite,
707
+ runPipeline,
619
708
  gradeReport,
620
709
  buildEvalRunEnvelope,
621
710
  trajectoryToTranscript,
@@ -625,6 +714,11 @@ import {
625
714
  } from "@alis-build/harness-eval";
626
715
  import { loadGradingConfig } from "@alis-build/harness-eval/config";
627
716
 
717
+ // Unified pipeline
718
+ const doc = await loadSuiteDocument("./examples/pipeline/suite.yaml");
719
+ const { exitCode } = await runPipeline(doc, { maxConcurrent: 2 });
720
+
721
+ // Or step-by-step
628
722
  const suite = await loadSuite("./examples/basic.yaml");
629
723
  const report = await runSuite(suite, { maxConcurrent: 2 });
630
724
 
@@ -649,7 +743,7 @@ const envelope = buildEvalRunEnvelope(report, {
649
743
  });
650
744
  ```
651
745
 
652
- Subpath exports: `@alis-build/harness-eval/runner`, `@alis-build/harness-eval/config`, `@alis-build/harness-eval/adapters/claude-code`.
746
+ Subpath exports: `@alis-build/harness-eval/runner`, `@alis-build/harness-eval/config`, `@alis-build/harness-eval/adapters/claude-code`, `@alis-build/harness-eval/adapters/codex`.
653
747
 
654
748
  ---
655
749
 
@@ -690,7 +784,7 @@ pnpm run typecheck
690
784
  pnpm run generate-schemas # Zod → schemas/*.schema.json only
691
785
  ```
692
786
 
693
- **Docs:** [Assertion DSL & adapter extension](docs/assertions.md) · [Eval record contract (DB / CI)](docs/eval-record.md)
787
+ **Docs:** [Suite & grading YAML](docs/suite-config.md) · [Assertion DSL & adapter extension](docs/assertions.md) · [Eval record contract (DB / CI)](docs/eval-record.md)
694
788
 
695
789
  ---
696
790
 
@@ -1,3 +1,3 @@
1
- import { n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics } from "../../types-B9H4IZtA.js";
2
- import { a as ClaudeCodeAdapterResult, i as ClaudeCodeAdapterConfig, o as ClaudeCodeOptions, r as runClaudeCode, s as PermissionMode, t as claudeCodeAdapter } from "../../index-6Z17eKZx.js";
1
+ import { n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics } from "../../types-C0gBkl0-.js";
2
+ import { a as ClaudeCodeAdapterResult, i as ClaudeCodeAdapterConfig, o as ClaudeCodeOptions, r as runClaudeCode, s as PermissionMode, t as claudeCodeAdapter } from "../../index-DnvP1UBl.js";
3
3
  export { type AdapterDiagnostics, AdapterError, type AdapterResult, type ClaudeCodeAdapterConfig, type ClaudeCodeAdapterResult, type ClaudeCodeOptions, type ParseErrorRecord, type PermissionMode, claudeCodeAdapter, runClaudeCode };
@@ -1,2 +1,3 @@
1
- import { a as AdapterError, r as runClaudeCode, t as claudeCodeAdapter } from "../../claude-code-ycT0JQZF.js";
1
+ import { t as AdapterError } from "../../types-Bac8_Ixb.js";
2
+ import { r as runClaudeCode, t as claudeCodeAdapter } from "../../claude-code-C_7hxC8z.js";
2
3
  export { AdapterError, claudeCodeAdapter, runClaudeCode };
@@ -0,0 +1,68 @@
1
+ import { a as HarnessAdapter, n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics, x as StreamEvent } from "../../types-C0gBkl0-.js";
2
+ import { i as CodexOptions, n as CodexAdapterResult, r as CodexJsonEvent, t as CodexAdapterConfig } from "../../types-Bu8uOZZN.js";
3
+
4
+ //#region src/adapters/codex/map-events.d.ts
5
+ /** Stateful mapper — tracks session id and pending tool calls across the stream. */
6
+ declare class CodexEventMapper {
7
+ private sessionId;
8
+ private sawInit;
9
+ private startedItems;
10
+ private turnCount;
11
+ /** Map one parsed Codex JSON object to zero or more stream events. */
12
+ map(event: CodexJsonEvent): StreamEvent[];
13
+ private buildInit;
14
+ private ensureInit;
15
+ private mapItemStarted;
16
+ private mapItemCompleted;
17
+ private toolUseEvent;
18
+ private commandUseEvent;
19
+ private toolResultEvent;
20
+ private buildResult;
21
+ }
22
+ /** Map an entire fixture or stream of Codex events through a fresh mapper. */
23
+ declare function mapCodexEvents(events: CodexJsonEvent[]): StreamEvent[];
24
+ /** Build harness-qualified MCP tool name from Codex server + tool fields. */
25
+ declare function mcpToolName(server: string, tool: string): string;
26
+ //#endregion
27
+ //#region src/adapters/codex/flags.d.ts
28
+ /** Prepend global flags that must appear before the `exec` subcommand. */
29
+ declare function appendGlobalCodexFlags(args: string[], config: CodexOptions): void;
30
+ /** Append `codex exec` subcommand flags (after `exec`, before prompt). */
31
+ declare function appendExecCodexFlags(args: string[], config: CodexOptions & {
32
+ model?: string;
33
+ cwd?: string;
34
+ }): void;
35
+ /** @deprecated Use appendGlobalCodexFlags + appendExecCodexFlags */
36
+ declare function appendCodexFlags(args: string[], config: CodexOptions & {
37
+ model?: string;
38
+ cwd?: string;
39
+ }): void;
40
+ /**
41
+ * Ensure harness runs pass `--output-last-message` when capture is enabled.
42
+ * Returns the auto-generated path (for cleanup), or null if unchanged.
43
+ */
44
+ declare function ensureHarnessOutputLastMessage(config: CodexAdapterConfig): string | null;
45
+ /**
46
+ * Build argv for `codex --ask-for-approval never exec --json … "<prompt>"`.
47
+ *
48
+ * Expects `config.outputLastMessage` to already be set if capture is desired;
49
+ * call {@link ensureHarnessOutputLastMessage} before this if spawning outside
50
+ * of {@link spawnCodex}.
51
+ */
52
+ declare function buildArgs(config: CodexAdapterConfig): string[];
53
+ /**
54
+ * Build argv for `codex --ask-for-approval never exec … "<prompt>"` (no `--json`).
55
+ */
56
+ declare function buildJudgeArgs(prompt: string, config?: CodexOptions & {
57
+ model?: string;
58
+ cwd?: string;
59
+ }): string[];
60
+ //#endregion
61
+ //#region src/adapters/codex/index.d.ts
62
+ /** Run Codex in headless `exec --json` mode and return a trajectory. */
63
+ declare function runCodex(config: CodexAdapterConfig): Promise<CodexAdapterResult>;
64
+ /** Registered {@link HarnessAdapter} for Codex CLI headless runs. */
65
+ declare const codexAdapter: HarnessAdapter<CodexAdapterConfig>;
66
+ //#endregion
67
+ export { type AdapterDiagnostics, AdapterError, type AdapterResult, type CodexAdapterConfig, type CodexAdapterResult, CodexEventMapper, type CodexOptions, type ParseErrorRecord, appendCodexFlags, appendExecCodexFlags, appendGlobalCodexFlags, buildArgs, buildJudgeArgs, codexAdapter, ensureHarnessOutputLastMessage, mapCodexEvents, mcpToolName, runCodex };
68
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1,3 @@
1
+ import { t as AdapterError } from "../../types-Bac8_Ixb.js";
2
+ import { a as appendGlobalCodexFlags, c as ensureHarnessOutputLastMessage, d as mcpToolName, i as appendExecCodexFlags, l as CodexEventMapper, n as runCodex, o as buildArgs, r as appendCodexFlags, s as buildJudgeArgs, t as codexAdapter, u as mapCodexEvents } from "../../codex-0cHO2te9.js";
3
+ export { AdapterError, CodexEventMapper, appendCodexFlags, appendExecCodexFlags, appendGlobalCodexFlags, buildArgs, buildJudgeArgs, codexAdapter, ensureHarnessOutputLastMessage, mapCodexEvents, mcpToolName, runCodex };
@@ -1,235 +1,9 @@
1
1
  import { t as __exportAll } from "./rolldown-runtime-D7D4PA-g.js";
2
+ import { n as TrajectoryBuilder, t as AdapterError } from "./types-Bac8_Ixb.js";
2
3
  import { spawn } from "node:child_process";
3
4
  import { mkdtemp, rm } from "node:fs/promises";
4
5
  import { tmpdir } from "node:os";
5
6
  import { join } from "node:path";
6
- //#region src/types/stream.ts
7
- /** Type guards. Prefer these over manual `e.type === "..."` checks at call sites. */
8
- function isSystemInit(e) {
9
- return e.type === "system" && e.subtype === "init";
10
- }
11
- function isSystemRetry(e) {
12
- return e.type === "system" && e.subtype === "api_retry";
13
- }
14
- function isAssistantMessage(e) {
15
- return e.type === "assistant";
16
- }
17
- function isUserMessage(e) {
18
- return e.type === "user";
19
- }
20
- function isResult(e) {
21
- return e.type === "result";
22
- }
23
- function isTextBlock(b) {
24
- return b.type === "text";
25
- }
26
- function isToolUseBlock(b) {
27
- return b.type === "tool_use";
28
- }
29
- function isToolResultBlock(b) {
30
- return b.type === "tool_result";
31
- }
32
- //#endregion
33
- //#region src/types/trajectory.ts
34
- /**
35
- * Extract the MCP namespace prefix from a tool name.
36
- *
37
- * Claude Code formats MCP tool names as `mcp__<server>__<tool>`. The namespace
38
- * is the first two segments joined: `mcp__<server>`. Returns null for non-MCP
39
- * tool names (built-ins like `Bash`, `Read`, `Edit`).
40
- *
41
- * @example
42
- * namespaceOf("mcp__api__search_skills") // "mcp__api"
43
- * namespaceOf("Bash") // null
44
- */
45
- function namespaceOf(toolName) {
46
- if (!toolName.startsWith("mcp__")) return null;
47
- const parts = toolName.split("__");
48
- if (parts.length < 3) return null;
49
- return `${parts[0]}__${parts[1]}`;
50
- }
51
- //#endregion
52
- //#region src/trajectory/builder.ts
53
- /**
54
- * TrajectoryBuilder — consumes a stream of {@link StreamEvent} values and
55
- * produces a {@link TrajectoryView}.
56
- *
57
- * State machine: the builder is a small, tolerant state machine. Invariants:
58
- *
59
- * - Exactly one `system/init` event opens the session. The builder requires
60
- * it to be present before `build()`.
61
- * - Each `assistant` event begins a new turn. Text blocks accumulate into
62
- * the turn's text; `tool_use` blocks become `ToolCall` records.
63
- * - `user` events with `tool_result` blocks deliver tool results back. We
64
- * match them to pending calls by `tool_use_id`.
65
- * - One `result` event closes the session and carries aggregate usage.
66
- *
67
- * The builder is *tolerant of partial streams*: a process killed mid-run
68
- * produces a coherent (but flagged) view. Tool calls without matching results
69
- * keep `result: null`. The `success` flag reflects whether a successful result
70
- * event was actually observed.
71
- *
72
- * Why a class (not a reducer)?
73
- * The internal `pendingCalls` map is mutable by design — we modify ToolCall
74
- * objects in place when results arrive, so other parts of the view (which
75
- * hold references to the same objects) see the update for free. A reducer
76
- * would force a deep copy per result event, which is wasteful and would
77
- * complicate identity-based queries.
78
- */
79
- var TrajectoryBuilder = class {
80
- meta = null;
81
- sessionStartTs = null;
82
- turns = [];
83
- allToolCalls = [];
84
- /**
85
- * tool_use_id → ToolCall, for matching results back to calls.
86
- * Entries are removed once a result is observed.
87
- */
88
- pendingCalls = /* @__PURE__ */ new Map();
89
- retries = [];
90
- finalUsage = null;
91
- finalCostUsd = 0;
92
- finalDurationMs = 0;
93
- finalNumTurns = 0;
94
- finalResultText = "";
95
- sawResultEvent = false;
96
- resultIsError = false;
97
- /**
98
- * Consume one event. Safe to call with events in stream order.
99
- *
100
- * Unknown event types are silently ignored — the schema evolves and we
101
- * don't want CI to break on a new event type we haven't modelled.
102
- */
103
- consume(event) {
104
- if (isSystemInit(event)) {
105
- this.meta = {
106
- sessionId: event.session_id,
107
- model: event.model,
108
- cwd: event.cwd,
109
- permissionMode: event.permissionMode,
110
- availableTools: event.tools ?? [],
111
- mcpServers: (event.mcp_servers ?? []).map((s) => ({
112
- name: s.name,
113
- status: s.status
114
- }))
115
- };
116
- this.sessionStartTs = Date.now();
117
- return;
118
- }
119
- if (event.type === "system" && event.subtype === "api_retry") {
120
- this.retries.push({
121
- offsetMs: this.sessionStartTs ? Date.now() - this.sessionStartTs : 0,
122
- raw: event
123
- });
124
- return;
125
- }
126
- if (isAssistantMessage(event)) {
127
- this.handleAssistantMessage(event);
128
- return;
129
- }
130
- if (isUserMessage(event)) {
131
- this.handleUserMessage(event);
132
- return;
133
- }
134
- if (isResult(event)) {
135
- this.sawResultEvent = true;
136
- this.resultIsError = event.is_error;
137
- this.finalUsage = event.usage ?? null;
138
- this.finalCostUsd = event.total_cost_usd ?? 0;
139
- this.finalDurationMs = event.duration_ms ?? 0;
140
- this.finalNumTurns = event.num_turns ?? 0;
141
- this.finalResultText = event.result ?? "";
142
- return;
143
- }
144
- }
145
- /**
146
- * Finalize the view. Call after consuming the last event from the stream.
147
- *
148
- * Throws if no `system/init` was observed — at that point we have no model,
149
- * no session id, and no available-tools list, which means assertions like
150
- * "called any mcp__api__* tool" can't even be evaluated meaningfully.
151
- */
152
- build() {
153
- if (this.meta === null) throw new Error("TrajectoryBuilder.build() called before any system/init event was observed. The harness may have failed to start, or the stream was truncated before init.");
154
- const lastTurn = this.turns[this.turns.length - 1];
155
- const accumulatedText = this.turns.map((t) => t.text).filter((t) => t.length > 0).join("\n\n").trim();
156
- return {
157
- meta: this.meta,
158
- toolCalls: this.allToolCalls,
159
- turns: this.turns,
160
- finalResponse: accumulatedText || this.finalResultText,
161
- finalStopReason: lastTurn?.stopReason ?? null,
162
- usage: {
163
- inputTokens: this.finalUsage?.input_tokens ?? 0,
164
- outputTokens: this.finalUsage?.output_tokens ?? 0,
165
- totalCostUsd: this.finalCostUsd,
166
- durationMs: this.finalDurationMs,
167
- numTurns: this.finalNumTurns || this.turns.length
168
- },
169
- retries: this.retries,
170
- success: this.sawResultEvent && !this.resultIsError
171
- };
172
- }
173
- handleAssistantMessage(event) {
174
- const turnIndex = this.turns.length;
175
- const textChunks = [];
176
- const toolCallsThisTurn = [];
177
- for (const block of event.message.content) {
178
- if (isTextBlock(block)) {
179
- textChunks.push(block.text);
180
- continue;
181
- }
182
- if (isToolUseBlock(block)) {
183
- const call = {
184
- name: block.name,
185
- namespace: namespaceOf(block.name),
186
- callId: block.id,
187
- args: block.input,
188
- result: null,
189
- isError: false,
190
- turnIndex,
191
- callIndex: this.allToolCalls.length
192
- };
193
- this.allToolCalls.push(call);
194
- this.pendingCalls.set(block.id, call);
195
- toolCallsThisTurn.push(call);
196
- continue;
197
- }
198
- }
199
- this.turns.push({
200
- turnIndex,
201
- text: textChunks.join("").trim(),
202
- toolCalls: toolCallsThisTurn,
203
- stopReason: event.message.stop_reason ?? null
204
- });
205
- }
206
- handleUserMessage(event) {
207
- const content = event.message.content;
208
- if (typeof content === "string") return;
209
- for (const block of content) {
210
- if (!isToolResultBlock(block)) continue;
211
- const call = this.pendingCalls.get(block.tool_use_id);
212
- if (!call) continue;
213
- call.result = block.content;
214
- call.isError = block.is_error ?? false;
215
- this.pendingCalls.delete(block.tool_use_id);
216
- }
217
- }
218
- };
219
- /**
220
- * Convenience: drain an async iterable of events through a fresh builder.
221
- *
222
- * Suitable when you have the full event stream and just want the view.
223
- * For interactive/incremental scenarios (e.g. surfacing partial state in a UI)
224
- * instantiate {@link TrajectoryBuilder} directly and call `consume()` /
225
- * `build()` yourself.
226
- */
227
- async function buildTrajectory(events) {
228
- const builder = new TrajectoryBuilder();
229
- for await (const event of events) builder.consume(event);
230
- return builder.build();
231
- }
232
- //#endregion
233
7
  //#region src/parsers/stream-json.ts
234
8
  /**
235
9
  * Parse a readable stream of NDJSON into a sequence of typed stream-json events.
@@ -281,27 +55,16 @@ function tryParseLine(line) {
281
55
  }
282
56
  }
283
57
  //#endregion
284
- //#region src/adapters/types.ts
285
- /**
286
- * Thrown when the harness fails to produce a usable trajectory.
287
- *
288
- * Most commonly this means the process failed before emitting a usable
289
- * session init event. Inspect `diagnostics.stderr` for the cause.
290
- */
291
- var AdapterError = class extends Error {
292
- diagnostics;
293
- constructor(message, diagnostics) {
294
- super(message);
295
- this.diagnostics = diagnostics;
296
- this.name = "AdapterError";
297
- }
298
- };
299
- //#endregion
300
58
  //#region src/adapters/claude-code/flags.ts
59
+ /** Append repeated `--flag value` pairs for array config fields. */
301
60
  function pushRepeatableFlag(args, flag, values) {
302
61
  if (!values) return;
303
62
  for (const value of values) args.push(flag, value);
304
63
  }
64
+ /**
65
+ * Append an optional CLI flag. Boolean `true` emits the flag alone; other
66
+ * scalars emit `--flag value`.
67
+ */
305
68
  function pushOptionalFlag(args, flag, value) {
306
69
  if (value === void 0) return;
307
70
  if (typeof value === "boolean") {
@@ -360,7 +123,12 @@ function buildArgs(config) {
360
123
  appendClaudeCodeFlags(args, config);
361
124
  return args;
362
125
  }
363
- /** Build args for an LLM judge subprocess (`--output-format json`). */
126
+ /**
127
+ * Build args for an LLM judge subprocess (`--output-format json`).
128
+ *
129
+ * Defaults permission mode to `bypassPermissions` so the judge does not
130
+ * block on tool permission prompts during single-shot JSON grading.
131
+ */
364
132
  function buildJudgeArgs(prompt, config = {}) {
365
133
  const args = [
366
134
  "-p",
@@ -402,6 +170,14 @@ const KILL_GRACE_MS = 5e3;
402
170
  /**
403
171
  * Spawn `claude` in headless mode with isolated config and a process-group
404
172
  * lifecycle. See {@link SpawnedClaude} for how to consume the result.
173
+ *
174
+ * **Kill sequence:** timeout and abort both follow the same two-step path:
175
+ * `SIGTERM` to the process group, then `SIGKILL` after {@link KILL_GRACE_MS}
176
+ * if the group is still alive. This avoids leaving MCP/tool subprocesses
177
+ * running while still giving claude a chance to flush stream-json output.
178
+ *
179
+ * @param config - Adapter options; `timeoutMs`, `signal`, and `isolateConfig`
180
+ * control lifecycle and config isolation.
405
181
  */
406
182
  async function spawnClaude(config) {
407
183
  const binary = config.binary ?? "claude";
@@ -425,6 +201,10 @@ async function spawnClaude(config) {
425
201
  let timedOut = false;
426
202
  let killEscalation = null;
427
203
  const timeoutMs = config.timeoutMs ?? DEFAULT_TIMEOUT_MS;
204
+ /**
205
+ * Arm (or re-arm) the SIGKILL fallback. Each SIGTERM attempt gets its own
206
+ * grace window so a slow shutdown doesn't leave orphaned MCP servers.
207
+ */
428
208
  const scheduleKillEscalation = () => {
429
209
  if (killEscalation) clearTimeout(killEscalation);
430
210
  killEscalation = setTimeout(() => killTree(child, "SIGKILL"), KILL_GRACE_MS);
@@ -487,10 +267,16 @@ async function spawnClaude(config) {
487
267
  * group is already gone. This catches MCP server subprocesses and tool
488
268
  * processes spawned by claude.
489
269
  *
490
- * Why both? On some platforms the process group dies before we get here
491
- * (the child itself already cleaned up), in which case `kill(-pid)` throws
492
- * ESRCH. The fallback handles that edge case without leaking zombies in
493
- * the common case.
270
+ * **Signal escalation:** callers typically invoke this first with `SIGTERM`,
271
+ * then again with `SIGKILL` after {@link KILL_GRACE_MS}. The group kill is
272
+ * essential a bare `child.kill()` would leave MCP servers running.
273
+ *
274
+ * **Platform edge case:** when the group leader exits first, `kill(-pid)`
275
+ * throws `ESRCH`. The single-PID fallback covers that without failing the
276
+ * adapter run.
277
+ *
278
+ * @param child - Spawned process handle from {@link spawn}.
279
+ * @param signal - POSIX signal to deliver (`SIGTERM` or `SIGKILL` in practice).
494
280
  */
495
281
  function killTree(child, signal) {
496
282
  if (child.pid === void 0) return;
@@ -553,11 +339,12 @@ async function runClaudeCode(config) {
553
339
  await spawned.cleanup();
554
340
  }
555
341
  }
342
+ /** Registered {@link HarnessAdapter} for Claude Code headless runs. */
556
343
  const claudeCodeAdapter = {
557
344
  id: "claude-code",
558
345
  run: runClaudeCode
559
346
  };
560
347
  //#endregion
561
- export { isUserMessage as _, AdapterError as a, buildTrajectory as c, isResult as d, isSystemInit as f, isToolUseBlock as g, isToolResultBlock as h, buildJudgeArgs as i, namespaceOf as l, isTextBlock as m, claude_code_exports as n, parseStreamJson as o, isSystemRetry as p, runClaudeCode as r, TrajectoryBuilder as s, claudeCodeAdapter as t, isAssistantMessage as u };
348
+ export { parseStreamJson as a, buildJudgeArgs as i, claude_code_exports as n, runClaudeCode as r, claudeCodeAdapter as t };
562
349
 
563
- //# sourceMappingURL=claude-code-ycT0JQZF.js.map
350
+ //# sourceMappingURL=claude-code-C_7hxC8z.js.map