@alis-build/harness-eval 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/README.md +92 -8
  2. package/dist/adapters/claude-code/index.d.ts +2 -2
  3. package/dist/adapters/claude-code/index.js +2 -1
  4. package/dist/adapters/codex/index.d.ts +68 -0
  5. package/dist/adapters/codex/index.js +3 -0
  6. package/dist/{claude-code-DZ4Vkgp6.js → claude-code-C_7hxC8z.js} +3 -245
  7. package/dist/claude-code-C_7hxC8z.js.map +1 -0
  8. package/dist/cli/bin.js +131 -151
  9. package/dist/cli/bin.js.map +1 -1
  10. package/dist/codex-0cHO2te9.js +496 -0
  11. package/dist/codex-0cHO2te9.js.map +1 -0
  12. package/dist/config/loader.d.ts +2 -2
  13. package/dist/config/loader.js +2 -2
  14. package/dist/{index-V22PrR0p.d.ts → index-DnvP1UBl.d.ts} +2 -2
  15. package/dist/index.d.ts +132 -6
  16. package/dist/index.js +6 -5
  17. package/dist/index.js.map +1 -1
  18. package/dist/loader-B1WmGGzf.d.ts +107 -0
  19. package/dist/{loader-DcI0KfRX.js → loader-DnQ6Jt0i.js} +472 -209
  20. package/dist/loader-DnQ6Jt0i.js.map +1 -0
  21. package/dist/{projections-BcX7w-f6.js → reporter-Biy-5-9M.js} +1335 -758
  22. package/dist/reporter-Biy-5-9M.js.map +1 -0
  23. package/dist/runner/suite.d.ts +1 -1
  24. package/dist/runner/suite.js +1 -1
  25. package/dist/{suite-DPJMIEbu.d.ts → suite-BEShV0by.d.ts} +2 -2
  26. package/dist/{suite-Dlzl-HI0.js → suite-BcP64nlb.js} +16 -2
  27. package/dist/{suite-Dlzl-HI0.js.map → suite-BcP64nlb.js.map} +1 -1
  28. package/dist/{types-CD3TwOtZ.d.ts → types-0QkNVyp9.d.ts} +2 -2
  29. package/dist/types-Bac8_Ixb.js +246 -0
  30. package/dist/types-Bac8_Ixb.js.map +1 -0
  31. package/dist/types-Bu8uOZZN.d.ts +77 -0
  32. package/dist/{types-B9H4IZtA.d.ts → types-C0gBkl0-.d.ts} +3 -2
  33. package/package.json +6 -2
  34. package/dist/claude-code-DZ4Vkgp6.js.map +0 -1
  35. package/dist/loader-C9yQHUPC.d.ts +0 -50
  36. package/dist/loader-DcI0KfRX.js.map +0 -1
  37. package/dist/projections-BcX7w-f6.js.map +0 -1
@@ -3,80 +3,6 @@ import { readFile, readdir, stat } from "node:fs/promises";
3
3
  import { isAbsolute, join, relative, resolve } from "node:path";
4
4
  import { parse } from "yaml";
5
5
  import { z } from "zod";
6
- //#region src/config/paths.ts
7
- /**
8
- * Resolve relative paths in suite config against the suite file directory.
9
- *
10
- * YAML authors write paths relative to the suite file; this module absolutizes
11
- * them at load time so the runner and adapters receive filesystem-ready values.
12
- * Tilde-prefixed paths and inline JSON blobs (settings starting with `{`) are
13
- * left unchanged.
14
- */
15
- /** Resolve a single path relative to `suiteDir` unless already absolute or `~/`. */
16
- function resolvePath(value, suiteDir) {
17
- if (isAbsolute(value) || value.startsWith("~/")) return value;
18
- return join(suiteDir, value);
19
- }
20
- /** Resolve Claude Code-specific path fields within a config block. */
21
- function resolveClaudeCodePaths(block, suiteDir) {
22
- const resolved = { ...block };
23
- if (typeof resolved.mcpConfig === "string") resolved.mcpConfig = resolvePath(resolved.mcpConfig, suiteDir);
24
- if (Array.isArray(resolved.pluginDirs)) resolved.pluginDirs = resolved.pluginDirs.map((p) => typeof p === "string" ? resolvePath(p, suiteDir) : p);
25
- if (Array.isArray(resolved.addDirs)) resolved.addDirs = resolved.addDirs.map((p) => typeof p === "string" ? resolvePath(p, suiteDir) : p);
26
- for (const field of [
27
- "systemPromptFile",
28
- "appendSystemPromptFile",
29
- "debugFile"
30
- ]) {
31
- const value = resolved[field];
32
- if (typeof value === "string" && !value.trim().startsWith("{")) resolved[field] = resolvePath(value, suiteDir);
33
- }
34
- if (typeof resolved.settings === "string" && !resolved.settings.trim().startsWith("{")) resolved.settings = resolvePath(resolved.settings, suiteDir);
35
- return resolved;
36
- }
37
- /** Resolve relative paths in a config layer relative to `suiteDir`. */
38
- function resolveConfigPaths(config, suiteDir) {
39
- if (!config) return void 0;
40
- const resolved = { ...config };
41
- if (typeof resolved.cwd === "string") resolved.cwd = resolvePath(resolved.cwd, suiteDir);
42
- if (resolved.claudeCode && typeof resolved.claudeCode === "object" && !Array.isArray(resolved.claudeCode)) resolved.claudeCode = resolveClaudeCodePaths(resolved.claudeCode, suiteDir);
43
- return resolved;
44
- }
45
- /** Resolve paths on an entire suite after load. */
46
- function resolveSuitePaths(suite, suiteFilePath) {
47
- const suiteDir = configFileDir(suiteFilePath);
48
- suite.defaultConfig = resolveConfigPaths(suite.defaultConfig, suiteDir);
49
- for (const cell of suite.matrix) cell.config = resolveConfigPaths(cell.config, suiteDir) ?? cell.config;
50
- for (const testCase of suite.cases) testCase.config = resolveConfigPaths(testCase.config, suiteDir);
51
- }
52
- /** Parent directory of a suite or grading config file path. */
53
- function configFileDir(filePath) {
54
- return filePath.includes("/") || filePath.includes("\\") ? filePath.replace(/[/\\][^/\\]+$/, "") : ".";
55
- }
56
- /**
57
- * Heuristically resolve env var values that look like relative file paths.
58
- *
59
- * Used for grading config where credential or config paths may be expressed
60
- * relative to the grading YAML location.
61
- */
62
- function resolveEnvPaths(env, baseDir) {
63
- const resolved = {};
64
- for (const [key, value] of Object.entries(env)) if (value.startsWith("./") || value.startsWith("../") || value.includes("/") && !value.startsWith("http")) resolved[key] = resolvePath(value, baseDir);
65
- else resolved[key] = value;
66
- return resolved;
67
- }
68
- /** Resolve relative paths in a standalone grading config file. */
69
- function resolveGradingConfigPaths(config, configFilePath) {
70
- const baseDir = configFileDir(configFilePath);
71
- const { adapter, maxConcurrent, ...rest } = config.judge;
72
- config.judge = {
73
- ...resolveConfigPaths(rest, baseDir) ?? rest,
74
- adapter,
75
- maxConcurrent
76
- };
77
- if (config.judge.env) config.judge.env = resolveEnvPaths(config.judge.env, baseDir);
78
- }
79
- //#endregion
80
6
  //#region src/config/schema.ts
81
7
  /**
82
8
  * zod schemas for the YAML on-disk shape.
@@ -132,13 +58,40 @@ const ClaudeCodeConfigSchema = z.object({
132
58
  maxTurns: z.number().int().positive(),
133
59
  isolateConfig: z.boolean()
134
60
  }).partial();
61
+ /** Codex CLI adapter-specific options (nested under `codex`). */
62
+ const CodexConfigSchema = z.object({
63
+ binary: z.string(),
64
+ profile: z.string(),
65
+ sandbox: z.enum([
66
+ "read-only",
67
+ "workspace-write",
68
+ "danger-full-access"
69
+ ]),
70
+ addDirs: z.array(z.string()),
71
+ configOverrides: z.array(z.string()),
72
+ askForApproval: z.enum([
73
+ "untrusted",
74
+ "on-request",
75
+ "never"
76
+ ]),
77
+ dangerouslyBypassApprovalsAndSandbox: z.boolean(),
78
+ dangerouslyBypassHookTrust: z.boolean(),
79
+ ephemeral: z.boolean(),
80
+ ignoreUserConfig: z.boolean(),
81
+ skipGitRepoCheck: z.boolean(),
82
+ outputSchema: z.string(),
83
+ outputLastMessage: z.string(),
84
+ captureLastMessage: z.boolean(),
85
+ isolateConfig: z.boolean()
86
+ }).partial();
135
87
  /** Generic + nested adapter config for one layer (defaultConfig, case, cell). */
136
88
  const ConfigPartialSchema = z.object({
137
89
  model: z.string(),
138
90
  cwd: z.string(),
139
91
  timeoutMs: z.number().int().positive(),
140
92
  env: z.record(z.string(), z.string()),
141
- claudeCode: ClaudeCodeConfigSchema
93
+ claudeCode: ClaudeCodeConfigSchema,
94
+ codex: CodexConfigSchema
142
95
  }).partial();
143
96
  /** A matrix cell — one point in the configuration matrix. */
144
97
  const MatrixCellSchema = z.object({
@@ -763,84 +716,260 @@ function typeOf(x) {
763
716
  return typeof x;
764
717
  }
765
718
  //#endregion
766
- //#region src/config/grading-schema.ts
719
+ //#region src/config/loader-internals.ts
767
720
  /**
768
- * Zod schema for standalone grading YAML (`grading.yaml`).
769
- *
770
- * The top-level `judge` block reuses {@link ConfigPartialSchema} fields plus
771
- * grader-specific concurrency and system-instruction overrides.
721
+ * Shared suite loader helpers (case file collection and parsing).
772
722
  */
773
- /** Top-level `judge` block mirrors harness config fields plus grader concurrency. */
774
- const JudgeConfigSchema = ConfigPartialSchema.extend({
775
- adapter: z.string().optional(),
776
- maxConcurrent: z.number().int().positive().optional(),
777
- /** Optional judge prompt prefix (maps to upstream system_instruction). */
778
- system_instruction: z.string().optional()
779
- });
780
- const GradingConfigSchema = z.object({ judge: JudgeConfigSchema });
781
- //#endregion
782
- //#region src/config/grading-loader.ts
783
- /**
784
- * Load standalone grading YAML for `harness-eval grade`.
785
- *
786
- * Grading config defines the judge subprocess (model, concurrency, Claude Code
787
- * flags) separately from the suite under test.
788
- */
789
- /** Load grading YAML from disk and resolve relative paths. */
790
- async function loadGradingConfig(filePath) {
791
- const absolutePath = resolve(filePath);
792
- let content;
793
- try {
794
- content = await readFile(absolutePath, "utf8");
795
- } catch (err) {
796
- throw new ConfigError(`failed to read grading config: ${err instanceof Error ? err.message : String(err)}`, filePath);
797
- }
798
- return parseGradingConfig(content, absolutePath);
799
- }
800
- /**
801
- * Parse grading YAML from a string.
802
- *
803
- * @param sourcePath Optional path for error messages and path resolution.
804
- */
805
- function parseGradingConfig(yamlContent, sourcePath) {
723
+ /** Parse one case file: single case, array, or `{ cases: [...] }`. */
724
+ function parseCasesFile(yamlContent, sourcePath) {
806
725
  let raw;
807
726
  try {
808
727
  raw = parse(yamlContent);
809
728
  } catch (err) {
810
729
  throw new ConfigError(`YAML parse error: ${err instanceof Error ? err.message : String(err)}`, sourcePath);
811
730
  }
812
- const validated = GradingConfigSchema.safeParse(raw);
813
- if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError$1(validated.error, sourcePath)}`, sourcePath);
814
- const config = { judge: { ...validated.data.judge } };
815
- if (sourcePath) resolveGradingConfigPaths(config, sourcePath);
816
- return config;
731
+ return transformTestCases(extractRawCases(raw, sourcePath), sourcePath ?? "cases");
817
732
  }
818
- /** Format a zod validation error with optional source file prefix. */
819
- function formatZodError$1(err, sourcePath) {
733
+ function extractRawCases(raw, sourcePath) {
734
+ if (Array.isArray(raw)) return raw.map((item, index) => validateRawCase(item, sourcePath, index));
735
+ if (raw && typeof raw === "object") {
736
+ const obj = raw;
737
+ if (Array.isArray(obj.cases)) return obj.cases.map((item, index) => validateRawCase(item, sourcePath, index));
738
+ if ("id" in obj && "prompt" in obj && "assertions" in obj) return [validateRawCase(raw, sourcePath, 0)];
739
+ }
740
+ throw new ConfigError("expected a case object, array of cases, or { cases: [...] }", sourcePath);
741
+ }
742
+ function validateRawCase(raw, sourcePath, index) {
743
+ const validated = TestCaseSchema.safeParse(raw);
744
+ if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError$3(validated.error, sourcePath)}`, sourcePath);
745
+ return validated.data;
746
+ }
747
+ /** Recursively collect `.yaml` / `.yml` files under `casesDir`. */
748
+ async function collectCaseYamlFiles(casesDir) {
749
+ const files = [];
750
+ async function walk(dir) {
751
+ let entries;
752
+ try {
753
+ entries = await readdir(dir, { withFileTypes: true });
754
+ } catch (err) {
755
+ if (err instanceof Error && "code" in err && err.code === "ENOENT") return;
756
+ throw err;
757
+ }
758
+ for (const entry of entries) {
759
+ const fullPath = join(dir, entry.name);
760
+ if (entry.isDirectory()) await walk(fullPath);
761
+ else if (entry.isFile() && (entry.name.endsWith(".yaml") || entry.name.endsWith(".yml"))) files.push(fullPath);
762
+ }
763
+ }
764
+ await walk(casesDir);
765
+ return files.sort();
766
+ }
767
+ function formatZodError$3(err, sourcePath) {
820
768
  return err.issues.map((issue) => {
821
769
  const path = issue.path.length > 0 ? issue.path.join(".") : "(root)";
822
770
  return ` ${sourcePath ? `${sourcePath} → ${path}` : path}: ${issue.message}`;
823
771
  }).join("\n");
824
772
  }
825
773
  //#endregion
826
- //#region src/config/loader.ts
774
+ //#region src/config/pipeline-schema.ts
827
775
  /**
828
- * Load a `TestSuite` from a YAML file, directory, or string.
776
+ * Zod schemas for optional `pipeline:` block in suite.yaml.
829
777
  *
830
- * Supports two on-disk layouts:
831
- * - Single file: `suite.yaml` with inline `cases`.
832
- * - Directory: `suite.yaml` plus optional `cases/*.yaml` fragments merged
833
- * in lexicographic path order.
778
+ * Step presence under `pipeline` enables orchestration via `harness-eval pipeline`.
779
+ */
780
+ /** `pipeline.run` step harness eval run. */
781
+ const PipelineRunStepSchema = z.object({
782
+ output: z.string().min(1).optional(),
783
+ maxConcurrent: z.number().int().positive().optional()
784
+ }).optional();
785
+ /** `pipeline.grade` step — LLM outcome grading. */
786
+ const PipelineGradeStepSchema = z.object({
787
+ input: z.string().min(1).optional(),
788
+ output: z.string().min(1).optional(),
789
+ maxConcurrent: z.number().int().positive().optional()
790
+ }).optional();
791
+ /** `pipeline.envelope` step — EvalRunEnvelope export. */
792
+ const PipelineEnvelopeStepSchema = z.object({
793
+ report: z.string().min(1).optional(),
794
+ grading: z.string().min(1).optional(),
795
+ output: z.string().min(1).optional(),
796
+ projection: z.enum([
797
+ "envelope",
798
+ "trajectory",
799
+ "instances"
800
+ ]).optional(),
801
+ includeRawStreamEvents: z.boolean().optional(),
802
+ noTranscript: z.boolean().optional()
803
+ }).optional();
804
+ /** Top-level optional pipeline block in suite.yaml. */
805
+ const PipelineConfigSchema = z.object({
806
+ run: PipelineRunStepSchema,
807
+ grade: PipelineGradeStepSchema,
808
+ envelope: PipelineEnvelopeStepSchema
809
+ }).partial();
810
+ /** Default artifact filenames relative to the suite.yaml directory. */
811
+ const DEFAULT_PIPELINE_OUTPUTS = {
812
+ run: "report.json",
813
+ grade: "grading.json",
814
+ envelope: "envelope.json"
815
+ };
816
+ //#endregion
817
+ //#region src/config/paths.ts
818
+ /**
819
+ * Resolve relative paths in suite config against the suite file directory.
834
820
  *
835
- * Relative paths in config (MCP config, plugin dirs, etc.) are resolved
836
- * against the suite file directory after load.
821
+ * YAML authors write paths relative to the suite file; this module absolutizes
822
+ * them at load time so the runner and adapters receive filesystem-ready values.
823
+ * Tilde-prefixed paths and inline JSON blobs (settings starting with `{`) are
824
+ * left unchanged.
837
825
  */
826
+ /** Resolve a single path relative to `suiteDir` unless already absolute or `~/`. */
827
+ function resolvePath(value, suiteDir) {
828
+ if (isAbsolute(value) || value.startsWith("~/")) return value;
829
+ return join(suiteDir, value);
830
+ }
831
+ /** Resolve Claude Code-specific path fields within a config block. */
832
+ function resolveClaudeCodePaths(block, suiteDir) {
833
+ const resolved = { ...block };
834
+ if (typeof resolved.mcpConfig === "string") resolved.mcpConfig = resolvePath(resolved.mcpConfig, suiteDir);
835
+ if (Array.isArray(resolved.pluginDirs)) resolved.pluginDirs = resolved.pluginDirs.map((p) => typeof p === "string" ? resolvePath(p, suiteDir) : p);
836
+ if (Array.isArray(resolved.addDirs)) resolved.addDirs = resolved.addDirs.map((p) => typeof p === "string" ? resolvePath(p, suiteDir) : p);
837
+ for (const field of [
838
+ "systemPromptFile",
839
+ "appendSystemPromptFile",
840
+ "debugFile"
841
+ ]) {
842
+ const value = resolved[field];
843
+ if (typeof value === "string" && !value.trim().startsWith("{")) resolved[field] = resolvePath(value, suiteDir);
844
+ }
845
+ if (typeof resolved.settings === "string" && !resolved.settings.trim().startsWith("{")) resolved.settings = resolvePath(resolved.settings, suiteDir);
846
+ return resolved;
847
+ }
848
+ /** Resolve Codex-specific path fields within a config block. */
849
+ function resolveCodexPaths(block, suiteDir) {
850
+ const resolved = { ...block };
851
+ if (Array.isArray(resolved.addDirs)) resolved.addDirs = resolved.addDirs.map((p) => typeof p === "string" ? resolvePath(p, suiteDir) : p);
852
+ for (const field of ["outputSchema", "outputLastMessage"]) {
853
+ const value = resolved[field];
854
+ if (typeof value === "string") resolved[field] = resolvePath(value, suiteDir);
855
+ }
856
+ return resolved;
857
+ }
858
+ /** Resolve relative paths in a config layer relative to `suiteDir`. */
859
+ function resolveConfigPaths(config, suiteDir) {
860
+ if (!config) return void 0;
861
+ const resolved = { ...config };
862
+ if (typeof resolved.cwd === "string") resolved.cwd = resolvePath(resolved.cwd, suiteDir);
863
+ if (resolved.claudeCode && typeof resolved.claudeCode === "object" && !Array.isArray(resolved.claudeCode)) resolved.claudeCode = resolveClaudeCodePaths(resolved.claudeCode, suiteDir);
864
+ if (resolved.codex && typeof resolved.codex === "object" && !Array.isArray(resolved.codex)) resolved.codex = resolveCodexPaths(resolved.codex, suiteDir);
865
+ return resolved;
866
+ }
867
+ /** Resolve paths on an entire suite after load. */
868
+ function resolveSuitePaths(suite, suiteFilePath) {
869
+ const suiteDir = configFileDir(suiteFilePath);
870
+ suite.defaultConfig = resolveConfigPaths(suite.defaultConfig, suiteDir);
871
+ for (const cell of suite.matrix) cell.config = resolveConfigPaths(cell.config, suiteDir) ?? cell.config;
872
+ for (const testCase of suite.cases) testCase.config = resolveConfigPaths(testCase.config, suiteDir);
873
+ }
874
+ /** Parent directory of a suite or grading config file path. */
875
+ function configFileDir(filePath) {
876
+ return filePath.includes("/") || filePath.includes("\\") ? filePath.replace(/[/\\][^/\\]+$/, "") : ".";
877
+ }
838
878
  /**
839
- * Load a suite from a file path or directory path.
879
+ * Heuristically resolve env var values that look like relative file paths.
840
880
  *
841
- * @throws {@link ConfigError} when the path is unreadable or validation fails.
881
+ * Used for grading config where credential or config paths may be expressed
882
+ * relative to the grading YAML location.
842
883
  */
843
- async function loadSuite(filePath) {
884
+ function resolveEnvPaths(env, baseDir) {
885
+ const resolved = {};
886
+ for (const [key, value] of Object.entries(env)) if (value.startsWith("./") || value.startsWith("../")) resolved[key] = resolvePath(value, baseDir);
887
+ else resolved[key] = value;
888
+ return resolved;
889
+ }
890
+ /** Resolve relative paths in a standalone grading config file. */
891
+ function resolveGradingConfigPaths(config, configFilePath) {
892
+ const baseDir = configFileDir(configFilePath);
893
+ const { adapter, maxConcurrent, ...rest } = config.judge;
894
+ config.judge = {
895
+ ...resolveConfigPaths(rest, baseDir) ?? rest,
896
+ adapter,
897
+ maxConcurrent
898
+ };
899
+ if (config.judge.env) config.judge.env = resolveEnvPaths(config.judge.env, baseDir);
900
+ }
901
+ /** Resolve a pipeline artifact path relative to the suite.yaml directory. */
902
+ function resolvePipelinePath(value, defaultRelative, suiteDir) {
903
+ return resolvePath(value ?? defaultRelative, suiteDir);
904
+ }
905
+ /** Resolve relative paths in a parsed pipeline config. */
906
+ function resolvePipelineConfigPaths(pipeline, suiteFilePath) {
907
+ const suiteDir = configFileDir(suiteFilePath);
908
+ const resolved = {};
909
+ if (pipeline.run) resolved.run = resolvePipelineRunStep(pipeline.run, suiteDir);
910
+ if (pipeline.grade) resolved.grade = resolvePipelineGradeStep(pipeline.grade, suiteDir);
911
+ if (pipeline.envelope) resolved.envelope = resolvePipelineEnvelopeStep(pipeline.envelope, suiteDir);
912
+ return resolved;
913
+ }
914
+ /** Resolve one pipeline step's run output path. */
915
+ function resolvePipelineRunStep(step, suiteDir) {
916
+ return {
917
+ ...step,
918
+ output: resolvePipelinePath(step.output, DEFAULT_PIPELINE_OUTPUTS.run, suiteDir)
919
+ };
920
+ }
921
+ /** Resolve grade step input (optional) and output paths. */
922
+ function resolvePipelineGradeStep(step, suiteDir) {
923
+ return {
924
+ ...step,
925
+ input: step.input ? resolvePipelinePath(step.input, DEFAULT_PIPELINE_OUTPUTS.run, suiteDir) : void 0,
926
+ output: resolvePipelinePath(step.output, DEFAULT_PIPELINE_OUTPUTS.grade, suiteDir)
927
+ };
928
+ }
929
+ /** Resolve envelope step report, grading, and output paths. */
930
+ function resolvePipelineEnvelopeStep(step, suiteDir) {
931
+ return {
932
+ ...step,
933
+ report: step.report ? resolvePipelinePath(step.report, DEFAULT_PIPELINE_OUTPUTS.run, suiteDir) : void 0,
934
+ grading: step.grading ? resolvePipelinePath(step.grading, DEFAULT_PIPELINE_OUTPUTS.grade, suiteDir) : void 0,
935
+ output: resolvePipelinePath(step.output, DEFAULT_PIPELINE_OUTPUTS.envelope, suiteDir)
936
+ };
937
+ }
938
+ //#endregion
939
+ //#region src/config/grading-schema.ts
940
+ /**
941
+ * Zod schema for standalone grading YAML (`grading.yaml`).
942
+ *
943
+ * The top-level `judge` block reuses {@link ConfigPartialSchema} fields plus
944
+ * grader-specific concurrency and system-instruction overrides.
945
+ */
946
+ /** Top-level `judge` block — mirrors harness config fields plus grader concurrency. */
947
+ const JudgeConfigSchema = ConfigPartialSchema.extend({
948
+ adapter: z.string().optional(),
949
+ maxConcurrent: z.number().int().positive().optional(),
950
+ /** Optional judge prompt prefix (maps to upstream system_instruction). */
951
+ system_instruction: z.string().optional()
952
+ });
953
+ const GradingConfigSchema = z.object({ judge: JudgeConfigSchema });
954
+ //#endregion
955
+ //#region src/config/suite-file-schema.ts
956
+ /** Single-file suite with optional inline judge and pipeline orchestration. */
957
+ const SuiteFileSingleSchema = TestSuiteSchema.extend({
958
+ judge: JudgeConfigSchema.optional(),
959
+ pipeline: PipelineConfigSchema.optional()
960
+ });
961
+ /** Directory suite root with optional inline judge and pipeline orchestration. */
962
+ const SuiteFileDirectorySchema = SuiteDirectorySchema.extend({
963
+ judge: JudgeConfigSchema.optional(),
964
+ pipeline: PipelineConfigSchema.optional()
965
+ });
966
+ //#endregion
967
+ //#region src/config/suite-document-loader.ts
968
+ /**
969
+ * Load a unified suite.yaml document (suite + optional judge + pipeline).
970
+ */
971
+ /** Load suite.yaml (or directory) including optional judge and pipeline blocks. */
972
+ async function loadSuiteDocument(filePath, options = {}) {
844
973
  const absolutePath = resolve(filePath);
845
974
  let info;
846
975
  try {
@@ -848,26 +977,12 @@ async function loadSuite(filePath) {
848
977
  } catch (err) {
849
978
  throw new ConfigError(`failed to read suite path: ${err instanceof Error ? err.message : String(err)}`, filePath);
850
979
  }
851
- if (info.isDirectory()) return loadSuiteDirectory(absolutePath);
852
- return loadSuiteFile(absolutePath);
980
+ const strict = options.validateOrchestration !== false;
981
+ if (info.isDirectory()) return loadSuiteDocumentDirectory(absolutePath, strict);
982
+ return loadSuiteDocumentFile(absolutePath, strict);
853
983
  }
854
- /** Load and parse a single-file suite (not a directory layout). */
855
- async function loadSuiteFile(absolutePath) {
856
- let content;
857
- try {
858
- content = await readFile(absolutePath, "utf8");
859
- } catch (err) {
860
- throw new ConfigError(`failed to read suite file: ${err instanceof Error ? err.message : String(err)}`, absolutePath);
861
- }
862
- return parseSuite(content, absolutePath);
863
- }
864
- /**
865
- * Load a directory suite: `suite.yaml` plus optional `cases/` YAML files.
866
- *
867
- * Cases from `suite.yaml` sort before external case files; within each file,
868
- * array order is preserved.
869
- */
870
- async function loadSuiteDirectory(dir) {
984
+ /** Load suite.yaml from a directory layout (cases under `cases/`). */
985
+ async function loadSuiteDocumentDirectory(dir, strict) {
871
986
  const suiteYamlPath = join(dir, "suite.yaml");
872
987
  let content;
873
988
  try {
@@ -875,7 +990,7 @@ async function loadSuiteDirectory(dir) {
875
990
  } catch (err) {
876
991
  throw new ConfigError(`missing suite.yaml in suite directory: ${err instanceof Error ? err.message : String(err)}`, dir);
877
992
  }
878
- const base = parseSuiteDirectory(content, suiteYamlPath);
993
+ const { judge, pipeline, suite: base } = parseSuiteFileRoot(content, suiteYamlPath, "directory", strict);
879
994
  const casesDir = join(dir, "cases");
880
995
  const caseFiles = await collectCaseYamlFiles(casesDir);
881
996
  const tagged = base.cases.map((testCase, index) => ({
@@ -904,94 +1019,242 @@ async function loadSuiteDirectory(dir) {
904
1019
  cases
905
1020
  };
906
1021
  resolveSuitePaths(suite, suiteYamlPath);
907
- return suite;
1022
+ return buildSuiteDocument(suiteYamlPath, suite, judge, pipeline);
1023
+ }
1024
+ /** Load a single suite.yaml file (inline cases). */
1025
+ async function loadSuiteDocumentFile(absolutePath, strict) {
1026
+ let content;
1027
+ try {
1028
+ content = await readFile(absolutePath, "utf8");
1029
+ } catch (err) {
1030
+ throw new ConfigError(`failed to read suite file: ${err instanceof Error ? err.message : String(err)}`, absolutePath);
1031
+ }
1032
+ const { judge, pipeline, suite } = parseSuiteFileRoot(content, absolutePath, "single", strict);
1033
+ resolveSuitePaths(suite, absolutePath);
1034
+ return buildSuiteDocument(absolutePath, suite, judge, pipeline);
908
1035
  }
909
1036
  /**
910
- * Parse suite YAML from a string (single-file layout with inline cases).
1037
+ * Parse suite.yaml root and validate against the appropriate schema.
911
1038
  *
912
- * @param sourcePath Optional path for error messages and relative path resolution.
1039
+ * When `strict` is true, uses extended schemas that validate `judge:` and
1040
+ * `pipeline:` blocks (for `loadSuiteDocument`). When false, uses base schemas
1041
+ * that silently strip unknown keys (for `loadSuite`).
913
1042
  */
914
- function parseSuite(yamlContent, sourcePath) {
1043
+ function parseSuiteFileRoot(yamlContent, sourcePath, layout, strict) {
915
1044
  let raw;
916
1045
  try {
917
1046
  raw = parse(yamlContent);
918
1047
  } catch (err) {
919
1048
  throw new ConfigError(`YAML parse error: ${err instanceof Error ? err.message : String(err)}`, sourcePath);
920
1049
  }
921
- const validated = TestSuiteSchema.safeParse(raw);
922
- if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError(validated.error, sourcePath)}`, sourcePath);
923
- const suite = transformSuite(validated.data);
924
- if (sourcePath) resolveSuitePaths(suite, resolve(sourcePath));
925
- return suite;
1050
+ if (!strict) {
1051
+ const validated = (layout === "directory" ? SuiteDirectorySchema : TestSuiteSchema).safeParse(raw);
1052
+ if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError$2(validated.error, sourcePath)}`, sourcePath);
1053
+ return { suite: (layout === "directory" ? transformSuiteDirectory : transformSuite)(validated.data) };
1054
+ }
1055
+ if (layout === "directory") {
1056
+ const validated = SuiteFileDirectorySchema.safeParse(raw);
1057
+ if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError$2(validated.error, sourcePath)}`, sourcePath);
1058
+ return extractSuiteFileParts(validated.data, sourcePath, transformSuiteDirectory);
1059
+ }
1060
+ const validated = SuiteFileSingleSchema.safeParse(raw);
1061
+ if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError$2(validated.error, sourcePath)}`, sourcePath);
1062
+ return extractSuiteFileParts(validated.data, sourcePath, transformSuite);
926
1063
  }
927
- /** Parse `suite.yaml` for directory layout (cases may be omitted). */
928
- function parseSuiteDirectory(yamlContent, sourcePath) {
1064
+ /** Split validated YAML into suite, judge, and pipeline with path resolution. */
1065
+ function extractSuiteFileParts(data, sourcePath, transform) {
1066
+ const { judge: rawJudge, pipeline: rawPipeline, ...suiteRaw } = data;
1067
+ const suite = transform(suiteRaw);
1068
+ let judge;
1069
+ if (rawJudge) {
1070
+ judge = { ...rawJudge };
1071
+ resolveGradingConfigPaths({ judge }, sourcePath);
1072
+ }
1073
+ let pipeline;
1074
+ if (rawPipeline) {
1075
+ pipeline = transformPipelineConfig(rawPipeline);
1076
+ pipeline = resolvePipelineConfigPaths(pipeline, sourcePath);
1077
+ }
1078
+ return {
1079
+ suite,
1080
+ judge,
1081
+ pipeline
1082
+ };
1083
+ }
1084
+ /** Apply default artifact filenames when a pipeline step key is present but paths are omitted. */
1085
+ function transformPipelineConfig(raw) {
1086
+ const pipeline = {};
1087
+ if (raw.run !== void 0) pipeline.run = {
1088
+ output: raw.run?.output ?? DEFAULT_PIPELINE_OUTPUTS.run,
1089
+ maxConcurrent: raw.run?.maxConcurrent
1090
+ };
1091
+ if (raw.grade !== void 0) pipeline.grade = {
1092
+ input: raw.grade?.input,
1093
+ output: raw.grade?.output ?? DEFAULT_PIPELINE_OUTPUTS.grade,
1094
+ maxConcurrent: raw.grade?.maxConcurrent
1095
+ };
1096
+ if (raw.envelope !== void 0) pipeline.envelope = {
1097
+ report: raw.envelope?.report,
1098
+ grading: raw.envelope?.grading,
1099
+ output: raw.envelope?.output ?? DEFAULT_PIPELINE_OUTPUTS.envelope,
1100
+ projection: raw.envelope?.projection ?? "envelope",
1101
+ includeRawStreamEvents: raw.envelope?.includeRawStreamEvents,
1102
+ noTranscript: raw.envelope?.noTranscript
1103
+ };
1104
+ return pipeline;
1105
+ }
1106
+ /** Assemble the runtime {@link SuiteDocument} from parsed parts. */
1107
+ function buildSuiteDocument(suitePath, suite, judge, pipeline) {
1108
+ return {
1109
+ suitePath: resolve(suitePath),
1110
+ suite,
1111
+ judge,
1112
+ pipeline
1113
+ };
1114
+ }
1115
+ function formatZodError$2(err, sourcePath) {
1116
+ return err.issues.map((issue) => {
1117
+ const path = issue.path.length > 0 ? issue.path.join(".") : "(root)";
1118
+ return ` ${sourcePath ? `${sourcePath} → ${path}` : path}: ${issue.message}`;
1119
+ }).join("\n");
1120
+ }
1121
+ //#endregion
1122
+ //#region src/config/grading-loader.ts
1123
+ /**
1124
+ * Load standalone grading YAML for `harness-eval grade`.
1125
+ *
1126
+ * Also accepts unified suite.yaml files with an inline `judge:` block.
1127
+ */
1128
+ /** Load grading YAML from disk and resolve relative paths. */
1129
+ async function loadGradingConfig(filePath) {
1130
+ const absolutePath = resolve(filePath);
1131
+ let info;
1132
+ try {
1133
+ info = await stat(absolutePath);
1134
+ } catch (err) {
1135
+ throw new ConfigError(`failed to read grading config: ${err instanceof Error ? err.message : String(err)}`, filePath);
1136
+ }
1137
+ if (info.isDirectory()) return loadGradingFromSuiteYaml(join(absolutePath, "suite.yaml"));
1138
+ let content;
1139
+ try {
1140
+ content = await readFile(absolutePath, "utf8");
1141
+ } catch (err) {
1142
+ throw new ConfigError(`failed to read grading config: ${err instanceof Error ? err.message : String(err)}`, filePath);
1143
+ }
1144
+ if (isSuiteRoot(parse(content))) return parseGradingFromSuiteRaw(parse(content), absolutePath);
1145
+ return parseGradingConfig(content, absolutePath);
1146
+ }
1147
+ /**
1148
+ * Parse grading YAML from a string.
1149
+ *
1150
+ * @param sourcePath Optional path for error messages and path resolution.
1151
+ */
1152
+ function parseGradingConfig(yamlContent, sourcePath) {
929
1153
  let raw;
930
1154
  try {
931
1155
  raw = parse(yamlContent);
932
1156
  } catch (err) {
933
1157
  throw new ConfigError(`YAML parse error: ${err instanceof Error ? err.message : String(err)}`, sourcePath);
934
1158
  }
935
- const validated = SuiteDirectorySchema.safeParse(raw);
936
- if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError(validated.error, sourcePath)}`, sourcePath);
937
- return transformSuiteDirectory(validated.data);
1159
+ if (isSuiteRoot(raw)) return parseGradingFromSuiteRaw(raw, sourcePath ?? "suite.yaml");
1160
+ const validated = GradingConfigSchema.safeParse(raw);
1161
+ if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError$1(validated.error, sourcePath)}`, sourcePath);
1162
+ const config = { judge: { ...validated.data.judge } };
1163
+ if (sourcePath) resolveGradingConfigPaths(config, sourcePath);
1164
+ return config;
938
1165
  }
939
- /** Parse one case file: single case, array, or `{ cases: [...] }`. */
940
- function parseCasesFile(yamlContent, sourcePath) {
1166
+ /** Detect unified suite.yaml by presence of suite-specific keys (vs standalone grading YAML). */
1167
+ function isSuiteRoot(raw) {
1168
+ if (raw === null || typeof raw !== "object") return false;
1169
+ return "cases" in raw || "matrix" in raw && "adapter" in raw;
1170
+ }
1171
+ async function loadGradingFromSuiteYaml(suiteYamlPath) {
1172
+ let content;
1173
+ try {
1174
+ content = await readFile(suiteYamlPath, "utf8");
1175
+ } catch (err) {
1176
+ throw new ConfigError(`failed to read suite file: ${err instanceof Error ? err.message : String(err)}`, suiteYamlPath);
1177
+ }
941
1178
  let raw;
942
1179
  try {
943
- raw = parse(yamlContent);
1180
+ raw = parse(content);
944
1181
  } catch (err) {
945
- throw new ConfigError(`YAML parse error: ${err instanceof Error ? err.message : String(err)}`, sourcePath);
1182
+ throw new ConfigError(`YAML parse error: ${err instanceof Error ? err.message : String(err)}`, suiteYamlPath);
946
1183
  }
947
- return transformTestCases(extractRawCases(raw, sourcePath), sourcePath ?? "cases");
1184
+ return parseGradingFromSuiteRaw(raw, suiteYamlPath);
948
1185
  }
1186
+ function parseGradingFromSuiteRaw(raw, sourcePath) {
1187
+ const single = SuiteFileSingleSchema.safeParse(raw);
1188
+ if (single.success) {
1189
+ if (!single.data.judge) throw new ConfigError("suite file has no judge block", sourcePath);
1190
+ const config = { judge: { ...single.data.judge } };
1191
+ resolveGradingConfigPaths(config, sourcePath);
1192
+ return config;
1193
+ }
1194
+ const directory = SuiteFileDirectorySchema.safeParse(raw);
1195
+ if (directory.success) {
1196
+ if (!directory.data.judge) throw new ConfigError("suite file has no judge block", sourcePath);
1197
+ const config = { judge: { ...directory.data.judge } };
1198
+ resolveGradingConfigPaths(config, sourcePath);
1199
+ return config;
1200
+ }
1201
+ throw new ConfigError(`validation failed:\n${formatZodError$1(directory.error ?? single.error, sourcePath)}`, sourcePath);
1202
+ }
1203
+ /** Format a zod validation error with optional source file prefix. */
1204
+ function formatZodError$1(err, sourcePath) {
1205
+ return err.issues.map((issue) => {
1206
+ const path = issue.path.length > 0 ? issue.path.join(".") : "(root)";
1207
+ return ` ${sourcePath ? `${sourcePath} → ${path}` : path}: ${issue.message}`;
1208
+ }).join("\n");
1209
+ }
1210
+ //#endregion
1211
+ //#region src/config/loader.ts
949
1212
  /**
950
- * Normalize raw YAML into a list of {@link RawTestCase} objects.
1213
+ * Load a `TestSuite` from a YAML file, directory, or string.
951
1214
  *
952
- * Accepts a single case, an array, or `{ cases: [...] }`.
1215
+ * For unified suite.yaml with optional `judge:` and `pipeline:` blocks,
1216
+ * use {@link loadSuiteDocument}.
953
1217
  */
954
- function extractRawCases(raw, sourcePath) {
955
- if (Array.isArray(raw)) return raw.map((item, index) => validateRawCase(item, sourcePath, index));
956
- if (raw && typeof raw === "object") {
957
- const obj = raw;
958
- if (Array.isArray(obj.cases)) return obj.cases.map((item, index) => validateRawCase(item, sourcePath, index));
959
- if ("id" in obj && "prompt" in obj && "assertions" in obj) return [validateRawCase(raw, sourcePath, 0)];
960
- }
961
- throw new ConfigError("expected a case object, array of cases, or { cases: [...] }", sourcePath);
962
- }
963
- /** Validate one raw case object against {@link TestCaseSchema}. */
964
- function validateRawCase(raw, sourcePath, index) {
965
- const validated = TestCaseSchema.safeParse(raw);
966
- if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError(validated.error, sourcePath)}`, sourcePath);
967
- return validated.data;
1218
+ /**
1219
+ * Load a suite from a file path or directory path (suite portion only).
1220
+ *
1221
+ * Orchestration blocks (`judge:`, `pipeline:`) are silently stripped — callers
1222
+ * that only need the `TestSuite` are not broken by malformed orchestration YAML.
1223
+ * Use {@link loadSuiteDocument} when you need validated orchestration metadata.
1224
+ */
1225
+ async function loadSuite(filePath) {
1226
+ return (await loadSuiteDocument(filePath, { validateOrchestration: false })).suite;
968
1227
  }
969
1228
  /**
970
- * Recursively collect `.yaml` / `.yml` files under `casesDir`.
1229
+ * Parse suite YAML from a string (single-file layout with inline cases).
971
1230
  *
972
- * Returns an empty list when the directory does not exist — external cases
973
- * are optional in directory layout.
1231
+ * Unknown top-level keys such as `judge` and `pipeline` are stripped.
974
1232
  */
975
- async function collectCaseYamlFiles(casesDir) {
976
- const files = [];
977
- async function walk(dir) {
978
- let entries;
979
- try {
980
- entries = await readdir(dir, { withFileTypes: true });
981
- } catch (err) {
982
- if (err instanceof Error && "code" in err && err.code === "ENOENT") return;
983
- throw err;
984
- }
985
- for (const entry of entries) {
986
- const fullPath = join(dir, entry.name);
987
- if (entry.isDirectory()) await walk(fullPath);
988
- else if (entry.isFile() && (entry.name.endsWith(".yaml") || entry.name.endsWith(".yml"))) files.push(fullPath);
989
- }
1233
+ function parseSuite(yamlContent, sourcePath) {
1234
+ let raw;
1235
+ try {
1236
+ raw = parse(yamlContent);
1237
+ } catch (err) {
1238
+ throw new ConfigError(`YAML parse error: ${err instanceof Error ? err.message : String(err)}`, sourcePath);
990
1239
  }
991
- await walk(casesDir);
992
- return files.sort();
1240
+ const validated = TestSuiteSchema.safeParse(raw);
1241
+ if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError(validated.error, sourcePath)}`, sourcePath);
1242
+ const suite = transformSuite(validated.data);
1243
+ if (sourcePath) resolveSuitePaths(suite, resolve(sourcePath));
1244
+ return suite;
1245
+ }
1246
+ /** Parse `suite.yaml` for directory layout (cases may be omitted). @internal */
1247
+ function parseSuiteDirectory(yamlContent, sourcePath) {
1248
+ let raw;
1249
+ try {
1250
+ raw = parse(yamlContent);
1251
+ } catch (err) {
1252
+ throw new ConfigError(`YAML parse error: ${err instanceof Error ? err.message : String(err)}`, sourcePath);
1253
+ }
1254
+ const validated = SuiteDirectorySchema.safeParse(raw);
1255
+ if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError(validated.error, sourcePath)}`, sourcePath);
1256
+ return transformSuiteDirectory(validated.data);
993
1257
  }
994
- /** Format a zod validation error with optional source file prefix. */
995
1258
  function formatZodError(err, sourcePath) {
996
1259
  return err.issues.map((issue) => {
997
1260
  const path = issue.path.length > 0 ? issue.path.join(".") : "(root)";
@@ -999,6 +1262,6 @@ function formatZodError(err, sourcePath) {
999
1262
  }).join("\n");
1000
1263
  }
1001
1264
  //#endregion
1002
- export { parseGradingConfig as a, loadGradingConfig as i, parseCasesFile as n, ConfigError as o, parseSuite as r, loadSuite as t };
1265
+ export { parseGradingConfig as a, parseCasesFile as c, loadGradingConfig as i, ConfigError as l, parseSuite as n, loadSuiteDocument as o, parseSuiteDirectory as r, DEFAULT_PIPELINE_OUTPUTS as s, loadSuite as t };
1003
1266
 
1004
- //# sourceMappingURL=loader-DcI0KfRX.js.map
1267
+ //# sourceMappingURL=loader-DnQ6Jt0i.js.map