@alis-build/harness-eval 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/README.md +104 -10
  2. package/dist/adapters/claude-code/index.d.ts +2 -2
  3. package/dist/adapters/claude-code/index.js +2 -1
  4. package/dist/adapters/codex/index.d.ts +68 -0
  5. package/dist/adapters/codex/index.js +3 -0
  6. package/dist/{claude-code-ycT0JQZF.js → claude-code-C_7hxC8z.js} +37 -250
  7. package/dist/claude-code-C_7hxC8z.js.map +1 -0
  8. package/dist/cli/bin.js +204 -127
  9. package/dist/cli/bin.js.map +1 -1
  10. package/dist/codex-0cHO2te9.js +496 -0
  11. package/dist/codex-0cHO2te9.js.map +1 -0
  12. package/dist/config/loader.d.ts +2 -2
  13. package/dist/config/loader.js +2 -2
  14. package/dist/{index-6Z17eKZx.d.ts → index-DnvP1UBl.d.ts} +3 -2
  15. package/dist/index.d.ts +397 -153
  16. package/dist/index.js +125 -5
  17. package/dist/index.js.map +1 -0
  18. package/dist/loader-B1WmGGzf.d.ts +107 -0
  19. package/dist/{loader-BCnFJ8rm.js → loader-DnQ6Jt0i.js} +707 -157
  20. package/dist/loader-DnQ6Jt0i.js.map +1 -0
  21. package/dist/reporter-Biy-5-9M.js +2216 -0
  22. package/dist/reporter-Biy-5-9M.js.map +1 -0
  23. package/dist/runner/suite.d.ts +1 -1
  24. package/dist/runner/suite.js +1 -1
  25. package/dist/{suite-BoOvK_lq.d.ts → suite-BEShV0by.d.ts} +7 -2
  26. package/dist/{suite-chj0j22j.js → suite-BcP64nlb.js} +72 -4
  27. package/dist/suite-BcP64nlb.js.map +1 -0
  28. package/dist/{types-BQol062t.d.ts → types-0QkNVyp9.d.ts} +152 -11
  29. package/dist/types-Bac8_Ixb.js +246 -0
  30. package/dist/types-Bac8_Ixb.js.map +1 -0
  31. package/dist/types-Bu8uOZZN.d.ts +77 -0
  32. package/dist/{types-B9H4IZtA.d.ts → types-C0gBkl0-.d.ts} +3 -2
  33. package/package.json +7 -2
  34. package/schemas/eval-interchange-instances.schema.json +196 -0
  35. package/schemas/eval-interchange.schema.json +65 -52
  36. package/schemas/eval-run-envelope.schema.json +182 -425
  37. package/dist/build-DsVJ_UeU.js +0 -1396
  38. package/dist/build-DsVJ_UeU.js.map +0 -1
  39. package/dist/claude-code-ycT0JQZF.js.map +0 -1
  40. package/dist/loader-BCnFJ8rm.js.map +0 -1
  41. package/dist/loader-DTvoVfN0.d.ts +0 -33
  42. package/dist/suite-chj0j22j.js.map +0 -1
  43. package/schemas/eval-interchange-agent-trace.schema.json +0 -322
  44. package/schemas/eval-interchange-proto-instance.schema.json +0 -106
@@ -3,72 +3,13 @@ import { readFile, readdir, stat } from "node:fs/promises";
3
3
  import { isAbsolute, join, relative, resolve } from "node:path";
4
4
  import { parse } from "yaml";
5
5
  import { z } from "zod";
6
- //#region src/config/paths.ts
7
- /**
8
- * Resolve relative paths in suite config against the suite file directory.
9
- */
10
- function resolvePath(value, suiteDir) {
11
- if (isAbsolute(value) || value.startsWith("~/")) return value;
12
- return join(suiteDir, value);
13
- }
14
- function resolveClaudeCodePaths(block, suiteDir) {
15
- const resolved = { ...block };
16
- if (typeof resolved.mcpConfig === "string") resolved.mcpConfig = resolvePath(resolved.mcpConfig, suiteDir);
17
- if (Array.isArray(resolved.pluginDirs)) resolved.pluginDirs = resolved.pluginDirs.map((p) => typeof p === "string" ? resolvePath(p, suiteDir) : p);
18
- if (Array.isArray(resolved.addDirs)) resolved.addDirs = resolved.addDirs.map((p) => typeof p === "string" ? resolvePath(p, suiteDir) : p);
19
- for (const field of [
20
- "systemPromptFile",
21
- "appendSystemPromptFile",
22
- "debugFile"
23
- ]) {
24
- const value = resolved[field];
25
- if (typeof value === "string" && !value.trim().startsWith("{")) resolved[field] = resolvePath(value, suiteDir);
26
- }
27
- if (typeof resolved.settings === "string" && !resolved.settings.trim().startsWith("{")) resolved.settings = resolvePath(resolved.settings, suiteDir);
28
- return resolved;
29
- }
30
- /** Resolve relative paths in a config layer relative to `suiteDir`. */
31
- function resolveConfigPaths(config, suiteDir) {
32
- if (!config) return void 0;
33
- const resolved = { ...config };
34
- if (typeof resolved.cwd === "string") resolved.cwd = resolvePath(resolved.cwd, suiteDir);
35
- if (resolved.claudeCode && typeof resolved.claudeCode === "object" && !Array.isArray(resolved.claudeCode)) resolved.claudeCode = resolveClaudeCodePaths(resolved.claudeCode, suiteDir);
36
- return resolved;
37
- }
38
- /** Resolve paths on an entire suite after load. */
39
- function resolveSuitePaths(suite, suiteFilePath) {
40
- const suiteDir = configFileDir(suiteFilePath);
41
- suite.defaultConfig = resolveConfigPaths(suite.defaultConfig, suiteDir);
42
- for (const cell of suite.matrix) cell.config = resolveConfigPaths(cell.config, suiteDir) ?? cell.config;
43
- for (const testCase of suite.cases) testCase.config = resolveConfigPaths(testCase.config, suiteDir);
44
- }
45
- function configFileDir(filePath) {
46
- return filePath.includes("/") || filePath.includes("\\") ? filePath.replace(/[/\\][^/\\]+$/, "") : ".";
47
- }
48
- function resolveEnvPaths(env, baseDir) {
49
- const resolved = {};
50
- for (const [key, value] of Object.entries(env)) if (value.startsWith("./") || value.startsWith("../") || value.includes("/") && !value.startsWith("http")) resolved[key] = resolvePath(value, baseDir);
51
- else resolved[key] = value;
52
- return resolved;
53
- }
54
- /** Resolve relative paths in a standalone grading config file. */
55
- function resolveGradingConfigPaths(config, configFilePath) {
56
- const baseDir = configFileDir(configFilePath);
57
- const { adapter, maxConcurrent, ...rest } = config.judge;
58
- config.judge = {
59
- ...resolveConfigPaths(rest, baseDir) ?? rest,
60
- adapter,
61
- maxConcurrent
62
- };
63
- if (config.judge.env) config.judge.env = resolveEnvPaths(config.judge.env, baseDir);
64
- }
65
- //#endregion
66
6
  //#region src/config/schema.ts
67
7
  /**
68
8
  * zod schemas for the YAML on-disk shape.
69
9
  *
70
10
  * Config uses a nested layout: generic harness fields at the top level,
71
- * adapter-specific options under a named key (e.g. `claudeCode`).
11
+ * adapter-specific options under a named key (e.g. `claudeCode`). Validated
12
+ * raw shapes are transformed into runtime types by `src/config/transform.ts`.
72
13
  */
73
14
  /** Claude Code adapter-specific options (nested under `claudeCode`). */
74
15
  const ClaudeCodeConfigSchema = z.object({
@@ -117,13 +58,40 @@ const ClaudeCodeConfigSchema = z.object({
117
58
  maxTurns: z.number().int().positive(),
118
59
  isolateConfig: z.boolean()
119
60
  }).partial();
61
+ /** Codex CLI adapter-specific options (nested under `codex`). */
62
+ const CodexConfigSchema = z.object({
63
+ binary: z.string(),
64
+ profile: z.string(),
65
+ sandbox: z.enum([
66
+ "read-only",
67
+ "workspace-write",
68
+ "danger-full-access"
69
+ ]),
70
+ addDirs: z.array(z.string()),
71
+ configOverrides: z.array(z.string()),
72
+ askForApproval: z.enum([
73
+ "untrusted",
74
+ "on-request",
75
+ "never"
76
+ ]),
77
+ dangerouslyBypassApprovalsAndSandbox: z.boolean(),
78
+ dangerouslyBypassHookTrust: z.boolean(),
79
+ ephemeral: z.boolean(),
80
+ ignoreUserConfig: z.boolean(),
81
+ skipGitRepoCheck: z.boolean(),
82
+ outputSchema: z.string(),
83
+ outputLastMessage: z.string(),
84
+ captureLastMessage: z.boolean(),
85
+ isolateConfig: z.boolean()
86
+ }).partial();
120
87
  /** Generic + nested adapter config for one layer (defaultConfig, case, cell). */
121
88
  const ConfigPartialSchema = z.object({
122
89
  model: z.string(),
123
90
  cwd: z.string(),
124
91
  timeoutMs: z.number().int().positive(),
125
92
  env: z.record(z.string(), z.string()),
126
- claudeCode: ClaudeCodeConfigSchema
93
+ claudeCode: ClaudeCodeConfigSchema,
94
+ codex: CodexConfigSchema
127
95
  }).partial();
128
96
  /** A matrix cell — one point in the configuration matrix. */
129
97
  const MatrixCellSchema = z.object({
@@ -136,6 +104,11 @@ const ReferenceToolCallSchema = z.object({
136
104
  tool_name: z.string().min(1),
137
105
  tool_input: z.unknown()
138
106
  });
107
+ /** Reference trajectory in suite YAML — array of steps or object with mode + steps. */
108
+ const ReferenceTrajectorySchema = z.union([z.array(ReferenceToolCallSchema), z.object({
109
+ tool_name_mode: z.enum(["harness", "bare"]).optional(),
110
+ steps: z.array(ReferenceToolCallSchema).min(1)
111
+ })]);
139
112
  /** A test case. */
140
113
  const TestCaseSchema = z.object({
141
114
  id: z.string().min(1),
@@ -143,7 +116,7 @@ const TestCaseSchema = z.object({
143
116
  category: z.string().optional(),
144
117
  notes: z.string().optional(),
145
118
  expectations: z.array(z.string().min(1)).optional(),
146
- reference_trajectory: z.array(ReferenceToolCallSchema).optional(),
119
+ reference_trajectory: ReferenceTrajectorySchema.optional(),
147
120
  human_ratings: z.record(z.string(), z.number()).optional(),
148
121
  assertions: z.array(z.unknown()).min(1),
149
122
  repetitions: z.number().int().positive().optional(),
@@ -192,6 +165,7 @@ function transformSuiteDirectory(raw) {
192
165
  function transformTestCases(raw, pathPrefix) {
193
166
  return raw.map((c, i) => transformTestCase(c, `${pathPrefix}[${i}]`));
194
167
  }
168
+ /** Merge suite-level parts shared by single-file and directory transforms. */
195
169
  function transformSuiteParts(raw) {
196
170
  return {
197
171
  adapter: raw.adapter,
@@ -200,6 +174,21 @@ function transformSuiteParts(raw) {
200
174
  cases: raw.cases.map((c, i) => transformTestCase(c, `cases[${i}]`))
201
175
  };
202
176
  }
177
+ /**
178
+ * Normalize reference trajectory YAML into {@link ReferenceTrajectoryConfig}.
179
+ *
180
+ * Accepts a bare step array or `{ tool_name_mode?, steps }` object form.
181
+ */
182
+ function normalizeReferenceTrajectory(raw, path) {
183
+ if (raw === void 0) return void 0;
184
+ if (Array.isArray(raw)) return { steps: raw };
185
+ if (!isPlainObject(raw) || !Array.isArray(raw.steps)) throw new ConfigError("reference_trajectory must be an array of tool calls or { tool_name_mode?, steps: [...] }", path);
186
+ return {
187
+ tool_name_mode: raw.tool_name_mode,
188
+ steps: raw.steps
189
+ };
190
+ }
191
+ /** Map raw matrix cell YAML to runtime {@link MatrixCell}. */
203
192
  function transformMatrixCell(raw) {
204
193
  return {
205
194
  label: raw.label,
@@ -207,6 +196,7 @@ function transformMatrixCell(raw) {
207
196
  axes: raw.axes
208
197
  };
209
198
  }
199
+ /** Map one raw test case to runtime {@link TestCase}, transforming assertions. */
210
200
  function transformTestCase(raw, path) {
211
201
  return {
212
202
  id: raw.id,
@@ -214,7 +204,7 @@ function transformTestCase(raw, path) {
214
204
  category: raw.category,
215
205
  notes: raw.notes,
216
206
  expectations: raw.expectations,
217
- reference_trajectory: raw.reference_trajectory,
207
+ reference_trajectory: normalizeReferenceTrajectory(raw.reference_trajectory, `${path}.reference_trajectory`),
218
208
  human_ratings: raw.human_ratings,
219
209
  repetitions: raw.repetitions,
220
210
  config: raw.config,
@@ -223,6 +213,17 @@ function transformTestCase(raw, path) {
223
213
  }
224
214
  /** Keys that may appear alongside an assertion-type key. Not assertion types themselves. */
225
215
  const SIBLING_KEYS = /* @__PURE__ */ new Set(["threshold"]);
216
+ /**
217
+ * Parse optional `threshold` sibling and delegate the assertion body to
218
+ * {@link transformAssertion}.
219
+ *
220
+ * @throws {ConfigError} When the wrapper is not an object, threshold is out of
221
+ * `[0, 1]`, or the nested assertion fails validation.
222
+ *
223
+ * @example
224
+ * transformThresholdedAssertion({ called: "Read", threshold: 0.9 }, "path")
225
+ * // → { assertion: { type: "called", tool: "Read" }, threshold: 0.9 }
226
+ */
226
227
  function transformThresholdedAssertion(raw, path) {
227
228
  if (!isPlainObject(raw)) throw new ConfigError(`expected object, got ${typeOf(raw)}`, path);
228
229
  const threshold = raw.threshold;
@@ -240,6 +241,19 @@ function transformThresholdedAssertion(raw, path) {
240
241
  * Finds the single non-sibling key, dispatches to the per-type transformer.
241
242
  * Per-type transformers handle both verbose-object and shortcut-scalar input
242
243
  * shapes where applicable.
244
+ *
245
+ * @param raw - Single assertion object from parsed YAML (may include `threshold` sibling).
246
+ * @param path - JSON-path-like location for error messages (e.g. `cases[0].assertions[1]`).
247
+ * @returns Runtime {@link Assertion} tagged union.
248
+ * @throws {ConfigError} When the object has no assertion key, multiple type keys, or an unknown type.
249
+ *
250
+ * @example
251
+ * transformAssertion({ called: "Read" }, "cases[0].assertions[0]")
252
+ * // → { type: "called", tool: "Read" }
253
+ *
254
+ * @example
255
+ * transformAssertion({ called: { tool: "Read", times: ">= 2" } }, "path")
256
+ * // → { type: "called", tool: "Read", times: ">= 2" }
243
257
  */
244
258
  function transformAssertion(raw, path) {
245
259
  if (!isPlainObject(raw)) throw new ConfigError(`expected object, got ${typeOf(raw)}`, path);
@@ -271,6 +285,22 @@ function transformAssertion(raw, path) {
271
285
  default: throw new ConfigError(`unknown assertion type: ${typeKey}`, path);
272
286
  }
273
287
  }
288
+ /**
289
+ * Transform `called` YAML (scalar or `{tool, times?}`) to runtime assertion.
290
+ *
291
+ * @throws {ConfigError} When value is neither string nor object, tool is invalid,
292
+ * or `times` is not a valid cardinality string.
293
+ *
294
+ * @example
295
+ * // Scalar shortcut
296
+ * transformCalled("mcp__api__search_skills", "path")
297
+ * // → { type: "called", tool: "mcp__api__search_skills" }
298
+ *
299
+ * @example
300
+ * // Verbose form with cardinality
301
+ * transformCalled({ tool: "Read", times: ">= 1" }, "path")
302
+ * // → { type: "called", tool: "Read", times: ">= 1" }
303
+ */
274
304
  function transformCalled(value, path) {
275
305
  if (typeof value === "string") return {
276
306
  type: "called",
@@ -293,6 +323,14 @@ function transformCalled(value, path) {
293
323
  times
294
324
  };
295
325
  }
326
+ /**
327
+ * Transform `not_called` YAML (scalar or `{tool}`).
328
+ *
329
+ * @throws {ConfigError} When value is neither string nor object with a valid `tool`.
330
+ *
331
+ * @example
332
+ * transformNotCalled("Bash", "path") // → { type: "not_called", tool: "Bash" }
333
+ */
296
334
  function transformNotCalled(value, path) {
297
335
  if (typeof value === "string") return {
298
336
  type: "not_called",
@@ -304,18 +342,45 @@ function transformNotCalled(value, path) {
304
342
  tool: requireToolPattern(value.tool, `${path}.tool`)
305
343
  };
306
344
  }
345
+ /**
346
+ * Transform `called_any_of` — bare tool list or `{tools: [...]}`.
347
+ *
348
+ * @throws {ConfigError} When the value is not an array or `{tools: [...]}` object.
349
+ *
350
+ * @example
351
+ * transformCalledAnyOf(["Read", "Glob"], "path")
352
+ * // → { type: "called_any_of", tools: ["Read", "Glob"] }
353
+ */
307
354
  function transformCalledAnyOf(value, path) {
308
355
  return {
309
356
  type: "called_any_of",
310
357
  tools: requireToolPatternList(value, path)
311
358
  };
312
359
  }
360
+ /**
361
+ * Transform `called_all_of` — bare tool list or `{tools: [...]}`.
362
+ *
363
+ * @throws {ConfigError} When the value is not an array or `{tools: [...]}` object.
364
+ *
365
+ * @example
366
+ * transformCalledAllOf({ tools: ["Read", "Grep"] }, "path")
367
+ * // → { type: "called_all_of", tools: ["Read", "Grep"] }
368
+ */
313
369
  function transformCalledAllOf(value, path) {
314
370
  return {
315
371
  type: "called_all_of",
316
372
  tools: requireToolPatternList(value, path)
317
373
  };
318
374
  }
375
+ /**
376
+ * Transform `called_before: {first, then}` ordering assertion.
377
+ *
378
+ * @throws {ConfigError} When value is not an object or `first`/`then` are invalid patterns.
379
+ *
380
+ * @example
381
+ * transformCalledBefore({ first: "SearchSkills", then: "LoadSkill" }, "path")
382
+ * // → { type: "called_before", first: "SearchSkills", then: "LoadSkill" }
383
+ */
319
384
  function transformCalledBefore(value, path) {
320
385
  if (!isPlainObject(value)) throw new ConfigError(`expected object with {first, then}, got ${typeOf(value)}`, path);
321
386
  return {
@@ -324,6 +389,19 @@ function transformCalledBefore(value, path) {
324
389
  then: requireToolPattern(value.then, `${path}.then`)
325
390
  };
326
391
  }
392
+ /**
393
+ * Transform `sequence` — tool list with optional `strict` flag.
394
+ *
395
+ * @throws {ConfigError} When value is neither a pattern array nor `{tools, strict?}` object.
396
+ *
397
+ * @example
398
+ * // Bare array (non-strict by default)
399
+ * transformSequence(["Read", "Edit"], "path")
400
+ *
401
+ * @example
402
+ * // Explicit strict ordering
403
+ * transformSequence({ tools: ["Read", "Edit"], strict: true }, "path")
404
+ */
327
405
  function transformSequence(value, path) {
328
406
  if (Array.isArray(value)) return {
329
407
  type: "sequence",
@@ -336,6 +414,19 @@ function transformSequence(value, path) {
336
414
  strict: value.strict === void 0 ? void 0 : requireBool(value.strict, `${path}.strict`)
337
415
  };
338
416
  }
417
+ /**
418
+ * Transform `called_with: {tool, args}` with predicate validation on args.
419
+ *
420
+ * @throws {ConfigError} When `tool` or `args` is missing/invalid, or `args` fails
421
+ * {@link validatePredicate}.
422
+ *
423
+ * @example
424
+ * transformCalledWith(
425
+ * { tool: "Read", args: { path: { contains: "README" } } },
426
+ * "path",
427
+ * )
428
+ * // → { type: "called_with", tool: "Read", args: { path: { contains: "README" } } }
429
+ */
339
430
  function transformCalledWith(value, path) {
340
431
  if (!isPlainObject(value)) throw new ConfigError(`expected object with {tool, args}, got ${typeOf(value)}`, path);
341
432
  const tool = requireToolPattern(value.tool, `${path}.tool`);
@@ -347,10 +438,32 @@ function transformCalledWith(value, path) {
347
438
  args: value.args
348
439
  };
349
440
  }
441
+ /**
442
+ * Transform `responded_without_tool_calls` — accepts true or empty object.
443
+ *
444
+ * @throws {ConfigError} When value is neither `true`, null, nor an empty object.
445
+ *
446
+ * @example
447
+ * transformRespondedWithoutToolCalls(true, "path")
448
+ * // → { type: "responded_without_tool_calls" }
449
+ */
350
450
  function transformRespondedWithoutToolCalls(value, path) {
351
451
  if (value === true || value === null || isPlainObject(value) && Object.keys(value).length === 0) return { type: "responded_without_tool_calls" };
352
452
  throw new ConfigError(`expected true or empty object, got ${JSON.stringify(value)}`, path);
353
453
  }
454
+ /**
455
+ * Transform budget assertions (`iterations_within`, `cost_within_usd`, `duration_within_ms`).
456
+ *
457
+ * @throws {ConfigError} When `max` is missing, non-positive, or not a number.
458
+ *
459
+ * @example
460
+ * transformScalarMax(5, "path", "iterations_within")
461
+ * // → { type: "iterations_within", max: 5 }
462
+ *
463
+ * @example
464
+ * transformScalarMax({ max: 2.5 }, "path", "cost_within_usd")
465
+ * // → { type: "cost_within_usd", max: 2.5 }
466
+ */
354
467
  function transformScalarMax(value, path, type) {
355
468
  let max;
356
469
  if (typeof value === "number") max = value;
@@ -362,6 +475,15 @@ function transformScalarMax(value, path, type) {
362
475
  max
363
476
  };
364
477
  }
478
+ /**
479
+ * Transform `finished_with` — stop reason string, list, or `{reasons}`.
480
+ *
481
+ * @throws {ConfigError} When value is not a string, string array, or `{reasons}` object.
482
+ *
483
+ * @example
484
+ * transformFinishedWith("end_turn", "path")
485
+ * // → { type: "finished_with", reasons: "end_turn" }
486
+ */
365
487
  function transformFinishedWith(value, path) {
366
488
  if (typeof value === "string") return {
367
489
  type: "finished_with",
@@ -384,6 +506,15 @@ function transformFinishedWith(value, path) {
384
506
  }
385
507
  throw new ConfigError(`expected string, string[], or {reasons: ...}, got ${JSON.stringify(value)}`, path);
386
508
  }
509
+ /**
510
+ * Transform `response_contains` / `response_not_contains` scalar or `{text}`.
511
+ *
512
+ * @throws {ConfigError} When value is neither a string nor `{text: string}`.
513
+ *
514
+ * @example
515
+ * transformResponseText("done", "path", "response_contains")
516
+ * // → { type: "response_contains", text: "done" }
517
+ */
387
518
  function transformResponseText(value, path, type) {
388
519
  if (typeof value === "string") return {
389
520
  type,
@@ -395,6 +526,15 @@ function transformResponseText(value, path, type) {
395
526
  };
396
527
  throw new ConfigError(`expected string or {text: string}, got ${JSON.stringify(value)}`, path);
397
528
  }
529
+ /**
530
+ * Transform `response_matches: {pattern, flags?}`.
531
+ *
532
+ * @throws {ConfigError} When `pattern` is missing or not a string.
533
+ *
534
+ * @example
535
+ * transformResponseMatches({ pattern: "error\\d+", flags: "i" }, "path")
536
+ * // → { type: "response_matches", pattern: "error\\d+", flags: "i" }
537
+ */
398
538
  function transformResponseMatches(value, path) {
399
539
  if (!isPlainObject(value)) throw new ConfigError(`expected object with {pattern, flags?}, got ${typeOf(value)}`, path);
400
540
  return {
@@ -403,24 +543,57 @@ function transformResponseMatches(value, path) {
403
543
  flags: value.flags === void 0 ? void 0 : requireString(value.flags, `${path}.flags`)
404
544
  };
405
545
  }
546
+ /**
547
+ * Transform compound `all_of` assertion list.
548
+ *
549
+ * @throws {ConfigError} When value is not an array or `{assertions: [...]}`.
550
+ *
551
+ * @example
552
+ * transformAllOf([{ called: "Read" }, { not_called: "Bash" }], "path")
553
+ */
406
554
  function transformAllOf(value, path) {
407
555
  return {
408
556
  type: "all_of",
409
557
  assertions: transformCompoundList(value, path)
410
558
  };
411
559
  }
560
+ /**
561
+ * Transform compound `any_of` assertion list.
562
+ *
563
+ * @throws {ConfigError} When value is not an array or `{assertions: [...]}`.
564
+ *
565
+ * @example
566
+ * transformAnyOf({ assertions: [{ called: "Read" }, { called: "Glob" }] }, "path")
567
+ */
412
568
  function transformAnyOf(value, path) {
413
569
  return {
414
570
  type: "any_of",
415
571
  assertions: transformCompoundList(value, path)
416
572
  };
417
573
  }
574
+ /**
575
+ * Transform compound `not` — single nested assertion, no threshold.
576
+ *
577
+ * The inner assertion uses the same single-key YAML shape as top-level
578
+ * assertions; thresholds apply only at the outer {@link transformThresholdedAssertion} level.
579
+ *
580
+ * @throws {ConfigError} Propagates from nested {@link transformAssertion}.
581
+ *
582
+ * @example
583
+ * transformNot({ called: "Bash" }, "path")
584
+ * // → { type: "not", assertion: { type: "called", tool: "Bash" } }
585
+ */
418
586
  function transformNot(value, path) {
419
587
  return {
420
588
  type: "not",
421
589
  assertion: transformAssertion(value, path)
422
590
  };
423
591
  }
592
+ /**
593
+ * Parse compound assertion list from array or `{assertions: [...]}`.
594
+ *
595
+ * @throws {ConfigError} When value is neither form.
596
+ */
424
597
  function transformCompoundList(value, path) {
425
598
  const list = Array.isArray(value) ? value : isPlainObject(value) && Array.isArray(value.assertions) ? value.assertions : null;
426
599
  if (list === null) throw new ConfigError(`expected array or {assertions: [...]}, got ${JSON.stringify(value)}`, path);
@@ -452,6 +625,9 @@ const COMPOUND_OPS = /* @__PURE__ */ new Set([
452
625
  * - single-key object whose key is a leaf op (e.g. `{contains: "x"}`)
453
626
  * - single-key compound (`{any_of: [...]}`, `{all_of: [...]}`, `{not: ...}`)
454
627
  * - multi-key object (descend into fields; each value is a sub-predicate)
628
+ *
629
+ * @throws {ConfigError} When a compound op has a non-array value or a leaf op
630
+ * has the wrong value type (e.g. non-string `contains`).
455
631
  */
456
632
  function validatePredicate(raw, path) {
457
633
  if (!isPlainObject(raw)) return;
@@ -474,6 +650,12 @@ function validatePredicate(raw, path) {
474
650
  }
475
651
  for (const [field, sub] of Object.entries(raw)) validatePredicate(sub, `${path}.${field}`);
476
652
  }
653
+ /**
654
+ * Validate a leaf predicate operator's value shape at config load time.
655
+ *
656
+ * @throws {ConfigError} When the operator's value has the wrong type or `regex`
657
+ * is not a valid JavaScript regular expression.
658
+ */
477
659
  function validateLeafOperator(op, value, path) {
478
660
  switch (op) {
479
661
  case "equals": return;
@@ -501,85 +683,293 @@ function validateLeafOperator(op, value, path) {
501
683
  default: return;
502
684
  }
503
685
  }
686
+ /** Require a tool pattern string or `{ pattern }` object. */
504
687
  function requireToolPattern(value, path) {
505
688
  if (typeof value === "string") return value;
506
689
  if (isPlainObject(value) && typeof value.pattern === "string") return { pattern: value.pattern };
507
690
  throw new ConfigError(`expected string or {pattern: string}, got ${JSON.stringify(value)}`, path);
508
691
  }
692
+ /** Require a bare tool pattern array or `{ tools: [...] }` wrapper. */
509
693
  function requireToolPatternList(value, path) {
510
694
  const list = Array.isArray(value) ? value : isPlainObject(value) && Array.isArray(value.tools) ? value.tools : null;
511
695
  if (list === null) throw new ConfigError(`expected array of tool patterns or {tools: [...]}, got ${JSON.stringify(value)}`, path);
512
696
  return list.map((v, i) => requireToolPattern(v, `${path}[${i}]`));
513
697
  }
698
+ /** Require a string value at `path` or throw {@link ConfigError}. */
514
699
  function requireString(value, path) {
515
700
  if (typeof value === "string") return value;
516
701
  throw new ConfigError(`expected string, got ${typeOf(value)}`, path);
517
702
  }
703
+ /** Require a boolean value at `path` or throw {@link ConfigError}. */
518
704
  function requireBool(value, path) {
519
705
  if (typeof value === "boolean") return value;
520
706
  throw new ConfigError(`expected boolean, got ${typeOf(value)}`, path);
521
707
  }
708
+ /** True for non-null, non-array objects (YAML mapping nodes). */
522
709
  function isPlainObject(x) {
523
710
  return typeof x === "object" && x !== null && !Array.isArray(x);
524
711
  }
712
+ /** Human-readable type name for config error messages. */
525
713
  function typeOf(x) {
526
714
  if (x === null) return "null";
527
715
  if (Array.isArray(x)) return "array";
528
716
  return typeof x;
529
717
  }
530
718
  //#endregion
531
- //#region src/config/grading-schema.ts
532
- /**
533
- * Zod schema for standalone grading YAML (`grading.yaml`).
534
- */
535
- /** Top-level `judge` block — mirrors harness config fields plus grader concurrency. */
536
- const JudgeConfigSchema = ConfigPartialSchema.extend({
537
- adapter: z.string().optional(),
538
- maxConcurrent: z.number().int().positive().optional(),
539
- /** Optional judge prompt prefix (maps to upstream system_instruction). */
540
- system_instruction: z.string().optional()
541
- });
542
- const GradingConfigSchema = z.object({ judge: JudgeConfigSchema });
543
- //#endregion
544
- //#region src/config/grading-loader.ts
719
+ //#region src/config/loader-internals.ts
545
720
  /**
546
- * Load standalone grading YAML for `harness-eval grade`.
721
+ * Shared suite loader helpers (case file collection and parsing).
547
722
  */
548
- async function loadGradingConfig(filePath) {
549
- const absolutePath = resolve(filePath);
550
- let content;
551
- try {
552
- content = await readFile(absolutePath, "utf8");
553
- } catch (err) {
554
- throw new ConfigError(`failed to read grading config: ${err instanceof Error ? err.message : String(err)}`, filePath);
555
- }
556
- return parseGradingConfig(content, absolutePath);
557
- }
558
- function parseGradingConfig(yamlContent, sourcePath) {
723
+ /** Parse one case file: single case, array, or `{ cases: [...] }`. */
724
+ function parseCasesFile(yamlContent, sourcePath) {
559
725
  let raw;
560
726
  try {
561
727
  raw = parse(yamlContent);
562
728
  } catch (err) {
563
729
  throw new ConfigError(`YAML parse error: ${err instanceof Error ? err.message : String(err)}`, sourcePath);
564
730
  }
565
- const validated = GradingConfigSchema.safeParse(raw);
566
- if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError$1(validated.error, sourcePath)}`, sourcePath);
567
- const config = { judge: { ...validated.data.judge } };
568
- if (sourcePath) resolveGradingConfigPaths(config, sourcePath);
569
- return config;
731
+ return transformTestCases(extractRawCases(raw, sourcePath), sourcePath ?? "cases");
570
732
  }
571
- function formatZodError$1(err, sourcePath) {
733
+ function extractRawCases(raw, sourcePath) {
734
+ if (Array.isArray(raw)) return raw.map((item, index) => validateRawCase(item, sourcePath, index));
735
+ if (raw && typeof raw === "object") {
736
+ const obj = raw;
737
+ if (Array.isArray(obj.cases)) return obj.cases.map((item, index) => validateRawCase(item, sourcePath, index));
738
+ if ("id" in obj && "prompt" in obj && "assertions" in obj) return [validateRawCase(raw, sourcePath, 0)];
739
+ }
740
+ throw new ConfigError("expected a case object, array of cases, or { cases: [...] }", sourcePath);
741
+ }
742
+ function validateRawCase(raw, sourcePath, index) {
743
+ const validated = TestCaseSchema.safeParse(raw);
744
+ if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError$3(validated.error, sourcePath)}`, sourcePath);
745
+ return validated.data;
746
+ }
747
+ /** Recursively collect `.yaml` / `.yml` files under `casesDir`. */
748
+ async function collectCaseYamlFiles(casesDir) {
749
+ const files = [];
750
+ async function walk(dir) {
751
+ let entries;
752
+ try {
753
+ entries = await readdir(dir, { withFileTypes: true });
754
+ } catch (err) {
755
+ if (err instanceof Error && "code" in err && err.code === "ENOENT") return;
756
+ throw err;
757
+ }
758
+ for (const entry of entries) {
759
+ const fullPath = join(dir, entry.name);
760
+ if (entry.isDirectory()) await walk(fullPath);
761
+ else if (entry.isFile() && (entry.name.endsWith(".yaml") || entry.name.endsWith(".yml"))) files.push(fullPath);
762
+ }
763
+ }
764
+ await walk(casesDir);
765
+ return files.sort();
766
+ }
767
+ function formatZodError$3(err, sourcePath) {
572
768
  return err.issues.map((issue) => {
573
769
  const path = issue.path.length > 0 ? issue.path.join(".") : "(root)";
574
770
  return ` ${sourcePath ? `${sourcePath} → ${path}` : path}: ${issue.message}`;
575
771
  }).join("\n");
576
772
  }
577
773
  //#endregion
578
- //#region src/config/loader.ts
774
+ //#region src/config/pipeline-schema.ts
579
775
  /**
580
- * Load a `TestSuite` from a YAML file, directory, or string.
776
+ * Zod schemas for optional `pipeline:` block in suite.yaml.
777
+ *
778
+ * Step presence under `pipeline` enables orchestration via `harness-eval pipeline`.
581
779
  */
582
- async function loadSuite(filePath) {
780
+ /** `pipeline.run` step — harness eval run. */
781
+ const PipelineRunStepSchema = z.object({
782
+ output: z.string().min(1).optional(),
783
+ maxConcurrent: z.number().int().positive().optional()
784
+ }).optional();
785
+ /** `pipeline.grade` step — LLM outcome grading. */
786
+ const PipelineGradeStepSchema = z.object({
787
+ input: z.string().min(1).optional(),
788
+ output: z.string().min(1).optional(),
789
+ maxConcurrent: z.number().int().positive().optional()
790
+ }).optional();
791
+ /** `pipeline.envelope` step — EvalRunEnvelope export. */
792
+ const PipelineEnvelopeStepSchema = z.object({
793
+ report: z.string().min(1).optional(),
794
+ grading: z.string().min(1).optional(),
795
+ output: z.string().min(1).optional(),
796
+ projection: z.enum([
797
+ "envelope",
798
+ "trajectory",
799
+ "instances"
800
+ ]).optional(),
801
+ includeRawStreamEvents: z.boolean().optional(),
802
+ noTranscript: z.boolean().optional()
803
+ }).optional();
804
+ /** Top-level optional pipeline block in suite.yaml. */
805
+ const PipelineConfigSchema = z.object({
806
+ run: PipelineRunStepSchema,
807
+ grade: PipelineGradeStepSchema,
808
+ envelope: PipelineEnvelopeStepSchema
809
+ }).partial();
810
+ /** Default artifact filenames relative to the suite.yaml directory. */
811
+ const DEFAULT_PIPELINE_OUTPUTS = {
812
+ run: "report.json",
813
+ grade: "grading.json",
814
+ envelope: "envelope.json"
815
+ };
816
+ //#endregion
817
+ //#region src/config/paths.ts
818
+ /**
819
+ * Resolve relative paths in suite config against the suite file directory.
820
+ *
821
+ * YAML authors write paths relative to the suite file; this module absolutizes
822
+ * them at load time so the runner and adapters receive filesystem-ready values.
823
+ * Tilde-prefixed paths and inline JSON blobs (settings starting with `{`) are
824
+ * left unchanged.
825
+ */
826
+ /** Resolve a single path relative to `suiteDir` unless already absolute or `~/`. */
827
+ function resolvePath(value, suiteDir) {
828
+ if (isAbsolute(value) || value.startsWith("~/")) return value;
829
+ return join(suiteDir, value);
830
+ }
831
+ /** Resolve Claude Code-specific path fields within a config block. */
832
+ function resolveClaudeCodePaths(block, suiteDir) {
833
+ const resolved = { ...block };
834
+ if (typeof resolved.mcpConfig === "string") resolved.mcpConfig = resolvePath(resolved.mcpConfig, suiteDir);
835
+ if (Array.isArray(resolved.pluginDirs)) resolved.pluginDirs = resolved.pluginDirs.map((p) => typeof p === "string" ? resolvePath(p, suiteDir) : p);
836
+ if (Array.isArray(resolved.addDirs)) resolved.addDirs = resolved.addDirs.map((p) => typeof p === "string" ? resolvePath(p, suiteDir) : p);
837
+ for (const field of [
838
+ "systemPromptFile",
839
+ "appendSystemPromptFile",
840
+ "debugFile"
841
+ ]) {
842
+ const value = resolved[field];
843
+ if (typeof value === "string" && !value.trim().startsWith("{")) resolved[field] = resolvePath(value, suiteDir);
844
+ }
845
+ if (typeof resolved.settings === "string" && !resolved.settings.trim().startsWith("{")) resolved.settings = resolvePath(resolved.settings, suiteDir);
846
+ return resolved;
847
+ }
848
+ /** Resolve Codex-specific path fields within a config block. */
849
+ function resolveCodexPaths(block, suiteDir) {
850
+ const resolved = { ...block };
851
+ if (Array.isArray(resolved.addDirs)) resolved.addDirs = resolved.addDirs.map((p) => typeof p === "string" ? resolvePath(p, suiteDir) : p);
852
+ for (const field of ["outputSchema", "outputLastMessage"]) {
853
+ const value = resolved[field];
854
+ if (typeof value === "string") resolved[field] = resolvePath(value, suiteDir);
855
+ }
856
+ return resolved;
857
+ }
858
+ /** Resolve relative paths in a config layer relative to `suiteDir`. */
859
+ function resolveConfigPaths(config, suiteDir) {
860
+ if (!config) return void 0;
861
+ const resolved = { ...config };
862
+ if (typeof resolved.cwd === "string") resolved.cwd = resolvePath(resolved.cwd, suiteDir);
863
+ if (resolved.claudeCode && typeof resolved.claudeCode === "object" && !Array.isArray(resolved.claudeCode)) resolved.claudeCode = resolveClaudeCodePaths(resolved.claudeCode, suiteDir);
864
+ if (resolved.codex && typeof resolved.codex === "object" && !Array.isArray(resolved.codex)) resolved.codex = resolveCodexPaths(resolved.codex, suiteDir);
865
+ return resolved;
866
+ }
867
+ /** Resolve paths on an entire suite after load. */
868
+ function resolveSuitePaths(suite, suiteFilePath) {
869
+ const suiteDir = configFileDir(suiteFilePath);
870
+ suite.defaultConfig = resolveConfigPaths(suite.defaultConfig, suiteDir);
871
+ for (const cell of suite.matrix) cell.config = resolveConfigPaths(cell.config, suiteDir) ?? cell.config;
872
+ for (const testCase of suite.cases) testCase.config = resolveConfigPaths(testCase.config, suiteDir);
873
+ }
874
+ /** Parent directory of a suite or grading config file path. */
875
+ function configFileDir(filePath) {
876
+ return filePath.includes("/") || filePath.includes("\\") ? filePath.replace(/[/\\][^/\\]+$/, "") : ".";
877
+ }
878
+ /**
879
+ * Heuristically resolve env var values that look like relative file paths.
880
+ *
881
+ * Used for grading config where credential or config paths may be expressed
882
+ * relative to the grading YAML location.
883
+ */
884
+ function resolveEnvPaths(env, baseDir) {
885
+ const resolved = {};
886
+ for (const [key, value] of Object.entries(env)) if (value.startsWith("./") || value.startsWith("../")) resolved[key] = resolvePath(value, baseDir);
887
+ else resolved[key] = value;
888
+ return resolved;
889
+ }
890
+ /** Resolve relative paths in a standalone grading config file. */
891
+ function resolveGradingConfigPaths(config, configFilePath) {
892
+ const baseDir = configFileDir(configFilePath);
893
+ const { adapter, maxConcurrent, ...rest } = config.judge;
894
+ config.judge = {
895
+ ...resolveConfigPaths(rest, baseDir) ?? rest,
896
+ adapter,
897
+ maxConcurrent
898
+ };
899
+ if (config.judge.env) config.judge.env = resolveEnvPaths(config.judge.env, baseDir);
900
+ }
901
+ /** Resolve a pipeline artifact path relative to the suite.yaml directory. */
902
+ function resolvePipelinePath(value, defaultRelative, suiteDir) {
903
+ return resolvePath(value ?? defaultRelative, suiteDir);
904
+ }
905
+ /** Resolve relative paths in a parsed pipeline config. */
906
+ function resolvePipelineConfigPaths(pipeline, suiteFilePath) {
907
+ const suiteDir = configFileDir(suiteFilePath);
908
+ const resolved = {};
909
+ if (pipeline.run) resolved.run = resolvePipelineRunStep(pipeline.run, suiteDir);
910
+ if (pipeline.grade) resolved.grade = resolvePipelineGradeStep(pipeline.grade, suiteDir);
911
+ if (pipeline.envelope) resolved.envelope = resolvePipelineEnvelopeStep(pipeline.envelope, suiteDir);
912
+ return resolved;
913
+ }
914
+ /** Resolve one pipeline step's run output path. */
915
+ function resolvePipelineRunStep(step, suiteDir) {
916
+ return {
917
+ ...step,
918
+ output: resolvePipelinePath(step.output, DEFAULT_PIPELINE_OUTPUTS.run, suiteDir)
919
+ };
920
+ }
921
+ /** Resolve grade step input (optional) and output paths. */
922
+ function resolvePipelineGradeStep(step, suiteDir) {
923
+ return {
924
+ ...step,
925
+ input: step.input ? resolvePipelinePath(step.input, DEFAULT_PIPELINE_OUTPUTS.run, suiteDir) : void 0,
926
+ output: resolvePipelinePath(step.output, DEFAULT_PIPELINE_OUTPUTS.grade, suiteDir)
927
+ };
928
+ }
929
+ /** Resolve envelope step report, grading, and output paths. */
930
+ function resolvePipelineEnvelopeStep(step, suiteDir) {
931
+ return {
932
+ ...step,
933
+ report: step.report ? resolvePipelinePath(step.report, DEFAULT_PIPELINE_OUTPUTS.run, suiteDir) : void 0,
934
+ grading: step.grading ? resolvePipelinePath(step.grading, DEFAULT_PIPELINE_OUTPUTS.grade, suiteDir) : void 0,
935
+ output: resolvePipelinePath(step.output, DEFAULT_PIPELINE_OUTPUTS.envelope, suiteDir)
936
+ };
937
+ }
938
+ //#endregion
939
+ //#region src/config/grading-schema.ts
940
+ /**
941
+ * Zod schema for standalone grading YAML (`grading.yaml`).
942
+ *
943
+ * The top-level `judge` block reuses {@link ConfigPartialSchema} fields plus
944
+ * grader-specific concurrency and system-instruction overrides.
945
+ */
946
+ /** Top-level `judge` block — mirrors harness config fields plus grader concurrency. */
947
+ const JudgeConfigSchema = ConfigPartialSchema.extend({
948
+ adapter: z.string().optional(),
949
+ maxConcurrent: z.number().int().positive().optional(),
950
+ /** Optional judge prompt prefix (maps to upstream system_instruction). */
951
+ system_instruction: z.string().optional()
952
+ });
953
+ const GradingConfigSchema = z.object({ judge: JudgeConfigSchema });
954
+ //#endregion
955
+ //#region src/config/suite-file-schema.ts
956
+ /** Single-file suite with optional inline judge and pipeline orchestration. */
957
+ const SuiteFileSingleSchema = TestSuiteSchema.extend({
958
+ judge: JudgeConfigSchema.optional(),
959
+ pipeline: PipelineConfigSchema.optional()
960
+ });
961
+ /** Directory suite root with optional inline judge and pipeline orchestration. */
962
+ const SuiteFileDirectorySchema = SuiteDirectorySchema.extend({
963
+ judge: JudgeConfigSchema.optional(),
964
+ pipeline: PipelineConfigSchema.optional()
965
+ });
966
+ //#endregion
967
+ //#region src/config/suite-document-loader.ts
968
+ /**
969
+ * Load a unified suite.yaml document (suite + optional judge + pipeline).
970
+ */
971
+ /** Load suite.yaml (or directory) including optional judge and pipeline blocks. */
972
+ async function loadSuiteDocument(filePath, options = {}) {
583
973
  const absolutePath = resolve(filePath);
584
974
  let info;
585
975
  try {
@@ -587,19 +977,12 @@ async function loadSuite(filePath) {
587
977
  } catch (err) {
588
978
  throw new ConfigError(`failed to read suite path: ${err instanceof Error ? err.message : String(err)}`, filePath);
589
979
  }
590
- if (info.isDirectory()) return loadSuiteDirectory(absolutePath);
591
- return loadSuiteFile(absolutePath);
592
- }
593
- async function loadSuiteFile(absolutePath) {
594
- let content;
595
- try {
596
- content = await readFile(absolutePath, "utf8");
597
- } catch (err) {
598
- throw new ConfigError(`failed to read suite file: ${err instanceof Error ? err.message : String(err)}`, absolutePath);
599
- }
600
- return parseSuite(content, absolutePath);
980
+ const strict = options.validateOrchestration !== false;
981
+ if (info.isDirectory()) return loadSuiteDocumentDirectory(absolutePath, strict);
982
+ return loadSuiteDocumentFile(absolutePath, strict);
601
983
  }
602
- async function loadSuiteDirectory(dir) {
984
+ /** Load suite.yaml from a directory layout (cases under `cases/`). */
985
+ async function loadSuiteDocumentDirectory(dir, strict) {
603
986
  const suiteYamlPath = join(dir, "suite.yaml");
604
987
  let content;
605
988
  try {
@@ -607,7 +990,7 @@ async function loadSuiteDirectory(dir) {
607
990
  } catch (err) {
608
991
  throw new ConfigError(`missing suite.yaml in suite directory: ${err instanceof Error ? err.message : String(err)}`, dir);
609
992
  }
610
- const base = parseSuiteDirectory(content, suiteYamlPath);
993
+ const { judge, pipeline, suite: base } = parseSuiteFileRoot(content, suiteYamlPath, "directory", strict);
611
994
  const casesDir = join(dir, "cases");
612
995
  const caseFiles = await collectCaseYamlFiles(casesDir);
613
996
  const tagged = base.cases.map((testCase, index) => ({
@@ -636,74 +1019,241 @@ async function loadSuiteDirectory(dir) {
636
1019
  cases
637
1020
  };
638
1021
  resolveSuitePaths(suite, suiteYamlPath);
639
- return suite;
1022
+ return buildSuiteDocument(suiteYamlPath, suite, judge, pipeline);
640
1023
  }
641
- function parseSuite(yamlContent, sourcePath) {
1024
+ /** Load a single suite.yaml file (inline cases). */
1025
+ async function loadSuiteDocumentFile(absolutePath, strict) {
1026
+ let content;
1027
+ try {
1028
+ content = await readFile(absolutePath, "utf8");
1029
+ } catch (err) {
1030
+ throw new ConfigError(`failed to read suite file: ${err instanceof Error ? err.message : String(err)}`, absolutePath);
1031
+ }
1032
+ const { judge, pipeline, suite } = parseSuiteFileRoot(content, absolutePath, "single", strict);
1033
+ resolveSuitePaths(suite, absolutePath);
1034
+ return buildSuiteDocument(absolutePath, suite, judge, pipeline);
1035
+ }
1036
+ /**
1037
+ * Parse suite.yaml root and validate against the appropriate schema.
1038
+ *
1039
+ * When `strict` is true, uses extended schemas that validate `judge:` and
1040
+ * `pipeline:` blocks (for `loadSuiteDocument`). When false, uses base schemas
1041
+ * that silently strip unknown keys (for `loadSuite`).
1042
+ */
1043
+ function parseSuiteFileRoot(yamlContent, sourcePath, layout, strict) {
642
1044
  let raw;
643
1045
  try {
644
1046
  raw = parse(yamlContent);
645
1047
  } catch (err) {
646
1048
  throw new ConfigError(`YAML parse error: ${err instanceof Error ? err.message : String(err)}`, sourcePath);
647
1049
  }
648
- const validated = TestSuiteSchema.safeParse(raw);
649
- if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError(validated.error, sourcePath)}`, sourcePath);
650
- const suite = transformSuite(validated.data);
651
- if (sourcePath) resolveSuitePaths(suite, resolve(sourcePath));
652
- return suite;
1050
+ if (!strict) {
1051
+ const validated = (layout === "directory" ? SuiteDirectorySchema : TestSuiteSchema).safeParse(raw);
1052
+ if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError$2(validated.error, sourcePath)}`, sourcePath);
1053
+ return { suite: (layout === "directory" ? transformSuiteDirectory : transformSuite)(validated.data) };
1054
+ }
1055
+ if (layout === "directory") {
1056
+ const validated = SuiteFileDirectorySchema.safeParse(raw);
1057
+ if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError$2(validated.error, sourcePath)}`, sourcePath);
1058
+ return extractSuiteFileParts(validated.data, sourcePath, transformSuiteDirectory);
1059
+ }
1060
+ const validated = SuiteFileSingleSchema.safeParse(raw);
1061
+ if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError$2(validated.error, sourcePath)}`, sourcePath);
1062
+ return extractSuiteFileParts(validated.data, sourcePath, transformSuite);
653
1063
  }
654
- function parseSuiteDirectory(yamlContent, sourcePath) {
1064
+ /** Split validated YAML into suite, judge, and pipeline with path resolution. */
1065
+ function extractSuiteFileParts(data, sourcePath, transform) {
1066
+ const { judge: rawJudge, pipeline: rawPipeline, ...suiteRaw } = data;
1067
+ const suite = transform(suiteRaw);
1068
+ let judge;
1069
+ if (rawJudge) {
1070
+ judge = { ...rawJudge };
1071
+ resolveGradingConfigPaths({ judge }, sourcePath);
1072
+ }
1073
+ let pipeline;
1074
+ if (rawPipeline) {
1075
+ pipeline = transformPipelineConfig(rawPipeline);
1076
+ pipeline = resolvePipelineConfigPaths(pipeline, sourcePath);
1077
+ }
1078
+ return {
1079
+ suite,
1080
+ judge,
1081
+ pipeline
1082
+ };
1083
+ }
1084
+ /** Apply default artifact filenames when a pipeline step key is present but paths are omitted. */
1085
+ function transformPipelineConfig(raw) {
1086
+ const pipeline = {};
1087
+ if (raw.run !== void 0) pipeline.run = {
1088
+ output: raw.run?.output ?? DEFAULT_PIPELINE_OUTPUTS.run,
1089
+ maxConcurrent: raw.run?.maxConcurrent
1090
+ };
1091
+ if (raw.grade !== void 0) pipeline.grade = {
1092
+ input: raw.grade?.input,
1093
+ output: raw.grade?.output ?? DEFAULT_PIPELINE_OUTPUTS.grade,
1094
+ maxConcurrent: raw.grade?.maxConcurrent
1095
+ };
1096
+ if (raw.envelope !== void 0) pipeline.envelope = {
1097
+ report: raw.envelope?.report,
1098
+ grading: raw.envelope?.grading,
1099
+ output: raw.envelope?.output ?? DEFAULT_PIPELINE_OUTPUTS.envelope,
1100
+ projection: raw.envelope?.projection ?? "envelope",
1101
+ includeRawStreamEvents: raw.envelope?.includeRawStreamEvents,
1102
+ noTranscript: raw.envelope?.noTranscript
1103
+ };
1104
+ return pipeline;
1105
+ }
1106
+ /** Assemble the runtime {@link SuiteDocument} from parsed parts. */
1107
+ function buildSuiteDocument(suitePath, suite, judge, pipeline) {
1108
+ return {
1109
+ suitePath: resolve(suitePath),
1110
+ suite,
1111
+ judge,
1112
+ pipeline
1113
+ };
1114
+ }
1115
+ function formatZodError$2(err, sourcePath) {
1116
+ return err.issues.map((issue) => {
1117
+ const path = issue.path.length > 0 ? issue.path.join(".") : "(root)";
1118
+ return ` ${sourcePath ? `${sourcePath} → ${path}` : path}: ${issue.message}`;
1119
+ }).join("\n");
1120
+ }
1121
+ //#endregion
1122
+ //#region src/config/grading-loader.ts
1123
+ /**
1124
+ * Load standalone grading YAML for `harness-eval grade`.
1125
+ *
1126
+ * Also accepts unified suite.yaml files with an inline `judge:` block.
1127
+ */
1128
+ /** Load grading YAML from disk and resolve relative paths. */
1129
+ async function loadGradingConfig(filePath) {
1130
+ const absolutePath = resolve(filePath);
1131
+ let info;
1132
+ try {
1133
+ info = await stat(absolutePath);
1134
+ } catch (err) {
1135
+ throw new ConfigError(`failed to read grading config: ${err instanceof Error ? err.message : String(err)}`, filePath);
1136
+ }
1137
+ if (info.isDirectory()) return loadGradingFromSuiteYaml(join(absolutePath, "suite.yaml"));
1138
+ let content;
1139
+ try {
1140
+ content = await readFile(absolutePath, "utf8");
1141
+ } catch (err) {
1142
+ throw new ConfigError(`failed to read grading config: ${err instanceof Error ? err.message : String(err)}`, filePath);
1143
+ }
1144
+ if (isSuiteRoot(parse(content))) return parseGradingFromSuiteRaw(parse(content), absolutePath);
1145
+ return parseGradingConfig(content, absolutePath);
1146
+ }
1147
+ /**
1148
+ * Parse grading YAML from a string.
1149
+ *
1150
+ * @param sourcePath Optional path for error messages and path resolution.
1151
+ */
1152
+ function parseGradingConfig(yamlContent, sourcePath) {
655
1153
  let raw;
656
1154
  try {
657
1155
  raw = parse(yamlContent);
658
1156
  } catch (err) {
659
1157
  throw new ConfigError(`YAML parse error: ${err instanceof Error ? err.message : String(err)}`, sourcePath);
660
1158
  }
661
- const validated = SuiteDirectorySchema.safeParse(raw);
662
- if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError(validated.error, sourcePath)}`, sourcePath);
663
- return transformSuiteDirectory(validated.data);
1159
+ if (isSuiteRoot(raw)) return parseGradingFromSuiteRaw(raw, sourcePath ?? "suite.yaml");
1160
+ const validated = GradingConfigSchema.safeParse(raw);
1161
+ if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError$1(validated.error, sourcePath)}`, sourcePath);
1162
+ const config = { judge: { ...validated.data.judge } };
1163
+ if (sourcePath) resolveGradingConfigPaths(config, sourcePath);
1164
+ return config;
664
1165
  }
665
- /** Parse one case file: single case, array, or `{ cases: [...] }`. */
666
- function parseCasesFile(yamlContent, sourcePath) {
1166
+ /** Detect unified suite.yaml by presence of suite-specific keys (vs standalone grading YAML). */
1167
+ function isSuiteRoot(raw) {
1168
+ if (raw === null || typeof raw !== "object") return false;
1169
+ return "cases" in raw || "matrix" in raw && "adapter" in raw;
1170
+ }
1171
+ async function loadGradingFromSuiteYaml(suiteYamlPath) {
1172
+ let content;
1173
+ try {
1174
+ content = await readFile(suiteYamlPath, "utf8");
1175
+ } catch (err) {
1176
+ throw new ConfigError(`failed to read suite file: ${err instanceof Error ? err.message : String(err)}`, suiteYamlPath);
1177
+ }
667
1178
  let raw;
668
1179
  try {
669
- raw = parse(yamlContent);
1180
+ raw = parse(content);
670
1181
  } catch (err) {
671
- throw new ConfigError(`YAML parse error: ${err instanceof Error ? err.message : String(err)}`, sourcePath);
1182
+ throw new ConfigError(`YAML parse error: ${err instanceof Error ? err.message : String(err)}`, suiteYamlPath);
672
1183
  }
673
- return transformTestCases(extractRawCases(raw, sourcePath), sourcePath ?? "cases");
1184
+ return parseGradingFromSuiteRaw(raw, suiteYamlPath);
674
1185
  }
675
- function extractRawCases(raw, sourcePath) {
676
- if (Array.isArray(raw)) return raw.map((item, index) => validateRawCase(item, sourcePath, index));
677
- if (raw && typeof raw === "object") {
678
- const obj = raw;
679
- if (Array.isArray(obj.cases)) return obj.cases.map((item, index) => validateRawCase(item, sourcePath, index));
680
- if ("id" in obj && "prompt" in obj && "assertions" in obj) return [validateRawCase(raw, sourcePath, 0)];
1186
+ function parseGradingFromSuiteRaw(raw, sourcePath) {
1187
+ const single = SuiteFileSingleSchema.safeParse(raw);
1188
+ if (single.success) {
1189
+ if (!single.data.judge) throw new ConfigError("suite file has no judge block", sourcePath);
1190
+ const config = { judge: { ...single.data.judge } };
1191
+ resolveGradingConfigPaths(config, sourcePath);
1192
+ return config;
681
1193
  }
682
- throw new ConfigError("expected a case object, array of cases, or { cases: [...] }", sourcePath);
1194
+ const directory = SuiteFileDirectorySchema.safeParse(raw);
1195
+ if (directory.success) {
1196
+ if (!directory.data.judge) throw new ConfigError("suite file has no judge block", sourcePath);
1197
+ const config = { judge: { ...directory.data.judge } };
1198
+ resolveGradingConfigPaths(config, sourcePath);
1199
+ return config;
1200
+ }
1201
+ throw new ConfigError(`validation failed:\n${formatZodError$1(directory.error ?? single.error, sourcePath)}`, sourcePath);
683
1202
  }
684
- function validateRawCase(raw, sourcePath, index) {
685
- const validated = TestCaseSchema.safeParse(raw);
1203
+ /** Format a zod validation error with optional source file prefix. */
1204
+ function formatZodError$1(err, sourcePath) {
1205
+ return err.issues.map((issue) => {
1206
+ const path = issue.path.length > 0 ? issue.path.join(".") : "(root)";
1207
+ return ` ${sourcePath ? `${sourcePath} → ${path}` : path}: ${issue.message}`;
1208
+ }).join("\n");
1209
+ }
1210
+ //#endregion
1211
+ //#region src/config/loader.ts
1212
+ /**
1213
+ * Load a `TestSuite` from a YAML file, directory, or string.
1214
+ *
1215
+ * For unified suite.yaml with optional `judge:` and `pipeline:` blocks,
1216
+ * use {@link loadSuiteDocument}.
1217
+ */
1218
+ /**
1219
+ * Load a suite from a file path or directory path (suite portion only).
1220
+ *
1221
+ * Orchestration blocks (`judge:`, `pipeline:`) are silently stripped — callers
1222
+ * that only need the `TestSuite` are not broken by malformed orchestration YAML.
1223
+ * Use {@link loadSuiteDocument} when you need validated orchestration metadata.
1224
+ */
1225
+ async function loadSuite(filePath) {
1226
+ return (await loadSuiteDocument(filePath, { validateOrchestration: false })).suite;
1227
+ }
1228
+ /**
1229
+ * Parse suite YAML from a string (single-file layout with inline cases).
1230
+ *
1231
+ * Unknown top-level keys such as `judge` and `pipeline` are stripped.
1232
+ */
1233
+ function parseSuite(yamlContent, sourcePath) {
1234
+ let raw;
1235
+ try {
1236
+ raw = parse(yamlContent);
1237
+ } catch (err) {
1238
+ throw new ConfigError(`YAML parse error: ${err instanceof Error ? err.message : String(err)}`, sourcePath);
1239
+ }
1240
+ const validated = TestSuiteSchema.safeParse(raw);
686
1241
  if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError(validated.error, sourcePath)}`, sourcePath);
687
- return validated.data;
1242
+ const suite = transformSuite(validated.data);
1243
+ if (sourcePath) resolveSuitePaths(suite, resolve(sourcePath));
1244
+ return suite;
688
1245
  }
689
- async function collectCaseYamlFiles(casesDir) {
690
- const files = [];
691
- async function walk(dir) {
692
- let entries;
693
- try {
694
- entries = await readdir(dir, { withFileTypes: true });
695
- } catch (err) {
696
- if (err instanceof Error && "code" in err && err.code === "ENOENT") return;
697
- throw err;
698
- }
699
- for (const entry of entries) {
700
- const fullPath = join(dir, entry.name);
701
- if (entry.isDirectory()) await walk(fullPath);
702
- else if (entry.isFile() && (entry.name.endsWith(".yaml") || entry.name.endsWith(".yml"))) files.push(fullPath);
703
- }
1246
+ /** Parse `suite.yaml` for directory layout (cases may be omitted). @internal */
1247
+ function parseSuiteDirectory(yamlContent, sourcePath) {
1248
+ let raw;
1249
+ try {
1250
+ raw = parse(yamlContent);
1251
+ } catch (err) {
1252
+ throw new ConfigError(`YAML parse error: ${err instanceof Error ? err.message : String(err)}`, sourcePath);
704
1253
  }
705
- await walk(casesDir);
706
- return files.sort();
1254
+ const validated = SuiteDirectorySchema.safeParse(raw);
1255
+ if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError(validated.error, sourcePath)}`, sourcePath);
1256
+ return transformSuiteDirectory(validated.data);
707
1257
  }
708
1258
  function formatZodError(err, sourcePath) {
709
1259
  return err.issues.map((issue) => {
@@ -712,6 +1262,6 @@ function formatZodError(err, sourcePath) {
712
1262
  }).join("\n");
713
1263
  }
714
1264
  //#endregion
715
- export { parseGradingConfig as a, loadGradingConfig as i, parseCasesFile as n, ConfigError as o, parseSuite as r, loadSuite as t };
1265
+ export { parseGradingConfig as a, parseCasesFile as c, loadGradingConfig as i, ConfigError as l, parseSuite as n, loadSuiteDocument as o, parseSuiteDirectory as r, DEFAULT_PIPELINE_OUTPUTS as s, loadSuite as t };
716
1266
 
717
- //# sourceMappingURL=loader-BCnFJ8rm.js.map
1267
+ //# sourceMappingURL=loader-DnQ6Jt0i.js.map