@alis-build/harness-eval 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/README.md +17 -4
  2. package/dist/adapters/claude-code/index.d.ts +1 -1
  3. package/dist/adapters/claude-code/index.js +1 -1
  4. package/dist/{claude-code-ycT0JQZF.js → claude-code-DZ4Vkgp6.js} +35 -6
  5. package/dist/{claude-code-ycT0JQZF.js.map → claude-code-DZ4Vkgp6.js.map} +1 -1
  6. package/dist/cli/bin.js +109 -12
  7. package/dist/cli/bin.js.map +1 -1
  8. package/dist/config/loader.d.ts +1 -1
  9. package/dist/config/loader.js +1 -1
  10. package/dist/{index-6Z17eKZx.d.ts → index-V22PrR0p.d.ts} +2 -1
  11. package/dist/index.d.ts +270 -152
  12. package/dist/index.js +124 -5
  13. package/dist/index.js.map +1 -0
  14. package/dist/{loader-DTvoVfN0.d.ts → loader-C9yQHUPC.d.ts} +19 -2
  15. package/dist/{loader-BCnFJ8rm.js → loader-DcI0KfRX.js} +291 -4
  16. package/dist/loader-DcI0KfRX.js.map +1 -0
  17. package/dist/{build-DsVJ_UeU.js → projections-BcX7w-f6.js} +486 -243
  18. package/dist/projections-BcX7w-f6.js.map +1 -0
  19. package/dist/runner/suite.d.ts +1 -1
  20. package/dist/runner/suite.js +1 -1
  21. package/dist/{suite-BoOvK_lq.d.ts → suite-DPJMIEbu.d.ts} +7 -2
  22. package/dist/{suite-chj0j22j.js → suite-Dlzl-HI0.js} +58 -4
  23. package/dist/suite-Dlzl-HI0.js.map +1 -0
  24. package/dist/{types-BQol062t.d.ts → types-CD3TwOtZ.d.ts} +151 -10
  25. package/package.json +4 -2
  26. package/schemas/eval-interchange-instances.schema.json +196 -0
  27. package/schemas/eval-interchange.schema.json +65 -52
  28. package/schemas/eval-run-envelope.schema.json +182 -425
  29. package/dist/build-DsVJ_UeU.js.map +0 -1
  30. package/dist/loader-BCnFJ8rm.js.map +0 -1
  31. package/dist/suite-chj0j22j.js.map +0 -1
  32. package/schemas/eval-interchange-agent-trace.schema.json +0 -322
  33. package/schemas/eval-interchange-proto-instance.schema.json +0 -106
@@ -6,11 +6,18 @@ import { z } from "zod";
6
6
  //#region src/config/paths.ts
7
7
  /**
8
8
  * Resolve relative paths in suite config against the suite file directory.
9
+ *
10
+ * YAML authors write paths relative to the suite file; this module absolutizes
11
+ * them at load time so the runner and adapters receive filesystem-ready values.
12
+ * Tilde-prefixed paths and inline JSON blobs (settings starting with `{`) are
13
+ * left unchanged.
9
14
  */
15
+ /** Resolve a single path relative to `suiteDir` unless already absolute or `~/`. */
10
16
  function resolvePath(value, suiteDir) {
11
17
  if (isAbsolute(value) || value.startsWith("~/")) return value;
12
18
  return join(suiteDir, value);
13
19
  }
20
+ /** Resolve Claude Code-specific path fields within a config block. */
14
21
  function resolveClaudeCodePaths(block, suiteDir) {
15
22
  const resolved = { ...block };
16
23
  if (typeof resolved.mcpConfig === "string") resolved.mcpConfig = resolvePath(resolved.mcpConfig, suiteDir);
@@ -42,9 +49,16 @@ function resolveSuitePaths(suite, suiteFilePath) {
42
49
  for (const cell of suite.matrix) cell.config = resolveConfigPaths(cell.config, suiteDir) ?? cell.config;
43
50
  for (const testCase of suite.cases) testCase.config = resolveConfigPaths(testCase.config, suiteDir);
44
51
  }
52
+ /** Parent directory of a suite or grading config file path. */
45
53
  function configFileDir(filePath) {
46
54
  return filePath.includes("/") || filePath.includes("\\") ? filePath.replace(/[/\\][^/\\]+$/, "") : ".";
47
55
  }
56
+ /**
57
+ * Heuristically resolve env var values that look like relative file paths.
58
+ *
59
+ * Used for grading config where credential or config paths may be expressed
60
+ * relative to the grading YAML location.
61
+ */
48
62
  function resolveEnvPaths(env, baseDir) {
49
63
  const resolved = {};
50
64
  for (const [key, value] of Object.entries(env)) if (value.startsWith("./") || value.startsWith("../") || value.includes("/") && !value.startsWith("http")) resolved[key] = resolvePath(value, baseDir);
@@ -68,7 +82,8 @@ function resolveGradingConfigPaths(config, configFilePath) {
68
82
  * zod schemas for the YAML on-disk shape.
69
83
  *
70
84
  * Config uses a nested layout: generic harness fields at the top level,
71
- * adapter-specific options under a named key (e.g. `claudeCode`).
85
+ * adapter-specific options under a named key (e.g. `claudeCode`). Validated
86
+ * raw shapes are transformed into runtime types by `src/config/transform.ts`.
72
87
  */
73
88
  /** Claude Code adapter-specific options (nested under `claudeCode`). */
74
89
  const ClaudeCodeConfigSchema = z.object({
@@ -136,6 +151,11 @@ const ReferenceToolCallSchema = z.object({
136
151
  tool_name: z.string().min(1),
137
152
  tool_input: z.unknown()
138
153
  });
154
+ /** Reference trajectory in suite YAML — array of steps or object with mode + steps. */
155
+ const ReferenceTrajectorySchema = z.union([z.array(ReferenceToolCallSchema), z.object({
156
+ tool_name_mode: z.enum(["harness", "bare"]).optional(),
157
+ steps: z.array(ReferenceToolCallSchema).min(1)
158
+ })]);
139
159
  /** A test case. */
140
160
  const TestCaseSchema = z.object({
141
161
  id: z.string().min(1),
@@ -143,7 +163,7 @@ const TestCaseSchema = z.object({
143
163
  category: z.string().optional(),
144
164
  notes: z.string().optional(),
145
165
  expectations: z.array(z.string().min(1)).optional(),
146
- reference_trajectory: z.array(ReferenceToolCallSchema).optional(),
166
+ reference_trajectory: ReferenceTrajectorySchema.optional(),
147
167
  human_ratings: z.record(z.string(), z.number()).optional(),
148
168
  assertions: z.array(z.unknown()).min(1),
149
169
  repetitions: z.number().int().positive().optional(),
@@ -192,6 +212,7 @@ function transformSuiteDirectory(raw) {
192
212
  function transformTestCases(raw, pathPrefix) {
193
213
  return raw.map((c, i) => transformTestCase(c, `${pathPrefix}[${i}]`));
194
214
  }
215
+ /** Merge suite-level parts shared by single-file and directory transforms. */
195
216
  function transformSuiteParts(raw) {
196
217
  return {
197
218
  adapter: raw.adapter,
@@ -200,6 +221,21 @@ function transformSuiteParts(raw) {
200
221
  cases: raw.cases.map((c, i) => transformTestCase(c, `cases[${i}]`))
201
222
  };
202
223
  }
224
+ /**
225
+ * Normalize reference trajectory YAML into {@link ReferenceTrajectoryConfig}.
226
+ *
227
+ * Accepts a bare step array or `{ tool_name_mode?, steps }` object form.
228
+ */
229
+ function normalizeReferenceTrajectory(raw, path) {
230
+ if (raw === void 0) return void 0;
231
+ if (Array.isArray(raw)) return { steps: raw };
232
+ if (!isPlainObject(raw) || !Array.isArray(raw.steps)) throw new ConfigError("reference_trajectory must be an array of tool calls or { tool_name_mode?, steps: [...] }", path);
233
+ return {
234
+ tool_name_mode: raw.tool_name_mode,
235
+ steps: raw.steps
236
+ };
237
+ }
238
+ /** Map raw matrix cell YAML to runtime {@link MatrixCell}. */
203
239
  function transformMatrixCell(raw) {
204
240
  return {
205
241
  label: raw.label,
@@ -207,6 +243,7 @@ function transformMatrixCell(raw) {
207
243
  axes: raw.axes
208
244
  };
209
245
  }
246
+ /** Map one raw test case to runtime {@link TestCase}, transforming assertions. */
210
247
  function transformTestCase(raw, path) {
211
248
  return {
212
249
  id: raw.id,
@@ -214,7 +251,7 @@ function transformTestCase(raw, path) {
214
251
  category: raw.category,
215
252
  notes: raw.notes,
216
253
  expectations: raw.expectations,
217
- reference_trajectory: raw.reference_trajectory,
254
+ reference_trajectory: normalizeReferenceTrajectory(raw.reference_trajectory, `${path}.reference_trajectory`),
218
255
  human_ratings: raw.human_ratings,
219
256
  repetitions: raw.repetitions,
220
257
  config: raw.config,
@@ -223,6 +260,17 @@ function transformTestCase(raw, path) {
223
260
  }
224
261
  /** Keys that may appear alongside an assertion-type key. Not assertion types themselves. */
225
262
  const SIBLING_KEYS = /* @__PURE__ */ new Set(["threshold"]);
263
+ /**
264
+ * Parse optional `threshold` sibling and delegate the assertion body to
265
+ * {@link transformAssertion}.
266
+ *
267
+ * @throws {ConfigError} When the wrapper is not an object, threshold is out of
268
+ * `[0, 1]`, or the nested assertion fails validation.
269
+ *
270
+ * @example
271
+ * transformThresholdedAssertion({ called: "Read", threshold: 0.9 }, "path")
272
+ * // → { assertion: { type: "called", tool: "Read" }, threshold: 0.9 }
273
+ */
226
274
  function transformThresholdedAssertion(raw, path) {
227
275
  if (!isPlainObject(raw)) throw new ConfigError(`expected object, got ${typeOf(raw)}`, path);
228
276
  const threshold = raw.threshold;
@@ -240,6 +288,19 @@ function transformThresholdedAssertion(raw, path) {
240
288
  * Finds the single non-sibling key, dispatches to the per-type transformer.
241
289
  * Per-type transformers handle both verbose-object and shortcut-scalar input
242
290
  * shapes where applicable.
291
+ *
292
+ * @param raw - Single assertion object from parsed YAML (may include `threshold` sibling).
293
+ * @param path - JSON-path-like location for error messages (e.g. `cases[0].assertions[1]`).
294
+ * @returns Runtime {@link Assertion} tagged union.
295
+ * @throws {ConfigError} When the object has no assertion key, multiple type keys, or an unknown type.
296
+ *
297
+ * @example
298
+ * transformAssertion({ called: "Read" }, "cases[0].assertions[0]")
299
+ * // → { type: "called", tool: "Read" }
300
+ *
301
+ * @example
302
+ * transformAssertion({ called: { tool: "Read", times: ">= 2" } }, "path")
303
+ * // → { type: "called", tool: "Read", times: ">= 2" }
243
304
  */
244
305
  function transformAssertion(raw, path) {
245
306
  if (!isPlainObject(raw)) throw new ConfigError(`expected object, got ${typeOf(raw)}`, path);
@@ -271,6 +332,22 @@ function transformAssertion(raw, path) {
271
332
  default: throw new ConfigError(`unknown assertion type: ${typeKey}`, path);
272
333
  }
273
334
  }
335
+ /**
336
+ * Transform `called` YAML (scalar or `{tool, times?}`) to runtime assertion.
337
+ *
338
+ * @throws {ConfigError} When value is neither string nor object, tool is invalid,
339
+ * or `times` is not a valid cardinality string.
340
+ *
341
+ * @example
342
+ * // Scalar shortcut
343
+ * transformCalled("mcp__api__search_skills", "path")
344
+ * // → { type: "called", tool: "mcp__api__search_skills" }
345
+ *
346
+ * @example
347
+ * // Verbose form with cardinality
348
+ * transformCalled({ tool: "Read", times: ">= 1" }, "path")
349
+ * // → { type: "called", tool: "Read", times: ">= 1" }
350
+ */
274
351
  function transformCalled(value, path) {
275
352
  if (typeof value === "string") return {
276
353
  type: "called",
@@ -293,6 +370,14 @@ function transformCalled(value, path) {
293
370
  times
294
371
  };
295
372
  }
373
+ /**
374
+ * Transform `not_called` YAML (scalar or `{tool}`).
375
+ *
376
+ * @throws {ConfigError} When value is neither string nor object with a valid `tool`.
377
+ *
378
+ * @example
379
+ * transformNotCalled("Bash", "path") // → { type: "not_called", tool: "Bash" }
380
+ */
296
381
  function transformNotCalled(value, path) {
297
382
  if (typeof value === "string") return {
298
383
  type: "not_called",
@@ -304,18 +389,45 @@ function transformNotCalled(value, path) {
304
389
  tool: requireToolPattern(value.tool, `${path}.tool`)
305
390
  };
306
391
  }
392
+ /**
393
+ * Transform `called_any_of` — bare tool list or `{tools: [...]}`.
394
+ *
395
+ * @throws {ConfigError} When the value is not an array or `{tools: [...]}` object.
396
+ *
397
+ * @example
398
+ * transformCalledAnyOf(["Read", "Glob"], "path")
399
+ * // → { type: "called_any_of", tools: ["Read", "Glob"] }
400
+ */
307
401
  function transformCalledAnyOf(value, path) {
308
402
  return {
309
403
  type: "called_any_of",
310
404
  tools: requireToolPatternList(value, path)
311
405
  };
312
406
  }
407
+ /**
408
+ * Transform `called_all_of` — bare tool list or `{tools: [...]}`.
409
+ *
410
+ * @throws {ConfigError} When the value is not an array or `{tools: [...]}` object.
411
+ *
412
+ * @example
413
+ * transformCalledAllOf({ tools: ["Read", "Grep"] }, "path")
414
+ * // → { type: "called_all_of", tools: ["Read", "Grep"] }
415
+ */
313
416
  function transformCalledAllOf(value, path) {
314
417
  return {
315
418
  type: "called_all_of",
316
419
  tools: requireToolPatternList(value, path)
317
420
  };
318
421
  }
422
+ /**
423
+ * Transform `called_before: {first, then}` ordering assertion.
424
+ *
425
+ * @throws {ConfigError} When value is not an object or `first`/`then` are invalid patterns.
426
+ *
427
+ * @example
428
+ * transformCalledBefore({ first: "SearchSkills", then: "LoadSkill" }, "path")
429
+ * // → { type: "called_before", first: "SearchSkills", then: "LoadSkill" }
430
+ */
319
431
  function transformCalledBefore(value, path) {
320
432
  if (!isPlainObject(value)) throw new ConfigError(`expected object with {first, then}, got ${typeOf(value)}`, path);
321
433
  return {
@@ -324,6 +436,19 @@ function transformCalledBefore(value, path) {
324
436
  then: requireToolPattern(value.then, `${path}.then`)
325
437
  };
326
438
  }
439
+ /**
440
+ * Transform `sequence` — tool list with optional `strict` flag.
441
+ *
442
+ * @throws {ConfigError} When value is neither a pattern array nor `{tools, strict?}` object.
443
+ *
444
+ * @example
445
+ * // Bare array (non-strict by default)
446
+ * transformSequence(["Read", "Edit"], "path")
447
+ *
448
+ * @example
449
+ * // Explicit strict ordering
450
+ * transformSequence({ tools: ["Read", "Edit"], strict: true }, "path")
451
+ */
327
452
  function transformSequence(value, path) {
328
453
  if (Array.isArray(value)) return {
329
454
  type: "sequence",
@@ -336,6 +461,19 @@ function transformSequence(value, path) {
336
461
  strict: value.strict === void 0 ? void 0 : requireBool(value.strict, `${path}.strict`)
337
462
  };
338
463
  }
464
+ /**
465
+ * Transform `called_with: {tool, args}` with predicate validation on args.
466
+ *
467
+ * @throws {ConfigError} When `tool` or `args` is missing/invalid, or `args` fails
468
+ * {@link validatePredicate}.
469
+ *
470
+ * @example
471
+ * transformCalledWith(
472
+ * { tool: "Read", args: { path: { contains: "README" } } },
473
+ * "path",
474
+ * )
475
+ * // → { type: "called_with", tool: "Read", args: { path: { contains: "README" } } }
476
+ */
339
477
  function transformCalledWith(value, path) {
340
478
  if (!isPlainObject(value)) throw new ConfigError(`expected object with {tool, args}, got ${typeOf(value)}`, path);
341
479
  const tool = requireToolPattern(value.tool, `${path}.tool`);
@@ -347,10 +485,32 @@ function transformCalledWith(value, path) {
347
485
  args: value.args
348
486
  };
349
487
  }
488
+ /**
489
+ * Transform `responded_without_tool_calls` — accepts true or empty object.
490
+ *
491
+ * @throws {ConfigError} When value is neither `true`, null, nor an empty object.
492
+ *
493
+ * @example
494
+ * transformRespondedWithoutToolCalls(true, "path")
495
+ * // → { type: "responded_without_tool_calls" }
496
+ */
350
497
  function transformRespondedWithoutToolCalls(value, path) {
351
498
  if (value === true || value === null || isPlainObject(value) && Object.keys(value).length === 0) return { type: "responded_without_tool_calls" };
352
499
  throw new ConfigError(`expected true or empty object, got ${JSON.stringify(value)}`, path);
353
500
  }
501
+ /**
502
+ * Transform budget assertions (`iterations_within`, `cost_within_usd`, `duration_within_ms`).
503
+ *
504
+ * @throws {ConfigError} When `max` is missing, non-positive, or not a number.
505
+ *
506
+ * @example
507
+ * transformScalarMax(5, "path", "iterations_within")
508
+ * // → { type: "iterations_within", max: 5 }
509
+ *
510
+ * @example
511
+ * transformScalarMax({ max: 2.5 }, "path", "cost_within_usd")
512
+ * // → { type: "cost_within_usd", max: 2.5 }
513
+ */
354
514
  function transformScalarMax(value, path, type) {
355
515
  let max;
356
516
  if (typeof value === "number") max = value;
@@ -362,6 +522,15 @@ function transformScalarMax(value, path, type) {
362
522
  max
363
523
  };
364
524
  }
525
+ /**
526
+ * Transform `finished_with` — stop reason string, list, or `{reasons}`.
527
+ *
528
+ * @throws {ConfigError} When value is not a string, string array, or `{reasons}` object.
529
+ *
530
+ * @example
531
+ * transformFinishedWith("end_turn", "path")
532
+ * // → { type: "finished_with", reasons: "end_turn" }
533
+ */
365
534
  function transformFinishedWith(value, path) {
366
535
  if (typeof value === "string") return {
367
536
  type: "finished_with",
@@ -384,6 +553,15 @@ function transformFinishedWith(value, path) {
384
553
  }
385
554
  throw new ConfigError(`expected string, string[], or {reasons: ...}, got ${JSON.stringify(value)}`, path);
386
555
  }
556
+ /**
557
+ * Transform `response_contains` / `response_not_contains` scalar or `{text}`.
558
+ *
559
+ * @throws {ConfigError} When value is neither a string nor `{text: string}`.
560
+ *
561
+ * @example
562
+ * transformResponseText("done", "path", "response_contains")
563
+ * // → { type: "response_contains", text: "done" }
564
+ */
387
565
  function transformResponseText(value, path, type) {
388
566
  if (typeof value === "string") return {
389
567
  type,
@@ -395,6 +573,15 @@ function transformResponseText(value, path, type) {
395
573
  };
396
574
  throw new ConfigError(`expected string or {text: string}, got ${JSON.stringify(value)}`, path);
397
575
  }
576
+ /**
577
+ * Transform `response_matches: {pattern, flags?}`.
578
+ *
579
+ * @throws {ConfigError} When `pattern` is missing or not a string.
580
+ *
581
+ * @example
582
+ * transformResponseMatches({ pattern: "error\\d+", flags: "i" }, "path")
583
+ * // → { type: "response_matches", pattern: "error\\d+", flags: "i" }
584
+ */
398
585
  function transformResponseMatches(value, path) {
399
586
  if (!isPlainObject(value)) throw new ConfigError(`expected object with {pattern, flags?}, got ${typeOf(value)}`, path);
400
587
  return {
@@ -403,24 +590,57 @@ function transformResponseMatches(value, path) {
403
590
  flags: value.flags === void 0 ? void 0 : requireString(value.flags, `${path}.flags`)
404
591
  };
405
592
  }
593
+ /**
594
+ * Transform compound `all_of` assertion list.
595
+ *
596
+ * @throws {ConfigError} When value is not an array or `{assertions: [...]}`.
597
+ *
598
+ * @example
599
+ * transformAllOf([{ called: "Read" }, { not_called: "Bash" }], "path")
600
+ */
406
601
  function transformAllOf(value, path) {
407
602
  return {
408
603
  type: "all_of",
409
604
  assertions: transformCompoundList(value, path)
410
605
  };
411
606
  }
607
+ /**
608
+ * Transform compound `any_of` assertion list.
609
+ *
610
+ * @throws {ConfigError} When value is not an array or `{assertions: [...]}`.
611
+ *
612
+ * @example
613
+ * transformAnyOf({ assertions: [{ called: "Read" }, { called: "Glob" }] }, "path")
614
+ */
412
615
  function transformAnyOf(value, path) {
413
616
  return {
414
617
  type: "any_of",
415
618
  assertions: transformCompoundList(value, path)
416
619
  };
417
620
  }
621
+ /**
622
+ * Transform compound `not` — single nested assertion, no threshold.
623
+ *
624
+ * The inner assertion uses the same single-key YAML shape as top-level
625
+ * assertions; thresholds apply only at the outer {@link transformThresholdedAssertion} level.
626
+ *
627
+ * @throws {ConfigError} Propagates from nested {@link transformAssertion}.
628
+ *
629
+ * @example
630
+ * transformNot({ called: "Bash" }, "path")
631
+ * // → { type: "not", assertion: { type: "called", tool: "Bash" } }
632
+ */
418
633
  function transformNot(value, path) {
419
634
  return {
420
635
  type: "not",
421
636
  assertion: transformAssertion(value, path)
422
637
  };
423
638
  }
639
+ /**
640
+ * Parse compound assertion list from array or `{assertions: [...]}`.
641
+ *
642
+ * @throws {ConfigError} When value is neither form.
643
+ */
424
644
  function transformCompoundList(value, path) {
425
645
  const list = Array.isArray(value) ? value : isPlainObject(value) && Array.isArray(value.assertions) ? value.assertions : null;
426
646
  if (list === null) throw new ConfigError(`expected array or {assertions: [...]}, got ${JSON.stringify(value)}`, path);
@@ -452,6 +672,9 @@ const COMPOUND_OPS = /* @__PURE__ */ new Set([
452
672
  * - single-key object whose key is a leaf op (e.g. `{contains: "x"}`)
453
673
  * - single-key compound (`{any_of: [...]}`, `{all_of: [...]}`, `{not: ...}`)
454
674
  * - multi-key object (descend into fields; each value is a sub-predicate)
675
+ *
676
+ * @throws {ConfigError} When a compound op has a non-array value or a leaf op
677
+ * has the wrong value type (e.g. non-string `contains`).
455
678
  */
456
679
  function validatePredicate(raw, path) {
457
680
  if (!isPlainObject(raw)) return;
@@ -474,6 +697,12 @@ function validatePredicate(raw, path) {
474
697
  }
475
698
  for (const [field, sub] of Object.entries(raw)) validatePredicate(sub, `${path}.${field}`);
476
699
  }
700
+ /**
701
+ * Validate a leaf predicate operator's value shape at config load time.
702
+ *
703
+ * @throws {ConfigError} When the operator's value has the wrong type or `regex`
704
+ * is not a valid JavaScript regular expression.
705
+ */
477
706
  function validateLeafOperator(op, value, path) {
478
707
  switch (op) {
479
708
  case "equals": return;
@@ -501,27 +730,33 @@ function validateLeafOperator(op, value, path) {
501
730
  default: return;
502
731
  }
503
732
  }
733
+ /** Require a tool pattern string or `{ pattern }` object. */
504
734
  function requireToolPattern(value, path) {
505
735
  if (typeof value === "string") return value;
506
736
  if (isPlainObject(value) && typeof value.pattern === "string") return { pattern: value.pattern };
507
737
  throw new ConfigError(`expected string or {pattern: string}, got ${JSON.stringify(value)}`, path);
508
738
  }
739
+ /** Require a bare tool pattern array or `{ tools: [...] }` wrapper. */
509
740
  function requireToolPatternList(value, path) {
510
741
  const list = Array.isArray(value) ? value : isPlainObject(value) && Array.isArray(value.tools) ? value.tools : null;
511
742
  if (list === null) throw new ConfigError(`expected array of tool patterns or {tools: [...]}, got ${JSON.stringify(value)}`, path);
512
743
  return list.map((v, i) => requireToolPattern(v, `${path}[${i}]`));
513
744
  }
745
+ /** Require a string value at `path` or throw {@link ConfigError}. */
514
746
  function requireString(value, path) {
515
747
  if (typeof value === "string") return value;
516
748
  throw new ConfigError(`expected string, got ${typeOf(value)}`, path);
517
749
  }
750
+ /** Require a boolean value at `path` or throw {@link ConfigError}. */
518
751
  function requireBool(value, path) {
519
752
  if (typeof value === "boolean") return value;
520
753
  throw new ConfigError(`expected boolean, got ${typeOf(value)}`, path);
521
754
  }
755
+ /** True for non-null, non-array objects (YAML mapping nodes). */
522
756
  function isPlainObject(x) {
523
757
  return typeof x === "object" && x !== null && !Array.isArray(x);
524
758
  }
759
+ /** Human-readable type name for config error messages. */
525
760
  function typeOf(x) {
526
761
  if (x === null) return "null";
527
762
  if (Array.isArray(x)) return "array";
@@ -531,6 +766,9 @@ function typeOf(x) {
531
766
  //#region src/config/grading-schema.ts
532
767
  /**
533
768
  * Zod schema for standalone grading YAML (`grading.yaml`).
769
+ *
770
+ * The top-level `judge` block reuses {@link ConfigPartialSchema} fields plus
771
+ * grader-specific concurrency and system-instruction overrides.
534
772
  */
535
773
  /** Top-level `judge` block — mirrors harness config fields plus grader concurrency. */
536
774
  const JudgeConfigSchema = ConfigPartialSchema.extend({
@@ -544,7 +782,11 @@ const GradingConfigSchema = z.object({ judge: JudgeConfigSchema });
544
782
  //#region src/config/grading-loader.ts
545
783
  /**
546
784
  * Load standalone grading YAML for `harness-eval grade`.
785
+ *
786
+ * Grading config defines the judge subprocess (model, concurrency, Claude Code
787
+ * flags) separately from the suite under test.
547
788
  */
789
+ /** Load grading YAML from disk and resolve relative paths. */
548
790
  async function loadGradingConfig(filePath) {
549
791
  const absolutePath = resolve(filePath);
550
792
  let content;
@@ -555,6 +797,11 @@ async function loadGradingConfig(filePath) {
555
797
  }
556
798
  return parseGradingConfig(content, absolutePath);
557
799
  }
800
+ /**
801
+ * Parse grading YAML from a string.
802
+ *
803
+ * @param sourcePath Optional path for error messages and path resolution.
804
+ */
558
805
  function parseGradingConfig(yamlContent, sourcePath) {
559
806
  let raw;
560
807
  try {
@@ -568,6 +815,7 @@ function parseGradingConfig(yamlContent, sourcePath) {
568
815
  if (sourcePath) resolveGradingConfigPaths(config, sourcePath);
569
816
  return config;
570
817
  }
818
+ /** Format a zod validation error with optional source file prefix. */
571
819
  function formatZodError$1(err, sourcePath) {
572
820
  return err.issues.map((issue) => {
573
821
  const path = issue.path.length > 0 ? issue.path.join(".") : "(root)";
@@ -578,6 +826,19 @@ function formatZodError$1(err, sourcePath) {
578
826
  //#region src/config/loader.ts
579
827
  /**
580
828
  * Load a `TestSuite` from a YAML file, directory, or string.
829
+ *
830
+ * Supports two on-disk layouts:
831
+ * - Single file: `suite.yaml` with inline `cases`.
832
+ * - Directory: `suite.yaml` plus optional `cases/*.yaml` fragments merged
833
+ * in lexicographic path order.
834
+ *
835
+ * Relative paths in config (MCP config, plugin dirs, etc.) are resolved
836
+ * against the suite file directory after load.
837
+ */
838
+ /**
839
+ * Load a suite from a file path or directory path.
840
+ *
841
+ * @throws {@link ConfigError} when the path is unreadable or validation fails.
581
842
  */
582
843
  async function loadSuite(filePath) {
583
844
  const absolutePath = resolve(filePath);
@@ -590,6 +851,7 @@ async function loadSuite(filePath) {
590
851
  if (info.isDirectory()) return loadSuiteDirectory(absolutePath);
591
852
  return loadSuiteFile(absolutePath);
592
853
  }
854
+ /** Load and parse a single-file suite (not a directory layout). */
593
855
  async function loadSuiteFile(absolutePath) {
594
856
  let content;
595
857
  try {
@@ -599,6 +861,12 @@ async function loadSuiteFile(absolutePath) {
599
861
  }
600
862
  return parseSuite(content, absolutePath);
601
863
  }
864
+ /**
865
+ * Load a directory suite: `suite.yaml` plus optional `cases/` YAML files.
866
+ *
867
+ * Cases from `suite.yaml` sort before external case files; within each file,
868
+ * array order is preserved.
869
+ */
602
870
  async function loadSuiteDirectory(dir) {
603
871
  const suiteYamlPath = join(dir, "suite.yaml");
604
872
  let content;
@@ -638,6 +906,11 @@ async function loadSuiteDirectory(dir) {
638
906
  resolveSuitePaths(suite, suiteYamlPath);
639
907
  return suite;
640
908
  }
909
+ /**
910
+ * Parse suite YAML from a string (single-file layout with inline cases).
911
+ *
912
+ * @param sourcePath Optional path for error messages and relative path resolution.
913
+ */
641
914
  function parseSuite(yamlContent, sourcePath) {
642
915
  let raw;
643
916
  try {
@@ -651,6 +924,7 @@ function parseSuite(yamlContent, sourcePath) {
651
924
  if (sourcePath) resolveSuitePaths(suite, resolve(sourcePath));
652
925
  return suite;
653
926
  }
927
+ /** Parse `suite.yaml` for directory layout (cases may be omitted). */
654
928
  function parseSuiteDirectory(yamlContent, sourcePath) {
655
929
  let raw;
656
930
  try {
@@ -672,6 +946,11 @@ function parseCasesFile(yamlContent, sourcePath) {
672
946
  }
673
947
  return transformTestCases(extractRawCases(raw, sourcePath), sourcePath ?? "cases");
674
948
  }
949
+ /**
950
+ * Normalize raw YAML into a list of {@link RawTestCase} objects.
951
+ *
952
+ * Accepts a single case, an array, or `{ cases: [...] }`.
953
+ */
675
954
  function extractRawCases(raw, sourcePath) {
676
955
  if (Array.isArray(raw)) return raw.map((item, index) => validateRawCase(item, sourcePath, index));
677
956
  if (raw && typeof raw === "object") {
@@ -681,11 +960,18 @@ function extractRawCases(raw, sourcePath) {
681
960
  }
682
961
  throw new ConfigError("expected a case object, array of cases, or { cases: [...] }", sourcePath);
683
962
  }
963
+ /** Validate one raw case object against {@link TestCaseSchema}. */
684
964
  function validateRawCase(raw, sourcePath, index) {
685
965
  const validated = TestCaseSchema.safeParse(raw);
686
966
  if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError(validated.error, sourcePath)}`, sourcePath);
687
967
  return validated.data;
688
968
  }
969
+ /**
970
+ * Recursively collect `.yaml` / `.yml` files under `casesDir`.
971
+ *
972
+ * Returns an empty list when the directory does not exist — external cases
973
+ * are optional in directory layout.
974
+ */
689
975
  async function collectCaseYamlFiles(casesDir) {
690
976
  const files = [];
691
977
  async function walk(dir) {
@@ -705,6 +991,7 @@ async function collectCaseYamlFiles(casesDir) {
705
991
  await walk(casesDir);
706
992
  return files.sort();
707
993
  }
994
+ /** Format a zod validation error with optional source file prefix. */
708
995
  function formatZodError(err, sourcePath) {
709
996
  return err.issues.map((issue) => {
710
997
  const path = issue.path.length > 0 ? issue.path.join(".") : "(root)";
@@ -714,4 +1001,4 @@ function formatZodError(err, sourcePath) {
714
1001
  //#endregion
715
1002
  export { parseGradingConfig as a, loadGradingConfig as i, parseCasesFile as n, ConfigError as o, parseSuite as r, loadSuite as t };
716
1003
 
717
- //# sourceMappingURL=loader-BCnFJ8rm.js.map
1004
+ //# sourceMappingURL=loader-DcI0KfRX.js.map