@alis-build/harness-eval 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -4
- package/dist/adapters/claude-code/index.d.ts +1 -1
- package/dist/adapters/claude-code/index.js +1 -1
- package/dist/{claude-code-ycT0JQZF.js → claude-code-DZ4Vkgp6.js} +35 -6
- package/dist/{claude-code-ycT0JQZF.js.map → claude-code-DZ4Vkgp6.js.map} +1 -1
- package/dist/cli/bin.js +109 -12
- package/dist/cli/bin.js.map +1 -1
- package/dist/config/loader.d.ts +1 -1
- package/dist/config/loader.js +1 -1
- package/dist/{index-6Z17eKZx.d.ts → index-V22PrR0p.d.ts} +2 -1
- package/dist/index.d.ts +270 -152
- package/dist/index.js +124 -5
- package/dist/index.js.map +1 -0
- package/dist/{loader-DTvoVfN0.d.ts → loader-C9yQHUPC.d.ts} +19 -2
- package/dist/{loader-BCnFJ8rm.js → loader-DcI0KfRX.js} +291 -4
- package/dist/loader-DcI0KfRX.js.map +1 -0
- package/dist/{build-DsVJ_UeU.js → projections-BcX7w-f6.js} +486 -243
- package/dist/projections-BcX7w-f6.js.map +1 -0
- package/dist/runner/suite.d.ts +1 -1
- package/dist/runner/suite.js +1 -1
- package/dist/{suite-BoOvK_lq.d.ts → suite-DPJMIEbu.d.ts} +7 -2
- package/dist/{suite-chj0j22j.js → suite-Dlzl-HI0.js} +58 -4
- package/dist/suite-Dlzl-HI0.js.map +1 -0
- package/dist/{types-BQol062t.d.ts → types-CD3TwOtZ.d.ts} +151 -10
- package/package.json +4 -2
- package/schemas/eval-interchange-instances.schema.json +196 -0
- package/schemas/eval-interchange.schema.json +65 -52
- package/schemas/eval-run-envelope.schema.json +182 -425
- package/dist/build-DsVJ_UeU.js.map +0 -1
- package/dist/loader-BCnFJ8rm.js.map +0 -1
- package/dist/suite-chj0j22j.js.map +0 -1
- package/schemas/eval-interchange-agent-trace.schema.json +0 -322
- package/schemas/eval-interchange-proto-instance.schema.json +0 -106
|
@@ -6,11 +6,18 @@ import { z } from "zod";
|
|
|
6
6
|
//#region src/config/paths.ts
|
|
7
7
|
/**
|
|
8
8
|
* Resolve relative paths in suite config against the suite file directory.
|
|
9
|
+
*
|
|
10
|
+
* YAML authors write paths relative to the suite file; this module absolutizes
|
|
11
|
+
* them at load time so the runner and adapters receive filesystem-ready values.
|
|
12
|
+
* Tilde-prefixed paths and inline JSON blobs (settings starting with `{`) are
|
|
13
|
+
* left unchanged.
|
|
9
14
|
*/
|
|
15
|
+
/** Resolve a single path relative to `suiteDir` unless already absolute or `~/`. */
|
|
10
16
|
function resolvePath(value, suiteDir) {
|
|
11
17
|
if (isAbsolute(value) || value.startsWith("~/")) return value;
|
|
12
18
|
return join(suiteDir, value);
|
|
13
19
|
}
|
|
20
|
+
/** Resolve Claude Code-specific path fields within a config block. */
|
|
14
21
|
function resolveClaudeCodePaths(block, suiteDir) {
|
|
15
22
|
const resolved = { ...block };
|
|
16
23
|
if (typeof resolved.mcpConfig === "string") resolved.mcpConfig = resolvePath(resolved.mcpConfig, suiteDir);
|
|
@@ -42,9 +49,16 @@ function resolveSuitePaths(suite, suiteFilePath) {
|
|
|
42
49
|
for (const cell of suite.matrix) cell.config = resolveConfigPaths(cell.config, suiteDir) ?? cell.config;
|
|
43
50
|
for (const testCase of suite.cases) testCase.config = resolveConfigPaths(testCase.config, suiteDir);
|
|
44
51
|
}
|
|
52
|
+
/** Parent directory of a suite or grading config file path. */
|
|
45
53
|
function configFileDir(filePath) {
|
|
46
54
|
return filePath.includes("/") || filePath.includes("\\") ? filePath.replace(/[/\\][^/\\]+$/, "") : ".";
|
|
47
55
|
}
|
|
56
|
+
/**
|
|
57
|
+
* Heuristically resolve env var values that look like relative file paths.
|
|
58
|
+
*
|
|
59
|
+
* Used for grading config where credential or config paths may be expressed
|
|
60
|
+
* relative to the grading YAML location.
|
|
61
|
+
*/
|
|
48
62
|
function resolveEnvPaths(env, baseDir) {
|
|
49
63
|
const resolved = {};
|
|
50
64
|
for (const [key, value] of Object.entries(env)) if (value.startsWith("./") || value.startsWith("../") || value.includes("/") && !value.startsWith("http")) resolved[key] = resolvePath(value, baseDir);
|
|
@@ -68,7 +82,8 @@ function resolveGradingConfigPaths(config, configFilePath) {
|
|
|
68
82
|
* zod schemas for the YAML on-disk shape.
|
|
69
83
|
*
|
|
70
84
|
* Config uses a nested layout: generic harness fields at the top level,
|
|
71
|
-
* adapter-specific options under a named key (e.g. `claudeCode`).
|
|
85
|
+
* adapter-specific options under a named key (e.g. `claudeCode`). Validated
|
|
86
|
+
* raw shapes are transformed into runtime types by `src/config/transform.ts`.
|
|
72
87
|
*/
|
|
73
88
|
/** Claude Code adapter-specific options (nested under `claudeCode`). */
|
|
74
89
|
const ClaudeCodeConfigSchema = z.object({
|
|
@@ -136,6 +151,11 @@ const ReferenceToolCallSchema = z.object({
|
|
|
136
151
|
tool_name: z.string().min(1),
|
|
137
152
|
tool_input: z.unknown()
|
|
138
153
|
});
|
|
154
|
+
/** Reference trajectory in suite YAML — array of steps or object with mode + steps. */
|
|
155
|
+
const ReferenceTrajectorySchema = z.union([z.array(ReferenceToolCallSchema), z.object({
|
|
156
|
+
tool_name_mode: z.enum(["harness", "bare"]).optional(),
|
|
157
|
+
steps: z.array(ReferenceToolCallSchema).min(1)
|
|
158
|
+
})]);
|
|
139
159
|
/** A test case. */
|
|
140
160
|
const TestCaseSchema = z.object({
|
|
141
161
|
id: z.string().min(1),
|
|
@@ -143,7 +163,7 @@ const TestCaseSchema = z.object({
|
|
|
143
163
|
category: z.string().optional(),
|
|
144
164
|
notes: z.string().optional(),
|
|
145
165
|
expectations: z.array(z.string().min(1)).optional(),
|
|
146
|
-
reference_trajectory:
|
|
166
|
+
reference_trajectory: ReferenceTrajectorySchema.optional(),
|
|
147
167
|
human_ratings: z.record(z.string(), z.number()).optional(),
|
|
148
168
|
assertions: z.array(z.unknown()).min(1),
|
|
149
169
|
repetitions: z.number().int().positive().optional(),
|
|
@@ -192,6 +212,7 @@ function transformSuiteDirectory(raw) {
|
|
|
192
212
|
function transformTestCases(raw, pathPrefix) {
|
|
193
213
|
return raw.map((c, i) => transformTestCase(c, `${pathPrefix}[${i}]`));
|
|
194
214
|
}
|
|
215
|
+
/** Merge suite-level parts shared by single-file and directory transforms. */
|
|
195
216
|
function transformSuiteParts(raw) {
|
|
196
217
|
return {
|
|
197
218
|
adapter: raw.adapter,
|
|
@@ -200,6 +221,21 @@ function transformSuiteParts(raw) {
|
|
|
200
221
|
cases: raw.cases.map((c, i) => transformTestCase(c, `cases[${i}]`))
|
|
201
222
|
};
|
|
202
223
|
}
|
|
224
|
+
/**
|
|
225
|
+
* Normalize reference trajectory YAML into {@link ReferenceTrajectoryConfig}.
|
|
226
|
+
*
|
|
227
|
+
* Accepts a bare step array or `{ tool_name_mode?, steps }` object form.
|
|
228
|
+
*/
|
|
229
|
+
function normalizeReferenceTrajectory(raw, path) {
|
|
230
|
+
if (raw === void 0) return void 0;
|
|
231
|
+
if (Array.isArray(raw)) return { steps: raw };
|
|
232
|
+
if (!isPlainObject(raw) || !Array.isArray(raw.steps)) throw new ConfigError("reference_trajectory must be an array of tool calls or { tool_name_mode?, steps: [...] }", path);
|
|
233
|
+
return {
|
|
234
|
+
tool_name_mode: raw.tool_name_mode,
|
|
235
|
+
steps: raw.steps
|
|
236
|
+
};
|
|
237
|
+
}
|
|
238
|
+
/** Map raw matrix cell YAML to runtime {@link MatrixCell}. */
|
|
203
239
|
function transformMatrixCell(raw) {
|
|
204
240
|
return {
|
|
205
241
|
label: raw.label,
|
|
@@ -207,6 +243,7 @@ function transformMatrixCell(raw) {
|
|
|
207
243
|
axes: raw.axes
|
|
208
244
|
};
|
|
209
245
|
}
|
|
246
|
+
/** Map one raw test case to runtime {@link TestCase}, transforming assertions. */
|
|
210
247
|
function transformTestCase(raw, path) {
|
|
211
248
|
return {
|
|
212
249
|
id: raw.id,
|
|
@@ -214,7 +251,7 @@ function transformTestCase(raw, path) {
|
|
|
214
251
|
category: raw.category,
|
|
215
252
|
notes: raw.notes,
|
|
216
253
|
expectations: raw.expectations,
|
|
217
|
-
reference_trajectory: raw.reference_trajectory,
|
|
254
|
+
reference_trajectory: normalizeReferenceTrajectory(raw.reference_trajectory, `${path}.reference_trajectory`),
|
|
218
255
|
human_ratings: raw.human_ratings,
|
|
219
256
|
repetitions: raw.repetitions,
|
|
220
257
|
config: raw.config,
|
|
@@ -223,6 +260,17 @@ function transformTestCase(raw, path) {
|
|
|
223
260
|
}
|
|
224
261
|
/** Keys that may appear alongside an assertion-type key. Not assertion types themselves. */
|
|
225
262
|
const SIBLING_KEYS = /* @__PURE__ */ new Set(["threshold"]);
|
|
263
|
+
/**
|
|
264
|
+
* Parse optional `threshold` sibling and delegate the assertion body to
|
|
265
|
+
* {@link transformAssertion}.
|
|
266
|
+
*
|
|
267
|
+
* @throws {ConfigError} When the wrapper is not an object, threshold is out of
|
|
268
|
+
* `[0, 1]`, or the nested assertion fails validation.
|
|
269
|
+
*
|
|
270
|
+
* @example
|
|
271
|
+
* transformThresholdedAssertion({ called: "Read", threshold: 0.9 }, "path")
|
|
272
|
+
* // → { assertion: { type: "called", tool: "Read" }, threshold: 0.9 }
|
|
273
|
+
*/
|
|
226
274
|
function transformThresholdedAssertion(raw, path) {
|
|
227
275
|
if (!isPlainObject(raw)) throw new ConfigError(`expected object, got ${typeOf(raw)}`, path);
|
|
228
276
|
const threshold = raw.threshold;
|
|
@@ -240,6 +288,19 @@ function transformThresholdedAssertion(raw, path) {
|
|
|
240
288
|
* Finds the single non-sibling key, dispatches to the per-type transformer.
|
|
241
289
|
* Per-type transformers handle both verbose-object and shortcut-scalar input
|
|
242
290
|
* shapes where applicable.
|
|
291
|
+
*
|
|
292
|
+
* @param raw - Single assertion object from parsed YAML (may include `threshold` sibling).
|
|
293
|
+
* @param path - JSON-path-like location for error messages (e.g. `cases[0].assertions[1]`).
|
|
294
|
+
* @returns Runtime {@link Assertion} tagged union.
|
|
295
|
+
* @throws {ConfigError} When the object has no assertion key, multiple type keys, or an unknown type.
|
|
296
|
+
*
|
|
297
|
+
* @example
|
|
298
|
+
* transformAssertion({ called: "Read" }, "cases[0].assertions[0]")
|
|
299
|
+
* // → { type: "called", tool: "Read" }
|
|
300
|
+
*
|
|
301
|
+
* @example
|
|
302
|
+
* transformAssertion({ called: { tool: "Read", times: ">= 2" } }, "path")
|
|
303
|
+
* // → { type: "called", tool: "Read", times: ">= 2" }
|
|
243
304
|
*/
|
|
244
305
|
function transformAssertion(raw, path) {
|
|
245
306
|
if (!isPlainObject(raw)) throw new ConfigError(`expected object, got ${typeOf(raw)}`, path);
|
|
@@ -271,6 +332,22 @@ function transformAssertion(raw, path) {
|
|
|
271
332
|
default: throw new ConfigError(`unknown assertion type: ${typeKey}`, path);
|
|
272
333
|
}
|
|
273
334
|
}
|
|
335
|
+
/**
|
|
336
|
+
* Transform `called` YAML (scalar or `{tool, times?}`) to runtime assertion.
|
|
337
|
+
*
|
|
338
|
+
* @throws {ConfigError} When value is neither string nor object, tool is invalid,
|
|
339
|
+
* or `times` is not a valid cardinality string.
|
|
340
|
+
*
|
|
341
|
+
* @example
|
|
342
|
+
* // Scalar shortcut
|
|
343
|
+
* transformCalled("mcp__api__search_skills", "path")
|
|
344
|
+
* // → { type: "called", tool: "mcp__api__search_skills" }
|
|
345
|
+
*
|
|
346
|
+
* @example
|
|
347
|
+
* // Verbose form with cardinality
|
|
348
|
+
* transformCalled({ tool: "Read", times: ">= 1" }, "path")
|
|
349
|
+
* // → { type: "called", tool: "Read", times: ">= 1" }
|
|
350
|
+
*/
|
|
274
351
|
function transformCalled(value, path) {
|
|
275
352
|
if (typeof value === "string") return {
|
|
276
353
|
type: "called",
|
|
@@ -293,6 +370,14 @@ function transformCalled(value, path) {
|
|
|
293
370
|
times
|
|
294
371
|
};
|
|
295
372
|
}
|
|
373
|
+
/**
|
|
374
|
+
* Transform `not_called` YAML (scalar or `{tool}`).
|
|
375
|
+
*
|
|
376
|
+
* @throws {ConfigError} When value is neither string nor object with a valid `tool`.
|
|
377
|
+
*
|
|
378
|
+
* @example
|
|
379
|
+
* transformNotCalled("Bash", "path") // → { type: "not_called", tool: "Bash" }
|
|
380
|
+
*/
|
|
296
381
|
function transformNotCalled(value, path) {
|
|
297
382
|
if (typeof value === "string") return {
|
|
298
383
|
type: "not_called",
|
|
@@ -304,18 +389,45 @@ function transformNotCalled(value, path) {
|
|
|
304
389
|
tool: requireToolPattern(value.tool, `${path}.tool`)
|
|
305
390
|
};
|
|
306
391
|
}
|
|
392
|
+
/**
|
|
393
|
+
* Transform `called_any_of` — bare tool list or `{tools: [...]}`.
|
|
394
|
+
*
|
|
395
|
+
* @throws {ConfigError} When the value is not an array or `{tools: [...]}` object.
|
|
396
|
+
*
|
|
397
|
+
* @example
|
|
398
|
+
* transformCalledAnyOf(["Read", "Glob"], "path")
|
|
399
|
+
* // → { type: "called_any_of", tools: ["Read", "Glob"] }
|
|
400
|
+
*/
|
|
307
401
|
function transformCalledAnyOf(value, path) {
|
|
308
402
|
return {
|
|
309
403
|
type: "called_any_of",
|
|
310
404
|
tools: requireToolPatternList(value, path)
|
|
311
405
|
};
|
|
312
406
|
}
|
|
407
|
+
/**
|
|
408
|
+
* Transform `called_all_of` — bare tool list or `{tools: [...]}`.
|
|
409
|
+
*
|
|
410
|
+
* @throws {ConfigError} When the value is not an array or `{tools: [...]}` object.
|
|
411
|
+
*
|
|
412
|
+
* @example
|
|
413
|
+
* transformCalledAllOf({ tools: ["Read", "Grep"] }, "path")
|
|
414
|
+
* // → { type: "called_all_of", tools: ["Read", "Grep"] }
|
|
415
|
+
*/
|
|
313
416
|
function transformCalledAllOf(value, path) {
|
|
314
417
|
return {
|
|
315
418
|
type: "called_all_of",
|
|
316
419
|
tools: requireToolPatternList(value, path)
|
|
317
420
|
};
|
|
318
421
|
}
|
|
422
|
+
/**
|
|
423
|
+
* Transform `called_before: {first, then}` ordering assertion.
|
|
424
|
+
*
|
|
425
|
+
* @throws {ConfigError} When value is not an object or `first`/`then` are invalid patterns.
|
|
426
|
+
*
|
|
427
|
+
* @example
|
|
428
|
+
* transformCalledBefore({ first: "SearchSkills", then: "LoadSkill" }, "path")
|
|
429
|
+
* // → { type: "called_before", first: "SearchSkills", then: "LoadSkill" }
|
|
430
|
+
*/
|
|
319
431
|
function transformCalledBefore(value, path) {
|
|
320
432
|
if (!isPlainObject(value)) throw new ConfigError(`expected object with {first, then}, got ${typeOf(value)}`, path);
|
|
321
433
|
return {
|
|
@@ -324,6 +436,19 @@ function transformCalledBefore(value, path) {
|
|
|
324
436
|
then: requireToolPattern(value.then, `${path}.then`)
|
|
325
437
|
};
|
|
326
438
|
}
|
|
439
|
+
/**
|
|
440
|
+
* Transform `sequence` — tool list with optional `strict` flag.
|
|
441
|
+
*
|
|
442
|
+
* @throws {ConfigError} When value is neither a pattern array nor `{tools, strict?}` object.
|
|
443
|
+
*
|
|
444
|
+
* @example
|
|
445
|
+
* // Bare array (non-strict by default)
|
|
446
|
+
* transformSequence(["Read", "Edit"], "path")
|
|
447
|
+
*
|
|
448
|
+
* @example
|
|
449
|
+
* // Explicit strict ordering
|
|
450
|
+
* transformSequence({ tools: ["Read", "Edit"], strict: true }, "path")
|
|
451
|
+
*/
|
|
327
452
|
function transformSequence(value, path) {
|
|
328
453
|
if (Array.isArray(value)) return {
|
|
329
454
|
type: "sequence",
|
|
@@ -336,6 +461,19 @@ function transformSequence(value, path) {
|
|
|
336
461
|
strict: value.strict === void 0 ? void 0 : requireBool(value.strict, `${path}.strict`)
|
|
337
462
|
};
|
|
338
463
|
}
|
|
464
|
+
/**
|
|
465
|
+
* Transform `called_with: {tool, args}` with predicate validation on args.
|
|
466
|
+
*
|
|
467
|
+
* @throws {ConfigError} When `tool` or `args` is missing/invalid, or `args` fails
|
|
468
|
+
* {@link validatePredicate}.
|
|
469
|
+
*
|
|
470
|
+
* @example
|
|
471
|
+
* transformCalledWith(
|
|
472
|
+
* { tool: "Read", args: { path: { contains: "README" } } },
|
|
473
|
+
* "path",
|
|
474
|
+
* )
|
|
475
|
+
* // → { type: "called_with", tool: "Read", args: { path: { contains: "README" } } }
|
|
476
|
+
*/
|
|
339
477
|
function transformCalledWith(value, path) {
|
|
340
478
|
if (!isPlainObject(value)) throw new ConfigError(`expected object with {tool, args}, got ${typeOf(value)}`, path);
|
|
341
479
|
const tool = requireToolPattern(value.tool, `${path}.tool`);
|
|
@@ -347,10 +485,32 @@ function transformCalledWith(value, path) {
|
|
|
347
485
|
args: value.args
|
|
348
486
|
};
|
|
349
487
|
}
|
|
488
|
+
/**
|
|
489
|
+
* Transform `responded_without_tool_calls` — accepts true or empty object.
|
|
490
|
+
*
|
|
491
|
+
* @throws {ConfigError} When value is neither `true`, null, nor an empty object.
|
|
492
|
+
*
|
|
493
|
+
* @example
|
|
494
|
+
* transformRespondedWithoutToolCalls(true, "path")
|
|
495
|
+
* // → { type: "responded_without_tool_calls" }
|
|
496
|
+
*/
|
|
350
497
|
function transformRespondedWithoutToolCalls(value, path) {
|
|
351
498
|
if (value === true || value === null || isPlainObject(value) && Object.keys(value).length === 0) return { type: "responded_without_tool_calls" };
|
|
352
499
|
throw new ConfigError(`expected true or empty object, got ${JSON.stringify(value)}`, path);
|
|
353
500
|
}
|
|
501
|
+
/**
|
|
502
|
+
* Transform budget assertions (`iterations_within`, `cost_within_usd`, `duration_within_ms`).
|
|
503
|
+
*
|
|
504
|
+
* @throws {ConfigError} When `max` is missing, non-positive, or not a number.
|
|
505
|
+
*
|
|
506
|
+
* @example
|
|
507
|
+
* transformScalarMax(5, "path", "iterations_within")
|
|
508
|
+
* // → { type: "iterations_within", max: 5 }
|
|
509
|
+
*
|
|
510
|
+
* @example
|
|
511
|
+
* transformScalarMax({ max: 2.5 }, "path", "cost_within_usd")
|
|
512
|
+
* // → { type: "cost_within_usd", max: 2.5 }
|
|
513
|
+
*/
|
|
354
514
|
function transformScalarMax(value, path, type) {
|
|
355
515
|
let max;
|
|
356
516
|
if (typeof value === "number") max = value;
|
|
@@ -362,6 +522,15 @@ function transformScalarMax(value, path, type) {
|
|
|
362
522
|
max
|
|
363
523
|
};
|
|
364
524
|
}
|
|
525
|
+
/**
|
|
526
|
+
* Transform `finished_with` — stop reason string, list, or `{reasons}`.
|
|
527
|
+
*
|
|
528
|
+
* @throws {ConfigError} When value is not a string, string array, or `{reasons}` object.
|
|
529
|
+
*
|
|
530
|
+
* @example
|
|
531
|
+
* transformFinishedWith("end_turn", "path")
|
|
532
|
+
* // → { type: "finished_with", reasons: "end_turn" }
|
|
533
|
+
*/
|
|
365
534
|
function transformFinishedWith(value, path) {
|
|
366
535
|
if (typeof value === "string") return {
|
|
367
536
|
type: "finished_with",
|
|
@@ -384,6 +553,15 @@ function transformFinishedWith(value, path) {
|
|
|
384
553
|
}
|
|
385
554
|
throw new ConfigError(`expected string, string[], or {reasons: ...}, got ${JSON.stringify(value)}`, path);
|
|
386
555
|
}
|
|
556
|
+
/**
|
|
557
|
+
* Transform `response_contains` / `response_not_contains` scalar or `{text}`.
|
|
558
|
+
*
|
|
559
|
+
* @throws {ConfigError} When value is neither a string nor `{text: string}`.
|
|
560
|
+
*
|
|
561
|
+
* @example
|
|
562
|
+
* transformResponseText("done", "path", "response_contains")
|
|
563
|
+
* // → { type: "response_contains", text: "done" }
|
|
564
|
+
*/
|
|
387
565
|
function transformResponseText(value, path, type) {
|
|
388
566
|
if (typeof value === "string") return {
|
|
389
567
|
type,
|
|
@@ -395,6 +573,15 @@ function transformResponseText(value, path, type) {
|
|
|
395
573
|
};
|
|
396
574
|
throw new ConfigError(`expected string or {text: string}, got ${JSON.stringify(value)}`, path);
|
|
397
575
|
}
|
|
576
|
+
/**
|
|
577
|
+
* Transform `response_matches: {pattern, flags?}`.
|
|
578
|
+
*
|
|
579
|
+
* @throws {ConfigError} When `pattern` is missing or not a string.
|
|
580
|
+
*
|
|
581
|
+
* @example
|
|
582
|
+
* transformResponseMatches({ pattern: "error\\d+", flags: "i" }, "path")
|
|
583
|
+
* // → { type: "response_matches", pattern: "error\\d+", flags: "i" }
|
|
584
|
+
*/
|
|
398
585
|
function transformResponseMatches(value, path) {
|
|
399
586
|
if (!isPlainObject(value)) throw new ConfigError(`expected object with {pattern, flags?}, got ${typeOf(value)}`, path);
|
|
400
587
|
return {
|
|
@@ -403,24 +590,57 @@ function transformResponseMatches(value, path) {
|
|
|
403
590
|
flags: value.flags === void 0 ? void 0 : requireString(value.flags, `${path}.flags`)
|
|
404
591
|
};
|
|
405
592
|
}
|
|
593
|
+
/**
|
|
594
|
+
* Transform compound `all_of` assertion list.
|
|
595
|
+
*
|
|
596
|
+
* @throws {ConfigError} When value is not an array or `{assertions: [...]}`.
|
|
597
|
+
*
|
|
598
|
+
* @example
|
|
599
|
+
* transformAllOf([{ called: "Read" }, { not_called: "Bash" }], "path")
|
|
600
|
+
*/
|
|
406
601
|
function transformAllOf(value, path) {
|
|
407
602
|
return {
|
|
408
603
|
type: "all_of",
|
|
409
604
|
assertions: transformCompoundList(value, path)
|
|
410
605
|
};
|
|
411
606
|
}
|
|
607
|
+
/**
|
|
608
|
+
* Transform compound `any_of` assertion list.
|
|
609
|
+
*
|
|
610
|
+
* @throws {ConfigError} When value is not an array or `{assertions: [...]}`.
|
|
611
|
+
*
|
|
612
|
+
* @example
|
|
613
|
+
* transformAnyOf({ assertions: [{ called: "Read" }, { called: "Glob" }] }, "path")
|
|
614
|
+
*/
|
|
412
615
|
function transformAnyOf(value, path) {
|
|
413
616
|
return {
|
|
414
617
|
type: "any_of",
|
|
415
618
|
assertions: transformCompoundList(value, path)
|
|
416
619
|
};
|
|
417
620
|
}
|
|
621
|
+
/**
|
|
622
|
+
* Transform compound `not` — single nested assertion, no threshold.
|
|
623
|
+
*
|
|
624
|
+
* The inner assertion uses the same single-key YAML shape as top-level
|
|
625
|
+
* assertions; thresholds apply only at the outer {@link transformThresholdedAssertion} level.
|
|
626
|
+
*
|
|
627
|
+
* @throws {ConfigError} Propagates from nested {@link transformAssertion}.
|
|
628
|
+
*
|
|
629
|
+
* @example
|
|
630
|
+
* transformNot({ called: "Bash" }, "path")
|
|
631
|
+
* // → { type: "not", assertion: { type: "called", tool: "Bash" } }
|
|
632
|
+
*/
|
|
418
633
|
function transformNot(value, path) {
|
|
419
634
|
return {
|
|
420
635
|
type: "not",
|
|
421
636
|
assertion: transformAssertion(value, path)
|
|
422
637
|
};
|
|
423
638
|
}
|
|
639
|
+
/**
|
|
640
|
+
* Parse compound assertion list from array or `{assertions: [...]}`.
|
|
641
|
+
*
|
|
642
|
+
* @throws {ConfigError} When value is neither form.
|
|
643
|
+
*/
|
|
424
644
|
function transformCompoundList(value, path) {
|
|
425
645
|
const list = Array.isArray(value) ? value : isPlainObject(value) && Array.isArray(value.assertions) ? value.assertions : null;
|
|
426
646
|
if (list === null) throw new ConfigError(`expected array or {assertions: [...]}, got ${JSON.stringify(value)}`, path);
|
|
@@ -452,6 +672,9 @@ const COMPOUND_OPS = /* @__PURE__ */ new Set([
|
|
|
452
672
|
* - single-key object whose key is a leaf op (e.g. `{contains: "x"}`)
|
|
453
673
|
* - single-key compound (`{any_of: [...]}`, `{all_of: [...]}`, `{not: ...}`)
|
|
454
674
|
* - multi-key object (descend into fields; each value is a sub-predicate)
|
|
675
|
+
*
|
|
676
|
+
* @throws {ConfigError} When a compound op has a non-array value or a leaf op
|
|
677
|
+
* has the wrong value type (e.g. non-string `contains`).
|
|
455
678
|
*/
|
|
456
679
|
function validatePredicate(raw, path) {
|
|
457
680
|
if (!isPlainObject(raw)) return;
|
|
@@ -474,6 +697,12 @@ function validatePredicate(raw, path) {
|
|
|
474
697
|
}
|
|
475
698
|
for (const [field, sub] of Object.entries(raw)) validatePredicate(sub, `${path}.${field}`);
|
|
476
699
|
}
|
|
700
|
+
/**
|
|
701
|
+
* Validate a leaf predicate operator's value shape at config load time.
|
|
702
|
+
*
|
|
703
|
+
* @throws {ConfigError} When the operator's value has the wrong type or `regex`
|
|
704
|
+
* is not a valid JavaScript regular expression.
|
|
705
|
+
*/
|
|
477
706
|
function validateLeafOperator(op, value, path) {
|
|
478
707
|
switch (op) {
|
|
479
708
|
case "equals": return;
|
|
@@ -501,27 +730,33 @@ function validateLeafOperator(op, value, path) {
|
|
|
501
730
|
default: return;
|
|
502
731
|
}
|
|
503
732
|
}
|
|
733
|
+
/** Require a tool pattern string or `{ pattern }` object. */
|
|
504
734
|
function requireToolPattern(value, path) {
|
|
505
735
|
if (typeof value === "string") return value;
|
|
506
736
|
if (isPlainObject(value) && typeof value.pattern === "string") return { pattern: value.pattern };
|
|
507
737
|
throw new ConfigError(`expected string or {pattern: string}, got ${JSON.stringify(value)}`, path);
|
|
508
738
|
}
|
|
739
|
+
/** Require a bare tool pattern array or `{ tools: [...] }` wrapper. */
|
|
509
740
|
function requireToolPatternList(value, path) {
|
|
510
741
|
const list = Array.isArray(value) ? value : isPlainObject(value) && Array.isArray(value.tools) ? value.tools : null;
|
|
511
742
|
if (list === null) throw new ConfigError(`expected array of tool patterns or {tools: [...]}, got ${JSON.stringify(value)}`, path);
|
|
512
743
|
return list.map((v, i) => requireToolPattern(v, `${path}[${i}]`));
|
|
513
744
|
}
|
|
745
|
+
/** Require a string value at `path` or throw {@link ConfigError}. */
|
|
514
746
|
function requireString(value, path) {
|
|
515
747
|
if (typeof value === "string") return value;
|
|
516
748
|
throw new ConfigError(`expected string, got ${typeOf(value)}`, path);
|
|
517
749
|
}
|
|
750
|
+
/** Require a boolean value at `path` or throw {@link ConfigError}. */
|
|
518
751
|
function requireBool(value, path) {
|
|
519
752
|
if (typeof value === "boolean") return value;
|
|
520
753
|
throw new ConfigError(`expected boolean, got ${typeOf(value)}`, path);
|
|
521
754
|
}
|
|
755
|
+
/** True for non-null, non-array objects (YAML mapping nodes). */
|
|
522
756
|
function isPlainObject(x) {
|
|
523
757
|
return typeof x === "object" && x !== null && !Array.isArray(x);
|
|
524
758
|
}
|
|
759
|
+
/** Human-readable type name for config error messages. */
|
|
525
760
|
function typeOf(x) {
|
|
526
761
|
if (x === null) return "null";
|
|
527
762
|
if (Array.isArray(x)) return "array";
|
|
@@ -531,6 +766,9 @@ function typeOf(x) {
|
|
|
531
766
|
//#region src/config/grading-schema.ts
|
|
532
767
|
/**
|
|
533
768
|
* Zod schema for standalone grading YAML (`grading.yaml`).
|
|
769
|
+
*
|
|
770
|
+
* The top-level `judge` block reuses {@link ConfigPartialSchema} fields plus
|
|
771
|
+
* grader-specific concurrency and system-instruction overrides.
|
|
534
772
|
*/
|
|
535
773
|
/** Top-level `judge` block — mirrors harness config fields plus grader concurrency. */
|
|
536
774
|
const JudgeConfigSchema = ConfigPartialSchema.extend({
|
|
@@ -544,7 +782,11 @@ const GradingConfigSchema = z.object({ judge: JudgeConfigSchema });
|
|
|
544
782
|
//#region src/config/grading-loader.ts
|
|
545
783
|
/**
|
|
546
784
|
* Load standalone grading YAML for `harness-eval grade`.
|
|
785
|
+
*
|
|
786
|
+
* Grading config defines the judge subprocess (model, concurrency, Claude Code
|
|
787
|
+
* flags) separately from the suite under test.
|
|
547
788
|
*/
|
|
789
|
+
/** Load grading YAML from disk and resolve relative paths. */
|
|
548
790
|
async function loadGradingConfig(filePath) {
|
|
549
791
|
const absolutePath = resolve(filePath);
|
|
550
792
|
let content;
|
|
@@ -555,6 +797,11 @@ async function loadGradingConfig(filePath) {
|
|
|
555
797
|
}
|
|
556
798
|
return parseGradingConfig(content, absolutePath);
|
|
557
799
|
}
|
|
800
|
+
/**
|
|
801
|
+
* Parse grading YAML from a string.
|
|
802
|
+
*
|
|
803
|
+
* @param sourcePath Optional path for error messages and path resolution.
|
|
804
|
+
*/
|
|
558
805
|
function parseGradingConfig(yamlContent, sourcePath) {
|
|
559
806
|
let raw;
|
|
560
807
|
try {
|
|
@@ -568,6 +815,7 @@ function parseGradingConfig(yamlContent, sourcePath) {
|
|
|
568
815
|
if (sourcePath) resolveGradingConfigPaths(config, sourcePath);
|
|
569
816
|
return config;
|
|
570
817
|
}
|
|
818
|
+
/** Format a zod validation error with optional source file prefix. */
|
|
571
819
|
function formatZodError$1(err, sourcePath) {
|
|
572
820
|
return err.issues.map((issue) => {
|
|
573
821
|
const path = issue.path.length > 0 ? issue.path.join(".") : "(root)";
|
|
@@ -578,6 +826,19 @@ function formatZodError$1(err, sourcePath) {
|
|
|
578
826
|
//#region src/config/loader.ts
|
|
579
827
|
/**
|
|
580
828
|
* Load a `TestSuite` from a YAML file, directory, or string.
|
|
829
|
+
*
|
|
830
|
+
* Supports two on-disk layouts:
|
|
831
|
+
* - Single file: `suite.yaml` with inline `cases`.
|
|
832
|
+
* - Directory: `suite.yaml` plus optional `cases/*.yaml` fragments merged
|
|
833
|
+
* in lexicographic path order.
|
|
834
|
+
*
|
|
835
|
+
* Relative paths in config (MCP config, plugin dirs, etc.) are resolved
|
|
836
|
+
* against the suite file directory after load.
|
|
837
|
+
*/
|
|
838
|
+
/**
|
|
839
|
+
* Load a suite from a file path or directory path.
|
|
840
|
+
*
|
|
841
|
+
* @throws {@link ConfigError} when the path is unreadable or validation fails.
|
|
581
842
|
*/
|
|
582
843
|
async function loadSuite(filePath) {
|
|
583
844
|
const absolutePath = resolve(filePath);
|
|
@@ -590,6 +851,7 @@ async function loadSuite(filePath) {
|
|
|
590
851
|
if (info.isDirectory()) return loadSuiteDirectory(absolutePath);
|
|
591
852
|
return loadSuiteFile(absolutePath);
|
|
592
853
|
}
|
|
854
|
+
/** Load and parse a single-file suite (not a directory layout). */
|
|
593
855
|
async function loadSuiteFile(absolutePath) {
|
|
594
856
|
let content;
|
|
595
857
|
try {
|
|
@@ -599,6 +861,12 @@ async function loadSuiteFile(absolutePath) {
|
|
|
599
861
|
}
|
|
600
862
|
return parseSuite(content, absolutePath);
|
|
601
863
|
}
|
|
864
|
+
/**
|
|
865
|
+
* Load a directory suite: `suite.yaml` plus optional `cases/` YAML files.
|
|
866
|
+
*
|
|
867
|
+
* Cases from `suite.yaml` sort before external case files; within each file,
|
|
868
|
+
* array order is preserved.
|
|
869
|
+
*/
|
|
602
870
|
async function loadSuiteDirectory(dir) {
|
|
603
871
|
const suiteYamlPath = join(dir, "suite.yaml");
|
|
604
872
|
let content;
|
|
@@ -638,6 +906,11 @@ async function loadSuiteDirectory(dir) {
|
|
|
638
906
|
resolveSuitePaths(suite, suiteYamlPath);
|
|
639
907
|
return suite;
|
|
640
908
|
}
|
|
909
|
+
/**
|
|
910
|
+
* Parse suite YAML from a string (single-file layout with inline cases).
|
|
911
|
+
*
|
|
912
|
+
* @param sourcePath Optional path for error messages and relative path resolution.
|
|
913
|
+
*/
|
|
641
914
|
function parseSuite(yamlContent, sourcePath) {
|
|
642
915
|
let raw;
|
|
643
916
|
try {
|
|
@@ -651,6 +924,7 @@ function parseSuite(yamlContent, sourcePath) {
|
|
|
651
924
|
if (sourcePath) resolveSuitePaths(suite, resolve(sourcePath));
|
|
652
925
|
return suite;
|
|
653
926
|
}
|
|
927
|
+
/** Parse `suite.yaml` for directory layout (cases may be omitted). */
|
|
654
928
|
function parseSuiteDirectory(yamlContent, sourcePath) {
|
|
655
929
|
let raw;
|
|
656
930
|
try {
|
|
@@ -672,6 +946,11 @@ function parseCasesFile(yamlContent, sourcePath) {
|
|
|
672
946
|
}
|
|
673
947
|
return transformTestCases(extractRawCases(raw, sourcePath), sourcePath ?? "cases");
|
|
674
948
|
}
|
|
949
|
+
/**
|
|
950
|
+
* Normalize raw YAML into a list of {@link RawTestCase} objects.
|
|
951
|
+
*
|
|
952
|
+
* Accepts a single case, an array, or `{ cases: [...] }`.
|
|
953
|
+
*/
|
|
675
954
|
function extractRawCases(raw, sourcePath) {
|
|
676
955
|
if (Array.isArray(raw)) return raw.map((item, index) => validateRawCase(item, sourcePath, index));
|
|
677
956
|
if (raw && typeof raw === "object") {
|
|
@@ -681,11 +960,18 @@ function extractRawCases(raw, sourcePath) {
|
|
|
681
960
|
}
|
|
682
961
|
throw new ConfigError("expected a case object, array of cases, or { cases: [...] }", sourcePath);
|
|
683
962
|
}
|
|
963
|
+
/** Validate one raw case object against {@link TestCaseSchema}. */
|
|
684
964
|
function validateRawCase(raw, sourcePath, index) {
|
|
685
965
|
const validated = TestCaseSchema.safeParse(raw);
|
|
686
966
|
if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError(validated.error, sourcePath)}`, sourcePath);
|
|
687
967
|
return validated.data;
|
|
688
968
|
}
|
|
969
|
+
/**
|
|
970
|
+
* Recursively collect `.yaml` / `.yml` files under `casesDir`.
|
|
971
|
+
*
|
|
972
|
+
* Returns an empty list when the directory does not exist — external cases
|
|
973
|
+
* are optional in directory layout.
|
|
974
|
+
*/
|
|
689
975
|
async function collectCaseYamlFiles(casesDir) {
|
|
690
976
|
const files = [];
|
|
691
977
|
async function walk(dir) {
|
|
@@ -705,6 +991,7 @@ async function collectCaseYamlFiles(casesDir) {
|
|
|
705
991
|
await walk(casesDir);
|
|
706
992
|
return files.sort();
|
|
707
993
|
}
|
|
994
|
+
/** Format a zod validation error with optional source file prefix. */
|
|
708
995
|
function formatZodError(err, sourcePath) {
|
|
709
996
|
return err.issues.map((issue) => {
|
|
710
997
|
const path = issue.path.length > 0 ? issue.path.join(".") : "(root)";
|
|
@@ -714,4 +1001,4 @@ function formatZodError(err, sourcePath) {
|
|
|
714
1001
|
//#endregion
|
|
715
1002
|
export { parseGradingConfig as a, loadGradingConfig as i, parseCasesFile as n, ConfigError as o, parseSuite as r, loadSuite as t };
|
|
716
1003
|
|
|
717
|
-
//# sourceMappingURL=loader-
|
|
1004
|
+
//# sourceMappingURL=loader-DcI0KfRX.js.map
|