@alis-build/harness-eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +700 -0
- package/dist/adapters/claude-code/index.d.ts +3 -0
- package/dist/adapters/claude-code/index.js +2 -0
- package/dist/build-DsVJ_UeU.js +1396 -0
- package/dist/build-DsVJ_UeU.js.map +1 -0
- package/dist/cardinality-DlE44e-4.js +31 -0
- package/dist/cardinality-DlE44e-4.js.map +1 -0
- package/dist/claude-code-ycT0JQZF.js +563 -0
- package/dist/claude-code-ycT0JQZF.js.map +1 -0
- package/dist/cli/bin.d.ts +1 -0
- package/dist/cli/bin.js +623 -0
- package/dist/cli/bin.js.map +1 -0
- package/dist/config/loader.d.ts +2 -0
- package/dist/config/loader.js +2 -0
- package/dist/index-6Z17eKZx.d.ts +72 -0
- package/dist/index.d.ts +725 -0
- package/dist/index.js +5 -0
- package/dist/loader-BCnFJ8rm.js +717 -0
- package/dist/loader-BCnFJ8rm.js.map +1 -0
- package/dist/loader-DTvoVfN0.d.ts +33 -0
- package/dist/rolldown-runtime-D7D4PA-g.js +13 -0
- package/dist/runner/suite.d.ts +2 -0
- package/dist/runner/suite.js +2 -0
- package/dist/suite-BoOvK_lq.d.ts +7 -0
- package/dist/suite-chj0j22j.js +684 -0
- package/dist/suite-chj0j22j.js.map +1 -0
- package/dist/types-B9H4IZtA.d.ts +305 -0
- package/dist/types-BQol062t.d.ts +292 -0
- package/package.json +74 -0
- package/schemas/eval-interchange-agent-trace.schema.json +322 -0
- package/schemas/eval-interchange-proto-instance.schema.json +106 -0
- package/schemas/eval-interchange.schema.json +140 -0
- package/schemas/eval-run-envelope.schema.json +2195 -0
- package/schemas/trajectory-view.schema.json +441 -0
|
@@ -0,0 +1,717 @@
|
|
|
1
|
+
import { n as parseCardinality } from "./cardinality-DlE44e-4.js";
|
|
2
|
+
import { readFile, readdir, stat } from "node:fs/promises";
|
|
3
|
+
import { isAbsolute, join, relative, resolve } from "node:path";
|
|
4
|
+
import { parse } from "yaml";
|
|
5
|
+
import { z } from "zod";
|
|
6
|
+
//#region src/config/paths.ts
|
|
7
|
+
/**
|
|
8
|
+
* Resolve relative paths in suite config against the suite file directory.
|
|
9
|
+
*/
|
|
10
|
+
function resolvePath(value, suiteDir) {
|
|
11
|
+
if (isAbsolute(value) || value.startsWith("~/")) return value;
|
|
12
|
+
return join(suiteDir, value);
|
|
13
|
+
}
|
|
14
|
+
function resolveClaudeCodePaths(block, suiteDir) {
|
|
15
|
+
const resolved = { ...block };
|
|
16
|
+
if (typeof resolved.mcpConfig === "string") resolved.mcpConfig = resolvePath(resolved.mcpConfig, suiteDir);
|
|
17
|
+
if (Array.isArray(resolved.pluginDirs)) resolved.pluginDirs = resolved.pluginDirs.map((p) => typeof p === "string" ? resolvePath(p, suiteDir) : p);
|
|
18
|
+
if (Array.isArray(resolved.addDirs)) resolved.addDirs = resolved.addDirs.map((p) => typeof p === "string" ? resolvePath(p, suiteDir) : p);
|
|
19
|
+
for (const field of [
|
|
20
|
+
"systemPromptFile",
|
|
21
|
+
"appendSystemPromptFile",
|
|
22
|
+
"debugFile"
|
|
23
|
+
]) {
|
|
24
|
+
const value = resolved[field];
|
|
25
|
+
if (typeof value === "string" && !value.trim().startsWith("{")) resolved[field] = resolvePath(value, suiteDir);
|
|
26
|
+
}
|
|
27
|
+
if (typeof resolved.settings === "string" && !resolved.settings.trim().startsWith("{")) resolved.settings = resolvePath(resolved.settings, suiteDir);
|
|
28
|
+
return resolved;
|
|
29
|
+
}
|
|
30
|
+
/** Resolve relative paths in a config layer relative to `suiteDir`. */
|
|
31
|
+
function resolveConfigPaths(config, suiteDir) {
|
|
32
|
+
if (!config) return void 0;
|
|
33
|
+
const resolved = { ...config };
|
|
34
|
+
if (typeof resolved.cwd === "string") resolved.cwd = resolvePath(resolved.cwd, suiteDir);
|
|
35
|
+
if (resolved.claudeCode && typeof resolved.claudeCode === "object" && !Array.isArray(resolved.claudeCode)) resolved.claudeCode = resolveClaudeCodePaths(resolved.claudeCode, suiteDir);
|
|
36
|
+
return resolved;
|
|
37
|
+
}
|
|
38
|
+
/** Resolve paths on an entire suite after load. */
|
|
39
|
+
function resolveSuitePaths(suite, suiteFilePath) {
|
|
40
|
+
const suiteDir = configFileDir(suiteFilePath);
|
|
41
|
+
suite.defaultConfig = resolveConfigPaths(suite.defaultConfig, suiteDir);
|
|
42
|
+
for (const cell of suite.matrix) cell.config = resolveConfigPaths(cell.config, suiteDir) ?? cell.config;
|
|
43
|
+
for (const testCase of suite.cases) testCase.config = resolveConfigPaths(testCase.config, suiteDir);
|
|
44
|
+
}
|
|
45
|
+
function configFileDir(filePath) {
|
|
46
|
+
return filePath.includes("/") || filePath.includes("\\") ? filePath.replace(/[/\\][^/\\]+$/, "") : ".";
|
|
47
|
+
}
|
|
48
|
+
function resolveEnvPaths(env, baseDir) {
|
|
49
|
+
const resolved = {};
|
|
50
|
+
for (const [key, value] of Object.entries(env)) if (value.startsWith("./") || value.startsWith("../") || value.includes("/") && !value.startsWith("http")) resolved[key] = resolvePath(value, baseDir);
|
|
51
|
+
else resolved[key] = value;
|
|
52
|
+
return resolved;
|
|
53
|
+
}
|
|
54
|
+
/** Resolve relative paths in a standalone grading config file. */
|
|
55
|
+
function resolveGradingConfigPaths(config, configFilePath) {
|
|
56
|
+
const baseDir = configFileDir(configFilePath);
|
|
57
|
+
const { adapter, maxConcurrent, ...rest } = config.judge;
|
|
58
|
+
config.judge = {
|
|
59
|
+
...resolveConfigPaths(rest, baseDir) ?? rest,
|
|
60
|
+
adapter,
|
|
61
|
+
maxConcurrent
|
|
62
|
+
};
|
|
63
|
+
if (config.judge.env) config.judge.env = resolveEnvPaths(config.judge.env, baseDir);
|
|
64
|
+
}
|
|
65
|
+
//#endregion
|
|
66
|
+
//#region src/config/schema.ts
|
|
67
|
+
/**
|
|
68
|
+
* zod schemas for the YAML on-disk shape.
|
|
69
|
+
*
|
|
70
|
+
* Config uses a nested layout: generic harness fields at the top level,
|
|
71
|
+
* adapter-specific options under a named key (e.g. `claudeCode`).
|
|
72
|
+
*/
|
|
73
|
+
/** Claude Code adapter-specific options (nested under `claudeCode`). */
|
|
74
|
+
const ClaudeCodeConfigSchema = z.object({
|
|
75
|
+
binary: z.string(),
|
|
76
|
+
pluginDirs: z.array(z.string()),
|
|
77
|
+
mcpConfig: z.string(),
|
|
78
|
+
permissionMode: z.enum([
|
|
79
|
+
"default",
|
|
80
|
+
"acceptEdits",
|
|
81
|
+
"plan",
|
|
82
|
+
"auto",
|
|
83
|
+
"dontAsk",
|
|
84
|
+
"bypassPermissions"
|
|
85
|
+
]),
|
|
86
|
+
effort: z.enum([
|
|
87
|
+
"low",
|
|
88
|
+
"medium",
|
|
89
|
+
"high",
|
|
90
|
+
"xhigh",
|
|
91
|
+
"max"
|
|
92
|
+
]),
|
|
93
|
+
pluginUrls: z.array(z.string()),
|
|
94
|
+
addDirs: z.array(z.string()),
|
|
95
|
+
strictMcpConfig: z.boolean(),
|
|
96
|
+
agent: z.string(),
|
|
97
|
+
fallbackModel: z.string(),
|
|
98
|
+
tools: z.string(),
|
|
99
|
+
maxBudgetUsd: z.number().positive(),
|
|
100
|
+
settings: z.string(),
|
|
101
|
+
settingSources: z.string(),
|
|
102
|
+
systemPrompt: z.string(),
|
|
103
|
+
systemPromptFile: z.string(),
|
|
104
|
+
appendSystemPrompt: z.string(),
|
|
105
|
+
appendSystemPromptFile: z.string(),
|
|
106
|
+
debug: z.union([z.string(), z.boolean()]),
|
|
107
|
+
debugFile: z.string(),
|
|
108
|
+
includeHookEvents: z.boolean(),
|
|
109
|
+
noSessionPersistence: z.boolean(),
|
|
110
|
+
disableSlashCommands: z.boolean(),
|
|
111
|
+
bare: z.boolean(),
|
|
112
|
+
safeMode: z.boolean(),
|
|
113
|
+
allowDangerouslySkipPermissions: z.boolean(),
|
|
114
|
+
dangerouslySkipPermissions: z.boolean(),
|
|
115
|
+
allowedTools: z.array(z.string()),
|
|
116
|
+
disallowedTools: z.array(z.string()),
|
|
117
|
+
maxTurns: z.number().int().positive(),
|
|
118
|
+
isolateConfig: z.boolean()
|
|
119
|
+
}).partial();
|
|
120
|
+
/** Generic + nested adapter config for one layer (defaultConfig, case, cell). */
|
|
121
|
+
const ConfigPartialSchema = z.object({
|
|
122
|
+
model: z.string(),
|
|
123
|
+
cwd: z.string(),
|
|
124
|
+
timeoutMs: z.number().int().positive(),
|
|
125
|
+
env: z.record(z.string(), z.string()),
|
|
126
|
+
claudeCode: ClaudeCodeConfigSchema
|
|
127
|
+
}).partial();
|
|
128
|
+
/** A matrix cell — one point in the configuration matrix. */
|
|
129
|
+
const MatrixCellSchema = z.object({
|
|
130
|
+
label: z.string().min(1),
|
|
131
|
+
config: ConfigPartialSchema,
|
|
132
|
+
axes: z.record(z.string(), z.string()).optional()
|
|
133
|
+
});
|
|
134
|
+
/** Reference tool call in suite YAML. */
|
|
135
|
+
const ReferenceToolCallSchema = z.object({
|
|
136
|
+
tool_name: z.string().min(1),
|
|
137
|
+
tool_input: z.unknown()
|
|
138
|
+
});
|
|
139
|
+
/** A test case. */
|
|
140
|
+
const TestCaseSchema = z.object({
|
|
141
|
+
id: z.string().min(1),
|
|
142
|
+
prompt: z.string().min(1),
|
|
143
|
+
category: z.string().optional(),
|
|
144
|
+
notes: z.string().optional(),
|
|
145
|
+
expectations: z.array(z.string().min(1)).optional(),
|
|
146
|
+
reference_trajectory: z.array(ReferenceToolCallSchema).optional(),
|
|
147
|
+
human_ratings: z.record(z.string(), z.number()).optional(),
|
|
148
|
+
assertions: z.array(z.unknown()).min(1),
|
|
149
|
+
repetitions: z.number().int().positive().optional(),
|
|
150
|
+
config: ConfigPartialSchema.optional()
|
|
151
|
+
});
|
|
152
|
+
/** Top-level suite shape. */
|
|
153
|
+
const TestSuiteSchema = z.object({
|
|
154
|
+
adapter: z.string().optional(),
|
|
155
|
+
defaultConfig: ConfigPartialSchema.optional(),
|
|
156
|
+
matrix: z.array(MatrixCellSchema).min(1),
|
|
157
|
+
cases: z.array(TestCaseSchema).min(1)
|
|
158
|
+
});
|
|
159
|
+
/** Directory suite root (suite.yaml) — cases may live under cases/ as separate YAML files. */
|
|
160
|
+
const SuiteDirectorySchema = z.object({
|
|
161
|
+
adapter: z.string().optional(),
|
|
162
|
+
defaultConfig: ConfigPartialSchema.optional(),
|
|
163
|
+
matrix: z.array(MatrixCellSchema).min(1),
|
|
164
|
+
cases: z.array(TestCaseSchema).optional()
|
|
165
|
+
});
|
|
166
|
+
//#endregion
|
|
167
|
+
//#region src/config/transform.ts
|
|
168
|
+
/**
|
|
169
|
+
* Thrown when a YAML suite fails to validate or transform. Carries a JSON-path-
|
|
170
|
+
* like trail so users can find the offending node in their config quickly.
|
|
171
|
+
*/
|
|
172
|
+
var ConfigError = class extends Error {
|
|
173
|
+
path;
|
|
174
|
+
constructor(message, path) {
|
|
175
|
+
super(path ? `[${path}] ${message}` : message);
|
|
176
|
+
this.path = path;
|
|
177
|
+
this.name = "ConfigError";
|
|
178
|
+
}
|
|
179
|
+
};
|
|
180
|
+
/** Transform a zod-validated raw suite into the runtime `TestSuite` shape. */
|
|
181
|
+
function transformSuite(raw) {
|
|
182
|
+
return transformSuiteParts(raw);
|
|
183
|
+
}
|
|
184
|
+
/** Transform a directory `suite.yaml` (cases optional) into runtime shape. */
|
|
185
|
+
function transformSuiteDirectory(raw) {
|
|
186
|
+
return transformSuiteParts({
|
|
187
|
+
...raw,
|
|
188
|
+
cases: raw.cases ?? []
|
|
189
|
+
});
|
|
190
|
+
}
|
|
191
|
+
/** Transform parsed case files into runtime test cases. */
|
|
192
|
+
function transformTestCases(raw, pathPrefix) {
|
|
193
|
+
return raw.map((c, i) => transformTestCase(c, `${pathPrefix}[${i}]`));
|
|
194
|
+
}
|
|
195
|
+
function transformSuiteParts(raw) {
|
|
196
|
+
return {
|
|
197
|
+
adapter: raw.adapter,
|
|
198
|
+
defaultConfig: raw.defaultConfig,
|
|
199
|
+
matrix: raw.matrix.map(transformMatrixCell),
|
|
200
|
+
cases: raw.cases.map((c, i) => transformTestCase(c, `cases[${i}]`))
|
|
201
|
+
};
|
|
202
|
+
}
|
|
203
|
+
function transformMatrixCell(raw) {
|
|
204
|
+
return {
|
|
205
|
+
label: raw.label,
|
|
206
|
+
config: raw.config,
|
|
207
|
+
axes: raw.axes
|
|
208
|
+
};
|
|
209
|
+
}
|
|
210
|
+
function transformTestCase(raw, path) {
|
|
211
|
+
return {
|
|
212
|
+
id: raw.id,
|
|
213
|
+
prompt: raw.prompt,
|
|
214
|
+
category: raw.category,
|
|
215
|
+
notes: raw.notes,
|
|
216
|
+
expectations: raw.expectations,
|
|
217
|
+
reference_trajectory: raw.reference_trajectory,
|
|
218
|
+
human_ratings: raw.human_ratings,
|
|
219
|
+
repetitions: raw.repetitions,
|
|
220
|
+
config: raw.config,
|
|
221
|
+
assertions: raw.assertions.map((a, i) => transformThresholdedAssertion(a, `${path}.assertions[${i}]`))
|
|
222
|
+
};
|
|
223
|
+
}
|
|
224
|
+
/** Keys that may appear alongside an assertion-type key. Not assertion types themselves. */
|
|
225
|
+
const SIBLING_KEYS = /* @__PURE__ */ new Set(["threshold"]);
|
|
226
|
+
function transformThresholdedAssertion(raw, path) {
|
|
227
|
+
if (!isPlainObject(raw)) throw new ConfigError(`expected object, got ${typeOf(raw)}`, path);
|
|
228
|
+
const threshold = raw.threshold;
|
|
229
|
+
if (threshold !== void 0) {
|
|
230
|
+
if (typeof threshold !== "number" || threshold < 0 || threshold > 1) throw new ConfigError(`threshold must be a number in [0, 1], got ${JSON.stringify(threshold)}`, `${path}.threshold`);
|
|
231
|
+
}
|
|
232
|
+
return {
|
|
233
|
+
assertion: transformAssertion(raw, path),
|
|
234
|
+
threshold: typeof threshold === "number" ? threshold : void 0
|
|
235
|
+
};
|
|
236
|
+
}
|
|
237
|
+
/**
|
|
238
|
+
* Transform one assertion from YAML shape to runtime shape.
|
|
239
|
+
*
|
|
240
|
+
* Finds the single non-sibling key, dispatches to the per-type transformer.
|
|
241
|
+
* Per-type transformers handle both verbose-object and shortcut-scalar input
|
|
242
|
+
* shapes where applicable.
|
|
243
|
+
*/
|
|
244
|
+
function transformAssertion(raw, path) {
|
|
245
|
+
if (!isPlainObject(raw)) throw new ConfigError(`expected object, got ${typeOf(raw)}`, path);
|
|
246
|
+
const typeKeys = Object.keys(raw).filter((k) => !SIBLING_KEYS.has(k));
|
|
247
|
+
if (typeKeys.length === 0) throw new ConfigError(`no assertion type key found (got only sibling keys: ${Object.keys(raw).join(", ")})`, path);
|
|
248
|
+
if (typeKeys.length > 1) throw new ConfigError(`multiple assertion type keys; pick one: ${typeKeys.join(", ")}`, path);
|
|
249
|
+
const typeKey = typeKeys[0];
|
|
250
|
+
const value = raw[typeKey];
|
|
251
|
+
const valuePath = `${path}.${typeKey}`;
|
|
252
|
+
switch (typeKey) {
|
|
253
|
+
case "called": return transformCalled(value, valuePath);
|
|
254
|
+
case "not_called": return transformNotCalled(value, valuePath);
|
|
255
|
+
case "called_any_of": return transformCalledAnyOf(value, valuePath);
|
|
256
|
+
case "called_all_of": return transformCalledAllOf(value, valuePath);
|
|
257
|
+
case "called_before": return transformCalledBefore(value, valuePath);
|
|
258
|
+
case "sequence": return transformSequence(value, valuePath);
|
|
259
|
+
case "called_with": return transformCalledWith(value, valuePath);
|
|
260
|
+
case "responded_without_tool_calls": return transformRespondedWithoutToolCalls(value, valuePath);
|
|
261
|
+
case "iterations_within": return transformScalarMax(value, valuePath, "iterations_within");
|
|
262
|
+
case "cost_within_usd": return transformScalarMax(value, valuePath, "cost_within_usd");
|
|
263
|
+
case "duration_within_ms": return transformScalarMax(value, valuePath, "duration_within_ms");
|
|
264
|
+
case "finished_with": return transformFinishedWith(value, valuePath);
|
|
265
|
+
case "response_contains": return transformResponseText(value, valuePath, "response_contains");
|
|
266
|
+
case "response_not_contains": return transformResponseText(value, valuePath, "response_not_contains");
|
|
267
|
+
case "response_matches": return transformResponseMatches(value, valuePath);
|
|
268
|
+
case "all_of": return transformAllOf(value, valuePath);
|
|
269
|
+
case "any_of": return transformAnyOf(value, valuePath);
|
|
270
|
+
case "not": return transformNot(value, valuePath);
|
|
271
|
+
default: throw new ConfigError(`unknown assertion type: ${typeKey}`, path);
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
function transformCalled(value, path) {
|
|
275
|
+
if (typeof value === "string") return {
|
|
276
|
+
type: "called",
|
|
277
|
+
tool: value
|
|
278
|
+
};
|
|
279
|
+
if (!isPlainObject(value)) throw new ConfigError(`expected string or object, got ${typeOf(value)}`, path);
|
|
280
|
+
const tool = requireToolPattern(value.tool, `${path}.tool`);
|
|
281
|
+
let times;
|
|
282
|
+
if (value.times !== void 0) {
|
|
283
|
+
times = requireString(value.times, `${path}.times`);
|
|
284
|
+
try {
|
|
285
|
+
parseCardinality(times);
|
|
286
|
+
} catch (err) {
|
|
287
|
+
throw new ConfigError(err instanceof Error ? err.message : `invalid cardinality: ${times}`, `${path}.times`);
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
return {
|
|
291
|
+
type: "called",
|
|
292
|
+
tool,
|
|
293
|
+
times
|
|
294
|
+
};
|
|
295
|
+
}
|
|
296
|
+
function transformNotCalled(value, path) {
|
|
297
|
+
if (typeof value === "string") return {
|
|
298
|
+
type: "not_called",
|
|
299
|
+
tool: value
|
|
300
|
+
};
|
|
301
|
+
if (!isPlainObject(value)) throw new ConfigError(`expected string or object, got ${typeOf(value)}`, path);
|
|
302
|
+
return {
|
|
303
|
+
type: "not_called",
|
|
304
|
+
tool: requireToolPattern(value.tool, `${path}.tool`)
|
|
305
|
+
};
|
|
306
|
+
}
|
|
307
|
+
function transformCalledAnyOf(value, path) {
|
|
308
|
+
return {
|
|
309
|
+
type: "called_any_of",
|
|
310
|
+
tools: requireToolPatternList(value, path)
|
|
311
|
+
};
|
|
312
|
+
}
|
|
313
|
+
function transformCalledAllOf(value, path) {
|
|
314
|
+
return {
|
|
315
|
+
type: "called_all_of",
|
|
316
|
+
tools: requireToolPatternList(value, path)
|
|
317
|
+
};
|
|
318
|
+
}
|
|
319
|
+
function transformCalledBefore(value, path) {
|
|
320
|
+
if (!isPlainObject(value)) throw new ConfigError(`expected object with {first, then}, got ${typeOf(value)}`, path);
|
|
321
|
+
return {
|
|
322
|
+
type: "called_before",
|
|
323
|
+
first: requireToolPattern(value.first, `${path}.first`),
|
|
324
|
+
then: requireToolPattern(value.then, `${path}.then`)
|
|
325
|
+
};
|
|
326
|
+
}
|
|
327
|
+
function transformSequence(value, path) {
|
|
328
|
+
if (Array.isArray(value)) return {
|
|
329
|
+
type: "sequence",
|
|
330
|
+
tools: value.map((v, i) => requireToolPattern(v, `${path}[${i}]`))
|
|
331
|
+
};
|
|
332
|
+
if (!isPlainObject(value)) throw new ConfigError(`expected array or object, got ${typeOf(value)}`, path);
|
|
333
|
+
return {
|
|
334
|
+
type: "sequence",
|
|
335
|
+
tools: requireToolPatternList(value.tools, `${path}.tools`),
|
|
336
|
+
strict: value.strict === void 0 ? void 0 : requireBool(value.strict, `${path}.strict`)
|
|
337
|
+
};
|
|
338
|
+
}
|
|
339
|
+
function transformCalledWith(value, path) {
|
|
340
|
+
if (!isPlainObject(value)) throw new ConfigError(`expected object with {tool, args}, got ${typeOf(value)}`, path);
|
|
341
|
+
const tool = requireToolPattern(value.tool, `${path}.tool`);
|
|
342
|
+
if (value.args === void 0) throw new ConfigError(`missing required field 'args'`, `${path}.args`);
|
|
343
|
+
validatePredicate(value.args, `${path}.args`);
|
|
344
|
+
return {
|
|
345
|
+
type: "called_with",
|
|
346
|
+
tool,
|
|
347
|
+
args: value.args
|
|
348
|
+
};
|
|
349
|
+
}
|
|
350
|
+
function transformRespondedWithoutToolCalls(value, path) {
|
|
351
|
+
if (value === true || value === null || isPlainObject(value) && Object.keys(value).length === 0) return { type: "responded_without_tool_calls" };
|
|
352
|
+
throw new ConfigError(`expected true or empty object, got ${JSON.stringify(value)}`, path);
|
|
353
|
+
}
|
|
354
|
+
function transformScalarMax(value, path, type) {
|
|
355
|
+
let max;
|
|
356
|
+
if (typeof value === "number") max = value;
|
|
357
|
+
else if (isPlainObject(value) && typeof value.max === "number") max = value.max;
|
|
358
|
+
else throw new ConfigError(`expected number or {max: number}, got ${JSON.stringify(value)}`, path);
|
|
359
|
+
if (max <= 0) throw new ConfigError(`max must be positive, got ${max}`, path);
|
|
360
|
+
return {
|
|
361
|
+
type,
|
|
362
|
+
max
|
|
363
|
+
};
|
|
364
|
+
}
|
|
365
|
+
function transformFinishedWith(value, path) {
|
|
366
|
+
if (typeof value === "string") return {
|
|
367
|
+
type: "finished_with",
|
|
368
|
+
reasons: value
|
|
369
|
+
};
|
|
370
|
+
if (Array.isArray(value)) return {
|
|
371
|
+
type: "finished_with",
|
|
372
|
+
reasons: value.map((v, i) => requireString(v, `${path}[${i}]`))
|
|
373
|
+
};
|
|
374
|
+
if (isPlainObject(value)) {
|
|
375
|
+
const reasons = value.reasons;
|
|
376
|
+
if (typeof reasons === "string") return {
|
|
377
|
+
type: "finished_with",
|
|
378
|
+
reasons
|
|
379
|
+
};
|
|
380
|
+
if (Array.isArray(reasons)) return {
|
|
381
|
+
type: "finished_with",
|
|
382
|
+
reasons: reasons.map((v, i) => requireString(v, `${path}.reasons[${i}]`))
|
|
383
|
+
};
|
|
384
|
+
}
|
|
385
|
+
throw new ConfigError(`expected string, string[], or {reasons: ...}, got ${JSON.stringify(value)}`, path);
|
|
386
|
+
}
|
|
387
|
+
function transformResponseText(value, path, type) {
|
|
388
|
+
if (typeof value === "string") return {
|
|
389
|
+
type,
|
|
390
|
+
text: value
|
|
391
|
+
};
|
|
392
|
+
if (isPlainObject(value) && typeof value.text === "string") return {
|
|
393
|
+
type,
|
|
394
|
+
text: value.text
|
|
395
|
+
};
|
|
396
|
+
throw new ConfigError(`expected string or {text: string}, got ${JSON.stringify(value)}`, path);
|
|
397
|
+
}
|
|
398
|
+
function transformResponseMatches(value, path) {
|
|
399
|
+
if (!isPlainObject(value)) throw new ConfigError(`expected object with {pattern, flags?}, got ${typeOf(value)}`, path);
|
|
400
|
+
return {
|
|
401
|
+
type: "response_matches",
|
|
402
|
+
pattern: requireString(value.pattern, `${path}.pattern`),
|
|
403
|
+
flags: value.flags === void 0 ? void 0 : requireString(value.flags, `${path}.flags`)
|
|
404
|
+
};
|
|
405
|
+
}
|
|
406
|
+
function transformAllOf(value, path) {
|
|
407
|
+
return {
|
|
408
|
+
type: "all_of",
|
|
409
|
+
assertions: transformCompoundList(value, path)
|
|
410
|
+
};
|
|
411
|
+
}
|
|
412
|
+
function transformAnyOf(value, path) {
|
|
413
|
+
return {
|
|
414
|
+
type: "any_of",
|
|
415
|
+
assertions: transformCompoundList(value, path)
|
|
416
|
+
};
|
|
417
|
+
}
|
|
418
|
+
function transformNot(value, path) {
|
|
419
|
+
return {
|
|
420
|
+
type: "not",
|
|
421
|
+
assertion: transformAssertion(value, path)
|
|
422
|
+
};
|
|
423
|
+
}
|
|
424
|
+
function transformCompoundList(value, path) {
|
|
425
|
+
const list = Array.isArray(value) ? value : isPlainObject(value) && Array.isArray(value.assertions) ? value.assertions : null;
|
|
426
|
+
if (list === null) throw new ConfigError(`expected array or {assertions: [...]}, got ${JSON.stringify(value)}`, path);
|
|
427
|
+
return list.map((a, i) => transformAssertion(a, `${path}[${i}]`));
|
|
428
|
+
}
|
|
429
|
+
const LEAF_OPS = /* @__PURE__ */ new Set([
|
|
430
|
+
"equals",
|
|
431
|
+
"contains",
|
|
432
|
+
"not_contains",
|
|
433
|
+
"regex",
|
|
434
|
+
"gte",
|
|
435
|
+
"lte",
|
|
436
|
+
"gt",
|
|
437
|
+
"lt",
|
|
438
|
+
"one_of"
|
|
439
|
+
]);
|
|
440
|
+
const COMPOUND_OPS = /* @__PURE__ */ new Set([
|
|
441
|
+
"any_of",
|
|
442
|
+
"all_of",
|
|
443
|
+
"not"
|
|
444
|
+
]);
|
|
445
|
+
/**
|
|
446
|
+
* Validate that a predicate is well-formed. The runtime engine is tolerant
|
|
447
|
+
* (returns false on bad shapes), but the loader is strict — invalid
|
|
448
|
+
* predicates are far more often user typos than intentional patterns.
|
|
449
|
+
*
|
|
450
|
+
* Permitted shapes:
|
|
451
|
+
* - scalar (treated as `{equals: scalar}` at runtime)
|
|
452
|
+
* - single-key object whose key is a leaf op (e.g. `{contains: "x"}`)
|
|
453
|
+
* - single-key compound (`{any_of: [...]}`, `{all_of: [...]}`, `{not: ...}`)
|
|
454
|
+
* - multi-key object (descend into fields; each value is a sub-predicate)
|
|
455
|
+
*/
|
|
456
|
+
function validatePredicate(raw, path) {
|
|
457
|
+
if (!isPlainObject(raw)) return;
|
|
458
|
+
const keys = Object.keys(raw);
|
|
459
|
+
if (keys.length === 1) {
|
|
460
|
+
const key = keys[0];
|
|
461
|
+
if (LEAF_OPS.has(key)) {
|
|
462
|
+
validateLeafOperator(key, raw[key], `${path}.${key}`);
|
|
463
|
+
return;
|
|
464
|
+
}
|
|
465
|
+
if (COMPOUND_OPS.has(key)) {
|
|
466
|
+
if (key === "not") validatePredicate(raw[key], `${path}.not`);
|
|
467
|
+
else {
|
|
468
|
+
const arr = raw[key];
|
|
469
|
+
if (!Array.isArray(arr)) throw new ConfigError(`${key} must be an array, got ${typeOf(arr)}`, `${path}.${key}`);
|
|
470
|
+
arr.forEach((sub, i) => validatePredicate(sub, `${path}.${key}[${i}]`));
|
|
471
|
+
}
|
|
472
|
+
return;
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
for (const [field, sub] of Object.entries(raw)) validatePredicate(sub, `${path}.${field}`);
|
|
476
|
+
}
|
|
477
|
+
function validateLeafOperator(op, value, path) {
|
|
478
|
+
switch (op) {
|
|
479
|
+
case "equals": return;
|
|
480
|
+
case "contains":
|
|
481
|
+
case "not_contains":
|
|
482
|
+
if (typeof value !== "string") throw new ConfigError(`${op} requires a string`, path);
|
|
483
|
+
return;
|
|
484
|
+
case "regex":
|
|
485
|
+
if (typeof value !== "string") throw new ConfigError("regex requires a string", path);
|
|
486
|
+
try {
|
|
487
|
+
new RegExp(value);
|
|
488
|
+
} catch {
|
|
489
|
+
throw new ConfigError(`invalid regex: ${value}`, path);
|
|
490
|
+
}
|
|
491
|
+
return;
|
|
492
|
+
case "gte":
|
|
493
|
+
case "lte":
|
|
494
|
+
case "gt":
|
|
495
|
+
case "lt":
|
|
496
|
+
if (typeof value !== "number") throw new ConfigError(`${op} requires a number`, path);
|
|
497
|
+
return;
|
|
498
|
+
case "one_of":
|
|
499
|
+
if (!Array.isArray(value)) throw new ConfigError("one_of requires an array", path);
|
|
500
|
+
return;
|
|
501
|
+
default: return;
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
function requireToolPattern(value, path) {
|
|
505
|
+
if (typeof value === "string") return value;
|
|
506
|
+
if (isPlainObject(value) && typeof value.pattern === "string") return { pattern: value.pattern };
|
|
507
|
+
throw new ConfigError(`expected string or {pattern: string}, got ${JSON.stringify(value)}`, path);
|
|
508
|
+
}
|
|
509
|
+
function requireToolPatternList(value, path) {
|
|
510
|
+
const list = Array.isArray(value) ? value : isPlainObject(value) && Array.isArray(value.tools) ? value.tools : null;
|
|
511
|
+
if (list === null) throw new ConfigError(`expected array of tool patterns or {tools: [...]}, got ${JSON.stringify(value)}`, path);
|
|
512
|
+
return list.map((v, i) => requireToolPattern(v, `${path}[${i}]`));
|
|
513
|
+
}
|
|
514
|
+
function requireString(value, path) {
|
|
515
|
+
if (typeof value === "string") return value;
|
|
516
|
+
throw new ConfigError(`expected string, got ${typeOf(value)}`, path);
|
|
517
|
+
}
|
|
518
|
+
function requireBool(value, path) {
|
|
519
|
+
if (typeof value === "boolean") return value;
|
|
520
|
+
throw new ConfigError(`expected boolean, got ${typeOf(value)}`, path);
|
|
521
|
+
}
|
|
522
|
+
function isPlainObject(x) {
|
|
523
|
+
return typeof x === "object" && x !== null && !Array.isArray(x);
|
|
524
|
+
}
|
|
525
|
+
function typeOf(x) {
|
|
526
|
+
if (x === null) return "null";
|
|
527
|
+
if (Array.isArray(x)) return "array";
|
|
528
|
+
return typeof x;
|
|
529
|
+
}
|
|
530
|
+
//#endregion
|
|
531
|
+
//#region src/config/grading-schema.ts
|
|
532
|
+
/**
|
|
533
|
+
* Zod schema for standalone grading YAML (`grading.yaml`).
|
|
534
|
+
*/
|
|
535
|
+
/** Top-level `judge` block — mirrors harness config fields plus grader concurrency. */
|
|
536
|
+
const JudgeConfigSchema = ConfigPartialSchema.extend({
|
|
537
|
+
adapter: z.string().optional(),
|
|
538
|
+
maxConcurrent: z.number().int().positive().optional(),
|
|
539
|
+
/** Optional judge prompt prefix (maps to upstream system_instruction). */
|
|
540
|
+
system_instruction: z.string().optional()
|
|
541
|
+
});
|
|
542
|
+
const GradingConfigSchema = z.object({ judge: JudgeConfigSchema });
|
|
543
|
+
//#endregion
|
|
544
|
+
//#region src/config/grading-loader.ts
|
|
545
|
+
/**
|
|
546
|
+
* Load standalone grading YAML for `harness-eval grade`.
|
|
547
|
+
*/
|
|
548
|
+
async function loadGradingConfig(filePath) {
|
|
549
|
+
const absolutePath = resolve(filePath);
|
|
550
|
+
let content;
|
|
551
|
+
try {
|
|
552
|
+
content = await readFile(absolutePath, "utf8");
|
|
553
|
+
} catch (err) {
|
|
554
|
+
throw new ConfigError(`failed to read grading config: ${err instanceof Error ? err.message : String(err)}`, filePath);
|
|
555
|
+
}
|
|
556
|
+
return parseGradingConfig(content, absolutePath);
|
|
557
|
+
}
|
|
558
|
+
function parseGradingConfig(yamlContent, sourcePath) {
|
|
559
|
+
let raw;
|
|
560
|
+
try {
|
|
561
|
+
raw = parse(yamlContent);
|
|
562
|
+
} catch (err) {
|
|
563
|
+
throw new ConfigError(`YAML parse error: ${err instanceof Error ? err.message : String(err)}`, sourcePath);
|
|
564
|
+
}
|
|
565
|
+
const validated = GradingConfigSchema.safeParse(raw);
|
|
566
|
+
if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError$1(validated.error, sourcePath)}`, sourcePath);
|
|
567
|
+
const config = { judge: { ...validated.data.judge } };
|
|
568
|
+
if (sourcePath) resolveGradingConfigPaths(config, sourcePath);
|
|
569
|
+
return config;
|
|
570
|
+
}
|
|
571
|
+
function formatZodError$1(err, sourcePath) {
|
|
572
|
+
return err.issues.map((issue) => {
|
|
573
|
+
const path = issue.path.length > 0 ? issue.path.join(".") : "(root)";
|
|
574
|
+
return ` ${sourcePath ? `${sourcePath} → ${path}` : path}: ${issue.message}`;
|
|
575
|
+
}).join("\n");
|
|
576
|
+
}
|
|
577
|
+
//#endregion
|
|
578
|
+
//#region src/config/loader.ts
|
|
579
|
+
/**
|
|
580
|
+
* Load a `TestSuite` from a YAML file, directory, or string.
|
|
581
|
+
*/
|
|
582
|
+
async function loadSuite(filePath) {
|
|
583
|
+
const absolutePath = resolve(filePath);
|
|
584
|
+
let info;
|
|
585
|
+
try {
|
|
586
|
+
info = await stat(absolutePath);
|
|
587
|
+
} catch (err) {
|
|
588
|
+
throw new ConfigError(`failed to read suite path: ${err instanceof Error ? err.message : String(err)}`, filePath);
|
|
589
|
+
}
|
|
590
|
+
if (info.isDirectory()) return loadSuiteDirectory(absolutePath);
|
|
591
|
+
return loadSuiteFile(absolutePath);
|
|
592
|
+
}
|
|
593
|
+
async function loadSuiteFile(absolutePath) {
|
|
594
|
+
let content;
|
|
595
|
+
try {
|
|
596
|
+
content = await readFile(absolutePath, "utf8");
|
|
597
|
+
} catch (err) {
|
|
598
|
+
throw new ConfigError(`failed to read suite file: ${err instanceof Error ? err.message : String(err)}`, absolutePath);
|
|
599
|
+
}
|
|
600
|
+
return parseSuite(content, absolutePath);
|
|
601
|
+
}
|
|
602
|
+
async function loadSuiteDirectory(dir) {
|
|
603
|
+
const suiteYamlPath = join(dir, "suite.yaml");
|
|
604
|
+
let content;
|
|
605
|
+
try {
|
|
606
|
+
content = await readFile(suiteYamlPath, "utf8");
|
|
607
|
+
} catch (err) {
|
|
608
|
+
throw new ConfigError(`missing suite.yaml in suite directory: ${err instanceof Error ? err.message : String(err)}`, dir);
|
|
609
|
+
}
|
|
610
|
+
const base = parseSuiteDirectory(content, suiteYamlPath);
|
|
611
|
+
const casesDir = join(dir, "cases");
|
|
612
|
+
const caseFiles = await collectCaseYamlFiles(casesDir);
|
|
613
|
+
const tagged = base.cases.map((testCase, index) => ({
|
|
614
|
+
relPath: "suite.yaml",
|
|
615
|
+
index,
|
|
616
|
+
testCase
|
|
617
|
+
}));
|
|
618
|
+
for (const filePath of caseFiles) {
|
|
619
|
+
const cases = parseCasesFile(await readFile(filePath, "utf8"), filePath);
|
|
620
|
+
const relPath = relative(casesDir, filePath);
|
|
621
|
+
for (const [index, testCase] of cases.entries()) tagged.push({
|
|
622
|
+
relPath,
|
|
623
|
+
index,
|
|
624
|
+
testCase
|
|
625
|
+
});
|
|
626
|
+
}
|
|
627
|
+
tagged.sort((a, b) => {
|
|
628
|
+
const pathCmp = a.relPath.localeCompare(b.relPath);
|
|
629
|
+
if (pathCmp !== 0) return pathCmp;
|
|
630
|
+
return a.index - b.index;
|
|
631
|
+
});
|
|
632
|
+
const cases = tagged.map((entry) => entry.testCase);
|
|
633
|
+
if (cases.length === 0) throw new ConfigError("suite directory has no test cases", dir);
|
|
634
|
+
const suite = {
|
|
635
|
+
...base,
|
|
636
|
+
cases
|
|
637
|
+
};
|
|
638
|
+
resolveSuitePaths(suite, suiteYamlPath);
|
|
639
|
+
return suite;
|
|
640
|
+
}
|
|
641
|
+
function parseSuite(yamlContent, sourcePath) {
|
|
642
|
+
let raw;
|
|
643
|
+
try {
|
|
644
|
+
raw = parse(yamlContent);
|
|
645
|
+
} catch (err) {
|
|
646
|
+
throw new ConfigError(`YAML parse error: ${err instanceof Error ? err.message : String(err)}`, sourcePath);
|
|
647
|
+
}
|
|
648
|
+
const validated = TestSuiteSchema.safeParse(raw);
|
|
649
|
+
if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError(validated.error, sourcePath)}`, sourcePath);
|
|
650
|
+
const suite = transformSuite(validated.data);
|
|
651
|
+
if (sourcePath) resolveSuitePaths(suite, resolve(sourcePath));
|
|
652
|
+
return suite;
|
|
653
|
+
}
|
|
654
|
+
function parseSuiteDirectory(yamlContent, sourcePath) {
|
|
655
|
+
let raw;
|
|
656
|
+
try {
|
|
657
|
+
raw = parse(yamlContent);
|
|
658
|
+
} catch (err) {
|
|
659
|
+
throw new ConfigError(`YAML parse error: ${err instanceof Error ? err.message : String(err)}`, sourcePath);
|
|
660
|
+
}
|
|
661
|
+
const validated = SuiteDirectorySchema.safeParse(raw);
|
|
662
|
+
if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError(validated.error, sourcePath)}`, sourcePath);
|
|
663
|
+
return transformSuiteDirectory(validated.data);
|
|
664
|
+
}
|
|
665
|
+
/** Parse one case file: single case, array, or `{ cases: [...] }`. */
|
|
666
|
+
function parseCasesFile(yamlContent, sourcePath) {
|
|
667
|
+
let raw;
|
|
668
|
+
try {
|
|
669
|
+
raw = parse(yamlContent);
|
|
670
|
+
} catch (err) {
|
|
671
|
+
throw new ConfigError(`YAML parse error: ${err instanceof Error ? err.message : String(err)}`, sourcePath);
|
|
672
|
+
}
|
|
673
|
+
return transformTestCases(extractRawCases(raw, sourcePath), sourcePath ?? "cases");
|
|
674
|
+
}
|
|
675
|
+
function extractRawCases(raw, sourcePath) {
|
|
676
|
+
if (Array.isArray(raw)) return raw.map((item, index) => validateRawCase(item, sourcePath, index));
|
|
677
|
+
if (raw && typeof raw === "object") {
|
|
678
|
+
const obj = raw;
|
|
679
|
+
if (Array.isArray(obj.cases)) return obj.cases.map((item, index) => validateRawCase(item, sourcePath, index));
|
|
680
|
+
if ("id" in obj && "prompt" in obj && "assertions" in obj) return [validateRawCase(raw, sourcePath, 0)];
|
|
681
|
+
}
|
|
682
|
+
throw new ConfigError("expected a case object, array of cases, or { cases: [...] }", sourcePath);
|
|
683
|
+
}
|
|
684
|
+
function validateRawCase(raw, sourcePath, index) {
|
|
685
|
+
const validated = TestCaseSchema.safeParse(raw);
|
|
686
|
+
if (!validated.success) throw new ConfigError(`validation failed:\n${formatZodError(validated.error, sourcePath)}`, sourcePath);
|
|
687
|
+
return validated.data;
|
|
688
|
+
}
|
|
689
|
+
async function collectCaseYamlFiles(casesDir) {
|
|
690
|
+
const files = [];
|
|
691
|
+
async function walk(dir) {
|
|
692
|
+
let entries;
|
|
693
|
+
try {
|
|
694
|
+
entries = await readdir(dir, { withFileTypes: true });
|
|
695
|
+
} catch (err) {
|
|
696
|
+
if (err instanceof Error && "code" in err && err.code === "ENOENT") return;
|
|
697
|
+
throw err;
|
|
698
|
+
}
|
|
699
|
+
for (const entry of entries) {
|
|
700
|
+
const fullPath = join(dir, entry.name);
|
|
701
|
+
if (entry.isDirectory()) await walk(fullPath);
|
|
702
|
+
else if (entry.isFile() && (entry.name.endsWith(".yaml") || entry.name.endsWith(".yml"))) files.push(fullPath);
|
|
703
|
+
}
|
|
704
|
+
}
|
|
705
|
+
await walk(casesDir);
|
|
706
|
+
return files.sort();
|
|
707
|
+
}
|
|
708
|
+
function formatZodError(err, sourcePath) {
|
|
709
|
+
return err.issues.map((issue) => {
|
|
710
|
+
const path = issue.path.length > 0 ? issue.path.join(".") : "(root)";
|
|
711
|
+
return ` ${sourcePath ? `${sourcePath} → ${path}` : path}: ${issue.message}`;
|
|
712
|
+
}).join("\n");
|
|
713
|
+
}
|
|
714
|
+
//#endregion
|
|
715
|
+
export { parseGradingConfig as a, loadGradingConfig as i, parseCasesFile as n, ConfigError as o, parseSuite as r, loadSuite as t };
|
|
716
|
+
|
|
717
|
+
//# sourceMappingURL=loader-BCnFJ8rm.js.map
|