@alis-build/harness-eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +700 -0
- package/dist/adapters/claude-code/index.d.ts +3 -0
- package/dist/adapters/claude-code/index.js +2 -0
- package/dist/build-DsVJ_UeU.js +1396 -0
- package/dist/build-DsVJ_UeU.js.map +1 -0
- package/dist/cardinality-DlE44e-4.js +31 -0
- package/dist/cardinality-DlE44e-4.js.map +1 -0
- package/dist/claude-code-ycT0JQZF.js +563 -0
- package/dist/claude-code-ycT0JQZF.js.map +1 -0
- package/dist/cli/bin.d.ts +1 -0
- package/dist/cli/bin.js +623 -0
- package/dist/cli/bin.js.map +1 -0
- package/dist/config/loader.d.ts +2 -0
- package/dist/config/loader.js +2 -0
- package/dist/index-6Z17eKZx.d.ts +72 -0
- package/dist/index.d.ts +725 -0
- package/dist/index.js +5 -0
- package/dist/loader-BCnFJ8rm.js +717 -0
- package/dist/loader-BCnFJ8rm.js.map +1 -0
- package/dist/loader-DTvoVfN0.d.ts +33 -0
- package/dist/rolldown-runtime-D7D4PA-g.js +13 -0
- package/dist/runner/suite.d.ts +2 -0
- package/dist/runner/suite.js +2 -0
- package/dist/suite-BoOvK_lq.d.ts +7 -0
- package/dist/suite-chj0j22j.js +684 -0
- package/dist/suite-chj0j22j.js.map +1 -0
- package/dist/types-B9H4IZtA.d.ts +305 -0
- package/dist/types-BQol062t.d.ts +292 -0
- package/package.json +74 -0
- package/schemas/eval-interchange-agent-trace.schema.json +322 -0
- package/schemas/eval-interchange-proto-instance.schema.json +106 -0
- package/schemas/eval-interchange.schema.json +140 -0
- package/schemas/eval-run-envelope.schema.json +2195 -0
- package/schemas/trajectory-view.schema.json +441 -0
|
@@ -0,0 +1,684 @@
|
|
|
1
|
+
import { t as claudeCodeAdapter } from "./claude-code-ycT0JQZF.js";
|
|
2
|
+
import { n as parseCardinality, t as describeCardinality } from "./cardinality-DlE44e-4.js";
|
|
3
|
+
//#region src/assertions/patterns.ts
|
|
4
|
+
/**
|
|
5
|
+
* Test whether a fully-qualified tool name matches a pattern.
|
|
6
|
+
*
|
|
7
|
+
* Literal patterns (no `*`) match by string equality. Glob patterns are
|
|
8
|
+
* compiled to a regex on each call — fine for our scale (dozens of patterns,
|
|
9
|
+
* thousands of calls per run). If this becomes a hot path, memoize.
|
|
10
|
+
*/
|
|
11
|
+
function toolMatches(toolName, pattern) {
|
|
12
|
+
const p = patternString(pattern);
|
|
13
|
+
if (!p.includes("*")) return toolName === p;
|
|
14
|
+
return globToRegex(p).test(toolName);
|
|
15
|
+
}
|
|
16
|
+
/** Extract the underlying string from either pattern form. */
|
|
17
|
+
function patternString(pattern) {
|
|
18
|
+
return typeof pattern === "string" ? pattern : pattern.pattern;
|
|
19
|
+
}
|
|
20
|
+
/** Human-readable representation for diagnostic messages. */
|
|
21
|
+
function describePattern(pattern) {
|
|
22
|
+
return patternString(pattern);
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Convert a glob (with `*` wildcards only) to an anchored regex.
|
|
26
|
+
* Other regex metacharacters in the input are escaped.
|
|
27
|
+
*/
|
|
28
|
+
function globToRegex(glob) {
|
|
29
|
+
const escaped = glob.replace(/[.+?^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
30
|
+
return new RegExp(`^${escaped}$`);
|
|
31
|
+
}
|
|
32
|
+
//#endregion
|
|
33
|
+
//#region src/assertions/predicates.ts
|
|
34
|
+
const LEAF_OPS = /* @__PURE__ */ new Set([
|
|
35
|
+
"equals",
|
|
36
|
+
"contains",
|
|
37
|
+
"not_contains",
|
|
38
|
+
"regex",
|
|
39
|
+
"gte",
|
|
40
|
+
"lte",
|
|
41
|
+
"gt",
|
|
42
|
+
"lt",
|
|
43
|
+
"one_of"
|
|
44
|
+
]);
|
|
45
|
+
const COMPOUND_OPS = /* @__PURE__ */ new Set([
|
|
46
|
+
"any_of",
|
|
47
|
+
"all_of",
|
|
48
|
+
"not"
|
|
49
|
+
]);
|
|
50
|
+
/**
|
|
51
|
+
* Apply a predicate to a value. Returns true if the value satisfies the
|
|
52
|
+
* predicate, false otherwise.
|
|
53
|
+
*
|
|
54
|
+
* The `predicate` parameter is typed as `unknown` because YAML deserialization
|
|
55
|
+
* produces unconstrained shapes; runtime dispatch is the validation.
|
|
56
|
+
*/
|
|
57
|
+
function matches(value, predicate) {
|
|
58
|
+
if (!isPlainObject(predicate)) return deepEquals(value, predicate);
|
|
59
|
+
const obj = predicate;
|
|
60
|
+
const keys = Object.keys(obj);
|
|
61
|
+
if (keys.length === 1) {
|
|
62
|
+
const key = keys[0];
|
|
63
|
+
if (COMPOUND_OPS.has(key)) switch (key) {
|
|
64
|
+
case "any_of": return obj.any_of.some((sub) => matches(value, sub));
|
|
65
|
+
case "all_of": return obj.all_of.every((sub) => matches(value, sub));
|
|
66
|
+
case "not": return !matches(value, obj.not);
|
|
67
|
+
}
|
|
68
|
+
if (LEAF_OPS.has(key)) return matchesLeaf(value, key, obj[key]);
|
|
69
|
+
}
|
|
70
|
+
if (!isPlainObject(value)) return false;
|
|
71
|
+
const valueObj = value;
|
|
72
|
+
for (const [field, subPred] of Object.entries(obj)) if (!matches(valueObj[field], subPred)) return false;
|
|
73
|
+
return true;
|
|
74
|
+
}
|
|
75
|
+
/** Apply a single leaf operator. Caller guarantees `op` is in LEAF_OPS. */
|
|
76
|
+
function matchesLeaf(value, op, target) {
|
|
77
|
+
switch (op) {
|
|
78
|
+
case "equals": return deepEquals(value, target);
|
|
79
|
+
case "contains": return typeof value === "string" && value.includes(target);
|
|
80
|
+
case "not_contains": return typeof value === "string" && !value.includes(target);
|
|
81
|
+
case "regex":
|
|
82
|
+
if (typeof value !== "string" || typeof target !== "string") return false;
|
|
83
|
+
try {
|
|
84
|
+
return new RegExp(target).test(value);
|
|
85
|
+
} catch {
|
|
86
|
+
return false;
|
|
87
|
+
}
|
|
88
|
+
case "gte": return typeof value === "number" && value >= target;
|
|
89
|
+
case "lte": return typeof value === "number" && value <= target;
|
|
90
|
+
case "gt": return typeof value === "number" && value > target;
|
|
91
|
+
case "lt": return typeof value === "number" && value < target;
|
|
92
|
+
case "one_of": return target.some((t) => deepEquals(value, t));
|
|
93
|
+
default: throw new Error(`unknown leaf operator: ${op}`);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
function isPlainObject(x) {
|
|
97
|
+
return typeof x === "object" && x !== null && !Array.isArray(x);
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Structural equality for unknown values. Used by `equals` and `one_of`.
|
|
101
|
+
* Strict — no coercions, no NaN-equals-NaN special case (matches `===`).
|
|
102
|
+
*/
|
|
103
|
+
function deepEquals(a, b) {
|
|
104
|
+
if (a === b) return true;
|
|
105
|
+
if (typeof a !== typeof b) return false;
|
|
106
|
+
if (a === null || b === null) return false;
|
|
107
|
+
if (typeof a !== "object") return false;
|
|
108
|
+
if (Array.isArray(a) !== Array.isArray(b)) return false;
|
|
109
|
+
if (Array.isArray(a) && Array.isArray(b)) {
|
|
110
|
+
if (a.length !== b.length) return false;
|
|
111
|
+
return a.every((v, i) => deepEquals(v, b[i]));
|
|
112
|
+
}
|
|
113
|
+
const aObj = a;
|
|
114
|
+
const bObj = b;
|
|
115
|
+
const aKeys = Object.keys(aObj);
|
|
116
|
+
const bKeys = Object.keys(bObj);
|
|
117
|
+
if (aKeys.length !== bKeys.length) return false;
|
|
118
|
+
return aKeys.every((k) => deepEquals(aObj[k], bObj[k]));
|
|
119
|
+
}
|
|
120
|
+
//#endregion
|
|
121
|
+
//#region src/assertions/tool-calls.ts
|
|
122
|
+
function evaluateCalled(view, assertion) {
|
|
123
|
+
const matching = view.toolCalls.filter((c) => toolMatches(c.name, assertion.tool));
|
|
124
|
+
const passed = parseCardinality(assertion.times)(matching.length);
|
|
125
|
+
return {
|
|
126
|
+
passed,
|
|
127
|
+
description: `called(${describePattern(assertion.tool)}, ${describeCardinality(assertion.times)})`,
|
|
128
|
+
details: passed ? `found ${matching.length} matching call(s)` : `found ${matching.length} call(s), expected ${describeCardinality(assertion.times)}`,
|
|
129
|
+
matches: matching
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
function evaluateNotCalled(view, assertion) {
|
|
133
|
+
const matching = view.toolCalls.filter((c) => toolMatches(c.name, assertion.tool));
|
|
134
|
+
const passed = matching.length === 0;
|
|
135
|
+
return {
|
|
136
|
+
passed,
|
|
137
|
+
description: `not_called(${describePattern(assertion.tool)})`,
|
|
138
|
+
details: passed ? "no matching calls" : `found ${matching.length} forbidden call(s)`,
|
|
139
|
+
matches: matching
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
function evaluateCalledAnyOf(view, assertion) {
|
|
143
|
+
const allMatches = [];
|
|
144
|
+
for (const pattern of assertion.tools) allMatches.push(...view.toolCalls.filter((c) => toolMatches(c.name, pattern)));
|
|
145
|
+
const passed = allMatches.length > 0;
|
|
146
|
+
return {
|
|
147
|
+
passed,
|
|
148
|
+
description: `called_any_of(${assertion.tools.map(describePattern).join(", ")})`,
|
|
149
|
+
details: passed ? `${allMatches.length} matching call(s)` : "no calls matched any pattern",
|
|
150
|
+
matches: allMatches
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
function evaluateCalledAllOf(view, assertion) {
|
|
154
|
+
const perPattern = assertion.tools.map((p) => ({
|
|
155
|
+
pattern: p,
|
|
156
|
+
matches: view.toolCalls.filter((c) => toolMatches(c.name, p))
|
|
157
|
+
}));
|
|
158
|
+
const missing = perPattern.filter((p) => p.matches.length === 0);
|
|
159
|
+
const passed = missing.length === 0;
|
|
160
|
+
return {
|
|
161
|
+
passed,
|
|
162
|
+
description: `called_all_of(${assertion.tools.map(describePattern).join(", ")})`,
|
|
163
|
+
details: passed ? "all patterns matched" : `missing: ${missing.map((m) => describePattern(m.pattern)).join(", ")}`,
|
|
164
|
+
matches: perPattern.flatMap((p) => p.matches)
|
|
165
|
+
};
|
|
166
|
+
}
|
|
167
|
+
function evaluateCalledBefore(view, assertion) {
|
|
168
|
+
const firsts = view.toolCalls.filter((c) => toolMatches(c.name, assertion.first));
|
|
169
|
+
const thens = view.toolCalls.filter((c) => toolMatches(c.name, assertion.then));
|
|
170
|
+
const desc = `called_before(${describePattern(assertion.first)} → ${describePattern(assertion.then)})`;
|
|
171
|
+
if (firsts.length === 0) return {
|
|
172
|
+
passed: false,
|
|
173
|
+
description: desc,
|
|
174
|
+
details: `no calls matching first`
|
|
175
|
+
};
|
|
176
|
+
if (thens.length === 0) return {
|
|
177
|
+
passed: false,
|
|
178
|
+
description: desc,
|
|
179
|
+
details: `no calls matching then`
|
|
180
|
+
};
|
|
181
|
+
const earliestFirst = Math.min(...firsts.map((c) => c.turnIndex));
|
|
182
|
+
const earliestThen = Math.min(...thens.map((c) => c.turnIndex));
|
|
183
|
+
const passed = earliestFirst < earliestThen;
|
|
184
|
+
return {
|
|
185
|
+
passed,
|
|
186
|
+
description: desc,
|
|
187
|
+
details: passed ? `first @ turn ${earliestFirst}, then @ turn ${earliestThen}` : `first @ turn ${earliestFirst}, then @ turn ${earliestThen} (not before)`,
|
|
188
|
+
matches: [...firsts, ...thens]
|
|
189
|
+
};
|
|
190
|
+
}
|
|
191
|
+
function evaluateSequence(view, assertion) {
|
|
192
|
+
const { tools, strict = false } = assertion;
|
|
193
|
+
const desc = `sequence([${tools.map(describePattern).join(" → ")}]${strict ? ", strict" : ""})`;
|
|
194
|
+
if (tools.length === 0) return {
|
|
195
|
+
passed: true,
|
|
196
|
+
description: desc,
|
|
197
|
+
details: "empty sequence trivially matches"
|
|
198
|
+
};
|
|
199
|
+
if (strict) {
|
|
200
|
+
if (view.toolCalls.length < tools.length) return {
|
|
201
|
+
passed: false,
|
|
202
|
+
description: desc,
|
|
203
|
+
details: "not enough tool calls"
|
|
204
|
+
};
|
|
205
|
+
for (let start = 0; start <= view.toolCalls.length - tools.length; start++) {
|
|
206
|
+
let ok = true;
|
|
207
|
+
for (let i = 0; i < tools.length; i++) if (!toolMatches(view.toolCalls[start + i].name, tools[i])) {
|
|
208
|
+
ok = false;
|
|
209
|
+
break;
|
|
210
|
+
}
|
|
211
|
+
if (ok) return {
|
|
212
|
+
passed: true,
|
|
213
|
+
description: desc,
|
|
214
|
+
details: `matched at positions ${start}..${start + tools.length - 1}`,
|
|
215
|
+
matches: view.toolCalls.slice(start, start + tools.length)
|
|
216
|
+
};
|
|
217
|
+
}
|
|
218
|
+
return {
|
|
219
|
+
passed: false,
|
|
220
|
+
description: desc,
|
|
221
|
+
details: "no contiguous match"
|
|
222
|
+
};
|
|
223
|
+
}
|
|
224
|
+
let idx = 0;
|
|
225
|
+
const matched = [];
|
|
226
|
+
for (const call of view.toolCalls) if (idx < tools.length && toolMatches(call.name, tools[idx])) {
|
|
227
|
+
matched.push(call);
|
|
228
|
+
idx++;
|
|
229
|
+
}
|
|
230
|
+
const passed = idx === tools.length;
|
|
231
|
+
return {
|
|
232
|
+
passed,
|
|
233
|
+
description: desc,
|
|
234
|
+
details: passed ? "matched in order" : `matched ${idx}/${tools.length}`,
|
|
235
|
+
matches: matched
|
|
236
|
+
};
|
|
237
|
+
}
|
|
238
|
+
function evaluateCalledWith(view, assertion) {
|
|
239
|
+
const candidates = view.toolCalls.filter((c) => toolMatches(c.name, assertion.tool));
|
|
240
|
+
const matching = candidates.filter((c) => matches(c.args, assertion.args));
|
|
241
|
+
const passed = matching.length > 0;
|
|
242
|
+
let details;
|
|
243
|
+
if (passed) details = `${matching.length} call(s) with matching args`;
|
|
244
|
+
else if (candidates.length === 0) details = `no calls to ${describePattern(assertion.tool)} at all`;
|
|
245
|
+
else details = `${candidates.length} call(s) but none with matching args`;
|
|
246
|
+
return {
|
|
247
|
+
passed,
|
|
248
|
+
description: `called_with(${describePattern(assertion.tool)}, args matching predicate)`,
|
|
249
|
+
details,
|
|
250
|
+
matches: matching
|
|
251
|
+
};
|
|
252
|
+
}
|
|
253
|
+
//#endregion
|
|
254
|
+
//#region src/assertions/behavior.ts
|
|
255
|
+
/**
|
|
256
|
+
* Was the response delivered without using any tool? This is the primary
|
|
257
|
+
* failure mode detector for the skills-loading problem: when the harness
|
|
258
|
+
* ignores the MCP, the trace shows zero tool calls and one terminal
|
|
259
|
+
* assistant turn with finish reason `end_turn`.
|
|
260
|
+
*
|
|
261
|
+
* "Without tool calls" is defined as `toolCalls.length === 0` AND the
|
|
262
|
+
* response text is non-empty (so we don't confuse "answered blind" with
|
|
263
|
+
* "session died before producing anything").
|
|
264
|
+
*/
|
|
265
|
+
function evaluateRespondedWithoutToolCalls(view, _assertion) {
|
|
266
|
+
const passed = view.toolCalls.length === 0 && view.finalResponse.length > 0;
|
|
267
|
+
return {
|
|
268
|
+
passed,
|
|
269
|
+
description: "responded_without_tool_calls",
|
|
270
|
+
details: passed ? "no tools called, response non-empty" : view.toolCalls.length > 0 ? `${view.toolCalls.length} tool call(s) made` : "response was empty (session probably aborted)"
|
|
271
|
+
};
|
|
272
|
+
}
|
|
273
|
+
function evaluateIterationsWithin(view, assertion) {
|
|
274
|
+
const n = view.usage.numTurns;
|
|
275
|
+
return {
|
|
276
|
+
passed: n <= assertion.max,
|
|
277
|
+
description: `iterations_within(${assertion.max})`,
|
|
278
|
+
details: `used ${n} turn(s)`
|
|
279
|
+
};
|
|
280
|
+
}
|
|
281
|
+
function evaluateCostWithinUsd(view, assertion) {
|
|
282
|
+
const cost = view.usage.totalCostUsd;
|
|
283
|
+
return {
|
|
284
|
+
passed: cost <= assertion.max,
|
|
285
|
+
description: `cost_within_usd(${assertion.max.toFixed(4)})`,
|
|
286
|
+
details: `used $${cost.toFixed(4)}`
|
|
287
|
+
};
|
|
288
|
+
}
|
|
289
|
+
function evaluateDurationWithinMs(view, assertion) {
|
|
290
|
+
const ms = view.usage.durationMs;
|
|
291
|
+
return {
|
|
292
|
+
passed: ms <= assertion.max,
|
|
293
|
+
description: `duration_within_ms(${assertion.max})`,
|
|
294
|
+
details: `took ${ms}ms`
|
|
295
|
+
};
|
|
296
|
+
}
|
|
297
|
+
function evaluateFinishedWith(view, assertion) {
|
|
298
|
+
const allowed = Array.isArray(assertion.reasons) ? assertion.reasons : [assertion.reasons];
|
|
299
|
+
const actual = view.finalStopReason;
|
|
300
|
+
return {
|
|
301
|
+
passed: actual !== null && allowed.includes(actual),
|
|
302
|
+
description: `finished_with(${allowed.join("|")})`,
|
|
303
|
+
details: `actual: ${actual ?? "(none)"}`
|
|
304
|
+
};
|
|
305
|
+
}
|
|
306
|
+
function evaluateResponseContains(view, assertion) {
|
|
307
|
+
const passed = view.finalResponse.includes(assertion.text);
|
|
308
|
+
return {
|
|
309
|
+
passed,
|
|
310
|
+
description: `response_contains(${JSON.stringify(assertion.text)})`,
|
|
311
|
+
details: passed ? "text found" : "text not in response"
|
|
312
|
+
};
|
|
313
|
+
}
|
|
314
|
+
function evaluateResponseNotContains(view, assertion) {
|
|
315
|
+
const passed = !view.finalResponse.includes(assertion.text);
|
|
316
|
+
return {
|
|
317
|
+
passed,
|
|
318
|
+
description: `response_not_contains(${JSON.stringify(assertion.text)})`,
|
|
319
|
+
details: passed ? "text absent" : "forbidden text found"
|
|
320
|
+
};
|
|
321
|
+
}
|
|
322
|
+
function evaluateResponseMatches(view, assertion) {
|
|
323
|
+
let passed;
|
|
324
|
+
let details;
|
|
325
|
+
try {
|
|
326
|
+
passed = new RegExp(assertion.pattern, assertion.flags).test(view.finalResponse);
|
|
327
|
+
details = passed ? "pattern matched" : "pattern did not match";
|
|
328
|
+
} catch (err) {
|
|
329
|
+
passed = false;
|
|
330
|
+
details = `invalid regex: ${err instanceof Error ? err.message : String(err)}`;
|
|
331
|
+
}
|
|
332
|
+
return {
|
|
333
|
+
passed,
|
|
334
|
+
description: `response_matches(/${assertion.pattern}/${assertion.flags ?? ""})`,
|
|
335
|
+
details
|
|
336
|
+
};
|
|
337
|
+
}
|
|
338
|
+
/**
|
|
339
|
+
* Run an arbitrary user-supplied predicate against the view.
|
|
340
|
+
*
|
|
341
|
+
* Only available from programmatic test definition (the YAML loader cannot
|
|
342
|
+
* produce functions). Catches thrown errors and reports them as failures so
|
|
343
|
+
* one bad predicate doesn't take down a whole eval run.
|
|
344
|
+
*/
|
|
345
|
+
function evaluatePredicate(view, assertion) {
|
|
346
|
+
let passed = false;
|
|
347
|
+
let details;
|
|
348
|
+
try {
|
|
349
|
+
passed = assertion.fn(view);
|
|
350
|
+
details = passed ? "predicate returned true" : "predicate returned false";
|
|
351
|
+
} catch (err) {
|
|
352
|
+
details = `predicate threw: ${err instanceof Error ? err.message : String(err)}`;
|
|
353
|
+
}
|
|
354
|
+
return {
|
|
355
|
+
passed,
|
|
356
|
+
description: assertion.description ?? "predicate(...)",
|
|
357
|
+
details
|
|
358
|
+
};
|
|
359
|
+
}
|
|
360
|
+
//#endregion
|
|
361
|
+
//#region src/assertions/compound.ts
|
|
362
|
+
function evaluateAllOf(view, assertion, evaluate) {
|
|
363
|
+
const children = assertion.assertions.map((a) => evaluate(view, a));
|
|
364
|
+
const passed = children.every((c) => c.passed);
|
|
365
|
+
const failedCount = children.filter((c) => !c.passed).length;
|
|
366
|
+
return {
|
|
367
|
+
passed,
|
|
368
|
+
description: `all_of (${children.length} child${children.length === 1 ? "" : "ren"})`,
|
|
369
|
+
details: passed ? "all passed" : `${failedCount} of ${children.length} failed`,
|
|
370
|
+
children
|
|
371
|
+
};
|
|
372
|
+
}
|
|
373
|
+
function evaluateAnyOf(view, assertion, evaluate) {
|
|
374
|
+
const children = assertion.assertions.map((a) => evaluate(view, a));
|
|
375
|
+
const passedCount = children.filter((c) => c.passed).length;
|
|
376
|
+
const passed = passedCount > 0;
|
|
377
|
+
return {
|
|
378
|
+
passed,
|
|
379
|
+
description: `any_of (${children.length} child${children.length === 1 ? "" : "ren"})`,
|
|
380
|
+
details: passed ? `${passedCount} passed` : "all failed",
|
|
381
|
+
children
|
|
382
|
+
};
|
|
383
|
+
}
|
|
384
|
+
function evaluateNot(view, assertion, evaluate) {
|
|
385
|
+
const child = evaluate(view, assertion.assertion);
|
|
386
|
+
return {
|
|
387
|
+
passed: !child.passed,
|
|
388
|
+
description: `not(${child.description})`,
|
|
389
|
+
details: child.passed ? "inner passed (so outer fails)" : "inner failed (so outer passes)",
|
|
390
|
+
children: [child]
|
|
391
|
+
};
|
|
392
|
+
}
|
|
393
|
+
//#endregion
|
|
394
|
+
//#region src/assertions/evaluator.ts
|
|
395
|
+
/**
|
|
396
|
+
* Evaluate one assertion against a trajectory view.
|
|
397
|
+
*
|
|
398
|
+
* The switch is exhaustive — TypeScript's `never` check at the end will
|
|
399
|
+
* flag any new variant added to the `Assertion` union that hasn't been
|
|
400
|
+
* wired up here.
|
|
401
|
+
*/
|
|
402
|
+
function evaluate(view, assertion) {
|
|
403
|
+
switch (assertion.type) {
|
|
404
|
+
case "called": return evaluateCalled(view, assertion);
|
|
405
|
+
case "not_called": return evaluateNotCalled(view, assertion);
|
|
406
|
+
case "called_any_of": return evaluateCalledAnyOf(view, assertion);
|
|
407
|
+
case "called_all_of": return evaluateCalledAllOf(view, assertion);
|
|
408
|
+
case "called_before": return evaluateCalledBefore(view, assertion);
|
|
409
|
+
case "sequence": return evaluateSequence(view, assertion);
|
|
410
|
+
case "called_with": return evaluateCalledWith(view, assertion);
|
|
411
|
+
case "responded_without_tool_calls": return evaluateRespondedWithoutToolCalls(view, assertion);
|
|
412
|
+
case "iterations_within": return evaluateIterationsWithin(view, assertion);
|
|
413
|
+
case "cost_within_usd": return evaluateCostWithinUsd(view, assertion);
|
|
414
|
+
case "duration_within_ms": return evaluateDurationWithinMs(view, assertion);
|
|
415
|
+
case "finished_with": return evaluateFinishedWith(view, assertion);
|
|
416
|
+
case "response_contains": return evaluateResponseContains(view, assertion);
|
|
417
|
+
case "response_not_contains": return evaluateResponseNotContains(view, assertion);
|
|
418
|
+
case "response_matches": return evaluateResponseMatches(view, assertion);
|
|
419
|
+
case "all_of": return evaluateAllOf(view, assertion, evaluate);
|
|
420
|
+
case "any_of": return evaluateAnyOf(view, assertion, evaluate);
|
|
421
|
+
case "not": return evaluateNot(view, assertion, evaluate);
|
|
422
|
+
case "predicate": return evaluatePredicate(view, assertion);
|
|
423
|
+
default: throw new Error(`unknown assertion: ${JSON.stringify(assertion)}`);
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
/**
|
|
427
|
+
* Evaluate a list of assertions independently. Used at the test-case level
|
|
428
|
+
* where each top-level assertion is reported separately (and thresholded
|
|
429
|
+
* separately, in the runner layer).
|
|
430
|
+
*/
|
|
431
|
+
function evaluateAll(view, assertions) {
|
|
432
|
+
return assertions.map((a) => evaluate(view, a));
|
|
433
|
+
}
|
|
434
|
+
//#endregion
|
|
435
|
+
//#region src/adapters/registry.ts
|
|
436
|
+
const ADAPTERS = {};
|
|
437
|
+
function registerBuiltIn(id, adapter) {
|
|
438
|
+
ADAPTERS[id] = adapter;
|
|
439
|
+
}
|
|
440
|
+
registerBuiltIn("claude-code", claudeCodeAdapter);
|
|
441
|
+
/**
|
|
442
|
+
* Register a harness adapter by id.
|
|
443
|
+
*
|
|
444
|
+
* Duplicate ids throw — registration is explicit so accidental overrides
|
|
445
|
+
* surface immediately during startup or test setup.
|
|
446
|
+
*/
|
|
447
|
+
function registerAdapter(id, adapter) {
|
|
448
|
+
if (ADAPTERS[id]) throw new Error(`adapter "${id}" is already registered`);
|
|
449
|
+
ADAPTERS[id] = adapter;
|
|
450
|
+
}
|
|
451
|
+
/** Return all registered adapter ids (built-in and runtime). */
|
|
452
|
+
function listAdapters() {
|
|
453
|
+
return Object.keys(ADAPTERS);
|
|
454
|
+
}
|
|
455
|
+
/** Resolve an adapter by id. Throws if unknown. */
|
|
456
|
+
function getAdapter(id) {
|
|
457
|
+
const adapter = ADAPTERS[id];
|
|
458
|
+
if (!adapter) throw new Error(`unknown adapter "${id}". Available: ${listAdapters().join(", ")}`);
|
|
459
|
+
return adapter;
|
|
460
|
+
}
|
|
461
|
+
/** Default adapter when YAML omits `adapter`. */
|
|
462
|
+
const DEFAULT_ADAPTER_ID = "claude-code";
|
|
463
|
+
function getDefaultAdapter() {
|
|
464
|
+
return getAdapter(DEFAULT_ADAPTER_ID);
|
|
465
|
+
}
|
|
466
|
+
//#endregion
|
|
467
|
+
//#region src/config/resolve-config.ts
|
|
468
|
+
/**
|
|
469
|
+
* Flatten nested suite config into harness-specific adapter config.
|
|
470
|
+
*/
|
|
471
|
+
/** Merge generic suite config layers into a flat {@link ClaudeCodeAdapterConfig}. */
|
|
472
|
+
function toClaudeCodeConfig(layers, prompt) {
|
|
473
|
+
const merged = {};
|
|
474
|
+
for (const layer of layers) {
|
|
475
|
+
const { claudeCode, ...generic } = layer;
|
|
476
|
+
Object.assign(merged, generic);
|
|
477
|
+
if (claudeCode && typeof claudeCode === "object") Object.assign(merged, claudeCode);
|
|
478
|
+
}
|
|
479
|
+
merged.prompt = prompt;
|
|
480
|
+
return merged;
|
|
481
|
+
}
|
|
482
|
+
/**
|
|
483
|
+
* Resolve merged suite layers into the flat config shape expected by the
|
|
484
|
+
* selected harness adapter.
|
|
485
|
+
*/
|
|
486
|
+
function resolveRunConfig(adapterId, layers, prompt) {
|
|
487
|
+
if (adapterId === "claude-code" || adapterId === "claude-code") return toClaudeCodeConfig(layers, prompt);
|
|
488
|
+
const merged = {};
|
|
489
|
+
for (const layer of layers) Object.assign(merged, layer);
|
|
490
|
+
merged.prompt = prompt;
|
|
491
|
+
return merged;
|
|
492
|
+
}
|
|
493
|
+
//#endregion
|
|
494
|
+
//#region src/runner/case.ts
|
|
495
|
+
/** Default repetition count when `case.repetitions` is omitted. */
|
|
496
|
+
const DEFAULT_REPETITIONS = 5;
|
|
497
|
+
/** Default assertion pass-rate threshold when `threshold` is omitted. */
|
|
498
|
+
const DEFAULT_THRESHOLD = 1;
|
|
499
|
+
/**
|
|
500
|
+
* Build the effective adapter config for one (suite, case, cell).
|
|
501
|
+
*
|
|
502
|
+
* Merge order (later wins): defaultConfig < case.config < cell.config.
|
|
503
|
+
*/
|
|
504
|
+
function mergeConfig(suite, testCase, cell) {
|
|
505
|
+
return resolveRunConfig(suite.adapter ?? getDefaultAdapter().id, [
|
|
506
|
+
suite.defaultConfig ?? {},
|
|
507
|
+
testCase.config ?? {},
|
|
508
|
+
cell.config
|
|
509
|
+
], testCase.prompt);
|
|
510
|
+
}
|
|
511
|
+
function getRepetitions(testCase) {
|
|
512
|
+
return testCase.repetitions ?? 5;
|
|
513
|
+
}
|
|
514
|
+
async function runRepetition(testCase, _cell, config, repetitionIndex, run, signal) {
|
|
515
|
+
const startTs = Date.now();
|
|
516
|
+
try {
|
|
517
|
+
const adapterResult = await run({
|
|
518
|
+
...config,
|
|
519
|
+
signal: signal ?? config.signal
|
|
520
|
+
});
|
|
521
|
+
return {
|
|
522
|
+
repetitionIndex,
|
|
523
|
+
adapterResult,
|
|
524
|
+
error: null,
|
|
525
|
+
assertionResults: evaluateAll(adapterResult.view, testCase.assertions.map((t) => t.assertion)),
|
|
526
|
+
durationMs: Date.now() - startTs
|
|
527
|
+
};
|
|
528
|
+
} catch (err) {
|
|
529
|
+
return {
|
|
530
|
+
repetitionIndex,
|
|
531
|
+
adapterResult: null,
|
|
532
|
+
error: extractError(err),
|
|
533
|
+
assertionResults: [],
|
|
534
|
+
durationMs: Date.now() - startTs
|
|
535
|
+
};
|
|
536
|
+
}
|
|
537
|
+
}
|
|
538
|
+
function extractError(err) {
|
|
539
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
540
|
+
let diagnostics = {};
|
|
541
|
+
if (err !== null && typeof err === "object" && "diagnostics" in err) {
|
|
542
|
+
const d = err.diagnostics;
|
|
543
|
+
if (d !== null && typeof d === "object") diagnostics = d;
|
|
544
|
+
}
|
|
545
|
+
return {
|
|
546
|
+
message,
|
|
547
|
+
diagnostics
|
|
548
|
+
};
|
|
549
|
+
}
|
|
550
|
+
function aggregateCell(testCase, cell, repetitions) {
|
|
551
|
+
const adapterErrors = repetitions.filter((r) => r.error !== null).length;
|
|
552
|
+
const evaluatedReps = repetitions.filter((r) => r.error === null);
|
|
553
|
+
const assertionStats = testCase.assertions.map((thresholded, i) => {
|
|
554
|
+
const threshold = thresholded.threshold ?? 1;
|
|
555
|
+
const passedCount = evaluatedReps.filter((r) => r.assertionResults[i]?.passed).length;
|
|
556
|
+
const evaluatedCount = evaluatedReps.length;
|
|
557
|
+
const passRate = evaluatedCount === 0 ? 0 : passedCount / evaluatedCount;
|
|
558
|
+
return {
|
|
559
|
+
description: evaluatedReps[0]?.assertionResults[i]?.description ?? `(${thresholded.assertion.type})`,
|
|
560
|
+
threshold,
|
|
561
|
+
passedCount,
|
|
562
|
+
evaluatedCount,
|
|
563
|
+
passRate,
|
|
564
|
+
meetsThreshold: evaluatedCount > 0 && passRate >= threshold
|
|
565
|
+
};
|
|
566
|
+
});
|
|
567
|
+
const passed = assertionStats.every((s) => s.meetsThreshold);
|
|
568
|
+
return {
|
|
569
|
+
caseId: testCase.id,
|
|
570
|
+
category: testCase.category,
|
|
571
|
+
notes: testCase.notes,
|
|
572
|
+
prompt: testCase.prompt,
|
|
573
|
+
expectations: testCase.expectations,
|
|
574
|
+
reference_trajectory: testCase.reference_trajectory,
|
|
575
|
+
human_ratings: testCase.human_ratings,
|
|
576
|
+
cell,
|
|
577
|
+
repetitions,
|
|
578
|
+
assertionStats,
|
|
579
|
+
adapterErrors,
|
|
580
|
+
passed
|
|
581
|
+
};
|
|
582
|
+
}
|
|
583
|
+
//#endregion
|
|
584
|
+
//#region src/runner/limit.ts
|
|
585
|
+
function createLimit(max) {
|
|
586
|
+
if (!Number.isInteger(max) || max < 1) throw new Error(`createLimit: max must be a positive integer, got ${max}`);
|
|
587
|
+
let running = 0;
|
|
588
|
+
/**
|
|
589
|
+
* FIFO list of resolvers belonging to tasks waiting for a slot. When a
|
|
590
|
+
* running task finishes, the next resolver is invoked to wake one waiter.
|
|
591
|
+
*/
|
|
592
|
+
const waiters = [];
|
|
593
|
+
return async (fn) => {
|
|
594
|
+
while (running >= max) await new Promise((resolve) => waiters.push(resolve));
|
|
595
|
+
running++;
|
|
596
|
+
try {
|
|
597
|
+
return await fn();
|
|
598
|
+
} finally {
|
|
599
|
+
running--;
|
|
600
|
+
const next = waiters.shift();
|
|
601
|
+
if (next) next();
|
|
602
|
+
}
|
|
603
|
+
};
|
|
604
|
+
}
|
|
605
|
+
//#endregion
|
|
606
|
+
//#region src/runner/suite.ts
|
|
607
|
+
/**
|
|
608
|
+
* Suite-level runner.
|
|
609
|
+
*/
|
|
610
|
+
const DEFAULT_MAX_CONCURRENT = 4;
|
|
611
|
+
async function runSuite(suite, options = {}) {
|
|
612
|
+
if (suite.matrix.length === 0) throw new Error("runSuite: suite.matrix must contain at least one cell");
|
|
613
|
+
if (suite.cases.length === 0) throw new Error("runSuite: suite.cases must contain at least one case");
|
|
614
|
+
const adapter = options.adapter ?? getAdapter(suite.adapter ?? getDefaultAdapter().id);
|
|
615
|
+
const run = (config) => adapter.run(config);
|
|
616
|
+
const limit = createLimit(options.maxConcurrent ?? DEFAULT_MAX_CONCURRENT);
|
|
617
|
+
const onProgress = options.onProgress;
|
|
618
|
+
const startTs = Date.now();
|
|
619
|
+
const startedAt = new Date(startTs).toISOString();
|
|
620
|
+
const tasks = [];
|
|
621
|
+
for (const testCase of suite.cases) {
|
|
622
|
+
const reps = getRepetitions(testCase);
|
|
623
|
+
for (const cell of suite.matrix) for (let i = 0; i < reps; i++) tasks.push({
|
|
624
|
+
testCase,
|
|
625
|
+
cell,
|
|
626
|
+
repetitionIndex: i
|
|
627
|
+
});
|
|
628
|
+
}
|
|
629
|
+
onProgress?.({
|
|
630
|
+
kind: "suite-start",
|
|
631
|
+
totalReps: tasks.length
|
|
632
|
+
});
|
|
633
|
+
const buckets = /* @__PURE__ */ new Map();
|
|
634
|
+
const bucketKey = (caseId, cellLabel) => `${caseId}::${cellLabel}`;
|
|
635
|
+
for (const testCase of suite.cases) for (const cell of suite.matrix) buckets.set(bucketKey(testCase.id, cell.label), []);
|
|
636
|
+
await Promise.all(tasks.map((task) => limit(async () => {
|
|
637
|
+
if (options.signal?.aborted) return;
|
|
638
|
+
onProgress?.({
|
|
639
|
+
kind: "rep-start",
|
|
640
|
+
caseId: task.testCase.id,
|
|
641
|
+
cellLabel: task.cell.label,
|
|
642
|
+
repIndex: task.repetitionIndex
|
|
643
|
+
});
|
|
644
|
+
const config = mergeConfig(suite, task.testCase, task.cell);
|
|
645
|
+
const result = await runRepetition(task.testCase, task.cell, config, task.repetitionIndex, run, options.signal);
|
|
646
|
+
buckets.get(bucketKey(task.testCase.id, task.cell.label)).push(result);
|
|
647
|
+
onProgress?.({
|
|
648
|
+
kind: "rep-complete",
|
|
649
|
+
caseId: task.testCase.id,
|
|
650
|
+
cellLabel: task.cell.label,
|
|
651
|
+
repIndex: task.repetitionIndex,
|
|
652
|
+
ok: result.error === null,
|
|
653
|
+
durationMs: result.durationMs,
|
|
654
|
+
toolCallCount: result.adapterResult?.view.toolCalls.length,
|
|
655
|
+
assertionResults: result.assertionResults,
|
|
656
|
+
errorMessage: result.error?.message
|
|
657
|
+
});
|
|
658
|
+
})));
|
|
659
|
+
const cells = [];
|
|
660
|
+
for (const testCase of suite.cases) for (const cell of suite.matrix) {
|
|
661
|
+
const reps = buckets.get(bucketKey(testCase.id, cell.label)) ?? [];
|
|
662
|
+
reps.sort((a, b) => a.repetitionIndex - b.repetitionIndex);
|
|
663
|
+
const cellReport = aggregateCell(testCase, cell, reps);
|
|
664
|
+
cells.push(cellReport);
|
|
665
|
+
onProgress?.({
|
|
666
|
+
kind: "cell-complete",
|
|
667
|
+
report: cellReport
|
|
668
|
+
});
|
|
669
|
+
}
|
|
670
|
+
const report = {
|
|
671
|
+
startedAt,
|
|
672
|
+
durationMs: Date.now() - startTs,
|
|
673
|
+
cells
|
|
674
|
+
};
|
|
675
|
+
onProgress?.({
|
|
676
|
+
kind: "suite-complete",
|
|
677
|
+
report
|
|
678
|
+
});
|
|
679
|
+
return report;
|
|
680
|
+
}
|
|
681
|
+
//#endregion
|
|
682
|
+
export { aggregateCell as a, runRepetition as c, getDefaultAdapter as d, listAdapters as f, evaluateAll as h, DEFAULT_THRESHOLD as i, DEFAULT_ADAPTER_ID as l, evaluate as m, createLimit as n, getRepetitions as o, registerAdapter as p, DEFAULT_REPETITIONS as r, mergeConfig as s, runSuite as t, getAdapter as u };
|
|
683
|
+
|
|
684
|
+
//# sourceMappingURL=suite-chj0j22j.js.map
|