@alis-build/harness-eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,684 @@
1
+ import { t as claudeCodeAdapter } from "./claude-code-ycT0JQZF.js";
2
+ import { n as parseCardinality, t as describeCardinality } from "./cardinality-DlE44e-4.js";
3
+ //#region src/assertions/patterns.ts
4
+ /**
5
+ * Test whether a fully-qualified tool name matches a pattern.
6
+ *
7
+ * Literal patterns (no `*`) match by string equality. Glob patterns are
8
+ * compiled to a regex on each call — fine for our scale (dozens of patterns,
9
+ * thousands of calls per run). If this becomes a hot path, memoize.
10
+ */
11
+ function toolMatches(toolName, pattern) {
12
+ const p = patternString(pattern);
13
+ if (!p.includes("*")) return toolName === p;
14
+ return globToRegex(p).test(toolName);
15
+ }
16
+ /** Extract the underlying string from either pattern form. */
17
+ function patternString(pattern) {
18
+ return typeof pattern === "string" ? pattern : pattern.pattern;
19
+ }
20
+ /** Human-readable representation for diagnostic messages. */
21
+ function describePattern(pattern) {
22
+ return patternString(pattern);
23
+ }
24
+ /**
25
+ * Convert a glob (with `*` wildcards only) to an anchored regex.
26
+ * Other regex metacharacters in the input are escaped.
27
+ */
28
+ function globToRegex(glob) {
29
+ const escaped = glob.replace(/[.+?^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
30
+ return new RegExp(`^${escaped}$`);
31
+ }
32
+ //#endregion
33
+ //#region src/assertions/predicates.ts
34
+ const LEAF_OPS = /* @__PURE__ */ new Set([
35
+ "equals",
36
+ "contains",
37
+ "not_contains",
38
+ "regex",
39
+ "gte",
40
+ "lte",
41
+ "gt",
42
+ "lt",
43
+ "one_of"
44
+ ]);
45
+ const COMPOUND_OPS = /* @__PURE__ */ new Set([
46
+ "any_of",
47
+ "all_of",
48
+ "not"
49
+ ]);
50
+ /**
51
+ * Apply a predicate to a value. Returns true if the value satisfies the
52
+ * predicate, false otherwise.
53
+ *
54
+ * The `predicate` parameter is typed as `unknown` because YAML deserialization
55
+ * produces unconstrained shapes; runtime dispatch is the validation.
56
+ */
57
+ function matches(value, predicate) {
58
+ if (!isPlainObject(predicate)) return deepEquals(value, predicate);
59
+ const obj = predicate;
60
+ const keys = Object.keys(obj);
61
+ if (keys.length === 1) {
62
+ const key = keys[0];
63
+ if (COMPOUND_OPS.has(key)) switch (key) {
64
+ case "any_of": return obj.any_of.some((sub) => matches(value, sub));
65
+ case "all_of": return obj.all_of.every((sub) => matches(value, sub));
66
+ case "not": return !matches(value, obj.not);
67
+ }
68
+ if (LEAF_OPS.has(key)) return matchesLeaf(value, key, obj[key]);
69
+ }
70
+ if (!isPlainObject(value)) return false;
71
+ const valueObj = value;
72
+ for (const [field, subPred] of Object.entries(obj)) if (!matches(valueObj[field], subPred)) return false;
73
+ return true;
74
+ }
75
+ /** Apply a single leaf operator. Caller guarantees `op` is in LEAF_OPS. */
76
+ function matchesLeaf(value, op, target) {
77
+ switch (op) {
78
+ case "equals": return deepEquals(value, target);
79
+ case "contains": return typeof value === "string" && value.includes(target);
80
+ case "not_contains": return typeof value === "string" && !value.includes(target);
81
+ case "regex":
82
+ if (typeof value !== "string" || typeof target !== "string") return false;
83
+ try {
84
+ return new RegExp(target).test(value);
85
+ } catch {
86
+ return false;
87
+ }
88
+ case "gte": return typeof value === "number" && value >= target;
89
+ case "lte": return typeof value === "number" && value <= target;
90
+ case "gt": return typeof value === "number" && value > target;
91
+ case "lt": return typeof value === "number" && value < target;
92
+ case "one_of": return target.some((t) => deepEquals(value, t));
93
+ default: throw new Error(`unknown leaf operator: ${op}`);
94
+ }
95
+ }
96
+ function isPlainObject(x) {
97
+ return typeof x === "object" && x !== null && !Array.isArray(x);
98
+ }
99
+ /**
100
+ * Structural equality for unknown values. Used by `equals` and `one_of`.
101
+ * Strict — no coercions, no NaN-equals-NaN special case (matches `===`).
102
+ */
103
+ function deepEquals(a, b) {
104
+ if (a === b) return true;
105
+ if (typeof a !== typeof b) return false;
106
+ if (a === null || b === null) return false;
107
+ if (typeof a !== "object") return false;
108
+ if (Array.isArray(a) !== Array.isArray(b)) return false;
109
+ if (Array.isArray(a) && Array.isArray(b)) {
110
+ if (a.length !== b.length) return false;
111
+ return a.every((v, i) => deepEquals(v, b[i]));
112
+ }
113
+ const aObj = a;
114
+ const bObj = b;
115
+ const aKeys = Object.keys(aObj);
116
+ const bKeys = Object.keys(bObj);
117
+ if (aKeys.length !== bKeys.length) return false;
118
+ return aKeys.every((k) => deepEquals(aObj[k], bObj[k]));
119
+ }
120
+ //#endregion
121
+ //#region src/assertions/tool-calls.ts
122
+ function evaluateCalled(view, assertion) {
123
+ const matching = view.toolCalls.filter((c) => toolMatches(c.name, assertion.tool));
124
+ const passed = parseCardinality(assertion.times)(matching.length);
125
+ return {
126
+ passed,
127
+ description: `called(${describePattern(assertion.tool)}, ${describeCardinality(assertion.times)})`,
128
+ details: passed ? `found ${matching.length} matching call(s)` : `found ${matching.length} call(s), expected ${describeCardinality(assertion.times)}`,
129
+ matches: matching
130
+ };
131
+ }
132
+ function evaluateNotCalled(view, assertion) {
133
+ const matching = view.toolCalls.filter((c) => toolMatches(c.name, assertion.tool));
134
+ const passed = matching.length === 0;
135
+ return {
136
+ passed,
137
+ description: `not_called(${describePattern(assertion.tool)})`,
138
+ details: passed ? "no matching calls" : `found ${matching.length} forbidden call(s)`,
139
+ matches: matching
140
+ };
141
+ }
142
+ function evaluateCalledAnyOf(view, assertion) {
143
+ const allMatches = [];
144
+ for (const pattern of assertion.tools) allMatches.push(...view.toolCalls.filter((c) => toolMatches(c.name, pattern)));
145
+ const passed = allMatches.length > 0;
146
+ return {
147
+ passed,
148
+ description: `called_any_of(${assertion.tools.map(describePattern).join(", ")})`,
149
+ details: passed ? `${allMatches.length} matching call(s)` : "no calls matched any pattern",
150
+ matches: allMatches
151
+ };
152
+ }
153
+ function evaluateCalledAllOf(view, assertion) {
154
+ const perPattern = assertion.tools.map((p) => ({
155
+ pattern: p,
156
+ matches: view.toolCalls.filter((c) => toolMatches(c.name, p))
157
+ }));
158
+ const missing = perPattern.filter((p) => p.matches.length === 0);
159
+ const passed = missing.length === 0;
160
+ return {
161
+ passed,
162
+ description: `called_all_of(${assertion.tools.map(describePattern).join(", ")})`,
163
+ details: passed ? "all patterns matched" : `missing: ${missing.map((m) => describePattern(m.pattern)).join(", ")}`,
164
+ matches: perPattern.flatMap((p) => p.matches)
165
+ };
166
+ }
167
+ function evaluateCalledBefore(view, assertion) {
168
+ const firsts = view.toolCalls.filter((c) => toolMatches(c.name, assertion.first));
169
+ const thens = view.toolCalls.filter((c) => toolMatches(c.name, assertion.then));
170
+ const desc = `called_before(${describePattern(assertion.first)} → ${describePattern(assertion.then)})`;
171
+ if (firsts.length === 0) return {
172
+ passed: false,
173
+ description: desc,
174
+ details: `no calls matching first`
175
+ };
176
+ if (thens.length === 0) return {
177
+ passed: false,
178
+ description: desc,
179
+ details: `no calls matching then`
180
+ };
181
+ const earliestFirst = Math.min(...firsts.map((c) => c.turnIndex));
182
+ const earliestThen = Math.min(...thens.map((c) => c.turnIndex));
183
+ const passed = earliestFirst < earliestThen;
184
+ return {
185
+ passed,
186
+ description: desc,
187
+ details: passed ? `first @ turn ${earliestFirst}, then @ turn ${earliestThen}` : `first @ turn ${earliestFirst}, then @ turn ${earliestThen} (not before)`,
188
+ matches: [...firsts, ...thens]
189
+ };
190
+ }
191
+ function evaluateSequence(view, assertion) {
192
+ const { tools, strict = false } = assertion;
193
+ const desc = `sequence([${tools.map(describePattern).join(" → ")}]${strict ? ", strict" : ""})`;
194
+ if (tools.length === 0) return {
195
+ passed: true,
196
+ description: desc,
197
+ details: "empty sequence trivially matches"
198
+ };
199
+ if (strict) {
200
+ if (view.toolCalls.length < tools.length) return {
201
+ passed: false,
202
+ description: desc,
203
+ details: "not enough tool calls"
204
+ };
205
+ for (let start = 0; start <= view.toolCalls.length - tools.length; start++) {
206
+ let ok = true;
207
+ for (let i = 0; i < tools.length; i++) if (!toolMatches(view.toolCalls[start + i].name, tools[i])) {
208
+ ok = false;
209
+ break;
210
+ }
211
+ if (ok) return {
212
+ passed: true,
213
+ description: desc,
214
+ details: `matched at positions ${start}..${start + tools.length - 1}`,
215
+ matches: view.toolCalls.slice(start, start + tools.length)
216
+ };
217
+ }
218
+ return {
219
+ passed: false,
220
+ description: desc,
221
+ details: "no contiguous match"
222
+ };
223
+ }
224
+ let idx = 0;
225
+ const matched = [];
226
+ for (const call of view.toolCalls) if (idx < tools.length && toolMatches(call.name, tools[idx])) {
227
+ matched.push(call);
228
+ idx++;
229
+ }
230
+ const passed = idx === tools.length;
231
+ return {
232
+ passed,
233
+ description: desc,
234
+ details: passed ? "matched in order" : `matched ${idx}/${tools.length}`,
235
+ matches: matched
236
+ };
237
+ }
238
+ function evaluateCalledWith(view, assertion) {
239
+ const candidates = view.toolCalls.filter((c) => toolMatches(c.name, assertion.tool));
240
+ const matching = candidates.filter((c) => matches(c.args, assertion.args));
241
+ const passed = matching.length > 0;
242
+ let details;
243
+ if (passed) details = `${matching.length} call(s) with matching args`;
244
+ else if (candidates.length === 0) details = `no calls to ${describePattern(assertion.tool)} at all`;
245
+ else details = `${candidates.length} call(s) but none with matching args`;
246
+ return {
247
+ passed,
248
+ description: `called_with(${describePattern(assertion.tool)}, args matching predicate)`,
249
+ details,
250
+ matches: matching
251
+ };
252
+ }
253
+ //#endregion
254
+ //#region src/assertions/behavior.ts
255
+ /**
256
+ * Was the response delivered without using any tool? This is the primary
257
+ * failure mode detector for the skills-loading problem: when the harness
258
+ * ignores the MCP, the trace shows zero tool calls and one terminal
259
+ * assistant turn with finish reason `end_turn`.
260
+ *
261
+ * "Without tool calls" is defined as `toolCalls.length === 0` AND the
262
+ * response text is non-empty (so we don't confuse "answered blind" with
263
+ * "session died before producing anything").
264
+ */
265
+ function evaluateRespondedWithoutToolCalls(view, _assertion) {
266
+ const passed = view.toolCalls.length === 0 && view.finalResponse.length > 0;
267
+ return {
268
+ passed,
269
+ description: "responded_without_tool_calls",
270
+ details: passed ? "no tools called, response non-empty" : view.toolCalls.length > 0 ? `${view.toolCalls.length} tool call(s) made` : "response was empty (session probably aborted)"
271
+ };
272
+ }
273
+ function evaluateIterationsWithin(view, assertion) {
274
+ const n = view.usage.numTurns;
275
+ return {
276
+ passed: n <= assertion.max,
277
+ description: `iterations_within(${assertion.max})`,
278
+ details: `used ${n} turn(s)`
279
+ };
280
+ }
281
+ function evaluateCostWithinUsd(view, assertion) {
282
+ const cost = view.usage.totalCostUsd;
283
+ return {
284
+ passed: cost <= assertion.max,
285
+ description: `cost_within_usd(${assertion.max.toFixed(4)})`,
286
+ details: `used $${cost.toFixed(4)}`
287
+ };
288
+ }
289
+ function evaluateDurationWithinMs(view, assertion) {
290
+ const ms = view.usage.durationMs;
291
+ return {
292
+ passed: ms <= assertion.max,
293
+ description: `duration_within_ms(${assertion.max})`,
294
+ details: `took ${ms}ms`
295
+ };
296
+ }
297
+ function evaluateFinishedWith(view, assertion) {
298
+ const allowed = Array.isArray(assertion.reasons) ? assertion.reasons : [assertion.reasons];
299
+ const actual = view.finalStopReason;
300
+ return {
301
+ passed: actual !== null && allowed.includes(actual),
302
+ description: `finished_with(${allowed.join("|")})`,
303
+ details: `actual: ${actual ?? "(none)"}`
304
+ };
305
+ }
306
+ function evaluateResponseContains(view, assertion) {
307
+ const passed = view.finalResponse.includes(assertion.text);
308
+ return {
309
+ passed,
310
+ description: `response_contains(${JSON.stringify(assertion.text)})`,
311
+ details: passed ? "text found" : "text not in response"
312
+ };
313
+ }
314
+ function evaluateResponseNotContains(view, assertion) {
315
+ const passed = !view.finalResponse.includes(assertion.text);
316
+ return {
317
+ passed,
318
+ description: `response_not_contains(${JSON.stringify(assertion.text)})`,
319
+ details: passed ? "text absent" : "forbidden text found"
320
+ };
321
+ }
322
+ function evaluateResponseMatches(view, assertion) {
323
+ let passed;
324
+ let details;
325
+ try {
326
+ passed = new RegExp(assertion.pattern, assertion.flags).test(view.finalResponse);
327
+ details = passed ? "pattern matched" : "pattern did not match";
328
+ } catch (err) {
329
+ passed = false;
330
+ details = `invalid regex: ${err instanceof Error ? err.message : String(err)}`;
331
+ }
332
+ return {
333
+ passed,
334
+ description: `response_matches(/${assertion.pattern}/${assertion.flags ?? ""})`,
335
+ details
336
+ };
337
+ }
338
+ /**
339
+ * Run an arbitrary user-supplied predicate against the view.
340
+ *
341
+ * Only available from programmatic test definition (the YAML loader cannot
342
+ * produce functions). Catches thrown errors and reports them as failures so
343
+ * one bad predicate doesn't take down a whole eval run.
344
+ */
345
+ function evaluatePredicate(view, assertion) {
346
+ let passed = false;
347
+ let details;
348
+ try {
349
+ passed = assertion.fn(view);
350
+ details = passed ? "predicate returned true" : "predicate returned false";
351
+ } catch (err) {
352
+ details = `predicate threw: ${err instanceof Error ? err.message : String(err)}`;
353
+ }
354
+ return {
355
+ passed,
356
+ description: assertion.description ?? "predicate(...)",
357
+ details
358
+ };
359
+ }
360
+ //#endregion
361
+ //#region src/assertions/compound.ts
362
+ function evaluateAllOf(view, assertion, evaluate) {
363
+ const children = assertion.assertions.map((a) => evaluate(view, a));
364
+ const passed = children.every((c) => c.passed);
365
+ const failedCount = children.filter((c) => !c.passed).length;
366
+ return {
367
+ passed,
368
+ description: `all_of (${children.length} child${children.length === 1 ? "" : "ren"})`,
369
+ details: passed ? "all passed" : `${failedCount} of ${children.length} failed`,
370
+ children
371
+ };
372
+ }
373
+ function evaluateAnyOf(view, assertion, evaluate) {
374
+ const children = assertion.assertions.map((a) => evaluate(view, a));
375
+ const passedCount = children.filter((c) => c.passed).length;
376
+ const passed = passedCount > 0;
377
+ return {
378
+ passed,
379
+ description: `any_of (${children.length} child${children.length === 1 ? "" : "ren"})`,
380
+ details: passed ? `${passedCount} passed` : "all failed",
381
+ children
382
+ };
383
+ }
384
+ function evaluateNot(view, assertion, evaluate) {
385
+ const child = evaluate(view, assertion.assertion);
386
+ return {
387
+ passed: !child.passed,
388
+ description: `not(${child.description})`,
389
+ details: child.passed ? "inner passed (so outer fails)" : "inner failed (so outer passes)",
390
+ children: [child]
391
+ };
392
+ }
393
+ //#endregion
394
+ //#region src/assertions/evaluator.ts
395
+ /**
396
+ * Evaluate one assertion against a trajectory view.
397
+ *
398
+ * The switch is exhaustive — TypeScript's `never` check at the end will
399
+ * flag any new variant added to the `Assertion` union that hasn't been
400
+ * wired up here.
401
+ */
402
+ function evaluate(view, assertion) {
403
+ switch (assertion.type) {
404
+ case "called": return evaluateCalled(view, assertion);
405
+ case "not_called": return evaluateNotCalled(view, assertion);
406
+ case "called_any_of": return evaluateCalledAnyOf(view, assertion);
407
+ case "called_all_of": return evaluateCalledAllOf(view, assertion);
408
+ case "called_before": return evaluateCalledBefore(view, assertion);
409
+ case "sequence": return evaluateSequence(view, assertion);
410
+ case "called_with": return evaluateCalledWith(view, assertion);
411
+ case "responded_without_tool_calls": return evaluateRespondedWithoutToolCalls(view, assertion);
412
+ case "iterations_within": return evaluateIterationsWithin(view, assertion);
413
+ case "cost_within_usd": return evaluateCostWithinUsd(view, assertion);
414
+ case "duration_within_ms": return evaluateDurationWithinMs(view, assertion);
415
+ case "finished_with": return evaluateFinishedWith(view, assertion);
416
+ case "response_contains": return evaluateResponseContains(view, assertion);
417
+ case "response_not_contains": return evaluateResponseNotContains(view, assertion);
418
+ case "response_matches": return evaluateResponseMatches(view, assertion);
419
+ case "all_of": return evaluateAllOf(view, assertion, evaluate);
420
+ case "any_of": return evaluateAnyOf(view, assertion, evaluate);
421
+ case "not": return evaluateNot(view, assertion, evaluate);
422
+ case "predicate": return evaluatePredicate(view, assertion);
423
+ default: throw new Error(`unknown assertion: ${JSON.stringify(assertion)}`);
424
+ }
425
+ }
426
+ /**
427
+ * Evaluate a list of assertions independently. Used at the test-case level
428
+ * where each top-level assertion is reported separately (and thresholded
429
+ * separately, in the runner layer).
430
+ */
431
+ function evaluateAll(view, assertions) {
432
+ return assertions.map((a) => evaluate(view, a));
433
+ }
434
+ //#endregion
435
+ //#region src/adapters/registry.ts
436
+ const ADAPTERS = {};
437
+ function registerBuiltIn(id, adapter) {
438
+ ADAPTERS[id] = adapter;
439
+ }
440
+ registerBuiltIn("claude-code", claudeCodeAdapter);
441
+ /**
442
+ * Register a harness adapter by id.
443
+ *
444
+ * Duplicate ids throw — registration is explicit so accidental overrides
445
+ * surface immediately during startup or test setup.
446
+ */
447
+ function registerAdapter(id, adapter) {
448
+ if (ADAPTERS[id]) throw new Error(`adapter "${id}" is already registered`);
449
+ ADAPTERS[id] = adapter;
450
+ }
451
+ /** Return all registered adapter ids (built-in and runtime). */
452
+ function listAdapters() {
453
+ return Object.keys(ADAPTERS);
454
+ }
455
+ /** Resolve an adapter by id. Throws if unknown. */
456
+ function getAdapter(id) {
457
+ const adapter = ADAPTERS[id];
458
+ if (!adapter) throw new Error(`unknown adapter "${id}". Available: ${listAdapters().join(", ")}`);
459
+ return adapter;
460
+ }
461
+ /** Default adapter when YAML omits `adapter`. */
462
+ const DEFAULT_ADAPTER_ID = "claude-code";
463
+ function getDefaultAdapter() {
464
+ return getAdapter(DEFAULT_ADAPTER_ID);
465
+ }
466
+ //#endregion
467
+ //#region src/config/resolve-config.ts
468
+ /**
469
+ * Flatten nested suite config into harness-specific adapter config.
470
+ */
471
+ /** Merge generic suite config layers into a flat {@link ClaudeCodeAdapterConfig}. */
472
+ function toClaudeCodeConfig(layers, prompt) {
473
+ const merged = {};
474
+ for (const layer of layers) {
475
+ const { claudeCode, ...generic } = layer;
476
+ Object.assign(merged, generic);
477
+ if (claudeCode && typeof claudeCode === "object") Object.assign(merged, claudeCode);
478
+ }
479
+ merged.prompt = prompt;
480
+ return merged;
481
+ }
482
+ /**
483
+ * Resolve merged suite layers into the flat config shape expected by the
484
+ * selected harness adapter.
485
+ */
486
+ function resolveRunConfig(adapterId, layers, prompt) {
487
+ if (adapterId === "claude-code" || adapterId === "claude-code") return toClaudeCodeConfig(layers, prompt);
488
+ const merged = {};
489
+ for (const layer of layers) Object.assign(merged, layer);
490
+ merged.prompt = prompt;
491
+ return merged;
492
+ }
493
+ //#endregion
494
+ //#region src/runner/case.ts
495
+ /** Default repetition count when `case.repetitions` is omitted. */
496
+ const DEFAULT_REPETITIONS = 5;
497
+ /** Default assertion pass-rate threshold when `threshold` is omitted. */
498
+ const DEFAULT_THRESHOLD = 1;
499
+ /**
500
+ * Build the effective adapter config for one (suite, case, cell).
501
+ *
502
+ * Merge order (later wins): defaultConfig < case.config < cell.config.
503
+ */
504
+ function mergeConfig(suite, testCase, cell) {
505
+ return resolveRunConfig(suite.adapter ?? getDefaultAdapter().id, [
506
+ suite.defaultConfig ?? {},
507
+ testCase.config ?? {},
508
+ cell.config
509
+ ], testCase.prompt);
510
+ }
511
+ function getRepetitions(testCase) {
512
+ return testCase.repetitions ?? 5;
513
+ }
514
+ async function runRepetition(testCase, _cell, config, repetitionIndex, run, signal) {
515
+ const startTs = Date.now();
516
+ try {
517
+ const adapterResult = await run({
518
+ ...config,
519
+ signal: signal ?? config.signal
520
+ });
521
+ return {
522
+ repetitionIndex,
523
+ adapterResult,
524
+ error: null,
525
+ assertionResults: evaluateAll(adapterResult.view, testCase.assertions.map((t) => t.assertion)),
526
+ durationMs: Date.now() - startTs
527
+ };
528
+ } catch (err) {
529
+ return {
530
+ repetitionIndex,
531
+ adapterResult: null,
532
+ error: extractError(err),
533
+ assertionResults: [],
534
+ durationMs: Date.now() - startTs
535
+ };
536
+ }
537
+ }
538
+ function extractError(err) {
539
+ const message = err instanceof Error ? err.message : String(err);
540
+ let diagnostics = {};
541
+ if (err !== null && typeof err === "object" && "diagnostics" in err) {
542
+ const d = err.diagnostics;
543
+ if (d !== null && typeof d === "object") diagnostics = d;
544
+ }
545
+ return {
546
+ message,
547
+ diagnostics
548
+ };
549
+ }
550
+ function aggregateCell(testCase, cell, repetitions) {
551
+ const adapterErrors = repetitions.filter((r) => r.error !== null).length;
552
+ const evaluatedReps = repetitions.filter((r) => r.error === null);
553
+ const assertionStats = testCase.assertions.map((thresholded, i) => {
554
+ const threshold = thresholded.threshold ?? 1;
555
+ const passedCount = evaluatedReps.filter((r) => r.assertionResults[i]?.passed).length;
556
+ const evaluatedCount = evaluatedReps.length;
557
+ const passRate = evaluatedCount === 0 ? 0 : passedCount / evaluatedCount;
558
+ return {
559
+ description: evaluatedReps[0]?.assertionResults[i]?.description ?? `(${thresholded.assertion.type})`,
560
+ threshold,
561
+ passedCount,
562
+ evaluatedCount,
563
+ passRate,
564
+ meetsThreshold: evaluatedCount > 0 && passRate >= threshold
565
+ };
566
+ });
567
+ const passed = assertionStats.every((s) => s.meetsThreshold);
568
+ return {
569
+ caseId: testCase.id,
570
+ category: testCase.category,
571
+ notes: testCase.notes,
572
+ prompt: testCase.prompt,
573
+ expectations: testCase.expectations,
574
+ reference_trajectory: testCase.reference_trajectory,
575
+ human_ratings: testCase.human_ratings,
576
+ cell,
577
+ repetitions,
578
+ assertionStats,
579
+ adapterErrors,
580
+ passed
581
+ };
582
+ }
583
+ //#endregion
584
+ //#region src/runner/limit.ts
585
+ function createLimit(max) {
586
+ if (!Number.isInteger(max) || max < 1) throw new Error(`createLimit: max must be a positive integer, got ${max}`);
587
+ let running = 0;
588
+ /**
589
+ * FIFO list of resolvers belonging to tasks waiting for a slot. When a
590
+ * running task finishes, the next resolver is invoked to wake one waiter.
591
+ */
592
+ const waiters = [];
593
+ return async (fn) => {
594
+ while (running >= max) await new Promise((resolve) => waiters.push(resolve));
595
+ running++;
596
+ try {
597
+ return await fn();
598
+ } finally {
599
+ running--;
600
+ const next = waiters.shift();
601
+ if (next) next();
602
+ }
603
+ };
604
+ }
605
+ //#endregion
606
+ //#region src/runner/suite.ts
607
+ /**
608
+ * Suite-level runner.
609
+ */
610
+ const DEFAULT_MAX_CONCURRENT = 4;
611
+ async function runSuite(suite, options = {}) {
612
+ if (suite.matrix.length === 0) throw new Error("runSuite: suite.matrix must contain at least one cell");
613
+ if (suite.cases.length === 0) throw new Error("runSuite: suite.cases must contain at least one case");
614
+ const adapter = options.adapter ?? getAdapter(suite.adapter ?? getDefaultAdapter().id);
615
+ const run = (config) => adapter.run(config);
616
+ const limit = createLimit(options.maxConcurrent ?? DEFAULT_MAX_CONCURRENT);
617
+ const onProgress = options.onProgress;
618
+ const startTs = Date.now();
619
+ const startedAt = new Date(startTs).toISOString();
620
+ const tasks = [];
621
+ for (const testCase of suite.cases) {
622
+ const reps = getRepetitions(testCase);
623
+ for (const cell of suite.matrix) for (let i = 0; i < reps; i++) tasks.push({
624
+ testCase,
625
+ cell,
626
+ repetitionIndex: i
627
+ });
628
+ }
629
+ onProgress?.({
630
+ kind: "suite-start",
631
+ totalReps: tasks.length
632
+ });
633
+ const buckets = /* @__PURE__ */ new Map();
634
+ const bucketKey = (caseId, cellLabel) => `${caseId}::${cellLabel}`;
635
+ for (const testCase of suite.cases) for (const cell of suite.matrix) buckets.set(bucketKey(testCase.id, cell.label), []);
636
+ await Promise.all(tasks.map((task) => limit(async () => {
637
+ if (options.signal?.aborted) return;
638
+ onProgress?.({
639
+ kind: "rep-start",
640
+ caseId: task.testCase.id,
641
+ cellLabel: task.cell.label,
642
+ repIndex: task.repetitionIndex
643
+ });
644
+ const config = mergeConfig(suite, task.testCase, task.cell);
645
+ const result = await runRepetition(task.testCase, task.cell, config, task.repetitionIndex, run, options.signal);
646
+ buckets.get(bucketKey(task.testCase.id, task.cell.label)).push(result);
647
+ onProgress?.({
648
+ kind: "rep-complete",
649
+ caseId: task.testCase.id,
650
+ cellLabel: task.cell.label,
651
+ repIndex: task.repetitionIndex,
652
+ ok: result.error === null,
653
+ durationMs: result.durationMs,
654
+ toolCallCount: result.adapterResult?.view.toolCalls.length,
655
+ assertionResults: result.assertionResults,
656
+ errorMessage: result.error?.message
657
+ });
658
+ })));
659
+ const cells = [];
660
+ for (const testCase of suite.cases) for (const cell of suite.matrix) {
661
+ const reps = buckets.get(bucketKey(testCase.id, cell.label)) ?? [];
662
+ reps.sort((a, b) => a.repetitionIndex - b.repetitionIndex);
663
+ const cellReport = aggregateCell(testCase, cell, reps);
664
+ cells.push(cellReport);
665
+ onProgress?.({
666
+ kind: "cell-complete",
667
+ report: cellReport
668
+ });
669
+ }
670
+ const report = {
671
+ startedAt,
672
+ durationMs: Date.now() - startTs,
673
+ cells
674
+ };
675
+ onProgress?.({
676
+ kind: "suite-complete",
677
+ report
678
+ });
679
+ return report;
680
+ }
681
+ //#endregion
682
+ export { aggregateCell as a, runRepetition as c, getDefaultAdapter as d, listAdapters as f, evaluateAll as h, DEFAULT_THRESHOLD as i, DEFAULT_ADAPTER_ID as l, evaluate as m, createLimit as n, getRepetitions as o, registerAdapter as p, DEFAULT_REPETITIONS as r, mergeConfig as s, runSuite as t, getAdapter as u };
683
+
684
+ //# sourceMappingURL=suite-chj0j22j.js.map