evalkit 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,817 @@
1
+ // src/checks/tool-selection.ts
2
+ function toolSelection(input) {
3
+ const expectedSet = new Set(input.expected);
4
+ const actualSet = new Set(input.actual);
5
+ const missing = [...expectedSet].filter((t) => !actualSet.has(t));
6
+ const extra = [...actualSet].filter((t) => !expectedSet.has(t));
7
+ const passed = missing.length === 0 && extra.length === 0;
8
+ let details = passed ? "All expected tools matched" : "";
9
+ if (missing.length > 0) {
10
+ details += `Missing: ${missing.join(", ")}`;
11
+ }
12
+ if (extra.length > 0) {
13
+ details += `${missing.length > 0 ? ". " : ""}Unexpected: ${extra.join(", ")}`;
14
+ }
15
+ return {
16
+ key: "tool_selection",
17
+ passed,
18
+ details,
19
+ expected: input.expected,
20
+ actual: [...actualSet]
21
+ };
22
+ }
23
+ function createToolSelectionEvaluator(config) {
24
+ return (input) => toolSelection({ expected: config.expected, actual: input.actual });
25
+ }
26
+
27
+ // src/checks/content-match.ts
28
+ function contentMatch(input) {
29
+ if (!input.mustContain || input.mustContain.length === 0) {
30
+ return {
31
+ key: "content_match",
32
+ passed: true,
33
+ details: "No content requirements",
34
+ missing: []
35
+ };
36
+ }
37
+ const lower = input.responseText.toLowerCase();
38
+ const missing = input.mustContain.filter((s) => !lower.includes(s.toLowerCase()));
39
+ const passed = missing.length === 0;
40
+ return {
41
+ key: "content_match",
42
+ passed,
43
+ details: passed ? "All required content found" : `Missing: ${missing.join(", ")}`,
44
+ missing
45
+ };
46
+ }
47
+ function createContentMatchEvaluator(config) {
48
+ return (input) => contentMatch({ responseText: input.responseText, mustContain: config.mustContain });
49
+ }
50
+
51
+ // src/checks/negative-match.ts
52
+ function negativeMatch(input) {
53
+ if (!input.mustNotContain || input.mustNotContain.length === 0) {
54
+ return {
55
+ key: "negative_match",
56
+ passed: true,
57
+ details: "No negative requirements",
58
+ found: []
59
+ };
60
+ }
61
+ const lower = input.responseText.toLowerCase();
62
+ const found = input.mustNotContain.filter((s) => lower.includes(s.toLowerCase()));
63
+ const passed = found.length === 0;
64
+ return {
65
+ key: "negative_match",
66
+ passed,
67
+ details: passed ? "No forbidden content found" : `Found: ${found.join(", ")}`,
68
+ found
69
+ };
70
+ }
71
+ function createNegativeMatchEvaluator(config) {
72
+ return (input) => negativeMatch({ responseText: input.responseText, mustNotContain: config.mustNotContain });
73
+ }
74
+
75
+ // src/checks/latency.ts
76
+ function latency(input) {
77
+ const threshold = input.thresholdMs ?? 2e4;
78
+ const passed = input.latencyMs <= threshold;
79
+ return {
80
+ key: "latency",
81
+ passed,
82
+ details: passed ? `${input.latencyMs}ms within ${threshold}ms threshold` : `${input.latencyMs}ms exceeded ${threshold}ms threshold`,
83
+ ms: input.latencyMs,
84
+ threshold
85
+ };
86
+ }
87
+ function createLatencyEvaluator(config) {
88
+ return (input) => latency({ latencyMs: input.latencyMs, thresholdMs: config.thresholdMs });
89
+ }
90
+
91
+ // src/checks/json-valid.ts
92
+ function jsonValid(input) {
93
+ let parsed;
94
+ try {
95
+ parsed = JSON.parse(input.text);
96
+ } catch {
97
+ return {
98
+ key: "json_valid",
99
+ passed: false,
100
+ details: "Invalid JSON"
101
+ };
102
+ }
103
+ if (input.requireObject && (parsed === null || typeof parsed !== "object")) {
104
+ return {
105
+ key: "json_valid",
106
+ passed: false,
107
+ details: `Expected object or array, got ${parsed === null ? "null" : typeof parsed}`
108
+ };
109
+ }
110
+ return {
111
+ key: "json_valid",
112
+ passed: true,
113
+ details: "Valid JSON"
114
+ };
115
+ }
116
+ function createJsonValidEvaluator(config) {
117
+ return (input) => jsonValid({ text: input.text, requireObject: config?.requireObject });
118
+ }
119
+
120
+ // src/checks/schema-match.ts
121
+ function schemaMatch(input) {
122
+ const missingKeys = input.requiredKeys.filter((k) => !(k in input.data));
123
+ const typeErrors = [];
124
+ if (input.typeChecks) {
125
+ for (const [key, expectedType] of Object.entries(input.typeChecks)) {
126
+ if (key in input.data) {
127
+ const actualType = Array.isArray(input.data[key]) ? "array" : typeof input.data[key];
128
+ if (actualType !== expectedType) {
129
+ typeErrors.push(`${key}: expected ${expectedType}, got ${actualType}`);
130
+ }
131
+ }
132
+ }
133
+ }
134
+ const passed = missingKeys.length === 0 && typeErrors.length === 0;
135
+ const problems = [];
136
+ if (missingKeys.length > 0) problems.push(`Missing keys: ${missingKeys.join(", ")}`);
137
+ if (typeErrors.length > 0) problems.push(`Type errors: ${typeErrors.join("; ")}`);
138
+ return {
139
+ key: "schema_match",
140
+ passed,
141
+ details: passed ? "Schema valid" : problems.join(". "),
142
+ missingKeys,
143
+ typeErrors
144
+ };
145
+ }
146
+ function createSchemaMatchEvaluator(config) {
147
+ return (input) => schemaMatch({
148
+ data: input.data,
149
+ requiredKeys: config.requiredKeys,
150
+ typeChecks: config.typeChecks
151
+ });
152
+ }
153
+
154
+ // src/checks/non-empty.ts
155
+ var DEFAULT_COP_OUT_PHRASES = [
156
+ "i don't know",
157
+ "n/a",
158
+ "no information",
159
+ "i'm not sure",
160
+ "i cannot",
161
+ "i can't",
162
+ "no data available"
163
+ ];
164
+ function nonEmpty(input) {
165
+ const trimmed = input.responseText.trim();
166
+ if (trimmed.length === 0) {
167
+ return {
168
+ key: "non_empty",
169
+ passed: false,
170
+ details: "Response is empty"
171
+ };
172
+ }
173
+ const phrases = input.copOutPhrases ?? DEFAULT_COP_OUT_PHRASES;
174
+ const lower = trimmed.toLowerCase();
175
+ const matchedPhrase = phrases.find((p) => lower === p.toLowerCase());
176
+ if (matchedPhrase) {
177
+ return {
178
+ key: "non_empty",
179
+ passed: false,
180
+ details: `Response is a cop-out phrase: "${matchedPhrase}"`
181
+ };
182
+ }
183
+ return {
184
+ key: "non_empty",
185
+ passed: true,
186
+ details: "Response is non-empty"
187
+ };
188
+ }
189
+ function createNonEmptyEvaluator(config) {
190
+ return (input) => nonEmpty({ responseText: input.responseText, copOutPhrases: config?.copOutPhrases });
191
+ }
192
+
193
+ // src/checks/length-bounds.ts
194
+ function lengthBounds(input) {
195
+ const length = input.responseText.length;
196
+ let passed = true;
197
+ const problems = [];
198
+ if (input.min !== void 0 && length < input.min) {
199
+ passed = false;
200
+ problems.push(`${length} chars below minimum ${input.min}`);
201
+ }
202
+ if (input.max !== void 0 && length > input.max) {
203
+ passed = false;
204
+ problems.push(`${length} chars above maximum ${input.max}`);
205
+ }
206
+ return {
207
+ key: "length_bounds",
208
+ passed,
209
+ details: passed ? `${length} chars within bounds` : problems.join(". "),
210
+ length,
211
+ min: input.min,
212
+ max: input.max
213
+ };
214
+ }
215
+ function createLengthBoundsEvaluator(config) {
216
+ return (input) => lengthBounds({ responseText: input.responseText, min: config.min, max: config.max });
217
+ }
218
+
219
+ // src/checks/regex-match.ts
220
+ function regexMatch(input) {
221
+ const mode = input.mode ?? "all";
222
+ const failedPatterns = [];
223
+ for (const pattern of input.patterns) {
224
+ const regex = typeof pattern === "string" ? new RegExp(pattern) : pattern;
225
+ if (!regex.test(input.responseText)) {
226
+ failedPatterns.push(String(pattern));
227
+ }
228
+ }
229
+ let passed;
230
+ if (mode === "all") {
231
+ passed = failedPatterns.length === 0;
232
+ } else {
233
+ passed = failedPatterns.length < input.patterns.length;
234
+ }
235
+ let details;
236
+ if (passed) {
237
+ details = mode === "all" ? "All patterns matched" : "At least one pattern matched";
238
+ } else {
239
+ details = mode === "all" ? `Failed patterns: ${failedPatterns.join(", ")}` : "No patterns matched";
240
+ }
241
+ return {
242
+ key: "regex_match",
243
+ passed,
244
+ details,
245
+ failedPatterns
246
+ };
247
+ }
248
+ function createRegexMatchEvaluator(config) {
249
+ return (input) => regexMatch({ responseText: input.responseText, patterns: config.patterns, mode: config.mode });
250
+ }
251
+
252
+ // src/checks/tool-call-count.ts
253
+ function toolCallCount(input) {
254
+ let passed = true;
255
+ const problems = [];
256
+ if (input.min !== void 0 && input.count < input.min) {
257
+ passed = false;
258
+ problems.push(`${input.count} calls below minimum ${input.min}`);
259
+ }
260
+ if (input.max !== void 0 && input.count > input.max) {
261
+ passed = false;
262
+ problems.push(`${input.count} calls above maximum ${input.max}`);
263
+ }
264
+ return {
265
+ key: "tool_call_count",
266
+ passed,
267
+ details: passed ? `${input.count} tool calls within bounds` : problems.join(". "),
268
+ count: input.count,
269
+ min: input.min,
270
+ max: input.max
271
+ };
272
+ }
273
+ function createToolCallCountEvaluator(config) {
274
+ return (input) => toolCallCount({ count: input.count, min: config.min, max: config.max });
275
+ }
276
+
277
+ // src/checks/cost-budget.ts
278
+ function costBudget(input) {
279
+ const passed = input.actual <= input.budget;
280
+ return {
281
+ key: "cost_budget",
282
+ passed,
283
+ details: passed ? `${input.actual} within budget of ${input.budget}` : `${input.actual} exceeded budget of ${input.budget}`,
284
+ actual: input.actual,
285
+ budget: input.budget
286
+ };
287
+ }
288
+ function createCostBudgetEvaluator(config) {
289
+ return (input) => costBudget({ actual: input.actual, budget: config.budget });
290
+ }
291
+
292
+ // src/checks/run-checks.ts
293
+ function runChecks(input) {
294
+ const results = [];
295
+ if (input.expectedTools !== void 0 && input.actualTools !== void 0) {
296
+ results.push(toolSelection({ expected: input.expectedTools, actual: input.actualTools }));
297
+ }
298
+ if (input.mustContain !== void 0 && input.responseText !== void 0) {
299
+ results.push(contentMatch({ responseText: input.responseText, mustContain: input.mustContain }));
300
+ }
301
+ if (input.mustNotContain !== void 0 && input.responseText !== void 0) {
302
+ results.push(negativeMatch({ responseText: input.responseText, mustNotContain: input.mustNotContain }));
303
+ }
304
+ if (input.latencyMs !== void 0) {
305
+ results.push(latency({ latencyMs: input.latencyMs, thresholdMs: input.thresholdMs }));
306
+ }
307
+ if (input.json !== void 0) {
308
+ results.push(jsonValid(input.json));
309
+ }
310
+ if (input.schema !== void 0) {
311
+ results.push(schemaMatch(input.schema));
312
+ }
313
+ if (input.copOutPhrases !== void 0 && input.responseText !== void 0) {
314
+ results.push(nonEmpty({ responseText: input.responseText, copOutPhrases: input.copOutPhrases }));
315
+ }
316
+ if ((input.lengthMin !== void 0 || input.lengthMax !== void 0) && input.responseText !== void 0) {
317
+ results.push(lengthBounds({ responseText: input.responseText, min: input.lengthMin, max: input.lengthMax }));
318
+ }
319
+ if (input.regexPatterns !== void 0 && input.responseText !== void 0) {
320
+ results.push(regexMatch({ responseText: input.responseText, patterns: input.regexPatterns, mode: input.regexMode }));
321
+ }
322
+ if (input.toolCallCountValue !== void 0) {
323
+ results.push(toolCallCount({ count: input.toolCallCountValue, min: input.toolCallMin, max: input.toolCallMax }));
324
+ }
325
+ if (input.costActual !== void 0 && input.costBudget !== void 0) {
326
+ results.push(costBudget({ actual: input.costActual, budget: input.costBudget }));
327
+ }
328
+ const passedCount = results.filter((r) => r.passed).length;
329
+ const allPassed = results.length > 0 && passedCount === results.length;
330
+ return {
331
+ passed: allPassed,
332
+ results,
333
+ summary: `${passedCount}/${results.length} checks passed`
334
+ };
335
+ }
336
+
337
+ // src/runner/loader.ts
338
+ import fs from "fs";
339
+ import path from "path";
340
+ function stripComment(line) {
341
+ let inSingle = false;
342
+ let inDouble = false;
343
+ for (let i = 0; i < line.length; i++) {
344
+ const ch = line[i];
345
+ if (ch === "'" && !inDouble) inSingle = !inSingle;
346
+ else if (ch === '"' && !inSingle) inDouble = !inDouble;
347
+ else if (ch === "#" && !inSingle && !inDouble) {
348
+ return line.slice(0, i).trimEnd();
349
+ }
350
+ }
351
+ return line;
352
+ }
353
+ function tokenize(content) {
354
+ const lines = [];
355
+ for (const raw of content.split("\n")) {
356
+ const stripped = stripComment(raw);
357
+ if (stripped.trim() === "") continue;
358
+ const indent = stripped.search(/\S/);
359
+ if (indent === -1) continue;
360
+ lines.push({ indent, raw: stripped, content: stripped.slice(indent) });
361
+ }
362
+ return lines;
363
+ }
364
+ function parseScalar(value) {
365
+ if (value === "true") return true;
366
+ if (value === "false") return false;
367
+ if (value === "null" || value === "~") return null;
368
+ if (value.startsWith('"') && value.endsWith('"') || value.startsWith("'") && value.endsWith("'")) {
369
+ return value.slice(1, -1);
370
+ }
371
+ if (/^-?\d+(\.\d+)?$/.test(value)) {
372
+ return Number(value);
373
+ }
374
+ return value;
375
+ }
376
+ function parseFlowArray(raw) {
377
+ const inner = raw.slice(1, -1).trim();
378
+ if (inner === "") return [];
379
+ const items = [];
380
+ let current = "";
381
+ let depth = 0;
382
+ let inSingle = false;
383
+ let inDouble = false;
384
+ for (let i = 0; i < inner.length; i++) {
385
+ const ch = inner[i];
386
+ if (ch === "'" && !inDouble) {
387
+ inSingle = !inSingle;
388
+ current += ch;
389
+ } else if (ch === '"' && !inSingle) {
390
+ inDouble = !inDouble;
391
+ current += ch;
392
+ } else if (ch === "[" && !inSingle && !inDouble) {
393
+ depth++;
394
+ current += ch;
395
+ } else if (ch === "]" && !inSingle && !inDouble) {
396
+ depth--;
397
+ current += ch;
398
+ } else if (ch === "," && depth === 0 && !inSingle && !inDouble) {
399
+ items.push(current.trim());
400
+ current = "";
401
+ } else {
402
+ current += ch;
403
+ }
404
+ }
405
+ if (current.trim() !== "") items.push(current.trim());
406
+ return items.map((item) => parseScalar(item));
407
+ }
408
+ function parseBlock(lines, start, baseIndent) {
409
+ const result = {};
410
+ let i = start;
411
+ while (i < lines.length && lines[i].indent >= baseIndent) {
412
+ const line = lines[i];
413
+ if (line.indent < baseIndent) break;
414
+ if (line.indent > baseIndent) break;
415
+ const content = line.content;
416
+ if (content.startsWith("- ")) {
417
+ break;
418
+ }
419
+ const colonIdx = findColon(content);
420
+ if (colonIdx === -1) {
421
+ throw new Error(`YAML parse error: expected key-value pair, got "${content}"`);
422
+ }
423
+ const key = content.slice(0, colonIdx).trim();
424
+ const valueRaw = content.slice(colonIdx + 1).trim();
425
+ if (valueRaw === "" || valueRaw === "") {
426
+ if (i + 1 < lines.length && lines[i + 1].indent > baseIndent) {
427
+ const nextIndent = lines[i + 1].indent;
428
+ const nextContent = lines[i + 1].content;
429
+ if (nextContent.startsWith("- ")) {
430
+ const [arr, newI] = parseSequence(lines, i + 1, nextIndent);
431
+ result[key] = arr;
432
+ i = newI;
433
+ } else {
434
+ const [nested, newI] = parseBlock(lines, i + 1, nextIndent);
435
+ result[key] = nested;
436
+ i = newI;
437
+ }
438
+ } else {
439
+ result[key] = null;
440
+ i++;
441
+ }
442
+ } else if (valueRaw.startsWith("[") && valueRaw.endsWith("]")) {
443
+ result[key] = parseFlowArray(valueRaw);
444
+ i++;
445
+ } else {
446
+ result[key] = parseScalar(valueRaw);
447
+ i++;
448
+ }
449
+ }
450
+ return [result, i];
451
+ }
452
+ function parseSequence(lines, start, baseIndent) {
453
+ const result = [];
454
+ let i = start;
455
+ while (i < lines.length && lines[i].indent >= baseIndent) {
456
+ const line = lines[i];
457
+ if (line.indent < baseIndent) break;
458
+ if (line.indent > baseIndent) break;
459
+ const content = line.content;
460
+ if (!content.startsWith("- ")) break;
461
+ const afterDash = content.slice(2).trim();
462
+ if (afterDash === "") {
463
+ if (i + 1 < lines.length && lines[i + 1].indent > baseIndent) {
464
+ const nextIndent = lines[i + 1].indent;
465
+ const [nested, newI] = parseBlock(lines, i + 1, nextIndent);
466
+ result.push(nested);
467
+ i = newI;
468
+ } else {
469
+ result.push(null);
470
+ i++;
471
+ }
472
+ } else if (findColon(afterDash) !== -1 && !afterDash.startsWith('"') && !afterDash.startsWith("'")) {
473
+ const inlineIndent = line.indent + 2;
474
+ const colonIdx = findColon(afterDash);
475
+ const firstKey = afterDash.slice(0, colonIdx).trim();
476
+ const firstValRaw = afterDash.slice(colonIdx + 1).trim();
477
+ const obj = {};
478
+ if (firstValRaw === "") {
479
+ if (i + 1 < lines.length && lines[i + 1].indent > inlineIndent) {
480
+ const nextIndent = lines[i + 1].indent;
481
+ const nextContent = lines[i + 1].content;
482
+ if (nextContent.startsWith("- ")) {
483
+ const [arr, newI] = parseSequence(lines, i + 1, nextIndent);
484
+ obj[firstKey] = arr;
485
+ i = newI;
486
+ } else {
487
+ const [nested, newI] = parseBlock(lines, i + 1, nextIndent);
488
+ obj[firstKey] = nested;
489
+ i = newI;
490
+ }
491
+ } else {
492
+ obj[firstKey] = null;
493
+ i++;
494
+ }
495
+ } else if (firstValRaw.startsWith("[") && firstValRaw.endsWith("]")) {
496
+ obj[firstKey] = parseFlowArray(firstValRaw);
497
+ i++;
498
+ } else {
499
+ obj[firstKey] = parseScalar(firstValRaw);
500
+ i++;
501
+ }
502
+ while (i < lines.length && lines[i].indent === inlineIndent) {
503
+ const sibContent = lines[i].content;
504
+ if (sibContent.startsWith("- ")) break;
505
+ const sibColon = findColon(sibContent);
506
+ if (sibColon === -1) break;
507
+ const sibKey = sibContent.slice(0, sibColon).trim();
508
+ const sibValRaw = sibContent.slice(sibColon + 1).trim();
509
+ if (sibValRaw === "") {
510
+ if (i + 1 < lines.length && lines[i + 1].indent > inlineIndent) {
511
+ const nextIndent = lines[i + 1].indent;
512
+ const nextContent = lines[i + 1].content;
513
+ if (nextContent.startsWith("- ")) {
514
+ const [arr, newI] = parseSequence(lines, i + 1, nextIndent);
515
+ obj[sibKey] = arr;
516
+ i = newI;
517
+ } else {
518
+ const [nested, newI] = parseBlock(lines, i + 1, nextIndent);
519
+ obj[sibKey] = nested;
520
+ i = newI;
521
+ }
522
+ } else {
523
+ obj[sibKey] = null;
524
+ i++;
525
+ }
526
+ } else if (sibValRaw.startsWith("[") && sibValRaw.endsWith("]")) {
527
+ obj[sibKey] = parseFlowArray(sibValRaw);
528
+ i++;
529
+ } else {
530
+ obj[sibKey] = parseScalar(sibValRaw);
531
+ i++;
532
+ }
533
+ }
534
+ result.push(obj);
535
+ } else if (afterDash.startsWith("[") && afterDash.endsWith("]")) {
536
+ result.push(parseFlowArray(afterDash));
537
+ i++;
538
+ } else {
539
+ result.push(parseScalar(afterDash));
540
+ i++;
541
+ }
542
+ }
543
+ return [result, i];
544
+ }
545
+ function findColon(str) {
546
+ let inSingle = false;
547
+ let inDouble = false;
548
+ for (let i = 0; i < str.length; i++) {
549
+ const ch = str[i];
550
+ if (ch === "'" && !inDouble) inSingle = !inSingle;
551
+ else if (ch === '"' && !inSingle) inDouble = !inDouble;
552
+ else if (ch === ":" && !inSingle && !inDouble) {
553
+ if (i + 1 >= str.length || str[i + 1] === " ") {
554
+ return i;
555
+ }
556
+ }
557
+ }
558
+ return -1;
559
+ }
560
+ function parseYaml(content) {
561
+ const lines = tokenize(content);
562
+ if (lines.length === 0) {
563
+ throw new Error("YAML is empty");
564
+ }
565
+ const [parsed] = parseBlock(lines, 0, lines[0].indent);
566
+ return validate(parsed);
567
+ }
568
+ function parseJson(content) {
569
+ const parsed = JSON.parse(content);
570
+ return validate(parsed);
571
+ }
572
+ function loadFile(filePath) {
573
+ const ext = path.extname(filePath).toLowerCase();
574
+ if (ext !== ".json" && ext !== ".yaml" && ext !== ".yml") {
575
+ throw new Error(`Unsupported file extension: ${ext}. Use .json, .yaml, or .yml`);
576
+ }
577
+ const content = fs.readFileSync(filePath, "utf-8");
578
+ if (ext === ".json") {
579
+ return parseJson(content);
580
+ }
581
+ return parseYaml(content);
582
+ }
583
+ function loadCases(source) {
584
+ if (typeof source === "string") {
585
+ return loadFile(source);
586
+ }
587
+ return validate(source);
588
+ }
589
+ function validate(data) {
590
+ if (typeof data !== "object" || data === null) {
591
+ throw new Error("Invalid suite config: expected an object");
592
+ }
593
+ const obj = data;
594
+ if (!Array.isArray(obj.test_cases)) {
595
+ throw new Error("Invalid suite config: missing test_cases array");
596
+ }
597
+ for (const tc of obj.test_cases) {
598
+ if (typeof tc !== "object" || tc === null) {
599
+ throw new Error("Invalid test case: expected an object");
600
+ }
601
+ const c = tc;
602
+ if (typeof c.id !== "string") {
603
+ throw new Error("Invalid test case: missing or non-string id");
604
+ }
605
+ if (typeof c.query !== "string") {
606
+ throw new Error("Invalid test case: missing or non-string query");
607
+ }
608
+ }
609
+ return {
610
+ name: typeof obj.name === "string" ? obj.name : void 0,
611
+ test_cases: obj.test_cases
612
+ };
613
+ }
614
+
615
+ // src/runner/run-suite.ts
616
+ async function runSuite(options) {
617
+ const config = typeof options.cases === "string" ? loadCases(options.cases) : loadCases(options.cases);
618
+ const suiteName = options.name ?? config.name ?? "unnamed";
619
+ const concurrency = options.concurrency ?? 1;
620
+ const startTime = Date.now();
621
+ const cases = config.test_cases;
622
+ const results = [];
623
+ if (concurrency <= 1) {
624
+ for (const tc of cases) {
625
+ const result = await runCase(tc, options.agent);
626
+ results.push(result);
627
+ options.onCaseComplete?.(result);
628
+ }
629
+ } else {
630
+ let idx = 0;
631
+ const runNext = async () => {
632
+ while (idx < cases.length) {
633
+ const currentIdx = idx++;
634
+ const result = await runCase(cases[currentIdx], options.agent);
635
+ results.push(result);
636
+ options.onCaseComplete?.(result);
637
+ }
638
+ };
639
+ const workers = Array.from(
640
+ { length: Math.min(concurrency, cases.length) },
641
+ () => runNext()
642
+ );
643
+ await Promise.all(workers);
644
+ }
645
+ const passedCount = results.filter((r) => r.passed).length;
646
+ const duration = Date.now() - startTime;
647
+ return {
648
+ name: suiteName,
649
+ passed: passedCount,
650
+ failed: results.length - passedCount,
651
+ total: results.length,
652
+ cases: results,
653
+ duration
654
+ };
655
+ }
656
+ async function runCase(tc, agent) {
657
+ let agentResult;
658
+ try {
659
+ agentResult = await agent(tc.query);
660
+ } catch (err) {
661
+ agentResult = {
662
+ responseText: ""
663
+ };
664
+ return {
665
+ id: tc.id,
666
+ query: tc.query,
667
+ passed: false,
668
+ checks: {
669
+ passed: false,
670
+ results: [{
671
+ key: "agent_error",
672
+ passed: false,
673
+ details: `Agent threw: ${err instanceof Error ? err.message : String(err)}`
674
+ }],
675
+ summary: "0/1 checks passed"
676
+ },
677
+ metadata: tc.metadata,
678
+ agentResult
679
+ };
680
+ }
681
+ const checks = tc.checks ?? {};
682
+ const input = {};
683
+ input.responseText = agentResult.responseText;
684
+ if (checks.expectedTools !== void 0) {
685
+ input.expectedTools = checks.expectedTools;
686
+ input.actualTools = agentResult.actualTools;
687
+ }
688
+ if (checks.mustContain !== void 0) {
689
+ input.mustContain = checks.mustContain;
690
+ }
691
+ if (checks.mustNotContain !== void 0) {
692
+ input.mustNotContain = checks.mustNotContain;
693
+ }
694
+ if (agentResult.latencyMs !== void 0) {
695
+ input.latencyMs = agentResult.latencyMs;
696
+ }
697
+ if (checks.thresholdMs !== void 0) {
698
+ input.thresholdMs = checks.thresholdMs;
699
+ }
700
+ if (checks.json !== void 0) {
701
+ input.json = {
702
+ text: agentResult.responseText,
703
+ requireObject: checks.json.requireObject
704
+ };
705
+ }
706
+ if (checks.schema !== void 0) {
707
+ try {
708
+ const data = JSON.parse(agentResult.responseText);
709
+ input.schema = {
710
+ data,
711
+ requiredKeys: checks.schema.requiredKeys,
712
+ typeChecks: checks.schema.typeChecks
713
+ };
714
+ } catch {
715
+ input.schema = {
716
+ data: {},
717
+ requiredKeys: checks.schema.requiredKeys,
718
+ typeChecks: checks.schema.typeChecks
719
+ };
720
+ }
721
+ }
722
+ if (checks.copOutPhrases !== void 0) {
723
+ input.copOutPhrases = checks.copOutPhrases;
724
+ }
725
+ if (checks.lengthMin !== void 0) {
726
+ input.lengthMin = checks.lengthMin;
727
+ }
728
+ if (checks.lengthMax !== void 0) {
729
+ input.lengthMax = checks.lengthMax;
730
+ }
731
+ if (checks.regexPatterns !== void 0) {
732
+ input.regexPatterns = checks.regexPatterns.map((p) => new RegExp(p));
733
+ }
734
+ if (checks.regexMode !== void 0) {
735
+ input.regexMode = checks.regexMode;
736
+ }
737
+ const toolCallCountValue = agentResult.toolCallCount ?? agentResult.actualTools?.length;
738
+ if (toolCallCountValue !== void 0) {
739
+ input.toolCallCountValue = toolCallCountValue;
740
+ }
741
+ if (checks.toolCallMin !== void 0) {
742
+ input.toolCallMin = checks.toolCallMin;
743
+ }
744
+ if (checks.toolCallMax !== void 0) {
745
+ input.toolCallMax = checks.toolCallMax;
746
+ }
747
+ if (agentResult.cost !== void 0) {
748
+ input.costActual = agentResult.cost;
749
+ }
750
+ if (checks.costBudget !== void 0) {
751
+ input.costBudget = checks.costBudget;
752
+ }
753
+ const suiteResult = runChecks(input);
754
+ const passed = suiteResult.results.length === 0 ? true : suiteResult.passed;
755
+ return {
756
+ id: tc.id,
757
+ query: tc.query,
758
+ passed,
759
+ checks: { ...suiteResult, passed },
760
+ metadata: tc.metadata,
761
+ agentResult
762
+ };
763
+ }
764
+
765
+ // src/runner/reporter.ts
766
+ function printSuiteResult(result) {
767
+ console.log(`
768
+ Suite: ${result.name}`);
769
+ console.log(`${"=".repeat(60)}`);
770
+ for (const c of result.cases) {
771
+ const status = c.passed ? "PASS" : "FAIL";
772
+ const query = c.query.length > 50 ? c.query.slice(0, 47) + "..." : c.query;
773
+ const latency2 = c.agentResult.latencyMs !== void 0 ? `${(c.agentResult.latencyMs / 1e3).toFixed(1)}s` : "";
774
+ console.log(` ${c.id} ${query.padEnd(50)} ${status} ${latency2}`);
775
+ if (!c.passed) {
776
+ for (const check of c.checks.results) {
777
+ if (!check.passed) {
778
+ console.log(` ${check.key}: ${check.details}`);
779
+ }
780
+ }
781
+ }
782
+ }
783
+ console.log(`
784
+ ${result.passed}/${result.total} passed (${(result.duration / 1e3).toFixed(1)}s)`);
785
+ }
786
+ export {
787
+ contentMatch,
788
+ costBudget,
789
+ createContentMatchEvaluator,
790
+ createCostBudgetEvaluator,
791
+ createJsonValidEvaluator,
792
+ createLatencyEvaluator,
793
+ createLengthBoundsEvaluator,
794
+ createNegativeMatchEvaluator,
795
+ createNonEmptyEvaluator,
796
+ createRegexMatchEvaluator,
797
+ createSchemaMatchEvaluator,
798
+ createToolCallCountEvaluator,
799
+ createToolSelectionEvaluator,
800
+ jsonValid,
801
+ latency,
802
+ lengthBounds,
803
+ loadCases,
804
+ loadFile,
805
+ negativeMatch,
806
+ nonEmpty,
807
+ parseJson,
808
+ parseYaml,
809
+ printSuiteResult,
810
+ regexMatch,
811
+ runChecks,
812
+ runSuite,
813
+ schemaMatch,
814
+ toolCallCount,
815
+ toolSelection
816
+ };
817
+ //# sourceMappingURL=index.js.map