@m4trix/evals 0.25.1 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -4,10 +4,10 @@ var effect = require('effect');
4
4
  var diff = require('diff');
5
5
  var stringify = require('fast-json-stable-stringify');
6
6
  var crypto = require('crypto');
7
- var fs = require('fs');
7
+ var promises = require('fs/promises');
8
8
  var path = require('path');
9
+ var fs = require('fs');
9
10
  var jitiModule = require('jiti');
10
- var promises = require('fs/promises');
11
11
  var url = require('url');
12
12
 
13
13
  var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
@@ -34,6 +34,249 @@ function _interopNamespace(e) {
34
34
  var stringify__default = /*#__PURE__*/_interopDefault(stringify);
35
35
  var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
36
36
 
37
+ // src/index.ts
38
+ var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
39
+ function makeEntityIdSchema(brand, label) {
40
+ return effect.Schema.String.pipe(
41
+ effect.Schema.trimmed(),
42
+ effect.Schema.minLength(1, {
43
+ message: () => `${label} must be non-empty.`
44
+ }),
45
+ effect.Schema.pattern(ENTITY_ID_PATTERN, {
46
+ message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
47
+ }),
48
+ effect.Schema.brand(brand)
49
+ );
50
+ }
51
+ var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
52
+ var EvaluatorNameSchema = makeEntityIdSchema("EvaluatorName", "Evaluator name");
53
+ var TestCaseNameSchema = makeEntityIdSchema("TestCaseName", "Test case name");
54
+ var DatasetNameSchema = makeEntityIdSchema("DatasetName", "Dataset name");
55
+ function validateWithSchema(schema, raw, context) {
56
+ const trimmed = raw.trim();
57
+ const decode = effect.Schema.decodeUnknownEither(
58
+ schema
59
+ );
60
+ const result = decode(trimmed);
61
+ if (effect.Either.isLeft(result)) {
62
+ throw new Error(`${context}: ${effect.ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
63
+ }
64
+ return result.right;
65
+ }
66
+ function validateRunConfigName(raw, context) {
67
+ return validateWithSchema(RunConfigNameSchema, raw, context);
68
+ }
69
+ function validateEvaluatorName(raw, context) {
70
+ return validateWithSchema(EvaluatorNameSchema, raw, context);
71
+ }
72
+ function validateTestCaseName(raw, context) {
73
+ return validateWithSchema(TestCaseNameSchema, raw, context);
74
+ }
75
+ function validateDatasetName(raw, context) {
76
+ return validateWithSchema(DatasetNameSchema, raw, context);
77
+ }
78
+ function normalizeOptionalDisplayName(raw) {
79
+ if (raw === void 0) {
80
+ return void 0;
81
+ }
82
+ const t = raw.trim();
83
+ return t.length === 0 ? void 0 : t;
84
+ }
85
+
86
+ // src/evals/dataset.ts
87
+ function matchesAny(value, matchers) {
88
+ return matchers.some(
89
+ (matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
90
+ );
91
+ }
92
+ function matchesAnyPath(filePath, matchers) {
93
+ return matchers.some((matcher) => {
94
+ if (typeof matcher === "string") {
95
+ return simpleGlobMatch(matcher, filePath);
96
+ }
97
+ return matcher.test(filePath);
98
+ });
99
+ }
100
+ function simpleGlobMatch(pattern, value) {
101
+ const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
102
+ return new RegExp(`^${escaped}$`).test(value);
103
+ }
104
+ var Dataset = class _Dataset {
105
+ constructor(config) {
106
+ this._config = config;
107
+ }
108
+ static define(config) {
109
+ const name = validateDatasetName(config.name, "Dataset.define");
110
+ const displayName = normalizeOptionalDisplayName(config.displayName);
111
+ return new _Dataset({
112
+ name,
113
+ displayName,
114
+ includedTags: config.includedTags ?? [],
115
+ excludedTags: config.excludedTags ?? [],
116
+ includedPaths: config.includedPaths ?? [],
117
+ excludedPaths: config.excludedPaths ?? []
118
+ });
119
+ }
120
+ /** Canonical dataset id (same rules as `RunConfig` / `TestCase` `name`). */
121
+ getName() {
122
+ return this._config.name;
123
+ }
124
+ getDisplayName() {
125
+ return this._config.displayName;
126
+ }
127
+ /** Label for CLI/TUI and evaluator `meta.datasetName`: {@link getDisplayName} if set, otherwise {@link getName}. */
128
+ getDisplayLabel() {
129
+ return this._config.displayName ?? this._config.name;
130
+ }
131
+ getIncludedTags() {
132
+ return this._config.includedTags;
133
+ }
134
+ getExcludedTags() {
135
+ return this._config.excludedTags;
136
+ }
137
+ getIncludedPaths() {
138
+ return this._config.includedPaths;
139
+ }
140
+ getExcludedPaths() {
141
+ return this._config.excludedPaths;
142
+ }
143
+ matchesTestCase(testCase, filePath) {
144
+ const tags = testCase.getTags();
145
+ if (this._config.excludedTags.length > 0) {
146
+ if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
147
+ return false;
148
+ }
149
+ }
150
+ if (this._config.excludedPaths.length > 0) {
151
+ if (matchesAnyPath(filePath, this._config.excludedPaths)) {
152
+ return false;
153
+ }
154
+ }
155
+ const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
156
+ const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
157
+ return tagMatch && pathMatch;
158
+ }
159
+ };
160
+ function getDatasetDisplayLabel(dataset) {
161
+ if (typeof dataset.getDisplayLabel === "function") {
162
+ return dataset.getDisplayLabel();
163
+ }
164
+ return typeof dataset.getName === "function" ? dataset.getName() : "";
165
+ }
166
+
167
+ // src/evals/evaluator.ts
168
+ var Evaluator = class _Evaluator {
169
+ constructor(config) {
170
+ this._config = config;
171
+ }
172
+ getState() {
173
+ return {
174
+ name: this._config.name,
175
+ displayName: this._config.displayName,
176
+ tags: this._config.tags,
177
+ inputSchema: this._config.inputSchema,
178
+ outputSchema: this._config.outputSchema,
179
+ scoreSchema: this._config.scoreSchema,
180
+ middlewares: this._config.middlewares,
181
+ evaluateFn: this._config.evaluateFn,
182
+ passThreshold: this._config.passThreshold,
183
+ passCriterion: this._config.passCriterion
184
+ };
185
+ }
186
+ static use(middleware) {
187
+ return new _Evaluator({
188
+ middlewares: [middleware],
189
+ tags: []
190
+ });
191
+ }
192
+ use(middleware) {
193
+ const state = this.getState();
194
+ return new _Evaluator({
195
+ ...state,
196
+ middlewares: [...state.middlewares, middleware]
197
+ });
198
+ }
199
+ define(config) {
200
+ const { middlewares } = this.getState();
201
+ const name = validateEvaluatorName(config.name, "Evaluator.define");
202
+ const displayName = normalizeOptionalDisplayName(config.displayName);
203
+ const tags = config.tags !== void 0 ? [...config.tags] : [];
204
+ return new _Evaluator({
205
+ name,
206
+ displayName,
207
+ tags,
208
+ inputSchema: config.inputSchema,
209
+ outputSchema: config.outputSchema,
210
+ scoreSchema: config.scoreSchema,
211
+ middlewares,
212
+ passThreshold: config.passThreshold,
213
+ passCriterion: config.passCriterion
214
+ });
215
+ }
216
+ evaluate(fn) {
217
+ return new _Evaluator({
218
+ ...this.getState(),
219
+ evaluateFn: fn
220
+ });
221
+ }
222
+ /** Canonical evaluator id when defined; otherwise undefined (middleware-only chain). */
223
+ getName() {
224
+ return this._config.name;
225
+ }
226
+ getDisplayName() {
227
+ return this._config.displayName;
228
+ }
229
+ /** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. Undefined if not yet defined. */
230
+ getDisplayLabel() {
231
+ const id = this._config.name;
232
+ if (id === void 0) {
233
+ return void 0;
234
+ }
235
+ return this._config.displayName ?? id;
236
+ }
237
+ /** Tags from `Evaluator.define({ tags })`; empty until defined. */
238
+ getTags() {
239
+ return [...this._config.tags];
240
+ }
241
+ getInputSchema() {
242
+ return this._config.inputSchema;
243
+ }
244
+ getOutputSchema() {
245
+ return this._config.outputSchema;
246
+ }
247
+ getScoreSchema() {
248
+ return this._config.scoreSchema;
249
+ }
250
+ getMiddlewares() {
251
+ return this._config.middlewares;
252
+ }
253
+ getEvaluateFn() {
254
+ return this._config.evaluateFn;
255
+ }
256
+ getPassThreshold() {
257
+ return this._config.passThreshold;
258
+ }
259
+ getPassCriterion() {
260
+ return this._config.passCriterion;
261
+ }
262
+ async resolveContext() {
263
+ const parts = await Promise.all(this._config.middlewares.map((mw) => mw.resolve()));
264
+ return Object.assign({}, ...parts);
265
+ }
266
+ };
267
+ function getEvaluatorDisplayLabel(evaluator) {
268
+ if (typeof evaluator.getDisplayLabel === "function") {
269
+ const label = evaluator.getDisplayLabel();
270
+ if (label !== void 0) {
271
+ return label;
272
+ }
273
+ }
274
+ return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
275
+ }
276
+ function getEvaluatorTagList(evaluator) {
277
+ return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
278
+ }
279
+
37
280
  // src/cli/data.mock.json
38
281
  var data_mock_default = {
39
282
  datasets: [
@@ -280,7 +523,7 @@ function toEvalDataset(item, snapshots) {
280
523
  const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
281
524
  return {
282
525
  id: item.id,
283
- name: item.dataset.getName(),
526
+ name: getDatasetDisplayLabel(item.dataset),
284
527
  overview: `Discovered from ${item.filePath}`,
285
528
  runs
286
529
  };
@@ -288,7 +531,7 @@ function toEvalDataset(item, snapshots) {
288
531
  function toEvaluatorOption(item) {
289
532
  return {
290
533
  id: item.id,
291
- name: item.evaluator.getName() ?? toSlug(item.id),
534
+ name: getEvaluatorDisplayLabel(item.evaluator) ?? toSlug(item.id),
292
535
  configPreview: `Source: ${item.filePath}`
293
536
  };
294
537
  }
@@ -333,196 +576,149 @@ function parseStartupArgs(argv) {
333
576
  }
334
577
  return args;
335
578
  }
336
-
337
- // src/evals/test-case.ts
338
- function resolve(value) {
339
- return typeof value === "function" ? value() : value;
340
- }
341
- var TestCase = class _TestCase {
342
- constructor(config) {
343
- this._config = config;
579
+ function preprocessForDiff(value, options) {
580
+ if (options?.sort && Array.isArray(value)) {
581
+ return [...value].sort((a, b) => {
582
+ const aStr = stringify__default.default(preprocessForDiff(a, options));
583
+ const bStr = stringify__default.default(preprocessForDiff(b, options));
584
+ return aStr.localeCompare(bStr);
585
+ }).map((item) => preprocessForDiff(item, options));
344
586
  }
345
- static describe(config) {
346
- const reruns = config.reruns ?? 1;
347
- if (reruns < 1 || !Number.isInteger(reruns)) {
348
- throw new Error(`TestCase reruns must be a positive integer, got ${reruns}`);
587
+ if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
588
+ const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
589
+ const filtered = {};
590
+ for (const [k, v] of Object.entries(value)) {
591
+ if (!keys.includes(k)) {
592
+ filtered[k] = preprocessForDiff(v, options);
593
+ }
349
594
  }
350
- return new _TestCase({
351
- name: config.name,
352
- tags: config.tags,
353
- reruns,
354
- inputSchema: config.inputSchema,
355
- input: config.input,
356
- outputSchema: config.outputSchema,
357
- output: config.output
358
- });
595
+ return filtered;
359
596
  }
360
- getReruns() {
361
- return this._config.reruns;
597
+ if (value !== null && typeof value === "object" && !Array.isArray(value)) {
598
+ const result = {};
599
+ for (const [k, v] of Object.entries(value)) {
600
+ result[k] = preprocessForDiff(v, options);
601
+ }
602
+ return result;
362
603
  }
363
- getName() {
364
- return this._config.name;
604
+ if (typeof value === "number" && options?.precision !== void 0) {
605
+ return Number(value.toFixed(options.precision));
365
606
  }
366
- getTags() {
367
- return this._config.tags;
607
+ return value;
608
+ }
609
+ function toPrettyJson(value) {
610
+ const str = stringify__default.default(value);
611
+ try {
612
+ const parsed = JSON.parse(str);
613
+ return JSON.stringify(parsed, null, 2);
614
+ } catch {
615
+ return str;
368
616
  }
369
- getInputSchema() {
370
- return this._config.inputSchema;
617
+ }
618
+ function formatDiffParts(parts) {
619
+ const lines = [];
620
+ for (const part of parts) {
621
+ const prefix = part.added ? "+ " : part.removed ? "- " : "";
622
+ const partLines = part.value.split("\n");
623
+ for (let i = 0; i < partLines.length; i++) {
624
+ const line = partLines[i];
625
+ if (i === partLines.length - 1 && line === "")
626
+ continue;
627
+ lines.push(prefix + line);
628
+ }
371
629
  }
372
- getInput() {
373
- return resolve(this._config.input);
630
+ return lines.join("\n");
631
+ }
632
+ function createDiffString(expected, actual, diffOptions) {
633
+ const expectedProcessed = preprocessForDiff(expected, diffOptions);
634
+ const actualProcessed = preprocessForDiff(actual, diffOptions);
635
+ if (diffOptions?.keysOnly) {
636
+ const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
637
+ const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
638
+ const parts2 = diff.diffLines(expectedKeys, actualKeys);
639
+ return formatDiffParts(parts2);
374
640
  }
375
- getOutputSchema() {
376
- return this._config.outputSchema;
641
+ const expectedStr = toPrettyJson(expectedProcessed);
642
+ const actualStr = toPrettyJson(actualProcessed);
643
+ if (expectedStr === actualStr) {
644
+ return "";
377
645
  }
378
- getOutput() {
379
- if (this._config.output === void 0) {
380
- return void 0;
381
- }
382
- return resolve(this._config.output);
383
- }
384
- };
385
-
386
- // src/evals/evaluator.ts
387
- var Evaluator = class _Evaluator {
388
- constructor(config) {
389
- this._config = config;
390
- }
391
- getState() {
392
- return {
393
- name: this._config.name,
394
- inputSchema: this._config.inputSchema,
395
- outputSchema: this._config.outputSchema,
396
- scoreSchema: this._config.scoreSchema,
397
- middlewares: this._config.middlewares,
398
- evaluateFn: this._config.evaluateFn,
399
- passThreshold: this._config.passThreshold,
400
- passCriterion: this._config.passCriterion
401
- };
402
- }
403
- static use(middleware) {
404
- return new _Evaluator({
405
- middlewares: [middleware]
406
- });
407
- }
408
- use(middleware) {
409
- const state = this.getState();
410
- return new _Evaluator({
411
- ...state,
412
- middlewares: [...state.middlewares, middleware]
413
- });
414
- }
415
- define(config) {
416
- const { middlewares } = this.getState();
417
- return new _Evaluator({
418
- name: config.name,
419
- inputSchema: config.inputSchema,
420
- outputSchema: config.outputSchema,
421
- scoreSchema: config.scoreSchema,
422
- middlewares,
423
- passThreshold: config.passThreshold,
424
- passCriterion: config.passCriterion
425
- });
426
- }
427
- evaluate(fn) {
428
- return new _Evaluator({
429
- ...this.getState(),
430
- evaluateFn: fn
431
- });
432
- }
433
- getName() {
434
- return this._config.name;
435
- }
436
- getInputSchema() {
437
- return this._config.inputSchema;
438
- }
439
- getOutputSchema() {
440
- return this._config.outputSchema;
441
- }
442
- getScoreSchema() {
443
- return this._config.scoreSchema;
444
- }
445
- getMiddlewares() {
446
- return this._config.middlewares;
447
- }
448
- getEvaluateFn() {
449
- return this._config.evaluateFn;
646
+ const parts = diff.diffLines(expectedStr, actualStr);
647
+ if (diffOptions?.outputNewOnly) {
648
+ const filtered = parts.filter((p) => p.added === true);
649
+ return formatDiffParts(filtered);
450
650
  }
451
- getPassThreshold() {
452
- return this._config.passThreshold;
651
+ return formatDiffParts(parts);
652
+ }
653
+ function extractKeys(value) {
654
+ if (value === null || typeof value !== "object") {
655
+ return "\xB7";
453
656
  }
454
- getPassCriterion() {
455
- return this._config.passCriterion;
657
+ if (Array.isArray(value)) {
658
+ return value.map(extractKeys);
456
659
  }
457
- async resolveContext() {
458
- const parts = await Promise.all(this._config.middlewares.map((mw) => mw.resolve()));
459
- return Object.assign({}, ...parts);
660
+ const result = {};
661
+ for (const [k, v] of Object.entries(value)) {
662
+ result[k] = extractKeys(v);
460
663
  }
461
- };
462
-
463
- // src/evals/dataset.ts
464
- function matchesAny(value, matchers) {
465
- return matchers.some(
466
- (matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
467
- );
664
+ return result;
468
665
  }
469
- function matchesAnyPath(filePath, matchers) {
470
- return matchers.some((matcher) => {
471
- if (typeof matcher === "string") {
472
- return simpleGlobMatch(matcher, filePath);
666
+ function formatLogMessage(msg) {
667
+ if (typeof msg === "string")
668
+ return msg;
669
+ if (msg instanceof Error)
670
+ return msg.stack ?? msg.message;
671
+ try {
672
+ if (msg !== null && typeof msg === "object") {
673
+ return JSON.stringify(msg, null, 2);
473
674
  }
474
- return matcher.test(filePath);
475
- });
675
+ return String(msg);
676
+ } catch {
677
+ return String(msg);
678
+ }
476
679
  }
477
- function simpleGlobMatch(pattern, value) {
478
- const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
479
- return new RegExp(`^${escaped}$`).test(value);
680
+ function createLogEntry(message, options) {
681
+ return {
682
+ type: "log",
683
+ label: options?.label,
684
+ message: formatLogMessage(message)
685
+ };
480
686
  }
481
- var Dataset = class _Dataset {
482
- constructor(config) {
483
- this._config = config;
484
- }
485
- static define(config) {
486
- return new _Dataset({
487
- name: config.name,
488
- includedTags: config.includedTags ?? [],
489
- excludedTags: config.excludedTags ?? [],
490
- includedPaths: config.includedPaths ?? [],
491
- excludedPaths: config.excludedPaths ?? []
492
- });
493
- }
494
- getName() {
495
- return this._config.name;
496
- }
497
- getIncludedTags() {
498
- return this._config.includedTags;
499
- }
500
- getExcludedTags() {
501
- return this._config.excludedTags;
502
- }
503
- getIncludedPaths() {
504
- return this._config.includedPaths;
505
- }
506
- getExcludedPaths() {
507
- return this._config.excludedPaths;
508
- }
509
- matchesTestCase(testCase, filePath) {
510
- const tags = testCase.getTags();
511
- if (this._config.excludedTags.length > 0) {
512
- if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
513
- return false;
687
+ function getLogLines(entry) {
688
+ return entry.message.split("\n");
689
+ }
690
+ function createDiffLogEntry(expected, actual, options) {
691
+ const { label, ...diffOpts } = options ?? {};
692
+ const diff = createDiffString(expected, actual, diffOpts);
693
+ return {
694
+ type: "diff",
695
+ label,
696
+ expected,
697
+ actual,
698
+ diff: diff || "(no differences)"
699
+ };
700
+ }
701
+ function printJsonDiff(expected, actual, options = {}) {
702
+ const { color = true, ...diffOpts } = options;
703
+ const diff = createDiffString(expected, actual, diffOpts);
704
+ if (color) {
705
+ const lines = diff.split("\n").map((line) => {
706
+ const trimmed = line.trimStart();
707
+ if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
708
+ return `\x1B[31m${line}\x1B[0m`;
514
709
  }
515
- }
516
- if (this._config.excludedPaths.length > 0) {
517
- if (matchesAnyPath(filePath, this._config.excludedPaths)) {
518
- return false;
710
+ if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
711
+ return `\x1B[32m${line}\x1B[0m`;
519
712
  }
520
- }
521
- const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
522
- const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
523
- return tagMatch && pathMatch;
713
+ return line;
714
+ });
715
+ const colored = lines.join("\n");
716
+ console.log(colored || "(no differences)");
717
+ return colored;
524
718
  }
525
- };
719
+ console.log(diff || "(no differences)");
720
+ return diff;
721
+ }
526
722
 
527
723
  // src/evals/metric.ts
528
724
  var registry = /* @__PURE__ */ new Map();
@@ -547,6 +743,113 @@ function getMetricById(id) {
547
743
  return registry.get(id);
548
744
  }
549
745
 
746
+ // src/evals/aggregators.ts
747
+ function aggregateTokenCountSum(values) {
748
+ const initial = {
749
+ input: 0,
750
+ output: 0,
751
+ inputCached: 0,
752
+ outputCached: 0
753
+ };
754
+ return values.reduce(
755
+ (acc, v) => ({
756
+ input: acc.input + (v.input ?? 0),
757
+ output: acc.output + (v.output ?? 0),
758
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
759
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
760
+ }),
761
+ initial
762
+ );
763
+ }
764
+ function aggregateLatencyAverage(values) {
765
+ if (values.length === 0) {
766
+ return { ms: 0 };
767
+ }
768
+ const sum = values.reduce((s, v) => s + v.ms, 0);
769
+ return { ms: sum / values.length };
770
+ }
771
+
772
+ // src/evals/metrics/standard.ts
773
+ var tokenCountMetric = Metric.of({
774
+ id: "token-count",
775
+ name: "Tokens",
776
+ aggregate: aggregateTokenCountSum,
777
+ format: (data, options) => {
778
+ const input = data.input ?? 0;
779
+ const output = data.output ?? 0;
780
+ const inputCached = data.inputCached ?? 0;
781
+ const outputCached = data.outputCached ?? 0;
782
+ const cached = inputCached + outputCached;
783
+ const base = `in:${input} out:${output} cached:${cached}`;
784
+ return options?.isAggregated ? `Total: ${base}` : base;
785
+ }
786
+ });
787
+ var latencyMetric = Metric.of({
788
+ id: "latency",
789
+ name: "Latency",
790
+ aggregate: aggregateLatencyAverage,
791
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
792
+ });
793
+
794
+ // src/evals/run-config.ts
795
+ function validateRow(row, index) {
796
+ const hasEvaluators = "evaluators" in row && row.evaluators !== void 0 && row.evaluators !== void 0;
797
+ const hasPattern = "evaluatorPattern" in row && typeof row.evaluatorPattern === "string" && row.evaluatorPattern.trim().length > 0;
798
+ if (hasEvaluators && hasPattern) {
799
+ throw new Error(`RunConfig run[${index}] must not set both evaluators and evaluatorPattern`);
800
+ }
801
+ if (!hasEvaluators && !hasPattern) {
802
+ throw new Error(`RunConfig run[${index}] must set either evaluators or evaluatorPattern`);
803
+ }
804
+ if (hasEvaluators && row.evaluators.length === 0) {
805
+ throw new Error(`RunConfig run[${index}]: evaluators must be non-empty`);
806
+ }
807
+ const rawRep = "repetitions" in row ? row.repetitions : void 0;
808
+ const repetitions = rawRep ?? 1;
809
+ if (!Number.isInteger(repetitions) || repetitions < 1) {
810
+ throw new Error(
811
+ `RunConfig run[${index}]: repetitions must be a positive integer, got ${String(rawRep)}`
812
+ );
813
+ }
814
+ }
815
+ var RunConfig = class _RunConfig {
816
+ constructor(name, displayName, tags, runs) {
817
+ this._name = name;
818
+ this._displayName = displayName;
819
+ this._tags = tags;
820
+ this._runs = runs;
821
+ }
822
+ static define(config) {
823
+ if (config.runs.length === 0) {
824
+ throw new Error("RunConfig runs must be non-empty");
825
+ }
826
+ config.runs.forEach(validateRow);
827
+ const name = validateRunConfigName(config.name, "RunConfig.define");
828
+ const displayName = normalizeOptionalDisplayName(config.displayName);
829
+ const tags = config.tags !== void 0 ? [...config.tags] : [];
830
+ return new _RunConfig(name, displayName, tags, config.runs);
831
+ }
832
+ /** Canonical id (branded {@link RunConfigName} at runtime; typed as `string` for ergonomics). */
833
+ getName() {
834
+ return this._name;
835
+ }
836
+ /** Optional unrestricted display label. */
837
+ getDisplayName() {
838
+ return this._displayName;
839
+ }
840
+ /** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. */
841
+ getDisplayLabel() {
842
+ return this._displayName ?? this._name;
843
+ }
844
+ /** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
845
+ getTags() {
846
+ return [...this._tags];
847
+ }
848
+ getRuns() {
849
+ return this._runs;
850
+ }
851
+ };
852
+
550
853
  // src/evals/score.ts
551
854
  var registry2 = /* @__PURE__ */ new Map();
552
855
  function formatScoreData(def, data, options) {
@@ -637,71 +940,23 @@ var Score = {
637
940
  aggregateValues: config.aggregateValues,
638
941
  make: (data, options) => {
639
942
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
640
- return {
641
- id: config.id,
642
- data,
643
- ...passed !== void 0 && { passed },
644
- ...options?.name !== void 0 && { name: options.name },
645
- def
646
- // Attach def so rendering/aggregation works without registry lookup
647
- };
648
- }
649
- };
650
- registry2.set(config.id, def);
651
- return def;
652
- }
653
- };
654
- function getScoreById(id) {
655
- return registry2.get(id);
656
- }
657
-
658
- // src/evals/aggregators.ts
659
- function aggregateTokenCountSum(values) {
660
- const initial = {
661
- input: 0,
662
- output: 0,
663
- inputCached: 0,
664
- outputCached: 0
665
- };
666
- return values.reduce(
667
- (acc, v) => ({
668
- input: acc.input + (v.input ?? 0),
669
- output: acc.output + (v.output ?? 0),
670
- inputCached: acc.inputCached + (v.inputCached ?? 0),
671
- outputCached: acc.outputCached + (v.outputCached ?? 0)
672
- }),
673
- initial
674
- );
675
- }
676
- function aggregateLatencyAverage(values) {
677
- if (values.length === 0) {
678
- return { ms: 0 };
679
- }
680
- const sum = values.reduce((s, v) => s + v.ms, 0);
681
- return { ms: sum / values.length };
682
- }
683
-
684
- // src/evals/metrics/standard.ts
685
- var tokenCountMetric = Metric.of({
686
- id: "token-count",
687
- name: "Tokens",
688
- aggregate: aggregateTokenCountSum,
689
- format: (data, options) => {
690
- const input = data.input ?? 0;
691
- const output = data.output ?? 0;
692
- const inputCached = data.inputCached ?? 0;
693
- const outputCached = data.outputCached ?? 0;
694
- const cached = inputCached + outputCached;
695
- const base = `in:${input} out:${output} cached:${cached}`;
696
- return options?.isAggregated ? `Total: ${base}` : base;
943
+ return {
944
+ id: config.id,
945
+ data,
946
+ ...passed !== void 0 && { passed },
947
+ ...options?.name !== void 0 && { name: options.name },
948
+ def
949
+ // Attach def so rendering/aggregation works without registry lookup
950
+ };
951
+ }
952
+ };
953
+ registry2.set(config.id, def);
954
+ return def;
697
955
  }
698
- });
699
- var latencyMetric = Metric.of({
700
- id: "latency",
701
- name: "Latency",
702
- aggregate: aggregateLatencyAverage,
703
- format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
704
- });
956
+ };
957
+ function getScoreById(id) {
958
+ return registry2.get(id);
959
+ }
705
960
 
706
961
  // src/evals/scores/standard.ts
707
962
  var percentScore = Score.of({
@@ -734,148 +989,197 @@ var binaryScore = Score.of({
734
989
  },
735
990
  aggregateValues: Score.aggregate.all
736
991
  });
737
- function preprocessForDiff(value, options) {
738
- if (options?.sort && Array.isArray(value)) {
739
- return [...value].sort((a, b) => {
740
- const aStr = stringify__default.default(preprocessForDiff(a, options));
741
- const bStr = stringify__default.default(preprocessForDiff(b, options));
742
- return aStr.localeCompare(bStr);
743
- }).map((item) => preprocessForDiff(item, options));
744
- }
745
- if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
746
- const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
747
- const filtered = {};
748
- for (const [k, v] of Object.entries(value)) {
749
- if (!keys.includes(k)) {
750
- filtered[k] = preprocessForDiff(v, options);
751
- }
992
+
993
+ // src/evals/tag-set.ts
994
+ var TagSet = class {
995
+ constructor() {
996
+ }
997
+ static define(tags) {
998
+ const out = {};
999
+ for (const tag of tags) {
1000
+ out[tag] = tag;
752
1001
  }
753
- return filtered;
1002
+ return out;
754
1003
  }
755
- if (value !== null && typeof value === "object" && !Array.isArray(value)) {
756
- const result = {};
757
- for (const [k, v] of Object.entries(value)) {
758
- result[k] = preprocessForDiff(v, options);
759
- }
760
- return result;
1004
+ };
1005
+
1006
+ // src/evals/test-case.ts
1007
+ function resolve(value) {
1008
+ return typeof value === "function" ? value() : value;
1009
+ }
1010
+ var TestCase = class _TestCase {
1011
+ constructor(config) {
1012
+ this._config = config;
761
1013
  }
762
- if (typeof value === "number" && options?.precision !== void 0) {
763
- return Number(value.toFixed(options.precision));
1014
+ static describe(config) {
1015
+ const name = validateTestCaseName(config.name, "TestCase.describe");
1016
+ const displayName = normalizeOptionalDisplayName(config.displayName);
1017
+ return new _TestCase({
1018
+ name,
1019
+ displayName,
1020
+ tags: config.tags,
1021
+ inputSchema: config.inputSchema,
1022
+ input: config.input,
1023
+ outputSchema: config.outputSchema,
1024
+ output: config.output
1025
+ });
764
1026
  }
765
- return value;
766
- }
767
- function toPrettyJson(value) {
768
- const str = stringify__default.default(value);
769
- try {
770
- const parsed = JSON.parse(str);
771
- return JSON.stringify(parsed, null, 2);
772
- } catch {
773
- return str;
1027
+ getName() {
1028
+ return this._config.name;
774
1029
  }
775
- }
776
- function formatDiffParts(parts) {
777
- const lines = [];
778
- for (const part of parts) {
779
- const prefix = part.added ? "+ " : part.removed ? "- " : "";
780
- const partLines = part.value.split("\n");
781
- for (let i = 0; i < partLines.length; i++) {
782
- const line = partLines[i];
783
- if (i === partLines.length - 1 && line === "")
784
- continue;
785
- lines.push(prefix + line);
786
- }
1030
+ getDisplayName() {
1031
+ return this._config.displayName;
787
1032
  }
788
- return lines.join("\n");
789
- }
790
- function createDiffString(expected, actual, diffOptions) {
791
- const expectedProcessed = preprocessForDiff(expected, diffOptions);
792
- const actualProcessed = preprocessForDiff(actual, diffOptions);
793
- if (diffOptions?.keysOnly) {
794
- const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
795
- const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
796
- const parts2 = diff.diffLines(expectedKeys, actualKeys);
797
- return formatDiffParts(parts2);
1033
+ getDisplayLabel() {
1034
+ return this._config.displayName ?? this._config.name;
798
1035
  }
799
- const expectedStr = toPrettyJson(expectedProcessed);
800
- const actualStr = toPrettyJson(actualProcessed);
801
- if (expectedStr === actualStr) {
802
- return "";
1036
+ getTags() {
1037
+ return this._config.tags;
803
1038
  }
804
- const parts = diff.diffLines(expectedStr, actualStr);
805
- if (diffOptions?.outputNewOnly) {
806
- const filtered = parts.filter((p) => p.added === true);
807
- return formatDiffParts(filtered);
1039
+ getInputSchema() {
1040
+ return this._config.inputSchema;
808
1041
  }
809
- return formatDiffParts(parts);
810
- }
811
- function extractKeys(value) {
812
- if (value === null || typeof value !== "object") {
813
- return "\xB7";
1042
+ getInput() {
1043
+ return resolve(this._config.input);
814
1044
  }
815
- if (Array.isArray(value)) {
816
- return value.map(extractKeys);
1045
+ getOutputSchema() {
1046
+ return this._config.outputSchema;
817
1047
  }
818
- const result = {};
819
- for (const [k, v] of Object.entries(value)) {
820
- result[k] = extractKeys(v);
1048
+ getOutput() {
1049
+ if (this._config.output === void 0) {
1050
+ return void 0;
1051
+ }
1052
+ return resolve(this._config.output);
821
1053
  }
822
- return result;
1054
+ };
1055
+ function getTestCaseDisplayLabel(testCase) {
1056
+ if (typeof testCase.getDisplayLabel === "function") {
1057
+ return testCase.getDisplayLabel();
1058
+ }
1059
+ return typeof testCase.getName === "function" ? testCase.getName() : "";
823
1060
  }
824
- function formatLogMessage(msg) {
825
- if (typeof msg === "string")
826
- return msg;
827
- if (msg instanceof Error)
828
- return msg.stack ?? msg.message;
1061
+ function getTestCaseTagList(testCase) {
1062
+ return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
1063
+ }
1064
+ async function loadRunSnapshotsFromArtifacts(config) {
1065
+ const baseDir = path.resolve(config.artifactDirectory);
1066
+ let entries;
829
1067
  try {
830
- if (msg !== null && typeof msg === "object") {
831
- return JSON.stringify(msg, null, 2);
832
- }
833
- return String(msg);
1068
+ entries = await promises.readdir(baseDir);
834
1069
  } catch {
835
- return String(msg);
1070
+ return [];
836
1071
  }
1072
+ const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
1073
+ const snapshots = [];
1074
+ for (const fileName of jsonlFiles) {
1075
+ const filePath = path.join(baseDir, fileName);
1076
+ try {
1077
+ const snapshot = await parseArtifactToSnapshot(filePath, config);
1078
+ if (snapshot) {
1079
+ snapshots.push(snapshot);
1080
+ }
1081
+ } catch {
1082
+ }
1083
+ }
1084
+ return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
837
1085
  }
838
- function createLogEntry(message, options) {
839
- return {
840
- type: "log",
841
- label: options?.label,
842
- message: formatLogMessage(message)
843
- };
844
- }
845
- function getLogLines(entry) {
846
- return entry.message.split("\n");
847
- }
848
- function createDiffLogEntry(expected, actual, options) {
849
- const { label, ...diffOpts } = options ?? {};
850
- const diff = createDiffString(expected, actual, diffOpts);
851
- return {
852
- type: "diff",
853
- label,
854
- expected,
855
- actual,
856
- diff: diff || "(no differences)"
857
- };
858
- }
859
- function printJsonDiff(expected, actual, options = {}) {
860
- const { color = true, ...diffOpts } = options;
861
- const diff = createDiffString(expected, actual, diffOpts);
862
- if (color) {
863
- const lines = diff.split("\n").map((line) => {
864
- const trimmed = line.trimStart();
865
- if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
866
- return `\x1B[31m${line}\x1B[0m`;
1086
+ async function parseArtifactToSnapshot(filePath, _config) {
1087
+ const content = await promises.readFile(filePath, "utf8");
1088
+ const lines = content.split("\n").filter((line) => line.trim().length > 0);
1089
+ if (lines.length === 0) {
1090
+ return null;
1091
+ }
1092
+ let runQueued = null;
1093
+ let runCompleted = null;
1094
+ let runFailed = null;
1095
+ let runStarted = null;
1096
+ for (const line of lines) {
1097
+ try {
1098
+ const event = JSON.parse(line);
1099
+ const type = event.type;
1100
+ if (type === "RunQueued") {
1101
+ runQueued = {
1102
+ runId: event.runId,
1103
+ datasetId: event.datasetId,
1104
+ datasetName: event.datasetName,
1105
+ evaluatorIds: event.evaluatorIds,
1106
+ totalTestCases: event.totalTestCases ?? 0,
1107
+ artifactPath: event.artifactPath ?? filePath,
1108
+ ts: event.ts
1109
+ };
867
1110
  }
868
- if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
869
- return `\x1B[32m${line}\x1B[0m`;
1111
+ if (type === "RunStarted") {
1112
+ runStarted = { startedAt: event.startedAt };
1113
+ }
1114
+ if (type === "RunCompleted") {
1115
+ runCompleted = {
1116
+ passedTestCases: event.passedTestCases,
1117
+ failedTestCases: event.failedTestCases,
1118
+ totalTestCases: event.totalTestCases,
1119
+ finishedAt: event.finishedAt
1120
+ };
1121
+ }
1122
+ if (type === "RunFailed") {
1123
+ runFailed = {
1124
+ finishedAt: event.finishedAt,
1125
+ errorMessage: event.errorMessage
1126
+ };
1127
+ }
1128
+ } catch {
1129
+ }
1130
+ }
1131
+ if (!runQueued) {
1132
+ return null;
1133
+ }
1134
+ const artifactPath = filePath;
1135
+ const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1136
+ const progress = aggregateTestCaseProgress(lines);
1137
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1138
+ const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1139
+ const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1140
+ return {
1141
+ runId: runQueued.runId,
1142
+ datasetId: runQueued.datasetId,
1143
+ datasetName: runQueued.datasetName,
1144
+ evaluatorIds: runQueued.evaluatorIds,
1145
+ queuedAt: runQueued.ts ?? 0,
1146
+ startedAt: runStarted?.startedAt,
1147
+ finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
1148
+ totalTestCases: runQueued.totalTestCases,
1149
+ completedTestCases,
1150
+ passedTestCases,
1151
+ failedTestCases,
1152
+ status,
1153
+ artifactPath,
1154
+ errorMessage: runFailed?.errorMessage
1155
+ };
1156
+ }
1157
+ function aggregateTestCaseProgress(lines) {
1158
+ let completedTestCases = 0;
1159
+ const testCasePassedBy = /* @__PURE__ */ new Map();
1160
+ for (const line of lines) {
1161
+ try {
1162
+ const event = JSON.parse(line);
1163
+ if (event.type === "TestCaseProgress") {
1164
+ const ev = event;
1165
+ completedTestCases = ev.completedTestCases ?? completedTestCases;
1166
+ const id = ev.testCaseId;
1167
+ const current = testCasePassedBy.get(id);
1168
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
870
1169
  }
871
- return line;
872
- });
873
- const colored = lines.join("\n");
874
- console.log(colored || "(no differences)");
875
- return colored;
1170
+ } catch {
1171
+ }
876
1172
  }
877
- console.log(diff || "(no differences)");
878
- return diff;
1173
+ let passedTestCases = 0;
1174
+ let failedTestCases = 0;
1175
+ for (const passed of testCasePassedBy.values()) {
1176
+ if (passed) {
1177
+ passedTestCases += 1;
1178
+ } else {
1179
+ failedTestCases += 1;
1180
+ }
1181
+ }
1182
+ return { completedTestCases, passedTestCases, failedTestCases };
879
1183
  }
880
1184
 
881
1185
  // src/runner/config.ts
@@ -887,6 +1191,7 @@ var defaultRunnerConfig = {
887
1191
  rootDir: process.cwd(),
888
1192
  datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
889
1193
  evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
1194
+ runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
890
1195
  testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
891
1196
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
892
1197
  },
@@ -912,6 +1217,11 @@ function toRunnerConfigOverrides(config) {
912
1217
  } else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
913
1218
  discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
914
1219
  }
1220
+ if (rawDiscovery?.runConfigFilePatterns !== void 0) {
1221
+ discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
1222
+ } else if (rawDiscovery?.runConfigSuffixes !== void 0) {
1223
+ discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
1224
+ }
915
1225
  if (rawDiscovery?.testCaseFilePatterns !== void 0) {
916
1226
  discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
917
1227
  } else if (rawDiscovery?.testCaseSuffixes !== void 0) {
@@ -1010,6 +1320,9 @@ function isDatasetLike(value) {
1010
1320
  function isEvaluatorLike(value) {
1011
1321
  return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
1012
1322
  }
1323
+ function isRunConfigLike(value) {
1324
+ return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
1325
+ }
1013
1326
  function isTestCaseLike(value) {
1014
1327
  return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
1015
1328
  }
@@ -1098,6 +1411,23 @@ async function collectEvaluatorsFromFiles(config) {
1098
1411
  );
1099
1412
  return found.flat();
1100
1413
  }
1414
+ async function collectRunConfigsFromFiles(config) {
1415
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1416
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
1417
+ const found = await Promise.all(
1418
+ matched.map(async (absolutePath) => {
1419
+ const exports = await loadModuleExports(absolutePath);
1420
+ const runConfigs = exports.filter(isRunConfigLike);
1421
+ const relPath = path.relative(config.rootDir, absolutePath);
1422
+ return runConfigs.map((runConfig) => ({
1423
+ id: runConfig.getName(),
1424
+ filePath: relPath,
1425
+ runConfig
1426
+ }));
1427
+ })
1428
+ );
1429
+ return found.flat();
1430
+ }
1101
1431
  async function collectTestCasesFromFiles(config) {
1102
1432
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1103
1433
  const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
@@ -1190,15 +1520,17 @@ function readOutput(testCase) {
1190
1520
  }
1191
1521
  return candidate.getOutput();
1192
1522
  }
1193
- function buildEvaluationUnits(testCases) {
1523
+ function buildEvaluationUnits(testCases, repetitionCount) {
1524
+ const count = Math.max(1, repetitionCount);
1194
1525
  const units = [];
1195
1526
  for (const testCaseItem of testCases) {
1196
- const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1197
- for (let r = 0; r < rerunTotal; r++) {
1527
+ const repetitionId = `rep-${crypto.randomUUID()}`;
1528
+ for (let r = 0; r < count; r++) {
1198
1529
  units.push({
1199
1530
  testCaseItem,
1200
- rerunIndex: r + 1,
1201
- rerunTotal
1531
+ repetitionId,
1532
+ repetitionIndex: r + 1,
1533
+ repetitionCount: count
1202
1534
  });
1203
1535
  }
1204
1536
  }
@@ -1211,7 +1543,7 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1211
1543
  return path.join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
1212
1544
  }
1213
1545
  function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
1214
- const { testCaseItem, rerunIndex, rerunTotal } = unit;
1546
+ const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
1215
1547
  return effect.Effect.gen(function* () {
1216
1548
  const evaluatorRunId = `run-${crypto.randomUUID()}`;
1217
1549
  const started = Date.now();
@@ -1220,11 +1552,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1220
1552
  type: "TestCaseStarted",
1221
1553
  runId: task.runId,
1222
1554
  testCaseId: testCaseItem.id,
1223
- testCaseName: testCaseItem.testCase.getName(),
1555
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
1224
1556
  startedTestCases: startedEvaluations,
1225
1557
  totalTestCases: totalEvaluations,
1226
- rerunIndex,
1227
- rerunTotal
1558
+ repetitionId,
1559
+ repetitionIndex,
1560
+ repetitionCount
1228
1561
  });
1229
1562
  const evaluatorScores = [];
1230
1563
  let testCaseError;
@@ -1258,8 +1591,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1258
1591
  meta: {
1259
1592
  triggerId: task.triggerId,
1260
1593
  runId: evaluatorRunId,
1261
- datasetId: task.datasetId
1594
+ datasetName: task.dataset.getDisplayLabel(),
1595
+ repetitionId,
1596
+ repetitionIndex,
1597
+ repetitionCount,
1598
+ runConfigName: task.runConfigName
1262
1599
  },
1600
+ testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1601
+ runConfigTags: task.runConfigTags,
1602
+ evaluatorTags: getEvaluatorTagList(evaluator),
1263
1603
  logDiff,
1264
1604
  log,
1265
1605
  createError
@@ -1302,18 +1642,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1302
1642
  });
1303
1643
  }
1304
1644
  }
1305
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1645
+ const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
1306
1646
  const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [n + 1, n + 1]);
1307
1647
  const progressEvent = {
1308
1648
  type: "TestCaseProgress",
1309
1649
  runId: task.runId,
1310
1650
  testCaseId: testCaseItem.id,
1311
- testCaseName: testCaseItem.testCase.getName(),
1651
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
1312
1652
  completedTestCases: completedEvaluations,
1313
1653
  totalTestCases: totalEvaluations,
1314
- rerunIndex,
1315
- rerunTotal,
1316
- passed: rerunPassedThis,
1654
+ repetitionId,
1655
+ repetitionIndex,
1656
+ repetitionCount,
1657
+ passed: repetitionPassedThis,
1317
1658
  durationMs: Date.now() - started,
1318
1659
  evaluatorScores,
1319
1660
  output,
@@ -1334,9 +1675,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1334
1675
  (map) => {
1335
1676
  const key = testCaseItem.id;
1336
1677
  const existing = map.get(key) ?? { completedCount: 0, results: [] };
1337
- const newResults = [...existing.results, rerunPassedThis];
1678
+ const newResults = [...existing.results, repetitionPassedThis];
1338
1679
  const newCompletedCount = existing.completedCount + 1;
1339
- const isLast = newCompletedCount === rerunTotal;
1680
+ const isLast = newCompletedCount === repetitionCount;
1340
1681
  const newMap = new Map(map);
1341
1682
  newMap.set(key, {
1342
1683
  completedCount: newCompletedCount,
@@ -1373,10 +1714,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1373
1714
  runId: task.runId,
1374
1715
  startedAt
1375
1716
  });
1376
- const totalEvaluations = task.testCases.reduce(
1377
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1378
- 0
1379
- );
1717
+ const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
1380
1718
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1381
1719
  const completedRef = yield* effect.Ref.make(0);
1382
1720
  const startedRef = yield* effect.Ref.make(0);
@@ -1385,7 +1723,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1385
1723
  const testCaseResultsRef = yield* effect.Ref.make(
1386
1724
  /* @__PURE__ */ new Map()
1387
1725
  );
1388
- const evaluationUnits = buildEvaluationUnits(task.testCases);
1726
+ const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
1389
1727
  const processEvaluation = (unit) => processOneEvaluation(
1390
1728
  task,
1391
1729
  unit,
@@ -1399,11 +1737,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1399
1737
  failedRef,
1400
1738
  testCaseResultsRef
1401
1739
  );
1402
- yield* effect.Effect.forEach(
1403
- evaluationUnits,
1404
- processEvaluation,
1405
- maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1406
- );
1740
+ const globalSem = task.globalEvaluationSemaphore;
1741
+ if (globalSem !== void 0) {
1742
+ yield* effect.Effect.forEach(
1743
+ evaluationUnits,
1744
+ (unit) => globalSem.withPermits(1)(processEvaluation(unit)),
1745
+ { concurrency: "unbounded", discard: true }
1746
+ );
1747
+ } else {
1748
+ yield* effect.Effect.forEach(
1749
+ evaluationUnits,
1750
+ processEvaluation,
1751
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1752
+ );
1753
+ }
1407
1754
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
1408
1755
  effect.Ref.get(completedRef),
1409
1756
  effect.Ref.get(passedRef),
@@ -1439,125 +1786,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1439
1786
  artifactPath: task.snapshot.artifactPath
1440
1787
  });
1441
1788
  });
1442
- async function loadRunSnapshotsFromArtifacts(config) {
1443
- const baseDir = path.resolve(config.artifactDirectory);
1444
- let entries;
1445
- try {
1446
- entries = await promises.readdir(baseDir);
1447
- } catch {
1448
- return [];
1449
- }
1450
- const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
1451
- const snapshots = [];
1452
- for (const fileName of jsonlFiles) {
1453
- const filePath = path.join(baseDir, fileName);
1454
- try {
1455
- const snapshot = await parseArtifactToSnapshot(filePath, config);
1456
- if (snapshot) {
1457
- snapshots.push(snapshot);
1458
- }
1459
- } catch {
1460
- }
1461
- }
1462
- return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
1463
- }
1464
- async function parseArtifactToSnapshot(filePath, _config) {
1465
- const content = await promises.readFile(filePath, "utf8");
1466
- const lines = content.split("\n").filter((line) => line.trim().length > 0);
1467
- if (lines.length === 0) {
1468
- return null;
1469
- }
1470
- let runQueued = null;
1471
- let runCompleted = null;
1472
- let runFailed = null;
1473
- let runStarted = null;
1474
- for (const line of lines) {
1475
- try {
1476
- const event = JSON.parse(line);
1477
- const type = event.type;
1478
- if (type === "RunQueued") {
1479
- runQueued = {
1480
- runId: event.runId,
1481
- datasetId: event.datasetId,
1482
- datasetName: event.datasetName,
1483
- evaluatorIds: event.evaluatorIds,
1484
- totalTestCases: event.totalTestCases ?? 0,
1485
- artifactPath: event.artifactPath ?? filePath,
1486
- ts: event.ts
1487
- };
1488
- }
1489
- if (type === "RunStarted") {
1490
- runStarted = { startedAt: event.startedAt };
1491
- }
1492
- if (type === "RunCompleted") {
1493
- runCompleted = {
1494
- passedTestCases: event.passedTestCases,
1495
- failedTestCases: event.failedTestCases,
1496
- totalTestCases: event.totalTestCases,
1497
- finishedAt: event.finishedAt
1498
- };
1499
- }
1500
- if (type === "RunFailed") {
1501
- runFailed = {
1502
- finishedAt: event.finishedAt,
1503
- errorMessage: event.errorMessage
1504
- };
1505
- }
1506
- } catch {
1507
- }
1789
+
1790
+ // src/runner/name-pattern.ts
1791
+ function parseRegexLiteral(pattern) {
1792
+ if (!pattern.startsWith("/")) {
1793
+ return void 0;
1508
1794
  }
1509
- if (!runQueued) {
1510
- return null;
1795
+ const lastSlash = pattern.lastIndexOf("/");
1796
+ if (lastSlash <= 0) {
1797
+ return void 0;
1511
1798
  }
1512
- const artifactPath = filePath;
1513
- const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1514
- const progress = aggregateTestCaseProgress(lines);
1515
- const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1516
- const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1517
- const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1518
1799
  return {
1519
- runId: runQueued.runId,
1520
- datasetId: runQueued.datasetId,
1521
- datasetName: runQueued.datasetName,
1522
- evaluatorIds: runQueued.evaluatorIds,
1523
- queuedAt: runQueued.ts ?? 0,
1524
- startedAt: runStarted?.startedAt,
1525
- finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
1526
- totalTestCases: runQueued.totalTestCases,
1527
- completedTestCases,
1528
- passedTestCases,
1529
- failedTestCases,
1530
- status,
1531
- artifactPath,
1532
- errorMessage: runFailed?.errorMessage
1800
+ source: pattern.slice(1, lastSlash),
1801
+ flags: pattern.slice(lastSlash + 1)
1533
1802
  };
1534
1803
  }
1535
- function aggregateTestCaseProgress(lines) {
1536
- let completedTestCases = 0;
1537
- const testCasePassedBy = /* @__PURE__ */ new Map();
1538
- for (const line of lines) {
1539
- try {
1540
- const event = JSON.parse(line);
1541
- if (event.type === "TestCaseProgress") {
1542
- const ev = event;
1543
- completedTestCases = ev.completedTestCases ?? completedTestCases;
1544
- const id = ev.testCaseId;
1545
- const current = testCasePassedBy.get(id);
1546
- testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
1547
- }
1548
- } catch {
1549
- }
1804
+ function createNameMatcher(pattern) {
1805
+ const normalizedPattern = pattern.trim();
1806
+ const regexLiteral = parseRegexLiteral(normalizedPattern);
1807
+ if (regexLiteral) {
1808
+ const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1809
+ return (value) => regex.test(value);
1550
1810
  }
1551
- let passedTestCases = 0;
1552
- let failedTestCases = 0;
1553
- for (const passed of testCasePassedBy.values()) {
1554
- if (passed) {
1555
- passedTestCases += 1;
1556
- } else {
1557
- failedTestCases += 1;
1558
- }
1811
+ if (normalizedPattern.includes("*")) {
1812
+ const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1813
+ const regex = new RegExp(`^${escaped}$`, "i");
1814
+ return (value) => regex.test(value);
1559
1815
  }
1560
- return { completedTestCases, passedTestCases, failedTestCases };
1816
+ return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1561
1817
  }
1562
1818
  async function appendJsonLine(artifactPath, payload) {
1563
1819
  await promises.mkdir(path.dirname(artifactPath), { recursive: true });
@@ -1616,32 +1872,12 @@ function searchCollectedTestCases(all, query) {
1616
1872
  }
1617
1873
 
1618
1874
  // src/runner/api.ts
1619
- function parseRegexLiteral(pattern) {
1620
- if (!pattern.startsWith("/")) {
1621
- return void 0;
1622
- }
1623
- const lastSlash = pattern.lastIndexOf("/");
1624
- if (lastSlash <= 0) {
1625
- return void 0;
1626
- }
1627
- return {
1628
- source: pattern.slice(1, lastSlash),
1629
- flags: pattern.slice(lastSlash + 1)
1630
- };
1631
- }
1632
- function createNameMatcher(pattern) {
1633
- const normalizedPattern = pattern.trim();
1634
- const regexLiteral = parseRegexLiteral(normalizedPattern);
1635
- if (regexLiteral) {
1636
- const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1637
- return (value) => regex.test(value);
1638
- }
1639
- if (normalizedPattern.includes("*")) {
1640
- const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1641
- const regex = new RegExp(`^${escaped}$`, "i");
1642
- return (value) => regex.test(value);
1875
+ function normalizeRunRepetitions(value) {
1876
+ const n = value ?? 1;
1877
+ if (!Number.isInteger(n) || n < 1) {
1878
+ throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
1643
1879
  }
1644
- return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1880
+ return n;
1645
1881
  }
1646
1882
  function mergeRunnerOverrides(base, next) {
1647
1883
  if (!base) {
@@ -1676,6 +1912,7 @@ var EffectRunner = class {
1676
1912
  this.listeners = /* @__PURE__ */ new Set();
1677
1913
  this.datasetsById = /* @__PURE__ */ new Map();
1678
1914
  this.evaluatorsById = /* @__PURE__ */ new Map();
1915
+ this.runConfigsById = /* @__PURE__ */ new Map();
1679
1916
  this.schedulerFiber = effect.Effect.runFork(this.createSchedulerEffect());
1680
1917
  this.persistenceFiber = effect.Effect.runFork(
1681
1918
  createPersistenceWorker(this.persistenceQueue)
@@ -1716,6 +1953,137 @@ var EffectRunner = class {
1716
1953
  (item) => matcher(item.evaluator.getName() ?? "")
1717
1954
  );
1718
1955
  }
1956
+ async collectRunConfigs() {
1957
+ const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
1958
+ this.runConfigsById.clear();
1959
+ const byNameLower = /* @__PURE__ */ new Map();
1960
+ for (const item of runConfigs) {
1961
+ const id = item.runConfig.getName();
1962
+ const lower = id.toLowerCase();
1963
+ const prev = byNameLower.get(lower);
1964
+ if (prev !== void 0 && prev.filePath !== item.filePath) {
1965
+ throw new Error(
1966
+ `Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
1967
+ );
1968
+ }
1969
+ byNameLower.set(lower, item);
1970
+ this.runConfigsById.set(id, item);
1971
+ }
1972
+ return runConfigs;
1973
+ }
1974
+ async resolveRunConfigByName(name) {
1975
+ if (this.runConfigsById.size === 0) {
1976
+ await this.collectRunConfigs();
1977
+ }
1978
+ const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
1979
+ const keyLower = key.toLowerCase();
1980
+ const matches = Array.from(this.runConfigsById.values()).filter(
1981
+ (item) => item.runConfig.getName().toLowerCase() === keyLower
1982
+ );
1983
+ if (matches.length === 0) {
1984
+ return void 0;
1985
+ }
1986
+ if (matches.length > 1) {
1987
+ throw new Error(
1988
+ `Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
1989
+ );
1990
+ }
1991
+ return matches[0];
1992
+ }
1993
+ async expandRunConfigToJobs(collected) {
1994
+ if (this.datasetsById.size === 0) {
1995
+ await this.collectDatasets();
1996
+ }
1997
+ if (this.evaluatorsById.size === 0) {
1998
+ await this.collectEvaluators();
1999
+ }
2000
+ const rcName = collected.runConfig.getName();
2001
+ const jobs = [];
2002
+ const runs = collected.runConfig.getRuns();
2003
+ for (const [i, row] of runs.entries()) {
2004
+ const dsCollected = Array.from(this.datasetsById.values()).find(
2005
+ (d) => d.dataset === row.dataset
2006
+ );
2007
+ if (!dsCollected) {
2008
+ throw new Error(
2009
+ `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
2010
+ );
2011
+ }
2012
+ let evaluatorIds;
2013
+ if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
2014
+ const matcher = createNameMatcher(row.evaluatorPattern);
2015
+ const matched = Array.from(this.evaluatorsById.values()).filter(
2016
+ (item) => matcher(item.evaluator.getName() ?? "")
2017
+ );
2018
+ if (matched.length === 0) {
2019
+ throw new Error(
2020
+ `RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
2021
+ );
2022
+ }
2023
+ evaluatorIds = matched.map((item) => item.id);
2024
+ } else {
2025
+ const evaluators = row.evaluators;
2026
+ evaluatorIds = [];
2027
+ for (const ev of evaluators) {
2028
+ const found = Array.from(this.evaluatorsById.values()).find(
2029
+ (item) => item.evaluator === ev
2030
+ );
2031
+ if (!found) {
2032
+ throw new Error(
2033
+ `RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
2034
+ );
2035
+ }
2036
+ evaluatorIds.push(found.id);
2037
+ }
2038
+ }
2039
+ const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
2040
+ jobs.push({
2041
+ datasetId: dsCollected.id,
2042
+ evaluatorIds,
2043
+ runConfigName: rcName,
2044
+ runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
2045
+ runConfigTags: collected.runConfig.getTags(),
2046
+ repetitions
2047
+ });
2048
+ }
2049
+ return jobs;
2050
+ }
2051
+ async expandRunConfigNamesToJobs(names) {
2052
+ const jobs = [];
2053
+ for (const name of names) {
2054
+ const collected = await this.resolveRunConfigByName(name);
2055
+ if (!collected) {
2056
+ const known = await this.collectRunConfigs();
2057
+ const available = known.map((r) => r.runConfig.getName()).sort();
2058
+ throw new Error(
2059
+ available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
2060
+ );
2061
+ }
2062
+ jobs.push(...await this.expandRunConfigToJobs(collected));
2063
+ }
2064
+ return jobs;
2065
+ }
2066
+ async runDatasetJobsWithSharedConcurrency(request) {
2067
+ const globalConcurrency = Math.max(1, request.globalConcurrency);
2068
+ const sem = effect.Effect.unsafeMakeSemaphore(globalConcurrency);
2069
+ const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
2070
+ const snapshots = [];
2071
+ for (const job of request.jobs) {
2072
+ snapshots.push(
2073
+ await this.startDatasetRun({
2074
+ datasetId: job.datasetId,
2075
+ evaluatorIds: job.evaluatorIds,
2076
+ triggerId,
2077
+ maxConcurrency: this.config.maxConcurrency ?? 1,
2078
+ globalEvaluationSemaphore: sem,
2079
+ runConfigName: job.runConfigName,
2080
+ runConfigTags: job.runConfigTags,
2081
+ repetitions: job.repetitions
2082
+ })
2083
+ );
2084
+ }
2085
+ return snapshots;
2086
+ }
1719
2087
  async searchTestCases(query) {
1720
2088
  const testCases = await collectTestCasesFromFiles(this.config.discovery);
1721
2089
  return searchCollectedTestCases(testCases, query);
@@ -1734,36 +2102,46 @@ var EffectRunner = class {
1734
2102
  );
1735
2103
  }
1736
2104
  async runDatasetWith(request) {
2105
+ const runConfigName = validateRunConfigName(
2106
+ request.runConfigName,
2107
+ "runDatasetWith.runConfigName"
2108
+ );
2109
+ return this.startDatasetRun({
2110
+ datasetId: request.datasetId,
2111
+ evaluatorIds: request.evaluatorIds,
2112
+ triggerId: request.triggerId,
2113
+ maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
2114
+ repetitions: request.repetitions,
2115
+ runConfigName,
2116
+ runConfigTags: request.runConfigTags
2117
+ });
2118
+ }
2119
+ async startDatasetRun(params) {
1737
2120
  if (this.datasetsById.size === 0) {
1738
2121
  await this.collectDatasets();
1739
2122
  }
1740
2123
  if (this.evaluatorsById.size === 0) {
1741
2124
  await this.collectEvaluators();
1742
2125
  }
1743
- const dataset = this.datasetsById.get(request.datasetId);
2126
+ const dataset = this.datasetsById.get(params.datasetId);
1744
2127
  if (!dataset) {
1745
- throw new Error(`Unknown dataset: ${request.datasetId}`);
2128
+ throw new Error(`Unknown dataset: ${params.datasetId}`);
1746
2129
  }
1747
- const selectedEvaluators = request.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
2130
+ const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
1748
2131
  if (selectedEvaluators.length === 0) {
1749
2132
  throw new Error("No evaluators selected for run");
1750
2133
  }
1751
- const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
1752
- const totalEvaluations = selectedTestCases.reduce(
1753
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1754
- 0
1755
- );
1756
- const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
2134
+ const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
2135
+ const repetitions = normalizeRunRepetitions(params.repetitions);
2136
+ const totalEvaluations = selectedTestCases.length * repetitions;
2137
+ const runConfigTags = [...params.runConfigTags ?? []];
2138
+ const triggerId = params.triggerId ?? `trg-${crypto.randomUUID()}`;
1757
2139
  const runId = `run-${crypto.randomUUID()}`;
1758
- const artifactPath = createArtifactPath(
1759
- this.config.artifactDirectory,
1760
- request.datasetId,
1761
- runId
1762
- );
2140
+ const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
1763
2141
  const snapshot = {
1764
2142
  runId,
1765
- datasetId: request.datasetId,
1766
- datasetName: dataset.dataset.getName(),
2143
+ datasetId: params.datasetId,
2144
+ datasetName: dataset.dataset.getDisplayLabel(),
1767
2145
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1768
2146
  queuedAt: Date.now(),
1769
2147
  totalTestCases: totalEvaluations,
@@ -1783,8 +2161,8 @@ var EffectRunner = class {
1783
2161
  const queuedEvent = {
1784
2162
  type: "RunQueued",
1785
2163
  runId,
1786
- datasetId: request.datasetId,
1787
- datasetName: dataset.dataset.getName(),
2164
+ datasetId: params.datasetId,
2165
+ datasetName: dataset.dataset.getDisplayLabel(),
1788
2166
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1789
2167
  totalTestCases: totalEvaluations,
1790
2168
  artifactPath
@@ -1797,17 +2175,20 @@ var EffectRunner = class {
1797
2175
  payload: queuedEvent
1798
2176
  })
1799
2177
  );
1800
- const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
1801
2178
  await effect.Effect.runPromise(
1802
2179
  effect.Queue.offer(this.runQueue, {
1803
2180
  runId,
1804
2181
  triggerId,
1805
- datasetId: request.datasetId,
2182
+ datasetId: params.datasetId,
1806
2183
  dataset: dataset.dataset,
1807
2184
  evaluators: selectedEvaluators,
1808
2185
  testCases: selectedTestCases,
1809
2186
  snapshot,
1810
- maxConcurrency
2187
+ maxConcurrency: params.maxConcurrency,
2188
+ globalEvaluationSemaphore: params.globalEvaluationSemaphore,
2189
+ runConfigName: params.runConfigName,
2190
+ runConfigTags,
2191
+ repetitions
1811
2192
  })
1812
2193
  );
1813
2194
  return snapshot;
@@ -1879,15 +2260,27 @@ var EffectRunner = class {
1879
2260
  }
1880
2261
  };
1881
2262
 
2263
+ // src/runner/events.ts
2264
+ var PROGRAMMATIC_RUN_CONFIG = {
2265
+ runConfigName: "programmatic"
2266
+ };
2267
+
1882
2268
  Object.defineProperty(exports, 'S', {
1883
2269
  enumerable: true,
1884
2270
  get: function () { return effect.Schema; }
1885
2271
  });
1886
2272
  exports.Dataset = Dataset;
2273
+ exports.DatasetNameSchema = DatasetNameSchema;
1887
2274
  exports.Evaluator = Evaluator;
2275
+ exports.EvaluatorNameSchema = EvaluatorNameSchema;
1888
2276
  exports.Metric = Metric;
2277
+ exports.PROGRAMMATIC_RUN_CONFIG = PROGRAMMATIC_RUN_CONFIG;
2278
+ exports.RunConfig = RunConfig;
2279
+ exports.RunConfigNameSchema = RunConfigNameSchema;
1889
2280
  exports.Score = Score;
2281
+ exports.TagSet = TagSet;
1890
2282
  exports.TestCase = TestCase;
2283
+ exports.TestCaseNameSchema = TestCaseNameSchema;
1891
2284
  exports.binaryScore = binaryScore;
1892
2285
  exports.createLogEntry = createLogEntry;
1893
2286
  exports.createRunner = createRunner;
@@ -1895,16 +2288,26 @@ exports.defaultRunnerConfig = defaultRunnerConfig;
1895
2288
  exports.defineConfig = defineConfig;
1896
2289
  exports.deltaScore = deltaScore;
1897
2290
  exports.formatScoreData = formatScoreData;
2291
+ exports.getDatasetDisplayLabel = getDatasetDisplayLabel;
2292
+ exports.getEvaluatorDisplayLabel = getEvaluatorDisplayLabel;
2293
+ exports.getEvaluatorTagList = getEvaluatorTagList;
1898
2294
  exports.getLogLines = getLogLines;
1899
2295
  exports.getMetricById = getMetricById;
1900
2296
  exports.getScoreById = getScoreById;
2297
+ exports.getTestCaseDisplayLabel = getTestCaseDisplayLabel;
2298
+ exports.getTestCaseTagList = getTestCaseTagList;
1901
2299
  exports.latencyMetric = latencyMetric;
1902
2300
  exports.loadMockData = loadMockData;
1903
2301
  exports.loadRunnerData = loadRunnerData;
2302
+ exports.normalizeOptionalDisplayName = normalizeOptionalDisplayName;
1904
2303
  exports.parseStartupArgs = parseStartupArgs;
1905
2304
  exports.percentScore = percentScore;
1906
2305
  exports.printJsonDiff = printJsonDiff;
1907
2306
  exports.tokenCountMetric = tokenCountMetric;
2307
+ exports.validateDatasetName = validateDatasetName;
2308
+ exports.validateEvaluatorName = validateEvaluatorName;
2309
+ exports.validateRunConfigName = validateRunConfigName;
2310
+ exports.validateTestCaseName = validateTestCaseName;
1908
2311
  exports.withRunnerConfig = withRunnerConfig;
1909
2312
  //# sourceMappingURL=out.js.map
1910
2313
  //# sourceMappingURL=index.cjs.map