@m4trix/evals 0.25.1 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,14 +1,257 @@
1
- import { Effect, PubSub, Queue, Ref, Fiber } from 'effect';
1
+ import { Schema, Effect, PubSub, Queue, Ref, Fiber, Either, ParseResult } from 'effect';
2
2
  export { Schema as S } from 'effect';
3
3
  import { diffLines } from 'diff';
4
4
  import stringify from 'fast-json-stable-stringify';
5
5
  import { randomUUID } from 'crypto';
6
+ import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
7
+ import { resolve as resolve$1, join, relative, dirname } from 'path';
6
8
  import { existsSync } from 'fs';
7
- import { resolve as resolve$1, relative, join, dirname } from 'path';
8
9
  import * as jitiModule from 'jiti';
9
- import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
10
10
  import { pathToFileURL } from 'url';
11
11
 
12
+ // src/index.ts
13
+ var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
14
+ function makeEntityIdSchema(brand, label) {
15
+ return Schema.String.pipe(
16
+ Schema.trimmed(),
17
+ Schema.minLength(1, {
18
+ message: () => `${label} must be non-empty.`
19
+ }),
20
+ Schema.pattern(ENTITY_ID_PATTERN, {
21
+ message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
22
+ }),
23
+ Schema.brand(brand)
24
+ );
25
+ }
26
+ var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
27
+ var EvaluatorNameSchema = makeEntityIdSchema("EvaluatorName", "Evaluator name");
28
+ var TestCaseNameSchema = makeEntityIdSchema("TestCaseName", "Test case name");
29
+ var DatasetNameSchema = makeEntityIdSchema("DatasetName", "Dataset name");
30
+ function validateWithSchema(schema, raw, context) {
31
+ const trimmed = raw.trim();
32
+ const decode = Schema.decodeUnknownEither(
33
+ schema
34
+ );
35
+ const result = decode(trimmed);
36
+ if (Either.isLeft(result)) {
37
+ throw new Error(`${context}: ${ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
38
+ }
39
+ return result.right;
40
+ }
41
+ function validateRunConfigName(raw, context) {
42
+ return validateWithSchema(RunConfigNameSchema, raw, context);
43
+ }
44
+ function validateEvaluatorName(raw, context) {
45
+ return validateWithSchema(EvaluatorNameSchema, raw, context);
46
+ }
47
+ function validateTestCaseName(raw, context) {
48
+ return validateWithSchema(TestCaseNameSchema, raw, context);
49
+ }
50
+ function validateDatasetName(raw, context) {
51
+ return validateWithSchema(DatasetNameSchema, raw, context);
52
+ }
53
+ function normalizeOptionalDisplayName(raw) {
54
+ if (raw === void 0) {
55
+ return void 0;
56
+ }
57
+ const t = raw.trim();
58
+ return t.length === 0 ? void 0 : t;
59
+ }
60
+
61
+ // src/evals/dataset.ts
62
+ function matchesAny(value, matchers) {
63
+ return matchers.some(
64
+ (matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
65
+ );
66
+ }
67
+ function matchesAnyPath(filePath, matchers) {
68
+ return matchers.some((matcher) => {
69
+ if (typeof matcher === "string") {
70
+ return simpleGlobMatch(matcher, filePath);
71
+ }
72
+ return matcher.test(filePath);
73
+ });
74
+ }
75
+ function simpleGlobMatch(pattern, value) {
76
+ const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
77
+ return new RegExp(`^${escaped}$`).test(value);
78
+ }
79
+ var Dataset = class _Dataset {
80
+ constructor(config) {
81
+ this._config = config;
82
+ }
83
+ static define(config) {
84
+ const name = validateDatasetName(config.name, "Dataset.define");
85
+ const displayName = normalizeOptionalDisplayName(config.displayName);
86
+ return new _Dataset({
87
+ name,
88
+ displayName,
89
+ includedTags: config.includedTags ?? [],
90
+ excludedTags: config.excludedTags ?? [],
91
+ includedPaths: config.includedPaths ?? [],
92
+ excludedPaths: config.excludedPaths ?? []
93
+ });
94
+ }
95
+ /** Canonical dataset id (same rules as `RunConfig` / `TestCase` `name`). */
96
+ getName() {
97
+ return this._config.name;
98
+ }
99
+ getDisplayName() {
100
+ return this._config.displayName;
101
+ }
102
+ /** Label for CLI/TUI and evaluator `meta.datasetName`: {@link getDisplayName} if set, otherwise {@link getName}. */
103
+ getDisplayLabel() {
104
+ return this._config.displayName ?? this._config.name;
105
+ }
106
+ getIncludedTags() {
107
+ return this._config.includedTags;
108
+ }
109
+ getExcludedTags() {
110
+ return this._config.excludedTags;
111
+ }
112
+ getIncludedPaths() {
113
+ return this._config.includedPaths;
114
+ }
115
+ getExcludedPaths() {
116
+ return this._config.excludedPaths;
117
+ }
118
+ matchesTestCase(testCase, filePath) {
119
+ const tags = testCase.getTags();
120
+ if (this._config.excludedTags.length > 0) {
121
+ if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
122
+ return false;
123
+ }
124
+ }
125
+ if (this._config.excludedPaths.length > 0) {
126
+ if (matchesAnyPath(filePath, this._config.excludedPaths)) {
127
+ return false;
128
+ }
129
+ }
130
+ const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
131
+ const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
132
+ return tagMatch && pathMatch;
133
+ }
134
+ };
135
+ function getDatasetDisplayLabel(dataset) {
136
+ if (typeof dataset.getDisplayLabel === "function") {
137
+ return dataset.getDisplayLabel();
138
+ }
139
+ return typeof dataset.getName === "function" ? dataset.getName() : "";
140
+ }
141
+
142
+ // src/evals/evaluator.ts
143
+ var Evaluator = class _Evaluator {
144
+ constructor(config) {
145
+ this._config = config;
146
+ }
147
+ getState() {
148
+ return {
149
+ name: this._config.name,
150
+ displayName: this._config.displayName,
151
+ tags: this._config.tags,
152
+ inputSchema: this._config.inputSchema,
153
+ outputSchema: this._config.outputSchema,
154
+ scoreSchema: this._config.scoreSchema,
155
+ middlewares: this._config.middlewares,
156
+ evaluateFn: this._config.evaluateFn,
157
+ passThreshold: this._config.passThreshold,
158
+ passCriterion: this._config.passCriterion
159
+ };
160
+ }
161
+ static use(middleware) {
162
+ return new _Evaluator({
163
+ middlewares: [middleware],
164
+ tags: []
165
+ });
166
+ }
167
+ use(middleware) {
168
+ const state = this.getState();
169
+ return new _Evaluator({
170
+ ...state,
171
+ middlewares: [...state.middlewares, middleware]
172
+ });
173
+ }
174
+ define(config) {
175
+ const { middlewares } = this.getState();
176
+ const name = validateEvaluatorName(config.name, "Evaluator.define");
177
+ const displayName = normalizeOptionalDisplayName(config.displayName);
178
+ const tags = config.tags !== void 0 ? [...config.tags] : [];
179
+ return new _Evaluator({
180
+ name,
181
+ displayName,
182
+ tags,
183
+ inputSchema: config.inputSchema,
184
+ outputSchema: config.outputSchema,
185
+ scoreSchema: config.scoreSchema,
186
+ middlewares,
187
+ passThreshold: config.passThreshold,
188
+ passCriterion: config.passCriterion
189
+ });
190
+ }
191
+ evaluate(fn) {
192
+ return new _Evaluator({
193
+ ...this.getState(),
194
+ evaluateFn: fn
195
+ });
196
+ }
197
+ /** Canonical evaluator id when defined; otherwise undefined (middleware-only chain). */
198
+ getName() {
199
+ return this._config.name;
200
+ }
201
+ getDisplayName() {
202
+ return this._config.displayName;
203
+ }
204
+ /** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. Undefined if not yet defined. */
205
+ getDisplayLabel() {
206
+ const id = this._config.name;
207
+ if (id === void 0) {
208
+ return void 0;
209
+ }
210
+ return this._config.displayName ?? id;
211
+ }
212
+ /** Tags from `Evaluator.define({ tags })`; empty until defined. */
213
+ getTags() {
214
+ return [...this._config.tags];
215
+ }
216
+ getInputSchema() {
217
+ return this._config.inputSchema;
218
+ }
219
+ getOutputSchema() {
220
+ return this._config.outputSchema;
221
+ }
222
+ getScoreSchema() {
223
+ return this._config.scoreSchema;
224
+ }
225
+ getMiddlewares() {
226
+ return this._config.middlewares;
227
+ }
228
+ getEvaluateFn() {
229
+ return this._config.evaluateFn;
230
+ }
231
+ getPassThreshold() {
232
+ return this._config.passThreshold;
233
+ }
234
+ getPassCriterion() {
235
+ return this._config.passCriterion;
236
+ }
237
+ async resolveContext() {
238
+ const parts = await Promise.all(this._config.middlewares.map((mw) => mw.resolve()));
239
+ return Object.assign({}, ...parts);
240
+ }
241
+ };
242
+ function getEvaluatorDisplayLabel(evaluator) {
243
+ if (typeof evaluator.getDisplayLabel === "function") {
244
+ const label = evaluator.getDisplayLabel();
245
+ if (label !== void 0) {
246
+ return label;
247
+ }
248
+ }
249
+ return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
250
+ }
251
+ function getEvaluatorTagList(evaluator) {
252
+ return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
253
+ }
254
+
12
255
  // src/cli/data.mock.json
13
256
  var data_mock_default = {
14
257
  datasets: [
@@ -255,7 +498,7 @@ function toEvalDataset(item, snapshots) {
255
498
  const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
256
499
  return {
257
500
  id: item.id,
258
- name: item.dataset.getName(),
501
+ name: getDatasetDisplayLabel(item.dataset),
259
502
  overview: `Discovered from ${item.filePath}`,
260
503
  runs
261
504
  };
@@ -263,7 +506,7 @@ function toEvalDataset(item, snapshots) {
263
506
  function toEvaluatorOption(item) {
264
507
  return {
265
508
  id: item.id,
266
- name: item.evaluator.getName() ?? toSlug(item.id),
509
+ name: getEvaluatorDisplayLabel(item.evaluator) ?? toSlug(item.id),
267
510
  configPreview: `Source: ${item.filePath}`
268
511
  };
269
512
  }
@@ -308,196 +551,149 @@ function parseStartupArgs(argv) {
308
551
  }
309
552
  return args;
310
553
  }
311
-
312
- // src/evals/test-case.ts
313
- function resolve(value) {
314
- return typeof value === "function" ? value() : value;
315
- }
316
- var TestCase = class _TestCase {
317
- constructor(config) {
318
- this._config = config;
554
+ function preprocessForDiff(value, options) {
555
+ if (options?.sort && Array.isArray(value)) {
556
+ return [...value].sort((a, b) => {
557
+ const aStr = stringify(preprocessForDiff(a, options));
558
+ const bStr = stringify(preprocessForDiff(b, options));
559
+ return aStr.localeCompare(bStr);
560
+ }).map((item) => preprocessForDiff(item, options));
319
561
  }
320
- static describe(config) {
321
- const reruns = config.reruns ?? 1;
322
- if (reruns < 1 || !Number.isInteger(reruns)) {
323
- throw new Error(`TestCase reruns must be a positive integer, got ${reruns}`);
562
+ if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
563
+ const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
564
+ const filtered = {};
565
+ for (const [k, v] of Object.entries(value)) {
566
+ if (!keys.includes(k)) {
567
+ filtered[k] = preprocessForDiff(v, options);
568
+ }
324
569
  }
325
- return new _TestCase({
326
- name: config.name,
327
- tags: config.tags,
328
- reruns,
329
- inputSchema: config.inputSchema,
330
- input: config.input,
331
- outputSchema: config.outputSchema,
332
- output: config.output
333
- });
570
+ return filtered;
334
571
  }
335
- getReruns() {
336
- return this._config.reruns;
572
+ if (value !== null && typeof value === "object" && !Array.isArray(value)) {
573
+ const result = {};
574
+ for (const [k, v] of Object.entries(value)) {
575
+ result[k] = preprocessForDiff(v, options);
576
+ }
577
+ return result;
337
578
  }
338
- getName() {
339
- return this._config.name;
579
+ if (typeof value === "number" && options?.precision !== void 0) {
580
+ return Number(value.toFixed(options.precision));
340
581
  }
341
- getTags() {
342
- return this._config.tags;
582
+ return value;
583
+ }
584
+ function toPrettyJson(value) {
585
+ const str = stringify(value);
586
+ try {
587
+ const parsed = JSON.parse(str);
588
+ return JSON.stringify(parsed, null, 2);
589
+ } catch {
590
+ return str;
343
591
  }
344
- getInputSchema() {
345
- return this._config.inputSchema;
592
+ }
593
+ function formatDiffParts(parts) {
594
+ const lines = [];
595
+ for (const part of parts) {
596
+ const prefix = part.added ? "+ " : part.removed ? "- " : "";
597
+ const partLines = part.value.split("\n");
598
+ for (let i = 0; i < partLines.length; i++) {
599
+ const line = partLines[i];
600
+ if (i === partLines.length - 1 && line === "")
601
+ continue;
602
+ lines.push(prefix + line);
603
+ }
346
604
  }
347
- getInput() {
348
- return resolve(this._config.input);
605
+ return lines.join("\n");
606
+ }
607
+ function createDiffString(expected, actual, diffOptions) {
608
+ const expectedProcessed = preprocessForDiff(expected, diffOptions);
609
+ const actualProcessed = preprocessForDiff(actual, diffOptions);
610
+ if (diffOptions?.keysOnly) {
611
+ const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
612
+ const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
613
+ const parts2 = diffLines(expectedKeys, actualKeys);
614
+ return formatDiffParts(parts2);
349
615
  }
350
- getOutputSchema() {
351
- return this._config.outputSchema;
616
+ const expectedStr = toPrettyJson(expectedProcessed);
617
+ const actualStr = toPrettyJson(actualProcessed);
618
+ if (expectedStr === actualStr) {
619
+ return "";
352
620
  }
353
- getOutput() {
354
- if (this._config.output === void 0) {
355
- return void 0;
356
- }
357
- return resolve(this._config.output);
358
- }
359
- };
360
-
361
- // src/evals/evaluator.ts
362
- var Evaluator = class _Evaluator {
363
- constructor(config) {
364
- this._config = config;
365
- }
366
- getState() {
367
- return {
368
- name: this._config.name,
369
- inputSchema: this._config.inputSchema,
370
- outputSchema: this._config.outputSchema,
371
- scoreSchema: this._config.scoreSchema,
372
- middlewares: this._config.middlewares,
373
- evaluateFn: this._config.evaluateFn,
374
- passThreshold: this._config.passThreshold,
375
- passCriterion: this._config.passCriterion
376
- };
377
- }
378
- static use(middleware) {
379
- return new _Evaluator({
380
- middlewares: [middleware]
381
- });
382
- }
383
- use(middleware) {
384
- const state = this.getState();
385
- return new _Evaluator({
386
- ...state,
387
- middlewares: [...state.middlewares, middleware]
388
- });
389
- }
390
- define(config) {
391
- const { middlewares } = this.getState();
392
- return new _Evaluator({
393
- name: config.name,
394
- inputSchema: config.inputSchema,
395
- outputSchema: config.outputSchema,
396
- scoreSchema: config.scoreSchema,
397
- middlewares,
398
- passThreshold: config.passThreshold,
399
- passCriterion: config.passCriterion
400
- });
401
- }
402
- evaluate(fn) {
403
- return new _Evaluator({
404
- ...this.getState(),
405
- evaluateFn: fn
406
- });
407
- }
408
- getName() {
409
- return this._config.name;
410
- }
411
- getInputSchema() {
412
- return this._config.inputSchema;
413
- }
414
- getOutputSchema() {
415
- return this._config.outputSchema;
416
- }
417
- getScoreSchema() {
418
- return this._config.scoreSchema;
419
- }
420
- getMiddlewares() {
421
- return this._config.middlewares;
422
- }
423
- getEvaluateFn() {
424
- return this._config.evaluateFn;
621
+ const parts = diffLines(expectedStr, actualStr);
622
+ if (diffOptions?.outputNewOnly) {
623
+ const filtered = parts.filter((p) => p.added === true);
624
+ return formatDiffParts(filtered);
425
625
  }
426
- getPassThreshold() {
427
- return this._config.passThreshold;
626
+ return formatDiffParts(parts);
627
+ }
628
+ function extractKeys(value) {
629
+ if (value === null || typeof value !== "object") {
630
+ return "\xB7";
428
631
  }
429
- getPassCriterion() {
430
- return this._config.passCriterion;
632
+ if (Array.isArray(value)) {
633
+ return value.map(extractKeys);
431
634
  }
432
- async resolveContext() {
433
- const parts = await Promise.all(this._config.middlewares.map((mw) => mw.resolve()));
434
- return Object.assign({}, ...parts);
635
+ const result = {};
636
+ for (const [k, v] of Object.entries(value)) {
637
+ result[k] = extractKeys(v);
435
638
  }
436
- };
437
-
438
- // src/evals/dataset.ts
439
- function matchesAny(value, matchers) {
440
- return matchers.some(
441
- (matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
442
- );
639
+ return result;
443
640
  }
444
- function matchesAnyPath(filePath, matchers) {
445
- return matchers.some((matcher) => {
446
- if (typeof matcher === "string") {
447
- return simpleGlobMatch(matcher, filePath);
641
+ function formatLogMessage(msg) {
642
+ if (typeof msg === "string")
643
+ return msg;
644
+ if (msg instanceof Error)
645
+ return msg.stack ?? msg.message;
646
+ try {
647
+ if (msg !== null && typeof msg === "object") {
648
+ return JSON.stringify(msg, null, 2);
448
649
  }
449
- return matcher.test(filePath);
450
- });
650
+ return String(msg);
651
+ } catch {
652
+ return String(msg);
653
+ }
451
654
  }
452
- function simpleGlobMatch(pattern, value) {
453
- const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
454
- return new RegExp(`^${escaped}$`).test(value);
655
+ function createLogEntry(message, options) {
656
+ return {
657
+ type: "log",
658
+ label: options?.label,
659
+ message: formatLogMessage(message)
660
+ };
455
661
  }
456
- var Dataset = class _Dataset {
457
- constructor(config) {
458
- this._config = config;
459
- }
460
- static define(config) {
461
- return new _Dataset({
462
- name: config.name,
463
- includedTags: config.includedTags ?? [],
464
- excludedTags: config.excludedTags ?? [],
465
- includedPaths: config.includedPaths ?? [],
466
- excludedPaths: config.excludedPaths ?? []
467
- });
468
- }
469
- getName() {
470
- return this._config.name;
471
- }
472
- getIncludedTags() {
473
- return this._config.includedTags;
474
- }
475
- getExcludedTags() {
476
- return this._config.excludedTags;
477
- }
478
- getIncludedPaths() {
479
- return this._config.includedPaths;
480
- }
481
- getExcludedPaths() {
482
- return this._config.excludedPaths;
483
- }
484
- matchesTestCase(testCase, filePath) {
485
- const tags = testCase.getTags();
486
- if (this._config.excludedTags.length > 0) {
487
- if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
488
- return false;
662
+ function getLogLines(entry) {
663
+ return entry.message.split("\n");
664
+ }
665
+ function createDiffLogEntry(expected, actual, options) {
666
+ const { label, ...diffOpts } = options ?? {};
667
+ const diff = createDiffString(expected, actual, diffOpts);
668
+ return {
669
+ type: "diff",
670
+ label,
671
+ expected,
672
+ actual,
673
+ diff: diff || "(no differences)"
674
+ };
675
+ }
676
+ function printJsonDiff(expected, actual, options = {}) {
677
+ const { color = true, ...diffOpts } = options;
678
+ const diff = createDiffString(expected, actual, diffOpts);
679
+ if (color) {
680
+ const lines = diff.split("\n").map((line) => {
681
+ const trimmed = line.trimStart();
682
+ if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
683
+ return `\x1B[31m${line}\x1B[0m`;
489
684
  }
490
- }
491
- if (this._config.excludedPaths.length > 0) {
492
- if (matchesAnyPath(filePath, this._config.excludedPaths)) {
493
- return false;
685
+ if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
686
+ return `\x1B[32m${line}\x1B[0m`;
494
687
  }
495
- }
496
- const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
497
- const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
498
- return tagMatch && pathMatch;
688
+ return line;
689
+ });
690
+ const colored = lines.join("\n");
691
+ console.log(colored || "(no differences)");
692
+ return colored;
499
693
  }
500
- };
694
+ console.log(diff || "(no differences)");
695
+ return diff;
696
+ }
501
697
 
502
698
  // src/evals/metric.ts
503
699
  var registry = /* @__PURE__ */ new Map();
@@ -522,6 +718,113 @@ function getMetricById(id) {
522
718
  return registry.get(id);
523
719
  }
524
720
 
721
+ // src/evals/aggregators.ts
722
+ function aggregateTokenCountSum(values) {
723
+ const initial = {
724
+ input: 0,
725
+ output: 0,
726
+ inputCached: 0,
727
+ outputCached: 0
728
+ };
729
+ return values.reduce(
730
+ (acc, v) => ({
731
+ input: acc.input + (v.input ?? 0),
732
+ output: acc.output + (v.output ?? 0),
733
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
734
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
735
+ }),
736
+ initial
737
+ );
738
+ }
739
+ function aggregateLatencyAverage(values) {
740
+ if (values.length === 0) {
741
+ return { ms: 0 };
742
+ }
743
+ const sum = values.reduce((s, v) => s + v.ms, 0);
744
+ return { ms: sum / values.length };
745
+ }
746
+
747
+ // src/evals/metrics/standard.ts
748
+ var tokenCountMetric = Metric.of({
749
+ id: "token-count",
750
+ name: "Tokens",
751
+ aggregate: aggregateTokenCountSum,
752
+ format: (data, options) => {
753
+ const input = data.input ?? 0;
754
+ const output = data.output ?? 0;
755
+ const inputCached = data.inputCached ?? 0;
756
+ const outputCached = data.outputCached ?? 0;
757
+ const cached = inputCached + outputCached;
758
+ const base = `in:${input} out:${output} cached:${cached}`;
759
+ return options?.isAggregated ? `Total: ${base}` : base;
760
+ }
761
+ });
762
+ var latencyMetric = Metric.of({
763
+ id: "latency",
764
+ name: "Latency",
765
+ aggregate: aggregateLatencyAverage,
766
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
767
+ });
768
+
769
+ // src/evals/run-config.ts
770
+ function validateRow(row, index) {
771
+ const hasEvaluators = "evaluators" in row && row.evaluators !== void 0 && row.evaluators !== void 0;
772
+ const hasPattern = "evaluatorPattern" in row && typeof row.evaluatorPattern === "string" && row.evaluatorPattern.trim().length > 0;
773
+ if (hasEvaluators && hasPattern) {
774
+ throw new Error(`RunConfig run[${index}] must not set both evaluators and evaluatorPattern`);
775
+ }
776
+ if (!hasEvaluators && !hasPattern) {
777
+ throw new Error(`RunConfig run[${index}] must set either evaluators or evaluatorPattern`);
778
+ }
779
+ if (hasEvaluators && row.evaluators.length === 0) {
780
+ throw new Error(`RunConfig run[${index}]: evaluators must be non-empty`);
781
+ }
782
+ const rawRep = "repetitions" in row ? row.repetitions : void 0;
783
+ const repetitions = rawRep ?? 1;
784
+ if (!Number.isInteger(repetitions) || repetitions < 1) {
785
+ throw new Error(
786
+ `RunConfig run[${index}]: repetitions must be a positive integer, got ${String(rawRep)}`
787
+ );
788
+ }
789
+ }
790
+ var RunConfig = class _RunConfig {
791
+ constructor(name, displayName, tags, runs) {
792
+ this._name = name;
793
+ this._displayName = displayName;
794
+ this._tags = tags;
795
+ this._runs = runs;
796
+ }
797
+ static define(config) {
798
+ if (config.runs.length === 0) {
799
+ throw new Error("RunConfig runs must be non-empty");
800
+ }
801
+ config.runs.forEach(validateRow);
802
+ const name = validateRunConfigName(config.name, "RunConfig.define");
803
+ const displayName = normalizeOptionalDisplayName(config.displayName);
804
+ const tags = config.tags !== void 0 ? [...config.tags] : [];
805
+ return new _RunConfig(name, displayName, tags, config.runs);
806
+ }
807
+ /** Canonical id (branded {@link RunConfigName} at runtime; typed as `string` for ergonomics). */
808
+ getName() {
809
+ return this._name;
810
+ }
811
+ /** Optional unrestricted display label. */
812
+ getDisplayName() {
813
+ return this._displayName;
814
+ }
815
+ /** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. */
816
+ getDisplayLabel() {
817
+ return this._displayName ?? this._name;
818
+ }
819
+ /** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
820
+ getTags() {
821
+ return [...this._tags];
822
+ }
823
+ getRuns() {
824
+ return this._runs;
825
+ }
826
+ };
827
+
525
828
  // src/evals/score.ts
526
829
  var registry2 = /* @__PURE__ */ new Map();
527
830
  function formatScoreData(def, data, options) {
@@ -612,71 +915,23 @@ var Score = {
612
915
  aggregateValues: config.aggregateValues,
613
916
  make: (data, options) => {
614
917
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
615
- return {
616
- id: config.id,
617
- data,
618
- ...passed !== void 0 && { passed },
619
- ...options?.name !== void 0 && { name: options.name },
620
- def
621
- // Attach def so rendering/aggregation works without registry lookup
622
- };
623
- }
624
- };
625
- registry2.set(config.id, def);
626
- return def;
627
- }
628
- };
629
- function getScoreById(id) {
630
- return registry2.get(id);
631
- }
632
-
633
- // src/evals/aggregators.ts
634
- function aggregateTokenCountSum(values) {
635
- const initial = {
636
- input: 0,
637
- output: 0,
638
- inputCached: 0,
639
- outputCached: 0
640
- };
641
- return values.reduce(
642
- (acc, v) => ({
643
- input: acc.input + (v.input ?? 0),
644
- output: acc.output + (v.output ?? 0),
645
- inputCached: acc.inputCached + (v.inputCached ?? 0),
646
- outputCached: acc.outputCached + (v.outputCached ?? 0)
647
- }),
648
- initial
649
- );
650
- }
651
- function aggregateLatencyAverage(values) {
652
- if (values.length === 0) {
653
- return { ms: 0 };
654
- }
655
- const sum = values.reduce((s, v) => s + v.ms, 0);
656
- return { ms: sum / values.length };
657
- }
658
-
659
- // src/evals/metrics/standard.ts
660
- var tokenCountMetric = Metric.of({
661
- id: "token-count",
662
- name: "Tokens",
663
- aggregate: aggregateTokenCountSum,
664
- format: (data, options) => {
665
- const input = data.input ?? 0;
666
- const output = data.output ?? 0;
667
- const inputCached = data.inputCached ?? 0;
668
- const outputCached = data.outputCached ?? 0;
669
- const cached = inputCached + outputCached;
670
- const base = `in:${input} out:${output} cached:${cached}`;
671
- return options?.isAggregated ? `Total: ${base}` : base;
918
+ return {
919
+ id: config.id,
920
+ data,
921
+ ...passed !== void 0 && { passed },
922
+ ...options?.name !== void 0 && { name: options.name },
923
+ def
924
+ // Attach def so rendering/aggregation works without registry lookup
925
+ };
926
+ }
927
+ };
928
+ registry2.set(config.id, def);
929
+ return def;
672
930
  }
673
- });
674
- var latencyMetric = Metric.of({
675
- id: "latency",
676
- name: "Latency",
677
- aggregate: aggregateLatencyAverage,
678
- format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
679
- });
931
+ };
932
+ function getScoreById(id) {
933
+ return registry2.get(id);
934
+ }
680
935
 
681
936
  // src/evals/scores/standard.ts
682
937
  var percentScore = Score.of({
@@ -709,148 +964,197 @@ var binaryScore = Score.of({
709
964
  },
710
965
  aggregateValues: Score.aggregate.all
711
966
  });
712
- function preprocessForDiff(value, options) {
713
- if (options?.sort && Array.isArray(value)) {
714
- return [...value].sort((a, b) => {
715
- const aStr = stringify(preprocessForDiff(a, options));
716
- const bStr = stringify(preprocessForDiff(b, options));
717
- return aStr.localeCompare(bStr);
718
- }).map((item) => preprocessForDiff(item, options));
719
- }
720
- if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
721
- const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
722
- const filtered = {};
723
- for (const [k, v] of Object.entries(value)) {
724
- if (!keys.includes(k)) {
725
- filtered[k] = preprocessForDiff(v, options);
726
- }
967
+
968
+ // src/evals/tag-set.ts
969
+ var TagSet = class {
970
+ constructor() {
971
+ }
972
+ static define(tags) {
973
+ const out = {};
974
+ for (const tag of tags) {
975
+ out[tag] = tag;
727
976
  }
728
- return filtered;
977
+ return out;
729
978
  }
730
- if (value !== null && typeof value === "object" && !Array.isArray(value)) {
731
- const result = {};
732
- for (const [k, v] of Object.entries(value)) {
733
- result[k] = preprocessForDiff(v, options);
734
- }
735
- return result;
979
+ };
980
+
981
+ // src/evals/test-case.ts
982
+ function resolve(value) {
983
+ return typeof value === "function" ? value() : value;
984
+ }
985
+ var TestCase = class _TestCase {
986
+ constructor(config) {
987
+ this._config = config;
736
988
  }
737
- if (typeof value === "number" && options?.precision !== void 0) {
738
- return Number(value.toFixed(options.precision));
989
+ static describe(config) {
990
+ const name = validateTestCaseName(config.name, "TestCase.describe");
991
+ const displayName = normalizeOptionalDisplayName(config.displayName);
992
+ return new _TestCase({
993
+ name,
994
+ displayName,
995
+ tags: config.tags,
996
+ inputSchema: config.inputSchema,
997
+ input: config.input,
998
+ outputSchema: config.outputSchema,
999
+ output: config.output
1000
+ });
739
1001
  }
740
- return value;
741
- }
742
- function toPrettyJson(value) {
743
- const str = stringify(value);
744
- try {
745
- const parsed = JSON.parse(str);
746
- return JSON.stringify(parsed, null, 2);
747
- } catch {
748
- return str;
1002
+ getName() {
1003
+ return this._config.name;
749
1004
  }
750
- }
751
- function formatDiffParts(parts) {
752
- const lines = [];
753
- for (const part of parts) {
754
- const prefix = part.added ? "+ " : part.removed ? "- " : "";
755
- const partLines = part.value.split("\n");
756
- for (let i = 0; i < partLines.length; i++) {
757
- const line = partLines[i];
758
- if (i === partLines.length - 1 && line === "")
759
- continue;
760
- lines.push(prefix + line);
761
- }
1005
+ getDisplayName() {
1006
+ return this._config.displayName;
762
1007
  }
763
- return lines.join("\n");
764
- }
765
- function createDiffString(expected, actual, diffOptions) {
766
- const expectedProcessed = preprocessForDiff(expected, diffOptions);
767
- const actualProcessed = preprocessForDiff(actual, diffOptions);
768
- if (diffOptions?.keysOnly) {
769
- const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
770
- const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
771
- const parts2 = diffLines(expectedKeys, actualKeys);
772
- return formatDiffParts(parts2);
1008
+ getDisplayLabel() {
1009
+ return this._config.displayName ?? this._config.name;
773
1010
  }
774
- const expectedStr = toPrettyJson(expectedProcessed);
775
- const actualStr = toPrettyJson(actualProcessed);
776
- if (expectedStr === actualStr) {
777
- return "";
1011
+ getTags() {
1012
+ return this._config.tags;
778
1013
  }
779
- const parts = diffLines(expectedStr, actualStr);
780
- if (diffOptions?.outputNewOnly) {
781
- const filtered = parts.filter((p) => p.added === true);
782
- return formatDiffParts(filtered);
1014
+ getInputSchema() {
1015
+ return this._config.inputSchema;
783
1016
  }
784
- return formatDiffParts(parts);
785
- }
786
- function extractKeys(value) {
787
- if (value === null || typeof value !== "object") {
788
- return "\xB7";
1017
+ getInput() {
1018
+ return resolve(this._config.input);
789
1019
  }
790
- if (Array.isArray(value)) {
791
- return value.map(extractKeys);
1020
+ getOutputSchema() {
1021
+ return this._config.outputSchema;
792
1022
  }
793
- const result = {};
794
- for (const [k, v] of Object.entries(value)) {
795
- result[k] = extractKeys(v);
1023
+ getOutput() {
1024
+ if (this._config.output === void 0) {
1025
+ return void 0;
1026
+ }
1027
+ return resolve(this._config.output);
796
1028
  }
797
- return result;
1029
+ };
1030
+ function getTestCaseDisplayLabel(testCase) {
1031
+ if (typeof testCase.getDisplayLabel === "function") {
1032
+ return testCase.getDisplayLabel();
1033
+ }
1034
+ return typeof testCase.getName === "function" ? testCase.getName() : "";
798
1035
  }
799
- function formatLogMessage(msg) {
800
- if (typeof msg === "string")
801
- return msg;
802
- if (msg instanceof Error)
803
- return msg.stack ?? msg.message;
1036
+ function getTestCaseTagList(testCase) {
1037
+ return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
1038
+ }
1039
+ async function loadRunSnapshotsFromArtifacts(config) {
1040
+ const baseDir = resolve$1(config.artifactDirectory);
1041
+ let entries;
804
1042
  try {
805
- if (msg !== null && typeof msg === "object") {
806
- return JSON.stringify(msg, null, 2);
807
- }
808
- return String(msg);
1043
+ entries = await readdir(baseDir);
809
1044
  } catch {
810
- return String(msg);
1045
+ return [];
811
1046
  }
1047
+ const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
1048
+ const snapshots = [];
1049
+ for (const fileName of jsonlFiles) {
1050
+ const filePath = join(baseDir, fileName);
1051
+ try {
1052
+ const snapshot = await parseArtifactToSnapshot(filePath, config);
1053
+ if (snapshot) {
1054
+ snapshots.push(snapshot);
1055
+ }
1056
+ } catch {
1057
+ }
1058
+ }
1059
+ return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
812
1060
  }
813
- function createLogEntry(message, options) {
814
- return {
815
- type: "log",
816
- label: options?.label,
817
- message: formatLogMessage(message)
818
- };
819
- }
820
- function getLogLines(entry) {
821
- return entry.message.split("\n");
822
- }
823
- function createDiffLogEntry(expected, actual, options) {
824
- const { label, ...diffOpts } = options ?? {};
825
- const diff = createDiffString(expected, actual, diffOpts);
826
- return {
827
- type: "diff",
828
- label,
829
- expected,
830
- actual,
831
- diff: diff || "(no differences)"
832
- };
833
- }
834
- function printJsonDiff(expected, actual, options = {}) {
835
- const { color = true, ...diffOpts } = options;
836
- const diff = createDiffString(expected, actual, diffOpts);
837
- if (color) {
838
- const lines = diff.split("\n").map((line) => {
839
- const trimmed = line.trimStart();
840
- if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
841
- return `\x1B[31m${line}\x1B[0m`;
1061
+ async function parseArtifactToSnapshot(filePath, _config) {
1062
+ const content = await readFile(filePath, "utf8");
1063
+ const lines = content.split("\n").filter((line) => line.trim().length > 0);
1064
+ if (lines.length === 0) {
1065
+ return null;
1066
+ }
1067
+ let runQueued = null;
1068
+ let runCompleted = null;
1069
+ let runFailed = null;
1070
+ let runStarted = null;
1071
+ for (const line of lines) {
1072
+ try {
1073
+ const event = JSON.parse(line);
1074
+ const type = event.type;
1075
+ if (type === "RunQueued") {
1076
+ runQueued = {
1077
+ runId: event.runId,
1078
+ datasetId: event.datasetId,
1079
+ datasetName: event.datasetName,
1080
+ evaluatorIds: event.evaluatorIds,
1081
+ totalTestCases: event.totalTestCases ?? 0,
1082
+ artifactPath: event.artifactPath ?? filePath,
1083
+ ts: event.ts
1084
+ };
842
1085
  }
843
- if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
844
- return `\x1B[32m${line}\x1B[0m`;
1086
+ if (type === "RunStarted") {
1087
+ runStarted = { startedAt: event.startedAt };
1088
+ }
1089
+ if (type === "RunCompleted") {
1090
+ runCompleted = {
1091
+ passedTestCases: event.passedTestCases,
1092
+ failedTestCases: event.failedTestCases,
1093
+ totalTestCases: event.totalTestCases,
1094
+ finishedAt: event.finishedAt
1095
+ };
1096
+ }
1097
+ if (type === "RunFailed") {
1098
+ runFailed = {
1099
+ finishedAt: event.finishedAt,
1100
+ errorMessage: event.errorMessage
1101
+ };
1102
+ }
1103
+ } catch {
1104
+ }
1105
+ }
1106
+ if (!runQueued) {
1107
+ return null;
1108
+ }
1109
+ const artifactPath = filePath;
1110
+ const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1111
+ const progress = aggregateTestCaseProgress(lines);
1112
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1113
+ const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1114
+ const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1115
+ return {
1116
+ runId: runQueued.runId,
1117
+ datasetId: runQueued.datasetId,
1118
+ datasetName: runQueued.datasetName,
1119
+ evaluatorIds: runQueued.evaluatorIds,
1120
+ queuedAt: runQueued.ts ?? 0,
1121
+ startedAt: runStarted?.startedAt,
1122
+ finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
1123
+ totalTestCases: runQueued.totalTestCases,
1124
+ completedTestCases,
1125
+ passedTestCases,
1126
+ failedTestCases,
1127
+ status,
1128
+ artifactPath,
1129
+ errorMessage: runFailed?.errorMessage
1130
+ };
1131
+ }
1132
+ function aggregateTestCaseProgress(lines) {
1133
+ let completedTestCases = 0;
1134
+ const testCasePassedBy = /* @__PURE__ */ new Map();
1135
+ for (const line of lines) {
1136
+ try {
1137
+ const event = JSON.parse(line);
1138
+ if (event.type === "TestCaseProgress") {
1139
+ const ev = event;
1140
+ completedTestCases = ev.completedTestCases ?? completedTestCases;
1141
+ const id = ev.testCaseId;
1142
+ const current = testCasePassedBy.get(id);
1143
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
845
1144
  }
846
- return line;
847
- });
848
- const colored = lines.join("\n");
849
- console.log(colored || "(no differences)");
850
- return colored;
1145
+ } catch {
1146
+ }
851
1147
  }
852
- console.log(diff || "(no differences)");
853
- return diff;
1148
+ let passedTestCases = 0;
1149
+ let failedTestCases = 0;
1150
+ for (const passed of testCasePassedBy.values()) {
1151
+ if (passed) {
1152
+ passedTestCases += 1;
1153
+ } else {
1154
+ failedTestCases += 1;
1155
+ }
1156
+ }
1157
+ return { completedTestCases, passedTestCases, failedTestCases };
854
1158
  }
855
1159
 
856
1160
  // src/runner/config.ts
@@ -862,6 +1166,7 @@ var defaultRunnerConfig = {
862
1166
  rootDir: process.cwd(),
863
1167
  datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
864
1168
  evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
1169
+ runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
865
1170
  testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
866
1171
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
867
1172
  },
@@ -887,6 +1192,11 @@ function toRunnerConfigOverrides(config) {
887
1192
  } else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
888
1193
  discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
889
1194
  }
1195
+ if (rawDiscovery?.runConfigFilePatterns !== void 0) {
1196
+ discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
1197
+ } else if (rawDiscovery?.runConfigSuffixes !== void 0) {
1198
+ discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
1199
+ }
890
1200
  if (rawDiscovery?.testCaseFilePatterns !== void 0) {
891
1201
  discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
892
1202
  } else if (rawDiscovery?.testCaseSuffixes !== void 0) {
@@ -985,6 +1295,9 @@ function isDatasetLike(value) {
985
1295
  function isEvaluatorLike(value) {
986
1296
  return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
987
1297
  }
1298
+ function isRunConfigLike(value) {
1299
+ return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
1300
+ }
988
1301
  function isTestCaseLike(value) {
989
1302
  return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
990
1303
  }
@@ -1073,6 +1386,23 @@ async function collectEvaluatorsFromFiles(config) {
1073
1386
  );
1074
1387
  return found.flat();
1075
1388
  }
1389
+ async function collectRunConfigsFromFiles(config) {
1390
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1391
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
1392
+ const found = await Promise.all(
1393
+ matched.map(async (absolutePath) => {
1394
+ const exports = await loadModuleExports(absolutePath);
1395
+ const runConfigs = exports.filter(isRunConfigLike);
1396
+ const relPath = relative(config.rootDir, absolutePath);
1397
+ return runConfigs.map((runConfig) => ({
1398
+ id: runConfig.getName(),
1399
+ filePath: relPath,
1400
+ runConfig
1401
+ }));
1402
+ })
1403
+ );
1404
+ return found.flat();
1405
+ }
1076
1406
  async function collectTestCasesFromFiles(config) {
1077
1407
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1078
1408
  const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
@@ -1165,15 +1495,17 @@ function readOutput(testCase) {
1165
1495
  }
1166
1496
  return candidate.getOutput();
1167
1497
  }
1168
- function buildEvaluationUnits(testCases) {
1498
+ function buildEvaluationUnits(testCases, repetitionCount) {
1499
+ const count = Math.max(1, repetitionCount);
1169
1500
  const units = [];
1170
1501
  for (const testCaseItem of testCases) {
1171
- const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1172
- for (let r = 0; r < rerunTotal; r++) {
1502
+ const repetitionId = `rep-${randomUUID()}`;
1503
+ for (let r = 0; r < count; r++) {
1173
1504
  units.push({
1174
1505
  testCaseItem,
1175
- rerunIndex: r + 1,
1176
- rerunTotal
1506
+ repetitionId,
1507
+ repetitionIndex: r + 1,
1508
+ repetitionCount: count
1177
1509
  });
1178
1510
  }
1179
1511
  }
@@ -1186,7 +1518,7 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1186
1518
  return join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
1187
1519
  }
1188
1520
  function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
1189
- const { testCaseItem, rerunIndex, rerunTotal } = unit;
1521
+ const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
1190
1522
  return Effect.gen(function* () {
1191
1523
  const evaluatorRunId = `run-${randomUUID()}`;
1192
1524
  const started = Date.now();
@@ -1195,11 +1527,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1195
1527
  type: "TestCaseStarted",
1196
1528
  runId: task.runId,
1197
1529
  testCaseId: testCaseItem.id,
1198
- testCaseName: testCaseItem.testCase.getName(),
1530
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
1199
1531
  startedTestCases: startedEvaluations,
1200
1532
  totalTestCases: totalEvaluations,
1201
- rerunIndex,
1202
- rerunTotal
1533
+ repetitionId,
1534
+ repetitionIndex,
1535
+ repetitionCount
1203
1536
  });
1204
1537
  const evaluatorScores = [];
1205
1538
  let testCaseError;
@@ -1233,8 +1566,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1233
1566
  meta: {
1234
1567
  triggerId: task.triggerId,
1235
1568
  runId: evaluatorRunId,
1236
- datasetId: task.datasetId
1569
+ datasetName: task.dataset.getDisplayLabel(),
1570
+ repetitionId,
1571
+ repetitionIndex,
1572
+ repetitionCount,
1573
+ runConfigName: task.runConfigName
1237
1574
  },
1575
+ testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1576
+ runConfigTags: task.runConfigTags,
1577
+ evaluatorTags: getEvaluatorTagList(evaluator),
1238
1578
  logDiff,
1239
1579
  log,
1240
1580
  createError
@@ -1277,18 +1617,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1277
1617
  });
1278
1618
  }
1279
1619
  }
1280
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1620
+ const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
1281
1621
  const completedEvaluations = yield* Ref.modify(completedRef, (n) => [n + 1, n + 1]);
1282
1622
  const progressEvent = {
1283
1623
  type: "TestCaseProgress",
1284
1624
  runId: task.runId,
1285
1625
  testCaseId: testCaseItem.id,
1286
- testCaseName: testCaseItem.testCase.getName(),
1626
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
1287
1627
  completedTestCases: completedEvaluations,
1288
1628
  totalTestCases: totalEvaluations,
1289
- rerunIndex,
1290
- rerunTotal,
1291
- passed: rerunPassedThis,
1629
+ repetitionId,
1630
+ repetitionIndex,
1631
+ repetitionCount,
1632
+ passed: repetitionPassedThis,
1292
1633
  durationMs: Date.now() - started,
1293
1634
  evaluatorScores,
1294
1635
  output,
@@ -1309,9 +1650,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1309
1650
  (map) => {
1310
1651
  const key = testCaseItem.id;
1311
1652
  const existing = map.get(key) ?? { completedCount: 0, results: [] };
1312
- const newResults = [...existing.results, rerunPassedThis];
1653
+ const newResults = [...existing.results, repetitionPassedThis];
1313
1654
  const newCompletedCount = existing.completedCount + 1;
1314
- const isLast = newCompletedCount === rerunTotal;
1655
+ const isLast = newCompletedCount === repetitionCount;
1315
1656
  const newMap = new Map(map);
1316
1657
  newMap.set(key, {
1317
1658
  completedCount: newCompletedCount,
@@ -1348,10 +1689,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1348
1689
  runId: task.runId,
1349
1690
  startedAt
1350
1691
  });
1351
- const totalEvaluations = task.testCases.reduce(
1352
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1353
- 0
1354
- );
1692
+ const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
1355
1693
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1356
1694
  const completedRef = yield* Ref.make(0);
1357
1695
  const startedRef = yield* Ref.make(0);
@@ -1360,7 +1698,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1360
1698
  const testCaseResultsRef = yield* Ref.make(
1361
1699
  /* @__PURE__ */ new Map()
1362
1700
  );
1363
- const evaluationUnits = buildEvaluationUnits(task.testCases);
1701
+ const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
1364
1702
  const processEvaluation = (unit) => processOneEvaluation(
1365
1703
  task,
1366
1704
  unit,
@@ -1374,11 +1712,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1374
1712
  failedRef,
1375
1713
  testCaseResultsRef
1376
1714
  );
1377
- yield* Effect.forEach(
1378
- evaluationUnits,
1379
- processEvaluation,
1380
- maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1381
- );
1715
+ const globalSem = task.globalEvaluationSemaphore;
1716
+ if (globalSem !== void 0) {
1717
+ yield* Effect.forEach(
1718
+ evaluationUnits,
1719
+ (unit) => globalSem.withPermits(1)(processEvaluation(unit)),
1720
+ { concurrency: "unbounded", discard: true }
1721
+ );
1722
+ } else {
1723
+ yield* Effect.forEach(
1724
+ evaluationUnits,
1725
+ processEvaluation,
1726
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1727
+ );
1728
+ }
1382
1729
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
1383
1730
  Ref.get(completedRef),
1384
1731
  Ref.get(passedRef),
@@ -1414,125 +1761,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1414
1761
  artifactPath: task.snapshot.artifactPath
1415
1762
  });
1416
1763
  });
1417
- async function loadRunSnapshotsFromArtifacts(config) {
1418
- const baseDir = resolve$1(config.artifactDirectory);
1419
- let entries;
1420
- try {
1421
- entries = await readdir(baseDir);
1422
- } catch {
1423
- return [];
1424
- }
1425
- const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
1426
- const snapshots = [];
1427
- for (const fileName of jsonlFiles) {
1428
- const filePath = join(baseDir, fileName);
1429
- try {
1430
- const snapshot = await parseArtifactToSnapshot(filePath, config);
1431
- if (snapshot) {
1432
- snapshots.push(snapshot);
1433
- }
1434
- } catch {
1435
- }
1436
- }
1437
- return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
1438
- }
1439
- async function parseArtifactToSnapshot(filePath, _config) {
1440
- const content = await readFile(filePath, "utf8");
1441
- const lines = content.split("\n").filter((line) => line.trim().length > 0);
1442
- if (lines.length === 0) {
1443
- return null;
1444
- }
1445
- let runQueued = null;
1446
- let runCompleted = null;
1447
- let runFailed = null;
1448
- let runStarted = null;
1449
- for (const line of lines) {
1450
- try {
1451
- const event = JSON.parse(line);
1452
- const type = event.type;
1453
- if (type === "RunQueued") {
1454
- runQueued = {
1455
- runId: event.runId,
1456
- datasetId: event.datasetId,
1457
- datasetName: event.datasetName,
1458
- evaluatorIds: event.evaluatorIds,
1459
- totalTestCases: event.totalTestCases ?? 0,
1460
- artifactPath: event.artifactPath ?? filePath,
1461
- ts: event.ts
1462
- };
1463
- }
1464
- if (type === "RunStarted") {
1465
- runStarted = { startedAt: event.startedAt };
1466
- }
1467
- if (type === "RunCompleted") {
1468
- runCompleted = {
1469
- passedTestCases: event.passedTestCases,
1470
- failedTestCases: event.failedTestCases,
1471
- totalTestCases: event.totalTestCases,
1472
- finishedAt: event.finishedAt
1473
- };
1474
- }
1475
- if (type === "RunFailed") {
1476
- runFailed = {
1477
- finishedAt: event.finishedAt,
1478
- errorMessage: event.errorMessage
1479
- };
1480
- }
1481
- } catch {
1482
- }
1764
+
1765
+ // src/runner/name-pattern.ts
1766
+ function parseRegexLiteral(pattern) {
1767
+ if (!pattern.startsWith("/")) {
1768
+ return void 0;
1483
1769
  }
1484
- if (!runQueued) {
1485
- return null;
1770
+ const lastSlash = pattern.lastIndexOf("/");
1771
+ if (lastSlash <= 0) {
1772
+ return void 0;
1486
1773
  }
1487
- const artifactPath = filePath;
1488
- const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1489
- const progress = aggregateTestCaseProgress(lines);
1490
- const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1491
- const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1492
- const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1493
1774
  return {
1494
- runId: runQueued.runId,
1495
- datasetId: runQueued.datasetId,
1496
- datasetName: runQueued.datasetName,
1497
- evaluatorIds: runQueued.evaluatorIds,
1498
- queuedAt: runQueued.ts ?? 0,
1499
- startedAt: runStarted?.startedAt,
1500
- finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
1501
- totalTestCases: runQueued.totalTestCases,
1502
- completedTestCases,
1503
- passedTestCases,
1504
- failedTestCases,
1505
- status,
1506
- artifactPath,
1507
- errorMessage: runFailed?.errorMessage
1775
+ source: pattern.slice(1, lastSlash),
1776
+ flags: pattern.slice(lastSlash + 1)
1508
1777
  };
1509
1778
  }
1510
- function aggregateTestCaseProgress(lines) {
1511
- let completedTestCases = 0;
1512
- const testCasePassedBy = /* @__PURE__ */ new Map();
1513
- for (const line of lines) {
1514
- try {
1515
- const event = JSON.parse(line);
1516
- if (event.type === "TestCaseProgress") {
1517
- const ev = event;
1518
- completedTestCases = ev.completedTestCases ?? completedTestCases;
1519
- const id = ev.testCaseId;
1520
- const current = testCasePassedBy.get(id);
1521
- testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
1522
- }
1523
- } catch {
1524
- }
1779
+ function createNameMatcher(pattern) {
1780
+ const normalizedPattern = pattern.trim();
1781
+ const regexLiteral = parseRegexLiteral(normalizedPattern);
1782
+ if (regexLiteral) {
1783
+ const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1784
+ return (value) => regex.test(value);
1525
1785
  }
1526
- let passedTestCases = 0;
1527
- let failedTestCases = 0;
1528
- for (const passed of testCasePassedBy.values()) {
1529
- if (passed) {
1530
- passedTestCases += 1;
1531
- } else {
1532
- failedTestCases += 1;
1533
- }
1786
+ if (normalizedPattern.includes("*")) {
1787
+ const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1788
+ const regex = new RegExp(`^${escaped}$`, "i");
1789
+ return (value) => regex.test(value);
1534
1790
  }
1535
- return { completedTestCases, passedTestCases, failedTestCases };
1791
+ return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1536
1792
  }
1537
1793
  async function appendJsonLine(artifactPath, payload) {
1538
1794
  await mkdir(dirname(artifactPath), { recursive: true });
@@ -1591,32 +1847,12 @@ function searchCollectedTestCases(all, query) {
1591
1847
  }
1592
1848
 
1593
1849
  // src/runner/api.ts
1594
- function parseRegexLiteral(pattern) {
1595
- if (!pattern.startsWith("/")) {
1596
- return void 0;
1597
- }
1598
- const lastSlash = pattern.lastIndexOf("/");
1599
- if (lastSlash <= 0) {
1600
- return void 0;
1601
- }
1602
- return {
1603
- source: pattern.slice(1, lastSlash),
1604
- flags: pattern.slice(lastSlash + 1)
1605
- };
1606
- }
1607
- function createNameMatcher(pattern) {
1608
- const normalizedPattern = pattern.trim();
1609
- const regexLiteral = parseRegexLiteral(normalizedPattern);
1610
- if (regexLiteral) {
1611
- const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1612
- return (value) => regex.test(value);
1613
- }
1614
- if (normalizedPattern.includes("*")) {
1615
- const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1616
- const regex = new RegExp(`^${escaped}$`, "i");
1617
- return (value) => regex.test(value);
1850
+ function normalizeRunRepetitions(value) {
1851
+ const n = value ?? 1;
1852
+ if (!Number.isInteger(n) || n < 1) {
1853
+ throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
1618
1854
  }
1619
- return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1855
+ return n;
1620
1856
  }
1621
1857
  function mergeRunnerOverrides(base, next) {
1622
1858
  if (!base) {
@@ -1651,6 +1887,7 @@ var EffectRunner = class {
1651
1887
  this.listeners = /* @__PURE__ */ new Set();
1652
1888
  this.datasetsById = /* @__PURE__ */ new Map();
1653
1889
  this.evaluatorsById = /* @__PURE__ */ new Map();
1890
+ this.runConfigsById = /* @__PURE__ */ new Map();
1654
1891
  this.schedulerFiber = Effect.runFork(this.createSchedulerEffect());
1655
1892
  this.persistenceFiber = Effect.runFork(
1656
1893
  createPersistenceWorker(this.persistenceQueue)
@@ -1691,6 +1928,137 @@ var EffectRunner = class {
1691
1928
  (item) => matcher(item.evaluator.getName() ?? "")
1692
1929
  );
1693
1930
  }
1931
+ async collectRunConfigs() {
1932
+ const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
1933
+ this.runConfigsById.clear();
1934
+ const byNameLower = /* @__PURE__ */ new Map();
1935
+ for (const item of runConfigs) {
1936
+ const id = item.runConfig.getName();
1937
+ const lower = id.toLowerCase();
1938
+ const prev = byNameLower.get(lower);
1939
+ if (prev !== void 0 && prev.filePath !== item.filePath) {
1940
+ throw new Error(
1941
+ `Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
1942
+ );
1943
+ }
1944
+ byNameLower.set(lower, item);
1945
+ this.runConfigsById.set(id, item);
1946
+ }
1947
+ return runConfigs;
1948
+ }
1949
+ async resolveRunConfigByName(name) {
1950
+ if (this.runConfigsById.size === 0) {
1951
+ await this.collectRunConfigs();
1952
+ }
1953
+ const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
1954
+ const keyLower = key.toLowerCase();
1955
+ const matches = Array.from(this.runConfigsById.values()).filter(
1956
+ (item) => item.runConfig.getName().toLowerCase() === keyLower
1957
+ );
1958
+ if (matches.length === 0) {
1959
+ return void 0;
1960
+ }
1961
+ if (matches.length > 1) {
1962
+ throw new Error(
1963
+ `Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
1964
+ );
1965
+ }
1966
+ return matches[0];
1967
+ }
1968
+ async expandRunConfigToJobs(collected) {
1969
+ if (this.datasetsById.size === 0) {
1970
+ await this.collectDatasets();
1971
+ }
1972
+ if (this.evaluatorsById.size === 0) {
1973
+ await this.collectEvaluators();
1974
+ }
1975
+ const rcName = collected.runConfig.getName();
1976
+ const jobs = [];
1977
+ const runs = collected.runConfig.getRuns();
1978
+ for (const [i, row] of runs.entries()) {
1979
+ const dsCollected = Array.from(this.datasetsById.values()).find(
1980
+ (d) => d.dataset === row.dataset
1981
+ );
1982
+ if (!dsCollected) {
1983
+ throw new Error(
1984
+ `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
1985
+ );
1986
+ }
1987
+ let evaluatorIds;
1988
+ if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
1989
+ const matcher = createNameMatcher(row.evaluatorPattern);
1990
+ const matched = Array.from(this.evaluatorsById.values()).filter(
1991
+ (item) => matcher(item.evaluator.getName() ?? "")
1992
+ );
1993
+ if (matched.length === 0) {
1994
+ throw new Error(
1995
+ `RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
1996
+ );
1997
+ }
1998
+ evaluatorIds = matched.map((item) => item.id);
1999
+ } else {
2000
+ const evaluators = row.evaluators;
2001
+ evaluatorIds = [];
2002
+ for (const ev of evaluators) {
2003
+ const found = Array.from(this.evaluatorsById.values()).find(
2004
+ (item) => item.evaluator === ev
2005
+ );
2006
+ if (!found) {
2007
+ throw new Error(
2008
+ `RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
2009
+ );
2010
+ }
2011
+ evaluatorIds.push(found.id);
2012
+ }
2013
+ }
2014
+ const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
2015
+ jobs.push({
2016
+ datasetId: dsCollected.id,
2017
+ evaluatorIds,
2018
+ runConfigName: rcName,
2019
+ runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
2020
+ runConfigTags: collected.runConfig.getTags(),
2021
+ repetitions
2022
+ });
2023
+ }
2024
+ return jobs;
2025
+ }
2026
+ async expandRunConfigNamesToJobs(names) {
2027
+ const jobs = [];
2028
+ for (const name of names) {
2029
+ const collected = await this.resolveRunConfigByName(name);
2030
+ if (!collected) {
2031
+ const known = await this.collectRunConfigs();
2032
+ const available = known.map((r) => r.runConfig.getName()).sort();
2033
+ throw new Error(
2034
+ available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
2035
+ );
2036
+ }
2037
+ jobs.push(...await this.expandRunConfigToJobs(collected));
2038
+ }
2039
+ return jobs;
2040
+ }
2041
+ async runDatasetJobsWithSharedConcurrency(request) {
2042
+ const globalConcurrency = Math.max(1, request.globalConcurrency);
2043
+ const sem = Effect.unsafeMakeSemaphore(globalConcurrency);
2044
+ const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
2045
+ const snapshots = [];
2046
+ for (const job of request.jobs) {
2047
+ snapshots.push(
2048
+ await this.startDatasetRun({
2049
+ datasetId: job.datasetId,
2050
+ evaluatorIds: job.evaluatorIds,
2051
+ triggerId,
2052
+ maxConcurrency: this.config.maxConcurrency ?? 1,
2053
+ globalEvaluationSemaphore: sem,
2054
+ runConfigName: job.runConfigName,
2055
+ runConfigTags: job.runConfigTags,
2056
+ repetitions: job.repetitions
2057
+ })
2058
+ );
2059
+ }
2060
+ return snapshots;
2061
+ }
1694
2062
  async searchTestCases(query) {
1695
2063
  const testCases = await collectTestCasesFromFiles(this.config.discovery);
1696
2064
  return searchCollectedTestCases(testCases, query);
@@ -1709,36 +2077,46 @@ var EffectRunner = class {
1709
2077
  );
1710
2078
  }
1711
2079
  async runDatasetWith(request) {
2080
+ const runConfigName = validateRunConfigName(
2081
+ request.runConfigName,
2082
+ "runDatasetWith.runConfigName"
2083
+ );
2084
+ return this.startDatasetRun({
2085
+ datasetId: request.datasetId,
2086
+ evaluatorIds: request.evaluatorIds,
2087
+ triggerId: request.triggerId,
2088
+ maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
2089
+ repetitions: request.repetitions,
2090
+ runConfigName,
2091
+ runConfigTags: request.runConfigTags
2092
+ });
2093
+ }
2094
+ async startDatasetRun(params) {
1712
2095
  if (this.datasetsById.size === 0) {
1713
2096
  await this.collectDatasets();
1714
2097
  }
1715
2098
  if (this.evaluatorsById.size === 0) {
1716
2099
  await this.collectEvaluators();
1717
2100
  }
1718
- const dataset = this.datasetsById.get(request.datasetId);
2101
+ const dataset = this.datasetsById.get(params.datasetId);
1719
2102
  if (!dataset) {
1720
- throw new Error(`Unknown dataset: ${request.datasetId}`);
2103
+ throw new Error(`Unknown dataset: ${params.datasetId}`);
1721
2104
  }
1722
- const selectedEvaluators = request.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
2105
+ const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
1723
2106
  if (selectedEvaluators.length === 0) {
1724
2107
  throw new Error("No evaluators selected for run");
1725
2108
  }
1726
- const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
1727
- const totalEvaluations = selectedTestCases.reduce(
1728
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1729
- 0
1730
- );
1731
- const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
2109
+ const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
2110
+ const repetitions = normalizeRunRepetitions(params.repetitions);
2111
+ const totalEvaluations = selectedTestCases.length * repetitions;
2112
+ const runConfigTags = [...params.runConfigTags ?? []];
2113
+ const triggerId = params.triggerId ?? `trg-${randomUUID()}`;
1732
2114
  const runId = `run-${randomUUID()}`;
1733
- const artifactPath = createArtifactPath(
1734
- this.config.artifactDirectory,
1735
- request.datasetId,
1736
- runId
1737
- );
2115
+ const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
1738
2116
  const snapshot = {
1739
2117
  runId,
1740
- datasetId: request.datasetId,
1741
- datasetName: dataset.dataset.getName(),
2118
+ datasetId: params.datasetId,
2119
+ datasetName: dataset.dataset.getDisplayLabel(),
1742
2120
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1743
2121
  queuedAt: Date.now(),
1744
2122
  totalTestCases: totalEvaluations,
@@ -1758,8 +2136,8 @@ var EffectRunner = class {
1758
2136
  const queuedEvent = {
1759
2137
  type: "RunQueued",
1760
2138
  runId,
1761
- datasetId: request.datasetId,
1762
- datasetName: dataset.dataset.getName(),
2139
+ datasetId: params.datasetId,
2140
+ datasetName: dataset.dataset.getDisplayLabel(),
1763
2141
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1764
2142
  totalTestCases: totalEvaluations,
1765
2143
  artifactPath
@@ -1772,17 +2150,20 @@ var EffectRunner = class {
1772
2150
  payload: queuedEvent
1773
2151
  })
1774
2152
  );
1775
- const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
1776
2153
  await Effect.runPromise(
1777
2154
  Queue.offer(this.runQueue, {
1778
2155
  runId,
1779
2156
  triggerId,
1780
- datasetId: request.datasetId,
2157
+ datasetId: params.datasetId,
1781
2158
  dataset: dataset.dataset,
1782
2159
  evaluators: selectedEvaluators,
1783
2160
  testCases: selectedTestCases,
1784
2161
  snapshot,
1785
- maxConcurrency
2162
+ maxConcurrency: params.maxConcurrency,
2163
+ globalEvaluationSemaphore: params.globalEvaluationSemaphore,
2164
+ runConfigName: params.runConfigName,
2165
+ runConfigTags,
2166
+ repetitions
1786
2167
  })
1787
2168
  );
1788
2169
  return snapshot;
@@ -1854,6 +2235,11 @@ var EffectRunner = class {
1854
2235
  }
1855
2236
  };
1856
2237
 
1857
- export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
2238
+ // src/runner/events.ts
2239
+ var PROGRAMMATIC_RUN_CONFIG = {
2240
+ runConfigName: "programmatic"
2241
+ };
2242
+
2243
+ export { Dataset, DatasetNameSchema, Evaluator, EvaluatorNameSchema, Metric, PROGRAMMATIC_RUN_CONFIG, RunConfig, RunConfigNameSchema, Score, TagSet, TestCase, TestCaseNameSchema, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getDatasetDisplayLabel, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateDatasetName, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };
1858
2244
  //# sourceMappingURL=out.js.map
1859
2245
  //# sourceMappingURL=index.js.map