@m4trix/evals 0.25.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -4,10 +4,10 @@ var effect = require('effect');
4
4
  var diff = require('diff');
5
5
  var stringify = require('fast-json-stable-stringify');
6
6
  var crypto = require('crypto');
7
- var fs = require('fs');
7
+ var promises = require('fs/promises');
8
8
  var path = require('path');
9
+ var fs = require('fs');
9
10
  var jitiModule = require('jiti');
10
- var promises = require('fs/promises');
11
11
  var url = require('url');
12
12
 
13
13
  var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
@@ -34,6 +34,164 @@ function _interopNamespace(e) {
34
34
  var stringify__default = /*#__PURE__*/_interopDefault(stringify);
35
35
  var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
36
36
 
37
+ // src/index.ts
38
+ var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
39
+ function makeEntityIdSchema(brand, label) {
40
+ return effect.Schema.String.pipe(
41
+ effect.Schema.trimmed(),
42
+ effect.Schema.minLength(1, {
43
+ message: () => `${label} must be non-empty.`
44
+ }),
45
+ effect.Schema.pattern(ENTITY_ID_PATTERN, {
46
+ message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
47
+ }),
48
+ effect.Schema.brand(brand)
49
+ );
50
+ }
51
+ var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
52
+ var EvaluatorNameSchema = makeEntityIdSchema("EvaluatorName", "Evaluator name");
53
+ var TestCaseNameSchema = makeEntityIdSchema("TestCaseName", "Test case name");
54
+ function validateWithSchema(schema, raw, context) {
55
+ const trimmed = raw.trim();
56
+ const decode = effect.Schema.decodeUnknownEither(
57
+ schema
58
+ );
59
+ const result = decode(trimmed);
60
+ if (effect.Either.isLeft(result)) {
61
+ throw new Error(`${context}: ${effect.ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
62
+ }
63
+ return result.right;
64
+ }
65
+ function validateRunConfigName(raw, context) {
66
+ return validateWithSchema(RunConfigNameSchema, raw, context);
67
+ }
68
+ function validateEvaluatorName(raw, context) {
69
+ return validateWithSchema(EvaluatorNameSchema, raw, context);
70
+ }
71
+ function validateTestCaseName(raw, context) {
72
+ return validateWithSchema(TestCaseNameSchema, raw, context);
73
+ }
74
+ function normalizeOptionalDisplayName(raw) {
75
+ if (raw === void 0) {
76
+ return void 0;
77
+ }
78
+ const t = raw.trim();
79
+ return t.length === 0 ? void 0 : t;
80
+ }
81
+
82
+ // src/evals/evaluator.ts
83
+ var Evaluator = class _Evaluator {
84
+ constructor(config) {
85
+ this._config = config;
86
+ }
87
+ getState() {
88
+ return {
89
+ name: this._config.name,
90
+ displayName: this._config.displayName,
91
+ tags: this._config.tags,
92
+ inputSchema: this._config.inputSchema,
93
+ outputSchema: this._config.outputSchema,
94
+ scoreSchema: this._config.scoreSchema,
95
+ middlewares: this._config.middlewares,
96
+ evaluateFn: this._config.evaluateFn,
97
+ passThreshold: this._config.passThreshold,
98
+ passCriterion: this._config.passCriterion
99
+ };
100
+ }
101
+ static use(middleware) {
102
+ return new _Evaluator({
103
+ middlewares: [middleware],
104
+ tags: []
105
+ });
106
+ }
107
+ use(middleware) {
108
+ const state = this.getState();
109
+ return new _Evaluator({
110
+ ...state,
111
+ middlewares: [...state.middlewares, middleware]
112
+ });
113
+ }
114
+ define(config) {
115
+ const { middlewares } = this.getState();
116
+ const name = validateEvaluatorName(config.name, "Evaluator.define");
117
+ const displayName = normalizeOptionalDisplayName(config.displayName);
118
+ const tags = config.tags !== void 0 ? [...config.tags] : [];
119
+ return new _Evaluator({
120
+ name,
121
+ displayName,
122
+ tags,
123
+ inputSchema: config.inputSchema,
124
+ outputSchema: config.outputSchema,
125
+ scoreSchema: config.scoreSchema,
126
+ middlewares,
127
+ passThreshold: config.passThreshold,
128
+ passCriterion: config.passCriterion
129
+ });
130
+ }
131
+ evaluate(fn) {
132
+ return new _Evaluator({
133
+ ...this.getState(),
134
+ evaluateFn: fn
135
+ });
136
+ }
137
+ /** Canonical evaluator id when defined; otherwise undefined (middleware-only chain). */
138
+ getName() {
139
+ return this._config.name;
140
+ }
141
+ getDisplayName() {
142
+ return this._config.displayName;
143
+ }
144
+ /** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. Undefined if not yet defined. */
145
+ getDisplayLabel() {
146
+ const id = this._config.name;
147
+ if (id === void 0) {
148
+ return void 0;
149
+ }
150
+ return this._config.displayName ?? id;
151
+ }
152
+ /** Tags from `Evaluator.define({ tags })`; empty until defined. */
153
+ getTags() {
154
+ return [...this._config.tags];
155
+ }
156
+ getInputSchema() {
157
+ return this._config.inputSchema;
158
+ }
159
+ getOutputSchema() {
160
+ return this._config.outputSchema;
161
+ }
162
+ getScoreSchema() {
163
+ return this._config.scoreSchema;
164
+ }
165
+ getMiddlewares() {
166
+ return this._config.middlewares;
167
+ }
168
+ getEvaluateFn() {
169
+ return this._config.evaluateFn;
170
+ }
171
+ getPassThreshold() {
172
+ return this._config.passThreshold;
173
+ }
174
+ getPassCriterion() {
175
+ return this._config.passCriterion;
176
+ }
177
+ async resolveContext() {
178
+ const parts = await Promise.all(this._config.middlewares.map((mw) => mw.resolve()));
179
+ return Object.assign({}, ...parts);
180
+ }
181
+ };
182
+ function getEvaluatorDisplayLabel(evaluator) {
183
+ if (typeof evaluator.getDisplayLabel === "function") {
184
+ const label = evaluator.getDisplayLabel();
185
+ if (label !== void 0) {
186
+ return label;
187
+ }
188
+ }
189
+ return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
190
+ }
191
+ function getEvaluatorTagList(evaluator) {
192
+ return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
193
+ }
194
+
37
195
  // src/cli/data.mock.json
38
196
  var data_mock_default = {
39
197
  datasets: [
@@ -184,9 +342,7 @@ var data_mock_default = {
184
342
  { name: "contract_match", score: 100 },
185
343
  { name: "arg_validity", score: 100 }
186
344
  ],
187
- checks: [
188
- { name: "tool_calls", passed: true, detail: "0 unexpected" }
189
- ],
345
+ checks: [{ name: "tool_calls", passed: true, detail: "0 unexpected" }],
190
346
  failures: [],
191
347
  meta: {
192
348
  model: "gpt-4o-mini",
@@ -209,9 +365,21 @@ var data_mock_default = {
209
365
  }
210
366
  ],
211
367
  evaluators: [
212
- { id: "json-schema-validator", name: "JSON Schema Validator", configPreview: "strict=true" },
213
- { id: "tool-call-contract-checker", name: "Tool-call Contract Checker", configPreview: "unexpectedCalls=error" },
214
- { id: "rubric-judge", name: "Rubric Judge (LLM)", configPreview: "model=gpt-4o-mini; scale=0-100" },
368
+ {
369
+ id: "json-schema-validator",
370
+ name: "JSON Schema Validator",
371
+ configPreview: "strict=true"
372
+ },
373
+ {
374
+ id: "tool-call-contract-checker",
375
+ name: "Tool-call Contract Checker",
376
+ configPreview: "unexpectedCalls=error"
377
+ },
378
+ {
379
+ id: "rubric-judge",
380
+ name: "Rubric Judge (LLM)",
381
+ configPreview: "model=gpt-4o-mini; scale=0-100"
382
+ },
215
383
  { id: "pii-leak-detector", name: "PII Leak Detector", configPreview: "redact=false" }
216
384
  ]
217
385
  };
@@ -278,7 +446,7 @@ function toEvalDataset(item, snapshots) {
278
446
  function toEvaluatorOption(item) {
279
447
  return {
280
448
  id: item.id,
281
- name: item.evaluator.getName() ?? toSlug(item.id),
449
+ name: getEvaluatorDisplayLabel(item.evaluator) ?? toSlug(item.id),
282
450
  configPreview: `Source: ${item.filePath}`
283
451
  };
284
452
  }
@@ -291,9 +459,7 @@ async function loadRunnerData(runner) {
291
459
  const memSnapshots = runner.getAllRunSnapshots();
292
460
  const seen = new Set(memSnapshots.map((s) => s.runId));
293
461
  const fromDisk = diskSnapshots.filter((s) => !seen.has(s.runId));
294
- const snapshots = [...memSnapshots, ...fromDisk].sort(
295
- (a, b) => b.queuedAt - a.queuedAt
296
- );
462
+ const snapshots = [...memSnapshots, ...fromDisk].sort((a, b) => b.queuedAt - a.queuedAt);
297
463
  if (datasets.length === 0 && evaluators.length === 0) {
298
464
  return loadMockData();
299
465
  }
@@ -326,134 +492,6 @@ function parseStartupArgs(argv) {
326
492
  return args;
327
493
  }
328
494
 
329
- // src/evals/test-case.ts
330
- function resolve(value) {
331
- return typeof value === "function" ? value() : value;
332
- }
333
- var TestCase = class _TestCase {
334
- constructor(config) {
335
- this._config = config;
336
- }
337
- static describe(config) {
338
- const reruns = config.reruns ?? 1;
339
- if (reruns < 1 || !Number.isInteger(reruns)) {
340
- throw new Error(`TestCase reruns must be a positive integer, got ${reruns}`);
341
- }
342
- return new _TestCase({
343
- name: config.name,
344
- tags: config.tags,
345
- reruns,
346
- inputSchema: config.inputSchema,
347
- input: config.input,
348
- outputSchema: config.outputSchema,
349
- output: config.output
350
- });
351
- }
352
- getReruns() {
353
- return this._config.reruns;
354
- }
355
- getName() {
356
- return this._config.name;
357
- }
358
- getTags() {
359
- return this._config.tags;
360
- }
361
- getInputSchema() {
362
- return this._config.inputSchema;
363
- }
364
- getInput() {
365
- return resolve(this._config.input);
366
- }
367
- getOutputSchema() {
368
- return this._config.outputSchema;
369
- }
370
- getOutput() {
371
- if (this._config.output === void 0) {
372
- return void 0;
373
- }
374
- return resolve(this._config.output);
375
- }
376
- };
377
-
378
- // src/evals/evaluator.ts
379
- var Evaluator = class _Evaluator {
380
- constructor(config) {
381
- this._config = config;
382
- }
383
- getState() {
384
- return {
385
- name: this._config.name,
386
- inputSchema: this._config.inputSchema,
387
- outputSchema: this._config.outputSchema,
388
- scoreSchema: this._config.scoreSchema,
389
- middlewares: this._config.middlewares,
390
- evaluateFn: this._config.evaluateFn,
391
- passThreshold: this._config.passThreshold,
392
- passCriterion: this._config.passCriterion
393
- };
394
- }
395
- static use(middleware) {
396
- return new _Evaluator({
397
- middlewares: [middleware]
398
- });
399
- }
400
- use(middleware) {
401
- const state = this.getState();
402
- return new _Evaluator({
403
- ...state,
404
- middlewares: [...state.middlewares, middleware]
405
- });
406
- }
407
- define(config) {
408
- const { middlewares } = this.getState();
409
- return new _Evaluator({
410
- name: config.name,
411
- inputSchema: config.inputSchema,
412
- outputSchema: config.outputSchema,
413
- scoreSchema: config.scoreSchema,
414
- middlewares,
415
- passThreshold: config.passThreshold,
416
- passCriterion: config.passCriterion
417
- });
418
- }
419
- evaluate(fn) {
420
- return new _Evaluator({
421
- ...this.getState(),
422
- evaluateFn: fn
423
- });
424
- }
425
- getName() {
426
- return this._config.name;
427
- }
428
- getInputSchema() {
429
- return this._config.inputSchema;
430
- }
431
- getOutputSchema() {
432
- return this._config.outputSchema;
433
- }
434
- getScoreSchema() {
435
- return this._config.scoreSchema;
436
- }
437
- getMiddlewares() {
438
- return this._config.middlewares;
439
- }
440
- getEvaluateFn() {
441
- return this._config.evaluateFn;
442
- }
443
- getPassThreshold() {
444
- return this._config.passThreshold;
445
- }
446
- getPassCriterion() {
447
- return this._config.passCriterion;
448
- }
449
- async resolveContext() {
450
- const parts = await Promise.all(
451
- this._config.middlewares.map((mw) => mw.resolve())
452
- );
453
- return Object.assign({}, ...parts);
454
- }
455
- };
456
-
457
495
  // src/evals/dataset.ts
458
496
  function matchesAny(value, matchers) {
459
497
  return matchers.some(
@@ -517,230 +555,13 @@ var Dataset = class _Dataset {
517
555
  return tagMatch && pathMatch;
518
556
  }
519
557
  };
520
-
521
- // src/evals/metric.ts
522
- var registry = /* @__PURE__ */ new Map();
523
- var Metric = {
524
- of(config) {
525
- const def = {
526
- id: config.id,
527
- name: config.name,
528
- aggregate: config.aggregate,
529
- format: config.format,
530
- make: (data, options) => ({
531
- id: config.id,
532
- data,
533
- ...options?.name !== void 0 && { name: options.name }
534
- })
535
- };
536
- registry.set(config.id, def);
537
- return def;
538
- }
539
- };
540
- function getMetricById(id) {
541
- return registry.get(id);
542
- }
543
-
544
- // src/evals/score.ts
545
- var registry2 = /* @__PURE__ */ new Map();
546
- function formatScoreData(def, data, options) {
547
- return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
548
- }
549
- var ScoreAggregate = {
550
- /** Average numeric fields. Use for scores like { value, delta }. */
551
- averageFields(fields) {
552
- return (values) => {
553
- const count = values.length || 1;
554
- const result = {};
555
- for (const field of fields) {
556
- result[field] = values.reduce(
557
- (s, v) => s + (v[field] ?? 0),
558
- 0
559
- ) / count;
560
- }
561
- return result;
562
- };
563
- },
564
- /** Average selected numeric fields, with sample std dev tracked for `value`. */
565
- averageWithVariance(fields) {
566
- return (values) => {
567
- const count = values.length;
568
- const result = {};
569
- for (const field of fields) {
570
- result[field] = count === 0 ? 0 : values.reduce(
571
- (sum, item) => sum + (item[field] ?? 0),
572
- 0
573
- ) / count;
574
- }
575
- const valueField = "value";
576
- const hasValueField = fields.includes(valueField);
577
- if (count === 0) {
578
- if (hasValueField) {
579
- result[valueField] = 0;
580
- }
581
- return {
582
- ...result,
583
- stdDev: void 0,
584
- count: 0
585
- };
586
- }
587
- let stdDev;
588
- if (hasValueField && count >= 2) {
589
- const sum = values.reduce(
590
- (s, v) => s + (v[valueField] ?? 0),
591
- 0
592
- );
593
- const sumSq = values.reduce(
594
- (s, v) => {
595
- const value = v[valueField] ?? 0;
596
- return s + value * value;
597
- },
598
- 0
599
- );
600
- const mean = sum / count;
601
- const variance = (sumSq - count * mean * mean) / (count - 1);
602
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
603
- }
604
- return {
605
- ...values[0],
606
- ...result,
607
- stdDev,
608
- count
609
- };
610
- };
611
- },
612
- /** All runs must pass. Use for binary scores. */
613
- all(values) {
614
- const total = values.length;
615
- const passedCount = values.filter((v) => v.passed).length;
616
- return {
617
- ...values[0],
618
- passed: total > 0 && values.every((v) => v.passed),
619
- passedCount,
620
- totalCount: total
621
- };
622
- },
623
- /** Take last value (no aggregation). Use when aggregation is not meaningful. */
624
- last(values) {
625
- return values[values.length - 1] ?? {};
626
- }
627
- };
628
- var Score = {
629
- aggregate: ScoreAggregate,
630
- of(config) {
631
- const def = {
632
- id: config.id,
633
- name: config.name,
634
- displayStrategy: config.displayStrategy,
635
- formatValue: config.formatValue,
636
- formatAggregate: config.formatAggregate,
637
- aggregateValues: config.aggregateValues,
638
- make: (data, options) => {
639
- const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
640
- return {
641
- id: config.id,
642
- data,
643
- ...passed !== void 0 && { passed },
644
- ...options?.name !== void 0 && { name: options.name },
645
- def
646
- // Attach def so rendering/aggregation works without registry lookup
647
- };
648
- }
649
- };
650
- registry2.set(config.id, def);
651
- return def;
652
- }
653
- };
654
- function getScoreById(id) {
655
- return registry2.get(id);
656
- }
657
-
658
- // src/evals/aggregators.ts
659
- function aggregateTokenCountSum(values) {
660
- const initial = {
661
- input: 0,
662
- output: 0,
663
- inputCached: 0,
664
- outputCached: 0
665
- };
666
- return values.reduce(
667
- (acc, v) => ({
668
- input: acc.input + (v.input ?? 0),
669
- output: acc.output + (v.output ?? 0),
670
- inputCached: acc.inputCached + (v.inputCached ?? 0),
671
- outputCached: acc.outputCached + (v.outputCached ?? 0)
672
- }),
673
- initial
674
- );
675
- }
676
- function aggregateLatencyAverage(values) {
677
- if (values.length === 0) {
678
- return { ms: 0 };
679
- }
680
- const sum = values.reduce((s, v) => s + v.ms, 0);
681
- return { ms: sum / values.length };
682
- }
683
-
684
- // src/evals/metrics/standard.ts
685
- var tokenCountMetric = Metric.of({
686
- id: "token-count",
687
- name: "Tokens",
688
- aggregate: aggregateTokenCountSum,
689
- format: (data, options) => {
690
- const input = data.input ?? 0;
691
- const output = data.output ?? 0;
692
- const inputCached = data.inputCached ?? 0;
693
- const outputCached = data.outputCached ?? 0;
694
- const cached = inputCached + outputCached;
695
- const base = `in:${input} out:${output} cached:${cached}`;
696
- return options?.isAggregated ? `Total: ${base}` : base;
697
- }
698
- });
699
- var latencyMetric = Metric.of({
700
- id: "latency",
701
- name: "Latency",
702
- aggregate: aggregateLatencyAverage,
703
- format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
704
- });
705
-
706
- // src/evals/scores/standard.ts
707
- var percentScore = Score.of({
708
- id: "percent",
709
- name: "Score",
710
- displayStrategy: "bar",
711
- formatValue: (data) => data.value.toFixed(2),
712
- formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
713
- aggregateValues: Score.aggregate.averageWithVariance(["value"])
714
- });
715
- var deltaScore = Score.of({
716
- id: "delta",
717
- name: "Delta",
718
- displayStrategy: "number",
719
- formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
720
- formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
721
- aggregateValues: Score.aggregate.averageFields(["value", "delta"])
722
- });
723
- var binaryScore = Score.of({
724
- id: "binary",
725
- name: "Result",
726
- displayStrategy: "passFail",
727
- formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
728
- formatAggregate: (data) => {
729
- const base = data.passed ? "All: PASSED" : "Some: FAILED";
730
- if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
731
- return `${base} (${data.passedCount}/${data.totalCount})`;
732
- }
733
- return base;
734
- },
735
- aggregateValues: Score.aggregate.all
736
- });
737
- function preprocessForDiff(value, options) {
738
- if (options?.sort && Array.isArray(value)) {
739
- return [...value].sort((a, b) => {
740
- const aStr = stringify__default.default(preprocessForDiff(a, options));
741
- const bStr = stringify__default.default(preprocessForDiff(b, options));
742
- return aStr.localeCompare(bStr);
743
- }).map((item) => preprocessForDiff(item, options));
558
+ function preprocessForDiff(value, options) {
559
+ if (options?.sort && Array.isArray(value)) {
560
+ return [...value].sort((a, b) => {
561
+ const aStr = stringify__default.default(preprocessForDiff(a, options));
562
+ const bStr = stringify__default.default(preprocessForDiff(b, options));
563
+ return aStr.localeCompare(bStr);
564
+ }).map((item) => preprocessForDiff(item, options));
744
565
  }
745
566
  if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
746
567
  const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
@@ -791,16 +612,8 @@ function createDiffString(expected, actual, diffOptions) {
791
612
  const expectedProcessed = preprocessForDiff(expected, diffOptions);
792
613
  const actualProcessed = preprocessForDiff(actual, diffOptions);
793
614
  if (diffOptions?.keysOnly) {
794
- const expectedKeys = JSON.stringify(
795
- extractKeys(expectedProcessed),
796
- null,
797
- 2
798
- );
799
- const actualKeys = JSON.stringify(
800
- extractKeys(actualProcessed),
801
- null,
802
- 2
803
- );
615
+ const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
616
+ const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
804
617
  const parts2 = diff.diffLines(expectedKeys, actualKeys);
805
618
  return formatDiffParts(parts2);
806
619
  }
@@ -811,9 +624,7 @@ function createDiffString(expected, actual, diffOptions) {
811
624
  }
812
625
  const parts = diff.diffLines(expectedStr, actualStr);
813
626
  if (diffOptions?.outputNewOnly) {
814
- const filtered = parts.filter(
815
- (p) => p.added === true
816
- );
627
+ const filtered = parts.filter((p) => p.added === true);
817
628
  return formatDiffParts(filtered);
818
629
  }
819
630
  return formatDiffParts(parts);
@@ -878,14 +689,476 @@ function printJsonDiff(expected, actual, options = {}) {
878
689
  if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
879
690
  return `\x1B[32m${line}\x1B[0m`;
880
691
  }
881
- return line;
882
- });
883
- const colored = lines.join("\n");
884
- console.log(colored || "(no differences)");
885
- return colored;
692
+ return line;
693
+ });
694
+ const colored = lines.join("\n");
695
+ console.log(colored || "(no differences)");
696
+ return colored;
697
+ }
698
+ console.log(diff || "(no differences)");
699
+ return diff;
700
+ }
701
+
702
+ // src/evals/metric.ts
703
+ var registry = /* @__PURE__ */ new Map();
704
+ var Metric = {
705
+ of(config) {
706
+ const def = {
707
+ id: config.id,
708
+ name: config.name,
709
+ aggregate: config.aggregate,
710
+ format: config.format,
711
+ make: (data, options) => ({
712
+ id: config.id,
713
+ data,
714
+ ...options?.name !== void 0 && { name: options.name }
715
+ })
716
+ };
717
+ registry.set(config.id, def);
718
+ return def;
719
+ }
720
+ };
721
+ function getMetricById(id) {
722
+ return registry.get(id);
723
+ }
724
+
725
+ // src/evals/aggregators.ts
726
+ function aggregateTokenCountSum(values) {
727
+ const initial = {
728
+ input: 0,
729
+ output: 0,
730
+ inputCached: 0,
731
+ outputCached: 0
732
+ };
733
+ return values.reduce(
734
+ (acc, v) => ({
735
+ input: acc.input + (v.input ?? 0),
736
+ output: acc.output + (v.output ?? 0),
737
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
738
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
739
+ }),
740
+ initial
741
+ );
742
+ }
743
+ function aggregateLatencyAverage(values) {
744
+ if (values.length === 0) {
745
+ return { ms: 0 };
746
+ }
747
+ const sum = values.reduce((s, v) => s + v.ms, 0);
748
+ return { ms: sum / values.length };
749
+ }
750
+
751
+ // src/evals/metrics/standard.ts
752
+ var tokenCountMetric = Metric.of({
753
+ id: "token-count",
754
+ name: "Tokens",
755
+ aggregate: aggregateTokenCountSum,
756
+ format: (data, options) => {
757
+ const input = data.input ?? 0;
758
+ const output = data.output ?? 0;
759
+ const inputCached = data.inputCached ?? 0;
760
+ const outputCached = data.outputCached ?? 0;
761
+ const cached = inputCached + outputCached;
762
+ const base = `in:${input} out:${output} cached:${cached}`;
763
+ return options?.isAggregated ? `Total: ${base}` : base;
764
+ }
765
+ });
766
+ var latencyMetric = Metric.of({
767
+ id: "latency",
768
+ name: "Latency",
769
+ aggregate: aggregateLatencyAverage,
770
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
771
+ });
772
+
773
+ // src/evals/run-config.ts
774
+ function validateRow(row, index) {
775
+ const hasEvaluators = "evaluators" in row && row.evaluators !== void 0 && row.evaluators !== void 0;
776
+ const hasPattern = "evaluatorPattern" in row && typeof row.evaluatorPattern === "string" && row.evaluatorPattern.trim().length > 0;
777
+ if (hasEvaluators && hasPattern) {
778
+ throw new Error(`RunConfig run[${index}] must not set both evaluators and evaluatorPattern`);
779
+ }
780
+ if (!hasEvaluators && !hasPattern) {
781
+ throw new Error(`RunConfig run[${index}] must set either evaluators or evaluatorPattern`);
782
+ }
783
+ if (hasEvaluators && row.evaluators.length === 0) {
784
+ throw new Error(`RunConfig run[${index}]: evaluators must be non-empty`);
785
+ }
786
+ const rawRep = "repetitions" in row ? row.repetitions : void 0;
787
+ const repetitions = rawRep ?? 1;
788
+ if (!Number.isInteger(repetitions) || repetitions < 1) {
789
+ throw new Error(
790
+ `RunConfig run[${index}]: repetitions must be a positive integer, got ${String(rawRep)}`
791
+ );
792
+ }
793
+ }
794
+ var RunConfig = class _RunConfig {
795
+ constructor(name, displayName, tags, runs) {
796
+ this._name = name;
797
+ this._displayName = displayName;
798
+ this._tags = tags;
799
+ this._runs = runs;
800
+ }
801
+ static define(config) {
802
+ if (config.runs.length === 0) {
803
+ throw new Error("RunConfig runs must be non-empty");
804
+ }
805
+ config.runs.forEach(validateRow);
806
+ const name = validateRunConfigName(config.name, "RunConfig.define");
807
+ const displayName = normalizeOptionalDisplayName(config.displayName);
808
+ const tags = config.tags !== void 0 ? [...config.tags] : [];
809
+ return new _RunConfig(name, displayName, tags, config.runs);
810
+ }
811
+ /** Canonical id (branded {@link RunConfigName} at runtime; typed as `string` for ergonomics). */
812
+ getName() {
813
+ return this._name;
814
+ }
815
+ /** Optional unrestricted display label. */
816
+ getDisplayName() {
817
+ return this._displayName;
818
+ }
819
+ /** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. */
820
+ getDisplayLabel() {
821
+ return this._displayName ?? this._name;
822
+ }
823
+ /** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
824
+ getTags() {
825
+ return [...this._tags];
826
+ }
827
+ getRuns() {
828
+ return this._runs;
829
+ }
830
+ };
831
+
832
+ // src/evals/score.ts
833
+ var registry2 = /* @__PURE__ */ new Map();
834
+ function formatScoreData(def, data, options) {
835
+ return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
836
+ }
837
+ var ScoreAggregate = {
838
+ /** Average numeric fields. Use for scores like { value, delta }. */
839
+ averageFields(fields) {
840
+ return (values) => {
841
+ const count = values.length || 1;
842
+ const result = {};
843
+ for (const field of fields) {
844
+ result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
845
+ }
846
+ return result;
847
+ };
848
+ },
849
+ /** Average selected numeric fields, with sample std dev tracked for `value`. */
850
+ averageWithVariance(fields) {
851
+ return (values) => {
852
+ const count = values.length;
853
+ const result = {};
854
+ for (const field of fields) {
855
+ result[field] = count === 0 ? 0 : values.reduce(
856
+ (sum, item) => sum + (item[field] ?? 0),
857
+ 0
858
+ ) / count;
859
+ }
860
+ const valueField = "value";
861
+ const hasValueField = fields.includes(valueField);
862
+ if (count === 0) {
863
+ if (hasValueField) {
864
+ result[valueField] = 0;
865
+ }
866
+ return {
867
+ ...result,
868
+ stdDev: void 0,
869
+ count: 0
870
+ };
871
+ }
872
+ let stdDev;
873
+ if (hasValueField && count >= 2) {
874
+ const sum = values.reduce(
875
+ (s, v) => s + (v[valueField] ?? 0),
876
+ 0
877
+ );
878
+ const sumSq = values.reduce((s, v) => {
879
+ const value = v[valueField] ?? 0;
880
+ return s + value * value;
881
+ }, 0);
882
+ const mean = sum / count;
883
+ const variance = (sumSq - count * mean * mean) / (count - 1);
884
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
885
+ }
886
+ return {
887
+ ...values[0],
888
+ ...result,
889
+ stdDev,
890
+ count
891
+ };
892
+ };
893
+ },
894
+ /** All runs must pass. Use for binary scores. */
895
+ all(values) {
896
+ const total = values.length;
897
+ const passedCount = values.filter((v) => v.passed).length;
898
+ return {
899
+ ...values[0],
900
+ passed: total > 0 && values.every((v) => v.passed),
901
+ passedCount,
902
+ totalCount: total
903
+ };
904
+ },
905
+ /** Take last value (no aggregation). Use when aggregation is not meaningful. */
906
+ last(values) {
907
+ return values[values.length - 1] ?? {};
908
+ }
909
+ };
910
+ var Score = {
911
+ aggregate: ScoreAggregate,
912
+ of(config) {
913
+ const def = {
914
+ id: config.id,
915
+ name: config.name,
916
+ displayStrategy: config.displayStrategy,
917
+ formatValue: config.formatValue,
918
+ formatAggregate: config.formatAggregate,
919
+ aggregateValues: config.aggregateValues,
920
+ make: (data, options) => {
921
+ const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
922
+ return {
923
+ id: config.id,
924
+ data,
925
+ ...passed !== void 0 && { passed },
926
+ ...options?.name !== void 0 && { name: options.name },
927
+ def
928
+ // Attach def so rendering/aggregation works without registry lookup
929
+ };
930
+ }
931
+ };
932
+ registry2.set(config.id, def);
933
+ return def;
934
+ }
935
+ };
936
+ function getScoreById(id) {
937
+ return registry2.get(id);
938
+ }
939
+
940
+ // src/evals/scores/standard.ts
941
+ var percentScore = Score.of({
942
+ id: "percent",
943
+ name: "Score",
944
+ displayStrategy: "bar",
945
+ formatValue: (data) => data.value.toFixed(2),
946
+ formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
947
+ aggregateValues: Score.aggregate.averageWithVariance(["value"])
948
+ });
949
+ var deltaScore = Score.of({
950
+ id: "delta",
951
+ name: "Delta",
952
+ displayStrategy: "number",
953
+ formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
954
+ formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
955
+ aggregateValues: Score.aggregate.averageFields(["value", "delta"])
956
+ });
957
+ var binaryScore = Score.of({
958
+ id: "binary",
959
+ name: "Result",
960
+ displayStrategy: "passFail",
961
+ formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
962
+ formatAggregate: (data) => {
963
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
964
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
965
+ return `${base} (${data.passedCount}/${data.totalCount})`;
966
+ }
967
+ return base;
968
+ },
969
+ aggregateValues: Score.aggregate.all
970
+ });
971
+
972
+ // src/evals/tag-set.ts
973
+ var TagSet = class {
974
+ constructor() {
975
+ }
976
+ static define(tags) {
977
+ const out = {};
978
+ for (const tag of tags) {
979
+ out[tag] = tag;
980
+ }
981
+ return out;
982
+ }
983
+ };
984
+
985
+ // src/evals/test-case.ts
986
+ function resolve(value) {
987
+ return typeof value === "function" ? value() : value;
988
+ }
989
+ var TestCase = class _TestCase {
990
+ constructor(config) {
991
+ this._config = config;
992
+ }
993
+ static describe(config) {
994
+ const name = validateTestCaseName(config.name, "TestCase.describe");
995
+ const displayName = normalizeOptionalDisplayName(config.displayName);
996
+ return new _TestCase({
997
+ name,
998
+ displayName,
999
+ tags: config.tags,
1000
+ inputSchema: config.inputSchema,
1001
+ input: config.input,
1002
+ outputSchema: config.outputSchema,
1003
+ output: config.output
1004
+ });
1005
+ }
1006
+ getName() {
1007
+ return this._config.name;
1008
+ }
1009
+ getDisplayName() {
1010
+ return this._config.displayName;
1011
+ }
1012
+ getDisplayLabel() {
1013
+ return this._config.displayName ?? this._config.name;
1014
+ }
1015
+ getTags() {
1016
+ return this._config.tags;
1017
+ }
1018
+ getInputSchema() {
1019
+ return this._config.inputSchema;
1020
+ }
1021
+ getInput() {
1022
+ return resolve(this._config.input);
1023
+ }
1024
+ getOutputSchema() {
1025
+ return this._config.outputSchema;
1026
+ }
1027
+ getOutput() {
1028
+ if (this._config.output === void 0) {
1029
+ return void 0;
1030
+ }
1031
+ return resolve(this._config.output);
1032
+ }
1033
+ };
1034
+ function getTestCaseDisplayLabel(testCase) {
1035
+ if (typeof testCase.getDisplayLabel === "function") {
1036
+ return testCase.getDisplayLabel();
1037
+ }
1038
+ return typeof testCase.getName === "function" ? testCase.getName() : "";
1039
+ }
1040
+ function getTestCaseTagList(testCase) {
1041
+ return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
1042
+ }
1043
+ async function loadRunSnapshotsFromArtifacts(config) {
1044
+ const baseDir = path.resolve(config.artifactDirectory);
1045
+ let entries;
1046
+ try {
1047
+ entries = await promises.readdir(baseDir);
1048
+ } catch {
1049
+ return [];
1050
+ }
1051
+ const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
1052
+ const snapshots = [];
1053
+ for (const fileName of jsonlFiles) {
1054
+ const filePath = path.join(baseDir, fileName);
1055
+ try {
1056
+ const snapshot = await parseArtifactToSnapshot(filePath, config);
1057
+ if (snapshot) {
1058
+ snapshots.push(snapshot);
1059
+ }
1060
+ } catch {
1061
+ }
1062
+ }
1063
+ return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
1064
+ }
1065
+ async function parseArtifactToSnapshot(filePath, _config) {
1066
+ const content = await promises.readFile(filePath, "utf8");
1067
+ const lines = content.split("\n").filter((line) => line.trim().length > 0);
1068
+ if (lines.length === 0) {
1069
+ return null;
1070
+ }
1071
+ let runQueued = null;
1072
+ let runCompleted = null;
1073
+ let runFailed = null;
1074
+ let runStarted = null;
1075
+ for (const line of lines) {
1076
+ try {
1077
+ const event = JSON.parse(line);
1078
+ const type = event.type;
1079
+ if (type === "RunQueued") {
1080
+ runQueued = {
1081
+ runId: event.runId,
1082
+ datasetId: event.datasetId,
1083
+ datasetName: event.datasetName,
1084
+ evaluatorIds: event.evaluatorIds,
1085
+ totalTestCases: event.totalTestCases ?? 0,
1086
+ artifactPath: event.artifactPath ?? filePath,
1087
+ ts: event.ts
1088
+ };
1089
+ }
1090
+ if (type === "RunStarted") {
1091
+ runStarted = { startedAt: event.startedAt };
1092
+ }
1093
+ if (type === "RunCompleted") {
1094
+ runCompleted = {
1095
+ passedTestCases: event.passedTestCases,
1096
+ failedTestCases: event.failedTestCases,
1097
+ totalTestCases: event.totalTestCases,
1098
+ finishedAt: event.finishedAt
1099
+ };
1100
+ }
1101
+ if (type === "RunFailed") {
1102
+ runFailed = {
1103
+ finishedAt: event.finishedAt,
1104
+ errorMessage: event.errorMessage
1105
+ };
1106
+ }
1107
+ } catch {
1108
+ }
1109
+ }
1110
+ if (!runQueued) {
1111
+ return null;
1112
+ }
1113
+ const artifactPath = filePath;
1114
+ const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1115
+ const progress = aggregateTestCaseProgress(lines);
1116
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1117
+ const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1118
+ const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1119
+ return {
1120
+ runId: runQueued.runId,
1121
+ datasetId: runQueued.datasetId,
1122
+ datasetName: runQueued.datasetName,
1123
+ evaluatorIds: runQueued.evaluatorIds,
1124
+ queuedAt: runQueued.ts ?? 0,
1125
+ startedAt: runStarted?.startedAt,
1126
+ finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
1127
+ totalTestCases: runQueued.totalTestCases,
1128
+ completedTestCases,
1129
+ passedTestCases,
1130
+ failedTestCases,
1131
+ status,
1132
+ artifactPath,
1133
+ errorMessage: runFailed?.errorMessage
1134
+ };
1135
+ }
1136
+ function aggregateTestCaseProgress(lines) {
1137
+ let completedTestCases = 0;
1138
+ const testCasePassedBy = /* @__PURE__ */ new Map();
1139
+ for (const line of lines) {
1140
+ try {
1141
+ const event = JSON.parse(line);
1142
+ if (event.type === "TestCaseProgress") {
1143
+ const ev = event;
1144
+ completedTestCases = ev.completedTestCases ?? completedTestCases;
1145
+ const id = ev.testCaseId;
1146
+ const current = testCasePassedBy.get(id);
1147
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
1148
+ }
1149
+ } catch {
1150
+ }
886
1151
  }
887
- console.log(diff || "(no differences)");
888
- return diff;
1152
+ let passedTestCases = 0;
1153
+ let failedTestCases = 0;
1154
+ for (const passed of testCasePassedBy.values()) {
1155
+ if (passed) {
1156
+ passedTestCases += 1;
1157
+ } else {
1158
+ failedTestCases += 1;
1159
+ }
1160
+ }
1161
+ return { completedTestCases, passedTestCases, failedTestCases };
889
1162
  }
890
1163
 
891
1164
  // src/runner/config.ts
@@ -896,18 +1169,9 @@ var defaultRunnerConfig = {
896
1169
  discovery: {
897
1170
  rootDir: process.cwd(),
898
1171
  datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
899
- evaluatorSuffixes: [
900
- ".evaluator.ts",
901
- ".evaluator.tsx",
902
- ".evaluator.js",
903
- ".evaluator.mjs"
904
- ],
905
- testCaseSuffixes: [
906
- ".test-case.ts",
907
- ".test-case.tsx",
908
- ".test-case.js",
909
- ".test-case.mjs"
910
- ],
1172
+ evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
1173
+ runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
1174
+ testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
911
1175
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
912
1176
  },
913
1177
  artifactDirectory: ".eval-results",
@@ -932,6 +1196,11 @@ function toRunnerConfigOverrides(config) {
932
1196
  } else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
933
1197
  discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
934
1198
  }
1199
+ if (rawDiscovery?.runConfigFilePatterns !== void 0) {
1200
+ discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
1201
+ } else if (rawDiscovery?.runConfigSuffixes !== void 0) {
1202
+ discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
1203
+ }
935
1204
  if (rawDiscovery?.testCaseFilePatterns !== void 0) {
936
1205
  discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
937
1206
  } else if (rawDiscovery?.testCaseSuffixes !== void 0) {
@@ -974,14 +1243,15 @@ function getJitiLoader() {
974
1243
  }
975
1244
  const createJiti2 = jitiModule__namespace.createJiti ?? jitiModule__namespace.default;
976
1245
  if (typeof createJiti2 !== "function") {
977
- throw new Error(
978
- "Failed to initialize jiti for m4trix eval config loading."
979
- );
1246
+ throw new Error("Failed to initialize jiti for m4trix eval config loading.");
980
1247
  }
981
- cachedLoader = createJiti2((typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)), {
982
- interopDefault: true,
983
- moduleCache: true
984
- });
1248
+ cachedLoader = createJiti2(
1249
+ (typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)),
1250
+ {
1251
+ interopDefault: true,
1252
+ moduleCache: true
1253
+ }
1254
+ );
985
1255
  return cachedLoader;
986
1256
  }
987
1257
  function resolveConfigModuleExport(loadedModule) {
@@ -1029,6 +1299,9 @@ function isDatasetLike(value) {
1029
1299
  function isEvaluatorLike(value) {
1030
1300
  return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
1031
1301
  }
1302
+ function isRunConfigLike(value) {
1303
+ return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
1304
+ }
1032
1305
  function isTestCaseLike(value) {
1033
1306
  return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
1034
1307
  }
@@ -1085,9 +1358,7 @@ async function loadModuleExports(filePath) {
1085
1358
  }
1086
1359
  async function collectDatasetsFromFiles(config) {
1087
1360
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1088
- const matched = files.filter(
1089
- (filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
1090
- );
1361
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.datasetSuffixes));
1091
1362
  const found = await Promise.all(
1092
1363
  matched.map(async (absolutePath) => {
1093
1364
  const exports = await loadModuleExports(absolutePath);
@@ -1104,9 +1375,7 @@ async function collectDatasetsFromFiles(config) {
1104
1375
  }
1105
1376
  async function collectEvaluatorsFromFiles(config) {
1106
1377
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1107
- const matched = files.filter(
1108
- (filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
1109
- );
1378
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes));
1110
1379
  const found = await Promise.all(
1111
1380
  matched.map(async (absolutePath) => {
1112
1381
  const exports = await loadModuleExports(absolutePath);
@@ -1121,11 +1390,26 @@ async function collectEvaluatorsFromFiles(config) {
1121
1390
  );
1122
1391
  return found.flat();
1123
1392
  }
1124
- async function collectTestCasesFromFiles(config) {
1393
+ async function collectRunConfigsFromFiles(config) {
1125
1394
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1126
- const matched = files.filter(
1127
- (filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
1395
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
1396
+ const found = await Promise.all(
1397
+ matched.map(async (absolutePath) => {
1398
+ const exports = await loadModuleExports(absolutePath);
1399
+ const runConfigs = exports.filter(isRunConfigLike);
1400
+ const relPath = path.relative(config.rootDir, absolutePath);
1401
+ return runConfigs.map((runConfig) => ({
1402
+ id: runConfig.getName(),
1403
+ filePath: relPath,
1404
+ runConfig
1405
+ }));
1406
+ })
1128
1407
  );
1408
+ return found.flat();
1409
+ }
1410
+ async function collectTestCasesFromFiles(config) {
1411
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1412
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
1129
1413
  const found = await Promise.all(
1130
1414
  matched.map(async (absolutePath) => {
1131
1415
  const exports = await loadModuleExports(absolutePath);
@@ -1215,15 +1499,17 @@ function readOutput(testCase) {
1215
1499
  }
1216
1500
  return candidate.getOutput();
1217
1501
  }
1218
- function buildEvaluationUnits(testCases) {
1502
+ function buildEvaluationUnits(testCases, repetitionCount) {
1503
+ const count = Math.max(1, repetitionCount);
1219
1504
  const units = [];
1220
1505
  for (const testCaseItem of testCases) {
1221
- const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1222
- for (let r = 0; r < rerunTotal; r++) {
1506
+ const repetitionId = `rep-${crypto.randomUUID()}`;
1507
+ for (let r = 0; r < count; r++) {
1223
1508
  units.push({
1224
1509
  testCaseItem,
1225
- rerunIndex: r + 1,
1226
- rerunTotal
1510
+ repetitionId,
1511
+ repetitionIndex: r + 1,
1512
+ repetitionCount: count
1227
1513
  });
1228
1514
  }
1229
1515
  }
@@ -1233,29 +1519,24 @@ function nowIsoForFile() {
1233
1519
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
1234
1520
  }
1235
1521
  function createArtifactPath(artifactDirectory, datasetId, runId) {
1236
- return path.join(
1237
- artifactDirectory,
1238
- `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1239
- );
1522
+ return path.join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
1240
1523
  }
1241
1524
  function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
1242
- const { testCaseItem, rerunIndex, rerunTotal } = unit;
1525
+ const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
1243
1526
  return effect.Effect.gen(function* () {
1244
1527
  const evaluatorRunId = `run-${crypto.randomUUID()}`;
1245
1528
  const started = Date.now();
1246
- const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
1247
- n + 1,
1248
- n + 1
1249
- ]);
1529
+ const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [n + 1, n + 1]);
1250
1530
  yield* publishEvent({
1251
1531
  type: "TestCaseStarted",
1252
1532
  runId: task.runId,
1253
1533
  testCaseId: testCaseItem.id,
1254
- testCaseName: testCaseItem.testCase.getName(),
1534
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
1255
1535
  startedTestCases: startedEvaluations,
1256
1536
  totalTestCases: totalEvaluations,
1257
- rerunIndex,
1258
- rerunTotal
1537
+ repetitionId,
1538
+ repetitionIndex,
1539
+ repetitionCount
1259
1540
  });
1260
1541
  const evaluatorScores = [];
1261
1542
  let testCaseError;
@@ -1279,9 +1560,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1279
1560
  return error;
1280
1561
  };
1281
1562
  try {
1282
- const ctx = yield* effect.Effect.promise(
1283
- () => Promise.resolve(evaluator.resolveContext())
1284
- );
1563
+ const ctx = yield* effect.Effect.promise(() => Promise.resolve(evaluator.resolveContext()));
1285
1564
  const result = yield* effect.Effect.promise(
1286
1565
  () => Promise.resolve().then(
1287
1566
  () => evaluateFn({
@@ -1291,8 +1570,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1291
1570
  meta: {
1292
1571
  triggerId: task.triggerId,
1293
1572
  runId: evaluatorRunId,
1294
- datasetId: task.datasetId
1573
+ datasetId: task.datasetId,
1574
+ repetitionId,
1575
+ repetitionIndex,
1576
+ repetitionCount,
1577
+ runConfigName: task.runConfigName
1295
1578
  },
1579
+ testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1580
+ runConfigTags: task.runConfigTags,
1581
+ evaluatorTags: getEvaluatorTagList(evaluator),
1296
1582
  logDiff,
1297
1583
  log,
1298
1584
  createError
@@ -1335,21 +1621,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1335
1621
  });
1336
1622
  }
1337
1623
  }
1338
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1339
- const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
1340
- n + 1,
1341
- n + 1
1342
- ]);
1624
+ const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
1625
+ const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [n + 1, n + 1]);
1343
1626
  const progressEvent = {
1344
1627
  type: "TestCaseProgress",
1345
1628
  runId: task.runId,
1346
1629
  testCaseId: testCaseItem.id,
1347
- testCaseName: testCaseItem.testCase.getName(),
1630
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
1348
1631
  completedTestCases: completedEvaluations,
1349
1632
  totalTestCases: totalEvaluations,
1350
- rerunIndex,
1351
- rerunTotal,
1352
- passed: rerunPassedThis,
1633
+ repetitionId,
1634
+ repetitionIndex,
1635
+ repetitionCount,
1636
+ passed: repetitionPassedThis,
1353
1637
  durationMs: Date.now() - started,
1354
1638
  evaluatorScores,
1355
1639
  output,
@@ -1370,9 +1654,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1370
1654
  (map) => {
1371
1655
  const key = testCaseItem.id;
1372
1656
  const existing = map.get(key) ?? { completedCount: 0, results: [] };
1373
- const newResults = [...existing.results, rerunPassedThis];
1657
+ const newResults = [...existing.results, repetitionPassedThis];
1374
1658
  const newCompletedCount = existing.completedCount + 1;
1375
- const isLast = newCompletedCount === rerunTotal;
1659
+ const isLast = newCompletedCount === repetitionCount;
1376
1660
  const newMap = new Map(map);
1377
1661
  newMap.set(key, {
1378
1662
  completedCount: newCompletedCount,
@@ -1388,10 +1672,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1388
1672
  } else {
1389
1673
  yield* effect.Ref.update(failedRef, (n) => n + 1);
1390
1674
  }
1391
- const [passed, failed] = yield* effect.Effect.all([
1392
- effect.Ref.get(passedRef),
1393
- effect.Ref.get(failedRef)
1394
- ]);
1675
+ const [passed, failed] = yield* effect.Effect.all([effect.Ref.get(passedRef), effect.Ref.get(failedRef)]);
1395
1676
  yield* updateSnapshot(task.runId, (snapshot) => ({
1396
1677
  ...snapshot,
1397
1678
  passedTestCases: passed,
@@ -1412,10 +1693,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1412
1693
  runId: task.runId,
1413
1694
  startedAt
1414
1695
  });
1415
- const totalEvaluations = task.testCases.reduce(
1416
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1417
- 0
1418
- );
1696
+ const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
1419
1697
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1420
1698
  const completedRef = yield* effect.Ref.make(0);
1421
1699
  const startedRef = yield* effect.Ref.make(0);
@@ -1424,7 +1702,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1424
1702
  const testCaseResultsRef = yield* effect.Ref.make(
1425
1703
  /* @__PURE__ */ new Map()
1426
1704
  );
1427
- const evaluationUnits = buildEvaluationUnits(task.testCases);
1705
+ const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
1428
1706
  const processEvaluation = (unit) => processOneEvaluation(
1429
1707
  task,
1430
1708
  unit,
@@ -1438,11 +1716,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1438
1716
  failedRef,
1439
1717
  testCaseResultsRef
1440
1718
  );
1441
- yield* effect.Effect.forEach(
1442
- evaluationUnits,
1443
- processEvaluation,
1444
- maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1445
- );
1719
+ const globalSem = task.globalEvaluationSemaphore;
1720
+ if (globalSem !== void 0) {
1721
+ yield* effect.Effect.forEach(
1722
+ evaluationUnits,
1723
+ (unit) => globalSem.withPermits(1)(processEvaluation(unit)),
1724
+ { concurrency: "unbounded", discard: true }
1725
+ );
1726
+ } else {
1727
+ yield* effect.Effect.forEach(
1728
+ evaluationUnits,
1729
+ processEvaluation,
1730
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1731
+ );
1732
+ }
1446
1733
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
1447
1734
  effect.Ref.get(completedRef),
1448
1735
  effect.Ref.get(passedRef),
@@ -1478,125 +1765,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1478
1765
  artifactPath: task.snapshot.artifactPath
1479
1766
  });
1480
1767
  });
1481
- async function loadRunSnapshotsFromArtifacts(config) {
1482
- const baseDir = path.resolve(config.artifactDirectory);
1483
- let entries;
1484
- try {
1485
- entries = await promises.readdir(baseDir);
1486
- } catch {
1487
- return [];
1488
- }
1489
- const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
1490
- const snapshots = [];
1491
- for (const fileName of jsonlFiles) {
1492
- const filePath = path.join(baseDir, fileName);
1493
- try {
1494
- const snapshot = await parseArtifactToSnapshot(filePath, config);
1495
- if (snapshot) {
1496
- snapshots.push(snapshot);
1497
- }
1498
- } catch {
1499
- }
1500
- }
1501
- return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
1502
- }
1503
- async function parseArtifactToSnapshot(filePath, _config) {
1504
- const content = await promises.readFile(filePath, "utf8");
1505
- const lines = content.split("\n").filter((line) => line.trim().length > 0);
1506
- if (lines.length === 0) {
1507
- return null;
1508
- }
1509
- let runQueued = null;
1510
- let runCompleted = null;
1511
- let runFailed = null;
1512
- let runStarted = null;
1513
- for (const line of lines) {
1514
- try {
1515
- const event = JSON.parse(line);
1516
- const type = event.type;
1517
- if (type === "RunQueued") {
1518
- runQueued = {
1519
- runId: event.runId,
1520
- datasetId: event.datasetId,
1521
- datasetName: event.datasetName,
1522
- evaluatorIds: event.evaluatorIds,
1523
- totalTestCases: event.totalTestCases ?? 0,
1524
- artifactPath: event.artifactPath ?? filePath,
1525
- ts: event.ts
1526
- };
1527
- }
1528
- if (type === "RunStarted") {
1529
- runStarted = { startedAt: event.startedAt };
1530
- }
1531
- if (type === "RunCompleted") {
1532
- runCompleted = {
1533
- passedTestCases: event.passedTestCases,
1534
- failedTestCases: event.failedTestCases,
1535
- totalTestCases: event.totalTestCases,
1536
- finishedAt: event.finishedAt
1537
- };
1538
- }
1539
- if (type === "RunFailed") {
1540
- runFailed = {
1541
- finishedAt: event.finishedAt,
1542
- errorMessage: event.errorMessage
1543
- };
1544
- }
1545
- } catch {
1546
- }
1768
+
1769
+ // src/runner/name-pattern.ts
1770
+ function parseRegexLiteral(pattern) {
1771
+ if (!pattern.startsWith("/")) {
1772
+ return void 0;
1547
1773
  }
1548
- if (!runQueued) {
1549
- return null;
1774
+ const lastSlash = pattern.lastIndexOf("/");
1775
+ if (lastSlash <= 0) {
1776
+ return void 0;
1550
1777
  }
1551
- const artifactPath = filePath;
1552
- const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1553
- const progress = aggregateTestCaseProgress(lines);
1554
- const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1555
- const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1556
- const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1557
1778
  return {
1558
- runId: runQueued.runId,
1559
- datasetId: runQueued.datasetId,
1560
- datasetName: runQueued.datasetName,
1561
- evaluatorIds: runQueued.evaluatorIds,
1562
- queuedAt: runQueued.ts ?? 0,
1563
- startedAt: runStarted?.startedAt,
1564
- finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
1565
- totalTestCases: runQueued.totalTestCases,
1566
- completedTestCases,
1567
- passedTestCases,
1568
- failedTestCases,
1569
- status,
1570
- artifactPath,
1571
- errorMessage: runFailed?.errorMessage
1779
+ source: pattern.slice(1, lastSlash),
1780
+ flags: pattern.slice(lastSlash + 1)
1572
1781
  };
1573
1782
  }
1574
- function aggregateTestCaseProgress(lines) {
1575
- let completedTestCases = 0;
1576
- const testCasePassedBy = /* @__PURE__ */ new Map();
1577
- for (const line of lines) {
1578
- try {
1579
- const event = JSON.parse(line);
1580
- if (event.type === "TestCaseProgress") {
1581
- const ev = event;
1582
- completedTestCases = ev.completedTestCases ?? completedTestCases;
1583
- const id = ev.testCaseId;
1584
- const current = testCasePassedBy.get(id);
1585
- testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
1586
- }
1587
- } catch {
1588
- }
1783
+ function createNameMatcher(pattern) {
1784
+ const normalizedPattern = pattern.trim();
1785
+ const regexLiteral = parseRegexLiteral(normalizedPattern);
1786
+ if (regexLiteral) {
1787
+ const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1788
+ return (value) => regex.test(value);
1589
1789
  }
1590
- let passedTestCases = 0;
1591
- let failedTestCases = 0;
1592
- for (const passed of testCasePassedBy.values()) {
1593
- if (passed) {
1594
- passedTestCases += 1;
1595
- } else {
1596
- failedTestCases += 1;
1597
- }
1790
+ if (normalizedPattern.includes("*")) {
1791
+ const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1792
+ const regex = new RegExp(`^${escaped}$`, "i");
1793
+ return (value) => regex.test(value);
1598
1794
  }
1599
- return { completedTestCases, passedTestCases, failedTestCases };
1795
+ return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1600
1796
  }
1601
1797
  async function appendJsonLine(artifactPath, payload) {
1602
1798
  await promises.mkdir(path.dirname(artifactPath), { recursive: true });
@@ -1655,32 +1851,12 @@ function searchCollectedTestCases(all, query) {
1655
1851
  }
1656
1852
 
1657
1853
  // src/runner/api.ts
1658
- function parseRegexLiteral(pattern) {
1659
- if (!pattern.startsWith("/")) {
1660
- return void 0;
1661
- }
1662
- const lastSlash = pattern.lastIndexOf("/");
1663
- if (lastSlash <= 0) {
1664
- return void 0;
1665
- }
1666
- return {
1667
- source: pattern.slice(1, lastSlash),
1668
- flags: pattern.slice(lastSlash + 1)
1669
- };
1670
- }
1671
- function createNameMatcher(pattern) {
1672
- const normalizedPattern = pattern.trim();
1673
- const regexLiteral = parseRegexLiteral(normalizedPattern);
1674
- if (regexLiteral) {
1675
- const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1676
- return (value) => regex.test(value);
1677
- }
1678
- if (normalizedPattern.includes("*")) {
1679
- const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1680
- const regex = new RegExp(`^${escaped}$`, "i");
1681
- return (value) => regex.test(value);
1854
+ function normalizeRunRepetitions(value) {
1855
+ const n = value ?? 1;
1856
+ if (!Number.isInteger(n) || n < 1) {
1857
+ throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
1682
1858
  }
1683
- return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1859
+ return n;
1684
1860
  }
1685
1861
  function mergeRunnerOverrides(base, next) {
1686
1862
  if (!base) {
@@ -1711,15 +1887,12 @@ var EffectRunner = class {
1711
1887
  this.persistenceQueue = effect.Effect.runSync(
1712
1888
  effect.Queue.unbounded()
1713
1889
  );
1714
- this.snapshotsRef = effect.Effect.runSync(
1715
- effect.Ref.make(/* @__PURE__ */ new Map())
1716
- );
1890
+ this.snapshotsRef = effect.Effect.runSync(effect.Ref.make(/* @__PURE__ */ new Map()));
1717
1891
  this.listeners = /* @__PURE__ */ new Set();
1718
1892
  this.datasetsById = /* @__PURE__ */ new Map();
1719
1893
  this.evaluatorsById = /* @__PURE__ */ new Map();
1720
- this.schedulerFiber = effect.Effect.runFork(
1721
- this.createSchedulerEffect()
1722
- );
1894
+ this.runConfigsById = /* @__PURE__ */ new Map();
1895
+ this.schedulerFiber = effect.Effect.runFork(this.createSchedulerEffect());
1723
1896
  this.persistenceFiber = effect.Effect.runFork(
1724
1897
  createPersistenceWorker(this.persistenceQueue)
1725
1898
  );
@@ -1759,6 +1932,137 @@ var EffectRunner = class {
1759
1932
  (item) => matcher(item.evaluator.getName() ?? "")
1760
1933
  );
1761
1934
  }
1935
+ async collectRunConfigs() {
1936
+ const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
1937
+ this.runConfigsById.clear();
1938
+ const byNameLower = /* @__PURE__ */ new Map();
1939
+ for (const item of runConfigs) {
1940
+ const id = item.runConfig.getName();
1941
+ const lower = id.toLowerCase();
1942
+ const prev = byNameLower.get(lower);
1943
+ if (prev !== void 0 && prev.filePath !== item.filePath) {
1944
+ throw new Error(
1945
+ `Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
1946
+ );
1947
+ }
1948
+ byNameLower.set(lower, item);
1949
+ this.runConfigsById.set(id, item);
1950
+ }
1951
+ return runConfigs;
1952
+ }
1953
+ async resolveRunConfigByName(name) {
1954
+ if (this.runConfigsById.size === 0) {
1955
+ await this.collectRunConfigs();
1956
+ }
1957
+ const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
1958
+ const keyLower = key.toLowerCase();
1959
+ const matches = Array.from(this.runConfigsById.values()).filter(
1960
+ (item) => item.runConfig.getName().toLowerCase() === keyLower
1961
+ );
1962
+ if (matches.length === 0) {
1963
+ return void 0;
1964
+ }
1965
+ if (matches.length > 1) {
1966
+ throw new Error(
1967
+ `Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
1968
+ );
1969
+ }
1970
+ return matches[0];
1971
+ }
1972
+ async expandRunConfigToJobs(collected) {
1973
+ if (this.datasetsById.size === 0) {
1974
+ await this.collectDatasets();
1975
+ }
1976
+ if (this.evaluatorsById.size === 0) {
1977
+ await this.collectEvaluators();
1978
+ }
1979
+ const rcName = collected.runConfig.getName();
1980
+ const jobs = [];
1981
+ const runs = collected.runConfig.getRuns();
1982
+ for (const [i, row] of runs.entries()) {
1983
+ const dsCollected = Array.from(this.datasetsById.values()).find(
1984
+ (d) => d.dataset === row.dataset
1985
+ );
1986
+ if (!dsCollected) {
1987
+ throw new Error(
1988
+ `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
1989
+ );
1990
+ }
1991
+ let evaluatorIds;
1992
+ if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
1993
+ const matcher = createNameMatcher(row.evaluatorPattern);
1994
+ const matched = Array.from(this.evaluatorsById.values()).filter(
1995
+ (item) => matcher(item.evaluator.getName() ?? "")
1996
+ );
1997
+ if (matched.length === 0) {
1998
+ throw new Error(
1999
+ `RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
2000
+ );
2001
+ }
2002
+ evaluatorIds = matched.map((item) => item.id);
2003
+ } else {
2004
+ const evaluators = row.evaluators;
2005
+ evaluatorIds = [];
2006
+ for (const ev of evaluators) {
2007
+ const found = Array.from(this.evaluatorsById.values()).find(
2008
+ (item) => item.evaluator === ev
2009
+ );
2010
+ if (!found) {
2011
+ throw new Error(
2012
+ `RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
2013
+ );
2014
+ }
2015
+ evaluatorIds.push(found.id);
2016
+ }
2017
+ }
2018
+ const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
2019
+ jobs.push({
2020
+ datasetId: dsCollected.id,
2021
+ evaluatorIds,
2022
+ runConfigName: rcName,
2023
+ runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
2024
+ runConfigTags: collected.runConfig.getTags(),
2025
+ repetitions
2026
+ });
2027
+ }
2028
+ return jobs;
2029
+ }
2030
+ async expandRunConfigNamesToJobs(names) {
2031
+ const jobs = [];
2032
+ for (const name of names) {
2033
+ const collected = await this.resolveRunConfigByName(name);
2034
+ if (!collected) {
2035
+ const known = await this.collectRunConfigs();
2036
+ const available = known.map((r) => r.runConfig.getName()).sort();
2037
+ throw new Error(
2038
+ available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
2039
+ );
2040
+ }
2041
+ jobs.push(...await this.expandRunConfigToJobs(collected));
2042
+ }
2043
+ return jobs;
2044
+ }
2045
+ async runDatasetJobsWithSharedConcurrency(request) {
2046
+ const globalConcurrency = Math.max(1, request.globalConcurrency);
2047
+ const sem = effect.Effect.unsafeMakeSemaphore(globalConcurrency);
2048
+ const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
2049
+ const snapshots = [];
2050
+ for (const job of request.jobs) {
2051
+ snapshots.push(
2052
+ await this.startDatasetRun({
2053
+ datasetId: job.datasetId,
2054
+ evaluatorIds: job.evaluatorIds,
2055
+ triggerId,
2056
+ maxConcurrency: this.config.maxConcurrency ?? 1,
2057
+ globalEvaluationSemaphore: sem,
2058
+ runConfigName: job.runConfigName,
2059
+ runConfigTags: job.runConfigTags,
2060
+ repetitions: job.repetitions
2061
+ })
2062
+ );
2063
+ }
2064
+ return snapshots;
2065
+ }
1762
2066
  async searchTestCases(query) {
1763
2067
  const testCases = await collectTestCasesFromFiles(this.config.discovery);
1764
2068
  return searchCollectedTestCases(testCases, query);
@@ -1777,35 +2081,45 @@ var EffectRunner = class {
1777
2081
  );
1778
2082
  }
1779
2083
  async runDatasetWith(request) {
2084
+ const runConfigName = validateRunConfigName(
2085
+ request.runConfigName,
2086
+ "runDatasetWith.runConfigName"
2087
+ );
2088
+ return this.startDatasetRun({
2089
+ datasetId: request.datasetId,
2090
+ evaluatorIds: request.evaluatorIds,
2091
+ triggerId: request.triggerId,
2092
+ maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
2093
+ repetitions: request.repetitions,
2094
+ runConfigName,
2095
+ runConfigTags: request.runConfigTags
2096
+ });
2097
+ }
2098
+ async startDatasetRun(params) {
1780
2099
  if (this.datasetsById.size === 0) {
1781
2100
  await this.collectDatasets();
1782
2101
  }
1783
2102
  if (this.evaluatorsById.size === 0) {
1784
2103
  await this.collectEvaluators();
1785
2104
  }
1786
- const dataset = this.datasetsById.get(request.datasetId);
2105
+ const dataset = this.datasetsById.get(params.datasetId);
1787
2106
  if (!dataset) {
1788
- throw new Error(`Unknown dataset: ${request.datasetId}`);
2107
+ throw new Error(`Unknown dataset: ${params.datasetId}`);
1789
2108
  }
1790
- const selectedEvaluators = request.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
2109
+ const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
1791
2110
  if (selectedEvaluators.length === 0) {
1792
2111
  throw new Error("No evaluators selected for run");
1793
2112
  }
1794
- const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
1795
- const totalEvaluations = selectedTestCases.reduce(
1796
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1797
- 0
1798
- );
1799
- const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
2113
+ const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
2114
+ const repetitions = normalizeRunRepetitions(params.repetitions);
2115
+ const totalEvaluations = selectedTestCases.length * repetitions;
2116
+ const runConfigTags = [...params.runConfigTags ?? []];
2117
+ const triggerId = params.triggerId ?? `trg-${crypto.randomUUID()}`;
1800
2118
  const runId = `run-${crypto.randomUUID()}`;
1801
- const artifactPath = createArtifactPath(
1802
- this.config.artifactDirectory,
1803
- request.datasetId,
1804
- runId
1805
- );
2119
+ const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
1806
2120
  const snapshot = {
1807
2121
  runId,
1808
- datasetId: request.datasetId,
2122
+ datasetId: params.datasetId,
1809
2123
  datasetName: dataset.dataset.getName(),
1810
2124
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1811
2125
  queuedAt: Date.now(),
@@ -1826,7 +2140,7 @@ var EffectRunner = class {
1826
2140
  const queuedEvent = {
1827
2141
  type: "RunQueued",
1828
2142
  runId,
1829
- datasetId: request.datasetId,
2143
+ datasetId: params.datasetId,
1830
2144
  datasetName: dataset.dataset.getName(),
1831
2145
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1832
2146
  totalTestCases: totalEvaluations,
@@ -1840,17 +2154,20 @@ var EffectRunner = class {
1840
2154
  payload: queuedEvent
1841
2155
  })
1842
2156
  );
1843
- const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
1844
2157
  await effect.Effect.runPromise(
1845
2158
  effect.Queue.offer(this.runQueue, {
1846
2159
  runId,
1847
2160
  triggerId,
1848
- datasetId: request.datasetId,
2161
+ datasetId: params.datasetId,
1849
2162
  dataset: dataset.dataset,
1850
2163
  evaluators: selectedEvaluators,
1851
2164
  testCases: selectedTestCases,
1852
2165
  snapshot,
1853
- maxConcurrency
2166
+ maxConcurrency: params.maxConcurrency,
2167
+ globalEvaluationSemaphore: params.globalEvaluationSemaphore,
2168
+ runConfigName: params.runConfigName,
2169
+ runConfigTags,
2170
+ repetitions
1854
2171
  })
1855
2172
  );
1856
2173
  return snapshot;
@@ -1866,9 +2183,9 @@ var EffectRunner = class {
1866
2183
  return effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).get(runId);
1867
2184
  }
1868
2185
  getAllRunSnapshots() {
1869
- return Array.from(
1870
- effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()
1871
- ).sort((a, b) => b.queuedAt - a.queuedAt);
2186
+ return Array.from(effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()).sort(
2187
+ (a, b) => b.queuedAt - a.queuedAt
2188
+ );
1872
2189
  }
1873
2190
  async loadRunSnapshotsFromArtifacts() {
1874
2191
  return loadRunSnapshotsFromArtifacts(this.config);
@@ -1922,15 +2239,26 @@ var EffectRunner = class {
1922
2239
  }
1923
2240
  };
1924
2241
 
2242
+ // src/runner/events.ts
2243
+ var PROGRAMMATIC_RUN_CONFIG = {
2244
+ runConfigName: "programmatic"
2245
+ };
2246
+
1925
2247
  Object.defineProperty(exports, 'S', {
1926
2248
  enumerable: true,
1927
2249
  get: function () { return effect.Schema; }
1928
2250
  });
1929
2251
  exports.Dataset = Dataset;
1930
2252
  exports.Evaluator = Evaluator;
2253
+ exports.EvaluatorNameSchema = EvaluatorNameSchema;
1931
2254
  exports.Metric = Metric;
2255
+ exports.PROGRAMMATIC_RUN_CONFIG = PROGRAMMATIC_RUN_CONFIG;
2256
+ exports.RunConfig = RunConfig;
2257
+ exports.RunConfigNameSchema = RunConfigNameSchema;
1932
2258
  exports.Score = Score;
2259
+ exports.TagSet = TagSet;
1933
2260
  exports.TestCase = TestCase;
2261
+ exports.TestCaseNameSchema = TestCaseNameSchema;
1934
2262
  exports.binaryScore = binaryScore;
1935
2263
  exports.createLogEntry = createLogEntry;
1936
2264
  exports.createRunner = createRunner;
@@ -1938,16 +2266,24 @@ exports.defaultRunnerConfig = defaultRunnerConfig;
1938
2266
  exports.defineConfig = defineConfig;
1939
2267
  exports.deltaScore = deltaScore;
1940
2268
  exports.formatScoreData = formatScoreData;
2269
+ exports.getEvaluatorDisplayLabel = getEvaluatorDisplayLabel;
2270
+ exports.getEvaluatorTagList = getEvaluatorTagList;
1941
2271
  exports.getLogLines = getLogLines;
1942
2272
  exports.getMetricById = getMetricById;
1943
2273
  exports.getScoreById = getScoreById;
2274
+ exports.getTestCaseDisplayLabel = getTestCaseDisplayLabel;
2275
+ exports.getTestCaseTagList = getTestCaseTagList;
1944
2276
  exports.latencyMetric = latencyMetric;
1945
2277
  exports.loadMockData = loadMockData;
1946
2278
  exports.loadRunnerData = loadRunnerData;
2279
+ exports.normalizeOptionalDisplayName = normalizeOptionalDisplayName;
1947
2280
  exports.parseStartupArgs = parseStartupArgs;
1948
2281
  exports.percentScore = percentScore;
1949
2282
  exports.printJsonDiff = printJsonDiff;
1950
2283
  exports.tokenCountMetric = tokenCountMetric;
2284
+ exports.validateEvaluatorName = validateEvaluatorName;
2285
+ exports.validateRunConfigName = validateRunConfigName;
2286
+ exports.validateTestCaseName = validateTestCaseName;
1951
2287
  exports.withRunnerConfig = withRunnerConfig;
1952
2288
  //# sourceMappingURL=out.js.map
1953
2289
  //# sourceMappingURL=index.cjs.map