@m4trix/evals 0.25.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,14 +1,172 @@
1
- import { Effect, PubSub, Queue, Ref, Fiber } from 'effect';
1
+ import { Schema, Effect, PubSub, Queue, Ref, Fiber, Either, ParseResult } from 'effect';
2
2
  export { Schema as S } from 'effect';
3
3
  import { diffLines } from 'diff';
4
4
  import stringify from 'fast-json-stable-stringify';
5
5
  import { randomUUID } from 'crypto';
6
+ import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
7
+ import { resolve as resolve$1, join, relative, dirname } from 'path';
6
8
  import { existsSync } from 'fs';
7
- import { resolve as resolve$1, relative, join, dirname } from 'path';
8
9
  import * as jitiModule from 'jiti';
9
- import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
10
10
  import { pathToFileURL } from 'url';
11
11
 
12
+ // src/index.ts
13
+ var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
14
+ function makeEntityIdSchema(brand, label) {
15
+ return Schema.String.pipe(
16
+ Schema.trimmed(),
17
+ Schema.minLength(1, {
18
+ message: () => `${label} must be non-empty.`
19
+ }),
20
+ Schema.pattern(ENTITY_ID_PATTERN, {
21
+ message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
22
+ }),
23
+ Schema.brand(brand)
24
+ );
25
+ }
26
+ var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
27
+ var EvaluatorNameSchema = makeEntityIdSchema("EvaluatorName", "Evaluator name");
28
+ var TestCaseNameSchema = makeEntityIdSchema("TestCaseName", "Test case name");
29
+ function validateWithSchema(schema, raw, context) {
30
+ const trimmed = raw.trim();
31
+ const decode = Schema.decodeUnknownEither(
32
+ schema
33
+ );
34
+ const result = decode(trimmed);
35
+ if (Either.isLeft(result)) {
36
+ throw new Error(`${context}: ${ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
37
+ }
38
+ return result.right;
39
+ }
40
+ function validateRunConfigName(raw, context) {
41
+ return validateWithSchema(RunConfigNameSchema, raw, context);
42
+ }
43
+ function validateEvaluatorName(raw, context) {
44
+ return validateWithSchema(EvaluatorNameSchema, raw, context);
45
+ }
46
+ function validateTestCaseName(raw, context) {
47
+ return validateWithSchema(TestCaseNameSchema, raw, context);
48
+ }
49
+ function normalizeOptionalDisplayName(raw) {
50
+ if (raw === void 0) {
51
+ return void 0;
52
+ }
53
+ const t = raw.trim();
54
+ return t.length === 0 ? void 0 : t;
55
+ }
56
+
57
+ // src/evals/evaluator.ts
58
+ var Evaluator = class _Evaluator {
59
+ constructor(config) {
60
+ this._config = config;
61
+ }
62
+ getState() {
63
+ return {
64
+ name: this._config.name,
65
+ displayName: this._config.displayName,
66
+ tags: this._config.tags,
67
+ inputSchema: this._config.inputSchema,
68
+ outputSchema: this._config.outputSchema,
69
+ scoreSchema: this._config.scoreSchema,
70
+ middlewares: this._config.middlewares,
71
+ evaluateFn: this._config.evaluateFn,
72
+ passThreshold: this._config.passThreshold,
73
+ passCriterion: this._config.passCriterion
74
+ };
75
+ }
76
+ static use(middleware) {
77
+ return new _Evaluator({
78
+ middlewares: [middleware],
79
+ tags: []
80
+ });
81
+ }
82
+ use(middleware) {
83
+ const state = this.getState();
84
+ return new _Evaluator({
85
+ ...state,
86
+ middlewares: [...state.middlewares, middleware]
87
+ });
88
+ }
89
+ define(config) {
90
+ const { middlewares } = this.getState();
91
+ const name = validateEvaluatorName(config.name, "Evaluator.define");
92
+ const displayName = normalizeOptionalDisplayName(config.displayName);
93
+ const tags = config.tags !== void 0 ? [...config.tags] : [];
94
+ return new _Evaluator({
95
+ name,
96
+ displayName,
97
+ tags,
98
+ inputSchema: config.inputSchema,
99
+ outputSchema: config.outputSchema,
100
+ scoreSchema: config.scoreSchema,
101
+ middlewares,
102
+ passThreshold: config.passThreshold,
103
+ passCriterion: config.passCriterion
104
+ });
105
+ }
106
+ evaluate(fn) {
107
+ return new _Evaluator({
108
+ ...this.getState(),
109
+ evaluateFn: fn
110
+ });
111
+ }
112
+ /** Canonical evaluator id when defined; otherwise undefined (middleware-only chain). */
113
+ getName() {
114
+ return this._config.name;
115
+ }
116
+ getDisplayName() {
117
+ return this._config.displayName;
118
+ }
119
+ /** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. Undefined if not yet defined. */
120
+ getDisplayLabel() {
121
+ const id = this._config.name;
122
+ if (id === void 0) {
123
+ return void 0;
124
+ }
125
+ return this._config.displayName ?? id;
126
+ }
127
+ /** Tags from `Evaluator.define({ tags })`; empty until defined. */
128
+ getTags() {
129
+ return [...this._config.tags];
130
+ }
131
+ getInputSchema() {
132
+ return this._config.inputSchema;
133
+ }
134
+ getOutputSchema() {
135
+ return this._config.outputSchema;
136
+ }
137
+ getScoreSchema() {
138
+ return this._config.scoreSchema;
139
+ }
140
+ getMiddlewares() {
141
+ return this._config.middlewares;
142
+ }
143
+ getEvaluateFn() {
144
+ return this._config.evaluateFn;
145
+ }
146
+ getPassThreshold() {
147
+ return this._config.passThreshold;
148
+ }
149
+ getPassCriterion() {
150
+ return this._config.passCriterion;
151
+ }
152
+ async resolveContext() {
153
+ const parts = await Promise.all(this._config.middlewares.map((mw) => mw.resolve()));
154
+ return Object.assign({}, ...parts);
155
+ }
156
+ };
157
+ function getEvaluatorDisplayLabel(evaluator) {
158
+ if (typeof evaluator.getDisplayLabel === "function") {
159
+ const label = evaluator.getDisplayLabel();
160
+ if (label !== void 0) {
161
+ return label;
162
+ }
163
+ }
164
+ return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
165
+ }
166
+ function getEvaluatorTagList(evaluator) {
167
+ return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
168
+ }
169
+
12
170
  // src/cli/data.mock.json
13
171
  var data_mock_default = {
14
172
  datasets: [
@@ -159,9 +317,7 @@ var data_mock_default = {
159
317
  { name: "contract_match", score: 100 },
160
318
  { name: "arg_validity", score: 100 }
161
319
  ],
162
- checks: [
163
- { name: "tool_calls", passed: true, detail: "0 unexpected" }
164
- ],
320
+ checks: [{ name: "tool_calls", passed: true, detail: "0 unexpected" }],
165
321
  failures: [],
166
322
  meta: {
167
323
  model: "gpt-4o-mini",
@@ -184,9 +340,21 @@ var data_mock_default = {
184
340
  }
185
341
  ],
186
342
  evaluators: [
187
- { id: "json-schema-validator", name: "JSON Schema Validator", configPreview: "strict=true" },
188
- { id: "tool-call-contract-checker", name: "Tool-call Contract Checker", configPreview: "unexpectedCalls=error" },
189
- { id: "rubric-judge", name: "Rubric Judge (LLM)", configPreview: "model=gpt-4o-mini; scale=0-100" },
343
+ {
344
+ id: "json-schema-validator",
345
+ name: "JSON Schema Validator",
346
+ configPreview: "strict=true"
347
+ },
348
+ {
349
+ id: "tool-call-contract-checker",
350
+ name: "Tool-call Contract Checker",
351
+ configPreview: "unexpectedCalls=error"
352
+ },
353
+ {
354
+ id: "rubric-judge",
355
+ name: "Rubric Judge (LLM)",
356
+ configPreview: "model=gpt-4o-mini; scale=0-100"
357
+ },
190
358
  { id: "pii-leak-detector", name: "PII Leak Detector", configPreview: "redact=false" }
191
359
  ]
192
360
  };
@@ -253,7 +421,7 @@ function toEvalDataset(item, snapshots) {
253
421
  function toEvaluatorOption(item) {
254
422
  return {
255
423
  id: item.id,
256
- name: item.evaluator.getName() ?? toSlug(item.id),
424
+ name: getEvaluatorDisplayLabel(item.evaluator) ?? toSlug(item.id),
257
425
  configPreview: `Source: ${item.filePath}`
258
426
  };
259
427
  }
@@ -266,9 +434,7 @@ async function loadRunnerData(runner) {
266
434
  const memSnapshots = runner.getAllRunSnapshots();
267
435
  const seen = new Set(memSnapshots.map((s) => s.runId));
268
436
  const fromDisk = diskSnapshots.filter((s) => !seen.has(s.runId));
269
- const snapshots = [...memSnapshots, ...fromDisk].sort(
270
- (a, b) => b.queuedAt - a.queuedAt
271
- );
437
+ const snapshots = [...memSnapshots, ...fromDisk].sort((a, b) => b.queuedAt - a.queuedAt);
272
438
  if (datasets.length === 0 && evaluators.length === 0) {
273
439
  return loadMockData();
274
440
  }
@@ -301,134 +467,6 @@ function parseStartupArgs(argv) {
301
467
  return args;
302
468
  }
303
469
 
304
- // src/evals/test-case.ts
305
- function resolve(value) {
306
- return typeof value === "function" ? value() : value;
307
- }
308
- var TestCase = class _TestCase {
309
- constructor(config) {
310
- this._config = config;
311
- }
312
- static describe(config) {
313
- const reruns = config.reruns ?? 1;
314
- if (reruns < 1 || !Number.isInteger(reruns)) {
315
- throw new Error(`TestCase reruns must be a positive integer, got ${reruns}`);
316
- }
317
- return new _TestCase({
318
- name: config.name,
319
- tags: config.tags,
320
- reruns,
321
- inputSchema: config.inputSchema,
322
- input: config.input,
323
- outputSchema: config.outputSchema,
324
- output: config.output
325
- });
326
- }
327
- getReruns() {
328
- return this._config.reruns;
329
- }
330
- getName() {
331
- return this._config.name;
332
- }
333
- getTags() {
334
- return this._config.tags;
335
- }
336
- getInputSchema() {
337
- return this._config.inputSchema;
338
- }
339
- getInput() {
340
- return resolve(this._config.input);
341
- }
342
- getOutputSchema() {
343
- return this._config.outputSchema;
344
- }
345
- getOutput() {
346
- if (this._config.output === void 0) {
347
- return void 0;
348
- }
349
- return resolve(this._config.output);
350
- }
351
- };
352
-
353
- // src/evals/evaluator.ts
354
- var Evaluator = class _Evaluator {
355
- constructor(config) {
356
- this._config = config;
357
- }
358
- getState() {
359
- return {
360
- name: this._config.name,
361
- inputSchema: this._config.inputSchema,
362
- outputSchema: this._config.outputSchema,
363
- scoreSchema: this._config.scoreSchema,
364
- middlewares: this._config.middlewares,
365
- evaluateFn: this._config.evaluateFn,
366
- passThreshold: this._config.passThreshold,
367
- passCriterion: this._config.passCriterion
368
- };
369
- }
370
- static use(middleware) {
371
- return new _Evaluator({
372
- middlewares: [middleware]
373
- });
374
- }
375
- use(middleware) {
376
- const state = this.getState();
377
- return new _Evaluator({
378
- ...state,
379
- middlewares: [...state.middlewares, middleware]
380
- });
381
- }
382
- define(config) {
383
- const { middlewares } = this.getState();
384
- return new _Evaluator({
385
- name: config.name,
386
- inputSchema: config.inputSchema,
387
- outputSchema: config.outputSchema,
388
- scoreSchema: config.scoreSchema,
389
- middlewares,
390
- passThreshold: config.passThreshold,
391
- passCriterion: config.passCriterion
392
- });
393
- }
394
- evaluate(fn) {
395
- return new _Evaluator({
396
- ...this.getState(),
397
- evaluateFn: fn
398
- });
399
- }
400
- getName() {
401
- return this._config.name;
402
- }
403
- getInputSchema() {
404
- return this._config.inputSchema;
405
- }
406
- getOutputSchema() {
407
- return this._config.outputSchema;
408
- }
409
- getScoreSchema() {
410
- return this._config.scoreSchema;
411
- }
412
- getMiddlewares() {
413
- return this._config.middlewares;
414
- }
415
- getEvaluateFn() {
416
- return this._config.evaluateFn;
417
- }
418
- getPassThreshold() {
419
- return this._config.passThreshold;
420
- }
421
- getPassCriterion() {
422
- return this._config.passCriterion;
423
- }
424
- async resolveContext() {
425
- const parts = await Promise.all(
426
- this._config.middlewares.map((mw) => mw.resolve())
427
- );
428
- return Object.assign({}, ...parts);
429
- }
430
- };
431
-
432
470
  // src/evals/dataset.ts
433
471
  function matchesAny(value, matchers) {
434
472
  return matchers.some(
@@ -492,230 +530,13 @@ var Dataset = class _Dataset {
492
530
  return tagMatch && pathMatch;
493
531
  }
494
532
  };
495
-
496
- // src/evals/metric.ts
497
- var registry = /* @__PURE__ */ new Map();
498
- var Metric = {
499
- of(config) {
500
- const def = {
501
- id: config.id,
502
- name: config.name,
503
- aggregate: config.aggregate,
504
- format: config.format,
505
- make: (data, options) => ({
506
- id: config.id,
507
- data,
508
- ...options?.name !== void 0 && { name: options.name }
509
- })
510
- };
511
- registry.set(config.id, def);
512
- return def;
513
- }
514
- };
515
- function getMetricById(id) {
516
- return registry.get(id);
517
- }
518
-
519
- // src/evals/score.ts
520
- var registry2 = /* @__PURE__ */ new Map();
521
- function formatScoreData(def, data, options) {
522
- return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
523
- }
524
- var ScoreAggregate = {
525
- /** Average numeric fields. Use for scores like { value, delta }. */
526
- averageFields(fields) {
527
- return (values) => {
528
- const count = values.length || 1;
529
- const result = {};
530
- for (const field of fields) {
531
- result[field] = values.reduce(
532
- (s, v) => s + (v[field] ?? 0),
533
- 0
534
- ) / count;
535
- }
536
- return result;
537
- };
538
- },
539
- /** Average selected numeric fields, with sample std dev tracked for `value`. */
540
- averageWithVariance(fields) {
541
- return (values) => {
542
- const count = values.length;
543
- const result = {};
544
- for (const field of fields) {
545
- result[field] = count === 0 ? 0 : values.reduce(
546
- (sum, item) => sum + (item[field] ?? 0),
547
- 0
548
- ) / count;
549
- }
550
- const valueField = "value";
551
- const hasValueField = fields.includes(valueField);
552
- if (count === 0) {
553
- if (hasValueField) {
554
- result[valueField] = 0;
555
- }
556
- return {
557
- ...result,
558
- stdDev: void 0,
559
- count: 0
560
- };
561
- }
562
- let stdDev;
563
- if (hasValueField && count >= 2) {
564
- const sum = values.reduce(
565
- (s, v) => s + (v[valueField] ?? 0),
566
- 0
567
- );
568
- const sumSq = values.reduce(
569
- (s, v) => {
570
- const value = v[valueField] ?? 0;
571
- return s + value * value;
572
- },
573
- 0
574
- );
575
- const mean = sum / count;
576
- const variance = (sumSq - count * mean * mean) / (count - 1);
577
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
578
- }
579
- return {
580
- ...values[0],
581
- ...result,
582
- stdDev,
583
- count
584
- };
585
- };
586
- },
587
- /** All runs must pass. Use for binary scores. */
588
- all(values) {
589
- const total = values.length;
590
- const passedCount = values.filter((v) => v.passed).length;
591
- return {
592
- ...values[0],
593
- passed: total > 0 && values.every((v) => v.passed),
594
- passedCount,
595
- totalCount: total
596
- };
597
- },
598
- /** Take last value (no aggregation). Use when aggregation is not meaningful. */
599
- last(values) {
600
- return values[values.length - 1] ?? {};
601
- }
602
- };
603
- var Score = {
604
- aggregate: ScoreAggregate,
605
- of(config) {
606
- const def = {
607
- id: config.id,
608
- name: config.name,
609
- displayStrategy: config.displayStrategy,
610
- formatValue: config.formatValue,
611
- formatAggregate: config.formatAggregate,
612
- aggregateValues: config.aggregateValues,
613
- make: (data, options) => {
614
- const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
615
- return {
616
- id: config.id,
617
- data,
618
- ...passed !== void 0 && { passed },
619
- ...options?.name !== void 0 && { name: options.name },
620
- def
621
- // Attach def so rendering/aggregation works without registry lookup
622
- };
623
- }
624
- };
625
- registry2.set(config.id, def);
626
- return def;
627
- }
628
- };
629
- function getScoreById(id) {
630
- return registry2.get(id);
631
- }
632
-
633
- // src/evals/aggregators.ts
634
- function aggregateTokenCountSum(values) {
635
- const initial = {
636
- input: 0,
637
- output: 0,
638
- inputCached: 0,
639
- outputCached: 0
640
- };
641
- return values.reduce(
642
- (acc, v) => ({
643
- input: acc.input + (v.input ?? 0),
644
- output: acc.output + (v.output ?? 0),
645
- inputCached: acc.inputCached + (v.inputCached ?? 0),
646
- outputCached: acc.outputCached + (v.outputCached ?? 0)
647
- }),
648
- initial
649
- );
650
- }
651
- function aggregateLatencyAverage(values) {
652
- if (values.length === 0) {
653
- return { ms: 0 };
654
- }
655
- const sum = values.reduce((s, v) => s + v.ms, 0);
656
- return { ms: sum / values.length };
657
- }
658
-
659
- // src/evals/metrics/standard.ts
660
- var tokenCountMetric = Metric.of({
661
- id: "token-count",
662
- name: "Tokens",
663
- aggregate: aggregateTokenCountSum,
664
- format: (data, options) => {
665
- const input = data.input ?? 0;
666
- const output = data.output ?? 0;
667
- const inputCached = data.inputCached ?? 0;
668
- const outputCached = data.outputCached ?? 0;
669
- const cached = inputCached + outputCached;
670
- const base = `in:${input} out:${output} cached:${cached}`;
671
- return options?.isAggregated ? `Total: ${base}` : base;
672
- }
673
- });
674
- var latencyMetric = Metric.of({
675
- id: "latency",
676
- name: "Latency",
677
- aggregate: aggregateLatencyAverage,
678
- format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
679
- });
680
-
681
- // src/evals/scores/standard.ts
682
- var percentScore = Score.of({
683
- id: "percent",
684
- name: "Score",
685
- displayStrategy: "bar",
686
- formatValue: (data) => data.value.toFixed(2),
687
- formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
688
- aggregateValues: Score.aggregate.averageWithVariance(["value"])
689
- });
690
- var deltaScore = Score.of({
691
- id: "delta",
692
- name: "Delta",
693
- displayStrategy: "number",
694
- formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
695
- formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
696
- aggregateValues: Score.aggregate.averageFields(["value", "delta"])
697
- });
698
- var binaryScore = Score.of({
699
- id: "binary",
700
- name: "Result",
701
- displayStrategy: "passFail",
702
- formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
703
- formatAggregate: (data) => {
704
- const base = data.passed ? "All: PASSED" : "Some: FAILED";
705
- if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
706
- return `${base} (${data.passedCount}/${data.totalCount})`;
707
- }
708
- return base;
709
- },
710
- aggregateValues: Score.aggregate.all
711
- });
712
- function preprocessForDiff(value, options) {
713
- if (options?.sort && Array.isArray(value)) {
714
- return [...value].sort((a, b) => {
715
- const aStr = stringify(preprocessForDiff(a, options));
716
- const bStr = stringify(preprocessForDiff(b, options));
717
- return aStr.localeCompare(bStr);
718
- }).map((item) => preprocessForDiff(item, options));
533
+ function preprocessForDiff(value, options) {
534
+ if (options?.sort && Array.isArray(value)) {
535
+ return [...value].sort((a, b) => {
536
+ const aStr = stringify(preprocessForDiff(a, options));
537
+ const bStr = stringify(preprocessForDiff(b, options));
538
+ return aStr.localeCompare(bStr);
539
+ }).map((item) => preprocessForDiff(item, options));
719
540
  }
720
541
  if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
721
542
  const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
@@ -766,16 +587,8 @@ function createDiffString(expected, actual, diffOptions) {
766
587
  const expectedProcessed = preprocessForDiff(expected, diffOptions);
767
588
  const actualProcessed = preprocessForDiff(actual, diffOptions);
768
589
  if (diffOptions?.keysOnly) {
769
- const expectedKeys = JSON.stringify(
770
- extractKeys(expectedProcessed),
771
- null,
772
- 2
773
- );
774
- const actualKeys = JSON.stringify(
775
- extractKeys(actualProcessed),
776
- null,
777
- 2
778
- );
590
+ const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
591
+ const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
779
592
  const parts2 = diffLines(expectedKeys, actualKeys);
780
593
  return formatDiffParts(parts2);
781
594
  }
@@ -786,9 +599,7 @@ function createDiffString(expected, actual, diffOptions) {
786
599
  }
787
600
  const parts = diffLines(expectedStr, actualStr);
788
601
  if (diffOptions?.outputNewOnly) {
789
- const filtered = parts.filter(
790
- (p) => p.added === true
791
- );
602
+ const filtered = parts.filter((p) => p.added === true);
792
603
  return formatDiffParts(filtered);
793
604
  }
794
605
  return formatDiffParts(parts);
@@ -853,14 +664,476 @@ function printJsonDiff(expected, actual, options = {}) {
853
664
  if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
854
665
  return `\x1B[32m${line}\x1B[0m`;
855
666
  }
856
- return line;
857
- });
858
- const colored = lines.join("\n");
859
- console.log(colored || "(no differences)");
860
- return colored;
667
+ return line;
668
+ });
669
+ const colored = lines.join("\n");
670
+ console.log(colored || "(no differences)");
671
+ return colored;
672
+ }
673
+ console.log(diff || "(no differences)");
674
+ return diff;
675
+ }
676
+
677
+ // src/evals/metric.ts
678
+ var registry = /* @__PURE__ */ new Map();
679
+ var Metric = {
680
+ of(config) {
681
+ const def = {
682
+ id: config.id,
683
+ name: config.name,
684
+ aggregate: config.aggregate,
685
+ format: config.format,
686
+ make: (data, options) => ({
687
+ id: config.id,
688
+ data,
689
+ ...options?.name !== void 0 && { name: options.name }
690
+ })
691
+ };
692
+ registry.set(config.id, def);
693
+ return def;
694
+ }
695
+ };
696
+ function getMetricById(id) {
697
+ return registry.get(id);
698
+ }
699
+
700
+ // src/evals/aggregators.ts
701
+ function aggregateTokenCountSum(values) {
702
+ const initial = {
703
+ input: 0,
704
+ output: 0,
705
+ inputCached: 0,
706
+ outputCached: 0
707
+ };
708
+ return values.reduce(
709
+ (acc, v) => ({
710
+ input: acc.input + (v.input ?? 0),
711
+ output: acc.output + (v.output ?? 0),
712
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
713
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
714
+ }),
715
+ initial
716
+ );
717
+ }
718
+ function aggregateLatencyAverage(values) {
719
+ if (values.length === 0) {
720
+ return { ms: 0 };
721
+ }
722
+ const sum = values.reduce((s, v) => s + v.ms, 0);
723
+ return { ms: sum / values.length };
724
+ }
725
+
726
+ // src/evals/metrics/standard.ts
727
+ var tokenCountMetric = Metric.of({
728
+ id: "token-count",
729
+ name: "Tokens",
730
+ aggregate: aggregateTokenCountSum,
731
+ format: (data, options) => {
732
+ const input = data.input ?? 0;
733
+ const output = data.output ?? 0;
734
+ const inputCached = data.inputCached ?? 0;
735
+ const outputCached = data.outputCached ?? 0;
736
+ const cached = inputCached + outputCached;
737
+ const base = `in:${input} out:${output} cached:${cached}`;
738
+ return options?.isAggregated ? `Total: ${base}` : base;
739
+ }
740
+ });
741
+ var latencyMetric = Metric.of({
742
+ id: "latency",
743
+ name: "Latency",
744
+ aggregate: aggregateLatencyAverage,
745
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
746
+ });
747
+
748
+ // src/evals/run-config.ts
749
+ function validateRow(row, index) {
750
+ const hasEvaluators = "evaluators" in row && row.evaluators !== void 0 && row.evaluators !== void 0;
751
+ const hasPattern = "evaluatorPattern" in row && typeof row.evaluatorPattern === "string" && row.evaluatorPattern.trim().length > 0;
752
+ if (hasEvaluators && hasPattern) {
753
+ throw new Error(`RunConfig run[${index}] must not set both evaluators and evaluatorPattern`);
754
+ }
755
+ if (!hasEvaluators && !hasPattern) {
756
+ throw new Error(`RunConfig run[${index}] must set either evaluators or evaluatorPattern`);
757
+ }
758
+ if (hasEvaluators && row.evaluators.length === 0) {
759
+ throw new Error(`RunConfig run[${index}]: evaluators must be non-empty`);
760
+ }
761
+ const rawRep = "repetitions" in row ? row.repetitions : void 0;
762
+ const repetitions = rawRep ?? 1;
763
+ if (!Number.isInteger(repetitions) || repetitions < 1) {
764
+ throw new Error(
765
+ `RunConfig run[${index}]: repetitions must be a positive integer, got ${String(rawRep)}`
766
+ );
767
+ }
768
+ }
769
+ var RunConfig = class _RunConfig {
770
+ constructor(name, displayName, tags, runs) {
771
+ this._name = name;
772
+ this._displayName = displayName;
773
+ this._tags = tags;
774
+ this._runs = runs;
775
+ }
776
+ static define(config) {
777
+ if (config.runs.length === 0) {
778
+ throw new Error("RunConfig runs must be non-empty");
779
+ }
780
+ config.runs.forEach(validateRow);
781
+ const name = validateRunConfigName(config.name, "RunConfig.define");
782
+ const displayName = normalizeOptionalDisplayName(config.displayName);
783
+ const tags = config.tags !== void 0 ? [...config.tags] : [];
784
+ return new _RunConfig(name, displayName, tags, config.runs);
785
+ }
786
+ /** Canonical id (branded {@link RunConfigName} at runtime; typed as `string` for ergonomics). */
787
+ getName() {
788
+ return this._name;
789
+ }
790
+ /** Optional unrestricted display label. */
791
+ getDisplayName() {
792
+ return this._displayName;
793
+ }
794
+ /** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. */
795
+ getDisplayLabel() {
796
+ return this._displayName ?? this._name;
797
+ }
798
+ /** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
799
+ getTags() {
800
+ return [...this._tags];
801
+ }
802
+ getRuns() {
803
+ return this._runs;
804
+ }
805
+ };
806
+
807
+ // src/evals/score.ts
808
+ var registry2 = /* @__PURE__ */ new Map();
809
+ function formatScoreData(def, data, options) {
810
+ return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
811
+ }
812
+ var ScoreAggregate = {
813
+ /** Average numeric fields. Use for scores like { value, delta }. */
814
+ averageFields(fields) {
815
+ return (values) => {
816
+ const count = values.length || 1;
817
+ const result = {};
818
+ for (const field of fields) {
819
+ result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
820
+ }
821
+ return result;
822
+ };
823
+ },
824
+ /** Average selected numeric fields, with sample std dev tracked for `value`. */
825
+ averageWithVariance(fields) {
826
+ return (values) => {
827
+ const count = values.length;
828
+ const result = {};
829
+ for (const field of fields) {
830
+ result[field] = count === 0 ? 0 : values.reduce(
831
+ (sum, item) => sum + (item[field] ?? 0),
832
+ 0
833
+ ) / count;
834
+ }
835
+ const valueField = "value";
836
+ const hasValueField = fields.includes(valueField);
837
+ if (count === 0) {
838
+ if (hasValueField) {
839
+ result[valueField] = 0;
840
+ }
841
+ return {
842
+ ...result,
843
+ stdDev: void 0,
844
+ count: 0
845
+ };
846
+ }
847
+ let stdDev;
848
+ if (hasValueField && count >= 2) {
849
+ const sum = values.reduce(
850
+ (s, v) => s + (v[valueField] ?? 0),
851
+ 0
852
+ );
853
+ const sumSq = values.reduce((s, v) => {
854
+ const value = v[valueField] ?? 0;
855
+ return s + value * value;
856
+ }, 0);
857
+ const mean = sum / count;
858
+ const variance = (sumSq - count * mean * mean) / (count - 1);
859
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
860
+ }
861
+ return {
862
+ ...values[0],
863
+ ...result,
864
+ stdDev,
865
+ count
866
+ };
867
+ };
868
+ },
869
+ /** All runs must pass. Use for binary scores. */
870
+ all(values) {
871
+ const total = values.length;
872
+ const passedCount = values.filter((v) => v.passed).length;
873
+ return {
874
+ ...values[0],
875
+ passed: total > 0 && values.every((v) => v.passed),
876
+ passedCount,
877
+ totalCount: total
878
+ };
879
+ },
880
+ /** Take last value (no aggregation). Use when aggregation is not meaningful. */
881
+ last(values) {
882
+ return values[values.length - 1] ?? {};
883
+ }
884
+ };
885
+ var Score = {
886
+ aggregate: ScoreAggregate,
887
+ of(config) {
888
+ const def = {
889
+ id: config.id,
890
+ name: config.name,
891
+ displayStrategy: config.displayStrategy,
892
+ formatValue: config.formatValue,
893
+ formatAggregate: config.formatAggregate,
894
+ aggregateValues: config.aggregateValues,
895
+ make: (data, options) => {
896
+ const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
897
+ return {
898
+ id: config.id,
899
+ data,
900
+ ...passed !== void 0 && { passed },
901
+ ...options?.name !== void 0 && { name: options.name },
902
+ def
903
+ // Attach def so rendering/aggregation works without registry lookup
904
+ };
905
+ }
906
+ };
907
+ registry2.set(config.id, def);
908
+ return def;
909
+ }
910
+ };
911
+ function getScoreById(id) {
912
+ return registry2.get(id);
913
+ }
914
+
915
+ // src/evals/scores/standard.ts
916
+ var percentScore = Score.of({
917
+ id: "percent",
918
+ name: "Score",
919
+ displayStrategy: "bar",
920
+ formatValue: (data) => data.value.toFixed(2),
921
+ formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
922
+ aggregateValues: Score.aggregate.averageWithVariance(["value"])
923
+ });
924
+ var deltaScore = Score.of({
925
+ id: "delta",
926
+ name: "Delta",
927
+ displayStrategy: "number",
928
+ formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
929
+ formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
930
+ aggregateValues: Score.aggregate.averageFields(["value", "delta"])
931
+ });
932
+ var binaryScore = Score.of({
933
+ id: "binary",
934
+ name: "Result",
935
+ displayStrategy: "passFail",
936
+ formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
937
+ formatAggregate: (data) => {
938
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
939
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
940
+ return `${base} (${data.passedCount}/${data.totalCount})`;
941
+ }
942
+ return base;
943
+ },
944
+ aggregateValues: Score.aggregate.all
945
+ });
946
+
947
+ // src/evals/tag-set.ts
948
+ var TagSet = class {
949
+ constructor() {
950
+ }
951
+ static define(tags) {
952
+ const out = {};
953
+ for (const tag of tags) {
954
+ out[tag] = tag;
955
+ }
956
+ return out;
957
+ }
958
+ };
959
+
960
+ // src/evals/test-case.ts
961
+ function resolve(value) {
962
+ return typeof value === "function" ? value() : value;
963
+ }
964
+ var TestCase = class _TestCase {
965
+ constructor(config) {
966
+ this._config = config;
967
+ }
968
+ static describe(config) {
969
+ const name = validateTestCaseName(config.name, "TestCase.describe");
970
+ const displayName = normalizeOptionalDisplayName(config.displayName);
971
+ return new _TestCase({
972
+ name,
973
+ displayName,
974
+ tags: config.tags,
975
+ inputSchema: config.inputSchema,
976
+ input: config.input,
977
+ outputSchema: config.outputSchema,
978
+ output: config.output
979
+ });
980
+ }
981
+ getName() {
982
+ return this._config.name;
983
+ }
984
+ getDisplayName() {
985
+ return this._config.displayName;
986
+ }
987
+ getDisplayLabel() {
988
+ return this._config.displayName ?? this._config.name;
989
+ }
990
+ getTags() {
991
+ return this._config.tags;
992
+ }
993
+ getInputSchema() {
994
+ return this._config.inputSchema;
995
+ }
996
+ getInput() {
997
+ return resolve(this._config.input);
998
+ }
999
+ getOutputSchema() {
1000
+ return this._config.outputSchema;
1001
+ }
1002
+ getOutput() {
1003
+ if (this._config.output === void 0) {
1004
+ return void 0;
1005
+ }
1006
+ return resolve(this._config.output);
1007
+ }
1008
+ };
1009
+ function getTestCaseDisplayLabel(testCase) {
1010
+ if (typeof testCase.getDisplayLabel === "function") {
1011
+ return testCase.getDisplayLabel();
1012
+ }
1013
+ return typeof testCase.getName === "function" ? testCase.getName() : "";
1014
+ }
1015
+ function getTestCaseTagList(testCase) {
1016
+ return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
1017
+ }
1018
+ async function loadRunSnapshotsFromArtifacts(config) {
1019
+ const baseDir = resolve$1(config.artifactDirectory);
1020
+ let entries;
1021
+ try {
1022
+ entries = await readdir(baseDir);
1023
+ } catch {
1024
+ return [];
1025
+ }
1026
+ const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
1027
+ const snapshots = [];
1028
+ for (const fileName of jsonlFiles) {
1029
+ const filePath = join(baseDir, fileName);
1030
+ try {
1031
+ const snapshot = await parseArtifactToSnapshot(filePath, config);
1032
+ if (snapshot) {
1033
+ snapshots.push(snapshot);
1034
+ }
1035
+ } catch {
1036
+ }
1037
+ }
1038
+ return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
1039
+ }
1040
+ async function parseArtifactToSnapshot(filePath, _config) {
1041
+ const content = await readFile(filePath, "utf8");
1042
+ const lines = content.split("\n").filter((line) => line.trim().length > 0);
1043
+ if (lines.length === 0) {
1044
+ return null;
1045
+ }
1046
+ let runQueued = null;
1047
+ let runCompleted = null;
1048
+ let runFailed = null;
1049
+ let runStarted = null;
1050
+ for (const line of lines) {
1051
+ try {
1052
+ const event = JSON.parse(line);
1053
+ const type = event.type;
1054
+ if (type === "RunQueued") {
1055
+ runQueued = {
1056
+ runId: event.runId,
1057
+ datasetId: event.datasetId,
1058
+ datasetName: event.datasetName,
1059
+ evaluatorIds: event.evaluatorIds,
1060
+ totalTestCases: event.totalTestCases ?? 0,
1061
+ artifactPath: event.artifactPath ?? filePath,
1062
+ ts: event.ts
1063
+ };
1064
+ }
1065
+ if (type === "RunStarted") {
1066
+ runStarted = { startedAt: event.startedAt };
1067
+ }
1068
+ if (type === "RunCompleted") {
1069
+ runCompleted = {
1070
+ passedTestCases: event.passedTestCases,
1071
+ failedTestCases: event.failedTestCases,
1072
+ totalTestCases: event.totalTestCases,
1073
+ finishedAt: event.finishedAt
1074
+ };
1075
+ }
1076
+ if (type === "RunFailed") {
1077
+ runFailed = {
1078
+ finishedAt: event.finishedAt,
1079
+ errorMessage: event.errorMessage
1080
+ };
1081
+ }
1082
+ } catch {
1083
+ }
1084
+ }
1085
+ if (!runQueued) {
1086
+ return null;
1087
+ }
1088
+ const artifactPath = filePath;
1089
+ const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1090
+ const progress = aggregateTestCaseProgress(lines);
1091
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1092
+ const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1093
+ const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1094
+ return {
1095
+ runId: runQueued.runId,
1096
+ datasetId: runQueued.datasetId,
1097
+ datasetName: runQueued.datasetName,
1098
+ evaluatorIds: runQueued.evaluatorIds,
1099
+ queuedAt: runQueued.ts ?? 0,
1100
+ startedAt: runStarted?.startedAt,
1101
+ finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
1102
+ totalTestCases: runQueued.totalTestCases,
1103
+ completedTestCases,
1104
+ passedTestCases,
1105
+ failedTestCases,
1106
+ status,
1107
+ artifactPath,
1108
+ errorMessage: runFailed?.errorMessage
1109
+ };
1110
+ }
1111
+ function aggregateTestCaseProgress(lines) {
1112
+ let completedTestCases = 0;
1113
+ const testCasePassedBy = /* @__PURE__ */ new Map();
1114
+ for (const line of lines) {
1115
+ try {
1116
+ const event = JSON.parse(line);
1117
+ if (event.type === "TestCaseProgress") {
1118
+ const ev = event;
1119
+ completedTestCases = ev.completedTestCases ?? completedTestCases;
1120
+ const id = ev.testCaseId;
1121
+ const current = testCasePassedBy.get(id);
1122
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
1123
+ }
1124
+ } catch {
1125
+ }
861
1126
  }
862
- console.log(diff || "(no differences)");
863
- return diff;
1127
+ let passedTestCases = 0;
1128
+ let failedTestCases = 0;
1129
+ for (const passed of testCasePassedBy.values()) {
1130
+ if (passed) {
1131
+ passedTestCases += 1;
1132
+ } else {
1133
+ failedTestCases += 1;
1134
+ }
1135
+ }
1136
+ return { completedTestCases, passedTestCases, failedTestCases };
864
1137
  }
865
1138
 
866
1139
  // src/runner/config.ts
@@ -871,18 +1144,9 @@ var defaultRunnerConfig = {
871
1144
  discovery: {
872
1145
  rootDir: process.cwd(),
873
1146
  datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
874
- evaluatorSuffixes: [
875
- ".evaluator.ts",
876
- ".evaluator.tsx",
877
- ".evaluator.js",
878
- ".evaluator.mjs"
879
- ],
880
- testCaseSuffixes: [
881
- ".test-case.ts",
882
- ".test-case.tsx",
883
- ".test-case.js",
884
- ".test-case.mjs"
885
- ],
1147
+ evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
1148
+ runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
1149
+ testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
886
1150
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
887
1151
  },
888
1152
  artifactDirectory: ".eval-results",
@@ -907,6 +1171,11 @@ function toRunnerConfigOverrides(config) {
907
1171
  } else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
908
1172
  discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
909
1173
  }
1174
+ if (rawDiscovery?.runConfigFilePatterns !== void 0) {
1175
+ discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
1176
+ } else if (rawDiscovery?.runConfigSuffixes !== void 0) {
1177
+ discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
1178
+ }
910
1179
  if (rawDiscovery?.testCaseFilePatterns !== void 0) {
911
1180
  discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
912
1181
  } else if (rawDiscovery?.testCaseSuffixes !== void 0) {
@@ -949,14 +1218,15 @@ function getJitiLoader() {
949
1218
  }
950
1219
  const createJiti2 = jitiModule.createJiti ?? jitiModule.default;
951
1220
  if (typeof createJiti2 !== "function") {
952
- throw new Error(
953
- "Failed to initialize jiti for m4trix eval config loading."
954
- );
1221
+ throw new Error("Failed to initialize jiti for m4trix eval config loading.");
955
1222
  }
956
- cachedLoader = createJiti2(import.meta.url, {
957
- interopDefault: true,
958
- moduleCache: true
959
- });
1223
+ cachedLoader = createJiti2(
1224
+ import.meta.url,
1225
+ {
1226
+ interopDefault: true,
1227
+ moduleCache: true
1228
+ }
1229
+ );
960
1230
  return cachedLoader;
961
1231
  }
962
1232
  function resolveConfigModuleExport(loadedModule) {
@@ -1004,6 +1274,9 @@ function isDatasetLike(value) {
1004
1274
  function isEvaluatorLike(value) {
1005
1275
  return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
1006
1276
  }
1277
+ function isRunConfigLike(value) {
1278
+ return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
1279
+ }
1007
1280
  function isTestCaseLike(value) {
1008
1281
  return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
1009
1282
  }
@@ -1060,9 +1333,7 @@ async function loadModuleExports(filePath) {
1060
1333
  }
1061
1334
  async function collectDatasetsFromFiles(config) {
1062
1335
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1063
- const matched = files.filter(
1064
- (filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
1065
- );
1336
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.datasetSuffixes));
1066
1337
  const found = await Promise.all(
1067
1338
  matched.map(async (absolutePath) => {
1068
1339
  const exports = await loadModuleExports(absolutePath);
@@ -1079,9 +1350,7 @@ async function collectDatasetsFromFiles(config) {
1079
1350
  }
1080
1351
  async function collectEvaluatorsFromFiles(config) {
1081
1352
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1082
- const matched = files.filter(
1083
- (filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
1084
- );
1353
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes));
1085
1354
  const found = await Promise.all(
1086
1355
  matched.map(async (absolutePath) => {
1087
1356
  const exports = await loadModuleExports(absolutePath);
@@ -1096,11 +1365,26 @@ async function collectEvaluatorsFromFiles(config) {
1096
1365
  );
1097
1366
  return found.flat();
1098
1367
  }
1099
- async function collectTestCasesFromFiles(config) {
1368
+ async function collectRunConfigsFromFiles(config) {
1100
1369
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1101
- const matched = files.filter(
1102
- (filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
1370
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
1371
+ const found = await Promise.all(
1372
+ matched.map(async (absolutePath) => {
1373
+ const exports = await loadModuleExports(absolutePath);
1374
+ const runConfigs = exports.filter(isRunConfigLike);
1375
+ const relPath = relative(config.rootDir, absolutePath);
1376
+ return runConfigs.map((runConfig) => ({
1377
+ id: runConfig.getName(),
1378
+ filePath: relPath,
1379
+ runConfig
1380
+ }));
1381
+ })
1103
1382
  );
1383
+ return found.flat();
1384
+ }
1385
+ async function collectTestCasesFromFiles(config) {
1386
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1387
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
1104
1388
  const found = await Promise.all(
1105
1389
  matched.map(async (absolutePath) => {
1106
1390
  const exports = await loadModuleExports(absolutePath);
@@ -1190,15 +1474,17 @@ function readOutput(testCase) {
1190
1474
  }
1191
1475
  return candidate.getOutput();
1192
1476
  }
1193
- function buildEvaluationUnits(testCases) {
1477
+ function buildEvaluationUnits(testCases, repetitionCount) {
1478
+ const count = Math.max(1, repetitionCount);
1194
1479
  const units = [];
1195
1480
  for (const testCaseItem of testCases) {
1196
- const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1197
- for (let r = 0; r < rerunTotal; r++) {
1481
+ const repetitionId = `rep-${randomUUID()}`;
1482
+ for (let r = 0; r < count; r++) {
1198
1483
  units.push({
1199
1484
  testCaseItem,
1200
- rerunIndex: r + 1,
1201
- rerunTotal
1485
+ repetitionId,
1486
+ repetitionIndex: r + 1,
1487
+ repetitionCount: count
1202
1488
  });
1203
1489
  }
1204
1490
  }
@@ -1208,29 +1494,24 @@ function nowIsoForFile() {
1208
1494
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
1209
1495
  }
1210
1496
  function createArtifactPath(artifactDirectory, datasetId, runId) {
1211
- return join(
1212
- artifactDirectory,
1213
- `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1214
- );
1497
+ return join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
1215
1498
  }
1216
1499
  function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
1217
- const { testCaseItem, rerunIndex, rerunTotal } = unit;
1500
+ const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
1218
1501
  return Effect.gen(function* () {
1219
1502
  const evaluatorRunId = `run-${randomUUID()}`;
1220
1503
  const started = Date.now();
1221
- const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
1222
- n + 1,
1223
- n + 1
1224
- ]);
1504
+ const startedEvaluations = yield* Ref.modify(startedRef, (n) => [n + 1, n + 1]);
1225
1505
  yield* publishEvent({
1226
1506
  type: "TestCaseStarted",
1227
1507
  runId: task.runId,
1228
1508
  testCaseId: testCaseItem.id,
1229
- testCaseName: testCaseItem.testCase.getName(),
1509
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
1230
1510
  startedTestCases: startedEvaluations,
1231
1511
  totalTestCases: totalEvaluations,
1232
- rerunIndex,
1233
- rerunTotal
1512
+ repetitionId,
1513
+ repetitionIndex,
1514
+ repetitionCount
1234
1515
  });
1235
1516
  const evaluatorScores = [];
1236
1517
  let testCaseError;
@@ -1254,9 +1535,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1254
1535
  return error;
1255
1536
  };
1256
1537
  try {
1257
- const ctx = yield* Effect.promise(
1258
- () => Promise.resolve(evaluator.resolveContext())
1259
- );
1538
+ const ctx = yield* Effect.promise(() => Promise.resolve(evaluator.resolveContext()));
1260
1539
  const result = yield* Effect.promise(
1261
1540
  () => Promise.resolve().then(
1262
1541
  () => evaluateFn({
@@ -1266,8 +1545,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1266
1545
  meta: {
1267
1546
  triggerId: task.triggerId,
1268
1547
  runId: evaluatorRunId,
1269
- datasetId: task.datasetId
1548
+ datasetId: task.datasetId,
1549
+ repetitionId,
1550
+ repetitionIndex,
1551
+ repetitionCount,
1552
+ runConfigName: task.runConfigName
1270
1553
  },
1554
+ testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1555
+ runConfigTags: task.runConfigTags,
1556
+ evaluatorTags: getEvaluatorTagList(evaluator),
1271
1557
  logDiff,
1272
1558
  log,
1273
1559
  createError
@@ -1310,21 +1596,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1310
1596
  });
1311
1597
  }
1312
1598
  }
1313
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1314
- const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
1315
- n + 1,
1316
- n + 1
1317
- ]);
1599
+ const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
1600
+ const completedEvaluations = yield* Ref.modify(completedRef, (n) => [n + 1, n + 1]);
1318
1601
  const progressEvent = {
1319
1602
  type: "TestCaseProgress",
1320
1603
  runId: task.runId,
1321
1604
  testCaseId: testCaseItem.id,
1322
- testCaseName: testCaseItem.testCase.getName(),
1605
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
1323
1606
  completedTestCases: completedEvaluations,
1324
1607
  totalTestCases: totalEvaluations,
1325
- rerunIndex,
1326
- rerunTotal,
1327
- passed: rerunPassedThis,
1608
+ repetitionId,
1609
+ repetitionIndex,
1610
+ repetitionCount,
1611
+ passed: repetitionPassedThis,
1328
1612
  durationMs: Date.now() - started,
1329
1613
  evaluatorScores,
1330
1614
  output,
@@ -1345,9 +1629,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1345
1629
  (map) => {
1346
1630
  const key = testCaseItem.id;
1347
1631
  const existing = map.get(key) ?? { completedCount: 0, results: [] };
1348
- const newResults = [...existing.results, rerunPassedThis];
1632
+ const newResults = [...existing.results, repetitionPassedThis];
1349
1633
  const newCompletedCount = existing.completedCount + 1;
1350
- const isLast = newCompletedCount === rerunTotal;
1634
+ const isLast = newCompletedCount === repetitionCount;
1351
1635
  const newMap = new Map(map);
1352
1636
  newMap.set(key, {
1353
1637
  completedCount: newCompletedCount,
@@ -1363,10 +1647,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1363
1647
  } else {
1364
1648
  yield* Ref.update(failedRef, (n) => n + 1);
1365
1649
  }
1366
- const [passed, failed] = yield* Effect.all([
1367
- Ref.get(passedRef),
1368
- Ref.get(failedRef)
1369
- ]);
1650
+ const [passed, failed] = yield* Effect.all([Ref.get(passedRef), Ref.get(failedRef)]);
1370
1651
  yield* updateSnapshot(task.runId, (snapshot) => ({
1371
1652
  ...snapshot,
1372
1653
  passedTestCases: passed,
@@ -1387,10 +1668,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1387
1668
  runId: task.runId,
1388
1669
  startedAt
1389
1670
  });
1390
- const totalEvaluations = task.testCases.reduce(
1391
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1392
- 0
1393
- );
1671
+ const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
1394
1672
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1395
1673
  const completedRef = yield* Ref.make(0);
1396
1674
  const startedRef = yield* Ref.make(0);
@@ -1399,7 +1677,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1399
1677
  const testCaseResultsRef = yield* Ref.make(
1400
1678
  /* @__PURE__ */ new Map()
1401
1679
  );
1402
- const evaluationUnits = buildEvaluationUnits(task.testCases);
1680
+ const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
1403
1681
  const processEvaluation = (unit) => processOneEvaluation(
1404
1682
  task,
1405
1683
  unit,
@@ -1413,11 +1691,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1413
1691
  failedRef,
1414
1692
  testCaseResultsRef
1415
1693
  );
1416
- yield* Effect.forEach(
1417
- evaluationUnits,
1418
- processEvaluation,
1419
- maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1420
- );
1694
+ const globalSem = task.globalEvaluationSemaphore;
1695
+ if (globalSem !== void 0) {
1696
+ yield* Effect.forEach(
1697
+ evaluationUnits,
1698
+ (unit) => globalSem.withPermits(1)(processEvaluation(unit)),
1699
+ { concurrency: "unbounded", discard: true }
1700
+ );
1701
+ } else {
1702
+ yield* Effect.forEach(
1703
+ evaluationUnits,
1704
+ processEvaluation,
1705
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1706
+ );
1707
+ }
1421
1708
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
1422
1709
  Ref.get(completedRef),
1423
1710
  Ref.get(passedRef),
@@ -1453,125 +1740,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1453
1740
  artifactPath: task.snapshot.artifactPath
1454
1741
  });
1455
1742
  });
1456
- async function loadRunSnapshotsFromArtifacts(config) {
1457
- const baseDir = resolve$1(config.artifactDirectory);
1458
- let entries;
1459
- try {
1460
- entries = await readdir(baseDir);
1461
- } catch {
1462
- return [];
1463
- }
1464
- const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
1465
- const snapshots = [];
1466
- for (const fileName of jsonlFiles) {
1467
- const filePath = join(baseDir, fileName);
1468
- try {
1469
- const snapshot = await parseArtifactToSnapshot(filePath, config);
1470
- if (snapshot) {
1471
- snapshots.push(snapshot);
1472
- }
1473
- } catch {
1474
- }
1475
- }
1476
- return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
1477
- }
1478
- async function parseArtifactToSnapshot(filePath, _config) {
1479
- const content = await readFile(filePath, "utf8");
1480
- const lines = content.split("\n").filter((line) => line.trim().length > 0);
1481
- if (lines.length === 0) {
1482
- return null;
1483
- }
1484
- let runQueued = null;
1485
- let runCompleted = null;
1486
- let runFailed = null;
1487
- let runStarted = null;
1488
- for (const line of lines) {
1489
- try {
1490
- const event = JSON.parse(line);
1491
- const type = event.type;
1492
- if (type === "RunQueued") {
1493
- runQueued = {
1494
- runId: event.runId,
1495
- datasetId: event.datasetId,
1496
- datasetName: event.datasetName,
1497
- evaluatorIds: event.evaluatorIds,
1498
- totalTestCases: event.totalTestCases ?? 0,
1499
- artifactPath: event.artifactPath ?? filePath,
1500
- ts: event.ts
1501
- };
1502
- }
1503
- if (type === "RunStarted") {
1504
- runStarted = { startedAt: event.startedAt };
1505
- }
1506
- if (type === "RunCompleted") {
1507
- runCompleted = {
1508
- passedTestCases: event.passedTestCases,
1509
- failedTestCases: event.failedTestCases,
1510
- totalTestCases: event.totalTestCases,
1511
- finishedAt: event.finishedAt
1512
- };
1513
- }
1514
- if (type === "RunFailed") {
1515
- runFailed = {
1516
- finishedAt: event.finishedAt,
1517
- errorMessage: event.errorMessage
1518
- };
1519
- }
1520
- } catch {
1521
- }
1743
+
1744
+ // src/runner/name-pattern.ts
1745
+ function parseRegexLiteral(pattern) {
1746
+ if (!pattern.startsWith("/")) {
1747
+ return void 0;
1522
1748
  }
1523
- if (!runQueued) {
1524
- return null;
1749
+ const lastSlash = pattern.lastIndexOf("/");
1750
+ if (lastSlash <= 0) {
1751
+ return void 0;
1525
1752
  }
1526
- const artifactPath = filePath;
1527
- const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1528
- const progress = aggregateTestCaseProgress(lines);
1529
- const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1530
- const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1531
- const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1532
1753
  return {
1533
- runId: runQueued.runId,
1534
- datasetId: runQueued.datasetId,
1535
- datasetName: runQueued.datasetName,
1536
- evaluatorIds: runQueued.evaluatorIds,
1537
- queuedAt: runQueued.ts ?? 0,
1538
- startedAt: runStarted?.startedAt,
1539
- finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
1540
- totalTestCases: runQueued.totalTestCases,
1541
- completedTestCases,
1542
- passedTestCases,
1543
- failedTestCases,
1544
- status,
1545
- artifactPath,
1546
- errorMessage: runFailed?.errorMessage
1754
+ source: pattern.slice(1, lastSlash),
1755
+ flags: pattern.slice(lastSlash + 1)
1547
1756
  };
1548
1757
  }
1549
- function aggregateTestCaseProgress(lines) {
1550
- let completedTestCases = 0;
1551
- const testCasePassedBy = /* @__PURE__ */ new Map();
1552
- for (const line of lines) {
1553
- try {
1554
- const event = JSON.parse(line);
1555
- if (event.type === "TestCaseProgress") {
1556
- const ev = event;
1557
- completedTestCases = ev.completedTestCases ?? completedTestCases;
1558
- const id = ev.testCaseId;
1559
- const current = testCasePassedBy.get(id);
1560
- testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
1561
- }
1562
- } catch {
1563
- }
1758
+ function createNameMatcher(pattern) {
1759
+ const normalizedPattern = pattern.trim();
1760
+ const regexLiteral = parseRegexLiteral(normalizedPattern);
1761
+ if (regexLiteral) {
1762
+ const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1763
+ return (value) => regex.test(value);
1564
1764
  }
1565
- let passedTestCases = 0;
1566
- let failedTestCases = 0;
1567
- for (const passed of testCasePassedBy.values()) {
1568
- if (passed) {
1569
- passedTestCases += 1;
1570
- } else {
1571
- failedTestCases += 1;
1572
- }
1765
+ if (normalizedPattern.includes("*")) {
1766
+ const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1767
+ const regex = new RegExp(`^${escaped}$`, "i");
1768
+ return (value) => regex.test(value);
1573
1769
  }
1574
- return { completedTestCases, passedTestCases, failedTestCases };
1770
+ return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1575
1771
  }
1576
1772
  async function appendJsonLine(artifactPath, payload) {
1577
1773
  await mkdir(dirname(artifactPath), { recursive: true });
@@ -1630,32 +1826,12 @@ function searchCollectedTestCases(all, query) {
1630
1826
  }
1631
1827
 
1632
1828
  // src/runner/api.ts
1633
- function parseRegexLiteral(pattern) {
1634
- if (!pattern.startsWith("/")) {
1635
- return void 0;
1636
- }
1637
- const lastSlash = pattern.lastIndexOf("/");
1638
- if (lastSlash <= 0) {
1639
- return void 0;
1640
- }
1641
- return {
1642
- source: pattern.slice(1, lastSlash),
1643
- flags: pattern.slice(lastSlash + 1)
1644
- };
1645
- }
1646
- function createNameMatcher(pattern) {
1647
- const normalizedPattern = pattern.trim();
1648
- const regexLiteral = parseRegexLiteral(normalizedPattern);
1649
- if (regexLiteral) {
1650
- const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1651
- return (value) => regex.test(value);
1652
- }
1653
- if (normalizedPattern.includes("*")) {
1654
- const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1655
- const regex = new RegExp(`^${escaped}$`, "i");
1656
- return (value) => regex.test(value);
1829
+ function normalizeRunRepetitions(value) {
1830
+ const n = value ?? 1;
1831
+ if (!Number.isInteger(n) || n < 1) {
1832
+ throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
1657
1833
  }
1658
- return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1834
+ return n;
1659
1835
  }
1660
1836
  function mergeRunnerOverrides(base, next) {
1661
1837
  if (!base) {
@@ -1686,15 +1862,12 @@ var EffectRunner = class {
1686
1862
  this.persistenceQueue = Effect.runSync(
1687
1863
  Queue.unbounded()
1688
1864
  );
1689
- this.snapshotsRef = Effect.runSync(
1690
- Ref.make(/* @__PURE__ */ new Map())
1691
- );
1865
+ this.snapshotsRef = Effect.runSync(Ref.make(/* @__PURE__ */ new Map()));
1692
1866
  this.listeners = /* @__PURE__ */ new Set();
1693
1867
  this.datasetsById = /* @__PURE__ */ new Map();
1694
1868
  this.evaluatorsById = /* @__PURE__ */ new Map();
1695
- this.schedulerFiber = Effect.runFork(
1696
- this.createSchedulerEffect()
1697
- );
1869
+ this.runConfigsById = /* @__PURE__ */ new Map();
1870
+ this.schedulerFiber = Effect.runFork(this.createSchedulerEffect());
1698
1871
  this.persistenceFiber = Effect.runFork(
1699
1872
  createPersistenceWorker(this.persistenceQueue)
1700
1873
  );
@@ -1734,6 +1907,137 @@ var EffectRunner = class {
1734
1907
  (item) => matcher(item.evaluator.getName() ?? "")
1735
1908
  );
1736
1909
  }
1910
+ async collectRunConfigs() {
1911
+ const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
1912
+ this.runConfigsById.clear();
1913
+ const byNameLower = /* @__PURE__ */ new Map();
1914
+ for (const item of runConfigs) {
1915
+ const id = item.runConfig.getName();
1916
+ const lower = id.toLowerCase();
1917
+ const prev = byNameLower.get(lower);
1918
+ if (prev !== void 0 && prev.filePath !== item.filePath) {
1919
+ throw new Error(
1920
+ `Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
1921
+ );
1922
+ }
1923
+ byNameLower.set(lower, item);
1924
+ this.runConfigsById.set(id, item);
1925
+ }
1926
+ return runConfigs;
1927
+ }
1928
+ async resolveRunConfigByName(name) {
1929
+ if (this.runConfigsById.size === 0) {
1930
+ await this.collectRunConfigs();
1931
+ }
1932
+ const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
1933
+ const keyLower = key.toLowerCase();
1934
+ const matches = Array.from(this.runConfigsById.values()).filter(
1935
+ (item) => item.runConfig.getName().toLowerCase() === keyLower
1936
+ );
1937
+ if (matches.length === 0) {
1938
+ return void 0;
1939
+ }
1940
+ if (matches.length > 1) {
1941
+ throw new Error(
1942
+ `Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
1943
+ );
1944
+ }
1945
+ return matches[0];
1946
+ }
1947
+ async expandRunConfigToJobs(collected) {
1948
+ if (this.datasetsById.size === 0) {
1949
+ await this.collectDatasets();
1950
+ }
1951
+ if (this.evaluatorsById.size === 0) {
1952
+ await this.collectEvaluators();
1953
+ }
1954
+ const rcName = collected.runConfig.getName();
1955
+ const jobs = [];
1956
+ const runs = collected.runConfig.getRuns();
1957
+ for (const [i, row] of runs.entries()) {
1958
+ const dsCollected = Array.from(this.datasetsById.values()).find(
1959
+ (d) => d.dataset === row.dataset
1960
+ );
1961
+ if (!dsCollected) {
1962
+ throw new Error(
1963
+ `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
1964
+ );
1965
+ }
1966
+ let evaluatorIds;
1967
+ if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
1968
+ const matcher = createNameMatcher(row.evaluatorPattern);
1969
+ const matched = Array.from(this.evaluatorsById.values()).filter(
1970
+ (item) => matcher(item.evaluator.getName() ?? "")
1971
+ );
1972
+ if (matched.length === 0) {
1973
+ throw new Error(
1974
+ `RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
1975
+ );
1976
+ }
1977
+ evaluatorIds = matched.map((item) => item.id);
1978
+ } else {
1979
+ const evaluators = row.evaluators;
1980
+ evaluatorIds = [];
1981
+ for (const ev of evaluators) {
1982
+ const found = Array.from(this.evaluatorsById.values()).find(
1983
+ (item) => item.evaluator === ev
1984
+ );
1985
+ if (!found) {
1986
+ throw new Error(
1987
+ `RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
1988
+ );
1989
+ }
1990
+ evaluatorIds.push(found.id);
1991
+ }
1992
+ }
1993
+ const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
1994
+ jobs.push({
1995
+ datasetId: dsCollected.id,
1996
+ evaluatorIds,
1997
+ runConfigName: rcName,
1998
+ runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
1999
+ runConfigTags: collected.runConfig.getTags(),
2000
+ repetitions
2001
+ });
2002
+ }
2003
+ return jobs;
2004
+ }
2005
+ async expandRunConfigNamesToJobs(names) {
2006
+ const jobs = [];
2007
+ for (const name of names) {
2008
+ const collected = await this.resolveRunConfigByName(name);
2009
+ if (!collected) {
2010
+ const known = await this.collectRunConfigs();
2011
+ const available = known.map((r) => r.runConfig.getName()).sort();
2012
+ throw new Error(
2013
+ available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
2014
+ );
2015
+ }
2016
+ jobs.push(...await this.expandRunConfigToJobs(collected));
2017
+ }
2018
+ return jobs;
2019
+ }
2020
+ async runDatasetJobsWithSharedConcurrency(request) {
2021
+ const globalConcurrency = Math.max(1, request.globalConcurrency);
2022
+ const sem = Effect.unsafeMakeSemaphore(globalConcurrency);
2023
+ const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
2024
+ const snapshots = [];
2025
+ for (const job of request.jobs) {
2026
+ snapshots.push(
2027
+ await this.startDatasetRun({
2028
+ datasetId: job.datasetId,
2029
+ evaluatorIds: job.evaluatorIds,
2030
+ triggerId,
2031
+ maxConcurrency: this.config.maxConcurrency ?? 1,
2032
+ globalEvaluationSemaphore: sem,
2033
+ runConfigName: job.runConfigName,
2034
+ runConfigTags: job.runConfigTags,
2035
+ repetitions: job.repetitions
2036
+ })
2037
+ );
2038
+ }
2039
+ return snapshots;
2040
+ }
1737
2041
  async searchTestCases(query) {
1738
2042
  const testCases = await collectTestCasesFromFiles(this.config.discovery);
1739
2043
  return searchCollectedTestCases(testCases, query);
@@ -1752,35 +2056,45 @@ var EffectRunner = class {
1752
2056
  );
1753
2057
  }
1754
2058
  async runDatasetWith(request) {
2059
+ const runConfigName = validateRunConfigName(
2060
+ request.runConfigName,
2061
+ "runDatasetWith.runConfigName"
2062
+ );
2063
+ return this.startDatasetRun({
2064
+ datasetId: request.datasetId,
2065
+ evaluatorIds: request.evaluatorIds,
2066
+ triggerId: request.triggerId,
2067
+ maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
2068
+ repetitions: request.repetitions,
2069
+ runConfigName,
2070
+ runConfigTags: request.runConfigTags
2071
+ });
2072
+ }
2073
+ async startDatasetRun(params) {
1755
2074
  if (this.datasetsById.size === 0) {
1756
2075
  await this.collectDatasets();
1757
2076
  }
1758
2077
  if (this.evaluatorsById.size === 0) {
1759
2078
  await this.collectEvaluators();
1760
2079
  }
1761
- const dataset = this.datasetsById.get(request.datasetId);
2080
+ const dataset = this.datasetsById.get(params.datasetId);
1762
2081
  if (!dataset) {
1763
- throw new Error(`Unknown dataset: ${request.datasetId}`);
2082
+ throw new Error(`Unknown dataset: ${params.datasetId}`);
1764
2083
  }
1765
- const selectedEvaluators = request.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
2084
+ const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
1766
2085
  if (selectedEvaluators.length === 0) {
1767
2086
  throw new Error("No evaluators selected for run");
1768
2087
  }
1769
- const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
1770
- const totalEvaluations = selectedTestCases.reduce(
1771
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1772
- 0
1773
- );
1774
- const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
2088
+ const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
2089
+ const repetitions = normalizeRunRepetitions(params.repetitions);
2090
+ const totalEvaluations = selectedTestCases.length * repetitions;
2091
+ const runConfigTags = [...params.runConfigTags ?? []];
2092
+ const triggerId = params.triggerId ?? `trg-${randomUUID()}`;
1775
2093
  const runId = `run-${randomUUID()}`;
1776
- const artifactPath = createArtifactPath(
1777
- this.config.artifactDirectory,
1778
- request.datasetId,
1779
- runId
1780
- );
2094
+ const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
1781
2095
  const snapshot = {
1782
2096
  runId,
1783
- datasetId: request.datasetId,
2097
+ datasetId: params.datasetId,
1784
2098
  datasetName: dataset.dataset.getName(),
1785
2099
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1786
2100
  queuedAt: Date.now(),
@@ -1801,7 +2115,7 @@ var EffectRunner = class {
1801
2115
  const queuedEvent = {
1802
2116
  type: "RunQueued",
1803
2117
  runId,
1804
- datasetId: request.datasetId,
2118
+ datasetId: params.datasetId,
1805
2119
  datasetName: dataset.dataset.getName(),
1806
2120
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1807
2121
  totalTestCases: totalEvaluations,
@@ -1815,17 +2129,20 @@ var EffectRunner = class {
1815
2129
  payload: queuedEvent
1816
2130
  })
1817
2131
  );
1818
- const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
1819
2132
  await Effect.runPromise(
1820
2133
  Queue.offer(this.runQueue, {
1821
2134
  runId,
1822
2135
  triggerId,
1823
- datasetId: request.datasetId,
2136
+ datasetId: params.datasetId,
1824
2137
  dataset: dataset.dataset,
1825
2138
  evaluators: selectedEvaluators,
1826
2139
  testCases: selectedTestCases,
1827
2140
  snapshot,
1828
- maxConcurrency
2141
+ maxConcurrency: params.maxConcurrency,
2142
+ globalEvaluationSemaphore: params.globalEvaluationSemaphore,
2143
+ runConfigName: params.runConfigName,
2144
+ runConfigTags,
2145
+ repetitions
1829
2146
  })
1830
2147
  );
1831
2148
  return snapshot;
@@ -1841,9 +2158,9 @@ var EffectRunner = class {
1841
2158
  return Effect.runSync(Ref.get(this.snapshotsRef)).get(runId);
1842
2159
  }
1843
2160
  getAllRunSnapshots() {
1844
- return Array.from(
1845
- Effect.runSync(Ref.get(this.snapshotsRef)).values()
1846
- ).sort((a, b) => b.queuedAt - a.queuedAt);
2161
+ return Array.from(Effect.runSync(Ref.get(this.snapshotsRef)).values()).sort(
2162
+ (a, b) => b.queuedAt - a.queuedAt
2163
+ );
1847
2164
  }
1848
2165
  async loadRunSnapshotsFromArtifacts() {
1849
2166
  return loadRunSnapshotsFromArtifacts(this.config);
@@ -1897,6 +2214,11 @@ var EffectRunner = class {
1897
2214
  }
1898
2215
  };
1899
2216
 
1900
- export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
2217
+ // src/runner/events.ts
2218
+ var PROGRAMMATIC_RUN_CONFIG = {
2219
+ runConfigName: "programmatic"
2220
+ };
2221
+
2222
+ export { Dataset, Evaluator, EvaluatorNameSchema, Metric, PROGRAMMATIC_RUN_CONFIG, RunConfig, RunConfigNameSchema, Score, TagSet, TestCase, TestCaseNameSchema, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };
1901
2223
  //# sourceMappingURL=out.js.map
1902
2224
  //# sourceMappingURL=index.js.map