@m4trix/evals 0.25.1 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -4,10 +4,10 @@ var effect = require('effect');
4
4
  var diff = require('diff');
5
5
  var stringify = require('fast-json-stable-stringify');
6
6
  var crypto = require('crypto');
7
- var fs = require('fs');
7
+ var promises = require('fs/promises');
8
8
  var path = require('path');
9
+ var fs = require('fs');
9
10
  var jitiModule = require('jiti');
10
- var promises = require('fs/promises');
11
11
  var url = require('url');
12
12
 
13
13
  var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
@@ -34,6 +34,164 @@ function _interopNamespace(e) {
34
34
  var stringify__default = /*#__PURE__*/_interopDefault(stringify);
35
35
  var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
36
36
 
37
+ // src/index.ts
38
+ var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
39
+ function makeEntityIdSchema(brand, label) {
40
+ return effect.Schema.String.pipe(
41
+ effect.Schema.trimmed(),
42
+ effect.Schema.minLength(1, {
43
+ message: () => `${label} must be non-empty.`
44
+ }),
45
+ effect.Schema.pattern(ENTITY_ID_PATTERN, {
46
+ message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
47
+ }),
48
+ effect.Schema.brand(brand)
49
+ );
50
+ }
51
+ var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
52
+ var EvaluatorNameSchema = makeEntityIdSchema("EvaluatorName", "Evaluator name");
53
+ var TestCaseNameSchema = makeEntityIdSchema("TestCaseName", "Test case name");
54
+ function validateWithSchema(schema, raw, context) {
55
+ const trimmed = raw.trim();
56
+ const decode = effect.Schema.decodeUnknownEither(
57
+ schema
58
+ );
59
+ const result = decode(trimmed);
60
+ if (effect.Either.isLeft(result)) {
61
+ throw new Error(`${context}: ${effect.ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
62
+ }
63
+ return result.right;
64
+ }
65
+ function validateRunConfigName(raw, context) {
66
+ return validateWithSchema(RunConfigNameSchema, raw, context);
67
+ }
68
+ function validateEvaluatorName(raw, context) {
69
+ return validateWithSchema(EvaluatorNameSchema, raw, context);
70
+ }
71
+ function validateTestCaseName(raw, context) {
72
+ return validateWithSchema(TestCaseNameSchema, raw, context);
73
+ }
74
+ function normalizeOptionalDisplayName(raw) {
75
+ if (raw === void 0) {
76
+ return void 0;
77
+ }
78
+ const t = raw.trim();
79
+ return t.length === 0 ? void 0 : t;
80
+ }
81
+
82
+ // src/evals/evaluator.ts
83
+ var Evaluator = class _Evaluator {
84
+ constructor(config) {
85
+ this._config = config;
86
+ }
87
+ getState() {
88
+ return {
89
+ name: this._config.name,
90
+ displayName: this._config.displayName,
91
+ tags: this._config.tags,
92
+ inputSchema: this._config.inputSchema,
93
+ outputSchema: this._config.outputSchema,
94
+ scoreSchema: this._config.scoreSchema,
95
+ middlewares: this._config.middlewares,
96
+ evaluateFn: this._config.evaluateFn,
97
+ passThreshold: this._config.passThreshold,
98
+ passCriterion: this._config.passCriterion
99
+ };
100
+ }
101
+ static use(middleware) {
102
+ return new _Evaluator({
103
+ middlewares: [middleware],
104
+ tags: []
105
+ });
106
+ }
107
+ use(middleware) {
108
+ const state = this.getState();
109
+ return new _Evaluator({
110
+ ...state,
111
+ middlewares: [...state.middlewares, middleware]
112
+ });
113
+ }
114
+ define(config) {
115
+ const { middlewares } = this.getState();
116
+ const name = validateEvaluatorName(config.name, "Evaluator.define");
117
+ const displayName = normalizeOptionalDisplayName(config.displayName);
118
+ const tags = config.tags !== void 0 ? [...config.tags] : [];
119
+ return new _Evaluator({
120
+ name,
121
+ displayName,
122
+ tags,
123
+ inputSchema: config.inputSchema,
124
+ outputSchema: config.outputSchema,
125
+ scoreSchema: config.scoreSchema,
126
+ middlewares,
127
+ passThreshold: config.passThreshold,
128
+ passCriterion: config.passCriterion
129
+ });
130
+ }
131
+ evaluate(fn) {
132
+ return new _Evaluator({
133
+ ...this.getState(),
134
+ evaluateFn: fn
135
+ });
136
+ }
137
+ /** Canonical evaluator id when defined; otherwise undefined (middleware-only chain). */
138
+ getName() {
139
+ return this._config.name;
140
+ }
141
+ getDisplayName() {
142
+ return this._config.displayName;
143
+ }
144
+ /** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. Undefined if not yet defined. */
145
+ getDisplayLabel() {
146
+ const id = this._config.name;
147
+ if (id === void 0) {
148
+ return void 0;
149
+ }
150
+ return this._config.displayName ?? id;
151
+ }
152
+ /** Tags from `Evaluator.define({ tags })`; empty until defined. */
153
+ getTags() {
154
+ return [...this._config.tags];
155
+ }
156
+ getInputSchema() {
157
+ return this._config.inputSchema;
158
+ }
159
+ getOutputSchema() {
160
+ return this._config.outputSchema;
161
+ }
162
+ getScoreSchema() {
163
+ return this._config.scoreSchema;
164
+ }
165
+ getMiddlewares() {
166
+ return this._config.middlewares;
167
+ }
168
+ getEvaluateFn() {
169
+ return this._config.evaluateFn;
170
+ }
171
+ getPassThreshold() {
172
+ return this._config.passThreshold;
173
+ }
174
+ getPassCriterion() {
175
+ return this._config.passCriterion;
176
+ }
177
+ async resolveContext() {
178
+ const parts = await Promise.all(this._config.middlewares.map((mw) => mw.resolve()));
179
+ return Object.assign({}, ...parts);
180
+ }
181
+ };
182
+ function getEvaluatorDisplayLabel(evaluator) {
183
+ if (typeof evaluator.getDisplayLabel === "function") {
184
+ const label = evaluator.getDisplayLabel();
185
+ if (label !== void 0) {
186
+ return label;
187
+ }
188
+ }
189
+ return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
190
+ }
191
+ function getEvaluatorTagList(evaluator) {
192
+ return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
193
+ }
194
+
37
195
  // src/cli/data.mock.json
38
196
  var data_mock_default = {
39
197
  datasets: [
@@ -288,7 +446,7 @@ function toEvalDataset(item, snapshots) {
288
446
  function toEvaluatorOption(item) {
289
447
  return {
290
448
  id: item.id,
291
- name: item.evaluator.getName() ?? toSlug(item.id),
449
+ name: getEvaluatorDisplayLabel(item.evaluator) ?? toSlug(item.id),
292
450
  configPreview: `Source: ${item.filePath}`
293
451
  };
294
452
  }
@@ -334,132 +492,6 @@ function parseStartupArgs(argv) {
334
492
  return args;
335
493
  }
336
494
 
337
- // src/evals/test-case.ts
338
- function resolve(value) {
339
- return typeof value === "function" ? value() : value;
340
- }
341
- var TestCase = class _TestCase {
342
- constructor(config) {
343
- this._config = config;
344
- }
345
- static describe(config) {
346
- const reruns = config.reruns ?? 1;
347
- if (reruns < 1 || !Number.isInteger(reruns)) {
348
- throw new Error(`TestCase reruns must be a positive integer, got ${reruns}`);
349
- }
350
- return new _TestCase({
351
- name: config.name,
352
- tags: config.tags,
353
- reruns,
354
- inputSchema: config.inputSchema,
355
- input: config.input,
356
- outputSchema: config.outputSchema,
357
- output: config.output
358
- });
359
- }
360
- getReruns() {
361
- return this._config.reruns;
362
- }
363
- getName() {
364
- return this._config.name;
365
- }
366
- getTags() {
367
- return this._config.tags;
368
- }
369
- getInputSchema() {
370
- return this._config.inputSchema;
371
- }
372
- getInput() {
373
- return resolve(this._config.input);
374
- }
375
- getOutputSchema() {
376
- return this._config.outputSchema;
377
- }
378
- getOutput() {
379
- if (this._config.output === void 0) {
380
- return void 0;
381
- }
382
- return resolve(this._config.output);
383
- }
384
- };
385
-
386
- // src/evals/evaluator.ts
387
- var Evaluator = class _Evaluator {
388
- constructor(config) {
389
- this._config = config;
390
- }
391
- getState() {
392
- return {
393
- name: this._config.name,
394
- inputSchema: this._config.inputSchema,
395
- outputSchema: this._config.outputSchema,
396
- scoreSchema: this._config.scoreSchema,
397
- middlewares: this._config.middlewares,
398
- evaluateFn: this._config.evaluateFn,
399
- passThreshold: this._config.passThreshold,
400
- passCriterion: this._config.passCriterion
401
- };
402
- }
403
- static use(middleware) {
404
- return new _Evaluator({
405
- middlewares: [middleware]
406
- });
407
- }
408
- use(middleware) {
409
- const state = this.getState();
410
- return new _Evaluator({
411
- ...state,
412
- middlewares: [...state.middlewares, middleware]
413
- });
414
- }
415
- define(config) {
416
- const { middlewares } = this.getState();
417
- return new _Evaluator({
418
- name: config.name,
419
- inputSchema: config.inputSchema,
420
- outputSchema: config.outputSchema,
421
- scoreSchema: config.scoreSchema,
422
- middlewares,
423
- passThreshold: config.passThreshold,
424
- passCriterion: config.passCriterion
425
- });
426
- }
427
- evaluate(fn) {
428
- return new _Evaluator({
429
- ...this.getState(),
430
- evaluateFn: fn
431
- });
432
- }
433
- getName() {
434
- return this._config.name;
435
- }
436
- getInputSchema() {
437
- return this._config.inputSchema;
438
- }
439
- getOutputSchema() {
440
- return this._config.outputSchema;
441
- }
442
- getScoreSchema() {
443
- return this._config.scoreSchema;
444
- }
445
- getMiddlewares() {
446
- return this._config.middlewares;
447
- }
448
- getEvaluateFn() {
449
- return this._config.evaluateFn;
450
- }
451
- getPassThreshold() {
452
- return this._config.passThreshold;
453
- }
454
- getPassCriterion() {
455
- return this._config.passCriterion;
456
- }
457
- async resolveContext() {
458
- const parts = await Promise.all(this._config.middlewares.map((mw) => mw.resolve()));
459
- return Object.assign({}, ...parts);
460
- }
461
- };
462
-
463
495
  // src/evals/dataset.ts
464
496
  function matchesAny(value, matchers) {
465
497
  return matchers.some(
@@ -523,34 +555,284 @@ var Dataset = class _Dataset {
523
555
  return tagMatch && pathMatch;
524
556
  }
525
557
  };
526
-
527
- // src/evals/metric.ts
528
- var registry = /* @__PURE__ */ new Map();
529
- var Metric = {
530
- of(config) {
531
- const def = {
532
- id: config.id,
533
- name: config.name,
534
- aggregate: config.aggregate,
535
- format: config.format,
536
- make: (data, options) => ({
537
- id: config.id,
538
- data,
539
- ...options?.name !== void 0 && { name: options.name }
540
- })
541
- };
542
- registry.set(config.id, def);
543
- return def;
558
+ function preprocessForDiff(value, options) {
559
+ if (options?.sort && Array.isArray(value)) {
560
+ return [...value].sort((a, b) => {
561
+ const aStr = stringify__default.default(preprocessForDiff(a, options));
562
+ const bStr = stringify__default.default(preprocessForDiff(b, options));
563
+ return aStr.localeCompare(bStr);
564
+ }).map((item) => preprocessForDiff(item, options));
544
565
  }
545
- };
546
- function getMetricById(id) {
547
- return registry.get(id);
548
- }
549
-
550
- // src/evals/score.ts
551
- var registry2 = /* @__PURE__ */ new Map();
552
- function formatScoreData(def, data, options) {
553
- return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
566
+ if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
567
+ const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
568
+ const filtered = {};
569
+ for (const [k, v] of Object.entries(value)) {
570
+ if (!keys.includes(k)) {
571
+ filtered[k] = preprocessForDiff(v, options);
572
+ }
573
+ }
574
+ return filtered;
575
+ }
576
+ if (value !== null && typeof value === "object" && !Array.isArray(value)) {
577
+ const result = {};
578
+ for (const [k, v] of Object.entries(value)) {
579
+ result[k] = preprocessForDiff(v, options);
580
+ }
581
+ return result;
582
+ }
583
+ if (typeof value === "number" && options?.precision !== void 0) {
584
+ return Number(value.toFixed(options.precision));
585
+ }
586
+ return value;
587
+ }
588
+ function toPrettyJson(value) {
589
+ const str = stringify__default.default(value);
590
+ try {
591
+ const parsed = JSON.parse(str);
592
+ return JSON.stringify(parsed, null, 2);
593
+ } catch {
594
+ return str;
595
+ }
596
+ }
597
+ function formatDiffParts(parts) {
598
+ const lines = [];
599
+ for (const part of parts) {
600
+ const prefix = part.added ? "+ " : part.removed ? "- " : "";
601
+ const partLines = part.value.split("\n");
602
+ for (let i = 0; i < partLines.length; i++) {
603
+ const line = partLines[i];
604
+ if (i === partLines.length - 1 && line === "")
605
+ continue;
606
+ lines.push(prefix + line);
607
+ }
608
+ }
609
+ return lines.join("\n");
610
+ }
611
+ function createDiffString(expected, actual, diffOptions) {
612
+ const expectedProcessed = preprocessForDiff(expected, diffOptions);
613
+ const actualProcessed = preprocessForDiff(actual, diffOptions);
614
+ if (diffOptions?.keysOnly) {
615
+ const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
616
+ const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
617
+ const parts2 = diff.diffLines(expectedKeys, actualKeys);
618
+ return formatDiffParts(parts2);
619
+ }
620
+ const expectedStr = toPrettyJson(expectedProcessed);
621
+ const actualStr = toPrettyJson(actualProcessed);
622
+ if (expectedStr === actualStr) {
623
+ return "";
624
+ }
625
+ const parts = diff.diffLines(expectedStr, actualStr);
626
+ if (diffOptions?.outputNewOnly) {
627
+ const filtered = parts.filter((p) => p.added === true);
628
+ return formatDiffParts(filtered);
629
+ }
630
+ return formatDiffParts(parts);
631
+ }
632
+ function extractKeys(value) {
633
+ if (value === null || typeof value !== "object") {
634
+ return "\xB7";
635
+ }
636
+ if (Array.isArray(value)) {
637
+ return value.map(extractKeys);
638
+ }
639
+ const result = {};
640
+ for (const [k, v] of Object.entries(value)) {
641
+ result[k] = extractKeys(v);
642
+ }
643
+ return result;
644
+ }
645
+ function formatLogMessage(msg) {
646
+ if (typeof msg === "string")
647
+ return msg;
648
+ if (msg instanceof Error)
649
+ return msg.stack ?? msg.message;
650
+ try {
651
+ if (msg !== null && typeof msg === "object") {
652
+ return JSON.stringify(msg, null, 2);
653
+ }
654
+ return String(msg);
655
+ } catch {
656
+ return String(msg);
657
+ }
658
+ }
659
+ function createLogEntry(message, options) {
660
+ return {
661
+ type: "log",
662
+ label: options?.label,
663
+ message: formatLogMessage(message)
664
+ };
665
+ }
666
+ function getLogLines(entry) {
667
+ return entry.message.split("\n");
668
+ }
669
+ function createDiffLogEntry(expected, actual, options) {
670
+ const { label, ...diffOpts } = options ?? {};
671
+ const diff = createDiffString(expected, actual, diffOpts);
672
+ return {
673
+ type: "diff",
674
+ label,
675
+ expected,
676
+ actual,
677
+ diff: diff || "(no differences)"
678
+ };
679
+ }
680
+ function printJsonDiff(expected, actual, options = {}) {
681
+ const { color = true, ...diffOpts } = options;
682
+ const diff = createDiffString(expected, actual, diffOpts);
683
+ if (color) {
684
+ const lines = diff.split("\n").map((line) => {
685
+ const trimmed = line.trimStart();
686
+ if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
687
+ return `\x1B[31m${line}\x1B[0m`;
688
+ }
689
+ if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
690
+ return `\x1B[32m${line}\x1B[0m`;
691
+ }
692
+ return line;
693
+ });
694
+ const colored = lines.join("\n");
695
+ console.log(colored || "(no differences)");
696
+ return colored;
697
+ }
698
+ console.log(diff || "(no differences)");
699
+ return diff;
700
+ }
701
+
702
+ // src/evals/metric.ts
703
+ var registry = /* @__PURE__ */ new Map();
704
+ var Metric = {
705
+ of(config) {
706
+ const def = {
707
+ id: config.id,
708
+ name: config.name,
709
+ aggregate: config.aggregate,
710
+ format: config.format,
711
+ make: (data, options) => ({
712
+ id: config.id,
713
+ data,
714
+ ...options?.name !== void 0 && { name: options.name }
715
+ })
716
+ };
717
+ registry.set(config.id, def);
718
+ return def;
719
+ }
720
+ };
721
+ function getMetricById(id) {
722
+ return registry.get(id);
723
+ }
724
+
725
+ // src/evals/aggregators.ts
726
+ function aggregateTokenCountSum(values) {
727
+ const initial = {
728
+ input: 0,
729
+ output: 0,
730
+ inputCached: 0,
731
+ outputCached: 0
732
+ };
733
+ return values.reduce(
734
+ (acc, v) => ({
735
+ input: acc.input + (v.input ?? 0),
736
+ output: acc.output + (v.output ?? 0),
737
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
738
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
739
+ }),
740
+ initial
741
+ );
742
+ }
743
+ function aggregateLatencyAverage(values) {
744
+ if (values.length === 0) {
745
+ return { ms: 0 };
746
+ }
747
+ const sum = values.reduce((s, v) => s + v.ms, 0);
748
+ return { ms: sum / values.length };
749
+ }
750
+
751
+ // src/evals/metrics/standard.ts
752
+ var tokenCountMetric = Metric.of({
753
+ id: "token-count",
754
+ name: "Tokens",
755
+ aggregate: aggregateTokenCountSum,
756
+ format: (data, options) => {
757
+ const input = data.input ?? 0;
758
+ const output = data.output ?? 0;
759
+ const inputCached = data.inputCached ?? 0;
760
+ const outputCached = data.outputCached ?? 0;
761
+ const cached = inputCached + outputCached;
762
+ const base = `in:${input} out:${output} cached:${cached}`;
763
+ return options?.isAggregated ? `Total: ${base}` : base;
764
+ }
765
+ });
766
+ var latencyMetric = Metric.of({
767
+ id: "latency",
768
+ name: "Latency",
769
+ aggregate: aggregateLatencyAverage,
770
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
771
+ });
772
+
773
+ // src/evals/run-config.ts
774
+ function validateRow(row, index) {
775
+ const hasEvaluators = "evaluators" in row && row.evaluators !== void 0 && row.evaluators !== void 0;
776
+ const hasPattern = "evaluatorPattern" in row && typeof row.evaluatorPattern === "string" && row.evaluatorPattern.trim().length > 0;
777
+ if (hasEvaluators && hasPattern) {
778
+ throw new Error(`RunConfig run[${index}] must not set both evaluators and evaluatorPattern`);
779
+ }
780
+ if (!hasEvaluators && !hasPattern) {
781
+ throw new Error(`RunConfig run[${index}] must set either evaluators or evaluatorPattern`);
782
+ }
783
+ if (hasEvaluators && row.evaluators.length === 0) {
784
+ throw new Error(`RunConfig run[${index}]: evaluators must be non-empty`);
785
+ }
786
+ const rawRep = "repetitions" in row ? row.repetitions : void 0;
787
+ const repetitions = rawRep ?? 1;
788
+ if (!Number.isInteger(repetitions) || repetitions < 1) {
789
+ throw new Error(
790
+ `RunConfig run[${index}]: repetitions must be a positive integer, got ${String(rawRep)}`
791
+ );
792
+ }
793
+ }
794
+ var RunConfig = class _RunConfig {
795
+ constructor(name, displayName, tags, runs) {
796
+ this._name = name;
797
+ this._displayName = displayName;
798
+ this._tags = tags;
799
+ this._runs = runs;
800
+ }
801
+ static define(config) {
802
+ if (config.runs.length === 0) {
803
+ throw new Error("RunConfig runs must be non-empty");
804
+ }
805
+ config.runs.forEach(validateRow);
806
+ const name = validateRunConfigName(config.name, "RunConfig.define");
807
+ const displayName = normalizeOptionalDisplayName(config.displayName);
808
+ const tags = config.tags !== void 0 ? [...config.tags] : [];
809
+ return new _RunConfig(name, displayName, tags, config.runs);
810
+ }
811
+ /** Canonical id (branded {@link RunConfigName} at runtime; typed as `string` for ergonomics). */
812
+ getName() {
813
+ return this._name;
814
+ }
815
+ /** Optional unrestricted display label. */
816
+ getDisplayName() {
817
+ return this._displayName;
818
+ }
819
+ /** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. */
820
+ getDisplayLabel() {
821
+ return this._displayName ?? this._name;
822
+ }
823
+ /** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
824
+ getTags() {
825
+ return [...this._tags];
826
+ }
827
+ getRuns() {
828
+ return this._runs;
829
+ }
830
+ };
831
+
832
+ // src/evals/score.ts
833
+ var registry2 = /* @__PURE__ */ new Map();
834
+ function formatScoreData(def, data, options) {
835
+ return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
554
836
  }
555
837
  var ScoreAggregate = {
556
838
  /** Average numeric fields. Use for scores like { value, delta }. */
@@ -655,54 +937,6 @@ function getScoreById(id) {
655
937
  return registry2.get(id);
656
938
  }
657
939
 
658
- // src/evals/aggregators.ts
659
- function aggregateTokenCountSum(values) {
660
- const initial = {
661
- input: 0,
662
- output: 0,
663
- inputCached: 0,
664
- outputCached: 0
665
- };
666
- return values.reduce(
667
- (acc, v) => ({
668
- input: acc.input + (v.input ?? 0),
669
- output: acc.output + (v.output ?? 0),
670
- inputCached: acc.inputCached + (v.inputCached ?? 0),
671
- outputCached: acc.outputCached + (v.outputCached ?? 0)
672
- }),
673
- initial
674
- );
675
- }
676
- function aggregateLatencyAverage(values) {
677
- if (values.length === 0) {
678
- return { ms: 0 };
679
- }
680
- const sum = values.reduce((s, v) => s + v.ms, 0);
681
- return { ms: sum / values.length };
682
- }
683
-
684
- // src/evals/metrics/standard.ts
685
- var tokenCountMetric = Metric.of({
686
- id: "token-count",
687
- name: "Tokens",
688
- aggregate: aggregateTokenCountSum,
689
- format: (data, options) => {
690
- const input = data.input ?? 0;
691
- const output = data.output ?? 0;
692
- const inputCached = data.inputCached ?? 0;
693
- const outputCached = data.outputCached ?? 0;
694
- const cached = inputCached + outputCached;
695
- const base = `in:${input} out:${output} cached:${cached}`;
696
- return options?.isAggregated ? `Total: ${base}` : base;
697
- }
698
- });
699
- var latencyMetric = Metric.of({
700
- id: "latency",
701
- name: "Latency",
702
- aggregate: aggregateLatencyAverage,
703
- format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
704
- });
705
-
706
940
  // src/evals/scores/standard.ts
707
941
  var percentScore = Score.of({
708
942
  id: "percent",
@@ -734,148 +968,197 @@ var binaryScore = Score.of({
734
968
  },
735
969
  aggregateValues: Score.aggregate.all
736
970
  });
737
- function preprocessForDiff(value, options) {
738
- if (options?.sort && Array.isArray(value)) {
739
- return [...value].sort((a, b) => {
740
- const aStr = stringify__default.default(preprocessForDiff(a, options));
741
- const bStr = stringify__default.default(preprocessForDiff(b, options));
742
- return aStr.localeCompare(bStr);
743
- }).map((item) => preprocessForDiff(item, options));
744
- }
745
- if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
746
- const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
747
- const filtered = {};
748
- for (const [k, v] of Object.entries(value)) {
749
- if (!keys.includes(k)) {
750
- filtered[k] = preprocessForDiff(v, options);
751
- }
971
+
972
+ // src/evals/tag-set.ts
973
+ var TagSet = class {
974
+ constructor() {
975
+ }
976
+ static define(tags) {
977
+ const out = {};
978
+ for (const tag of tags) {
979
+ out[tag] = tag;
752
980
  }
753
- return filtered;
981
+ return out;
754
982
  }
755
- if (value !== null && typeof value === "object" && !Array.isArray(value)) {
756
- const result = {};
757
- for (const [k, v] of Object.entries(value)) {
758
- result[k] = preprocessForDiff(v, options);
759
- }
760
- return result;
983
+ };
984
+
985
+ // src/evals/test-case.ts
986
+ function resolve(value) {
987
+ return typeof value === "function" ? value() : value;
988
+ }
989
+ var TestCase = class _TestCase {
990
+ constructor(config) {
991
+ this._config = config;
761
992
  }
762
- if (typeof value === "number" && options?.precision !== void 0) {
763
- return Number(value.toFixed(options.precision));
993
+ static describe(config) {
994
+ const name = validateTestCaseName(config.name, "TestCase.describe");
995
+ const displayName = normalizeOptionalDisplayName(config.displayName);
996
+ return new _TestCase({
997
+ name,
998
+ displayName,
999
+ tags: config.tags,
1000
+ inputSchema: config.inputSchema,
1001
+ input: config.input,
1002
+ outputSchema: config.outputSchema,
1003
+ output: config.output
1004
+ });
764
1005
  }
765
- return value;
766
- }
767
- function toPrettyJson(value) {
768
- const str = stringify__default.default(value);
769
- try {
770
- const parsed = JSON.parse(str);
771
- return JSON.stringify(parsed, null, 2);
772
- } catch {
773
- return str;
1006
+ getName() {
1007
+ return this._config.name;
774
1008
  }
775
- }
776
- function formatDiffParts(parts) {
777
- const lines = [];
778
- for (const part of parts) {
779
- const prefix = part.added ? "+ " : part.removed ? "- " : "";
780
- const partLines = part.value.split("\n");
781
- for (let i = 0; i < partLines.length; i++) {
782
- const line = partLines[i];
783
- if (i === partLines.length - 1 && line === "")
784
- continue;
785
- lines.push(prefix + line);
786
- }
1009
+ getDisplayName() {
1010
+ return this._config.displayName;
787
1011
  }
788
- return lines.join("\n");
789
- }
790
- function createDiffString(expected, actual, diffOptions) {
791
- const expectedProcessed = preprocessForDiff(expected, diffOptions);
792
- const actualProcessed = preprocessForDiff(actual, diffOptions);
793
- if (diffOptions?.keysOnly) {
794
- const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
795
- const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
796
- const parts2 = diff.diffLines(expectedKeys, actualKeys);
797
- return formatDiffParts(parts2);
1012
+ getDisplayLabel() {
1013
+ return this._config.displayName ?? this._config.name;
798
1014
  }
799
- const expectedStr = toPrettyJson(expectedProcessed);
800
- const actualStr = toPrettyJson(actualProcessed);
801
- if (expectedStr === actualStr) {
802
- return "";
1015
+ getTags() {
1016
+ return this._config.tags;
803
1017
  }
804
- const parts = diff.diffLines(expectedStr, actualStr);
805
- if (diffOptions?.outputNewOnly) {
806
- const filtered = parts.filter((p) => p.added === true);
807
- return formatDiffParts(filtered);
1018
+ getInputSchema() {
1019
+ return this._config.inputSchema;
808
1020
  }
809
- return formatDiffParts(parts);
810
- }
811
- function extractKeys(value) {
812
- if (value === null || typeof value !== "object") {
813
- return "\xB7";
1021
+ getInput() {
1022
+ return resolve(this._config.input);
814
1023
  }
815
- if (Array.isArray(value)) {
816
- return value.map(extractKeys);
1024
+ getOutputSchema() {
1025
+ return this._config.outputSchema;
817
1026
  }
818
- const result = {};
819
- for (const [k, v] of Object.entries(value)) {
820
- result[k] = extractKeys(v);
1027
+ getOutput() {
1028
+ if (this._config.output === void 0) {
1029
+ return void 0;
1030
+ }
1031
+ return resolve(this._config.output);
821
1032
  }
822
- return result;
1033
+ };
1034
+ function getTestCaseDisplayLabel(testCase) {
1035
+ if (typeof testCase.getDisplayLabel === "function") {
1036
+ return testCase.getDisplayLabel();
1037
+ }
1038
+ return typeof testCase.getName === "function" ? testCase.getName() : "";
823
1039
  }
824
- function formatLogMessage(msg) {
825
- if (typeof msg === "string")
826
- return msg;
827
- if (msg instanceof Error)
828
- return msg.stack ?? msg.message;
1040
+ function getTestCaseTagList(testCase) {
1041
+ return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
1042
+ }
1043
+ async function loadRunSnapshotsFromArtifacts(config) {
1044
+ const baseDir = path.resolve(config.artifactDirectory);
1045
+ let entries;
829
1046
  try {
830
- if (msg !== null && typeof msg === "object") {
831
- return JSON.stringify(msg, null, 2);
832
- }
833
- return String(msg);
1047
+ entries = await promises.readdir(baseDir);
834
1048
  } catch {
835
- return String(msg);
1049
+ return [];
836
1050
  }
1051
+ const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
1052
+ const snapshots = [];
1053
+ for (const fileName of jsonlFiles) {
1054
+ const filePath = path.join(baseDir, fileName);
1055
+ try {
1056
+ const snapshot = await parseArtifactToSnapshot(filePath, config);
1057
+ if (snapshot) {
1058
+ snapshots.push(snapshot);
1059
+ }
1060
+ } catch {
1061
+ }
1062
+ }
1063
+ return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
837
1064
  }
838
- function createLogEntry(message, options) {
839
- return {
840
- type: "log",
841
- label: options?.label,
842
- message: formatLogMessage(message)
843
- };
844
- }
845
- function getLogLines(entry) {
846
- return entry.message.split("\n");
847
- }
848
- function createDiffLogEntry(expected, actual, options) {
849
- const { label, ...diffOpts } = options ?? {};
850
- const diff = createDiffString(expected, actual, diffOpts);
851
- return {
852
- type: "diff",
853
- label,
854
- expected,
855
- actual,
856
- diff: diff || "(no differences)"
857
- };
858
- }
859
- function printJsonDiff(expected, actual, options = {}) {
860
- const { color = true, ...diffOpts } = options;
861
- const diff = createDiffString(expected, actual, diffOpts);
862
- if (color) {
863
- const lines = diff.split("\n").map((line) => {
864
- const trimmed = line.trimStart();
865
- if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
866
- return `\x1B[31m${line}\x1B[0m`;
1065
+ async function parseArtifactToSnapshot(filePath, _config) {
1066
+ const content = await promises.readFile(filePath, "utf8");
1067
+ const lines = content.split("\n").filter((line) => line.trim().length > 0);
1068
+ if (lines.length === 0) {
1069
+ return null;
1070
+ }
1071
+ let runQueued = null;
1072
+ let runCompleted = null;
1073
+ let runFailed = null;
1074
+ let runStarted = null;
1075
+ for (const line of lines) {
1076
+ try {
1077
+ const event = JSON.parse(line);
1078
+ const type = event.type;
1079
+ if (type === "RunQueued") {
1080
+ runQueued = {
1081
+ runId: event.runId,
1082
+ datasetId: event.datasetId,
1083
+ datasetName: event.datasetName,
1084
+ evaluatorIds: event.evaluatorIds,
1085
+ totalTestCases: event.totalTestCases ?? 0,
1086
+ artifactPath: event.artifactPath ?? filePath,
1087
+ ts: event.ts
1088
+ };
867
1089
  }
868
- if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
869
- return `\x1B[32m${line}\x1B[0m`;
1090
+ if (type === "RunStarted") {
1091
+ runStarted = { startedAt: event.startedAt };
1092
+ }
1093
+ if (type === "RunCompleted") {
1094
+ runCompleted = {
1095
+ passedTestCases: event.passedTestCases,
1096
+ failedTestCases: event.failedTestCases,
1097
+ totalTestCases: event.totalTestCases,
1098
+ finishedAt: event.finishedAt
1099
+ };
1100
+ }
1101
+ if (type === "RunFailed") {
1102
+ runFailed = {
1103
+ finishedAt: event.finishedAt,
1104
+ errorMessage: event.errorMessage
1105
+ };
1106
+ }
1107
+ } catch {
1108
+ }
1109
+ }
1110
+ if (!runQueued) {
1111
+ return null;
1112
+ }
1113
+ const artifactPath = filePath;
1114
+ const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1115
+ const progress = aggregateTestCaseProgress(lines);
1116
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1117
+ const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1118
+ const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1119
+ return {
1120
+ runId: runQueued.runId,
1121
+ datasetId: runQueued.datasetId,
1122
+ datasetName: runQueued.datasetName,
1123
+ evaluatorIds: runQueued.evaluatorIds,
1124
+ queuedAt: runQueued.ts ?? 0,
1125
+ startedAt: runStarted?.startedAt,
1126
+ finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
1127
+ totalTestCases: runQueued.totalTestCases,
1128
+ completedTestCases,
1129
+ passedTestCases,
1130
+ failedTestCases,
1131
+ status,
1132
+ artifactPath,
1133
+ errorMessage: runFailed?.errorMessage
1134
+ };
1135
+ }
1136
+ function aggregateTestCaseProgress(lines) {
1137
+ let completedTestCases = 0;
1138
+ const testCasePassedBy = /* @__PURE__ */ new Map();
1139
+ for (const line of lines) {
1140
+ try {
1141
+ const event = JSON.parse(line);
1142
+ if (event.type === "TestCaseProgress") {
1143
+ const ev = event;
1144
+ completedTestCases = ev.completedTestCases ?? completedTestCases;
1145
+ const id = ev.testCaseId;
1146
+ const current = testCasePassedBy.get(id);
1147
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
870
1148
  }
871
- return line;
872
- });
873
- const colored = lines.join("\n");
874
- console.log(colored || "(no differences)");
875
- return colored;
1149
+ } catch {
1150
+ }
876
1151
  }
877
- console.log(diff || "(no differences)");
878
- return diff;
1152
+ let passedTestCases = 0;
1153
+ let failedTestCases = 0;
1154
+ for (const passed of testCasePassedBy.values()) {
1155
+ if (passed) {
1156
+ passedTestCases += 1;
1157
+ } else {
1158
+ failedTestCases += 1;
1159
+ }
1160
+ }
1161
+ return { completedTestCases, passedTestCases, failedTestCases };
879
1162
  }
880
1163
 
881
1164
  // src/runner/config.ts
@@ -887,6 +1170,7 @@ var defaultRunnerConfig = {
887
1170
  rootDir: process.cwd(),
888
1171
  datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
889
1172
  evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
1173
+ runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
890
1174
  testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
891
1175
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
892
1176
  },
@@ -912,6 +1196,11 @@ function toRunnerConfigOverrides(config) {
912
1196
  } else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
913
1197
  discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
914
1198
  }
1199
+ if (rawDiscovery?.runConfigFilePatterns !== void 0) {
1200
+ discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
1201
+ } else if (rawDiscovery?.runConfigSuffixes !== void 0) {
1202
+ discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
1203
+ }
915
1204
  if (rawDiscovery?.testCaseFilePatterns !== void 0) {
916
1205
  discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
917
1206
  } else if (rawDiscovery?.testCaseSuffixes !== void 0) {
@@ -1010,6 +1299,9 @@ function isDatasetLike(value) {
1010
1299
  function isEvaluatorLike(value) {
1011
1300
  return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
1012
1301
  }
1302
+ function isRunConfigLike(value) {
1303
+ return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
1304
+ }
1013
1305
  function isTestCaseLike(value) {
1014
1306
  return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
1015
1307
  }
@@ -1098,6 +1390,23 @@ async function collectEvaluatorsFromFiles(config) {
1098
1390
  );
1099
1391
  return found.flat();
1100
1392
  }
1393
+ async function collectRunConfigsFromFiles(config) {
1394
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1395
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
1396
+ const found = await Promise.all(
1397
+ matched.map(async (absolutePath) => {
1398
+ const exports = await loadModuleExports(absolutePath);
1399
+ const runConfigs = exports.filter(isRunConfigLike);
1400
+ const relPath = path.relative(config.rootDir, absolutePath);
1401
+ return runConfigs.map((runConfig) => ({
1402
+ id: runConfig.getName(),
1403
+ filePath: relPath,
1404
+ runConfig
1405
+ }));
1406
+ })
1407
+ );
1408
+ return found.flat();
1409
+ }
1101
1410
  async function collectTestCasesFromFiles(config) {
1102
1411
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1103
1412
  const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
@@ -1190,15 +1499,17 @@ function readOutput(testCase) {
1190
1499
  }
1191
1500
  return candidate.getOutput();
1192
1501
  }
1193
- function buildEvaluationUnits(testCases) {
1502
+ function buildEvaluationUnits(testCases, repetitionCount) {
1503
+ const count = Math.max(1, repetitionCount);
1194
1504
  const units = [];
1195
1505
  for (const testCaseItem of testCases) {
1196
- const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1197
- for (let r = 0; r < rerunTotal; r++) {
1506
+ const repetitionId = `rep-${crypto.randomUUID()}`;
1507
+ for (let r = 0; r < count; r++) {
1198
1508
  units.push({
1199
1509
  testCaseItem,
1200
- rerunIndex: r + 1,
1201
- rerunTotal
1510
+ repetitionId,
1511
+ repetitionIndex: r + 1,
1512
+ repetitionCount: count
1202
1513
  });
1203
1514
  }
1204
1515
  }
@@ -1211,7 +1522,7 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1211
1522
  return path.join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
1212
1523
  }
1213
1524
  function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
1214
- const { testCaseItem, rerunIndex, rerunTotal } = unit;
1525
+ const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
1215
1526
  return effect.Effect.gen(function* () {
1216
1527
  const evaluatorRunId = `run-${crypto.randomUUID()}`;
1217
1528
  const started = Date.now();
@@ -1220,11 +1531,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1220
1531
  type: "TestCaseStarted",
1221
1532
  runId: task.runId,
1222
1533
  testCaseId: testCaseItem.id,
1223
- testCaseName: testCaseItem.testCase.getName(),
1534
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
1224
1535
  startedTestCases: startedEvaluations,
1225
1536
  totalTestCases: totalEvaluations,
1226
- rerunIndex,
1227
- rerunTotal
1537
+ repetitionId,
1538
+ repetitionIndex,
1539
+ repetitionCount
1228
1540
  });
1229
1541
  const evaluatorScores = [];
1230
1542
  let testCaseError;
@@ -1258,8 +1570,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1258
1570
  meta: {
1259
1571
  triggerId: task.triggerId,
1260
1572
  runId: evaluatorRunId,
1261
- datasetId: task.datasetId
1573
+ datasetId: task.datasetId,
1574
+ repetitionId,
1575
+ repetitionIndex,
1576
+ repetitionCount,
1577
+ runConfigName: task.runConfigName
1262
1578
  },
1579
+ testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1580
+ runConfigTags: task.runConfigTags,
1581
+ evaluatorTags: getEvaluatorTagList(evaluator),
1263
1582
  logDiff,
1264
1583
  log,
1265
1584
  createError
@@ -1302,18 +1621,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1302
1621
  });
1303
1622
  }
1304
1623
  }
1305
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1624
+ const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
1306
1625
  const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [n + 1, n + 1]);
1307
1626
  const progressEvent = {
1308
1627
  type: "TestCaseProgress",
1309
1628
  runId: task.runId,
1310
1629
  testCaseId: testCaseItem.id,
1311
- testCaseName: testCaseItem.testCase.getName(),
1630
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
1312
1631
  completedTestCases: completedEvaluations,
1313
1632
  totalTestCases: totalEvaluations,
1314
- rerunIndex,
1315
- rerunTotal,
1316
- passed: rerunPassedThis,
1633
+ repetitionId,
1634
+ repetitionIndex,
1635
+ repetitionCount,
1636
+ passed: repetitionPassedThis,
1317
1637
  durationMs: Date.now() - started,
1318
1638
  evaluatorScores,
1319
1639
  output,
@@ -1334,9 +1654,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1334
1654
  (map) => {
1335
1655
  const key = testCaseItem.id;
1336
1656
  const existing = map.get(key) ?? { completedCount: 0, results: [] };
1337
- const newResults = [...existing.results, rerunPassedThis];
1657
+ const newResults = [...existing.results, repetitionPassedThis];
1338
1658
  const newCompletedCount = existing.completedCount + 1;
1339
- const isLast = newCompletedCount === rerunTotal;
1659
+ const isLast = newCompletedCount === repetitionCount;
1340
1660
  const newMap = new Map(map);
1341
1661
  newMap.set(key, {
1342
1662
  completedCount: newCompletedCount,
@@ -1373,10 +1693,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1373
1693
  runId: task.runId,
1374
1694
  startedAt
1375
1695
  });
1376
- const totalEvaluations = task.testCases.reduce(
1377
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1378
- 0
1379
- );
1696
+ const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
1380
1697
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1381
1698
  const completedRef = yield* effect.Ref.make(0);
1382
1699
  const startedRef = yield* effect.Ref.make(0);
@@ -1385,7 +1702,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1385
1702
  const testCaseResultsRef = yield* effect.Ref.make(
1386
1703
  /* @__PURE__ */ new Map()
1387
1704
  );
1388
- const evaluationUnits = buildEvaluationUnits(task.testCases);
1705
+ const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
1389
1706
  const processEvaluation = (unit) => processOneEvaluation(
1390
1707
  task,
1391
1708
  unit,
@@ -1399,11 +1716,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1399
1716
  failedRef,
1400
1717
  testCaseResultsRef
1401
1718
  );
1402
- yield* effect.Effect.forEach(
1403
- evaluationUnits,
1404
- processEvaluation,
1405
- maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1406
- );
1719
+ const globalSem = task.globalEvaluationSemaphore;
1720
+ if (globalSem !== void 0) {
1721
+ yield* effect.Effect.forEach(
1722
+ evaluationUnits,
1723
+ (unit) => globalSem.withPermits(1)(processEvaluation(unit)),
1724
+ { concurrency: "unbounded", discard: true }
1725
+ );
1726
+ } else {
1727
+ yield* effect.Effect.forEach(
1728
+ evaluationUnits,
1729
+ processEvaluation,
1730
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1731
+ );
1732
+ }
1407
1733
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
1408
1734
  effect.Ref.get(completedRef),
1409
1735
  effect.Ref.get(passedRef),
@@ -1439,125 +1765,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1439
1765
  artifactPath: task.snapshot.artifactPath
1440
1766
  });
1441
1767
  });
1442
- async function loadRunSnapshotsFromArtifacts(config) {
1443
- const baseDir = path.resolve(config.artifactDirectory);
1444
- let entries;
1445
- try {
1446
- entries = await promises.readdir(baseDir);
1447
- } catch {
1448
- return [];
1449
- }
1450
- const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
1451
- const snapshots = [];
1452
- for (const fileName of jsonlFiles) {
1453
- const filePath = path.join(baseDir, fileName);
1454
- try {
1455
- const snapshot = await parseArtifactToSnapshot(filePath, config);
1456
- if (snapshot) {
1457
- snapshots.push(snapshot);
1458
- }
1459
- } catch {
1460
- }
1461
- }
1462
- return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
1463
- }
1464
- async function parseArtifactToSnapshot(filePath, _config) {
1465
- const content = await promises.readFile(filePath, "utf8");
1466
- const lines = content.split("\n").filter((line) => line.trim().length > 0);
1467
- if (lines.length === 0) {
1468
- return null;
1469
- }
1470
- let runQueued = null;
1471
- let runCompleted = null;
1472
- let runFailed = null;
1473
- let runStarted = null;
1474
- for (const line of lines) {
1475
- try {
1476
- const event = JSON.parse(line);
1477
- const type = event.type;
1478
- if (type === "RunQueued") {
1479
- runQueued = {
1480
- runId: event.runId,
1481
- datasetId: event.datasetId,
1482
- datasetName: event.datasetName,
1483
- evaluatorIds: event.evaluatorIds,
1484
- totalTestCases: event.totalTestCases ?? 0,
1485
- artifactPath: event.artifactPath ?? filePath,
1486
- ts: event.ts
1487
- };
1488
- }
1489
- if (type === "RunStarted") {
1490
- runStarted = { startedAt: event.startedAt };
1491
- }
1492
- if (type === "RunCompleted") {
1493
- runCompleted = {
1494
- passedTestCases: event.passedTestCases,
1495
- failedTestCases: event.failedTestCases,
1496
- totalTestCases: event.totalTestCases,
1497
- finishedAt: event.finishedAt
1498
- };
1499
- }
1500
- if (type === "RunFailed") {
1501
- runFailed = {
1502
- finishedAt: event.finishedAt,
1503
- errorMessage: event.errorMessage
1504
- };
1505
- }
1506
- } catch {
1507
- }
1768
+
1769
+ // src/runner/name-pattern.ts
1770
+ function parseRegexLiteral(pattern) {
1771
+ if (!pattern.startsWith("/")) {
1772
+ return void 0;
1508
1773
  }
1509
- if (!runQueued) {
1510
- return null;
1774
+ const lastSlash = pattern.lastIndexOf("/");
1775
+ if (lastSlash <= 0) {
1776
+ return void 0;
1511
1777
  }
1512
- const artifactPath = filePath;
1513
- const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1514
- const progress = aggregateTestCaseProgress(lines);
1515
- const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1516
- const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1517
- const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1518
1778
  return {
1519
- runId: runQueued.runId,
1520
- datasetId: runQueued.datasetId,
1521
- datasetName: runQueued.datasetName,
1522
- evaluatorIds: runQueued.evaluatorIds,
1523
- queuedAt: runQueued.ts ?? 0,
1524
- startedAt: runStarted?.startedAt,
1525
- finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
1526
- totalTestCases: runQueued.totalTestCases,
1527
- completedTestCases,
1528
- passedTestCases,
1529
- failedTestCases,
1530
- status,
1531
- artifactPath,
1532
- errorMessage: runFailed?.errorMessage
1779
+ source: pattern.slice(1, lastSlash),
1780
+ flags: pattern.slice(lastSlash + 1)
1533
1781
  };
1534
1782
  }
1535
- function aggregateTestCaseProgress(lines) {
1536
- let completedTestCases = 0;
1537
- const testCasePassedBy = /* @__PURE__ */ new Map();
1538
- for (const line of lines) {
1539
- try {
1540
- const event = JSON.parse(line);
1541
- if (event.type === "TestCaseProgress") {
1542
- const ev = event;
1543
- completedTestCases = ev.completedTestCases ?? completedTestCases;
1544
- const id = ev.testCaseId;
1545
- const current = testCasePassedBy.get(id);
1546
- testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
1547
- }
1548
- } catch {
1549
- }
1783
+ function createNameMatcher(pattern) {
1784
+ const normalizedPattern = pattern.trim();
1785
+ const regexLiteral = parseRegexLiteral(normalizedPattern);
1786
+ if (regexLiteral) {
1787
+ const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1788
+ return (value) => regex.test(value);
1550
1789
  }
1551
- let passedTestCases = 0;
1552
- let failedTestCases = 0;
1553
- for (const passed of testCasePassedBy.values()) {
1554
- if (passed) {
1555
- passedTestCases += 1;
1556
- } else {
1557
- failedTestCases += 1;
1558
- }
1790
+ if (normalizedPattern.includes("*")) {
1791
+ const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1792
+ const regex = new RegExp(`^${escaped}$`, "i");
1793
+ return (value) => regex.test(value);
1559
1794
  }
1560
- return { completedTestCases, passedTestCases, failedTestCases };
1795
+ return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1561
1796
  }
1562
1797
  async function appendJsonLine(artifactPath, payload) {
1563
1798
  await promises.mkdir(path.dirname(artifactPath), { recursive: true });
@@ -1616,32 +1851,12 @@ function searchCollectedTestCases(all, query) {
1616
1851
  }
1617
1852
 
1618
1853
  // src/runner/api.ts
1619
- function parseRegexLiteral(pattern) {
1620
- if (!pattern.startsWith("/")) {
1621
- return void 0;
1622
- }
1623
- const lastSlash = pattern.lastIndexOf("/");
1624
- if (lastSlash <= 0) {
1625
- return void 0;
1626
- }
1627
- return {
1628
- source: pattern.slice(1, lastSlash),
1629
- flags: pattern.slice(lastSlash + 1)
1630
- };
1631
- }
1632
- function createNameMatcher(pattern) {
1633
- const normalizedPattern = pattern.trim();
1634
- const regexLiteral = parseRegexLiteral(normalizedPattern);
1635
- if (regexLiteral) {
1636
- const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1637
- return (value) => regex.test(value);
1638
- }
1639
- if (normalizedPattern.includes("*")) {
1640
- const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1641
- const regex = new RegExp(`^${escaped}$`, "i");
1642
- return (value) => regex.test(value);
1854
+ function normalizeRunRepetitions(value) {
1855
+ const n = value ?? 1;
1856
+ if (!Number.isInteger(n) || n < 1) {
1857
+ throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
1643
1858
  }
1644
- return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1859
+ return n;
1645
1860
  }
1646
1861
  function mergeRunnerOverrides(base, next) {
1647
1862
  if (!base) {
@@ -1676,6 +1891,7 @@ var EffectRunner = class {
1676
1891
  this.listeners = /* @__PURE__ */ new Set();
1677
1892
  this.datasetsById = /* @__PURE__ */ new Map();
1678
1893
  this.evaluatorsById = /* @__PURE__ */ new Map();
1894
+ this.runConfigsById = /* @__PURE__ */ new Map();
1679
1895
  this.schedulerFiber = effect.Effect.runFork(this.createSchedulerEffect());
1680
1896
  this.persistenceFiber = effect.Effect.runFork(
1681
1897
  createPersistenceWorker(this.persistenceQueue)
@@ -1716,6 +1932,137 @@ var EffectRunner = class {
1716
1932
  (item) => matcher(item.evaluator.getName() ?? "")
1717
1933
  );
1718
1934
  }
1935
+ async collectRunConfigs() {
1936
+ const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
1937
+ this.runConfigsById.clear();
1938
+ const byNameLower = /* @__PURE__ */ new Map();
1939
+ for (const item of runConfigs) {
1940
+ const id = item.runConfig.getName();
1941
+ const lower = id.toLowerCase();
1942
+ const prev = byNameLower.get(lower);
1943
+ if (prev !== void 0 && prev.filePath !== item.filePath) {
1944
+ throw new Error(
1945
+ `Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
1946
+ );
1947
+ }
1948
+ byNameLower.set(lower, item);
1949
+ this.runConfigsById.set(id, item);
1950
+ }
1951
+ return runConfigs;
1952
+ }
1953
+ async resolveRunConfigByName(name) {
1954
+ if (this.runConfigsById.size === 0) {
1955
+ await this.collectRunConfigs();
1956
+ }
1957
+ const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
1958
+ const keyLower = key.toLowerCase();
1959
+ const matches = Array.from(this.runConfigsById.values()).filter(
1960
+ (item) => item.runConfig.getName().toLowerCase() === keyLower
1961
+ );
1962
+ if (matches.length === 0) {
1963
+ return void 0;
1964
+ }
1965
+ if (matches.length > 1) {
1966
+ throw new Error(
1967
+ `Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
1968
+ );
1969
+ }
1970
+ return matches[0];
1971
+ }
1972
+ async expandRunConfigToJobs(collected) {
1973
+ if (this.datasetsById.size === 0) {
1974
+ await this.collectDatasets();
1975
+ }
1976
+ if (this.evaluatorsById.size === 0) {
1977
+ await this.collectEvaluators();
1978
+ }
1979
+ const rcName = collected.runConfig.getName();
1980
+ const jobs = [];
1981
+ const runs = collected.runConfig.getRuns();
1982
+ for (const [i, row] of runs.entries()) {
1983
+ const dsCollected = Array.from(this.datasetsById.values()).find(
1984
+ (d) => d.dataset === row.dataset
1985
+ );
1986
+ if (!dsCollected) {
1987
+ throw new Error(
1988
+ `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
1989
+ );
1990
+ }
1991
+ let evaluatorIds;
1992
+ if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
1993
+ const matcher = createNameMatcher(row.evaluatorPattern);
1994
+ const matched = Array.from(this.evaluatorsById.values()).filter(
1995
+ (item) => matcher(item.evaluator.getName() ?? "")
1996
+ );
1997
+ if (matched.length === 0) {
1998
+ throw new Error(
1999
+ `RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
2000
+ );
2001
+ }
2002
+ evaluatorIds = matched.map((item) => item.id);
2003
+ } else {
2004
+ const evaluators = row.evaluators;
2005
+ evaluatorIds = [];
2006
+ for (const ev of evaluators) {
2007
+ const found = Array.from(this.evaluatorsById.values()).find(
2008
+ (item) => item.evaluator === ev
2009
+ );
2010
+ if (!found) {
2011
+ throw new Error(
2012
+ `RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
2013
+ );
2014
+ }
2015
+ evaluatorIds.push(found.id);
2016
+ }
2017
+ }
2018
+ const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
2019
+ jobs.push({
2020
+ datasetId: dsCollected.id,
2021
+ evaluatorIds,
2022
+ runConfigName: rcName,
2023
+ runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
2024
+ runConfigTags: collected.runConfig.getTags(),
2025
+ repetitions
2026
+ });
2027
+ }
2028
+ return jobs;
2029
+ }
2030
+ async expandRunConfigNamesToJobs(names) {
2031
+ const jobs = [];
2032
+ for (const name of names) {
2033
+ const collected = await this.resolveRunConfigByName(name);
2034
+ if (!collected) {
2035
+ const known = await this.collectRunConfigs();
2036
+ const available = known.map((r) => r.runConfig.getName()).sort();
2037
+ throw new Error(
2038
+ available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
2039
+ );
2040
+ }
2041
+ jobs.push(...await this.expandRunConfigToJobs(collected));
2042
+ }
2043
+ return jobs;
2044
+ }
2045
+ async runDatasetJobsWithSharedConcurrency(request) {
2046
+ const globalConcurrency = Math.max(1, request.globalConcurrency);
2047
+ const sem = effect.Effect.unsafeMakeSemaphore(globalConcurrency);
2048
+ const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
2049
+ const snapshots = [];
2050
+ for (const job of request.jobs) {
2051
+ snapshots.push(
2052
+ await this.startDatasetRun({
2053
+ datasetId: job.datasetId,
2054
+ evaluatorIds: job.evaluatorIds,
2055
+ triggerId,
2056
+ maxConcurrency: this.config.maxConcurrency ?? 1,
2057
+ globalEvaluationSemaphore: sem,
2058
+ runConfigName: job.runConfigName,
2059
+ runConfigTags: job.runConfigTags,
2060
+ repetitions: job.repetitions
2061
+ })
2062
+ );
2063
+ }
2064
+ return snapshots;
2065
+ }
1719
2066
  async searchTestCases(query) {
1720
2067
  const testCases = await collectTestCasesFromFiles(this.config.discovery);
1721
2068
  return searchCollectedTestCases(testCases, query);
@@ -1734,35 +2081,45 @@ var EffectRunner = class {
1734
2081
  );
1735
2082
  }
1736
2083
  async runDatasetWith(request) {
2084
+ const runConfigName = validateRunConfigName(
2085
+ request.runConfigName,
2086
+ "runDatasetWith.runConfigName"
2087
+ );
2088
+ return this.startDatasetRun({
2089
+ datasetId: request.datasetId,
2090
+ evaluatorIds: request.evaluatorIds,
2091
+ triggerId: request.triggerId,
2092
+ maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
2093
+ repetitions: request.repetitions,
2094
+ runConfigName,
2095
+ runConfigTags: request.runConfigTags
2096
+ });
2097
+ }
2098
+ async startDatasetRun(params) {
1737
2099
  if (this.datasetsById.size === 0) {
1738
2100
  await this.collectDatasets();
1739
2101
  }
1740
2102
  if (this.evaluatorsById.size === 0) {
1741
2103
  await this.collectEvaluators();
1742
2104
  }
1743
- const dataset = this.datasetsById.get(request.datasetId);
2105
+ const dataset = this.datasetsById.get(params.datasetId);
1744
2106
  if (!dataset) {
1745
- throw new Error(`Unknown dataset: ${request.datasetId}`);
2107
+ throw new Error(`Unknown dataset: ${params.datasetId}`);
1746
2108
  }
1747
- const selectedEvaluators = request.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
2109
+ const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
1748
2110
  if (selectedEvaluators.length === 0) {
1749
2111
  throw new Error("No evaluators selected for run");
1750
2112
  }
1751
- const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
1752
- const totalEvaluations = selectedTestCases.reduce(
1753
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1754
- 0
1755
- );
1756
- const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
2113
+ const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
2114
+ const repetitions = normalizeRunRepetitions(params.repetitions);
2115
+ const totalEvaluations = selectedTestCases.length * repetitions;
2116
+ const runConfigTags = [...params.runConfigTags ?? []];
2117
+ const triggerId = params.triggerId ?? `trg-${crypto.randomUUID()}`;
1757
2118
  const runId = `run-${crypto.randomUUID()}`;
1758
- const artifactPath = createArtifactPath(
1759
- this.config.artifactDirectory,
1760
- request.datasetId,
1761
- runId
1762
- );
2119
+ const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
1763
2120
  const snapshot = {
1764
2121
  runId,
1765
- datasetId: request.datasetId,
2122
+ datasetId: params.datasetId,
1766
2123
  datasetName: dataset.dataset.getName(),
1767
2124
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1768
2125
  queuedAt: Date.now(),
@@ -1783,7 +2140,7 @@ var EffectRunner = class {
1783
2140
  const queuedEvent = {
1784
2141
  type: "RunQueued",
1785
2142
  runId,
1786
- datasetId: request.datasetId,
2143
+ datasetId: params.datasetId,
1787
2144
  datasetName: dataset.dataset.getName(),
1788
2145
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1789
2146
  totalTestCases: totalEvaluations,
@@ -1797,17 +2154,20 @@ var EffectRunner = class {
1797
2154
  payload: queuedEvent
1798
2155
  })
1799
2156
  );
1800
- const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
1801
2157
  await effect.Effect.runPromise(
1802
2158
  effect.Queue.offer(this.runQueue, {
1803
2159
  runId,
1804
2160
  triggerId,
1805
- datasetId: request.datasetId,
2161
+ datasetId: params.datasetId,
1806
2162
  dataset: dataset.dataset,
1807
2163
  evaluators: selectedEvaluators,
1808
2164
  testCases: selectedTestCases,
1809
2165
  snapshot,
1810
- maxConcurrency
2166
+ maxConcurrency: params.maxConcurrency,
2167
+ globalEvaluationSemaphore: params.globalEvaluationSemaphore,
2168
+ runConfigName: params.runConfigName,
2169
+ runConfigTags,
2170
+ repetitions
1811
2171
  })
1812
2172
  );
1813
2173
  return snapshot;
@@ -1879,15 +2239,26 @@ var EffectRunner = class {
1879
2239
  }
1880
2240
  };
1881
2241
 
2242
+ // src/runner/events.ts
2243
+ var PROGRAMMATIC_RUN_CONFIG = {
2244
+ runConfigName: "programmatic"
2245
+ };
2246
+
1882
2247
  Object.defineProperty(exports, 'S', {
1883
2248
  enumerable: true,
1884
2249
  get: function () { return effect.Schema; }
1885
2250
  });
1886
2251
  exports.Dataset = Dataset;
1887
2252
  exports.Evaluator = Evaluator;
2253
+ exports.EvaluatorNameSchema = EvaluatorNameSchema;
1888
2254
  exports.Metric = Metric;
2255
+ exports.PROGRAMMATIC_RUN_CONFIG = PROGRAMMATIC_RUN_CONFIG;
2256
+ exports.RunConfig = RunConfig;
2257
+ exports.RunConfigNameSchema = RunConfigNameSchema;
1889
2258
  exports.Score = Score;
2259
+ exports.TagSet = TagSet;
1890
2260
  exports.TestCase = TestCase;
2261
+ exports.TestCaseNameSchema = TestCaseNameSchema;
1891
2262
  exports.binaryScore = binaryScore;
1892
2263
  exports.createLogEntry = createLogEntry;
1893
2264
  exports.createRunner = createRunner;
@@ -1895,16 +2266,24 @@ exports.defaultRunnerConfig = defaultRunnerConfig;
1895
2266
  exports.defineConfig = defineConfig;
1896
2267
  exports.deltaScore = deltaScore;
1897
2268
  exports.formatScoreData = formatScoreData;
2269
+ exports.getEvaluatorDisplayLabel = getEvaluatorDisplayLabel;
2270
+ exports.getEvaluatorTagList = getEvaluatorTagList;
1898
2271
  exports.getLogLines = getLogLines;
1899
2272
  exports.getMetricById = getMetricById;
1900
2273
  exports.getScoreById = getScoreById;
2274
+ exports.getTestCaseDisplayLabel = getTestCaseDisplayLabel;
2275
+ exports.getTestCaseTagList = getTestCaseTagList;
1901
2276
  exports.latencyMetric = latencyMetric;
1902
2277
  exports.loadMockData = loadMockData;
1903
2278
  exports.loadRunnerData = loadRunnerData;
2279
+ exports.normalizeOptionalDisplayName = normalizeOptionalDisplayName;
1904
2280
  exports.parseStartupArgs = parseStartupArgs;
1905
2281
  exports.percentScore = percentScore;
1906
2282
  exports.printJsonDiff = printJsonDiff;
1907
2283
  exports.tokenCountMetric = tokenCountMetric;
2284
+ exports.validateEvaluatorName = validateEvaluatorName;
2285
+ exports.validateRunConfigName = validateRunConfigName;
2286
+ exports.validateTestCaseName = validateTestCaseName;
1908
2287
  exports.withRunnerConfig = withRunnerConfig;
1909
2288
  //# sourceMappingURL=out.js.map
1910
2289
  //# sourceMappingURL=index.cjs.map