@m4trix/evals 0.12.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +706 -231
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +707 -232
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +710 -390
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +702 -382
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +289 -108
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +28 -5
- package/dist/index.js +290 -109
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
package/dist/index.d.ts
CHANGED
|
@@ -74,6 +74,7 @@ interface CliState {
|
|
|
74
74
|
datasetMenuIndex: number;
|
|
75
75
|
runMenuIndex: number;
|
|
76
76
|
detailsScrollOffset: number;
|
|
77
|
+
overviewScrollOffset: number;
|
|
77
78
|
selectedEvaluatorIds: string[];
|
|
78
79
|
evaluatorMenuIndex: number;
|
|
79
80
|
searchQuery: string;
|
|
@@ -91,6 +92,8 @@ interface RunnerDiscoveryConfig {
|
|
|
91
92
|
interface RunnerConfig {
|
|
92
93
|
discovery: RunnerDiscoveryConfig;
|
|
93
94
|
artifactDirectory: string;
|
|
95
|
+
/** Max concurrent test cases per run. Default: 1 (sequential). */
|
|
96
|
+
maxConcurrency: number;
|
|
94
97
|
}
|
|
95
98
|
type RunnerConfigOverrides = Omit<Partial<RunnerConfig>, 'discovery'> & {
|
|
96
99
|
discovery?: Partial<RunnerDiscoveryConfig>;
|
|
@@ -108,6 +111,8 @@ interface M4trixEvalConfigDiscovery {
|
|
|
108
111
|
interface M4trixEvalConfig {
|
|
109
112
|
discovery?: M4trixEvalConfigDiscovery;
|
|
110
113
|
artifactDirectory?: string;
|
|
114
|
+
/** Max concurrent test cases per run. Default: 1 (sequential). */
|
|
115
|
+
maxConcurrency?: number;
|
|
111
116
|
}
|
|
112
117
|
type ConfigType = M4trixEvalConfig;
|
|
113
118
|
type M4trixEvalConfigFactory<TConfig extends ConfigType = ConfigType> = () => TConfig;
|
|
@@ -124,6 +129,7 @@ type InputOrBuilder<T> = T | (() => T);
|
|
|
124
129
|
interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>> {
|
|
125
130
|
name: string;
|
|
126
131
|
tags: string[];
|
|
132
|
+
reruns?: number;
|
|
127
133
|
inputSchema: TI;
|
|
128
134
|
input: InputOrBuilder<Schema.Schema.Type<TI>>;
|
|
129
135
|
outputSchema?: TO;
|
|
@@ -133,6 +139,7 @@ declare class TestCase<TInput = unknown, TOutput = unknown> {
|
|
|
133
139
|
private readonly _config;
|
|
134
140
|
private constructor();
|
|
135
141
|
static describe<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>>(config: TestCaseDescribeConfig<TI, TO>): TestCase<Schema.Schema.Type<TI>, Schema.Schema.Type<TO>>;
|
|
142
|
+
getReruns(): number;
|
|
136
143
|
getName(): string;
|
|
137
144
|
getTags(): string[];
|
|
138
145
|
getInputSchema(): Schema.Schema.Any;
|
|
@@ -225,17 +232,22 @@ interface MetricItem<TData = unknown> {
|
|
|
225
232
|
readonly id: string;
|
|
226
233
|
readonly data: TData;
|
|
227
234
|
}
|
|
235
|
+
interface FormatMetricOptions {
|
|
236
|
+
isAggregated?: boolean;
|
|
237
|
+
}
|
|
228
238
|
interface MetricDef<TData = unknown> {
|
|
229
239
|
readonly id: string;
|
|
230
240
|
readonly name?: string;
|
|
231
|
-
|
|
241
|
+
readonly aggregate?: (values: ReadonlyArray<TData>) => TData;
|
|
242
|
+
format(data: TData, options?: FormatMetricOptions): string;
|
|
232
243
|
make(data: TData): MetricItem<TData>;
|
|
233
244
|
}
|
|
234
245
|
declare const Metric: {
|
|
235
246
|
of<TData>(config: {
|
|
236
247
|
id: string;
|
|
237
248
|
name?: string | undefined;
|
|
238
|
-
format: (data: TData) => string;
|
|
249
|
+
format: (data: TData, options?: FormatMetricOptions) => string;
|
|
250
|
+
aggregate?: ((values: readonly TData[]) => TData) | undefined;
|
|
239
251
|
}): MetricDef<TData>;
|
|
240
252
|
};
|
|
241
253
|
declare function getMetricById(id: string): MetricDef<unknown> | undefined;
|
|
@@ -246,11 +258,15 @@ interface ScoreItem<TData = unknown> {
|
|
|
246
258
|
readonly data: TData;
|
|
247
259
|
readonly passed?: boolean;
|
|
248
260
|
}
|
|
261
|
+
interface FormatScoreOptions {
|
|
262
|
+
isAggregated?: boolean;
|
|
263
|
+
}
|
|
249
264
|
interface ScoreDef<TData = unknown> {
|
|
250
265
|
readonly id: string;
|
|
251
266
|
readonly name?: string;
|
|
252
267
|
readonly displayStrategy: ScoreDisplayStrategy;
|
|
253
|
-
|
|
268
|
+
readonly aggregate?: (values: ReadonlyArray<TData>) => TData;
|
|
269
|
+
format(data: TData, options?: FormatScoreOptions): string;
|
|
254
270
|
make(data: TData, options?: {
|
|
255
271
|
definePassed?: (data: TData) => boolean;
|
|
256
272
|
}): ScoreItem<TData>;
|
|
@@ -260,7 +276,8 @@ declare const Score: {
|
|
|
260
276
|
id: string;
|
|
261
277
|
name?: string | undefined;
|
|
262
278
|
displayStrategy: ScoreDisplayStrategy;
|
|
263
|
-
format: (data: TData) => string;
|
|
279
|
+
format: (data: TData, options?: FormatScoreOptions) => string;
|
|
280
|
+
aggregate?: ((values: readonly TData[]) => TData) | undefined;
|
|
264
281
|
}): ScoreDef<TData>;
|
|
265
282
|
};
|
|
266
283
|
declare function getScoreById(id: string): ScoreDef<unknown> | undefined;
|
|
@@ -326,6 +343,8 @@ type RunnerEvent = {
|
|
|
326
343
|
testCaseName: string;
|
|
327
344
|
completedTestCases: number;
|
|
328
345
|
totalTestCases: number;
|
|
346
|
+
rerunIndex: number;
|
|
347
|
+
rerunTotal: number;
|
|
329
348
|
passed: boolean;
|
|
330
349
|
durationMs: number;
|
|
331
350
|
evaluatorScores: ReadonlyArray<{
|
|
@@ -394,11 +413,15 @@ declare const latencyMetric: MetricDef<LatencyData>;
|
|
|
394
413
|
|
|
395
414
|
interface PercentScoreData {
|
|
396
415
|
value: number;
|
|
416
|
+
stdDev?: number;
|
|
417
|
+
count?: number;
|
|
397
418
|
}
|
|
398
419
|
declare const percentScore: ScoreDef<PercentScoreData>;
|
|
399
420
|
interface BinaryScoreData {
|
|
400
421
|
passed: boolean;
|
|
422
|
+
passedCount?: number;
|
|
423
|
+
totalCount?: number;
|
|
401
424
|
}
|
|
402
425
|
declare const binaryScore: ScoreDef<BinaryScoreData>;
|
|
403
426
|
|
|
404
|
-
export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, Dataset, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, Evaluator, EvaluatorOption, LatencyData, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
|
427
|
+
export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, Dataset, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, Evaluator, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, LatencyData, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
package/dist/index.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import { Effect, PubSub, Queue, Fiber } from 'effect';
|
|
1
|
+
import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
|
|
2
2
|
export { Schema as S } from 'effect';
|
|
3
|
-
import {
|
|
3
|
+
import { diffLines } from 'diff';
|
|
4
4
|
import { randomUUID } from 'crypto';
|
|
5
5
|
import { existsSync } from 'fs';
|
|
6
6
|
import { resolve as resolve$1, relative, join, dirname } from 'path';
|
|
@@ -309,15 +309,23 @@ var TestCase = class _TestCase {
|
|
|
309
309
|
this._config = config;
|
|
310
310
|
}
|
|
311
311
|
static describe(config) {
|
|
312
|
+
const reruns = config.reruns ?? 1;
|
|
313
|
+
if (reruns < 1 || !Number.isInteger(reruns)) {
|
|
314
|
+
throw new Error(`TestCase reruns must be a positive integer, got ${reruns}`);
|
|
315
|
+
}
|
|
312
316
|
return new _TestCase({
|
|
313
317
|
name: config.name,
|
|
314
318
|
tags: config.tags,
|
|
319
|
+
reruns,
|
|
315
320
|
inputSchema: config.inputSchema,
|
|
316
321
|
input: config.input,
|
|
317
322
|
outputSchema: config.outputSchema,
|
|
318
323
|
output: config.output
|
|
319
324
|
});
|
|
320
325
|
}
|
|
326
|
+
getReruns() {
|
|
327
|
+
return this._config.reruns;
|
|
328
|
+
}
|
|
321
329
|
getName() {
|
|
322
330
|
return this._config.name;
|
|
323
331
|
}
|
|
@@ -491,6 +499,7 @@ var Metric = {
|
|
|
491
499
|
const def = {
|
|
492
500
|
id: config.id,
|
|
493
501
|
name: config.name,
|
|
502
|
+
aggregate: config.aggregate,
|
|
494
503
|
format: config.format,
|
|
495
504
|
make: (data) => ({ id: config.id, data })
|
|
496
505
|
};
|
|
@@ -510,6 +519,7 @@ var Score = {
|
|
|
510
519
|
id: config.id,
|
|
511
520
|
name: config.name,
|
|
512
521
|
displayStrategy: config.displayStrategy,
|
|
522
|
+
aggregate: config.aggregate,
|
|
513
523
|
format: config.format,
|
|
514
524
|
make: (data, options) => {
|
|
515
525
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
@@ -528,23 +538,75 @@ function getScoreById(id) {
|
|
|
528
538
|
return registry2.get(id);
|
|
529
539
|
}
|
|
530
540
|
|
|
541
|
+
// src/evals/aggregators.ts
|
|
542
|
+
function aggregateAverageWithVariance(values) {
|
|
543
|
+
if (values.length === 0) {
|
|
544
|
+
return { value: 0, count: 0 };
|
|
545
|
+
}
|
|
546
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
547
|
+
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
548
|
+
const mean = sum / values.length;
|
|
549
|
+
let stdDev;
|
|
550
|
+
if (values.length >= 2) {
|
|
551
|
+
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
552
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
553
|
+
}
|
|
554
|
+
return { value: mean, stdDev, count: values.length };
|
|
555
|
+
}
|
|
556
|
+
function aggregateAll(values) {
|
|
557
|
+
const total = values.length;
|
|
558
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
559
|
+
return {
|
|
560
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
561
|
+
passedCount,
|
|
562
|
+
totalCount: total
|
|
563
|
+
};
|
|
564
|
+
}
|
|
565
|
+
function aggregateTokenCountSum(values) {
|
|
566
|
+
const initial = {
|
|
567
|
+
input: 0,
|
|
568
|
+
output: 0,
|
|
569
|
+
inputCached: 0,
|
|
570
|
+
outputCached: 0
|
|
571
|
+
};
|
|
572
|
+
return values.reduce(
|
|
573
|
+
(acc, v) => ({
|
|
574
|
+
input: acc.input + (v.input ?? 0),
|
|
575
|
+
output: acc.output + (v.output ?? 0),
|
|
576
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
577
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
578
|
+
}),
|
|
579
|
+
initial
|
|
580
|
+
);
|
|
581
|
+
}
|
|
582
|
+
function aggregateLatencyAverage(values) {
|
|
583
|
+
if (values.length === 0) {
|
|
584
|
+
return { ms: 0 };
|
|
585
|
+
}
|
|
586
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
587
|
+
return { ms: sum / values.length };
|
|
588
|
+
}
|
|
589
|
+
|
|
531
590
|
// src/evals/metrics/standard.ts
|
|
532
591
|
var tokenCountMetric = Metric.of({
|
|
533
592
|
id: "token-count",
|
|
534
593
|
name: "Tokens",
|
|
535
|
-
|
|
594
|
+
aggregate: aggregateTokenCountSum,
|
|
595
|
+
format: (data, options) => {
|
|
536
596
|
const input = data.input ?? 0;
|
|
537
597
|
const output = data.output ?? 0;
|
|
538
598
|
const inputCached = data.inputCached ?? 0;
|
|
539
599
|
const outputCached = data.outputCached ?? 0;
|
|
540
600
|
const cached = inputCached + outputCached;
|
|
541
|
-
|
|
601
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
602
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
542
603
|
}
|
|
543
604
|
});
|
|
544
605
|
var latencyMetric = Metric.of({
|
|
545
606
|
id: "latency",
|
|
546
607
|
name: "Latency",
|
|
547
|
-
|
|
608
|
+
aggregate: aggregateLatencyAverage,
|
|
609
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
548
610
|
});
|
|
549
611
|
|
|
550
612
|
// src/evals/scores/standard.ts
|
|
@@ -552,16 +614,59 @@ var percentScore = Score.of({
|
|
|
552
614
|
id: "percent",
|
|
553
615
|
name: "Score",
|
|
554
616
|
displayStrategy: "bar",
|
|
555
|
-
format: (data) =>
|
|
617
|
+
format: (data, options) => {
|
|
618
|
+
if (options?.isAggregated) {
|
|
619
|
+
return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
|
|
620
|
+
}
|
|
621
|
+
return data.value.toFixed(2);
|
|
622
|
+
},
|
|
623
|
+
aggregate: aggregateAverageWithVariance
|
|
556
624
|
});
|
|
557
625
|
var binaryScore = Score.of({
|
|
558
626
|
id: "binary",
|
|
559
627
|
name: "Result",
|
|
560
628
|
displayStrategy: "passFail",
|
|
561
|
-
format: (data) =>
|
|
629
|
+
format: (data, options) => {
|
|
630
|
+
if (options?.isAggregated) {
|
|
631
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
632
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
633
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
634
|
+
}
|
|
635
|
+
return base;
|
|
636
|
+
}
|
|
637
|
+
return data.passed ? "PASSED" : "NOT PASSED";
|
|
638
|
+
},
|
|
639
|
+
aggregate: aggregateAll
|
|
562
640
|
});
|
|
641
|
+
function toJsonLines(value) {
|
|
642
|
+
try {
|
|
643
|
+
return JSON.stringify(value, null, 2);
|
|
644
|
+
} catch {
|
|
645
|
+
return String(value);
|
|
646
|
+
}
|
|
647
|
+
}
|
|
648
|
+
function formatDiffString(changes) {
|
|
649
|
+
const lines = [];
|
|
650
|
+
for (const part of changes) {
|
|
651
|
+
const prefix = part.added ? "+" : part.removed ? "-" : " ";
|
|
652
|
+
const partLines = part.value.split("\n");
|
|
653
|
+
if (partLines[partLines.length - 1] === "") {
|
|
654
|
+
partLines.pop();
|
|
655
|
+
}
|
|
656
|
+
for (const line of partLines) {
|
|
657
|
+
lines.push(`${prefix} ${line}`);
|
|
658
|
+
}
|
|
659
|
+
}
|
|
660
|
+
return lines.join("\n");
|
|
661
|
+
}
|
|
662
|
+
function createDiffString(expected, actual) {
|
|
663
|
+
const expectedStr = toJsonLines(expected);
|
|
664
|
+
const actualStr = toJsonLines(actual);
|
|
665
|
+
const changes = diffLines(expectedStr, actualStr);
|
|
666
|
+
return formatDiffString(changes);
|
|
667
|
+
}
|
|
563
668
|
function createDiffLogEntry(expected, actual, options) {
|
|
564
|
-
const diff =
|
|
669
|
+
const diff = createDiffString(expected, actual);
|
|
565
670
|
return {
|
|
566
671
|
type: "diff",
|
|
567
672
|
label: options?.label,
|
|
@@ -571,8 +676,22 @@ function createDiffLogEntry(expected, actual, options) {
|
|
|
571
676
|
};
|
|
572
677
|
}
|
|
573
678
|
function printJsonDiff(expected, actual, options = {}) {
|
|
574
|
-
const
|
|
575
|
-
|
|
679
|
+
const diff = createDiffString(expected, actual);
|
|
680
|
+
if (options.color) {
|
|
681
|
+
const lines = diff.split("\n").map((line) => {
|
|
682
|
+
const trimmed = line.trimStart();
|
|
683
|
+
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
684
|
+
return `\x1B[31m${line}\x1B[0m`;
|
|
685
|
+
}
|
|
686
|
+
if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
|
|
687
|
+
return `\x1B[32m${line}\x1B[0m`;
|
|
688
|
+
}
|
|
689
|
+
return line;
|
|
690
|
+
});
|
|
691
|
+
const colored = lines.join("\n");
|
|
692
|
+
console.log(colored || "(no differences)");
|
|
693
|
+
return colored;
|
|
694
|
+
}
|
|
576
695
|
console.log(diff || "(no differences)");
|
|
577
696
|
return diff;
|
|
578
697
|
}
|
|
@@ -599,7 +718,8 @@ var defaultRunnerConfig = {
|
|
|
599
718
|
],
|
|
600
719
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
601
720
|
},
|
|
602
|
-
artifactDirectory: ".eval-results"
|
|
721
|
+
artifactDirectory: ".eval-results",
|
|
722
|
+
maxConcurrency: 1
|
|
603
723
|
};
|
|
604
724
|
function toRunnerConfigOverrides(config) {
|
|
605
725
|
if (!config) {
|
|
@@ -632,6 +752,9 @@ function toRunnerConfigOverrides(config) {
|
|
|
632
752
|
if (config.artifactDirectory !== void 0) {
|
|
633
753
|
overrides.artifactDirectory = config.artifactDirectory;
|
|
634
754
|
}
|
|
755
|
+
if (config.maxConcurrency !== void 0) {
|
|
756
|
+
overrides.maxConcurrency = config.maxConcurrency;
|
|
757
|
+
}
|
|
635
758
|
if (Object.keys(discovery).length > 0) {
|
|
636
759
|
overrides.discovery = discovery;
|
|
637
760
|
}
|
|
@@ -905,6 +1028,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
905
1028
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
906
1029
|
);
|
|
907
1030
|
}
|
|
1031
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
1032
|
+
return Effect.gen(function* () {
|
|
1033
|
+
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1034
|
+
const rerunPassed = [];
|
|
1035
|
+
for (let r = 0; r < reruns; r++) {
|
|
1036
|
+
const started = Date.now();
|
|
1037
|
+
const evaluatorScores = [];
|
|
1038
|
+
let testCaseError;
|
|
1039
|
+
const output = readOutput(testCaseItem.testCase);
|
|
1040
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
1041
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
1042
|
+
if (!evaluateFn) {
|
|
1043
|
+
continue;
|
|
1044
|
+
}
|
|
1045
|
+
try {
|
|
1046
|
+
const logs = [];
|
|
1047
|
+
const logDiff = (expected, actual, options) => {
|
|
1048
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1049
|
+
};
|
|
1050
|
+
const ctx = yield* Effect.promise(
|
|
1051
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
1052
|
+
);
|
|
1053
|
+
const result = yield* Effect.promise(
|
|
1054
|
+
() => Promise.resolve(
|
|
1055
|
+
evaluateFn({
|
|
1056
|
+
input: testCaseItem.testCase.getInput(),
|
|
1057
|
+
ctx,
|
|
1058
|
+
output,
|
|
1059
|
+
logDiff
|
|
1060
|
+
})
|
|
1061
|
+
)
|
|
1062
|
+
);
|
|
1063
|
+
const { scores, metrics } = normalizeResult(result);
|
|
1064
|
+
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
1065
|
+
evaluatorScores.push({
|
|
1066
|
+
evaluatorId,
|
|
1067
|
+
scores,
|
|
1068
|
+
passed: passed2,
|
|
1069
|
+
metrics,
|
|
1070
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1071
|
+
});
|
|
1072
|
+
} catch (error) {
|
|
1073
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1074
|
+
evaluatorScores.push({
|
|
1075
|
+
evaluatorId,
|
|
1076
|
+
scores: [],
|
|
1077
|
+
passed: false
|
|
1078
|
+
});
|
|
1079
|
+
}
|
|
1080
|
+
}
|
|
1081
|
+
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1082
|
+
rerunPassed.push(rerunPassedThis);
|
|
1083
|
+
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
|
|
1084
|
+
n + 1,
|
|
1085
|
+
n + 1
|
|
1086
|
+
]);
|
|
1087
|
+
const progressEvent = {
|
|
1088
|
+
type: "TestCaseProgress",
|
|
1089
|
+
runId: task.runId,
|
|
1090
|
+
testCaseId: testCaseItem.id,
|
|
1091
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1092
|
+
completedTestCases: completedEvaluations,
|
|
1093
|
+
totalTestCases: totalEvaluations,
|
|
1094
|
+
rerunIndex: r + 1,
|
|
1095
|
+
rerunTotal: reruns,
|
|
1096
|
+
passed: rerunPassedThis,
|
|
1097
|
+
durationMs: Date.now() - started,
|
|
1098
|
+
evaluatorScores,
|
|
1099
|
+
output,
|
|
1100
|
+
errorMessage: testCaseError
|
|
1101
|
+
};
|
|
1102
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
1103
|
+
...snapshot,
|
|
1104
|
+
completedTestCases: completedEvaluations
|
|
1105
|
+
}));
|
|
1106
|
+
yield* publishEvent(progressEvent);
|
|
1107
|
+
yield* Queue.offer(persistenceQueue, {
|
|
1108
|
+
runId: task.runId,
|
|
1109
|
+
artifactPath: task.snapshot.artifactPath,
|
|
1110
|
+
payload: progressEvent
|
|
1111
|
+
});
|
|
1112
|
+
}
|
|
1113
|
+
const testCasePassed = rerunPassed.every(Boolean);
|
|
1114
|
+
if (testCasePassed) {
|
|
1115
|
+
yield* Ref.update(passedRef, (n) => n + 1);
|
|
1116
|
+
} else {
|
|
1117
|
+
yield* Ref.update(failedRef, (n) => n + 1);
|
|
1118
|
+
}
|
|
1119
|
+
const [passed, failed] = yield* Effect.all([
|
|
1120
|
+
Ref.get(passedRef),
|
|
1121
|
+
Ref.get(failedRef)
|
|
1122
|
+
]);
|
|
1123
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
1124
|
+
...snapshot,
|
|
1125
|
+
passedTestCases: passed,
|
|
1126
|
+
failedTestCases: failed
|
|
1127
|
+
}));
|
|
1128
|
+
});
|
|
1129
|
+
}
|
|
908
1130
|
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
|
|
909
1131
|
const startedAt = Date.now();
|
|
910
1132
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
@@ -917,104 +1139,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
917
1139
|
runId: task.runId,
|
|
918
1140
|
startedAt
|
|
919
1141
|
});
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
)
|
|
950
|
-
);
|
|
951
|
-
const { scores, metrics } = normalizeResult(result);
|
|
952
|
-
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
953
|
-
evaluatorScores.push({
|
|
954
|
-
evaluatorId,
|
|
955
|
-
scores,
|
|
956
|
-
passed,
|
|
957
|
-
metrics,
|
|
958
|
-
logs: logs.length > 0 ? logs : void 0
|
|
959
|
-
});
|
|
960
|
-
} catch (error) {
|
|
961
|
-
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
962
|
-
evaluatorScores.push({
|
|
963
|
-
evaluatorId,
|
|
964
|
-
scores: [],
|
|
965
|
-
passed: false
|
|
966
|
-
});
|
|
967
|
-
}
|
|
968
|
-
}
|
|
969
|
-
const testCasePassed = evaluatorScores.every((s) => s.passed);
|
|
970
|
-
completedTestCases += 1;
|
|
971
|
-
if (testCasePassed) {
|
|
972
|
-
passedTestCases += 1;
|
|
973
|
-
} else {
|
|
974
|
-
failedTestCases += 1;
|
|
975
|
-
}
|
|
976
|
-
const progressEvent = {
|
|
977
|
-
type: "TestCaseProgress",
|
|
978
|
-
runId: task.runId,
|
|
979
|
-
testCaseId: testCaseItem.id,
|
|
980
|
-
testCaseName: testCaseItem.testCase.getName(),
|
|
981
|
-
completedTestCases,
|
|
982
|
-
totalTestCases: task.testCases.length,
|
|
983
|
-
passed: testCasePassed,
|
|
984
|
-
durationMs: Date.now() - started,
|
|
985
|
-
evaluatorScores,
|
|
986
|
-
output,
|
|
987
|
-
errorMessage: testCaseError
|
|
988
|
-
};
|
|
989
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
990
|
-
...snapshot,
|
|
991
|
-
completedTestCases,
|
|
992
|
-
passedTestCases,
|
|
993
|
-
failedTestCases
|
|
994
|
-
}));
|
|
995
|
-
yield* publishEvent(progressEvent);
|
|
996
|
-
yield* Queue.offer(persistenceQueue, {
|
|
997
|
-
runId: task.runId,
|
|
998
|
-
artifactPath: task.snapshot.artifactPath,
|
|
999
|
-
payload: progressEvent
|
|
1000
|
-
});
|
|
1001
|
-
}
|
|
1142
|
+
const totalEvaluations = task.testCases.reduce(
|
|
1143
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1144
|
+
0
|
|
1145
|
+
);
|
|
1146
|
+
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1147
|
+
const completedRef = yield* Ref.make(0);
|
|
1148
|
+
const passedRef = yield* Ref.make(0);
|
|
1149
|
+
const failedRef = yield* Ref.make(0);
|
|
1150
|
+
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
1151
|
+
task,
|
|
1152
|
+
testCaseItem,
|
|
1153
|
+
totalEvaluations,
|
|
1154
|
+
publishEvent,
|
|
1155
|
+
persistenceQueue,
|
|
1156
|
+
updateSnapshot,
|
|
1157
|
+
completedRef,
|
|
1158
|
+
passedRef,
|
|
1159
|
+
failedRef
|
|
1160
|
+
);
|
|
1161
|
+
yield* Effect.forEach(
|
|
1162
|
+
task.testCases,
|
|
1163
|
+
processTestCase,
|
|
1164
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1165
|
+
);
|
|
1166
|
+
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
|
|
1167
|
+
Ref.get(completedRef),
|
|
1168
|
+
Ref.get(passedRef),
|
|
1169
|
+
Ref.get(failedRef)
|
|
1170
|
+
]);
|
|
1002
1171
|
const finishedAt = Date.now();
|
|
1003
1172
|
const completedEvent = {
|
|
1004
1173
|
type: "RunCompleted",
|
|
1005
1174
|
runId: task.runId,
|
|
1006
1175
|
finishedAt,
|
|
1007
|
-
passedTestCases,
|
|
1008
|
-
failedTestCases,
|
|
1176
|
+
passedTestCases: passedUniqueTestCases,
|
|
1177
|
+
failedTestCases: failedUniqueTestCases,
|
|
1009
1178
|
totalTestCases: task.testCases.length,
|
|
1010
1179
|
artifactPath: task.snapshot.artifactPath
|
|
1011
1180
|
};
|
|
1012
1181
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
1013
1182
|
...snapshot,
|
|
1014
1183
|
status: "completed",
|
|
1015
|
-
completedTestCases,
|
|
1016
|
-
passedTestCases,
|
|
1017
|
-
failedTestCases,
|
|
1184
|
+
completedTestCases: completedEvaluations,
|
|
1185
|
+
passedTestCases: passedUniqueTestCases,
|
|
1186
|
+
failedTestCases: failedUniqueTestCases,
|
|
1018
1187
|
finishedAt
|
|
1019
1188
|
}));
|
|
1020
1189
|
yield* publishEvent(completedEvent);
|
|
@@ -1102,7 +1271,7 @@ async function parseArtifactToSnapshot(filePath, _config) {
|
|
|
1102
1271
|
const artifactPath = filePath;
|
|
1103
1272
|
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1104
1273
|
const progress = aggregateTestCaseProgress(lines);
|
|
1105
|
-
const completedTestCases = runCompleted
|
|
1274
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1106
1275
|
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1107
1276
|
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1108
1277
|
return {
|
|
@@ -1124,23 +1293,29 @@ async function parseArtifactToSnapshot(filePath, _config) {
|
|
|
1124
1293
|
}
|
|
1125
1294
|
function aggregateTestCaseProgress(lines) {
|
|
1126
1295
|
let completedTestCases = 0;
|
|
1127
|
-
|
|
1128
|
-
let failedTestCases = 0;
|
|
1296
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
1129
1297
|
for (const line of lines) {
|
|
1130
1298
|
try {
|
|
1131
1299
|
const event = JSON.parse(line);
|
|
1132
1300
|
if (event.type === "TestCaseProgress") {
|
|
1133
1301
|
const ev = event;
|
|
1134
1302
|
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
failedTestCases += 1;
|
|
1139
|
-
}
|
|
1303
|
+
const id = ev.testCaseId;
|
|
1304
|
+
const current = testCasePassedBy.get(id);
|
|
1305
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1140
1306
|
}
|
|
1141
1307
|
} catch {
|
|
1142
1308
|
}
|
|
1143
1309
|
}
|
|
1310
|
+
let passedTestCases = 0;
|
|
1311
|
+
let failedTestCases = 0;
|
|
1312
|
+
for (const passed of testCasePassedBy.values()) {
|
|
1313
|
+
if (passed) {
|
|
1314
|
+
passedTestCases += 1;
|
|
1315
|
+
} else {
|
|
1316
|
+
failedTestCases += 1;
|
|
1317
|
+
}
|
|
1318
|
+
}
|
|
1144
1319
|
return { completedTestCases, passedTestCases, failedTestCases };
|
|
1145
1320
|
}
|
|
1146
1321
|
async function appendJsonLine(artifactPath, payload) {
|
|
@@ -1335,6 +1510,10 @@ var EffectRunner = class {
|
|
|
1335
1510
|
throw new Error("No evaluators selected for run");
|
|
1336
1511
|
}
|
|
1337
1512
|
const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
|
|
1513
|
+
const totalEvaluations = selectedTestCases.reduce(
|
|
1514
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1515
|
+
0
|
|
1516
|
+
);
|
|
1338
1517
|
const runId = `run-${randomUUID()}`;
|
|
1339
1518
|
const artifactPath = createArtifactPath(
|
|
1340
1519
|
this.config.artifactDirectory,
|
|
@@ -1347,7 +1526,7 @@ var EffectRunner = class {
|
|
|
1347
1526
|
datasetName: dataset.dataset.getName(),
|
|
1348
1527
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1349
1528
|
queuedAt: Date.now(),
|
|
1350
|
-
totalTestCases:
|
|
1529
|
+
totalTestCases: totalEvaluations,
|
|
1351
1530
|
completedTestCases: 0,
|
|
1352
1531
|
passedTestCases: 0,
|
|
1353
1532
|
failedTestCases: 0,
|
|
@@ -1361,7 +1540,7 @@ var EffectRunner = class {
|
|
|
1361
1540
|
datasetId: request.datasetId,
|
|
1362
1541
|
datasetName: dataset.dataset.getName(),
|
|
1363
1542
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1364
|
-
totalTestCases:
|
|
1543
|
+
totalTestCases: totalEvaluations,
|
|
1365
1544
|
artifactPath
|
|
1366
1545
|
};
|
|
1367
1546
|
await Effect.runPromise(this.publishEvent(queuedEvent));
|
|
@@ -1372,6 +1551,7 @@ var EffectRunner = class {
|
|
|
1372
1551
|
payload: queuedEvent
|
|
1373
1552
|
})
|
|
1374
1553
|
);
|
|
1554
|
+
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
1375
1555
|
await Effect.runPromise(
|
|
1376
1556
|
Queue.offer(this.runQueue, {
|
|
1377
1557
|
runId,
|
|
@@ -1379,7 +1559,8 @@ var EffectRunner = class {
|
|
|
1379
1559
|
dataset: dataset.dataset,
|
|
1380
1560
|
evaluators: selectedEvaluators,
|
|
1381
1561
|
testCases: selectedTestCases,
|
|
1382
|
-
snapshot
|
|
1562
|
+
snapshot,
|
|
1563
|
+
maxConcurrency
|
|
1383
1564
|
})
|
|
1384
1565
|
);
|
|
1385
1566
|
return snapshot;
|