@m4trix/evals 0.12.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +599 -224
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +600 -225
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +214 -105
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +215 -106
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +217 -104
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +23 -5
- package/dist/index.js +218 -105
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -91,6 +91,8 @@ interface RunnerDiscoveryConfig {
|
|
|
91
91
|
interface RunnerConfig {
|
|
92
92
|
discovery: RunnerDiscoveryConfig;
|
|
93
93
|
artifactDirectory: string;
|
|
94
|
+
/** Max concurrent test cases per run. Default: 1 (sequential). */
|
|
95
|
+
maxConcurrency: number;
|
|
94
96
|
}
|
|
95
97
|
type RunnerConfigOverrides = Omit<Partial<RunnerConfig>, 'discovery'> & {
|
|
96
98
|
discovery?: Partial<RunnerDiscoveryConfig>;
|
|
@@ -108,6 +110,8 @@ interface M4trixEvalConfigDiscovery {
|
|
|
108
110
|
interface M4trixEvalConfig {
|
|
109
111
|
discovery?: M4trixEvalConfigDiscovery;
|
|
110
112
|
artifactDirectory?: string;
|
|
113
|
+
/** Max concurrent test cases per run. Default: 1 (sequential). */
|
|
114
|
+
maxConcurrency?: number;
|
|
111
115
|
}
|
|
112
116
|
type ConfigType = M4trixEvalConfig;
|
|
113
117
|
type M4trixEvalConfigFactory<TConfig extends ConfigType = ConfigType> = () => TConfig;
|
|
@@ -124,6 +128,7 @@ type InputOrBuilder<T> = T | (() => T);
|
|
|
124
128
|
interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>> {
|
|
125
129
|
name: string;
|
|
126
130
|
tags: string[];
|
|
131
|
+
reruns?: number;
|
|
127
132
|
inputSchema: TI;
|
|
128
133
|
input: InputOrBuilder<Schema.Schema.Type<TI>>;
|
|
129
134
|
outputSchema?: TO;
|
|
@@ -133,6 +138,7 @@ declare class TestCase<TInput = unknown, TOutput = unknown> {
|
|
|
133
138
|
private readonly _config;
|
|
134
139
|
private constructor();
|
|
135
140
|
static describe<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>>(config: TestCaseDescribeConfig<TI, TO>): TestCase<Schema.Schema.Type<TI>, Schema.Schema.Type<TO>>;
|
|
141
|
+
getReruns(): number;
|
|
136
142
|
getName(): string;
|
|
137
143
|
getTags(): string[];
|
|
138
144
|
getInputSchema(): Schema.Schema.Any;
|
|
@@ -225,17 +231,22 @@ interface MetricItem<TData = unknown> {
|
|
|
225
231
|
readonly id: string;
|
|
226
232
|
readonly data: TData;
|
|
227
233
|
}
|
|
234
|
+
interface FormatMetricOptions {
|
|
235
|
+
isAggregated?: boolean;
|
|
236
|
+
}
|
|
228
237
|
interface MetricDef<TData = unknown> {
|
|
229
238
|
readonly id: string;
|
|
230
239
|
readonly name?: string;
|
|
231
|
-
|
|
240
|
+
readonly aggregate?: (values: ReadonlyArray<TData>) => TData;
|
|
241
|
+
format(data: TData, options?: FormatMetricOptions): string;
|
|
232
242
|
make(data: TData): MetricItem<TData>;
|
|
233
243
|
}
|
|
234
244
|
declare const Metric: {
|
|
235
245
|
of<TData>(config: {
|
|
236
246
|
id: string;
|
|
237
247
|
name?: string | undefined;
|
|
238
|
-
format: (data: TData) => string;
|
|
248
|
+
format: (data: TData, options?: FormatMetricOptions) => string;
|
|
249
|
+
aggregate?: ((values: readonly TData[]) => TData) | undefined;
|
|
239
250
|
}): MetricDef<TData>;
|
|
240
251
|
};
|
|
241
252
|
declare function getMetricById(id: string): MetricDef<unknown> | undefined;
|
|
@@ -246,11 +257,15 @@ interface ScoreItem<TData = unknown> {
|
|
|
246
257
|
readonly data: TData;
|
|
247
258
|
readonly passed?: boolean;
|
|
248
259
|
}
|
|
260
|
+
interface FormatScoreOptions {
|
|
261
|
+
isAggregated?: boolean;
|
|
262
|
+
}
|
|
249
263
|
interface ScoreDef<TData = unknown> {
|
|
250
264
|
readonly id: string;
|
|
251
265
|
readonly name?: string;
|
|
252
266
|
readonly displayStrategy: ScoreDisplayStrategy;
|
|
253
|
-
|
|
267
|
+
readonly aggregate?: (values: ReadonlyArray<TData>) => TData;
|
|
268
|
+
format(data: TData, options?: FormatScoreOptions): string;
|
|
254
269
|
make(data: TData, options?: {
|
|
255
270
|
definePassed?: (data: TData) => boolean;
|
|
256
271
|
}): ScoreItem<TData>;
|
|
@@ -260,7 +275,8 @@ declare const Score: {
|
|
|
260
275
|
id: string;
|
|
261
276
|
name?: string | undefined;
|
|
262
277
|
displayStrategy: ScoreDisplayStrategy;
|
|
263
|
-
format: (data: TData) => string;
|
|
278
|
+
format: (data: TData, options?: FormatScoreOptions) => string;
|
|
279
|
+
aggregate?: ((values: readonly TData[]) => TData) | undefined;
|
|
264
280
|
}): ScoreDef<TData>;
|
|
265
281
|
};
|
|
266
282
|
declare function getScoreById(id: string): ScoreDef<unknown> | undefined;
|
|
@@ -326,6 +342,8 @@ type RunnerEvent = {
|
|
|
326
342
|
testCaseName: string;
|
|
327
343
|
completedTestCases: number;
|
|
328
344
|
totalTestCases: number;
|
|
345
|
+
rerunIndex: number;
|
|
346
|
+
rerunTotal: number;
|
|
329
347
|
passed: boolean;
|
|
330
348
|
durationMs: number;
|
|
331
349
|
evaluatorScores: ReadonlyArray<{
|
|
@@ -401,4 +419,4 @@ interface BinaryScoreData {
|
|
|
401
419
|
}
|
|
402
420
|
declare const binaryScore: ScoreDef<BinaryScoreData>;
|
|
403
421
|
|
|
404
|
-
export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, Dataset, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, Evaluator, EvaluatorOption, LatencyData, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
|
422
|
+
export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, Dataset, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, Evaluator, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, LatencyData, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
package/dist/index.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { Effect, PubSub, Queue, Fiber } from 'effect';
|
|
1
|
+
import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
|
|
2
2
|
export { Schema as S } from 'effect';
|
|
3
3
|
import { diffString } from 'json-diff';
|
|
4
4
|
import { randomUUID } from 'crypto';
|
|
@@ -309,15 +309,23 @@ var TestCase = class _TestCase {
|
|
|
309
309
|
this._config = config;
|
|
310
310
|
}
|
|
311
311
|
static describe(config) {
|
|
312
|
+
const reruns = config.reruns ?? 1;
|
|
313
|
+
if (reruns < 1 || !Number.isInteger(reruns)) {
|
|
314
|
+
throw new Error(`TestCase reruns must be a positive integer, got ${reruns}`);
|
|
315
|
+
}
|
|
312
316
|
return new _TestCase({
|
|
313
317
|
name: config.name,
|
|
314
318
|
tags: config.tags,
|
|
319
|
+
reruns,
|
|
315
320
|
inputSchema: config.inputSchema,
|
|
316
321
|
input: config.input,
|
|
317
322
|
outputSchema: config.outputSchema,
|
|
318
323
|
output: config.output
|
|
319
324
|
});
|
|
320
325
|
}
|
|
326
|
+
getReruns() {
|
|
327
|
+
return this._config.reruns;
|
|
328
|
+
}
|
|
321
329
|
getName() {
|
|
322
330
|
return this._config.name;
|
|
323
331
|
}
|
|
@@ -491,6 +499,7 @@ var Metric = {
|
|
|
491
499
|
const def = {
|
|
492
500
|
id: config.id,
|
|
493
501
|
name: config.name,
|
|
502
|
+
aggregate: config.aggregate,
|
|
494
503
|
format: config.format,
|
|
495
504
|
make: (data) => ({ id: config.id, data })
|
|
496
505
|
};
|
|
@@ -510,6 +519,7 @@ var Score = {
|
|
|
510
519
|
id: config.id,
|
|
511
520
|
name: config.name,
|
|
512
521
|
displayStrategy: config.displayStrategy,
|
|
522
|
+
aggregate: config.aggregate,
|
|
513
523
|
format: config.format,
|
|
514
524
|
make: (data, options) => {
|
|
515
525
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
@@ -528,23 +538,62 @@ function getScoreById(id) {
|
|
|
528
538
|
return registry2.get(id);
|
|
529
539
|
}
|
|
530
540
|
|
|
541
|
+
// src/evals/aggregators.ts
|
|
542
|
+
function aggregateAverage(values) {
|
|
543
|
+
if (values.length === 0) {
|
|
544
|
+
return { value: 0 };
|
|
545
|
+
}
|
|
546
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
547
|
+
return { value: sum / values.length };
|
|
548
|
+
}
|
|
549
|
+
function aggregateAll(values) {
|
|
550
|
+
return { passed: values.length > 0 && values.every((v) => v.passed) };
|
|
551
|
+
}
|
|
552
|
+
function aggregateTokenCountSum(values) {
|
|
553
|
+
const initial = {
|
|
554
|
+
input: 0,
|
|
555
|
+
output: 0,
|
|
556
|
+
inputCached: 0,
|
|
557
|
+
outputCached: 0
|
|
558
|
+
};
|
|
559
|
+
return values.reduce(
|
|
560
|
+
(acc, v) => ({
|
|
561
|
+
input: acc.input + (v.input ?? 0),
|
|
562
|
+
output: acc.output + (v.output ?? 0),
|
|
563
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
564
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
565
|
+
}),
|
|
566
|
+
initial
|
|
567
|
+
);
|
|
568
|
+
}
|
|
569
|
+
function aggregateLatencyAverage(values) {
|
|
570
|
+
if (values.length === 0) {
|
|
571
|
+
return { ms: 0 };
|
|
572
|
+
}
|
|
573
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
574
|
+
return { ms: sum / values.length };
|
|
575
|
+
}
|
|
576
|
+
|
|
531
577
|
// src/evals/metrics/standard.ts
|
|
532
578
|
var tokenCountMetric = Metric.of({
|
|
533
579
|
id: "token-count",
|
|
534
580
|
name: "Tokens",
|
|
535
|
-
|
|
581
|
+
aggregate: aggregateTokenCountSum,
|
|
582
|
+
format: (data, options) => {
|
|
536
583
|
const input = data.input ?? 0;
|
|
537
584
|
const output = data.output ?? 0;
|
|
538
585
|
const inputCached = data.inputCached ?? 0;
|
|
539
586
|
const outputCached = data.outputCached ?? 0;
|
|
540
587
|
const cached = inputCached + outputCached;
|
|
541
|
-
|
|
588
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
589
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
542
590
|
}
|
|
543
591
|
});
|
|
544
592
|
var latencyMetric = Metric.of({
|
|
545
593
|
id: "latency",
|
|
546
594
|
name: "Latency",
|
|
547
|
-
|
|
595
|
+
aggregate: aggregateLatencyAverage,
|
|
596
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
548
597
|
});
|
|
549
598
|
|
|
550
599
|
// src/evals/scores/standard.ts
|
|
@@ -552,13 +601,15 @@ var percentScore = Score.of({
|
|
|
552
601
|
id: "percent",
|
|
553
602
|
name: "Score",
|
|
554
603
|
displayStrategy: "bar",
|
|
555
|
-
format: (data) => data.value.toFixed(2)
|
|
604
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
|
|
605
|
+
aggregate: aggregateAverage
|
|
556
606
|
});
|
|
557
607
|
var binaryScore = Score.of({
|
|
558
608
|
id: "binary",
|
|
559
609
|
name: "Result",
|
|
560
610
|
displayStrategy: "passFail",
|
|
561
|
-
format: (data) => data.passed ? "PASSED" : "NOT PASSED"
|
|
611
|
+
format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
|
|
612
|
+
aggregate: aggregateAll
|
|
562
613
|
});
|
|
563
614
|
function createDiffLogEntry(expected, actual, options) {
|
|
564
615
|
const diff = diffString(expected, actual, { color: false });
|
|
@@ -599,7 +650,8 @@ var defaultRunnerConfig = {
|
|
|
599
650
|
],
|
|
600
651
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
601
652
|
},
|
|
602
|
-
artifactDirectory: ".eval-results"
|
|
653
|
+
artifactDirectory: ".eval-results",
|
|
654
|
+
maxConcurrency: 1
|
|
603
655
|
};
|
|
604
656
|
function toRunnerConfigOverrides(config) {
|
|
605
657
|
if (!config) {
|
|
@@ -632,6 +684,9 @@ function toRunnerConfigOverrides(config) {
|
|
|
632
684
|
if (config.artifactDirectory !== void 0) {
|
|
633
685
|
overrides.artifactDirectory = config.artifactDirectory;
|
|
634
686
|
}
|
|
687
|
+
if (config.maxConcurrency !== void 0) {
|
|
688
|
+
overrides.maxConcurrency = config.maxConcurrency;
|
|
689
|
+
}
|
|
635
690
|
if (Object.keys(discovery).length > 0) {
|
|
636
691
|
overrides.discovery = discovery;
|
|
637
692
|
}
|
|
@@ -905,6 +960,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
905
960
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
906
961
|
);
|
|
907
962
|
}
|
|
963
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
964
|
+
return Effect.gen(function* () {
|
|
965
|
+
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
966
|
+
const rerunPassed = [];
|
|
967
|
+
for (let r = 0; r < reruns; r++) {
|
|
968
|
+
const started = Date.now();
|
|
969
|
+
const evaluatorScores = [];
|
|
970
|
+
let testCaseError;
|
|
971
|
+
const output = readOutput(testCaseItem.testCase);
|
|
972
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
973
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
974
|
+
if (!evaluateFn) {
|
|
975
|
+
continue;
|
|
976
|
+
}
|
|
977
|
+
try {
|
|
978
|
+
const logs = [];
|
|
979
|
+
const logDiff = (expected, actual, options) => {
|
|
980
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
981
|
+
};
|
|
982
|
+
const ctx = yield* Effect.promise(
|
|
983
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
984
|
+
);
|
|
985
|
+
const result = yield* Effect.promise(
|
|
986
|
+
() => Promise.resolve(
|
|
987
|
+
evaluateFn({
|
|
988
|
+
input: testCaseItem.testCase.getInput(),
|
|
989
|
+
ctx,
|
|
990
|
+
output,
|
|
991
|
+
logDiff
|
|
992
|
+
})
|
|
993
|
+
)
|
|
994
|
+
);
|
|
995
|
+
const { scores, metrics } = normalizeResult(result);
|
|
996
|
+
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
997
|
+
evaluatorScores.push({
|
|
998
|
+
evaluatorId,
|
|
999
|
+
scores,
|
|
1000
|
+
passed: passed2,
|
|
1001
|
+
metrics,
|
|
1002
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1003
|
+
});
|
|
1004
|
+
} catch (error) {
|
|
1005
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1006
|
+
evaluatorScores.push({
|
|
1007
|
+
evaluatorId,
|
|
1008
|
+
scores: [],
|
|
1009
|
+
passed: false
|
|
1010
|
+
});
|
|
1011
|
+
}
|
|
1012
|
+
}
|
|
1013
|
+
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1014
|
+
rerunPassed.push(rerunPassedThis);
|
|
1015
|
+
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
|
|
1016
|
+
n + 1,
|
|
1017
|
+
n + 1
|
|
1018
|
+
]);
|
|
1019
|
+
const progressEvent = {
|
|
1020
|
+
type: "TestCaseProgress",
|
|
1021
|
+
runId: task.runId,
|
|
1022
|
+
testCaseId: testCaseItem.id,
|
|
1023
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1024
|
+
completedTestCases: completedEvaluations,
|
|
1025
|
+
totalTestCases: totalEvaluations,
|
|
1026
|
+
rerunIndex: r + 1,
|
|
1027
|
+
rerunTotal: reruns,
|
|
1028
|
+
passed: rerunPassedThis,
|
|
1029
|
+
durationMs: Date.now() - started,
|
|
1030
|
+
evaluatorScores,
|
|
1031
|
+
output,
|
|
1032
|
+
errorMessage: testCaseError
|
|
1033
|
+
};
|
|
1034
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
1035
|
+
...snapshot,
|
|
1036
|
+
completedTestCases: completedEvaluations
|
|
1037
|
+
}));
|
|
1038
|
+
yield* publishEvent(progressEvent);
|
|
1039
|
+
yield* Queue.offer(persistenceQueue, {
|
|
1040
|
+
runId: task.runId,
|
|
1041
|
+
artifactPath: task.snapshot.artifactPath,
|
|
1042
|
+
payload: progressEvent
|
|
1043
|
+
});
|
|
1044
|
+
}
|
|
1045
|
+
const testCasePassed = rerunPassed.every(Boolean);
|
|
1046
|
+
if (testCasePassed) {
|
|
1047
|
+
yield* Ref.update(passedRef, (n) => n + 1);
|
|
1048
|
+
} else {
|
|
1049
|
+
yield* Ref.update(failedRef, (n) => n + 1);
|
|
1050
|
+
}
|
|
1051
|
+
const [passed, failed] = yield* Effect.all([
|
|
1052
|
+
Ref.get(passedRef),
|
|
1053
|
+
Ref.get(failedRef)
|
|
1054
|
+
]);
|
|
1055
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
1056
|
+
...snapshot,
|
|
1057
|
+
passedTestCases: passed,
|
|
1058
|
+
failedTestCases: failed
|
|
1059
|
+
}));
|
|
1060
|
+
});
|
|
1061
|
+
}
|
|
908
1062
|
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
|
|
909
1063
|
const startedAt = Date.now();
|
|
910
1064
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
@@ -917,104 +1071,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
917
1071
|
runId: task.runId,
|
|
918
1072
|
startedAt
|
|
919
1073
|
});
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
)
|
|
950
|
-
);
|
|
951
|
-
const { scores, metrics } = normalizeResult(result);
|
|
952
|
-
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
953
|
-
evaluatorScores.push({
|
|
954
|
-
evaluatorId,
|
|
955
|
-
scores,
|
|
956
|
-
passed,
|
|
957
|
-
metrics,
|
|
958
|
-
logs: logs.length > 0 ? logs : void 0
|
|
959
|
-
});
|
|
960
|
-
} catch (error) {
|
|
961
|
-
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
962
|
-
evaluatorScores.push({
|
|
963
|
-
evaluatorId,
|
|
964
|
-
scores: [],
|
|
965
|
-
passed: false
|
|
966
|
-
});
|
|
967
|
-
}
|
|
968
|
-
}
|
|
969
|
-
const testCasePassed = evaluatorScores.every((s) => s.passed);
|
|
970
|
-
completedTestCases += 1;
|
|
971
|
-
if (testCasePassed) {
|
|
972
|
-
passedTestCases += 1;
|
|
973
|
-
} else {
|
|
974
|
-
failedTestCases += 1;
|
|
975
|
-
}
|
|
976
|
-
const progressEvent = {
|
|
977
|
-
type: "TestCaseProgress",
|
|
978
|
-
runId: task.runId,
|
|
979
|
-
testCaseId: testCaseItem.id,
|
|
980
|
-
testCaseName: testCaseItem.testCase.getName(),
|
|
981
|
-
completedTestCases,
|
|
982
|
-
totalTestCases: task.testCases.length,
|
|
983
|
-
passed: testCasePassed,
|
|
984
|
-
durationMs: Date.now() - started,
|
|
985
|
-
evaluatorScores,
|
|
986
|
-
output,
|
|
987
|
-
errorMessage: testCaseError
|
|
988
|
-
};
|
|
989
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
990
|
-
...snapshot,
|
|
991
|
-
completedTestCases,
|
|
992
|
-
passedTestCases,
|
|
993
|
-
failedTestCases
|
|
994
|
-
}));
|
|
995
|
-
yield* publishEvent(progressEvent);
|
|
996
|
-
yield* Queue.offer(persistenceQueue, {
|
|
997
|
-
runId: task.runId,
|
|
998
|
-
artifactPath: task.snapshot.artifactPath,
|
|
999
|
-
payload: progressEvent
|
|
1000
|
-
});
|
|
1001
|
-
}
|
|
1074
|
+
const totalEvaluations = task.testCases.reduce(
|
|
1075
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1076
|
+
0
|
|
1077
|
+
);
|
|
1078
|
+
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1079
|
+
const completedRef = yield* Ref.make(0);
|
|
1080
|
+
const passedRef = yield* Ref.make(0);
|
|
1081
|
+
const failedRef = yield* Ref.make(0);
|
|
1082
|
+
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
1083
|
+
task,
|
|
1084
|
+
testCaseItem,
|
|
1085
|
+
totalEvaluations,
|
|
1086
|
+
publishEvent,
|
|
1087
|
+
persistenceQueue,
|
|
1088
|
+
updateSnapshot,
|
|
1089
|
+
completedRef,
|
|
1090
|
+
passedRef,
|
|
1091
|
+
failedRef
|
|
1092
|
+
);
|
|
1093
|
+
yield* Effect.forEach(
|
|
1094
|
+
task.testCases,
|
|
1095
|
+
processTestCase,
|
|
1096
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1097
|
+
);
|
|
1098
|
+
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
|
|
1099
|
+
Ref.get(completedRef),
|
|
1100
|
+
Ref.get(passedRef),
|
|
1101
|
+
Ref.get(failedRef)
|
|
1102
|
+
]);
|
|
1002
1103
|
const finishedAt = Date.now();
|
|
1003
1104
|
const completedEvent = {
|
|
1004
1105
|
type: "RunCompleted",
|
|
1005
1106
|
runId: task.runId,
|
|
1006
1107
|
finishedAt,
|
|
1007
|
-
passedTestCases,
|
|
1008
|
-
failedTestCases,
|
|
1108
|
+
passedTestCases: passedUniqueTestCases,
|
|
1109
|
+
failedTestCases: failedUniqueTestCases,
|
|
1009
1110
|
totalTestCases: task.testCases.length,
|
|
1010
1111
|
artifactPath: task.snapshot.artifactPath
|
|
1011
1112
|
};
|
|
1012
1113
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
1013
1114
|
...snapshot,
|
|
1014
1115
|
status: "completed",
|
|
1015
|
-
completedTestCases,
|
|
1016
|
-
passedTestCases,
|
|
1017
|
-
failedTestCases,
|
|
1116
|
+
completedTestCases: completedEvaluations,
|
|
1117
|
+
passedTestCases: passedUniqueTestCases,
|
|
1118
|
+
failedTestCases: failedUniqueTestCases,
|
|
1018
1119
|
finishedAt
|
|
1019
1120
|
}));
|
|
1020
1121
|
yield* publishEvent(completedEvent);
|
|
@@ -1102,7 +1203,7 @@ async function parseArtifactToSnapshot(filePath, _config) {
|
|
|
1102
1203
|
const artifactPath = filePath;
|
|
1103
1204
|
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1104
1205
|
const progress = aggregateTestCaseProgress(lines);
|
|
1105
|
-
const completedTestCases = runCompleted
|
|
1206
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1106
1207
|
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1107
1208
|
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1108
1209
|
return {
|
|
@@ -1124,23 +1225,29 @@ async function parseArtifactToSnapshot(filePath, _config) {
|
|
|
1124
1225
|
}
|
|
1125
1226
|
function aggregateTestCaseProgress(lines) {
|
|
1126
1227
|
let completedTestCases = 0;
|
|
1127
|
-
|
|
1128
|
-
let failedTestCases = 0;
|
|
1228
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
1129
1229
|
for (const line of lines) {
|
|
1130
1230
|
try {
|
|
1131
1231
|
const event = JSON.parse(line);
|
|
1132
1232
|
if (event.type === "TestCaseProgress") {
|
|
1133
1233
|
const ev = event;
|
|
1134
1234
|
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
failedTestCases += 1;
|
|
1139
|
-
}
|
|
1235
|
+
const id = ev.testCaseId;
|
|
1236
|
+
const current = testCasePassedBy.get(id);
|
|
1237
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1140
1238
|
}
|
|
1141
1239
|
} catch {
|
|
1142
1240
|
}
|
|
1143
1241
|
}
|
|
1242
|
+
let passedTestCases = 0;
|
|
1243
|
+
let failedTestCases = 0;
|
|
1244
|
+
for (const passed of testCasePassedBy.values()) {
|
|
1245
|
+
if (passed) {
|
|
1246
|
+
passedTestCases += 1;
|
|
1247
|
+
} else {
|
|
1248
|
+
failedTestCases += 1;
|
|
1249
|
+
}
|
|
1250
|
+
}
|
|
1144
1251
|
return { completedTestCases, passedTestCases, failedTestCases };
|
|
1145
1252
|
}
|
|
1146
1253
|
async function appendJsonLine(artifactPath, payload) {
|
|
@@ -1335,6 +1442,10 @@ var EffectRunner = class {
|
|
|
1335
1442
|
throw new Error("No evaluators selected for run");
|
|
1336
1443
|
}
|
|
1337
1444
|
const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
|
|
1445
|
+
const totalEvaluations = selectedTestCases.reduce(
|
|
1446
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1447
|
+
0
|
|
1448
|
+
);
|
|
1338
1449
|
const runId = `run-${randomUUID()}`;
|
|
1339
1450
|
const artifactPath = createArtifactPath(
|
|
1340
1451
|
this.config.artifactDirectory,
|
|
@@ -1347,7 +1458,7 @@ var EffectRunner = class {
|
|
|
1347
1458
|
datasetName: dataset.dataset.getName(),
|
|
1348
1459
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1349
1460
|
queuedAt: Date.now(),
|
|
1350
|
-
totalTestCases:
|
|
1461
|
+
totalTestCases: totalEvaluations,
|
|
1351
1462
|
completedTestCases: 0,
|
|
1352
1463
|
passedTestCases: 0,
|
|
1353
1464
|
failedTestCases: 0,
|
|
@@ -1361,7 +1472,7 @@ var EffectRunner = class {
|
|
|
1361
1472
|
datasetId: request.datasetId,
|
|
1362
1473
|
datasetName: dataset.dataset.getName(),
|
|
1363
1474
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1364
|
-
totalTestCases:
|
|
1475
|
+
totalTestCases: totalEvaluations,
|
|
1365
1476
|
artifactPath
|
|
1366
1477
|
};
|
|
1367
1478
|
await Effect.runPromise(this.publishEvent(queuedEvent));
|
|
@@ -1372,6 +1483,7 @@ var EffectRunner = class {
|
|
|
1372
1483
|
payload: queuedEvent
|
|
1373
1484
|
})
|
|
1374
1485
|
);
|
|
1486
|
+
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
1375
1487
|
await Effect.runPromise(
|
|
1376
1488
|
Queue.offer(this.runQueue, {
|
|
1377
1489
|
runId,
|
|
@@ -1379,7 +1491,8 @@ var EffectRunner = class {
|
|
|
1379
1491
|
dataset: dataset.dataset,
|
|
1380
1492
|
evaluators: selectedEvaluators,
|
|
1381
1493
|
testCases: selectedTestCases,
|
|
1382
|
-
snapshot
|
|
1494
|
+
snapshot,
|
|
1495
|
+
maxConcurrency
|
|
1383
1496
|
})
|
|
1384
1497
|
);
|
|
1385
1498
|
return snapshot;
|