@m4trix/evals 0.12.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +706 -231
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +707 -232
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +710 -390
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +702 -382
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +289 -108
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +28 -5
- package/dist/index.js +290 -109
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
package/dist/cli-simple.js
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import { randomUUID } from 'crypto';
|
|
3
|
-
import { Effect, PubSub, Queue, Fiber } from 'effect';
|
|
3
|
+
import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
|
|
4
4
|
import { existsSync } from 'fs';
|
|
5
5
|
import { resolve, relative, join, parse, dirname } from 'path';
|
|
6
6
|
import * as jitiModule from 'jiti';
|
|
7
7
|
import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
8
8
|
import { pathToFileURL } from 'url';
|
|
9
|
-
import {
|
|
9
|
+
import { diffLines } from 'diff';
|
|
10
10
|
import React2, { useState, useEffect, useCallback } from 'react';
|
|
11
11
|
import { render, Box, Text } from 'ink';
|
|
12
12
|
import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
|
|
@@ -30,7 +30,8 @@ var defaultRunnerConfig = {
|
|
|
30
30
|
],
|
|
31
31
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
32
32
|
},
|
|
33
|
-
artifactDirectory: ".eval-results"
|
|
33
|
+
artifactDirectory: ".eval-results",
|
|
34
|
+
maxConcurrency: 1
|
|
34
35
|
};
|
|
35
36
|
function toRunnerConfigOverrides(config) {
|
|
36
37
|
if (!config) {
|
|
@@ -63,6 +64,9 @@ function toRunnerConfigOverrides(config) {
|
|
|
63
64
|
if (config.artifactDirectory !== void 0) {
|
|
64
65
|
overrides.artifactDirectory = config.artifactDirectory;
|
|
65
66
|
}
|
|
67
|
+
if (config.maxConcurrency !== void 0) {
|
|
68
|
+
overrides.maxConcurrency = config.maxConcurrency;
|
|
69
|
+
}
|
|
66
70
|
if (Object.keys(discovery).length > 0) {
|
|
67
71
|
overrides.discovery = discovery;
|
|
68
72
|
}
|
|
@@ -256,8 +260,35 @@ async function collectTestCasesFromFiles(config) {
|
|
|
256
260
|
);
|
|
257
261
|
return found.flat();
|
|
258
262
|
}
|
|
263
|
+
function toJsonLines(value) {
|
|
264
|
+
try {
|
|
265
|
+
return JSON.stringify(value, null, 2);
|
|
266
|
+
} catch {
|
|
267
|
+
return String(value);
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
function formatDiffString(changes) {
|
|
271
|
+
const lines = [];
|
|
272
|
+
for (const part of changes) {
|
|
273
|
+
const prefix = part.added ? "+" : part.removed ? "-" : " ";
|
|
274
|
+
const partLines = part.value.split("\n");
|
|
275
|
+
if (partLines[partLines.length - 1] === "") {
|
|
276
|
+
partLines.pop();
|
|
277
|
+
}
|
|
278
|
+
for (const line of partLines) {
|
|
279
|
+
lines.push(`${prefix} ${line}`);
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
return lines.join("\n");
|
|
283
|
+
}
|
|
284
|
+
function createDiffString(expected, actual) {
|
|
285
|
+
const expectedStr = toJsonLines(expected);
|
|
286
|
+
const actualStr = toJsonLines(actual);
|
|
287
|
+
const changes = diffLines(expectedStr, actualStr);
|
|
288
|
+
return formatDiffString(changes);
|
|
289
|
+
}
|
|
259
290
|
function createDiffLogEntry(expected, actual, options) {
|
|
260
|
-
const diff =
|
|
291
|
+
const diff = createDiffString(expected, actual);
|
|
261
292
|
return {
|
|
262
293
|
type: "diff",
|
|
263
294
|
label: options?.label,
|
|
@@ -267,7 +298,7 @@ function createDiffLogEntry(expected, actual, options) {
|
|
|
267
298
|
};
|
|
268
299
|
}
|
|
269
300
|
function getDiffLines(entry) {
|
|
270
|
-
const raw =
|
|
301
|
+
const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
|
|
271
302
|
return raw.split("\n").map((line) => {
|
|
272
303
|
const trimmed = line.trimStart();
|
|
273
304
|
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
@@ -287,6 +318,7 @@ var Metric = {
|
|
|
287
318
|
const def = {
|
|
288
319
|
id: config.id,
|
|
289
320
|
name: config.name,
|
|
321
|
+
aggregate: config.aggregate,
|
|
290
322
|
format: config.format,
|
|
291
323
|
make: (data) => ({ id: config.id, data })
|
|
292
324
|
};
|
|
@@ -306,6 +338,7 @@ var Score = {
|
|
|
306
338
|
id: config.id,
|
|
307
339
|
name: config.name,
|
|
308
340
|
displayStrategy: config.displayStrategy,
|
|
341
|
+
aggregate: config.aggregate,
|
|
309
342
|
format: config.format,
|
|
310
343
|
make: (data, options) => {
|
|
311
344
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
@@ -324,23 +357,75 @@ function getScoreById(id) {
|
|
|
324
357
|
return registry2.get(id);
|
|
325
358
|
}
|
|
326
359
|
|
|
360
|
+
// src/evals/aggregators.ts
|
|
361
|
+
function aggregateAverageWithVariance(values) {
|
|
362
|
+
if (values.length === 0) {
|
|
363
|
+
return { value: 0, count: 0 };
|
|
364
|
+
}
|
|
365
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
366
|
+
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
367
|
+
const mean = sum / values.length;
|
|
368
|
+
let stdDev;
|
|
369
|
+
if (values.length >= 2) {
|
|
370
|
+
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
371
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
372
|
+
}
|
|
373
|
+
return { value: mean, stdDev, count: values.length };
|
|
374
|
+
}
|
|
375
|
+
function aggregateAll(values) {
|
|
376
|
+
const total = values.length;
|
|
377
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
378
|
+
return {
|
|
379
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
380
|
+
passedCount,
|
|
381
|
+
totalCount: total
|
|
382
|
+
};
|
|
383
|
+
}
|
|
384
|
+
function aggregateTokenCountSum(values) {
|
|
385
|
+
const initial = {
|
|
386
|
+
input: 0,
|
|
387
|
+
output: 0,
|
|
388
|
+
inputCached: 0,
|
|
389
|
+
outputCached: 0
|
|
390
|
+
};
|
|
391
|
+
return values.reduce(
|
|
392
|
+
(acc, v) => ({
|
|
393
|
+
input: acc.input + (v.input ?? 0),
|
|
394
|
+
output: acc.output + (v.output ?? 0),
|
|
395
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
396
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
397
|
+
}),
|
|
398
|
+
initial
|
|
399
|
+
);
|
|
400
|
+
}
|
|
401
|
+
function aggregateLatencyAverage(values) {
|
|
402
|
+
if (values.length === 0) {
|
|
403
|
+
return { ms: 0 };
|
|
404
|
+
}
|
|
405
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
406
|
+
return { ms: sum / values.length };
|
|
407
|
+
}
|
|
408
|
+
|
|
327
409
|
// src/evals/metrics/standard.ts
|
|
328
410
|
Metric.of({
|
|
329
411
|
id: "token-count",
|
|
330
412
|
name: "Tokens",
|
|
331
|
-
|
|
413
|
+
aggregate: aggregateTokenCountSum,
|
|
414
|
+
format: (data, options) => {
|
|
332
415
|
const input = data.input ?? 0;
|
|
333
416
|
const output = data.output ?? 0;
|
|
334
417
|
const inputCached = data.inputCached ?? 0;
|
|
335
418
|
const outputCached = data.outputCached ?? 0;
|
|
336
419
|
const cached = inputCached + outputCached;
|
|
337
|
-
|
|
420
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
421
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
338
422
|
}
|
|
339
423
|
});
|
|
340
424
|
Metric.of({
|
|
341
425
|
id: "latency",
|
|
342
426
|
name: "Latency",
|
|
343
|
-
|
|
427
|
+
aggregate: aggregateLatencyAverage,
|
|
428
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
344
429
|
});
|
|
345
430
|
|
|
346
431
|
// src/evals/scores/standard.ts
|
|
@@ -348,16 +433,50 @@ Score.of({
|
|
|
348
433
|
id: "percent",
|
|
349
434
|
name: "Score",
|
|
350
435
|
displayStrategy: "bar",
|
|
351
|
-
format: (data) =>
|
|
436
|
+
format: (data, options) => {
|
|
437
|
+
if (options?.isAggregated) {
|
|
438
|
+
return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
|
|
439
|
+
}
|
|
440
|
+
return data.value.toFixed(2);
|
|
441
|
+
},
|
|
442
|
+
aggregate: aggregateAverageWithVariance
|
|
352
443
|
});
|
|
353
444
|
Score.of({
|
|
354
445
|
id: "binary",
|
|
355
446
|
name: "Result",
|
|
356
447
|
displayStrategy: "passFail",
|
|
357
|
-
format: (data) =>
|
|
448
|
+
format: (data, options) => {
|
|
449
|
+
if (options?.isAggregated) {
|
|
450
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
451
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
452
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
453
|
+
}
|
|
454
|
+
return base;
|
|
455
|
+
}
|
|
456
|
+
return data.passed ? "PASSED" : "NOT PASSED";
|
|
457
|
+
},
|
|
458
|
+
aggregate: aggregateAll
|
|
358
459
|
});
|
|
359
460
|
|
|
360
461
|
// src/runner/score-utils.ts
|
|
462
|
+
function aggregateScoreItems(items) {
|
|
463
|
+
if (items.length === 0)
|
|
464
|
+
return void 0;
|
|
465
|
+
const def = getScoreById(items[0].id);
|
|
466
|
+
if (!def?.aggregate)
|
|
467
|
+
return items[items.length - 1];
|
|
468
|
+
const aggregated = def.aggregate(items.map((i) => i.data));
|
|
469
|
+
return { ...items[0], data: aggregated };
|
|
470
|
+
}
|
|
471
|
+
function aggregateMetricItems(items) {
|
|
472
|
+
if (items.length === 0)
|
|
473
|
+
return void 0;
|
|
474
|
+
const def = getMetricById(items[0].id);
|
|
475
|
+
if (!def?.aggregate)
|
|
476
|
+
return items[items.length - 1];
|
|
477
|
+
const aggregated = def.aggregate(items.map((i) => i.data));
|
|
478
|
+
return { ...items[0], data: aggregated };
|
|
479
|
+
}
|
|
361
480
|
function toNumericScoreFromScores(scores) {
|
|
362
481
|
for (const item of scores) {
|
|
363
482
|
const def = getScoreById(item.id);
|
|
@@ -436,6 +555,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
436
555
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
437
556
|
);
|
|
438
557
|
}
|
|
558
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
559
|
+
return Effect.gen(function* () {
|
|
560
|
+
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
561
|
+
const rerunPassed = [];
|
|
562
|
+
for (let r = 0; r < reruns; r++) {
|
|
563
|
+
const started = Date.now();
|
|
564
|
+
const evaluatorScores = [];
|
|
565
|
+
let testCaseError;
|
|
566
|
+
const output = readOutput(testCaseItem.testCase);
|
|
567
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
568
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
569
|
+
if (!evaluateFn) {
|
|
570
|
+
continue;
|
|
571
|
+
}
|
|
572
|
+
try {
|
|
573
|
+
const logs = [];
|
|
574
|
+
const logDiff = (expected, actual, options) => {
|
|
575
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
576
|
+
};
|
|
577
|
+
const ctx = yield* Effect.promise(
|
|
578
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
579
|
+
);
|
|
580
|
+
const result = yield* Effect.promise(
|
|
581
|
+
() => Promise.resolve(
|
|
582
|
+
evaluateFn({
|
|
583
|
+
input: testCaseItem.testCase.getInput(),
|
|
584
|
+
ctx,
|
|
585
|
+
output,
|
|
586
|
+
logDiff
|
|
587
|
+
})
|
|
588
|
+
)
|
|
589
|
+
);
|
|
590
|
+
const { scores, metrics } = normalizeResult(result);
|
|
591
|
+
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
592
|
+
evaluatorScores.push({
|
|
593
|
+
evaluatorId,
|
|
594
|
+
scores,
|
|
595
|
+
passed: passed2,
|
|
596
|
+
metrics,
|
|
597
|
+
logs: logs.length > 0 ? logs : void 0
|
|
598
|
+
});
|
|
599
|
+
} catch (error) {
|
|
600
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
601
|
+
evaluatorScores.push({
|
|
602
|
+
evaluatorId,
|
|
603
|
+
scores: [],
|
|
604
|
+
passed: false
|
|
605
|
+
});
|
|
606
|
+
}
|
|
607
|
+
}
|
|
608
|
+
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
609
|
+
rerunPassed.push(rerunPassedThis);
|
|
610
|
+
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
|
|
611
|
+
n + 1,
|
|
612
|
+
n + 1
|
|
613
|
+
]);
|
|
614
|
+
const progressEvent = {
|
|
615
|
+
type: "TestCaseProgress",
|
|
616
|
+
runId: task.runId,
|
|
617
|
+
testCaseId: testCaseItem.id,
|
|
618
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
619
|
+
completedTestCases: completedEvaluations,
|
|
620
|
+
totalTestCases: totalEvaluations,
|
|
621
|
+
rerunIndex: r + 1,
|
|
622
|
+
rerunTotal: reruns,
|
|
623
|
+
passed: rerunPassedThis,
|
|
624
|
+
durationMs: Date.now() - started,
|
|
625
|
+
evaluatorScores,
|
|
626
|
+
output,
|
|
627
|
+
errorMessage: testCaseError
|
|
628
|
+
};
|
|
629
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
630
|
+
...snapshot,
|
|
631
|
+
completedTestCases: completedEvaluations
|
|
632
|
+
}));
|
|
633
|
+
yield* publishEvent(progressEvent);
|
|
634
|
+
yield* Queue.offer(persistenceQueue, {
|
|
635
|
+
runId: task.runId,
|
|
636
|
+
artifactPath: task.snapshot.artifactPath,
|
|
637
|
+
payload: progressEvent
|
|
638
|
+
});
|
|
639
|
+
}
|
|
640
|
+
const testCasePassed = rerunPassed.every(Boolean);
|
|
641
|
+
if (testCasePassed) {
|
|
642
|
+
yield* Ref.update(passedRef, (n) => n + 1);
|
|
643
|
+
} else {
|
|
644
|
+
yield* Ref.update(failedRef, (n) => n + 1);
|
|
645
|
+
}
|
|
646
|
+
const [passed, failed] = yield* Effect.all([
|
|
647
|
+
Ref.get(passedRef),
|
|
648
|
+
Ref.get(failedRef)
|
|
649
|
+
]);
|
|
650
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
651
|
+
...snapshot,
|
|
652
|
+
passedTestCases: passed,
|
|
653
|
+
failedTestCases: failed
|
|
654
|
+
}));
|
|
655
|
+
});
|
|
656
|
+
}
|
|
439
657
|
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
|
|
440
658
|
const startedAt = Date.now();
|
|
441
659
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
@@ -448,104 +666,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
448
666
|
runId: task.runId,
|
|
449
667
|
startedAt
|
|
450
668
|
});
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
)
|
|
481
|
-
);
|
|
482
|
-
const { scores, metrics } = normalizeResult(result);
|
|
483
|
-
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
484
|
-
evaluatorScores.push({
|
|
485
|
-
evaluatorId,
|
|
486
|
-
scores,
|
|
487
|
-
passed,
|
|
488
|
-
metrics,
|
|
489
|
-
logs: logs.length > 0 ? logs : void 0
|
|
490
|
-
});
|
|
491
|
-
} catch (error) {
|
|
492
|
-
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
493
|
-
evaluatorScores.push({
|
|
494
|
-
evaluatorId,
|
|
495
|
-
scores: [],
|
|
496
|
-
passed: false
|
|
497
|
-
});
|
|
498
|
-
}
|
|
499
|
-
}
|
|
500
|
-
const testCasePassed = evaluatorScores.every((s) => s.passed);
|
|
501
|
-
completedTestCases += 1;
|
|
502
|
-
if (testCasePassed) {
|
|
503
|
-
passedTestCases += 1;
|
|
504
|
-
} else {
|
|
505
|
-
failedTestCases += 1;
|
|
506
|
-
}
|
|
507
|
-
const progressEvent = {
|
|
508
|
-
type: "TestCaseProgress",
|
|
509
|
-
runId: task.runId,
|
|
510
|
-
testCaseId: testCaseItem.id,
|
|
511
|
-
testCaseName: testCaseItem.testCase.getName(),
|
|
512
|
-
completedTestCases,
|
|
513
|
-
totalTestCases: task.testCases.length,
|
|
514
|
-
passed: testCasePassed,
|
|
515
|
-
durationMs: Date.now() - started,
|
|
516
|
-
evaluatorScores,
|
|
517
|
-
output,
|
|
518
|
-
errorMessage: testCaseError
|
|
519
|
-
};
|
|
520
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
521
|
-
...snapshot,
|
|
522
|
-
completedTestCases,
|
|
523
|
-
passedTestCases,
|
|
524
|
-
failedTestCases
|
|
525
|
-
}));
|
|
526
|
-
yield* publishEvent(progressEvent);
|
|
527
|
-
yield* Queue.offer(persistenceQueue, {
|
|
528
|
-
runId: task.runId,
|
|
529
|
-
artifactPath: task.snapshot.artifactPath,
|
|
530
|
-
payload: progressEvent
|
|
531
|
-
});
|
|
532
|
-
}
|
|
669
|
+
const totalEvaluations = task.testCases.reduce(
|
|
670
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
671
|
+
0
|
|
672
|
+
);
|
|
673
|
+
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
674
|
+
const completedRef = yield* Ref.make(0);
|
|
675
|
+
const passedRef = yield* Ref.make(0);
|
|
676
|
+
const failedRef = yield* Ref.make(0);
|
|
677
|
+
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
678
|
+
task,
|
|
679
|
+
testCaseItem,
|
|
680
|
+
totalEvaluations,
|
|
681
|
+
publishEvent,
|
|
682
|
+
persistenceQueue,
|
|
683
|
+
updateSnapshot,
|
|
684
|
+
completedRef,
|
|
685
|
+
passedRef,
|
|
686
|
+
failedRef
|
|
687
|
+
);
|
|
688
|
+
yield* Effect.forEach(
|
|
689
|
+
task.testCases,
|
|
690
|
+
processTestCase,
|
|
691
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
692
|
+
);
|
|
693
|
+
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
|
|
694
|
+
Ref.get(completedRef),
|
|
695
|
+
Ref.get(passedRef),
|
|
696
|
+
Ref.get(failedRef)
|
|
697
|
+
]);
|
|
533
698
|
const finishedAt = Date.now();
|
|
534
699
|
const completedEvent = {
|
|
535
700
|
type: "RunCompleted",
|
|
536
701
|
runId: task.runId,
|
|
537
702
|
finishedAt,
|
|
538
|
-
passedTestCases,
|
|
539
|
-
failedTestCases,
|
|
703
|
+
passedTestCases: passedUniqueTestCases,
|
|
704
|
+
failedTestCases: failedUniqueTestCases,
|
|
540
705
|
totalTestCases: task.testCases.length,
|
|
541
706
|
artifactPath: task.snapshot.artifactPath
|
|
542
707
|
};
|
|
543
708
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
544
709
|
...snapshot,
|
|
545
710
|
status: "completed",
|
|
546
|
-
completedTestCases,
|
|
547
|
-
passedTestCases,
|
|
548
|
-
failedTestCases,
|
|
711
|
+
completedTestCases: completedEvaluations,
|
|
712
|
+
passedTestCases: passedUniqueTestCases,
|
|
713
|
+
failedTestCases: failedUniqueTestCases,
|
|
549
714
|
finishedAt
|
|
550
715
|
}));
|
|
551
716
|
yield* publishEvent(completedEvent);
|
|
@@ -633,7 +798,7 @@ async function parseArtifactToSnapshot(filePath, _config) {
|
|
|
633
798
|
const artifactPath = filePath;
|
|
634
799
|
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
635
800
|
const progress = aggregateTestCaseProgress(lines);
|
|
636
|
-
const completedTestCases = runCompleted
|
|
801
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
637
802
|
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
638
803
|
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
639
804
|
return {
|
|
@@ -655,23 +820,29 @@ async function parseArtifactToSnapshot(filePath, _config) {
|
|
|
655
820
|
}
|
|
656
821
|
function aggregateTestCaseProgress(lines) {
|
|
657
822
|
let completedTestCases = 0;
|
|
658
|
-
|
|
659
|
-
let failedTestCases = 0;
|
|
823
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
660
824
|
for (const line of lines) {
|
|
661
825
|
try {
|
|
662
826
|
const event = JSON.parse(line);
|
|
663
827
|
if (event.type === "TestCaseProgress") {
|
|
664
828
|
const ev = event;
|
|
665
829
|
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
failedTestCases += 1;
|
|
670
|
-
}
|
|
830
|
+
const id = ev.testCaseId;
|
|
831
|
+
const current = testCasePassedBy.get(id);
|
|
832
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
671
833
|
}
|
|
672
834
|
} catch {
|
|
673
835
|
}
|
|
674
836
|
}
|
|
837
|
+
let passedTestCases = 0;
|
|
838
|
+
let failedTestCases = 0;
|
|
839
|
+
for (const passed of testCasePassedBy.values()) {
|
|
840
|
+
if (passed) {
|
|
841
|
+
passedTestCases += 1;
|
|
842
|
+
} else {
|
|
843
|
+
failedTestCases += 1;
|
|
844
|
+
}
|
|
845
|
+
}
|
|
675
846
|
return { completedTestCases, passedTestCases, failedTestCases };
|
|
676
847
|
}
|
|
677
848
|
async function appendJsonLine(artifactPath, payload) {
|
|
@@ -866,6 +1037,10 @@ var EffectRunner = class {
|
|
|
866
1037
|
throw new Error("No evaluators selected for run");
|
|
867
1038
|
}
|
|
868
1039
|
const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
|
|
1040
|
+
const totalEvaluations = selectedTestCases.reduce(
|
|
1041
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1042
|
+
0
|
|
1043
|
+
);
|
|
869
1044
|
const runId = `run-${randomUUID()}`;
|
|
870
1045
|
const artifactPath = createArtifactPath(
|
|
871
1046
|
this.config.artifactDirectory,
|
|
@@ -878,7 +1053,7 @@ var EffectRunner = class {
|
|
|
878
1053
|
datasetName: dataset.dataset.getName(),
|
|
879
1054
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
880
1055
|
queuedAt: Date.now(),
|
|
881
|
-
totalTestCases:
|
|
1056
|
+
totalTestCases: totalEvaluations,
|
|
882
1057
|
completedTestCases: 0,
|
|
883
1058
|
passedTestCases: 0,
|
|
884
1059
|
failedTestCases: 0,
|
|
@@ -892,7 +1067,7 @@ var EffectRunner = class {
|
|
|
892
1067
|
datasetId: request.datasetId,
|
|
893
1068
|
datasetName: dataset.dataset.getName(),
|
|
894
1069
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
895
|
-
totalTestCases:
|
|
1070
|
+
totalTestCases: totalEvaluations,
|
|
896
1071
|
artifactPath
|
|
897
1072
|
};
|
|
898
1073
|
await Effect.runPromise(this.publishEvent(queuedEvent));
|
|
@@ -903,6 +1078,7 @@ var EffectRunner = class {
|
|
|
903
1078
|
payload: queuedEvent
|
|
904
1079
|
})
|
|
905
1080
|
);
|
|
1081
|
+
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
906
1082
|
await Effect.runPromise(
|
|
907
1083
|
Queue.offer(this.runQueue, {
|
|
908
1084
|
runId,
|
|
@@ -910,7 +1086,8 @@ var EffectRunner = class {
|
|
|
910
1086
|
dataset: dataset.dataset,
|
|
911
1087
|
evaluators: selectedEvaluators,
|
|
912
1088
|
testCases: selectedTestCases,
|
|
913
|
-
snapshot
|
|
1089
|
+
snapshot,
|
|
1090
|
+
maxConcurrency
|
|
914
1091
|
})
|
|
915
1092
|
);
|
|
916
1093
|
return snapshot;
|
|
@@ -1216,6 +1393,13 @@ function Spinner({ label = "Running" }) {
|
|
|
1216
1393
|
label
|
|
1217
1394
|
] });
|
|
1218
1395
|
}
|
|
1396
|
+
function sampleStdDev(sum, sumSq, n) {
|
|
1397
|
+
if (n < 2)
|
|
1398
|
+
return void 0;
|
|
1399
|
+
const mean = sum / n;
|
|
1400
|
+
const variance = (sumSq - n * mean * mean) / (n - 1);
|
|
1401
|
+
return variance > 0 ? Math.sqrt(variance) : 0;
|
|
1402
|
+
}
|
|
1219
1403
|
function scoreColor(score) {
|
|
1220
1404
|
if (score >= 80)
|
|
1221
1405
|
return "green";
|
|
@@ -1228,13 +1412,62 @@ function createBar(value, max = 100, width = 20) {
|
|
|
1228
1412
|
const filled = Math.round(safe / max * width);
|
|
1229
1413
|
return "\u2588".repeat(filled) + "\u2591".repeat(width - filled);
|
|
1230
1414
|
}
|
|
1231
|
-
function
|
|
1415
|
+
function aggregateEvaluatorScores(events, nameById) {
|
|
1416
|
+
if (events.length === 0)
|
|
1417
|
+
return [];
|
|
1418
|
+
const evaluatorIds = new Set(
|
|
1419
|
+
events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
|
|
1420
|
+
);
|
|
1421
|
+
const result = [];
|
|
1422
|
+
for (const evaluatorId of evaluatorIds) {
|
|
1423
|
+
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
1424
|
+
const metricIdToItems = /* @__PURE__ */ new Map();
|
|
1425
|
+
for (const ev of events) {
|
|
1426
|
+
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1427
|
+
for (const s of es?.scores ?? []) {
|
|
1428
|
+
const list = scoreIdToItems.get(s.id) ?? [];
|
|
1429
|
+
list.push(s);
|
|
1430
|
+
scoreIdToItems.set(s.id, list);
|
|
1431
|
+
}
|
|
1432
|
+
for (const m of es?.metrics ?? []) {
|
|
1433
|
+
const list = metricIdToItems.get(m.id) ?? [];
|
|
1434
|
+
list.push(m);
|
|
1435
|
+
metricIdToItems.set(m.id, list);
|
|
1436
|
+
}
|
|
1437
|
+
}
|
|
1438
|
+
const aggregatedScores = [];
|
|
1439
|
+
for (const items of scoreIdToItems.values()) {
|
|
1440
|
+
const agg = aggregateScoreItems(items);
|
|
1441
|
+
if (agg)
|
|
1442
|
+
aggregatedScores.push(agg);
|
|
1443
|
+
}
|
|
1444
|
+
const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([id, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
|
|
1445
|
+
const passed = events.every((ev) => {
|
|
1446
|
+
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1447
|
+
return es?.passed ?? false;
|
|
1448
|
+
});
|
|
1449
|
+
const lastEvent = events[events.length - 1];
|
|
1450
|
+
const lastEs = lastEvent?.evaluatorScores.find(
|
|
1451
|
+
(x) => x.evaluatorId === evaluatorId
|
|
1452
|
+
);
|
|
1453
|
+
result.push({
|
|
1454
|
+
evaluatorId,
|
|
1455
|
+
evaluatorName: nameById.get(evaluatorId) ?? evaluatorId,
|
|
1456
|
+
scores: aggregatedScores,
|
|
1457
|
+
passed,
|
|
1458
|
+
metrics: aggregatedMetrics.length > 0 ? aggregatedMetrics : void 0,
|
|
1459
|
+
logs: lastEs?.logs
|
|
1460
|
+
});
|
|
1461
|
+
}
|
|
1462
|
+
return result;
|
|
1463
|
+
}
|
|
1464
|
+
function formatScorePart(item, scoreToColor2, options) {
|
|
1232
1465
|
const def = getScoreById(item.id);
|
|
1233
1466
|
if (!def) {
|
|
1234
1467
|
const numeric = toNumericScore(item.data);
|
|
1235
1468
|
return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
|
|
1236
1469
|
}
|
|
1237
|
-
const formatted = def.format(item.data);
|
|
1470
|
+
const formatted = def.format(item.data, options);
|
|
1238
1471
|
if (def.displayStrategy === "bar") {
|
|
1239
1472
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
1240
1473
|
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
@@ -1254,6 +1487,7 @@ function RunView({
|
|
|
1254
1487
|
);
|
|
1255
1488
|
const [runInfo, setRunInfo] = useState(null);
|
|
1256
1489
|
const [testCases, setTestCases] = useState([]);
|
|
1490
|
+
const [completedEvaluations, setCompletedEvaluations] = useState(0);
|
|
1257
1491
|
const [summary, setSummary] = useState(null);
|
|
1258
1492
|
const [evaluatorNameById, setEvaluatorNameById] = useState(/* @__PURE__ */ new Map());
|
|
1259
1493
|
const runEval = useCallback(async () => {
|
|
@@ -1280,48 +1514,44 @@ function RunView({
|
|
|
1280
1514
|
return;
|
|
1281
1515
|
}
|
|
1282
1516
|
const nameById = new Map(
|
|
1283
|
-
evaluators.map((item) => [
|
|
1284
|
-
item.id,
|
|
1285
|
-
item.evaluator.getName() ?? item.id
|
|
1286
|
-
])
|
|
1517
|
+
evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
|
|
1287
1518
|
);
|
|
1288
1519
|
setEvaluatorNameById(nameById);
|
|
1289
1520
|
const aggregates = /* @__PURE__ */ new Map();
|
|
1290
1521
|
let overallScoreTotal = 0;
|
|
1522
|
+
let overallScoreSumSq = 0;
|
|
1291
1523
|
let overallScoreCount = 0;
|
|
1292
1524
|
const done = new Promise((resolve5) => {
|
|
1293
1525
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1294
1526
|
if (event.type === "TestCaseProgress") {
|
|
1295
1527
|
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
1296
|
-
|
|
1528
|
+
numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
|
|
1297
1529
|
for (const item of event.evaluatorScores) {
|
|
1298
1530
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
1299
1531
|
if (numeric !== void 0) {
|
|
1300
1532
|
const current = aggregates.get(item.evaluatorId) ?? {
|
|
1301
1533
|
total: 0,
|
|
1534
|
+
sumSq: 0,
|
|
1302
1535
|
count: 0,
|
|
1303
1536
|
passed: 0,
|
|
1304
1537
|
failed: 0
|
|
1305
1538
|
};
|
|
1306
1539
|
aggregates.set(item.evaluatorId, {
|
|
1307
1540
|
total: current.total + numeric,
|
|
1541
|
+
sumSq: current.sumSq + numeric * numeric,
|
|
1308
1542
|
count: current.count + 1,
|
|
1309
1543
|
passed: current.passed + (item.passed ? 1 : 0),
|
|
1310
1544
|
failed: current.failed + (item.passed ? 0 : 1)
|
|
1311
1545
|
});
|
|
1312
1546
|
overallScoreTotal += numeric;
|
|
1547
|
+
overallScoreSumSq += numeric * numeric;
|
|
1313
1548
|
overallScoreCount += 1;
|
|
1314
1549
|
}
|
|
1315
1550
|
}
|
|
1316
|
-
setTestCases((prev) =>
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
completedTestCases: event.completedTestCases,
|
|
1321
|
-
totalTestCases: event.totalTestCases,
|
|
1322
|
-
durationMs: event.durationMs,
|
|
1323
|
-
passed: event.passed,
|
|
1324
|
-
averageScore,
|
|
1551
|
+
setTestCases((prev) => {
|
|
1552
|
+
const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
|
|
1553
|
+
const existing = byId.get(event.testCaseId);
|
|
1554
|
+
const newEvent = {
|
|
1325
1555
|
evaluatorScores: event.evaluatorScores.map((item) => ({
|
|
1326
1556
|
evaluatorId: item.evaluatorId,
|
|
1327
1557
|
evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
|
|
@@ -1329,9 +1559,33 @@ function RunView({
|
|
|
1329
1559
|
passed: item.passed,
|
|
1330
1560
|
metrics: item.metrics,
|
|
1331
1561
|
logs: item.logs
|
|
1332
|
-
}))
|
|
1333
|
-
|
|
1334
|
-
|
|
1562
|
+
})),
|
|
1563
|
+
passed: event.passed,
|
|
1564
|
+
durationMs: event.durationMs
|
|
1565
|
+
};
|
|
1566
|
+
const events = existing ? [...existing.events, newEvent] : [newEvent];
|
|
1567
|
+
const isAggregated = events.length > 1;
|
|
1568
|
+
const aggregatedEvaluatorScores = aggregateEvaluatorScores(
|
|
1569
|
+
events,
|
|
1570
|
+
nameById
|
|
1571
|
+
);
|
|
1572
|
+
const merged = {
|
|
1573
|
+
name: event.testCaseName,
|
|
1574
|
+
testCaseId: event.testCaseId,
|
|
1575
|
+
completedTestCases: event.completedTestCases,
|
|
1576
|
+
totalTestCases: event.totalTestCases,
|
|
1577
|
+
rerunIndex: event.rerunIndex,
|
|
1578
|
+
rerunTotal: event.rerunTotal,
|
|
1579
|
+
durationMs: events.reduce((s, e) => s + e.durationMs, 0),
|
|
1580
|
+
passed: events.every((e) => e.passed),
|
|
1581
|
+
events,
|
|
1582
|
+
aggregatedEvaluatorScores,
|
|
1583
|
+
isAggregated
|
|
1584
|
+
};
|
|
1585
|
+
byId.set(event.testCaseId, merged);
|
|
1586
|
+
setCompletedEvaluations(event.completedTestCases);
|
|
1587
|
+
return Array.from(byId.values());
|
|
1588
|
+
});
|
|
1335
1589
|
}
|
|
1336
1590
|
if (event.type === "RunCompleted" || event.type === "RunFailed") {
|
|
1337
1591
|
unsubscribe();
|
|
@@ -1346,9 +1600,7 @@ function RunView({
|
|
|
1346
1600
|
setRunInfo({
|
|
1347
1601
|
runId: snapshot.runId,
|
|
1348
1602
|
datasetName: snapshot.datasetName,
|
|
1349
|
-
evaluatorNames: evaluators.map(
|
|
1350
|
-
(e) => e.evaluator.getName() ?? e.id
|
|
1351
|
-
),
|
|
1603
|
+
evaluatorNames: evaluators.map((e) => e.evaluator.getName() ?? e.id),
|
|
1352
1604
|
totalTestCases: snapshot.totalTestCases
|
|
1353
1605
|
});
|
|
1354
1606
|
setPhase("running");
|
|
@@ -1362,6 +1614,7 @@ function RunView({
|
|
|
1362
1614
|
failedTestCases: finalEvent.failedTestCases,
|
|
1363
1615
|
totalTestCases: finalEvent.totalTestCases,
|
|
1364
1616
|
overallScoreTotal,
|
|
1617
|
+
overallScoreSumSq,
|
|
1365
1618
|
overallScoreCount,
|
|
1366
1619
|
aggregates: new Map(aggregates),
|
|
1367
1620
|
artifactPath: finalEvent.artifactPath
|
|
@@ -1376,29 +1629,41 @@ function RunView({
|
|
|
1376
1629
|
/* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(Banner, {}) }),
|
|
1377
1630
|
runInfo && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 1, children: [
|
|
1378
1631
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1379
|
-
/* @__PURE__ */
|
|
1632
|
+
/* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
|
|
1633
|
+
"Run",
|
|
1634
|
+
" "
|
|
1635
|
+
] }),
|
|
1380
1636
|
/* @__PURE__ */ jsx(Text, { color: "gray", children: runInfo.runId })
|
|
1381
1637
|
] }),
|
|
1382
1638
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1383
|
-
/* @__PURE__ */
|
|
1639
|
+
/* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
|
|
1640
|
+
"Dataset",
|
|
1641
|
+
" "
|
|
1642
|
+
] }),
|
|
1384
1643
|
runInfo.datasetName
|
|
1385
1644
|
] }),
|
|
1386
1645
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1387
|
-
/* @__PURE__ */
|
|
1646
|
+
/* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
|
|
1647
|
+
"Evaluators",
|
|
1648
|
+
" "
|
|
1649
|
+
] }),
|
|
1388
1650
|
runInfo.evaluatorNames.join(", ")
|
|
1389
1651
|
] }),
|
|
1390
1652
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1391
|
-
/* @__PURE__ */
|
|
1653
|
+
/* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
|
|
1654
|
+
"Test cases",
|
|
1655
|
+
" "
|
|
1656
|
+
] }),
|
|
1392
1657
|
runInfo.totalTestCases
|
|
1393
1658
|
] })
|
|
1394
1659
|
] }),
|
|
1395
1660
|
phase === "running" && /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(
|
|
1396
1661
|
Spinner,
|
|
1397
1662
|
{
|
|
1398
|
-
label: `Evaluations ${
|
|
1663
|
+
label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0}`
|
|
1399
1664
|
}
|
|
1400
1665
|
) }),
|
|
1401
|
-
testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc
|
|
1666
|
+
testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
1402
1667
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1403
1668
|
/* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
|
|
1404
1669
|
"[",
|
|
@@ -1409,49 +1674,78 @@ function RunView({
|
|
|
1409
1674
|
] }),
|
|
1410
1675
|
" ",
|
|
1411
1676
|
tc.name,
|
|
1677
|
+
" ",
|
|
1678
|
+
/* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
|
|
1679
|
+
"(",
|
|
1680
|
+
tc.rerunIndex,
|
|
1681
|
+
"/",
|
|
1682
|
+
tc.rerunTotal,
|
|
1683
|
+
")"
|
|
1684
|
+
] }),
|
|
1412
1685
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1413
1686
|
" (",
|
|
1414
1687
|
tc.durationMs,
|
|
1415
1688
|
"ms)"
|
|
1416
1689
|
] })
|
|
1417
1690
|
] }),
|
|
1418
|
-
tc.
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
|
|
1430
|
-
|
|
1431
|
-
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1691
|
+
tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxs(
|
|
1692
|
+
Box,
|
|
1693
|
+
{
|
|
1694
|
+
flexDirection: "column",
|
|
1695
|
+
marginLeft: 2,
|
|
1696
|
+
children: [
|
|
1697
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1698
|
+
item.evaluatorName,
|
|
1699
|
+
":",
|
|
1700
|
+
" ",
|
|
1701
|
+
/* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
1702
|
+
" ",
|
|
1703
|
+
item.scores.map((s) => /* @__PURE__ */ jsxs(
|
|
1704
|
+
Text,
|
|
1705
|
+
{
|
|
1706
|
+
color: scoreColor(toNumericScore(s.data) ?? 0),
|
|
1707
|
+
children: [
|
|
1708
|
+
formatScorePart(s, scoreColor, {
|
|
1709
|
+
isAggregated: tc.isAggregated
|
|
1710
|
+
}),
|
|
1711
|
+
" "
|
|
1712
|
+
]
|
|
1713
|
+
},
|
|
1714
|
+
s.id
|
|
1715
|
+
)),
|
|
1716
|
+
item.metrics?.map((m) => {
|
|
1717
|
+
const def = getMetricById(m.id);
|
|
1718
|
+
if (!def)
|
|
1719
|
+
return null;
|
|
1720
|
+
const formatted = def.format(m.data, {
|
|
1721
|
+
isAggregated: tc.isAggregated
|
|
1722
|
+
});
|
|
1723
|
+
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1724
|
+
"[",
|
|
1725
|
+
def.name ? `${def.name}: ` : "",
|
|
1726
|
+
formatted,
|
|
1727
|
+
"]",
|
|
1728
|
+
" "
|
|
1729
|
+
] }, m.id);
|
|
1730
|
+
})
|
|
1731
|
+
] }),
|
|
1732
|
+
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
1733
|
+
(log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(
|
|
1734
|
+
({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
|
|
1735
|
+
Text,
|
|
1736
|
+
{
|
|
1737
|
+
color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
|
|
1738
|
+
children: line
|
|
1739
|
+
},
|
|
1740
|
+
lineIdx
|
|
1741
|
+
)
|
|
1742
|
+
) }, logIdx) : null
|
|
1743
|
+
) })
|
|
1744
|
+
]
|
|
1745
|
+
},
|
|
1746
|
+
item.evaluatorId
|
|
1747
|
+
))
|
|
1748
|
+
] }, tc.testCaseId)) }),
|
|
1455
1749
|
phase === "completed" && summary && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
|
|
1456
1750
|
/* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run Summary" }),
|
|
1457
1751
|
/* @__PURE__ */ jsxs(Box, { marginTop: 1, children: [
|
|
@@ -1478,7 +1772,14 @@ function RunView({
|
|
|
1478
1772
|
label: "overall avg",
|
|
1479
1773
|
value: summary.overallScoreTotal / summary.overallScoreCount,
|
|
1480
1774
|
barWidth: 20,
|
|
1481
|
-
format: (v) =>
|
|
1775
|
+
format: (v) => {
|
|
1776
|
+
const sd = sampleStdDev(
|
|
1777
|
+
summary.overallScoreTotal,
|
|
1778
|
+
summary.overallScoreSumSq,
|
|
1779
|
+
summary.overallScoreCount
|
|
1780
|
+
);
|
|
1781
|
+
return sd !== void 0 ? `${v.toFixed(2)} \xB1 ${sd.toFixed(2)}` : v.toFixed(2);
|
|
1782
|
+
}
|
|
1482
1783
|
}
|
|
1483
1784
|
) }),
|
|
1484
1785
|
/* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
|
|
@@ -1493,12 +1794,15 @@ function RunView({
|
|
|
1493
1794
|
] }, id);
|
|
1494
1795
|
}
|
|
1495
1796
|
const mean = agg.total / agg.count;
|
|
1797
|
+
const sd = sampleStdDev(agg.total, agg.sumSq, agg.count);
|
|
1798
|
+
const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
|
|
1496
1799
|
return /* @__PURE__ */ jsxs(Text, { children: [
|
|
1497
1800
|
"- ",
|
|
1498
1801
|
name.padEnd(28),
|
|
1499
1802
|
" avg=",
|
|
1500
|
-
/* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children:
|
|
1501
|
-
"
|
|
1803
|
+
/* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children: meanStr }),
|
|
1804
|
+
" ",
|
|
1805
|
+
"passed=",
|
|
1502
1806
|
agg.passed,
|
|
1503
1807
|
" failed=",
|
|
1504
1808
|
agg.failed
|
|
@@ -1507,28 +1811,41 @@ function RunView({
|
|
|
1507
1811
|
] }),
|
|
1508
1812
|
/* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
|
|
1509
1813
|
/* @__PURE__ */ jsx(Text, { color: "magenta", children: "test case scores" }),
|
|
1510
|
-
testCases.map((tc
|
|
1511
|
-
|
|
1512
|
-
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
|
|
1516
|
-
|
|
1517
|
-
|
|
1518
|
-
|
|
1519
|
-
|
|
1814
|
+
testCases.map((tc) => {
|
|
1815
|
+
const allScores = tc.events.flatMap(
|
|
1816
|
+
(ev) => ev.evaluatorScores.map((es) => toNumericScoreFromScores(es.scores)).filter((n) => n !== void 0)
|
|
1817
|
+
);
|
|
1818
|
+
const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : void 0;
|
|
1819
|
+
const sumSq = allScores.length > 0 ? allScores.reduce((s, v) => s + v * v, 0) : 0;
|
|
1820
|
+
const total = allScores.reduce((a, b) => a + b, 0);
|
|
1821
|
+
const tcStdDev = sampleStdDev(total, sumSq, allScores.length);
|
|
1822
|
+
const firstScore = tc.aggregatedEvaluatorScores[0]?.scores[0];
|
|
1823
|
+
const scoreLabel = firstScore && tc.isAggregated ? formatScorePart(firstScore, scoreColor, {
|
|
1824
|
+
isAggregated: true
|
|
1825
|
+
}) : averageScore !== void 0 ? tcStdDev !== void 0 && tc.isAggregated ? `${averageScore.toFixed(2)} \xB1 ${tcStdDev.toFixed(2)}` : averageScore.toFixed(2) : "n/a";
|
|
1826
|
+
return /* @__PURE__ */ jsxs(Box, { children: [
|
|
1827
|
+
/* @__PURE__ */ jsx(Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
|
|
1828
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1829
|
+
" ",
|
|
1830
|
+
tc.name.padEnd(24)
|
|
1520
1831
|
] }),
|
|
1832
|
+
averageScore !== void 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
1833
|
+
/* @__PURE__ */ jsxs(Text, { color: scoreColor(averageScore), children: [
|
|
1834
|
+
"score=",
|
|
1835
|
+
scoreLabel
|
|
1836
|
+
] }),
|
|
1837
|
+
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1838
|
+
" ",
|
|
1839
|
+
createBar(averageScore, 100, 14)
|
|
1840
|
+
] })
|
|
1841
|
+
] }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: "score=n/a" }),
|
|
1521
1842
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1522
|
-
" ",
|
|
1523
|
-
|
|
1843
|
+
" (",
|
|
1844
|
+
tc.durationMs,
|
|
1845
|
+
"ms)"
|
|
1524
1846
|
] })
|
|
1525
|
-
] }
|
|
1526
|
-
|
|
1527
|
-
" (",
|
|
1528
|
-
tc.durationMs,
|
|
1529
|
-
"ms)"
|
|
1530
|
-
] })
|
|
1531
|
-
] }, i))
|
|
1847
|
+
] }, tc.testCaseId);
|
|
1848
|
+
})
|
|
1532
1849
|
] }),
|
|
1533
1850
|
/* @__PURE__ */ jsx(Box, { marginTop: 1, children: /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1534
1851
|
"artifact: ",
|
|
@@ -1539,6 +1856,61 @@ function RunView({
|
|
|
1539
1856
|
}
|
|
1540
1857
|
|
|
1541
1858
|
// src/cli-simple/run.ts
|
|
1859
|
+
function sampleStdDev2(sum, sumSq, n) {
|
|
1860
|
+
if (n < 2)
|
|
1861
|
+
return void 0;
|
|
1862
|
+
const mean = sum / n;
|
|
1863
|
+
const variance = (sumSq - n * mean * mean) / (n - 1);
|
|
1864
|
+
return variance > 0 ? Math.sqrt(variance) : 0;
|
|
1865
|
+
}
|
|
1866
|
+
function buildTestCaseSummaries(byId) {
|
|
1867
|
+
const summaries = [];
|
|
1868
|
+
for (const { name, events } of byId.values()) {
|
|
1869
|
+
const passed = events.every((e) => e.passed);
|
|
1870
|
+
const durationMs = events.reduce((sum, e) => sum + e.durationMs, 0);
|
|
1871
|
+
const isAggregated = events.length > 1;
|
|
1872
|
+
const allScores = events.flatMap(
|
|
1873
|
+
(ev) => ev.evaluatorScores.map((es) => toNumericScoreFromScores(es.scores)).filter((n) => n !== void 0)
|
|
1874
|
+
);
|
|
1875
|
+
const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : void 0;
|
|
1876
|
+
const sumSq = allScores.length > 0 ? allScores.reduce((s, v) => s + v * v, 0) : 0;
|
|
1877
|
+
const total = allScores.reduce((a, b) => a + b, 0);
|
|
1878
|
+
const stdDev = sampleStdDev2(total, sumSq, allScores.length);
|
|
1879
|
+
let firstAggregatedScore;
|
|
1880
|
+
for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
|
|
1881
|
+
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
1882
|
+
for (const ev of events) {
|
|
1883
|
+
const es = ev.evaluatorScores.find(
|
|
1884
|
+
(x) => x.evaluatorId === evaluatorScores.evaluatorId
|
|
1885
|
+
);
|
|
1886
|
+
for (const s of es?.scores ?? []) {
|
|
1887
|
+
const list = scoreIdToItems.get(s.id) ?? [];
|
|
1888
|
+
list.push(s);
|
|
1889
|
+
scoreIdToItems.set(s.id, list);
|
|
1890
|
+
}
|
|
1891
|
+
}
|
|
1892
|
+
for (const items of scoreIdToItems.values()) {
|
|
1893
|
+
const agg = aggregateScoreItems(items);
|
|
1894
|
+
if (agg && firstAggregatedScore === void 0) {
|
|
1895
|
+
firstAggregatedScore = agg;
|
|
1896
|
+
break;
|
|
1897
|
+
}
|
|
1898
|
+
}
|
|
1899
|
+
if (firstAggregatedScore !== void 0)
|
|
1900
|
+
break;
|
|
1901
|
+
}
|
|
1902
|
+
summaries.push({
|
|
1903
|
+
name,
|
|
1904
|
+
averageScore,
|
|
1905
|
+
stdDev: stdDev ?? void 0,
|
|
1906
|
+
aggregatedScoreItem: firstAggregatedScore,
|
|
1907
|
+
isAggregated,
|
|
1908
|
+
durationMs,
|
|
1909
|
+
passed
|
|
1910
|
+
});
|
|
1911
|
+
}
|
|
1912
|
+
return summaries;
|
|
1913
|
+
}
|
|
1542
1914
|
var ansi2 = {
|
|
1543
1915
|
reset: "\x1B[0m",
|
|
1544
1916
|
bold: "\x1B[1m",
|
|
@@ -1566,14 +1938,59 @@ function getEvaluatorSummaryLine(evaluatorName, aggregate) {
|
|
|
1566
1938
|
return `- ${evaluatorName.padEnd(28)} no numeric scores`;
|
|
1567
1939
|
}
|
|
1568
1940
|
const mean = aggregate.total / aggregate.count;
|
|
1569
|
-
|
|
1941
|
+
const sd = sampleStdDev2(aggregate.total, aggregate.sumSq, aggregate.count);
|
|
1942
|
+
const meanStr = sd !== void 0 ? `${mean.toFixed(2)} \xB1 ${sd.toFixed(2)}` : mean.toFixed(2);
|
|
1943
|
+
return `- ${evaluatorName.padEnd(28)} avg=${colorize(meanStr, scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
|
|
1570
1944
|
}
|
|
1571
1945
|
function createBar2(value, max = 100, width = 20) {
|
|
1572
1946
|
const safe = Math.max(0, Math.min(max, value));
|
|
1573
1947
|
const filled = Math.round(safe / max * width);
|
|
1574
1948
|
return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
|
|
1575
1949
|
}
|
|
1576
|
-
function
|
|
1950
|
+
function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
|
|
1951
|
+
if (events.length === 0)
|
|
1952
|
+
return [];
|
|
1953
|
+
const evaluatorIds = new Set(
|
|
1954
|
+
events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
|
|
1955
|
+
);
|
|
1956
|
+
const result = [];
|
|
1957
|
+
for (const evaluatorId of evaluatorIds) {
|
|
1958
|
+
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
1959
|
+
const metricIdToItems = /* @__PURE__ */ new Map();
|
|
1960
|
+
for (const ev of events) {
|
|
1961
|
+
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1962
|
+
for (const s of es?.scores ?? []) {
|
|
1963
|
+
const list = scoreIdToItems.get(s.id) ?? [];
|
|
1964
|
+
list.push(s);
|
|
1965
|
+
scoreIdToItems.set(s.id, list);
|
|
1966
|
+
}
|
|
1967
|
+
for (const m of es?.metrics ?? []) {
|
|
1968
|
+
const list = metricIdToItems.get(m.id) ?? [];
|
|
1969
|
+
list.push(m);
|
|
1970
|
+
metricIdToItems.set(m.id, list);
|
|
1971
|
+
}
|
|
1972
|
+
}
|
|
1973
|
+
const aggregatedScores = [];
|
|
1974
|
+
for (const items of scoreIdToItems.values()) {
|
|
1975
|
+
const agg = aggregateScoreItems(items);
|
|
1976
|
+
if (agg)
|
|
1977
|
+
aggregatedScores.push(agg);
|
|
1978
|
+
}
|
|
1979
|
+
const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
|
|
1980
|
+
const passed = events.every((ev) => {
|
|
1981
|
+
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1982
|
+
return es?.passed ?? false;
|
|
1983
|
+
});
|
|
1984
|
+
result.push({
|
|
1985
|
+
evaluatorId,
|
|
1986
|
+
scores: aggregatedScores,
|
|
1987
|
+
passed,
|
|
1988
|
+
metrics: aggregatedMetrics.length > 0 ? aggregatedMetrics : void 0
|
|
1989
|
+
});
|
|
1990
|
+
}
|
|
1991
|
+
return result;
|
|
1992
|
+
}
|
|
1993
|
+
function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
1577
1994
|
const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
|
|
1578
1995
|
const scoreParts = [];
|
|
1579
1996
|
for (const item of scores) {
|
|
@@ -1585,7 +2002,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
|
1585
2002
|
);
|
|
1586
2003
|
continue;
|
|
1587
2004
|
}
|
|
1588
|
-
const formatted = def.format(item.data);
|
|
2005
|
+
const formatted = def.format(item.data, options);
|
|
1589
2006
|
switch (def.displayStrategy) {
|
|
1590
2007
|
case "bar": {
|
|
1591
2008
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
@@ -1618,7 +2035,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
|
1618
2035
|
for (const { id, data } of metrics) {
|
|
1619
2036
|
const def = getMetricById(id);
|
|
1620
2037
|
if (def) {
|
|
1621
|
-
const formatted = def.format(data);
|
|
2038
|
+
const formatted = def.format(data, options);
|
|
1622
2039
|
metricParts.push(
|
|
1623
2040
|
def.name ? `[${def.name}: ${formatted}]` : `[${formatted}]`
|
|
1624
2041
|
);
|
|
@@ -1651,8 +2068,9 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1651
2068
|
evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
|
|
1652
2069
|
);
|
|
1653
2070
|
const aggregates = /* @__PURE__ */ new Map();
|
|
1654
|
-
const
|
|
2071
|
+
const testCaseByTestId = /* @__PURE__ */ new Map();
|
|
1655
2072
|
let overallScoreTotal = 0;
|
|
2073
|
+
let overallScoreSumSq = 0;
|
|
1656
2074
|
let overallScoreCount = 0;
|
|
1657
2075
|
let completedCount = 0;
|
|
1658
2076
|
let totalCount = 0;
|
|
@@ -1665,6 +2083,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1665
2083
|
}
|
|
1666
2084
|
process.stdout.write("\r\x1B[2K");
|
|
1667
2085
|
}
|
|
2086
|
+
function cursorUp(n) {
|
|
2087
|
+
if (!process.stdout.isTTY || n <= 0)
|
|
2088
|
+
return;
|
|
2089
|
+
process.stdout.write(`\x1B[${n}A`);
|
|
2090
|
+
}
|
|
1668
2091
|
function drawSpinner() {
|
|
1669
2092
|
if (!process.stdout.isTTY || runFinished) {
|
|
1670
2093
|
return;
|
|
@@ -1678,6 +2101,8 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1678
2101
|
)} ${colorize("(live)", ansi2.dim)}`
|
|
1679
2102
|
);
|
|
1680
2103
|
}
|
|
2104
|
+
let lastPrintedTestCaseId = null;
|
|
2105
|
+
let lastPrintedLineCount = 0;
|
|
1681
2106
|
let spinnerTimer;
|
|
1682
2107
|
const done = new Promise((resolve5) => {
|
|
1683
2108
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
@@ -1685,55 +2110,94 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1685
2110
|
completedCount = event.completedTestCases;
|
|
1686
2111
|
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
1687
2112
|
const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
|
|
1688
|
-
|
|
1689
|
-
|
|
1690
|
-
|
|
1691
|
-
|
|
2113
|
+
const testCaseId = event.testCaseId;
|
|
2114
|
+
const existing = testCaseByTestId.get(testCaseId) ?? {
|
|
2115
|
+
name: event.testCaseName,
|
|
2116
|
+
events: []
|
|
2117
|
+
};
|
|
2118
|
+
existing.events.push({
|
|
2119
|
+
averageScore,
|
|
2120
|
+
passed: event.passed,
|
|
2121
|
+
durationMs: event.durationMs,
|
|
2122
|
+
evaluatorScores: event.evaluatorScores
|
|
2123
|
+
});
|
|
2124
|
+
testCaseByTestId.set(testCaseId, existing);
|
|
1692
2125
|
for (const item of event.evaluatorScores) {
|
|
1693
|
-
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
1694
|
-
console.log(
|
|
1695
|
-
formatEvaluatorScoreLine(
|
|
1696
|
-
name,
|
|
1697
|
-
item.scores,
|
|
1698
|
-
item.passed,
|
|
1699
|
-
item.metrics
|
|
1700
|
-
)
|
|
1701
|
-
);
|
|
1702
|
-
if (!item.passed && item.logs && item.logs.length > 0) {
|
|
1703
|
-
for (const log of item.logs) {
|
|
1704
|
-
if (log.type === "diff") {
|
|
1705
|
-
const useColor = process.stdout.isTTY;
|
|
1706
|
-
for (const { type, line } of getDiffLines(log)) {
|
|
1707
|
-
const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
|
|
1708
|
-
console.log(colored);
|
|
1709
|
-
}
|
|
1710
|
-
}
|
|
1711
|
-
}
|
|
1712
|
-
}
|
|
1713
2126
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
1714
2127
|
if (numeric !== void 0) {
|
|
1715
2128
|
const current = aggregates.get(item.evaluatorId) ?? {
|
|
1716
2129
|
total: 0,
|
|
2130
|
+
sumSq: 0,
|
|
1717
2131
|
count: 0,
|
|
1718
2132
|
passed: 0,
|
|
1719
2133
|
failed: 0
|
|
1720
2134
|
};
|
|
1721
2135
|
aggregates.set(item.evaluatorId, {
|
|
1722
2136
|
total: current.total + numeric,
|
|
2137
|
+
sumSq: current.sumSq + numeric * numeric,
|
|
1723
2138
|
count: current.count + 1,
|
|
1724
2139
|
passed: current.passed + (item.passed ? 1 : 0),
|
|
1725
2140
|
failed: current.failed + (item.passed ? 0 : 1)
|
|
1726
2141
|
});
|
|
1727
2142
|
overallScoreTotal += numeric;
|
|
2143
|
+
overallScoreSumSq += numeric * numeric;
|
|
1728
2144
|
overallScoreCount += 1;
|
|
1729
2145
|
}
|
|
1730
2146
|
}
|
|
1731
|
-
|
|
1732
|
-
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
|
|
2147
|
+
const isSameTestCase = lastPrintedTestCaseId === testCaseId;
|
|
2148
|
+
const isLastRerun = event.rerunIndex >= event.rerunTotal;
|
|
2149
|
+
const isNonTty = !process.stdout.isTTY;
|
|
2150
|
+
const skipPrintNonTty = isNonTty && event.rerunTotal > 1 && !isLastRerun;
|
|
2151
|
+
if (isSameTestCase && lastPrintedLineCount > 0 && !skipPrintNonTty) {
|
|
2152
|
+
cursorUp(lastPrintedLineCount);
|
|
2153
|
+
}
|
|
2154
|
+
const aggregatedScores = aggregateEvaluatorScoresFromEvents(
|
|
2155
|
+
existing.events);
|
|
2156
|
+
const isAggregated = existing.events.length > 1;
|
|
2157
|
+
const durationMs = existing.events.reduce(
|
|
2158
|
+
(s, e) => s + e.durationMs,
|
|
2159
|
+
0
|
|
2160
|
+
);
|
|
2161
|
+
existing.events.every((e) => e.passed);
|
|
2162
|
+
const lines = [];
|
|
2163
|
+
lines.push(
|
|
2164
|
+
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
|
|
2165
|
+
);
|
|
2166
|
+
for (const item of aggregatedScores) {
|
|
2167
|
+
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
2168
|
+
lines.push(
|
|
2169
|
+
formatEvaluatorScoreLine(
|
|
2170
|
+
name,
|
|
2171
|
+
item.scores,
|
|
2172
|
+
item.passed,
|
|
2173
|
+
item.metrics,
|
|
2174
|
+
{ isAggregated }
|
|
2175
|
+
)
|
|
2176
|
+
);
|
|
2177
|
+
const lastEvent = existing.events[existing.events.length - 1];
|
|
2178
|
+
const lastEs = lastEvent?.evaluatorScores.find(
|
|
2179
|
+
(x) => x.evaluatorId === item.evaluatorId
|
|
2180
|
+
);
|
|
2181
|
+
if (!item.passed && lastEs?.logs && lastEs.logs.length > 0) {
|
|
2182
|
+
for (const log of lastEs.logs) {
|
|
2183
|
+
if (log.type === "diff") {
|
|
2184
|
+
const useColor = process.stdout.isTTY;
|
|
2185
|
+
for (const { type, line } of getDiffLines(log)) {
|
|
2186
|
+
const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
|
|
2187
|
+
lines.push(colored);
|
|
2188
|
+
}
|
|
2189
|
+
}
|
|
2190
|
+
}
|
|
2191
|
+
}
|
|
2192
|
+
}
|
|
2193
|
+
if (!skipPrintNonTty) {
|
|
2194
|
+
for (let i = 0; i < lines.length; i++) {
|
|
2195
|
+
process.stdout.write(`\r\x1B[2K${lines[i]}
|
|
2196
|
+
`);
|
|
2197
|
+
}
|
|
2198
|
+
lastPrintedTestCaseId = testCaseId;
|
|
2199
|
+
lastPrintedLineCount = lines.length;
|
|
2200
|
+
}
|
|
1737
2201
|
drawSpinner();
|
|
1738
2202
|
}
|
|
1739
2203
|
if (event.type === "RunCompleted" || event.type === "RunFailed") {
|
|
@@ -1784,9 +2248,15 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1784
2248
|
);
|
|
1785
2249
|
if (overallScoreCount > 0) {
|
|
1786
2250
|
const overallAverage = overallScoreTotal / overallScoreCount;
|
|
2251
|
+
const overallSd = sampleStdDev2(
|
|
2252
|
+
overallScoreTotal,
|
|
2253
|
+
overallScoreSumSq,
|
|
2254
|
+
overallScoreCount
|
|
2255
|
+
);
|
|
2256
|
+
const avgStr = overallSd !== void 0 ? `${overallAverage.toFixed(2)} \xB1 ${overallSd.toFixed(2)}` : overallAverage.toFixed(2);
|
|
1787
2257
|
console.log(
|
|
1788
2258
|
`- overall avg score: ${colorize(
|
|
1789
|
-
|
|
2259
|
+
avgStr,
|
|
1790
2260
|
scoreToColor(overallAverage)
|
|
1791
2261
|
)} ${colorize(createBar2(overallAverage), ansi2.dim)}`
|
|
1792
2262
|
);
|
|
@@ -1797,6 +2267,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1797
2267
|
getEvaluatorSummaryLine(evaluatorName, aggregates.get(evaluatorId))
|
|
1798
2268
|
);
|
|
1799
2269
|
}
|
|
2270
|
+
const testCaseSummaries = buildTestCaseSummaries(testCaseByTestId);
|
|
1800
2271
|
if (testCaseSummaries.length > 0) {
|
|
1801
2272
|
console.log(colorize("- test case scores:", ansi2.magenta));
|
|
1802
2273
|
for (const summary of testCaseSummaries) {
|
|
@@ -1807,9 +2278,13 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1807
2278
|
);
|
|
1808
2279
|
continue;
|
|
1809
2280
|
}
|
|
2281
|
+
const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? getScoreById(summary.aggregatedScoreItem.id)?.format(
|
|
2282
|
+
summary.aggregatedScoreItem.data,
|
|
2283
|
+
{ isAggregated: true }
|
|
2284
|
+
) ?? summary.averageScore.toFixed(2) : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
|
|
1810
2285
|
console.log(
|
|
1811
2286
|
` ${status} ${summary.name.padEnd(24)} score=${colorize(
|
|
1812
|
-
|
|
2287
|
+
scoreLabel,
|
|
1813
2288
|
scoreToColor(summary.averageScore)
|
|
1814
2289
|
)} ${colorize(createBar2(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
|
|
1815
2290
|
);
|