@m4trix/evals 0.12.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +599 -224
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +600 -225
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +214 -105
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +215 -106
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +217 -104
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +23 -5
- package/dist/index.js +218 -105
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import { randomUUID } from 'crypto';
|
|
3
|
-
import { Effect, PubSub, Queue, Fiber } from 'effect';
|
|
3
|
+
import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
|
|
4
4
|
import { existsSync } from 'fs';
|
|
5
5
|
import { resolve, relative, join, parse, dirname } from 'path';
|
|
6
6
|
import * as jitiModule from 'jiti';
|
|
@@ -30,7 +30,8 @@ var defaultRunnerConfig = {
|
|
|
30
30
|
],
|
|
31
31
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
32
32
|
},
|
|
33
|
-
artifactDirectory: ".eval-results"
|
|
33
|
+
artifactDirectory: ".eval-results",
|
|
34
|
+
maxConcurrency: 1
|
|
34
35
|
};
|
|
35
36
|
function toRunnerConfigOverrides(config) {
|
|
36
37
|
if (!config) {
|
|
@@ -63,6 +64,9 @@ function toRunnerConfigOverrides(config) {
|
|
|
63
64
|
if (config.artifactDirectory !== void 0) {
|
|
64
65
|
overrides.artifactDirectory = config.artifactDirectory;
|
|
65
66
|
}
|
|
67
|
+
if (config.maxConcurrency !== void 0) {
|
|
68
|
+
overrides.maxConcurrency = config.maxConcurrency;
|
|
69
|
+
}
|
|
66
70
|
if (Object.keys(discovery).length > 0) {
|
|
67
71
|
overrides.discovery = discovery;
|
|
68
72
|
}
|
|
@@ -287,6 +291,7 @@ var Metric = {
|
|
|
287
291
|
const def = {
|
|
288
292
|
id: config.id,
|
|
289
293
|
name: config.name,
|
|
294
|
+
aggregate: config.aggregate,
|
|
290
295
|
format: config.format,
|
|
291
296
|
make: (data) => ({ id: config.id, data })
|
|
292
297
|
};
|
|
@@ -306,6 +311,7 @@ var Score = {
|
|
|
306
311
|
id: config.id,
|
|
307
312
|
name: config.name,
|
|
308
313
|
displayStrategy: config.displayStrategy,
|
|
314
|
+
aggregate: config.aggregate,
|
|
309
315
|
format: config.format,
|
|
310
316
|
make: (data, options) => {
|
|
311
317
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
@@ -324,23 +330,62 @@ function getScoreById(id) {
|
|
|
324
330
|
return registry2.get(id);
|
|
325
331
|
}
|
|
326
332
|
|
|
333
|
+
// src/evals/aggregators.ts
|
|
334
|
+
function aggregateAverage(values) {
|
|
335
|
+
if (values.length === 0) {
|
|
336
|
+
return { value: 0 };
|
|
337
|
+
}
|
|
338
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
339
|
+
return { value: sum / values.length };
|
|
340
|
+
}
|
|
341
|
+
function aggregateAll(values) {
|
|
342
|
+
return { passed: values.length > 0 && values.every((v) => v.passed) };
|
|
343
|
+
}
|
|
344
|
+
function aggregateTokenCountSum(values) {
|
|
345
|
+
const initial = {
|
|
346
|
+
input: 0,
|
|
347
|
+
output: 0,
|
|
348
|
+
inputCached: 0,
|
|
349
|
+
outputCached: 0
|
|
350
|
+
};
|
|
351
|
+
return values.reduce(
|
|
352
|
+
(acc, v) => ({
|
|
353
|
+
input: acc.input + (v.input ?? 0),
|
|
354
|
+
output: acc.output + (v.output ?? 0),
|
|
355
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
356
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
357
|
+
}),
|
|
358
|
+
initial
|
|
359
|
+
);
|
|
360
|
+
}
|
|
361
|
+
function aggregateLatencyAverage(values) {
|
|
362
|
+
if (values.length === 0) {
|
|
363
|
+
return { ms: 0 };
|
|
364
|
+
}
|
|
365
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
366
|
+
return { ms: sum / values.length };
|
|
367
|
+
}
|
|
368
|
+
|
|
327
369
|
// src/evals/metrics/standard.ts
|
|
328
370
|
Metric.of({
|
|
329
371
|
id: "token-count",
|
|
330
372
|
name: "Tokens",
|
|
331
|
-
|
|
373
|
+
aggregate: aggregateTokenCountSum,
|
|
374
|
+
format: (data, options) => {
|
|
332
375
|
const input = data.input ?? 0;
|
|
333
376
|
const output = data.output ?? 0;
|
|
334
377
|
const inputCached = data.inputCached ?? 0;
|
|
335
378
|
const outputCached = data.outputCached ?? 0;
|
|
336
379
|
const cached = inputCached + outputCached;
|
|
337
|
-
|
|
380
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
381
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
338
382
|
}
|
|
339
383
|
});
|
|
340
384
|
Metric.of({
|
|
341
385
|
id: "latency",
|
|
342
386
|
name: "Latency",
|
|
343
|
-
|
|
387
|
+
aggregate: aggregateLatencyAverage,
|
|
388
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
344
389
|
});
|
|
345
390
|
|
|
346
391
|
// src/evals/scores/standard.ts
|
|
@@ -348,16 +393,36 @@ Score.of({
|
|
|
348
393
|
id: "percent",
|
|
349
394
|
name: "Score",
|
|
350
395
|
displayStrategy: "bar",
|
|
351
|
-
format: (data) => data.value.toFixed(2)
|
|
396
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
|
|
397
|
+
aggregate: aggregateAverage
|
|
352
398
|
});
|
|
353
399
|
Score.of({
|
|
354
400
|
id: "binary",
|
|
355
401
|
name: "Result",
|
|
356
402
|
displayStrategy: "passFail",
|
|
357
|
-
format: (data) => data.passed ? "PASSED" : "NOT PASSED"
|
|
403
|
+
format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
|
|
404
|
+
aggregate: aggregateAll
|
|
358
405
|
});
|
|
359
406
|
|
|
360
407
|
// src/runner/score-utils.ts
|
|
408
|
+
function aggregateScoreItems(items) {
|
|
409
|
+
if (items.length === 0)
|
|
410
|
+
return void 0;
|
|
411
|
+
const def = getScoreById(items[0].id);
|
|
412
|
+
if (!def?.aggregate)
|
|
413
|
+
return items[items.length - 1];
|
|
414
|
+
const aggregated = def.aggregate(items.map((i) => i.data));
|
|
415
|
+
return { ...items[0], data: aggregated };
|
|
416
|
+
}
|
|
417
|
+
function aggregateMetricItems(items) {
|
|
418
|
+
if (items.length === 0)
|
|
419
|
+
return void 0;
|
|
420
|
+
const def = getMetricById(items[0].id);
|
|
421
|
+
if (!def?.aggregate)
|
|
422
|
+
return items[items.length - 1];
|
|
423
|
+
const aggregated = def.aggregate(items.map((i) => i.data));
|
|
424
|
+
return { ...items[0], data: aggregated };
|
|
425
|
+
}
|
|
361
426
|
function toNumericScoreFromScores(scores) {
|
|
362
427
|
for (const item of scores) {
|
|
363
428
|
const def = getScoreById(item.id);
|
|
@@ -436,6 +501,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
436
501
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
437
502
|
);
|
|
438
503
|
}
|
|
504
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
505
|
+
return Effect.gen(function* () {
|
|
506
|
+
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
507
|
+
const rerunPassed = [];
|
|
508
|
+
for (let r = 0; r < reruns; r++) {
|
|
509
|
+
const started = Date.now();
|
|
510
|
+
const evaluatorScores = [];
|
|
511
|
+
let testCaseError;
|
|
512
|
+
const output = readOutput(testCaseItem.testCase);
|
|
513
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
514
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
515
|
+
if (!evaluateFn) {
|
|
516
|
+
continue;
|
|
517
|
+
}
|
|
518
|
+
try {
|
|
519
|
+
const logs = [];
|
|
520
|
+
const logDiff = (expected, actual, options) => {
|
|
521
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
522
|
+
};
|
|
523
|
+
const ctx = yield* Effect.promise(
|
|
524
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
525
|
+
);
|
|
526
|
+
const result = yield* Effect.promise(
|
|
527
|
+
() => Promise.resolve(
|
|
528
|
+
evaluateFn({
|
|
529
|
+
input: testCaseItem.testCase.getInput(),
|
|
530
|
+
ctx,
|
|
531
|
+
output,
|
|
532
|
+
logDiff
|
|
533
|
+
})
|
|
534
|
+
)
|
|
535
|
+
);
|
|
536
|
+
const { scores, metrics } = normalizeResult(result);
|
|
537
|
+
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
538
|
+
evaluatorScores.push({
|
|
539
|
+
evaluatorId,
|
|
540
|
+
scores,
|
|
541
|
+
passed: passed2,
|
|
542
|
+
metrics,
|
|
543
|
+
logs: logs.length > 0 ? logs : void 0
|
|
544
|
+
});
|
|
545
|
+
} catch (error) {
|
|
546
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
547
|
+
evaluatorScores.push({
|
|
548
|
+
evaluatorId,
|
|
549
|
+
scores: [],
|
|
550
|
+
passed: false
|
|
551
|
+
});
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
555
|
+
rerunPassed.push(rerunPassedThis);
|
|
556
|
+
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
|
|
557
|
+
n + 1,
|
|
558
|
+
n + 1
|
|
559
|
+
]);
|
|
560
|
+
const progressEvent = {
|
|
561
|
+
type: "TestCaseProgress",
|
|
562
|
+
runId: task.runId,
|
|
563
|
+
testCaseId: testCaseItem.id,
|
|
564
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
565
|
+
completedTestCases: completedEvaluations,
|
|
566
|
+
totalTestCases: totalEvaluations,
|
|
567
|
+
rerunIndex: r + 1,
|
|
568
|
+
rerunTotal: reruns,
|
|
569
|
+
passed: rerunPassedThis,
|
|
570
|
+
durationMs: Date.now() - started,
|
|
571
|
+
evaluatorScores,
|
|
572
|
+
output,
|
|
573
|
+
errorMessage: testCaseError
|
|
574
|
+
};
|
|
575
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
576
|
+
...snapshot,
|
|
577
|
+
completedTestCases: completedEvaluations
|
|
578
|
+
}));
|
|
579
|
+
yield* publishEvent(progressEvent);
|
|
580
|
+
yield* Queue.offer(persistenceQueue, {
|
|
581
|
+
runId: task.runId,
|
|
582
|
+
artifactPath: task.snapshot.artifactPath,
|
|
583
|
+
payload: progressEvent
|
|
584
|
+
});
|
|
585
|
+
}
|
|
586
|
+
const testCasePassed = rerunPassed.every(Boolean);
|
|
587
|
+
if (testCasePassed) {
|
|
588
|
+
yield* Ref.update(passedRef, (n) => n + 1);
|
|
589
|
+
} else {
|
|
590
|
+
yield* Ref.update(failedRef, (n) => n + 1);
|
|
591
|
+
}
|
|
592
|
+
const [passed, failed] = yield* Effect.all([
|
|
593
|
+
Ref.get(passedRef),
|
|
594
|
+
Ref.get(failedRef)
|
|
595
|
+
]);
|
|
596
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
597
|
+
...snapshot,
|
|
598
|
+
passedTestCases: passed,
|
|
599
|
+
failedTestCases: failed
|
|
600
|
+
}));
|
|
601
|
+
});
|
|
602
|
+
}
|
|
439
603
|
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
|
|
440
604
|
const startedAt = Date.now();
|
|
441
605
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
@@ -448,104 +612,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
448
612
|
runId: task.runId,
|
|
449
613
|
startedAt
|
|
450
614
|
});
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
)
|
|
481
|
-
);
|
|
482
|
-
const { scores, metrics } = normalizeResult(result);
|
|
483
|
-
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
484
|
-
evaluatorScores.push({
|
|
485
|
-
evaluatorId,
|
|
486
|
-
scores,
|
|
487
|
-
passed,
|
|
488
|
-
metrics,
|
|
489
|
-
logs: logs.length > 0 ? logs : void 0
|
|
490
|
-
});
|
|
491
|
-
} catch (error) {
|
|
492
|
-
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
493
|
-
evaluatorScores.push({
|
|
494
|
-
evaluatorId,
|
|
495
|
-
scores: [],
|
|
496
|
-
passed: false
|
|
497
|
-
});
|
|
498
|
-
}
|
|
499
|
-
}
|
|
500
|
-
const testCasePassed = evaluatorScores.every((s) => s.passed);
|
|
501
|
-
completedTestCases += 1;
|
|
502
|
-
if (testCasePassed) {
|
|
503
|
-
passedTestCases += 1;
|
|
504
|
-
} else {
|
|
505
|
-
failedTestCases += 1;
|
|
506
|
-
}
|
|
507
|
-
const progressEvent = {
|
|
508
|
-
type: "TestCaseProgress",
|
|
509
|
-
runId: task.runId,
|
|
510
|
-
testCaseId: testCaseItem.id,
|
|
511
|
-
testCaseName: testCaseItem.testCase.getName(),
|
|
512
|
-
completedTestCases,
|
|
513
|
-
totalTestCases: task.testCases.length,
|
|
514
|
-
passed: testCasePassed,
|
|
515
|
-
durationMs: Date.now() - started,
|
|
516
|
-
evaluatorScores,
|
|
517
|
-
output,
|
|
518
|
-
errorMessage: testCaseError
|
|
519
|
-
};
|
|
520
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
521
|
-
...snapshot,
|
|
522
|
-
completedTestCases,
|
|
523
|
-
passedTestCases,
|
|
524
|
-
failedTestCases
|
|
525
|
-
}));
|
|
526
|
-
yield* publishEvent(progressEvent);
|
|
527
|
-
yield* Queue.offer(persistenceQueue, {
|
|
528
|
-
runId: task.runId,
|
|
529
|
-
artifactPath: task.snapshot.artifactPath,
|
|
530
|
-
payload: progressEvent
|
|
531
|
-
});
|
|
532
|
-
}
|
|
615
|
+
const totalEvaluations = task.testCases.reduce(
|
|
616
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
617
|
+
0
|
|
618
|
+
);
|
|
619
|
+
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
620
|
+
const completedRef = yield* Ref.make(0);
|
|
621
|
+
const passedRef = yield* Ref.make(0);
|
|
622
|
+
const failedRef = yield* Ref.make(0);
|
|
623
|
+
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
624
|
+
task,
|
|
625
|
+
testCaseItem,
|
|
626
|
+
totalEvaluations,
|
|
627
|
+
publishEvent,
|
|
628
|
+
persistenceQueue,
|
|
629
|
+
updateSnapshot,
|
|
630
|
+
completedRef,
|
|
631
|
+
passedRef,
|
|
632
|
+
failedRef
|
|
633
|
+
);
|
|
634
|
+
yield* Effect.forEach(
|
|
635
|
+
task.testCases,
|
|
636
|
+
processTestCase,
|
|
637
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
638
|
+
);
|
|
639
|
+
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
|
|
640
|
+
Ref.get(completedRef),
|
|
641
|
+
Ref.get(passedRef),
|
|
642
|
+
Ref.get(failedRef)
|
|
643
|
+
]);
|
|
533
644
|
const finishedAt = Date.now();
|
|
534
645
|
const completedEvent = {
|
|
535
646
|
type: "RunCompleted",
|
|
536
647
|
runId: task.runId,
|
|
537
648
|
finishedAt,
|
|
538
|
-
passedTestCases,
|
|
539
|
-
failedTestCases,
|
|
649
|
+
passedTestCases: passedUniqueTestCases,
|
|
650
|
+
failedTestCases: failedUniqueTestCases,
|
|
540
651
|
totalTestCases: task.testCases.length,
|
|
541
652
|
artifactPath: task.snapshot.artifactPath
|
|
542
653
|
};
|
|
543
654
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
544
655
|
...snapshot,
|
|
545
656
|
status: "completed",
|
|
546
|
-
completedTestCases,
|
|
547
|
-
passedTestCases,
|
|
548
|
-
failedTestCases,
|
|
657
|
+
completedTestCases: completedEvaluations,
|
|
658
|
+
passedTestCases: passedUniqueTestCases,
|
|
659
|
+
failedTestCases: failedUniqueTestCases,
|
|
549
660
|
finishedAt
|
|
550
661
|
}));
|
|
551
662
|
yield* publishEvent(completedEvent);
|
|
@@ -633,7 +744,7 @@ async function parseArtifactToSnapshot(filePath, _config) {
|
|
|
633
744
|
const artifactPath = filePath;
|
|
634
745
|
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
635
746
|
const progress = aggregateTestCaseProgress(lines);
|
|
636
|
-
const completedTestCases = runCompleted
|
|
747
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
637
748
|
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
638
749
|
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
639
750
|
return {
|
|
@@ -655,23 +766,29 @@ async function parseArtifactToSnapshot(filePath, _config) {
|
|
|
655
766
|
}
|
|
656
767
|
function aggregateTestCaseProgress(lines) {
|
|
657
768
|
let completedTestCases = 0;
|
|
658
|
-
|
|
659
|
-
let failedTestCases = 0;
|
|
769
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
660
770
|
for (const line of lines) {
|
|
661
771
|
try {
|
|
662
772
|
const event = JSON.parse(line);
|
|
663
773
|
if (event.type === "TestCaseProgress") {
|
|
664
774
|
const ev = event;
|
|
665
775
|
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
failedTestCases += 1;
|
|
670
|
-
}
|
|
776
|
+
const id = ev.testCaseId;
|
|
777
|
+
const current = testCasePassedBy.get(id);
|
|
778
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
671
779
|
}
|
|
672
780
|
} catch {
|
|
673
781
|
}
|
|
674
782
|
}
|
|
783
|
+
let passedTestCases = 0;
|
|
784
|
+
let failedTestCases = 0;
|
|
785
|
+
for (const passed of testCasePassedBy.values()) {
|
|
786
|
+
if (passed) {
|
|
787
|
+
passedTestCases += 1;
|
|
788
|
+
} else {
|
|
789
|
+
failedTestCases += 1;
|
|
790
|
+
}
|
|
791
|
+
}
|
|
675
792
|
return { completedTestCases, passedTestCases, failedTestCases };
|
|
676
793
|
}
|
|
677
794
|
async function appendJsonLine(artifactPath, payload) {
|
|
@@ -866,6 +983,10 @@ var EffectRunner = class {
|
|
|
866
983
|
throw new Error("No evaluators selected for run");
|
|
867
984
|
}
|
|
868
985
|
const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
|
|
986
|
+
const totalEvaluations = selectedTestCases.reduce(
|
|
987
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
988
|
+
0
|
|
989
|
+
);
|
|
869
990
|
const runId = `run-${randomUUID()}`;
|
|
870
991
|
const artifactPath = createArtifactPath(
|
|
871
992
|
this.config.artifactDirectory,
|
|
@@ -878,7 +999,7 @@ var EffectRunner = class {
|
|
|
878
999
|
datasetName: dataset.dataset.getName(),
|
|
879
1000
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
880
1001
|
queuedAt: Date.now(),
|
|
881
|
-
totalTestCases:
|
|
1002
|
+
totalTestCases: totalEvaluations,
|
|
882
1003
|
completedTestCases: 0,
|
|
883
1004
|
passedTestCases: 0,
|
|
884
1005
|
failedTestCases: 0,
|
|
@@ -892,7 +1013,7 @@ var EffectRunner = class {
|
|
|
892
1013
|
datasetId: request.datasetId,
|
|
893
1014
|
datasetName: dataset.dataset.getName(),
|
|
894
1015
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
895
|
-
totalTestCases:
|
|
1016
|
+
totalTestCases: totalEvaluations,
|
|
896
1017
|
artifactPath
|
|
897
1018
|
};
|
|
898
1019
|
await Effect.runPromise(this.publishEvent(queuedEvent));
|
|
@@ -903,6 +1024,7 @@ var EffectRunner = class {
|
|
|
903
1024
|
payload: queuedEvent
|
|
904
1025
|
})
|
|
905
1026
|
);
|
|
1027
|
+
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
906
1028
|
await Effect.runPromise(
|
|
907
1029
|
Queue.offer(this.runQueue, {
|
|
908
1030
|
runId,
|
|
@@ -910,7 +1032,8 @@ var EffectRunner = class {
|
|
|
910
1032
|
dataset: dataset.dataset,
|
|
911
1033
|
evaluators: selectedEvaluators,
|
|
912
1034
|
testCases: selectedTestCases,
|
|
913
|
-
snapshot
|
|
1035
|
+
snapshot,
|
|
1036
|
+
maxConcurrency
|
|
914
1037
|
})
|
|
915
1038
|
);
|
|
916
1039
|
return snapshot;
|
|
@@ -1228,13 +1351,62 @@ function createBar(value, max = 100, width = 20) {
|
|
|
1228
1351
|
const filled = Math.round(safe / max * width);
|
|
1229
1352
|
return "\u2588".repeat(filled) + "\u2591".repeat(width - filled);
|
|
1230
1353
|
}
|
|
1231
|
-
function
|
|
1354
|
+
function aggregateEvaluatorScores(events, nameById) {
|
|
1355
|
+
if (events.length === 0)
|
|
1356
|
+
return [];
|
|
1357
|
+
const evaluatorIds = new Set(
|
|
1358
|
+
events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
|
|
1359
|
+
);
|
|
1360
|
+
const result = [];
|
|
1361
|
+
for (const evaluatorId of evaluatorIds) {
|
|
1362
|
+
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
1363
|
+
const metricIdToItems = /* @__PURE__ */ new Map();
|
|
1364
|
+
for (const ev of events) {
|
|
1365
|
+
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1366
|
+
for (const s of es?.scores ?? []) {
|
|
1367
|
+
const list = scoreIdToItems.get(s.id) ?? [];
|
|
1368
|
+
list.push(s);
|
|
1369
|
+
scoreIdToItems.set(s.id, list);
|
|
1370
|
+
}
|
|
1371
|
+
for (const m of es?.metrics ?? []) {
|
|
1372
|
+
const list = metricIdToItems.get(m.id) ?? [];
|
|
1373
|
+
list.push(m);
|
|
1374
|
+
metricIdToItems.set(m.id, list);
|
|
1375
|
+
}
|
|
1376
|
+
}
|
|
1377
|
+
const aggregatedScores = [];
|
|
1378
|
+
for (const items of scoreIdToItems.values()) {
|
|
1379
|
+
const agg = aggregateScoreItems(items);
|
|
1380
|
+
if (agg)
|
|
1381
|
+
aggregatedScores.push(agg);
|
|
1382
|
+
}
|
|
1383
|
+
const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([id, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
|
|
1384
|
+
const passed = events.every((ev) => {
|
|
1385
|
+
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1386
|
+
return es?.passed ?? false;
|
|
1387
|
+
});
|
|
1388
|
+
const lastEvent = events[events.length - 1];
|
|
1389
|
+
const lastEs = lastEvent?.evaluatorScores.find(
|
|
1390
|
+
(x) => x.evaluatorId === evaluatorId
|
|
1391
|
+
);
|
|
1392
|
+
result.push({
|
|
1393
|
+
evaluatorId,
|
|
1394
|
+
evaluatorName: nameById.get(evaluatorId) ?? evaluatorId,
|
|
1395
|
+
scores: aggregatedScores,
|
|
1396
|
+
passed,
|
|
1397
|
+
metrics: aggregatedMetrics.length > 0 ? aggregatedMetrics : void 0,
|
|
1398
|
+
logs: lastEs?.logs
|
|
1399
|
+
});
|
|
1400
|
+
}
|
|
1401
|
+
return result;
|
|
1402
|
+
}
|
|
1403
|
+
function formatScorePart(item, scoreToColor2, options) {
|
|
1232
1404
|
const def = getScoreById(item.id);
|
|
1233
1405
|
if (!def) {
|
|
1234
1406
|
const numeric = toNumericScore(item.data);
|
|
1235
1407
|
return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
|
|
1236
1408
|
}
|
|
1237
|
-
const formatted = def.format(item.data);
|
|
1409
|
+
const formatted = def.format(item.data, options);
|
|
1238
1410
|
if (def.displayStrategy === "bar") {
|
|
1239
1411
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
1240
1412
|
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
@@ -1254,6 +1426,7 @@ function RunView({
|
|
|
1254
1426
|
);
|
|
1255
1427
|
const [runInfo, setRunInfo] = useState(null);
|
|
1256
1428
|
const [testCases, setTestCases] = useState([]);
|
|
1429
|
+
const [completedEvaluations, setCompletedEvaluations] = useState(0);
|
|
1257
1430
|
const [summary, setSummary] = useState(null);
|
|
1258
1431
|
const [evaluatorNameById, setEvaluatorNameById] = useState(/* @__PURE__ */ new Map());
|
|
1259
1432
|
const runEval = useCallback(async () => {
|
|
@@ -1280,10 +1453,7 @@ function RunView({
|
|
|
1280
1453
|
return;
|
|
1281
1454
|
}
|
|
1282
1455
|
const nameById = new Map(
|
|
1283
|
-
evaluators.map((item) => [
|
|
1284
|
-
item.id,
|
|
1285
|
-
item.evaluator.getName() ?? item.id
|
|
1286
|
-
])
|
|
1456
|
+
evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
|
|
1287
1457
|
);
|
|
1288
1458
|
setEvaluatorNameById(nameById);
|
|
1289
1459
|
const aggregates = /* @__PURE__ */ new Map();
|
|
@@ -1293,7 +1463,7 @@ function RunView({
|
|
|
1293
1463
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1294
1464
|
if (event.type === "TestCaseProgress") {
|
|
1295
1465
|
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
1296
|
-
|
|
1466
|
+
numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
|
|
1297
1467
|
for (const item of event.evaluatorScores) {
|
|
1298
1468
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
1299
1469
|
if (numeric !== void 0) {
|
|
@@ -1313,15 +1483,10 @@ function RunView({
|
|
|
1313
1483
|
overallScoreCount += 1;
|
|
1314
1484
|
}
|
|
1315
1485
|
}
|
|
1316
|
-
setTestCases((prev) =>
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
completedTestCases: event.completedTestCases,
|
|
1321
|
-
totalTestCases: event.totalTestCases,
|
|
1322
|
-
durationMs: event.durationMs,
|
|
1323
|
-
passed: event.passed,
|
|
1324
|
-
averageScore,
|
|
1486
|
+
setTestCases((prev) => {
|
|
1487
|
+
const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
|
|
1488
|
+
const existing = byId.get(event.testCaseId);
|
|
1489
|
+
const newEvent = {
|
|
1325
1490
|
evaluatorScores: event.evaluatorScores.map((item) => ({
|
|
1326
1491
|
evaluatorId: item.evaluatorId,
|
|
1327
1492
|
evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
|
|
@@ -1329,9 +1494,33 @@ function RunView({
|
|
|
1329
1494
|
passed: item.passed,
|
|
1330
1495
|
metrics: item.metrics,
|
|
1331
1496
|
logs: item.logs
|
|
1332
|
-
}))
|
|
1333
|
-
|
|
1334
|
-
|
|
1497
|
+
})),
|
|
1498
|
+
passed: event.passed,
|
|
1499
|
+
durationMs: event.durationMs
|
|
1500
|
+
};
|
|
1501
|
+
const events = existing ? [...existing.events, newEvent] : [newEvent];
|
|
1502
|
+
const isAggregated = events.length > 1;
|
|
1503
|
+
const aggregatedEvaluatorScores = aggregateEvaluatorScores(
|
|
1504
|
+
events,
|
|
1505
|
+
nameById
|
|
1506
|
+
);
|
|
1507
|
+
const merged = {
|
|
1508
|
+
name: event.testCaseName,
|
|
1509
|
+
testCaseId: event.testCaseId,
|
|
1510
|
+
completedTestCases: event.completedTestCases,
|
|
1511
|
+
totalTestCases: event.totalTestCases,
|
|
1512
|
+
rerunIndex: event.rerunIndex,
|
|
1513
|
+
rerunTotal: event.rerunTotal,
|
|
1514
|
+
durationMs: events.reduce((s, e) => s + e.durationMs, 0),
|
|
1515
|
+
passed: events.every((e) => e.passed),
|
|
1516
|
+
events,
|
|
1517
|
+
aggregatedEvaluatorScores,
|
|
1518
|
+
isAggregated
|
|
1519
|
+
};
|
|
1520
|
+
byId.set(event.testCaseId, merged);
|
|
1521
|
+
setCompletedEvaluations(event.completedTestCases);
|
|
1522
|
+
return Array.from(byId.values());
|
|
1523
|
+
});
|
|
1335
1524
|
}
|
|
1336
1525
|
if (event.type === "RunCompleted" || event.type === "RunFailed") {
|
|
1337
1526
|
unsubscribe();
|
|
@@ -1346,9 +1535,7 @@ function RunView({
|
|
|
1346
1535
|
setRunInfo({
|
|
1347
1536
|
runId: snapshot.runId,
|
|
1348
1537
|
datasetName: snapshot.datasetName,
|
|
1349
|
-
evaluatorNames: evaluators.map(
|
|
1350
|
-
(e) => e.evaluator.getName() ?? e.id
|
|
1351
|
-
),
|
|
1538
|
+
evaluatorNames: evaluators.map((e) => e.evaluator.getName() ?? e.id),
|
|
1352
1539
|
totalTestCases: snapshot.totalTestCases
|
|
1353
1540
|
});
|
|
1354
1541
|
setPhase("running");
|
|
@@ -1376,29 +1563,41 @@ function RunView({
|
|
|
1376
1563
|
/* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(Banner, {}) }),
|
|
1377
1564
|
runInfo && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 1, children: [
|
|
1378
1565
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1379
|
-
/* @__PURE__ */
|
|
1566
|
+
/* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
|
|
1567
|
+
"Run",
|
|
1568
|
+
" "
|
|
1569
|
+
] }),
|
|
1380
1570
|
/* @__PURE__ */ jsx(Text, { color: "gray", children: runInfo.runId })
|
|
1381
1571
|
] }),
|
|
1382
1572
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1383
|
-
/* @__PURE__ */
|
|
1573
|
+
/* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
|
|
1574
|
+
"Dataset",
|
|
1575
|
+
" "
|
|
1576
|
+
] }),
|
|
1384
1577
|
runInfo.datasetName
|
|
1385
1578
|
] }),
|
|
1386
1579
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1387
|
-
/* @__PURE__ */
|
|
1580
|
+
/* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
|
|
1581
|
+
"Evaluators",
|
|
1582
|
+
" "
|
|
1583
|
+
] }),
|
|
1388
1584
|
runInfo.evaluatorNames.join(", ")
|
|
1389
1585
|
] }),
|
|
1390
1586
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1391
|
-
/* @__PURE__ */
|
|
1587
|
+
/* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
|
|
1588
|
+
"Test cases",
|
|
1589
|
+
" "
|
|
1590
|
+
] }),
|
|
1392
1591
|
runInfo.totalTestCases
|
|
1393
1592
|
] })
|
|
1394
1593
|
] }),
|
|
1395
1594
|
phase === "running" && /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(
|
|
1396
1595
|
Spinner,
|
|
1397
1596
|
{
|
|
1398
|
-
label: `Evaluations ${
|
|
1597
|
+
label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0}`
|
|
1399
1598
|
}
|
|
1400
1599
|
) }),
|
|
1401
|
-
testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc
|
|
1600
|
+
testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
1402
1601
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1403
1602
|
/* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
|
|
1404
1603
|
"[",
|
|
@@ -1409,49 +1608,78 @@ function RunView({
|
|
|
1409
1608
|
] }),
|
|
1410
1609
|
" ",
|
|
1411
1610
|
tc.name,
|
|
1611
|
+
" ",
|
|
1612
|
+
/* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
|
|
1613
|
+
"(",
|
|
1614
|
+
tc.rerunIndex,
|
|
1615
|
+
"/",
|
|
1616
|
+
tc.rerunTotal,
|
|
1617
|
+
")"
|
|
1618
|
+
] }),
|
|
1412
1619
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1413
1620
|
" (",
|
|
1414
1621
|
tc.durationMs,
|
|
1415
1622
|
"ms)"
|
|
1416
1623
|
] })
|
|
1417
1624
|
] }),
|
|
1418
|
-
tc.
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
|
|
1430
|
-
|
|
1431
|
-
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1625
|
+
tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxs(
|
|
1626
|
+
Box,
|
|
1627
|
+
{
|
|
1628
|
+
flexDirection: "column",
|
|
1629
|
+
marginLeft: 2,
|
|
1630
|
+
children: [
|
|
1631
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1632
|
+
item.evaluatorName,
|
|
1633
|
+
":",
|
|
1634
|
+
" ",
|
|
1635
|
+
/* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
1636
|
+
" ",
|
|
1637
|
+
item.scores.map((s) => /* @__PURE__ */ jsxs(
|
|
1638
|
+
Text,
|
|
1639
|
+
{
|
|
1640
|
+
color: scoreColor(toNumericScore(s.data) ?? 0),
|
|
1641
|
+
children: [
|
|
1642
|
+
formatScorePart(s, scoreColor, {
|
|
1643
|
+
isAggregated: tc.isAggregated
|
|
1644
|
+
}),
|
|
1645
|
+
" "
|
|
1646
|
+
]
|
|
1647
|
+
},
|
|
1648
|
+
s.id
|
|
1649
|
+
)),
|
|
1650
|
+
item.metrics?.map((m) => {
|
|
1651
|
+
const def = getMetricById(m.id);
|
|
1652
|
+
if (!def)
|
|
1653
|
+
return null;
|
|
1654
|
+
const formatted = def.format(m.data, {
|
|
1655
|
+
isAggregated: tc.isAggregated
|
|
1656
|
+
});
|
|
1657
|
+
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1658
|
+
"[",
|
|
1659
|
+
def.name ? `${def.name}: ` : "",
|
|
1660
|
+
formatted,
|
|
1661
|
+
"]",
|
|
1662
|
+
" "
|
|
1663
|
+
] }, m.id);
|
|
1664
|
+
})
|
|
1665
|
+
] }),
|
|
1666
|
+
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
1667
|
+
(log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(
|
|
1668
|
+
({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
|
|
1669
|
+
Text,
|
|
1670
|
+
{
|
|
1671
|
+
color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
|
|
1672
|
+
children: line
|
|
1673
|
+
},
|
|
1674
|
+
lineIdx
|
|
1675
|
+
)
|
|
1676
|
+
) }, logIdx) : null
|
|
1677
|
+
) })
|
|
1678
|
+
]
|
|
1679
|
+
},
|
|
1680
|
+
item.evaluatorId
|
|
1681
|
+
))
|
|
1682
|
+
] }, tc.testCaseId)) }),
|
|
1455
1683
|
phase === "completed" && summary && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
|
|
1456
1684
|
/* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run Summary" }),
|
|
1457
1685
|
/* @__PURE__ */ jsxs(Box, { marginTop: 1, children: [
|
|
@@ -1498,7 +1726,8 @@ function RunView({
|
|
|
1498
1726
|
name.padEnd(28),
|
|
1499
1727
|
" avg=",
|
|
1500
1728
|
/* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
|
|
1501
|
-
"
|
|
1729
|
+
" ",
|
|
1730
|
+
"passed=",
|
|
1502
1731
|
agg.passed,
|
|
1503
1732
|
" failed=",
|
|
1504
1733
|
agg.failed
|
|
@@ -1507,28 +1736,38 @@ function RunView({
|
|
|
1507
1736
|
] }),
|
|
1508
1737
|
/* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
|
|
1509
1738
|
/* @__PURE__ */ jsx(Text, { color: "magenta", children: "test case scores" }),
|
|
1510
|
-
testCases.map((tc
|
|
1511
|
-
|
|
1512
|
-
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
]
|
|
1516
|
-
|
|
1517
|
-
|
|
1518
|
-
|
|
1519
|
-
|
|
1739
|
+
testCases.map((tc) => {
|
|
1740
|
+
const numericScores = tc.aggregatedEvaluatorScores.flatMap(
|
|
1741
|
+
(item) => item.scores.map((s) => toNumericScoreFromScores([s])).filter((n) => n !== void 0)
|
|
1742
|
+
);
|
|
1743
|
+
const averageScore = numericScores.length > 0 ? numericScores.reduce((a, b) => a + b, 0) / numericScores.length : void 0;
|
|
1744
|
+
const firstScore = tc.aggregatedEvaluatorScores[0]?.scores[0];
|
|
1745
|
+
const scoreLabel = firstScore && tc.isAggregated ? formatScorePart(firstScore, scoreColor, {
|
|
1746
|
+
isAggregated: true
|
|
1747
|
+
}) : averageScore !== void 0 ? averageScore.toFixed(2) : "n/a";
|
|
1748
|
+
return /* @__PURE__ */ jsxs(Box, { children: [
|
|
1749
|
+
/* @__PURE__ */ jsx(Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
|
|
1750
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1751
|
+
" ",
|
|
1752
|
+
tc.name.padEnd(24)
|
|
1520
1753
|
] }),
|
|
1754
|
+
averageScore !== void 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
1755
|
+
/* @__PURE__ */ jsxs(Text, { color: scoreColor(averageScore), children: [
|
|
1756
|
+
"score=",
|
|
1757
|
+
scoreLabel
|
|
1758
|
+
] }),
|
|
1759
|
+
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1760
|
+
" ",
|
|
1761
|
+
createBar(averageScore, 100, 14)
|
|
1762
|
+
] })
|
|
1763
|
+
] }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: "score=n/a" }),
|
|
1521
1764
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1522
|
-
" ",
|
|
1523
|
-
|
|
1765
|
+
" (",
|
|
1766
|
+
tc.durationMs,
|
|
1767
|
+
"ms)"
|
|
1524
1768
|
] })
|
|
1525
|
-
] }
|
|
1526
|
-
|
|
1527
|
-
" (",
|
|
1528
|
-
tc.durationMs,
|
|
1529
|
-
"ms)"
|
|
1530
|
-
] })
|
|
1531
|
-
] }, i))
|
|
1769
|
+
] }, tc.testCaseId);
|
|
1770
|
+
})
|
|
1532
1771
|
] }),
|
|
1533
1772
|
/* @__PURE__ */ jsx(Box, { marginTop: 1, children: /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1534
1773
|
"artifact: ",
|
|
@@ -1539,6 +1778,51 @@ function RunView({
|
|
|
1539
1778
|
}
|
|
1540
1779
|
|
|
1541
1780
|
// src/cli-simple/run.ts
|
|
1781
|
+
function buildTestCaseSummaries(byId) {
|
|
1782
|
+
const summaries = [];
|
|
1783
|
+
for (const { name, events } of byId.values()) {
|
|
1784
|
+
const passed = events.every((e) => e.passed);
|
|
1785
|
+
const durationMs = events.reduce((sum, e) => sum + e.durationMs, 0);
|
|
1786
|
+
const isAggregated = events.length > 1;
|
|
1787
|
+
const numericScores = [];
|
|
1788
|
+
let firstAggregatedScore;
|
|
1789
|
+
for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
|
|
1790
|
+
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
1791
|
+
for (const ev of events) {
|
|
1792
|
+
const es = ev.evaluatorScores.find(
|
|
1793
|
+
(x) => x.evaluatorId === evaluatorScores.evaluatorId
|
|
1794
|
+
);
|
|
1795
|
+
for (const s of es?.scores ?? []) {
|
|
1796
|
+
const list = scoreIdToItems.get(s.id) ?? [];
|
|
1797
|
+
list.push(s);
|
|
1798
|
+
scoreIdToItems.set(s.id, list);
|
|
1799
|
+
}
|
|
1800
|
+
}
|
|
1801
|
+
for (const items of scoreIdToItems.values()) {
|
|
1802
|
+
const agg = aggregateScoreItems(items);
|
|
1803
|
+
if (agg) {
|
|
1804
|
+
const n = toNumericScoreFromScores([agg]);
|
|
1805
|
+
if (n !== void 0) {
|
|
1806
|
+
numericScores.push(n);
|
|
1807
|
+
if (firstAggregatedScore === void 0) {
|
|
1808
|
+
firstAggregatedScore = agg;
|
|
1809
|
+
}
|
|
1810
|
+
}
|
|
1811
|
+
}
|
|
1812
|
+
}
|
|
1813
|
+
}
|
|
1814
|
+
const averageScore = numericScores.length > 0 ? numericScores.reduce((a, b) => a + b, 0) / numericScores.length : void 0;
|
|
1815
|
+
summaries.push({
|
|
1816
|
+
name,
|
|
1817
|
+
averageScore,
|
|
1818
|
+
aggregatedScoreItem: firstAggregatedScore,
|
|
1819
|
+
isAggregated,
|
|
1820
|
+
durationMs,
|
|
1821
|
+
passed
|
|
1822
|
+
});
|
|
1823
|
+
}
|
|
1824
|
+
return summaries;
|
|
1825
|
+
}
|
|
1542
1826
|
var ansi2 = {
|
|
1543
1827
|
reset: "\x1B[0m",
|
|
1544
1828
|
bold: "\x1B[1m",
|
|
@@ -1573,7 +1857,50 @@ function createBar2(value, max = 100, width = 20) {
|
|
|
1573
1857
|
const filled = Math.round(safe / max * width);
|
|
1574
1858
|
return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
|
|
1575
1859
|
}
|
|
1576
|
-
function
|
|
1860
|
+
function aggregateEvaluatorScoresFromEvents(events, evaluatorNameById) {
|
|
1861
|
+
if (events.length === 0)
|
|
1862
|
+
return [];
|
|
1863
|
+
const evaluatorIds = new Set(
|
|
1864
|
+
events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
|
|
1865
|
+
);
|
|
1866
|
+
const result = [];
|
|
1867
|
+
for (const evaluatorId of evaluatorIds) {
|
|
1868
|
+
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
1869
|
+
const metricIdToItems = /* @__PURE__ */ new Map();
|
|
1870
|
+
for (const ev of events) {
|
|
1871
|
+
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1872
|
+
for (const s of es?.scores ?? []) {
|
|
1873
|
+
const list = scoreIdToItems.get(s.id) ?? [];
|
|
1874
|
+
list.push(s);
|
|
1875
|
+
scoreIdToItems.set(s.id, list);
|
|
1876
|
+
}
|
|
1877
|
+
for (const m of es?.metrics ?? []) {
|
|
1878
|
+
const list = metricIdToItems.get(m.id) ?? [];
|
|
1879
|
+
list.push(m);
|
|
1880
|
+
metricIdToItems.set(m.id, list);
|
|
1881
|
+
}
|
|
1882
|
+
}
|
|
1883
|
+
const aggregatedScores = [];
|
|
1884
|
+
for (const items of scoreIdToItems.values()) {
|
|
1885
|
+
const agg = aggregateScoreItems(items);
|
|
1886
|
+
if (agg)
|
|
1887
|
+
aggregatedScores.push(agg);
|
|
1888
|
+
}
|
|
1889
|
+
const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
|
|
1890
|
+
const passed = events.every((ev) => {
|
|
1891
|
+
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1892
|
+
return es?.passed ?? false;
|
|
1893
|
+
});
|
|
1894
|
+
result.push({
|
|
1895
|
+
evaluatorId,
|
|
1896
|
+
scores: aggregatedScores,
|
|
1897
|
+
passed,
|
|
1898
|
+
metrics: aggregatedMetrics.length > 0 ? aggregatedMetrics : void 0
|
|
1899
|
+
});
|
|
1900
|
+
}
|
|
1901
|
+
return result;
|
|
1902
|
+
}
|
|
1903
|
+
function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
1577
1904
|
const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
|
|
1578
1905
|
const scoreParts = [];
|
|
1579
1906
|
for (const item of scores) {
|
|
@@ -1585,7 +1912,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
|
1585
1912
|
);
|
|
1586
1913
|
continue;
|
|
1587
1914
|
}
|
|
1588
|
-
const formatted = def.format(item.data);
|
|
1915
|
+
const formatted = def.format(item.data, options);
|
|
1589
1916
|
switch (def.displayStrategy) {
|
|
1590
1917
|
case "bar": {
|
|
1591
1918
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
@@ -1618,7 +1945,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
|
1618
1945
|
for (const { id, data } of metrics) {
|
|
1619
1946
|
const def = getMetricById(id);
|
|
1620
1947
|
if (def) {
|
|
1621
|
-
const formatted = def.format(data);
|
|
1948
|
+
const formatted = def.format(data, options);
|
|
1622
1949
|
metricParts.push(
|
|
1623
1950
|
def.name ? `[${def.name}: ${formatted}]` : `[${formatted}]`
|
|
1624
1951
|
);
|
|
@@ -1651,7 +1978,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1651
1978
|
evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
|
|
1652
1979
|
);
|
|
1653
1980
|
const aggregates = /* @__PURE__ */ new Map();
|
|
1654
|
-
const
|
|
1981
|
+
const testCaseByTestId = /* @__PURE__ */ new Map();
|
|
1655
1982
|
let overallScoreTotal = 0;
|
|
1656
1983
|
let overallScoreCount = 0;
|
|
1657
1984
|
let completedCount = 0;
|
|
@@ -1665,6 +1992,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1665
1992
|
}
|
|
1666
1993
|
process.stdout.write("\r\x1B[2K");
|
|
1667
1994
|
}
|
|
1995
|
+
function cursorUp(n) {
|
|
1996
|
+
if (!process.stdout.isTTY || n <= 0)
|
|
1997
|
+
return;
|
|
1998
|
+
process.stdout.write(`\x1B[${n}A`);
|
|
1999
|
+
}
|
|
1668
2000
|
function drawSpinner() {
|
|
1669
2001
|
if (!process.stdout.isTTY || runFinished) {
|
|
1670
2002
|
return;
|
|
@@ -1678,6 +2010,8 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1678
2010
|
)} ${colorize("(live)", ansi2.dim)}`
|
|
1679
2011
|
);
|
|
1680
2012
|
}
|
|
2013
|
+
let lastPrintedTestCaseId = null;
|
|
2014
|
+
let lastPrintedLineCount = 0;
|
|
1681
2015
|
let spinnerTimer;
|
|
1682
2016
|
const done = new Promise((resolve5) => {
|
|
1683
2017
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
@@ -1685,31 +2019,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1685
2019
|
completedCount = event.completedTestCases;
|
|
1686
2020
|
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
1687
2021
|
const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
|
|
1688
|
-
|
|
1689
|
-
|
|
1690
|
-
|
|
1691
|
-
|
|
2022
|
+
const testCaseId = event.testCaseId;
|
|
2023
|
+
const existing = testCaseByTestId.get(testCaseId) ?? {
|
|
2024
|
+
name: event.testCaseName,
|
|
2025
|
+
events: []
|
|
2026
|
+
};
|
|
2027
|
+
existing.events.push({
|
|
2028
|
+
averageScore,
|
|
2029
|
+
passed: event.passed,
|
|
2030
|
+
durationMs: event.durationMs,
|
|
2031
|
+
evaluatorScores: event.evaluatorScores
|
|
2032
|
+
});
|
|
2033
|
+
testCaseByTestId.set(testCaseId, existing);
|
|
1692
2034
|
for (const item of event.evaluatorScores) {
|
|
1693
|
-
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
1694
|
-
console.log(
|
|
1695
|
-
formatEvaluatorScoreLine(
|
|
1696
|
-
name,
|
|
1697
|
-
item.scores,
|
|
1698
|
-
item.passed,
|
|
1699
|
-
item.metrics
|
|
1700
|
-
)
|
|
1701
|
-
);
|
|
1702
|
-
if (!item.passed && item.logs && item.logs.length > 0) {
|
|
1703
|
-
for (const log of item.logs) {
|
|
1704
|
-
if (log.type === "diff") {
|
|
1705
|
-
const useColor = process.stdout.isTTY;
|
|
1706
|
-
for (const { type, line } of getDiffLines(log)) {
|
|
1707
|
-
const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
|
|
1708
|
-
console.log(colored);
|
|
1709
|
-
}
|
|
1710
|
-
}
|
|
1711
|
-
}
|
|
1712
|
-
}
|
|
1713
2035
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
1714
2036
|
if (numeric !== void 0) {
|
|
1715
2037
|
const current = aggregates.get(item.evaluatorId) ?? {
|
|
@@ -1728,12 +2050,60 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1728
2050
|
overallScoreCount += 1;
|
|
1729
2051
|
}
|
|
1730
2052
|
}
|
|
1731
|
-
|
|
1732
|
-
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
|
|
2053
|
+
const isSameTestCase = lastPrintedTestCaseId === testCaseId;
|
|
2054
|
+
const isLastRerun = event.rerunIndex >= event.rerunTotal;
|
|
2055
|
+
const isNonTty = !process.stdout.isTTY;
|
|
2056
|
+
const skipPrintNonTty = isNonTty && event.rerunTotal > 1 && !isLastRerun;
|
|
2057
|
+
if (isSameTestCase && lastPrintedLineCount > 0 && !skipPrintNonTty) {
|
|
2058
|
+
cursorUp(lastPrintedLineCount);
|
|
2059
|
+
}
|
|
2060
|
+
const aggregatedScores = aggregateEvaluatorScoresFromEvents(
|
|
2061
|
+
existing.events);
|
|
2062
|
+
const isAggregated = existing.events.length > 1;
|
|
2063
|
+
const durationMs = existing.events.reduce(
|
|
2064
|
+
(s, e) => s + e.durationMs,
|
|
2065
|
+
0
|
|
2066
|
+
);
|
|
2067
|
+
existing.events.every((e) => e.passed);
|
|
2068
|
+
const lines = [];
|
|
2069
|
+
lines.push(
|
|
2070
|
+
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
|
|
2071
|
+
);
|
|
2072
|
+
for (const item of aggregatedScores) {
|
|
2073
|
+
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
2074
|
+
lines.push(
|
|
2075
|
+
formatEvaluatorScoreLine(
|
|
2076
|
+
name,
|
|
2077
|
+
item.scores,
|
|
2078
|
+
item.passed,
|
|
2079
|
+
item.metrics,
|
|
2080
|
+
{ isAggregated }
|
|
2081
|
+
)
|
|
2082
|
+
);
|
|
2083
|
+
const lastEvent = existing.events[existing.events.length - 1];
|
|
2084
|
+
const lastEs = lastEvent?.evaluatorScores.find(
|
|
2085
|
+
(x) => x.evaluatorId === item.evaluatorId
|
|
2086
|
+
);
|
|
2087
|
+
if (!item.passed && lastEs?.logs && lastEs.logs.length > 0) {
|
|
2088
|
+
for (const log of lastEs.logs) {
|
|
2089
|
+
if (log.type === "diff") {
|
|
2090
|
+
const useColor = process.stdout.isTTY;
|
|
2091
|
+
for (const { type, line } of getDiffLines(log)) {
|
|
2092
|
+
const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
|
|
2093
|
+
lines.push(colored);
|
|
2094
|
+
}
|
|
2095
|
+
}
|
|
2096
|
+
}
|
|
2097
|
+
}
|
|
2098
|
+
}
|
|
2099
|
+
if (!skipPrintNonTty) {
|
|
2100
|
+
for (let i = 0; i < lines.length; i++) {
|
|
2101
|
+
process.stdout.write(`\r\x1B[2K${lines[i]}
|
|
2102
|
+
`);
|
|
2103
|
+
}
|
|
2104
|
+
lastPrintedTestCaseId = testCaseId;
|
|
2105
|
+
lastPrintedLineCount = lines.length;
|
|
2106
|
+
}
|
|
1737
2107
|
drawSpinner();
|
|
1738
2108
|
}
|
|
1739
2109
|
if (event.type === "RunCompleted" || event.type === "RunFailed") {
|
|
@@ -1797,6 +2167,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1797
2167
|
getEvaluatorSummaryLine(evaluatorName, aggregates.get(evaluatorId))
|
|
1798
2168
|
);
|
|
1799
2169
|
}
|
|
2170
|
+
const testCaseSummaries = buildTestCaseSummaries(testCaseByTestId);
|
|
1800
2171
|
if (testCaseSummaries.length > 0) {
|
|
1801
2172
|
console.log(colorize("- test case scores:", ansi2.magenta));
|
|
1802
2173
|
for (const summary of testCaseSummaries) {
|
|
@@ -1807,9 +2178,13 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1807
2178
|
);
|
|
1808
2179
|
continue;
|
|
1809
2180
|
}
|
|
2181
|
+
const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? getScoreById(summary.aggregatedScoreItem.id)?.format(
|
|
2182
|
+
summary.aggregatedScoreItem.data,
|
|
2183
|
+
{ isAggregated: true }
|
|
2184
|
+
) ?? summary.averageScore.toFixed(2) : summary.averageScore.toFixed(2);
|
|
1810
2185
|
console.log(
|
|
1811
2186
|
` ${status} ${summary.name.padEnd(24)} score=${colorize(
|
|
1812
|
-
|
|
2187
|
+
scoreLabel,
|
|
1813
2188
|
scoreToColor(summary.averageScore)
|
|
1814
2189
|
)} ${colorize(createBar2(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
|
|
1815
2190
|
);
|