@m4trix/evals 0.16.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +113 -76
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +113 -76
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +87 -47
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +87 -47
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +86 -44
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +54 -5
- package/dist/index.js +85 -45
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.cjs
CHANGED
|
@@ -359,20 +359,70 @@ function getMetricById(id) {
|
|
|
359
359
|
|
|
360
360
|
// src/evals/score.ts
|
|
361
361
|
var registry2 = /* @__PURE__ */ new Map();
|
|
362
|
+
function formatScoreData(def, data, options) {
|
|
363
|
+
return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
|
|
364
|
+
}
|
|
365
|
+
var ScoreAggregate = {
|
|
366
|
+
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
367
|
+
averageFields(fields) {
|
|
368
|
+
return (values) => {
|
|
369
|
+
const count = values.length || 1;
|
|
370
|
+
const result = {};
|
|
371
|
+
for (const field of fields) {
|
|
372
|
+
result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
|
|
373
|
+
}
|
|
374
|
+
return result;
|
|
375
|
+
};
|
|
376
|
+
},
|
|
377
|
+
/** Average `value` with sample std dev. Use for percent-style scores. */
|
|
378
|
+
averageWithVariance(values) {
|
|
379
|
+
if (values.length === 0) {
|
|
380
|
+
return { value: 0, stdDev: void 0, count: 0 };
|
|
381
|
+
}
|
|
382
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
383
|
+
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
384
|
+
const mean = sum / values.length;
|
|
385
|
+
let stdDev;
|
|
386
|
+
if (values.length >= 2) {
|
|
387
|
+
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
388
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
389
|
+
}
|
|
390
|
+
return { ...values[0], value: mean, stdDev, count: values.length };
|
|
391
|
+
},
|
|
392
|
+
/** All runs must pass. Use for binary scores. */
|
|
393
|
+
all(values) {
|
|
394
|
+
const total = values.length;
|
|
395
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
396
|
+
return {
|
|
397
|
+
...values[0],
|
|
398
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
399
|
+
passedCount,
|
|
400
|
+
totalCount: total
|
|
401
|
+
};
|
|
402
|
+
},
|
|
403
|
+
/** Take last value (no aggregation). Use when aggregation is not meaningful. */
|
|
404
|
+
last(values) {
|
|
405
|
+
return values[values.length - 1] ?? {};
|
|
406
|
+
}
|
|
407
|
+
};
|
|
362
408
|
var Score = {
|
|
409
|
+
aggregate: ScoreAggregate,
|
|
363
410
|
of(config) {
|
|
364
411
|
const def = {
|
|
365
412
|
id: config.id,
|
|
366
413
|
name: config.name,
|
|
367
414
|
displayStrategy: config.displayStrategy,
|
|
368
|
-
|
|
369
|
-
|
|
415
|
+
formatValue: config.formatValue,
|
|
416
|
+
formatAggregate: config.formatAggregate,
|
|
417
|
+
aggregateValues: config.aggregateValues,
|
|
370
418
|
make: (data, options) => {
|
|
371
419
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
372
420
|
return {
|
|
373
421
|
id: config.id,
|
|
374
422
|
data,
|
|
375
|
-
...passed !== void 0 && { passed }
|
|
423
|
+
...passed !== void 0 && { passed },
|
|
424
|
+
def
|
|
425
|
+
// Attach def so rendering/aggregation works without registry lookup
|
|
376
426
|
};
|
|
377
427
|
}
|
|
378
428
|
};
|
|
@@ -385,29 +435,6 @@ function getScoreById(id) {
|
|
|
385
435
|
}
|
|
386
436
|
|
|
387
437
|
// src/evals/aggregators.ts
|
|
388
|
-
function aggregateAverageWithVariance(values) {
|
|
389
|
-
if (values.length === 0) {
|
|
390
|
-
return { value: 0, count: 0 };
|
|
391
|
-
}
|
|
392
|
-
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
393
|
-
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
394
|
-
const mean = sum / values.length;
|
|
395
|
-
let stdDev;
|
|
396
|
-
if (values.length >= 2) {
|
|
397
|
-
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
398
|
-
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
399
|
-
}
|
|
400
|
-
return { value: mean, stdDev, count: values.length };
|
|
401
|
-
}
|
|
402
|
-
function aggregateAll(values) {
|
|
403
|
-
const total = values.length;
|
|
404
|
-
const passedCount = values.filter((v) => v.passed).length;
|
|
405
|
-
return {
|
|
406
|
-
passed: total > 0 && values.every((v) => v.passed),
|
|
407
|
-
passedCount,
|
|
408
|
-
totalCount: total
|
|
409
|
-
};
|
|
410
|
-
}
|
|
411
438
|
function aggregateTokenCountSum(values) {
|
|
412
439
|
const initial = {
|
|
413
440
|
input: 0,
|
|
@@ -460,40 +487,45 @@ Score.of({
|
|
|
460
487
|
id: "percent",
|
|
461
488
|
name: "Score",
|
|
462
489
|
displayStrategy: "bar",
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
490
|
+
formatValue: (data) => data.value.toFixed(2),
|
|
491
|
+
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
492
|
+
aggregateValues: Score.aggregate.averageWithVariance
|
|
493
|
+
});
|
|
494
|
+
Score.of({
|
|
495
|
+
id: "delta",
|
|
496
|
+
name: "Delta",
|
|
497
|
+
displayStrategy: "number",
|
|
498
|
+
formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
|
|
499
|
+
formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
|
|
500
|
+
aggregateValues: Score.aggregate.averageFields(["value", "delta"])
|
|
470
501
|
});
|
|
471
502
|
Score.of({
|
|
472
503
|
id: "binary",
|
|
473
504
|
name: "Result",
|
|
474
505
|
displayStrategy: "passFail",
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
}
|
|
481
|
-
return base;
|
|
506
|
+
formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
|
|
507
|
+
formatAggregate: (data) => {
|
|
508
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
509
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
510
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
482
511
|
}
|
|
483
|
-
return
|
|
512
|
+
return base;
|
|
484
513
|
},
|
|
485
|
-
|
|
514
|
+
aggregateValues: Score.aggregate.all
|
|
486
515
|
});
|
|
487
516
|
|
|
488
517
|
// src/runner/score-utils.ts
|
|
518
|
+
function getScoreDef(item) {
|
|
519
|
+
return item.def ?? getScoreById(item.id);
|
|
520
|
+
}
|
|
489
521
|
function aggregateScoreItems(items) {
|
|
490
522
|
if (items.length === 0)
|
|
491
523
|
return void 0;
|
|
492
|
-
const def =
|
|
493
|
-
if (!def?.
|
|
524
|
+
const def = getScoreDef(items[0]);
|
|
525
|
+
if (!def?.aggregateValues)
|
|
494
526
|
return items[items.length - 1];
|
|
495
|
-
const aggregated = def.
|
|
496
|
-
return { ...items[0], data: aggregated };
|
|
527
|
+
const aggregated = def.aggregateValues(items.map((i) => i.data));
|
|
528
|
+
return { ...items[0], data: aggregated, def };
|
|
497
529
|
}
|
|
498
530
|
function aggregateMetricItems(items) {
|
|
499
531
|
if (items.length === 0)
|
|
@@ -506,7 +538,7 @@ function aggregateMetricItems(items) {
|
|
|
506
538
|
}
|
|
507
539
|
function toNumericScoreFromScores(scores) {
|
|
508
540
|
for (const item of scores) {
|
|
509
|
-
const def =
|
|
541
|
+
const def = getScoreDef(item);
|
|
510
542
|
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
511
543
|
const value = item.data.value;
|
|
512
544
|
if (typeof value === "number" && Number.isFinite(value)) {
|
|
@@ -587,6 +619,7 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
587
619
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
588
620
|
const rerunPassed = [];
|
|
589
621
|
for (let r = 0; r < reruns; r++) {
|
|
622
|
+
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
590
623
|
const started = Date.now();
|
|
591
624
|
const evaluatorScores = [];
|
|
592
625
|
let testCaseError;
|
|
@@ -613,6 +646,11 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
613
646
|
input: testCaseItem.testCase.getInput(),
|
|
614
647
|
ctx,
|
|
615
648
|
output,
|
|
649
|
+
meta: {
|
|
650
|
+
triggerId: task.triggerId,
|
|
651
|
+
runId: evaluatorRunId,
|
|
652
|
+
datasetId: task.datasetId
|
|
653
|
+
},
|
|
616
654
|
logDiff,
|
|
617
655
|
log
|
|
618
656
|
})
|
|
@@ -888,7 +926,7 @@ var createPersistenceWorker = (queue) => effect.Effect.forever(
|
|
|
888
926
|
() => appendJsonLine(message.artifactPath, {
|
|
889
927
|
runId: message.runId,
|
|
890
928
|
ts: Date.now(),
|
|
891
|
-
...message.payload
|
|
929
|
+
...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
|
|
892
930
|
})
|
|
893
931
|
);
|
|
894
932
|
})
|
|
@@ -1072,6 +1110,7 @@ var EffectRunner = class {
|
|
|
1072
1110
|
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1073
1111
|
0
|
|
1074
1112
|
);
|
|
1113
|
+
const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
1075
1114
|
const runId = `run-${crypto.randomUUID()}`;
|
|
1076
1115
|
const artifactPath = createArtifactPath(
|
|
1077
1116
|
this.config.artifactDirectory,
|
|
@@ -1113,6 +1152,7 @@ var EffectRunner = class {
|
|
|
1113
1152
|
await effect.Effect.runPromise(
|
|
1114
1153
|
effect.Queue.offer(this.runQueue, {
|
|
1115
1154
|
runId,
|
|
1155
|
+
triggerId,
|
|
1116
1156
|
datasetId: request.datasetId,
|
|
1117
1157
|
dataset: dataset.dataset,
|
|
1118
1158
|
evaluators: selectedEvaluators,
|
|
@@ -1472,7 +1512,7 @@ function aggregateEvaluatorScores(events, nameById) {
|
|
|
1472
1512
|
if (agg)
|
|
1473
1513
|
aggregatedScores.push(agg);
|
|
1474
1514
|
}
|
|
1475
|
-
const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([
|
|
1515
|
+
const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
|
|
1476
1516
|
const passed = events.every((ev) => {
|
|
1477
1517
|
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1478
1518
|
return es?.passed ?? false;
|
|
@@ -1492,13 +1532,13 @@ function aggregateEvaluatorScores(events, nameById) {
|
|
|
1492
1532
|
}
|
|
1493
1533
|
return result;
|
|
1494
1534
|
}
|
|
1495
|
-
function formatScorePart(item,
|
|
1496
|
-
const def = getScoreById(item.id);
|
|
1535
|
+
function formatScorePart(item, _scoreToColor, options) {
|
|
1536
|
+
const def = item.def ?? getScoreById(item.id);
|
|
1497
1537
|
if (!def) {
|
|
1498
1538
|
const numeric = toNumericScore(item.data);
|
|
1499
1539
|
return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
|
|
1500
1540
|
}
|
|
1501
|
-
const formatted = def
|
|
1541
|
+
const formatted = formatScoreData(def, item.data, options);
|
|
1502
1542
|
if (def.displayStrategy === "bar") {
|
|
1503
1543
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
1504
1544
|
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
@@ -1556,8 +1596,6 @@ function RunView({
|
|
|
1556
1596
|
const done = new Promise((resolve5) => {
|
|
1557
1597
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1558
1598
|
if (event.type === "TestCaseProgress") {
|
|
1559
|
-
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
1560
|
-
numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
|
|
1561
1599
|
for (const item of event.evaluatorScores) {
|
|
1562
1600
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
1563
1601
|
if (numeric !== void 0) {
|
|
@@ -1647,16 +1685,17 @@ function RunView({
|
|
|
1647
1685
|
onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
|
|
1648
1686
|
return;
|
|
1649
1687
|
}
|
|
1688
|
+
const completed = finalEvent;
|
|
1650
1689
|
setSummary({
|
|
1651
|
-
passedTestCases:
|
|
1652
|
-
failedTestCases:
|
|
1653
|
-
totalTestCases:
|
|
1690
|
+
passedTestCases: completed.passedTestCases,
|
|
1691
|
+
failedTestCases: completed.failedTestCases,
|
|
1692
|
+
totalTestCases: completed.totalTestCases,
|
|
1654
1693
|
overallScoreTotal,
|
|
1655
1694
|
overallScoreSumSq,
|
|
1656
1695
|
overallScoreCount,
|
|
1657
1696
|
aggregates: new Map(aggregates),
|
|
1658
1697
|
scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
|
|
1659
|
-
artifactPath:
|
|
1698
|
+
artifactPath: completed.artifactPath
|
|
1660
1699
|
});
|
|
1661
1700
|
setPhase("completed");
|
|
1662
1701
|
setTimeout(() => onComplete(), 200);
|
|
@@ -1862,11 +1901,9 @@ function RunView({
|
|
|
1862
1901
|
const aggregated = aggregateScoreItems(items);
|
|
1863
1902
|
if (!aggregated)
|
|
1864
1903
|
return null;
|
|
1865
|
-
const def = getScoreById(aggregated.id);
|
|
1904
|
+
const def = aggregated.def ?? getScoreById(aggregated.id);
|
|
1866
1905
|
const label = def ? def.name ?? def.id : aggregated.id;
|
|
1867
|
-
const formatted = def
|
|
1868
|
-
isAggregated: true
|
|
1869
|
-
}) ?? "n/a";
|
|
1906
|
+
const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
|
|
1870
1907
|
const numeric = toNumericScore(aggregated.data);
|
|
1871
1908
|
return /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1872
1909
|
ink.Text,
|
|
@@ -2025,9 +2062,9 @@ function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreIt
|
|
|
2025
2062
|
const agg = aggregateScoreItems(items);
|
|
2026
2063
|
if (!agg)
|
|
2027
2064
|
continue;
|
|
2028
|
-
const def = getScoreById(agg.id);
|
|
2065
|
+
const def = agg.def ?? getScoreById(agg.id);
|
|
2029
2066
|
const label = def ? def.name ?? def.id : agg.id;
|
|
2030
|
-
const formatted = def
|
|
2067
|
+
const formatted = def ? def.formatAggregate(agg.data) : "n/a";
|
|
2031
2068
|
const numeric = toNumericScore(agg.data);
|
|
2032
2069
|
const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
|
|
2033
2070
|
scoreLines.push(` ${label}: ${colored}`);
|
|
@@ -2045,7 +2082,7 @@ function createBar2(value, max = 100, width = 20) {
|
|
|
2045
2082
|
const filled = Math.round(safe / max * width);
|
|
2046
2083
|
return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
|
|
2047
2084
|
}
|
|
2048
|
-
function aggregateEvaluatorScoresFromEvents(events,
|
|
2085
|
+
function aggregateEvaluatorScoresFromEvents(events, _evaluatorNameById) {
|
|
2049
2086
|
if (events.length === 0)
|
|
2050
2087
|
return [];
|
|
2051
2088
|
const evaluatorIds = new Set(
|
|
@@ -2104,14 +2141,14 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2104
2141
|
}
|
|
2105
2142
|
const scoreLines = [];
|
|
2106
2143
|
for (const item of scores) {
|
|
2107
|
-
const def = getScoreById(item.id);
|
|
2144
|
+
const def = item.def ?? getScoreById(item.id);
|
|
2108
2145
|
const scoreLabel = def ? def.name ?? def.id : item.id;
|
|
2109
2146
|
let formatted;
|
|
2110
2147
|
if (!def) {
|
|
2111
2148
|
const numeric = toNumericScore(item.data);
|
|
2112
2149
|
formatted = numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a";
|
|
2113
2150
|
} else {
|
|
2114
|
-
const raw = def
|
|
2151
|
+
const raw = formatScoreData(def, item.data, options);
|
|
2115
2152
|
switch (def.displayStrategy) {
|
|
2116
2153
|
case "bar": {
|
|
2117
2154
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
@@ -2263,7 +2300,6 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2263
2300
|
(s, e) => s + e.durationMs,
|
|
2264
2301
|
0
|
|
2265
2302
|
);
|
|
2266
|
-
existing.events.every((e) => e.passed);
|
|
2267
2303
|
const lines = [];
|
|
2268
2304
|
lines.push(
|
|
2269
2305
|
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
|
|
@@ -2341,18 +2377,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2341
2377
|
if (finalEvent.type === "RunFailed") {
|
|
2342
2378
|
throw new Error(`Run failed: ${finalEvent.errorMessage}`);
|
|
2343
2379
|
}
|
|
2380
|
+
const completed = finalEvent;
|
|
2344
2381
|
console.log("");
|
|
2345
2382
|
console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
2346
2383
|
console.log(
|
|
2347
2384
|
`- passed: ${colorize(
|
|
2348
|
-
`${
|
|
2385
|
+
`${completed.passedTestCases}/${completed.totalTestCases}`,
|
|
2349
2386
|
ansi2.green
|
|
2350
2387
|
)}`
|
|
2351
2388
|
);
|
|
2352
2389
|
console.log(
|
|
2353
2390
|
`- failed: ${colorize(
|
|
2354
|
-
`${
|
|
2355
|
-
|
|
2391
|
+
`${completed.failedTestCases}/${completed.totalTestCases}`,
|
|
2392
|
+
completed.failedTestCases > 0 ? ansi2.red : ansi2.dim
|
|
2356
2393
|
)}`
|
|
2357
2394
|
);
|
|
2358
2395
|
if (overallScoreCount > 0) {
|
|
@@ -2393,10 +2430,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2393
2430
|
);
|
|
2394
2431
|
continue;
|
|
2395
2432
|
}
|
|
2396
|
-
const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ?
|
|
2397
|
-
summary.aggregatedScoreItem.
|
|
2398
|
-
|
|
2399
|
-
)
|
|
2433
|
+
const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? (() => {
|
|
2434
|
+
const def = summary.aggregatedScoreItem.def ?? getScoreById(summary.aggregatedScoreItem.id);
|
|
2435
|
+
return def ? def.formatAggregate(summary.aggregatedScoreItem.data) : summary.averageScore.toFixed(2);
|
|
2436
|
+
})() : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
|
|
2400
2437
|
console.log(
|
|
2401
2438
|
` ${status} ${summary.name.padEnd(24)} score=${colorize(
|
|
2402
2439
|
scoreLabel,
|
|
@@ -2405,7 +2442,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2405
2442
|
);
|
|
2406
2443
|
}
|
|
2407
2444
|
}
|
|
2408
|
-
console.log(`- artifact: ${colorize(
|
|
2445
|
+
console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
|
|
2409
2446
|
}
|
|
2410
2447
|
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
|
|
2411
2448
|
return new Promise((resolve5, reject) => {
|