@m4trix/evals 0.16.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +113 -76
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +113 -76
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +87 -47
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +87 -47
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +86 -44
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +54 -5
- package/dist/index.js +85 -45
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.js
CHANGED
|
@@ -333,20 +333,70 @@ function getMetricById(id) {
|
|
|
333
333
|
|
|
334
334
|
// src/evals/score.ts
|
|
335
335
|
var registry2 = /* @__PURE__ */ new Map();
|
|
336
|
+
function formatScoreData(def, data, options) {
|
|
337
|
+
return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
|
|
338
|
+
}
|
|
339
|
+
var ScoreAggregate = {
|
|
340
|
+
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
341
|
+
averageFields(fields) {
|
|
342
|
+
return (values) => {
|
|
343
|
+
const count = values.length || 1;
|
|
344
|
+
const result = {};
|
|
345
|
+
for (const field of fields) {
|
|
346
|
+
result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
|
|
347
|
+
}
|
|
348
|
+
return result;
|
|
349
|
+
};
|
|
350
|
+
},
|
|
351
|
+
/** Average `value` with sample std dev. Use for percent-style scores. */
|
|
352
|
+
averageWithVariance(values) {
|
|
353
|
+
if (values.length === 0) {
|
|
354
|
+
return { value: 0, stdDev: void 0, count: 0 };
|
|
355
|
+
}
|
|
356
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
357
|
+
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
358
|
+
const mean = sum / values.length;
|
|
359
|
+
let stdDev;
|
|
360
|
+
if (values.length >= 2) {
|
|
361
|
+
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
362
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
363
|
+
}
|
|
364
|
+
return { ...values[0], value: mean, stdDev, count: values.length };
|
|
365
|
+
},
|
|
366
|
+
/** All runs must pass. Use for binary scores. */
|
|
367
|
+
all(values) {
|
|
368
|
+
const total = values.length;
|
|
369
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
370
|
+
return {
|
|
371
|
+
...values[0],
|
|
372
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
373
|
+
passedCount,
|
|
374
|
+
totalCount: total
|
|
375
|
+
};
|
|
376
|
+
},
|
|
377
|
+
/** Take last value (no aggregation). Use when aggregation is not meaningful. */
|
|
378
|
+
last(values) {
|
|
379
|
+
return values[values.length - 1] ?? {};
|
|
380
|
+
}
|
|
381
|
+
};
|
|
336
382
|
var Score = {
|
|
383
|
+
aggregate: ScoreAggregate,
|
|
337
384
|
of(config) {
|
|
338
385
|
const def = {
|
|
339
386
|
id: config.id,
|
|
340
387
|
name: config.name,
|
|
341
388
|
displayStrategy: config.displayStrategy,
|
|
342
|
-
|
|
343
|
-
|
|
389
|
+
formatValue: config.formatValue,
|
|
390
|
+
formatAggregate: config.formatAggregate,
|
|
391
|
+
aggregateValues: config.aggregateValues,
|
|
344
392
|
make: (data, options) => {
|
|
345
393
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
346
394
|
return {
|
|
347
395
|
id: config.id,
|
|
348
396
|
data,
|
|
349
|
-
...passed !== void 0 && { passed }
|
|
397
|
+
...passed !== void 0 && { passed },
|
|
398
|
+
def
|
|
399
|
+
// Attach def so rendering/aggregation works without registry lookup
|
|
350
400
|
};
|
|
351
401
|
}
|
|
352
402
|
};
|
|
@@ -359,29 +409,6 @@ function getScoreById(id) {
|
|
|
359
409
|
}
|
|
360
410
|
|
|
361
411
|
// src/evals/aggregators.ts
|
|
362
|
-
function aggregateAverageWithVariance(values) {
|
|
363
|
-
if (values.length === 0) {
|
|
364
|
-
return { value: 0, count: 0 };
|
|
365
|
-
}
|
|
366
|
-
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
367
|
-
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
368
|
-
const mean = sum / values.length;
|
|
369
|
-
let stdDev;
|
|
370
|
-
if (values.length >= 2) {
|
|
371
|
-
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
372
|
-
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
373
|
-
}
|
|
374
|
-
return { value: mean, stdDev, count: values.length };
|
|
375
|
-
}
|
|
376
|
-
function aggregateAll(values) {
|
|
377
|
-
const total = values.length;
|
|
378
|
-
const passedCount = values.filter((v) => v.passed).length;
|
|
379
|
-
return {
|
|
380
|
-
passed: total > 0 && values.every((v) => v.passed),
|
|
381
|
-
passedCount,
|
|
382
|
-
totalCount: total
|
|
383
|
-
};
|
|
384
|
-
}
|
|
385
412
|
function aggregateTokenCountSum(values) {
|
|
386
413
|
const initial = {
|
|
387
414
|
input: 0,
|
|
@@ -434,40 +461,45 @@ Score.of({
|
|
|
434
461
|
id: "percent",
|
|
435
462
|
name: "Score",
|
|
436
463
|
displayStrategy: "bar",
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
464
|
+
formatValue: (data) => data.value.toFixed(2),
|
|
465
|
+
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
466
|
+
aggregateValues: Score.aggregate.averageWithVariance
|
|
467
|
+
});
|
|
468
|
+
Score.of({
|
|
469
|
+
id: "delta",
|
|
470
|
+
name: "Delta",
|
|
471
|
+
displayStrategy: "number",
|
|
472
|
+
formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
|
|
473
|
+
formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
|
|
474
|
+
aggregateValues: Score.aggregate.averageFields(["value", "delta"])
|
|
444
475
|
});
|
|
445
476
|
Score.of({
|
|
446
477
|
id: "binary",
|
|
447
478
|
name: "Result",
|
|
448
479
|
displayStrategy: "passFail",
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
}
|
|
455
|
-
return base;
|
|
480
|
+
formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
|
|
481
|
+
formatAggregate: (data) => {
|
|
482
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
483
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
484
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
456
485
|
}
|
|
457
|
-
return
|
|
486
|
+
return base;
|
|
458
487
|
},
|
|
459
|
-
|
|
488
|
+
aggregateValues: Score.aggregate.all
|
|
460
489
|
});
|
|
461
490
|
|
|
462
491
|
// src/runner/score-utils.ts
|
|
492
|
+
function getScoreDef(item) {
|
|
493
|
+
return item.def ?? getScoreById(item.id);
|
|
494
|
+
}
|
|
463
495
|
function aggregateScoreItems(items) {
|
|
464
496
|
if (items.length === 0)
|
|
465
497
|
return void 0;
|
|
466
|
-
const def =
|
|
467
|
-
if (!def?.
|
|
498
|
+
const def = getScoreDef(items[0]);
|
|
499
|
+
if (!def?.aggregateValues)
|
|
468
500
|
return items[items.length - 1];
|
|
469
|
-
const aggregated = def.
|
|
470
|
-
return { ...items[0], data: aggregated };
|
|
501
|
+
const aggregated = def.aggregateValues(items.map((i) => i.data));
|
|
502
|
+
return { ...items[0], data: aggregated, def };
|
|
471
503
|
}
|
|
472
504
|
function aggregateMetricItems(items) {
|
|
473
505
|
if (items.length === 0)
|
|
@@ -480,7 +512,7 @@ function aggregateMetricItems(items) {
|
|
|
480
512
|
}
|
|
481
513
|
function toNumericScoreFromScores(scores) {
|
|
482
514
|
for (const item of scores) {
|
|
483
|
-
const def =
|
|
515
|
+
const def = getScoreDef(item);
|
|
484
516
|
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
485
517
|
const value = item.data.value;
|
|
486
518
|
if (typeof value === "number" && Number.isFinite(value)) {
|
|
@@ -561,6 +593,7 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
561
593
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
562
594
|
const rerunPassed = [];
|
|
563
595
|
for (let r = 0; r < reruns; r++) {
|
|
596
|
+
const evaluatorRunId = `run-${randomUUID()}`;
|
|
564
597
|
const started = Date.now();
|
|
565
598
|
const evaluatorScores = [];
|
|
566
599
|
let testCaseError;
|
|
@@ -587,6 +620,11 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
587
620
|
input: testCaseItem.testCase.getInput(),
|
|
588
621
|
ctx,
|
|
589
622
|
output,
|
|
623
|
+
meta: {
|
|
624
|
+
triggerId: task.triggerId,
|
|
625
|
+
runId: evaluatorRunId,
|
|
626
|
+
datasetId: task.datasetId
|
|
627
|
+
},
|
|
590
628
|
logDiff,
|
|
591
629
|
log
|
|
592
630
|
})
|
|
@@ -862,7 +900,7 @@ var createPersistenceWorker = (queue) => Effect.forever(
|
|
|
862
900
|
() => appendJsonLine(message.artifactPath, {
|
|
863
901
|
runId: message.runId,
|
|
864
902
|
ts: Date.now(),
|
|
865
|
-
...message.payload
|
|
903
|
+
...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
|
|
866
904
|
})
|
|
867
905
|
);
|
|
868
906
|
})
|
|
@@ -1046,6 +1084,7 @@ var EffectRunner = class {
|
|
|
1046
1084
|
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1047
1085
|
0
|
|
1048
1086
|
);
|
|
1087
|
+
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
1049
1088
|
const runId = `run-${randomUUID()}`;
|
|
1050
1089
|
const artifactPath = createArtifactPath(
|
|
1051
1090
|
this.config.artifactDirectory,
|
|
@@ -1087,6 +1126,7 @@ var EffectRunner = class {
|
|
|
1087
1126
|
await Effect.runPromise(
|
|
1088
1127
|
Queue.offer(this.runQueue, {
|
|
1089
1128
|
runId,
|
|
1129
|
+
triggerId,
|
|
1090
1130
|
datasetId: request.datasetId,
|
|
1091
1131
|
dataset: dataset.dataset,
|
|
1092
1132
|
evaluators: selectedEvaluators,
|
|
@@ -1446,7 +1486,7 @@ function aggregateEvaluatorScores(events, nameById) {
|
|
|
1446
1486
|
if (agg)
|
|
1447
1487
|
aggregatedScores.push(agg);
|
|
1448
1488
|
}
|
|
1449
|
-
const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([
|
|
1489
|
+
const aggregatedMetrics = Array.from(metricIdToItems.entries()).map(([, items]) => aggregateMetricItems(items)).filter((m) => m !== void 0);
|
|
1450
1490
|
const passed = events.every((ev) => {
|
|
1451
1491
|
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1452
1492
|
return es?.passed ?? false;
|
|
@@ -1466,13 +1506,13 @@ function aggregateEvaluatorScores(events, nameById) {
|
|
|
1466
1506
|
}
|
|
1467
1507
|
return result;
|
|
1468
1508
|
}
|
|
1469
|
-
function formatScorePart(item,
|
|
1470
|
-
const def = getScoreById(item.id);
|
|
1509
|
+
function formatScorePart(item, _scoreToColor, options) {
|
|
1510
|
+
const def = item.def ?? getScoreById(item.id);
|
|
1471
1511
|
if (!def) {
|
|
1472
1512
|
const numeric = toNumericScore(item.data);
|
|
1473
1513
|
return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
|
|
1474
1514
|
}
|
|
1475
|
-
const formatted = def
|
|
1515
|
+
const formatted = formatScoreData(def, item.data, options);
|
|
1476
1516
|
if (def.displayStrategy === "bar") {
|
|
1477
1517
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
1478
1518
|
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
@@ -1530,8 +1570,6 @@ function RunView({
|
|
|
1530
1570
|
const done = new Promise((resolve5) => {
|
|
1531
1571
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1532
1572
|
if (event.type === "TestCaseProgress") {
|
|
1533
|
-
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
1534
|
-
numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
|
|
1535
1573
|
for (const item of event.evaluatorScores) {
|
|
1536
1574
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
1537
1575
|
if (numeric !== void 0) {
|
|
@@ -1621,16 +1659,17 @@ function RunView({
|
|
|
1621
1659
|
onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
|
|
1622
1660
|
return;
|
|
1623
1661
|
}
|
|
1662
|
+
const completed = finalEvent;
|
|
1624
1663
|
setSummary({
|
|
1625
|
-
passedTestCases:
|
|
1626
|
-
failedTestCases:
|
|
1627
|
-
totalTestCases:
|
|
1664
|
+
passedTestCases: completed.passedTestCases,
|
|
1665
|
+
failedTestCases: completed.failedTestCases,
|
|
1666
|
+
totalTestCases: completed.totalTestCases,
|
|
1628
1667
|
overallScoreTotal,
|
|
1629
1668
|
overallScoreSumSq,
|
|
1630
1669
|
overallScoreCount,
|
|
1631
1670
|
aggregates: new Map(aggregates),
|
|
1632
1671
|
scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
|
|
1633
|
-
artifactPath:
|
|
1672
|
+
artifactPath: completed.artifactPath
|
|
1634
1673
|
});
|
|
1635
1674
|
setPhase("completed");
|
|
1636
1675
|
setTimeout(() => onComplete(), 200);
|
|
@@ -1836,11 +1875,9 @@ function RunView({
|
|
|
1836
1875
|
const aggregated = aggregateScoreItems(items);
|
|
1837
1876
|
if (!aggregated)
|
|
1838
1877
|
return null;
|
|
1839
|
-
const def = getScoreById(aggregated.id);
|
|
1878
|
+
const def = aggregated.def ?? getScoreById(aggregated.id);
|
|
1840
1879
|
const label = def ? def.name ?? def.id : aggregated.id;
|
|
1841
|
-
const formatted = def
|
|
1842
|
-
isAggregated: true
|
|
1843
|
-
}) ?? "n/a";
|
|
1880
|
+
const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
|
|
1844
1881
|
const numeric = toNumericScore(aggregated.data);
|
|
1845
1882
|
return /* @__PURE__ */ jsxs(
|
|
1846
1883
|
Text,
|
|
@@ -1999,9 +2036,9 @@ function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreIt
|
|
|
1999
2036
|
const agg = aggregateScoreItems(items);
|
|
2000
2037
|
if (!agg)
|
|
2001
2038
|
continue;
|
|
2002
|
-
const def = getScoreById(agg.id);
|
|
2039
|
+
const def = agg.def ?? getScoreById(agg.id);
|
|
2003
2040
|
const label = def ? def.name ?? def.id : agg.id;
|
|
2004
|
-
const formatted = def
|
|
2041
|
+
const formatted = def ? def.formatAggregate(agg.data) : "n/a";
|
|
2005
2042
|
const numeric = toNumericScore(agg.data);
|
|
2006
2043
|
const colored = numeric !== void 0 ? colorize(formatted, scoreToColor(numeric)) : formatted;
|
|
2007
2044
|
scoreLines.push(` ${label}: ${colored}`);
|
|
@@ -2019,7 +2056,7 @@ function createBar2(value, max = 100, width = 20) {
|
|
|
2019
2056
|
const filled = Math.round(safe / max * width);
|
|
2020
2057
|
return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
|
|
2021
2058
|
}
|
|
2022
|
-
function aggregateEvaluatorScoresFromEvents(events,
|
|
2059
|
+
function aggregateEvaluatorScoresFromEvents(events, _evaluatorNameById) {
|
|
2023
2060
|
if (events.length === 0)
|
|
2024
2061
|
return [];
|
|
2025
2062
|
const evaluatorIds = new Set(
|
|
@@ -2078,14 +2115,14 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2078
2115
|
}
|
|
2079
2116
|
const scoreLines = [];
|
|
2080
2117
|
for (const item of scores) {
|
|
2081
|
-
const def = getScoreById(item.id);
|
|
2118
|
+
const def = item.def ?? getScoreById(item.id);
|
|
2082
2119
|
const scoreLabel = def ? def.name ?? def.id : item.id;
|
|
2083
2120
|
let formatted;
|
|
2084
2121
|
if (!def) {
|
|
2085
2122
|
const numeric = toNumericScore(item.data);
|
|
2086
2123
|
formatted = numeric !== void 0 ? colorize(numeric.toFixed(2), scoreToColor(numeric)) : "n/a";
|
|
2087
2124
|
} else {
|
|
2088
|
-
const raw = def
|
|
2125
|
+
const raw = formatScoreData(def, item.data, options);
|
|
2089
2126
|
switch (def.displayStrategy) {
|
|
2090
2127
|
case "bar": {
|
|
2091
2128
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
@@ -2237,7 +2274,6 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2237
2274
|
(s, e) => s + e.durationMs,
|
|
2238
2275
|
0
|
|
2239
2276
|
);
|
|
2240
|
-
existing.events.every((e) => e.passed);
|
|
2241
2277
|
const lines = [];
|
|
2242
2278
|
lines.push(
|
|
2243
2279
|
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
|
|
@@ -2315,18 +2351,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2315
2351
|
if (finalEvent.type === "RunFailed") {
|
|
2316
2352
|
throw new Error(`Run failed: ${finalEvent.errorMessage}`);
|
|
2317
2353
|
}
|
|
2354
|
+
const completed = finalEvent;
|
|
2318
2355
|
console.log("");
|
|
2319
2356
|
console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
2320
2357
|
console.log(
|
|
2321
2358
|
`- passed: ${colorize(
|
|
2322
|
-
`${
|
|
2359
|
+
`${completed.passedTestCases}/${completed.totalTestCases}`,
|
|
2323
2360
|
ansi2.green
|
|
2324
2361
|
)}`
|
|
2325
2362
|
);
|
|
2326
2363
|
console.log(
|
|
2327
2364
|
`- failed: ${colorize(
|
|
2328
|
-
`${
|
|
2329
|
-
|
|
2365
|
+
`${completed.failedTestCases}/${completed.totalTestCases}`,
|
|
2366
|
+
completed.failedTestCases > 0 ? ansi2.red : ansi2.dim
|
|
2330
2367
|
)}`
|
|
2331
2368
|
);
|
|
2332
2369
|
if (overallScoreCount > 0) {
|
|
@@ -2367,10 +2404,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2367
2404
|
);
|
|
2368
2405
|
continue;
|
|
2369
2406
|
}
|
|
2370
|
-
const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ?
|
|
2371
|
-
summary.aggregatedScoreItem.
|
|
2372
|
-
|
|
2373
|
-
)
|
|
2407
|
+
const scoreLabel = summary.isAggregated && summary.aggregatedScoreItem ? (() => {
|
|
2408
|
+
const def = summary.aggregatedScoreItem.def ?? getScoreById(summary.aggregatedScoreItem.id);
|
|
2409
|
+
return def ? def.formatAggregate(summary.aggregatedScoreItem.data) : summary.averageScore.toFixed(2);
|
|
2410
|
+
})() : summary.stdDev !== void 0 && summary.isAggregated ? `${summary.averageScore.toFixed(2)} \xB1 ${summary.stdDev.toFixed(2)}` : summary.averageScore.toFixed(2);
|
|
2374
2411
|
console.log(
|
|
2375
2412
|
` ${status} ${summary.name.padEnd(24)} score=${colorize(
|
|
2376
2413
|
scoreLabel,
|
|
@@ -2379,7 +2416,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2379
2416
|
);
|
|
2380
2417
|
}
|
|
2381
2418
|
}
|
|
2382
|
-
console.log(`- artifact: ${colorize(
|
|
2419
|
+
console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
|
|
2383
2420
|
}
|
|
2384
2421
|
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
|
|
2385
2422
|
return new Promise((resolve5, reject) => {
|