@m4trix/evals 0.28.0 → 0.30.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/cli-simple.cjs +20 -6
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +20 -6
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +10 -1
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +10 -1
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +8 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +17 -0
- package/dist/index.js +8 -0
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.js
CHANGED
|
@@ -986,8 +986,11 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
986
986
|
output,
|
|
987
987
|
meta: {
|
|
988
988
|
triggerId: task.triggerId,
|
|
989
|
+
triggerTimestamp: task.triggerTimestamp,
|
|
989
990
|
runId: evaluatorRunId,
|
|
990
991
|
datasetName: task.dataset.getDisplayLabel(),
|
|
992
|
+
testCaseId: testCaseItem.id,
|
|
993
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
991
994
|
repetitionId,
|
|
992
995
|
repetitionIndex,
|
|
993
996
|
repetitionCount,
|
|
@@ -1464,6 +1467,7 @@ var EffectRunner = class {
|
|
|
1464
1467
|
const globalConcurrency = Math.max(1, request.globalConcurrency);
|
|
1465
1468
|
const sem = Effect.unsafeMakeSemaphore(globalConcurrency);
|
|
1466
1469
|
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
1470
|
+
const triggerTimestamp = request.triggerTimestamp ?? Date.now();
|
|
1467
1471
|
const snapshots = [];
|
|
1468
1472
|
for (const job of request.jobs) {
|
|
1469
1473
|
snapshots.push(
|
|
@@ -1471,6 +1475,7 @@ var EffectRunner = class {
|
|
|
1471
1475
|
datasetId: job.datasetId,
|
|
1472
1476
|
evaluatorIds: job.evaluatorIds,
|
|
1473
1477
|
triggerId,
|
|
1478
|
+
triggerTimestamp,
|
|
1474
1479
|
maxConcurrency: this.config.maxConcurrency ?? 1,
|
|
1475
1480
|
globalEvaluationSemaphore: sem,
|
|
1476
1481
|
runConfigName: job.runConfigName,
|
|
@@ -1508,6 +1513,7 @@ var EffectRunner = class {
|
|
|
1508
1513
|
datasetId: request.datasetId,
|
|
1509
1514
|
evaluatorIds: request.evaluatorIds,
|
|
1510
1515
|
triggerId: request.triggerId,
|
|
1516
|
+
triggerTimestamp: request.triggerTimestamp ?? Date.now(),
|
|
1511
1517
|
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
1512
1518
|
repetitions: request.repetitions,
|
|
1513
1519
|
runConfigName,
|
|
@@ -1535,6 +1541,7 @@ var EffectRunner = class {
|
|
|
1535
1541
|
const totalEvaluations = selectedTestCases.length * repetitions;
|
|
1536
1542
|
const runConfigTags = [...params.runConfigTags ?? []];
|
|
1537
1543
|
const triggerId = params.triggerId ?? `trg-${randomUUID()}`;
|
|
1544
|
+
const triggerTimestamp = params.triggerTimestamp ?? Date.now();
|
|
1538
1545
|
const runId = `run-${randomUUID()}`;
|
|
1539
1546
|
const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
|
|
1540
1547
|
const snapshot = {
|
|
@@ -1578,6 +1585,7 @@ var EffectRunner = class {
|
|
|
1578
1585
|
Queue.offer(this.runQueue, {
|
|
1579
1586
|
runId,
|
|
1580
1587
|
triggerId,
|
|
1588
|
+
triggerTimestamp,
|
|
1581
1589
|
datasetId: params.datasetId,
|
|
1582
1590
|
dataset: dataset.dataset,
|
|
1583
1591
|
evaluators: selectedEvaluators,
|
|
@@ -2010,6 +2018,7 @@ function RunView({
|
|
|
2010
2018
|
runConfigNames,
|
|
2011
2019
|
concurrency,
|
|
2012
2020
|
experimentName,
|
|
2021
|
+
triggerTimestamp,
|
|
2013
2022
|
onComplete
|
|
2014
2023
|
}) {
|
|
2015
2024
|
const [phase, setPhase] = useState("loading");
|
|
@@ -2180,7 +2189,8 @@ function RunView({
|
|
|
2180
2189
|
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2181
2190
|
jobs,
|
|
2182
2191
|
globalConcurrency: concurrency,
|
|
2183
|
-
experimentName
|
|
2192
|
+
experimentName,
|
|
2193
|
+
triggerTimestamp
|
|
2184
2194
|
});
|
|
2185
2195
|
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2186
2196
|
const snap = snapshots[i];
|
|
@@ -2237,7 +2247,7 @@ function RunView({
|
|
|
2237
2247
|
setPhase("completed");
|
|
2238
2248
|
const exitCode = failedTestCases > 0 ? 1 : 0;
|
|
2239
2249
|
setTimeout(() => onComplete(void 0, exitCode), 200);
|
|
2240
|
-
}, [runner, runConfigNames, concurrency, experimentName, onComplete]);
|
|
2250
|
+
}, [runner, runConfigNames, concurrency, experimentName, triggerTimestamp, onComplete]);
|
|
2241
2251
|
useEffect(() => {
|
|
2242
2252
|
void runEval();
|
|
2243
2253
|
}, [runEval]);
|
|
@@ -2725,7 +2735,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2725
2735
|
}
|
|
2726
2736
|
return lines;
|
|
2727
2737
|
}
|
|
2728
|
-
async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName) {
|
|
2738
|
+
async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName, triggerTimestamp) {
|
|
2729
2739
|
const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
|
|
2730
2740
|
if (jobs.length === 0) {
|
|
2731
2741
|
throw new Error("No jobs expanded from RunConfigs.");
|
|
@@ -2935,7 +2945,8 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency,
|
|
|
2935
2945
|
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2936
2946
|
jobs,
|
|
2937
2947
|
globalConcurrency: concurrency,
|
|
2938
|
-
experimentName
|
|
2948
|
+
experimentName,
|
|
2949
|
+
triggerTimestamp
|
|
2939
2950
|
});
|
|
2940
2951
|
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2941
2952
|
const snap = snapshots[i];
|
|
@@ -3035,7 +3046,7 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency,
|
|
|
3035
3046
|
}
|
|
3036
3047
|
return failedTestCasesTotal > 0 ? 1 : 0;
|
|
3037
3048
|
}
|
|
3038
|
-
async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName) {
|
|
3049
|
+
async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName, triggerTimestamp) {
|
|
3039
3050
|
return new Promise((resolve5, reject) => {
|
|
3040
3051
|
const app = render(
|
|
3041
3052
|
React.createElement(RunView, {
|
|
@@ -3043,6 +3054,7 @@ async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, e
|
|
|
3043
3054
|
runConfigNames,
|
|
3044
3055
|
concurrency,
|
|
3045
3056
|
experimentName,
|
|
3057
|
+
triggerTimestamp,
|
|
3046
3058
|
onComplete: (err, exitCode) => {
|
|
3047
3059
|
app.unmount();
|
|
3048
3060
|
if (err) {
|
|
@@ -3100,11 +3112,13 @@ async function main() {
|
|
|
3100
3112
|
try {
|
|
3101
3113
|
if (args.command === "run") {
|
|
3102
3114
|
const concurrency = args.concurrency ?? getDefaultConcurrency();
|
|
3115
|
+
const triggerTimestamp = Date.now();
|
|
3103
3116
|
const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
|
|
3104
3117
|
runner,
|
|
3105
3118
|
args.runConfigNames,
|
|
3106
3119
|
concurrency,
|
|
3107
|
-
args.experimentName
|
|
3120
|
+
args.experimentName,
|
|
3121
|
+
triggerTimestamp
|
|
3108
3122
|
);
|
|
3109
3123
|
if (args.ci && exitCode !== 0) {
|
|
3110
3124
|
process.exit(1);
|