@m4trix/evals 0.26.0 → 0.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -4
- package/dist/cli-simple.cjs +53 -23
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +53 -23
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +25 -12
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +25 -12
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +108 -79
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +65 -24
- package/dist/index.js +106 -80
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.cjs
CHANGED
|
@@ -280,6 +280,7 @@ function makeEntityIdSchema(brand, label) {
|
|
|
280
280
|
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
281
281
|
makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
282
282
|
makeEntityIdSchema("TestCaseName", "Test case name");
|
|
283
|
+
makeEntityIdSchema("DatasetName", "Dataset name");
|
|
283
284
|
function validateWithSchema(schema, raw, context) {
|
|
284
285
|
const trimmed = raw.trim();
|
|
285
286
|
const decode = effect.Schema.decodeUnknownEither(
|
|
@@ -295,6 +296,14 @@ function validateRunConfigName(raw, context) {
|
|
|
295
296
|
return validateWithSchema(RunConfigNameSchema, raw, context);
|
|
296
297
|
}
|
|
297
298
|
|
|
299
|
+
// src/evals/dataset.ts
|
|
300
|
+
function getDatasetDisplayLabel(dataset) {
|
|
301
|
+
if (typeof dataset.getDisplayLabel === "function") {
|
|
302
|
+
return dataset.getDisplayLabel();
|
|
303
|
+
}
|
|
304
|
+
return typeof dataset.getName === "function" ? dataset.getName() : "";
|
|
305
|
+
}
|
|
306
|
+
|
|
298
307
|
// src/evals/evaluator.ts
|
|
299
308
|
function getEvaluatorDisplayLabel(evaluator) {
|
|
300
309
|
if (typeof evaluator.getDisplayLabel === "function") {
|
|
@@ -555,7 +564,7 @@ function toEvalDataset(item, snapshots) {
|
|
|
555
564
|
const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
|
|
556
565
|
return {
|
|
557
566
|
id: item.id,
|
|
558
|
-
name: item.dataset
|
|
567
|
+
name: getDatasetDisplayLabel(item.dataset),
|
|
559
568
|
overview: `Discovered from ${item.filePath}`,
|
|
560
569
|
runs
|
|
561
570
|
};
|
|
@@ -1723,15 +1732,16 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1723
1732
|
meta: {
|
|
1724
1733
|
triggerId: task.triggerId,
|
|
1725
1734
|
runId: evaluatorRunId,
|
|
1726
|
-
|
|
1735
|
+
datasetName: task.dataset.getDisplayLabel(),
|
|
1727
1736
|
repetitionId,
|
|
1728
1737
|
repetitionIndex,
|
|
1729
1738
|
repetitionCount,
|
|
1730
|
-
runConfigName: task.runConfigName
|
|
1739
|
+
runConfigName: task.runConfigName,
|
|
1740
|
+
...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
|
|
1741
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1742
|
+
runConfigTags: task.runConfigTags,
|
|
1743
|
+
evaluatorTags: getEvaluatorTagList(evaluator)
|
|
1731
1744
|
},
|
|
1732
|
-
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1733
|
-
runConfigTags: task.runConfigTags,
|
|
1734
|
-
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
1735
1745
|
logDiff,
|
|
1736
1746
|
log,
|
|
1737
1747
|
createError
|
|
@@ -2138,7 +2148,7 @@ var EffectRunner = class {
|
|
|
2138
2148
|
);
|
|
2139
2149
|
if (!dsCollected) {
|
|
2140
2150
|
throw new Error(
|
|
2141
|
-
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.
|
|
2151
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
2142
2152
|
);
|
|
2143
2153
|
}
|
|
2144
2154
|
let evaluatorIds;
|
|
@@ -2210,7 +2220,8 @@ var EffectRunner = class {
|
|
|
2210
2220
|
globalEvaluationSemaphore: sem,
|
|
2211
2221
|
runConfigName: job.runConfigName,
|
|
2212
2222
|
runConfigTags: job.runConfigTags,
|
|
2213
|
-
repetitions: job.repetitions
|
|
2223
|
+
repetitions: job.repetitions,
|
|
2224
|
+
experimentName: request.experimentName
|
|
2214
2225
|
})
|
|
2215
2226
|
);
|
|
2216
2227
|
}
|
|
@@ -2245,7 +2256,8 @@ var EffectRunner = class {
|
|
|
2245
2256
|
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
2246
2257
|
repetitions: request.repetitions,
|
|
2247
2258
|
runConfigName,
|
|
2248
|
-
runConfigTags: request.runConfigTags
|
|
2259
|
+
runConfigTags: request.runConfigTags,
|
|
2260
|
+
experimentName: request.experimentName
|
|
2249
2261
|
});
|
|
2250
2262
|
}
|
|
2251
2263
|
async startDatasetRun(params) {
|
|
@@ -2273,7 +2285,7 @@ var EffectRunner = class {
|
|
|
2273
2285
|
const snapshot = {
|
|
2274
2286
|
runId,
|
|
2275
2287
|
datasetId: params.datasetId,
|
|
2276
|
-
datasetName: dataset.dataset.
|
|
2288
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
2277
2289
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
2278
2290
|
queuedAt: Date.now(),
|
|
2279
2291
|
totalTestCases: totalEvaluations,
|
|
@@ -2294,7 +2306,7 @@ var EffectRunner = class {
|
|
|
2294
2306
|
type: "RunQueued",
|
|
2295
2307
|
runId,
|
|
2296
2308
|
datasetId: params.datasetId,
|
|
2297
|
-
datasetName: dataset.dataset.
|
|
2309
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
2298
2310
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
2299
2311
|
totalTestCases: totalEvaluations,
|
|
2300
2312
|
artifactPath
|
|
@@ -2320,7 +2332,8 @@ var EffectRunner = class {
|
|
|
2320
2332
|
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
2321
2333
|
runConfigName: params.runConfigName,
|
|
2322
2334
|
runConfigTags,
|
|
2323
|
-
repetitions
|
|
2335
|
+
repetitions,
|
|
2336
|
+
experimentName: params.experimentName
|
|
2324
2337
|
})
|
|
2325
2338
|
);
|
|
2326
2339
|
return snapshot;
|