@m4trix/evals 0.26.0 → 0.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -4
- package/dist/cli-simple.cjs +53 -23
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +53 -23
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +25 -12
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +25 -12
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +108 -79
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +65 -24
- package/dist/index.js +106 -80
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -253,6 +253,7 @@ function makeEntityIdSchema(brand, label) {
|
|
|
253
253
|
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
254
254
|
makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
255
255
|
makeEntityIdSchema("TestCaseName", "Test case name");
|
|
256
|
+
makeEntityIdSchema("DatasetName", "Dataset name");
|
|
256
257
|
function validateWithSchema(schema, raw, context) {
|
|
257
258
|
const trimmed = raw.trim();
|
|
258
259
|
const decode = Schema.decodeUnknownEither(
|
|
@@ -268,6 +269,14 @@ function validateRunConfigName(raw, context) {
|
|
|
268
269
|
return validateWithSchema(RunConfigNameSchema, raw, context);
|
|
269
270
|
}
|
|
270
271
|
|
|
272
|
+
// src/evals/dataset.ts
|
|
273
|
+
function getDatasetDisplayLabel(dataset) {
|
|
274
|
+
if (typeof dataset.getDisplayLabel === "function") {
|
|
275
|
+
return dataset.getDisplayLabel();
|
|
276
|
+
}
|
|
277
|
+
return typeof dataset.getName === "function" ? dataset.getName() : "";
|
|
278
|
+
}
|
|
279
|
+
|
|
271
280
|
// src/evals/evaluator.ts
|
|
272
281
|
function getEvaluatorDisplayLabel(evaluator) {
|
|
273
282
|
if (typeof evaluator.getDisplayLabel === "function") {
|
|
@@ -528,7 +537,7 @@ function toEvalDataset(item, snapshots) {
|
|
|
528
537
|
const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
|
|
529
538
|
return {
|
|
530
539
|
id: item.id,
|
|
531
|
-
name: item.dataset
|
|
540
|
+
name: getDatasetDisplayLabel(item.dataset),
|
|
532
541
|
overview: `Discovered from ${item.filePath}`,
|
|
533
542
|
runs
|
|
534
543
|
};
|
|
@@ -1696,15 +1705,16 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1696
1705
|
meta: {
|
|
1697
1706
|
triggerId: task.triggerId,
|
|
1698
1707
|
runId: evaluatorRunId,
|
|
1699
|
-
|
|
1708
|
+
datasetName: task.dataset.getDisplayLabel(),
|
|
1700
1709
|
repetitionId,
|
|
1701
1710
|
repetitionIndex,
|
|
1702
1711
|
repetitionCount,
|
|
1703
|
-
runConfigName: task.runConfigName
|
|
1712
|
+
runConfigName: task.runConfigName,
|
|
1713
|
+
...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
|
|
1714
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1715
|
+
runConfigTags: task.runConfigTags,
|
|
1716
|
+
evaluatorTags: getEvaluatorTagList(evaluator)
|
|
1704
1717
|
},
|
|
1705
|
-
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1706
|
-
runConfigTags: task.runConfigTags,
|
|
1707
|
-
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
1708
1718
|
logDiff,
|
|
1709
1719
|
log,
|
|
1710
1720
|
createError
|
|
@@ -2111,7 +2121,7 @@ var EffectRunner = class {
|
|
|
2111
2121
|
);
|
|
2112
2122
|
if (!dsCollected) {
|
|
2113
2123
|
throw new Error(
|
|
2114
|
-
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.
|
|
2124
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
2115
2125
|
);
|
|
2116
2126
|
}
|
|
2117
2127
|
let evaluatorIds;
|
|
@@ -2183,7 +2193,8 @@ var EffectRunner = class {
|
|
|
2183
2193
|
globalEvaluationSemaphore: sem,
|
|
2184
2194
|
runConfigName: job.runConfigName,
|
|
2185
2195
|
runConfigTags: job.runConfigTags,
|
|
2186
|
-
repetitions: job.repetitions
|
|
2196
|
+
repetitions: job.repetitions,
|
|
2197
|
+
experimentName: request.experimentName
|
|
2187
2198
|
})
|
|
2188
2199
|
);
|
|
2189
2200
|
}
|
|
@@ -2218,7 +2229,8 @@ var EffectRunner = class {
|
|
|
2218
2229
|
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
2219
2230
|
repetitions: request.repetitions,
|
|
2220
2231
|
runConfigName,
|
|
2221
|
-
runConfigTags: request.runConfigTags
|
|
2232
|
+
runConfigTags: request.runConfigTags,
|
|
2233
|
+
experimentName: request.experimentName
|
|
2222
2234
|
});
|
|
2223
2235
|
}
|
|
2224
2236
|
async startDatasetRun(params) {
|
|
@@ -2246,7 +2258,7 @@ var EffectRunner = class {
|
|
|
2246
2258
|
const snapshot = {
|
|
2247
2259
|
runId,
|
|
2248
2260
|
datasetId: params.datasetId,
|
|
2249
|
-
datasetName: dataset.dataset.
|
|
2261
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
2250
2262
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
2251
2263
|
queuedAt: Date.now(),
|
|
2252
2264
|
totalTestCases: totalEvaluations,
|
|
@@ -2267,7 +2279,7 @@ var EffectRunner = class {
|
|
|
2267
2279
|
type: "RunQueued",
|
|
2268
2280
|
runId,
|
|
2269
2281
|
datasetId: params.datasetId,
|
|
2270
|
-
datasetName: dataset.dataset.
|
|
2282
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
2271
2283
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
2272
2284
|
totalTestCases: totalEvaluations,
|
|
2273
2285
|
artifactPath
|
|
@@ -2293,7 +2305,8 @@ var EffectRunner = class {
|
|
|
2293
2305
|
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
2294
2306
|
runConfigName: params.runConfigName,
|
|
2295
2307
|
runConfigTags,
|
|
2296
|
-
repetitions
|
|
2308
|
+
repetitions,
|
|
2309
|
+
experimentName: params.experimentName
|
|
2297
2310
|
})
|
|
2298
2311
|
);
|
|
2299
2312
|
return snapshot;
|