@m4trix/evals 0.26.0 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.cjs CHANGED
@@ -280,6 +280,7 @@ function makeEntityIdSchema(brand, label) {
280
280
  var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
281
281
  makeEntityIdSchema("EvaluatorName", "Evaluator name");
282
282
  makeEntityIdSchema("TestCaseName", "Test case name");
283
+ makeEntityIdSchema("DatasetName", "Dataset name");
283
284
  function validateWithSchema(schema, raw, context) {
284
285
  const trimmed = raw.trim();
285
286
  const decode = effect.Schema.decodeUnknownEither(
@@ -295,6 +296,14 @@ function validateRunConfigName(raw, context) {
295
296
  return validateWithSchema(RunConfigNameSchema, raw, context);
296
297
  }
297
298
 
299
+ // src/evals/dataset.ts
300
+ function getDatasetDisplayLabel(dataset) {
301
+ if (typeof dataset.getDisplayLabel === "function") {
302
+ return dataset.getDisplayLabel();
303
+ }
304
+ return typeof dataset.getName === "function" ? dataset.getName() : "";
305
+ }
306
+
298
307
  // src/evals/evaluator.ts
299
308
  function getEvaluatorDisplayLabel(evaluator) {
300
309
  if (typeof evaluator.getDisplayLabel === "function") {
@@ -555,7 +564,7 @@ function toEvalDataset(item, snapshots) {
555
564
  const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
556
565
  return {
557
566
  id: item.id,
558
- name: item.dataset.getName(),
567
+ name: getDatasetDisplayLabel(item.dataset),
559
568
  overview: `Discovered from ${item.filePath}`,
560
569
  runs
561
570
  };
@@ -1723,15 +1732,16 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1723
1732
  meta: {
1724
1733
  triggerId: task.triggerId,
1725
1734
  runId: evaluatorRunId,
1726
- datasetId: task.datasetId,
1735
+ datasetName: task.dataset.getDisplayLabel(),
1727
1736
  repetitionId,
1728
1737
  repetitionIndex,
1729
1738
  repetitionCount,
1730
- runConfigName: task.runConfigName
1739
+ runConfigName: task.runConfigName,
1740
+ ...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
1741
+ testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1742
+ runConfigTags: task.runConfigTags,
1743
+ evaluatorTags: getEvaluatorTagList(evaluator)
1731
1744
  },
1732
- testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1733
- runConfigTags: task.runConfigTags,
1734
- evaluatorTags: getEvaluatorTagList(evaluator),
1735
1745
  logDiff,
1736
1746
  log,
1737
1747
  createError
@@ -2138,7 +2148,7 @@ var EffectRunner = class {
2138
2148
  );
2139
2149
  if (!dsCollected) {
2140
2150
  throw new Error(
2141
- `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
2151
+ `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
2142
2152
  );
2143
2153
  }
2144
2154
  let evaluatorIds;
@@ -2210,7 +2220,8 @@ var EffectRunner = class {
2210
2220
  globalEvaluationSemaphore: sem,
2211
2221
  runConfigName: job.runConfigName,
2212
2222
  runConfigTags: job.runConfigTags,
2213
- repetitions: job.repetitions
2223
+ repetitions: job.repetitions,
2224
+ experimentName: request.experimentName
2214
2225
  })
2215
2226
  );
2216
2227
  }
@@ -2245,7 +2256,8 @@ var EffectRunner = class {
2245
2256
  maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
2246
2257
  repetitions: request.repetitions,
2247
2258
  runConfigName,
2248
- runConfigTags: request.runConfigTags
2259
+ runConfigTags: request.runConfigTags,
2260
+ experimentName: request.experimentName
2249
2261
  });
2250
2262
  }
2251
2263
  async startDatasetRun(params) {
@@ -2273,7 +2285,7 @@ var EffectRunner = class {
2273
2285
  const snapshot = {
2274
2286
  runId,
2275
2287
  datasetId: params.datasetId,
2276
- datasetName: dataset.dataset.getName(),
2288
+ datasetName: dataset.dataset.getDisplayLabel(),
2277
2289
  evaluatorIds: selectedEvaluators.map((item) => item.id),
2278
2290
  queuedAt: Date.now(),
2279
2291
  totalTestCases: totalEvaluations,
@@ -2294,7 +2306,7 @@ var EffectRunner = class {
2294
2306
  type: "RunQueued",
2295
2307
  runId,
2296
2308
  datasetId: params.datasetId,
2297
- datasetName: dataset.dataset.getName(),
2309
+ datasetName: dataset.dataset.getDisplayLabel(),
2298
2310
  evaluatorIds: selectedEvaluators.map((item) => item.id),
2299
2311
  totalTestCases: totalEvaluations,
2300
2312
  artifactPath
@@ -2320,7 +2332,8 @@ var EffectRunner = class {
2320
2332
  globalEvaluationSemaphore: params.globalEvaluationSemaphore,
2321
2333
  runConfigName: params.runConfigName,
2322
2334
  runConfigTags,
2323
- repetitions
2335
+ repetitions,
2336
+ experimentName: params.experimentName
2324
2337
  })
2325
2338
  );
2326
2339
  return snapshot;