@m4trix/evals 0.26.0 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -253,6 +253,7 @@ function makeEntityIdSchema(brand, label) {
253
253
  var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
254
254
  makeEntityIdSchema("EvaluatorName", "Evaluator name");
255
255
  makeEntityIdSchema("TestCaseName", "Test case name");
256
+ makeEntityIdSchema("DatasetName", "Dataset name");
256
257
  function validateWithSchema(schema, raw, context) {
257
258
  const trimmed = raw.trim();
258
259
  const decode = Schema.decodeUnknownEither(
@@ -268,6 +269,14 @@ function validateRunConfigName(raw, context) {
268
269
  return validateWithSchema(RunConfigNameSchema, raw, context);
269
270
  }
270
271
 
272
+ // src/evals/dataset.ts
273
+ function getDatasetDisplayLabel(dataset) {
274
+ if (typeof dataset.getDisplayLabel === "function") {
275
+ return dataset.getDisplayLabel();
276
+ }
277
+ return typeof dataset.getName === "function" ? dataset.getName() : "";
278
+ }
279
+
271
280
  // src/evals/evaluator.ts
272
281
  function getEvaluatorDisplayLabel(evaluator) {
273
282
  if (typeof evaluator.getDisplayLabel === "function") {
@@ -528,7 +537,7 @@ function toEvalDataset(item, snapshots) {
528
537
  const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
529
538
  return {
530
539
  id: item.id,
531
- name: item.dataset.getName(),
540
+ name: getDatasetDisplayLabel(item.dataset),
532
541
  overview: `Discovered from ${item.filePath}`,
533
542
  runs
534
543
  };
@@ -1696,15 +1705,16 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1696
1705
  meta: {
1697
1706
  triggerId: task.triggerId,
1698
1707
  runId: evaluatorRunId,
1699
- datasetId: task.datasetId,
1708
+ datasetName: task.dataset.getDisplayLabel(),
1700
1709
  repetitionId,
1701
1710
  repetitionIndex,
1702
1711
  repetitionCount,
1703
- runConfigName: task.runConfigName
1712
+ runConfigName: task.runConfigName,
1713
+ ...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
1714
+ testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1715
+ runConfigTags: task.runConfigTags,
1716
+ evaluatorTags: getEvaluatorTagList(evaluator)
1704
1717
  },
1705
- testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1706
- runConfigTags: task.runConfigTags,
1707
- evaluatorTags: getEvaluatorTagList(evaluator),
1708
1718
  logDiff,
1709
1719
  log,
1710
1720
  createError
@@ -2111,7 +2121,7 @@ var EffectRunner = class {
2111
2121
  );
2112
2122
  if (!dsCollected) {
2113
2123
  throw new Error(
2114
- `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
2124
+ `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
2115
2125
  );
2116
2126
  }
2117
2127
  let evaluatorIds;
@@ -2183,7 +2193,8 @@ var EffectRunner = class {
2183
2193
  globalEvaluationSemaphore: sem,
2184
2194
  runConfigName: job.runConfigName,
2185
2195
  runConfigTags: job.runConfigTags,
2186
- repetitions: job.repetitions
2196
+ repetitions: job.repetitions,
2197
+ experimentName: request.experimentName
2187
2198
  })
2188
2199
  );
2189
2200
  }
@@ -2218,7 +2229,8 @@ var EffectRunner = class {
2218
2229
  maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
2219
2230
  repetitions: request.repetitions,
2220
2231
  runConfigName,
2221
- runConfigTags: request.runConfigTags
2232
+ runConfigTags: request.runConfigTags,
2233
+ experimentName: request.experimentName
2222
2234
  });
2223
2235
  }
2224
2236
  async startDatasetRun(params) {
@@ -2246,7 +2258,7 @@ var EffectRunner = class {
2246
2258
  const snapshot = {
2247
2259
  runId,
2248
2260
  datasetId: params.datasetId,
2249
- datasetName: dataset.dataset.getName(),
2261
+ datasetName: dataset.dataset.getDisplayLabel(),
2250
2262
  evaluatorIds: selectedEvaluators.map((item) => item.id),
2251
2263
  queuedAt: Date.now(),
2252
2264
  totalTestCases: totalEvaluations,
@@ -2267,7 +2279,7 @@ var EffectRunner = class {
2267
2279
  type: "RunQueued",
2268
2280
  runId,
2269
2281
  datasetId: params.datasetId,
2270
- datasetName: dataset.dataset.getName(),
2282
+ datasetName: dataset.dataset.getDisplayLabel(),
2271
2283
  evaluatorIds: selectedEvaluators.map((item) => item.id),
2272
2284
  totalTestCases: totalEvaluations,
2273
2285
  artifactPath
@@ -2293,7 +2305,8 @@ var EffectRunner = class {
2293
2305
  globalEvaluationSemaphore: params.globalEvaluationSemaphore,
2294
2306
  runConfigName: params.runConfigName,
2295
2307
  runConfigTags,
2296
- repetitions
2308
+ repetitions,
2309
+ experimentName: params.experimentName
2297
2310
  })
2298
2311
  );
2299
2312
  return snapshot;