@m4trix/evals 0.26.0 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,6 +29,7 @@ function makeEntityIdSchema(brand, label) {
29
29
  var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
30
30
  makeEntityIdSchema("EvaluatorName", "Evaluator name");
31
31
  makeEntityIdSchema("TestCaseName", "Test case name");
32
+ makeEntityIdSchema("DatasetName", "Dataset name");
32
33
  function validateWithSchema(schema, raw, context) {
33
34
  const trimmed = raw.trim();
34
35
  const decode = Schema.decodeUnknownEither(
@@ -585,6 +586,14 @@ function getTestCaseTagList(testCase) {
585
586
  return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
586
587
  }
587
588
 
589
+ // src/evals/dataset.ts
590
+ function getDatasetDisplayLabel(dataset) {
591
+ if (typeof dataset.getDisplayLabel === "function") {
592
+ return dataset.getDisplayLabel();
593
+ }
594
+ return typeof dataset.getName === "function" ? dataset.getName() : "";
595
+ }
596
+
588
597
  // src/evals/metric.ts
589
598
  var registry = /* @__PURE__ */ new Map();
590
599
  var Metric = {
@@ -978,15 +987,16 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
978
987
  meta: {
979
988
  triggerId: task.triggerId,
980
989
  runId: evaluatorRunId,
981
- datasetId: task.datasetId,
990
+ datasetName: task.dataset.getDisplayLabel(),
982
991
  repetitionId,
983
992
  repetitionIndex,
984
993
  repetitionCount,
985
- runConfigName: task.runConfigName
994
+ runConfigName: task.runConfigName,
995
+ ...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
996
+ testCaseTags: getTestCaseTagList(testCaseItem.testCase),
997
+ runConfigTags: task.runConfigTags,
998
+ evaluatorTags: getEvaluatorTagList(evaluator)
986
999
  },
987
- testCaseTags: getTestCaseTagList(testCaseItem.testCase),
988
- runConfigTags: task.runConfigTags,
989
- evaluatorTags: getEvaluatorTagList(evaluator),
990
1000
  logDiff,
991
1001
  log,
992
1002
  createError
@@ -1393,7 +1403,7 @@ var EffectRunner = class {
1393
1403
  );
1394
1404
  if (!dsCollected) {
1395
1405
  throw new Error(
1396
- `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
1406
+ `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
1397
1407
  );
1398
1408
  }
1399
1409
  let evaluatorIds;
@@ -1465,7 +1475,8 @@ var EffectRunner = class {
1465
1475
  globalEvaluationSemaphore: sem,
1466
1476
  runConfigName: job.runConfigName,
1467
1477
  runConfigTags: job.runConfigTags,
1468
- repetitions: job.repetitions
1478
+ repetitions: job.repetitions,
1479
+ experimentName: request.experimentName
1469
1480
  })
1470
1481
  );
1471
1482
  }
@@ -1500,7 +1511,8 @@ var EffectRunner = class {
1500
1511
  maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
1501
1512
  repetitions: request.repetitions,
1502
1513
  runConfigName,
1503
- runConfigTags: request.runConfigTags
1514
+ runConfigTags: request.runConfigTags,
1515
+ experimentName: request.experimentName
1504
1516
  });
1505
1517
  }
1506
1518
  async startDatasetRun(params) {
@@ -1528,7 +1540,7 @@ var EffectRunner = class {
1528
1540
  const snapshot = {
1529
1541
  runId,
1530
1542
  datasetId: params.datasetId,
1531
- datasetName: dataset.dataset.getName(),
1543
+ datasetName: dataset.dataset.getDisplayLabel(),
1532
1544
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1533
1545
  queuedAt: Date.now(),
1534
1546
  totalTestCases: totalEvaluations,
@@ -1549,7 +1561,7 @@ var EffectRunner = class {
1549
1561
  type: "RunQueued",
1550
1562
  runId,
1551
1563
  datasetId: params.datasetId,
1552
- datasetName: dataset.dataset.getName(),
1564
+ datasetName: dataset.dataset.getDisplayLabel(),
1553
1565
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1554
1566
  totalTestCases: totalEvaluations,
1555
1567
  artifactPath
@@ -1575,7 +1587,8 @@ var EffectRunner = class {
1575
1587
  globalEvaluationSemaphore: params.globalEvaluationSemaphore,
1576
1588
  runConfigName: params.runConfigName,
1577
1589
  runConfigTags,
1578
- repetitions
1590
+ repetitions,
1591
+ experimentName: params.experimentName
1579
1592
  })
1580
1593
  );
1581
1594
  return snapshot;
@@ -1695,6 +1708,17 @@ function parseSimpleCliArgs(argv) {
1695
1708
  index += 1;
1696
1709
  continue;
1697
1710
  }
1711
+ if (token === "--experiment" && argv[index + 1]) {
1712
+ const raw = argv[index + 1];
1713
+ if (typeof raw === "string") {
1714
+ const trimmed = raw.trim();
1715
+ if (trimmed.length > 0) {
1716
+ args.experimentName = trimmed;
1717
+ }
1718
+ }
1719
+ index += 1;
1720
+ continue;
1721
+ }
1698
1722
  args.unknownArgs.push(token);
1699
1723
  }
1700
1724
  return args;
@@ -1702,12 +1726,13 @@ function parseSimpleCliArgs(argv) {
1702
1726
  function getSimpleCliUsage() {
1703
1727
  return [
1704
1728
  "Usage:",
1705
- " eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--ci]",
1706
- " eval-agents-simple generate --dataset <datasetName>",
1729
+ " eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--experiment <name>] [--ci]",
1730
+ " eval-agents-simple generate --dataset <datasetId>",
1707
1731
  "",
1708
1732
  "Options:",
1709
1733
  " --ci With run: exit with code 1 if any test case fails.",
1710
- " --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential."
1734
+ " --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential.",
1735
+ " --experiment <name> With run: set evaluator meta.experimentName for this invocation."
1711
1736
  ].join("\n");
1712
1737
  }
1713
1738
 
@@ -1771,7 +1796,7 @@ function GenerateView({
1771
1796
  if (!cancelled) {
1772
1797
  setResult({
1773
1798
  count: payload.length,
1774
- datasetName: dataset.dataset.getName(),
1799
+ datasetName: getDatasetDisplayLabel(dataset.dataset),
1775
1800
  outputPath
1776
1801
  });
1777
1802
  setTimeout(() => onComplete(), 200);
@@ -1832,7 +1857,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
1832
1857
  const outputPath = createOutputPath(absoluteDatasetPath);
1833
1858
  await writeFile(outputPath, `${JSON.stringify(payload, null, 2)}
1834
1859
  `, "utf8");
1835
- console.log(`Generated ${payload.length} test cases for dataset "${dataset.dataset.getName()}".`);
1860
+ console.log(`Generated ${payload.length} test cases for dataset "${getDatasetDisplayLabel(dataset.dataset)}".`);
1836
1861
  console.log(`Wrote ${outputPath}`);
1837
1862
  }
1838
1863
  async function generateDatasetJsonCommandInk(runner, datasetName) {
@@ -1984,6 +2009,7 @@ function RunView({
1984
2009
  runner,
1985
2010
  runConfigNames,
1986
2011
  concurrency,
2012
+ experimentName,
1987
2013
  onComplete
1988
2014
  }) {
1989
2015
  const [phase, setPhase] = useState("loading");
@@ -2153,7 +2179,8 @@ function RunView({
2153
2179
  });
2154
2180
  const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
2155
2181
  jobs,
2156
- globalConcurrency: concurrency
2182
+ globalConcurrency: concurrency,
2183
+ experimentName
2157
2184
  });
2158
2185
  for (let i = 0; i < snapshots.length; i += 1) {
2159
2186
  const snap = snapshots[i];
@@ -2210,7 +2237,7 @@ function RunView({
2210
2237
  setPhase("completed");
2211
2238
  const exitCode = failedTestCases > 0 ? 1 : 0;
2212
2239
  setTimeout(() => onComplete(void 0, exitCode), 200);
2213
- }, [runner, runConfigNames, concurrency, onComplete]);
2240
+ }, [runner, runConfigNames, concurrency, experimentName, onComplete]);
2214
2241
  useEffect(() => {
2215
2242
  void runEval();
2216
2243
  }, [runEval]);
@@ -2698,7 +2725,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2698
2725
  }
2699
2726
  return lines;
2700
2727
  }
2701
- async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency) {
2728
+ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName) {
2702
2729
  const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
2703
2730
  if (jobs.length === 0) {
2704
2731
  throw new Error("No jobs expanded from RunConfigs.");
@@ -2907,7 +2934,8 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency)
2907
2934
  console.log("");
2908
2935
  const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
2909
2936
  jobs,
2910
- globalConcurrency: concurrency
2937
+ globalConcurrency: concurrency,
2938
+ experimentName
2911
2939
  });
2912
2940
  for (let i = 0; i < snapshots.length; i += 1) {
2913
2941
  const snap = snapshots[i];
@@ -3007,13 +3035,14 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency)
3007
3035
  }
3008
3036
  return failedTestCasesTotal > 0 ? 1 : 0;
3009
3037
  }
3010
- async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency) {
3038
+ async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName) {
3011
3039
  return new Promise((resolve5, reject) => {
3012
3040
  const app = render(
3013
3041
  React.createElement(RunView, {
3014
3042
  runner,
3015
3043
  runConfigNames,
3016
3044
  concurrency,
3045
+ experimentName,
3017
3046
  onComplete: (err, exitCode) => {
3018
3047
  app.unmount();
3019
3048
  if (err) {
@@ -3074,7 +3103,8 @@ async function main() {
3074
3103
  const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
3075
3104
  runner,
3076
3105
  args.runConfigNames,
3077
- concurrency
3106
+ concurrency,
3107
+ args.experimentName
3078
3108
  );
3079
3109
  if (args.ci && exitCode !== 0) {
3080
3110
  process.exit(1);
@@ -3083,7 +3113,7 @@ async function main() {
3083
3113
  }
3084
3114
  const genDataset = args.datasetName;
3085
3115
  if (!genDataset) {
3086
- console.error("Missing required --dataset <datasetName> argument.");
3116
+ console.error("Missing required --dataset <datasetId> argument.");
3087
3117
  printUsageAndExit(1);
3088
3118
  }
3089
3119
  await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(