@m4trix/evals 0.27.0 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -988,14 +988,17 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
988
988
  triggerId: task.triggerId,
989
989
  runId: evaluatorRunId,
990
990
  datasetName: task.dataset.getDisplayLabel(),
991
+ testCaseId: testCaseItem.id,
992
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
991
993
  repetitionId,
992
994
  repetitionIndex,
993
995
  repetitionCount,
994
- runConfigName: task.runConfigName
996
+ runConfigName: task.runConfigName,
997
+ ...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
998
+ testCaseTags: getTestCaseTagList(testCaseItem.testCase),
999
+ runConfigTags: task.runConfigTags,
1000
+ evaluatorTags: getEvaluatorTagList(evaluator)
995
1001
  },
996
- testCaseTags: getTestCaseTagList(testCaseItem.testCase),
997
- runConfigTags: task.runConfigTags,
998
- evaluatorTags: getEvaluatorTagList(evaluator),
999
1002
  logDiff,
1000
1003
  log,
1001
1004
  createError
@@ -1474,7 +1477,8 @@ var EffectRunner = class {
1474
1477
  globalEvaluationSemaphore: sem,
1475
1478
  runConfigName: job.runConfigName,
1476
1479
  runConfigTags: job.runConfigTags,
1477
- repetitions: job.repetitions
1480
+ repetitions: job.repetitions,
1481
+ experimentName: request.experimentName
1478
1482
  })
1479
1483
  );
1480
1484
  }
@@ -1509,7 +1513,8 @@ var EffectRunner = class {
1509
1513
  maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
1510
1514
  repetitions: request.repetitions,
1511
1515
  runConfigName,
1512
- runConfigTags: request.runConfigTags
1516
+ runConfigTags: request.runConfigTags,
1517
+ experimentName: request.experimentName
1513
1518
  });
1514
1519
  }
1515
1520
  async startDatasetRun(params) {
@@ -1584,7 +1589,8 @@ var EffectRunner = class {
1584
1589
  globalEvaluationSemaphore: params.globalEvaluationSemaphore,
1585
1590
  runConfigName: params.runConfigName,
1586
1591
  runConfigTags,
1587
- repetitions
1592
+ repetitions,
1593
+ experimentName: params.experimentName
1588
1594
  })
1589
1595
  );
1590
1596
  return snapshot;
@@ -1704,6 +1710,17 @@ function parseSimpleCliArgs(argv) {
1704
1710
  index += 1;
1705
1711
  continue;
1706
1712
  }
1713
+ if (token === "--experiment" && argv[index + 1]) {
1714
+ const raw = argv[index + 1];
1715
+ if (typeof raw === "string") {
1716
+ const trimmed = raw.trim();
1717
+ if (trimmed.length > 0) {
1718
+ args.experimentName = trimmed;
1719
+ }
1720
+ }
1721
+ index += 1;
1722
+ continue;
1723
+ }
1707
1724
  args.unknownArgs.push(token);
1708
1725
  }
1709
1726
  return args;
@@ -1711,12 +1728,13 @@ function parseSimpleCliArgs(argv) {
1711
1728
  function getSimpleCliUsage() {
1712
1729
  return [
1713
1730
  "Usage:",
1714
- " eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--ci]",
1731
+ " eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--experiment <name>] [--ci]",
1715
1732
  " eval-agents-simple generate --dataset <datasetId>",
1716
1733
  "",
1717
1734
  "Options:",
1718
1735
  " --ci With run: exit with code 1 if any test case fails.",
1719
- " --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential."
1736
+ " --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential.",
1737
+ " --experiment <name> With run: set evaluator meta.experimentName for this invocation."
1720
1738
  ].join("\n");
1721
1739
  }
1722
1740
 
@@ -1993,6 +2011,7 @@ function RunView({
1993
2011
  runner,
1994
2012
  runConfigNames,
1995
2013
  concurrency,
2014
+ experimentName,
1996
2015
  onComplete
1997
2016
  }) {
1998
2017
  const [phase, setPhase] = useState("loading");
@@ -2162,7 +2181,8 @@ function RunView({
2162
2181
  });
2163
2182
  const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
2164
2183
  jobs,
2165
- globalConcurrency: concurrency
2184
+ globalConcurrency: concurrency,
2185
+ experimentName
2166
2186
  });
2167
2187
  for (let i = 0; i < snapshots.length; i += 1) {
2168
2188
  const snap = snapshots[i];
@@ -2219,7 +2239,7 @@ function RunView({
2219
2239
  setPhase("completed");
2220
2240
  const exitCode = failedTestCases > 0 ? 1 : 0;
2221
2241
  setTimeout(() => onComplete(void 0, exitCode), 200);
2222
- }, [runner, runConfigNames, concurrency, onComplete]);
2242
+ }, [runner, runConfigNames, concurrency, experimentName, onComplete]);
2223
2243
  useEffect(() => {
2224
2244
  void runEval();
2225
2245
  }, [runEval]);
@@ -2707,7 +2727,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2707
2727
  }
2708
2728
  return lines;
2709
2729
  }
2710
- async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency) {
2730
+ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName) {
2711
2731
  const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
2712
2732
  if (jobs.length === 0) {
2713
2733
  throw new Error("No jobs expanded from RunConfigs.");
@@ -2916,7 +2936,8 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency)
2916
2936
  console.log("");
2917
2937
  const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
2918
2938
  jobs,
2919
- globalConcurrency: concurrency
2939
+ globalConcurrency: concurrency,
2940
+ experimentName
2920
2941
  });
2921
2942
  for (let i = 0; i < snapshots.length; i += 1) {
2922
2943
  const snap = snapshots[i];
@@ -3016,13 +3037,14 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency)
3016
3037
  }
3017
3038
  return failedTestCasesTotal > 0 ? 1 : 0;
3018
3039
  }
3019
- async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency) {
3040
+ async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName) {
3020
3041
  return new Promise((resolve5, reject) => {
3021
3042
  const app = render(
3022
3043
  React.createElement(RunView, {
3023
3044
  runner,
3024
3045
  runConfigNames,
3025
3046
  concurrency,
3047
+ experimentName,
3026
3048
  onComplete: (err, exitCode) => {
3027
3049
  app.unmount();
3028
3050
  if (err) {
@@ -3083,7 +3105,8 @@ async function main() {
3083
3105
  const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
3084
3106
  runner,
3085
3107
  args.runConfigNames,
3086
- concurrency
3108
+ concurrency,
3109
+ args.experimentName
3087
3110
  );
3088
3111
  if (args.ci && exitCode !== 0) {
3089
3112
  process.exit(1);