@m4trix/evals 0.27.0 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -991,11 +991,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
991
991
  repetitionId,
992
992
  repetitionIndex,
993
993
  repetitionCount,
994
- runConfigName: task.runConfigName
994
+ runConfigName: task.runConfigName,
995
+ ...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
996
+ testCaseTags: getTestCaseTagList(testCaseItem.testCase),
997
+ runConfigTags: task.runConfigTags,
998
+ evaluatorTags: getEvaluatorTagList(evaluator)
995
999
  },
996
- testCaseTags: getTestCaseTagList(testCaseItem.testCase),
997
- runConfigTags: task.runConfigTags,
998
- evaluatorTags: getEvaluatorTagList(evaluator),
999
1000
  logDiff,
1000
1001
  log,
1001
1002
  createError
@@ -1474,7 +1475,8 @@ var EffectRunner = class {
1474
1475
  globalEvaluationSemaphore: sem,
1475
1476
  runConfigName: job.runConfigName,
1476
1477
  runConfigTags: job.runConfigTags,
1477
- repetitions: job.repetitions
1478
+ repetitions: job.repetitions,
1479
+ experimentName: request.experimentName
1478
1480
  })
1479
1481
  );
1480
1482
  }
@@ -1509,7 +1511,8 @@ var EffectRunner = class {
1509
1511
  maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
1510
1512
  repetitions: request.repetitions,
1511
1513
  runConfigName,
1512
- runConfigTags: request.runConfigTags
1514
+ runConfigTags: request.runConfigTags,
1515
+ experimentName: request.experimentName
1513
1516
  });
1514
1517
  }
1515
1518
  async startDatasetRun(params) {
@@ -1584,7 +1587,8 @@ var EffectRunner = class {
1584
1587
  globalEvaluationSemaphore: params.globalEvaluationSemaphore,
1585
1588
  runConfigName: params.runConfigName,
1586
1589
  runConfigTags,
1587
- repetitions
1590
+ repetitions,
1591
+ experimentName: params.experimentName
1588
1592
  })
1589
1593
  );
1590
1594
  return snapshot;
@@ -1704,6 +1708,17 @@ function parseSimpleCliArgs(argv) {
1704
1708
  index += 1;
1705
1709
  continue;
1706
1710
  }
1711
+ if (token === "--experiment" && argv[index + 1]) {
1712
+ const raw = argv[index + 1];
1713
+ if (typeof raw === "string") {
1714
+ const trimmed = raw.trim();
1715
+ if (trimmed.length > 0) {
1716
+ args.experimentName = trimmed;
1717
+ }
1718
+ }
1719
+ index += 1;
1720
+ continue;
1721
+ }
1707
1722
  args.unknownArgs.push(token);
1708
1723
  }
1709
1724
  return args;
@@ -1711,12 +1726,13 @@ function parseSimpleCliArgs(argv) {
1711
1726
  function getSimpleCliUsage() {
1712
1727
  return [
1713
1728
  "Usage:",
1714
- " eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--ci]",
1729
+ " eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--experiment <name>] [--ci]",
1715
1730
  " eval-agents-simple generate --dataset <datasetId>",
1716
1731
  "",
1717
1732
  "Options:",
1718
1733
  " --ci With run: exit with code 1 if any test case fails.",
1719
- " --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential."
1734
+ " --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential.",
1735
+ " --experiment <name> With run: set evaluator meta.experimentName for this invocation."
1720
1736
  ].join("\n");
1721
1737
  }
1722
1738
 
@@ -1993,6 +2009,7 @@ function RunView({
1993
2009
  runner,
1994
2010
  runConfigNames,
1995
2011
  concurrency,
2012
+ experimentName,
1996
2013
  onComplete
1997
2014
  }) {
1998
2015
  const [phase, setPhase] = useState("loading");
@@ -2162,7 +2179,8 @@ function RunView({
2162
2179
  });
2163
2180
  const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
2164
2181
  jobs,
2165
- globalConcurrency: concurrency
2182
+ globalConcurrency: concurrency,
2183
+ experimentName
2166
2184
  });
2167
2185
  for (let i = 0; i < snapshots.length; i += 1) {
2168
2186
  const snap = snapshots[i];
@@ -2219,7 +2237,7 @@ function RunView({
2219
2237
  setPhase("completed");
2220
2238
  const exitCode = failedTestCases > 0 ? 1 : 0;
2221
2239
  setTimeout(() => onComplete(void 0, exitCode), 200);
2222
- }, [runner, runConfigNames, concurrency, onComplete]);
2240
+ }, [runner, runConfigNames, concurrency, experimentName, onComplete]);
2223
2241
  useEffect(() => {
2224
2242
  void runEval();
2225
2243
  }, [runEval]);
@@ -2707,7 +2725,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2707
2725
  }
2708
2726
  return lines;
2709
2727
  }
2710
- async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency) {
2728
+ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName) {
2711
2729
  const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
2712
2730
  if (jobs.length === 0) {
2713
2731
  throw new Error("No jobs expanded from RunConfigs.");
@@ -2916,7 +2934,8 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency)
2916
2934
  console.log("");
2917
2935
  const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
2918
2936
  jobs,
2919
- globalConcurrency: concurrency
2937
+ globalConcurrency: concurrency,
2938
+ experimentName
2920
2939
  });
2921
2940
  for (let i = 0; i < snapshots.length; i += 1) {
2922
2941
  const snap = snapshots[i];
@@ -3016,13 +3035,14 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency)
3016
3035
  }
3017
3036
  return failedTestCasesTotal > 0 ? 1 : 0;
3018
3037
  }
3019
- async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency) {
3038
+ async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName) {
3020
3039
  return new Promise((resolve5, reject) => {
3021
3040
  const app = render(
3022
3041
  React.createElement(RunView, {
3023
3042
  runner,
3024
3043
  runConfigNames,
3025
3044
  concurrency,
3045
+ experimentName,
3026
3046
  onComplete: (err, exitCode) => {
3027
3047
  app.unmount();
3028
3048
  if (err) {
@@ -3083,7 +3103,8 @@ async function main() {
3083
3103
  const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
3084
3104
  runner,
3085
3105
  args.runConfigNames,
3086
- concurrency
3106
+ concurrency,
3107
+ args.experimentName
3087
3108
  );
3088
3109
  if (args.ci && exitCode !== 0) {
3089
3110
  process.exit(1);