@m4trix/evals 0.28.0 → 0.30.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -986,8 +986,11 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
986
986
  output,
987
987
  meta: {
988
988
  triggerId: task.triggerId,
989
+ triggerTimestamp: task.triggerTimestamp,
989
990
  runId: evaluatorRunId,
990
991
  datasetName: task.dataset.getDisplayLabel(),
992
+ testCaseId: testCaseItem.id,
993
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
991
994
  repetitionId,
992
995
  repetitionIndex,
993
996
  repetitionCount,
@@ -1464,6 +1467,7 @@ var EffectRunner = class {
1464
1467
  const globalConcurrency = Math.max(1, request.globalConcurrency);
1465
1468
  const sem = Effect.unsafeMakeSemaphore(globalConcurrency);
1466
1469
  const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
1470
+ const triggerTimestamp = request.triggerTimestamp ?? Date.now();
1467
1471
  const snapshots = [];
1468
1472
  for (const job of request.jobs) {
1469
1473
  snapshots.push(
@@ -1471,6 +1475,7 @@ var EffectRunner = class {
1471
1475
  datasetId: job.datasetId,
1472
1476
  evaluatorIds: job.evaluatorIds,
1473
1477
  triggerId,
1478
+ triggerTimestamp,
1474
1479
  maxConcurrency: this.config.maxConcurrency ?? 1,
1475
1480
  globalEvaluationSemaphore: sem,
1476
1481
  runConfigName: job.runConfigName,
@@ -1508,6 +1513,7 @@ var EffectRunner = class {
1508
1513
  datasetId: request.datasetId,
1509
1514
  evaluatorIds: request.evaluatorIds,
1510
1515
  triggerId: request.triggerId,
1516
+ triggerTimestamp: request.triggerTimestamp ?? Date.now(),
1511
1517
  maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
1512
1518
  repetitions: request.repetitions,
1513
1519
  runConfigName,
@@ -1535,6 +1541,7 @@ var EffectRunner = class {
1535
1541
  const totalEvaluations = selectedTestCases.length * repetitions;
1536
1542
  const runConfigTags = [...params.runConfigTags ?? []];
1537
1543
  const triggerId = params.triggerId ?? `trg-${randomUUID()}`;
1544
+ const triggerTimestamp = params.triggerTimestamp ?? Date.now();
1538
1545
  const runId = `run-${randomUUID()}`;
1539
1546
  const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
1540
1547
  const snapshot = {
@@ -1578,6 +1585,7 @@ var EffectRunner = class {
1578
1585
  Queue.offer(this.runQueue, {
1579
1586
  runId,
1580
1587
  triggerId,
1588
+ triggerTimestamp,
1581
1589
  datasetId: params.datasetId,
1582
1590
  dataset: dataset.dataset,
1583
1591
  evaluators: selectedEvaluators,
@@ -2010,6 +2018,7 @@ function RunView({
2010
2018
  runConfigNames,
2011
2019
  concurrency,
2012
2020
  experimentName,
2021
+ triggerTimestamp,
2013
2022
  onComplete
2014
2023
  }) {
2015
2024
  const [phase, setPhase] = useState("loading");
@@ -2180,7 +2189,8 @@ function RunView({
2180
2189
  const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
2181
2190
  jobs,
2182
2191
  globalConcurrency: concurrency,
2183
- experimentName
2192
+ experimentName,
2193
+ triggerTimestamp
2184
2194
  });
2185
2195
  for (let i = 0; i < snapshots.length; i += 1) {
2186
2196
  const snap = snapshots[i];
@@ -2237,7 +2247,7 @@ function RunView({
2237
2247
  setPhase("completed");
2238
2248
  const exitCode = failedTestCases > 0 ? 1 : 0;
2239
2249
  setTimeout(() => onComplete(void 0, exitCode), 200);
2240
- }, [runner, runConfigNames, concurrency, experimentName, onComplete]);
2250
+ }, [runner, runConfigNames, concurrency, experimentName, triggerTimestamp, onComplete]);
2241
2251
  useEffect(() => {
2242
2252
  void runEval();
2243
2253
  }, [runEval]);
@@ -2725,7 +2735,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2725
2735
  }
2726
2736
  return lines;
2727
2737
  }
2728
- async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName) {
2738
+ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName, triggerTimestamp) {
2729
2739
  const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
2730
2740
  if (jobs.length === 0) {
2731
2741
  throw new Error("No jobs expanded from RunConfigs.");
@@ -2935,7 +2945,8 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency,
2935
2945
  const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
2936
2946
  jobs,
2937
2947
  globalConcurrency: concurrency,
2938
- experimentName
2948
+ experimentName,
2949
+ triggerTimestamp
2939
2950
  });
2940
2951
  for (let i = 0; i < snapshots.length; i += 1) {
2941
2952
  const snap = snapshots[i];
@@ -3035,7 +3046,7 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency,
3035
3046
  }
3036
3047
  return failedTestCasesTotal > 0 ? 1 : 0;
3037
3048
  }
3038
- async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName) {
3049
+ async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName, triggerTimestamp) {
3039
3050
  return new Promise((resolve5, reject) => {
3040
3051
  const app = render(
3041
3052
  React.createElement(RunView, {
@@ -3043,6 +3054,7 @@ async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, e
3043
3054
  runConfigNames,
3044
3055
  concurrency,
3045
3056
  experimentName,
3057
+ triggerTimestamp,
3046
3058
  onComplete: (err, exitCode) => {
3047
3059
  app.unmount();
3048
3060
  if (err) {
@@ -3100,11 +3112,13 @@ async function main() {
3100
3112
  try {
3101
3113
  if (args.command === "run") {
3102
3114
  const concurrency = args.concurrency ?? getDefaultConcurrency();
3115
+ const triggerTimestamp = Date.now();
3103
3116
  const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
3104
3117
  runner,
3105
3118
  args.runConfigNames,
3106
3119
  concurrency,
3107
- args.experimentName
3120
+ args.experimentName,
3121
+ triggerTimestamp
3108
3122
  );
3109
3123
  if (args.ci && exitCode !== 0) {
3110
3124
  process.exit(1);