@m4trix/evals 0.16.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -234,10 +234,23 @@ interface EvalMiddleware<TCtx> {
234
234
  name: string;
235
235
  resolve: () => TCtx | Promise<TCtx>;
236
236
  }
237
+ interface EvaluateMeta {
238
+ /** Identifier of the trigger that started the run (for example, a CLI invocation). */
239
+ triggerId: string;
240
+ /**
241
+ * Identifier of the current test-case execution shared across all evaluators
242
+ * for this specific test-case run.
243
+ */
244
+ runId: string;
245
+ /** Identifier of the dataset currently being evaluated. */
246
+ datasetId: string;
247
+ }
237
248
  interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>> {
238
249
  input: TInput;
239
250
  ctx: TCtx;
240
251
  output?: TOutput;
252
+ /** Metadata about the current evaluator invocation. */
253
+ meta: EvaluateMeta;
241
254
  /** Records a diff for this test case; stored in run artifact and shown by CLI */
242
255
  logDiff: (expected: unknown, actual: unknown, options?: CreateDiffLogEntryOptions) => void;
243
256
  /** Logs a message or object for this test case; stored in run artifact and shown by CLI */
@@ -349,6 +362,11 @@ interface SearchTestCasesQuery {
349
362
  excludedPaths?: ReadonlyArray<string | RegExp>;
350
363
  }
351
364
  interface RunDatasetRequest {
365
+ /**
366
+ * Identifier for what triggered the run request (for example, a CLI command).
367
+ * When omitted, the runner generates one in the format `trg-[uuid]`.
368
+ */
369
+ triggerId?: string;
352
370
  datasetId: string;
353
371
  evaluatorIds: ReadonlyArray<string>;
354
372
  concurrency?: number;
@@ -469,4 +487,4 @@ interface BinaryScoreData {
469
487
  }
470
488
  declare const binaryScore: ScoreDef<BinaryScoreData>;
471
489
 
472
- export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, Evaluator, EvaluatorLogEntry, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
490
+ export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
package/dist/index.js CHANGED
@@ -1035,6 +1035,7 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1035
1035
  const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1036
1036
  const rerunPassed = [];
1037
1037
  for (let r = 0; r < reruns; r++) {
1038
+ const evaluatorRunId = `run-${randomUUID()}`;
1038
1039
  const started = Date.now();
1039
1040
  const evaluatorScores = [];
1040
1041
  let testCaseError;
@@ -1061,6 +1062,11 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1061
1062
  input: testCaseItem.testCase.getInput(),
1062
1063
  ctx,
1063
1064
  output,
1065
+ meta: {
1066
+ triggerId: task.triggerId,
1067
+ runId: evaluatorRunId,
1068
+ datasetId: task.datasetId
1069
+ },
1064
1070
  logDiff,
1065
1071
  log
1066
1072
  })
@@ -1520,6 +1526,7 @@ var EffectRunner = class {
1520
1526
  (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1521
1527
  0
1522
1528
  );
1529
+ const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
1523
1530
  const runId = `run-${randomUUID()}`;
1524
1531
  const artifactPath = createArtifactPath(
1525
1532
  this.config.artifactDirectory,
@@ -1561,6 +1568,7 @@ var EffectRunner = class {
1561
1568
  await Effect.runPromise(
1562
1569
  Queue.offer(this.runQueue, {
1563
1570
  runId,
1571
+ triggerId,
1564
1572
  datasetId: request.datasetId,
1565
1573
  dataset: dataset.dataset,
1566
1574
  evaluators: selectedEvaluators,