@m4trix/evals 0.27.0 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -132,7 +132,12 @@ interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TO extends Schema
132
132
  name: string;
133
133
  /** Optional human-readable label for CLI/TUI and evaluator args (any characters). */
134
134
  displayName?: string;
135
- tags: string[];
135
+ /**
136
+ * Declared tags on this test case (not `Dataset` filter options). Use `Dataset` `includedTags` /
137
+ * `excludedTags` to select which cases belong to a dataset; evaluators read the resolved tags as
138
+ * `meta.testCaseTags`.
139
+ */
140
+ tags?: ReadonlyArray<string>;
136
141
  inputSchema: TI;
137
142
  input: InputOrBuilder<Schema.Schema.Type<TI>>;
138
143
  outputSchema?: TO;
@@ -156,7 +161,7 @@ declare function getTestCaseDisplayLabel(testCase: {
156
161
  getDisplayLabel?: () => string;
157
162
  getName?: () => string;
158
163
  }): string;
159
- /** Tags for evaluator `args.testCaseTags` (supports plain test-case-shaped objects without `getTags`). */
164
+ /** Tags for evaluator `meta.testCaseTags` (supports plain test-case-shaped objects without `getTags`). */
160
165
  declare function getTestCaseTagList(testCase: {
161
166
  getTags?: () => ReadonlyArray<string>;
162
167
  }): string[];
@@ -276,8 +281,16 @@ interface EvaluateMeta {
276
281
  runId: string;
277
282
  /** Display label for the dataset (`Dataset.getDisplayLabel()`, i.e. `displayName ?? name`). */
278
283
  datasetName: string;
284
+ /** Discovery id for the current test case (same as runner events’ `testCaseId`). */
285
+ testCaseId: string;
286
+ /** Display label for the test case (`TestCase.getDisplayLabel()`, i.e. `displayName ?? name`). */
287
+ testCaseName: string;
279
288
  /** Canonical `RunConfig` name (or `programmatic` for API/TUI-only runs). */
280
289
  runConfigName: string;
290
+ /**
291
+ * Optional label for this invocation (e.g. CLI `--experiment`); omitted when not set.
292
+ */
293
+ experimentName?: string;
281
294
  /**
282
295
  * Stable id shared by every execution of the same logical test case when `repetitionCount > 1`
283
296
  * (and present with count 1 for consistency).
@@ -287,6 +300,15 @@ interface EvaluateMeta {
287
300
  repetitionIndex: number;
288
301
  /** Total scheduled executions for this logical test case in the current run. */
289
302
  repetitionCount: number;
303
+ /** Declared tags on the current test case (`TestCase.describe({ tags })`); empty when none. */
304
+ testCaseTags: string[];
305
+ /**
306
+ * Declared tags on the current run config or programmatic request (`RunConfig.define({ tags })`,
307
+ * `RunDatasetRequest.runConfigTags`); empty when none.
308
+ */
309
+ runConfigTags: string[];
310
+ /** Declared tags on this evaluator (`Evaluator.define({ tags })`); empty when none. */
311
+ evaluatorTags: string[];
290
312
  }
291
313
  interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>> {
292
314
  input: TInput;
@@ -294,12 +316,6 @@ interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>>
294
316
  output?: TOutput;
295
317
  /** Metadata about the current evaluator invocation. */
296
318
  meta: EvaluateMeta;
297
- /** Tags from `TestCase.describe({ tags })` for the current test case. */
298
- testCaseTags: string[];
299
- /** Tags from `RunConfig.define({ tags })` for this job; empty for programmatic runs unless set on the request. */
300
- runConfigTags: string[];
301
- /** Tags from `Evaluator.define({ tags })` for this evaluator. */
302
- evaluatorTags: string[];
303
319
  /** Records a diff for this test case; stored in run artifact and shown by CLI */
304
320
  logDiff: (expected: unknown, actual: unknown, options?: CreateDiffLogEntryOptions) => void;
305
321
  /** Logs a message or object for this test case; stored in run artifact and shown by CLI */
@@ -328,7 +344,10 @@ interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.
328
344
  scoreSchema: TS;
329
345
  passThreshold?: number;
330
346
  passCriterion?: (score: unknown) => boolean;
331
- /** Optional tags for this evaluator; surfaced on every `evaluate` invocation. */
347
+ /**
348
+ * Declared tags for this evaluator (not dataset filter rules); echoed on every `evaluate` call as
349
+ * `meta.evaluatorTags`.
350
+ */
332
351
  tags?: ReadonlyArray<string>;
333
352
  }
334
353
  declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, TCtx = Record<string, never>> {
@@ -360,7 +379,7 @@ declare function getEvaluatorDisplayLabel(evaluator: {
360
379
  getDisplayLabel?: () => string | undefined;
361
380
  getName?: () => string | undefined;
362
381
  }): string | undefined;
363
- /** Tags for evaluator `args.evaluatorTags` (plain evaluator-shaped objects without `getTags` yield `[]`). */
382
+ /** Tags for evaluator `meta.evaluatorTags` (plain evaluator-shaped objects without `getTags` yield `[]`). */
364
383
  declare function getEvaluatorTagList(evaluator: {
365
384
  getTags?: () => ReadonlyArray<string>;
366
385
  }): string[];
@@ -441,7 +460,7 @@ interface RunConfigDefineConfig {
441
460
  name: string;
442
461
  /** Optional human-readable label for CLI/TUI (any characters). */
443
462
  displayName?: string;
444
- /** Optional tags; copied to every evaluation as `runConfigTags` on the evaluator callback. */
463
+ /** Optional declared tags for this run config; copied to every evaluation as `meta.runConfigTags`. */
445
464
  tags?: ReadonlyArray<string>;
446
465
  runs: ReadonlyArray<RunConfigRow>;
447
466
  }
@@ -458,7 +477,7 @@ declare class RunConfig {
458
477
  getDisplayName(): string | undefined;
459
478
  /** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. */
460
479
  getDisplayLabel(): string;
461
- /** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
480
+ /** Tags from `RunConfig.define({ tags })`; surfaced as `meta.runConfigTags` on evaluator callbacks. */
462
481
  getTags(): string[];
463
482
  getRuns(): ReadonlyArray<RunConfigRow>;
464
483
  }
@@ -547,7 +566,7 @@ interface RunDatasetJob {
547
566
  */
548
567
  runConfigDisplayLabel?: string;
549
568
  /**
550
- * Tags from `RunConfig.define({ tags })` for this job; forwarded as `runConfigTags` on evaluator callbacks.
569
+ * Tags from `RunConfig.define({ tags })` for this job; forwarded as `meta.runConfigTags` on evaluator callbacks.
551
570
  */
552
571
  runConfigTags?: ReadonlyArray<string>;
553
572
  /** Evaluates each matching test case this many times (default 1). */
@@ -584,9 +603,13 @@ interface RunDatasetRequest {
584
603
  */
585
604
  repetitions?: number;
586
605
  /**
587
- * Optional tags for this run; forwarded as `runConfigTags` on evaluator callbacks (e.g. suite labels).
606
+ * Optional tags for this run; forwarded as `meta.runConfigTags` on evaluator callbacks (e.g. suite labels).
588
607
  */
589
608
  runConfigTags?: ReadonlyArray<string>;
609
+ /**
610
+ * Optional label for this run; forwarded as `experimentName` on evaluator `meta`.
611
+ */
612
+ experimentName?: string;
590
613
  }
591
614
  interface RunSnapshot {
592
615
  runId: string;
@@ -674,6 +697,8 @@ interface RunDatasetJobsWithSharedConcurrencyRequest {
674
697
  jobs: ReadonlyArray<RunDatasetJob>;
675
698
  globalConcurrency: number;
676
699
  triggerId?: string;
700
+ /** Applied to every job in this batch (e.g. CLI `--experiment`). */
701
+ experimentName?: string;
677
702
  }
678
703
  interface RunnerApi {
679
704
  collectDatasets(): Promise<ReadonlyArray<CollectedDataset>>;
package/dist/index.js CHANGED
@@ -816,7 +816,7 @@ var RunConfig = class _RunConfig {
816
816
  getDisplayLabel() {
817
817
  return this._displayName ?? this._name;
818
818
  }
819
- /** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
819
+ /** Tags from `RunConfig.define({ tags })`; surfaced as `meta.runConfigTags` on evaluator callbacks. */
820
820
  getTags() {
821
821
  return [...this._tags];
822
822
  }
@@ -989,10 +989,11 @@ var TestCase = class _TestCase {
989
989
  static describe(config) {
990
990
  const name = validateTestCaseName(config.name, "TestCase.describe");
991
991
  const displayName = normalizeOptionalDisplayName(config.displayName);
992
+ const tags = config.tags !== void 0 ? [...config.tags] : [];
992
993
  return new _TestCase({
993
994
  name,
994
995
  displayName,
995
- tags: config.tags,
996
+ tags,
996
997
  inputSchema: config.inputSchema,
997
998
  input: config.input,
998
999
  outputSchema: config.outputSchema,
@@ -1009,7 +1010,7 @@ var TestCase = class _TestCase {
1009
1010
  return this._config.displayName ?? this._config.name;
1010
1011
  }
1011
1012
  getTags() {
1012
- return this._config.tags;
1013
+ return [...this._config.tags];
1013
1014
  }
1014
1015
  getInputSchema() {
1015
1016
  return this._config.inputSchema;
@@ -1567,14 +1568,17 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1567
1568
  triggerId: task.triggerId,
1568
1569
  runId: evaluatorRunId,
1569
1570
  datasetName: task.dataset.getDisplayLabel(),
1571
+ testCaseId: testCaseItem.id,
1572
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
1570
1573
  repetitionId,
1571
1574
  repetitionIndex,
1572
1575
  repetitionCount,
1573
- runConfigName: task.runConfigName
1576
+ runConfigName: task.runConfigName,
1577
+ ...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
1578
+ testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1579
+ runConfigTags: task.runConfigTags,
1580
+ evaluatorTags: getEvaluatorTagList(evaluator)
1574
1581
  },
1575
- testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1576
- runConfigTags: task.runConfigTags,
1577
- evaluatorTags: getEvaluatorTagList(evaluator),
1578
1582
  logDiff,
1579
1583
  log,
1580
1584
  createError
@@ -2053,7 +2057,8 @@ var EffectRunner = class {
2053
2057
  globalEvaluationSemaphore: sem,
2054
2058
  runConfigName: job.runConfigName,
2055
2059
  runConfigTags: job.runConfigTags,
2056
- repetitions: job.repetitions
2060
+ repetitions: job.repetitions,
2061
+ experimentName: request.experimentName
2057
2062
  })
2058
2063
  );
2059
2064
  }
@@ -2088,7 +2093,8 @@ var EffectRunner = class {
2088
2093
  maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
2089
2094
  repetitions: request.repetitions,
2090
2095
  runConfigName,
2091
- runConfigTags: request.runConfigTags
2096
+ runConfigTags: request.runConfigTags,
2097
+ experimentName: request.experimentName
2092
2098
  });
2093
2099
  }
2094
2100
  async startDatasetRun(params) {
@@ -2163,7 +2169,8 @@ var EffectRunner = class {
2163
2169
  globalEvaluationSemaphore: params.globalEvaluationSemaphore,
2164
2170
  runConfigName: params.runConfigName,
2165
2171
  runConfigTags,
2166
- repetitions
2172
+ repetitions,
2173
+ experimentName: params.experimentName
2167
2174
  })
2168
2175
  );
2169
2176
  return snapshot;