@m4trix/evals 0.27.0 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -132,7 +132,12 @@ interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TO extends Schema
132
132
  name: string;
133
133
  /** Optional human-readable label for CLI/TUI and evaluator args (any characters). */
134
134
  displayName?: string;
135
- tags: string[];
135
+ /**
136
+ * Declared tags on this test case (not `Dataset` filter options). Use `Dataset` `includedTags` /
137
+ * `excludedTags` to select which cases belong to a dataset; evaluators read the resolved tags as
138
+ * `meta.testCaseTags`.
139
+ */
140
+ tags?: ReadonlyArray<string>;
136
141
  inputSchema: TI;
137
142
  input: InputOrBuilder<Schema.Schema.Type<TI>>;
138
143
  outputSchema?: TO;
@@ -156,7 +161,7 @@ declare function getTestCaseDisplayLabel(testCase: {
156
161
  getDisplayLabel?: () => string;
157
162
  getName?: () => string;
158
163
  }): string;
159
- /** Tags for evaluator `args.testCaseTags` (supports plain test-case-shaped objects without `getTags`). */
164
+ /** Tags for evaluator `meta.testCaseTags` (supports plain test-case-shaped objects without `getTags`). */
160
165
  declare function getTestCaseTagList(testCase: {
161
166
  getTags?: () => ReadonlyArray<string>;
162
167
  }): string[];
@@ -278,6 +283,10 @@ interface EvaluateMeta {
278
283
  datasetName: string;
279
284
  /** Canonical `RunConfig` name (or `programmatic` for API/TUI-only runs). */
280
285
  runConfigName: string;
286
+ /**
287
+ * Optional label for this invocation (e.g. CLI `--experiment`); omitted when not set.
288
+ */
289
+ experimentName?: string;
281
290
  /**
282
291
  * Stable id shared by every execution of the same logical test case when `repetitionCount > 1`
283
292
  * (and present with count 1 for consistency).
@@ -287,6 +296,15 @@ interface EvaluateMeta {
287
296
  repetitionIndex: number;
288
297
  /** Total scheduled executions for this logical test case in the current run. */
289
298
  repetitionCount: number;
299
+ /** Declared tags on the current test case (`TestCase.describe({ tags })`); empty when none. */
300
+ testCaseTags: string[];
301
+ /**
302
+ * Declared tags on the current run config or programmatic request (`RunConfig.define({ tags })`,
303
+ * `RunDatasetRequest.runConfigTags`); empty when none.
304
+ */
305
+ runConfigTags: string[];
306
+ /** Declared tags on this evaluator (`Evaluator.define({ tags })`); empty when none. */
307
+ evaluatorTags: string[];
290
308
  }
291
309
  interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>> {
292
310
  input: TInput;
@@ -294,12 +312,6 @@ interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>>
294
312
  output?: TOutput;
295
313
  /** Metadata about the current evaluator invocation. */
296
314
  meta: EvaluateMeta;
297
- /** Tags from `TestCase.describe({ tags })` for the current test case. */
298
- testCaseTags: string[];
299
- /** Tags from `RunConfig.define({ tags })` for this job; empty for programmatic runs unless set on the request. */
300
- runConfigTags: string[];
301
- /** Tags from `Evaluator.define({ tags })` for this evaluator. */
302
- evaluatorTags: string[];
303
315
  /** Records a diff for this test case; stored in run artifact and shown by CLI */
304
316
  logDiff: (expected: unknown, actual: unknown, options?: CreateDiffLogEntryOptions) => void;
305
317
  /** Logs a message or object for this test case; stored in run artifact and shown by CLI */
@@ -328,7 +340,10 @@ interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.
328
340
  scoreSchema: TS;
329
341
  passThreshold?: number;
330
342
  passCriterion?: (score: unknown) => boolean;
331
- /** Optional tags for this evaluator; surfaced on every `evaluate` invocation. */
343
+ /**
344
+ * Declared tags for this evaluator (not dataset filter rules); echoed on every `evaluate` call as
345
+ * `meta.evaluatorTags`.
346
+ */
332
347
  tags?: ReadonlyArray<string>;
333
348
  }
334
349
  declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, TCtx = Record<string, never>> {
@@ -360,7 +375,7 @@ declare function getEvaluatorDisplayLabel(evaluator: {
360
375
  getDisplayLabel?: () => string | undefined;
361
376
  getName?: () => string | undefined;
362
377
  }): string | undefined;
363
- /** Tags for evaluator `args.evaluatorTags` (plain evaluator-shaped objects without `getTags` yield `[]`). */
378
+ /** Tags for evaluator `meta.evaluatorTags` (plain evaluator-shaped objects without `getTags` yield `[]`). */
364
379
  declare function getEvaluatorTagList(evaluator: {
365
380
  getTags?: () => ReadonlyArray<string>;
366
381
  }): string[];
@@ -441,7 +456,7 @@ interface RunConfigDefineConfig {
441
456
  name: string;
442
457
  /** Optional human-readable label for CLI/TUI (any characters). */
443
458
  displayName?: string;
444
- /** Optional tags; copied to every evaluation as `runConfigTags` on the evaluator callback. */
459
+ /** Optional declared tags for this run config; copied to every evaluation as `meta.runConfigTags`. */
445
460
  tags?: ReadonlyArray<string>;
446
461
  runs: ReadonlyArray<RunConfigRow>;
447
462
  }
@@ -458,7 +473,7 @@ declare class RunConfig {
458
473
  getDisplayName(): string | undefined;
459
474
  /** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. */
460
475
  getDisplayLabel(): string;
461
- /** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
476
+ /** Tags from `RunConfig.define({ tags })`; surfaced as `meta.runConfigTags` on evaluator callbacks. */
462
477
  getTags(): string[];
463
478
  getRuns(): ReadonlyArray<RunConfigRow>;
464
479
  }
@@ -547,7 +562,7 @@ interface RunDatasetJob {
547
562
  */
548
563
  runConfigDisplayLabel?: string;
549
564
  /**
550
- * Tags from `RunConfig.define({ tags })` for this job; forwarded as `runConfigTags` on evaluator callbacks.
565
+ * Tags from `RunConfig.define({ tags })` for this job; forwarded as `meta.runConfigTags` on evaluator callbacks.
551
566
  */
552
567
  runConfigTags?: ReadonlyArray<string>;
553
568
  /** Evaluates each matching test case this many times (default 1). */
@@ -584,9 +599,13 @@ interface RunDatasetRequest {
584
599
  */
585
600
  repetitions?: number;
586
601
  /**
587
- * Optional tags for this run; forwarded as `runConfigTags` on evaluator callbacks (e.g. suite labels).
602
+ * Optional tags for this run; forwarded as `meta.runConfigTags` on evaluator callbacks (e.g. suite labels).
588
603
  */
589
604
  runConfigTags?: ReadonlyArray<string>;
605
+ /**
606
+ * Optional label for this run; forwarded as `experimentName` on evaluator `meta`.
607
+ */
608
+ experimentName?: string;
590
609
  }
591
610
  interface RunSnapshot {
592
611
  runId: string;
@@ -674,6 +693,8 @@ interface RunDatasetJobsWithSharedConcurrencyRequest {
674
693
  jobs: ReadonlyArray<RunDatasetJob>;
675
694
  globalConcurrency: number;
676
695
  triggerId?: string;
696
+ /** Applied to every job in this batch (e.g. CLI `--experiment`). */
697
+ experimentName?: string;
677
698
  }
678
699
  interface RunnerApi {
679
700
  collectDatasets(): Promise<ReadonlyArray<CollectedDataset>>;
package/dist/index.js CHANGED
@@ -816,7 +816,7 @@ var RunConfig = class _RunConfig {
816
816
  getDisplayLabel() {
817
817
  return this._displayName ?? this._name;
818
818
  }
819
- /** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
819
+ /** Tags from `RunConfig.define({ tags })`; surfaced as `meta.runConfigTags` on evaluator callbacks. */
820
820
  getTags() {
821
821
  return [...this._tags];
822
822
  }
@@ -989,10 +989,11 @@ var TestCase = class _TestCase {
989
989
  static describe(config) {
990
990
  const name = validateTestCaseName(config.name, "TestCase.describe");
991
991
  const displayName = normalizeOptionalDisplayName(config.displayName);
992
+ const tags = config.tags !== void 0 ? [...config.tags] : [];
992
993
  return new _TestCase({
993
994
  name,
994
995
  displayName,
995
- tags: config.tags,
996
+ tags,
996
997
  inputSchema: config.inputSchema,
997
998
  input: config.input,
998
999
  outputSchema: config.outputSchema,
@@ -1009,7 +1010,7 @@ var TestCase = class _TestCase {
1009
1010
  return this._config.displayName ?? this._config.name;
1010
1011
  }
1011
1012
  getTags() {
1012
- return this._config.tags;
1013
+ return [...this._config.tags];
1013
1014
  }
1014
1015
  getInputSchema() {
1015
1016
  return this._config.inputSchema;
@@ -1570,11 +1571,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1570
1571
  repetitionId,
1571
1572
  repetitionIndex,
1572
1573
  repetitionCount,
1573
- runConfigName: task.runConfigName
1574
+ runConfigName: task.runConfigName,
1575
+ ...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
1576
+ testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1577
+ runConfigTags: task.runConfigTags,
1578
+ evaluatorTags: getEvaluatorTagList(evaluator)
1574
1579
  },
1575
- testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1576
- runConfigTags: task.runConfigTags,
1577
- evaluatorTags: getEvaluatorTagList(evaluator),
1578
1580
  logDiff,
1579
1581
  log,
1580
1582
  createError
@@ -2053,7 +2055,8 @@ var EffectRunner = class {
2053
2055
  globalEvaluationSemaphore: sem,
2054
2056
  runConfigName: job.runConfigName,
2055
2057
  runConfigTags: job.runConfigTags,
2056
- repetitions: job.repetitions
2058
+ repetitions: job.repetitions,
2059
+ experimentName: request.experimentName
2057
2060
  })
2058
2061
  );
2059
2062
  }
@@ -2088,7 +2091,8 @@ var EffectRunner = class {
2088
2091
  maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
2089
2092
  repetitions: request.repetitions,
2090
2093
  runConfigName,
2091
- runConfigTags: request.runConfigTags
2094
+ runConfigTags: request.runConfigTags,
2095
+ experimentName: request.experimentName
2092
2096
  });
2093
2097
  }
2094
2098
  async startDatasetRun(params) {
@@ -2163,7 +2167,8 @@ var EffectRunner = class {
2163
2167
  globalEvaluationSemaphore: params.globalEvaluationSemaphore,
2164
2168
  runConfigName: params.runConfigName,
2165
2169
  runConfigTags,
2166
- repetitions
2170
+ repetitions,
2171
+ experimentName: params.experimentName
2167
2172
  })
2168
2173
  );
2169
2174
  return snapshot;