@m4trix/evals 0.26.0 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -123,21 +123,21 @@ declare function defineConfig<TConfig extends ConfigType>(factory: M4trixEvalCon
123
123
  declare const defaultRunnerConfig: RunnerConfig;
124
124
  declare function withRunnerConfig(overrides?: RunnerConfigOverrides): RunnerConfig;
125
125
 
126
- /** Matches a tag by exact string equality or regex test */
127
- type TagMatcher = string | RegExp;
128
- /** Matches a file path by glob string or regex test */
129
- type PathMatcher = string | RegExp;
130
-
131
126
  type InputOrBuilder<T> = T | (() => T);
132
127
  interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>> {
133
128
  /**
134
- * Stable id (letters, digits, `_`, `-`).
129
+ * Stable id (letters, digits, `_`, `-`); used in discovery and matching.
135
130
  * For an unrestricted UI label, set {@link displayName}.
136
131
  */
137
132
  name: string;
138
- /** Optional human-readable label for CLI/TUI (any characters). */
133
+ /** Optional human-readable label for CLI/TUI and evaluator args (any characters). */
139
134
  displayName?: string;
140
- tags: string[];
135
+ /**
136
+ * Declared tags on this test case (not `Dataset` filter options). Use `Dataset` `includedTags` /
137
+ * `excludedTags` to select which cases belong to a dataset; evaluators read the resolved tags as
138
+ * `meta.testCaseTags`.
139
+ */
140
+ tags?: ReadonlyArray<string>;
141
141
  inputSchema: TI;
142
142
  input: InputOrBuilder<Schema.Schema.Type<TI>>;
143
143
  outputSchema?: TO;
@@ -161,13 +161,24 @@ declare function getTestCaseDisplayLabel(testCase: {
161
161
  getDisplayLabel?: () => string;
162
162
  getName?: () => string;
163
163
  }): string;
164
- /** Tags for evaluator `args.testCaseTags` (supports plain test-case-shaped objects without `getTags`). */
164
+ /** Tags for evaluator `meta.testCaseTags` (supports plain test-case-shaped objects without `getTags`). */
165
165
  declare function getTestCaseTagList(testCase: {
166
166
  getTags?: () => ReadonlyArray<string>;
167
167
  }): string[];
168
168
 
169
+ /** Matches a tag by exact string equality or regex test */
170
+ type TagMatcher = string | RegExp;
171
+ /** Matches a file path by glob string or regex test */
172
+ type PathMatcher = string | RegExp;
173
+
169
174
  interface DatasetDefineConfig {
175
+ /**
176
+ * Stable id (letters, digits, `_`, `-`); used for discovery ids and `resolveDatasetByName`.
177
+ * For an unrestricted UI label, set {@link displayName}.
178
+ */
170
179
  name: string;
180
+ /** Optional human-readable label for CLI/TUI (any characters). */
181
+ displayName?: string;
171
182
  includedTags?: TagMatcher[];
172
183
  excludedTags?: TagMatcher[];
173
184
  includedPaths?: PathMatcher[];
@@ -177,13 +188,22 @@ declare class Dataset {
177
188
  private readonly _config;
178
189
  private constructor();
179
190
  static define(config: DatasetDefineConfig): Dataset;
191
+ /** Canonical dataset id (same rules as `RunConfig` / `TestCase` `name`). */
180
192
  getName(): string;
193
+ getDisplayName(): string | undefined;
194
+ /** Label for CLI/TUI and evaluator `meta.datasetName`: {@link getDisplayName} if set, otherwise {@link getName}. */
195
+ getDisplayLabel(): string;
181
196
  getIncludedTags(): ReadonlyArray<TagMatcher>;
182
197
  getExcludedTags(): ReadonlyArray<TagMatcher>;
183
198
  getIncludedPaths(): ReadonlyArray<PathMatcher>;
184
199
  getExcludedPaths(): ReadonlyArray<PathMatcher>;
185
200
  matchesTestCase(testCase: TestCase<unknown>, filePath: string): boolean;
186
201
  }
202
+ /** CLI / runner: display label for a dataset-shaped object (supports discovery duck-types). */
203
+ declare function getDatasetDisplayLabel(dataset: {
204
+ getDisplayLabel?: () => string;
205
+ getName?: () => string;
206
+ }): string;
187
207
 
188
208
  /**
189
209
  * Options for customizing JSON diff output. Passed to logDiff, createDiffLogEntry, and printJsonDiff.
@@ -259,10 +279,14 @@ interface EvaluateMeta {
259
279
  * for this specific test-case run.
260
280
  */
261
281
  runId: string;
262
- /** Identifier of the dataset currently being evaluated. */
263
- datasetId: string;
282
+ /** Display label for the dataset (`Dataset.getDisplayLabel()`, i.e. `displayName ?? name`). */
283
+ datasetName: string;
264
284
  /** Canonical `RunConfig` name (or `programmatic` for API/TUI-only runs). */
265
285
  runConfigName: string;
286
+ /**
287
+ * Optional label for this invocation (e.g. CLI `--experiment`); omitted when not set.
288
+ */
289
+ experimentName?: string;
266
290
  /**
267
291
  * Stable id shared by every execution of the same logical test case when `repetitionCount > 1`
268
292
  * (and present with count 1 for consistency).
@@ -272,6 +296,15 @@ interface EvaluateMeta {
272
296
  repetitionIndex: number;
273
297
  /** Total scheduled executions for this logical test case in the current run. */
274
298
  repetitionCount: number;
299
+ /** Declared tags on the current test case (`TestCase.describe({ tags })`); empty when none. */
300
+ testCaseTags: string[];
301
+ /**
302
+ * Declared tags on the current run config or programmatic request (`RunConfig.define({ tags })`,
303
+ * `RunDatasetRequest.runConfigTags`); empty when none.
304
+ */
305
+ runConfigTags: string[];
306
+ /** Declared tags on this evaluator (`Evaluator.define({ tags })`); empty when none. */
307
+ evaluatorTags: string[];
275
308
  }
276
309
  interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>> {
277
310
  input: TInput;
@@ -279,12 +312,6 @@ interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>>
279
312
  output?: TOutput;
280
313
  /** Metadata about the current evaluator invocation. */
281
314
  meta: EvaluateMeta;
282
- /** Tags from `TestCase.describe({ tags })` for the current test case. */
283
- testCaseTags: string[];
284
- /** Tags from `RunConfig.define({ tags })` for this job; empty for programmatic runs unless set on the request. */
285
- runConfigTags: string[];
286
- /** Tags from `Evaluator.define({ tags })` for this evaluator. */
287
- evaluatorTags: string[];
288
315
  /** Records a diff for this test case; stored in run artifact and shown by CLI */
289
316
  logDiff: (expected: unknown, actual: unknown, options?: CreateDiffLogEntryOptions) => void;
290
317
  /** Logs a message or object for this test case; stored in run artifact and shown by CLI */
@@ -313,7 +340,10 @@ interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.
313
340
  scoreSchema: TS;
314
341
  passThreshold?: number;
315
342
  passCriterion?: (score: unknown) => boolean;
316
- /** Optional tags for this evaluator; surfaced on every `evaluate` invocation. */
343
+ /**
344
+ * Declared tags for this evaluator (not dataset filter rules); echoed on every `evaluate` call as
345
+ * `meta.evaluatorTags`.
346
+ */
317
347
  tags?: ReadonlyArray<string>;
318
348
  }
319
349
  declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, TCtx = Record<string, never>> {
@@ -345,7 +375,7 @@ declare function getEvaluatorDisplayLabel(evaluator: {
345
375
  getDisplayLabel?: () => string | undefined;
346
376
  getName?: () => string | undefined;
347
377
  }): string | undefined;
348
- /** Tags for evaluator `args.evaluatorTags` (plain evaluator-shaped objects without `getTags` yield `[]`). */
378
+ /** Tags for evaluator `meta.evaluatorTags` (plain evaluator-shaped objects without `getTags` yield `[]`). */
349
379
  declare function getEvaluatorTagList(evaluator: {
350
380
  getTags?: () => ReadonlyArray<string>;
351
381
  }): string[];
@@ -384,12 +414,16 @@ declare const RunConfigNameSchema: Schema.brand<Schema.filter<Schema.filter<Sche
384
414
  declare const EvaluatorNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "EvaluatorName">;
385
415
  /** Branded id for `TestCase.describe({ name })` (decode with {@link TestCaseNameSchema}). */
386
416
  declare const TestCaseNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "TestCaseName">;
417
+ /** Branded id for `Dataset.define({ name })` (decode with {@link DatasetNameSchema}). */
418
+ declare const DatasetNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "DatasetName">;
387
419
  type RunConfigName = Schema.Schema.Type<typeof RunConfigNameSchema>;
388
420
  type EvaluatorName = Schema.Schema.Type<typeof EvaluatorNameSchema>;
389
421
  type TestCaseName = Schema.Schema.Type<typeof TestCaseNameSchema>;
422
+ type DatasetName = Schema.Schema.Type<typeof DatasetNameSchema>;
390
423
  declare function validateRunConfigName(raw: string, context: string): RunConfigName;
391
424
  declare function validateEvaluatorName(raw: string, context: string): EvaluatorName;
392
425
  declare function validateTestCaseName(raw: string, context: string): TestCaseName;
426
+ declare function validateDatasetName(raw: string, context: string): DatasetName;
393
427
  /** Optional UI label: trim; empty after trim becomes undefined. */
394
428
  declare function normalizeOptionalDisplayName(raw: string | undefined): string | undefined;
395
429
 
@@ -422,7 +456,7 @@ interface RunConfigDefineConfig {
422
456
  name: string;
423
457
  /** Optional human-readable label for CLI/TUI (any characters). */
424
458
  displayName?: string;
425
- /** Optional tags; copied to every evaluation as `runConfigTags` on the evaluator callback. */
459
+ /** Optional declared tags for this run config; copied to every evaluation as `meta.runConfigTags`. */
426
460
  tags?: ReadonlyArray<string>;
427
461
  runs: ReadonlyArray<RunConfigRow>;
428
462
  }
@@ -439,7 +473,7 @@ declare class RunConfig {
439
473
  getDisplayName(): string | undefined;
440
474
  /** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. */
441
475
  getDisplayLabel(): string;
442
- /** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
476
+ /** Tags from `RunConfig.define({ tags })`; surfaced as `meta.runConfigTags` on evaluator callbacks. */
443
477
  getTags(): string[];
444
478
  getRuns(): ReadonlyArray<RunConfigRow>;
445
479
  }
@@ -528,7 +562,7 @@ interface RunDatasetJob {
528
562
  */
529
563
  runConfigDisplayLabel?: string;
530
564
  /**
531
- * Tags from `RunConfig.define({ tags })` for this job; forwarded as `runConfigTags` on evaluator callbacks.
565
+ * Tags from `RunConfig.define({ tags })` for this job; forwarded as `meta.runConfigTags` on evaluator callbacks.
532
566
  */
533
567
  runConfigTags?: ReadonlyArray<string>;
534
568
  /** Evaluates each matching test case this many times (default 1). */
@@ -565,9 +599,13 @@ interface RunDatasetRequest {
565
599
  */
566
600
  repetitions?: number;
567
601
  /**
568
- * Optional tags for this run; forwarded as `runConfigTags` on evaluator callbacks (e.g. suite labels).
602
+ * Optional tags for this run; forwarded as `meta.runConfigTags` on evaluator callbacks (e.g. suite labels).
569
603
  */
570
604
  runConfigTags?: ReadonlyArray<string>;
605
+ /**
606
+ * Optional label for this run; forwarded as `experimentName` on evaluator `meta`.
607
+ */
608
+ experimentName?: string;
571
609
  }
572
610
  interface RunSnapshot {
573
611
  runId: string;
@@ -655,11 +693,14 @@ interface RunDatasetJobsWithSharedConcurrencyRequest {
655
693
  jobs: ReadonlyArray<RunDatasetJob>;
656
694
  globalConcurrency: number;
657
695
  triggerId?: string;
696
+ /** Applied to every job in this batch (e.g. CLI `--experiment`). */
697
+ experimentName?: string;
658
698
  }
659
699
  interface RunnerApi {
660
700
  collectDatasets(): Promise<ReadonlyArray<CollectedDataset>>;
661
701
  collectEvaluators(): Promise<ReadonlyArray<CollectedEvaluator>>;
662
702
  collectRunConfigs(): Promise<ReadonlyArray<CollectedRunConfig>>;
703
+ /** Resolves a dataset by canonical **`Dataset` `name`** (id), case-insensitive. */
663
704
  resolveDatasetByName(name: string): Promise<CollectedDataset | undefined>;
664
705
  resolveEvaluatorsByNamePattern(pattern: string): Promise<ReadonlyArray<CollectedEvaluator>>;
665
706
  /**
@@ -732,4 +773,4 @@ declare class TagSet {
732
773
  static define<const T extends readonly string[]>(tags: T): TagSetMembers<T>;
733
774
  }
734
775
 
735
- export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedRunConfig, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DeltaScoreData, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorName, EvaluatorNameSchema, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PROGRAMMATIC_RUN_CONFIG, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunConfig, RunConfigDefineConfig, RunConfigName, RunConfigNameSchema, RunConfigRow, RunConfigRowEvaluators, RunConfigRowPattern, RunDatasetJob, RunDatasetJobsWithSharedConcurrencyRequest, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TagSet, TagSetMembers, TestCase, TestCaseName, TestCaseNameSchema, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };
776
+ export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedRunConfig, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DatasetDefineConfig, DatasetName, DatasetNameSchema, DeltaScoreData, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorName, EvaluatorNameSchema, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PROGRAMMATIC_RUN_CONFIG, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunConfig, RunConfigDefineConfig, RunConfigName, RunConfigNameSchema, RunConfigRow, RunConfigRowEvaluators, RunConfigRowPattern, RunDatasetJob, RunDatasetJobsWithSharedConcurrencyRequest, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TagSet, TagSetMembers, TestCase, TestCaseName, TestCaseNameSchema, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getDatasetDisplayLabel, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateDatasetName, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };
package/dist/index.js CHANGED
@@ -26,6 +26,7 @@ function makeEntityIdSchema(brand, label) {
26
26
  var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
27
27
  var EvaluatorNameSchema = makeEntityIdSchema("EvaluatorName", "Evaluator name");
28
28
  var TestCaseNameSchema = makeEntityIdSchema("TestCaseName", "Test case name");
29
+ var DatasetNameSchema = makeEntityIdSchema("DatasetName", "Dataset name");
29
30
  function validateWithSchema(schema, raw, context) {
30
31
  const trimmed = raw.trim();
31
32
  const decode = Schema.decodeUnknownEither(
@@ -46,6 +47,9 @@ function validateEvaluatorName(raw, context) {
46
47
  function validateTestCaseName(raw, context) {
47
48
  return validateWithSchema(TestCaseNameSchema, raw, context);
48
49
  }
50
+ function validateDatasetName(raw, context) {
51
+ return validateWithSchema(DatasetNameSchema, raw, context);
52
+ }
49
53
  function normalizeOptionalDisplayName(raw) {
50
54
  if (raw === void 0) {
51
55
  return void 0;
@@ -54,6 +58,87 @@ function normalizeOptionalDisplayName(raw) {
54
58
  return t.length === 0 ? void 0 : t;
55
59
  }
56
60
 
61
+ // src/evals/dataset.ts
62
+ function matchesAny(value, matchers) {
63
+ return matchers.some(
64
+ (matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
65
+ );
66
+ }
67
+ function matchesAnyPath(filePath, matchers) {
68
+ return matchers.some((matcher) => {
69
+ if (typeof matcher === "string") {
70
+ return simpleGlobMatch(matcher, filePath);
71
+ }
72
+ return matcher.test(filePath);
73
+ });
74
+ }
75
+ function simpleGlobMatch(pattern, value) {
76
+ const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
77
+ return new RegExp(`^${escaped}$`).test(value);
78
+ }
79
+ var Dataset = class _Dataset {
80
+ constructor(config) {
81
+ this._config = config;
82
+ }
83
+ static define(config) {
84
+ const name = validateDatasetName(config.name, "Dataset.define");
85
+ const displayName = normalizeOptionalDisplayName(config.displayName);
86
+ return new _Dataset({
87
+ name,
88
+ displayName,
89
+ includedTags: config.includedTags ?? [],
90
+ excludedTags: config.excludedTags ?? [],
91
+ includedPaths: config.includedPaths ?? [],
92
+ excludedPaths: config.excludedPaths ?? []
93
+ });
94
+ }
95
+ /** Canonical dataset id (same rules as `RunConfig` / `TestCase` `name`). */
96
+ getName() {
97
+ return this._config.name;
98
+ }
99
+ getDisplayName() {
100
+ return this._config.displayName;
101
+ }
102
+ /** Label for CLI/TUI and evaluator `meta.datasetName`: {@link getDisplayName} if set, otherwise {@link getName}. */
103
+ getDisplayLabel() {
104
+ return this._config.displayName ?? this._config.name;
105
+ }
106
+ getIncludedTags() {
107
+ return this._config.includedTags;
108
+ }
109
+ getExcludedTags() {
110
+ return this._config.excludedTags;
111
+ }
112
+ getIncludedPaths() {
113
+ return this._config.includedPaths;
114
+ }
115
+ getExcludedPaths() {
116
+ return this._config.excludedPaths;
117
+ }
118
+ matchesTestCase(testCase, filePath) {
119
+ const tags = testCase.getTags();
120
+ if (this._config.excludedTags.length > 0) {
121
+ if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
122
+ return false;
123
+ }
124
+ }
125
+ if (this._config.excludedPaths.length > 0) {
126
+ if (matchesAnyPath(filePath, this._config.excludedPaths)) {
127
+ return false;
128
+ }
129
+ }
130
+ const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
131
+ const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
132
+ return tagMatch && pathMatch;
133
+ }
134
+ };
135
+ function getDatasetDisplayLabel(dataset) {
136
+ if (typeof dataset.getDisplayLabel === "function") {
137
+ return dataset.getDisplayLabel();
138
+ }
139
+ return typeof dataset.getName === "function" ? dataset.getName() : "";
140
+ }
141
+
57
142
  // src/evals/evaluator.ts
58
143
  var Evaluator = class _Evaluator {
59
144
  constructor(config) {
@@ -413,7 +498,7 @@ function toEvalDataset(item, snapshots) {
413
498
  const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
414
499
  return {
415
500
  id: item.id,
416
- name: item.dataset.getName(),
501
+ name: getDatasetDisplayLabel(item.dataset),
417
502
  overview: `Discovered from ${item.filePath}`,
418
503
  runs
419
504
  };
@@ -466,70 +551,6 @@ function parseStartupArgs(argv) {
466
551
  }
467
552
  return args;
468
553
  }
469
-
470
- // src/evals/dataset.ts
471
- function matchesAny(value, matchers) {
472
- return matchers.some(
473
- (matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
474
- );
475
- }
476
- function matchesAnyPath(filePath, matchers) {
477
- return matchers.some((matcher) => {
478
- if (typeof matcher === "string") {
479
- return simpleGlobMatch(matcher, filePath);
480
- }
481
- return matcher.test(filePath);
482
- });
483
- }
484
- function simpleGlobMatch(pattern, value) {
485
- const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
486
- return new RegExp(`^${escaped}$`).test(value);
487
- }
488
- var Dataset = class _Dataset {
489
- constructor(config) {
490
- this._config = config;
491
- }
492
- static define(config) {
493
- return new _Dataset({
494
- name: config.name,
495
- includedTags: config.includedTags ?? [],
496
- excludedTags: config.excludedTags ?? [],
497
- includedPaths: config.includedPaths ?? [],
498
- excludedPaths: config.excludedPaths ?? []
499
- });
500
- }
501
- getName() {
502
- return this._config.name;
503
- }
504
- getIncludedTags() {
505
- return this._config.includedTags;
506
- }
507
- getExcludedTags() {
508
- return this._config.excludedTags;
509
- }
510
- getIncludedPaths() {
511
- return this._config.includedPaths;
512
- }
513
- getExcludedPaths() {
514
- return this._config.excludedPaths;
515
- }
516
- matchesTestCase(testCase, filePath) {
517
- const tags = testCase.getTags();
518
- if (this._config.excludedTags.length > 0) {
519
- if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
520
- return false;
521
- }
522
- }
523
- if (this._config.excludedPaths.length > 0) {
524
- if (matchesAnyPath(filePath, this._config.excludedPaths)) {
525
- return false;
526
- }
527
- }
528
- const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
529
- const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
530
- return tagMatch && pathMatch;
531
- }
532
- };
533
554
  function preprocessForDiff(value, options) {
534
555
  if (options?.sort && Array.isArray(value)) {
535
556
  return [...value].sort((a, b) => {
@@ -795,7 +816,7 @@ var RunConfig = class _RunConfig {
795
816
  getDisplayLabel() {
796
817
  return this._displayName ?? this._name;
797
818
  }
798
- /** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
819
+ /** Tags from `RunConfig.define({ tags })`; surfaced as `meta.runConfigTags` on evaluator callbacks. */
799
820
  getTags() {
800
821
  return [...this._tags];
801
822
  }
@@ -968,10 +989,11 @@ var TestCase = class _TestCase {
968
989
  static describe(config) {
969
990
  const name = validateTestCaseName(config.name, "TestCase.describe");
970
991
  const displayName = normalizeOptionalDisplayName(config.displayName);
992
+ const tags = config.tags !== void 0 ? [...config.tags] : [];
971
993
  return new _TestCase({
972
994
  name,
973
995
  displayName,
974
- tags: config.tags,
996
+ tags,
975
997
  inputSchema: config.inputSchema,
976
998
  input: config.input,
977
999
  outputSchema: config.outputSchema,
@@ -988,7 +1010,7 @@ var TestCase = class _TestCase {
988
1010
  return this._config.displayName ?? this._config.name;
989
1011
  }
990
1012
  getTags() {
991
- return this._config.tags;
1013
+ return [...this._config.tags];
992
1014
  }
993
1015
  getInputSchema() {
994
1016
  return this._config.inputSchema;
@@ -1545,15 +1567,16 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1545
1567
  meta: {
1546
1568
  triggerId: task.triggerId,
1547
1569
  runId: evaluatorRunId,
1548
- datasetId: task.datasetId,
1570
+ datasetName: task.dataset.getDisplayLabel(),
1549
1571
  repetitionId,
1550
1572
  repetitionIndex,
1551
1573
  repetitionCount,
1552
- runConfigName: task.runConfigName
1574
+ runConfigName: task.runConfigName,
1575
+ ...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
1576
+ testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1577
+ runConfigTags: task.runConfigTags,
1578
+ evaluatorTags: getEvaluatorTagList(evaluator)
1553
1579
  },
1554
- testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1555
- runConfigTags: task.runConfigTags,
1556
- evaluatorTags: getEvaluatorTagList(evaluator),
1557
1580
  logDiff,
1558
1581
  log,
1559
1582
  createError
@@ -1960,7 +1983,7 @@ var EffectRunner = class {
1960
1983
  );
1961
1984
  if (!dsCollected) {
1962
1985
  throw new Error(
1963
- `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
1986
+ `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
1964
1987
  );
1965
1988
  }
1966
1989
  let evaluatorIds;
@@ -2032,7 +2055,8 @@ var EffectRunner = class {
2032
2055
  globalEvaluationSemaphore: sem,
2033
2056
  runConfigName: job.runConfigName,
2034
2057
  runConfigTags: job.runConfigTags,
2035
- repetitions: job.repetitions
2058
+ repetitions: job.repetitions,
2059
+ experimentName: request.experimentName
2036
2060
  })
2037
2061
  );
2038
2062
  }
@@ -2067,7 +2091,8 @@ var EffectRunner = class {
2067
2091
  maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
2068
2092
  repetitions: request.repetitions,
2069
2093
  runConfigName,
2070
- runConfigTags: request.runConfigTags
2094
+ runConfigTags: request.runConfigTags,
2095
+ experimentName: request.experimentName
2071
2096
  });
2072
2097
  }
2073
2098
  async startDatasetRun(params) {
@@ -2095,7 +2120,7 @@ var EffectRunner = class {
2095
2120
  const snapshot = {
2096
2121
  runId,
2097
2122
  datasetId: params.datasetId,
2098
- datasetName: dataset.dataset.getName(),
2123
+ datasetName: dataset.dataset.getDisplayLabel(),
2099
2124
  evaluatorIds: selectedEvaluators.map((item) => item.id),
2100
2125
  queuedAt: Date.now(),
2101
2126
  totalTestCases: totalEvaluations,
@@ -2116,7 +2141,7 @@ var EffectRunner = class {
2116
2141
  type: "RunQueued",
2117
2142
  runId,
2118
2143
  datasetId: params.datasetId,
2119
- datasetName: dataset.dataset.getName(),
2144
+ datasetName: dataset.dataset.getDisplayLabel(),
2120
2145
  evaluatorIds: selectedEvaluators.map((item) => item.id),
2121
2146
  totalTestCases: totalEvaluations,
2122
2147
  artifactPath
@@ -2142,7 +2167,8 @@ var EffectRunner = class {
2142
2167
  globalEvaluationSemaphore: params.globalEvaluationSemaphore,
2143
2168
  runConfigName: params.runConfigName,
2144
2169
  runConfigTags,
2145
- repetitions
2170
+ repetitions,
2171
+ experimentName: params.experimentName
2146
2172
  })
2147
2173
  );
2148
2174
  return snapshot;
@@ -2219,6 +2245,6 @@ var PROGRAMMATIC_RUN_CONFIG = {
2219
2245
  runConfigName: "programmatic"
2220
2246
  };
2221
2247
 
2222
- export { Dataset, Evaluator, EvaluatorNameSchema, Metric, PROGRAMMATIC_RUN_CONFIG, RunConfig, RunConfigNameSchema, Score, TagSet, TestCase, TestCaseNameSchema, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };
2248
+ export { Dataset, DatasetNameSchema, Evaluator, EvaluatorNameSchema, Metric, PROGRAMMATIC_RUN_CONFIG, RunConfig, RunConfigNameSchema, Score, TagSet, TestCase, TestCaseNameSchema, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getDatasetDisplayLabel, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateDatasetName, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };
2223
2249
  //# sourceMappingURL=out.js.map
2224
2250
  //# sourceMappingURL=index.js.map