npm - @m4trix/evals - Versions diffs - 0.26.0 → 0.28.0 - Mend

@m4trix/evals 0.26.0 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -123,21 +123,21 @@ declare function defineConfig<TConfig extends ConfigType>(factory: M4trixEvalCon
 declare const defaultRunnerConfig: RunnerConfig;
 declare function withRunnerConfig(overrides?: RunnerConfigOverrides): RunnerConfig;
-/** Matches a tag by exact string equality or regex test */
-type TagMatcher = string | RegExp;
-/** Matches a file path by glob string or regex test */
-type PathMatcher = string | RegExp;
 type InputOrBuilder<T> = T | (() => T);
 interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>> {
     /**
-     * Stable id (letters, digits, `_`, `-`).
+     * Stable id (letters, digits, `_`, `-`); used in discovery and matching.
      * For an unrestricted UI label, set {@link displayName}.
      */
     name: string;
-    /** Optional human-readable label for CLI/TUI (any characters). */
+    /** Optional human-readable label for CLI/TUI and evaluator args (any characters). */
     displayName?: string;
-    tags: string[];
+    /**
+     * Declared tags on this test case (not `Dataset` filter options). Use `Dataset` `includedTags` /
+     * `excludedTags` to select which cases belong to a dataset; evaluators read the resolved tags as
+     * `meta.testCaseTags`.
+     */
+    tags?: ReadonlyArray<string>;
     inputSchema: TI;
     input: InputOrBuilder<Schema.Schema.Type<TI>>;
     outputSchema?: TO;
@@ -161,13 +161,24 @@ declare function getTestCaseDisplayLabel(testCase: {
     getDisplayLabel?: () => string;
     getName?: () => string;
 }): string;
-/** Tags for evaluator `args.testCaseTags` (supports plain test-case-shaped objects without `getTags`). */
+/** Tags for evaluator `meta.testCaseTags` (supports plain test-case-shaped objects without `getTags`). */
 declare function getTestCaseTagList(testCase: {
     getTags?: () => ReadonlyArray<string>;
 }): string[];
+/** Matches a tag by exact string equality or regex test */
+type TagMatcher = string | RegExp;
+/** Matches a file path by glob string or regex test */
+type PathMatcher = string | RegExp;
 interface DatasetDefineConfig {
+    /**
+     * Stable id (letters, digits, `_`, `-`); used for discovery ids and `resolveDatasetByName`.
+     * For an unrestricted UI label, set {@link displayName}.
+     */
     name: string;
+    /** Optional human-readable label for CLI/TUI (any characters). */
+    displayName?: string;
     includedTags?: TagMatcher[];
     excludedTags?: TagMatcher[];
     includedPaths?: PathMatcher[];
@@ -177,13 +188,22 @@ declare class Dataset {
     private readonly _config;
     private constructor();
     static define(config: DatasetDefineConfig): Dataset;
+    /** Canonical dataset id (same rules as `RunConfig` / `TestCase` `name`). */
     getName(): string;
+    getDisplayName(): string | undefined;
+    /** Label for CLI/TUI and evaluator `meta.datasetName`: {@link getDisplayName} if set, otherwise {@link getName}. */
+    getDisplayLabel(): string;
     getIncludedTags(): ReadonlyArray<TagMatcher>;
     getExcludedTags(): ReadonlyArray<TagMatcher>;
     getIncludedPaths(): ReadonlyArray<PathMatcher>;
     getExcludedPaths(): ReadonlyArray<PathMatcher>;
     matchesTestCase(testCase: TestCase<unknown>, filePath: string): boolean;
 }
+/** CLI / runner: display label for a dataset-shaped object (supports discovery duck-types). */
+declare function getDatasetDisplayLabel(dataset: {
+    getDisplayLabel?: () => string;
+    getName?: () => string;
+}): string;
 /**
  * Options for customizing JSON diff output. Passed to logDiff, createDiffLogEntry, and printJsonDiff.
@@ -259,10 +279,14 @@ interface EvaluateMeta {
      * for this specific test-case run.
      */
     runId: string;
-    /** Identifier of the dataset currently being evaluated. */
-    datasetId: string;
+    /** Display label for the dataset (`Dataset.getDisplayLabel()`, i.e. `displayName ?? name`). */
+    datasetName: string;
     /** Canonical `RunConfig` name (or `programmatic` for API/TUI-only runs). */
     runConfigName: string;
+    /**
+     * Optional label for this invocation (e.g. CLI `--experiment`); omitted when not set.
+     */
+    experimentName?: string;
     /**
      * Stable id shared by every execution of the same logical test case when `repetitionCount > 1`
      * (and present with count 1 for consistency).
@@ -272,6 +296,15 @@ interface EvaluateMeta {
     repetitionIndex: number;
     /** Total scheduled executions for this logical test case in the current run. */
     repetitionCount: number;
+    /** Declared tags on the current test case (`TestCase.describe({ tags })`); empty when none. */
+    testCaseTags: string[];
+    /**
+     * Declared tags on the current run config or programmatic request (`RunConfig.define({ tags })`,
+     * `RunDatasetRequest.runConfigTags`); empty when none.
+     */
+    runConfigTags: string[];
+    /** Declared tags on this evaluator (`Evaluator.define({ tags })`); empty when none. */
+    evaluatorTags: string[];
 }
 interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>> {
     input: TInput;
@@ -279,12 +312,6 @@ interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>>
     output?: TOutput;
     /** Metadata about the current evaluator invocation. */
     meta: EvaluateMeta;
-    /** Tags from `TestCase.describe({ tags })` for the current test case. */
-    testCaseTags: string[];
-    /** Tags from `RunConfig.define({ tags })` for this job; empty for programmatic runs unless set on the request. */
-    runConfigTags: string[];
-    /** Tags from `Evaluator.define({ tags })` for this evaluator. */
-    evaluatorTags: string[];
     /** Records a diff for this test case; stored in run artifact and shown by CLI */
     logDiff: (expected: unknown, actual: unknown, options?: CreateDiffLogEntryOptions) => void;
     /** Logs a message or object for this test case; stored in run artifact and shown by CLI */
@@ -313,7 +340,10 @@ interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.
     scoreSchema: TS;
     passThreshold?: number;
     passCriterion?: (score: unknown) => boolean;
-    /** Optional tags for this evaluator; surfaced on every `evaluate` invocation. */
+    /**
+     * Declared tags for this evaluator (not dataset filter rules); echoed on every `evaluate` call as
+     * `meta.evaluatorTags`.
+     */
     tags?: ReadonlyArray<string>;
 }
 declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, TCtx = Record<string, never>> {
@@ -345,7 +375,7 @@ declare function getEvaluatorDisplayLabel(evaluator: {
     getDisplayLabel?: () => string | undefined;
     getName?: () => string | undefined;
 }): string | undefined;
-/** Tags for evaluator `args.evaluatorTags` (plain evaluator-shaped objects without `getTags` yield `[]`). */
+/** Tags for evaluator `meta.evaluatorTags` (plain evaluator-shaped objects without `getTags` yield `[]`). */
 declare function getEvaluatorTagList(evaluator: {
     getTags?: () => ReadonlyArray<string>;
 }): string[];
@@ -384,12 +414,16 @@ declare const RunConfigNameSchema: Schema.brand<Schema.filter<Schema.filter<Sche
 declare const EvaluatorNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "EvaluatorName">;
 /** Branded id for `TestCase.describe({ name })` (decode with {@link TestCaseNameSchema}). */
 declare const TestCaseNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "TestCaseName">;
+/** Branded id for `Dataset.define({ name })` (decode with {@link DatasetNameSchema}). */
+declare const DatasetNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "DatasetName">;
 type RunConfigName = Schema.Schema.Type<typeof RunConfigNameSchema>;
 type EvaluatorName = Schema.Schema.Type<typeof EvaluatorNameSchema>;
 type TestCaseName = Schema.Schema.Type<typeof TestCaseNameSchema>;
+type DatasetName = Schema.Schema.Type<typeof DatasetNameSchema>;
 declare function validateRunConfigName(raw: string, context: string): RunConfigName;
 declare function validateEvaluatorName(raw: string, context: string): EvaluatorName;
 declare function validateTestCaseName(raw: string, context: string): TestCaseName;
+declare function validateDatasetName(raw: string, context: string): DatasetName;
 /** Optional UI label: trim; empty after trim becomes undefined. */
 declare function normalizeOptionalDisplayName(raw: string | undefined): string | undefined;
@@ -422,7 +456,7 @@ interface RunConfigDefineConfig {
     name: string;
     /** Optional human-readable label for CLI/TUI (any characters). */
     displayName?: string;
-    /** Optional tags; copied to every evaluation as `runConfigTags` on the evaluator callback. */
+    /** Optional declared tags for this run config; copied to every evaluation as `meta.runConfigTags`. */
     tags?: ReadonlyArray<string>;
     runs: ReadonlyArray<RunConfigRow>;
 }
@@ -439,7 +473,7 @@ declare class RunConfig {
     getDisplayName(): string | undefined;
     /** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. */
     getDisplayLabel(): string;
-    /** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
+    /** Tags from `RunConfig.define({ tags })`; surfaced as `meta.runConfigTags` on evaluator callbacks. */
     getTags(): string[];
     getRuns(): ReadonlyArray<RunConfigRow>;
 }
@@ -528,7 +562,7 @@ interface RunDatasetJob {
      */
     runConfigDisplayLabel?: string;
     /**
-     * Tags from `RunConfig.define({ tags })` for this job; forwarded as `runConfigTags` on evaluator callbacks.
+     * Tags from `RunConfig.define({ tags })` for this job; forwarded as `meta.runConfigTags` on evaluator callbacks.
      */
     runConfigTags?: ReadonlyArray<string>;
     /** Evaluates each matching test case this many times (default 1). */
@@ -565,9 +599,13 @@ interface RunDatasetRequest {
      */
     repetitions?: number;
     /**
-     * Optional tags for this run; forwarded as `runConfigTags` on evaluator callbacks (e.g. suite labels).
+     * Optional tags for this run; forwarded as `meta.runConfigTags` on evaluator callbacks (e.g. suite labels).
      */
     runConfigTags?: ReadonlyArray<string>;
+    /**
+     * Optional label for this run; forwarded as `experimentName` on evaluator `meta`.
+     */
+    experimentName?: string;
 }
 interface RunSnapshot {
     runId: string;
@@ -655,11 +693,14 @@ interface RunDatasetJobsWithSharedConcurrencyRequest {
     jobs: ReadonlyArray<RunDatasetJob>;
     globalConcurrency: number;
     triggerId?: string;
+    /** Applied to every job in this batch (e.g. CLI `--experiment`). */
+    experimentName?: string;
 }
 interface RunnerApi {
     collectDatasets(): Promise<ReadonlyArray<CollectedDataset>>;
     collectEvaluators(): Promise<ReadonlyArray<CollectedEvaluator>>;
     collectRunConfigs(): Promise<ReadonlyArray<CollectedRunConfig>>;
+    /** Resolves a dataset by canonical **`Dataset` `name`** (id), case-insensitive. */
     resolveDatasetByName(name: string): Promise<CollectedDataset | undefined>;
     resolveEvaluatorsByNamePattern(pattern: string): Promise<ReadonlyArray<CollectedEvaluator>>;
     /**
@@ -732,4 +773,4 @@ declare class TagSet {
     static define<const T extends readonly string[]>(tags: T): TagSetMembers<T>;
 }
-export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedRunConfig, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DeltaScoreData, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorName, EvaluatorNameSchema, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PROGRAMMATIC_RUN_CONFIG, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunConfig, RunConfigDefineConfig, RunConfigName, RunConfigNameSchema, RunConfigRow, RunConfigRowEvaluators, RunConfigRowPattern, RunDatasetJob, RunDatasetJobsWithSharedConcurrencyRequest, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TagSet, TagSetMembers, TestCase, TestCaseName, TestCaseNameSchema, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };
+export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedRunConfig, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DatasetDefineConfig, DatasetName, DatasetNameSchema, DeltaScoreData, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorName, EvaluatorNameSchema, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PROGRAMMATIC_RUN_CONFIG, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunConfig, RunConfigDefineConfig, RunConfigName, RunConfigNameSchema, RunConfigRow, RunConfigRowEvaluators, RunConfigRowPattern, RunDatasetJob, RunDatasetJobsWithSharedConcurrencyRequest, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TagSet, TagSetMembers, TestCase, TestCaseName, TestCaseNameSchema, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getDatasetDisplayLabel, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateDatasetName, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };

package/dist/index.js CHANGED Viewed

@@ -26,6 +26,7 @@ function makeEntityIdSchema(brand, label) {
 var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
 var EvaluatorNameSchema = makeEntityIdSchema("EvaluatorName", "Evaluator name");
 var TestCaseNameSchema = makeEntityIdSchema("TestCaseName", "Test case name");
+var DatasetNameSchema = makeEntityIdSchema("DatasetName", "Dataset name");
 function validateWithSchema(schema, raw, context) {
   const trimmed = raw.trim();
   const decode = Schema.decodeUnknownEither(
@@ -46,6 +47,9 @@ function validateEvaluatorName(raw, context) {
 function validateTestCaseName(raw, context) {
   return validateWithSchema(TestCaseNameSchema, raw, context);
 }
+function validateDatasetName(raw, context) {
+  return validateWithSchema(DatasetNameSchema, raw, context);
+}
 function normalizeOptionalDisplayName(raw) {
   if (raw === void 0) {
     return void 0;
@@ -54,6 +58,87 @@ function normalizeOptionalDisplayName(raw) {
   return t.length === 0 ? void 0 : t;
 }
+// src/evals/dataset.ts
+function matchesAny(value, matchers) {
+  return matchers.some(
+    (matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
+  );
+}
+function matchesAnyPath(filePath, matchers) {
+  return matchers.some((matcher) => {
+    if (typeof matcher === "string") {
+      return simpleGlobMatch(matcher, filePath);
+    }
+    return matcher.test(filePath);
+  });
+}
+function simpleGlobMatch(pattern, value) {
+  const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
+  return new RegExp(`^${escaped}$`).test(value);
+}
+var Dataset = class _Dataset {
+  constructor(config) {
+    this._config = config;
+  }
+  static define(config) {
+    const name = validateDatasetName(config.name, "Dataset.define");
+    const displayName = normalizeOptionalDisplayName(config.displayName);
+    return new _Dataset({
+      name,
+      displayName,
+      includedTags: config.includedTags ?? [],
+      excludedTags: config.excludedTags ?? [],
+      includedPaths: config.includedPaths ?? [],
+      excludedPaths: config.excludedPaths ?? []
+    });
+  }
+  /** Canonical dataset id (same rules as `RunConfig` / `TestCase` `name`). */
+  getName() {
+    return this._config.name;
+  }
+  getDisplayName() {
+    return this._config.displayName;
+  }
+  /** Label for CLI/TUI and evaluator `meta.datasetName`: {@link getDisplayName} if set, otherwise {@link getName}. */
+  getDisplayLabel() {
+    return this._config.displayName ?? this._config.name;
+  }
+  getIncludedTags() {
+    return this._config.includedTags;
+  }
+  getExcludedTags() {
+    return this._config.excludedTags;
+  }
+  getIncludedPaths() {
+    return this._config.includedPaths;
+  }
+  getExcludedPaths() {
+    return this._config.excludedPaths;
+  }
+  matchesTestCase(testCase, filePath) {
+    const tags = testCase.getTags();
+    if (this._config.excludedTags.length > 0) {
+      if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
+        return false;
+      }
+    }
+    if (this._config.excludedPaths.length > 0) {
+      if (matchesAnyPath(filePath, this._config.excludedPaths)) {
+        return false;
+      }
+    }
+    const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
+    const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
+    return tagMatch && pathMatch;
+  }
+};
+function getDatasetDisplayLabel(dataset) {
+  if (typeof dataset.getDisplayLabel === "function") {
+    return dataset.getDisplayLabel();
+  }
+  return typeof dataset.getName === "function" ? dataset.getName() : "";
+}
 // src/evals/evaluator.ts
 var Evaluator = class _Evaluator {
   constructor(config) {
@@ -413,7 +498,7 @@ function toEvalDataset(item, snapshots) {
   const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
   return {
     id: item.id,
-    name: item.dataset.getName(),
+    name: getDatasetDisplayLabel(item.dataset),
     overview: `Discovered from ${item.filePath}`,
     runs
   };
@@ -466,70 +551,6 @@ function parseStartupArgs(argv) {
   }
   return args;
 }
-// src/evals/dataset.ts
-function matchesAny(value, matchers) {
-  return matchers.some(
-    (matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
-  );
-}
-function matchesAnyPath(filePath, matchers) {
-  return matchers.some((matcher) => {
-    if (typeof matcher === "string") {
-      return simpleGlobMatch(matcher, filePath);
-    }
-    return matcher.test(filePath);
-  });
-}
-function simpleGlobMatch(pattern, value) {
-  const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
-  return new RegExp(`^${escaped}$`).test(value);
-}
-var Dataset = class _Dataset {
-  constructor(config) {
-    this._config = config;
-  }
-  static define(config) {
-    return new _Dataset({
-      name: config.name,
-      includedTags: config.includedTags ?? [],
-      excludedTags: config.excludedTags ?? [],
-      includedPaths: config.includedPaths ?? [],
-      excludedPaths: config.excludedPaths ?? []
-    });
-  }
-  getName() {
-    return this._config.name;
-  }
-  getIncludedTags() {
-    return this._config.includedTags;
-  }
-  getExcludedTags() {
-    return this._config.excludedTags;
-  }
-  getIncludedPaths() {
-    return this._config.includedPaths;
-  }
-  getExcludedPaths() {
-    return this._config.excludedPaths;
-  }
-  matchesTestCase(testCase, filePath) {
-    const tags = testCase.getTags();
-    if (this._config.excludedTags.length > 0) {
-      if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
-        return false;
-      }
-    }
-    if (this._config.excludedPaths.length > 0) {
-      if (matchesAnyPath(filePath, this._config.excludedPaths)) {
-        return false;
-      }
-    }
-    const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
-    const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
-    return tagMatch && pathMatch;
-  }
-};
 function preprocessForDiff(value, options) {
   if (options?.sort && Array.isArray(value)) {
     return [...value].sort((a, b) => {
@@ -795,7 +816,7 @@ var RunConfig = class _RunConfig {
   getDisplayLabel() {
     return this._displayName ?? this._name;
   }
-  /** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
+  /** Tags from `RunConfig.define({ tags })`; surfaced as `meta.runConfigTags` on evaluator callbacks. */
   getTags() {
     return [...this._tags];
   }
@@ -968,10 +989,11 @@ var TestCase = class _TestCase {
   static describe(config) {
     const name = validateTestCaseName(config.name, "TestCase.describe");
     const displayName = normalizeOptionalDisplayName(config.displayName);
+    const tags = config.tags !== void 0 ? [...config.tags] : [];
     return new _TestCase({
       name,
       displayName,
-      tags: config.tags,
+      tags,
       inputSchema: config.inputSchema,
       input: config.input,
       outputSchema: config.outputSchema,
@@ -988,7 +1010,7 @@ var TestCase = class _TestCase {
     return this._config.displayName ?? this._config.name;
   }
   getTags() {
-    return this._config.tags;
+    return [...this._config.tags];
   }
   getInputSchema() {
     return this._config.inputSchema;
@@ -1545,15 +1567,16 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
               meta: {
                 triggerId: task.triggerId,
                 runId: evaluatorRunId,
-                datasetId: task.datasetId,
+                datasetName: task.dataset.getDisplayLabel(),
                 repetitionId,
                 repetitionIndex,
                 repetitionCount,
-                runConfigName: task.runConfigName
+                runConfigName: task.runConfigName,
+                ...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
+                testCaseTags: getTestCaseTagList(testCaseItem.testCase),
+                runConfigTags: task.runConfigTags,
+                evaluatorTags: getEvaluatorTagList(evaluator)
               },
-              testCaseTags: getTestCaseTagList(testCaseItem.testCase),
-              runConfigTags: task.runConfigTags,
-              evaluatorTags: getEvaluatorTagList(evaluator),
               logDiff,
               log,
               createError
@@ -1960,7 +1983,7 @@ var EffectRunner = class {
       );
       if (!dsCollected) {
         throw new Error(
-          `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
+          `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
         );
       }
       let evaluatorIds;
@@ -2032,7 +2055,8 @@ var EffectRunner = class {
           globalEvaluationSemaphore: sem,
           runConfigName: job.runConfigName,
           runConfigTags: job.runConfigTags,
-          repetitions: job.repetitions
+          repetitions: job.repetitions,
+          experimentName: request.experimentName
         })
       );
     }
@@ -2067,7 +2091,8 @@ var EffectRunner = class {
       maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
       repetitions: request.repetitions,
       runConfigName,
-      runConfigTags: request.runConfigTags
+      runConfigTags: request.runConfigTags,
+      experimentName: request.experimentName
     });
   }
   async startDatasetRun(params) {
@@ -2095,7 +2120,7 @@ var EffectRunner = class {
     const snapshot = {
       runId,
       datasetId: params.datasetId,
-      datasetName: dataset.dataset.getName(),
+      datasetName: dataset.dataset.getDisplayLabel(),
       evaluatorIds: selectedEvaluators.map((item) => item.id),
       queuedAt: Date.now(),
       totalTestCases: totalEvaluations,
@@ -2116,7 +2141,7 @@ var EffectRunner = class {
       type: "RunQueued",
       runId,
       datasetId: params.datasetId,
-      datasetName: dataset.dataset.getName(),
+      datasetName: dataset.dataset.getDisplayLabel(),
       evaluatorIds: selectedEvaluators.map((item) => item.id),
       totalTestCases: totalEvaluations,
       artifactPath
@@ -2142,7 +2167,8 @@ var EffectRunner = class {
         globalEvaluationSemaphore: params.globalEvaluationSemaphore,
         runConfigName: params.runConfigName,
         runConfigTags,
-        repetitions
+        repetitions,
+        experimentName: params.experimentName
       })
     );
     return snapshot;
@@ -2219,6 +2245,6 @@ var PROGRAMMATIC_RUN_CONFIG = {
   runConfigName: "programmatic"
 };
-export { Dataset, Evaluator, EvaluatorNameSchema, Metric, PROGRAMMATIC_RUN_CONFIG, RunConfig, RunConfigNameSchema, Score, TagSet, TestCase, TestCaseNameSchema, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };
+export { Dataset, DatasetNameSchema, Evaluator, EvaluatorNameSchema, Metric, PROGRAMMATIC_RUN_CONFIG, RunConfig, RunConfigNameSchema, Score, TagSet, TestCase, TestCaseNameSchema, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getDatasetDisplayLabel, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateDatasetName, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };
 //# sourceMappingURL=out.js.map
 //# sourceMappingURL=index.js.map