npm - @m4trix/evals - Versions diffs - 0.27.0 → 0.29.0 - Mend

@m4trix/evals 0.27.0 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -132,7 +132,12 @@ interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TO extends Schema
     name: string;
     /** Optional human-readable label for CLI/TUI and evaluator args (any characters). */
     displayName?: string;
-    tags: string[];
+    /**
+     * Declared tags on this test case (not `Dataset` filter options). Use `Dataset` `includedTags` /
+     * `excludedTags` to select which cases belong to a dataset; evaluators read the resolved tags as
+     * `meta.testCaseTags`.
+     */
+    tags?: ReadonlyArray<string>;
     inputSchema: TI;
     input: InputOrBuilder<Schema.Schema.Type<TI>>;
     outputSchema?: TO;
@@ -156,7 +161,7 @@ declare function getTestCaseDisplayLabel(testCase: {
     getDisplayLabel?: () => string;
     getName?: () => string;
 }): string;
-/** Tags for evaluator `args.testCaseTags` (supports plain test-case-shaped objects without `getTags`). */
+/** Tags for evaluator `meta.testCaseTags` (supports plain test-case-shaped objects without `getTags`). */
 declare function getTestCaseTagList(testCase: {
     getTags?: () => ReadonlyArray<string>;
 }): string[];
@@ -276,8 +281,16 @@ interface EvaluateMeta {
     runId: string;
     /** Display label for the dataset (`Dataset.getDisplayLabel()`, i.e. `displayName ?? name`). */
     datasetName: string;
+    /** Discovery id for the current test case (same as runner events’ `testCaseId`). */
+    testCaseId: string;
+    /** Display label for the test case (`TestCase.getDisplayLabel()`, i.e. `displayName ?? name`). */
+    testCaseName: string;
     /** Canonical `RunConfig` name (or `programmatic` for API/TUI-only runs). */
     runConfigName: string;
+    /**
+     * Optional label for this invocation (e.g. CLI `--experiment`); omitted when not set.
+     */
+    experimentName?: string;
     /**
      * Stable id shared by every execution of the same logical test case when `repetitionCount > 1`
      * (and present with count 1 for consistency).
@@ -287,6 +300,15 @@ interface EvaluateMeta {
     repetitionIndex: number;
     /** Total scheduled executions for this logical test case in the current run. */
     repetitionCount: number;
+    /** Declared tags on the current test case (`TestCase.describe({ tags })`); empty when none. */
+    testCaseTags: string[];
+    /**
+     * Declared tags on the current run config or programmatic request (`RunConfig.define({ tags })`,
+     * `RunDatasetRequest.runConfigTags`); empty when none.
+     */
+    runConfigTags: string[];
+    /** Declared tags on this evaluator (`Evaluator.define({ tags })`); empty when none. */
+    evaluatorTags: string[];
 }
 interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>> {
     input: TInput;
@@ -294,12 +316,6 @@ interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>>
     output?: TOutput;
     /** Metadata about the current evaluator invocation. */
     meta: EvaluateMeta;
-    /** Tags from `TestCase.describe({ tags })` for the current test case. */
-    testCaseTags: string[];
-    /** Tags from `RunConfig.define({ tags })` for this job; empty for programmatic runs unless set on the request. */
-    runConfigTags: string[];
-    /** Tags from `Evaluator.define({ tags })` for this evaluator. */
-    evaluatorTags: string[];
     /** Records a diff for this test case; stored in run artifact and shown by CLI */
     logDiff: (expected: unknown, actual: unknown, options?: CreateDiffLogEntryOptions) => void;
     /** Logs a message or object for this test case; stored in run artifact and shown by CLI */
@@ -328,7 +344,10 @@ interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.
     scoreSchema: TS;
     passThreshold?: number;
     passCriterion?: (score: unknown) => boolean;
-    /** Optional tags for this evaluator; surfaced on every `evaluate` invocation. */
+    /**
+     * Declared tags for this evaluator (not dataset filter rules); echoed on every `evaluate` call as
+     * `meta.evaluatorTags`.
+     */
     tags?: ReadonlyArray<string>;
 }
 declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, TCtx = Record<string, never>> {
@@ -360,7 +379,7 @@ declare function getEvaluatorDisplayLabel(evaluator: {
     getDisplayLabel?: () => string | undefined;
     getName?: () => string | undefined;
 }): string | undefined;
-/** Tags for evaluator `args.evaluatorTags` (plain evaluator-shaped objects without `getTags` yield `[]`). */
+/** Tags for evaluator `meta.evaluatorTags` (plain evaluator-shaped objects without `getTags` yield `[]`). */
 declare function getEvaluatorTagList(evaluator: {
     getTags?: () => ReadonlyArray<string>;
 }): string[];
@@ -441,7 +460,7 @@ interface RunConfigDefineConfig {
     name: string;
     /** Optional human-readable label for CLI/TUI (any characters). */
     displayName?: string;
-    /** Optional tags; copied to every evaluation as `runConfigTags` on the evaluator callback. */
+    /** Optional declared tags for this run config; copied to every evaluation as `meta.runConfigTags`. */
     tags?: ReadonlyArray<string>;
     runs: ReadonlyArray<RunConfigRow>;
 }
@@ -458,7 +477,7 @@ declare class RunConfig {
     getDisplayName(): string | undefined;
     /** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. */
     getDisplayLabel(): string;
-    /** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
+    /** Tags from `RunConfig.define({ tags })`; surfaced as `meta.runConfigTags` on evaluator callbacks. */
     getTags(): string[];
     getRuns(): ReadonlyArray<RunConfigRow>;
 }
@@ -547,7 +566,7 @@ interface RunDatasetJob {
      */
     runConfigDisplayLabel?: string;
     /**
-     * Tags from `RunConfig.define({ tags })` for this job; forwarded as `runConfigTags` on evaluator callbacks.
+     * Tags from `RunConfig.define({ tags })` for this job; forwarded as `meta.runConfigTags` on evaluator callbacks.
      */
     runConfigTags?: ReadonlyArray<string>;
     /** Evaluates each matching test case this many times (default 1). */
@@ -584,9 +603,13 @@ interface RunDatasetRequest {
      */
     repetitions?: number;
     /**
-     * Optional tags for this run; forwarded as `runConfigTags` on evaluator callbacks (e.g. suite labels).
+     * Optional tags for this run; forwarded as `meta.runConfigTags` on evaluator callbacks (e.g. suite labels).
      */
     runConfigTags?: ReadonlyArray<string>;
+    /**
+     * Optional label for this run; forwarded as `experimentName` on evaluator `meta`.
+     */
+    experimentName?: string;
 }
 interface RunSnapshot {
     runId: string;
@@ -674,6 +697,8 @@ interface RunDatasetJobsWithSharedConcurrencyRequest {
     jobs: ReadonlyArray<RunDatasetJob>;
     globalConcurrency: number;
     triggerId?: string;
+    /** Applied to every job in this batch (e.g. CLI `--experiment`). */
+    experimentName?: string;
 }
 interface RunnerApi {
     collectDatasets(): Promise<ReadonlyArray<CollectedDataset>>;

package/dist/index.js CHANGED Viewed

@@ -816,7 +816,7 @@ var RunConfig = class _RunConfig {
   getDisplayLabel() {
     return this._displayName ?? this._name;
   }
-  /** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
+  /** Tags from `RunConfig.define({ tags })`; surfaced as `meta.runConfigTags` on evaluator callbacks. */
   getTags() {
     return [...this._tags];
   }
@@ -989,10 +989,11 @@ var TestCase = class _TestCase {
   static describe(config) {
     const name = validateTestCaseName(config.name, "TestCase.describe");
     const displayName = normalizeOptionalDisplayName(config.displayName);
+    const tags = config.tags !== void 0 ? [...config.tags] : [];
     return new _TestCase({
       name,
       displayName,
-      tags: config.tags,
+      tags,
       inputSchema: config.inputSchema,
       input: config.input,
       outputSchema: config.outputSchema,
@@ -1009,7 +1010,7 @@ var TestCase = class _TestCase {
     return this._config.displayName ?? this._config.name;
   }
   getTags() {
-    return this._config.tags;
+    return [...this._config.tags];
   }
   getInputSchema() {
     return this._config.inputSchema;
@@ -1567,14 +1568,17 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
                 triggerId: task.triggerId,
                 runId: evaluatorRunId,
                 datasetName: task.dataset.getDisplayLabel(),
+                testCaseId: testCaseItem.id,
+                testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
                 repetitionId,
                 repetitionIndex,
                 repetitionCount,
-                runConfigName: task.runConfigName
+                runConfigName: task.runConfigName,
+                ...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
+                testCaseTags: getTestCaseTagList(testCaseItem.testCase),
+                runConfigTags: task.runConfigTags,
+                evaluatorTags: getEvaluatorTagList(evaluator)
               },
-              testCaseTags: getTestCaseTagList(testCaseItem.testCase),
-              runConfigTags: task.runConfigTags,
-              evaluatorTags: getEvaluatorTagList(evaluator),
               logDiff,
               log,
               createError
@@ -2053,7 +2057,8 @@ var EffectRunner = class {
           globalEvaluationSemaphore: sem,
           runConfigName: job.runConfigName,
           runConfigTags: job.runConfigTags,
-          repetitions: job.repetitions
+          repetitions: job.repetitions,
+          experimentName: request.experimentName
         })
       );
     }
@@ -2088,7 +2093,8 @@ var EffectRunner = class {
       maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
       repetitions: request.repetitions,
       runConfigName,
-      runConfigTags: request.runConfigTags
+      runConfigTags: request.runConfigTags,
+      experimentName: request.experimentName
     });
   }
   async startDatasetRun(params) {
@@ -2163,7 +2169,8 @@ var EffectRunner = class {
         globalEvaluationSemaphore: params.globalEvaluationSemaphore,
         runConfigName: params.runConfigName,
         runConfigTags,
-        repetitions
+        repetitions,
+        experimentName: params.experimentName
       })
     );
     return snapshot;