npm - @m4trix/evals - Versions diffs - 0.25.1 → 0.26.0 - Mend

@m4trix/evals 0.25.1 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -86,6 +86,7 @@ interface RunnerDiscoveryConfig {
     rootDir: string;
     datasetSuffixes: ReadonlyArray<string>;
     evaluatorSuffixes: ReadonlyArray<string>;
+    runConfigSuffixes: ReadonlyArray<string>;
     testCaseSuffixes: ReadonlyArray<string>;
     excludeDirectories: ReadonlyArray<string>;
 }
@@ -102,9 +103,11 @@ interface M4trixEvalConfigDiscovery {
     rootDir?: string;
     datasetFilePatterns?: ReadonlyArray<string>;
     evaluatorFilePatterns?: ReadonlyArray<string>;
+    runConfigFilePatterns?: ReadonlyArray<string>;
     testCaseFilePatterns?: ReadonlyArray<string>;
     datasetSuffixes?: ReadonlyArray<string>;
     evaluatorSuffixes?: ReadonlyArray<string>;
+    runConfigSuffixes?: ReadonlyArray<string>;
     testCaseSuffixes?: ReadonlyArray<string>;
     excludeDirectories?: ReadonlyArray<string>;
 }
@@ -127,9 +130,14 @@ type PathMatcher = string | RegExp;
 type InputOrBuilder<T> = T | (() => T);
 interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>> {
+    /**
+     * Stable id (letters, digits, `_`, `-`).
+     * For an unrestricted UI label, set {@link displayName}.
+     */
     name: string;
+    /** Optional human-readable label for CLI/TUI (any characters). */
+    displayName?: string;
     tags: string[];
-    reruns?: number;
     inputSchema: TI;
     input: InputOrBuilder<Schema.Schema.Type<TI>>;
     outputSchema?: TO;
@@ -139,14 +147,24 @@ declare class TestCase<TInput = unknown, TOutput = unknown> {
     private readonly _config;
     private constructor();
     static describe<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>>(config: TestCaseDescribeConfig<TI, TO>): TestCase<Schema.Schema.Type<TI>, Schema.Schema.Type<TO>>;
-    getReruns(): number;
     getName(): string;
+    getDisplayName(): string | undefined;
+    getDisplayLabel(): string;
     getTags(): string[];
     getInputSchema(): Schema.Schema.Any;
     getInput(): TInput;
     getOutputSchema(): Schema.Schema.Any | undefined;
     getOutput(): TOutput | undefined;
 }
+/** CLI-friendly label: {@link TestCase.getDisplayLabel} when present, else {@link TestCase.getName} (supports plain test-case-shaped objects). */
+declare function getTestCaseDisplayLabel(testCase: {
+    getDisplayLabel?: () => string;
+    getName?: () => string;
+}): string;
+/** Tags for evaluator `args.testCaseTags` (supports plain test-case-shaped objects without `getTags`). */
+declare function getTestCaseTagList(testCase: {
+    getTags?: () => ReadonlyArray<string>;
+}): string[];
 interface DatasetDefineConfig {
     name: string;
@@ -243,6 +261,17 @@ interface EvaluateMeta {
     runId: string;
     /** Identifier of the dataset currently being evaluated. */
     datasetId: string;
+    /** Canonical `RunConfig` name (or `programmatic` for API/TUI-only runs). */
+    runConfigName: string;
+    /**
+     * Stable id shared by every execution of the same logical test case when `repetitionCount > 1`
+     * (and present with count 1 for consistency).
+     */
+    repetitionId: string;
+    /** 1-based index of this execution within the repetition group. */
+    repetitionIndex: number;
+    /** Total scheduled executions for this logical test case in the current run. */
+    repetitionCount: number;
 }
 interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>> {
     input: TInput;
@@ -250,6 +279,12 @@ interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>>
     output?: TOutput;
     /** Metadata about the current evaluator invocation. */
     meta: EvaluateMeta;
+    /** Tags from `TestCase.describe({ tags })` for the current test case. */
+    testCaseTags: string[];
+    /** Tags from `RunConfig.define({ tags })` for this job; empty for programmatic runs unless set on the request. */
+    runConfigTags: string[];
+    /** Tags from `Evaluator.define({ tags })` for this evaluator. */
+    evaluatorTags: string[];
     /** Records a diff for this test case; stored in run artifact and shown by CLI */
     logDiff: (expected: unknown, actual: unknown, options?: CreateDiffLogEntryOptions) => void;
     /** Logs a message or object for this test case; stored in run artifact and shown by CLI */
@@ -266,12 +301,20 @@ interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>>
 }
 type EvaluateFn<TInput, TOutput, TScore, TCtx> = (args: EvaluateArgs<TInput, TOutput, TCtx>) => TScore | Error | Promise<TScore | Error>;
 interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any> {
+    /**
+     * Stable id (letters, digits, `_`, `-`); used for discovery, name patterns, and `meta`.
+     * For an unrestricted UI label, set {@link displayName}.
+     */
     name: string;
+    /** Optional human-readable label for CLI/TUI (any characters). */
+    displayName?: string;
     inputSchema: TI;
     outputSchema: TO;
     scoreSchema: TS;
     passThreshold?: number;
     passCriterion?: (score: unknown) => boolean;
+    /** Optional tags for this evaluator; surfaced on every `evaluate` invocation. */
+    tags?: ReadonlyArray<string>;
 }
 declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, TCtx = Record<string, never>> {
     private readonly _config;
@@ -281,7 +324,13 @@ declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, T
     use<TNew>(middleware: EvalMiddleware<TNew>): Evaluator<TInput, TOutput, TScore, TCtx & TNew>;
     define<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any>(config: EvaluatorDefineConfig<TI, TO, TS>): Evaluator<Schema.Schema.Type<TI>, Schema.Schema.Type<TO>, Schema.Schema.Type<TS>, TCtx>;
     evaluate(fn: EvaluateFn<TInput, TOutput, TScore, TCtx>): Evaluator<TInput, TOutput, TScore, TCtx>;
+    /** Canonical evaluator id when defined; otherwise undefined (middleware-only chain). */
     getName(): string | undefined;
+    getDisplayName(): string | undefined;
+    /** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. Undefined if not yet defined. */
+    getDisplayLabel(): string | undefined;
+    /** Tags from `Evaluator.define({ tags })`; empty until defined. */
+    getTags(): string[];
     getInputSchema(): Schema.Schema.Any | undefined;
     getOutputSchema(): Schema.Schema.Any | undefined;
     getScoreSchema(): Schema.Schema.Any | undefined;
@@ -291,6 +340,15 @@ declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, T
     getPassCriterion(): ((score: unknown) => boolean) | undefined;
     resolveContext(): Promise<TCtx>;
 }
+/** CLI-friendly label: {@link Evaluator.getDisplayLabel} when present, else {@link Evaluator.getName} (supports plain evaluator-shaped objects from discovery). */
+declare function getEvaluatorDisplayLabel(evaluator: {
+    getDisplayLabel?: () => string | undefined;
+    getName?: () => string | undefined;
+}): string | undefined;
+/** Tags for evaluator `args.evaluatorTags` (plain evaluator-shaped objects without `getTags` yield `[]`). */
+declare function getEvaluatorTagList(evaluator: {
+    getTags?: () => ReadonlyArray<string>;
+}): string[];
 interface MetricItem<TData = unknown> {
     readonly id: string;
@@ -320,6 +378,72 @@ declare const Metric: {
 };
 declare function getMetricById(id: string): MetricDef<unknown> | undefined;
+/** Branded id for `RunConfig` `name` (decode with {@link RunConfigNameSchema}). */
+declare const RunConfigNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "RunConfigName">;
+/** Branded id for `Evaluator.define({ name })` (decode with {@link EvaluatorNameSchema}). */
+declare const EvaluatorNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "EvaluatorName">;
+/** Branded id for `TestCase.describe({ name })` (decode with {@link TestCaseNameSchema}). */
+declare const TestCaseNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "TestCaseName">;
+type RunConfigName = Schema.Schema.Type<typeof RunConfigNameSchema>;
+type EvaluatorName = Schema.Schema.Type<typeof EvaluatorNameSchema>;
+type TestCaseName = Schema.Schema.Type<typeof TestCaseNameSchema>;
+declare function validateRunConfigName(raw: string, context: string): RunConfigName;
+declare function validateEvaluatorName(raw: string, context: string): EvaluatorName;
+declare function validateTestCaseName(raw: string, context: string): TestCaseName;
+/** Optional UI label: trim; empty after trim becomes undefined. */
+declare function normalizeOptionalDisplayName(raw: string | undefined): string | undefined;
+/** Heterogeneous evaluator rows; `unknown` breaks assignability from concrete `EvaluateFn` (contravariance on `input`). */
+type RunConfigEvaluatorRef = Evaluator<any, any, any, any>;
+/** Select evaluators by concrete instances (same module exports as discovery). */
+interface RunConfigRowEvaluators {
+    readonly dataset: Dataset;
+    readonly evaluators: ReadonlyArray<RunConfigEvaluatorRef>;
+    readonly evaluatorPattern?: undefined;
+    /**
+     * How many times each test case in this dataset is evaluated for this row (default: 1).
+     * All executions of the same logical test case share one `repetitionId` in evaluator `meta`.
+     */
+    readonly repetitions?: number;
+}
+/** Select evaluators using the same wildcard / regex rules as the runner's `resolveEvaluatorsByNamePattern`. */
+interface RunConfigRowPattern {
+    readonly dataset: Dataset;
+    readonly evaluatorPattern: string;
+    readonly evaluators?: undefined;
+    readonly repetitions?: number;
+}
+type RunConfigRow = RunConfigRowEvaluators | RunConfigRowPattern;
+interface RunConfigDefineConfig {
+    /**
+     * Stable id (letters, digits, `_`, `-`); surfaced in discovery, CLI `--run-config`, and evaluator `meta`.
+     * For an unrestricted UI label, set {@link displayName}.
+     */
+    name: string;
+    /** Optional human-readable label for CLI/TUI (any characters). */
+    displayName?: string;
+    /** Optional tags; copied to every evaluation as `runConfigTags` on the evaluator callback. */
+    tags?: ReadonlyArray<string>;
+    runs: ReadonlyArray<RunConfigRow>;
+}
+declare class RunConfig {
+    private readonly _name;
+    private readonly _displayName;
+    private readonly _tags;
+    private readonly _runs;
+    private constructor();
+    static define(config: RunConfigDefineConfig): RunConfig;
+    /** Canonical id (branded {@link RunConfigName} at runtime; typed as `string` for ergonomics). */
+    getName(): string;
+    /** Optional unrestricted display label. */
+    getDisplayName(): string | undefined;
+    /** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. */
+    getDisplayLabel(): string;
+    /** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
+    getTags(): string[];
+    getRuns(): ReadonlyArray<RunConfigRow>;
+}
 type ScoreDisplayStrategy = 'bar' | 'number' | 'passFail';
 interface ScoreItem<TData = unknown> {
     readonly id: string;
@@ -387,6 +511,29 @@ interface CollectedEvaluator {
     filePath: string;
     evaluator: Evaluator<unknown, unknown, unknown, unknown>;
 }
+interface CollectedRunConfig {
+    id: string;
+    filePath: string;
+    runConfig: RunConfig;
+}
+/** One dataset + evaluator set queued as part of a RunConfig or batch run. */
+interface RunDatasetJob {
+    datasetId: string;
+    evaluatorIds: ReadonlyArray<string>;
+    /** RunConfig name (same as `RunConfig.getName()`). */
+    runConfigName: string;
+    /**
+     * Human-facing RunConfig label (`displayName ?? name`) when the job was expanded from `RunConfig.define`.
+     * Omitted for ad-hoc jobs; UI should fall back to {@link runConfigName}.
+     */
+    runConfigDisplayLabel?: string;
+    /**
+     * Tags from `RunConfig.define({ tags })` for this job; forwarded as `runConfigTags` on evaluator callbacks.
+     */
+    runConfigTags?: ReadonlyArray<string>;
+    /** Evaluates each matching test case this many times (default 1). */
+    repetitions: number;
+}
 interface CollectedTestCase {
     id: string;
     filePath: string;
@@ -398,6 +545,10 @@ interface SearchTestCasesQuery {
     includedPaths?: ReadonlyArray<string | RegExp>;
     excludedPaths?: ReadonlyArray<string | RegExp>;
 }
+/** Use with `RunDatasetRequest` for API / TUI runs that are not backed by a `RunConfig` file. */
+declare const PROGRAMMATIC_RUN_CONFIG: {
+    readonly runConfigName: "programmatic";
+};
 interface RunDatasetRequest {
     /**
      * Identifier for what triggered the run request (for example, a CLI command).
@@ -406,7 +557,17 @@ interface RunDatasetRequest {
     triggerId?: string;
     datasetId: string;
     evaluatorIds: ReadonlyArray<string>;
+    /** RunConfig name surfaced on evaluator `meta` (from the job or `PROGRAMMATIC_RUN_CONFIG`). */
+    runConfigName: string;
     concurrency?: number;
+    /**
+     * How many times each test case is executed (default: 1). For RunConfig-backed runs, set per row on the config.
+     */
+    repetitions?: number;
+    /**
+     * Optional tags for this run; forwarded as `runConfigTags` on evaluator callbacks (e.g. suite labels).
+     */
+    runConfigTags?: ReadonlyArray<string>;
 }
 interface RunSnapshot {
     runId: string;
@@ -443,8 +604,9 @@ type RunnerEvent = {
     testCaseName: string;
     startedTestCases: number;
     totalTestCases: number;
-    rerunIndex: number;
-    rerunTotal: number;
+    repetitionId: string;
+    repetitionIndex: number;
+    repetitionCount: number;
 } | {
     type: 'TestCaseProgress';
     runId: string;
@@ -452,8 +614,9 @@ type RunnerEvent = {
     testCaseName: string;
     completedTestCases: number;
     totalTestCases: number;
-    rerunIndex: number;
-    rerunTotal: number;
+    repetitionId: string;
+    repetitionIndex: number;
+    repetitionCount: number;
     passed: boolean;
     durationMs: number;
     evaluatorScores: ReadonlyArray<{
@@ -488,11 +651,26 @@ type RunnerEvent = {
 interface SubscribeOptions {
     runId?: string;
 }
+interface RunDatasetJobsWithSharedConcurrencyRequest {
+    jobs: ReadonlyArray<RunDatasetJob>;
+    globalConcurrency: number;
+    triggerId?: string;
+}
 interface RunnerApi {
     collectDatasets(): Promise<ReadonlyArray<CollectedDataset>>;
     collectEvaluators(): Promise<ReadonlyArray<CollectedEvaluator>>;
+    collectRunConfigs(): Promise<ReadonlyArray<CollectedRunConfig>>;
     resolveDatasetByName(name: string): Promise<CollectedDataset | undefined>;
     resolveEvaluatorsByNamePattern(pattern: string): Promise<ReadonlyArray<CollectedEvaluator>>;
+    /**
+     * Resolves a RunConfig by display name (case-insensitive).
+     * @throws If more than one discovered RunConfig uses the same name (list file paths in the error).
+     */
+    resolveRunConfigByName(name: string): Promise<CollectedRunConfig | undefined>;
+    expandRunConfigToJobs(collected: CollectedRunConfig): Promise<ReadonlyArray<RunDatasetJob>>;
+    /** Resolves each name in order and concatenates expanded jobs (the same name may appear more than once). */
+    expandRunConfigNamesToJobs(names: ReadonlyArray<string>): Promise<ReadonlyArray<RunDatasetJob>>;
+    runDatasetJobsWithSharedConcurrency(request: RunDatasetJobsWithSharedConcurrencyRequest): Promise<ReadonlyArray<RunSnapshot>>;
     searchTestCases(query?: SearchTestCasesQuery): Promise<ReadonlyArray<CollectedTestCase>>;
     collectDatasetTestCases(datasetId: string): Promise<ReadonlyArray<CollectedTestCase>>;
     runDatasetWith(request: RunDatasetRequest): Promise<RunSnapshot>;
@@ -538,4 +716,20 @@ interface BinaryScoreData {
 }
 declare const binaryScore: ScoreDef<BinaryScoreData>;
-export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DeltaScoreData, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
+/**
+ * Map from each tag literal to a `string` value (the same string at runtime).
+ * Lets you reference `set['my-tag']` with autocomplete and errors on unknown keys.
+ */
+type TagSetMembers<T extends readonly string[]> = {
+    readonly [K in T[number]]: string;
+};
+/**
+ * Closed set of tag strings for type-safe references (`set['alpha']` is valid; `set['nope']` is a type error).
+ * Values are plain `string`, so they assign to `string[]`, dataset matchers, etc.
+ */
+declare class TagSet {
+    private constructor();
+    static define<const T extends readonly string[]>(tags: T): TagSetMembers<T>;
+}
+export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedRunConfig, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DeltaScoreData, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorName, EvaluatorNameSchema, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PROGRAMMATIC_RUN_CONFIG, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunConfig, RunConfigDefineConfig, RunConfigName, RunConfigNameSchema, RunConfigRow, RunConfigRowEvaluators, RunConfigRowPattern, RunDatasetJob, RunDatasetJobsWithSharedConcurrencyRequest, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TagSet, TagSetMembers, TestCase, TestCaseName, TestCaseNameSchema, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };