npm - @m4trix/evals - Versions diffs - 0.26.0 → 0.27.0 - Mend

@m4trix/evals 0.26.0 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -123,19 +123,14 @@ declare function defineConfig<TConfig extends ConfigType>(factory: M4trixEvalCon
 declare const defaultRunnerConfig: RunnerConfig;
 declare function withRunnerConfig(overrides?: RunnerConfigOverrides): RunnerConfig;
-/** Matches a tag by exact string equality or regex test */
-type TagMatcher = string | RegExp;
-/** Matches a file path by glob string or regex test */
-type PathMatcher = string | RegExp;
 type InputOrBuilder<T> = T | (() => T);
 interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>> {
     /**
-     * Stable id (letters, digits, `_`, `-`).
+     * Stable id (letters, digits, `_`, `-`); used in discovery and matching.
      * For an unrestricted UI label, set {@link displayName}.
      */
     name: string;
-    /** Optional human-readable label for CLI/TUI (any characters). */
+    /** Optional human-readable label for CLI/TUI and evaluator args (any characters). */
     displayName?: string;
     tags: string[];
     inputSchema: TI;
@@ -166,8 +161,19 @@ declare function getTestCaseTagList(testCase: {
     getTags?: () => ReadonlyArray<string>;
 }): string[];
+/** Matches a tag by exact string equality or regex test */
+type TagMatcher = string | RegExp;
+/** Matches a file path by glob string or regex test */
+type PathMatcher = string | RegExp;
 interface DatasetDefineConfig {
+    /**
+     * Stable id (letters, digits, `_`, `-`); used for discovery ids and `resolveDatasetByName`.
+     * For an unrestricted UI label, set {@link displayName}.
+     */
     name: string;
+    /** Optional human-readable label for CLI/TUI (any characters). */
+    displayName?: string;
     includedTags?: TagMatcher[];
     excludedTags?: TagMatcher[];
     includedPaths?: PathMatcher[];
@@ -177,13 +183,22 @@ declare class Dataset {
     private readonly _config;
     private constructor();
     static define(config: DatasetDefineConfig): Dataset;
+    /** Canonical dataset id (same rules as `RunConfig` / `TestCase` `name`). */
     getName(): string;
+    getDisplayName(): string | undefined;
+    /** Label for CLI/TUI and evaluator `meta.datasetName`: {@link getDisplayName} if set, otherwise {@link getName}. */
+    getDisplayLabel(): string;
     getIncludedTags(): ReadonlyArray<TagMatcher>;
     getExcludedTags(): ReadonlyArray<TagMatcher>;
     getIncludedPaths(): ReadonlyArray<PathMatcher>;
     getExcludedPaths(): ReadonlyArray<PathMatcher>;
     matchesTestCase(testCase: TestCase<unknown>, filePath: string): boolean;
 }
+/** CLI / runner: display label for a dataset-shaped object (supports discovery duck-types). */
+declare function getDatasetDisplayLabel(dataset: {
+    getDisplayLabel?: () => string;
+    getName?: () => string;
+}): string;
 /**
  * Options for customizing JSON diff output. Passed to logDiff, createDiffLogEntry, and printJsonDiff.
@@ -259,8 +274,8 @@ interface EvaluateMeta {
      * for this specific test-case run.
      */
     runId: string;
-    /** Identifier of the dataset currently being evaluated. */
-    datasetId: string;
+    /** Display label for the dataset (`Dataset.getDisplayLabel()`, i.e. `displayName ?? name`). */
+    datasetName: string;
     /** Canonical `RunConfig` name (or `programmatic` for API/TUI-only runs). */
     runConfigName: string;
     /**
@@ -384,12 +399,16 @@ declare const RunConfigNameSchema: Schema.brand<Schema.filter<Schema.filter<Sche
 declare const EvaluatorNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "EvaluatorName">;
 /** Branded id for `TestCase.describe({ name })` (decode with {@link TestCaseNameSchema}). */
 declare const TestCaseNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "TestCaseName">;
+/** Branded id for `Dataset.define({ name })` (decode with {@link DatasetNameSchema}). */
+declare const DatasetNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "DatasetName">;
 type RunConfigName = Schema.Schema.Type<typeof RunConfigNameSchema>;
 type EvaluatorName = Schema.Schema.Type<typeof EvaluatorNameSchema>;
 type TestCaseName = Schema.Schema.Type<typeof TestCaseNameSchema>;
+type DatasetName = Schema.Schema.Type<typeof DatasetNameSchema>;
 declare function validateRunConfigName(raw: string, context: string): RunConfigName;
 declare function validateEvaluatorName(raw: string, context: string): EvaluatorName;
 declare function validateTestCaseName(raw: string, context: string): TestCaseName;
+declare function validateDatasetName(raw: string, context: string): DatasetName;
 /** Optional UI label: trim; empty after trim becomes undefined. */
 declare function normalizeOptionalDisplayName(raw: string | undefined): string | undefined;
@@ -660,6 +679,7 @@ interface RunnerApi {
     collectDatasets(): Promise<ReadonlyArray<CollectedDataset>>;
     collectEvaluators(): Promise<ReadonlyArray<CollectedEvaluator>>;
     collectRunConfigs(): Promise<ReadonlyArray<CollectedRunConfig>>;
+    /** Resolves a dataset by canonical **`Dataset` `name`** (id), case-insensitive. */
     resolveDatasetByName(name: string): Promise<CollectedDataset | undefined>;
     resolveEvaluatorsByNamePattern(pattern: string): Promise<ReadonlyArray<CollectedEvaluator>>;
     /**
@@ -732,4 +752,4 @@ declare class TagSet {
     static define<const T extends readonly string[]>(tags: T): TagSetMembers<T>;
 }
-export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedRunConfig, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DeltaScoreData, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorName, EvaluatorNameSchema, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PROGRAMMATIC_RUN_CONFIG, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunConfig, RunConfigDefineConfig, RunConfigName, RunConfigNameSchema, RunConfigRow, RunConfigRowEvaluators, RunConfigRowPattern, RunDatasetJob, RunDatasetJobsWithSharedConcurrencyRequest, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TagSet, TagSetMembers, TestCase, TestCaseName, TestCaseNameSchema, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };
+export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedRunConfig, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DatasetDefineConfig, DatasetName, DatasetNameSchema, DeltaScoreData, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorName, EvaluatorNameSchema, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PROGRAMMATIC_RUN_CONFIG, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunConfig, RunConfigDefineConfig, RunConfigName, RunConfigNameSchema, RunConfigRow, RunConfigRowEvaluators, RunConfigRowPattern, RunDatasetJob, RunDatasetJobsWithSharedConcurrencyRequest, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TagSet, TagSetMembers, TestCase, TestCaseName, TestCaseNameSchema, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getDatasetDisplayLabel, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateDatasetName, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };

package/dist/index.js CHANGED Viewed

@@ -26,6 +26,7 @@ function makeEntityIdSchema(brand, label) {
 var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
 var EvaluatorNameSchema = makeEntityIdSchema("EvaluatorName", "Evaluator name");
 var TestCaseNameSchema = makeEntityIdSchema("TestCaseName", "Test case name");
+var DatasetNameSchema = makeEntityIdSchema("DatasetName", "Dataset name");
 function validateWithSchema(schema, raw, context) {
   const trimmed = raw.trim();
   const decode = Schema.decodeUnknownEither(
@@ -46,6 +47,9 @@ function validateEvaluatorName(raw, context) {
 function validateTestCaseName(raw, context) {
   return validateWithSchema(TestCaseNameSchema, raw, context);
 }
+function validateDatasetName(raw, context) {
+  return validateWithSchema(DatasetNameSchema, raw, context);
+}
 function normalizeOptionalDisplayName(raw) {
   if (raw === void 0) {
     return void 0;
@@ -54,6 +58,87 @@ function normalizeOptionalDisplayName(raw) {
   return t.length === 0 ? void 0 : t;
 }
+// src/evals/dataset.ts
+function matchesAny(value, matchers) {
+  return matchers.some(
+    (matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
+  );
+}
+function matchesAnyPath(filePath, matchers) {
+  return matchers.some((matcher) => {
+    if (typeof matcher === "string") {
+      return simpleGlobMatch(matcher, filePath);
+    }
+    return matcher.test(filePath);
+  });
+}
+function simpleGlobMatch(pattern, value) {
+  const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
+  return new RegExp(`^${escaped}$`).test(value);
+}
+var Dataset = class _Dataset {
+  constructor(config) {
+    this._config = config;
+  }
+  static define(config) {
+    const name = validateDatasetName(config.name, "Dataset.define");
+    const displayName = normalizeOptionalDisplayName(config.displayName);
+    return new _Dataset({
+      name,
+      displayName,
+      includedTags: config.includedTags ?? [],
+      excludedTags: config.excludedTags ?? [],
+      includedPaths: config.includedPaths ?? [],
+      excludedPaths: config.excludedPaths ?? []
+    });
+  }
+  /** Canonical dataset id (same rules as `RunConfig` / `TestCase` `name`). */
+  getName() {
+    return this._config.name;
+  }
+  getDisplayName() {
+    return this._config.displayName;
+  }
+  /** Label for CLI/TUI and evaluator `meta.datasetName`: {@link getDisplayName} if set, otherwise {@link getName}. */
+  getDisplayLabel() {
+    return this._config.displayName ?? this._config.name;
+  }
+  getIncludedTags() {
+    return this._config.includedTags;
+  }
+  getExcludedTags() {
+    return this._config.excludedTags;
+  }
+  getIncludedPaths() {
+    return this._config.includedPaths;
+  }
+  getExcludedPaths() {
+    return this._config.excludedPaths;
+  }
+  matchesTestCase(testCase, filePath) {
+    const tags = testCase.getTags();
+    if (this._config.excludedTags.length > 0) {
+      if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
+        return false;
+      }
+    }
+    if (this._config.excludedPaths.length > 0) {
+      if (matchesAnyPath(filePath, this._config.excludedPaths)) {
+        return false;
+      }
+    }
+    const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
+    const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
+    return tagMatch && pathMatch;
+  }
+};
+function getDatasetDisplayLabel(dataset) {
+  if (typeof dataset.getDisplayLabel === "function") {
+    return dataset.getDisplayLabel();
+  }
+  return typeof dataset.getName === "function" ? dataset.getName() : "";
+}
 // src/evals/evaluator.ts
 var Evaluator = class _Evaluator {
   constructor(config) {
@@ -413,7 +498,7 @@ function toEvalDataset(item, snapshots) {
   const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
   return {
     id: item.id,
-    name: item.dataset.getName(),
+    name: getDatasetDisplayLabel(item.dataset),
     overview: `Discovered from ${item.filePath}`,
     runs
   };
@@ -466,70 +551,6 @@ function parseStartupArgs(argv) {
   }
   return args;
 }
-// src/evals/dataset.ts
-function matchesAny(value, matchers) {
-  return matchers.some(
-    (matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
-  );
-}
-function matchesAnyPath(filePath, matchers) {
-  return matchers.some((matcher) => {
-    if (typeof matcher === "string") {
-      return simpleGlobMatch(matcher, filePath);
-    }
-    return matcher.test(filePath);
-  });
-}
-function simpleGlobMatch(pattern, value) {
-  const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
-  return new RegExp(`^${escaped}$`).test(value);
-}
-var Dataset = class _Dataset {
-  constructor(config) {
-    this._config = config;
-  }
-  static define(config) {
-    return new _Dataset({
-      name: config.name,
-      includedTags: config.includedTags ?? [],
-      excludedTags: config.excludedTags ?? [],
-      includedPaths: config.includedPaths ?? [],
-      excludedPaths: config.excludedPaths ?? []
-    });
-  }
-  getName() {
-    return this._config.name;
-  }
-  getIncludedTags() {
-    return this._config.includedTags;
-  }
-  getExcludedTags() {
-    return this._config.excludedTags;
-  }
-  getIncludedPaths() {
-    return this._config.includedPaths;
-  }
-  getExcludedPaths() {
-    return this._config.excludedPaths;
-  }
-  matchesTestCase(testCase, filePath) {
-    const tags = testCase.getTags();
-    if (this._config.excludedTags.length > 0) {
-      if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
-        return false;
-      }
-    }
-    if (this._config.excludedPaths.length > 0) {
-      if (matchesAnyPath(filePath, this._config.excludedPaths)) {
-        return false;
-      }
-    }
-    const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
-    const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
-    return tagMatch && pathMatch;
-  }
-};
 function preprocessForDiff(value, options) {
   if (options?.sort && Array.isArray(value)) {
     return [...value].sort((a, b) => {
@@ -1545,7 +1566,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
               meta: {
                 triggerId: task.triggerId,
                 runId: evaluatorRunId,
-                datasetId: task.datasetId,
+                datasetName: task.dataset.getDisplayLabel(),
                 repetitionId,
                 repetitionIndex,
                 repetitionCount,
@@ -1960,7 +1981,7 @@ var EffectRunner = class {
       );
       if (!dsCollected) {
         throw new Error(
-          `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
+          `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
         );
       }
       let evaluatorIds;
@@ -2095,7 +2116,7 @@ var EffectRunner = class {
     const snapshot = {
       runId,
       datasetId: params.datasetId,
-      datasetName: dataset.dataset.getName(),
+      datasetName: dataset.dataset.getDisplayLabel(),
       evaluatorIds: selectedEvaluators.map((item) => item.id),
       queuedAt: Date.now(),
       totalTestCases: totalEvaluations,
@@ -2116,7 +2137,7 @@ var EffectRunner = class {
       type: "RunQueued",
       runId,
       datasetId: params.datasetId,
-      datasetName: dataset.dataset.getName(),
+      datasetName: dataset.dataset.getDisplayLabel(),
       evaluatorIds: selectedEvaluators.map((item) => item.id),
       totalTestCases: totalEvaluations,
       artifactPath
@@ -2219,6 +2240,6 @@ var PROGRAMMATIC_RUN_CONFIG = {
   runConfigName: "programmatic"
 };
-export { Dataset, Evaluator, EvaluatorNameSchema, Metric, PROGRAMMATIC_RUN_CONFIG, RunConfig, RunConfigNameSchema, Score, TagSet, TestCase, TestCaseNameSchema, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };
+export { Dataset, DatasetNameSchema, Evaluator, EvaluatorNameSchema, Metric, PROGRAMMATIC_RUN_CONFIG, RunConfig, RunConfigNameSchema, Score, TagSet, TestCase, TestCaseNameSchema, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getDatasetDisplayLabel, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateDatasetName, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };
 //# sourceMappingURL=out.js.map
 //# sourceMappingURL=index.js.map