npm - @m4trix/evals - Versions diffs - 0.26.0 → 0.28.0 - Mend

@m4trix/evals 0.26.0 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/dist/index.cjs CHANGED Viewed

@@ -51,6 +51,7 @@ function makeEntityIdSchema(brand, label) {
 var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
 var EvaluatorNameSchema = makeEntityIdSchema("EvaluatorName", "Evaluator name");
 var TestCaseNameSchema = makeEntityIdSchema("TestCaseName", "Test case name");
+var DatasetNameSchema = makeEntityIdSchema("DatasetName", "Dataset name");
 function validateWithSchema(schema, raw, context) {
   const trimmed = raw.trim();
   const decode = effect.Schema.decodeUnknownEither(
@@ -71,6 +72,9 @@ function validateEvaluatorName(raw, context) {
 function validateTestCaseName(raw, context) {
   return validateWithSchema(TestCaseNameSchema, raw, context);
 }
+function validateDatasetName(raw, context) {
+  return validateWithSchema(DatasetNameSchema, raw, context);
+}
 function normalizeOptionalDisplayName(raw) {
   if (raw === void 0) {
     return void 0;
@@ -79,6 +83,87 @@ function normalizeOptionalDisplayName(raw) {
   return t.length === 0 ? void 0 : t;
 }
+// src/evals/dataset.ts
+function matchesAny(value, matchers) {
+  return matchers.some(
+    (matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
+  );
+}
+function matchesAnyPath(filePath, matchers) {
+  return matchers.some((matcher) => {
+    if (typeof matcher === "string") {
+      return simpleGlobMatch(matcher, filePath);
+    }
+    return matcher.test(filePath);
+  });
+}
+function simpleGlobMatch(pattern, value) {
+  const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
+  return new RegExp(`^${escaped}$`).test(value);
+}
+var Dataset = class _Dataset {
+  constructor(config) {
+    this._config = config;
+  }
+  static define(config) {
+    const name = validateDatasetName(config.name, "Dataset.define");
+    const displayName = normalizeOptionalDisplayName(config.displayName);
+    return new _Dataset({
+      name,
+      displayName,
+      includedTags: config.includedTags ?? [],
+      excludedTags: config.excludedTags ?? [],
+      includedPaths: config.includedPaths ?? [],
+      excludedPaths: config.excludedPaths ?? []
+    });
+  }
+  /** Canonical dataset id (same rules as `RunConfig` / `TestCase` `name`). */
+  getName() {
+    return this._config.name;
+  }
+  getDisplayName() {
+    return this._config.displayName;
+  }
+  /** Label for CLI/TUI and evaluator `meta.datasetName`: {@link getDisplayName} if set, otherwise {@link getName}. */
+  getDisplayLabel() {
+    return this._config.displayName ?? this._config.name;
+  }
+  getIncludedTags() {
+    return this._config.includedTags;
+  }
+  getExcludedTags() {
+    return this._config.excludedTags;
+  }
+  getIncludedPaths() {
+    return this._config.includedPaths;
+  }
+  getExcludedPaths() {
+    return this._config.excludedPaths;
+  }
+  matchesTestCase(testCase, filePath) {
+    const tags = testCase.getTags();
+    if (this._config.excludedTags.length > 0) {
+      if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
+        return false;
+      }
+    }
+    if (this._config.excludedPaths.length > 0) {
+      if (matchesAnyPath(filePath, this._config.excludedPaths)) {
+        return false;
+      }
+    }
+    const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
+    const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
+    return tagMatch && pathMatch;
+  }
+};
+function getDatasetDisplayLabel(dataset) {
+  if (typeof dataset.getDisplayLabel === "function") {
+    return dataset.getDisplayLabel();
+  }
+  return typeof dataset.getName === "function" ? dataset.getName() : "";
+}
 // src/evals/evaluator.ts
 var Evaluator = class _Evaluator {
   constructor(config) {
@@ -438,7 +523,7 @@ function toEvalDataset(item, snapshots) {
   const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
   return {
     id: item.id,
-    name: item.dataset.getName(),
+    name: getDatasetDisplayLabel(item.dataset),
     overview: `Discovered from ${item.filePath}`,
     runs
   };
@@ -491,70 +576,6 @@ function parseStartupArgs(argv) {
   }
   return args;
 }
-// src/evals/dataset.ts
-function matchesAny(value, matchers) {
-  return matchers.some(
-    (matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
-  );
-}
-function matchesAnyPath(filePath, matchers) {
-  return matchers.some((matcher) => {
-    if (typeof matcher === "string") {
-      return simpleGlobMatch(matcher, filePath);
-    }
-    return matcher.test(filePath);
-  });
-}
-function simpleGlobMatch(pattern, value) {
-  const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
-  return new RegExp(`^${escaped}$`).test(value);
-}
-var Dataset = class _Dataset {
-  constructor(config) {
-    this._config = config;
-  }
-  static define(config) {
-    return new _Dataset({
-      name: config.name,
-      includedTags: config.includedTags ?? [],
-      excludedTags: config.excludedTags ?? [],
-      includedPaths: config.includedPaths ?? [],
-      excludedPaths: config.excludedPaths ?? []
-    });
-  }
-  getName() {
-    return this._config.name;
-  }
-  getIncludedTags() {
-    return this._config.includedTags;
-  }
-  getExcludedTags() {
-    return this._config.excludedTags;
-  }
-  getIncludedPaths() {
-    return this._config.includedPaths;
-  }
-  getExcludedPaths() {
-    return this._config.excludedPaths;
-  }
-  matchesTestCase(testCase, filePath) {
-    const tags = testCase.getTags();
-    if (this._config.excludedTags.length > 0) {
-      if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
-        return false;
-      }
-    }
-    if (this._config.excludedPaths.length > 0) {
-      if (matchesAnyPath(filePath, this._config.excludedPaths)) {
-        return false;
-      }
-    }
-    const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
-    const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
-    return tagMatch && pathMatch;
-  }
-};
 function preprocessForDiff(value, options) {
   if (options?.sort && Array.isArray(value)) {
     return [...value].sort((a, b) => {
@@ -820,7 +841,7 @@ var RunConfig = class _RunConfig {
   getDisplayLabel() {
     return this._displayName ?? this._name;
   }
-  /** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
+  /** Tags from `RunConfig.define({ tags })`; surfaced as `meta.runConfigTags` on evaluator callbacks. */
   getTags() {
     return [...this._tags];
   }
@@ -993,10 +1014,11 @@ var TestCase = class _TestCase {
   static describe(config) {
     const name = validateTestCaseName(config.name, "TestCase.describe");
     const displayName = normalizeOptionalDisplayName(config.displayName);
+    const tags = config.tags !== void 0 ? [...config.tags] : [];
     return new _TestCase({
       name,
       displayName,
-      tags: config.tags,
+      tags,
       inputSchema: config.inputSchema,
       input: config.input,
       outputSchema: config.outputSchema,
@@ -1013,7 +1035,7 @@ var TestCase = class _TestCase {
     return this._config.displayName ?? this._config.name;
   }
   getTags() {
-    return this._config.tags;
+    return [...this._config.tags];
   }
   getInputSchema() {
     return this._config.inputSchema;
@@ -1570,15 +1592,16 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
               meta: {
                 triggerId: task.triggerId,
                 runId: evaluatorRunId,
-                datasetId: task.datasetId,
+                datasetName: task.dataset.getDisplayLabel(),
                 repetitionId,
                 repetitionIndex,
                 repetitionCount,
-                runConfigName: task.runConfigName
+                runConfigName: task.runConfigName,
+                ...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
+                testCaseTags: getTestCaseTagList(testCaseItem.testCase),
+                runConfigTags: task.runConfigTags,
+                evaluatorTags: getEvaluatorTagList(evaluator)
               },
-              testCaseTags: getTestCaseTagList(testCaseItem.testCase),
-              runConfigTags: task.runConfigTags,
-              evaluatorTags: getEvaluatorTagList(evaluator),
               logDiff,
               log,
               createError
@@ -1985,7 +2008,7 @@ var EffectRunner = class {
       );
       if (!dsCollected) {
         throw new Error(
-          `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
+          `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
         );
       }
       let evaluatorIds;
@@ -2057,7 +2080,8 @@ var EffectRunner = class {
           globalEvaluationSemaphore: sem,
           runConfigName: job.runConfigName,
           runConfigTags: job.runConfigTags,
-          repetitions: job.repetitions
+          repetitions: job.repetitions,
+          experimentName: request.experimentName
         })
       );
     }
@@ -2092,7 +2116,8 @@ var EffectRunner = class {
       maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
       repetitions: request.repetitions,
       runConfigName,
-      runConfigTags: request.runConfigTags
+      runConfigTags: request.runConfigTags,
+      experimentName: request.experimentName
     });
   }
   async startDatasetRun(params) {
@@ -2120,7 +2145,7 @@ var EffectRunner = class {
     const snapshot = {
       runId,
       datasetId: params.datasetId,
-      datasetName: dataset.dataset.getName(),
+      datasetName: dataset.dataset.getDisplayLabel(),
       evaluatorIds: selectedEvaluators.map((item) => item.id),
       queuedAt: Date.now(),
       totalTestCases: totalEvaluations,
@@ -2141,7 +2166,7 @@ var EffectRunner = class {
       type: "RunQueued",
       runId,
       datasetId: params.datasetId,
-      datasetName: dataset.dataset.getName(),
+      datasetName: dataset.dataset.getDisplayLabel(),
       evaluatorIds: selectedEvaluators.map((item) => item.id),
       totalTestCases: totalEvaluations,
       artifactPath
@@ -2167,7 +2192,8 @@ var EffectRunner = class {
         globalEvaluationSemaphore: params.globalEvaluationSemaphore,
         runConfigName: params.runConfigName,
         runConfigTags,
-        repetitions
+        repetitions,
+        experimentName: params.experimentName
       })
     );
     return snapshot;
@@ -2249,6 +2275,7 @@ Object.defineProperty(exports, 'S', {
   get: function () { return effect.Schema; }
 });
 exports.Dataset = Dataset;
+exports.DatasetNameSchema = DatasetNameSchema;
 exports.Evaluator = Evaluator;
 exports.EvaluatorNameSchema = EvaluatorNameSchema;
 exports.Metric = Metric;
@@ -2266,6 +2293,7 @@ exports.defaultRunnerConfig = defaultRunnerConfig;
 exports.defineConfig = defineConfig;
 exports.deltaScore = deltaScore;
 exports.formatScoreData = formatScoreData;
+exports.getDatasetDisplayLabel = getDatasetDisplayLabel;
 exports.getEvaluatorDisplayLabel = getEvaluatorDisplayLabel;
 exports.getEvaluatorTagList = getEvaluatorTagList;
 exports.getLogLines = getLogLines;
@@ -2281,6 +2309,7 @@ exports.parseStartupArgs = parseStartupArgs;
 exports.percentScore = percentScore;
 exports.printJsonDiff = printJsonDiff;
 exports.tokenCountMetric = tokenCountMetric;
+exports.validateDatasetName = validateDatasetName;
 exports.validateEvaluatorName = validateEvaluatorName;
 exports.validateRunConfigName = validateRunConfigName;
 exports.validateTestCaseName = validateTestCaseName;