@m4trix/evals 0.26.0 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -51,6 +51,7 @@ function makeEntityIdSchema(brand, label) {
51
51
  var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
52
52
  var EvaluatorNameSchema = makeEntityIdSchema("EvaluatorName", "Evaluator name");
53
53
  var TestCaseNameSchema = makeEntityIdSchema("TestCaseName", "Test case name");
54
+ var DatasetNameSchema = makeEntityIdSchema("DatasetName", "Dataset name");
54
55
  function validateWithSchema(schema, raw, context) {
55
56
  const trimmed = raw.trim();
56
57
  const decode = effect.Schema.decodeUnknownEither(
@@ -71,6 +72,9 @@ function validateEvaluatorName(raw, context) {
71
72
  function validateTestCaseName(raw, context) {
72
73
  return validateWithSchema(TestCaseNameSchema, raw, context);
73
74
  }
75
+ function validateDatasetName(raw, context) {
76
+ return validateWithSchema(DatasetNameSchema, raw, context);
77
+ }
74
78
  function normalizeOptionalDisplayName(raw) {
75
79
  if (raw === void 0) {
76
80
  return void 0;
@@ -79,6 +83,87 @@ function normalizeOptionalDisplayName(raw) {
79
83
  return t.length === 0 ? void 0 : t;
80
84
  }
81
85
 
86
+ // src/evals/dataset.ts
87
+ function matchesAny(value, matchers) {
88
+ return matchers.some(
89
+ (matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
90
+ );
91
+ }
92
+ function matchesAnyPath(filePath, matchers) {
93
+ return matchers.some((matcher) => {
94
+ if (typeof matcher === "string") {
95
+ return simpleGlobMatch(matcher, filePath);
96
+ }
97
+ return matcher.test(filePath);
98
+ });
99
+ }
100
+ function simpleGlobMatch(pattern, value) {
101
+ const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
102
+ return new RegExp(`^${escaped}$`).test(value);
103
+ }
104
+ var Dataset = class _Dataset {
105
+ constructor(config) {
106
+ this._config = config;
107
+ }
108
+ static define(config) {
109
+ const name = validateDatasetName(config.name, "Dataset.define");
110
+ const displayName = normalizeOptionalDisplayName(config.displayName);
111
+ return new _Dataset({
112
+ name,
113
+ displayName,
114
+ includedTags: config.includedTags ?? [],
115
+ excludedTags: config.excludedTags ?? [],
116
+ includedPaths: config.includedPaths ?? [],
117
+ excludedPaths: config.excludedPaths ?? []
118
+ });
119
+ }
120
+ /** Canonical dataset id (same rules as `RunConfig` / `TestCase` `name`). */
121
+ getName() {
122
+ return this._config.name;
123
+ }
124
+ getDisplayName() {
125
+ return this._config.displayName;
126
+ }
127
+ /** Label for CLI/TUI and evaluator `meta.datasetName`: {@link getDisplayName} if set, otherwise {@link getName}. */
128
+ getDisplayLabel() {
129
+ return this._config.displayName ?? this._config.name;
130
+ }
131
+ getIncludedTags() {
132
+ return this._config.includedTags;
133
+ }
134
+ getExcludedTags() {
135
+ return this._config.excludedTags;
136
+ }
137
+ getIncludedPaths() {
138
+ return this._config.includedPaths;
139
+ }
140
+ getExcludedPaths() {
141
+ return this._config.excludedPaths;
142
+ }
143
+ matchesTestCase(testCase, filePath) {
144
+ const tags = testCase.getTags();
145
+ if (this._config.excludedTags.length > 0) {
146
+ if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
147
+ return false;
148
+ }
149
+ }
150
+ if (this._config.excludedPaths.length > 0) {
151
+ if (matchesAnyPath(filePath, this._config.excludedPaths)) {
152
+ return false;
153
+ }
154
+ }
155
+ const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
156
+ const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
157
+ return tagMatch && pathMatch;
158
+ }
159
+ };
160
+ function getDatasetDisplayLabel(dataset) {
161
+ if (typeof dataset.getDisplayLabel === "function") {
162
+ return dataset.getDisplayLabel();
163
+ }
164
+ return typeof dataset.getName === "function" ? dataset.getName() : "";
165
+ }
166
+
82
167
  // src/evals/evaluator.ts
83
168
  var Evaluator = class _Evaluator {
84
169
  constructor(config) {
@@ -438,7 +523,7 @@ function toEvalDataset(item, snapshots) {
438
523
  const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
439
524
  return {
440
525
  id: item.id,
441
- name: item.dataset.getName(),
526
+ name: getDatasetDisplayLabel(item.dataset),
442
527
  overview: `Discovered from ${item.filePath}`,
443
528
  runs
444
529
  };
@@ -491,70 +576,6 @@ function parseStartupArgs(argv) {
491
576
  }
492
577
  return args;
493
578
  }
494
-
495
- // src/evals/dataset.ts
496
- function matchesAny(value, matchers) {
497
- return matchers.some(
498
- (matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
499
- );
500
- }
501
- function matchesAnyPath(filePath, matchers) {
502
- return matchers.some((matcher) => {
503
- if (typeof matcher === "string") {
504
- return simpleGlobMatch(matcher, filePath);
505
- }
506
- return matcher.test(filePath);
507
- });
508
- }
509
- function simpleGlobMatch(pattern, value) {
510
- const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
511
- return new RegExp(`^${escaped}$`).test(value);
512
- }
513
- var Dataset = class _Dataset {
514
- constructor(config) {
515
- this._config = config;
516
- }
517
- static define(config) {
518
- return new _Dataset({
519
- name: config.name,
520
- includedTags: config.includedTags ?? [],
521
- excludedTags: config.excludedTags ?? [],
522
- includedPaths: config.includedPaths ?? [],
523
- excludedPaths: config.excludedPaths ?? []
524
- });
525
- }
526
- getName() {
527
- return this._config.name;
528
- }
529
- getIncludedTags() {
530
- return this._config.includedTags;
531
- }
532
- getExcludedTags() {
533
- return this._config.excludedTags;
534
- }
535
- getIncludedPaths() {
536
- return this._config.includedPaths;
537
- }
538
- getExcludedPaths() {
539
- return this._config.excludedPaths;
540
- }
541
- matchesTestCase(testCase, filePath) {
542
- const tags = testCase.getTags();
543
- if (this._config.excludedTags.length > 0) {
544
- if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
545
- return false;
546
- }
547
- }
548
- if (this._config.excludedPaths.length > 0) {
549
- if (matchesAnyPath(filePath, this._config.excludedPaths)) {
550
- return false;
551
- }
552
- }
553
- const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
554
- const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
555
- return tagMatch && pathMatch;
556
- }
557
- };
558
579
  function preprocessForDiff(value, options) {
559
580
  if (options?.sort && Array.isArray(value)) {
560
581
  return [...value].sort((a, b) => {
@@ -820,7 +841,7 @@ var RunConfig = class _RunConfig {
820
841
  getDisplayLabel() {
821
842
  return this._displayName ?? this._name;
822
843
  }
823
- /** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
844
+ /** Tags from `RunConfig.define({ tags })`; surfaced as `meta.runConfigTags` on evaluator callbacks. */
824
845
  getTags() {
825
846
  return [...this._tags];
826
847
  }
@@ -993,10 +1014,11 @@ var TestCase = class _TestCase {
993
1014
  static describe(config) {
994
1015
  const name = validateTestCaseName(config.name, "TestCase.describe");
995
1016
  const displayName = normalizeOptionalDisplayName(config.displayName);
1017
+ const tags = config.tags !== void 0 ? [...config.tags] : [];
996
1018
  return new _TestCase({
997
1019
  name,
998
1020
  displayName,
999
- tags: config.tags,
1021
+ tags,
1000
1022
  inputSchema: config.inputSchema,
1001
1023
  input: config.input,
1002
1024
  outputSchema: config.outputSchema,
@@ -1013,7 +1035,7 @@ var TestCase = class _TestCase {
1013
1035
  return this._config.displayName ?? this._config.name;
1014
1036
  }
1015
1037
  getTags() {
1016
- return this._config.tags;
1038
+ return [...this._config.tags];
1017
1039
  }
1018
1040
  getInputSchema() {
1019
1041
  return this._config.inputSchema;
@@ -1570,15 +1592,16 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1570
1592
  meta: {
1571
1593
  triggerId: task.triggerId,
1572
1594
  runId: evaluatorRunId,
1573
- datasetId: task.datasetId,
1595
+ datasetName: task.dataset.getDisplayLabel(),
1574
1596
  repetitionId,
1575
1597
  repetitionIndex,
1576
1598
  repetitionCount,
1577
- runConfigName: task.runConfigName
1599
+ runConfigName: task.runConfigName,
1600
+ ...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
1601
+ testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1602
+ runConfigTags: task.runConfigTags,
1603
+ evaluatorTags: getEvaluatorTagList(evaluator)
1578
1604
  },
1579
- testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1580
- runConfigTags: task.runConfigTags,
1581
- evaluatorTags: getEvaluatorTagList(evaluator),
1582
1605
  logDiff,
1583
1606
  log,
1584
1607
  createError
@@ -1985,7 +2008,7 @@ var EffectRunner = class {
1985
2008
  );
1986
2009
  if (!dsCollected) {
1987
2010
  throw new Error(
1988
- `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
2011
+ `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
1989
2012
  );
1990
2013
  }
1991
2014
  let evaluatorIds;
@@ -2057,7 +2080,8 @@ var EffectRunner = class {
2057
2080
  globalEvaluationSemaphore: sem,
2058
2081
  runConfigName: job.runConfigName,
2059
2082
  runConfigTags: job.runConfigTags,
2060
- repetitions: job.repetitions
2083
+ repetitions: job.repetitions,
2084
+ experimentName: request.experimentName
2061
2085
  })
2062
2086
  );
2063
2087
  }
@@ -2092,7 +2116,8 @@ var EffectRunner = class {
2092
2116
  maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
2093
2117
  repetitions: request.repetitions,
2094
2118
  runConfigName,
2095
- runConfigTags: request.runConfigTags
2119
+ runConfigTags: request.runConfigTags,
2120
+ experimentName: request.experimentName
2096
2121
  });
2097
2122
  }
2098
2123
  async startDatasetRun(params) {
@@ -2120,7 +2145,7 @@ var EffectRunner = class {
2120
2145
  const snapshot = {
2121
2146
  runId,
2122
2147
  datasetId: params.datasetId,
2123
- datasetName: dataset.dataset.getName(),
2148
+ datasetName: dataset.dataset.getDisplayLabel(),
2124
2149
  evaluatorIds: selectedEvaluators.map((item) => item.id),
2125
2150
  queuedAt: Date.now(),
2126
2151
  totalTestCases: totalEvaluations,
@@ -2141,7 +2166,7 @@ var EffectRunner = class {
2141
2166
  type: "RunQueued",
2142
2167
  runId,
2143
2168
  datasetId: params.datasetId,
2144
- datasetName: dataset.dataset.getName(),
2169
+ datasetName: dataset.dataset.getDisplayLabel(),
2145
2170
  evaluatorIds: selectedEvaluators.map((item) => item.id),
2146
2171
  totalTestCases: totalEvaluations,
2147
2172
  artifactPath
@@ -2167,7 +2192,8 @@ var EffectRunner = class {
2167
2192
  globalEvaluationSemaphore: params.globalEvaluationSemaphore,
2168
2193
  runConfigName: params.runConfigName,
2169
2194
  runConfigTags,
2170
- repetitions
2195
+ repetitions,
2196
+ experimentName: params.experimentName
2171
2197
  })
2172
2198
  );
2173
2199
  return snapshot;
@@ -2249,6 +2275,7 @@ Object.defineProperty(exports, 'S', {
2249
2275
  get: function () { return effect.Schema; }
2250
2276
  });
2251
2277
  exports.Dataset = Dataset;
2278
+ exports.DatasetNameSchema = DatasetNameSchema;
2252
2279
  exports.Evaluator = Evaluator;
2253
2280
  exports.EvaluatorNameSchema = EvaluatorNameSchema;
2254
2281
  exports.Metric = Metric;
@@ -2266,6 +2293,7 @@ exports.defaultRunnerConfig = defaultRunnerConfig;
2266
2293
  exports.defineConfig = defineConfig;
2267
2294
  exports.deltaScore = deltaScore;
2268
2295
  exports.formatScoreData = formatScoreData;
2296
+ exports.getDatasetDisplayLabel = getDatasetDisplayLabel;
2269
2297
  exports.getEvaluatorDisplayLabel = getEvaluatorDisplayLabel;
2270
2298
  exports.getEvaluatorTagList = getEvaluatorTagList;
2271
2299
  exports.getLogLines = getLogLines;
@@ -2281,6 +2309,7 @@ exports.parseStartupArgs = parseStartupArgs;
2281
2309
  exports.percentScore = percentScore;
2282
2310
  exports.printJsonDiff = printJsonDiff;
2283
2311
  exports.tokenCountMetric = tokenCountMetric;
2312
+ exports.validateDatasetName = validateDatasetName;
2284
2313
  exports.validateEvaluatorName = validateEvaluatorName;
2285
2314
  exports.validateRunConfigName = validateRunConfigName;
2286
2315
  exports.validateTestCaseName = validateTestCaseName;