@m4trix/evals 0.26.0 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -51,6 +51,7 @@ function makeEntityIdSchema(brand, label) {
51
51
  var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
52
52
  var EvaluatorNameSchema = makeEntityIdSchema("EvaluatorName", "Evaluator name");
53
53
  var TestCaseNameSchema = makeEntityIdSchema("TestCaseName", "Test case name");
54
+ var DatasetNameSchema = makeEntityIdSchema("DatasetName", "Dataset name");
54
55
  function validateWithSchema(schema, raw, context) {
55
56
  const trimmed = raw.trim();
56
57
  const decode = effect.Schema.decodeUnknownEither(
@@ -71,6 +72,9 @@ function validateEvaluatorName(raw, context) {
71
72
  function validateTestCaseName(raw, context) {
72
73
  return validateWithSchema(TestCaseNameSchema, raw, context);
73
74
  }
75
+ function validateDatasetName(raw, context) {
76
+ return validateWithSchema(DatasetNameSchema, raw, context);
77
+ }
74
78
  function normalizeOptionalDisplayName(raw) {
75
79
  if (raw === void 0) {
76
80
  return void 0;
@@ -79,6 +83,87 @@ function normalizeOptionalDisplayName(raw) {
79
83
  return t.length === 0 ? void 0 : t;
80
84
  }
81
85
 
86
+ // src/evals/dataset.ts
87
+ function matchesAny(value, matchers) {
88
+ return matchers.some(
89
+ (matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
90
+ );
91
+ }
92
+ function matchesAnyPath(filePath, matchers) {
93
+ return matchers.some((matcher) => {
94
+ if (typeof matcher === "string") {
95
+ return simpleGlobMatch(matcher, filePath);
96
+ }
97
+ return matcher.test(filePath);
98
+ });
99
+ }
100
+ function simpleGlobMatch(pattern, value) {
101
+ const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
102
+ return new RegExp(`^${escaped}$`).test(value);
103
+ }
104
+ var Dataset = class _Dataset {
105
+ constructor(config) {
106
+ this._config = config;
107
+ }
108
+ static define(config) {
109
+ const name = validateDatasetName(config.name, "Dataset.define");
110
+ const displayName = normalizeOptionalDisplayName(config.displayName);
111
+ return new _Dataset({
112
+ name,
113
+ displayName,
114
+ includedTags: config.includedTags ?? [],
115
+ excludedTags: config.excludedTags ?? [],
116
+ includedPaths: config.includedPaths ?? [],
117
+ excludedPaths: config.excludedPaths ?? []
118
+ });
119
+ }
120
+ /** Canonical dataset id (same rules as `RunConfig` / `TestCase` `name`). */
121
+ getName() {
122
+ return this._config.name;
123
+ }
124
+ getDisplayName() {
125
+ return this._config.displayName;
126
+ }
127
+ /** Label for CLI/TUI and evaluator `meta.datasetName`: {@link getDisplayName} if set, otherwise {@link getName}. */
128
+ getDisplayLabel() {
129
+ return this._config.displayName ?? this._config.name;
130
+ }
131
+ getIncludedTags() {
132
+ return this._config.includedTags;
133
+ }
134
+ getExcludedTags() {
135
+ return this._config.excludedTags;
136
+ }
137
+ getIncludedPaths() {
138
+ return this._config.includedPaths;
139
+ }
140
+ getExcludedPaths() {
141
+ return this._config.excludedPaths;
142
+ }
143
+ matchesTestCase(testCase, filePath) {
144
+ const tags = testCase.getTags();
145
+ if (this._config.excludedTags.length > 0) {
146
+ if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
147
+ return false;
148
+ }
149
+ }
150
+ if (this._config.excludedPaths.length > 0) {
151
+ if (matchesAnyPath(filePath, this._config.excludedPaths)) {
152
+ return false;
153
+ }
154
+ }
155
+ const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
156
+ const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
157
+ return tagMatch && pathMatch;
158
+ }
159
+ };
160
+ function getDatasetDisplayLabel(dataset) {
161
+ if (typeof dataset.getDisplayLabel === "function") {
162
+ return dataset.getDisplayLabel();
163
+ }
164
+ return typeof dataset.getName === "function" ? dataset.getName() : "";
165
+ }
166
+
82
167
  // src/evals/evaluator.ts
83
168
  var Evaluator = class _Evaluator {
84
169
  constructor(config) {
@@ -438,7 +523,7 @@ function toEvalDataset(item, snapshots) {
438
523
  const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
439
524
  return {
440
525
  id: item.id,
441
- name: item.dataset.getName(),
526
+ name: getDatasetDisplayLabel(item.dataset),
442
527
  overview: `Discovered from ${item.filePath}`,
443
528
  runs
444
529
  };
@@ -491,70 +576,6 @@ function parseStartupArgs(argv) {
491
576
  }
492
577
  return args;
493
578
  }
494
-
495
- // src/evals/dataset.ts
496
- function matchesAny(value, matchers) {
497
- return matchers.some(
498
- (matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
499
- );
500
- }
501
- function matchesAnyPath(filePath, matchers) {
502
- return matchers.some((matcher) => {
503
- if (typeof matcher === "string") {
504
- return simpleGlobMatch(matcher, filePath);
505
- }
506
- return matcher.test(filePath);
507
- });
508
- }
509
- function simpleGlobMatch(pattern, value) {
510
- const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
511
- return new RegExp(`^${escaped}$`).test(value);
512
- }
513
- var Dataset = class _Dataset {
514
- constructor(config) {
515
- this._config = config;
516
- }
517
- static define(config) {
518
- return new _Dataset({
519
- name: config.name,
520
- includedTags: config.includedTags ?? [],
521
- excludedTags: config.excludedTags ?? [],
522
- includedPaths: config.includedPaths ?? [],
523
- excludedPaths: config.excludedPaths ?? []
524
- });
525
- }
526
- getName() {
527
- return this._config.name;
528
- }
529
- getIncludedTags() {
530
- return this._config.includedTags;
531
- }
532
- getExcludedTags() {
533
- return this._config.excludedTags;
534
- }
535
- getIncludedPaths() {
536
- return this._config.includedPaths;
537
- }
538
- getExcludedPaths() {
539
- return this._config.excludedPaths;
540
- }
541
- matchesTestCase(testCase, filePath) {
542
- const tags = testCase.getTags();
543
- if (this._config.excludedTags.length > 0) {
544
- if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
545
- return false;
546
- }
547
- }
548
- if (this._config.excludedPaths.length > 0) {
549
- if (matchesAnyPath(filePath, this._config.excludedPaths)) {
550
- return false;
551
- }
552
- }
553
- const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
554
- const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
555
- return tagMatch && pathMatch;
556
- }
557
- };
558
579
  function preprocessForDiff(value, options) {
559
580
  if (options?.sort && Array.isArray(value)) {
560
581
  return [...value].sort((a, b) => {
@@ -1570,7 +1591,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1570
1591
  meta: {
1571
1592
  triggerId: task.triggerId,
1572
1593
  runId: evaluatorRunId,
1573
- datasetId: task.datasetId,
1594
+ datasetName: task.dataset.getDisplayLabel(),
1574
1595
  repetitionId,
1575
1596
  repetitionIndex,
1576
1597
  repetitionCount,
@@ -1985,7 +2006,7 @@ var EffectRunner = class {
1985
2006
  );
1986
2007
  if (!dsCollected) {
1987
2008
  throw new Error(
1988
- `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
2009
+ `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
1989
2010
  );
1990
2011
  }
1991
2012
  let evaluatorIds;
@@ -2120,7 +2141,7 @@ var EffectRunner = class {
2120
2141
  const snapshot = {
2121
2142
  runId,
2122
2143
  datasetId: params.datasetId,
2123
- datasetName: dataset.dataset.getName(),
2144
+ datasetName: dataset.dataset.getDisplayLabel(),
2124
2145
  evaluatorIds: selectedEvaluators.map((item) => item.id),
2125
2146
  queuedAt: Date.now(),
2126
2147
  totalTestCases: totalEvaluations,
@@ -2141,7 +2162,7 @@ var EffectRunner = class {
2141
2162
  type: "RunQueued",
2142
2163
  runId,
2143
2164
  datasetId: params.datasetId,
2144
- datasetName: dataset.dataset.getName(),
2165
+ datasetName: dataset.dataset.getDisplayLabel(),
2145
2166
  evaluatorIds: selectedEvaluators.map((item) => item.id),
2146
2167
  totalTestCases: totalEvaluations,
2147
2168
  artifactPath
@@ -2249,6 +2270,7 @@ Object.defineProperty(exports, 'S', {
2249
2270
  get: function () { return effect.Schema; }
2250
2271
  });
2251
2272
  exports.Dataset = Dataset;
2273
+ exports.DatasetNameSchema = DatasetNameSchema;
2252
2274
  exports.Evaluator = Evaluator;
2253
2275
  exports.EvaluatorNameSchema = EvaluatorNameSchema;
2254
2276
  exports.Metric = Metric;
@@ -2266,6 +2288,7 @@ exports.defaultRunnerConfig = defaultRunnerConfig;
2266
2288
  exports.defineConfig = defineConfig;
2267
2289
  exports.deltaScore = deltaScore;
2268
2290
  exports.formatScoreData = formatScoreData;
2291
+ exports.getDatasetDisplayLabel = getDatasetDisplayLabel;
2269
2292
  exports.getEvaluatorDisplayLabel = getEvaluatorDisplayLabel;
2270
2293
  exports.getEvaluatorTagList = getEvaluatorTagList;
2271
2294
  exports.getLogLines = getLogLines;
@@ -2281,6 +2304,7 @@ exports.parseStartupArgs = parseStartupArgs;
2281
2304
  exports.percentScore = percentScore;
2282
2305
  exports.printJsonDiff = printJsonDiff;
2283
2306
  exports.tokenCountMetric = tokenCountMetric;
2307
+ exports.validateDatasetName = validateDatasetName;
2284
2308
  exports.validateEvaluatorName = validateEvaluatorName;
2285
2309
  exports.validateRunConfigName = validateRunConfigName;
2286
2310
  exports.validateTestCaseName = validateTestCaseName;