@m4trix/evals 0.26.0 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -123,19 +123,14 @@ declare function defineConfig<TConfig extends ConfigType>(factory: M4trixEvalCon
123
123
  declare const defaultRunnerConfig: RunnerConfig;
124
124
  declare function withRunnerConfig(overrides?: RunnerConfigOverrides): RunnerConfig;
125
125
 
126
- /** Matches a tag by exact string equality or regex test */
127
- type TagMatcher = string | RegExp;
128
- /** Matches a file path by glob string or regex test */
129
- type PathMatcher = string | RegExp;
130
-
131
126
  type InputOrBuilder<T> = T | (() => T);
132
127
  interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>> {
133
128
  /**
134
- * Stable id (letters, digits, `_`, `-`).
129
+ * Stable id (letters, digits, `_`, `-`); used in discovery and matching.
135
130
  * For an unrestricted UI label, set {@link displayName}.
136
131
  */
137
132
  name: string;
138
- /** Optional human-readable label for CLI/TUI (any characters). */
133
+ /** Optional human-readable label for CLI/TUI and evaluator args (any characters). */
139
134
  displayName?: string;
140
135
  tags: string[];
141
136
  inputSchema: TI;
@@ -166,8 +161,19 @@ declare function getTestCaseTagList(testCase: {
166
161
  getTags?: () => ReadonlyArray<string>;
167
162
  }): string[];
168
163
 
164
+ /** Matches a tag by exact string equality or regex test */
165
+ type TagMatcher = string | RegExp;
166
+ /** Matches a file path by glob string or regex test */
167
+ type PathMatcher = string | RegExp;
168
+
169
169
  interface DatasetDefineConfig {
170
+ /**
171
+ * Stable id (letters, digits, `_`, `-`); used for discovery ids and `resolveDatasetByName`.
172
+ * For an unrestricted UI label, set {@link displayName}.
173
+ */
170
174
  name: string;
175
+ /** Optional human-readable label for CLI/TUI (any characters). */
176
+ displayName?: string;
171
177
  includedTags?: TagMatcher[];
172
178
  excludedTags?: TagMatcher[];
173
179
  includedPaths?: PathMatcher[];
@@ -177,13 +183,22 @@ declare class Dataset {
177
183
  private readonly _config;
178
184
  private constructor();
179
185
  static define(config: DatasetDefineConfig): Dataset;
186
+ /** Canonical dataset id (same rules as `RunConfig` / `TestCase` `name`). */
180
187
  getName(): string;
188
+ getDisplayName(): string | undefined;
189
+ /** Label for CLI/TUI and evaluator `meta.datasetName`: {@link getDisplayName} if set, otherwise {@link getName}. */
190
+ getDisplayLabel(): string;
181
191
  getIncludedTags(): ReadonlyArray<TagMatcher>;
182
192
  getExcludedTags(): ReadonlyArray<TagMatcher>;
183
193
  getIncludedPaths(): ReadonlyArray<PathMatcher>;
184
194
  getExcludedPaths(): ReadonlyArray<PathMatcher>;
185
195
  matchesTestCase(testCase: TestCase<unknown>, filePath: string): boolean;
186
196
  }
197
+ /** CLI / runner: display label for a dataset-shaped object (supports discovery duck-types). */
198
+ declare function getDatasetDisplayLabel(dataset: {
199
+ getDisplayLabel?: () => string;
200
+ getName?: () => string;
201
+ }): string;
187
202
 
188
203
  /**
189
204
  * Options for customizing JSON diff output. Passed to logDiff, createDiffLogEntry, and printJsonDiff.
@@ -259,8 +274,8 @@ interface EvaluateMeta {
259
274
  * for this specific test-case run.
260
275
  */
261
276
  runId: string;
262
- /** Identifier of the dataset currently being evaluated. */
263
- datasetId: string;
277
+ /** Display label for the dataset (`Dataset.getDisplayLabel()`, i.e. `displayName ?? name`). */
278
+ datasetName: string;
264
279
  /** Canonical `RunConfig` name (or `programmatic` for API/TUI-only runs). */
265
280
  runConfigName: string;
266
281
  /**
@@ -384,12 +399,16 @@ declare const RunConfigNameSchema: Schema.brand<Schema.filter<Schema.filter<Sche
384
399
  declare const EvaluatorNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "EvaluatorName">;
385
400
  /** Branded id for `TestCase.describe({ name })` (decode with {@link TestCaseNameSchema}). */
386
401
  declare const TestCaseNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "TestCaseName">;
402
+ /** Branded id for `Dataset.define({ name })` (decode with {@link DatasetNameSchema}). */
403
+ declare const DatasetNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "DatasetName">;
387
404
  type RunConfigName = Schema.Schema.Type<typeof RunConfigNameSchema>;
388
405
  type EvaluatorName = Schema.Schema.Type<typeof EvaluatorNameSchema>;
389
406
  type TestCaseName = Schema.Schema.Type<typeof TestCaseNameSchema>;
407
+ type DatasetName = Schema.Schema.Type<typeof DatasetNameSchema>;
390
408
  declare function validateRunConfigName(raw: string, context: string): RunConfigName;
391
409
  declare function validateEvaluatorName(raw: string, context: string): EvaluatorName;
392
410
  declare function validateTestCaseName(raw: string, context: string): TestCaseName;
411
+ declare function validateDatasetName(raw: string, context: string): DatasetName;
393
412
  /** Optional UI label: trim; empty after trim becomes undefined. */
394
413
  declare function normalizeOptionalDisplayName(raw: string | undefined): string | undefined;
395
414
 
@@ -660,6 +679,7 @@ interface RunnerApi {
660
679
  collectDatasets(): Promise<ReadonlyArray<CollectedDataset>>;
661
680
  collectEvaluators(): Promise<ReadonlyArray<CollectedEvaluator>>;
662
681
  collectRunConfigs(): Promise<ReadonlyArray<CollectedRunConfig>>;
682
+ /** Resolves a dataset by canonical **`Dataset` `name`** (id), case-insensitive. */
663
683
  resolveDatasetByName(name: string): Promise<CollectedDataset | undefined>;
664
684
  resolveEvaluatorsByNamePattern(pattern: string): Promise<ReadonlyArray<CollectedEvaluator>>;
665
685
  /**
@@ -732,4 +752,4 @@ declare class TagSet {
732
752
  static define<const T extends readonly string[]>(tags: T): TagSetMembers<T>;
733
753
  }
734
754
 
735
- export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedRunConfig, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DeltaScoreData, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorName, EvaluatorNameSchema, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PROGRAMMATIC_RUN_CONFIG, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunConfig, RunConfigDefineConfig, RunConfigName, RunConfigNameSchema, RunConfigRow, RunConfigRowEvaluators, RunConfigRowPattern, RunDatasetJob, RunDatasetJobsWithSharedConcurrencyRequest, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TagSet, TagSetMembers, TestCase, TestCaseName, TestCaseNameSchema, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };
755
+ export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedRunConfig, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DatasetDefineConfig, DatasetName, DatasetNameSchema, DeltaScoreData, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorName, EvaluatorNameSchema, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PROGRAMMATIC_RUN_CONFIG, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunConfig, RunConfigDefineConfig, RunConfigName, RunConfigNameSchema, RunConfigRow, RunConfigRowEvaluators, RunConfigRowPattern, RunDatasetJob, RunDatasetJobsWithSharedConcurrencyRequest, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TagSet, TagSetMembers, TestCase, TestCaseName, TestCaseNameSchema, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getDatasetDisplayLabel, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateDatasetName, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };
package/dist/index.js CHANGED
@@ -26,6 +26,7 @@ function makeEntityIdSchema(brand, label) {
26
26
  var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
27
27
  var EvaluatorNameSchema = makeEntityIdSchema("EvaluatorName", "Evaluator name");
28
28
  var TestCaseNameSchema = makeEntityIdSchema("TestCaseName", "Test case name");
29
+ var DatasetNameSchema = makeEntityIdSchema("DatasetName", "Dataset name");
29
30
  function validateWithSchema(schema, raw, context) {
30
31
  const trimmed = raw.trim();
31
32
  const decode = Schema.decodeUnknownEither(
@@ -46,6 +47,9 @@ function validateEvaluatorName(raw, context) {
46
47
  function validateTestCaseName(raw, context) {
47
48
  return validateWithSchema(TestCaseNameSchema, raw, context);
48
49
  }
50
+ function validateDatasetName(raw, context) {
51
+ return validateWithSchema(DatasetNameSchema, raw, context);
52
+ }
49
53
  function normalizeOptionalDisplayName(raw) {
50
54
  if (raw === void 0) {
51
55
  return void 0;
@@ -54,6 +58,87 @@ function normalizeOptionalDisplayName(raw) {
54
58
  return t.length === 0 ? void 0 : t;
55
59
  }
56
60
 
61
+ // src/evals/dataset.ts
62
+ function matchesAny(value, matchers) {
63
+ return matchers.some(
64
+ (matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
65
+ );
66
+ }
67
+ function matchesAnyPath(filePath, matchers) {
68
+ return matchers.some((matcher) => {
69
+ if (typeof matcher === "string") {
70
+ return simpleGlobMatch(matcher, filePath);
71
+ }
72
+ return matcher.test(filePath);
73
+ });
74
+ }
75
+ function simpleGlobMatch(pattern, value) {
76
+ const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
77
+ return new RegExp(`^${escaped}$`).test(value);
78
+ }
79
+ var Dataset = class _Dataset {
80
+ constructor(config) {
81
+ this._config = config;
82
+ }
83
+ static define(config) {
84
+ const name = validateDatasetName(config.name, "Dataset.define");
85
+ const displayName = normalizeOptionalDisplayName(config.displayName);
86
+ return new _Dataset({
87
+ name,
88
+ displayName,
89
+ includedTags: config.includedTags ?? [],
90
+ excludedTags: config.excludedTags ?? [],
91
+ includedPaths: config.includedPaths ?? [],
92
+ excludedPaths: config.excludedPaths ?? []
93
+ });
94
+ }
95
+ /** Canonical dataset id (same rules as `RunConfig` / `TestCase` `name`). */
96
+ getName() {
97
+ return this._config.name;
98
+ }
99
+ getDisplayName() {
100
+ return this._config.displayName;
101
+ }
102
+ /** Label for CLI/TUI and evaluator `meta.datasetName`: {@link getDisplayName} if set, otherwise {@link getName}. */
103
+ getDisplayLabel() {
104
+ return this._config.displayName ?? this._config.name;
105
+ }
106
+ getIncludedTags() {
107
+ return this._config.includedTags;
108
+ }
109
+ getExcludedTags() {
110
+ return this._config.excludedTags;
111
+ }
112
+ getIncludedPaths() {
113
+ return this._config.includedPaths;
114
+ }
115
+ getExcludedPaths() {
116
+ return this._config.excludedPaths;
117
+ }
118
+ matchesTestCase(testCase, filePath) {
119
+ const tags = testCase.getTags();
120
+ if (this._config.excludedTags.length > 0) {
121
+ if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
122
+ return false;
123
+ }
124
+ }
125
+ if (this._config.excludedPaths.length > 0) {
126
+ if (matchesAnyPath(filePath, this._config.excludedPaths)) {
127
+ return false;
128
+ }
129
+ }
130
+ const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
131
+ const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
132
+ return tagMatch && pathMatch;
133
+ }
134
+ };
135
+ function getDatasetDisplayLabel(dataset) {
136
+ if (typeof dataset.getDisplayLabel === "function") {
137
+ return dataset.getDisplayLabel();
138
+ }
139
+ return typeof dataset.getName === "function" ? dataset.getName() : "";
140
+ }
141
+
57
142
  // src/evals/evaluator.ts
58
143
  var Evaluator = class _Evaluator {
59
144
  constructor(config) {
@@ -413,7 +498,7 @@ function toEvalDataset(item, snapshots) {
413
498
  const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
414
499
  return {
415
500
  id: item.id,
416
- name: item.dataset.getName(),
501
+ name: getDatasetDisplayLabel(item.dataset),
417
502
  overview: `Discovered from ${item.filePath}`,
418
503
  runs
419
504
  };
@@ -466,70 +551,6 @@ function parseStartupArgs(argv) {
466
551
  }
467
552
  return args;
468
553
  }
469
-
470
- // src/evals/dataset.ts
471
- function matchesAny(value, matchers) {
472
- return matchers.some(
473
- (matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
474
- );
475
- }
476
- function matchesAnyPath(filePath, matchers) {
477
- return matchers.some((matcher) => {
478
- if (typeof matcher === "string") {
479
- return simpleGlobMatch(matcher, filePath);
480
- }
481
- return matcher.test(filePath);
482
- });
483
- }
484
- function simpleGlobMatch(pattern, value) {
485
- const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
486
- return new RegExp(`^${escaped}$`).test(value);
487
- }
488
- var Dataset = class _Dataset {
489
- constructor(config) {
490
- this._config = config;
491
- }
492
- static define(config) {
493
- return new _Dataset({
494
- name: config.name,
495
- includedTags: config.includedTags ?? [],
496
- excludedTags: config.excludedTags ?? [],
497
- includedPaths: config.includedPaths ?? [],
498
- excludedPaths: config.excludedPaths ?? []
499
- });
500
- }
501
- getName() {
502
- return this._config.name;
503
- }
504
- getIncludedTags() {
505
- return this._config.includedTags;
506
- }
507
- getExcludedTags() {
508
- return this._config.excludedTags;
509
- }
510
- getIncludedPaths() {
511
- return this._config.includedPaths;
512
- }
513
- getExcludedPaths() {
514
- return this._config.excludedPaths;
515
- }
516
- matchesTestCase(testCase, filePath) {
517
- const tags = testCase.getTags();
518
- if (this._config.excludedTags.length > 0) {
519
- if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
520
- return false;
521
- }
522
- }
523
- if (this._config.excludedPaths.length > 0) {
524
- if (matchesAnyPath(filePath, this._config.excludedPaths)) {
525
- return false;
526
- }
527
- }
528
- const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
529
- const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
530
- return tagMatch && pathMatch;
531
- }
532
- };
533
554
  function preprocessForDiff(value, options) {
534
555
  if (options?.sort && Array.isArray(value)) {
535
556
  return [...value].sort((a, b) => {
@@ -1545,7 +1566,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1545
1566
  meta: {
1546
1567
  triggerId: task.triggerId,
1547
1568
  runId: evaluatorRunId,
1548
- datasetId: task.datasetId,
1569
+ datasetName: task.dataset.getDisplayLabel(),
1549
1570
  repetitionId,
1550
1571
  repetitionIndex,
1551
1572
  repetitionCount,
@@ -1960,7 +1981,7 @@ var EffectRunner = class {
1960
1981
  );
1961
1982
  if (!dsCollected) {
1962
1983
  throw new Error(
1963
- `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
1984
+ `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
1964
1985
  );
1965
1986
  }
1966
1987
  let evaluatorIds;
@@ -2095,7 +2116,7 @@ var EffectRunner = class {
2095
2116
  const snapshot = {
2096
2117
  runId,
2097
2118
  datasetId: params.datasetId,
2098
- datasetName: dataset.dataset.getName(),
2119
+ datasetName: dataset.dataset.getDisplayLabel(),
2099
2120
  evaluatorIds: selectedEvaluators.map((item) => item.id),
2100
2121
  queuedAt: Date.now(),
2101
2122
  totalTestCases: totalEvaluations,
@@ -2116,7 +2137,7 @@ var EffectRunner = class {
2116
2137
  type: "RunQueued",
2117
2138
  runId,
2118
2139
  datasetId: params.datasetId,
2119
- datasetName: dataset.dataset.getName(),
2140
+ datasetName: dataset.dataset.getDisplayLabel(),
2120
2141
  evaluatorIds: selectedEvaluators.map((item) => item.id),
2121
2142
  totalTestCases: totalEvaluations,
2122
2143
  artifactPath
@@ -2219,6 +2240,6 @@ var PROGRAMMATIC_RUN_CONFIG = {
2219
2240
  runConfigName: "programmatic"
2220
2241
  };
2221
2242
 
2222
- export { Dataset, Evaluator, EvaluatorNameSchema, Metric, PROGRAMMATIC_RUN_CONFIG, RunConfig, RunConfigNameSchema, Score, TagSet, TestCase, TestCaseNameSchema, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };
2243
+ export { Dataset, DatasetNameSchema, Evaluator, EvaluatorNameSchema, Metric, PROGRAMMATIC_RUN_CONFIG, RunConfig, RunConfigNameSchema, Score, TagSet, TestCase, TestCaseNameSchema, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getDatasetDisplayLabel, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateDatasetName, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };
2223
2244
  //# sourceMappingURL=out.js.map
2224
2245
  //# sourceMappingURL=index.js.map