@m4trix/evals 0.26.0 → 0.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -4
- package/dist/cli-simple.cjs +53 -23
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +53 -23
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +25 -12
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +25 -12
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +108 -79
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +65 -24
- package/dist/index.js +106 -80
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -51,6 +51,7 @@ function makeEntityIdSchema(brand, label) {
|
|
|
51
51
|
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
52
52
|
var EvaluatorNameSchema = makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
53
53
|
var TestCaseNameSchema = makeEntityIdSchema("TestCaseName", "Test case name");
|
|
54
|
+
var DatasetNameSchema = makeEntityIdSchema("DatasetName", "Dataset name");
|
|
54
55
|
function validateWithSchema(schema, raw, context) {
|
|
55
56
|
const trimmed = raw.trim();
|
|
56
57
|
const decode = effect.Schema.decodeUnknownEither(
|
|
@@ -71,6 +72,9 @@ function validateEvaluatorName(raw, context) {
|
|
|
71
72
|
function validateTestCaseName(raw, context) {
|
|
72
73
|
return validateWithSchema(TestCaseNameSchema, raw, context);
|
|
73
74
|
}
|
|
75
|
+
function validateDatasetName(raw, context) {
|
|
76
|
+
return validateWithSchema(DatasetNameSchema, raw, context);
|
|
77
|
+
}
|
|
74
78
|
function normalizeOptionalDisplayName(raw) {
|
|
75
79
|
if (raw === void 0) {
|
|
76
80
|
return void 0;
|
|
@@ -79,6 +83,87 @@ function normalizeOptionalDisplayName(raw) {
|
|
|
79
83
|
return t.length === 0 ? void 0 : t;
|
|
80
84
|
}
|
|
81
85
|
|
|
86
|
+
// src/evals/dataset.ts
|
|
87
|
+
function matchesAny(value, matchers) {
|
|
88
|
+
return matchers.some(
|
|
89
|
+
(matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
|
|
90
|
+
);
|
|
91
|
+
}
|
|
92
|
+
function matchesAnyPath(filePath, matchers) {
|
|
93
|
+
return matchers.some((matcher) => {
|
|
94
|
+
if (typeof matcher === "string") {
|
|
95
|
+
return simpleGlobMatch(matcher, filePath);
|
|
96
|
+
}
|
|
97
|
+
return matcher.test(filePath);
|
|
98
|
+
});
|
|
99
|
+
}
|
|
100
|
+
function simpleGlobMatch(pattern, value) {
|
|
101
|
+
const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
|
|
102
|
+
return new RegExp(`^${escaped}$`).test(value);
|
|
103
|
+
}
|
|
104
|
+
var Dataset = class _Dataset {
|
|
105
|
+
constructor(config) {
|
|
106
|
+
this._config = config;
|
|
107
|
+
}
|
|
108
|
+
static define(config) {
|
|
109
|
+
const name = validateDatasetName(config.name, "Dataset.define");
|
|
110
|
+
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
111
|
+
return new _Dataset({
|
|
112
|
+
name,
|
|
113
|
+
displayName,
|
|
114
|
+
includedTags: config.includedTags ?? [],
|
|
115
|
+
excludedTags: config.excludedTags ?? [],
|
|
116
|
+
includedPaths: config.includedPaths ?? [],
|
|
117
|
+
excludedPaths: config.excludedPaths ?? []
|
|
118
|
+
});
|
|
119
|
+
}
|
|
120
|
+
/** Canonical dataset id (same rules as `RunConfig` / `TestCase` `name`). */
|
|
121
|
+
getName() {
|
|
122
|
+
return this._config.name;
|
|
123
|
+
}
|
|
124
|
+
getDisplayName() {
|
|
125
|
+
return this._config.displayName;
|
|
126
|
+
}
|
|
127
|
+
/** Label for CLI/TUI and evaluator `meta.datasetName`: {@link getDisplayName} if set, otherwise {@link getName}. */
|
|
128
|
+
getDisplayLabel() {
|
|
129
|
+
return this._config.displayName ?? this._config.name;
|
|
130
|
+
}
|
|
131
|
+
getIncludedTags() {
|
|
132
|
+
return this._config.includedTags;
|
|
133
|
+
}
|
|
134
|
+
getExcludedTags() {
|
|
135
|
+
return this._config.excludedTags;
|
|
136
|
+
}
|
|
137
|
+
getIncludedPaths() {
|
|
138
|
+
return this._config.includedPaths;
|
|
139
|
+
}
|
|
140
|
+
getExcludedPaths() {
|
|
141
|
+
return this._config.excludedPaths;
|
|
142
|
+
}
|
|
143
|
+
matchesTestCase(testCase, filePath) {
|
|
144
|
+
const tags = testCase.getTags();
|
|
145
|
+
if (this._config.excludedTags.length > 0) {
|
|
146
|
+
if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
|
|
147
|
+
return false;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
if (this._config.excludedPaths.length > 0) {
|
|
151
|
+
if (matchesAnyPath(filePath, this._config.excludedPaths)) {
|
|
152
|
+
return false;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
|
|
156
|
+
const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
|
|
157
|
+
return tagMatch && pathMatch;
|
|
158
|
+
}
|
|
159
|
+
};
|
|
160
|
+
function getDatasetDisplayLabel(dataset) {
|
|
161
|
+
if (typeof dataset.getDisplayLabel === "function") {
|
|
162
|
+
return dataset.getDisplayLabel();
|
|
163
|
+
}
|
|
164
|
+
return typeof dataset.getName === "function" ? dataset.getName() : "";
|
|
165
|
+
}
|
|
166
|
+
|
|
82
167
|
// src/evals/evaluator.ts
|
|
83
168
|
var Evaluator = class _Evaluator {
|
|
84
169
|
constructor(config) {
|
|
@@ -438,7 +523,7 @@ function toEvalDataset(item, snapshots) {
|
|
|
438
523
|
const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
|
|
439
524
|
return {
|
|
440
525
|
id: item.id,
|
|
441
|
-
name: item.dataset
|
|
526
|
+
name: getDatasetDisplayLabel(item.dataset),
|
|
442
527
|
overview: `Discovered from ${item.filePath}`,
|
|
443
528
|
runs
|
|
444
529
|
};
|
|
@@ -491,70 +576,6 @@ function parseStartupArgs(argv) {
|
|
|
491
576
|
}
|
|
492
577
|
return args;
|
|
493
578
|
}
|
|
494
|
-
|
|
495
|
-
// src/evals/dataset.ts
|
|
496
|
-
function matchesAny(value, matchers) {
|
|
497
|
-
return matchers.some(
|
|
498
|
-
(matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
|
|
499
|
-
);
|
|
500
|
-
}
|
|
501
|
-
function matchesAnyPath(filePath, matchers) {
|
|
502
|
-
return matchers.some((matcher) => {
|
|
503
|
-
if (typeof matcher === "string") {
|
|
504
|
-
return simpleGlobMatch(matcher, filePath);
|
|
505
|
-
}
|
|
506
|
-
return matcher.test(filePath);
|
|
507
|
-
});
|
|
508
|
-
}
|
|
509
|
-
function simpleGlobMatch(pattern, value) {
|
|
510
|
-
const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
|
|
511
|
-
return new RegExp(`^${escaped}$`).test(value);
|
|
512
|
-
}
|
|
513
|
-
var Dataset = class _Dataset {
|
|
514
|
-
constructor(config) {
|
|
515
|
-
this._config = config;
|
|
516
|
-
}
|
|
517
|
-
static define(config) {
|
|
518
|
-
return new _Dataset({
|
|
519
|
-
name: config.name,
|
|
520
|
-
includedTags: config.includedTags ?? [],
|
|
521
|
-
excludedTags: config.excludedTags ?? [],
|
|
522
|
-
includedPaths: config.includedPaths ?? [],
|
|
523
|
-
excludedPaths: config.excludedPaths ?? []
|
|
524
|
-
});
|
|
525
|
-
}
|
|
526
|
-
getName() {
|
|
527
|
-
return this._config.name;
|
|
528
|
-
}
|
|
529
|
-
getIncludedTags() {
|
|
530
|
-
return this._config.includedTags;
|
|
531
|
-
}
|
|
532
|
-
getExcludedTags() {
|
|
533
|
-
return this._config.excludedTags;
|
|
534
|
-
}
|
|
535
|
-
getIncludedPaths() {
|
|
536
|
-
return this._config.includedPaths;
|
|
537
|
-
}
|
|
538
|
-
getExcludedPaths() {
|
|
539
|
-
return this._config.excludedPaths;
|
|
540
|
-
}
|
|
541
|
-
matchesTestCase(testCase, filePath) {
|
|
542
|
-
const tags = testCase.getTags();
|
|
543
|
-
if (this._config.excludedTags.length > 0) {
|
|
544
|
-
if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
|
|
545
|
-
return false;
|
|
546
|
-
}
|
|
547
|
-
}
|
|
548
|
-
if (this._config.excludedPaths.length > 0) {
|
|
549
|
-
if (matchesAnyPath(filePath, this._config.excludedPaths)) {
|
|
550
|
-
return false;
|
|
551
|
-
}
|
|
552
|
-
}
|
|
553
|
-
const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
|
|
554
|
-
const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
|
|
555
|
-
return tagMatch && pathMatch;
|
|
556
|
-
}
|
|
557
|
-
};
|
|
558
579
|
function preprocessForDiff(value, options) {
|
|
559
580
|
if (options?.sort && Array.isArray(value)) {
|
|
560
581
|
return [...value].sort((a, b) => {
|
|
@@ -820,7 +841,7 @@ var RunConfig = class _RunConfig {
|
|
|
820
841
|
getDisplayLabel() {
|
|
821
842
|
return this._displayName ?? this._name;
|
|
822
843
|
}
|
|
823
|
-
/** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
|
|
844
|
+
/** Tags from `RunConfig.define({ tags })`; surfaced as `meta.runConfigTags` on evaluator callbacks. */
|
|
824
845
|
getTags() {
|
|
825
846
|
return [...this._tags];
|
|
826
847
|
}
|
|
@@ -993,10 +1014,11 @@ var TestCase = class _TestCase {
|
|
|
993
1014
|
static describe(config) {
|
|
994
1015
|
const name = validateTestCaseName(config.name, "TestCase.describe");
|
|
995
1016
|
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
1017
|
+
const tags = config.tags !== void 0 ? [...config.tags] : [];
|
|
996
1018
|
return new _TestCase({
|
|
997
1019
|
name,
|
|
998
1020
|
displayName,
|
|
999
|
-
tags
|
|
1021
|
+
tags,
|
|
1000
1022
|
inputSchema: config.inputSchema,
|
|
1001
1023
|
input: config.input,
|
|
1002
1024
|
outputSchema: config.outputSchema,
|
|
@@ -1013,7 +1035,7 @@ var TestCase = class _TestCase {
|
|
|
1013
1035
|
return this._config.displayName ?? this._config.name;
|
|
1014
1036
|
}
|
|
1015
1037
|
getTags() {
|
|
1016
|
-
return this._config.tags;
|
|
1038
|
+
return [...this._config.tags];
|
|
1017
1039
|
}
|
|
1018
1040
|
getInputSchema() {
|
|
1019
1041
|
return this._config.inputSchema;
|
|
@@ -1570,15 +1592,16 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1570
1592
|
meta: {
|
|
1571
1593
|
triggerId: task.triggerId,
|
|
1572
1594
|
runId: evaluatorRunId,
|
|
1573
|
-
|
|
1595
|
+
datasetName: task.dataset.getDisplayLabel(),
|
|
1574
1596
|
repetitionId,
|
|
1575
1597
|
repetitionIndex,
|
|
1576
1598
|
repetitionCount,
|
|
1577
|
-
runConfigName: task.runConfigName
|
|
1599
|
+
runConfigName: task.runConfigName,
|
|
1600
|
+
...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
|
|
1601
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1602
|
+
runConfigTags: task.runConfigTags,
|
|
1603
|
+
evaluatorTags: getEvaluatorTagList(evaluator)
|
|
1578
1604
|
},
|
|
1579
|
-
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1580
|
-
runConfigTags: task.runConfigTags,
|
|
1581
|
-
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
1582
1605
|
logDiff,
|
|
1583
1606
|
log,
|
|
1584
1607
|
createError
|
|
@@ -1985,7 +2008,7 @@ var EffectRunner = class {
|
|
|
1985
2008
|
);
|
|
1986
2009
|
if (!dsCollected) {
|
|
1987
2010
|
throw new Error(
|
|
1988
|
-
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.
|
|
2011
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
1989
2012
|
);
|
|
1990
2013
|
}
|
|
1991
2014
|
let evaluatorIds;
|
|
@@ -2057,7 +2080,8 @@ var EffectRunner = class {
|
|
|
2057
2080
|
globalEvaluationSemaphore: sem,
|
|
2058
2081
|
runConfigName: job.runConfigName,
|
|
2059
2082
|
runConfigTags: job.runConfigTags,
|
|
2060
|
-
repetitions: job.repetitions
|
|
2083
|
+
repetitions: job.repetitions,
|
|
2084
|
+
experimentName: request.experimentName
|
|
2061
2085
|
})
|
|
2062
2086
|
);
|
|
2063
2087
|
}
|
|
@@ -2092,7 +2116,8 @@ var EffectRunner = class {
|
|
|
2092
2116
|
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
2093
2117
|
repetitions: request.repetitions,
|
|
2094
2118
|
runConfigName,
|
|
2095
|
-
runConfigTags: request.runConfigTags
|
|
2119
|
+
runConfigTags: request.runConfigTags,
|
|
2120
|
+
experimentName: request.experimentName
|
|
2096
2121
|
});
|
|
2097
2122
|
}
|
|
2098
2123
|
async startDatasetRun(params) {
|
|
@@ -2120,7 +2145,7 @@ var EffectRunner = class {
|
|
|
2120
2145
|
const snapshot = {
|
|
2121
2146
|
runId,
|
|
2122
2147
|
datasetId: params.datasetId,
|
|
2123
|
-
datasetName: dataset.dataset.
|
|
2148
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
2124
2149
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
2125
2150
|
queuedAt: Date.now(),
|
|
2126
2151
|
totalTestCases: totalEvaluations,
|
|
@@ -2141,7 +2166,7 @@ var EffectRunner = class {
|
|
|
2141
2166
|
type: "RunQueued",
|
|
2142
2167
|
runId,
|
|
2143
2168
|
datasetId: params.datasetId,
|
|
2144
|
-
datasetName: dataset.dataset.
|
|
2169
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
2145
2170
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
2146
2171
|
totalTestCases: totalEvaluations,
|
|
2147
2172
|
artifactPath
|
|
@@ -2167,7 +2192,8 @@ var EffectRunner = class {
|
|
|
2167
2192
|
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
2168
2193
|
runConfigName: params.runConfigName,
|
|
2169
2194
|
runConfigTags,
|
|
2170
|
-
repetitions
|
|
2195
|
+
repetitions,
|
|
2196
|
+
experimentName: params.experimentName
|
|
2171
2197
|
})
|
|
2172
2198
|
);
|
|
2173
2199
|
return snapshot;
|
|
@@ -2249,6 +2275,7 @@ Object.defineProperty(exports, 'S', {
|
|
|
2249
2275
|
get: function () { return effect.Schema; }
|
|
2250
2276
|
});
|
|
2251
2277
|
exports.Dataset = Dataset;
|
|
2278
|
+
exports.DatasetNameSchema = DatasetNameSchema;
|
|
2252
2279
|
exports.Evaluator = Evaluator;
|
|
2253
2280
|
exports.EvaluatorNameSchema = EvaluatorNameSchema;
|
|
2254
2281
|
exports.Metric = Metric;
|
|
@@ -2266,6 +2293,7 @@ exports.defaultRunnerConfig = defaultRunnerConfig;
|
|
|
2266
2293
|
exports.defineConfig = defineConfig;
|
|
2267
2294
|
exports.deltaScore = deltaScore;
|
|
2268
2295
|
exports.formatScoreData = formatScoreData;
|
|
2296
|
+
exports.getDatasetDisplayLabel = getDatasetDisplayLabel;
|
|
2269
2297
|
exports.getEvaluatorDisplayLabel = getEvaluatorDisplayLabel;
|
|
2270
2298
|
exports.getEvaluatorTagList = getEvaluatorTagList;
|
|
2271
2299
|
exports.getLogLines = getLogLines;
|
|
@@ -2281,6 +2309,7 @@ exports.parseStartupArgs = parseStartupArgs;
|
|
|
2281
2309
|
exports.percentScore = percentScore;
|
|
2282
2310
|
exports.printJsonDiff = printJsonDiff;
|
|
2283
2311
|
exports.tokenCountMetric = tokenCountMetric;
|
|
2312
|
+
exports.validateDatasetName = validateDatasetName;
|
|
2284
2313
|
exports.validateEvaluatorName = validateEvaluatorName;
|
|
2285
2314
|
exports.validateRunConfigName = validateRunConfigName;
|
|
2286
2315
|
exports.validateTestCaseName = validateTestCaseName;
|