@m4trix/evals 0.26.0 → 0.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -3
- package/dist/cli-simple.cjs +17 -8
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +17 -8
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +14 -5
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +14 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +93 -69
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +30 -10
- package/dist/index.js +91 -70
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -51,6 +51,7 @@ function makeEntityIdSchema(brand, label) {
|
|
|
51
51
|
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
52
52
|
var EvaluatorNameSchema = makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
53
53
|
var TestCaseNameSchema = makeEntityIdSchema("TestCaseName", "Test case name");
|
|
54
|
+
var DatasetNameSchema = makeEntityIdSchema("DatasetName", "Dataset name");
|
|
54
55
|
function validateWithSchema(schema, raw, context) {
|
|
55
56
|
const trimmed = raw.trim();
|
|
56
57
|
const decode = effect.Schema.decodeUnknownEither(
|
|
@@ -71,6 +72,9 @@ function validateEvaluatorName(raw, context) {
|
|
|
71
72
|
function validateTestCaseName(raw, context) {
|
|
72
73
|
return validateWithSchema(TestCaseNameSchema, raw, context);
|
|
73
74
|
}
|
|
75
|
+
function validateDatasetName(raw, context) {
|
|
76
|
+
return validateWithSchema(DatasetNameSchema, raw, context);
|
|
77
|
+
}
|
|
74
78
|
function normalizeOptionalDisplayName(raw) {
|
|
75
79
|
if (raw === void 0) {
|
|
76
80
|
return void 0;
|
|
@@ -79,6 +83,87 @@ function normalizeOptionalDisplayName(raw) {
|
|
|
79
83
|
return t.length === 0 ? void 0 : t;
|
|
80
84
|
}
|
|
81
85
|
|
|
86
|
+
// src/evals/dataset.ts
|
|
87
|
+
function matchesAny(value, matchers) {
|
|
88
|
+
return matchers.some(
|
|
89
|
+
(matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
|
|
90
|
+
);
|
|
91
|
+
}
|
|
92
|
+
function matchesAnyPath(filePath, matchers) {
|
|
93
|
+
return matchers.some((matcher) => {
|
|
94
|
+
if (typeof matcher === "string") {
|
|
95
|
+
return simpleGlobMatch(matcher, filePath);
|
|
96
|
+
}
|
|
97
|
+
return matcher.test(filePath);
|
|
98
|
+
});
|
|
99
|
+
}
|
|
100
|
+
function simpleGlobMatch(pattern, value) {
|
|
101
|
+
const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
|
|
102
|
+
return new RegExp(`^${escaped}$`).test(value);
|
|
103
|
+
}
|
|
104
|
+
var Dataset = class _Dataset {
|
|
105
|
+
constructor(config) {
|
|
106
|
+
this._config = config;
|
|
107
|
+
}
|
|
108
|
+
static define(config) {
|
|
109
|
+
const name = validateDatasetName(config.name, "Dataset.define");
|
|
110
|
+
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
111
|
+
return new _Dataset({
|
|
112
|
+
name,
|
|
113
|
+
displayName,
|
|
114
|
+
includedTags: config.includedTags ?? [],
|
|
115
|
+
excludedTags: config.excludedTags ?? [],
|
|
116
|
+
includedPaths: config.includedPaths ?? [],
|
|
117
|
+
excludedPaths: config.excludedPaths ?? []
|
|
118
|
+
});
|
|
119
|
+
}
|
|
120
|
+
/** Canonical dataset id (same rules as `RunConfig` / `TestCase` `name`). */
|
|
121
|
+
getName() {
|
|
122
|
+
return this._config.name;
|
|
123
|
+
}
|
|
124
|
+
getDisplayName() {
|
|
125
|
+
return this._config.displayName;
|
|
126
|
+
}
|
|
127
|
+
/** Label for CLI/TUI and evaluator `meta.datasetName`: {@link getDisplayName} if set, otherwise {@link getName}. */
|
|
128
|
+
getDisplayLabel() {
|
|
129
|
+
return this._config.displayName ?? this._config.name;
|
|
130
|
+
}
|
|
131
|
+
getIncludedTags() {
|
|
132
|
+
return this._config.includedTags;
|
|
133
|
+
}
|
|
134
|
+
getExcludedTags() {
|
|
135
|
+
return this._config.excludedTags;
|
|
136
|
+
}
|
|
137
|
+
getIncludedPaths() {
|
|
138
|
+
return this._config.includedPaths;
|
|
139
|
+
}
|
|
140
|
+
getExcludedPaths() {
|
|
141
|
+
return this._config.excludedPaths;
|
|
142
|
+
}
|
|
143
|
+
matchesTestCase(testCase, filePath) {
|
|
144
|
+
const tags = testCase.getTags();
|
|
145
|
+
if (this._config.excludedTags.length > 0) {
|
|
146
|
+
if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
|
|
147
|
+
return false;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
if (this._config.excludedPaths.length > 0) {
|
|
151
|
+
if (matchesAnyPath(filePath, this._config.excludedPaths)) {
|
|
152
|
+
return false;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
|
|
156
|
+
const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
|
|
157
|
+
return tagMatch && pathMatch;
|
|
158
|
+
}
|
|
159
|
+
};
|
|
160
|
+
function getDatasetDisplayLabel(dataset) {
|
|
161
|
+
if (typeof dataset.getDisplayLabel === "function") {
|
|
162
|
+
return dataset.getDisplayLabel();
|
|
163
|
+
}
|
|
164
|
+
return typeof dataset.getName === "function" ? dataset.getName() : "";
|
|
165
|
+
}
|
|
166
|
+
|
|
82
167
|
// src/evals/evaluator.ts
|
|
83
168
|
var Evaluator = class _Evaluator {
|
|
84
169
|
constructor(config) {
|
|
@@ -438,7 +523,7 @@ function toEvalDataset(item, snapshots) {
|
|
|
438
523
|
const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
|
|
439
524
|
return {
|
|
440
525
|
id: item.id,
|
|
441
|
-
name: item.dataset
|
|
526
|
+
name: getDatasetDisplayLabel(item.dataset),
|
|
442
527
|
overview: `Discovered from ${item.filePath}`,
|
|
443
528
|
runs
|
|
444
529
|
};
|
|
@@ -491,70 +576,6 @@ function parseStartupArgs(argv) {
|
|
|
491
576
|
}
|
|
492
577
|
return args;
|
|
493
578
|
}
|
|
494
|
-
|
|
495
|
-
// src/evals/dataset.ts
|
|
496
|
-
function matchesAny(value, matchers) {
|
|
497
|
-
return matchers.some(
|
|
498
|
-
(matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
|
|
499
|
-
);
|
|
500
|
-
}
|
|
501
|
-
function matchesAnyPath(filePath, matchers) {
|
|
502
|
-
return matchers.some((matcher) => {
|
|
503
|
-
if (typeof matcher === "string") {
|
|
504
|
-
return simpleGlobMatch(matcher, filePath);
|
|
505
|
-
}
|
|
506
|
-
return matcher.test(filePath);
|
|
507
|
-
});
|
|
508
|
-
}
|
|
509
|
-
function simpleGlobMatch(pattern, value) {
|
|
510
|
-
const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
|
|
511
|
-
return new RegExp(`^${escaped}$`).test(value);
|
|
512
|
-
}
|
|
513
|
-
var Dataset = class _Dataset {
|
|
514
|
-
constructor(config) {
|
|
515
|
-
this._config = config;
|
|
516
|
-
}
|
|
517
|
-
static define(config) {
|
|
518
|
-
return new _Dataset({
|
|
519
|
-
name: config.name,
|
|
520
|
-
includedTags: config.includedTags ?? [],
|
|
521
|
-
excludedTags: config.excludedTags ?? [],
|
|
522
|
-
includedPaths: config.includedPaths ?? [],
|
|
523
|
-
excludedPaths: config.excludedPaths ?? []
|
|
524
|
-
});
|
|
525
|
-
}
|
|
526
|
-
getName() {
|
|
527
|
-
return this._config.name;
|
|
528
|
-
}
|
|
529
|
-
getIncludedTags() {
|
|
530
|
-
return this._config.includedTags;
|
|
531
|
-
}
|
|
532
|
-
getExcludedTags() {
|
|
533
|
-
return this._config.excludedTags;
|
|
534
|
-
}
|
|
535
|
-
getIncludedPaths() {
|
|
536
|
-
return this._config.includedPaths;
|
|
537
|
-
}
|
|
538
|
-
getExcludedPaths() {
|
|
539
|
-
return this._config.excludedPaths;
|
|
540
|
-
}
|
|
541
|
-
matchesTestCase(testCase, filePath) {
|
|
542
|
-
const tags = testCase.getTags();
|
|
543
|
-
if (this._config.excludedTags.length > 0) {
|
|
544
|
-
if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
|
|
545
|
-
return false;
|
|
546
|
-
}
|
|
547
|
-
}
|
|
548
|
-
if (this._config.excludedPaths.length > 0) {
|
|
549
|
-
if (matchesAnyPath(filePath, this._config.excludedPaths)) {
|
|
550
|
-
return false;
|
|
551
|
-
}
|
|
552
|
-
}
|
|
553
|
-
const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
|
|
554
|
-
const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
|
|
555
|
-
return tagMatch && pathMatch;
|
|
556
|
-
}
|
|
557
|
-
};
|
|
558
579
|
function preprocessForDiff(value, options) {
|
|
559
580
|
if (options?.sort && Array.isArray(value)) {
|
|
560
581
|
return [...value].sort((a, b) => {
|
|
@@ -1570,7 +1591,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1570
1591
|
meta: {
|
|
1571
1592
|
triggerId: task.triggerId,
|
|
1572
1593
|
runId: evaluatorRunId,
|
|
1573
|
-
|
|
1594
|
+
datasetName: task.dataset.getDisplayLabel(),
|
|
1574
1595
|
repetitionId,
|
|
1575
1596
|
repetitionIndex,
|
|
1576
1597
|
repetitionCount,
|
|
@@ -1985,7 +2006,7 @@ var EffectRunner = class {
|
|
|
1985
2006
|
);
|
|
1986
2007
|
if (!dsCollected) {
|
|
1987
2008
|
throw new Error(
|
|
1988
|
-
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.
|
|
2009
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
1989
2010
|
);
|
|
1990
2011
|
}
|
|
1991
2012
|
let evaluatorIds;
|
|
@@ -2120,7 +2141,7 @@ var EffectRunner = class {
|
|
|
2120
2141
|
const snapshot = {
|
|
2121
2142
|
runId,
|
|
2122
2143
|
datasetId: params.datasetId,
|
|
2123
|
-
datasetName: dataset.dataset.
|
|
2144
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
2124
2145
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
2125
2146
|
queuedAt: Date.now(),
|
|
2126
2147
|
totalTestCases: totalEvaluations,
|
|
@@ -2141,7 +2162,7 @@ var EffectRunner = class {
|
|
|
2141
2162
|
type: "RunQueued",
|
|
2142
2163
|
runId,
|
|
2143
2164
|
datasetId: params.datasetId,
|
|
2144
|
-
datasetName: dataset.dataset.
|
|
2165
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
2145
2166
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
2146
2167
|
totalTestCases: totalEvaluations,
|
|
2147
2168
|
artifactPath
|
|
@@ -2249,6 +2270,7 @@ Object.defineProperty(exports, 'S', {
|
|
|
2249
2270
|
get: function () { return effect.Schema; }
|
|
2250
2271
|
});
|
|
2251
2272
|
exports.Dataset = Dataset;
|
|
2273
|
+
exports.DatasetNameSchema = DatasetNameSchema;
|
|
2252
2274
|
exports.Evaluator = Evaluator;
|
|
2253
2275
|
exports.EvaluatorNameSchema = EvaluatorNameSchema;
|
|
2254
2276
|
exports.Metric = Metric;
|
|
@@ -2266,6 +2288,7 @@ exports.defaultRunnerConfig = defaultRunnerConfig;
|
|
|
2266
2288
|
exports.defineConfig = defineConfig;
|
|
2267
2289
|
exports.deltaScore = deltaScore;
|
|
2268
2290
|
exports.formatScoreData = formatScoreData;
|
|
2291
|
+
exports.getDatasetDisplayLabel = getDatasetDisplayLabel;
|
|
2269
2292
|
exports.getEvaluatorDisplayLabel = getEvaluatorDisplayLabel;
|
|
2270
2293
|
exports.getEvaluatorTagList = getEvaluatorTagList;
|
|
2271
2294
|
exports.getLogLines = getLogLines;
|
|
@@ -2281,6 +2304,7 @@ exports.parseStartupArgs = parseStartupArgs;
|
|
|
2281
2304
|
exports.percentScore = percentScore;
|
|
2282
2305
|
exports.printJsonDiff = printJsonDiff;
|
|
2283
2306
|
exports.tokenCountMetric = tokenCountMetric;
|
|
2307
|
+
exports.validateDatasetName = validateDatasetName;
|
|
2284
2308
|
exports.validateEvaluatorName = validateEvaluatorName;
|
|
2285
2309
|
exports.validateRunConfigName = validateRunConfigName;
|
|
2286
2310
|
exports.validateTestCaseName = validateTestCaseName;
|