@m4trix/evals 0.26.0 → 0.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -4
- package/dist/cli-simple.cjs +53 -23
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +53 -23
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +25 -12
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +25 -12
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +108 -79
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +65 -24
- package/dist/index.js +106 -80
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -123,21 +123,21 @@ declare function defineConfig<TConfig extends ConfigType>(factory: M4trixEvalCon
|
|
|
123
123
|
declare const defaultRunnerConfig: RunnerConfig;
|
|
124
124
|
declare function withRunnerConfig(overrides?: RunnerConfigOverrides): RunnerConfig;
|
|
125
125
|
|
|
126
|
-
/** Matches a tag by exact string equality or regex test */
|
|
127
|
-
type TagMatcher = string | RegExp;
|
|
128
|
-
/** Matches a file path by glob string or regex test */
|
|
129
|
-
type PathMatcher = string | RegExp;
|
|
130
|
-
|
|
131
126
|
type InputOrBuilder<T> = T | (() => T);
|
|
132
127
|
interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>> {
|
|
133
128
|
/**
|
|
134
|
-
* Stable id (letters, digits, `_`, `-`).
|
|
129
|
+
* Stable id (letters, digits, `_`, `-`); used in discovery and matching.
|
|
135
130
|
* For an unrestricted UI label, set {@link displayName}.
|
|
136
131
|
*/
|
|
137
132
|
name: string;
|
|
138
|
-
/** Optional human-readable label for CLI/TUI (any characters). */
|
|
133
|
+
/** Optional human-readable label for CLI/TUI and evaluator args (any characters). */
|
|
139
134
|
displayName?: string;
|
|
140
|
-
|
|
135
|
+
/**
|
|
136
|
+
* Declared tags on this test case (not `Dataset` filter options). Use `Dataset` `includedTags` /
|
|
137
|
+
* `excludedTags` to select which cases belong to a dataset; evaluators read the resolved tags as
|
|
138
|
+
* `meta.testCaseTags`.
|
|
139
|
+
*/
|
|
140
|
+
tags?: ReadonlyArray<string>;
|
|
141
141
|
inputSchema: TI;
|
|
142
142
|
input: InputOrBuilder<Schema.Schema.Type<TI>>;
|
|
143
143
|
outputSchema?: TO;
|
|
@@ -161,13 +161,24 @@ declare function getTestCaseDisplayLabel(testCase: {
|
|
|
161
161
|
getDisplayLabel?: () => string;
|
|
162
162
|
getName?: () => string;
|
|
163
163
|
}): string;
|
|
164
|
-
/** Tags for evaluator `
|
|
164
|
+
/** Tags for evaluator `meta.testCaseTags` (supports plain test-case-shaped objects without `getTags`). */
|
|
165
165
|
declare function getTestCaseTagList(testCase: {
|
|
166
166
|
getTags?: () => ReadonlyArray<string>;
|
|
167
167
|
}): string[];
|
|
168
168
|
|
|
169
|
+
/** Matches a tag by exact string equality or regex test */
|
|
170
|
+
type TagMatcher = string | RegExp;
|
|
171
|
+
/** Matches a file path by glob string or regex test */
|
|
172
|
+
type PathMatcher = string | RegExp;
|
|
173
|
+
|
|
169
174
|
interface DatasetDefineConfig {
|
|
175
|
+
/**
|
|
176
|
+
* Stable id (letters, digits, `_`, `-`); used for discovery ids and `resolveDatasetByName`.
|
|
177
|
+
* For an unrestricted UI label, set {@link displayName}.
|
|
178
|
+
*/
|
|
170
179
|
name: string;
|
|
180
|
+
/** Optional human-readable label for CLI/TUI (any characters). */
|
|
181
|
+
displayName?: string;
|
|
171
182
|
includedTags?: TagMatcher[];
|
|
172
183
|
excludedTags?: TagMatcher[];
|
|
173
184
|
includedPaths?: PathMatcher[];
|
|
@@ -177,13 +188,22 @@ declare class Dataset {
|
|
|
177
188
|
private readonly _config;
|
|
178
189
|
private constructor();
|
|
179
190
|
static define(config: DatasetDefineConfig): Dataset;
|
|
191
|
+
/** Canonical dataset id (same rules as `RunConfig` / `TestCase` `name`). */
|
|
180
192
|
getName(): string;
|
|
193
|
+
getDisplayName(): string | undefined;
|
|
194
|
+
/** Label for CLI/TUI and evaluator `meta.datasetName`: {@link getDisplayName} if set, otherwise {@link getName}. */
|
|
195
|
+
getDisplayLabel(): string;
|
|
181
196
|
getIncludedTags(): ReadonlyArray<TagMatcher>;
|
|
182
197
|
getExcludedTags(): ReadonlyArray<TagMatcher>;
|
|
183
198
|
getIncludedPaths(): ReadonlyArray<PathMatcher>;
|
|
184
199
|
getExcludedPaths(): ReadonlyArray<PathMatcher>;
|
|
185
200
|
matchesTestCase(testCase: TestCase<unknown>, filePath: string): boolean;
|
|
186
201
|
}
|
|
202
|
+
/** CLI / runner: display label for a dataset-shaped object (supports discovery duck-types). */
|
|
203
|
+
declare function getDatasetDisplayLabel(dataset: {
|
|
204
|
+
getDisplayLabel?: () => string;
|
|
205
|
+
getName?: () => string;
|
|
206
|
+
}): string;
|
|
187
207
|
|
|
188
208
|
/**
|
|
189
209
|
* Options for customizing JSON diff output. Passed to logDiff, createDiffLogEntry, and printJsonDiff.
|
|
@@ -259,10 +279,14 @@ interface EvaluateMeta {
|
|
|
259
279
|
* for this specific test-case run.
|
|
260
280
|
*/
|
|
261
281
|
runId: string;
|
|
262
|
-
/**
|
|
263
|
-
|
|
282
|
+
/** Display label for the dataset (`Dataset.getDisplayLabel()`, i.e. `displayName ?? name`). */
|
|
283
|
+
datasetName: string;
|
|
264
284
|
/** Canonical `RunConfig` name (or `programmatic` for API/TUI-only runs). */
|
|
265
285
|
runConfigName: string;
|
|
286
|
+
/**
|
|
287
|
+
* Optional label for this invocation (e.g. CLI `--experiment`); omitted when not set.
|
|
288
|
+
*/
|
|
289
|
+
experimentName?: string;
|
|
266
290
|
/**
|
|
267
291
|
* Stable id shared by every execution of the same logical test case when `repetitionCount > 1`
|
|
268
292
|
* (and present with count 1 for consistency).
|
|
@@ -272,6 +296,15 @@ interface EvaluateMeta {
|
|
|
272
296
|
repetitionIndex: number;
|
|
273
297
|
/** Total scheduled executions for this logical test case in the current run. */
|
|
274
298
|
repetitionCount: number;
|
|
299
|
+
/** Declared tags on the current test case (`TestCase.describe({ tags })`); empty when none. */
|
|
300
|
+
testCaseTags: string[];
|
|
301
|
+
/**
|
|
302
|
+
* Declared tags on the current run config or programmatic request (`RunConfig.define({ tags })`,
|
|
303
|
+
* `RunDatasetRequest.runConfigTags`); empty when none.
|
|
304
|
+
*/
|
|
305
|
+
runConfigTags: string[];
|
|
306
|
+
/** Declared tags on this evaluator (`Evaluator.define({ tags })`); empty when none. */
|
|
307
|
+
evaluatorTags: string[];
|
|
275
308
|
}
|
|
276
309
|
interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>> {
|
|
277
310
|
input: TInput;
|
|
@@ -279,12 +312,6 @@ interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>>
|
|
|
279
312
|
output?: TOutput;
|
|
280
313
|
/** Metadata about the current evaluator invocation. */
|
|
281
314
|
meta: EvaluateMeta;
|
|
282
|
-
/** Tags from `TestCase.describe({ tags })` for the current test case. */
|
|
283
|
-
testCaseTags: string[];
|
|
284
|
-
/** Tags from `RunConfig.define({ tags })` for this job; empty for programmatic runs unless set on the request. */
|
|
285
|
-
runConfigTags: string[];
|
|
286
|
-
/** Tags from `Evaluator.define({ tags })` for this evaluator. */
|
|
287
|
-
evaluatorTags: string[];
|
|
288
315
|
/** Records a diff for this test case; stored in run artifact and shown by CLI */
|
|
289
316
|
logDiff: (expected: unknown, actual: unknown, options?: CreateDiffLogEntryOptions) => void;
|
|
290
317
|
/** Logs a message or object for this test case; stored in run artifact and shown by CLI */
|
|
@@ -313,7 +340,10 @@ interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.
|
|
|
313
340
|
scoreSchema: TS;
|
|
314
341
|
passThreshold?: number;
|
|
315
342
|
passCriterion?: (score: unknown) => boolean;
|
|
316
|
-
/**
|
|
343
|
+
/**
|
|
344
|
+
* Declared tags for this evaluator (not dataset filter rules); echoed on every `evaluate` call as
|
|
345
|
+
* `meta.evaluatorTags`.
|
|
346
|
+
*/
|
|
317
347
|
tags?: ReadonlyArray<string>;
|
|
318
348
|
}
|
|
319
349
|
declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, TCtx = Record<string, never>> {
|
|
@@ -345,7 +375,7 @@ declare function getEvaluatorDisplayLabel(evaluator: {
|
|
|
345
375
|
getDisplayLabel?: () => string | undefined;
|
|
346
376
|
getName?: () => string | undefined;
|
|
347
377
|
}): string | undefined;
|
|
348
|
-
/** Tags for evaluator `
|
|
378
|
+
/** Tags for evaluator `meta.evaluatorTags` (plain evaluator-shaped objects without `getTags` yield `[]`). */
|
|
349
379
|
declare function getEvaluatorTagList(evaluator: {
|
|
350
380
|
getTags?: () => ReadonlyArray<string>;
|
|
351
381
|
}): string[];
|
|
@@ -384,12 +414,16 @@ declare const RunConfigNameSchema: Schema.brand<Schema.filter<Schema.filter<Sche
|
|
|
384
414
|
declare const EvaluatorNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "EvaluatorName">;
|
|
385
415
|
/** Branded id for `TestCase.describe({ name })` (decode with {@link TestCaseNameSchema}). */
|
|
386
416
|
declare const TestCaseNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "TestCaseName">;
|
|
417
|
+
/** Branded id for `Dataset.define({ name })` (decode with {@link DatasetNameSchema}). */
|
|
418
|
+
declare const DatasetNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "DatasetName">;
|
|
387
419
|
type RunConfigName = Schema.Schema.Type<typeof RunConfigNameSchema>;
|
|
388
420
|
type EvaluatorName = Schema.Schema.Type<typeof EvaluatorNameSchema>;
|
|
389
421
|
type TestCaseName = Schema.Schema.Type<typeof TestCaseNameSchema>;
|
|
422
|
+
type DatasetName = Schema.Schema.Type<typeof DatasetNameSchema>;
|
|
390
423
|
declare function validateRunConfigName(raw: string, context: string): RunConfigName;
|
|
391
424
|
declare function validateEvaluatorName(raw: string, context: string): EvaluatorName;
|
|
392
425
|
declare function validateTestCaseName(raw: string, context: string): TestCaseName;
|
|
426
|
+
declare function validateDatasetName(raw: string, context: string): DatasetName;
|
|
393
427
|
/** Optional UI label: trim; empty after trim becomes undefined. */
|
|
394
428
|
declare function normalizeOptionalDisplayName(raw: string | undefined): string | undefined;
|
|
395
429
|
|
|
@@ -422,7 +456,7 @@ interface RunConfigDefineConfig {
|
|
|
422
456
|
name: string;
|
|
423
457
|
/** Optional human-readable label for CLI/TUI (any characters). */
|
|
424
458
|
displayName?: string;
|
|
425
|
-
/** Optional tags; copied to every evaluation as `runConfigTags
|
|
459
|
+
/** Optional declared tags for this run config; copied to every evaluation as `meta.runConfigTags`. */
|
|
426
460
|
tags?: ReadonlyArray<string>;
|
|
427
461
|
runs: ReadonlyArray<RunConfigRow>;
|
|
428
462
|
}
|
|
@@ -439,7 +473,7 @@ declare class RunConfig {
|
|
|
439
473
|
getDisplayName(): string | undefined;
|
|
440
474
|
/** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. */
|
|
441
475
|
getDisplayLabel(): string;
|
|
442
|
-
/** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
|
|
476
|
+
/** Tags from `RunConfig.define({ tags })`; surfaced as `meta.runConfigTags` on evaluator callbacks. */
|
|
443
477
|
getTags(): string[];
|
|
444
478
|
getRuns(): ReadonlyArray<RunConfigRow>;
|
|
445
479
|
}
|
|
@@ -528,7 +562,7 @@ interface RunDatasetJob {
|
|
|
528
562
|
*/
|
|
529
563
|
runConfigDisplayLabel?: string;
|
|
530
564
|
/**
|
|
531
|
-
* Tags from `RunConfig.define({ tags })` for this job; forwarded as `runConfigTags` on evaluator callbacks.
|
|
565
|
+
* Tags from `RunConfig.define({ tags })` for this job; forwarded as `meta.runConfigTags` on evaluator callbacks.
|
|
532
566
|
*/
|
|
533
567
|
runConfigTags?: ReadonlyArray<string>;
|
|
534
568
|
/** Evaluates each matching test case this many times (default 1). */
|
|
@@ -565,9 +599,13 @@ interface RunDatasetRequest {
|
|
|
565
599
|
*/
|
|
566
600
|
repetitions?: number;
|
|
567
601
|
/**
|
|
568
|
-
* Optional tags for this run; forwarded as `runConfigTags` on evaluator callbacks (e.g. suite labels).
|
|
602
|
+
* Optional tags for this run; forwarded as `meta.runConfigTags` on evaluator callbacks (e.g. suite labels).
|
|
569
603
|
*/
|
|
570
604
|
runConfigTags?: ReadonlyArray<string>;
|
|
605
|
+
/**
|
|
606
|
+
* Optional label for this run; forwarded as `experimentName` on evaluator `meta`.
|
|
607
|
+
*/
|
|
608
|
+
experimentName?: string;
|
|
571
609
|
}
|
|
572
610
|
interface RunSnapshot {
|
|
573
611
|
runId: string;
|
|
@@ -655,11 +693,14 @@ interface RunDatasetJobsWithSharedConcurrencyRequest {
|
|
|
655
693
|
jobs: ReadonlyArray<RunDatasetJob>;
|
|
656
694
|
globalConcurrency: number;
|
|
657
695
|
triggerId?: string;
|
|
696
|
+
/** Applied to every job in this batch (e.g. CLI `--experiment`). */
|
|
697
|
+
experimentName?: string;
|
|
658
698
|
}
|
|
659
699
|
interface RunnerApi {
|
|
660
700
|
collectDatasets(): Promise<ReadonlyArray<CollectedDataset>>;
|
|
661
701
|
collectEvaluators(): Promise<ReadonlyArray<CollectedEvaluator>>;
|
|
662
702
|
collectRunConfigs(): Promise<ReadonlyArray<CollectedRunConfig>>;
|
|
703
|
+
/** Resolves a dataset by canonical **`Dataset` `name`** (id), case-insensitive. */
|
|
663
704
|
resolveDatasetByName(name: string): Promise<CollectedDataset | undefined>;
|
|
664
705
|
resolveEvaluatorsByNamePattern(pattern: string): Promise<ReadonlyArray<CollectedEvaluator>>;
|
|
665
706
|
/**
|
|
@@ -732,4 +773,4 @@ declare class TagSet {
|
|
|
732
773
|
static define<const T extends readonly string[]>(tags: T): TagSetMembers<T>;
|
|
733
774
|
}
|
|
734
775
|
|
|
735
|
-
export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedRunConfig, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DeltaScoreData, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorName, EvaluatorNameSchema, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PROGRAMMATIC_RUN_CONFIG, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunConfig, RunConfigDefineConfig, RunConfigName, RunConfigNameSchema, RunConfigRow, RunConfigRowEvaluators, RunConfigRowPattern, RunDatasetJob, RunDatasetJobsWithSharedConcurrencyRequest, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TagSet, TagSetMembers, TestCase, TestCaseName, TestCaseNameSchema, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };
|
|
776
|
+
export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedRunConfig, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DatasetDefineConfig, DatasetName, DatasetNameSchema, DeltaScoreData, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorName, EvaluatorNameSchema, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PROGRAMMATIC_RUN_CONFIG, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunConfig, RunConfigDefineConfig, RunConfigName, RunConfigNameSchema, RunConfigRow, RunConfigRowEvaluators, RunConfigRowPattern, RunDatasetJob, RunDatasetJobsWithSharedConcurrencyRequest, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TagSet, TagSetMembers, TestCase, TestCaseName, TestCaseNameSchema, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getDatasetDisplayLabel, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateDatasetName, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };
|
package/dist/index.js
CHANGED
|
@@ -26,6 +26,7 @@ function makeEntityIdSchema(brand, label) {
|
|
|
26
26
|
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
27
27
|
var EvaluatorNameSchema = makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
28
28
|
var TestCaseNameSchema = makeEntityIdSchema("TestCaseName", "Test case name");
|
|
29
|
+
var DatasetNameSchema = makeEntityIdSchema("DatasetName", "Dataset name");
|
|
29
30
|
function validateWithSchema(schema, raw, context) {
|
|
30
31
|
const trimmed = raw.trim();
|
|
31
32
|
const decode = Schema.decodeUnknownEither(
|
|
@@ -46,6 +47,9 @@ function validateEvaluatorName(raw, context) {
|
|
|
46
47
|
function validateTestCaseName(raw, context) {
|
|
47
48
|
return validateWithSchema(TestCaseNameSchema, raw, context);
|
|
48
49
|
}
|
|
50
|
+
function validateDatasetName(raw, context) {
|
|
51
|
+
return validateWithSchema(DatasetNameSchema, raw, context);
|
|
52
|
+
}
|
|
49
53
|
function normalizeOptionalDisplayName(raw) {
|
|
50
54
|
if (raw === void 0) {
|
|
51
55
|
return void 0;
|
|
@@ -54,6 +58,87 @@ function normalizeOptionalDisplayName(raw) {
|
|
|
54
58
|
return t.length === 0 ? void 0 : t;
|
|
55
59
|
}
|
|
56
60
|
|
|
61
|
+
// src/evals/dataset.ts
|
|
62
|
+
function matchesAny(value, matchers) {
|
|
63
|
+
return matchers.some(
|
|
64
|
+
(matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
|
|
65
|
+
);
|
|
66
|
+
}
|
|
67
|
+
function matchesAnyPath(filePath, matchers) {
|
|
68
|
+
return matchers.some((matcher) => {
|
|
69
|
+
if (typeof matcher === "string") {
|
|
70
|
+
return simpleGlobMatch(matcher, filePath);
|
|
71
|
+
}
|
|
72
|
+
return matcher.test(filePath);
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
function simpleGlobMatch(pattern, value) {
|
|
76
|
+
const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
|
|
77
|
+
return new RegExp(`^${escaped}$`).test(value);
|
|
78
|
+
}
|
|
79
|
+
var Dataset = class _Dataset {
|
|
80
|
+
constructor(config) {
|
|
81
|
+
this._config = config;
|
|
82
|
+
}
|
|
83
|
+
static define(config) {
|
|
84
|
+
const name = validateDatasetName(config.name, "Dataset.define");
|
|
85
|
+
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
86
|
+
return new _Dataset({
|
|
87
|
+
name,
|
|
88
|
+
displayName,
|
|
89
|
+
includedTags: config.includedTags ?? [],
|
|
90
|
+
excludedTags: config.excludedTags ?? [],
|
|
91
|
+
includedPaths: config.includedPaths ?? [],
|
|
92
|
+
excludedPaths: config.excludedPaths ?? []
|
|
93
|
+
});
|
|
94
|
+
}
|
|
95
|
+
/** Canonical dataset id (same rules as `RunConfig` / `TestCase` `name`). */
|
|
96
|
+
getName() {
|
|
97
|
+
return this._config.name;
|
|
98
|
+
}
|
|
99
|
+
getDisplayName() {
|
|
100
|
+
return this._config.displayName;
|
|
101
|
+
}
|
|
102
|
+
/** Label for CLI/TUI and evaluator `meta.datasetName`: {@link getDisplayName} if set, otherwise {@link getName}. */
|
|
103
|
+
getDisplayLabel() {
|
|
104
|
+
return this._config.displayName ?? this._config.name;
|
|
105
|
+
}
|
|
106
|
+
getIncludedTags() {
|
|
107
|
+
return this._config.includedTags;
|
|
108
|
+
}
|
|
109
|
+
getExcludedTags() {
|
|
110
|
+
return this._config.excludedTags;
|
|
111
|
+
}
|
|
112
|
+
getIncludedPaths() {
|
|
113
|
+
return this._config.includedPaths;
|
|
114
|
+
}
|
|
115
|
+
getExcludedPaths() {
|
|
116
|
+
return this._config.excludedPaths;
|
|
117
|
+
}
|
|
118
|
+
matchesTestCase(testCase, filePath) {
|
|
119
|
+
const tags = testCase.getTags();
|
|
120
|
+
if (this._config.excludedTags.length > 0) {
|
|
121
|
+
if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
|
|
122
|
+
return false;
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
if (this._config.excludedPaths.length > 0) {
|
|
126
|
+
if (matchesAnyPath(filePath, this._config.excludedPaths)) {
|
|
127
|
+
return false;
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
|
|
131
|
+
const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
|
|
132
|
+
return tagMatch && pathMatch;
|
|
133
|
+
}
|
|
134
|
+
};
|
|
135
|
+
function getDatasetDisplayLabel(dataset) {
|
|
136
|
+
if (typeof dataset.getDisplayLabel === "function") {
|
|
137
|
+
return dataset.getDisplayLabel();
|
|
138
|
+
}
|
|
139
|
+
return typeof dataset.getName === "function" ? dataset.getName() : "";
|
|
140
|
+
}
|
|
141
|
+
|
|
57
142
|
// src/evals/evaluator.ts
|
|
58
143
|
var Evaluator = class _Evaluator {
|
|
59
144
|
constructor(config) {
|
|
@@ -413,7 +498,7 @@ function toEvalDataset(item, snapshots) {
|
|
|
413
498
|
const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
|
|
414
499
|
return {
|
|
415
500
|
id: item.id,
|
|
416
|
-
name: item.dataset
|
|
501
|
+
name: getDatasetDisplayLabel(item.dataset),
|
|
417
502
|
overview: `Discovered from ${item.filePath}`,
|
|
418
503
|
runs
|
|
419
504
|
};
|
|
@@ -466,70 +551,6 @@ function parseStartupArgs(argv) {
|
|
|
466
551
|
}
|
|
467
552
|
return args;
|
|
468
553
|
}
|
|
469
|
-
|
|
470
|
-
// src/evals/dataset.ts
|
|
471
|
-
function matchesAny(value, matchers) {
|
|
472
|
-
return matchers.some(
|
|
473
|
-
(matcher) => typeof matcher === "string" ? value === matcher : matcher.test(value)
|
|
474
|
-
);
|
|
475
|
-
}
|
|
476
|
-
function matchesAnyPath(filePath, matchers) {
|
|
477
|
-
return matchers.some((matcher) => {
|
|
478
|
-
if (typeof matcher === "string") {
|
|
479
|
-
return simpleGlobMatch(matcher, filePath);
|
|
480
|
-
}
|
|
481
|
-
return matcher.test(filePath);
|
|
482
|
-
});
|
|
483
|
-
}
|
|
484
|
-
function simpleGlobMatch(pattern, value) {
|
|
485
|
-
const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\?/g, "[^/]").replace(/\*\*\//g, "(?:.*/)?").replace(/\*\*/g, ".*").replace(/\*/g, "[^/]*");
|
|
486
|
-
return new RegExp(`^${escaped}$`).test(value);
|
|
487
|
-
}
|
|
488
|
-
var Dataset = class _Dataset {
|
|
489
|
-
constructor(config) {
|
|
490
|
-
this._config = config;
|
|
491
|
-
}
|
|
492
|
-
static define(config) {
|
|
493
|
-
return new _Dataset({
|
|
494
|
-
name: config.name,
|
|
495
|
-
includedTags: config.includedTags ?? [],
|
|
496
|
-
excludedTags: config.excludedTags ?? [],
|
|
497
|
-
includedPaths: config.includedPaths ?? [],
|
|
498
|
-
excludedPaths: config.excludedPaths ?? []
|
|
499
|
-
});
|
|
500
|
-
}
|
|
501
|
-
getName() {
|
|
502
|
-
return this._config.name;
|
|
503
|
-
}
|
|
504
|
-
getIncludedTags() {
|
|
505
|
-
return this._config.includedTags;
|
|
506
|
-
}
|
|
507
|
-
getExcludedTags() {
|
|
508
|
-
return this._config.excludedTags;
|
|
509
|
-
}
|
|
510
|
-
getIncludedPaths() {
|
|
511
|
-
return this._config.includedPaths;
|
|
512
|
-
}
|
|
513
|
-
getExcludedPaths() {
|
|
514
|
-
return this._config.excludedPaths;
|
|
515
|
-
}
|
|
516
|
-
matchesTestCase(testCase, filePath) {
|
|
517
|
-
const tags = testCase.getTags();
|
|
518
|
-
if (this._config.excludedTags.length > 0) {
|
|
519
|
-
if (tags.some((tag) => matchesAny(tag, this._config.excludedTags))) {
|
|
520
|
-
return false;
|
|
521
|
-
}
|
|
522
|
-
}
|
|
523
|
-
if (this._config.excludedPaths.length > 0) {
|
|
524
|
-
if (matchesAnyPath(filePath, this._config.excludedPaths)) {
|
|
525
|
-
return false;
|
|
526
|
-
}
|
|
527
|
-
}
|
|
528
|
-
const tagMatch = this._config.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, this._config.includedTags));
|
|
529
|
-
const pathMatch = this._config.includedPaths.length === 0 || matchesAnyPath(filePath, this._config.includedPaths);
|
|
530
|
-
return tagMatch && pathMatch;
|
|
531
|
-
}
|
|
532
|
-
};
|
|
533
554
|
function preprocessForDiff(value, options) {
|
|
534
555
|
if (options?.sort && Array.isArray(value)) {
|
|
535
556
|
return [...value].sort((a, b) => {
|
|
@@ -795,7 +816,7 @@ var RunConfig = class _RunConfig {
|
|
|
795
816
|
getDisplayLabel() {
|
|
796
817
|
return this._displayName ?? this._name;
|
|
797
818
|
}
|
|
798
|
-
/** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
|
|
819
|
+
/** Tags from `RunConfig.define({ tags })`; surfaced as `meta.runConfigTags` on evaluator callbacks. */
|
|
799
820
|
getTags() {
|
|
800
821
|
return [...this._tags];
|
|
801
822
|
}
|
|
@@ -968,10 +989,11 @@ var TestCase = class _TestCase {
|
|
|
968
989
|
static describe(config) {
|
|
969
990
|
const name = validateTestCaseName(config.name, "TestCase.describe");
|
|
970
991
|
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
992
|
+
const tags = config.tags !== void 0 ? [...config.tags] : [];
|
|
971
993
|
return new _TestCase({
|
|
972
994
|
name,
|
|
973
995
|
displayName,
|
|
974
|
-
tags
|
|
996
|
+
tags,
|
|
975
997
|
inputSchema: config.inputSchema,
|
|
976
998
|
input: config.input,
|
|
977
999
|
outputSchema: config.outputSchema,
|
|
@@ -988,7 +1010,7 @@ var TestCase = class _TestCase {
|
|
|
988
1010
|
return this._config.displayName ?? this._config.name;
|
|
989
1011
|
}
|
|
990
1012
|
getTags() {
|
|
991
|
-
return this._config.tags;
|
|
1013
|
+
return [...this._config.tags];
|
|
992
1014
|
}
|
|
993
1015
|
getInputSchema() {
|
|
994
1016
|
return this._config.inputSchema;
|
|
@@ -1545,15 +1567,16 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1545
1567
|
meta: {
|
|
1546
1568
|
triggerId: task.triggerId,
|
|
1547
1569
|
runId: evaluatorRunId,
|
|
1548
|
-
|
|
1570
|
+
datasetName: task.dataset.getDisplayLabel(),
|
|
1549
1571
|
repetitionId,
|
|
1550
1572
|
repetitionIndex,
|
|
1551
1573
|
repetitionCount,
|
|
1552
|
-
runConfigName: task.runConfigName
|
|
1574
|
+
runConfigName: task.runConfigName,
|
|
1575
|
+
...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
|
|
1576
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1577
|
+
runConfigTags: task.runConfigTags,
|
|
1578
|
+
evaluatorTags: getEvaluatorTagList(evaluator)
|
|
1553
1579
|
},
|
|
1554
|
-
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1555
|
-
runConfigTags: task.runConfigTags,
|
|
1556
|
-
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
1557
1580
|
logDiff,
|
|
1558
1581
|
log,
|
|
1559
1582
|
createError
|
|
@@ -1960,7 +1983,7 @@ var EffectRunner = class {
|
|
|
1960
1983
|
);
|
|
1961
1984
|
if (!dsCollected) {
|
|
1962
1985
|
throw new Error(
|
|
1963
|
-
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.
|
|
1986
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
1964
1987
|
);
|
|
1965
1988
|
}
|
|
1966
1989
|
let evaluatorIds;
|
|
@@ -2032,7 +2055,8 @@ var EffectRunner = class {
|
|
|
2032
2055
|
globalEvaluationSemaphore: sem,
|
|
2033
2056
|
runConfigName: job.runConfigName,
|
|
2034
2057
|
runConfigTags: job.runConfigTags,
|
|
2035
|
-
repetitions: job.repetitions
|
|
2058
|
+
repetitions: job.repetitions,
|
|
2059
|
+
experimentName: request.experimentName
|
|
2036
2060
|
})
|
|
2037
2061
|
);
|
|
2038
2062
|
}
|
|
@@ -2067,7 +2091,8 @@ var EffectRunner = class {
|
|
|
2067
2091
|
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
2068
2092
|
repetitions: request.repetitions,
|
|
2069
2093
|
runConfigName,
|
|
2070
|
-
runConfigTags: request.runConfigTags
|
|
2094
|
+
runConfigTags: request.runConfigTags,
|
|
2095
|
+
experimentName: request.experimentName
|
|
2071
2096
|
});
|
|
2072
2097
|
}
|
|
2073
2098
|
async startDatasetRun(params) {
|
|
@@ -2095,7 +2120,7 @@ var EffectRunner = class {
|
|
|
2095
2120
|
const snapshot = {
|
|
2096
2121
|
runId,
|
|
2097
2122
|
datasetId: params.datasetId,
|
|
2098
|
-
datasetName: dataset.dataset.
|
|
2123
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
2099
2124
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
2100
2125
|
queuedAt: Date.now(),
|
|
2101
2126
|
totalTestCases: totalEvaluations,
|
|
@@ -2116,7 +2141,7 @@ var EffectRunner = class {
|
|
|
2116
2141
|
type: "RunQueued",
|
|
2117
2142
|
runId,
|
|
2118
2143
|
datasetId: params.datasetId,
|
|
2119
|
-
datasetName: dataset.dataset.
|
|
2144
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
2120
2145
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
2121
2146
|
totalTestCases: totalEvaluations,
|
|
2122
2147
|
artifactPath
|
|
@@ -2142,7 +2167,8 @@ var EffectRunner = class {
|
|
|
2142
2167
|
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
2143
2168
|
runConfigName: params.runConfigName,
|
|
2144
2169
|
runConfigTags,
|
|
2145
|
-
repetitions
|
|
2170
|
+
repetitions,
|
|
2171
|
+
experimentName: params.experimentName
|
|
2146
2172
|
})
|
|
2147
2173
|
);
|
|
2148
2174
|
return snapshot;
|
|
@@ -2219,6 +2245,6 @@ var PROGRAMMATIC_RUN_CONFIG = {
|
|
|
2219
2245
|
runConfigName: "programmatic"
|
|
2220
2246
|
};
|
|
2221
2247
|
|
|
2222
|
-
export { Dataset, Evaluator, EvaluatorNameSchema, Metric, PROGRAMMATIC_RUN_CONFIG, RunConfig, RunConfigNameSchema, Score, TagSet, TestCase, TestCaseNameSchema, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };
|
|
2248
|
+
export { Dataset, DatasetNameSchema, Evaluator, EvaluatorNameSchema, Metric, PROGRAMMATIC_RUN_CONFIG, RunConfig, RunConfigNameSchema, Score, TagSet, TestCase, TestCaseNameSchema, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getDatasetDisplayLabel, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateDatasetName, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };
|
|
2223
2249
|
//# sourceMappingURL=out.js.map
|
|
2224
2250
|
//# sourceMappingURL=index.js.map
|