@m4trix/evals 0.16.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +113 -76
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +113 -76
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +87 -47
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +87 -47
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +86 -44
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +54 -5
- package/dist/index.js +85 -45
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -234,10 +234,23 @@ interface EvalMiddleware<TCtx> {
|
|
|
234
234
|
name: string;
|
|
235
235
|
resolve: () => TCtx | Promise<TCtx>;
|
|
236
236
|
}
|
|
237
|
+
interface EvaluateMeta {
|
|
238
|
+
/** Identifier of the trigger that started the run (for example, a CLI invocation). */
|
|
239
|
+
triggerId: string;
|
|
240
|
+
/**
|
|
241
|
+
* Identifier of the current test-case execution shared across all evaluators
|
|
242
|
+
* for this specific test-case run.
|
|
243
|
+
*/
|
|
244
|
+
runId: string;
|
|
245
|
+
/** Identifier of the dataset currently being evaluated. */
|
|
246
|
+
datasetId: string;
|
|
247
|
+
}
|
|
237
248
|
interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>> {
|
|
238
249
|
input: TInput;
|
|
239
250
|
ctx: TCtx;
|
|
240
251
|
output?: TOutput;
|
|
252
|
+
/** Metadata about the current evaluator invocation. */
|
|
253
|
+
meta: EvaluateMeta;
|
|
241
254
|
/** Records a diff for this test case; stored in run artifact and shown by CLI */
|
|
242
255
|
logDiff: (expected: unknown, actual: unknown, options?: CreateDiffLogEntryOptions) => void;
|
|
243
256
|
/** Logs a message or object for this test case; stored in run artifact and shown by CLI */
|
|
@@ -302,6 +315,8 @@ interface ScoreItem<TData = unknown> {
|
|
|
302
315
|
readonly id: string;
|
|
303
316
|
readonly data: TData;
|
|
304
317
|
readonly passed?: boolean;
|
|
318
|
+
/** Attached def for formatting/aggregation without registry lookup (avoids n/a across module boundaries) */
|
|
319
|
+
readonly def?: ScoreDef<TData>;
|
|
305
320
|
}
|
|
306
321
|
interface FormatScoreOptions {
|
|
307
322
|
isAggregated?: boolean;
|
|
@@ -310,19 +325,43 @@ interface ScoreDef<TData = unknown> {
|
|
|
310
325
|
readonly id: string;
|
|
311
326
|
readonly name?: string;
|
|
312
327
|
readonly displayStrategy: ScoreDisplayStrategy;
|
|
313
|
-
readonly
|
|
314
|
-
|
|
328
|
+
readonly formatValue: (data: TData) => string;
|
|
329
|
+
readonly formatAggregate: (data: TData) => string;
|
|
330
|
+
readonly aggregateValues: (values: ReadonlyArray<TData>) => TData;
|
|
315
331
|
make(data: TData, options?: {
|
|
316
332
|
definePassed?: (data: TData) => boolean;
|
|
317
333
|
}): ScoreItem<TData>;
|
|
318
334
|
}
|
|
335
|
+
/** Helper to format using the right method based on isAggregated (for consumers that need a single entry point) */
|
|
336
|
+
declare function formatScoreData<TData>(def: ScoreDef<TData>, data: TData, options?: FormatScoreOptions): string;
|
|
319
337
|
declare const Score: {
|
|
338
|
+
aggregate: {
|
|
339
|
+
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
340
|
+
averageFields<K extends string>(fields: readonly K[]): (values: readonly Record<K, number>[]) => Record<K, number>;
|
|
341
|
+
/** Average `value` with sample std dev. Use for percent-style scores. */
|
|
342
|
+
averageWithVariance<T extends {
|
|
343
|
+
value: number;
|
|
344
|
+
}>(values: readonly T[]): T & {
|
|
345
|
+
stdDev?: number | undefined;
|
|
346
|
+
count: number;
|
|
347
|
+
};
|
|
348
|
+
/** All runs must pass. Use for binary scores. */
|
|
349
|
+
all<T_1 extends {
|
|
350
|
+
passed: boolean;
|
|
351
|
+
}>(values: readonly T_1[]): T_1 & {
|
|
352
|
+
passedCount?: number | undefined;
|
|
353
|
+
totalCount?: number | undefined;
|
|
354
|
+
};
|
|
355
|
+
/** Take last value (no aggregation). Use when aggregation is not meaningful. */
|
|
356
|
+
last<T_2>(values: readonly T_2[]): T_2;
|
|
357
|
+
};
|
|
320
358
|
of<TData>(config: {
|
|
321
359
|
id: string;
|
|
322
360
|
name?: string | undefined;
|
|
323
361
|
displayStrategy: ScoreDisplayStrategy;
|
|
324
|
-
|
|
325
|
-
|
|
362
|
+
formatValue: (data: TData) => string;
|
|
363
|
+
formatAggregate: (data: TData) => string;
|
|
364
|
+
aggregateValues: (values: readonly TData[]) => TData;
|
|
326
365
|
}): ScoreDef<TData>;
|
|
327
366
|
};
|
|
328
367
|
declare function getScoreById(id: string): ScoreDef<unknown> | undefined;
|
|
@@ -349,6 +388,11 @@ interface SearchTestCasesQuery {
|
|
|
349
388
|
excludedPaths?: ReadonlyArray<string | RegExp>;
|
|
350
389
|
}
|
|
351
390
|
interface RunDatasetRequest {
|
|
391
|
+
/**
|
|
392
|
+
* Identifier for what triggered the run request (for example, a CLI command).
|
|
393
|
+
* When omitted, the runner generates one in the format `trg-[uuid]`.
|
|
394
|
+
*/
|
|
395
|
+
triggerId?: string;
|
|
352
396
|
datasetId: string;
|
|
353
397
|
evaluatorIds: ReadonlyArray<string>;
|
|
354
398
|
concurrency?: number;
|
|
@@ -462,6 +506,11 @@ interface PercentScoreData {
|
|
|
462
506
|
count?: number;
|
|
463
507
|
}
|
|
464
508
|
declare const percentScore: ScoreDef<PercentScoreData>;
|
|
509
|
+
interface DeltaScoreData {
|
|
510
|
+
value: number;
|
|
511
|
+
delta: number;
|
|
512
|
+
}
|
|
513
|
+
declare const deltaScore: ScoreDef<DeltaScoreData>;
|
|
465
514
|
interface BinaryScoreData {
|
|
466
515
|
passed: boolean;
|
|
467
516
|
passedCount?: number;
|
|
@@ -469,4 +518,4 @@ interface BinaryScoreData {
|
|
|
469
518
|
}
|
|
470
519
|
declare const binaryScore: ScoreDef<BinaryScoreData>;
|
|
471
520
|
|
|
472
|
-
export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, Evaluator, EvaluatorLogEntry, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
|
521
|
+
export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DeltaScoreData, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
package/dist/index.js
CHANGED
|
@@ -513,20 +513,70 @@ function getMetricById(id) {
|
|
|
513
513
|
|
|
514
514
|
// src/evals/score.ts
|
|
515
515
|
var registry2 = /* @__PURE__ */ new Map();
|
|
516
|
+
function formatScoreData(def, data, options) {
|
|
517
|
+
return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
|
|
518
|
+
}
|
|
519
|
+
var ScoreAggregate = {
|
|
520
|
+
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
521
|
+
averageFields(fields) {
|
|
522
|
+
return (values) => {
|
|
523
|
+
const count = values.length || 1;
|
|
524
|
+
const result = {};
|
|
525
|
+
for (const field of fields) {
|
|
526
|
+
result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
|
|
527
|
+
}
|
|
528
|
+
return result;
|
|
529
|
+
};
|
|
530
|
+
},
|
|
531
|
+
/** Average `value` with sample std dev. Use for percent-style scores. */
|
|
532
|
+
averageWithVariance(values) {
|
|
533
|
+
if (values.length === 0) {
|
|
534
|
+
return { value: 0, stdDev: void 0, count: 0 };
|
|
535
|
+
}
|
|
536
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
537
|
+
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
538
|
+
const mean = sum / values.length;
|
|
539
|
+
let stdDev;
|
|
540
|
+
if (values.length >= 2) {
|
|
541
|
+
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
542
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
543
|
+
}
|
|
544
|
+
return { ...values[0], value: mean, stdDev, count: values.length };
|
|
545
|
+
},
|
|
546
|
+
/** All runs must pass. Use for binary scores. */
|
|
547
|
+
all(values) {
|
|
548
|
+
const total = values.length;
|
|
549
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
550
|
+
return {
|
|
551
|
+
...values[0],
|
|
552
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
553
|
+
passedCount,
|
|
554
|
+
totalCount: total
|
|
555
|
+
};
|
|
556
|
+
},
|
|
557
|
+
/** Take last value (no aggregation). Use when aggregation is not meaningful. */
|
|
558
|
+
last(values) {
|
|
559
|
+
return values[values.length - 1] ?? {};
|
|
560
|
+
}
|
|
561
|
+
};
|
|
516
562
|
var Score = {
|
|
563
|
+
aggregate: ScoreAggregate,
|
|
517
564
|
of(config) {
|
|
518
565
|
const def = {
|
|
519
566
|
id: config.id,
|
|
520
567
|
name: config.name,
|
|
521
568
|
displayStrategy: config.displayStrategy,
|
|
522
|
-
|
|
523
|
-
|
|
569
|
+
formatValue: config.formatValue,
|
|
570
|
+
formatAggregate: config.formatAggregate,
|
|
571
|
+
aggregateValues: config.aggregateValues,
|
|
524
572
|
make: (data, options) => {
|
|
525
573
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
526
574
|
return {
|
|
527
575
|
id: config.id,
|
|
528
576
|
data,
|
|
529
|
-
...passed !== void 0 && { passed }
|
|
577
|
+
...passed !== void 0 && { passed },
|
|
578
|
+
def
|
|
579
|
+
// Attach def so rendering/aggregation works without registry lookup
|
|
530
580
|
};
|
|
531
581
|
}
|
|
532
582
|
};
|
|
@@ -539,29 +589,6 @@ function getScoreById(id) {
|
|
|
539
589
|
}
|
|
540
590
|
|
|
541
591
|
// src/evals/aggregators.ts
|
|
542
|
-
function aggregateAverageWithVariance(values) {
|
|
543
|
-
if (values.length === 0) {
|
|
544
|
-
return { value: 0, count: 0 };
|
|
545
|
-
}
|
|
546
|
-
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
547
|
-
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
548
|
-
const mean = sum / values.length;
|
|
549
|
-
let stdDev;
|
|
550
|
-
if (values.length >= 2) {
|
|
551
|
-
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
552
|
-
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
553
|
-
}
|
|
554
|
-
return { value: mean, stdDev, count: values.length };
|
|
555
|
-
}
|
|
556
|
-
function aggregateAll(values) {
|
|
557
|
-
const total = values.length;
|
|
558
|
-
const passedCount = values.filter((v) => v.passed).length;
|
|
559
|
-
return {
|
|
560
|
-
passed: total > 0 && values.every((v) => v.passed),
|
|
561
|
-
passedCount,
|
|
562
|
-
totalCount: total
|
|
563
|
-
};
|
|
564
|
-
}
|
|
565
592
|
function aggregateTokenCountSum(values) {
|
|
566
593
|
const initial = {
|
|
567
594
|
input: 0,
|
|
@@ -614,29 +641,31 @@ var percentScore = Score.of({
|
|
|
614
641
|
id: "percent",
|
|
615
642
|
name: "Score",
|
|
616
643
|
displayStrategy: "bar",
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
644
|
+
formatValue: (data) => data.value.toFixed(2),
|
|
645
|
+
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
646
|
+
aggregateValues: Score.aggregate.averageWithVariance
|
|
647
|
+
});
|
|
648
|
+
var deltaScore = Score.of({
|
|
649
|
+
id: "delta",
|
|
650
|
+
name: "Delta",
|
|
651
|
+
displayStrategy: "number",
|
|
652
|
+
formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
|
|
653
|
+
formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
|
|
654
|
+
aggregateValues: Score.aggregate.averageFields(["value", "delta"])
|
|
624
655
|
});
|
|
625
656
|
var binaryScore = Score.of({
|
|
626
657
|
id: "binary",
|
|
627
658
|
name: "Result",
|
|
628
659
|
displayStrategy: "passFail",
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
}
|
|
635
|
-
return base;
|
|
660
|
+
formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
|
|
661
|
+
formatAggregate: (data) => {
|
|
662
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
663
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
664
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
636
665
|
}
|
|
637
|
-
return
|
|
666
|
+
return base;
|
|
638
667
|
},
|
|
639
|
-
|
|
668
|
+
aggregateValues: Score.aggregate.all
|
|
640
669
|
});
|
|
641
670
|
function createDiffString(expected, actual, diffOptions) {
|
|
642
671
|
const opts = { ...diffOptions, color: false };
|
|
@@ -952,9 +981,12 @@ async function collectTestCasesFromFiles(config) {
|
|
|
952
981
|
}
|
|
953
982
|
|
|
954
983
|
// src/runner/score-utils.ts
|
|
984
|
+
function getScoreDef(item) {
|
|
985
|
+
return item.def ?? getScoreById(item.id);
|
|
986
|
+
}
|
|
955
987
|
function toNumericScoreFromScores(scores) {
|
|
956
988
|
for (const item of scores) {
|
|
957
|
-
const def =
|
|
989
|
+
const def = getScoreDef(item);
|
|
958
990
|
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
959
991
|
const value = item.data.value;
|
|
960
992
|
if (typeof value === "number" && Number.isFinite(value)) {
|
|
@@ -1035,6 +1067,7 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1035
1067
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1036
1068
|
const rerunPassed = [];
|
|
1037
1069
|
for (let r = 0; r < reruns; r++) {
|
|
1070
|
+
const evaluatorRunId = `run-${randomUUID()}`;
|
|
1038
1071
|
const started = Date.now();
|
|
1039
1072
|
const evaluatorScores = [];
|
|
1040
1073
|
let testCaseError;
|
|
@@ -1061,6 +1094,11 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1061
1094
|
input: testCaseItem.testCase.getInput(),
|
|
1062
1095
|
ctx,
|
|
1063
1096
|
output,
|
|
1097
|
+
meta: {
|
|
1098
|
+
triggerId: task.triggerId,
|
|
1099
|
+
runId: evaluatorRunId,
|
|
1100
|
+
datasetId: task.datasetId
|
|
1101
|
+
},
|
|
1064
1102
|
logDiff,
|
|
1065
1103
|
log
|
|
1066
1104
|
})
|
|
@@ -1336,7 +1374,7 @@ var createPersistenceWorker = (queue) => Effect.forever(
|
|
|
1336
1374
|
() => appendJsonLine(message.artifactPath, {
|
|
1337
1375
|
runId: message.runId,
|
|
1338
1376
|
ts: Date.now(),
|
|
1339
|
-
...message.payload
|
|
1377
|
+
...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
|
|
1340
1378
|
})
|
|
1341
1379
|
);
|
|
1342
1380
|
})
|
|
@@ -1520,6 +1558,7 @@ var EffectRunner = class {
|
|
|
1520
1558
|
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1521
1559
|
0
|
|
1522
1560
|
);
|
|
1561
|
+
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
1523
1562
|
const runId = `run-${randomUUID()}`;
|
|
1524
1563
|
const artifactPath = createArtifactPath(
|
|
1525
1564
|
this.config.artifactDirectory,
|
|
@@ -1561,6 +1600,7 @@ var EffectRunner = class {
|
|
|
1561
1600
|
await Effect.runPromise(
|
|
1562
1601
|
Queue.offer(this.runQueue, {
|
|
1563
1602
|
runId,
|
|
1603
|
+
triggerId,
|
|
1564
1604
|
datasetId: request.datasetId,
|
|
1565
1605
|
dataset: dataset.dataset,
|
|
1566
1606
|
evaluators: selectedEvaluators,
|
|
@@ -1634,6 +1674,6 @@ var EffectRunner = class {
|
|
|
1634
1674
|
}
|
|
1635
1675
|
};
|
|
1636
1676
|
|
|
1637
|
-
export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
|
1677
|
+
export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
|
1638
1678
|
//# sourceMappingURL=out.js.map
|
|
1639
1679
|
//# sourceMappingURL=index.js.map
|