@m4trix/evals 0.17.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +179 -88
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +179 -88
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +124 -50
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +124 -50
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +120 -45
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +42 -6
- package/dist/index.js +119 -46
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -289,6 +289,8 @@ declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, T
|
|
|
289
289
|
interface MetricItem<TData = unknown> {
|
|
290
290
|
readonly id: string;
|
|
291
291
|
readonly data: TData;
|
|
292
|
+
/** Per-item display name override (wins over def.name in rendering) */
|
|
293
|
+
readonly name?: string;
|
|
292
294
|
}
|
|
293
295
|
interface FormatMetricOptions {
|
|
294
296
|
isAggregated?: boolean;
|
|
@@ -298,7 +300,9 @@ interface MetricDef<TData = unknown> {
|
|
|
298
300
|
readonly name?: string;
|
|
299
301
|
readonly aggregate?: (values: ReadonlyArray<TData>) => TData;
|
|
300
302
|
format(data: TData, options?: FormatMetricOptions): string;
|
|
301
|
-
make(data: TData
|
|
303
|
+
make(data: TData, options?: {
|
|
304
|
+
name?: string;
|
|
305
|
+
}): MetricItem<TData>;
|
|
302
306
|
}
|
|
303
307
|
declare const Metric: {
|
|
304
308
|
of<TData>(config: {
|
|
@@ -315,6 +319,10 @@ interface ScoreItem<TData = unknown> {
|
|
|
315
319
|
readonly id: string;
|
|
316
320
|
readonly data: TData;
|
|
317
321
|
readonly passed?: boolean;
|
|
322
|
+
/** Per-item display name override (wins over def.name in rendering) */
|
|
323
|
+
readonly name?: string;
|
|
324
|
+
/** Attached def for formatting/aggregation without registry lookup (avoids n/a across module boundaries) */
|
|
325
|
+
readonly def?: ScoreDef<TData>;
|
|
318
326
|
}
|
|
319
327
|
interface FormatScoreOptions {
|
|
320
328
|
isAggregated?: boolean;
|
|
@@ -323,19 +331,42 @@ interface ScoreDef<TData = unknown> {
|
|
|
323
331
|
readonly id: string;
|
|
324
332
|
readonly name?: string;
|
|
325
333
|
readonly displayStrategy: ScoreDisplayStrategy;
|
|
326
|
-
readonly
|
|
327
|
-
|
|
334
|
+
readonly formatValue: (data: TData) => string;
|
|
335
|
+
readonly formatAggregate: (data: TData) => string;
|
|
336
|
+
readonly aggregateValues: (values: ReadonlyArray<TData>) => TData;
|
|
328
337
|
make(data: TData, options?: {
|
|
329
338
|
definePassed?: (data: TData) => boolean;
|
|
339
|
+
name?: string;
|
|
330
340
|
}): ScoreItem<TData>;
|
|
331
341
|
}
|
|
342
|
+
/** Helper to format using the right method based on isAggregated (for consumers that need a single entry point) */
|
|
343
|
+
declare function formatScoreData<TData>(def: ScoreDef<TData>, data: TData, options?: FormatScoreOptions): string;
|
|
332
344
|
declare const Score: {
|
|
345
|
+
aggregate: {
|
|
346
|
+
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
347
|
+
averageFields<K extends string>(fields: readonly K[]): (values: readonly Record<K, number>[]) => Record<K, number>;
|
|
348
|
+
/** Average selected numeric fields, with sample std dev tracked for `value`. */
|
|
349
|
+
averageWithVariance<K_1 extends string>(fields: readonly K_1[]): (values: readonly Record<K_1, number>[]) => Record<K_1, number> & {
|
|
350
|
+
stdDev?: number | undefined;
|
|
351
|
+
count: number;
|
|
352
|
+
};
|
|
353
|
+
/** All runs must pass. Use for binary scores. */
|
|
354
|
+
all<T extends {
|
|
355
|
+
passed: boolean;
|
|
356
|
+
}>(values: readonly T[]): T & {
|
|
357
|
+
passedCount?: number | undefined;
|
|
358
|
+
totalCount?: number | undefined;
|
|
359
|
+
};
|
|
360
|
+
/** Take last value (no aggregation). Use when aggregation is not meaningful. */
|
|
361
|
+
last<T_1>(values: readonly T_1[]): T_1;
|
|
362
|
+
};
|
|
333
363
|
of<TData>(config: {
|
|
334
364
|
id: string;
|
|
335
365
|
name?: string | undefined;
|
|
336
366
|
displayStrategy: ScoreDisplayStrategy;
|
|
337
|
-
|
|
338
|
-
|
|
367
|
+
formatValue: (data: TData) => string;
|
|
368
|
+
formatAggregate: (data: TData) => string;
|
|
369
|
+
aggregateValues: (values: readonly TData[]) => TData;
|
|
339
370
|
}): ScoreDef<TData>;
|
|
340
371
|
};
|
|
341
372
|
declare function getScoreById(id: string): ScoreDef<unknown> | undefined;
|
|
@@ -480,6 +511,11 @@ interface PercentScoreData {
|
|
|
480
511
|
count?: number;
|
|
481
512
|
}
|
|
482
513
|
declare const percentScore: ScoreDef<PercentScoreData>;
|
|
514
|
+
interface DeltaScoreData {
|
|
515
|
+
value: number;
|
|
516
|
+
delta: number;
|
|
517
|
+
}
|
|
518
|
+
declare const deltaScore: ScoreDef<DeltaScoreData>;
|
|
483
519
|
interface BinaryScoreData {
|
|
484
520
|
passed: boolean;
|
|
485
521
|
passedCount?: number;
|
|
@@ -487,4 +523,4 @@ interface BinaryScoreData {
|
|
|
487
523
|
}
|
|
488
524
|
declare const binaryScore: ScoreDef<BinaryScoreData>;
|
|
489
525
|
|
|
490
|
-
export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
|
526
|
+
export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DeltaScoreData, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
package/dist/index.js
CHANGED
|
@@ -501,7 +501,11 @@ var Metric = {
|
|
|
501
501
|
name: config.name,
|
|
502
502
|
aggregate: config.aggregate,
|
|
503
503
|
format: config.format,
|
|
504
|
-
make: (data) => ({
|
|
504
|
+
make: (data, options) => ({
|
|
505
|
+
id: config.id,
|
|
506
|
+
data,
|
|
507
|
+
...options?.name !== void 0 && { name: options.name }
|
|
508
|
+
})
|
|
505
509
|
};
|
|
506
510
|
registry.set(config.id, def);
|
|
507
511
|
return def;
|
|
@@ -513,20 +517,107 @@ function getMetricById(id) {
|
|
|
513
517
|
|
|
514
518
|
// src/evals/score.ts
|
|
515
519
|
var registry2 = /* @__PURE__ */ new Map();
|
|
520
|
+
function formatScoreData(def, data, options) {
|
|
521
|
+
return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
|
|
522
|
+
}
|
|
523
|
+
var ScoreAggregate = {
|
|
524
|
+
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
525
|
+
averageFields(fields) {
|
|
526
|
+
return (values) => {
|
|
527
|
+
const count = values.length || 1;
|
|
528
|
+
const result = {};
|
|
529
|
+
for (const field of fields) {
|
|
530
|
+
result[field] = values.reduce(
|
|
531
|
+
(s, v) => s + (v[field] ?? 0),
|
|
532
|
+
0
|
|
533
|
+
) / count;
|
|
534
|
+
}
|
|
535
|
+
return result;
|
|
536
|
+
};
|
|
537
|
+
},
|
|
538
|
+
/** Average selected numeric fields, with sample std dev tracked for `value`. */
|
|
539
|
+
averageWithVariance(fields) {
|
|
540
|
+
return (values) => {
|
|
541
|
+
const count = values.length;
|
|
542
|
+
const result = {};
|
|
543
|
+
for (const field of fields) {
|
|
544
|
+
result[field] = count === 0 ? 0 : values.reduce(
|
|
545
|
+
(sum, item) => sum + (item[field] ?? 0),
|
|
546
|
+
0
|
|
547
|
+
) / count;
|
|
548
|
+
}
|
|
549
|
+
const valueField = "value";
|
|
550
|
+
const hasValueField = fields.includes(valueField);
|
|
551
|
+
if (count === 0) {
|
|
552
|
+
if (hasValueField) {
|
|
553
|
+
result[valueField] = 0;
|
|
554
|
+
}
|
|
555
|
+
return {
|
|
556
|
+
...result,
|
|
557
|
+
stdDev: void 0,
|
|
558
|
+
count: 0
|
|
559
|
+
};
|
|
560
|
+
}
|
|
561
|
+
let stdDev;
|
|
562
|
+
if (hasValueField && count >= 2) {
|
|
563
|
+
const sum = values.reduce(
|
|
564
|
+
(s, v) => s + (v[valueField] ?? 0),
|
|
565
|
+
0
|
|
566
|
+
);
|
|
567
|
+
const sumSq = values.reduce(
|
|
568
|
+
(s, v) => {
|
|
569
|
+
const value = v[valueField] ?? 0;
|
|
570
|
+
return s + value * value;
|
|
571
|
+
},
|
|
572
|
+
0
|
|
573
|
+
);
|
|
574
|
+
const mean = sum / count;
|
|
575
|
+
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
576
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
577
|
+
}
|
|
578
|
+
return {
|
|
579
|
+
...values[0],
|
|
580
|
+
...result,
|
|
581
|
+
stdDev,
|
|
582
|
+
count
|
|
583
|
+
};
|
|
584
|
+
};
|
|
585
|
+
},
|
|
586
|
+
/** All runs must pass. Use for binary scores. */
|
|
587
|
+
all(values) {
|
|
588
|
+
const total = values.length;
|
|
589
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
590
|
+
return {
|
|
591
|
+
...values[0],
|
|
592
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
593
|
+
passedCount,
|
|
594
|
+
totalCount: total
|
|
595
|
+
};
|
|
596
|
+
},
|
|
597
|
+
/** Take last value (no aggregation). Use when aggregation is not meaningful. */
|
|
598
|
+
last(values) {
|
|
599
|
+
return values[values.length - 1] ?? {};
|
|
600
|
+
}
|
|
601
|
+
};
|
|
516
602
|
var Score = {
|
|
603
|
+
aggregate: ScoreAggregate,
|
|
517
604
|
of(config) {
|
|
518
605
|
const def = {
|
|
519
606
|
id: config.id,
|
|
520
607
|
name: config.name,
|
|
521
608
|
displayStrategy: config.displayStrategy,
|
|
522
|
-
|
|
523
|
-
|
|
609
|
+
formatValue: config.formatValue,
|
|
610
|
+
formatAggregate: config.formatAggregate,
|
|
611
|
+
aggregateValues: config.aggregateValues,
|
|
524
612
|
make: (data, options) => {
|
|
525
613
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
526
614
|
return {
|
|
527
615
|
id: config.id,
|
|
528
616
|
data,
|
|
529
|
-
...passed !== void 0 && { passed }
|
|
617
|
+
...passed !== void 0 && { passed },
|
|
618
|
+
...options?.name !== void 0 && { name: options.name },
|
|
619
|
+
def
|
|
620
|
+
// Attach def so rendering/aggregation works without registry lookup
|
|
530
621
|
};
|
|
531
622
|
}
|
|
532
623
|
};
|
|
@@ -539,29 +630,6 @@ function getScoreById(id) {
|
|
|
539
630
|
}
|
|
540
631
|
|
|
541
632
|
// src/evals/aggregators.ts
|
|
542
|
-
function aggregateAverageWithVariance(values) {
|
|
543
|
-
if (values.length === 0) {
|
|
544
|
-
return { value: 0, count: 0 };
|
|
545
|
-
}
|
|
546
|
-
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
547
|
-
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
548
|
-
const mean = sum / values.length;
|
|
549
|
-
let stdDev;
|
|
550
|
-
if (values.length >= 2) {
|
|
551
|
-
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
552
|
-
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
553
|
-
}
|
|
554
|
-
return { value: mean, stdDev, count: values.length };
|
|
555
|
-
}
|
|
556
|
-
function aggregateAll(values) {
|
|
557
|
-
const total = values.length;
|
|
558
|
-
const passedCount = values.filter((v) => v.passed).length;
|
|
559
|
-
return {
|
|
560
|
-
passed: total > 0 && values.every((v) => v.passed),
|
|
561
|
-
passedCount,
|
|
562
|
-
totalCount: total
|
|
563
|
-
};
|
|
564
|
-
}
|
|
565
633
|
function aggregateTokenCountSum(values) {
|
|
566
634
|
const initial = {
|
|
567
635
|
input: 0,
|
|
@@ -614,29 +682,31 @@ var percentScore = Score.of({
|
|
|
614
682
|
id: "percent",
|
|
615
683
|
name: "Score",
|
|
616
684
|
displayStrategy: "bar",
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
685
|
+
formatValue: (data) => data.value.toFixed(2),
|
|
686
|
+
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
687
|
+
aggregateValues: Score.aggregate.averageWithVariance(["value"])
|
|
688
|
+
});
|
|
689
|
+
var deltaScore = Score.of({
|
|
690
|
+
id: "delta",
|
|
691
|
+
name: "Delta",
|
|
692
|
+
displayStrategy: "number",
|
|
693
|
+
formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
|
|
694
|
+
formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
|
|
695
|
+
aggregateValues: Score.aggregate.averageFields(["value", "delta"])
|
|
624
696
|
});
|
|
625
697
|
var binaryScore = Score.of({
|
|
626
698
|
id: "binary",
|
|
627
699
|
name: "Result",
|
|
628
700
|
displayStrategy: "passFail",
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
}
|
|
635
|
-
return base;
|
|
701
|
+
formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
|
|
702
|
+
formatAggregate: (data) => {
|
|
703
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
704
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
705
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
636
706
|
}
|
|
637
|
-
return
|
|
707
|
+
return base;
|
|
638
708
|
},
|
|
639
|
-
|
|
709
|
+
aggregateValues: Score.aggregate.all
|
|
640
710
|
});
|
|
641
711
|
function createDiffString(expected, actual, diffOptions) {
|
|
642
712
|
const opts = { ...diffOptions, color: false };
|
|
@@ -952,9 +1022,12 @@ async function collectTestCasesFromFiles(config) {
|
|
|
952
1022
|
}
|
|
953
1023
|
|
|
954
1024
|
// src/runner/score-utils.ts
|
|
1025
|
+
function getScoreDef(item) {
|
|
1026
|
+
return item.def ?? getScoreById(item.id);
|
|
1027
|
+
}
|
|
955
1028
|
function toNumericScoreFromScores(scores) {
|
|
956
1029
|
for (const item of scores) {
|
|
957
|
-
const def =
|
|
1030
|
+
const def = getScoreDef(item);
|
|
958
1031
|
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
959
1032
|
const value = item.data.value;
|
|
960
1033
|
if (typeof value === "number" && Number.isFinite(value)) {
|
|
@@ -1342,7 +1415,7 @@ var createPersistenceWorker = (queue) => Effect.forever(
|
|
|
1342
1415
|
() => appendJsonLine(message.artifactPath, {
|
|
1343
1416
|
runId: message.runId,
|
|
1344
1417
|
ts: Date.now(),
|
|
1345
|
-
...message.payload
|
|
1418
|
+
...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
|
|
1346
1419
|
})
|
|
1347
1420
|
);
|
|
1348
1421
|
})
|
|
@@ -1642,6 +1715,6 @@ var EffectRunner = class {
|
|
|
1642
1715
|
}
|
|
1643
1716
|
};
|
|
1644
1717
|
|
|
1645
|
-
export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
|
1718
|
+
export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
|
1646
1719
|
//# sourceMappingURL=out.js.map
|
|
1647
1720
|
//# sourceMappingURL=index.js.map
|