@m4trix/evals 0.17.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +105 -76
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +105 -76
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +79 -47
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +79 -47
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +78 -44
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +36 -5
- package/dist/index.js +77 -45
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -315,6 +315,8 @@ interface ScoreItem<TData = unknown> {
|
|
|
315
315
|
readonly id: string;
|
|
316
316
|
readonly data: TData;
|
|
317
317
|
readonly passed?: boolean;
|
|
318
|
+
/** Attached def for formatting/aggregation without registry lookup (avoids n/a across module boundaries) */
|
|
319
|
+
readonly def?: ScoreDef<TData>;
|
|
318
320
|
}
|
|
319
321
|
interface FormatScoreOptions {
|
|
320
322
|
isAggregated?: boolean;
|
|
@@ -323,19 +325,43 @@ interface ScoreDef<TData = unknown> {
|
|
|
323
325
|
readonly id: string;
|
|
324
326
|
readonly name?: string;
|
|
325
327
|
readonly displayStrategy: ScoreDisplayStrategy;
|
|
326
|
-
readonly
|
|
327
|
-
|
|
328
|
+
readonly formatValue: (data: TData) => string;
|
|
329
|
+
readonly formatAggregate: (data: TData) => string;
|
|
330
|
+
readonly aggregateValues: (values: ReadonlyArray<TData>) => TData;
|
|
328
331
|
make(data: TData, options?: {
|
|
329
332
|
definePassed?: (data: TData) => boolean;
|
|
330
333
|
}): ScoreItem<TData>;
|
|
331
334
|
}
|
|
335
|
+
/** Helper to format using the right method based on isAggregated (for consumers that need a single entry point) */
|
|
336
|
+
declare function formatScoreData<TData>(def: ScoreDef<TData>, data: TData, options?: FormatScoreOptions): string;
|
|
332
337
|
declare const Score: {
|
|
338
|
+
aggregate: {
|
|
339
|
+
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
340
|
+
averageFields<K extends string>(fields: readonly K[]): (values: readonly Record<K, number>[]) => Record<K, number>;
|
|
341
|
+
/** Average `value` with sample std dev. Use for percent-style scores. */
|
|
342
|
+
averageWithVariance<T extends {
|
|
343
|
+
value: number;
|
|
344
|
+
}>(values: readonly T[]): T & {
|
|
345
|
+
stdDev?: number | undefined;
|
|
346
|
+
count: number;
|
|
347
|
+
};
|
|
348
|
+
/** All runs must pass. Use for binary scores. */
|
|
349
|
+
all<T_1 extends {
|
|
350
|
+
passed: boolean;
|
|
351
|
+
}>(values: readonly T_1[]): T_1 & {
|
|
352
|
+
passedCount?: number | undefined;
|
|
353
|
+
totalCount?: number | undefined;
|
|
354
|
+
};
|
|
355
|
+
/** Take last value (no aggregation). Use when aggregation is not meaningful. */
|
|
356
|
+
last<T_2>(values: readonly T_2[]): T_2;
|
|
357
|
+
};
|
|
333
358
|
of<TData>(config: {
|
|
334
359
|
id: string;
|
|
335
360
|
name?: string | undefined;
|
|
336
361
|
displayStrategy: ScoreDisplayStrategy;
|
|
337
|
-
|
|
338
|
-
|
|
362
|
+
formatValue: (data: TData) => string;
|
|
363
|
+
formatAggregate: (data: TData) => string;
|
|
364
|
+
aggregateValues: (values: readonly TData[]) => TData;
|
|
339
365
|
}): ScoreDef<TData>;
|
|
340
366
|
};
|
|
341
367
|
declare function getScoreById(id: string): ScoreDef<unknown> | undefined;
|
|
@@ -480,6 +506,11 @@ interface PercentScoreData {
|
|
|
480
506
|
count?: number;
|
|
481
507
|
}
|
|
482
508
|
declare const percentScore: ScoreDef<PercentScoreData>;
|
|
509
|
+
interface DeltaScoreData {
|
|
510
|
+
value: number;
|
|
511
|
+
delta: number;
|
|
512
|
+
}
|
|
513
|
+
declare const deltaScore: ScoreDef<DeltaScoreData>;
|
|
483
514
|
interface BinaryScoreData {
|
|
484
515
|
passed: boolean;
|
|
485
516
|
passedCount?: number;
|
|
@@ -487,4 +518,4 @@ interface BinaryScoreData {
|
|
|
487
518
|
}
|
|
488
519
|
declare const binaryScore: ScoreDef<BinaryScoreData>;
|
|
489
520
|
|
|
490
|
-
export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
|
521
|
+
export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DeltaScoreData, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
package/dist/index.js
CHANGED
|
@@ -513,20 +513,70 @@ function getMetricById(id) {
|
|
|
513
513
|
|
|
514
514
|
// src/evals/score.ts
|
|
515
515
|
var registry2 = /* @__PURE__ */ new Map();
|
|
516
|
+
function formatScoreData(def, data, options) {
|
|
517
|
+
return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
|
|
518
|
+
}
|
|
519
|
+
var ScoreAggregate = {
|
|
520
|
+
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
521
|
+
averageFields(fields) {
|
|
522
|
+
return (values) => {
|
|
523
|
+
const count = values.length || 1;
|
|
524
|
+
const result = {};
|
|
525
|
+
for (const field of fields) {
|
|
526
|
+
result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
|
|
527
|
+
}
|
|
528
|
+
return result;
|
|
529
|
+
};
|
|
530
|
+
},
|
|
531
|
+
/** Average `value` with sample std dev. Use for percent-style scores. */
|
|
532
|
+
averageWithVariance(values) {
|
|
533
|
+
if (values.length === 0) {
|
|
534
|
+
return { value: 0, stdDev: void 0, count: 0 };
|
|
535
|
+
}
|
|
536
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
537
|
+
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
538
|
+
const mean = sum / values.length;
|
|
539
|
+
let stdDev;
|
|
540
|
+
if (values.length >= 2) {
|
|
541
|
+
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
542
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
543
|
+
}
|
|
544
|
+
return { ...values[0], value: mean, stdDev, count: values.length };
|
|
545
|
+
},
|
|
546
|
+
/** All runs must pass. Use for binary scores. */
|
|
547
|
+
all(values) {
|
|
548
|
+
const total = values.length;
|
|
549
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
550
|
+
return {
|
|
551
|
+
...values[0],
|
|
552
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
553
|
+
passedCount,
|
|
554
|
+
totalCount: total
|
|
555
|
+
};
|
|
556
|
+
},
|
|
557
|
+
/** Take last value (no aggregation). Use when aggregation is not meaningful. */
|
|
558
|
+
last(values) {
|
|
559
|
+
return values[values.length - 1] ?? {};
|
|
560
|
+
}
|
|
561
|
+
};
|
|
516
562
|
var Score = {
|
|
563
|
+
aggregate: ScoreAggregate,
|
|
517
564
|
of(config) {
|
|
518
565
|
const def = {
|
|
519
566
|
id: config.id,
|
|
520
567
|
name: config.name,
|
|
521
568
|
displayStrategy: config.displayStrategy,
|
|
522
|
-
|
|
523
|
-
|
|
569
|
+
formatValue: config.formatValue,
|
|
570
|
+
formatAggregate: config.formatAggregate,
|
|
571
|
+
aggregateValues: config.aggregateValues,
|
|
524
572
|
make: (data, options) => {
|
|
525
573
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
526
574
|
return {
|
|
527
575
|
id: config.id,
|
|
528
576
|
data,
|
|
529
|
-
...passed !== void 0 && { passed }
|
|
577
|
+
...passed !== void 0 && { passed },
|
|
578
|
+
def
|
|
579
|
+
// Attach def so rendering/aggregation works without registry lookup
|
|
530
580
|
};
|
|
531
581
|
}
|
|
532
582
|
};
|
|
@@ -539,29 +589,6 @@ function getScoreById(id) {
|
|
|
539
589
|
}
|
|
540
590
|
|
|
541
591
|
// src/evals/aggregators.ts
|
|
542
|
-
function aggregateAverageWithVariance(values) {
|
|
543
|
-
if (values.length === 0) {
|
|
544
|
-
return { value: 0, count: 0 };
|
|
545
|
-
}
|
|
546
|
-
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
547
|
-
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
548
|
-
const mean = sum / values.length;
|
|
549
|
-
let stdDev;
|
|
550
|
-
if (values.length >= 2) {
|
|
551
|
-
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
552
|
-
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
553
|
-
}
|
|
554
|
-
return { value: mean, stdDev, count: values.length };
|
|
555
|
-
}
|
|
556
|
-
function aggregateAll(values) {
|
|
557
|
-
const total = values.length;
|
|
558
|
-
const passedCount = values.filter((v) => v.passed).length;
|
|
559
|
-
return {
|
|
560
|
-
passed: total > 0 && values.every((v) => v.passed),
|
|
561
|
-
passedCount,
|
|
562
|
-
totalCount: total
|
|
563
|
-
};
|
|
564
|
-
}
|
|
565
592
|
function aggregateTokenCountSum(values) {
|
|
566
593
|
const initial = {
|
|
567
594
|
input: 0,
|
|
@@ -614,29 +641,31 @@ var percentScore = Score.of({
|
|
|
614
641
|
id: "percent",
|
|
615
642
|
name: "Score",
|
|
616
643
|
displayStrategy: "bar",
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
644
|
+
formatValue: (data) => data.value.toFixed(2),
|
|
645
|
+
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
646
|
+
aggregateValues: Score.aggregate.averageWithVariance
|
|
647
|
+
});
|
|
648
|
+
var deltaScore = Score.of({
|
|
649
|
+
id: "delta",
|
|
650
|
+
name: "Delta",
|
|
651
|
+
displayStrategy: "number",
|
|
652
|
+
formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
|
|
653
|
+
formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
|
|
654
|
+
aggregateValues: Score.aggregate.averageFields(["value", "delta"])
|
|
624
655
|
});
|
|
625
656
|
var binaryScore = Score.of({
|
|
626
657
|
id: "binary",
|
|
627
658
|
name: "Result",
|
|
628
659
|
displayStrategy: "passFail",
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
}
|
|
635
|
-
return base;
|
|
660
|
+
formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
|
|
661
|
+
formatAggregate: (data) => {
|
|
662
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
663
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
664
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
636
665
|
}
|
|
637
|
-
return
|
|
666
|
+
return base;
|
|
638
667
|
},
|
|
639
|
-
|
|
668
|
+
aggregateValues: Score.aggregate.all
|
|
640
669
|
});
|
|
641
670
|
function createDiffString(expected, actual, diffOptions) {
|
|
642
671
|
const opts = { ...diffOptions, color: false };
|
|
@@ -952,9 +981,12 @@ async function collectTestCasesFromFiles(config) {
|
|
|
952
981
|
}
|
|
953
982
|
|
|
954
983
|
// src/runner/score-utils.ts
|
|
984
|
+
function getScoreDef(item) {
|
|
985
|
+
return item.def ?? getScoreById(item.id);
|
|
986
|
+
}
|
|
955
987
|
function toNumericScoreFromScores(scores) {
|
|
956
988
|
for (const item of scores) {
|
|
957
|
-
const def =
|
|
989
|
+
const def = getScoreDef(item);
|
|
958
990
|
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
959
991
|
const value = item.data.value;
|
|
960
992
|
if (typeof value === "number" && Number.isFinite(value)) {
|
|
@@ -1342,7 +1374,7 @@ var createPersistenceWorker = (queue) => Effect.forever(
|
|
|
1342
1374
|
() => appendJsonLine(message.artifactPath, {
|
|
1343
1375
|
runId: message.runId,
|
|
1344
1376
|
ts: Date.now(),
|
|
1345
|
-
...message.payload
|
|
1377
|
+
...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
|
|
1346
1378
|
})
|
|
1347
1379
|
);
|
|
1348
1380
|
})
|
|
@@ -1642,6 +1674,6 @@ var EffectRunner = class {
|
|
|
1642
1674
|
}
|
|
1643
1675
|
};
|
|
1644
1676
|
|
|
1645
|
-
export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
|
1677
|
+
export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
|
1646
1678
|
//# sourceMappingURL=out.js.map
|
|
1647
1679
|
//# sourceMappingURL=index.js.map
|