@m4trix/evals 0.18.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +91 -29
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +91 -29
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +61 -19
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +61 -19
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +58 -17
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +13 -8
- package/dist/index.js +58 -17
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -289,6 +289,8 @@ declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, T
|
|
|
289
289
|
interface MetricItem<TData = unknown> {
|
|
290
290
|
readonly id: string;
|
|
291
291
|
readonly data: TData;
|
|
292
|
+
/** Per-item display name override (wins over def.name in rendering) */
|
|
293
|
+
readonly name?: string;
|
|
292
294
|
}
|
|
293
295
|
interface FormatMetricOptions {
|
|
294
296
|
isAggregated?: boolean;
|
|
@@ -298,7 +300,9 @@ interface MetricDef<TData = unknown> {
|
|
|
298
300
|
readonly name?: string;
|
|
299
301
|
readonly aggregate?: (values: ReadonlyArray<TData>) => TData;
|
|
300
302
|
format(data: TData, options?: FormatMetricOptions): string;
|
|
301
|
-
make(data: TData
|
|
303
|
+
make(data: TData, options?: {
|
|
304
|
+
name?: string;
|
|
305
|
+
}): MetricItem<TData>;
|
|
302
306
|
}
|
|
303
307
|
declare const Metric: {
|
|
304
308
|
of<TData>(config: {
|
|
@@ -315,6 +319,8 @@ interface ScoreItem<TData = unknown> {
|
|
|
315
319
|
readonly id: string;
|
|
316
320
|
readonly data: TData;
|
|
317
321
|
readonly passed?: boolean;
|
|
322
|
+
/** Per-item display name override (wins over def.name in rendering) */
|
|
323
|
+
readonly name?: string;
|
|
318
324
|
/** Attached def for formatting/aggregation without registry lookup (avoids n/a across module boundaries) */
|
|
319
325
|
readonly def?: ScoreDef<TData>;
|
|
320
326
|
}
|
|
@@ -330,6 +336,7 @@ interface ScoreDef<TData = unknown> {
|
|
|
330
336
|
readonly aggregateValues: (values: ReadonlyArray<TData>) => TData;
|
|
331
337
|
make(data: TData, options?: {
|
|
332
338
|
definePassed?: (data: TData) => boolean;
|
|
339
|
+
name?: string;
|
|
333
340
|
}): ScoreItem<TData>;
|
|
334
341
|
}
|
|
335
342
|
/** Helper to format using the right method based on isAggregated (for consumers that need a single entry point) */
|
|
@@ -338,22 +345,20 @@ declare const Score: {
|
|
|
338
345
|
aggregate: {
|
|
339
346
|
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
340
347
|
averageFields<K extends string>(fields: readonly K[]): (values: readonly Record<K, number>[]) => Record<K, number>;
|
|
341
|
-
/** Average
|
|
342
|
-
averageWithVariance<
|
|
343
|
-
value: number;
|
|
344
|
-
}>(values: readonly T[]): T & {
|
|
348
|
+
/** Average selected numeric fields, with sample std dev tracked for `value`. */
|
|
349
|
+
averageWithVariance<K_1 extends string>(fields: readonly K_1[]): (values: readonly Record<K_1, number>[]) => Record<K_1, number> & {
|
|
345
350
|
stdDev?: number | undefined;
|
|
346
351
|
count: number;
|
|
347
352
|
};
|
|
348
353
|
/** All runs must pass. Use for binary scores. */
|
|
349
|
-
all<
|
|
354
|
+
all<T extends {
|
|
350
355
|
passed: boolean;
|
|
351
|
-
}>(values: readonly
|
|
356
|
+
}>(values: readonly T[]): T & {
|
|
352
357
|
passedCount?: number | undefined;
|
|
353
358
|
totalCount?: number | undefined;
|
|
354
359
|
};
|
|
355
360
|
/** Take last value (no aggregation). Use when aggregation is not meaningful. */
|
|
356
|
-
last<
|
|
361
|
+
last<T_1>(values: readonly T_1[]): T_1;
|
|
357
362
|
};
|
|
358
363
|
of<TData>(config: {
|
|
359
364
|
id: string;
|
package/dist/index.js
CHANGED
|
@@ -501,7 +501,11 @@ var Metric = {
|
|
|
501
501
|
name: config.name,
|
|
502
502
|
aggregate: config.aggregate,
|
|
503
503
|
format: config.format,
|
|
504
|
-
make: (data) => ({
|
|
504
|
+
make: (data, options) => ({
|
|
505
|
+
id: config.id,
|
|
506
|
+
data,
|
|
507
|
+
...options?.name !== void 0 && { name: options.name }
|
|
508
|
+
})
|
|
505
509
|
};
|
|
506
510
|
registry.set(config.id, def);
|
|
507
511
|
return def;
|
|
@@ -523,25 +527,61 @@ var ScoreAggregate = {
|
|
|
523
527
|
const count = values.length || 1;
|
|
524
528
|
const result = {};
|
|
525
529
|
for (const field of fields) {
|
|
526
|
-
result[field] = values.reduce(
|
|
530
|
+
result[field] = values.reduce(
|
|
531
|
+
(s, v) => s + (v[field] ?? 0),
|
|
532
|
+
0
|
|
533
|
+
) / count;
|
|
527
534
|
}
|
|
528
535
|
return result;
|
|
529
536
|
};
|
|
530
537
|
},
|
|
531
|
-
/** Average
|
|
532
|
-
averageWithVariance(
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
538
|
+
/** Average selected numeric fields, with sample std dev tracked for `value`. */
|
|
539
|
+
averageWithVariance(fields) {
|
|
540
|
+
return (values) => {
|
|
541
|
+
const count = values.length;
|
|
542
|
+
const result = {};
|
|
543
|
+
for (const field of fields) {
|
|
544
|
+
result[field] = count === 0 ? 0 : values.reduce(
|
|
545
|
+
(sum, item) => sum + (item[field] ?? 0),
|
|
546
|
+
0
|
|
547
|
+
) / count;
|
|
548
|
+
}
|
|
549
|
+
const valueField = "value";
|
|
550
|
+
const hasValueField = fields.includes(valueField);
|
|
551
|
+
if (count === 0) {
|
|
552
|
+
if (hasValueField) {
|
|
553
|
+
result[valueField] = 0;
|
|
554
|
+
}
|
|
555
|
+
return {
|
|
556
|
+
...result,
|
|
557
|
+
stdDev: void 0,
|
|
558
|
+
count: 0
|
|
559
|
+
};
|
|
560
|
+
}
|
|
561
|
+
let stdDev;
|
|
562
|
+
if (hasValueField && count >= 2) {
|
|
563
|
+
const sum = values.reduce(
|
|
564
|
+
(s, v) => s + (v[valueField] ?? 0),
|
|
565
|
+
0
|
|
566
|
+
);
|
|
567
|
+
const sumSq = values.reduce(
|
|
568
|
+
(s, v) => {
|
|
569
|
+
const value = v[valueField] ?? 0;
|
|
570
|
+
return s + value * value;
|
|
571
|
+
},
|
|
572
|
+
0
|
|
573
|
+
);
|
|
574
|
+
const mean = sum / count;
|
|
575
|
+
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
576
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
577
|
+
}
|
|
578
|
+
return {
|
|
579
|
+
...values[0],
|
|
580
|
+
...result,
|
|
581
|
+
stdDev,
|
|
582
|
+
count
|
|
583
|
+
};
|
|
584
|
+
};
|
|
545
585
|
},
|
|
546
586
|
/** All runs must pass. Use for binary scores. */
|
|
547
587
|
all(values) {
|
|
@@ -575,6 +615,7 @@ var Score = {
|
|
|
575
615
|
id: config.id,
|
|
576
616
|
data,
|
|
577
617
|
...passed !== void 0 && { passed },
|
|
618
|
+
...options?.name !== void 0 && { name: options.name },
|
|
578
619
|
def
|
|
579
620
|
// Attach def so rendering/aggregation works without registry lookup
|
|
580
621
|
};
|
|
@@ -643,7 +684,7 @@ var percentScore = Score.of({
|
|
|
643
684
|
displayStrategy: "bar",
|
|
644
685
|
formatValue: (data) => data.value.toFixed(2),
|
|
645
686
|
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
646
|
-
aggregateValues: Score.aggregate.averageWithVariance
|
|
687
|
+
aggregateValues: Score.aggregate.averageWithVariance(["value"])
|
|
647
688
|
});
|
|
648
689
|
var deltaScore = Score.of({
|
|
649
690
|
id: "delta",
|