@m4trix/evals 0.18.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -289,6 +289,8 @@ declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, T
289
289
  interface MetricItem<TData = unknown> {
290
290
  readonly id: string;
291
291
  readonly data: TData;
292
+ /** Per-item display name override (wins over def.name in rendering) */
293
+ readonly name?: string;
292
294
  }
293
295
  interface FormatMetricOptions {
294
296
  isAggregated?: boolean;
@@ -298,7 +300,9 @@ interface MetricDef<TData = unknown> {
298
300
  readonly name?: string;
299
301
  readonly aggregate?: (values: ReadonlyArray<TData>) => TData;
300
302
  format(data: TData, options?: FormatMetricOptions): string;
301
- make(data: TData): MetricItem<TData>;
303
+ make(data: TData, options?: {
304
+ name?: string;
305
+ }): MetricItem<TData>;
302
306
  }
303
307
  declare const Metric: {
304
308
  of<TData>(config: {
@@ -315,6 +319,8 @@ interface ScoreItem<TData = unknown> {
315
319
  readonly id: string;
316
320
  readonly data: TData;
317
321
  readonly passed?: boolean;
322
+ /** Per-item display name override (wins over def.name in rendering) */
323
+ readonly name?: string;
318
324
  /** Attached def for formatting/aggregation without registry lookup (avoids n/a across module boundaries) */
319
325
  readonly def?: ScoreDef<TData>;
320
326
  }
@@ -330,6 +336,7 @@ interface ScoreDef<TData = unknown> {
330
336
  readonly aggregateValues: (values: ReadonlyArray<TData>) => TData;
331
337
  make(data: TData, options?: {
332
338
  definePassed?: (data: TData) => boolean;
339
+ name?: string;
333
340
  }): ScoreItem<TData>;
334
341
  }
335
342
  /** Helper to format using the right method based on isAggregated (for consumers that need a single entry point) */
@@ -338,22 +345,20 @@ declare const Score: {
338
345
  aggregate: {
339
346
  /** Average numeric fields. Use for scores like { value, delta }. */
340
347
  averageFields<K extends string>(fields: readonly K[]): (values: readonly Record<K, number>[]) => Record<K, number>;
341
- /** Average `value` with sample std dev. Use for percent-style scores. */
342
- averageWithVariance<T extends {
343
- value: number;
344
- }>(values: readonly T[]): T & {
348
+ /** Average selected numeric fields, with sample std dev tracked for `value`. */
349
+ averageWithVariance<K_1 extends string>(fields: readonly K_1[]): (values: readonly Record<K_1, number>[]) => Record<K_1, number> & {
345
350
  stdDev?: number | undefined;
346
351
  count: number;
347
352
  };
348
353
  /** All runs must pass. Use for binary scores. */
349
- all<T_1 extends {
354
+ all<T extends {
350
355
  passed: boolean;
351
- }>(values: readonly T_1[]): T_1 & {
356
+ }>(values: readonly T[]): T & {
352
357
  passedCount?: number | undefined;
353
358
  totalCount?: number | undefined;
354
359
  };
355
360
  /** Take last value (no aggregation). Use when aggregation is not meaningful. */
356
- last<T_2>(values: readonly T_2[]): T_2;
361
+ last<T_1>(values: readonly T_1[]): T_1;
357
362
  };
358
363
  of<TData>(config: {
359
364
  id: string;
package/dist/index.js CHANGED
@@ -501,7 +501,11 @@ var Metric = {
501
501
  name: config.name,
502
502
  aggregate: config.aggregate,
503
503
  format: config.format,
504
- make: (data) => ({ id: config.id, data })
504
+ make: (data, options) => ({
505
+ id: config.id,
506
+ data,
507
+ ...options?.name !== void 0 && { name: options.name }
508
+ })
505
509
  };
506
510
  registry.set(config.id, def);
507
511
  return def;
@@ -523,25 +527,61 @@ var ScoreAggregate = {
523
527
  const count = values.length || 1;
524
528
  const result = {};
525
529
  for (const field of fields) {
526
- result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
530
+ result[field] = values.reduce(
531
+ (s, v) => s + (v[field] ?? 0),
532
+ 0
533
+ ) / count;
527
534
  }
528
535
  return result;
529
536
  };
530
537
  },
531
- /** Average `value` with sample std dev. Use for percent-style scores. */
532
- averageWithVariance(values) {
533
- if (values.length === 0) {
534
- return { value: 0, stdDev: void 0, count: 0 };
535
- }
536
- const sum = values.reduce((s, v) => s + v.value, 0);
537
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
538
- const mean = sum / values.length;
539
- let stdDev;
540
- if (values.length >= 2) {
541
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
542
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
543
- }
544
- return { ...values[0], value: mean, stdDev, count: values.length };
538
+ /** Average selected numeric fields, with sample std dev tracked for `value`. */
539
+ averageWithVariance(fields) {
540
+ return (values) => {
541
+ const count = values.length;
542
+ const result = {};
543
+ for (const field of fields) {
544
+ result[field] = count === 0 ? 0 : values.reduce(
545
+ (sum, item) => sum + (item[field] ?? 0),
546
+ 0
547
+ ) / count;
548
+ }
549
+ const valueField = "value";
550
+ const hasValueField = fields.includes(valueField);
551
+ if (count === 0) {
552
+ if (hasValueField) {
553
+ result[valueField] = 0;
554
+ }
555
+ return {
556
+ ...result,
557
+ stdDev: void 0,
558
+ count: 0
559
+ };
560
+ }
561
+ let stdDev;
562
+ if (hasValueField && count >= 2) {
563
+ const sum = values.reduce(
564
+ (s, v) => s + (v[valueField] ?? 0),
565
+ 0
566
+ );
567
+ const sumSq = values.reduce(
568
+ (s, v) => {
569
+ const value = v[valueField] ?? 0;
570
+ return s + value * value;
571
+ },
572
+ 0
573
+ );
574
+ const mean = sum / count;
575
+ const variance = (sumSq - count * mean * mean) / (count - 1);
576
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
577
+ }
578
+ return {
579
+ ...values[0],
580
+ ...result,
581
+ stdDev,
582
+ count
583
+ };
584
+ };
545
585
  },
546
586
  /** All runs must pass. Use for binary scores. */
547
587
  all(values) {
@@ -575,6 +615,7 @@ var Score = {
575
615
  id: config.id,
576
616
  data,
577
617
  ...passed !== void 0 && { passed },
618
+ ...options?.name !== void 0 && { name: options.name },
578
619
  def
579
620
  // Attach def so rendering/aggregation works without registry lookup
580
621
  };
@@ -643,7 +684,7 @@ var percentScore = Score.of({
643
684
  displayStrategy: "bar",
644
685
  formatValue: (data) => data.value.toFixed(2),
645
686
  formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
646
- aggregateValues: Score.aggregate.averageWithVariance
687
+ aggregateValues: Score.aggregate.averageWithVariance(["value"])
647
688
  });
648
689
  var deltaScore = Score.of({
649
690
  id: "delta",