@contractspec/lib.provider-ranking 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. package/README.md +44 -0
  2. package/dist/browser/eval/index.js +101 -0
  3. package/dist/browser/eval/runner.js +101 -0
  4. package/dist/browser/eval/types.js +0 -0
  5. package/dist/browser/in-memory-store.js +92 -0
  6. package/dist/browser/index.js +105 -0
  7. package/dist/browser/ingesters/artificial-analysis.js +149 -0
  8. package/dist/browser/ingesters/chatbot-arena.js +142 -0
  9. package/dist/browser/ingesters/fetch-utils.js +39 -0
  10. package/dist/browser/ingesters/index.js +418 -0
  11. package/dist/browser/ingesters/open-llm-leaderboard.js +108 -0
  12. package/dist/browser/ingesters/registry.js +412 -0
  13. package/dist/browser/ingesters/swe-bench.js +105 -0
  14. package/dist/browser/ingesters/types.js +0 -0
  15. package/dist/browser/scoring/composite-scorer.js +122 -0
  16. package/dist/browser/scoring/dimension-weights.js +39 -0
  17. package/dist/browser/scoring/index.js +161 -0
  18. package/dist/browser/scoring/normalizer.js +37 -0
  19. package/dist/browser/store.js +0 -0
  20. package/dist/browser/types.js +14 -0
  21. package/dist/eval/index.d.ts +2 -0
  22. package/dist/eval/index.js +102 -0
  23. package/dist/eval/runner.d.ts +18 -0
  24. package/dist/eval/runner.js +102 -0
  25. package/dist/eval/types.d.ts +51 -0
  26. package/dist/eval/types.js +1 -0
  27. package/dist/in-memory-store.d.ts +17 -0
  28. package/dist/in-memory-store.js +93 -0
  29. package/dist/index.d.ts +4 -0
  30. package/dist/index.js +106 -0
  31. package/dist/ingesters/artificial-analysis.d.ts +8 -0
  32. package/dist/ingesters/artificial-analysis.js +150 -0
  33. package/dist/ingesters/chatbot-arena.d.ts +8 -0
  34. package/dist/ingesters/chatbot-arena.js +143 -0
  35. package/dist/ingesters/fetch-utils.d.ts +11 -0
  36. package/dist/ingesters/fetch-utils.js +40 -0
  37. package/dist/ingesters/index.d.ts +7 -0
  38. package/dist/ingesters/index.js +419 -0
  39. package/dist/ingesters/open-llm-leaderboard.d.ts +8 -0
  40. package/dist/ingesters/open-llm-leaderboard.js +109 -0
  41. package/dist/ingesters/registry.d.ts +17 -0
  42. package/dist/ingesters/registry.js +413 -0
  43. package/dist/ingesters/swe-bench.d.ts +8 -0
  44. package/dist/ingesters/swe-bench.js +106 -0
  45. package/dist/ingesters/types.d.ts +31 -0
  46. package/dist/ingesters/types.js +1 -0
  47. package/dist/node/eval/index.js +101 -0
  48. package/dist/node/eval/runner.js +101 -0
  49. package/dist/node/eval/types.js +0 -0
  50. package/dist/node/in-memory-store.js +92 -0
  51. package/dist/node/index.js +105 -0
  52. package/dist/node/ingesters/artificial-analysis.js +149 -0
  53. package/dist/node/ingesters/chatbot-arena.js +142 -0
  54. package/dist/node/ingesters/fetch-utils.js +39 -0
  55. package/dist/node/ingesters/index.js +418 -0
  56. package/dist/node/ingesters/open-llm-leaderboard.js +108 -0
  57. package/dist/node/ingesters/registry.js +412 -0
  58. package/dist/node/ingesters/swe-bench.js +105 -0
  59. package/dist/node/ingesters/types.js +0 -0
  60. package/dist/node/scoring/composite-scorer.js +122 -0
  61. package/dist/node/scoring/dimension-weights.js +39 -0
  62. package/dist/node/scoring/index.js +161 -0
  63. package/dist/node/scoring/normalizer.js +37 -0
  64. package/dist/node/store.js +0 -0
  65. package/dist/node/types.js +14 -0
  66. package/dist/scoring/composite-scorer.d.ts +10 -0
  67. package/dist/scoring/composite-scorer.js +123 -0
  68. package/dist/scoring/dimension-weights.d.ts +8 -0
  69. package/dist/scoring/dimension-weights.js +40 -0
  70. package/dist/scoring/index.d.ts +3 -0
  71. package/dist/scoring/index.js +162 -0
  72. package/dist/scoring/normalizer.d.ts +20 -0
  73. package/dist/scoring/normalizer.js +38 -0
  74. package/dist/store.d.ts +19 -0
  75. package/dist/store.js +1 -0
  76. package/dist/types.d.ts +100 -0
  77. package/dist/types.js +15 -0
  78. package/package.json +362 -0
@@ -0,0 +1,161 @@
1
+ // src/scoring/normalizer.ts
2
+ var SOURCE_NORMALIZATION = {
3
+ "chatbot-arena": { min: 800, max: 1400, invertScale: false },
4
+ "swe-bench": { min: 0, max: 100, invertScale: false },
5
+ "human-eval": { min: 0, max: 100, invertScale: false },
6
+ mmlu: { min: 0, max: 100, invertScale: false },
7
+ gpqa: { min: 0, max: 100, invertScale: false },
8
+ arc: { min: 0, max: 100, invertScale: false },
9
+ truthfulqa: { min: 0, max: 100, invertScale: false },
10
+ "tau-bench": { min: 0, max: 100, invertScale: false },
11
+ "artificial-analysis": { min: 0, max: 100, invertScale: false }
12
+ };
13
+ function normalizeScore(rawScore, source, configOverride) {
14
+ const config = configOverride ?? SOURCE_NORMALIZATION[source];
15
+ if (!config) {
16
+ return Math.max(0, Math.min(100, rawScore));
17
+ }
18
+ const { min, max, invertScale } = config;
19
+ const range = max - min;
20
+ if (range === 0)
21
+ return 50;
22
+ let normalized = (rawScore - min) / range * 100;
23
+ if (invertScale) {
24
+ normalized = 100 - normalized;
25
+ }
26
+ return Math.max(0, Math.min(100, normalized));
27
+ }
28
+ function normalizeBenchmarkResults(results) {
29
+ return results.map((result) => ({
30
+ ...result,
31
+ score: normalizeScore(typeof result.rawScore === "number" ? result.rawScore : result.score, result.source)
32
+ }));
33
+ }
34
+
35
+ // src/scoring/dimension-weights.ts
36
+ var DEFAULT_DIMENSION_WEIGHTS = [
37
+ { dimension: "coding", weight: 20 },
38
+ { dimension: "reasoning", weight: 20 },
39
+ { dimension: "agentic", weight: 15 },
40
+ { dimension: "cost", weight: 10 },
41
+ { dimension: "latency", weight: 10 },
42
+ { dimension: "context", weight: 10 },
43
+ { dimension: "safety", weight: 10 },
44
+ { dimension: "custom", weight: 5 }
45
+ ];
46
+ function getWeightMap(overrides) {
47
+ const map = new Map;
48
+ for (const w of DEFAULT_DIMENSION_WEIGHTS) {
49
+ map.set(w.dimension, w.weight);
50
+ }
51
+ if (overrides) {
52
+ for (const w of overrides) {
53
+ map.set(w.dimension, w.weight);
54
+ }
55
+ }
56
+ return map;
57
+ }
58
+ function normalizeWeights(weights, activeDimensions) {
59
+ const totalWeight = activeDimensions.reduce((sum, dim) => sum + (weights.get(dim) ?? 0), 0);
60
+ if (totalWeight === 0)
61
+ return new Map;
62
+ const normalized = new Map;
63
+ for (const dim of activeDimensions) {
64
+ const raw = weights.get(dim) ?? 0;
65
+ normalized.set(dim, raw / totalWeight);
66
+ }
67
+ return normalized;
68
+ }
69
+
70
+ // src/scoring/composite-scorer.ts
71
+ function computeModelRankings(results, options, existingRankings) {
72
+ const byModel = groupByModel(results);
73
+ const weights = getWeightMap(options?.weightOverrides);
74
+ const unsorted = [];
75
+ for (const [modelId, modelResults] of byModel) {
76
+ const providerKey = modelResults[0]?.providerKey ?? "unknown";
77
+ const dimensionScores = computeDimensionScores(modelResults);
78
+ const activeDimensions = Object.keys(dimensionScores);
79
+ const normalizedWeights = normalizeWeights(weights, activeDimensions);
80
+ let compositeScore = 0;
81
+ for (const dim of activeDimensions) {
82
+ const dimScore = dimensionScores[dim];
83
+ const weight = normalizedWeights.get(dim) ?? 0;
84
+ if (dimScore) {
85
+ compositeScore += dimScore.score * weight;
86
+ }
87
+ }
88
+ const previousRank = existingRankings?.get(modelId)?.rank ?? null;
89
+ unsorted.push({
90
+ modelId,
91
+ providerKey,
92
+ compositeScore: Math.round(compositeScore * 100) / 100,
93
+ dimensionScores,
94
+ rank: 0,
95
+ previousRank,
96
+ updatedAt: new Date
97
+ });
98
+ }
99
+ unsorted.sort((a, b) => b.compositeScore - a.compositeScore);
100
+ return unsorted.map((ranking, index) => ({
101
+ ...ranking,
102
+ rank: index + 1
103
+ }));
104
+ }
105
+ function groupByModel(results) {
106
+ const map = new Map;
107
+ for (const result of results) {
108
+ const existing = map.get(result.modelId);
109
+ if (existing) {
110
+ existing.push(result);
111
+ } else {
112
+ map.set(result.modelId, [result]);
113
+ }
114
+ }
115
+ return map;
116
+ }
117
+ function computeDimensionScores(results) {
118
+ const byDimension = new Map;
119
+ for (const result of results) {
120
+ const existing = byDimension.get(result.dimension);
121
+ if (existing) {
122
+ existing.push(result);
123
+ } else {
124
+ byDimension.set(result.dimension, [result]);
125
+ }
126
+ }
127
+ const scores = {};
128
+ for (const [dimension, dimResults] of byDimension) {
129
+ const avgScore = dimResults.reduce((sum, r) => sum + r.score, 0) / dimResults.length;
130
+ const sources = [...new Set(dimResults.map((r) => r.source))];
131
+ const recencyFactor = computeRecencyFactor(dimResults);
132
+ const sourceDiversity = Math.min(sources.length / 3, 1);
133
+ const confidence = Math.round((recencyFactor * 0.5 + sourceDiversity * 0.5) * 100) / 100;
134
+ scores[dimension] = {
135
+ score: Math.round(avgScore * 100) / 100,
136
+ confidence,
137
+ sources
138
+ };
139
+ }
140
+ return scores;
141
+ }
142
+ function computeRecencyFactor(results) {
143
+ if (results.length === 0)
144
+ return 0;
145
+ const now = Date.now();
146
+ const mostRecent = Math.max(...results.map((r) => r.measuredAt.getTime()));
147
+ const daysSinceMostRecent = (now - mostRecent) / (1000 * 60 * 60 * 24);
148
+ if (daysSinceMostRecent <= 30)
149
+ return 1;
150
+ if (daysSinceMostRecent >= 180)
151
+ return 0.3;
152
+ return 1 - (daysSinceMostRecent - 30) / (180 - 30) * 0.7;
153
+ }
154
+ export {
155
+ normalizeWeights,
156
+ normalizeScore,
157
+ normalizeBenchmarkResults,
158
+ getWeightMap,
159
+ computeModelRankings,
160
+ DEFAULT_DIMENSION_WEIGHTS
161
+ };
@@ -0,0 +1,37 @@
1
+ // src/scoring/normalizer.ts
2
+ var SOURCE_NORMALIZATION = {
3
+ "chatbot-arena": { min: 800, max: 1400, invertScale: false },
4
+ "swe-bench": { min: 0, max: 100, invertScale: false },
5
+ "human-eval": { min: 0, max: 100, invertScale: false },
6
+ mmlu: { min: 0, max: 100, invertScale: false },
7
+ gpqa: { min: 0, max: 100, invertScale: false },
8
+ arc: { min: 0, max: 100, invertScale: false },
9
+ truthfulqa: { min: 0, max: 100, invertScale: false },
10
+ "tau-bench": { min: 0, max: 100, invertScale: false },
11
+ "artificial-analysis": { min: 0, max: 100, invertScale: false }
12
+ };
13
+ function normalizeScore(rawScore, source, configOverride) {
14
+ const config = configOverride ?? SOURCE_NORMALIZATION[source];
15
+ if (!config) {
16
+ return Math.max(0, Math.min(100, rawScore));
17
+ }
18
+ const { min, max, invertScale } = config;
19
+ const range = max - min;
20
+ if (range === 0)
21
+ return 50;
22
+ let normalized = (rawScore - min) / range * 100;
23
+ if (invertScale) {
24
+ normalized = 100 - normalized;
25
+ }
26
+ return Math.max(0, Math.min(100, normalized));
27
+ }
28
+ function normalizeBenchmarkResults(results) {
29
+ return results.map((result) => ({
30
+ ...result,
31
+ score: normalizeScore(typeof result.rawScore === "number" ? result.rawScore : result.score, result.source)
32
+ }));
33
+ }
34
+ export {
35
+ normalizeScore,
36
+ normalizeBenchmarkResults
37
+ };
File without changes
@@ -0,0 +1,14 @@
1
+ // src/types.ts
2
+ var BENCHMARK_DIMENSIONS = [
3
+ "coding",
4
+ "reasoning",
5
+ "agentic",
6
+ "cost",
7
+ "latency",
8
+ "context",
9
+ "safety",
10
+ "custom"
11
+ ];
12
+ export {
13
+ BENCHMARK_DIMENSIONS
14
+ };
@@ -0,0 +1,10 @@
1
+ import type { BenchmarkResult, DimensionWeightConfig, ModelRanking } from '../types';
2
+ interface ScorerOptions {
3
+ weightOverrides?: DimensionWeightConfig[];
4
+ }
5
+ /**
6
+ * Groups benchmark results by model, computes per-dimension scores,
7
+ * and produces a weighted composite ranking.
8
+ */
9
+ export declare function computeModelRankings(results: BenchmarkResult[], options?: ScorerOptions, existingRankings?: Map<string, ModelRanking>): ModelRanking[];
10
+ export {};
@@ -0,0 +1,123 @@
1
+ // @bun
2
+ // src/scoring/dimension-weights.ts
3
+ var DEFAULT_DIMENSION_WEIGHTS = [
4
+ { dimension: "coding", weight: 20 },
5
+ { dimension: "reasoning", weight: 20 },
6
+ { dimension: "agentic", weight: 15 },
7
+ { dimension: "cost", weight: 10 },
8
+ { dimension: "latency", weight: 10 },
9
+ { dimension: "context", weight: 10 },
10
+ { dimension: "safety", weight: 10 },
11
+ { dimension: "custom", weight: 5 }
12
+ ];
13
+ function getWeightMap(overrides) {
14
+ const map = new Map;
15
+ for (const w of DEFAULT_DIMENSION_WEIGHTS) {
16
+ map.set(w.dimension, w.weight);
17
+ }
18
+ if (overrides) {
19
+ for (const w of overrides) {
20
+ map.set(w.dimension, w.weight);
21
+ }
22
+ }
23
+ return map;
24
+ }
25
+ function normalizeWeights(weights, activeDimensions) {
26
+ const totalWeight = activeDimensions.reduce((sum, dim) => sum + (weights.get(dim) ?? 0), 0);
27
+ if (totalWeight === 0)
28
+ return new Map;
29
+ const normalized = new Map;
30
+ for (const dim of activeDimensions) {
31
+ const raw = weights.get(dim) ?? 0;
32
+ normalized.set(dim, raw / totalWeight);
33
+ }
34
+ return normalized;
35
+ }
36
+
37
+ // src/scoring/composite-scorer.ts
38
+ function computeModelRankings(results, options, existingRankings) {
39
+ const byModel = groupByModel(results);
40
+ const weights = getWeightMap(options?.weightOverrides);
41
+ const unsorted = [];
42
+ for (const [modelId, modelResults] of byModel) {
43
+ const providerKey = modelResults[0]?.providerKey ?? "unknown";
44
+ const dimensionScores = computeDimensionScores(modelResults);
45
+ const activeDimensions = Object.keys(dimensionScores);
46
+ const normalizedWeights = normalizeWeights(weights, activeDimensions);
47
+ let compositeScore = 0;
48
+ for (const dim of activeDimensions) {
49
+ const dimScore = dimensionScores[dim];
50
+ const weight = normalizedWeights.get(dim) ?? 0;
51
+ if (dimScore) {
52
+ compositeScore += dimScore.score * weight;
53
+ }
54
+ }
55
+ const previousRank = existingRankings?.get(modelId)?.rank ?? null;
56
+ unsorted.push({
57
+ modelId,
58
+ providerKey,
59
+ compositeScore: Math.round(compositeScore * 100) / 100,
60
+ dimensionScores,
61
+ rank: 0,
62
+ previousRank,
63
+ updatedAt: new Date
64
+ });
65
+ }
66
+ unsorted.sort((a, b) => b.compositeScore - a.compositeScore);
67
+ return unsorted.map((ranking, index) => ({
68
+ ...ranking,
69
+ rank: index + 1
70
+ }));
71
+ }
72
+ function groupByModel(results) {
73
+ const map = new Map;
74
+ for (const result of results) {
75
+ const existing = map.get(result.modelId);
76
+ if (existing) {
77
+ existing.push(result);
78
+ } else {
79
+ map.set(result.modelId, [result]);
80
+ }
81
+ }
82
+ return map;
83
+ }
84
+ function computeDimensionScores(results) {
85
+ const byDimension = new Map;
86
+ for (const result of results) {
87
+ const existing = byDimension.get(result.dimension);
88
+ if (existing) {
89
+ existing.push(result);
90
+ } else {
91
+ byDimension.set(result.dimension, [result]);
92
+ }
93
+ }
94
+ const scores = {};
95
+ for (const [dimension, dimResults] of byDimension) {
96
+ const avgScore = dimResults.reduce((sum, r) => sum + r.score, 0) / dimResults.length;
97
+ const sources = [...new Set(dimResults.map((r) => r.source))];
98
+ const recencyFactor = computeRecencyFactor(dimResults);
99
+ const sourceDiversity = Math.min(sources.length / 3, 1);
100
+ const confidence = Math.round((recencyFactor * 0.5 + sourceDiversity * 0.5) * 100) / 100;
101
+ scores[dimension] = {
102
+ score: Math.round(avgScore * 100) / 100,
103
+ confidence,
104
+ sources
105
+ };
106
+ }
107
+ return scores;
108
+ }
109
+ function computeRecencyFactor(results) {
110
+ if (results.length === 0)
111
+ return 0;
112
+ const now = Date.now();
113
+ const mostRecent = Math.max(...results.map((r) => r.measuredAt.getTime()));
114
+ const daysSinceMostRecent = (now - mostRecent) / (1000 * 60 * 60 * 24);
115
+ if (daysSinceMostRecent <= 30)
116
+ return 1;
117
+ if (daysSinceMostRecent >= 180)
118
+ return 0.3;
119
+ return 1 - (daysSinceMostRecent - 30) / (180 - 30) * 0.7;
120
+ }
121
+ export {
122
+ computeModelRankings
123
+ };
@@ -0,0 +1,8 @@
1
+ import type { BenchmarkDimension, DimensionWeightConfig } from '../types';
2
+ /**
3
+ * Default weights for composite score calculation.
4
+ * Weights are normalized to sum to 1.0 at scoring time.
5
+ */
6
+ export declare const DEFAULT_DIMENSION_WEIGHTS: DimensionWeightConfig[];
7
+ export declare function getWeightMap(overrides?: DimensionWeightConfig[]): Map<BenchmarkDimension, number>;
8
+ export declare function normalizeWeights(weights: Map<BenchmarkDimension, number>, activeDimensions: BenchmarkDimension[]): Map<BenchmarkDimension, number>;
@@ -0,0 +1,40 @@
1
+ // @bun
2
+ // src/scoring/dimension-weights.ts
3
+ var DEFAULT_DIMENSION_WEIGHTS = [
4
+ { dimension: "coding", weight: 20 },
5
+ { dimension: "reasoning", weight: 20 },
6
+ { dimension: "agentic", weight: 15 },
7
+ { dimension: "cost", weight: 10 },
8
+ { dimension: "latency", weight: 10 },
9
+ { dimension: "context", weight: 10 },
10
+ { dimension: "safety", weight: 10 },
11
+ { dimension: "custom", weight: 5 }
12
+ ];
13
+ function getWeightMap(overrides) {
14
+ const map = new Map;
15
+ for (const w of DEFAULT_DIMENSION_WEIGHTS) {
16
+ map.set(w.dimension, w.weight);
17
+ }
18
+ if (overrides) {
19
+ for (const w of overrides) {
20
+ map.set(w.dimension, w.weight);
21
+ }
22
+ }
23
+ return map;
24
+ }
25
+ function normalizeWeights(weights, activeDimensions) {
26
+ const totalWeight = activeDimensions.reduce((sum, dim) => sum + (weights.get(dim) ?? 0), 0);
27
+ if (totalWeight === 0)
28
+ return new Map;
29
+ const normalized = new Map;
30
+ for (const dim of activeDimensions) {
31
+ const raw = weights.get(dim) ?? 0;
32
+ normalized.set(dim, raw / totalWeight);
33
+ }
34
+ return normalized;
35
+ }
36
+ export {
37
+ normalizeWeights,
38
+ getWeightMap,
39
+ DEFAULT_DIMENSION_WEIGHTS
40
+ };
@@ -0,0 +1,3 @@
1
+ export { computeModelRankings } from './composite-scorer';
2
+ export { DEFAULT_DIMENSION_WEIGHTS, getWeightMap, normalizeWeights, } from './dimension-weights';
3
+ export { normalizeScore, normalizeBenchmarkResults } from './normalizer';
@@ -0,0 +1,162 @@
1
+ // @bun
2
+ // src/scoring/normalizer.ts
3
+ var SOURCE_NORMALIZATION = {
4
+ "chatbot-arena": { min: 800, max: 1400, invertScale: false },
5
+ "swe-bench": { min: 0, max: 100, invertScale: false },
6
+ "human-eval": { min: 0, max: 100, invertScale: false },
7
+ mmlu: { min: 0, max: 100, invertScale: false },
8
+ gpqa: { min: 0, max: 100, invertScale: false },
9
+ arc: { min: 0, max: 100, invertScale: false },
10
+ truthfulqa: { min: 0, max: 100, invertScale: false },
11
+ "tau-bench": { min: 0, max: 100, invertScale: false },
12
+ "artificial-analysis": { min: 0, max: 100, invertScale: false }
13
+ };
14
+ function normalizeScore(rawScore, source, configOverride) {
15
+ const config = configOverride ?? SOURCE_NORMALIZATION[source];
16
+ if (!config) {
17
+ return Math.max(0, Math.min(100, rawScore));
18
+ }
19
+ const { min, max, invertScale } = config;
20
+ const range = max - min;
21
+ if (range === 0)
22
+ return 50;
23
+ let normalized = (rawScore - min) / range * 100;
24
+ if (invertScale) {
25
+ normalized = 100 - normalized;
26
+ }
27
+ return Math.max(0, Math.min(100, normalized));
28
+ }
29
+ function normalizeBenchmarkResults(results) {
30
+ return results.map((result) => ({
31
+ ...result,
32
+ score: normalizeScore(typeof result.rawScore === "number" ? result.rawScore : result.score, result.source)
33
+ }));
34
+ }
35
+
36
+ // src/scoring/dimension-weights.ts
37
+ var DEFAULT_DIMENSION_WEIGHTS = [
38
+ { dimension: "coding", weight: 20 },
39
+ { dimension: "reasoning", weight: 20 },
40
+ { dimension: "agentic", weight: 15 },
41
+ { dimension: "cost", weight: 10 },
42
+ { dimension: "latency", weight: 10 },
43
+ { dimension: "context", weight: 10 },
44
+ { dimension: "safety", weight: 10 },
45
+ { dimension: "custom", weight: 5 }
46
+ ];
47
+ function getWeightMap(overrides) {
48
+ const map = new Map;
49
+ for (const w of DEFAULT_DIMENSION_WEIGHTS) {
50
+ map.set(w.dimension, w.weight);
51
+ }
52
+ if (overrides) {
53
+ for (const w of overrides) {
54
+ map.set(w.dimension, w.weight);
55
+ }
56
+ }
57
+ return map;
58
+ }
59
+ function normalizeWeights(weights, activeDimensions) {
60
+ const totalWeight = activeDimensions.reduce((sum, dim) => sum + (weights.get(dim) ?? 0), 0);
61
+ if (totalWeight === 0)
62
+ return new Map;
63
+ const normalized = new Map;
64
+ for (const dim of activeDimensions) {
65
+ const raw = weights.get(dim) ?? 0;
66
+ normalized.set(dim, raw / totalWeight);
67
+ }
68
+ return normalized;
69
+ }
70
+
71
+ // src/scoring/composite-scorer.ts
72
+ function computeModelRankings(results, options, existingRankings) {
73
+ const byModel = groupByModel(results);
74
+ const weights = getWeightMap(options?.weightOverrides);
75
+ const unsorted = [];
76
+ for (const [modelId, modelResults] of byModel) {
77
+ const providerKey = modelResults[0]?.providerKey ?? "unknown";
78
+ const dimensionScores = computeDimensionScores(modelResults);
79
+ const activeDimensions = Object.keys(dimensionScores);
80
+ const normalizedWeights = normalizeWeights(weights, activeDimensions);
81
+ let compositeScore = 0;
82
+ for (const dim of activeDimensions) {
83
+ const dimScore = dimensionScores[dim];
84
+ const weight = normalizedWeights.get(dim) ?? 0;
85
+ if (dimScore) {
86
+ compositeScore += dimScore.score * weight;
87
+ }
88
+ }
89
+ const previousRank = existingRankings?.get(modelId)?.rank ?? null;
90
+ unsorted.push({
91
+ modelId,
92
+ providerKey,
93
+ compositeScore: Math.round(compositeScore * 100) / 100,
94
+ dimensionScores,
95
+ rank: 0,
96
+ previousRank,
97
+ updatedAt: new Date
98
+ });
99
+ }
100
+ unsorted.sort((a, b) => b.compositeScore - a.compositeScore);
101
+ return unsorted.map((ranking, index) => ({
102
+ ...ranking,
103
+ rank: index + 1
104
+ }));
105
+ }
106
+ function groupByModel(results) {
107
+ const map = new Map;
108
+ for (const result of results) {
109
+ const existing = map.get(result.modelId);
110
+ if (existing) {
111
+ existing.push(result);
112
+ } else {
113
+ map.set(result.modelId, [result]);
114
+ }
115
+ }
116
+ return map;
117
+ }
118
+ function computeDimensionScores(results) {
119
+ const byDimension = new Map;
120
+ for (const result of results) {
121
+ const existing = byDimension.get(result.dimension);
122
+ if (existing) {
123
+ existing.push(result);
124
+ } else {
125
+ byDimension.set(result.dimension, [result]);
126
+ }
127
+ }
128
+ const scores = {};
129
+ for (const [dimension, dimResults] of byDimension) {
130
+ const avgScore = dimResults.reduce((sum, r) => sum + r.score, 0) / dimResults.length;
131
+ const sources = [...new Set(dimResults.map((r) => r.source))];
132
+ const recencyFactor = computeRecencyFactor(dimResults);
133
+ const sourceDiversity = Math.min(sources.length / 3, 1);
134
+ const confidence = Math.round((recencyFactor * 0.5 + sourceDiversity * 0.5) * 100) / 100;
135
+ scores[dimension] = {
136
+ score: Math.round(avgScore * 100) / 100,
137
+ confidence,
138
+ sources
139
+ };
140
+ }
141
+ return scores;
142
+ }
143
+ function computeRecencyFactor(results) {
144
+ if (results.length === 0)
145
+ return 0;
146
+ const now = Date.now();
147
+ const mostRecent = Math.max(...results.map((r) => r.measuredAt.getTime()));
148
+ const daysSinceMostRecent = (now - mostRecent) / (1000 * 60 * 60 * 24);
149
+ if (daysSinceMostRecent <= 30)
150
+ return 1;
151
+ if (daysSinceMostRecent >= 180)
152
+ return 0.3;
153
+ return 1 - (daysSinceMostRecent - 30) / (180 - 30) * 0.7;
154
+ }
155
+ export {
156
+ normalizeWeights,
157
+ normalizeScore,
158
+ normalizeBenchmarkResults,
159
+ getWeightMap,
160
+ computeModelRankings,
161
+ DEFAULT_DIMENSION_WEIGHTS
162
+ };
@@ -0,0 +1,20 @@
1
+ import type { BenchmarkResult, BenchmarkSource } from '../types';
2
+ interface NormalizationConfig {
3
+ /** Minimum possible raw score for the source. */
4
+ min: number;
5
+ /** Maximum possible raw score for the source. */
6
+ max: number;
7
+ /** If true, lower raw scores are better (e.g. latency, cost). */
8
+ invertScale: boolean;
9
+ }
10
+ /**
11
+ * Normalize a raw score to the 0-100 scale.
12
+ * Falls back to clamping if no source config is known.
13
+ */
14
+ export declare function normalizeScore(rawScore: number, source: BenchmarkSource, configOverride?: NormalizationConfig): number;
15
+ /**
16
+ * Normalize an array of benchmark results in place,
17
+ * setting the `score` field from `rawScore`.
18
+ */
19
+ export declare function normalizeBenchmarkResults(results: BenchmarkResult[]): BenchmarkResult[];
20
+ export {};
@@ -0,0 +1,38 @@
1
+ // @bun
2
+ // src/scoring/normalizer.ts
3
+ var SOURCE_NORMALIZATION = {
4
+ "chatbot-arena": { min: 800, max: 1400, invertScale: false },
5
+ "swe-bench": { min: 0, max: 100, invertScale: false },
6
+ "human-eval": { min: 0, max: 100, invertScale: false },
7
+ mmlu: { min: 0, max: 100, invertScale: false },
8
+ gpqa: { min: 0, max: 100, invertScale: false },
9
+ arc: { min: 0, max: 100, invertScale: false },
10
+ truthfulqa: { min: 0, max: 100, invertScale: false },
11
+ "tau-bench": { min: 0, max: 100, invertScale: false },
12
+ "artificial-analysis": { min: 0, max: 100, invertScale: false }
13
+ };
14
+ function normalizeScore(rawScore, source, configOverride) {
15
+ const config = configOverride ?? SOURCE_NORMALIZATION[source];
16
+ if (!config) {
17
+ return Math.max(0, Math.min(100, rawScore));
18
+ }
19
+ const { min, max, invertScale } = config;
20
+ const range = max - min;
21
+ if (range === 0)
22
+ return 50;
23
+ let normalized = (rawScore - min) / range * 100;
24
+ if (invertScale) {
25
+ normalized = 100 - normalized;
26
+ }
27
+ return Math.max(0, Math.min(100, normalized));
28
+ }
29
+ function normalizeBenchmarkResults(results) {
30
+ return results.map((result) => ({
31
+ ...result,
32
+ score: normalizeScore(typeof result.rawScore === "number" ? result.rawScore : result.score, result.source)
33
+ }));
34
+ }
35
+ export {
36
+ normalizeScore,
37
+ normalizeBenchmarkResults
38
+ };
@@ -0,0 +1,19 @@
1
+ import type { BenchmarkResult, BenchmarkResultListResult, BenchmarkResultQuery, IngestionRun, ModelProfile, ModelRanking, RankingListResult, RankingQuery } from './types';
2
+ /**
3
+ * Storage interface for the provider ranking system.
4
+ *
5
+ * Lib provides an in-memory implementation; the module layer
6
+ * adds a Postgres-backed implementation.
7
+ */
8
+ export interface ProviderRankingStore {
9
+ upsertBenchmarkResult(result: BenchmarkResult): Promise<void>;
10
+ getBenchmarkResult(id: string): Promise<BenchmarkResult | null>;
11
+ listBenchmarkResults(query: BenchmarkResultQuery): Promise<BenchmarkResultListResult>;
12
+ upsertModelRanking(ranking: ModelRanking): Promise<void>;
13
+ getModelRanking(modelId: string): Promise<ModelRanking | null>;
14
+ listModelRankings(query: RankingQuery): Promise<RankingListResult>;
15
+ getModelProfile(modelId: string): Promise<ModelProfile | null>;
16
+ createIngestionRun(run: IngestionRun): Promise<void>;
17
+ updateIngestionRun(id: string, update: Partial<IngestionRun>): Promise<void>;
18
+ getIngestionRun(id: string): Promise<IngestionRun | null>;
19
+ }
package/dist/store.js ADDED
@@ -0,0 +1 @@
1
+ // @bun