@contractspec/lib.provider-ranking 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +44 -0
- package/dist/browser/eval/index.js +101 -0
- package/dist/browser/eval/runner.js +101 -0
- package/dist/browser/eval/types.js +0 -0
- package/dist/browser/in-memory-store.js +92 -0
- package/dist/browser/index.js +105 -0
- package/dist/browser/ingesters/artificial-analysis.js +149 -0
- package/dist/browser/ingesters/chatbot-arena.js +142 -0
- package/dist/browser/ingesters/fetch-utils.js +39 -0
- package/dist/browser/ingesters/index.js +418 -0
- package/dist/browser/ingesters/open-llm-leaderboard.js +108 -0
- package/dist/browser/ingesters/registry.js +412 -0
- package/dist/browser/ingesters/swe-bench.js +105 -0
- package/dist/browser/ingesters/types.js +0 -0
- package/dist/browser/scoring/composite-scorer.js +122 -0
- package/dist/browser/scoring/dimension-weights.js +39 -0
- package/dist/browser/scoring/index.js +161 -0
- package/dist/browser/scoring/normalizer.js +37 -0
- package/dist/browser/store.js +0 -0
- package/dist/browser/types.js +14 -0
- package/dist/eval/index.d.ts +2 -0
- package/dist/eval/index.js +102 -0
- package/dist/eval/runner.d.ts +18 -0
- package/dist/eval/runner.js +102 -0
- package/dist/eval/types.d.ts +51 -0
- package/dist/eval/types.js +1 -0
- package/dist/in-memory-store.d.ts +17 -0
- package/dist/in-memory-store.js +93 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.js +106 -0
- package/dist/ingesters/artificial-analysis.d.ts +8 -0
- package/dist/ingesters/artificial-analysis.js +150 -0
- package/dist/ingesters/chatbot-arena.d.ts +8 -0
- package/dist/ingesters/chatbot-arena.js +143 -0
- package/dist/ingesters/fetch-utils.d.ts +11 -0
- package/dist/ingesters/fetch-utils.js +40 -0
- package/dist/ingesters/index.d.ts +7 -0
- package/dist/ingesters/index.js +419 -0
- package/dist/ingesters/open-llm-leaderboard.d.ts +8 -0
- package/dist/ingesters/open-llm-leaderboard.js +109 -0
- package/dist/ingesters/registry.d.ts +17 -0
- package/dist/ingesters/registry.js +413 -0
- package/dist/ingesters/swe-bench.d.ts +8 -0
- package/dist/ingesters/swe-bench.js +106 -0
- package/dist/ingesters/types.d.ts +31 -0
- package/dist/ingesters/types.js +1 -0
- package/dist/node/eval/index.js +101 -0
- package/dist/node/eval/runner.js +101 -0
- package/dist/node/eval/types.js +0 -0
- package/dist/node/in-memory-store.js +92 -0
- package/dist/node/index.js +105 -0
- package/dist/node/ingesters/artificial-analysis.js +149 -0
- package/dist/node/ingesters/chatbot-arena.js +142 -0
- package/dist/node/ingesters/fetch-utils.js +39 -0
- package/dist/node/ingesters/index.js +418 -0
- package/dist/node/ingesters/open-llm-leaderboard.js +108 -0
- package/dist/node/ingesters/registry.js +412 -0
- package/dist/node/ingesters/swe-bench.js +105 -0
- package/dist/node/ingesters/types.js +0 -0
- package/dist/node/scoring/composite-scorer.js +122 -0
- package/dist/node/scoring/dimension-weights.js +39 -0
- package/dist/node/scoring/index.js +161 -0
- package/dist/node/scoring/normalizer.js +37 -0
- package/dist/node/store.js +0 -0
- package/dist/node/types.js +14 -0
- package/dist/scoring/composite-scorer.d.ts +10 -0
- package/dist/scoring/composite-scorer.js +123 -0
- package/dist/scoring/dimension-weights.d.ts +8 -0
- package/dist/scoring/dimension-weights.js +40 -0
- package/dist/scoring/index.d.ts +3 -0
- package/dist/scoring/index.js +162 -0
- package/dist/scoring/normalizer.d.ts +20 -0
- package/dist/scoring/normalizer.js +38 -0
- package/dist/store.d.ts +19 -0
- package/dist/store.js +1 -0
- package/dist/types.d.ts +100 -0
- package/dist/types.js +15 -0
- package/package.json +362 -0
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
// src/scoring/dimension-weights.ts
|
|
2
|
+
var DEFAULT_DIMENSION_WEIGHTS = [
|
|
3
|
+
{ dimension: "coding", weight: 20 },
|
|
4
|
+
{ dimension: "reasoning", weight: 20 },
|
|
5
|
+
{ dimension: "agentic", weight: 15 },
|
|
6
|
+
{ dimension: "cost", weight: 10 },
|
|
7
|
+
{ dimension: "latency", weight: 10 },
|
|
8
|
+
{ dimension: "context", weight: 10 },
|
|
9
|
+
{ dimension: "safety", weight: 10 },
|
|
10
|
+
{ dimension: "custom", weight: 5 }
|
|
11
|
+
];
|
|
12
|
+
function getWeightMap(overrides) {
|
|
13
|
+
const map = new Map;
|
|
14
|
+
for (const w of DEFAULT_DIMENSION_WEIGHTS) {
|
|
15
|
+
map.set(w.dimension, w.weight);
|
|
16
|
+
}
|
|
17
|
+
if (overrides) {
|
|
18
|
+
for (const w of overrides) {
|
|
19
|
+
map.set(w.dimension, w.weight);
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
return map;
|
|
23
|
+
}
|
|
24
|
+
function normalizeWeights(weights, activeDimensions) {
|
|
25
|
+
const totalWeight = activeDimensions.reduce((sum, dim) => sum + (weights.get(dim) ?? 0), 0);
|
|
26
|
+
if (totalWeight === 0)
|
|
27
|
+
return new Map;
|
|
28
|
+
const normalized = new Map;
|
|
29
|
+
for (const dim of activeDimensions) {
|
|
30
|
+
const raw = weights.get(dim) ?? 0;
|
|
31
|
+
normalized.set(dim, raw / totalWeight);
|
|
32
|
+
}
|
|
33
|
+
return normalized;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// src/scoring/composite-scorer.ts
|
|
37
|
+
function computeModelRankings(results, options, existingRankings) {
|
|
38
|
+
const byModel = groupByModel(results);
|
|
39
|
+
const weights = getWeightMap(options?.weightOverrides);
|
|
40
|
+
const unsorted = [];
|
|
41
|
+
for (const [modelId, modelResults] of byModel) {
|
|
42
|
+
const providerKey = modelResults[0]?.providerKey ?? "unknown";
|
|
43
|
+
const dimensionScores = computeDimensionScores(modelResults);
|
|
44
|
+
const activeDimensions = Object.keys(dimensionScores);
|
|
45
|
+
const normalizedWeights = normalizeWeights(weights, activeDimensions);
|
|
46
|
+
let compositeScore = 0;
|
|
47
|
+
for (const dim of activeDimensions) {
|
|
48
|
+
const dimScore = dimensionScores[dim];
|
|
49
|
+
const weight = normalizedWeights.get(dim) ?? 0;
|
|
50
|
+
if (dimScore) {
|
|
51
|
+
compositeScore += dimScore.score * weight;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
const previousRank = existingRankings?.get(modelId)?.rank ?? null;
|
|
55
|
+
unsorted.push({
|
|
56
|
+
modelId,
|
|
57
|
+
providerKey,
|
|
58
|
+
compositeScore: Math.round(compositeScore * 100) / 100,
|
|
59
|
+
dimensionScores,
|
|
60
|
+
rank: 0,
|
|
61
|
+
previousRank,
|
|
62
|
+
updatedAt: new Date
|
|
63
|
+
});
|
|
64
|
+
}
|
|
65
|
+
unsorted.sort((a, b) => b.compositeScore - a.compositeScore);
|
|
66
|
+
return unsorted.map((ranking, index) => ({
|
|
67
|
+
...ranking,
|
|
68
|
+
rank: index + 1
|
|
69
|
+
}));
|
|
70
|
+
}
|
|
71
|
+
function groupByModel(results) {
|
|
72
|
+
const map = new Map;
|
|
73
|
+
for (const result of results) {
|
|
74
|
+
const existing = map.get(result.modelId);
|
|
75
|
+
if (existing) {
|
|
76
|
+
existing.push(result);
|
|
77
|
+
} else {
|
|
78
|
+
map.set(result.modelId, [result]);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
return map;
|
|
82
|
+
}
|
|
83
|
+
function computeDimensionScores(results) {
|
|
84
|
+
const byDimension = new Map;
|
|
85
|
+
for (const result of results) {
|
|
86
|
+
const existing = byDimension.get(result.dimension);
|
|
87
|
+
if (existing) {
|
|
88
|
+
existing.push(result);
|
|
89
|
+
} else {
|
|
90
|
+
byDimension.set(result.dimension, [result]);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
const scores = {};
|
|
94
|
+
for (const [dimension, dimResults] of byDimension) {
|
|
95
|
+
const avgScore = dimResults.reduce((sum, r) => sum + r.score, 0) / dimResults.length;
|
|
96
|
+
const sources = [...new Set(dimResults.map((r) => r.source))];
|
|
97
|
+
const recencyFactor = computeRecencyFactor(dimResults);
|
|
98
|
+
const sourceDiversity = Math.min(sources.length / 3, 1);
|
|
99
|
+
const confidence = Math.round((recencyFactor * 0.5 + sourceDiversity * 0.5) * 100) / 100;
|
|
100
|
+
scores[dimension] = {
|
|
101
|
+
score: Math.round(avgScore * 100) / 100,
|
|
102
|
+
confidence,
|
|
103
|
+
sources
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
return scores;
|
|
107
|
+
}
|
|
108
|
+
function computeRecencyFactor(results) {
|
|
109
|
+
if (results.length === 0)
|
|
110
|
+
return 0;
|
|
111
|
+
const now = Date.now();
|
|
112
|
+
const mostRecent = Math.max(...results.map((r) => r.measuredAt.getTime()));
|
|
113
|
+
const daysSinceMostRecent = (now - mostRecent) / (1000 * 60 * 60 * 24);
|
|
114
|
+
if (daysSinceMostRecent <= 30)
|
|
115
|
+
return 1;
|
|
116
|
+
if (daysSinceMostRecent >= 180)
|
|
117
|
+
return 0.3;
|
|
118
|
+
return 1 - (daysSinceMostRecent - 30) / (180 - 30) * 0.7;
|
|
119
|
+
}
|
|
120
|
+
export {
|
|
121
|
+
computeModelRankings
|
|
122
|
+
};
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
// src/scoring/dimension-weights.ts
|
|
2
|
+
var DEFAULT_DIMENSION_WEIGHTS = [
|
|
3
|
+
{ dimension: "coding", weight: 20 },
|
|
4
|
+
{ dimension: "reasoning", weight: 20 },
|
|
5
|
+
{ dimension: "agentic", weight: 15 },
|
|
6
|
+
{ dimension: "cost", weight: 10 },
|
|
7
|
+
{ dimension: "latency", weight: 10 },
|
|
8
|
+
{ dimension: "context", weight: 10 },
|
|
9
|
+
{ dimension: "safety", weight: 10 },
|
|
10
|
+
{ dimension: "custom", weight: 5 }
|
|
11
|
+
];
|
|
12
|
+
function getWeightMap(overrides) {
|
|
13
|
+
const map = new Map;
|
|
14
|
+
for (const w of DEFAULT_DIMENSION_WEIGHTS) {
|
|
15
|
+
map.set(w.dimension, w.weight);
|
|
16
|
+
}
|
|
17
|
+
if (overrides) {
|
|
18
|
+
for (const w of overrides) {
|
|
19
|
+
map.set(w.dimension, w.weight);
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
return map;
|
|
23
|
+
}
|
|
24
|
+
function normalizeWeights(weights, activeDimensions) {
|
|
25
|
+
const totalWeight = activeDimensions.reduce((sum, dim) => sum + (weights.get(dim) ?? 0), 0);
|
|
26
|
+
if (totalWeight === 0)
|
|
27
|
+
return new Map;
|
|
28
|
+
const normalized = new Map;
|
|
29
|
+
for (const dim of activeDimensions) {
|
|
30
|
+
const raw = weights.get(dim) ?? 0;
|
|
31
|
+
normalized.set(dim, raw / totalWeight);
|
|
32
|
+
}
|
|
33
|
+
return normalized;
|
|
34
|
+
}
|
|
35
|
+
export {
|
|
36
|
+
normalizeWeights,
|
|
37
|
+
getWeightMap,
|
|
38
|
+
DEFAULT_DIMENSION_WEIGHTS
|
|
39
|
+
};
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
// src/scoring/normalizer.ts
|
|
2
|
+
var SOURCE_NORMALIZATION = {
|
|
3
|
+
"chatbot-arena": { min: 800, max: 1400, invertScale: false },
|
|
4
|
+
"swe-bench": { min: 0, max: 100, invertScale: false },
|
|
5
|
+
"human-eval": { min: 0, max: 100, invertScale: false },
|
|
6
|
+
mmlu: { min: 0, max: 100, invertScale: false },
|
|
7
|
+
gpqa: { min: 0, max: 100, invertScale: false },
|
|
8
|
+
arc: { min: 0, max: 100, invertScale: false },
|
|
9
|
+
truthfulqa: { min: 0, max: 100, invertScale: false },
|
|
10
|
+
"tau-bench": { min: 0, max: 100, invertScale: false },
|
|
11
|
+
"artificial-analysis": { min: 0, max: 100, invertScale: false }
|
|
12
|
+
};
|
|
13
|
+
function normalizeScore(rawScore, source, configOverride) {
|
|
14
|
+
const config = configOverride ?? SOURCE_NORMALIZATION[source];
|
|
15
|
+
if (!config) {
|
|
16
|
+
return Math.max(0, Math.min(100, rawScore));
|
|
17
|
+
}
|
|
18
|
+
const { min, max, invertScale } = config;
|
|
19
|
+
const range = max - min;
|
|
20
|
+
if (range === 0)
|
|
21
|
+
return 50;
|
|
22
|
+
let normalized = (rawScore - min) / range * 100;
|
|
23
|
+
if (invertScale) {
|
|
24
|
+
normalized = 100 - normalized;
|
|
25
|
+
}
|
|
26
|
+
return Math.max(0, Math.min(100, normalized));
|
|
27
|
+
}
|
|
28
|
+
function normalizeBenchmarkResults(results) {
|
|
29
|
+
return results.map((result) => ({
|
|
30
|
+
...result,
|
|
31
|
+
score: normalizeScore(typeof result.rawScore === "number" ? result.rawScore : result.score, result.source)
|
|
32
|
+
}));
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// src/scoring/dimension-weights.ts
|
|
36
|
+
var DEFAULT_DIMENSION_WEIGHTS = [
|
|
37
|
+
{ dimension: "coding", weight: 20 },
|
|
38
|
+
{ dimension: "reasoning", weight: 20 },
|
|
39
|
+
{ dimension: "agentic", weight: 15 },
|
|
40
|
+
{ dimension: "cost", weight: 10 },
|
|
41
|
+
{ dimension: "latency", weight: 10 },
|
|
42
|
+
{ dimension: "context", weight: 10 },
|
|
43
|
+
{ dimension: "safety", weight: 10 },
|
|
44
|
+
{ dimension: "custom", weight: 5 }
|
|
45
|
+
];
|
|
46
|
+
function getWeightMap(overrides) {
|
|
47
|
+
const map = new Map;
|
|
48
|
+
for (const w of DEFAULT_DIMENSION_WEIGHTS) {
|
|
49
|
+
map.set(w.dimension, w.weight);
|
|
50
|
+
}
|
|
51
|
+
if (overrides) {
|
|
52
|
+
for (const w of overrides) {
|
|
53
|
+
map.set(w.dimension, w.weight);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
return map;
|
|
57
|
+
}
|
|
58
|
+
function normalizeWeights(weights, activeDimensions) {
|
|
59
|
+
const totalWeight = activeDimensions.reduce((sum, dim) => sum + (weights.get(dim) ?? 0), 0);
|
|
60
|
+
if (totalWeight === 0)
|
|
61
|
+
return new Map;
|
|
62
|
+
const normalized = new Map;
|
|
63
|
+
for (const dim of activeDimensions) {
|
|
64
|
+
const raw = weights.get(dim) ?? 0;
|
|
65
|
+
normalized.set(dim, raw / totalWeight);
|
|
66
|
+
}
|
|
67
|
+
return normalized;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// src/scoring/composite-scorer.ts
|
|
71
|
+
function computeModelRankings(results, options, existingRankings) {
|
|
72
|
+
const byModel = groupByModel(results);
|
|
73
|
+
const weights = getWeightMap(options?.weightOverrides);
|
|
74
|
+
const unsorted = [];
|
|
75
|
+
for (const [modelId, modelResults] of byModel) {
|
|
76
|
+
const providerKey = modelResults[0]?.providerKey ?? "unknown";
|
|
77
|
+
const dimensionScores = computeDimensionScores(modelResults);
|
|
78
|
+
const activeDimensions = Object.keys(dimensionScores);
|
|
79
|
+
const normalizedWeights = normalizeWeights(weights, activeDimensions);
|
|
80
|
+
let compositeScore = 0;
|
|
81
|
+
for (const dim of activeDimensions) {
|
|
82
|
+
const dimScore = dimensionScores[dim];
|
|
83
|
+
const weight = normalizedWeights.get(dim) ?? 0;
|
|
84
|
+
if (dimScore) {
|
|
85
|
+
compositeScore += dimScore.score * weight;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
const previousRank = existingRankings?.get(modelId)?.rank ?? null;
|
|
89
|
+
unsorted.push({
|
|
90
|
+
modelId,
|
|
91
|
+
providerKey,
|
|
92
|
+
compositeScore: Math.round(compositeScore * 100) / 100,
|
|
93
|
+
dimensionScores,
|
|
94
|
+
rank: 0,
|
|
95
|
+
previousRank,
|
|
96
|
+
updatedAt: new Date
|
|
97
|
+
});
|
|
98
|
+
}
|
|
99
|
+
unsorted.sort((a, b) => b.compositeScore - a.compositeScore);
|
|
100
|
+
return unsorted.map((ranking, index) => ({
|
|
101
|
+
...ranking,
|
|
102
|
+
rank: index + 1
|
|
103
|
+
}));
|
|
104
|
+
}
|
|
105
|
+
function groupByModel(results) {
|
|
106
|
+
const map = new Map;
|
|
107
|
+
for (const result of results) {
|
|
108
|
+
const existing = map.get(result.modelId);
|
|
109
|
+
if (existing) {
|
|
110
|
+
existing.push(result);
|
|
111
|
+
} else {
|
|
112
|
+
map.set(result.modelId, [result]);
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
return map;
|
|
116
|
+
}
|
|
117
|
+
function computeDimensionScores(results) {
|
|
118
|
+
const byDimension = new Map;
|
|
119
|
+
for (const result of results) {
|
|
120
|
+
const existing = byDimension.get(result.dimension);
|
|
121
|
+
if (existing) {
|
|
122
|
+
existing.push(result);
|
|
123
|
+
} else {
|
|
124
|
+
byDimension.set(result.dimension, [result]);
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
const scores = {};
|
|
128
|
+
for (const [dimension, dimResults] of byDimension) {
|
|
129
|
+
const avgScore = dimResults.reduce((sum, r) => sum + r.score, 0) / dimResults.length;
|
|
130
|
+
const sources = [...new Set(dimResults.map((r) => r.source))];
|
|
131
|
+
const recencyFactor = computeRecencyFactor(dimResults);
|
|
132
|
+
const sourceDiversity = Math.min(sources.length / 3, 1);
|
|
133
|
+
const confidence = Math.round((recencyFactor * 0.5 + sourceDiversity * 0.5) * 100) / 100;
|
|
134
|
+
scores[dimension] = {
|
|
135
|
+
score: Math.round(avgScore * 100) / 100,
|
|
136
|
+
confidence,
|
|
137
|
+
sources
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
return scores;
|
|
141
|
+
}
|
|
142
|
+
function computeRecencyFactor(results) {
|
|
143
|
+
if (results.length === 0)
|
|
144
|
+
return 0;
|
|
145
|
+
const now = Date.now();
|
|
146
|
+
const mostRecent = Math.max(...results.map((r) => r.measuredAt.getTime()));
|
|
147
|
+
const daysSinceMostRecent = (now - mostRecent) / (1000 * 60 * 60 * 24);
|
|
148
|
+
if (daysSinceMostRecent <= 30)
|
|
149
|
+
return 1;
|
|
150
|
+
if (daysSinceMostRecent >= 180)
|
|
151
|
+
return 0.3;
|
|
152
|
+
return 1 - (daysSinceMostRecent - 30) / (180 - 30) * 0.7;
|
|
153
|
+
}
|
|
154
|
+
export {
|
|
155
|
+
normalizeWeights,
|
|
156
|
+
normalizeScore,
|
|
157
|
+
normalizeBenchmarkResults,
|
|
158
|
+
getWeightMap,
|
|
159
|
+
computeModelRankings,
|
|
160
|
+
DEFAULT_DIMENSION_WEIGHTS
|
|
161
|
+
};
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
// src/scoring/normalizer.ts
|
|
2
|
+
var SOURCE_NORMALIZATION = {
|
|
3
|
+
"chatbot-arena": { min: 800, max: 1400, invertScale: false },
|
|
4
|
+
"swe-bench": { min: 0, max: 100, invertScale: false },
|
|
5
|
+
"human-eval": { min: 0, max: 100, invertScale: false },
|
|
6
|
+
mmlu: { min: 0, max: 100, invertScale: false },
|
|
7
|
+
gpqa: { min: 0, max: 100, invertScale: false },
|
|
8
|
+
arc: { min: 0, max: 100, invertScale: false },
|
|
9
|
+
truthfulqa: { min: 0, max: 100, invertScale: false },
|
|
10
|
+
"tau-bench": { min: 0, max: 100, invertScale: false },
|
|
11
|
+
"artificial-analysis": { min: 0, max: 100, invertScale: false }
|
|
12
|
+
};
|
|
13
|
+
function normalizeScore(rawScore, source, configOverride) {
|
|
14
|
+
const config = configOverride ?? SOURCE_NORMALIZATION[source];
|
|
15
|
+
if (!config) {
|
|
16
|
+
return Math.max(0, Math.min(100, rawScore));
|
|
17
|
+
}
|
|
18
|
+
const { min, max, invertScale } = config;
|
|
19
|
+
const range = max - min;
|
|
20
|
+
if (range === 0)
|
|
21
|
+
return 50;
|
|
22
|
+
let normalized = (rawScore - min) / range * 100;
|
|
23
|
+
if (invertScale) {
|
|
24
|
+
normalized = 100 - normalized;
|
|
25
|
+
}
|
|
26
|
+
return Math.max(0, Math.min(100, normalized));
|
|
27
|
+
}
|
|
28
|
+
function normalizeBenchmarkResults(results) {
|
|
29
|
+
return results.map((result) => ({
|
|
30
|
+
...result,
|
|
31
|
+
score: normalizeScore(typeof result.rawScore === "number" ? result.rawScore : result.score, result.source)
|
|
32
|
+
}));
|
|
33
|
+
}
|
|
34
|
+
export {
|
|
35
|
+
normalizeScore,
|
|
36
|
+
normalizeBenchmarkResults
|
|
37
|
+
};
|
|
File without changes
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// src/eval/runner.ts
|
|
3
|
+
class EvalRunner {
|
|
4
|
+
adapter;
|
|
5
|
+
options;
|
|
6
|
+
constructor(adapter, options = {}) {
|
|
7
|
+
this.adapter = adapter;
|
|
8
|
+
this.options = options;
|
|
9
|
+
}
|
|
10
|
+
async run(suite, modelId, providerKey) {
|
|
11
|
+
const runId = `eval-${suite.key}-${modelId}-${Date.now()}`;
|
|
12
|
+
const startedAt = new Date;
|
|
13
|
+
const concurrency = this.options.maxConcurrency ?? 5;
|
|
14
|
+
const caseResults = await this.runCasesWithConcurrency(suite.cases, suite.defaultGrader, concurrency);
|
|
15
|
+
const passedCases = caseResults.filter((r) => r.passed).length;
|
|
16
|
+
const averageScore = caseResults.length > 0 ? caseResults.reduce((sum, r) => sum + r.score, 0) / caseResults.length : 0;
|
|
17
|
+
const averageLatencyMs = caseResults.length > 0 ? caseResults.reduce((sum, r) => sum + r.latencyMs, 0) / caseResults.length : 0;
|
|
18
|
+
return {
|
|
19
|
+
runId,
|
|
20
|
+
evalSuiteKey: suite.key,
|
|
21
|
+
modelId,
|
|
22
|
+
providerKey,
|
|
23
|
+
totalCases: suite.cases.length,
|
|
24
|
+
passedCases,
|
|
25
|
+
averageScore: Math.round(averageScore * 100) / 100,
|
|
26
|
+
averageLatencyMs: Math.round(averageLatencyMs),
|
|
27
|
+
caseResults,
|
|
28
|
+
startedAt,
|
|
29
|
+
completedAt: new Date
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
async runCasesWithConcurrency(cases, defaultGrader, concurrency) {
|
|
33
|
+
const results = [];
|
|
34
|
+
const queue = [...cases];
|
|
35
|
+
const workers = Array.from({ length: Math.min(concurrency, queue.length) }, async () => {
|
|
36
|
+
while (queue.length > 0) {
|
|
37
|
+
const evalCase = queue.shift();
|
|
38
|
+
if (!evalCase)
|
|
39
|
+
break;
|
|
40
|
+
const result = await this.runSingleCase(evalCase, defaultGrader);
|
|
41
|
+
results.push(result);
|
|
42
|
+
}
|
|
43
|
+
});
|
|
44
|
+
await Promise.all(workers);
|
|
45
|
+
return results;
|
|
46
|
+
}
|
|
47
|
+
async runSingleCase(evalCase, defaultGrader) {
|
|
48
|
+
try {
|
|
49
|
+
const { text, latencyMs } = await this.adapter.chat(evalCase.prompt);
|
|
50
|
+
const { passed, score } = this.grade(evalCase, text, defaultGrader);
|
|
51
|
+
return {
|
|
52
|
+
caseId: evalCase.id,
|
|
53
|
+
passed,
|
|
54
|
+
score,
|
|
55
|
+
response: text,
|
|
56
|
+
latencyMs
|
|
57
|
+
};
|
|
58
|
+
} catch (error) {
|
|
59
|
+
return {
|
|
60
|
+
caseId: evalCase.id,
|
|
61
|
+
passed: false,
|
|
62
|
+
score: 0,
|
|
63
|
+
response: "",
|
|
64
|
+
latencyMs: 0,
|
|
65
|
+
error: error instanceof Error ? error.message : String(error)
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
grade(evalCase, response, defaultGrader) {
|
|
70
|
+
const grader = evalCase.graderKey ?? defaultGrader;
|
|
71
|
+
switch (grader) {
|
|
72
|
+
case "exact":
|
|
73
|
+
if (!evalCase.expectedOutput)
|
|
74
|
+
return { passed: true, score: 1 };
|
|
75
|
+
return {
|
|
76
|
+
passed: response.trim() === evalCase.expectedOutput.trim(),
|
|
77
|
+
score: response.trim() === evalCase.expectedOutput.trim() ? 1 : 0
|
|
78
|
+
};
|
|
79
|
+
case "contains":
|
|
80
|
+
if (!evalCase.expectedOutput)
|
|
81
|
+
return { passed: true, score: 1 };
|
|
82
|
+
return {
|
|
83
|
+
passed: response.includes(evalCase.expectedOutput),
|
|
84
|
+
score: response.includes(evalCase.expectedOutput) ? 1 : 0
|
|
85
|
+
};
|
|
86
|
+
case "regex": {
|
|
87
|
+
if (!evalCase.expectedPattern)
|
|
88
|
+
return { passed: true, score: 1 };
|
|
89
|
+
const regex = new RegExp(evalCase.expectedPattern);
|
|
90
|
+
const matches = regex.test(response);
|
|
91
|
+
return { passed: matches, score: matches ? 1 : 0 };
|
|
92
|
+
}
|
|
93
|
+
case "llm-judge":
|
|
94
|
+
return { passed: true, score: 0.5 };
|
|
95
|
+
default:
|
|
96
|
+
return { passed: true, score: 0.5 };
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
export {
|
|
101
|
+
EvalRunner
|
|
102
|
+
};
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import type { EvalLLMAdapter, EvalRunResult, EvalSuite } from './types';
|
|
2
|
+
interface EvalRunnerOptions {
|
|
3
|
+
maxConcurrency?: number;
|
|
4
|
+
}
|
|
5
|
+
/**
|
|
6
|
+
* Executes an eval suite against an LLM adapter and produces
|
|
7
|
+
* scored results that can be stored as BenchmarkResults.
|
|
8
|
+
*/
|
|
9
|
+
export declare class EvalRunner {
|
|
10
|
+
private readonly adapter;
|
|
11
|
+
private readonly options;
|
|
12
|
+
constructor(adapter: EvalLLMAdapter, options?: EvalRunnerOptions);
|
|
13
|
+
run(suite: EvalSuite, modelId: string, providerKey: string): Promise<EvalRunResult>;
|
|
14
|
+
private runCasesWithConcurrency;
|
|
15
|
+
private runSingleCase;
|
|
16
|
+
private grade;
|
|
17
|
+
}
|
|
18
|
+
export {};
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// src/eval/runner.ts
|
|
3
|
+
class EvalRunner {
|
|
4
|
+
adapter;
|
|
5
|
+
options;
|
|
6
|
+
constructor(adapter, options = {}) {
|
|
7
|
+
this.adapter = adapter;
|
|
8
|
+
this.options = options;
|
|
9
|
+
}
|
|
10
|
+
async run(suite, modelId, providerKey) {
|
|
11
|
+
const runId = `eval-${suite.key}-${modelId}-${Date.now()}`;
|
|
12
|
+
const startedAt = new Date;
|
|
13
|
+
const concurrency = this.options.maxConcurrency ?? 5;
|
|
14
|
+
const caseResults = await this.runCasesWithConcurrency(suite.cases, suite.defaultGrader, concurrency);
|
|
15
|
+
const passedCases = caseResults.filter((r) => r.passed).length;
|
|
16
|
+
const averageScore = caseResults.length > 0 ? caseResults.reduce((sum, r) => sum + r.score, 0) / caseResults.length : 0;
|
|
17
|
+
const averageLatencyMs = caseResults.length > 0 ? caseResults.reduce((sum, r) => sum + r.latencyMs, 0) / caseResults.length : 0;
|
|
18
|
+
return {
|
|
19
|
+
runId,
|
|
20
|
+
evalSuiteKey: suite.key,
|
|
21
|
+
modelId,
|
|
22
|
+
providerKey,
|
|
23
|
+
totalCases: suite.cases.length,
|
|
24
|
+
passedCases,
|
|
25
|
+
averageScore: Math.round(averageScore * 100) / 100,
|
|
26
|
+
averageLatencyMs: Math.round(averageLatencyMs),
|
|
27
|
+
caseResults,
|
|
28
|
+
startedAt,
|
|
29
|
+
completedAt: new Date
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
async runCasesWithConcurrency(cases, defaultGrader, concurrency) {
|
|
33
|
+
const results = [];
|
|
34
|
+
const queue = [...cases];
|
|
35
|
+
const workers = Array.from({ length: Math.min(concurrency, queue.length) }, async () => {
|
|
36
|
+
while (queue.length > 0) {
|
|
37
|
+
const evalCase = queue.shift();
|
|
38
|
+
if (!evalCase)
|
|
39
|
+
break;
|
|
40
|
+
const result = await this.runSingleCase(evalCase, defaultGrader);
|
|
41
|
+
results.push(result);
|
|
42
|
+
}
|
|
43
|
+
});
|
|
44
|
+
await Promise.all(workers);
|
|
45
|
+
return results;
|
|
46
|
+
}
|
|
47
|
+
async runSingleCase(evalCase, defaultGrader) {
|
|
48
|
+
try {
|
|
49
|
+
const { text, latencyMs } = await this.adapter.chat(evalCase.prompt);
|
|
50
|
+
const { passed, score } = this.grade(evalCase, text, defaultGrader);
|
|
51
|
+
return {
|
|
52
|
+
caseId: evalCase.id,
|
|
53
|
+
passed,
|
|
54
|
+
score,
|
|
55
|
+
response: text,
|
|
56
|
+
latencyMs
|
|
57
|
+
};
|
|
58
|
+
} catch (error) {
|
|
59
|
+
return {
|
|
60
|
+
caseId: evalCase.id,
|
|
61
|
+
passed: false,
|
|
62
|
+
score: 0,
|
|
63
|
+
response: "",
|
|
64
|
+
latencyMs: 0,
|
|
65
|
+
error: error instanceof Error ? error.message : String(error)
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
grade(evalCase, response, defaultGrader) {
|
|
70
|
+
const grader = evalCase.graderKey ?? defaultGrader;
|
|
71
|
+
switch (grader) {
|
|
72
|
+
case "exact":
|
|
73
|
+
if (!evalCase.expectedOutput)
|
|
74
|
+
return { passed: true, score: 1 };
|
|
75
|
+
return {
|
|
76
|
+
passed: response.trim() === evalCase.expectedOutput.trim(),
|
|
77
|
+
score: response.trim() === evalCase.expectedOutput.trim() ? 1 : 0
|
|
78
|
+
};
|
|
79
|
+
case "contains":
|
|
80
|
+
if (!evalCase.expectedOutput)
|
|
81
|
+
return { passed: true, score: 1 };
|
|
82
|
+
return {
|
|
83
|
+
passed: response.includes(evalCase.expectedOutput),
|
|
84
|
+
score: response.includes(evalCase.expectedOutput) ? 1 : 0
|
|
85
|
+
};
|
|
86
|
+
case "regex": {
|
|
87
|
+
if (!evalCase.expectedPattern)
|
|
88
|
+
return { passed: true, score: 1 };
|
|
89
|
+
const regex = new RegExp(evalCase.expectedPattern);
|
|
90
|
+
const matches = regex.test(response);
|
|
91
|
+
return { passed: matches, score: matches ? 1 : 0 };
|
|
92
|
+
}
|
|
93
|
+
case "llm-judge":
|
|
94
|
+
return { passed: true, score: 0.5 };
|
|
95
|
+
default:
|
|
96
|
+
return { passed: true, score: 0.5 };
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
export {
|
|
101
|
+
EvalRunner
|
|
102
|
+
};
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import type { BenchmarkDimension } from '../types';
|
|
2
|
+
export interface EvalCase {
|
|
3
|
+
id: string;
|
|
4
|
+
prompt: string;
|
|
5
|
+
expectedOutput?: string;
|
|
6
|
+
/** Regex or substring that must appear in the response. */
|
|
7
|
+
expectedPattern?: string;
|
|
8
|
+
/** Custom grading function key (resolved at runtime). */
|
|
9
|
+
graderKey?: string;
|
|
10
|
+
metadata?: Record<string, unknown>;
|
|
11
|
+
}
|
|
12
|
+
export interface EvalSuite {
|
|
13
|
+
key: string;
|
|
14
|
+
displayName: string;
|
|
15
|
+
description: string;
|
|
16
|
+
dimension: BenchmarkDimension;
|
|
17
|
+
cases: EvalCase[];
|
|
18
|
+
/** Default grading strategy when individual cases don't specify one. */
|
|
19
|
+
defaultGrader: 'exact' | 'contains' | 'regex' | 'llm-judge';
|
|
20
|
+
}
|
|
21
|
+
export interface EvalCaseResult {
|
|
22
|
+
caseId: string;
|
|
23
|
+
passed: boolean;
|
|
24
|
+
score: number;
|
|
25
|
+
response: string;
|
|
26
|
+
latencyMs: number;
|
|
27
|
+
error?: string;
|
|
28
|
+
}
|
|
29
|
+
export interface EvalRunResult {
|
|
30
|
+
runId: string;
|
|
31
|
+
evalSuiteKey: string;
|
|
32
|
+
modelId: string;
|
|
33
|
+
providerKey: string;
|
|
34
|
+
totalCases: number;
|
|
35
|
+
passedCases: number;
|
|
36
|
+
averageScore: number;
|
|
37
|
+
averageLatencyMs: number;
|
|
38
|
+
caseResults: EvalCaseResult[];
|
|
39
|
+
startedAt: Date;
|
|
40
|
+
completedAt: Date;
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Abstraction over the LLM provider for eval execution.
|
|
44
|
+
* Kept minimal to avoid coupling to a specific SDK.
|
|
45
|
+
*/
|
|
46
|
+
export interface EvalLLMAdapter {
|
|
47
|
+
chat(prompt: string): Promise<{
|
|
48
|
+
text: string;
|
|
49
|
+
latencyMs: number;
|
|
50
|
+
}>;
|
|
51
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
// @bun
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import type { ProviderRankingStore } from './store';
|
|
2
|
+
import type { BenchmarkResult, BenchmarkResultListResult, BenchmarkResultQuery, IngestionRun, ModelProfile, ModelRanking, RankingListResult, RankingQuery } from './types';
|
|
3
|
+
export declare class InMemoryProviderRankingStore implements ProviderRankingStore {
|
|
4
|
+
private benchmarkResults;
|
|
5
|
+
private modelRankings;
|
|
6
|
+
private ingestionRuns;
|
|
7
|
+
upsertBenchmarkResult(result: BenchmarkResult): Promise<void>;
|
|
8
|
+
getBenchmarkResult(id: string): Promise<BenchmarkResult | null>;
|
|
9
|
+
listBenchmarkResults(query: BenchmarkResultQuery): Promise<BenchmarkResultListResult>;
|
|
10
|
+
upsertModelRanking(ranking: ModelRanking): Promise<void>;
|
|
11
|
+
getModelRanking(modelId: string): Promise<ModelRanking | null>;
|
|
12
|
+
listModelRankings(query: RankingQuery): Promise<RankingListResult>;
|
|
13
|
+
getModelProfile(modelId: string): Promise<ModelProfile | null>;
|
|
14
|
+
createIngestionRun(run: IngestionRun): Promise<void>;
|
|
15
|
+
updateIngestionRun(id: string, update: Partial<IngestionRun>): Promise<void>;
|
|
16
|
+
getIngestionRun(id: string): Promise<IngestionRun | null>;
|
|
17
|
+
}
|