@contractspec/lib.provider-ranking 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +44 -0
- package/dist/browser/eval/index.js +101 -0
- package/dist/browser/eval/runner.js +101 -0
- package/dist/browser/eval/types.js +0 -0
- package/dist/browser/in-memory-store.js +92 -0
- package/dist/browser/index.js +105 -0
- package/dist/browser/ingesters/artificial-analysis.js +149 -0
- package/dist/browser/ingesters/chatbot-arena.js +142 -0
- package/dist/browser/ingesters/fetch-utils.js +39 -0
- package/dist/browser/ingesters/index.js +418 -0
- package/dist/browser/ingesters/open-llm-leaderboard.js +108 -0
- package/dist/browser/ingesters/registry.js +412 -0
- package/dist/browser/ingesters/swe-bench.js +105 -0
- package/dist/browser/ingesters/types.js +0 -0
- package/dist/browser/scoring/composite-scorer.js +122 -0
- package/dist/browser/scoring/dimension-weights.js +39 -0
- package/dist/browser/scoring/index.js +161 -0
- package/dist/browser/scoring/normalizer.js +37 -0
- package/dist/browser/store.js +0 -0
- package/dist/browser/types.js +14 -0
- package/dist/eval/index.d.ts +2 -0
- package/dist/eval/index.js +102 -0
- package/dist/eval/runner.d.ts +18 -0
- package/dist/eval/runner.js +102 -0
- package/dist/eval/types.d.ts +51 -0
- package/dist/eval/types.js +1 -0
- package/dist/in-memory-store.d.ts +17 -0
- package/dist/in-memory-store.js +93 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.js +106 -0
- package/dist/ingesters/artificial-analysis.d.ts +8 -0
- package/dist/ingesters/artificial-analysis.js +150 -0
- package/dist/ingesters/chatbot-arena.d.ts +8 -0
- package/dist/ingesters/chatbot-arena.js +143 -0
- package/dist/ingesters/fetch-utils.d.ts +11 -0
- package/dist/ingesters/fetch-utils.js +40 -0
- package/dist/ingesters/index.d.ts +7 -0
- package/dist/ingesters/index.js +419 -0
- package/dist/ingesters/open-llm-leaderboard.d.ts +8 -0
- package/dist/ingesters/open-llm-leaderboard.js +109 -0
- package/dist/ingesters/registry.d.ts +17 -0
- package/dist/ingesters/registry.js +413 -0
- package/dist/ingesters/swe-bench.d.ts +8 -0
- package/dist/ingesters/swe-bench.js +106 -0
- package/dist/ingesters/types.d.ts +31 -0
- package/dist/ingesters/types.js +1 -0
- package/dist/node/eval/index.js +101 -0
- package/dist/node/eval/runner.js +101 -0
- package/dist/node/eval/types.js +0 -0
- package/dist/node/in-memory-store.js +92 -0
- package/dist/node/index.js +105 -0
- package/dist/node/ingesters/artificial-analysis.js +149 -0
- package/dist/node/ingesters/chatbot-arena.js +142 -0
- package/dist/node/ingesters/fetch-utils.js +39 -0
- package/dist/node/ingesters/index.js +418 -0
- package/dist/node/ingesters/open-llm-leaderboard.js +108 -0
- package/dist/node/ingesters/registry.js +412 -0
- package/dist/node/ingesters/swe-bench.js +105 -0
- package/dist/node/ingesters/types.js +0 -0
- package/dist/node/scoring/composite-scorer.js +122 -0
- package/dist/node/scoring/dimension-weights.js +39 -0
- package/dist/node/scoring/index.js +161 -0
- package/dist/node/scoring/normalizer.js +37 -0
- package/dist/node/store.js +0 -0
- package/dist/node/types.js +14 -0
- package/dist/scoring/composite-scorer.d.ts +10 -0
- package/dist/scoring/composite-scorer.js +123 -0
- package/dist/scoring/dimension-weights.d.ts +8 -0
- package/dist/scoring/dimension-weights.js +40 -0
- package/dist/scoring/index.d.ts +3 -0
- package/dist/scoring/index.js +162 -0
- package/dist/scoring/normalizer.d.ts +20 -0
- package/dist/scoring/normalizer.js +38 -0
- package/dist/store.d.ts +19 -0
- package/dist/store.js +1 -0
- package/dist/types.d.ts +100 -0
- package/dist/types.js +15 -0
- package/package.json +362 -0
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// src/in-memory-store.ts
|
|
3
|
+
class InMemoryProviderRankingStore {
|
|
4
|
+
benchmarkResults = new Map;
|
|
5
|
+
modelRankings = new Map;
|
|
6
|
+
ingestionRuns = new Map;
|
|
7
|
+
async upsertBenchmarkResult(result) {
|
|
8
|
+
this.benchmarkResults.set(result.id, result);
|
|
9
|
+
}
|
|
10
|
+
async getBenchmarkResult(id) {
|
|
11
|
+
return this.benchmarkResults.get(id) ?? null;
|
|
12
|
+
}
|
|
13
|
+
async listBenchmarkResults(query) {
|
|
14
|
+
let results = Array.from(this.benchmarkResults.values());
|
|
15
|
+
if (query.source) {
|
|
16
|
+
results = results.filter((r) => r.source === query.source);
|
|
17
|
+
}
|
|
18
|
+
if (query.modelId) {
|
|
19
|
+
results = results.filter((r) => r.modelId === query.modelId);
|
|
20
|
+
}
|
|
21
|
+
if (query.dimension) {
|
|
22
|
+
results = results.filter((r) => r.dimension === query.dimension);
|
|
23
|
+
}
|
|
24
|
+
if (query.providerKey) {
|
|
25
|
+
results = results.filter((r) => r.providerKey === query.providerKey);
|
|
26
|
+
}
|
|
27
|
+
const total = results.length;
|
|
28
|
+
const offset = query.offset ?? 0;
|
|
29
|
+
const limit = query.limit ?? 50;
|
|
30
|
+
results = results.slice(offset, offset + limit);
|
|
31
|
+
const nextOffset = offset + results.length < total ? offset + results.length : undefined;
|
|
32
|
+
return { results, total, nextOffset };
|
|
33
|
+
}
|
|
34
|
+
async upsertModelRanking(ranking) {
|
|
35
|
+
this.modelRankings.set(ranking.modelId, ranking);
|
|
36
|
+
}
|
|
37
|
+
async getModelRanking(modelId) {
|
|
38
|
+
return this.modelRankings.get(modelId) ?? null;
|
|
39
|
+
}
|
|
40
|
+
async listModelRankings(query) {
|
|
41
|
+
let rankings = Array.from(this.modelRankings.values());
|
|
42
|
+
if (query.providerKey) {
|
|
43
|
+
rankings = rankings.filter((r) => r.providerKey === query.providerKey);
|
|
44
|
+
}
|
|
45
|
+
if (query.dimension) {
|
|
46
|
+
const dim = query.dimension;
|
|
47
|
+
rankings.sort((a, b) => {
|
|
48
|
+
const scoreA = a.dimensionScores[dim]?.score ?? -1;
|
|
49
|
+
const scoreB = b.dimensionScores[dim]?.score ?? -1;
|
|
50
|
+
return scoreB - scoreA;
|
|
51
|
+
});
|
|
52
|
+
} else {
|
|
53
|
+
rankings.sort((a, b) => a.rank - b.rank);
|
|
54
|
+
}
|
|
55
|
+
const total = rankings.length;
|
|
56
|
+
const offset = query.offset ?? 0;
|
|
57
|
+
const limit = query.limit ?? 50;
|
|
58
|
+
rankings = rankings.slice(offset, offset + limit);
|
|
59
|
+
const nextOffset = offset + rankings.length < total ? offset + rankings.length : undefined;
|
|
60
|
+
return { rankings, total, nextOffset };
|
|
61
|
+
}
|
|
62
|
+
async getModelProfile(modelId) {
|
|
63
|
+
const ranking = this.modelRankings.get(modelId);
|
|
64
|
+
if (!ranking)
|
|
65
|
+
return null;
|
|
66
|
+
const benchmarkResults = Array.from(this.benchmarkResults.values()).filter((r) => r.modelId === modelId);
|
|
67
|
+
return {
|
|
68
|
+
modelId,
|
|
69
|
+
providerKey: ranking.providerKey,
|
|
70
|
+
displayName: modelId,
|
|
71
|
+
contextWindow: 0,
|
|
72
|
+
costPerMillion: null,
|
|
73
|
+
capabilities: [],
|
|
74
|
+
ranking,
|
|
75
|
+
benchmarkResults
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
async createIngestionRun(run) {
|
|
79
|
+
this.ingestionRuns.set(run.id, run);
|
|
80
|
+
}
|
|
81
|
+
async updateIngestionRun(id, update) {
|
|
82
|
+
const existing = this.ingestionRuns.get(id);
|
|
83
|
+
if (existing) {
|
|
84
|
+
this.ingestionRuns.set(id, { ...existing, ...update });
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
async getIngestionRun(id) {
|
|
88
|
+
return this.ingestionRuns.get(id) ?? null;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
export {
|
|
92
|
+
InMemoryProviderRankingStore
|
|
93
|
+
};
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
export type { BenchmarkDimension, BenchmarkSource, BenchmarkResult, DimensionScore, ModelRanking, ModelProfile, BenchmarkResultQuery, BenchmarkResultListResult, RankingQuery, RankingListResult, IngestionRun, DimensionWeightConfig, } from './types';
|
|
2
|
+
export { BENCHMARK_DIMENSIONS } from './types';
|
|
3
|
+
export type { ProviderRankingStore } from './store';
|
|
4
|
+
export { InMemoryProviderRankingStore } from './in-memory-store';
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// src/in-memory-store.ts
|
|
3
|
+
class InMemoryProviderRankingStore {
|
|
4
|
+
benchmarkResults = new Map;
|
|
5
|
+
modelRankings = new Map;
|
|
6
|
+
ingestionRuns = new Map;
|
|
7
|
+
async upsertBenchmarkResult(result) {
|
|
8
|
+
this.benchmarkResults.set(result.id, result);
|
|
9
|
+
}
|
|
10
|
+
async getBenchmarkResult(id) {
|
|
11
|
+
return this.benchmarkResults.get(id) ?? null;
|
|
12
|
+
}
|
|
13
|
+
async listBenchmarkResults(query) {
|
|
14
|
+
let results = Array.from(this.benchmarkResults.values());
|
|
15
|
+
if (query.source) {
|
|
16
|
+
results = results.filter((r) => r.source === query.source);
|
|
17
|
+
}
|
|
18
|
+
if (query.modelId) {
|
|
19
|
+
results = results.filter((r) => r.modelId === query.modelId);
|
|
20
|
+
}
|
|
21
|
+
if (query.dimension) {
|
|
22
|
+
results = results.filter((r) => r.dimension === query.dimension);
|
|
23
|
+
}
|
|
24
|
+
if (query.providerKey) {
|
|
25
|
+
results = results.filter((r) => r.providerKey === query.providerKey);
|
|
26
|
+
}
|
|
27
|
+
const total = results.length;
|
|
28
|
+
const offset = query.offset ?? 0;
|
|
29
|
+
const limit = query.limit ?? 50;
|
|
30
|
+
results = results.slice(offset, offset + limit);
|
|
31
|
+
const nextOffset = offset + results.length < total ? offset + results.length : undefined;
|
|
32
|
+
return { results, total, nextOffset };
|
|
33
|
+
}
|
|
34
|
+
async upsertModelRanking(ranking) {
|
|
35
|
+
this.modelRankings.set(ranking.modelId, ranking);
|
|
36
|
+
}
|
|
37
|
+
async getModelRanking(modelId) {
|
|
38
|
+
return this.modelRankings.get(modelId) ?? null;
|
|
39
|
+
}
|
|
40
|
+
async listModelRankings(query) {
|
|
41
|
+
let rankings = Array.from(this.modelRankings.values());
|
|
42
|
+
if (query.providerKey) {
|
|
43
|
+
rankings = rankings.filter((r) => r.providerKey === query.providerKey);
|
|
44
|
+
}
|
|
45
|
+
if (query.dimension) {
|
|
46
|
+
const dim = query.dimension;
|
|
47
|
+
rankings.sort((a, b) => {
|
|
48
|
+
const scoreA = a.dimensionScores[dim]?.score ?? -1;
|
|
49
|
+
const scoreB = b.dimensionScores[dim]?.score ?? -1;
|
|
50
|
+
return scoreB - scoreA;
|
|
51
|
+
});
|
|
52
|
+
} else {
|
|
53
|
+
rankings.sort((a, b) => a.rank - b.rank);
|
|
54
|
+
}
|
|
55
|
+
const total = rankings.length;
|
|
56
|
+
const offset = query.offset ?? 0;
|
|
57
|
+
const limit = query.limit ?? 50;
|
|
58
|
+
rankings = rankings.slice(offset, offset + limit);
|
|
59
|
+
const nextOffset = offset + rankings.length < total ? offset + rankings.length : undefined;
|
|
60
|
+
return { rankings, total, nextOffset };
|
|
61
|
+
}
|
|
62
|
+
async getModelProfile(modelId) {
|
|
63
|
+
const ranking = this.modelRankings.get(modelId);
|
|
64
|
+
if (!ranking)
|
|
65
|
+
return null;
|
|
66
|
+
const benchmarkResults = Array.from(this.benchmarkResults.values()).filter((r) => r.modelId === modelId);
|
|
67
|
+
return {
|
|
68
|
+
modelId,
|
|
69
|
+
providerKey: ranking.providerKey,
|
|
70
|
+
displayName: modelId,
|
|
71
|
+
contextWindow: 0,
|
|
72
|
+
costPerMillion: null,
|
|
73
|
+
capabilities: [],
|
|
74
|
+
ranking,
|
|
75
|
+
benchmarkResults
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
async createIngestionRun(run) {
|
|
79
|
+
this.ingestionRuns.set(run.id, run);
|
|
80
|
+
}
|
|
81
|
+
async updateIngestionRun(id, update) {
|
|
82
|
+
const existing = this.ingestionRuns.get(id);
|
|
83
|
+
if (existing) {
|
|
84
|
+
this.ingestionRuns.set(id, { ...existing, ...update });
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
async getIngestionRun(id) {
|
|
88
|
+
return this.ingestionRuns.get(id) ?? null;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// src/types.ts
|
|
93
|
+
var BENCHMARK_DIMENSIONS = [
|
|
94
|
+
"coding",
|
|
95
|
+
"reasoning",
|
|
96
|
+
"agentic",
|
|
97
|
+
"cost",
|
|
98
|
+
"latency",
|
|
99
|
+
"context",
|
|
100
|
+
"safety",
|
|
101
|
+
"custom"
|
|
102
|
+
];
|
|
103
|
+
export {
|
|
104
|
+
InMemoryProviderRankingStore,
|
|
105
|
+
BENCHMARK_DIMENSIONS
|
|
106
|
+
};
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { BenchmarkIngester } from './types';
|
|
2
|
+
/**
|
|
3
|
+
* Ingests Artificial Analysis data covering quality, speed, and cost.
|
|
4
|
+
*
|
|
5
|
+
* Produces results across multiple dimensions: reasoning (quality),
|
|
6
|
+
* latency (speed/TTFT), cost (pricing), and context (window size).
|
|
7
|
+
*/
|
|
8
|
+
export declare const artificialAnalysisIngester: BenchmarkIngester;
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// src/ingesters/fetch-utils.ts
|
|
3
|
+
async function fetchWithRetry(url, options) {
|
|
4
|
+
const fetchFn = options?.fetch ?? globalThis.fetch;
|
|
5
|
+
const maxRetries = options?.maxRetries ?? 2;
|
|
6
|
+
const baseDelay = options?.baseDelayMs ?? 500;
|
|
7
|
+
let lastError;
|
|
8
|
+
for (let attempt = 0;attempt <= maxRetries; attempt++) {
|
|
9
|
+
try {
|
|
10
|
+
const response = await fetchFn(url);
|
|
11
|
+
if (response.ok)
|
|
12
|
+
return response;
|
|
13
|
+
if (response.status >= 500 && attempt < maxRetries) {
|
|
14
|
+
await sleep(baseDelay * Math.pow(2, attempt));
|
|
15
|
+
continue;
|
|
16
|
+
}
|
|
17
|
+
throw new Error(`Fetch failed: ${response.status} ${response.statusText} (${url})`);
|
|
18
|
+
} catch (error) {
|
|
19
|
+
lastError = error instanceof Error ? error : new Error(String(error));
|
|
20
|
+
if (attempt < maxRetries) {
|
|
21
|
+
await sleep(baseDelay * Math.pow(2, attempt));
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
throw lastError ?? new Error(`Fetch failed after ${maxRetries + 1} attempts: ${url}`);
|
|
26
|
+
}
|
|
27
|
+
function parseJsonSafe(text, label) {
|
|
28
|
+
try {
|
|
29
|
+
return JSON.parse(text);
|
|
30
|
+
} catch {
|
|
31
|
+
throw new Error(`Failed to parse JSON response from ${label}: ${text.slice(0, 200)}`);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
function sleep(ms) {
|
|
35
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// src/ingesters/artificial-analysis.ts
|
|
39
|
+
var DEFAULT_AA_URL = "https://artificialanalysis.ai/api/models";
|
|
40
|
+
var artificialAnalysisIngester = {
|
|
41
|
+
source: "artificial-analysis",
|
|
42
|
+
displayName: "Artificial Analysis",
|
|
43
|
+
description: "Quality, speed, and cost benchmarks from Artificial Analysis.",
|
|
44
|
+
async ingest(options) {
|
|
45
|
+
const url = options?.sourceUrl ?? DEFAULT_AA_URL;
|
|
46
|
+
const response = await fetchWithRetry(url, { fetch: options?.fetch });
|
|
47
|
+
const text = await response.text();
|
|
48
|
+
const data = parseJsonSafe(text, "Artificial Analysis");
|
|
49
|
+
const now = new Date;
|
|
50
|
+
const results = [];
|
|
51
|
+
const dims = options?.dimensions ? new Set(options.dimensions) : null;
|
|
52
|
+
let entries = data.filter((e) => e.model_id && e.provider);
|
|
53
|
+
if (options?.modelFilter?.length) {
|
|
54
|
+
const filterSet = new Set(options.modelFilter);
|
|
55
|
+
entries = entries.filter((e) => filterSet.has(e.model_id));
|
|
56
|
+
}
|
|
57
|
+
for (const entry of entries) {
|
|
58
|
+
const baseId = `artificial-analysis:${entry.model_id}`;
|
|
59
|
+
if (entry.quality_score != null && (!dims || dims.has("reasoning"))) {
|
|
60
|
+
results.push({
|
|
61
|
+
id: `${baseId}:reasoning`,
|
|
62
|
+
modelId: entry.model_id,
|
|
63
|
+
providerKey: entry.provider.toLowerCase(),
|
|
64
|
+
source: "artificial-analysis",
|
|
65
|
+
dimension: "reasoning",
|
|
66
|
+
score: Math.max(0, Math.min(100, entry.quality_score)),
|
|
67
|
+
rawScore: entry.quality_score,
|
|
68
|
+
metadata: { model_name: entry.model_name },
|
|
69
|
+
measuredAt: now,
|
|
70
|
+
ingestedAt: now
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
if ((entry.tokens_per_second != null || entry.ttft_ms != null) && (!dims || dims.has("latency"))) {
|
|
74
|
+
const latencyScore = computeLatencyScore(entry.tokens_per_second, entry.ttft_ms);
|
|
75
|
+
results.push({
|
|
76
|
+
id: `${baseId}:latency`,
|
|
77
|
+
modelId: entry.model_id,
|
|
78
|
+
providerKey: entry.provider.toLowerCase(),
|
|
79
|
+
source: "artificial-analysis",
|
|
80
|
+
dimension: "latency",
|
|
81
|
+
score: latencyScore,
|
|
82
|
+
rawScore: {
|
|
83
|
+
tokens_per_second: entry.tokens_per_second,
|
|
84
|
+
ttft_ms: entry.ttft_ms
|
|
85
|
+
},
|
|
86
|
+
metadata: { model_name: entry.model_name },
|
|
87
|
+
measuredAt: now,
|
|
88
|
+
ingestedAt: now
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
if ((entry.price_per_million_input_tokens != null || entry.price_per_million_output_tokens != null) && (!dims || dims.has("cost"))) {
|
|
92
|
+
const costScore = computeCostScore(entry.price_per_million_input_tokens, entry.price_per_million_output_tokens);
|
|
93
|
+
results.push({
|
|
94
|
+
id: `${baseId}:cost`,
|
|
95
|
+
modelId: entry.model_id,
|
|
96
|
+
providerKey: entry.provider.toLowerCase(),
|
|
97
|
+
source: "artificial-analysis",
|
|
98
|
+
dimension: "cost",
|
|
99
|
+
score: costScore,
|
|
100
|
+
rawScore: {
|
|
101
|
+
input: entry.price_per_million_input_tokens,
|
|
102
|
+
output: entry.price_per_million_output_tokens
|
|
103
|
+
},
|
|
104
|
+
metadata: { model_name: entry.model_name },
|
|
105
|
+
measuredAt: now,
|
|
106
|
+
ingestedAt: now
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
if (entry.context_window != null && (!dims || dims.has("context"))) {
|
|
110
|
+
const contextScore = computeContextScore(entry.context_window);
|
|
111
|
+
results.push({
|
|
112
|
+
id: `${baseId}:context`,
|
|
113
|
+
modelId: entry.model_id,
|
|
114
|
+
providerKey: entry.provider.toLowerCase(),
|
|
115
|
+
source: "artificial-analysis",
|
|
116
|
+
dimension: "context",
|
|
117
|
+
score: contextScore,
|
|
118
|
+
rawScore: entry.context_window,
|
|
119
|
+
metadata: { model_name: entry.model_name },
|
|
120
|
+
measuredAt: now,
|
|
121
|
+
ingestedAt: now
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
return options?.maxResults ? results.slice(0, options.maxResults) : results;
|
|
126
|
+
}
|
|
127
|
+
};
|
|
128
|
+
function computeLatencyScore(tokensPerSec, ttftMs) {
|
|
129
|
+
let score = 50;
|
|
130
|
+
if (tokensPerSec != null) {
|
|
131
|
+
score = Math.min(100, tokensPerSec / 200 * 100);
|
|
132
|
+
}
|
|
133
|
+
if (ttftMs != null) {
|
|
134
|
+
const ttftPenalty = Math.max(0, Math.min(30, (ttftMs - 200) / 100 * 10));
|
|
135
|
+
score = Math.max(0, score - ttftPenalty);
|
|
136
|
+
}
|
|
137
|
+
return Math.round(score * 100) / 100;
|
|
138
|
+
}
|
|
139
|
+
function computeCostScore(inputCost, outputCost) {
|
|
140
|
+
const avgCost = ((inputCost ?? 0) + (outputCost ?? 0)) / 2;
|
|
141
|
+
const score = Math.max(0, 100 - avgCost / 30 * 100);
|
|
142
|
+
return Math.round(score * 100) / 100;
|
|
143
|
+
}
|
|
144
|
+
function computeContextScore(contextWindow) {
|
|
145
|
+
const score = Math.min(100, contextWindow / 1e6 * 100);
|
|
146
|
+
return Math.round(score * 100) / 100;
|
|
147
|
+
}
|
|
148
|
+
export {
|
|
149
|
+
artificialAnalysisIngester
|
|
150
|
+
};
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { BenchmarkIngester } from './types';
|
|
2
|
+
/**
|
|
3
|
+
* Ingests Chatbot Arena (LMSYS) Elo ratings.
|
|
4
|
+
*
|
|
5
|
+
* Maps Elo ratings to the "reasoning" dimension since Arena
|
|
6
|
+
* measures general conversational/reasoning ability.
|
|
7
|
+
*/
|
|
8
|
+
export declare const chatbotArenaIngester: BenchmarkIngester;
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// src/ingesters/fetch-utils.ts
|
|
3
|
+
async function fetchWithRetry(url, options) {
|
|
4
|
+
const fetchFn = options?.fetch ?? globalThis.fetch;
|
|
5
|
+
const maxRetries = options?.maxRetries ?? 2;
|
|
6
|
+
const baseDelay = options?.baseDelayMs ?? 500;
|
|
7
|
+
let lastError;
|
|
8
|
+
for (let attempt = 0;attempt <= maxRetries; attempt++) {
|
|
9
|
+
try {
|
|
10
|
+
const response = await fetchFn(url);
|
|
11
|
+
if (response.ok)
|
|
12
|
+
return response;
|
|
13
|
+
if (response.status >= 500 && attempt < maxRetries) {
|
|
14
|
+
await sleep(baseDelay * Math.pow(2, attempt));
|
|
15
|
+
continue;
|
|
16
|
+
}
|
|
17
|
+
throw new Error(`Fetch failed: ${response.status} ${response.statusText} (${url})`);
|
|
18
|
+
} catch (error) {
|
|
19
|
+
lastError = error instanceof Error ? error : new Error(String(error));
|
|
20
|
+
if (attempt < maxRetries) {
|
|
21
|
+
await sleep(baseDelay * Math.pow(2, attempt));
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
throw lastError ?? new Error(`Fetch failed after ${maxRetries + 1} attempts: ${url}`);
|
|
26
|
+
}
|
|
27
|
+
function parseJsonSafe(text, label) {
|
|
28
|
+
try {
|
|
29
|
+
return JSON.parse(text);
|
|
30
|
+
} catch {
|
|
31
|
+
throw new Error(`Failed to parse JSON response from ${label}: ${text.slice(0, 200)}`);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
function sleep(ms) {
|
|
35
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// src/scoring/normalizer.ts
|
|
39
|
+
var SOURCE_NORMALIZATION = {
|
|
40
|
+
"chatbot-arena": { min: 800, max: 1400, invertScale: false },
|
|
41
|
+
"swe-bench": { min: 0, max: 100, invertScale: false },
|
|
42
|
+
"human-eval": { min: 0, max: 100, invertScale: false },
|
|
43
|
+
mmlu: { min: 0, max: 100, invertScale: false },
|
|
44
|
+
gpqa: { min: 0, max: 100, invertScale: false },
|
|
45
|
+
arc: { min: 0, max: 100, invertScale: false },
|
|
46
|
+
truthfulqa: { min: 0, max: 100, invertScale: false },
|
|
47
|
+
"tau-bench": { min: 0, max: 100, invertScale: false },
|
|
48
|
+
"artificial-analysis": { min: 0, max: 100, invertScale: false }
|
|
49
|
+
};
|
|
50
|
+
function normalizeScore(rawScore, source, configOverride) {
|
|
51
|
+
const config = configOverride ?? SOURCE_NORMALIZATION[source];
|
|
52
|
+
if (!config) {
|
|
53
|
+
return Math.max(0, Math.min(100, rawScore));
|
|
54
|
+
}
|
|
55
|
+
const { min, max, invertScale } = config;
|
|
56
|
+
const range = max - min;
|
|
57
|
+
if (range === 0)
|
|
58
|
+
return 50;
|
|
59
|
+
let normalized = (rawScore - min) / range * 100;
|
|
60
|
+
if (invertScale) {
|
|
61
|
+
normalized = 100 - normalized;
|
|
62
|
+
}
|
|
63
|
+
return Math.max(0, Math.min(100, normalized));
|
|
64
|
+
}
|
|
65
|
+
function normalizeBenchmarkResults(results) {
|
|
66
|
+
return results.map((result) => ({
|
|
67
|
+
...result,
|
|
68
|
+
score: normalizeScore(typeof result.rawScore === "number" ? result.rawScore : result.score, result.source)
|
|
69
|
+
}));
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// src/ingesters/chatbot-arena.ts
|
|
73
|
+
var DEFAULT_ARENA_URL = "https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard/resolve/main/results.json";
|
|
74
|
+
var chatbotArenaIngester = {
|
|
75
|
+
source: "chatbot-arena",
|
|
76
|
+
displayName: "Chatbot Arena (LMSYS)",
|
|
77
|
+
description: "Elo ratings from the LMSYS Chatbot Arena human preference leaderboard.",
|
|
78
|
+
async ingest(options) {
|
|
79
|
+
if (options?.dimensions?.length && !options.dimensions.includes("reasoning")) {
|
|
80
|
+
return [];
|
|
81
|
+
}
|
|
82
|
+
const url = options?.sourceUrl ?? DEFAULT_ARENA_URL;
|
|
83
|
+
const response = await fetchWithRetry(url, { fetch: options?.fetch });
|
|
84
|
+
const text = await response.text();
|
|
85
|
+
const data = parseJsonSafe(text, "Chatbot Arena");
|
|
86
|
+
const now = new Date;
|
|
87
|
+
let entries = data.filter((entry) => entry["Arena Elo rating"] != null && entry.Model);
|
|
88
|
+
if (options?.modelFilter?.length) {
|
|
89
|
+
const filterSet = new Set(options.modelFilter);
|
|
90
|
+
entries = entries.filter((e) => filterSet.has(e.key ?? e.Model));
|
|
91
|
+
}
|
|
92
|
+
if (options?.maxResults) {
|
|
93
|
+
entries = entries.slice(0, options.maxResults);
|
|
94
|
+
}
|
|
95
|
+
let results = entries.map((entry) => {
|
|
96
|
+
const elo = entry["Arena Elo rating"];
|
|
97
|
+
const modelId = entry.key ?? entry.Model.toLowerCase().replace(/\s+/g, "-");
|
|
98
|
+
const org = entry.Organization?.toLowerCase() ?? "unknown";
|
|
99
|
+
return {
|
|
100
|
+
id: `chatbot-arena:${modelId}:reasoning`,
|
|
101
|
+
modelId,
|
|
102
|
+
providerKey: mapOrganizationToProvider(org),
|
|
103
|
+
source: "chatbot-arena",
|
|
104
|
+
dimension: "reasoning",
|
|
105
|
+
score: normalizeScore(elo, "chatbot-arena"),
|
|
106
|
+
rawScore: elo,
|
|
107
|
+
metadata: {
|
|
108
|
+
organization: entry.Organization,
|
|
109
|
+
license: entry.License
|
|
110
|
+
},
|
|
111
|
+
measuredAt: now,
|
|
112
|
+
ingestedAt: now
|
|
113
|
+
};
|
|
114
|
+
});
|
|
115
|
+
const { fromDate, toDate } = options ?? {};
|
|
116
|
+
if (fromDate) {
|
|
117
|
+
results = results.filter((r) => r.measuredAt >= fromDate);
|
|
118
|
+
}
|
|
119
|
+
if (toDate) {
|
|
120
|
+
results = results.filter((r) => r.measuredAt <= toDate);
|
|
121
|
+
}
|
|
122
|
+
return results;
|
|
123
|
+
}
|
|
124
|
+
};
|
|
125
|
+
function mapOrganizationToProvider(org) {
|
|
126
|
+
const normalized = org.toLowerCase();
|
|
127
|
+
if (normalized.includes("openai"))
|
|
128
|
+
return "openai";
|
|
129
|
+
if (normalized.includes("anthropic"))
|
|
130
|
+
return "anthropic";
|
|
131
|
+
if (normalized.includes("google") || normalized.includes("deepmind"))
|
|
132
|
+
return "gemini";
|
|
133
|
+
if (normalized.includes("mistral"))
|
|
134
|
+
return "mistral";
|
|
135
|
+
if (normalized.includes("meta"))
|
|
136
|
+
return "meta";
|
|
137
|
+
if (normalized.includes("cohere"))
|
|
138
|
+
return "cohere";
|
|
139
|
+
return org;
|
|
140
|
+
}
|
|
141
|
+
export {
|
|
142
|
+
chatbotArenaIngester
|
|
143
|
+
};
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Fetch helper with retry and exponential backoff for ingester robustness.
|
|
3
|
+
*/
|
|
4
|
+
interface FetchWithRetryOptions {
|
|
5
|
+
fetch?: typeof globalThis.fetch;
|
|
6
|
+
maxRetries?: number;
|
|
7
|
+
baseDelayMs?: number;
|
|
8
|
+
}
|
|
9
|
+
export declare function fetchWithRetry(url: string, options?: FetchWithRetryOptions): Promise<Response>;
|
|
10
|
+
export declare function parseJsonSafe<T>(text: string, label: string): T;
|
|
11
|
+
export {};
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// src/ingesters/fetch-utils.ts
|
|
3
|
+
async function fetchWithRetry(url, options) {
|
|
4
|
+
const fetchFn = options?.fetch ?? globalThis.fetch;
|
|
5
|
+
const maxRetries = options?.maxRetries ?? 2;
|
|
6
|
+
const baseDelay = options?.baseDelayMs ?? 500;
|
|
7
|
+
let lastError;
|
|
8
|
+
for (let attempt = 0;attempt <= maxRetries; attempt++) {
|
|
9
|
+
try {
|
|
10
|
+
const response = await fetchFn(url);
|
|
11
|
+
if (response.ok)
|
|
12
|
+
return response;
|
|
13
|
+
if (response.status >= 500 && attempt < maxRetries) {
|
|
14
|
+
await sleep(baseDelay * Math.pow(2, attempt));
|
|
15
|
+
continue;
|
|
16
|
+
}
|
|
17
|
+
throw new Error(`Fetch failed: ${response.status} ${response.statusText} (${url})`);
|
|
18
|
+
} catch (error) {
|
|
19
|
+
lastError = error instanceof Error ? error : new Error(String(error));
|
|
20
|
+
if (attempt < maxRetries) {
|
|
21
|
+
await sleep(baseDelay * Math.pow(2, attempt));
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
throw lastError ?? new Error(`Fetch failed after ${maxRetries + 1} attempts: ${url}`);
|
|
26
|
+
}
|
|
27
|
+
function parseJsonSafe(text, label) {
|
|
28
|
+
try {
|
|
29
|
+
return JSON.parse(text);
|
|
30
|
+
} catch {
|
|
31
|
+
throw new Error(`Failed to parse JSON response from ${label}: ${text.slice(0, 200)}`);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
function sleep(ms) {
|
|
35
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
36
|
+
}
|
|
37
|
+
export {
|
|
38
|
+
parseJsonSafe,
|
|
39
|
+
fetchWithRetry
|
|
40
|
+
};
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
export type { BenchmarkIngester, IngesterOptions } from './types';
|
|
2
|
+
export { chatbotArenaIngester } from './chatbot-arena';
|
|
3
|
+
export { artificialAnalysisIngester } from './artificial-analysis';
|
|
4
|
+
export { sweBenchIngester } from './swe-bench';
|
|
5
|
+
export { openLlmLeaderboardIngester } from './open-llm-leaderboard';
|
|
6
|
+
export { IngesterRegistry, createDefaultIngesterRegistry } from './registry';
|
|
7
|
+
export { fetchWithRetry, parseJsonSafe } from './fetch-utils';
|