@contractspec/lib.provider-ranking 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. package/README.md +44 -0
  2. package/dist/browser/eval/index.js +101 -0
  3. package/dist/browser/eval/runner.js +101 -0
  4. package/dist/browser/eval/types.js +0 -0
  5. package/dist/browser/in-memory-store.js +92 -0
  6. package/dist/browser/index.js +105 -0
  7. package/dist/browser/ingesters/artificial-analysis.js +149 -0
  8. package/dist/browser/ingesters/chatbot-arena.js +142 -0
  9. package/dist/browser/ingesters/fetch-utils.js +39 -0
  10. package/dist/browser/ingesters/index.js +418 -0
  11. package/dist/browser/ingesters/open-llm-leaderboard.js +108 -0
  12. package/dist/browser/ingesters/registry.js +412 -0
  13. package/dist/browser/ingesters/swe-bench.js +105 -0
  14. package/dist/browser/ingesters/types.js +0 -0
  15. package/dist/browser/scoring/composite-scorer.js +122 -0
  16. package/dist/browser/scoring/dimension-weights.js +39 -0
  17. package/dist/browser/scoring/index.js +161 -0
  18. package/dist/browser/scoring/normalizer.js +37 -0
  19. package/dist/browser/store.js +0 -0
  20. package/dist/browser/types.js +14 -0
  21. package/dist/eval/index.d.ts +2 -0
  22. package/dist/eval/index.js +102 -0
  23. package/dist/eval/runner.d.ts +18 -0
  24. package/dist/eval/runner.js +102 -0
  25. package/dist/eval/types.d.ts +51 -0
  26. package/dist/eval/types.js +1 -0
  27. package/dist/in-memory-store.d.ts +17 -0
  28. package/dist/in-memory-store.js +93 -0
  29. package/dist/index.d.ts +4 -0
  30. package/dist/index.js +106 -0
  31. package/dist/ingesters/artificial-analysis.d.ts +8 -0
  32. package/dist/ingesters/artificial-analysis.js +150 -0
  33. package/dist/ingesters/chatbot-arena.d.ts +8 -0
  34. package/dist/ingesters/chatbot-arena.js +143 -0
  35. package/dist/ingesters/fetch-utils.d.ts +11 -0
  36. package/dist/ingesters/fetch-utils.js +40 -0
  37. package/dist/ingesters/index.d.ts +7 -0
  38. package/dist/ingesters/index.js +419 -0
  39. package/dist/ingesters/open-llm-leaderboard.d.ts +8 -0
  40. package/dist/ingesters/open-llm-leaderboard.js +109 -0
  41. package/dist/ingesters/registry.d.ts +17 -0
  42. package/dist/ingesters/registry.js +413 -0
  43. package/dist/ingesters/swe-bench.d.ts +8 -0
  44. package/dist/ingesters/swe-bench.js +106 -0
  45. package/dist/ingesters/types.d.ts +31 -0
  46. package/dist/ingesters/types.js +1 -0
  47. package/dist/node/eval/index.js +101 -0
  48. package/dist/node/eval/runner.js +101 -0
  49. package/dist/node/eval/types.js +0 -0
  50. package/dist/node/in-memory-store.js +92 -0
  51. package/dist/node/index.js +105 -0
  52. package/dist/node/ingesters/artificial-analysis.js +149 -0
  53. package/dist/node/ingesters/chatbot-arena.js +142 -0
  54. package/dist/node/ingesters/fetch-utils.js +39 -0
  55. package/dist/node/ingesters/index.js +418 -0
  56. package/dist/node/ingesters/open-llm-leaderboard.js +108 -0
  57. package/dist/node/ingesters/registry.js +412 -0
  58. package/dist/node/ingesters/swe-bench.js +105 -0
  59. package/dist/node/ingesters/types.js +0 -0
  60. package/dist/node/scoring/composite-scorer.js +122 -0
  61. package/dist/node/scoring/dimension-weights.js +39 -0
  62. package/dist/node/scoring/index.js +161 -0
  63. package/dist/node/scoring/normalizer.js +37 -0
  64. package/dist/node/store.js +0 -0
  65. package/dist/node/types.js +14 -0
  66. package/dist/scoring/composite-scorer.d.ts +10 -0
  67. package/dist/scoring/composite-scorer.js +123 -0
  68. package/dist/scoring/dimension-weights.d.ts +8 -0
  69. package/dist/scoring/dimension-weights.js +40 -0
  70. package/dist/scoring/index.d.ts +3 -0
  71. package/dist/scoring/index.js +162 -0
  72. package/dist/scoring/normalizer.d.ts +20 -0
  73. package/dist/scoring/normalizer.js +38 -0
  74. package/dist/store.d.ts +19 -0
  75. package/dist/store.js +1 -0
  76. package/dist/types.d.ts +100 -0
  77. package/dist/types.js +15 -0
  78. package/package.json +362 -0
@@ -0,0 +1,108 @@
1
+ // src/ingesters/fetch-utils.ts
2
+ async function fetchWithRetry(url, options) {
3
+ const fetchFn = options?.fetch ?? globalThis.fetch;
4
+ const maxRetries = options?.maxRetries ?? 2;
5
+ const baseDelay = options?.baseDelayMs ?? 500;
6
+ let lastError;
7
+ for (let attempt = 0;attempt <= maxRetries; attempt++) {
8
+ try {
9
+ const response = await fetchFn(url);
10
+ if (response.ok)
11
+ return response;
12
+ if (response.status >= 500 && attempt < maxRetries) {
13
+ await sleep(baseDelay * Math.pow(2, attempt));
14
+ continue;
15
+ }
16
+ throw new Error(`Fetch failed: ${response.status} ${response.statusText} (${url})`);
17
+ } catch (error) {
18
+ lastError = error instanceof Error ? error : new Error(String(error));
19
+ if (attempt < maxRetries) {
20
+ await sleep(baseDelay * Math.pow(2, attempt));
21
+ }
22
+ }
23
+ }
24
+ throw lastError ?? new Error(`Fetch failed after ${maxRetries + 1} attempts: ${url}`);
25
+ }
26
+ function parseJsonSafe(text, label) {
27
+ try {
28
+ return JSON.parse(text);
29
+ } catch {
30
+ throw new Error(`Failed to parse JSON response from ${label}: ${text.slice(0, 200)}`);
31
+ }
32
+ }
33
+ function sleep(ms) {
34
+ return new Promise((resolve) => setTimeout(resolve, ms));
35
+ }
36
+
37
+ // src/ingesters/open-llm-leaderboard.ts
38
+ var DEFAULT_HF_URL = "https://huggingface.co/api/spaces/open-llm-leaderboard/open_llm_leaderboard/results";
39
+ var BENCHMARK_MAPPINGS = [
40
+ { field: "mmlu", dimension: "reasoning", sourceKey: "mmlu" },
41
+ { field: "arc", dimension: "reasoning", sourceKey: "arc" },
42
+ { field: "gpqa", dimension: "reasoning", sourceKey: "gpqa" },
43
+ { field: "truthfulqa", dimension: "safety", sourceKey: "truthfulqa" }
44
+ ];
45
+ var openLlmLeaderboardIngester = {
46
+ source: "mmlu",
47
+ displayName: "Open LLM Leaderboard",
48
+ description: "Aggregated benchmark scores from the HuggingFace Open LLM Leaderboard.",
49
+ async ingest(options) {
50
+ const url = options?.sourceUrl ?? DEFAULT_HF_URL;
51
+ const response = await fetchWithRetry(url, { fetch: options?.fetch });
52
+ const text = await response.text();
53
+ const data = parseJsonSafe(text, "Open LLM Leaderboard");
54
+ const now = new Date;
55
+ const results = [];
56
+ const dims = options?.dimensions ? new Set(options.dimensions) : null;
57
+ let entries = data.filter((e) => e.model_name);
58
+ if (options?.modelFilter?.length) {
59
+ const filterSet = new Set(options.modelFilter);
60
+ entries = entries.filter((e) => filterSet.has(e.model_name.toLowerCase().replace(/\s+/g, "-")));
61
+ }
62
+ for (const entry of entries) {
63
+ const modelId = entry.model_name.toLowerCase().replace(/\s+/g, "-");
64
+ const org = entry.organization?.toLowerCase() ?? "unknown";
65
+ const providerKey = mapOrganizationToProvider(org);
66
+ for (const mapping of BENCHMARK_MAPPINGS) {
67
+ if (dims && !dims.has(mapping.dimension))
68
+ continue;
69
+ const value = entry[mapping.field];
70
+ if (typeof value !== "number")
71
+ continue;
72
+ results.push({
73
+ id: `open-llm:${modelId}:${mapping.sourceKey}`,
74
+ modelId,
75
+ providerKey,
76
+ source: mapping.sourceKey,
77
+ dimension: mapping.dimension,
78
+ score: Math.max(0, Math.min(100, value)),
79
+ rawScore: value,
80
+ metadata: {
81
+ organization: entry.organization,
82
+ leaderboard_average: entry.average
83
+ },
84
+ measuredAt: now,
85
+ ingestedAt: now
86
+ });
87
+ }
88
+ }
89
+ return options?.maxResults ? results.slice(0, options.maxResults) : results;
90
+ }
91
+ };
92
+ function mapOrganizationToProvider(org) {
93
+ const normalized = org.toLowerCase();
94
+ if (normalized.includes("openai"))
95
+ return "openai";
96
+ if (normalized.includes("anthropic"))
97
+ return "anthropic";
98
+ if (normalized.includes("google") || normalized.includes("deepmind"))
99
+ return "gemini";
100
+ if (normalized.includes("mistral"))
101
+ return "mistral";
102
+ if (normalized.includes("meta"))
103
+ return "meta";
104
+ return org;
105
+ }
106
+ export {
107
+ openLlmLeaderboardIngester
108
+ };
@@ -0,0 +1,412 @@
1
+ // src/ingesters/fetch-utils.ts
2
+ async function fetchWithRetry(url, options) {
3
+ const fetchFn = options?.fetch ?? globalThis.fetch;
4
+ const maxRetries = options?.maxRetries ?? 2;
5
+ const baseDelay = options?.baseDelayMs ?? 500;
6
+ let lastError;
7
+ for (let attempt = 0;attempt <= maxRetries; attempt++) {
8
+ try {
9
+ const response = await fetchFn(url);
10
+ if (response.ok)
11
+ return response;
12
+ if (response.status >= 500 && attempt < maxRetries) {
13
+ await sleep(baseDelay * Math.pow(2, attempt));
14
+ continue;
15
+ }
16
+ throw new Error(`Fetch failed: ${response.status} ${response.statusText} (${url})`);
17
+ } catch (error) {
18
+ lastError = error instanceof Error ? error : new Error(String(error));
19
+ if (attempt < maxRetries) {
20
+ await sleep(baseDelay * Math.pow(2, attempt));
21
+ }
22
+ }
23
+ }
24
+ throw lastError ?? new Error(`Fetch failed after ${maxRetries + 1} attempts: ${url}`);
25
+ }
26
+ function parseJsonSafe(text, label) {
27
+ try {
28
+ return JSON.parse(text);
29
+ } catch {
30
+ throw new Error(`Failed to parse JSON response from ${label}: ${text.slice(0, 200)}`);
31
+ }
32
+ }
33
+ function sleep(ms) {
34
+ return new Promise((resolve) => setTimeout(resolve, ms));
35
+ }
36
+
37
+ // src/ingesters/artificial-analysis.ts
38
+ var DEFAULT_AA_URL = "https://artificialanalysis.ai/api/models";
39
+ var artificialAnalysisIngester = {
40
+ source: "artificial-analysis",
41
+ displayName: "Artificial Analysis",
42
+ description: "Quality, speed, and cost benchmarks from Artificial Analysis.",
43
+ async ingest(options) {
44
+ const url = options?.sourceUrl ?? DEFAULT_AA_URL;
45
+ const response = await fetchWithRetry(url, { fetch: options?.fetch });
46
+ const text = await response.text();
47
+ const data = parseJsonSafe(text, "Artificial Analysis");
48
+ const now = new Date;
49
+ const results = [];
50
+ const dims = options?.dimensions ? new Set(options.dimensions) : null;
51
+ let entries = data.filter((e) => e.model_id && e.provider);
52
+ if (options?.modelFilter?.length) {
53
+ const filterSet = new Set(options.modelFilter);
54
+ entries = entries.filter((e) => filterSet.has(e.model_id));
55
+ }
56
+ for (const entry of entries) {
57
+ const baseId = `artificial-analysis:${entry.model_id}`;
58
+ if (entry.quality_score != null && (!dims || dims.has("reasoning"))) {
59
+ results.push({
60
+ id: `${baseId}:reasoning`,
61
+ modelId: entry.model_id,
62
+ providerKey: entry.provider.toLowerCase(),
63
+ source: "artificial-analysis",
64
+ dimension: "reasoning",
65
+ score: Math.max(0, Math.min(100, entry.quality_score)),
66
+ rawScore: entry.quality_score,
67
+ metadata: { model_name: entry.model_name },
68
+ measuredAt: now,
69
+ ingestedAt: now
70
+ });
71
+ }
72
+ if ((entry.tokens_per_second != null || entry.ttft_ms != null) && (!dims || dims.has("latency"))) {
73
+ const latencyScore = computeLatencyScore(entry.tokens_per_second, entry.ttft_ms);
74
+ results.push({
75
+ id: `${baseId}:latency`,
76
+ modelId: entry.model_id,
77
+ providerKey: entry.provider.toLowerCase(),
78
+ source: "artificial-analysis",
79
+ dimension: "latency",
80
+ score: latencyScore,
81
+ rawScore: {
82
+ tokens_per_second: entry.tokens_per_second,
83
+ ttft_ms: entry.ttft_ms
84
+ },
85
+ metadata: { model_name: entry.model_name },
86
+ measuredAt: now,
87
+ ingestedAt: now
88
+ });
89
+ }
90
+ if ((entry.price_per_million_input_tokens != null || entry.price_per_million_output_tokens != null) && (!dims || dims.has("cost"))) {
91
+ const costScore = computeCostScore(entry.price_per_million_input_tokens, entry.price_per_million_output_tokens);
92
+ results.push({
93
+ id: `${baseId}:cost`,
94
+ modelId: entry.model_id,
95
+ providerKey: entry.provider.toLowerCase(),
96
+ source: "artificial-analysis",
97
+ dimension: "cost",
98
+ score: costScore,
99
+ rawScore: {
100
+ input: entry.price_per_million_input_tokens,
101
+ output: entry.price_per_million_output_tokens
102
+ },
103
+ metadata: { model_name: entry.model_name },
104
+ measuredAt: now,
105
+ ingestedAt: now
106
+ });
107
+ }
108
+ if (entry.context_window != null && (!dims || dims.has("context"))) {
109
+ const contextScore = computeContextScore(entry.context_window);
110
+ results.push({
111
+ id: `${baseId}:context`,
112
+ modelId: entry.model_id,
113
+ providerKey: entry.provider.toLowerCase(),
114
+ source: "artificial-analysis",
115
+ dimension: "context",
116
+ score: contextScore,
117
+ rawScore: entry.context_window,
118
+ metadata: { model_name: entry.model_name },
119
+ measuredAt: now,
120
+ ingestedAt: now
121
+ });
122
+ }
123
+ }
124
+ return options?.maxResults ? results.slice(0, options.maxResults) : results;
125
+ }
126
+ };
127
+ function computeLatencyScore(tokensPerSec, ttftMs) {
128
+ let score = 50;
129
+ if (tokensPerSec != null) {
130
+ score = Math.min(100, tokensPerSec / 200 * 100);
131
+ }
132
+ if (ttftMs != null) {
133
+ const ttftPenalty = Math.max(0, Math.min(30, (ttftMs - 200) / 100 * 10));
134
+ score = Math.max(0, score - ttftPenalty);
135
+ }
136
+ return Math.round(score * 100) / 100;
137
+ }
138
+ function computeCostScore(inputCost, outputCost) {
139
+ const avgCost = ((inputCost ?? 0) + (outputCost ?? 0)) / 2;
140
+ const score = Math.max(0, 100 - avgCost / 30 * 100);
141
+ return Math.round(score * 100) / 100;
142
+ }
143
+ function computeContextScore(contextWindow) {
144
+ const score = Math.min(100, contextWindow / 1e6 * 100);
145
+ return Math.round(score * 100) / 100;
146
+ }
147
+
148
+ // src/scoring/normalizer.ts
149
+ var SOURCE_NORMALIZATION = {
150
+ "chatbot-arena": { min: 800, max: 1400, invertScale: false },
151
+ "swe-bench": { min: 0, max: 100, invertScale: false },
152
+ "human-eval": { min: 0, max: 100, invertScale: false },
153
+ mmlu: { min: 0, max: 100, invertScale: false },
154
+ gpqa: { min: 0, max: 100, invertScale: false },
155
+ arc: { min: 0, max: 100, invertScale: false },
156
+ truthfulqa: { min: 0, max: 100, invertScale: false },
157
+ "tau-bench": { min: 0, max: 100, invertScale: false },
158
+ "artificial-analysis": { min: 0, max: 100, invertScale: false }
159
+ };
160
+ function normalizeScore(rawScore, source, configOverride) {
161
+ const config = configOverride ?? SOURCE_NORMALIZATION[source];
162
+ if (!config) {
163
+ return Math.max(0, Math.min(100, rawScore));
164
+ }
165
+ const { min, max, invertScale } = config;
166
+ const range = max - min;
167
+ if (range === 0)
168
+ return 50;
169
+ let normalized = (rawScore - min) / range * 100;
170
+ if (invertScale) {
171
+ normalized = 100 - normalized;
172
+ }
173
+ return Math.max(0, Math.min(100, normalized));
174
+ }
175
+ function normalizeBenchmarkResults(results) {
176
+ return results.map((result) => ({
177
+ ...result,
178
+ score: normalizeScore(typeof result.rawScore === "number" ? result.rawScore : result.score, result.source)
179
+ }));
180
+ }
181
+
182
+ // src/ingesters/chatbot-arena.ts
183
+ var DEFAULT_ARENA_URL = "https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard/resolve/main/results.json";
184
+ var chatbotArenaIngester = {
185
+ source: "chatbot-arena",
186
+ displayName: "Chatbot Arena (LMSYS)",
187
+ description: "Elo ratings from the LMSYS Chatbot Arena human preference leaderboard.",
188
+ async ingest(options) {
189
+ if (options?.dimensions?.length && !options.dimensions.includes("reasoning")) {
190
+ return [];
191
+ }
192
+ const url = options?.sourceUrl ?? DEFAULT_ARENA_URL;
193
+ const response = await fetchWithRetry(url, { fetch: options?.fetch });
194
+ const text = await response.text();
195
+ const data = parseJsonSafe(text, "Chatbot Arena");
196
+ const now = new Date;
197
+ let entries = data.filter((entry) => entry["Arena Elo rating"] != null && entry.Model);
198
+ if (options?.modelFilter?.length) {
199
+ const filterSet = new Set(options.modelFilter);
200
+ entries = entries.filter((e) => filterSet.has(e.key ?? e.Model));
201
+ }
202
+ if (options?.maxResults) {
203
+ entries = entries.slice(0, options.maxResults);
204
+ }
205
+ let results = entries.map((entry) => {
206
+ const elo = entry["Arena Elo rating"];
207
+ const modelId = entry.key ?? entry.Model.toLowerCase().replace(/\s+/g, "-");
208
+ const org = entry.Organization?.toLowerCase() ?? "unknown";
209
+ return {
210
+ id: `chatbot-arena:${modelId}:reasoning`,
211
+ modelId,
212
+ providerKey: mapOrganizationToProvider(org),
213
+ source: "chatbot-arena",
214
+ dimension: "reasoning",
215
+ score: normalizeScore(elo, "chatbot-arena"),
216
+ rawScore: elo,
217
+ metadata: {
218
+ organization: entry.Organization,
219
+ license: entry.License
220
+ },
221
+ measuredAt: now,
222
+ ingestedAt: now
223
+ };
224
+ });
225
+ const { fromDate, toDate } = options ?? {};
226
+ if (fromDate) {
227
+ results = results.filter((r) => r.measuredAt >= fromDate);
228
+ }
229
+ if (toDate) {
230
+ results = results.filter((r) => r.measuredAt <= toDate);
231
+ }
232
+ return results;
233
+ }
234
+ };
235
+ function mapOrganizationToProvider(org) {
236
+ const normalized = org.toLowerCase();
237
+ if (normalized.includes("openai"))
238
+ return "openai";
239
+ if (normalized.includes("anthropic"))
240
+ return "anthropic";
241
+ if (normalized.includes("google") || normalized.includes("deepmind"))
242
+ return "gemini";
243
+ if (normalized.includes("mistral"))
244
+ return "mistral";
245
+ if (normalized.includes("meta"))
246
+ return "meta";
247
+ if (normalized.includes("cohere"))
248
+ return "cohere";
249
+ return org;
250
+ }
251
+
252
+ // src/ingesters/swe-bench.ts
253
+ var DEFAULT_SWE_BENCH_URL = "https://raw.githubusercontent.com/princeton-nlp/SWE-bench/main/docs/leaderboard.json";
254
+ var sweBenchIngester = {
255
+ source: "swe-bench",
256
+ displayName: "SWE-bench",
257
+ description: "Software engineering task completion rates from SWE-bench.",
258
+ async ingest(options) {
259
+ if (options?.dimensions?.length && !options.dimensions.includes("coding")) {
260
+ return [];
261
+ }
262
+ const url = options?.sourceUrl ?? DEFAULT_SWE_BENCH_URL;
263
+ const response = await fetchWithRetry(url, { fetch: options?.fetch });
264
+ const text = await response.text();
265
+ const data = parseJsonSafe(text, "SWE-bench");
266
+ const now = new Date;
267
+ let entries = data.filter((entry) => entry.model && entry.resolved_rate != null);
268
+ if (options?.modelFilter?.length) {
269
+ const filterSet = new Set(options.modelFilter);
270
+ entries = entries.filter((e) => filterSet.has(e.model.toLowerCase().replace(/\s+/g, "-")));
271
+ }
272
+ if (options?.maxResults) {
273
+ entries = entries.slice(0, options.maxResults);
274
+ }
275
+ let results = entries.map((entry) => {
276
+ const modelId = entry.model.toLowerCase().replace(/\s+/g, "-");
277
+ const org = entry.organization?.toLowerCase() ?? "unknown";
278
+ return {
279
+ id: `swe-bench:${modelId}:coding`,
280
+ modelId,
281
+ providerKey: mapOrganizationToProvider2(org),
282
+ source: "swe-bench",
283
+ dimension: "coding",
284
+ score: Math.max(0, Math.min(100, entry.resolved_rate)),
285
+ rawScore: entry.resolved_rate,
286
+ metadata: {
287
+ organization: entry.organization,
288
+ date: entry.date
289
+ },
290
+ measuredAt: entry.date ? new Date(entry.date) : now,
291
+ ingestedAt: now
292
+ };
293
+ });
294
+ const { fromDate, toDate } = options ?? {};
295
+ if (fromDate) {
296
+ results = results.filter((r) => r.measuredAt >= fromDate);
297
+ }
298
+ if (toDate) {
299
+ results = results.filter((r) => r.measuredAt <= toDate);
300
+ }
301
+ return results;
302
+ }
303
+ };
304
+ function mapOrganizationToProvider2(org) {
305
+ const normalized = org.toLowerCase();
306
+ if (normalized.includes("openai"))
307
+ return "openai";
308
+ if (normalized.includes("anthropic"))
309
+ return "anthropic";
310
+ if (normalized.includes("google") || normalized.includes("deepmind"))
311
+ return "gemini";
312
+ if (normalized.includes("mistral"))
313
+ return "mistral";
314
+ if (normalized.includes("meta"))
315
+ return "meta";
316
+ return org;
317
+ }
318
+
319
+ // src/ingesters/open-llm-leaderboard.ts
320
+ var DEFAULT_HF_URL = "https://huggingface.co/api/spaces/open-llm-leaderboard/open_llm_leaderboard/results";
321
+ var BENCHMARK_MAPPINGS = [
322
+ { field: "mmlu", dimension: "reasoning", sourceKey: "mmlu" },
323
+ { field: "arc", dimension: "reasoning", sourceKey: "arc" },
324
+ { field: "gpqa", dimension: "reasoning", sourceKey: "gpqa" },
325
+ { field: "truthfulqa", dimension: "safety", sourceKey: "truthfulqa" }
326
+ ];
327
+ var openLlmLeaderboardIngester = {
328
+ source: "mmlu",
329
+ displayName: "Open LLM Leaderboard",
330
+ description: "Aggregated benchmark scores from the HuggingFace Open LLM Leaderboard.",
331
+ async ingest(options) {
332
+ const url = options?.sourceUrl ?? DEFAULT_HF_URL;
333
+ const response = await fetchWithRetry(url, { fetch: options?.fetch });
334
+ const text = await response.text();
335
+ const data = parseJsonSafe(text, "Open LLM Leaderboard");
336
+ const now = new Date;
337
+ const results = [];
338
+ const dims = options?.dimensions ? new Set(options.dimensions) : null;
339
+ let entries = data.filter((e) => e.model_name);
340
+ if (options?.modelFilter?.length) {
341
+ const filterSet = new Set(options.modelFilter);
342
+ entries = entries.filter((e) => filterSet.has(e.model_name.toLowerCase().replace(/\s+/g, "-")));
343
+ }
344
+ for (const entry of entries) {
345
+ const modelId = entry.model_name.toLowerCase().replace(/\s+/g, "-");
346
+ const org = entry.organization?.toLowerCase() ?? "unknown";
347
+ const providerKey = mapOrganizationToProvider3(org);
348
+ for (const mapping of BENCHMARK_MAPPINGS) {
349
+ if (dims && !dims.has(mapping.dimension))
350
+ continue;
351
+ const value = entry[mapping.field];
352
+ if (typeof value !== "number")
353
+ continue;
354
+ results.push({
355
+ id: `open-llm:${modelId}:${mapping.sourceKey}`,
356
+ modelId,
357
+ providerKey,
358
+ source: mapping.sourceKey,
359
+ dimension: mapping.dimension,
360
+ score: Math.max(0, Math.min(100, value)),
361
+ rawScore: value,
362
+ metadata: {
363
+ organization: entry.organization,
364
+ leaderboard_average: entry.average
365
+ },
366
+ measuredAt: now,
367
+ ingestedAt: now
368
+ });
369
+ }
370
+ }
371
+ return options?.maxResults ? results.slice(0, options.maxResults) : results;
372
+ }
373
+ };
374
+ function mapOrganizationToProvider3(org) {
375
+ const normalized = org.toLowerCase();
376
+ if (normalized.includes("openai"))
377
+ return "openai";
378
+ if (normalized.includes("anthropic"))
379
+ return "anthropic";
380
+ if (normalized.includes("google") || normalized.includes("deepmind"))
381
+ return "gemini";
382
+ if (normalized.includes("mistral"))
383
+ return "mistral";
384
+ if (normalized.includes("meta"))
385
+ return "meta";
386
+ return org;
387
+ }
388
+
389
+ // src/ingesters/registry.ts
390
+ class IngesterRegistry {
391
+ ingesters = new Map;
392
+ register(ingester) {
393
+ this.ingesters.set(ingester.source, ingester);
394
+ return this;
395
+ }
396
+ get(source) {
397
+ return this.ingesters.get(source);
398
+ }
399
+ list() {
400
+ return Array.from(this.ingesters.values());
401
+ }
402
+ has(source) {
403
+ return this.ingesters.has(source);
404
+ }
405
+ }
406
+ function createDefaultIngesterRegistry() {
407
+ return new IngesterRegistry().register(chatbotArenaIngester).register(artificialAnalysisIngester).register(sweBenchIngester).register(openLlmLeaderboardIngester);
408
+ }
409
+ export {
410
+ createDefaultIngesterRegistry,
411
+ IngesterRegistry
412
+ };
@@ -0,0 +1,105 @@
1
+ // src/ingesters/fetch-utils.ts
2
+ async function fetchWithRetry(url, options) {
3
+ const fetchFn = options?.fetch ?? globalThis.fetch;
4
+ const maxRetries = options?.maxRetries ?? 2;
5
+ const baseDelay = options?.baseDelayMs ?? 500;
6
+ let lastError;
7
+ for (let attempt = 0;attempt <= maxRetries; attempt++) {
8
+ try {
9
+ const response = await fetchFn(url);
10
+ if (response.ok)
11
+ return response;
12
+ if (response.status >= 500 && attempt < maxRetries) {
13
+ await sleep(baseDelay * Math.pow(2, attempt));
14
+ continue;
15
+ }
16
+ throw new Error(`Fetch failed: ${response.status} ${response.statusText} (${url})`);
17
+ } catch (error) {
18
+ lastError = error instanceof Error ? error : new Error(String(error));
19
+ if (attempt < maxRetries) {
20
+ await sleep(baseDelay * Math.pow(2, attempt));
21
+ }
22
+ }
23
+ }
24
+ throw lastError ?? new Error(`Fetch failed after ${maxRetries + 1} attempts: ${url}`);
25
+ }
26
+ function parseJsonSafe(text, label) {
27
+ try {
28
+ return JSON.parse(text);
29
+ } catch {
30
+ throw new Error(`Failed to parse JSON response from ${label}: ${text.slice(0, 200)}`);
31
+ }
32
+ }
33
+ function sleep(ms) {
34
+ return new Promise((resolve) => setTimeout(resolve, ms));
35
+ }
36
+
37
+ // src/ingesters/swe-bench.ts
38
+ var DEFAULT_SWE_BENCH_URL = "https://raw.githubusercontent.com/princeton-nlp/SWE-bench/main/docs/leaderboard.json";
39
+ var sweBenchIngester = {
40
+ source: "swe-bench",
41
+ displayName: "SWE-bench",
42
+ description: "Software engineering task completion rates from SWE-bench.",
43
+ async ingest(options) {
44
+ if (options?.dimensions?.length && !options.dimensions.includes("coding")) {
45
+ return [];
46
+ }
47
+ const url = options?.sourceUrl ?? DEFAULT_SWE_BENCH_URL;
48
+ const response = await fetchWithRetry(url, { fetch: options?.fetch });
49
+ const text = await response.text();
50
+ const data = parseJsonSafe(text, "SWE-bench");
51
+ const now = new Date;
52
+ let entries = data.filter((entry) => entry.model && entry.resolved_rate != null);
53
+ if (options?.modelFilter?.length) {
54
+ const filterSet = new Set(options.modelFilter);
55
+ entries = entries.filter((e) => filterSet.has(e.model.toLowerCase().replace(/\s+/g, "-")));
56
+ }
57
+ if (options?.maxResults) {
58
+ entries = entries.slice(0, options.maxResults);
59
+ }
60
+ let results = entries.map((entry) => {
61
+ const modelId = entry.model.toLowerCase().replace(/\s+/g, "-");
62
+ const org = entry.organization?.toLowerCase() ?? "unknown";
63
+ return {
64
+ id: `swe-bench:${modelId}:coding`,
65
+ modelId,
66
+ providerKey: mapOrganizationToProvider(org),
67
+ source: "swe-bench",
68
+ dimension: "coding",
69
+ score: Math.max(0, Math.min(100, entry.resolved_rate)),
70
+ rawScore: entry.resolved_rate,
71
+ metadata: {
72
+ organization: entry.organization,
73
+ date: entry.date
74
+ },
75
+ measuredAt: entry.date ? new Date(entry.date) : now,
76
+ ingestedAt: now
77
+ };
78
+ });
79
+ const { fromDate, toDate } = options ?? {};
80
+ if (fromDate) {
81
+ results = results.filter((r) => r.measuredAt >= fromDate);
82
+ }
83
+ if (toDate) {
84
+ results = results.filter((r) => r.measuredAt <= toDate);
85
+ }
86
+ return results;
87
+ }
88
+ };
89
+ function mapOrganizationToProvider(org) {
90
+ const normalized = org.toLowerCase();
91
+ if (normalized.includes("openai"))
92
+ return "openai";
93
+ if (normalized.includes("anthropic"))
94
+ return "anthropic";
95
+ if (normalized.includes("google") || normalized.includes("deepmind"))
96
+ return "gemini";
97
+ if (normalized.includes("mistral"))
98
+ return "mistral";
99
+ if (normalized.includes("meta"))
100
+ return "meta";
101
+ return org;
102
+ }
103
+ export {
104
+ sweBenchIngester
105
+ };
File without changes