@contractspec/lib.provider-ranking 0.7.5 → 0.7.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +60 -27
- package/dist/browser/ingesters/index.js +68 -68
- package/dist/browser/ingesters/registry.js +68 -68
- package/dist/eval/index.d.ts +1 -1
- package/dist/index.d.ts +3 -3
- package/dist/ingesters/index.d.ts +5 -5
- package/dist/ingesters/index.js +68 -68
- package/dist/ingesters/registry.js +68 -68
- package/dist/node/ingesters/index.js +68 -68
- package/dist/node/ingesters/registry.js +68 -68
- package/dist/scoring/index.d.ts +1 -1
- package/package.json +5 -5
package/README.md
CHANGED
|
@@ -1,44 +1,77 @@
|
|
|
1
1
|
# @contractspec/lib.provider-ranking
|
|
2
2
|
|
|
3
|
-
Website: https://contractspec.io
|
|
3
|
+
Website: https://contractspec.io
|
|
4
4
|
|
|
5
5
|
**AI provider ranking: benchmark ingestion, scoring, and model comparison.**
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
## What It Provides
|
|
8
|
+
|
|
9
|
+
- **Layer**: lib.
|
|
10
|
+
- **Consumers**: module.provider-ranking.
|
|
11
|
+
- Related ContractSpec packages include `@contractspec/tool.bun`, `@contractspec/tool.typescript`.
|
|
12
|
+
- Related ContractSpec packages include `@contractspec/tool.bun`, `@contractspec/tool.typescript`.
|
|
8
13
|
|
|
9
14
|
## Installation
|
|
10
15
|
|
|
11
|
-
|
|
12
|
-
bun add @contractspec/lib.provider-ranking
|
|
13
|
-
```
|
|
16
|
+
`npm install @contractspec/lib.provider-ranking`
|
|
14
17
|
|
|
15
|
-
|
|
18
|
+
or
|
|
16
19
|
|
|
17
|
-
|
|
18
|
-
- `./types` -- `BenchmarkResult`, `ModelRanking`, `ModelProfile`, `BenchmarkDimension`, `DimensionWeightConfig`
|
|
19
|
-
- `./store` -- `ProviderRankingStore` interface
|
|
20
|
-
- `./in-memory-store` -- `InMemoryProviderRankingStore` class
|
|
21
|
-
- `./scoring` -- `computeModelRankings()`, `normalizeScore()`, `DEFAULT_DIMENSION_WEIGHTS`
|
|
22
|
-
- `./ingesters` -- `chatbotArenaIngester`, `sweBenchIngester`, `artificialAnalysisIngester`, `IngesterRegistry`
|
|
23
|
-
- `./eval` -- `EvalRunner`, `EvalSuite`, `EvalCase` for custom evaluation
|
|
20
|
+
`bun add @contractspec/lib.provider-ranking`
|
|
24
21
|
|
|
25
22
|
## Usage
|
|
26
23
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
24
|
+
Import the root entrypoint from `@contractspec/lib.provider-ranking`, or choose a documented subpath when you only need one part of the package surface.
|
|
25
|
+
|
|
26
|
+
## Architecture
|
|
27
|
+
|
|
28
|
+
- `src/eval` is part of the package's public or composition surface.
|
|
29
|
+
- `src/in-memory-store.ts` is part of the package's public or composition surface.
|
|
30
|
+
- `src/index.ts` is the root public barrel and package entrypoint.
|
|
31
|
+
- `src/ingesters` is part of the package's public or composition surface.
|
|
32
|
+
- `src/scoring` is part of the package's public or composition surface.
|
|
33
|
+
- `src/store.ts` is part of the package's public or composition surface.
|
|
34
|
+
- `src/types.ts` is shared public type definitions.
|
|
35
|
+
|
|
36
|
+
## Public Entry Points
|
|
37
|
+
|
|
38
|
+
- Export `.` resolves through `./src/index.ts`.
|
|
39
|
+
- Export `./eval` resolves through `./src/eval/index.ts`.
|
|
40
|
+
- Export `./eval/runner` resolves through `./src/eval/runner.ts`.
|
|
41
|
+
- Export `./eval/types` resolves through `./src/eval/types.ts`.
|
|
42
|
+
- Export `./in-memory-store` resolves through `./src/in-memory-store.ts`.
|
|
43
|
+
- Export `./ingesters` resolves through `./src/ingesters/index.ts`.
|
|
44
|
+
- Export `./ingesters/artificial-analysis` resolves through `./src/ingesters/artificial-analysis.ts`.
|
|
45
|
+
- Export `./ingesters/chatbot-arena` resolves through `./src/ingesters/chatbot-arena.ts`.
|
|
46
|
+
- Export `./ingesters/fetch-utils` resolves through `./src/ingesters/fetch-utils.ts`.
|
|
47
|
+
- Export `./ingesters/open-llm-leaderboard` resolves through `./src/ingesters/open-llm-leaderboard.ts`.
|
|
48
|
+
- The package publishes 19 total export subpaths; keep docs aligned with `package.json`.
|
|
49
|
+
|
|
50
|
+
## Local Commands
|
|
51
|
+
|
|
52
|
+
- `bun run dev` — contractspec-bun-build dev
|
|
53
|
+
- `bun run build` — bun run prebuild && bun run build:bundle && bun run build:types
|
|
54
|
+
- `bun run test` — bun test --pass-with-no-tests
|
|
55
|
+
- `bun run lint` — bun lint:fix
|
|
56
|
+
- `bun run lint:check` — biome check .
|
|
57
|
+
- `bun run lint:fix` — biome check --write --unsafe --only=nursery/useSortedClasses . && biome check --write .
|
|
58
|
+
- `bun run typecheck` — tsc --noEmit
|
|
59
|
+
- `bun run publish:pkg` — bun publish --tolerate-republish --ignore-scripts --verbose
|
|
60
|
+
- `bun run publish:pkg:canary` — bun publish:pkg --tag canary
|
|
61
|
+
- `bun run clean` — rimraf dist .turbo
|
|
62
|
+
- `bun run build:bundle` — contractspec-bun-build transpile
|
|
63
|
+
- `bun run build:types` — contractspec-bun-build types
|
|
64
|
+
- `bun run prebuild` — contractspec-bun-build prebuild
|
|
31
65
|
|
|
32
|
-
|
|
33
|
-
const registry = createDefaultIngesterRegistry();
|
|
66
|
+
## Recent Updates
|
|
34
67
|
|
|
35
|
-
|
|
36
|
-
|
|
68
|
+
- Replace eslint+prettier by biomejs to optimize speed.
|
|
69
|
+
- Resolve lint, build, and type errors across nine packages.
|
|
70
|
+
- Add first-class transport, auth, versioning, and BYOK support across all integrations.
|
|
71
|
+
- Add AI provider ranking system with ranking-driven model selection.
|
|
37
72
|
|
|
38
|
-
|
|
39
|
-
await store.addBenchmarkResult(result);
|
|
40
|
-
}
|
|
73
|
+
## Notes
|
|
41
74
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
75
|
+
- Store interface is the adapter boundary — do not leak implementation details.
|
|
76
|
+
- Scoring algorithms must stay deterministic (no randomness, no side effects).
|
|
77
|
+
- Benchmark dimension enum is shared across ingesters and scoring — keep in sync.
|
|
@@ -249,73 +249,6 @@ function mapOrganizationToProvider(org) {
|
|
|
249
249
|
return org;
|
|
250
250
|
}
|
|
251
251
|
|
|
252
|
-
// src/ingesters/swe-bench.ts
|
|
253
|
-
var DEFAULT_SWE_BENCH_URL = "https://raw.githubusercontent.com/princeton-nlp/SWE-bench/main/docs/leaderboard.json";
|
|
254
|
-
var sweBenchIngester = {
|
|
255
|
-
source: "swe-bench",
|
|
256
|
-
displayName: "SWE-bench",
|
|
257
|
-
description: "Software engineering task completion rates from SWE-bench.",
|
|
258
|
-
async ingest(options) {
|
|
259
|
-
if (options?.dimensions?.length && !options.dimensions.includes("coding")) {
|
|
260
|
-
return [];
|
|
261
|
-
}
|
|
262
|
-
const url = options?.sourceUrl ?? DEFAULT_SWE_BENCH_URL;
|
|
263
|
-
const response = await fetchWithRetry(url, { fetch: options?.fetch });
|
|
264
|
-
const text = await response.text();
|
|
265
|
-
const data = parseJsonSafe(text, "SWE-bench");
|
|
266
|
-
const now = new Date;
|
|
267
|
-
let entries = data.filter((entry) => entry.model && entry.resolved_rate != null);
|
|
268
|
-
if (options?.modelFilter?.length) {
|
|
269
|
-
const filterSet = new Set(options.modelFilter);
|
|
270
|
-
entries = entries.filter((e) => filterSet.has(e.model.toLowerCase().replace(/\s+/g, "-")));
|
|
271
|
-
}
|
|
272
|
-
if (options?.maxResults) {
|
|
273
|
-
entries = entries.slice(0, options.maxResults);
|
|
274
|
-
}
|
|
275
|
-
let results = entries.map((entry) => {
|
|
276
|
-
const modelId = entry.model.toLowerCase().replace(/\s+/g, "-");
|
|
277
|
-
const org = entry.organization?.toLowerCase() ?? "unknown";
|
|
278
|
-
return {
|
|
279
|
-
id: `swe-bench:${modelId}:coding`,
|
|
280
|
-
modelId,
|
|
281
|
-
providerKey: mapOrganizationToProvider2(org),
|
|
282
|
-
source: "swe-bench",
|
|
283
|
-
dimension: "coding",
|
|
284
|
-
score: Math.max(0, Math.min(100, entry.resolved_rate)),
|
|
285
|
-
rawScore: entry.resolved_rate,
|
|
286
|
-
metadata: {
|
|
287
|
-
organization: entry.organization,
|
|
288
|
-
date: entry.date
|
|
289
|
-
},
|
|
290
|
-
measuredAt: entry.date ? new Date(entry.date) : now,
|
|
291
|
-
ingestedAt: now
|
|
292
|
-
};
|
|
293
|
-
});
|
|
294
|
-
const { fromDate, toDate } = options ?? {};
|
|
295
|
-
if (fromDate) {
|
|
296
|
-
results = results.filter((r) => r.measuredAt >= fromDate);
|
|
297
|
-
}
|
|
298
|
-
if (toDate) {
|
|
299
|
-
results = results.filter((r) => r.measuredAt <= toDate);
|
|
300
|
-
}
|
|
301
|
-
return results;
|
|
302
|
-
}
|
|
303
|
-
};
|
|
304
|
-
function mapOrganizationToProvider2(org) {
|
|
305
|
-
const normalized = org.toLowerCase();
|
|
306
|
-
if (normalized.includes("openai"))
|
|
307
|
-
return "openai";
|
|
308
|
-
if (normalized.includes("anthropic"))
|
|
309
|
-
return "anthropic";
|
|
310
|
-
if (normalized.includes("google") || normalized.includes("deepmind"))
|
|
311
|
-
return "gemini";
|
|
312
|
-
if (normalized.includes("mistral"))
|
|
313
|
-
return "mistral";
|
|
314
|
-
if (normalized.includes("meta"))
|
|
315
|
-
return "meta";
|
|
316
|
-
return org;
|
|
317
|
-
}
|
|
318
|
-
|
|
319
252
|
// src/ingesters/open-llm-leaderboard.ts
|
|
320
253
|
var DEFAULT_HF_URL = "https://huggingface.co/api/spaces/open-llm-leaderboard/open_llm_leaderboard/results";
|
|
321
254
|
var BENCHMARK_MAPPINGS = [
|
|
@@ -344,7 +277,7 @@ var openLlmLeaderboardIngester = {
|
|
|
344
277
|
for (const entry of entries) {
|
|
345
278
|
const modelId = entry.model_name.toLowerCase().replace(/\s+/g, "-");
|
|
346
279
|
const org = entry.organization?.toLowerCase() ?? "unknown";
|
|
347
|
-
const providerKey =
|
|
280
|
+
const providerKey = mapOrganizationToProvider2(org);
|
|
348
281
|
for (const mapping of BENCHMARK_MAPPINGS) {
|
|
349
282
|
if (dims && !dims.has(mapping.dimension))
|
|
350
283
|
continue;
|
|
@@ -371,6 +304,73 @@ var openLlmLeaderboardIngester = {
|
|
|
371
304
|
return options?.maxResults ? results.slice(0, options.maxResults) : results;
|
|
372
305
|
}
|
|
373
306
|
};
|
|
307
|
+
function mapOrganizationToProvider2(org) {
|
|
308
|
+
const normalized = org.toLowerCase();
|
|
309
|
+
if (normalized.includes("openai"))
|
|
310
|
+
return "openai";
|
|
311
|
+
if (normalized.includes("anthropic"))
|
|
312
|
+
return "anthropic";
|
|
313
|
+
if (normalized.includes("google") || normalized.includes("deepmind"))
|
|
314
|
+
return "gemini";
|
|
315
|
+
if (normalized.includes("mistral"))
|
|
316
|
+
return "mistral";
|
|
317
|
+
if (normalized.includes("meta"))
|
|
318
|
+
return "meta";
|
|
319
|
+
return org;
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
// src/ingesters/swe-bench.ts
|
|
323
|
+
var DEFAULT_SWE_BENCH_URL = "https://raw.githubusercontent.com/princeton-nlp/SWE-bench/main/docs/leaderboard.json";
|
|
324
|
+
var sweBenchIngester = {
|
|
325
|
+
source: "swe-bench",
|
|
326
|
+
displayName: "SWE-bench",
|
|
327
|
+
description: "Software engineering task completion rates from SWE-bench.",
|
|
328
|
+
async ingest(options) {
|
|
329
|
+
if (options?.dimensions?.length && !options.dimensions.includes("coding")) {
|
|
330
|
+
return [];
|
|
331
|
+
}
|
|
332
|
+
const url = options?.sourceUrl ?? DEFAULT_SWE_BENCH_URL;
|
|
333
|
+
const response = await fetchWithRetry(url, { fetch: options?.fetch });
|
|
334
|
+
const text = await response.text();
|
|
335
|
+
const data = parseJsonSafe(text, "SWE-bench");
|
|
336
|
+
const now = new Date;
|
|
337
|
+
let entries = data.filter((entry) => entry.model && entry.resolved_rate != null);
|
|
338
|
+
if (options?.modelFilter?.length) {
|
|
339
|
+
const filterSet = new Set(options.modelFilter);
|
|
340
|
+
entries = entries.filter((e) => filterSet.has(e.model.toLowerCase().replace(/\s+/g, "-")));
|
|
341
|
+
}
|
|
342
|
+
if (options?.maxResults) {
|
|
343
|
+
entries = entries.slice(0, options.maxResults);
|
|
344
|
+
}
|
|
345
|
+
let results = entries.map((entry) => {
|
|
346
|
+
const modelId = entry.model.toLowerCase().replace(/\s+/g, "-");
|
|
347
|
+
const org = entry.organization?.toLowerCase() ?? "unknown";
|
|
348
|
+
return {
|
|
349
|
+
id: `swe-bench:${modelId}:coding`,
|
|
350
|
+
modelId,
|
|
351
|
+
providerKey: mapOrganizationToProvider3(org),
|
|
352
|
+
source: "swe-bench",
|
|
353
|
+
dimension: "coding",
|
|
354
|
+
score: Math.max(0, Math.min(100, entry.resolved_rate)),
|
|
355
|
+
rawScore: entry.resolved_rate,
|
|
356
|
+
metadata: {
|
|
357
|
+
organization: entry.organization,
|
|
358
|
+
date: entry.date
|
|
359
|
+
},
|
|
360
|
+
measuredAt: entry.date ? new Date(entry.date) : now,
|
|
361
|
+
ingestedAt: now
|
|
362
|
+
};
|
|
363
|
+
});
|
|
364
|
+
const { fromDate, toDate } = options ?? {};
|
|
365
|
+
if (fromDate) {
|
|
366
|
+
results = results.filter((r) => r.measuredAt >= fromDate);
|
|
367
|
+
}
|
|
368
|
+
if (toDate) {
|
|
369
|
+
results = results.filter((r) => r.measuredAt <= toDate);
|
|
370
|
+
}
|
|
371
|
+
return results;
|
|
372
|
+
}
|
|
373
|
+
};
|
|
374
374
|
function mapOrganizationToProvider3(org) {
|
|
375
375
|
const normalized = org.toLowerCase();
|
|
376
376
|
if (normalized.includes("openai"))
|
|
@@ -249,73 +249,6 @@ function mapOrganizationToProvider(org) {
|
|
|
249
249
|
return org;
|
|
250
250
|
}
|
|
251
251
|
|
|
252
|
-
// src/ingesters/swe-bench.ts
|
|
253
|
-
var DEFAULT_SWE_BENCH_URL = "https://raw.githubusercontent.com/princeton-nlp/SWE-bench/main/docs/leaderboard.json";
|
|
254
|
-
var sweBenchIngester = {
|
|
255
|
-
source: "swe-bench",
|
|
256
|
-
displayName: "SWE-bench",
|
|
257
|
-
description: "Software engineering task completion rates from SWE-bench.",
|
|
258
|
-
async ingest(options) {
|
|
259
|
-
if (options?.dimensions?.length && !options.dimensions.includes("coding")) {
|
|
260
|
-
return [];
|
|
261
|
-
}
|
|
262
|
-
const url = options?.sourceUrl ?? DEFAULT_SWE_BENCH_URL;
|
|
263
|
-
const response = await fetchWithRetry(url, { fetch: options?.fetch });
|
|
264
|
-
const text = await response.text();
|
|
265
|
-
const data = parseJsonSafe(text, "SWE-bench");
|
|
266
|
-
const now = new Date;
|
|
267
|
-
let entries = data.filter((entry) => entry.model && entry.resolved_rate != null);
|
|
268
|
-
if (options?.modelFilter?.length) {
|
|
269
|
-
const filterSet = new Set(options.modelFilter);
|
|
270
|
-
entries = entries.filter((e) => filterSet.has(e.model.toLowerCase().replace(/\s+/g, "-")));
|
|
271
|
-
}
|
|
272
|
-
if (options?.maxResults) {
|
|
273
|
-
entries = entries.slice(0, options.maxResults);
|
|
274
|
-
}
|
|
275
|
-
let results = entries.map((entry) => {
|
|
276
|
-
const modelId = entry.model.toLowerCase().replace(/\s+/g, "-");
|
|
277
|
-
const org = entry.organization?.toLowerCase() ?? "unknown";
|
|
278
|
-
return {
|
|
279
|
-
id: `swe-bench:${modelId}:coding`,
|
|
280
|
-
modelId,
|
|
281
|
-
providerKey: mapOrganizationToProvider2(org),
|
|
282
|
-
source: "swe-bench",
|
|
283
|
-
dimension: "coding",
|
|
284
|
-
score: Math.max(0, Math.min(100, entry.resolved_rate)),
|
|
285
|
-
rawScore: entry.resolved_rate,
|
|
286
|
-
metadata: {
|
|
287
|
-
organization: entry.organization,
|
|
288
|
-
date: entry.date
|
|
289
|
-
},
|
|
290
|
-
measuredAt: entry.date ? new Date(entry.date) : now,
|
|
291
|
-
ingestedAt: now
|
|
292
|
-
};
|
|
293
|
-
});
|
|
294
|
-
const { fromDate, toDate } = options ?? {};
|
|
295
|
-
if (fromDate) {
|
|
296
|
-
results = results.filter((r) => r.measuredAt >= fromDate);
|
|
297
|
-
}
|
|
298
|
-
if (toDate) {
|
|
299
|
-
results = results.filter((r) => r.measuredAt <= toDate);
|
|
300
|
-
}
|
|
301
|
-
return results;
|
|
302
|
-
}
|
|
303
|
-
};
|
|
304
|
-
function mapOrganizationToProvider2(org) {
|
|
305
|
-
const normalized = org.toLowerCase();
|
|
306
|
-
if (normalized.includes("openai"))
|
|
307
|
-
return "openai";
|
|
308
|
-
if (normalized.includes("anthropic"))
|
|
309
|
-
return "anthropic";
|
|
310
|
-
if (normalized.includes("google") || normalized.includes("deepmind"))
|
|
311
|
-
return "gemini";
|
|
312
|
-
if (normalized.includes("mistral"))
|
|
313
|
-
return "mistral";
|
|
314
|
-
if (normalized.includes("meta"))
|
|
315
|
-
return "meta";
|
|
316
|
-
return org;
|
|
317
|
-
}
|
|
318
|
-
|
|
319
252
|
// src/ingesters/open-llm-leaderboard.ts
|
|
320
253
|
var DEFAULT_HF_URL = "https://huggingface.co/api/spaces/open-llm-leaderboard/open_llm_leaderboard/results";
|
|
321
254
|
var BENCHMARK_MAPPINGS = [
|
|
@@ -344,7 +277,7 @@ var openLlmLeaderboardIngester = {
|
|
|
344
277
|
for (const entry of entries) {
|
|
345
278
|
const modelId = entry.model_name.toLowerCase().replace(/\s+/g, "-");
|
|
346
279
|
const org = entry.organization?.toLowerCase() ?? "unknown";
|
|
347
|
-
const providerKey =
|
|
280
|
+
const providerKey = mapOrganizationToProvider2(org);
|
|
348
281
|
for (const mapping of BENCHMARK_MAPPINGS) {
|
|
349
282
|
if (dims && !dims.has(mapping.dimension))
|
|
350
283
|
continue;
|
|
@@ -371,6 +304,73 @@ var openLlmLeaderboardIngester = {
|
|
|
371
304
|
return options?.maxResults ? results.slice(0, options.maxResults) : results;
|
|
372
305
|
}
|
|
373
306
|
};
|
|
307
|
+
function mapOrganizationToProvider2(org) {
|
|
308
|
+
const normalized = org.toLowerCase();
|
|
309
|
+
if (normalized.includes("openai"))
|
|
310
|
+
return "openai";
|
|
311
|
+
if (normalized.includes("anthropic"))
|
|
312
|
+
return "anthropic";
|
|
313
|
+
if (normalized.includes("google") || normalized.includes("deepmind"))
|
|
314
|
+
return "gemini";
|
|
315
|
+
if (normalized.includes("mistral"))
|
|
316
|
+
return "mistral";
|
|
317
|
+
if (normalized.includes("meta"))
|
|
318
|
+
return "meta";
|
|
319
|
+
return org;
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
// src/ingesters/swe-bench.ts
|
|
323
|
+
var DEFAULT_SWE_BENCH_URL = "https://raw.githubusercontent.com/princeton-nlp/SWE-bench/main/docs/leaderboard.json";
|
|
324
|
+
var sweBenchIngester = {
|
|
325
|
+
source: "swe-bench",
|
|
326
|
+
displayName: "SWE-bench",
|
|
327
|
+
description: "Software engineering task completion rates from SWE-bench.",
|
|
328
|
+
async ingest(options) {
|
|
329
|
+
if (options?.dimensions?.length && !options.dimensions.includes("coding")) {
|
|
330
|
+
return [];
|
|
331
|
+
}
|
|
332
|
+
const url = options?.sourceUrl ?? DEFAULT_SWE_BENCH_URL;
|
|
333
|
+
const response = await fetchWithRetry(url, { fetch: options?.fetch });
|
|
334
|
+
const text = await response.text();
|
|
335
|
+
const data = parseJsonSafe(text, "SWE-bench");
|
|
336
|
+
const now = new Date;
|
|
337
|
+
let entries = data.filter((entry) => entry.model && entry.resolved_rate != null);
|
|
338
|
+
if (options?.modelFilter?.length) {
|
|
339
|
+
const filterSet = new Set(options.modelFilter);
|
|
340
|
+
entries = entries.filter((e) => filterSet.has(e.model.toLowerCase().replace(/\s+/g, "-")));
|
|
341
|
+
}
|
|
342
|
+
if (options?.maxResults) {
|
|
343
|
+
entries = entries.slice(0, options.maxResults);
|
|
344
|
+
}
|
|
345
|
+
let results = entries.map((entry) => {
|
|
346
|
+
const modelId = entry.model.toLowerCase().replace(/\s+/g, "-");
|
|
347
|
+
const org = entry.organization?.toLowerCase() ?? "unknown";
|
|
348
|
+
return {
|
|
349
|
+
id: `swe-bench:${modelId}:coding`,
|
|
350
|
+
modelId,
|
|
351
|
+
providerKey: mapOrganizationToProvider3(org),
|
|
352
|
+
source: "swe-bench",
|
|
353
|
+
dimension: "coding",
|
|
354
|
+
score: Math.max(0, Math.min(100, entry.resolved_rate)),
|
|
355
|
+
rawScore: entry.resolved_rate,
|
|
356
|
+
metadata: {
|
|
357
|
+
organization: entry.organization,
|
|
358
|
+
date: entry.date
|
|
359
|
+
},
|
|
360
|
+
measuredAt: entry.date ? new Date(entry.date) : now,
|
|
361
|
+
ingestedAt: now
|
|
362
|
+
};
|
|
363
|
+
});
|
|
364
|
+
const { fromDate, toDate } = options ?? {};
|
|
365
|
+
if (fromDate) {
|
|
366
|
+
results = results.filter((r) => r.measuredAt >= fromDate);
|
|
367
|
+
}
|
|
368
|
+
if (toDate) {
|
|
369
|
+
results = results.filter((r) => r.measuredAt <= toDate);
|
|
370
|
+
}
|
|
371
|
+
return results;
|
|
372
|
+
}
|
|
373
|
+
};
|
|
374
374
|
function mapOrganizationToProvider3(org) {
|
|
375
375
|
const normalized = org.toLowerCase();
|
|
376
376
|
if (normalized.includes("openai"))
|
package/dist/eval/index.d.ts
CHANGED
package/dist/index.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
export type { BenchmarkDimension, BenchmarkSource, BenchmarkResult, DimensionScore, ModelRanking, ModelProfile, BenchmarkResultQuery, BenchmarkResultListResult, RankingQuery, RankingListResult, IngestionRun, DimensionWeightConfig, } from './types';
|
|
2
|
-
export { BENCHMARK_DIMENSIONS } from './types';
|
|
3
|
-
export type { ProviderRankingStore } from './store';
|
|
4
1
|
export { InMemoryProviderRankingStore } from './in-memory-store';
|
|
2
|
+
export type { ProviderRankingStore } from './store';
|
|
3
|
+
export type { BenchmarkDimension, BenchmarkResult, BenchmarkResultListResult, BenchmarkResultQuery, BenchmarkSource, DimensionScore, DimensionWeightConfig, IngestionRun, ModelProfile, ModelRanking, RankingListResult, RankingQuery, } from './types';
|
|
4
|
+
export { BENCHMARK_DIMENSIONS } from './types';
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
export type { BenchmarkIngester, IngesterOptions } from './types';
|
|
2
|
-
export { chatbotArenaIngester } from './chatbot-arena';
|
|
3
1
|
export { artificialAnalysisIngester } from './artificial-analysis';
|
|
4
|
-
export {
|
|
5
|
-
export { openLlmLeaderboardIngester } from './open-llm-leaderboard';
|
|
6
|
-
export { IngesterRegistry, createDefaultIngesterRegistry } from './registry';
|
|
2
|
+
export { chatbotArenaIngester } from './chatbot-arena';
|
|
7
3
|
export { fetchWithRetry, parseJsonSafe } from './fetch-utils';
|
|
4
|
+
export { openLlmLeaderboardIngester } from './open-llm-leaderboard';
|
|
5
|
+
export { createDefaultIngesterRegistry, IngesterRegistry } from './registry';
|
|
6
|
+
export { sweBenchIngester } from './swe-bench';
|
|
7
|
+
export type { BenchmarkIngester, IngesterOptions } from './types';
|
package/dist/ingesters/index.js
CHANGED
|
@@ -250,73 +250,6 @@ function mapOrganizationToProvider(org) {
|
|
|
250
250
|
return org;
|
|
251
251
|
}
|
|
252
252
|
|
|
253
|
-
// src/ingesters/swe-bench.ts
|
|
254
|
-
var DEFAULT_SWE_BENCH_URL = "https://raw.githubusercontent.com/princeton-nlp/SWE-bench/main/docs/leaderboard.json";
|
|
255
|
-
var sweBenchIngester = {
|
|
256
|
-
source: "swe-bench",
|
|
257
|
-
displayName: "SWE-bench",
|
|
258
|
-
description: "Software engineering task completion rates from SWE-bench.",
|
|
259
|
-
async ingest(options) {
|
|
260
|
-
if (options?.dimensions?.length && !options.dimensions.includes("coding")) {
|
|
261
|
-
return [];
|
|
262
|
-
}
|
|
263
|
-
const url = options?.sourceUrl ?? DEFAULT_SWE_BENCH_URL;
|
|
264
|
-
const response = await fetchWithRetry(url, { fetch: options?.fetch });
|
|
265
|
-
const text = await response.text();
|
|
266
|
-
const data = parseJsonSafe(text, "SWE-bench");
|
|
267
|
-
const now = new Date;
|
|
268
|
-
let entries = data.filter((entry) => entry.model && entry.resolved_rate != null);
|
|
269
|
-
if (options?.modelFilter?.length) {
|
|
270
|
-
const filterSet = new Set(options.modelFilter);
|
|
271
|
-
entries = entries.filter((e) => filterSet.has(e.model.toLowerCase().replace(/\s+/g, "-")));
|
|
272
|
-
}
|
|
273
|
-
if (options?.maxResults) {
|
|
274
|
-
entries = entries.slice(0, options.maxResults);
|
|
275
|
-
}
|
|
276
|
-
let results = entries.map((entry) => {
|
|
277
|
-
const modelId = entry.model.toLowerCase().replace(/\s+/g, "-");
|
|
278
|
-
const org = entry.organization?.toLowerCase() ?? "unknown";
|
|
279
|
-
return {
|
|
280
|
-
id: `swe-bench:${modelId}:coding`,
|
|
281
|
-
modelId,
|
|
282
|
-
providerKey: mapOrganizationToProvider2(org),
|
|
283
|
-
source: "swe-bench",
|
|
284
|
-
dimension: "coding",
|
|
285
|
-
score: Math.max(0, Math.min(100, entry.resolved_rate)),
|
|
286
|
-
rawScore: entry.resolved_rate,
|
|
287
|
-
metadata: {
|
|
288
|
-
organization: entry.organization,
|
|
289
|
-
date: entry.date
|
|
290
|
-
},
|
|
291
|
-
measuredAt: entry.date ? new Date(entry.date) : now,
|
|
292
|
-
ingestedAt: now
|
|
293
|
-
};
|
|
294
|
-
});
|
|
295
|
-
const { fromDate, toDate } = options ?? {};
|
|
296
|
-
if (fromDate) {
|
|
297
|
-
results = results.filter((r) => r.measuredAt >= fromDate);
|
|
298
|
-
}
|
|
299
|
-
if (toDate) {
|
|
300
|
-
results = results.filter((r) => r.measuredAt <= toDate);
|
|
301
|
-
}
|
|
302
|
-
return results;
|
|
303
|
-
}
|
|
304
|
-
};
|
|
305
|
-
function mapOrganizationToProvider2(org) {
|
|
306
|
-
const normalized = org.toLowerCase();
|
|
307
|
-
if (normalized.includes("openai"))
|
|
308
|
-
return "openai";
|
|
309
|
-
if (normalized.includes("anthropic"))
|
|
310
|
-
return "anthropic";
|
|
311
|
-
if (normalized.includes("google") || normalized.includes("deepmind"))
|
|
312
|
-
return "gemini";
|
|
313
|
-
if (normalized.includes("mistral"))
|
|
314
|
-
return "mistral";
|
|
315
|
-
if (normalized.includes("meta"))
|
|
316
|
-
return "meta";
|
|
317
|
-
return org;
|
|
318
|
-
}
|
|
319
|
-
|
|
320
253
|
// src/ingesters/open-llm-leaderboard.ts
|
|
321
254
|
var DEFAULT_HF_URL = "https://huggingface.co/api/spaces/open-llm-leaderboard/open_llm_leaderboard/results";
|
|
322
255
|
var BENCHMARK_MAPPINGS = [
|
|
@@ -345,7 +278,7 @@ var openLlmLeaderboardIngester = {
|
|
|
345
278
|
for (const entry of entries) {
|
|
346
279
|
const modelId = entry.model_name.toLowerCase().replace(/\s+/g, "-");
|
|
347
280
|
const org = entry.organization?.toLowerCase() ?? "unknown";
|
|
348
|
-
const providerKey =
|
|
281
|
+
const providerKey = mapOrganizationToProvider2(org);
|
|
349
282
|
for (const mapping of BENCHMARK_MAPPINGS) {
|
|
350
283
|
if (dims && !dims.has(mapping.dimension))
|
|
351
284
|
continue;
|
|
@@ -372,6 +305,73 @@ var openLlmLeaderboardIngester = {
|
|
|
372
305
|
return options?.maxResults ? results.slice(0, options.maxResults) : results;
|
|
373
306
|
}
|
|
374
307
|
};
|
|
308
|
+
function mapOrganizationToProvider2(org) {
|
|
309
|
+
const normalized = org.toLowerCase();
|
|
310
|
+
if (normalized.includes("openai"))
|
|
311
|
+
return "openai";
|
|
312
|
+
if (normalized.includes("anthropic"))
|
|
313
|
+
return "anthropic";
|
|
314
|
+
if (normalized.includes("google") || normalized.includes("deepmind"))
|
|
315
|
+
return "gemini";
|
|
316
|
+
if (normalized.includes("mistral"))
|
|
317
|
+
return "mistral";
|
|
318
|
+
if (normalized.includes("meta"))
|
|
319
|
+
return "meta";
|
|
320
|
+
return org;
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// src/ingesters/swe-bench.ts
|
|
324
|
+
var DEFAULT_SWE_BENCH_URL = "https://raw.githubusercontent.com/princeton-nlp/SWE-bench/main/docs/leaderboard.json";
|
|
325
|
+
var sweBenchIngester = {
|
|
326
|
+
source: "swe-bench",
|
|
327
|
+
displayName: "SWE-bench",
|
|
328
|
+
description: "Software engineering task completion rates from SWE-bench.",
|
|
329
|
+
async ingest(options) {
|
|
330
|
+
if (options?.dimensions?.length && !options.dimensions.includes("coding")) {
|
|
331
|
+
return [];
|
|
332
|
+
}
|
|
333
|
+
const url = options?.sourceUrl ?? DEFAULT_SWE_BENCH_URL;
|
|
334
|
+
const response = await fetchWithRetry(url, { fetch: options?.fetch });
|
|
335
|
+
const text = await response.text();
|
|
336
|
+
const data = parseJsonSafe(text, "SWE-bench");
|
|
337
|
+
const now = new Date;
|
|
338
|
+
let entries = data.filter((entry) => entry.model && entry.resolved_rate != null);
|
|
339
|
+
if (options?.modelFilter?.length) {
|
|
340
|
+
const filterSet = new Set(options.modelFilter);
|
|
341
|
+
entries = entries.filter((e) => filterSet.has(e.model.toLowerCase().replace(/\s+/g, "-")));
|
|
342
|
+
}
|
|
343
|
+
if (options?.maxResults) {
|
|
344
|
+
entries = entries.slice(0, options.maxResults);
|
|
345
|
+
}
|
|
346
|
+
let results = entries.map((entry) => {
|
|
347
|
+
const modelId = entry.model.toLowerCase().replace(/\s+/g, "-");
|
|
348
|
+
const org = entry.organization?.toLowerCase() ?? "unknown";
|
|
349
|
+
return {
|
|
350
|
+
id: `swe-bench:${modelId}:coding`,
|
|
351
|
+
modelId,
|
|
352
|
+
providerKey: mapOrganizationToProvider3(org),
|
|
353
|
+
source: "swe-bench",
|
|
354
|
+
dimension: "coding",
|
|
355
|
+
score: Math.max(0, Math.min(100, entry.resolved_rate)),
|
|
356
|
+
rawScore: entry.resolved_rate,
|
|
357
|
+
metadata: {
|
|
358
|
+
organization: entry.organization,
|
|
359
|
+
date: entry.date
|
|
360
|
+
},
|
|
361
|
+
measuredAt: entry.date ? new Date(entry.date) : now,
|
|
362
|
+
ingestedAt: now
|
|
363
|
+
};
|
|
364
|
+
});
|
|
365
|
+
const { fromDate, toDate } = options ?? {};
|
|
366
|
+
if (fromDate) {
|
|
367
|
+
results = results.filter((r) => r.measuredAt >= fromDate);
|
|
368
|
+
}
|
|
369
|
+
if (toDate) {
|
|
370
|
+
results = results.filter((r) => r.measuredAt <= toDate);
|
|
371
|
+
}
|
|
372
|
+
return results;
|
|
373
|
+
}
|
|
374
|
+
};
|
|
375
375
|
function mapOrganizationToProvider3(org) {
|
|
376
376
|
const normalized = org.toLowerCase();
|
|
377
377
|
if (normalized.includes("openai"))
|
|
@@ -250,73 +250,6 @@ function mapOrganizationToProvider(org) {
|
|
|
250
250
|
return org;
|
|
251
251
|
}
|
|
252
252
|
|
|
253
|
-
// src/ingesters/swe-bench.ts
|
|
254
|
-
var DEFAULT_SWE_BENCH_URL = "https://raw.githubusercontent.com/princeton-nlp/SWE-bench/main/docs/leaderboard.json";
|
|
255
|
-
var sweBenchIngester = {
|
|
256
|
-
source: "swe-bench",
|
|
257
|
-
displayName: "SWE-bench",
|
|
258
|
-
description: "Software engineering task completion rates from SWE-bench.",
|
|
259
|
-
async ingest(options) {
|
|
260
|
-
if (options?.dimensions?.length && !options.dimensions.includes("coding")) {
|
|
261
|
-
return [];
|
|
262
|
-
}
|
|
263
|
-
const url = options?.sourceUrl ?? DEFAULT_SWE_BENCH_URL;
|
|
264
|
-
const response = await fetchWithRetry(url, { fetch: options?.fetch });
|
|
265
|
-
const text = await response.text();
|
|
266
|
-
const data = parseJsonSafe(text, "SWE-bench");
|
|
267
|
-
const now = new Date;
|
|
268
|
-
let entries = data.filter((entry) => entry.model && entry.resolved_rate != null);
|
|
269
|
-
if (options?.modelFilter?.length) {
|
|
270
|
-
const filterSet = new Set(options.modelFilter);
|
|
271
|
-
entries = entries.filter((e) => filterSet.has(e.model.toLowerCase().replace(/\s+/g, "-")));
|
|
272
|
-
}
|
|
273
|
-
if (options?.maxResults) {
|
|
274
|
-
entries = entries.slice(0, options.maxResults);
|
|
275
|
-
}
|
|
276
|
-
let results = entries.map((entry) => {
|
|
277
|
-
const modelId = entry.model.toLowerCase().replace(/\s+/g, "-");
|
|
278
|
-
const org = entry.organization?.toLowerCase() ?? "unknown";
|
|
279
|
-
return {
|
|
280
|
-
id: `swe-bench:${modelId}:coding`,
|
|
281
|
-
modelId,
|
|
282
|
-
providerKey: mapOrganizationToProvider2(org),
|
|
283
|
-
source: "swe-bench",
|
|
284
|
-
dimension: "coding",
|
|
285
|
-
score: Math.max(0, Math.min(100, entry.resolved_rate)),
|
|
286
|
-
rawScore: entry.resolved_rate,
|
|
287
|
-
metadata: {
|
|
288
|
-
organization: entry.organization,
|
|
289
|
-
date: entry.date
|
|
290
|
-
},
|
|
291
|
-
measuredAt: entry.date ? new Date(entry.date) : now,
|
|
292
|
-
ingestedAt: now
|
|
293
|
-
};
|
|
294
|
-
});
|
|
295
|
-
const { fromDate, toDate } = options ?? {};
|
|
296
|
-
if (fromDate) {
|
|
297
|
-
results = results.filter((r) => r.measuredAt >= fromDate);
|
|
298
|
-
}
|
|
299
|
-
if (toDate) {
|
|
300
|
-
results = results.filter((r) => r.measuredAt <= toDate);
|
|
301
|
-
}
|
|
302
|
-
return results;
|
|
303
|
-
}
|
|
304
|
-
};
|
|
305
|
-
function mapOrganizationToProvider2(org) {
|
|
306
|
-
const normalized = org.toLowerCase();
|
|
307
|
-
if (normalized.includes("openai"))
|
|
308
|
-
return "openai";
|
|
309
|
-
if (normalized.includes("anthropic"))
|
|
310
|
-
return "anthropic";
|
|
311
|
-
if (normalized.includes("google") || normalized.includes("deepmind"))
|
|
312
|
-
return "gemini";
|
|
313
|
-
if (normalized.includes("mistral"))
|
|
314
|
-
return "mistral";
|
|
315
|
-
if (normalized.includes("meta"))
|
|
316
|
-
return "meta";
|
|
317
|
-
return org;
|
|
318
|
-
}
|
|
319
|
-
|
|
320
253
|
// src/ingesters/open-llm-leaderboard.ts
|
|
321
254
|
var DEFAULT_HF_URL = "https://huggingface.co/api/spaces/open-llm-leaderboard/open_llm_leaderboard/results";
|
|
322
255
|
var BENCHMARK_MAPPINGS = [
|
|
@@ -345,7 +278,7 @@ var openLlmLeaderboardIngester = {
|
|
|
345
278
|
for (const entry of entries) {
|
|
346
279
|
const modelId = entry.model_name.toLowerCase().replace(/\s+/g, "-");
|
|
347
280
|
const org = entry.organization?.toLowerCase() ?? "unknown";
|
|
348
|
-
const providerKey =
|
|
281
|
+
const providerKey = mapOrganizationToProvider2(org);
|
|
349
282
|
for (const mapping of BENCHMARK_MAPPINGS) {
|
|
350
283
|
if (dims && !dims.has(mapping.dimension))
|
|
351
284
|
continue;
|
|
@@ -372,6 +305,73 @@ var openLlmLeaderboardIngester = {
|
|
|
372
305
|
return options?.maxResults ? results.slice(0, options.maxResults) : results;
|
|
373
306
|
}
|
|
374
307
|
};
|
|
308
|
+
function mapOrganizationToProvider2(org) {
|
|
309
|
+
const normalized = org.toLowerCase();
|
|
310
|
+
if (normalized.includes("openai"))
|
|
311
|
+
return "openai";
|
|
312
|
+
if (normalized.includes("anthropic"))
|
|
313
|
+
return "anthropic";
|
|
314
|
+
if (normalized.includes("google") || normalized.includes("deepmind"))
|
|
315
|
+
return "gemini";
|
|
316
|
+
if (normalized.includes("mistral"))
|
|
317
|
+
return "mistral";
|
|
318
|
+
if (normalized.includes("meta"))
|
|
319
|
+
return "meta";
|
|
320
|
+
return org;
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// src/ingesters/swe-bench.ts
|
|
324
|
+
var DEFAULT_SWE_BENCH_URL = "https://raw.githubusercontent.com/princeton-nlp/SWE-bench/main/docs/leaderboard.json";
|
|
325
|
+
var sweBenchIngester = {
|
|
326
|
+
source: "swe-bench",
|
|
327
|
+
displayName: "SWE-bench",
|
|
328
|
+
description: "Software engineering task completion rates from SWE-bench.",
|
|
329
|
+
async ingest(options) {
|
|
330
|
+
if (options?.dimensions?.length && !options.dimensions.includes("coding")) {
|
|
331
|
+
return [];
|
|
332
|
+
}
|
|
333
|
+
const url = options?.sourceUrl ?? DEFAULT_SWE_BENCH_URL;
|
|
334
|
+
const response = await fetchWithRetry(url, { fetch: options?.fetch });
|
|
335
|
+
const text = await response.text();
|
|
336
|
+
const data = parseJsonSafe(text, "SWE-bench");
|
|
337
|
+
const now = new Date;
|
|
338
|
+
let entries = data.filter((entry) => entry.model && entry.resolved_rate != null);
|
|
339
|
+
if (options?.modelFilter?.length) {
|
|
340
|
+
const filterSet = new Set(options.modelFilter);
|
|
341
|
+
entries = entries.filter((e) => filterSet.has(e.model.toLowerCase().replace(/\s+/g, "-")));
|
|
342
|
+
}
|
|
343
|
+
if (options?.maxResults) {
|
|
344
|
+
entries = entries.slice(0, options.maxResults);
|
|
345
|
+
}
|
|
346
|
+
let results = entries.map((entry) => {
|
|
347
|
+
const modelId = entry.model.toLowerCase().replace(/\s+/g, "-");
|
|
348
|
+
const org = entry.organization?.toLowerCase() ?? "unknown";
|
|
349
|
+
return {
|
|
350
|
+
id: `swe-bench:${modelId}:coding`,
|
|
351
|
+
modelId,
|
|
352
|
+
providerKey: mapOrganizationToProvider3(org),
|
|
353
|
+
source: "swe-bench",
|
|
354
|
+
dimension: "coding",
|
|
355
|
+
score: Math.max(0, Math.min(100, entry.resolved_rate)),
|
|
356
|
+
rawScore: entry.resolved_rate,
|
|
357
|
+
metadata: {
|
|
358
|
+
organization: entry.organization,
|
|
359
|
+
date: entry.date
|
|
360
|
+
},
|
|
361
|
+
measuredAt: entry.date ? new Date(entry.date) : now,
|
|
362
|
+
ingestedAt: now
|
|
363
|
+
};
|
|
364
|
+
});
|
|
365
|
+
const { fromDate, toDate } = options ?? {};
|
|
366
|
+
if (fromDate) {
|
|
367
|
+
results = results.filter((r) => r.measuredAt >= fromDate);
|
|
368
|
+
}
|
|
369
|
+
if (toDate) {
|
|
370
|
+
results = results.filter((r) => r.measuredAt <= toDate);
|
|
371
|
+
}
|
|
372
|
+
return results;
|
|
373
|
+
}
|
|
374
|
+
};
|
|
375
375
|
function mapOrganizationToProvider3(org) {
|
|
376
376
|
const normalized = org.toLowerCase();
|
|
377
377
|
if (normalized.includes("openai"))
|
|
@@ -249,73 +249,6 @@ function mapOrganizationToProvider(org) {
|
|
|
249
249
|
return org;
|
|
250
250
|
}
|
|
251
251
|
|
|
252
|
-
// src/ingesters/swe-bench.ts
|
|
253
|
-
var DEFAULT_SWE_BENCH_URL = "https://raw.githubusercontent.com/princeton-nlp/SWE-bench/main/docs/leaderboard.json";
|
|
254
|
-
var sweBenchIngester = {
|
|
255
|
-
source: "swe-bench",
|
|
256
|
-
displayName: "SWE-bench",
|
|
257
|
-
description: "Software engineering task completion rates from SWE-bench.",
|
|
258
|
-
async ingest(options) {
|
|
259
|
-
if (options?.dimensions?.length && !options.dimensions.includes("coding")) {
|
|
260
|
-
return [];
|
|
261
|
-
}
|
|
262
|
-
const url = options?.sourceUrl ?? DEFAULT_SWE_BENCH_URL;
|
|
263
|
-
const response = await fetchWithRetry(url, { fetch: options?.fetch });
|
|
264
|
-
const text = await response.text();
|
|
265
|
-
const data = parseJsonSafe(text, "SWE-bench");
|
|
266
|
-
const now = new Date;
|
|
267
|
-
let entries = data.filter((entry) => entry.model && entry.resolved_rate != null);
|
|
268
|
-
if (options?.modelFilter?.length) {
|
|
269
|
-
const filterSet = new Set(options.modelFilter);
|
|
270
|
-
entries = entries.filter((e) => filterSet.has(e.model.toLowerCase().replace(/\s+/g, "-")));
|
|
271
|
-
}
|
|
272
|
-
if (options?.maxResults) {
|
|
273
|
-
entries = entries.slice(0, options.maxResults);
|
|
274
|
-
}
|
|
275
|
-
let results = entries.map((entry) => {
|
|
276
|
-
const modelId = entry.model.toLowerCase().replace(/\s+/g, "-");
|
|
277
|
-
const org = entry.organization?.toLowerCase() ?? "unknown";
|
|
278
|
-
return {
|
|
279
|
-
id: `swe-bench:${modelId}:coding`,
|
|
280
|
-
modelId,
|
|
281
|
-
providerKey: mapOrganizationToProvider2(org),
|
|
282
|
-
source: "swe-bench",
|
|
283
|
-
dimension: "coding",
|
|
284
|
-
score: Math.max(0, Math.min(100, entry.resolved_rate)),
|
|
285
|
-
rawScore: entry.resolved_rate,
|
|
286
|
-
metadata: {
|
|
287
|
-
organization: entry.organization,
|
|
288
|
-
date: entry.date
|
|
289
|
-
},
|
|
290
|
-
measuredAt: entry.date ? new Date(entry.date) : now,
|
|
291
|
-
ingestedAt: now
|
|
292
|
-
};
|
|
293
|
-
});
|
|
294
|
-
const { fromDate, toDate } = options ?? {};
|
|
295
|
-
if (fromDate) {
|
|
296
|
-
results = results.filter((r) => r.measuredAt >= fromDate);
|
|
297
|
-
}
|
|
298
|
-
if (toDate) {
|
|
299
|
-
results = results.filter((r) => r.measuredAt <= toDate);
|
|
300
|
-
}
|
|
301
|
-
return results;
|
|
302
|
-
}
|
|
303
|
-
};
|
|
304
|
-
function mapOrganizationToProvider2(org) {
|
|
305
|
-
const normalized = org.toLowerCase();
|
|
306
|
-
if (normalized.includes("openai"))
|
|
307
|
-
return "openai";
|
|
308
|
-
if (normalized.includes("anthropic"))
|
|
309
|
-
return "anthropic";
|
|
310
|
-
if (normalized.includes("google") || normalized.includes("deepmind"))
|
|
311
|
-
return "gemini";
|
|
312
|
-
if (normalized.includes("mistral"))
|
|
313
|
-
return "mistral";
|
|
314
|
-
if (normalized.includes("meta"))
|
|
315
|
-
return "meta";
|
|
316
|
-
return org;
|
|
317
|
-
}
|
|
318
|
-
|
|
319
252
|
// src/ingesters/open-llm-leaderboard.ts
|
|
320
253
|
var DEFAULT_HF_URL = "https://huggingface.co/api/spaces/open-llm-leaderboard/open_llm_leaderboard/results";
|
|
321
254
|
var BENCHMARK_MAPPINGS = [
|
|
@@ -344,7 +277,7 @@ var openLlmLeaderboardIngester = {
|
|
|
344
277
|
for (const entry of entries) {
|
|
345
278
|
const modelId = entry.model_name.toLowerCase().replace(/\s+/g, "-");
|
|
346
279
|
const org = entry.organization?.toLowerCase() ?? "unknown";
|
|
347
|
-
const providerKey =
|
|
280
|
+
const providerKey = mapOrganizationToProvider2(org);
|
|
348
281
|
for (const mapping of BENCHMARK_MAPPINGS) {
|
|
349
282
|
if (dims && !dims.has(mapping.dimension))
|
|
350
283
|
continue;
|
|
@@ -371,6 +304,73 @@ var openLlmLeaderboardIngester = {
|
|
|
371
304
|
return options?.maxResults ? results.slice(0, options.maxResults) : results;
|
|
372
305
|
}
|
|
373
306
|
};
|
|
307
|
+
function mapOrganizationToProvider2(org) {
|
|
308
|
+
const normalized = org.toLowerCase();
|
|
309
|
+
if (normalized.includes("openai"))
|
|
310
|
+
return "openai";
|
|
311
|
+
if (normalized.includes("anthropic"))
|
|
312
|
+
return "anthropic";
|
|
313
|
+
if (normalized.includes("google") || normalized.includes("deepmind"))
|
|
314
|
+
return "gemini";
|
|
315
|
+
if (normalized.includes("mistral"))
|
|
316
|
+
return "mistral";
|
|
317
|
+
if (normalized.includes("meta"))
|
|
318
|
+
return "meta";
|
|
319
|
+
return org;
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
// src/ingesters/swe-bench.ts
|
|
323
|
+
var DEFAULT_SWE_BENCH_URL = "https://raw.githubusercontent.com/princeton-nlp/SWE-bench/main/docs/leaderboard.json";
|
|
324
|
+
var sweBenchIngester = {
|
|
325
|
+
source: "swe-bench",
|
|
326
|
+
displayName: "SWE-bench",
|
|
327
|
+
description: "Software engineering task completion rates from SWE-bench.",
|
|
328
|
+
async ingest(options) {
|
|
329
|
+
if (options?.dimensions?.length && !options.dimensions.includes("coding")) {
|
|
330
|
+
return [];
|
|
331
|
+
}
|
|
332
|
+
const url = options?.sourceUrl ?? DEFAULT_SWE_BENCH_URL;
|
|
333
|
+
const response = await fetchWithRetry(url, { fetch: options?.fetch });
|
|
334
|
+
const text = await response.text();
|
|
335
|
+
const data = parseJsonSafe(text, "SWE-bench");
|
|
336
|
+
const now = new Date;
|
|
337
|
+
let entries = data.filter((entry) => entry.model && entry.resolved_rate != null);
|
|
338
|
+
if (options?.modelFilter?.length) {
|
|
339
|
+
const filterSet = new Set(options.modelFilter);
|
|
340
|
+
entries = entries.filter((e) => filterSet.has(e.model.toLowerCase().replace(/\s+/g, "-")));
|
|
341
|
+
}
|
|
342
|
+
if (options?.maxResults) {
|
|
343
|
+
entries = entries.slice(0, options.maxResults);
|
|
344
|
+
}
|
|
345
|
+
let results = entries.map((entry) => {
|
|
346
|
+
const modelId = entry.model.toLowerCase().replace(/\s+/g, "-");
|
|
347
|
+
const org = entry.organization?.toLowerCase() ?? "unknown";
|
|
348
|
+
return {
|
|
349
|
+
id: `swe-bench:${modelId}:coding`,
|
|
350
|
+
modelId,
|
|
351
|
+
providerKey: mapOrganizationToProvider3(org),
|
|
352
|
+
source: "swe-bench",
|
|
353
|
+
dimension: "coding",
|
|
354
|
+
score: Math.max(0, Math.min(100, entry.resolved_rate)),
|
|
355
|
+
rawScore: entry.resolved_rate,
|
|
356
|
+
metadata: {
|
|
357
|
+
organization: entry.organization,
|
|
358
|
+
date: entry.date
|
|
359
|
+
},
|
|
360
|
+
measuredAt: entry.date ? new Date(entry.date) : now,
|
|
361
|
+
ingestedAt: now
|
|
362
|
+
};
|
|
363
|
+
});
|
|
364
|
+
const { fromDate, toDate } = options ?? {};
|
|
365
|
+
if (fromDate) {
|
|
366
|
+
results = results.filter((r) => r.measuredAt >= fromDate);
|
|
367
|
+
}
|
|
368
|
+
if (toDate) {
|
|
369
|
+
results = results.filter((r) => r.measuredAt <= toDate);
|
|
370
|
+
}
|
|
371
|
+
return results;
|
|
372
|
+
}
|
|
373
|
+
};
|
|
374
374
|
function mapOrganizationToProvider3(org) {
|
|
375
375
|
const normalized = org.toLowerCase();
|
|
376
376
|
if (normalized.includes("openai"))
|
|
@@ -249,73 +249,6 @@ function mapOrganizationToProvider(org) {
|
|
|
249
249
|
return org;
|
|
250
250
|
}
|
|
251
251
|
|
|
252
|
-
// src/ingesters/swe-bench.ts
|
|
253
|
-
var DEFAULT_SWE_BENCH_URL = "https://raw.githubusercontent.com/princeton-nlp/SWE-bench/main/docs/leaderboard.json";
|
|
254
|
-
var sweBenchIngester = {
|
|
255
|
-
source: "swe-bench",
|
|
256
|
-
displayName: "SWE-bench",
|
|
257
|
-
description: "Software engineering task completion rates from SWE-bench.",
|
|
258
|
-
async ingest(options) {
|
|
259
|
-
if (options?.dimensions?.length && !options.dimensions.includes("coding")) {
|
|
260
|
-
return [];
|
|
261
|
-
}
|
|
262
|
-
const url = options?.sourceUrl ?? DEFAULT_SWE_BENCH_URL;
|
|
263
|
-
const response = await fetchWithRetry(url, { fetch: options?.fetch });
|
|
264
|
-
const text = await response.text();
|
|
265
|
-
const data = parseJsonSafe(text, "SWE-bench");
|
|
266
|
-
const now = new Date;
|
|
267
|
-
let entries = data.filter((entry) => entry.model && entry.resolved_rate != null);
|
|
268
|
-
if (options?.modelFilter?.length) {
|
|
269
|
-
const filterSet = new Set(options.modelFilter);
|
|
270
|
-
entries = entries.filter((e) => filterSet.has(e.model.toLowerCase().replace(/\s+/g, "-")));
|
|
271
|
-
}
|
|
272
|
-
if (options?.maxResults) {
|
|
273
|
-
entries = entries.slice(0, options.maxResults);
|
|
274
|
-
}
|
|
275
|
-
let results = entries.map((entry) => {
|
|
276
|
-
const modelId = entry.model.toLowerCase().replace(/\s+/g, "-");
|
|
277
|
-
const org = entry.organization?.toLowerCase() ?? "unknown";
|
|
278
|
-
return {
|
|
279
|
-
id: `swe-bench:${modelId}:coding`,
|
|
280
|
-
modelId,
|
|
281
|
-
providerKey: mapOrganizationToProvider2(org),
|
|
282
|
-
source: "swe-bench",
|
|
283
|
-
dimension: "coding",
|
|
284
|
-
score: Math.max(0, Math.min(100, entry.resolved_rate)),
|
|
285
|
-
rawScore: entry.resolved_rate,
|
|
286
|
-
metadata: {
|
|
287
|
-
organization: entry.organization,
|
|
288
|
-
date: entry.date
|
|
289
|
-
},
|
|
290
|
-
measuredAt: entry.date ? new Date(entry.date) : now,
|
|
291
|
-
ingestedAt: now
|
|
292
|
-
};
|
|
293
|
-
});
|
|
294
|
-
const { fromDate, toDate } = options ?? {};
|
|
295
|
-
if (fromDate) {
|
|
296
|
-
results = results.filter((r) => r.measuredAt >= fromDate);
|
|
297
|
-
}
|
|
298
|
-
if (toDate) {
|
|
299
|
-
results = results.filter((r) => r.measuredAt <= toDate);
|
|
300
|
-
}
|
|
301
|
-
return results;
|
|
302
|
-
}
|
|
303
|
-
};
|
|
304
|
-
function mapOrganizationToProvider2(org) {
|
|
305
|
-
const normalized = org.toLowerCase();
|
|
306
|
-
if (normalized.includes("openai"))
|
|
307
|
-
return "openai";
|
|
308
|
-
if (normalized.includes("anthropic"))
|
|
309
|
-
return "anthropic";
|
|
310
|
-
if (normalized.includes("google") || normalized.includes("deepmind"))
|
|
311
|
-
return "gemini";
|
|
312
|
-
if (normalized.includes("mistral"))
|
|
313
|
-
return "mistral";
|
|
314
|
-
if (normalized.includes("meta"))
|
|
315
|
-
return "meta";
|
|
316
|
-
return org;
|
|
317
|
-
}
|
|
318
|
-
|
|
319
252
|
// src/ingesters/open-llm-leaderboard.ts
|
|
320
253
|
var DEFAULT_HF_URL = "https://huggingface.co/api/spaces/open-llm-leaderboard/open_llm_leaderboard/results";
|
|
321
254
|
var BENCHMARK_MAPPINGS = [
|
|
@@ -344,7 +277,7 @@ var openLlmLeaderboardIngester = {
|
|
|
344
277
|
for (const entry of entries) {
|
|
345
278
|
const modelId = entry.model_name.toLowerCase().replace(/\s+/g, "-");
|
|
346
279
|
const org = entry.organization?.toLowerCase() ?? "unknown";
|
|
347
|
-
const providerKey =
|
|
280
|
+
const providerKey = mapOrganizationToProvider2(org);
|
|
348
281
|
for (const mapping of BENCHMARK_MAPPINGS) {
|
|
349
282
|
if (dims && !dims.has(mapping.dimension))
|
|
350
283
|
continue;
|
|
@@ -371,6 +304,73 @@ var openLlmLeaderboardIngester = {
|
|
|
371
304
|
return options?.maxResults ? results.slice(0, options.maxResults) : results;
|
|
372
305
|
}
|
|
373
306
|
};
|
|
307
|
+
function mapOrganizationToProvider2(org) {
|
|
308
|
+
const normalized = org.toLowerCase();
|
|
309
|
+
if (normalized.includes("openai"))
|
|
310
|
+
return "openai";
|
|
311
|
+
if (normalized.includes("anthropic"))
|
|
312
|
+
return "anthropic";
|
|
313
|
+
if (normalized.includes("google") || normalized.includes("deepmind"))
|
|
314
|
+
return "gemini";
|
|
315
|
+
if (normalized.includes("mistral"))
|
|
316
|
+
return "mistral";
|
|
317
|
+
if (normalized.includes("meta"))
|
|
318
|
+
return "meta";
|
|
319
|
+
return org;
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
// src/ingesters/swe-bench.ts
|
|
323
|
+
var DEFAULT_SWE_BENCH_URL = "https://raw.githubusercontent.com/princeton-nlp/SWE-bench/main/docs/leaderboard.json";
|
|
324
|
+
var sweBenchIngester = {
|
|
325
|
+
source: "swe-bench",
|
|
326
|
+
displayName: "SWE-bench",
|
|
327
|
+
description: "Software engineering task completion rates from SWE-bench.",
|
|
328
|
+
async ingest(options) {
|
|
329
|
+
if (options?.dimensions?.length && !options.dimensions.includes("coding")) {
|
|
330
|
+
return [];
|
|
331
|
+
}
|
|
332
|
+
const url = options?.sourceUrl ?? DEFAULT_SWE_BENCH_URL;
|
|
333
|
+
const response = await fetchWithRetry(url, { fetch: options?.fetch });
|
|
334
|
+
const text = await response.text();
|
|
335
|
+
const data = parseJsonSafe(text, "SWE-bench");
|
|
336
|
+
const now = new Date;
|
|
337
|
+
let entries = data.filter((entry) => entry.model && entry.resolved_rate != null);
|
|
338
|
+
if (options?.modelFilter?.length) {
|
|
339
|
+
const filterSet = new Set(options.modelFilter);
|
|
340
|
+
entries = entries.filter((e) => filterSet.has(e.model.toLowerCase().replace(/\s+/g, "-")));
|
|
341
|
+
}
|
|
342
|
+
if (options?.maxResults) {
|
|
343
|
+
entries = entries.slice(0, options.maxResults);
|
|
344
|
+
}
|
|
345
|
+
let results = entries.map((entry) => {
|
|
346
|
+
const modelId = entry.model.toLowerCase().replace(/\s+/g, "-");
|
|
347
|
+
const org = entry.organization?.toLowerCase() ?? "unknown";
|
|
348
|
+
return {
|
|
349
|
+
id: `swe-bench:${modelId}:coding`,
|
|
350
|
+
modelId,
|
|
351
|
+
providerKey: mapOrganizationToProvider3(org),
|
|
352
|
+
source: "swe-bench",
|
|
353
|
+
dimension: "coding",
|
|
354
|
+
score: Math.max(0, Math.min(100, entry.resolved_rate)),
|
|
355
|
+
rawScore: entry.resolved_rate,
|
|
356
|
+
metadata: {
|
|
357
|
+
organization: entry.organization,
|
|
358
|
+
date: entry.date
|
|
359
|
+
},
|
|
360
|
+
measuredAt: entry.date ? new Date(entry.date) : now,
|
|
361
|
+
ingestedAt: now
|
|
362
|
+
};
|
|
363
|
+
});
|
|
364
|
+
const { fromDate, toDate } = options ?? {};
|
|
365
|
+
if (fromDate) {
|
|
366
|
+
results = results.filter((r) => r.measuredAt >= fromDate);
|
|
367
|
+
}
|
|
368
|
+
if (toDate) {
|
|
369
|
+
results = results.filter((r) => r.measuredAt <= toDate);
|
|
370
|
+
}
|
|
371
|
+
return results;
|
|
372
|
+
}
|
|
373
|
+
};
|
|
374
374
|
function mapOrganizationToProvider3(org) {
|
|
375
375
|
const normalized = org.toLowerCase();
|
|
376
376
|
if (normalized.includes("openai"))
|
package/dist/scoring/index.d.ts
CHANGED
|
@@ -1,3 +1,3 @@
|
|
|
1
1
|
export { computeModelRankings } from './composite-scorer';
|
|
2
2
|
export { DEFAULT_DIMENSION_WEIGHTS, getWeightMap, normalizeWeights, } from './dimension-weights';
|
|
3
|
-
export {
|
|
3
|
+
export { normalizeBenchmarkResults, normalizeScore } from './normalizer';
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@contractspec/lib.provider-ranking",
|
|
3
|
-
"version": "0.7.
|
|
3
|
+
"version": "0.7.8",
|
|
4
4
|
"description": "AI provider ranking: benchmark ingestion, scoring, and model comparison",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"contractspec",
|
|
@@ -24,16 +24,16 @@
|
|
|
24
24
|
"dev": "contractspec-bun-build dev",
|
|
25
25
|
"clean": "rimraf dist .turbo",
|
|
26
26
|
"lint": "bun lint:fix",
|
|
27
|
-
"lint:fix": "
|
|
28
|
-
"lint:check": "
|
|
27
|
+
"lint:fix": "biome check --write --unsafe --only=nursery/useSortedClasses . && biome check --write .",
|
|
28
|
+
"lint:check": "biome check .",
|
|
29
29
|
"test": "bun test --pass-with-no-tests",
|
|
30
30
|
"prebuild": "contractspec-bun-build prebuild",
|
|
31
31
|
"typecheck": "tsc --noEmit"
|
|
32
32
|
},
|
|
33
33
|
"dependencies": {},
|
|
34
34
|
"devDependencies": {
|
|
35
|
-
"@contractspec/tool.typescript": "3.7.
|
|
36
|
-
"@contractspec/tool.bun": "3.7.
|
|
35
|
+
"@contractspec/tool.typescript": "3.7.8",
|
|
36
|
+
"@contractspec/tool.bun": "3.7.8",
|
|
37
37
|
"typescript": "^5.9.3"
|
|
38
38
|
},
|
|
39
39
|
"exports": {
|