@contractspec/lib.provider-ranking 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +44 -0
- package/dist/browser/eval/index.js +101 -0
- package/dist/browser/eval/runner.js +101 -0
- package/dist/browser/eval/types.js +0 -0
- package/dist/browser/in-memory-store.js +92 -0
- package/dist/browser/index.js +105 -0
- package/dist/browser/ingesters/artificial-analysis.js +149 -0
- package/dist/browser/ingesters/chatbot-arena.js +142 -0
- package/dist/browser/ingesters/fetch-utils.js +39 -0
- package/dist/browser/ingesters/index.js +418 -0
- package/dist/browser/ingesters/open-llm-leaderboard.js +108 -0
- package/dist/browser/ingesters/registry.js +412 -0
- package/dist/browser/ingesters/swe-bench.js +105 -0
- package/dist/browser/ingesters/types.js +0 -0
- package/dist/browser/scoring/composite-scorer.js +122 -0
- package/dist/browser/scoring/dimension-weights.js +39 -0
- package/dist/browser/scoring/index.js +161 -0
- package/dist/browser/scoring/normalizer.js +37 -0
- package/dist/browser/store.js +0 -0
- package/dist/browser/types.js +14 -0
- package/dist/eval/index.d.ts +2 -0
- package/dist/eval/index.js +102 -0
- package/dist/eval/runner.d.ts +18 -0
- package/dist/eval/runner.js +102 -0
- package/dist/eval/types.d.ts +51 -0
- package/dist/eval/types.js +1 -0
- package/dist/in-memory-store.d.ts +17 -0
- package/dist/in-memory-store.js +93 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.js +106 -0
- package/dist/ingesters/artificial-analysis.d.ts +8 -0
- package/dist/ingesters/artificial-analysis.js +150 -0
- package/dist/ingesters/chatbot-arena.d.ts +8 -0
- package/dist/ingesters/chatbot-arena.js +143 -0
- package/dist/ingesters/fetch-utils.d.ts +11 -0
- package/dist/ingesters/fetch-utils.js +40 -0
- package/dist/ingesters/index.d.ts +7 -0
- package/dist/ingesters/index.js +419 -0
- package/dist/ingesters/open-llm-leaderboard.d.ts +8 -0
- package/dist/ingesters/open-llm-leaderboard.js +109 -0
- package/dist/ingesters/registry.d.ts +17 -0
- package/dist/ingesters/registry.js +413 -0
- package/dist/ingesters/swe-bench.d.ts +8 -0
- package/dist/ingesters/swe-bench.js +106 -0
- package/dist/ingesters/types.d.ts +31 -0
- package/dist/ingesters/types.js +1 -0
- package/dist/node/eval/index.js +101 -0
- package/dist/node/eval/runner.js +101 -0
- package/dist/node/eval/types.js +0 -0
- package/dist/node/in-memory-store.js +92 -0
- package/dist/node/index.js +105 -0
- package/dist/node/ingesters/artificial-analysis.js +149 -0
- package/dist/node/ingesters/chatbot-arena.js +142 -0
- package/dist/node/ingesters/fetch-utils.js +39 -0
- package/dist/node/ingesters/index.js +418 -0
- package/dist/node/ingesters/open-llm-leaderboard.js +108 -0
- package/dist/node/ingesters/registry.js +412 -0
- package/dist/node/ingesters/swe-bench.js +105 -0
- package/dist/node/ingesters/types.js +0 -0
- package/dist/node/scoring/composite-scorer.js +122 -0
- package/dist/node/scoring/dimension-weights.js +39 -0
- package/dist/node/scoring/index.js +161 -0
- package/dist/node/scoring/normalizer.js +37 -0
- package/dist/node/store.js +0 -0
- package/dist/node/types.js +14 -0
- package/dist/scoring/composite-scorer.d.ts +10 -0
- package/dist/scoring/composite-scorer.js +123 -0
- package/dist/scoring/dimension-weights.d.ts +8 -0
- package/dist/scoring/dimension-weights.js +40 -0
- package/dist/scoring/index.d.ts +3 -0
- package/dist/scoring/index.js +162 -0
- package/dist/scoring/normalizer.d.ts +20 -0
- package/dist/scoring/normalizer.js +38 -0
- package/dist/store.d.ts +19 -0
- package/dist/store.js +1 -0
- package/dist/types.d.ts +100 -0
- package/dist/types.js +15 -0
- package/package.json +362 -0
package/README.md
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# @contractspec/lib.provider-ranking
|
|
2
|
+
|
|
3
|
+
Website: https://contractspec.io/
|
|
4
|
+
|
|
5
|
+
**AI provider ranking: benchmark ingestion, scoring, and model comparison.**
|
|
6
|
+
|
|
7
|
+
Ingests benchmark data from multiple sources (Chatbot Arena, SWE-bench, Artificial Analysis, Open LLM Leaderboard), normalizes scores to a 0-100 scale, and computes composite rankings across dimensions like coding, reasoning, cost, and latency.
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
bun add @contractspec/lib.provider-ranking
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Exports
|
|
16
|
+
|
|
17
|
+
- `.` -- Core types, store interface, and in-memory store
|
|
18
|
+
- `./types` -- `BenchmarkResult`, `ModelRanking`, `ModelProfile`, `BenchmarkDimension`, `DimensionWeightConfig`
|
|
19
|
+
- `./store` -- `ProviderRankingStore` interface
|
|
20
|
+
- `./in-memory-store` -- `InMemoryProviderRankingStore` class
|
|
21
|
+
- `./scoring` -- `computeModelRankings()`, `normalizeScore()`, `DEFAULT_DIMENSION_WEIGHTS`
|
|
22
|
+
- `./ingesters` -- `chatbotArenaIngester`, `sweBenchIngester`, `artificialAnalysisIngester`, `IngesterRegistry`
|
|
23
|
+
- `./eval` -- `EvalRunner`, `EvalSuite`, `EvalCase` for custom evaluation
|
|
24
|
+
|
|
25
|
+
## Usage
|
|
26
|
+
|
|
27
|
+
```ts
|
|
28
|
+
import { InMemoryProviderRankingStore } from "@contractspec/lib.provider-ranking/in-memory-store";
|
|
29
|
+
import { createDefaultIngesterRegistry } from "@contractspec/lib.provider-ranking/ingesters";
|
|
30
|
+
import { computeModelRankings } from "@contractspec/lib.provider-ranking/scoring";
|
|
31
|
+
|
|
32
|
+
const store = new InMemoryProviderRankingStore();
|
|
33
|
+
const registry = createDefaultIngesterRegistry();
|
|
34
|
+
|
|
35
|
+
const ingester = registry.get("swe-bench");
|
|
36
|
+
const results = await ingester.ingest();
|
|
37
|
+
|
|
38
|
+
for (const result of results) {
|
|
39
|
+
await store.addBenchmarkResult(result);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
const rankings = computeModelRankings(await store.listBenchmarkResults({}));
|
|
43
|
+
console.log(rankings);
|
|
44
|
+
```
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
// src/eval/runner.ts
|
|
2
|
+
class EvalRunner {
|
|
3
|
+
adapter;
|
|
4
|
+
options;
|
|
5
|
+
constructor(adapter, options = {}) {
|
|
6
|
+
this.adapter = adapter;
|
|
7
|
+
this.options = options;
|
|
8
|
+
}
|
|
9
|
+
async run(suite, modelId, providerKey) {
|
|
10
|
+
const runId = `eval-${suite.key}-${modelId}-${Date.now()}`;
|
|
11
|
+
const startedAt = new Date;
|
|
12
|
+
const concurrency = this.options.maxConcurrency ?? 5;
|
|
13
|
+
const caseResults = await this.runCasesWithConcurrency(suite.cases, suite.defaultGrader, concurrency);
|
|
14
|
+
const passedCases = caseResults.filter((r) => r.passed).length;
|
|
15
|
+
const averageScore = caseResults.length > 0 ? caseResults.reduce((sum, r) => sum + r.score, 0) / caseResults.length : 0;
|
|
16
|
+
const averageLatencyMs = caseResults.length > 0 ? caseResults.reduce((sum, r) => sum + r.latencyMs, 0) / caseResults.length : 0;
|
|
17
|
+
return {
|
|
18
|
+
runId,
|
|
19
|
+
evalSuiteKey: suite.key,
|
|
20
|
+
modelId,
|
|
21
|
+
providerKey,
|
|
22
|
+
totalCases: suite.cases.length,
|
|
23
|
+
passedCases,
|
|
24
|
+
averageScore: Math.round(averageScore * 100) / 100,
|
|
25
|
+
averageLatencyMs: Math.round(averageLatencyMs),
|
|
26
|
+
caseResults,
|
|
27
|
+
startedAt,
|
|
28
|
+
completedAt: new Date
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
async runCasesWithConcurrency(cases, defaultGrader, concurrency) {
|
|
32
|
+
const results = [];
|
|
33
|
+
const queue = [...cases];
|
|
34
|
+
const workers = Array.from({ length: Math.min(concurrency, queue.length) }, async () => {
|
|
35
|
+
while (queue.length > 0) {
|
|
36
|
+
const evalCase = queue.shift();
|
|
37
|
+
if (!evalCase)
|
|
38
|
+
break;
|
|
39
|
+
const result = await this.runSingleCase(evalCase, defaultGrader);
|
|
40
|
+
results.push(result);
|
|
41
|
+
}
|
|
42
|
+
});
|
|
43
|
+
await Promise.all(workers);
|
|
44
|
+
return results;
|
|
45
|
+
}
|
|
46
|
+
async runSingleCase(evalCase, defaultGrader) {
|
|
47
|
+
try {
|
|
48
|
+
const { text, latencyMs } = await this.adapter.chat(evalCase.prompt);
|
|
49
|
+
const { passed, score } = this.grade(evalCase, text, defaultGrader);
|
|
50
|
+
return {
|
|
51
|
+
caseId: evalCase.id,
|
|
52
|
+
passed,
|
|
53
|
+
score,
|
|
54
|
+
response: text,
|
|
55
|
+
latencyMs
|
|
56
|
+
};
|
|
57
|
+
} catch (error) {
|
|
58
|
+
return {
|
|
59
|
+
caseId: evalCase.id,
|
|
60
|
+
passed: false,
|
|
61
|
+
score: 0,
|
|
62
|
+
response: "",
|
|
63
|
+
latencyMs: 0,
|
|
64
|
+
error: error instanceof Error ? error.message : String(error)
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
grade(evalCase, response, defaultGrader) {
|
|
69
|
+
const grader = evalCase.graderKey ?? defaultGrader;
|
|
70
|
+
switch (grader) {
|
|
71
|
+
case "exact":
|
|
72
|
+
if (!evalCase.expectedOutput)
|
|
73
|
+
return { passed: true, score: 1 };
|
|
74
|
+
return {
|
|
75
|
+
passed: response.trim() === evalCase.expectedOutput.trim(),
|
|
76
|
+
score: response.trim() === evalCase.expectedOutput.trim() ? 1 : 0
|
|
77
|
+
};
|
|
78
|
+
case "contains":
|
|
79
|
+
if (!evalCase.expectedOutput)
|
|
80
|
+
return { passed: true, score: 1 };
|
|
81
|
+
return {
|
|
82
|
+
passed: response.includes(evalCase.expectedOutput),
|
|
83
|
+
score: response.includes(evalCase.expectedOutput) ? 1 : 0
|
|
84
|
+
};
|
|
85
|
+
case "regex": {
|
|
86
|
+
if (!evalCase.expectedPattern)
|
|
87
|
+
return { passed: true, score: 1 };
|
|
88
|
+
const regex = new RegExp(evalCase.expectedPattern);
|
|
89
|
+
const matches = regex.test(response);
|
|
90
|
+
return { passed: matches, score: matches ? 1 : 0 };
|
|
91
|
+
}
|
|
92
|
+
case "llm-judge":
|
|
93
|
+
return { passed: true, score: 0.5 };
|
|
94
|
+
default:
|
|
95
|
+
return { passed: true, score: 0.5 };
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
export {
|
|
100
|
+
EvalRunner
|
|
101
|
+
};
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
// src/eval/runner.ts
|
|
2
|
+
class EvalRunner {
|
|
3
|
+
adapter;
|
|
4
|
+
options;
|
|
5
|
+
constructor(adapter, options = {}) {
|
|
6
|
+
this.adapter = adapter;
|
|
7
|
+
this.options = options;
|
|
8
|
+
}
|
|
9
|
+
async run(suite, modelId, providerKey) {
|
|
10
|
+
const runId = `eval-${suite.key}-${modelId}-${Date.now()}`;
|
|
11
|
+
const startedAt = new Date;
|
|
12
|
+
const concurrency = this.options.maxConcurrency ?? 5;
|
|
13
|
+
const caseResults = await this.runCasesWithConcurrency(suite.cases, suite.defaultGrader, concurrency);
|
|
14
|
+
const passedCases = caseResults.filter((r) => r.passed).length;
|
|
15
|
+
const averageScore = caseResults.length > 0 ? caseResults.reduce((sum, r) => sum + r.score, 0) / caseResults.length : 0;
|
|
16
|
+
const averageLatencyMs = caseResults.length > 0 ? caseResults.reduce((sum, r) => sum + r.latencyMs, 0) / caseResults.length : 0;
|
|
17
|
+
return {
|
|
18
|
+
runId,
|
|
19
|
+
evalSuiteKey: suite.key,
|
|
20
|
+
modelId,
|
|
21
|
+
providerKey,
|
|
22
|
+
totalCases: suite.cases.length,
|
|
23
|
+
passedCases,
|
|
24
|
+
averageScore: Math.round(averageScore * 100) / 100,
|
|
25
|
+
averageLatencyMs: Math.round(averageLatencyMs),
|
|
26
|
+
caseResults,
|
|
27
|
+
startedAt,
|
|
28
|
+
completedAt: new Date
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
async runCasesWithConcurrency(cases, defaultGrader, concurrency) {
|
|
32
|
+
const results = [];
|
|
33
|
+
const queue = [...cases];
|
|
34
|
+
const workers = Array.from({ length: Math.min(concurrency, queue.length) }, async () => {
|
|
35
|
+
while (queue.length > 0) {
|
|
36
|
+
const evalCase = queue.shift();
|
|
37
|
+
if (!evalCase)
|
|
38
|
+
break;
|
|
39
|
+
const result = await this.runSingleCase(evalCase, defaultGrader);
|
|
40
|
+
results.push(result);
|
|
41
|
+
}
|
|
42
|
+
});
|
|
43
|
+
await Promise.all(workers);
|
|
44
|
+
return results;
|
|
45
|
+
}
|
|
46
|
+
async runSingleCase(evalCase, defaultGrader) {
|
|
47
|
+
try {
|
|
48
|
+
const { text, latencyMs } = await this.adapter.chat(evalCase.prompt);
|
|
49
|
+
const { passed, score } = this.grade(evalCase, text, defaultGrader);
|
|
50
|
+
return {
|
|
51
|
+
caseId: evalCase.id,
|
|
52
|
+
passed,
|
|
53
|
+
score,
|
|
54
|
+
response: text,
|
|
55
|
+
latencyMs
|
|
56
|
+
};
|
|
57
|
+
} catch (error) {
|
|
58
|
+
return {
|
|
59
|
+
caseId: evalCase.id,
|
|
60
|
+
passed: false,
|
|
61
|
+
score: 0,
|
|
62
|
+
response: "",
|
|
63
|
+
latencyMs: 0,
|
|
64
|
+
error: error instanceof Error ? error.message : String(error)
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
grade(evalCase, response, defaultGrader) {
|
|
69
|
+
const grader = evalCase.graderKey ?? defaultGrader;
|
|
70
|
+
switch (grader) {
|
|
71
|
+
case "exact":
|
|
72
|
+
if (!evalCase.expectedOutput)
|
|
73
|
+
return { passed: true, score: 1 };
|
|
74
|
+
return {
|
|
75
|
+
passed: response.trim() === evalCase.expectedOutput.trim(),
|
|
76
|
+
score: response.trim() === evalCase.expectedOutput.trim() ? 1 : 0
|
|
77
|
+
};
|
|
78
|
+
case "contains":
|
|
79
|
+
if (!evalCase.expectedOutput)
|
|
80
|
+
return { passed: true, score: 1 };
|
|
81
|
+
return {
|
|
82
|
+
passed: response.includes(evalCase.expectedOutput),
|
|
83
|
+
score: response.includes(evalCase.expectedOutput) ? 1 : 0
|
|
84
|
+
};
|
|
85
|
+
case "regex": {
|
|
86
|
+
if (!evalCase.expectedPattern)
|
|
87
|
+
return { passed: true, score: 1 };
|
|
88
|
+
const regex = new RegExp(evalCase.expectedPattern);
|
|
89
|
+
const matches = regex.test(response);
|
|
90
|
+
return { passed: matches, score: matches ? 1 : 0 };
|
|
91
|
+
}
|
|
92
|
+
case "llm-judge":
|
|
93
|
+
return { passed: true, score: 0.5 };
|
|
94
|
+
default:
|
|
95
|
+
return { passed: true, score: 0.5 };
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
export {
|
|
100
|
+
EvalRunner
|
|
101
|
+
};
|
|
File without changes
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
// src/in-memory-store.ts
|
|
2
|
+
class InMemoryProviderRankingStore {
|
|
3
|
+
benchmarkResults = new Map;
|
|
4
|
+
modelRankings = new Map;
|
|
5
|
+
ingestionRuns = new Map;
|
|
6
|
+
async upsertBenchmarkResult(result) {
|
|
7
|
+
this.benchmarkResults.set(result.id, result);
|
|
8
|
+
}
|
|
9
|
+
async getBenchmarkResult(id) {
|
|
10
|
+
return this.benchmarkResults.get(id) ?? null;
|
|
11
|
+
}
|
|
12
|
+
async listBenchmarkResults(query) {
|
|
13
|
+
let results = Array.from(this.benchmarkResults.values());
|
|
14
|
+
if (query.source) {
|
|
15
|
+
results = results.filter((r) => r.source === query.source);
|
|
16
|
+
}
|
|
17
|
+
if (query.modelId) {
|
|
18
|
+
results = results.filter((r) => r.modelId === query.modelId);
|
|
19
|
+
}
|
|
20
|
+
if (query.dimension) {
|
|
21
|
+
results = results.filter((r) => r.dimension === query.dimension);
|
|
22
|
+
}
|
|
23
|
+
if (query.providerKey) {
|
|
24
|
+
results = results.filter((r) => r.providerKey === query.providerKey);
|
|
25
|
+
}
|
|
26
|
+
const total = results.length;
|
|
27
|
+
const offset = query.offset ?? 0;
|
|
28
|
+
const limit = query.limit ?? 50;
|
|
29
|
+
results = results.slice(offset, offset + limit);
|
|
30
|
+
const nextOffset = offset + results.length < total ? offset + results.length : undefined;
|
|
31
|
+
return { results, total, nextOffset };
|
|
32
|
+
}
|
|
33
|
+
async upsertModelRanking(ranking) {
|
|
34
|
+
this.modelRankings.set(ranking.modelId, ranking);
|
|
35
|
+
}
|
|
36
|
+
async getModelRanking(modelId) {
|
|
37
|
+
return this.modelRankings.get(modelId) ?? null;
|
|
38
|
+
}
|
|
39
|
+
async listModelRankings(query) {
|
|
40
|
+
let rankings = Array.from(this.modelRankings.values());
|
|
41
|
+
if (query.providerKey) {
|
|
42
|
+
rankings = rankings.filter((r) => r.providerKey === query.providerKey);
|
|
43
|
+
}
|
|
44
|
+
if (query.dimension) {
|
|
45
|
+
const dim = query.dimension;
|
|
46
|
+
rankings.sort((a, b) => {
|
|
47
|
+
const scoreA = a.dimensionScores[dim]?.score ?? -1;
|
|
48
|
+
const scoreB = b.dimensionScores[dim]?.score ?? -1;
|
|
49
|
+
return scoreB - scoreA;
|
|
50
|
+
});
|
|
51
|
+
} else {
|
|
52
|
+
rankings.sort((a, b) => a.rank - b.rank);
|
|
53
|
+
}
|
|
54
|
+
const total = rankings.length;
|
|
55
|
+
const offset = query.offset ?? 0;
|
|
56
|
+
const limit = query.limit ?? 50;
|
|
57
|
+
rankings = rankings.slice(offset, offset + limit);
|
|
58
|
+
const nextOffset = offset + rankings.length < total ? offset + rankings.length : undefined;
|
|
59
|
+
return { rankings, total, nextOffset };
|
|
60
|
+
}
|
|
61
|
+
async getModelProfile(modelId) {
|
|
62
|
+
const ranking = this.modelRankings.get(modelId);
|
|
63
|
+
if (!ranking)
|
|
64
|
+
return null;
|
|
65
|
+
const benchmarkResults = Array.from(this.benchmarkResults.values()).filter((r) => r.modelId === modelId);
|
|
66
|
+
return {
|
|
67
|
+
modelId,
|
|
68
|
+
providerKey: ranking.providerKey,
|
|
69
|
+
displayName: modelId,
|
|
70
|
+
contextWindow: 0,
|
|
71
|
+
costPerMillion: null,
|
|
72
|
+
capabilities: [],
|
|
73
|
+
ranking,
|
|
74
|
+
benchmarkResults
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
async createIngestionRun(run) {
|
|
78
|
+
this.ingestionRuns.set(run.id, run);
|
|
79
|
+
}
|
|
80
|
+
async updateIngestionRun(id, update) {
|
|
81
|
+
const existing = this.ingestionRuns.get(id);
|
|
82
|
+
if (existing) {
|
|
83
|
+
this.ingestionRuns.set(id, { ...existing, ...update });
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
async getIngestionRun(id) {
|
|
87
|
+
return this.ingestionRuns.get(id) ?? null;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
export {
|
|
91
|
+
InMemoryProviderRankingStore
|
|
92
|
+
};
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
// src/in-memory-store.ts
|
|
2
|
+
class InMemoryProviderRankingStore {
|
|
3
|
+
benchmarkResults = new Map;
|
|
4
|
+
modelRankings = new Map;
|
|
5
|
+
ingestionRuns = new Map;
|
|
6
|
+
async upsertBenchmarkResult(result) {
|
|
7
|
+
this.benchmarkResults.set(result.id, result);
|
|
8
|
+
}
|
|
9
|
+
async getBenchmarkResult(id) {
|
|
10
|
+
return this.benchmarkResults.get(id) ?? null;
|
|
11
|
+
}
|
|
12
|
+
async listBenchmarkResults(query) {
|
|
13
|
+
let results = Array.from(this.benchmarkResults.values());
|
|
14
|
+
if (query.source) {
|
|
15
|
+
results = results.filter((r) => r.source === query.source);
|
|
16
|
+
}
|
|
17
|
+
if (query.modelId) {
|
|
18
|
+
results = results.filter((r) => r.modelId === query.modelId);
|
|
19
|
+
}
|
|
20
|
+
if (query.dimension) {
|
|
21
|
+
results = results.filter((r) => r.dimension === query.dimension);
|
|
22
|
+
}
|
|
23
|
+
if (query.providerKey) {
|
|
24
|
+
results = results.filter((r) => r.providerKey === query.providerKey);
|
|
25
|
+
}
|
|
26
|
+
const total = results.length;
|
|
27
|
+
const offset = query.offset ?? 0;
|
|
28
|
+
const limit = query.limit ?? 50;
|
|
29
|
+
results = results.slice(offset, offset + limit);
|
|
30
|
+
const nextOffset = offset + results.length < total ? offset + results.length : undefined;
|
|
31
|
+
return { results, total, nextOffset };
|
|
32
|
+
}
|
|
33
|
+
async upsertModelRanking(ranking) {
|
|
34
|
+
this.modelRankings.set(ranking.modelId, ranking);
|
|
35
|
+
}
|
|
36
|
+
async getModelRanking(modelId) {
|
|
37
|
+
return this.modelRankings.get(modelId) ?? null;
|
|
38
|
+
}
|
|
39
|
+
async listModelRankings(query) {
|
|
40
|
+
let rankings = Array.from(this.modelRankings.values());
|
|
41
|
+
if (query.providerKey) {
|
|
42
|
+
rankings = rankings.filter((r) => r.providerKey === query.providerKey);
|
|
43
|
+
}
|
|
44
|
+
if (query.dimension) {
|
|
45
|
+
const dim = query.dimension;
|
|
46
|
+
rankings.sort((a, b) => {
|
|
47
|
+
const scoreA = a.dimensionScores[dim]?.score ?? -1;
|
|
48
|
+
const scoreB = b.dimensionScores[dim]?.score ?? -1;
|
|
49
|
+
return scoreB - scoreA;
|
|
50
|
+
});
|
|
51
|
+
} else {
|
|
52
|
+
rankings.sort((a, b) => a.rank - b.rank);
|
|
53
|
+
}
|
|
54
|
+
const total = rankings.length;
|
|
55
|
+
const offset = query.offset ?? 0;
|
|
56
|
+
const limit = query.limit ?? 50;
|
|
57
|
+
rankings = rankings.slice(offset, offset + limit);
|
|
58
|
+
const nextOffset = offset + rankings.length < total ? offset + rankings.length : undefined;
|
|
59
|
+
return { rankings, total, nextOffset };
|
|
60
|
+
}
|
|
61
|
+
async getModelProfile(modelId) {
|
|
62
|
+
const ranking = this.modelRankings.get(modelId);
|
|
63
|
+
if (!ranking)
|
|
64
|
+
return null;
|
|
65
|
+
const benchmarkResults = Array.from(this.benchmarkResults.values()).filter((r) => r.modelId === modelId);
|
|
66
|
+
return {
|
|
67
|
+
modelId,
|
|
68
|
+
providerKey: ranking.providerKey,
|
|
69
|
+
displayName: modelId,
|
|
70
|
+
contextWindow: 0,
|
|
71
|
+
costPerMillion: null,
|
|
72
|
+
capabilities: [],
|
|
73
|
+
ranking,
|
|
74
|
+
benchmarkResults
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
async createIngestionRun(run) {
|
|
78
|
+
this.ingestionRuns.set(run.id, run);
|
|
79
|
+
}
|
|
80
|
+
async updateIngestionRun(id, update) {
|
|
81
|
+
const existing = this.ingestionRuns.get(id);
|
|
82
|
+
if (existing) {
|
|
83
|
+
this.ingestionRuns.set(id, { ...existing, ...update });
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
async getIngestionRun(id) {
|
|
87
|
+
return this.ingestionRuns.get(id) ?? null;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// src/types.ts
|
|
92
|
+
var BENCHMARK_DIMENSIONS = [
|
|
93
|
+
"coding",
|
|
94
|
+
"reasoning",
|
|
95
|
+
"agentic",
|
|
96
|
+
"cost",
|
|
97
|
+
"latency",
|
|
98
|
+
"context",
|
|
99
|
+
"safety",
|
|
100
|
+
"custom"
|
|
101
|
+
];
|
|
102
|
+
export {
|
|
103
|
+
InMemoryProviderRankingStore,
|
|
104
|
+
BENCHMARK_DIMENSIONS
|
|
105
|
+
};
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
// src/ingesters/fetch-utils.ts
|
|
2
|
+
async function fetchWithRetry(url, options) {
|
|
3
|
+
const fetchFn = options?.fetch ?? globalThis.fetch;
|
|
4
|
+
const maxRetries = options?.maxRetries ?? 2;
|
|
5
|
+
const baseDelay = options?.baseDelayMs ?? 500;
|
|
6
|
+
let lastError;
|
|
7
|
+
for (let attempt = 0;attempt <= maxRetries; attempt++) {
|
|
8
|
+
try {
|
|
9
|
+
const response = await fetchFn(url);
|
|
10
|
+
if (response.ok)
|
|
11
|
+
return response;
|
|
12
|
+
if (response.status >= 500 && attempt < maxRetries) {
|
|
13
|
+
await sleep(baseDelay * Math.pow(2, attempt));
|
|
14
|
+
continue;
|
|
15
|
+
}
|
|
16
|
+
throw new Error(`Fetch failed: ${response.status} ${response.statusText} (${url})`);
|
|
17
|
+
} catch (error) {
|
|
18
|
+
lastError = error instanceof Error ? error : new Error(String(error));
|
|
19
|
+
if (attempt < maxRetries) {
|
|
20
|
+
await sleep(baseDelay * Math.pow(2, attempt));
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
throw lastError ?? new Error(`Fetch failed after ${maxRetries + 1} attempts: ${url}`);
|
|
25
|
+
}
|
|
26
|
+
function parseJsonSafe(text, label) {
|
|
27
|
+
try {
|
|
28
|
+
return JSON.parse(text);
|
|
29
|
+
} catch {
|
|
30
|
+
throw new Error(`Failed to parse JSON response from ${label}: ${text.slice(0, 200)}`);
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
function sleep(ms) {
|
|
34
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// src/ingesters/artificial-analysis.ts
|
|
38
|
+
var DEFAULT_AA_URL = "https://artificialanalysis.ai/api/models";
|
|
39
|
+
var artificialAnalysisIngester = {
|
|
40
|
+
source: "artificial-analysis",
|
|
41
|
+
displayName: "Artificial Analysis",
|
|
42
|
+
description: "Quality, speed, and cost benchmarks from Artificial Analysis.",
|
|
43
|
+
async ingest(options) {
|
|
44
|
+
const url = options?.sourceUrl ?? DEFAULT_AA_URL;
|
|
45
|
+
const response = await fetchWithRetry(url, { fetch: options?.fetch });
|
|
46
|
+
const text = await response.text();
|
|
47
|
+
const data = parseJsonSafe(text, "Artificial Analysis");
|
|
48
|
+
const now = new Date;
|
|
49
|
+
const results = [];
|
|
50
|
+
const dims = options?.dimensions ? new Set(options.dimensions) : null;
|
|
51
|
+
let entries = data.filter((e) => e.model_id && e.provider);
|
|
52
|
+
if (options?.modelFilter?.length) {
|
|
53
|
+
const filterSet = new Set(options.modelFilter);
|
|
54
|
+
entries = entries.filter((e) => filterSet.has(e.model_id));
|
|
55
|
+
}
|
|
56
|
+
for (const entry of entries) {
|
|
57
|
+
const baseId = `artificial-analysis:${entry.model_id}`;
|
|
58
|
+
if (entry.quality_score != null && (!dims || dims.has("reasoning"))) {
|
|
59
|
+
results.push({
|
|
60
|
+
id: `${baseId}:reasoning`,
|
|
61
|
+
modelId: entry.model_id,
|
|
62
|
+
providerKey: entry.provider.toLowerCase(),
|
|
63
|
+
source: "artificial-analysis",
|
|
64
|
+
dimension: "reasoning",
|
|
65
|
+
score: Math.max(0, Math.min(100, entry.quality_score)),
|
|
66
|
+
rawScore: entry.quality_score,
|
|
67
|
+
metadata: { model_name: entry.model_name },
|
|
68
|
+
measuredAt: now,
|
|
69
|
+
ingestedAt: now
|
|
70
|
+
});
|
|
71
|
+
}
|
|
72
|
+
if ((entry.tokens_per_second != null || entry.ttft_ms != null) && (!dims || dims.has("latency"))) {
|
|
73
|
+
const latencyScore = computeLatencyScore(entry.tokens_per_second, entry.ttft_ms);
|
|
74
|
+
results.push({
|
|
75
|
+
id: `${baseId}:latency`,
|
|
76
|
+
modelId: entry.model_id,
|
|
77
|
+
providerKey: entry.provider.toLowerCase(),
|
|
78
|
+
source: "artificial-analysis",
|
|
79
|
+
dimension: "latency",
|
|
80
|
+
score: latencyScore,
|
|
81
|
+
rawScore: {
|
|
82
|
+
tokens_per_second: entry.tokens_per_second,
|
|
83
|
+
ttft_ms: entry.ttft_ms
|
|
84
|
+
},
|
|
85
|
+
metadata: { model_name: entry.model_name },
|
|
86
|
+
measuredAt: now,
|
|
87
|
+
ingestedAt: now
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
if ((entry.price_per_million_input_tokens != null || entry.price_per_million_output_tokens != null) && (!dims || dims.has("cost"))) {
|
|
91
|
+
const costScore = computeCostScore(entry.price_per_million_input_tokens, entry.price_per_million_output_tokens);
|
|
92
|
+
results.push({
|
|
93
|
+
id: `${baseId}:cost`,
|
|
94
|
+
modelId: entry.model_id,
|
|
95
|
+
providerKey: entry.provider.toLowerCase(),
|
|
96
|
+
source: "artificial-analysis",
|
|
97
|
+
dimension: "cost",
|
|
98
|
+
score: costScore,
|
|
99
|
+
rawScore: {
|
|
100
|
+
input: entry.price_per_million_input_tokens,
|
|
101
|
+
output: entry.price_per_million_output_tokens
|
|
102
|
+
},
|
|
103
|
+
metadata: { model_name: entry.model_name },
|
|
104
|
+
measuredAt: now,
|
|
105
|
+
ingestedAt: now
|
|
106
|
+
});
|
|
107
|
+
}
|
|
108
|
+
if (entry.context_window != null && (!dims || dims.has("context"))) {
|
|
109
|
+
const contextScore = computeContextScore(entry.context_window);
|
|
110
|
+
results.push({
|
|
111
|
+
id: `${baseId}:context`,
|
|
112
|
+
modelId: entry.model_id,
|
|
113
|
+
providerKey: entry.provider.toLowerCase(),
|
|
114
|
+
source: "artificial-analysis",
|
|
115
|
+
dimension: "context",
|
|
116
|
+
score: contextScore,
|
|
117
|
+
rawScore: entry.context_window,
|
|
118
|
+
metadata: { model_name: entry.model_name },
|
|
119
|
+
measuredAt: now,
|
|
120
|
+
ingestedAt: now
|
|
121
|
+
});
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
return options?.maxResults ? results.slice(0, options.maxResults) : results;
|
|
125
|
+
}
|
|
126
|
+
};
|
|
127
|
+
function computeLatencyScore(tokensPerSec, ttftMs) {
|
|
128
|
+
let score = 50;
|
|
129
|
+
if (tokensPerSec != null) {
|
|
130
|
+
score = Math.min(100, tokensPerSec / 200 * 100);
|
|
131
|
+
}
|
|
132
|
+
if (ttftMs != null) {
|
|
133
|
+
const ttftPenalty = Math.max(0, Math.min(30, (ttftMs - 200) / 100 * 10));
|
|
134
|
+
score = Math.max(0, score - ttftPenalty);
|
|
135
|
+
}
|
|
136
|
+
return Math.round(score * 100) / 100;
|
|
137
|
+
}
|
|
138
|
+
function computeCostScore(inputCost, outputCost) {
|
|
139
|
+
const avgCost = ((inputCost ?? 0) + (outputCost ?? 0)) / 2;
|
|
140
|
+
const score = Math.max(0, 100 - avgCost / 30 * 100);
|
|
141
|
+
return Math.round(score * 100) / 100;
|
|
142
|
+
}
|
|
143
|
+
function computeContextScore(contextWindow) {
|
|
144
|
+
const score = Math.min(100, contextWindow / 1e6 * 100);
|
|
145
|
+
return Math.round(score * 100) / 100;
|
|
146
|
+
}
|
|
147
|
+
export {
|
|
148
|
+
artificialAnalysisIngester
|
|
149
|
+
};
|