@contractspec/lib.provider-ranking 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. package/README.md +44 -0
  2. package/dist/browser/eval/index.js +101 -0
  3. package/dist/browser/eval/runner.js +101 -0
  4. package/dist/browser/eval/types.js +0 -0
  5. package/dist/browser/in-memory-store.js +92 -0
  6. package/dist/browser/index.js +105 -0
  7. package/dist/browser/ingesters/artificial-analysis.js +149 -0
  8. package/dist/browser/ingesters/chatbot-arena.js +142 -0
  9. package/dist/browser/ingesters/fetch-utils.js +39 -0
  10. package/dist/browser/ingesters/index.js +418 -0
  11. package/dist/browser/ingesters/open-llm-leaderboard.js +108 -0
  12. package/dist/browser/ingesters/registry.js +412 -0
  13. package/dist/browser/ingesters/swe-bench.js +105 -0
  14. package/dist/browser/ingesters/types.js +0 -0
  15. package/dist/browser/scoring/composite-scorer.js +122 -0
  16. package/dist/browser/scoring/dimension-weights.js +39 -0
  17. package/dist/browser/scoring/index.js +161 -0
  18. package/dist/browser/scoring/normalizer.js +37 -0
  19. package/dist/browser/store.js +0 -0
  20. package/dist/browser/types.js +14 -0
  21. package/dist/eval/index.d.ts +2 -0
  22. package/dist/eval/index.js +102 -0
  23. package/dist/eval/runner.d.ts +18 -0
  24. package/dist/eval/runner.js +102 -0
  25. package/dist/eval/types.d.ts +51 -0
  26. package/dist/eval/types.js +1 -0
  27. package/dist/in-memory-store.d.ts +17 -0
  28. package/dist/in-memory-store.js +93 -0
  29. package/dist/index.d.ts +4 -0
  30. package/dist/index.js +106 -0
  31. package/dist/ingesters/artificial-analysis.d.ts +8 -0
  32. package/dist/ingesters/artificial-analysis.js +150 -0
  33. package/dist/ingesters/chatbot-arena.d.ts +8 -0
  34. package/dist/ingesters/chatbot-arena.js +143 -0
  35. package/dist/ingesters/fetch-utils.d.ts +11 -0
  36. package/dist/ingesters/fetch-utils.js +40 -0
  37. package/dist/ingesters/index.d.ts +7 -0
  38. package/dist/ingesters/index.js +419 -0
  39. package/dist/ingesters/open-llm-leaderboard.d.ts +8 -0
  40. package/dist/ingesters/open-llm-leaderboard.js +109 -0
  41. package/dist/ingesters/registry.d.ts +17 -0
  42. package/dist/ingesters/registry.js +413 -0
  43. package/dist/ingesters/swe-bench.d.ts +8 -0
  44. package/dist/ingesters/swe-bench.js +106 -0
  45. package/dist/ingesters/types.d.ts +31 -0
  46. package/dist/ingesters/types.js +1 -0
  47. package/dist/node/eval/index.js +101 -0
  48. package/dist/node/eval/runner.js +101 -0
  49. package/dist/node/eval/types.js +0 -0
  50. package/dist/node/in-memory-store.js +92 -0
  51. package/dist/node/index.js +105 -0
  52. package/dist/node/ingesters/artificial-analysis.js +149 -0
  53. package/dist/node/ingesters/chatbot-arena.js +142 -0
  54. package/dist/node/ingesters/fetch-utils.js +39 -0
  55. package/dist/node/ingesters/index.js +418 -0
  56. package/dist/node/ingesters/open-llm-leaderboard.js +108 -0
  57. package/dist/node/ingesters/registry.js +412 -0
  58. package/dist/node/ingesters/swe-bench.js +105 -0
  59. package/dist/node/ingesters/types.js +0 -0
  60. package/dist/node/scoring/composite-scorer.js +122 -0
  61. package/dist/node/scoring/dimension-weights.js +39 -0
  62. package/dist/node/scoring/index.js +161 -0
  63. package/dist/node/scoring/normalizer.js +37 -0
  64. package/dist/node/store.js +0 -0
  65. package/dist/node/types.js +14 -0
  66. package/dist/scoring/composite-scorer.d.ts +10 -0
  67. package/dist/scoring/composite-scorer.js +123 -0
  68. package/dist/scoring/dimension-weights.d.ts +8 -0
  69. package/dist/scoring/dimension-weights.js +40 -0
  70. package/dist/scoring/index.d.ts +3 -0
  71. package/dist/scoring/index.js +162 -0
  72. package/dist/scoring/normalizer.d.ts +20 -0
  73. package/dist/scoring/normalizer.js +38 -0
  74. package/dist/store.d.ts +19 -0
  75. package/dist/store.js +1 -0
  76. package/dist/types.d.ts +100 -0
  77. package/dist/types.js +15 -0
  78. package/package.json +362 -0
@@ -0,0 +1,122 @@
1
+ // src/scoring/dimension-weights.ts
2
+ var DEFAULT_DIMENSION_WEIGHTS = [
3
+ { dimension: "coding", weight: 20 },
4
+ { dimension: "reasoning", weight: 20 },
5
+ { dimension: "agentic", weight: 15 },
6
+ { dimension: "cost", weight: 10 },
7
+ { dimension: "latency", weight: 10 },
8
+ { dimension: "context", weight: 10 },
9
+ { dimension: "safety", weight: 10 },
10
+ { dimension: "custom", weight: 5 }
11
+ ];
12
+ function getWeightMap(overrides) {
13
+ const map = new Map;
14
+ for (const w of DEFAULT_DIMENSION_WEIGHTS) {
15
+ map.set(w.dimension, w.weight);
16
+ }
17
+ if (overrides) {
18
+ for (const w of overrides) {
19
+ map.set(w.dimension, w.weight);
20
+ }
21
+ }
22
+ return map;
23
+ }
24
+ function normalizeWeights(weights, activeDimensions) {
25
+ const totalWeight = activeDimensions.reduce((sum, dim) => sum + (weights.get(dim) ?? 0), 0);
26
+ if (totalWeight === 0)
27
+ return new Map;
28
+ const normalized = new Map;
29
+ for (const dim of activeDimensions) {
30
+ const raw = weights.get(dim) ?? 0;
31
+ normalized.set(dim, raw / totalWeight);
32
+ }
33
+ return normalized;
34
+ }
35
+
36
+ // src/scoring/composite-scorer.ts
37
+ function computeModelRankings(results, options, existingRankings) {
38
+ const byModel = groupByModel(results);
39
+ const weights = getWeightMap(options?.weightOverrides);
40
+ const unsorted = [];
41
+ for (const [modelId, modelResults] of byModel) {
42
+ const providerKey = modelResults[0]?.providerKey ?? "unknown";
43
+ const dimensionScores = computeDimensionScores(modelResults);
44
+ const activeDimensions = Object.keys(dimensionScores);
45
+ const normalizedWeights = normalizeWeights(weights, activeDimensions);
46
+ let compositeScore = 0;
47
+ for (const dim of activeDimensions) {
48
+ const dimScore = dimensionScores[dim];
49
+ const weight = normalizedWeights.get(dim) ?? 0;
50
+ if (dimScore) {
51
+ compositeScore += dimScore.score * weight;
52
+ }
53
+ }
54
+ const previousRank = existingRankings?.get(modelId)?.rank ?? null;
55
+ unsorted.push({
56
+ modelId,
57
+ providerKey,
58
+ compositeScore: Math.round(compositeScore * 100) / 100,
59
+ dimensionScores,
60
+ rank: 0,
61
+ previousRank,
62
+ updatedAt: new Date
63
+ });
64
+ }
65
+ unsorted.sort((a, b) => b.compositeScore - a.compositeScore);
66
+ return unsorted.map((ranking, index) => ({
67
+ ...ranking,
68
+ rank: index + 1
69
+ }));
70
+ }
71
+ function groupByModel(results) {
72
+ const map = new Map;
73
+ for (const result of results) {
74
+ const existing = map.get(result.modelId);
75
+ if (existing) {
76
+ existing.push(result);
77
+ } else {
78
+ map.set(result.modelId, [result]);
79
+ }
80
+ }
81
+ return map;
82
+ }
83
+ function computeDimensionScores(results) {
84
+ const byDimension = new Map;
85
+ for (const result of results) {
86
+ const existing = byDimension.get(result.dimension);
87
+ if (existing) {
88
+ existing.push(result);
89
+ } else {
90
+ byDimension.set(result.dimension, [result]);
91
+ }
92
+ }
93
+ const scores = {};
94
+ for (const [dimension, dimResults] of byDimension) {
95
+ const avgScore = dimResults.reduce((sum, r) => sum + r.score, 0) / dimResults.length;
96
+ const sources = [...new Set(dimResults.map((r) => r.source))];
97
+ const recencyFactor = computeRecencyFactor(dimResults);
98
+ const sourceDiversity = Math.min(sources.length / 3, 1);
99
+ const confidence = Math.round((recencyFactor * 0.5 + sourceDiversity * 0.5) * 100) / 100;
100
+ scores[dimension] = {
101
+ score: Math.round(avgScore * 100) / 100,
102
+ confidence,
103
+ sources
104
+ };
105
+ }
106
+ return scores;
107
+ }
108
+ function computeRecencyFactor(results) {
109
+ if (results.length === 0)
110
+ return 0;
111
+ const now = Date.now();
112
+ const mostRecent = Math.max(...results.map((r) => r.measuredAt.getTime()));
113
+ const daysSinceMostRecent = (now - mostRecent) / (1000 * 60 * 60 * 24);
114
+ if (daysSinceMostRecent <= 30)
115
+ return 1;
116
+ if (daysSinceMostRecent >= 180)
117
+ return 0.3;
118
+ return 1 - (daysSinceMostRecent - 30) / (180 - 30) * 0.7;
119
+ }
120
+ export {
121
+ computeModelRankings
122
+ };
@@ -0,0 +1,39 @@
1
+ // src/scoring/dimension-weights.ts
2
+ var DEFAULT_DIMENSION_WEIGHTS = [
3
+ { dimension: "coding", weight: 20 },
4
+ { dimension: "reasoning", weight: 20 },
5
+ { dimension: "agentic", weight: 15 },
6
+ { dimension: "cost", weight: 10 },
7
+ { dimension: "latency", weight: 10 },
8
+ { dimension: "context", weight: 10 },
9
+ { dimension: "safety", weight: 10 },
10
+ { dimension: "custom", weight: 5 }
11
+ ];
12
+ function getWeightMap(overrides) {
13
+ const map = new Map;
14
+ for (const w of DEFAULT_DIMENSION_WEIGHTS) {
15
+ map.set(w.dimension, w.weight);
16
+ }
17
+ if (overrides) {
18
+ for (const w of overrides) {
19
+ map.set(w.dimension, w.weight);
20
+ }
21
+ }
22
+ return map;
23
+ }
24
+ function normalizeWeights(weights, activeDimensions) {
25
+ const totalWeight = activeDimensions.reduce((sum, dim) => sum + (weights.get(dim) ?? 0), 0);
26
+ if (totalWeight === 0)
27
+ return new Map;
28
+ const normalized = new Map;
29
+ for (const dim of activeDimensions) {
30
+ const raw = weights.get(dim) ?? 0;
31
+ normalized.set(dim, raw / totalWeight);
32
+ }
33
+ return normalized;
34
+ }
35
+ export {
36
+ normalizeWeights,
37
+ getWeightMap,
38
+ DEFAULT_DIMENSION_WEIGHTS
39
+ };
@@ -0,0 +1,161 @@
1
+ // src/scoring/normalizer.ts
2
+ var SOURCE_NORMALIZATION = {
3
+ "chatbot-arena": { min: 800, max: 1400, invertScale: false },
4
+ "swe-bench": { min: 0, max: 100, invertScale: false },
5
+ "human-eval": { min: 0, max: 100, invertScale: false },
6
+ mmlu: { min: 0, max: 100, invertScale: false },
7
+ gpqa: { min: 0, max: 100, invertScale: false },
8
+ arc: { min: 0, max: 100, invertScale: false },
9
+ truthfulqa: { min: 0, max: 100, invertScale: false },
10
+ "tau-bench": { min: 0, max: 100, invertScale: false },
11
+ "artificial-analysis": { min: 0, max: 100, invertScale: false }
12
+ };
13
+ function normalizeScore(rawScore, source, configOverride) {
14
+ const config = configOverride ?? SOURCE_NORMALIZATION[source];
15
+ if (!config) {
16
+ return Math.max(0, Math.min(100, rawScore));
17
+ }
18
+ const { min, max, invertScale } = config;
19
+ const range = max - min;
20
+ if (range === 0)
21
+ return 50;
22
+ let normalized = (rawScore - min) / range * 100;
23
+ if (invertScale) {
24
+ normalized = 100 - normalized;
25
+ }
26
+ return Math.max(0, Math.min(100, normalized));
27
+ }
28
+ function normalizeBenchmarkResults(results) {
29
+ return results.map((result) => ({
30
+ ...result,
31
+ score: normalizeScore(typeof result.rawScore === "number" ? result.rawScore : result.score, result.source)
32
+ }));
33
+ }
34
+
35
+ // src/scoring/dimension-weights.ts
36
+ var DEFAULT_DIMENSION_WEIGHTS = [
37
+ { dimension: "coding", weight: 20 },
38
+ { dimension: "reasoning", weight: 20 },
39
+ { dimension: "agentic", weight: 15 },
40
+ { dimension: "cost", weight: 10 },
41
+ { dimension: "latency", weight: 10 },
42
+ { dimension: "context", weight: 10 },
43
+ { dimension: "safety", weight: 10 },
44
+ { dimension: "custom", weight: 5 }
45
+ ];
46
+ function getWeightMap(overrides) {
47
+ const map = new Map;
48
+ for (const w of DEFAULT_DIMENSION_WEIGHTS) {
49
+ map.set(w.dimension, w.weight);
50
+ }
51
+ if (overrides) {
52
+ for (const w of overrides) {
53
+ map.set(w.dimension, w.weight);
54
+ }
55
+ }
56
+ return map;
57
+ }
58
+ function normalizeWeights(weights, activeDimensions) {
59
+ const totalWeight = activeDimensions.reduce((sum, dim) => sum + (weights.get(dim) ?? 0), 0);
60
+ if (totalWeight === 0)
61
+ return new Map;
62
+ const normalized = new Map;
63
+ for (const dim of activeDimensions) {
64
+ const raw = weights.get(dim) ?? 0;
65
+ normalized.set(dim, raw / totalWeight);
66
+ }
67
+ return normalized;
68
+ }
69
+
70
+ // src/scoring/composite-scorer.ts
71
+ function computeModelRankings(results, options, existingRankings) {
72
+ const byModel = groupByModel(results);
73
+ const weights = getWeightMap(options?.weightOverrides);
74
+ const unsorted = [];
75
+ for (const [modelId, modelResults] of byModel) {
76
+ const providerKey = modelResults[0]?.providerKey ?? "unknown";
77
+ const dimensionScores = computeDimensionScores(modelResults);
78
+ const activeDimensions = Object.keys(dimensionScores);
79
+ const normalizedWeights = normalizeWeights(weights, activeDimensions);
80
+ let compositeScore = 0;
81
+ for (const dim of activeDimensions) {
82
+ const dimScore = dimensionScores[dim];
83
+ const weight = normalizedWeights.get(dim) ?? 0;
84
+ if (dimScore) {
85
+ compositeScore += dimScore.score * weight;
86
+ }
87
+ }
88
+ const previousRank = existingRankings?.get(modelId)?.rank ?? null;
89
+ unsorted.push({
90
+ modelId,
91
+ providerKey,
92
+ compositeScore: Math.round(compositeScore * 100) / 100,
93
+ dimensionScores,
94
+ rank: 0,
95
+ previousRank,
96
+ updatedAt: new Date
97
+ });
98
+ }
99
+ unsorted.sort((a, b) => b.compositeScore - a.compositeScore);
100
+ return unsorted.map((ranking, index) => ({
101
+ ...ranking,
102
+ rank: index + 1
103
+ }));
104
+ }
105
+ function groupByModel(results) {
106
+ const map = new Map;
107
+ for (const result of results) {
108
+ const existing = map.get(result.modelId);
109
+ if (existing) {
110
+ existing.push(result);
111
+ } else {
112
+ map.set(result.modelId, [result]);
113
+ }
114
+ }
115
+ return map;
116
+ }
117
+ function computeDimensionScores(results) {
118
+ const byDimension = new Map;
119
+ for (const result of results) {
120
+ const existing = byDimension.get(result.dimension);
121
+ if (existing) {
122
+ existing.push(result);
123
+ } else {
124
+ byDimension.set(result.dimension, [result]);
125
+ }
126
+ }
127
+ const scores = {};
128
+ for (const [dimension, dimResults] of byDimension) {
129
+ const avgScore = dimResults.reduce((sum, r) => sum + r.score, 0) / dimResults.length;
130
+ const sources = [...new Set(dimResults.map((r) => r.source))];
131
+ const recencyFactor = computeRecencyFactor(dimResults);
132
+ const sourceDiversity = Math.min(sources.length / 3, 1);
133
+ const confidence = Math.round((recencyFactor * 0.5 + sourceDiversity * 0.5) * 100) / 100;
134
+ scores[dimension] = {
135
+ score: Math.round(avgScore * 100) / 100,
136
+ confidence,
137
+ sources
138
+ };
139
+ }
140
+ return scores;
141
+ }
142
+ function computeRecencyFactor(results) {
143
+ if (results.length === 0)
144
+ return 0;
145
+ const now = Date.now();
146
+ const mostRecent = Math.max(...results.map((r) => r.measuredAt.getTime()));
147
+ const daysSinceMostRecent = (now - mostRecent) / (1000 * 60 * 60 * 24);
148
+ if (daysSinceMostRecent <= 30)
149
+ return 1;
150
+ if (daysSinceMostRecent >= 180)
151
+ return 0.3;
152
+ return 1 - (daysSinceMostRecent - 30) / (180 - 30) * 0.7;
153
+ }
154
+ export {
155
+ normalizeWeights,
156
+ normalizeScore,
157
+ normalizeBenchmarkResults,
158
+ getWeightMap,
159
+ computeModelRankings,
160
+ DEFAULT_DIMENSION_WEIGHTS
161
+ };
@@ -0,0 +1,37 @@
1
+ // src/scoring/normalizer.ts
2
+ var SOURCE_NORMALIZATION = {
3
+ "chatbot-arena": { min: 800, max: 1400, invertScale: false },
4
+ "swe-bench": { min: 0, max: 100, invertScale: false },
5
+ "human-eval": { min: 0, max: 100, invertScale: false },
6
+ mmlu: { min: 0, max: 100, invertScale: false },
7
+ gpqa: { min: 0, max: 100, invertScale: false },
8
+ arc: { min: 0, max: 100, invertScale: false },
9
+ truthfulqa: { min: 0, max: 100, invertScale: false },
10
+ "tau-bench": { min: 0, max: 100, invertScale: false },
11
+ "artificial-analysis": { min: 0, max: 100, invertScale: false }
12
+ };
13
+ function normalizeScore(rawScore, source, configOverride) {
14
+ const config = configOverride ?? SOURCE_NORMALIZATION[source];
15
+ if (!config) {
16
+ return Math.max(0, Math.min(100, rawScore));
17
+ }
18
+ const { min, max, invertScale } = config;
19
+ const range = max - min;
20
+ if (range === 0)
21
+ return 50;
22
+ let normalized = (rawScore - min) / range * 100;
23
+ if (invertScale) {
24
+ normalized = 100 - normalized;
25
+ }
26
+ return Math.max(0, Math.min(100, normalized));
27
+ }
28
+ function normalizeBenchmarkResults(results) {
29
+ return results.map((result) => ({
30
+ ...result,
31
+ score: normalizeScore(typeof result.rawScore === "number" ? result.rawScore : result.score, result.source)
32
+ }));
33
+ }
34
+ export {
35
+ normalizeScore,
36
+ normalizeBenchmarkResults
37
+ };
File without changes
@@ -0,0 +1,14 @@
1
+ // src/types.ts
2
+ var BENCHMARK_DIMENSIONS = [
3
+ "coding",
4
+ "reasoning",
5
+ "agentic",
6
+ "cost",
7
+ "latency",
8
+ "context",
9
+ "safety",
10
+ "custom"
11
+ ];
12
+ export {
13
+ BENCHMARK_DIMENSIONS
14
+ };
@@ -0,0 +1,2 @@
1
+ export type { EvalCase, EvalSuite, EvalCaseResult, EvalRunResult, EvalLLMAdapter, } from './types';
2
+ export { EvalRunner } from './runner';
@@ -0,0 +1,102 @@
1
+ // @bun
2
+ // src/eval/runner.ts
3
+ class EvalRunner {
4
+ adapter;
5
+ options;
6
+ constructor(adapter, options = {}) {
7
+ this.adapter = adapter;
8
+ this.options = options;
9
+ }
10
+ async run(suite, modelId, providerKey) {
11
+ const runId = `eval-${suite.key}-${modelId}-${Date.now()}`;
12
+ const startedAt = new Date;
13
+ const concurrency = this.options.maxConcurrency ?? 5;
14
+ const caseResults = await this.runCasesWithConcurrency(suite.cases, suite.defaultGrader, concurrency);
15
+ const passedCases = caseResults.filter((r) => r.passed).length;
16
+ const averageScore = caseResults.length > 0 ? caseResults.reduce((sum, r) => sum + r.score, 0) / caseResults.length : 0;
17
+ const averageLatencyMs = caseResults.length > 0 ? caseResults.reduce((sum, r) => sum + r.latencyMs, 0) / caseResults.length : 0;
18
+ return {
19
+ runId,
20
+ evalSuiteKey: suite.key,
21
+ modelId,
22
+ providerKey,
23
+ totalCases: suite.cases.length,
24
+ passedCases,
25
+ averageScore: Math.round(averageScore * 100) / 100,
26
+ averageLatencyMs: Math.round(averageLatencyMs),
27
+ caseResults,
28
+ startedAt,
29
+ completedAt: new Date
30
+ };
31
+ }
32
+ async runCasesWithConcurrency(cases, defaultGrader, concurrency) {
33
+ const results = [];
34
+ const queue = [...cases];
35
+ const workers = Array.from({ length: Math.min(concurrency, queue.length) }, async () => {
36
+ while (queue.length > 0) {
37
+ const evalCase = queue.shift();
38
+ if (!evalCase)
39
+ break;
40
+ const result = await this.runSingleCase(evalCase, defaultGrader);
41
+ results.push(result);
42
+ }
43
+ });
44
+ await Promise.all(workers);
45
+ return results;
46
+ }
47
+ async runSingleCase(evalCase, defaultGrader) {
48
+ try {
49
+ const { text, latencyMs } = await this.adapter.chat(evalCase.prompt);
50
+ const { passed, score } = this.grade(evalCase, text, defaultGrader);
51
+ return {
52
+ caseId: evalCase.id,
53
+ passed,
54
+ score,
55
+ response: text,
56
+ latencyMs
57
+ };
58
+ } catch (error) {
59
+ return {
60
+ caseId: evalCase.id,
61
+ passed: false,
62
+ score: 0,
63
+ response: "",
64
+ latencyMs: 0,
65
+ error: error instanceof Error ? error.message : String(error)
66
+ };
67
+ }
68
+ }
69
+ grade(evalCase, response, defaultGrader) {
70
+ const grader = evalCase.graderKey ?? defaultGrader;
71
+ switch (grader) {
72
+ case "exact":
73
+ if (!evalCase.expectedOutput)
74
+ return { passed: true, score: 1 };
75
+ return {
76
+ passed: response.trim() === evalCase.expectedOutput.trim(),
77
+ score: response.trim() === evalCase.expectedOutput.trim() ? 1 : 0
78
+ };
79
+ case "contains":
80
+ if (!evalCase.expectedOutput)
81
+ return { passed: true, score: 1 };
82
+ return {
83
+ passed: response.includes(evalCase.expectedOutput),
84
+ score: response.includes(evalCase.expectedOutput) ? 1 : 0
85
+ };
86
+ case "regex": {
87
+ if (!evalCase.expectedPattern)
88
+ return { passed: true, score: 1 };
89
+ const regex = new RegExp(evalCase.expectedPattern);
90
+ const matches = regex.test(response);
91
+ return { passed: matches, score: matches ? 1 : 0 };
92
+ }
93
+ case "llm-judge":
94
+ return { passed: true, score: 0.5 };
95
+ default:
96
+ return { passed: true, score: 0.5 };
97
+ }
98
+ }
99
+ }
100
+ export {
101
+ EvalRunner
102
+ };
@@ -0,0 +1,18 @@
1
+ import type { EvalLLMAdapter, EvalRunResult, EvalSuite } from './types';
2
+ interface EvalRunnerOptions {
3
+ maxConcurrency?: number;
4
+ }
5
+ /**
6
+ * Executes an eval suite against an LLM adapter and produces
7
+ * scored results that can be stored as BenchmarkResults.
8
+ */
9
+ export declare class EvalRunner {
10
+ private readonly adapter;
11
+ private readonly options;
12
+ constructor(adapter: EvalLLMAdapter, options?: EvalRunnerOptions);
13
+ run(suite: EvalSuite, modelId: string, providerKey: string): Promise<EvalRunResult>;
14
+ private runCasesWithConcurrency;
15
+ private runSingleCase;
16
+ private grade;
17
+ }
18
+ export {};
@@ -0,0 +1,102 @@
1
+ // @bun
2
+ // src/eval/runner.ts
3
+ class EvalRunner {
4
+ adapter;
5
+ options;
6
+ constructor(adapter, options = {}) {
7
+ this.adapter = adapter;
8
+ this.options = options;
9
+ }
10
+ async run(suite, modelId, providerKey) {
11
+ const runId = `eval-${suite.key}-${modelId}-${Date.now()}`;
12
+ const startedAt = new Date;
13
+ const concurrency = this.options.maxConcurrency ?? 5;
14
+ const caseResults = await this.runCasesWithConcurrency(suite.cases, suite.defaultGrader, concurrency);
15
+ const passedCases = caseResults.filter((r) => r.passed).length;
16
+ const averageScore = caseResults.length > 0 ? caseResults.reduce((sum, r) => sum + r.score, 0) / caseResults.length : 0;
17
+ const averageLatencyMs = caseResults.length > 0 ? caseResults.reduce((sum, r) => sum + r.latencyMs, 0) / caseResults.length : 0;
18
+ return {
19
+ runId,
20
+ evalSuiteKey: suite.key,
21
+ modelId,
22
+ providerKey,
23
+ totalCases: suite.cases.length,
24
+ passedCases,
25
+ averageScore: Math.round(averageScore * 100) / 100,
26
+ averageLatencyMs: Math.round(averageLatencyMs),
27
+ caseResults,
28
+ startedAt,
29
+ completedAt: new Date
30
+ };
31
+ }
32
+ async runCasesWithConcurrency(cases, defaultGrader, concurrency) {
33
+ const results = [];
34
+ const queue = [...cases];
35
+ const workers = Array.from({ length: Math.min(concurrency, queue.length) }, async () => {
36
+ while (queue.length > 0) {
37
+ const evalCase = queue.shift();
38
+ if (!evalCase)
39
+ break;
40
+ const result = await this.runSingleCase(evalCase, defaultGrader);
41
+ results.push(result);
42
+ }
43
+ });
44
+ await Promise.all(workers);
45
+ return results;
46
+ }
47
+ async runSingleCase(evalCase, defaultGrader) {
48
+ try {
49
+ const { text, latencyMs } = await this.adapter.chat(evalCase.prompt);
50
+ const { passed, score } = this.grade(evalCase, text, defaultGrader);
51
+ return {
52
+ caseId: evalCase.id,
53
+ passed,
54
+ score,
55
+ response: text,
56
+ latencyMs
57
+ };
58
+ } catch (error) {
59
+ return {
60
+ caseId: evalCase.id,
61
+ passed: false,
62
+ score: 0,
63
+ response: "",
64
+ latencyMs: 0,
65
+ error: error instanceof Error ? error.message : String(error)
66
+ };
67
+ }
68
+ }
69
+ grade(evalCase, response, defaultGrader) {
70
+ const grader = evalCase.graderKey ?? defaultGrader;
71
+ switch (grader) {
72
+ case "exact":
73
+ if (!evalCase.expectedOutput)
74
+ return { passed: true, score: 1 };
75
+ return {
76
+ passed: response.trim() === evalCase.expectedOutput.trim(),
77
+ score: response.trim() === evalCase.expectedOutput.trim() ? 1 : 0
78
+ };
79
+ case "contains":
80
+ if (!evalCase.expectedOutput)
81
+ return { passed: true, score: 1 };
82
+ return {
83
+ passed: response.includes(evalCase.expectedOutput),
84
+ score: response.includes(evalCase.expectedOutput) ? 1 : 0
85
+ };
86
+ case "regex": {
87
+ if (!evalCase.expectedPattern)
88
+ return { passed: true, score: 1 };
89
+ const regex = new RegExp(evalCase.expectedPattern);
90
+ const matches = regex.test(response);
91
+ return { passed: matches, score: matches ? 1 : 0 };
92
+ }
93
+ case "llm-judge":
94
+ return { passed: true, score: 0.5 };
95
+ default:
96
+ return { passed: true, score: 0.5 };
97
+ }
98
+ }
99
+ }
100
+ export {
101
+ EvalRunner
102
+ };
@@ -0,0 +1,51 @@
1
+ import type { BenchmarkDimension } from '../types';
2
+ export interface EvalCase {
3
+ id: string;
4
+ prompt: string;
5
+ expectedOutput?: string;
6
+ /** Regex or substring that must appear in the response. */
7
+ expectedPattern?: string;
8
+ /** Custom grading function key (resolved at runtime). */
9
+ graderKey?: string;
10
+ metadata?: Record<string, unknown>;
11
+ }
12
+ export interface EvalSuite {
13
+ key: string;
14
+ displayName: string;
15
+ description: string;
16
+ dimension: BenchmarkDimension;
17
+ cases: EvalCase[];
18
+ /** Default grading strategy when individual cases don't specify one. */
19
+ defaultGrader: 'exact' | 'contains' | 'regex' | 'llm-judge';
20
+ }
21
+ export interface EvalCaseResult {
22
+ caseId: string;
23
+ passed: boolean;
24
+ score: number;
25
+ response: string;
26
+ latencyMs: number;
27
+ error?: string;
28
+ }
29
+ export interface EvalRunResult {
30
+ runId: string;
31
+ evalSuiteKey: string;
32
+ modelId: string;
33
+ providerKey: string;
34
+ totalCases: number;
35
+ passedCases: number;
36
+ averageScore: number;
37
+ averageLatencyMs: number;
38
+ caseResults: EvalCaseResult[];
39
+ startedAt: Date;
40
+ completedAt: Date;
41
+ }
42
+ /**
43
+ * Abstraction over the LLM provider for eval execution.
44
+ * Kept minimal to avoid coupling to a specific SDK.
45
+ */
46
+ export interface EvalLLMAdapter {
47
+ chat(prompt: string): Promise<{
48
+ text: string;
49
+ latencyMs: number;
50
+ }>;
51
+ }
@@ -0,0 +1 @@
1
+ // @bun
@@ -0,0 +1,17 @@
1
+ import type { ProviderRankingStore } from './store';
2
+ import type { BenchmarkResult, BenchmarkResultListResult, BenchmarkResultQuery, IngestionRun, ModelProfile, ModelRanking, RankingListResult, RankingQuery } from './types';
3
+ export declare class InMemoryProviderRankingStore implements ProviderRankingStore {
4
+ private benchmarkResults;
5
+ private modelRankings;
6
+ private ingestionRuns;
7
+ upsertBenchmarkResult(result: BenchmarkResult): Promise<void>;
8
+ getBenchmarkResult(id: string): Promise<BenchmarkResult | null>;
9
+ listBenchmarkResults(query: BenchmarkResultQuery): Promise<BenchmarkResultListResult>;
10
+ upsertModelRanking(ranking: ModelRanking): Promise<void>;
11
+ getModelRanking(modelId: string): Promise<ModelRanking | null>;
12
+ listModelRankings(query: RankingQuery): Promise<RankingListResult>;
13
+ getModelProfile(modelId: string): Promise<ModelProfile | null>;
14
+ createIngestionRun(run: IngestionRun): Promise<void>;
15
+ updateIngestionRun(id: string, update: Partial<IngestionRun>): Promise<void>;
16
+ getIngestionRun(id: string): Promise<IngestionRun | null>;
17
+ }