@elizaos/training 2.0.0-alpha.21 → 2.0.0-alpha.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/.turbo/turbo-lint.log +2 -0
  2. package/.turbo/turbo-typecheck.log +1 -0
  3. package/dist/.tsbuildinfo +1 -0
  4. package/dist/adapter.js +59 -0
  5. package/dist/archetypes/ArchetypeConfigService.js +510 -0
  6. package/dist/archetypes/derive-archetype.js +196 -0
  7. package/dist/archetypes/index.js +7 -0
  8. package/dist/benchmark/ArchetypeMatchupBenchmark.js +547 -0
  9. package/dist/benchmark/BenchmarkChartGenerator.js +632 -0
  10. package/dist/benchmark/BenchmarkDataGenerator.js +825 -0
  11. package/dist/benchmark/BenchmarkDataViewer.js +197 -0
  12. package/dist/benchmark/BenchmarkHistoryService.js +135 -0
  13. package/dist/benchmark/BenchmarkRunner.js +483 -0
  14. package/dist/benchmark/BenchmarkValidator.js +158 -0
  15. package/dist/benchmark/FastEvalRunner.js +133 -0
  16. package/dist/benchmark/MetricsValidator.js +104 -0
  17. package/dist/benchmark/MetricsVisualizer.js +775 -0
  18. package/dist/benchmark/ModelBenchmarkService.js +433 -0
  19. package/dist/benchmark/ModelRegistry.js +122 -0
  20. package/dist/benchmark/RulerBenchmarkIntegration.js +168 -0
  21. package/dist/benchmark/SimulationA2AInterface.js +683 -0
  22. package/dist/benchmark/SimulationEngine.js +522 -0
  23. package/dist/benchmark/TaskRunner.js +60 -0
  24. package/dist/benchmark/__tests__/BenchmarkRunner.test.js +409 -0
  25. package/dist/benchmark/__tests__/HeadToHead.test.js +105 -0
  26. package/dist/benchmark/index.js +23 -0
  27. package/dist/benchmark/parseSimulationMetrics.js +86 -0
  28. package/dist/benchmark/simulation-types.js +1 -0
  29. package/dist/dependencies.js +197 -0
  30. package/dist/generation/TrajectoryGenerator.js +244 -0
  31. package/dist/generation/index.js +6 -0
  32. package/dist/huggingface/HuggingFaceDatasetUploader.js +463 -0
  33. package/dist/huggingface/HuggingFaceIntegrationService.js +272 -0
  34. package/dist/huggingface/HuggingFaceModelUploader.js +385 -0
  35. package/dist/huggingface/index.js +9 -0
  36. package/dist/huggingface/shared/HuggingFaceUploadUtil.js +144 -0
  37. package/dist/index.js +41 -0
  38. package/dist/init-training.js +43 -0
  39. package/dist/metrics/TrajectoryMetricsExtractor.js +523 -0
  40. package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +628 -0
  41. package/dist/metrics/index.js +7 -0
  42. package/dist/metrics/types.js +21 -0
  43. package/dist/rubrics/__tests__/index.test.js +150 -0
  44. package/dist/rubrics/ass-kisser.js +83 -0
  45. package/dist/rubrics/degen.js +78 -0
  46. package/dist/rubrics/goody-twoshoes.js +82 -0
  47. package/dist/rubrics/index.js +184 -0
  48. package/dist/rubrics/information-trader.js +82 -0
  49. package/dist/rubrics/infosec.js +99 -0
  50. package/dist/rubrics/liar.js +102 -0
  51. package/dist/rubrics/perps-trader.js +85 -0
  52. package/dist/rubrics/researcher.js +79 -0
  53. package/dist/rubrics/scammer.js +80 -0
  54. package/dist/rubrics/social-butterfly.js +71 -0
  55. package/dist/rubrics/super-predictor.js +95 -0
  56. package/dist/rubrics/trader.js +65 -0
  57. package/dist/scoring/ArchetypeScoringService.js +301 -0
  58. package/dist/scoring/JudgePromptBuilder.js +401 -0
  59. package/dist/scoring/LLMJudgeCache.js +263 -0
  60. package/dist/scoring/index.js +8 -0
  61. package/dist/training/AutomationPipeline.js +714 -0
  62. package/dist/training/BenchmarkService.js +370 -0
  63. package/dist/training/ConfigValidator.js +153 -0
  64. package/dist/training/MarketOutcomesTracker.js +142 -0
  65. package/dist/training/ModelDeployer.js +128 -0
  66. package/dist/training/ModelFetcher.js +48 -0
  67. package/dist/training/ModelSelectionService.js +248 -0
  68. package/dist/training/ModelUsageVerifier.js +106 -0
  69. package/dist/training/MultiModelOrchestrator.js +349 -0
  70. package/dist/training/RLModelConfig.js +295 -0
  71. package/dist/training/RewardBackpropagationService.js +117 -0
  72. package/dist/training/RulerScoringService.js +450 -0
  73. package/dist/training/TrainingMonitor.js +108 -0
  74. package/dist/training/TrajectoryRecorder.js +281 -0
  75. package/dist/training/__tests__/TrajectoryRecorder.test.js +363 -0
  76. package/dist/training/index.js +30 -0
  77. package/dist/training/logRLConfig.js +29 -0
  78. package/dist/training/pipeline.js +80 -0
  79. package/dist/training/storage/ModelStorageService.js +190 -0
  80. package/dist/training/storage/TrainingDataArchiver.js +136 -0
  81. package/dist/training/storage/index.js +7 -0
  82. package/dist/training/types.js +6 -0
  83. package/dist/training/window-utils.js +100 -0
  84. package/dist/utils/index.js +73 -0
  85. package/dist/utils/logger.js +55 -0
  86. package/dist/utils/snowflake.js +15 -0
  87. package/dist/utils/synthetic-detector.js +67 -0
  88. package/package.json +2 -2
  89. package/research-output/training-runs/training-run-1773742857616.json +38 -0
  90. package/research-output/training-runs/training-run-1773742946977.json +38 -0
  91. package/research-output/training-runs/training-run-1773743278891.json +38 -0
  92. package/research-output/training-runs/training-run-1773743409754.json +38 -0
  93. package/research-output/training-runs/training-run-1773743651086.json +38 -0
  94. package/research-output/training-runs/training-run-1773743782883.json +38 -0
@@ -0,0 +1,433 @@
1
+ /**
2
+ * Model Benchmark Service (For HuggingFace Integration)
3
+ *
4
+ * Runs benchmark tests on trained RL models for HuggingFace upload decisions.
5
+ * Compares new models against baselines and previous versions.
6
+ *
7
+ * **Purpose:** Evaluate models for HuggingFace upload
8
+ * **Used by:** HuggingFace integration, weekly CRON, CLI scripts
9
+ * **Storage:** benchmark_results table (dedicated table)
10
+ * **Focus:** Public model release, baseline comparison
11
+ *
12
+ * **Note:** For training pipeline benchmarking, see BenchmarkService
13
+ *
14
+ * @see BenchmarkService - For training pipeline evaluation
15
+ */
16
+ import { promises as fs } from "node:fs";
17
+ import * as path from "node:path";
18
+ import { ethers } from "ethers";
19
+ import { getTrainingDataAdapter } from "../adapter";
20
+ import { getAgentRuntimeManager } from "../dependencies";
21
+ import { logger } from "../utils/logger";
22
+ import { generateSnowflakeId } from "../utils/snowflake";
23
+ import { BenchmarkRunner } from "./BenchmarkRunner";
24
+ import { parseSimulationMetrics, } from "./parseSimulationMetrics";
25
+ export class ModelBenchmarkService {
26
+ /**
27
+ * Benchmark a trained model against standard benchmarks
28
+ */
29
+ static async benchmarkModel(options) {
30
+ logger.info("Starting model benchmark", { modelId: options.modelId });
31
+ // Load model from database
32
+ const adapter = getTrainingDataAdapter();
33
+ const model = await adapter.getModelById(options.modelId);
34
+ if (!model) {
35
+ throw new Error(`Model not found: ${options.modelId}`);
36
+ }
37
+ // Check if model already benchmarked
38
+ const existingBenchmarks = await ModelBenchmarkService.getModelBenchmarks(options.modelId);
39
+ if (existingBenchmarks.length > 0 && !options.saveResults) {
40
+ logger.info("Model already benchmarked", {
41
+ modelId: options.modelId,
42
+ count: existingBenchmarks.length,
43
+ });
44
+ return existingBenchmarks;
45
+ }
46
+ // Create test agent for benchmarking
47
+ const testAgentId = await ModelBenchmarkService.getOrCreateTestAgent();
48
+ const results = [];
49
+ // Run each benchmark
50
+ for (const benchmarkPath of options.benchmarkPaths) {
51
+ logger.info("Running benchmark", {
52
+ benchmark: benchmarkPath,
53
+ modelId: options.modelId,
54
+ });
55
+ try {
56
+ // Get agent runtime (will use the RL model if configured)
57
+ const runtime = await getAgentRuntimeManager().getRuntime(testAgentId);
58
+ // Run benchmark
59
+ const simulationResult = await BenchmarkRunner.runSingle({
60
+ benchmarkPath,
61
+ agentRuntime: runtime,
62
+ agentUserId: testAgentId,
63
+ saveTrajectory: false,
64
+ outputDir: options.outputDir ||
65
+ path.join(process.cwd(), "benchmarks", "model-results", model.version),
66
+ forceModel: model.storagePath, // Use the RL model
67
+ });
68
+ // Create benchmark result
69
+ const benchmarkResult = {
70
+ modelId: options.modelId,
71
+ modelVersion: model.version,
72
+ benchmarkId: simulationResult.benchmarkId,
73
+ benchmarkPath,
74
+ runAt: new Date(),
75
+ metrics: simulationResult.metrics,
76
+ };
77
+ // Compare to baseline if available
78
+ const baseline = await ModelBenchmarkService.getBaselineBenchmark(benchmarkPath);
79
+ if (baseline) {
80
+ benchmarkResult.comparisonToBaseline = {
81
+ pnlDelta: simulationResult.metrics.totalPnl - baseline.totalPnl,
82
+ accuracyDelta: simulationResult.metrics.predictionMetrics.accuracy -
83
+ baseline.predictionMetrics.accuracy,
84
+ optimalityDelta: simulationResult.metrics.optimalityScore -
85
+ baseline.optimalityScore,
86
+ improved: simulationResult.metrics.totalPnl > baseline.totalPnl,
87
+ };
88
+ }
89
+ results.push(benchmarkResult);
90
+ logger.info("Benchmark completed", {
91
+ benchmark: benchmarkPath,
92
+ pnl: simulationResult.metrics.totalPnl,
93
+ accuracy: simulationResult.metrics.predictionMetrics.accuracy,
94
+ });
95
+ // Save result if requested (to both database and files)
96
+ if (options.saveResults) {
97
+ await ModelBenchmarkService.saveBenchmarkResultToDatabase(benchmarkResult);
98
+ await ModelBenchmarkService.saveBenchmarkResult(benchmarkResult);
99
+ }
100
+ }
101
+ catch (error) {
102
+ logger.error("Benchmark failed", { benchmark: benchmarkPath, error });
103
+ }
104
+ }
105
+ // Update model with aggregate benchmark score
106
+ if (results.length > 0) {
107
+ const avgOptimality = results.reduce((sum, r) => sum + r.metrics.optimalityScore, 0) /
108
+ results.length;
109
+ const avgPnl = results.reduce((sum, r) => sum + r.metrics.totalPnl, 0) /
110
+ results.length;
111
+ await adapter.updateModelBenchmark(options.modelId, avgOptimality, avgPnl, (model.benchmarkCount || 0) + results.length);
112
+ }
113
+ logger.info("Model benchmark complete", {
114
+ modelId: options.modelId,
115
+ benchmarksRun: results.length,
116
+ });
117
+ return results;
118
+ }
119
+ /**
120
+ * Compare new model against baseline
121
+ */
122
+ static async compareToBaseline(modelId) {
123
+ // Get new model benchmarks
124
+ const newModelBenchmarks = await ModelBenchmarkService.getModelBenchmarks(modelId);
125
+ if (newModelBenchmarks.length === 0) {
126
+ throw new Error(`No benchmarks found for model: ${modelId}`);
127
+ }
128
+ // Calculate new model average metrics
129
+ const newModelMetrics = ModelBenchmarkService.calculateAverageMetrics(newModelBenchmarks.map((b) => b.metrics));
130
+ // Get baseline benchmarks (use best baseline model)
131
+ const baselineMetrics = await ModelBenchmarkService.getBaselineAverageMetrics();
132
+ // Calculate improvement
133
+ const pnlDelta = newModelMetrics.totalPnl - baselineMetrics.totalPnl;
134
+ const accuracyDelta = newModelMetrics.accuracy - baselineMetrics.accuracy;
135
+ const optimalityDelta = newModelMetrics.optimality - baselineMetrics.optimality;
136
+ // Determine if this is an improvement (weighted score)
137
+ const improvementScore = (pnlDelta > 0 ? 1 : 0) * 0.4 +
138
+ (accuracyDelta > 0 ? 1 : 0) * 0.3 +
139
+ (optimalityDelta > 0 ? 1 : 0) * 0.3;
140
+ const isImprovement = improvementScore > 0.5;
141
+ let recommendation;
142
+ if (isImprovement && pnlDelta > 0) {
143
+ recommendation = "deploy";
144
+ }
145
+ else if (pnlDelta < -100) {
146
+ recommendation = "baseline_better";
147
+ }
148
+ else {
149
+ recommendation = "keep_training";
150
+ }
151
+ return {
152
+ newModel: {
153
+ modelId,
154
+ version: newModelBenchmarks[0]?.modelVersion,
155
+ avgMetrics: newModelMetrics,
156
+ },
157
+ baseline: {
158
+ modelId: "baseline",
159
+ avgMetrics: baselineMetrics,
160
+ },
161
+ improvement: {
162
+ pnlDelta,
163
+ accuracyDelta,
164
+ optimalityDelta,
165
+ isImprovement,
166
+ },
167
+ recommendation,
168
+ };
169
+ }
170
+ /**
171
+ * Get all unbenchmarked models
172
+ */
173
+ static async getUnbenchmarkedModels() {
174
+ return getTrainingDataAdapter().getUnbenchmarkedModels();
175
+ }
176
+ /**
177
+ * Get model benchmark results
178
+ */
179
+ static async getModelBenchmarks(modelId) {
180
+ // For now, read from files
181
+ // In production, you'd store these in a database table
182
+ const benchmarksDir = path.join(process.cwd(), "benchmarks", "model-results");
183
+ const results = [];
184
+ try {
185
+ const model = await getTrainingDataAdapter().getModelById(modelId);
186
+ if (!model)
187
+ return results;
188
+ const modelDir = path.join(benchmarksDir, model.version);
189
+ const files = await fs.readdir(modelDir).catch(() => []);
190
+ for (const file of files) {
191
+ if (file.endsWith(".json")) {
192
+ const filePath = path.join(modelDir, file);
193
+ const data = JSON.parse(await fs.readFile(filePath, "utf-8"));
194
+ if (data.modelId === modelId) {
195
+ results.push(data);
196
+ }
197
+ }
198
+ }
199
+ }
200
+ catch (error) {
201
+ logger.warn("Could not load benchmark results", { error });
202
+ }
203
+ return results;
204
+ }
205
+ /**
206
+ * Save benchmark result to database
207
+ */
208
+ static async saveBenchmarkResultToDatabase(result) {
209
+ await getTrainingDataAdapter().insertBenchmarkResult({
210
+ id: await generateSnowflakeId(),
211
+ modelId: result.modelId,
212
+ benchmarkId: result.benchmarkId,
213
+ benchmarkPath: result.benchmarkPath,
214
+ runAt: result.runAt,
215
+ totalPnl: result.metrics.totalPnl,
216
+ predictionAccuracy: result.metrics.predictionMetrics.accuracy,
217
+ perpWinRate: result.metrics.perpMetrics.winRate,
218
+ optimalityScore: result.metrics.optimalityScore,
219
+ detailedMetrics: JSON.parse(JSON.stringify(result.metrics)),
220
+ baselinePnlDelta: result.comparisonToBaseline?.pnlDelta ?? null,
221
+ baselineAccuracyDelta: result.comparisonToBaseline?.accuracyDelta ?? null,
222
+ improved: result.comparisonToBaseline?.improved ?? null,
223
+ duration: result.metrics.timing.totalDuration,
224
+ });
225
+ logger.info("Benchmark result saved to database", {
226
+ modelId: result.modelId,
227
+ benchmarkId: result.benchmarkId,
228
+ });
229
+ }
230
+ /**
231
+ * Save benchmark result to file
232
+ */
233
+ static async saveBenchmarkResult(result) {
234
+ const outputDir = path.join(process.cwd(), "benchmarks", "model-results", result.modelVersion);
235
+ await fs.mkdir(outputDir, { recursive: true });
236
+ const filename = `benchmark-${result.benchmarkId}-${Date.now()}.json`;
237
+ const filePath = path.join(outputDir, filename);
238
+ await fs.writeFile(filePath, JSON.stringify(result, null, 2));
239
+ logger.info("Benchmark result saved to file", { filePath });
240
+ }
241
+ /**
242
+ * Get benchmark results from database
243
+ */
244
+ static async getBenchmarkResultsFromDatabase(modelId) {
245
+ const results = await getTrainingDataAdapter().getBenchmarkResultsByModel(modelId);
246
+ return results.map((r) => ({
247
+ modelId: r.modelId,
248
+ modelVersion: "", // Not stored in results table
249
+ benchmarkId: r.benchmarkId,
250
+ benchmarkPath: r.benchmarkPath,
251
+ runAt: r.runAt,
252
+ metrics: parseSimulationMetrics(r.detailedMetrics),
253
+ comparisonToBaseline: r.baselinePnlDelta !== null
254
+ ? {
255
+ pnlDelta: r.baselinePnlDelta,
256
+ accuracyDelta: r.baselineAccuracyDelta ?? 0,
257
+ optimalityDelta: 0, // Not stored separately
258
+ improved: r.improved || false,
259
+ }
260
+ : undefined,
261
+ }));
262
+ }
263
+ /**
264
+ * Get baseline benchmark for comparison
265
+ */
266
+ static async getBaselineBenchmark(benchmarkPath) {
267
+ try {
268
+ // Look for baseline result for this benchmark
269
+ const baselinesDir = path.join(process.cwd(), "benchmarks", "baselines");
270
+ const files = await fs.readdir(baselinesDir).catch(() => []);
271
+ for (const file of files) {
272
+ if (file.endsWith(".json")) {
273
+ const filePath = path.join(baselinesDir, file);
274
+ const data = JSON.parse(await fs.readFile(filePath, "utf-8"));
275
+ if (data.benchmark?.path === benchmarkPath ||
276
+ data.benchmark === benchmarkPath) {
277
+ return data.metrics;
278
+ }
279
+ }
280
+ }
281
+ }
282
+ catch (error) {
283
+ logger.warn("Could not load baseline benchmark", { error });
284
+ }
285
+ return null;
286
+ }
287
+ /**
288
+ * Calculate average metrics across multiple benchmark results
289
+ */
290
+ static calculateAverageMetrics(metricsArray) {
291
+ if (metricsArray.length === 0) {
292
+ return {
293
+ totalPnl: 0,
294
+ accuracy: 0,
295
+ winRate: 0,
296
+ optimality: 0,
297
+ benchmarkCount: 0,
298
+ };
299
+ }
300
+ const totals = metricsArray.reduce((acc, metrics) => ({
301
+ pnl: acc.pnl + metrics.totalPnl,
302
+ accuracy: acc.accuracy + metrics.predictionMetrics.accuracy,
303
+ winRate: acc.winRate + metrics.perpMetrics.winRate,
304
+ optimality: acc.optimality + metrics.optimalityScore,
305
+ }), { pnl: 0, accuracy: 0, winRate: 0, optimality: 0 });
306
+ const count = metricsArray.length;
307
+ return {
308
+ totalPnl: totals.pnl / count,
309
+ accuracy: totals.accuracy / count,
310
+ winRate: totals.winRate / count,
311
+ optimality: totals.optimality / count,
312
+ benchmarkCount: count,
313
+ };
314
+ }
315
+ /**
316
+ * Get baseline average metrics
317
+ */
318
+ static async getBaselineAverageMetrics() {
319
+ const baselinesDir = path.join(process.cwd(), "benchmarks", "baselines");
320
+ const metricsArray = [];
321
+ try {
322
+ const files = await fs.readdir(baselinesDir).catch(() => []);
323
+ for (const file of files) {
324
+ if (file.endsWith(".json")) {
325
+ const filePath = path.join(baselinesDir, file);
326
+ const data = JSON.parse(await fs.readFile(filePath, "utf-8"));
327
+ if (data.metrics) {
328
+ metricsArray.push(data.metrics);
329
+ }
330
+ }
331
+ }
332
+ }
333
+ catch (error) {
334
+ logger.warn("Could not load baseline metrics", { error });
335
+ }
336
+ return ModelBenchmarkService.calculateAverageMetrics(metricsArray);
337
+ }
338
+ /**
339
+ * Get or create test agent for benchmarking
340
+ */
341
+ static async getOrCreateTestAgent() {
342
+ const testAgentUsername = "model-benchmark-agent";
343
+ const adapter = getTrainingDataAdapter();
344
+ const existing = await adapter.getUserByUsername(testAgentUsername);
345
+ if (existing) {
346
+ return existing.id;
347
+ }
348
+ // Create new test agent
349
+ const agentId = await generateSnowflakeId();
350
+ const agent = await adapter.createUser({
351
+ id: agentId,
352
+ privyId: `did:privy:model-benchmark-${agentId}`,
353
+ username: testAgentUsername,
354
+ displayName: "Model Benchmark Agent",
355
+ walletAddress: ethers.Wallet.createRandom().address,
356
+ isAgent: true,
357
+ virtualBalance: "10000",
358
+ reputationPoints: 1000,
359
+ isTest: true,
360
+ updatedAt: new Date(),
361
+ });
362
+ // Create agent config in separate table
363
+ if (agent) {
364
+ await adapter.createAgentConfig({
365
+ id: await generateSnowflakeId(),
366
+ userId: agentId,
367
+ autonomousTrading: true,
368
+ autonomousPosting: false,
369
+ autonomousCommenting: false,
370
+ systemPrompt: "You are a test agent for benchmarking model performance.",
371
+ modelTier: "pro",
372
+ updatedAt: new Date(),
373
+ });
374
+ }
375
+ if (!agent) {
376
+ throw new Error("Failed to create model benchmark test agent");
377
+ }
378
+ logger.info("Created model benchmark test agent", { agentId: agent.id });
379
+ return agent.id;
380
+ }
381
+ /**
382
+ * Get standard benchmark paths for model evaluation
383
+ */
384
+ static async getStandardBenchmarkPaths() {
385
+ const benchmarksDir = path.join(process.cwd(), "benchmarks");
386
+ const standardBenchmarks = [];
387
+ try {
388
+ // First, look in benchmarks/standard/ directory
389
+ const standardDir = path.join(benchmarksDir, "standard");
390
+ if (await fs
391
+ .access(standardDir)
392
+ .then(() => true)
393
+ .catch(() => false)) {
394
+ const standardFiles = await fs.readdir(standardDir);
395
+ for (const file of standardFiles) {
396
+ if (file.startsWith("standard-") && file.endsWith(".json")) {
397
+ standardBenchmarks.push(path.join(standardDir, file));
398
+ }
399
+ }
400
+ }
401
+ // If standard benchmarks found, use those
402
+ if (standardBenchmarks.length > 0) {
403
+ logger.info(`Using ${standardBenchmarks.length} standard benchmarks from benchmarks/standard/`);
404
+ return standardBenchmarks;
405
+ }
406
+ // Fallback: Look for week-long benchmarks in main directory
407
+ const files = await fs.readdir(benchmarksDir);
408
+ for (const file of files) {
409
+ if (file.startsWith("benchmark-week-") && file.endsWith(".json")) {
410
+ standardBenchmarks.push(path.join(benchmarksDir, file));
411
+ }
412
+ }
413
+ // If still nothing, use any benchmark files
414
+ if (standardBenchmarks.length === 0) {
415
+ for (const file of files) {
416
+ if (file.startsWith("benchmark-") &&
417
+ file.endsWith(".json") &&
418
+ !file.includes("comparison")) {
419
+ const filePath = path.join(benchmarksDir, file);
420
+ standardBenchmarks.push(filePath);
421
+ }
422
+ }
423
+ }
424
+ }
425
+ catch (error) {
426
+ logger.error("Could not load standard benchmarks", { error });
427
+ }
428
+ if (standardBenchmarks.length === 0) {
429
+ logger.warn("No standard benchmarks found. Generate benchmark fixtures before upload.");
430
+ }
431
+ return standardBenchmarks;
432
+ }
433
+ }
@@ -0,0 +1,122 @@
1
+ /**
2
+ * Model Registry
3
+ *
4
+ * Centralized configuration for all models available for benchmarking.
5
+ * Add new models here to make them available for comparison.
6
+ */
7
+ /**
8
+ * Registry of all available models for benchmarking
9
+ */
10
+ export const MODEL_REGISTRY = [
11
+ {
12
+ id: "llama-8b",
13
+ displayName: "LLaMA 3.1 8B",
14
+ provider: "groq",
15
+ modelId: "llama-3.1-8b-instant",
16
+ tier: "lite",
17
+ parametersBillions: 8,
18
+ isBaseline: true,
19
+ },
20
+ {
21
+ id: "llama-70b",
22
+ displayName: "LLaMA 3.1 70B",
23
+ provider: "groq",
24
+ modelId: "llama-3.1-70b-versatile",
25
+ tier: "standard",
26
+ parametersBillions: 70,
27
+ isBaseline: false,
28
+ },
29
+ {
30
+ id: "qwen-32b",
31
+ displayName: "Qwen 3 32B",
32
+ provider: "groq",
33
+ modelId: "qwen/qwen3-32b",
34
+ tier: "standard",
35
+ parametersBillions: 32,
36
+ isBaseline: true,
37
+ },
38
+ {
39
+ id: "mixtral-8x7b",
40
+ displayName: "Mixtral 8x7B",
41
+ provider: "groq",
42
+ modelId: "mixtral-8x7b-32768",
43
+ tier: "standard",
44
+ parametersBillions: 46,
45
+ isBaseline: false,
46
+ },
47
+ {
48
+ id: "gpt-4o",
49
+ displayName: "GPT-4o",
50
+ provider: "openai",
51
+ modelId: "gpt-4o",
52
+ tier: "pro",
53
+ isBaseline: false,
54
+ },
55
+ {
56
+ id: "gpt-4o-mini",
57
+ displayName: "GPT-4o Mini",
58
+ provider: "openai",
59
+ modelId: "gpt-4o-mini",
60
+ tier: "lite",
61
+ isBaseline: false,
62
+ },
63
+ {
64
+ id: "claude-sonnet",
65
+ displayName: "Claude 3.5 Sonnet",
66
+ provider: "anthropic",
67
+ modelId: "claude-3-5-sonnet-20241022",
68
+ tier: "pro",
69
+ isBaseline: false,
70
+ },
71
+ {
72
+ id: "claude-haiku",
73
+ displayName: "Claude 3.5 Haiku",
74
+ provider: "anthropic",
75
+ modelId: "claude-3-5-haiku-20241022",
76
+ tier: "lite",
77
+ isBaseline: false,
78
+ },
79
+ ];
80
+ /**
81
+ * Get a model config by ID
82
+ */
83
+ export function getModelById(id) {
84
+ return MODEL_REGISTRY.find((m) => m.id === id);
85
+ }
86
+ /**
87
+ * Get a model config by model ID (API identifier)
88
+ */
89
+ export function getModelByModelId(modelId) {
90
+ return MODEL_REGISTRY.find((m) => m.modelId === modelId);
91
+ }
92
+ /**
93
+ * Get all baseline models
94
+ */
95
+ export function getBaselineModels() {
96
+ return MODEL_REGISTRY.filter((m) => m.isBaseline);
97
+ }
98
+ /**
99
+ * Get models by provider
100
+ */
101
+ export function getModelsByProvider(provider) {
102
+ return MODEL_REGISTRY.filter((m) => m.provider === provider);
103
+ }
104
+ /**
105
+ * Get models by tier
106
+ */
107
+ export function getModelsByTier(tier) {
108
+ return MODEL_REGISTRY.filter((m) => m.tier === tier);
109
+ }
110
+ /**
111
+ * Validate that a model ID exists
112
+ */
113
+ export function validateModelId(id) {
114
+ return MODEL_REGISTRY.some((m) => m.id === id || m.modelId === id);
115
+ }
116
+ /**
117
+ * Get model display name (supports both id and modelId)
118
+ */
119
+ export function getModelDisplayName(idOrModelId) {
120
+ const model = getModelById(idOrModelId) ?? getModelByModelId(idOrModelId);
121
+ return model?.displayName ?? idOrModelId;
122
+ }