@elizaos/training 2.0.0-alpha.13 → 2.0.0-alpha.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/package.json +2 -2
  2. package/research-output/training-runs/training-run-1773726941205.json +38 -0
  3. package/scripts/rank_trajectories.ts +0 -1
  4. package/scripts/run_task_benchmark.ts +4 -11
  5. package/src/adapter.ts +96 -49
  6. package/src/archetypes/ArchetypeConfigService.ts +188 -185
  7. package/src/archetypes/derive-archetype.ts +47 -47
  8. package/src/archetypes/index.ts +2 -2
  9. package/src/benchmark/ArchetypeMatchupBenchmark.ts +70 -70
  10. package/src/benchmark/BenchmarkChartGenerator.ts +70 -69
  11. package/src/benchmark/BenchmarkDataGenerator.ts +136 -136
  12. package/src/benchmark/BenchmarkDataViewer.ts +32 -30
  13. package/src/benchmark/BenchmarkHistoryService.ts +13 -12
  14. package/src/benchmark/BenchmarkRunner.ts +87 -83
  15. package/src/benchmark/BenchmarkValidator.ts +48 -46
  16. package/src/benchmark/FastEvalRunner.ts +17 -16
  17. package/src/benchmark/MetricsValidator.ts +20 -21
  18. package/src/benchmark/MetricsVisualizer.ts +92 -85
  19. package/src/benchmark/ModelBenchmarkService.ts +90 -82
  20. package/src/benchmark/ModelRegistry.ts +44 -44
  21. package/src/benchmark/RulerBenchmarkIntegration.ts +24 -24
  22. package/src/benchmark/SimulationA2AInterface.ts +118 -118
  23. package/src/benchmark/SimulationEngine.ts +51 -51
  24. package/src/benchmark/TaskRunner.ts +87 -79
  25. package/src/benchmark/__tests__/BenchmarkRunner.test.ts +80 -80
  26. package/src/benchmark/__tests__/HeadToHead.test.ts +26 -26
  27. package/src/benchmark/index.ts +27 -27
  28. package/src/benchmark/parseSimulationMetrics.ts +32 -32
  29. package/src/benchmark/simulation-types.ts +10 -10
  30. package/src/dependencies.ts +34 -34
  31. package/src/generation/TrajectoryGenerator.ts +39 -37
  32. package/src/generation/index.ts +1 -1
  33. package/src/huggingface/HuggingFaceDatasetUploader.ts +72 -72
  34. package/src/huggingface/HuggingFaceIntegrationService.ts +59 -53
  35. package/src/huggingface/HuggingFaceModelUploader.ts +60 -59
  36. package/src/huggingface/index.ts +6 -6
  37. package/src/huggingface/shared/HuggingFaceUploadUtil.ts +32 -32
  38. package/src/index.ts +27 -27
  39. package/src/init-training.ts +6 -6
  40. package/src/metrics/TrajectoryMetricsExtractor.ts +70 -71
  41. package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +182 -182
  42. package/src/metrics/index.ts +2 -2
  43. package/src/rubrics/__tests__/index.test.ts +73 -73
  44. package/src/rubrics/ass-kisser.ts +6 -6
  45. package/src/rubrics/degen.ts +6 -6
  46. package/src/rubrics/goody-twoshoes.ts +6 -6
  47. package/src/rubrics/index.ts +50 -50
  48. package/src/rubrics/information-trader.ts +6 -6
  49. package/src/rubrics/infosec.ts +6 -6
  50. package/src/rubrics/liar.ts +6 -6
  51. package/src/rubrics/perps-trader.ts +6 -6
  52. package/src/rubrics/researcher.ts +6 -6
  53. package/src/rubrics/scammer.ts +6 -6
  54. package/src/rubrics/social-butterfly.ts +7 -7
  55. package/src/rubrics/super-predictor.ts +6 -6
  56. package/src/rubrics/trader.ts +5 -5
  57. package/src/scoring/ArchetypeScoringService.ts +56 -54
  58. package/src/scoring/JudgePromptBuilder.ts +96 -96
  59. package/src/scoring/LLMJudgeCache.ts +26 -23
  60. package/src/scoring/index.ts +3 -3
  61. package/src/training/AutomationPipeline.ts +149 -140
  62. package/src/training/BenchmarkService.ts +49 -45
  63. package/src/training/ConfigValidator.ts +38 -32
  64. package/src/training/MarketOutcomesTracker.ts +22 -12
  65. package/src/training/ModelDeployer.ts +15 -15
  66. package/src/training/ModelFetcher.ts +7 -7
  67. package/src/training/ModelSelectionService.ts +32 -32
  68. package/src/training/ModelUsageVerifier.ts +31 -24
  69. package/src/training/MultiModelOrchestrator.ts +44 -44
  70. package/src/training/RLModelConfig.ts +57 -57
  71. package/src/training/RewardBackpropagationService.ts +18 -17
  72. package/src/training/RulerScoringService.ts +73 -72
  73. package/src/training/TrainingMonitor.ts +29 -29
  74. package/src/training/TrajectoryRecorder.ts +25 -27
  75. package/src/training/__tests__/TrajectoryRecorder.test.ts +105 -105
  76. package/src/training/index.ts +36 -36
  77. package/src/training/logRLConfig.ts +7 -7
  78. package/src/training/pipeline.ts +13 -16
  79. package/src/training/storage/ModelStorageService.ts +32 -32
  80. package/src/training/storage/TrainingDataArchiver.ts +21 -21
  81. package/src/training/storage/index.ts +2 -2
  82. package/src/training/types.ts +6 -6
  83. package/src/training/window-utils.ts +14 -14
  84. package/src/utils/index.ts +7 -7
  85. package/src/utils/logger.ts +5 -5
  86. package/src/utils/snowflake.ts +1 -1
  87. package/src/utils/synthetic-detector.ts +7 -7
@@ -14,19 +14,19 @@
14
14
  * @see BenchmarkService - For training pipeline evaluation
15
15
  */
16
16
 
17
- import { getTrainingDataAdapter } from '../adapter';
18
- import { ethers } from 'ethers';
19
- import { promises as fs } from 'fs';
20
- import * as path from 'path';
21
- import { getAgentRuntimeManager } from '../dependencies';
22
- import { logger } from '../utils/logger';
23
- import { generateSnowflakeId } from '../utils/snowflake';
24
- import { BenchmarkRunner } from './BenchmarkRunner';
17
+ import { promises as fs } from "node:fs";
18
+ import * as path from "node:path";
19
+ import { ethers } from "ethers";
20
+ import { getTrainingDataAdapter } from "../adapter";
21
+ import { getAgentRuntimeManager } from "../dependencies";
22
+ import { logger } from "../utils/logger";
23
+ import { generateSnowflakeId } from "../utils/snowflake";
24
+ import { BenchmarkRunner } from "./BenchmarkRunner";
25
25
  import {
26
26
  type JsonValue,
27
27
  parseSimulationMetrics,
28
- } from './parseSimulationMetrics';
29
- import type { SimulationMetrics, SimulationResult } from './SimulationEngine';
28
+ } from "./parseSimulationMetrics";
29
+ import type { SimulationMetrics, SimulationResult } from "./SimulationEngine";
30
30
 
31
31
  export interface ModelBenchmarkOptions {
32
32
  modelId: string;
@@ -66,7 +66,7 @@ export interface ModelComparisonResult {
66
66
  optimalityDelta: number;
67
67
  isImprovement: boolean;
68
68
  };
69
- recommendation: 'deploy' | 'keep_training' | 'baseline_better';
69
+ recommendation: "deploy" | "keep_training" | "baseline_better";
70
70
  }
71
71
 
72
72
  export interface AverageMetrics {
@@ -82,9 +82,9 @@ export class ModelBenchmarkService {
82
82
  * Benchmark a trained model against standard benchmarks
83
83
  */
84
84
  static async benchmarkModel(
85
- options: ModelBenchmarkOptions
85
+ options: ModelBenchmarkOptions,
86
86
  ): Promise<ModelBenchmarkResult[]> {
87
- logger.info('Starting model benchmark', { modelId: options.modelId });
87
+ logger.info("Starting model benchmark", { modelId: options.modelId });
88
88
 
89
89
  // Load model from database
90
90
  const adapter = getTrainingDataAdapter();
@@ -95,9 +95,11 @@ export class ModelBenchmarkService {
95
95
  }
96
96
 
97
97
  // Check if model already benchmarked
98
- const existingBenchmarks = await this.getModelBenchmarks(options.modelId);
98
+ const existingBenchmarks = await ModelBenchmarkService.getModelBenchmarks(
99
+ options.modelId,
100
+ );
99
101
  if (existingBenchmarks.length > 0 && !options.saveResults) {
100
- logger.info('Model already benchmarked', {
102
+ logger.info("Model already benchmarked", {
101
103
  modelId: options.modelId,
102
104
  count: existingBenchmarks.length,
103
105
  });
@@ -105,13 +107,13 @@ export class ModelBenchmarkService {
105
107
  }
106
108
 
107
109
  // Create test agent for benchmarking
108
- const testAgentId = await this.getOrCreateTestAgent();
110
+ const testAgentId = await ModelBenchmarkService.getOrCreateTestAgent();
109
111
 
110
112
  const results: ModelBenchmarkResult[] = [];
111
113
 
112
114
  // Run each benchmark
113
115
  for (const benchmarkPath of options.benchmarkPaths) {
114
- logger.info('Running benchmark', {
116
+ logger.info("Running benchmark", {
115
117
  benchmark: benchmarkPath,
116
118
  modelId: options.modelId,
117
119
  });
@@ -131,9 +133,9 @@ export class ModelBenchmarkService {
131
133
  options.outputDir ||
132
134
  path.join(
133
135
  process.cwd(),
134
- 'benchmarks',
135
- 'model-results',
136
- model.version
136
+ "benchmarks",
137
+ "model-results",
138
+ model.version,
137
139
  ),
138
140
  forceModel: model.storagePath, // Use the RL model
139
141
  });
@@ -149,7 +151,8 @@ export class ModelBenchmarkService {
149
151
  };
150
152
 
151
153
  // Compare to baseline if available
152
- const baseline = await this.getBaselineBenchmark(benchmarkPath);
154
+ const baseline =
155
+ await ModelBenchmarkService.getBaselineBenchmark(benchmarkPath);
153
156
  if (baseline) {
154
157
  benchmarkResult.comparisonToBaseline = {
155
158
  pnlDelta: simulationResult.metrics.totalPnl - baseline.totalPnl,
@@ -165,7 +168,7 @@ export class ModelBenchmarkService {
165
168
 
166
169
  results.push(benchmarkResult);
167
170
 
168
- logger.info('Benchmark completed', {
171
+ logger.info("Benchmark completed", {
169
172
  benchmark: benchmarkPath,
170
173
  pnl: simulationResult.metrics.totalPnl,
171
174
  accuracy: simulationResult.metrics.predictionMetrics.accuracy,
@@ -173,11 +176,13 @@ export class ModelBenchmarkService {
173
176
 
174
177
  // Save result if requested (to both database and files)
175
178
  if (options.saveResults) {
176
- await this.saveBenchmarkResultToDatabase(benchmarkResult);
177
- await this.saveBenchmarkResult(benchmarkResult);
179
+ await ModelBenchmarkService.saveBenchmarkResultToDatabase(
180
+ benchmarkResult,
181
+ );
182
+ await ModelBenchmarkService.saveBenchmarkResult(benchmarkResult);
178
183
  }
179
184
  } catch (error) {
180
- logger.error('Benchmark failed', { benchmark: benchmarkPath, error });
185
+ logger.error("Benchmark failed", { benchmark: benchmarkPath, error });
181
186
  }
182
187
  }
183
188
 
@@ -198,7 +203,7 @@ export class ModelBenchmarkService {
198
203
  );
199
204
  }
200
205
 
201
- logger.info('Model benchmark complete', {
206
+ logger.info("Model benchmark complete", {
202
207
  modelId: options.modelId,
203
208
  benchmarksRun: results.length,
204
209
  });
@@ -210,22 +215,24 @@ export class ModelBenchmarkService {
210
215
  * Compare new model against baseline
211
216
  */
212
217
  static async compareToBaseline(
213
- modelId: string
218
+ modelId: string,
214
219
  ): Promise<ModelComparisonResult> {
215
220
  // Get new model benchmarks
216
- const newModelBenchmarks = await this.getModelBenchmarks(modelId);
221
+ const newModelBenchmarks =
222
+ await ModelBenchmarkService.getModelBenchmarks(modelId);
217
223
 
218
224
  if (newModelBenchmarks.length === 0) {
219
225
  throw new Error(`No benchmarks found for model: ${modelId}`);
220
226
  }
221
227
 
222
228
  // Calculate new model average metrics
223
- const newModelMetrics = this.calculateAverageMetrics(
224
- newModelBenchmarks.map((b) => b.metrics)
229
+ const newModelMetrics = ModelBenchmarkService.calculateAverageMetrics(
230
+ newModelBenchmarks.map((b) => b.metrics),
225
231
  );
226
232
 
227
233
  // Get baseline benchmarks (use best baseline model)
228
- const baselineMetrics = await this.getBaselineAverageMetrics();
234
+ const baselineMetrics =
235
+ await ModelBenchmarkService.getBaselineAverageMetrics();
229
236
 
230
237
  // Calculate improvement
231
238
  const pnlDelta = newModelMetrics.totalPnl - baselineMetrics.totalPnl;
@@ -241,23 +248,23 @@ export class ModelBenchmarkService {
241
248
 
242
249
  const isImprovement = improvementScore > 0.5;
243
250
 
244
- let recommendation: 'deploy' | 'keep_training' | 'baseline_better';
251
+ let recommendation: "deploy" | "keep_training" | "baseline_better";
245
252
  if (isImprovement && pnlDelta > 0) {
246
- recommendation = 'deploy';
253
+ recommendation = "deploy";
247
254
  } else if (pnlDelta < -100) {
248
- recommendation = 'baseline_better';
255
+ recommendation = "baseline_better";
249
256
  } else {
250
- recommendation = 'keep_training';
257
+ recommendation = "keep_training";
251
258
  }
252
259
 
253
260
  return {
254
261
  newModel: {
255
262
  modelId,
256
- version: newModelBenchmarks[0]!.modelVersion,
263
+ version: newModelBenchmarks[0]?.modelVersion,
257
264
  avgMetrics: newModelMetrics,
258
265
  },
259
266
  baseline: {
260
- modelId: 'baseline',
267
+ modelId: "baseline",
261
268
  avgMetrics: baselineMetrics,
262
269
  },
263
270
  improvement: {
@@ -281,15 +288,15 @@ export class ModelBenchmarkService {
281
288
  * Get model benchmark results
282
289
  */
283
290
  private static async getModelBenchmarks(
284
- modelId: string
291
+ modelId: string,
285
292
  ): Promise<ModelBenchmarkResult[]> {
286
293
  // For now, read from files
287
294
  // In production, you'd store these in a database table
288
295
 
289
296
  const benchmarksDir = path.join(
290
297
  process.cwd(),
291
- 'benchmarks',
292
- 'model-results'
298
+ "benchmarks",
299
+ "model-results",
293
300
  );
294
301
  const results: ModelBenchmarkResult[] = [];
295
302
 
@@ -302,9 +309,9 @@ export class ModelBenchmarkService {
302
309
  const files = await fs.readdir(modelDir).catch(() => []);
303
310
 
304
311
  for (const file of files) {
305
- if (file.endsWith('.json')) {
312
+ if (file.endsWith(".json")) {
306
313
  const filePath = path.join(modelDir, file);
307
- const data = JSON.parse(await fs.readFile(filePath, 'utf-8'));
314
+ const data = JSON.parse(await fs.readFile(filePath, "utf-8"));
308
315
 
309
316
  if (data.modelId === modelId) {
310
317
  results.push(data);
@@ -312,7 +319,7 @@ export class ModelBenchmarkService {
312
319
  }
313
320
  }
314
321
  } catch (error) {
315
- logger.warn('Could not load benchmark results', { error });
322
+ logger.warn("Could not load benchmark results", { error });
316
323
  }
317
324
 
318
325
  return results;
@@ -322,7 +329,7 @@ export class ModelBenchmarkService {
322
329
  * Save benchmark result to database
323
330
  */
324
331
  private static async saveBenchmarkResultToDatabase(
325
- result: ModelBenchmarkResult
332
+ result: ModelBenchmarkResult,
326
333
  ): Promise<void> {
327
334
  await getTrainingDataAdapter().insertBenchmarkResult({
328
335
  id: await generateSnowflakeId(),
@@ -341,7 +348,7 @@ export class ModelBenchmarkService {
341
348
  duration: result.metrics.timing.totalDuration,
342
349
  });
343
350
 
344
- logger.info('Benchmark result saved to database', {
351
+ logger.info("Benchmark result saved to database", {
345
352
  modelId: result.modelId,
346
353
  benchmarkId: result.benchmarkId,
347
354
  });
@@ -351,13 +358,13 @@ export class ModelBenchmarkService {
351
358
  * Save benchmark result to file
352
359
  */
353
360
  private static async saveBenchmarkResult(
354
- result: ModelBenchmarkResult
361
+ result: ModelBenchmarkResult,
355
362
  ): Promise<void> {
356
363
  const outputDir = path.join(
357
364
  process.cwd(),
358
- 'benchmarks',
359
- 'model-results',
360
- result.modelVersion
365
+ "benchmarks",
366
+ "model-results",
367
+ result.modelVersion,
361
368
  );
362
369
  await fs.mkdir(outputDir, { recursive: true });
363
370
 
@@ -366,20 +373,21 @@ export class ModelBenchmarkService {
366
373
 
367
374
  await fs.writeFile(filePath, JSON.stringify(result, null, 2));
368
375
 
369
- logger.info('Benchmark result saved to file', { filePath });
376
+ logger.info("Benchmark result saved to file", { filePath });
370
377
  }
371
378
 
372
379
  /**
373
380
  * Get benchmark results from database
374
381
  */
375
382
  static async getBenchmarkResultsFromDatabase(
376
- modelId: string
383
+ modelId: string,
377
384
  ): Promise<ModelBenchmarkResult[]> {
378
- const results = await getTrainingDataAdapter().getBenchmarkResultsByModel(modelId);
385
+ const results =
386
+ await getTrainingDataAdapter().getBenchmarkResultsByModel(modelId);
379
387
 
380
388
  return results.map((r) => ({
381
389
  modelId: r.modelId,
382
- modelVersion: '', // Not stored in results table
390
+ modelVersion: "", // Not stored in results table
383
391
  benchmarkId: r.benchmarkId,
384
392
  benchmarkPath: r.benchmarkPath,
385
393
  runAt: r.runAt,
@@ -400,17 +408,17 @@ export class ModelBenchmarkService {
400
408
  * Get baseline benchmark for comparison
401
409
  */
402
410
  private static async getBaselineBenchmark(
403
- benchmarkPath: string
411
+ benchmarkPath: string,
404
412
  ): Promise<SimulationMetrics | null> {
405
413
  try {
406
414
  // Look for baseline result for this benchmark
407
- const baselinesDir = path.join(process.cwd(), 'benchmarks', 'baselines');
415
+ const baselinesDir = path.join(process.cwd(), "benchmarks", "baselines");
408
416
  const files = await fs.readdir(baselinesDir).catch(() => []);
409
417
 
410
418
  for (const file of files) {
411
- if (file.endsWith('.json')) {
419
+ if (file.endsWith(".json")) {
412
420
  const filePath = path.join(baselinesDir, file);
413
- const data = JSON.parse(await fs.readFile(filePath, 'utf-8'));
421
+ const data = JSON.parse(await fs.readFile(filePath, "utf-8"));
414
422
 
415
423
  if (
416
424
  data.benchmark?.path === benchmarkPath ||
@@ -421,7 +429,7 @@ export class ModelBenchmarkService {
421
429
  }
422
430
  }
423
431
  } catch (error) {
424
- logger.warn('Could not load baseline benchmark', { error });
432
+ logger.warn("Could not load baseline benchmark", { error });
425
433
  }
426
434
 
427
435
  return null;
@@ -431,7 +439,7 @@ export class ModelBenchmarkService {
431
439
  * Calculate average metrics across multiple benchmark results
432
440
  */
433
441
  private static calculateAverageMetrics(
434
- metricsArray: SimulationMetrics[]
442
+ metricsArray: SimulationMetrics[],
435
443
  ): AverageMetrics {
436
444
  if (metricsArray.length === 0) {
437
445
  return {
@@ -450,7 +458,7 @@ export class ModelBenchmarkService {
450
458
  winRate: acc.winRate + metrics.perpMetrics.winRate,
451
459
  optimality: acc.optimality + metrics.optimalityScore,
452
460
  }),
453
- { pnl: 0, accuracy: 0, winRate: 0, optimality: 0 }
461
+ { pnl: 0, accuracy: 0, winRate: 0, optimality: 0 },
454
462
  );
455
463
 
456
464
  const count = metricsArray.length;
@@ -468,16 +476,16 @@ export class ModelBenchmarkService {
468
476
  * Get baseline average metrics
469
477
  */
470
478
  private static async getBaselineAverageMetrics(): Promise<AverageMetrics> {
471
- const baselinesDir = path.join(process.cwd(), 'benchmarks', 'baselines');
479
+ const baselinesDir = path.join(process.cwd(), "benchmarks", "baselines");
472
480
  const metricsArray: SimulationMetrics[] = [];
473
481
 
474
482
  try {
475
483
  const files = await fs.readdir(baselinesDir).catch(() => []);
476
484
 
477
485
  for (const file of files) {
478
- if (file.endsWith('.json')) {
486
+ if (file.endsWith(".json")) {
479
487
  const filePath = path.join(baselinesDir, file);
480
- const data = JSON.parse(await fs.readFile(filePath, 'utf-8'));
488
+ const data = JSON.parse(await fs.readFile(filePath, "utf-8"));
481
489
 
482
490
  if (data.metrics) {
483
491
  metricsArray.push(data.metrics);
@@ -485,17 +493,17 @@ export class ModelBenchmarkService {
485
493
  }
486
494
  }
487
495
  } catch (error) {
488
- logger.warn('Could not load baseline metrics', { error });
496
+ logger.warn("Could not load baseline metrics", { error });
489
497
  }
490
498
 
491
- return this.calculateAverageMetrics(metricsArray);
499
+ return ModelBenchmarkService.calculateAverageMetrics(metricsArray);
492
500
  }
493
501
 
494
502
  /**
495
503
  * Get or create test agent for benchmarking
496
504
  */
497
505
  private static async getOrCreateTestAgent(): Promise<string> {
498
- const testAgentUsername = 'model-benchmark-agent';
506
+ const testAgentUsername = "model-benchmark-agent";
499
507
  const adapter = getTrainingDataAdapter();
500
508
 
501
509
  const existing = await adapter.getUserByUsername(testAgentUsername);
@@ -510,10 +518,10 @@ export class ModelBenchmarkService {
510
518
  id: agentId,
511
519
  privyId: `did:privy:model-benchmark-${agentId}`,
512
520
  username: testAgentUsername,
513
- displayName: 'Model Benchmark Agent',
521
+ displayName: "Model Benchmark Agent",
514
522
  walletAddress: ethers.Wallet.createRandom().address,
515
523
  isAgent: true,
516
- virtualBalance: '10000',
524
+ virtualBalance: "10000",
517
525
  reputationPoints: 1000,
518
526
  isTest: true,
519
527
  updatedAt: new Date(),
@@ -528,17 +536,17 @@ export class ModelBenchmarkService {
528
536
  autonomousPosting: false,
529
537
  autonomousCommenting: false,
530
538
  systemPrompt:
531
- 'You are a test agent for benchmarking model performance.',
532
- modelTier: 'pro',
539
+ "You are a test agent for benchmarking model performance.",
540
+ modelTier: "pro",
533
541
  updatedAt: new Date(),
534
542
  });
535
543
  }
536
544
 
537
545
  if (!agent) {
538
- throw new Error('Failed to create model benchmark test agent');
546
+ throw new Error("Failed to create model benchmark test agent");
539
547
  }
540
548
 
541
- logger.info('Created model benchmark test agent', { agentId: agent.id });
549
+ logger.info("Created model benchmark test agent", { agentId: agent.id });
542
550
 
543
551
  return agent.id;
544
552
  }
@@ -547,12 +555,12 @@ export class ModelBenchmarkService {
547
555
  * Get standard benchmark paths for model evaluation
548
556
  */
549
557
  static async getStandardBenchmarkPaths(): Promise<string[]> {
550
- const benchmarksDir = path.join(process.cwd(), 'benchmarks');
558
+ const benchmarksDir = path.join(process.cwd(), "benchmarks");
551
559
  const standardBenchmarks: string[] = [];
552
560
 
553
561
  try {
554
562
  // First, look in benchmarks/standard/ directory
555
- const standardDir = path.join(benchmarksDir, 'standard');
563
+ const standardDir = path.join(benchmarksDir, "standard");
556
564
  if (
557
565
  await fs
558
566
  .access(standardDir)
@@ -561,7 +569,7 @@ export class ModelBenchmarkService {
561
569
  ) {
562
570
  const standardFiles = await fs.readdir(standardDir);
563
571
  for (const file of standardFiles) {
564
- if (file.startsWith('standard-') && file.endsWith('.json')) {
572
+ if (file.startsWith("standard-") && file.endsWith(".json")) {
565
573
  standardBenchmarks.push(path.join(standardDir, file));
566
574
  }
567
575
  }
@@ -570,7 +578,7 @@ export class ModelBenchmarkService {
570
578
  // If standard benchmarks found, use those
571
579
  if (standardBenchmarks.length > 0) {
572
580
  logger.info(
573
- `Using ${standardBenchmarks.length} standard benchmarks from benchmarks/standard/`
581
+ `Using ${standardBenchmarks.length} standard benchmarks from benchmarks/standard/`,
574
582
  );
575
583
  return standardBenchmarks;
576
584
  }
@@ -578,7 +586,7 @@ export class ModelBenchmarkService {
578
586
  // Fallback: Look for week-long benchmarks in main directory
579
587
  const files = await fs.readdir(benchmarksDir);
580
588
  for (const file of files) {
581
- if (file.startsWith('benchmark-week-') && file.endsWith('.json')) {
589
+ if (file.startsWith("benchmark-week-") && file.endsWith(".json")) {
582
590
  standardBenchmarks.push(path.join(benchmarksDir, file));
583
591
  }
584
592
  }
@@ -587,9 +595,9 @@ export class ModelBenchmarkService {
587
595
  if (standardBenchmarks.length === 0) {
588
596
  for (const file of files) {
589
597
  if (
590
- file.startsWith('benchmark-') &&
591
- file.endsWith('.json') &&
592
- !file.includes('comparison')
598
+ file.startsWith("benchmark-") &&
599
+ file.endsWith(".json") &&
600
+ !file.includes("comparison")
593
601
  ) {
594
602
  const filePath = path.join(benchmarksDir, file);
595
603
  standardBenchmarks.push(filePath);
@@ -597,12 +605,12 @@ export class ModelBenchmarkService {
597
605
  }
598
606
  }
599
607
  } catch (error) {
600
- logger.error('Could not load standard benchmarks', { error });
608
+ logger.error("Could not load standard benchmarks", { error });
601
609
  }
602
610
 
603
611
  if (standardBenchmarks.length === 0) {
604
612
  logger.warn(
605
- 'No standard benchmarks found. Generate benchmark fixtures before upload.'
613
+ "No standard benchmarks found. Generate benchmark fixtures before upload.",
606
614
  );
607
615
  }
608
616
 
@@ -13,13 +13,13 @@ export interface ModelConfig {
13
13
  displayName: string;
14
14
 
15
15
  /** Provider (groq, openai, anthropic, etc.) */
16
- provider: 'groq' | 'openai' | 'anthropic' | 'together' | 'local';
16
+ provider: "groq" | "openai" | "anthropic" | "together" | "local";
17
17
 
18
18
  /** Model identifier for the provider's API */
19
19
  modelId: string;
20
20
 
21
21
  /** Model tier (lite, standard, pro) */
22
- tier: 'lite' | 'standard' | 'pro';
22
+ tier: "lite" | "standard" | "pro";
23
23
 
24
24
  /** Approximate parameters in billions */
25
25
  parametersBillions?: number;
@@ -36,71 +36,71 @@ export interface ModelConfig {
36
36
  */
37
37
  export const MODEL_REGISTRY: ModelConfig[] = [
38
38
  {
39
- id: 'llama-8b',
40
- displayName: 'LLaMA 3.1 8B',
41
- provider: 'groq',
42
- modelId: 'llama-3.1-8b-instant',
43
- tier: 'lite',
39
+ id: "llama-8b",
40
+ displayName: "LLaMA 3.1 8B",
41
+ provider: "groq",
42
+ modelId: "llama-3.1-8b-instant",
43
+ tier: "lite",
44
44
  parametersBillions: 8,
45
45
  isBaseline: true,
46
46
  },
47
47
  {
48
- id: 'llama-70b',
49
- displayName: 'LLaMA 3.1 70B',
50
- provider: 'groq',
51
- modelId: 'llama-3.1-70b-versatile',
52
- tier: 'standard',
48
+ id: "llama-70b",
49
+ displayName: "LLaMA 3.1 70B",
50
+ provider: "groq",
51
+ modelId: "llama-3.1-70b-versatile",
52
+ tier: "standard",
53
53
  parametersBillions: 70,
54
54
  isBaseline: false,
55
55
  },
56
56
  {
57
- id: 'qwen-32b',
58
- displayName: 'Qwen 3 32B',
59
- provider: 'groq',
60
- modelId: 'qwen/qwen3-32b',
61
- tier: 'standard',
57
+ id: "qwen-32b",
58
+ displayName: "Qwen 3 32B",
59
+ provider: "groq",
60
+ modelId: "qwen/qwen3-32b",
61
+ tier: "standard",
62
62
  parametersBillions: 32,
63
63
  isBaseline: true,
64
64
  },
65
65
  {
66
- id: 'mixtral-8x7b',
67
- displayName: 'Mixtral 8x7B',
68
- provider: 'groq',
69
- modelId: 'mixtral-8x7b-32768',
70
- tier: 'standard',
66
+ id: "mixtral-8x7b",
67
+ displayName: "Mixtral 8x7B",
68
+ provider: "groq",
69
+ modelId: "mixtral-8x7b-32768",
70
+ tier: "standard",
71
71
  parametersBillions: 46,
72
72
  isBaseline: false,
73
73
  },
74
74
  {
75
- id: 'gpt-4o',
76
- displayName: 'GPT-4o',
77
- provider: 'openai',
78
- modelId: 'gpt-4o',
79
- tier: 'pro',
75
+ id: "gpt-4o",
76
+ displayName: "GPT-4o",
77
+ provider: "openai",
78
+ modelId: "gpt-4o",
79
+ tier: "pro",
80
80
  isBaseline: false,
81
81
  },
82
82
  {
83
- id: 'gpt-4o-mini',
84
- displayName: 'GPT-4o Mini',
85
- provider: 'openai',
86
- modelId: 'gpt-4o-mini',
87
- tier: 'lite',
83
+ id: "gpt-4o-mini",
84
+ displayName: "GPT-4o Mini",
85
+ provider: "openai",
86
+ modelId: "gpt-4o-mini",
87
+ tier: "lite",
88
88
  isBaseline: false,
89
89
  },
90
90
  {
91
- id: 'claude-sonnet',
92
- displayName: 'Claude 3.5 Sonnet',
93
- provider: 'anthropic',
94
- modelId: 'claude-3-5-sonnet-20241022',
95
- tier: 'pro',
91
+ id: "claude-sonnet",
92
+ displayName: "Claude 3.5 Sonnet",
93
+ provider: "anthropic",
94
+ modelId: "claude-3-5-sonnet-20241022",
95
+ tier: "pro",
96
96
  isBaseline: false,
97
97
  },
98
98
  {
99
- id: 'claude-haiku',
100
- displayName: 'Claude 3.5 Haiku',
101
- provider: 'anthropic',
102
- modelId: 'claude-3-5-haiku-20241022',
103
- tier: 'lite',
99
+ id: "claude-haiku",
100
+ displayName: "Claude 3.5 Haiku",
101
+ provider: "anthropic",
102
+ modelId: "claude-3-5-haiku-20241022",
103
+ tier: "lite",
104
104
  isBaseline: false,
105
105
  },
106
106
  ];
@@ -130,7 +130,7 @@ export function getBaselineModels(): ModelConfig[] {
130
130
  * Get models by provider
131
131
  */
132
132
  export function getModelsByProvider(
133
- provider: ModelConfig['provider']
133
+ provider: ModelConfig["provider"],
134
134
  ): ModelConfig[] {
135
135
  return MODEL_REGISTRY.filter((m) => m.provider === provider);
136
136
  }
@@ -138,7 +138,7 @@ export function getModelsByProvider(
138
138
  /**
139
139
  * Get models by tier
140
140
  */
141
- export function getModelsByTier(tier: ModelConfig['tier']): ModelConfig[] {
141
+ export function getModelsByTier(tier: ModelConfig["tier"]): ModelConfig[] {
142
142
  return MODEL_REGISTRY.filter((m) => m.tier === tier);
143
143
  }
144
144