@elizaos/training 2.0.0-alpha.21 → 2.0.0-alpha.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-lint.log +2 -0
- package/.turbo/turbo-typecheck.log +1 -0
- package/dist/.tsbuildinfo +1 -0
- package/dist/adapter.js +59 -0
- package/dist/archetypes/ArchetypeConfigService.js +510 -0
- package/dist/archetypes/derive-archetype.js +196 -0
- package/dist/archetypes/index.js +7 -0
- package/dist/benchmark/ArchetypeMatchupBenchmark.js +547 -0
- package/dist/benchmark/BenchmarkChartGenerator.js +632 -0
- package/dist/benchmark/BenchmarkDataGenerator.js +825 -0
- package/dist/benchmark/BenchmarkDataViewer.js +197 -0
- package/dist/benchmark/BenchmarkHistoryService.js +135 -0
- package/dist/benchmark/BenchmarkRunner.js +483 -0
- package/dist/benchmark/BenchmarkValidator.js +158 -0
- package/dist/benchmark/FastEvalRunner.js +133 -0
- package/dist/benchmark/MetricsValidator.js +104 -0
- package/dist/benchmark/MetricsVisualizer.js +775 -0
- package/dist/benchmark/ModelBenchmarkService.js +433 -0
- package/dist/benchmark/ModelRegistry.js +122 -0
- package/dist/benchmark/RulerBenchmarkIntegration.js +168 -0
- package/dist/benchmark/SimulationA2AInterface.js +683 -0
- package/dist/benchmark/SimulationEngine.js +522 -0
- package/dist/benchmark/TaskRunner.js +60 -0
- package/dist/benchmark/__tests__/BenchmarkRunner.test.js +409 -0
- package/dist/benchmark/__tests__/HeadToHead.test.js +105 -0
- package/dist/benchmark/index.js +23 -0
- package/dist/benchmark/parseSimulationMetrics.js +86 -0
- package/dist/benchmark/simulation-types.js +1 -0
- package/dist/dependencies.js +197 -0
- package/dist/generation/TrajectoryGenerator.js +244 -0
- package/dist/generation/index.js +6 -0
- package/dist/huggingface/HuggingFaceDatasetUploader.js +463 -0
- package/dist/huggingface/HuggingFaceIntegrationService.js +272 -0
- package/dist/huggingface/HuggingFaceModelUploader.js +385 -0
- package/dist/huggingface/index.js +9 -0
- package/dist/huggingface/shared/HuggingFaceUploadUtil.js +144 -0
- package/dist/index.js +41 -0
- package/dist/init-training.js +43 -0
- package/dist/metrics/TrajectoryMetricsExtractor.js +523 -0
- package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +628 -0
- package/dist/metrics/index.js +7 -0
- package/dist/metrics/types.js +21 -0
- package/dist/rubrics/__tests__/index.test.js +150 -0
- package/dist/rubrics/ass-kisser.js +83 -0
- package/dist/rubrics/degen.js +78 -0
- package/dist/rubrics/goody-twoshoes.js +82 -0
- package/dist/rubrics/index.js +184 -0
- package/dist/rubrics/information-trader.js +82 -0
- package/dist/rubrics/infosec.js +99 -0
- package/dist/rubrics/liar.js +102 -0
- package/dist/rubrics/perps-trader.js +85 -0
- package/dist/rubrics/researcher.js +79 -0
- package/dist/rubrics/scammer.js +80 -0
- package/dist/rubrics/social-butterfly.js +71 -0
- package/dist/rubrics/super-predictor.js +95 -0
- package/dist/rubrics/trader.js +65 -0
- package/dist/scoring/ArchetypeScoringService.js +301 -0
- package/dist/scoring/JudgePromptBuilder.js +401 -0
- package/dist/scoring/LLMJudgeCache.js +263 -0
- package/dist/scoring/index.js +8 -0
- package/dist/training/AutomationPipeline.js +714 -0
- package/dist/training/BenchmarkService.js +370 -0
- package/dist/training/ConfigValidator.js +153 -0
- package/dist/training/MarketOutcomesTracker.js +142 -0
- package/dist/training/ModelDeployer.js +128 -0
- package/dist/training/ModelFetcher.js +48 -0
- package/dist/training/ModelSelectionService.js +248 -0
- package/dist/training/ModelUsageVerifier.js +106 -0
- package/dist/training/MultiModelOrchestrator.js +349 -0
- package/dist/training/RLModelConfig.js +295 -0
- package/dist/training/RewardBackpropagationService.js +117 -0
- package/dist/training/RulerScoringService.js +450 -0
- package/dist/training/TrainingMonitor.js +108 -0
- package/dist/training/TrajectoryRecorder.js +281 -0
- package/dist/training/__tests__/TrajectoryRecorder.test.js +363 -0
- package/dist/training/index.js +30 -0
- package/dist/training/logRLConfig.js +29 -0
- package/dist/training/pipeline.js +80 -0
- package/dist/training/storage/ModelStorageService.js +190 -0
- package/dist/training/storage/TrainingDataArchiver.js +136 -0
- package/dist/training/storage/index.js +7 -0
- package/dist/training/types.js +6 -0
- package/dist/training/window-utils.js +100 -0
- package/dist/utils/index.js +73 -0
- package/dist/utils/logger.js +55 -0
- package/dist/utils/snowflake.js +15 -0
- package/dist/utils/synthetic-detector.js +67 -0
- package/package.json +2 -2
- package/research-output/training-runs/training-run-1773742857616.json +38 -0
- package/research-output/training-runs/training-run-1773742946977.json +38 -0
- package/research-output/training-runs/training-run-1773743278891.json +38 -0
- package/research-output/training-runs/training-run-1773743409754.json +38 -0
- package/research-output/training-runs/training-run-1773743651086.json +38 -0
- package/research-output/training-runs/training-run-1773743782883.json +38 -0
|
@@ -0,0 +1,433 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Model Benchmark Service (For HuggingFace Integration)
|
|
3
|
+
*
|
|
4
|
+
* Runs benchmark tests on trained RL models for HuggingFace upload decisions.
|
|
5
|
+
* Compares new models against baselines and previous versions.
|
|
6
|
+
*
|
|
7
|
+
* **Purpose:** Evaluate models for HuggingFace upload
|
|
8
|
+
* **Used by:** HuggingFace integration, weekly CRON, CLI scripts
|
|
9
|
+
* **Storage:** benchmark_results table (dedicated table)
|
|
10
|
+
* **Focus:** Public model release, baseline comparison
|
|
11
|
+
*
|
|
12
|
+
* **Note:** For training pipeline benchmarking, see BenchmarkService
|
|
13
|
+
*
|
|
14
|
+
* @see BenchmarkService - For training pipeline evaluation
|
|
15
|
+
*/
|
|
16
|
+
import { promises as fs } from "node:fs";
|
|
17
|
+
import * as path from "node:path";
|
|
18
|
+
import { ethers } from "ethers";
|
|
19
|
+
import { getTrainingDataAdapter } from "../adapter";
|
|
20
|
+
import { getAgentRuntimeManager } from "../dependencies";
|
|
21
|
+
import { logger } from "../utils/logger";
|
|
22
|
+
import { generateSnowflakeId } from "../utils/snowflake";
|
|
23
|
+
import { BenchmarkRunner } from "./BenchmarkRunner";
|
|
24
|
+
import { parseSimulationMetrics, } from "./parseSimulationMetrics";
|
|
25
|
+
export class ModelBenchmarkService {
|
|
26
|
+
/**
|
|
27
|
+
* Benchmark a trained model against standard benchmarks
|
|
28
|
+
*/
|
|
29
|
+
static async benchmarkModel(options) {
|
|
30
|
+
logger.info("Starting model benchmark", { modelId: options.modelId });
|
|
31
|
+
// Load model from database
|
|
32
|
+
const adapter = getTrainingDataAdapter();
|
|
33
|
+
const model = await adapter.getModelById(options.modelId);
|
|
34
|
+
if (!model) {
|
|
35
|
+
throw new Error(`Model not found: ${options.modelId}`);
|
|
36
|
+
}
|
|
37
|
+
// Check if model already benchmarked
|
|
38
|
+
const existingBenchmarks = await ModelBenchmarkService.getModelBenchmarks(options.modelId);
|
|
39
|
+
if (existingBenchmarks.length > 0 && !options.saveResults) {
|
|
40
|
+
logger.info("Model already benchmarked", {
|
|
41
|
+
modelId: options.modelId,
|
|
42
|
+
count: existingBenchmarks.length,
|
|
43
|
+
});
|
|
44
|
+
return existingBenchmarks;
|
|
45
|
+
}
|
|
46
|
+
// Create test agent for benchmarking
|
|
47
|
+
const testAgentId = await ModelBenchmarkService.getOrCreateTestAgent();
|
|
48
|
+
const results = [];
|
|
49
|
+
// Run each benchmark
|
|
50
|
+
for (const benchmarkPath of options.benchmarkPaths) {
|
|
51
|
+
logger.info("Running benchmark", {
|
|
52
|
+
benchmark: benchmarkPath,
|
|
53
|
+
modelId: options.modelId,
|
|
54
|
+
});
|
|
55
|
+
try {
|
|
56
|
+
// Get agent runtime (will use the RL model if configured)
|
|
57
|
+
const runtime = await getAgentRuntimeManager().getRuntime(testAgentId);
|
|
58
|
+
// Run benchmark
|
|
59
|
+
const simulationResult = await BenchmarkRunner.runSingle({
|
|
60
|
+
benchmarkPath,
|
|
61
|
+
agentRuntime: runtime,
|
|
62
|
+
agentUserId: testAgentId,
|
|
63
|
+
saveTrajectory: false,
|
|
64
|
+
outputDir: options.outputDir ||
|
|
65
|
+
path.join(process.cwd(), "benchmarks", "model-results", model.version),
|
|
66
|
+
forceModel: model.storagePath, // Use the RL model
|
|
67
|
+
});
|
|
68
|
+
// Create benchmark result
|
|
69
|
+
const benchmarkResult = {
|
|
70
|
+
modelId: options.modelId,
|
|
71
|
+
modelVersion: model.version,
|
|
72
|
+
benchmarkId: simulationResult.benchmarkId,
|
|
73
|
+
benchmarkPath,
|
|
74
|
+
runAt: new Date(),
|
|
75
|
+
metrics: simulationResult.metrics,
|
|
76
|
+
};
|
|
77
|
+
// Compare to baseline if available
|
|
78
|
+
const baseline = await ModelBenchmarkService.getBaselineBenchmark(benchmarkPath);
|
|
79
|
+
if (baseline) {
|
|
80
|
+
benchmarkResult.comparisonToBaseline = {
|
|
81
|
+
pnlDelta: simulationResult.metrics.totalPnl - baseline.totalPnl,
|
|
82
|
+
accuracyDelta: simulationResult.metrics.predictionMetrics.accuracy -
|
|
83
|
+
baseline.predictionMetrics.accuracy,
|
|
84
|
+
optimalityDelta: simulationResult.metrics.optimalityScore -
|
|
85
|
+
baseline.optimalityScore,
|
|
86
|
+
improved: simulationResult.metrics.totalPnl > baseline.totalPnl,
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
results.push(benchmarkResult);
|
|
90
|
+
logger.info("Benchmark completed", {
|
|
91
|
+
benchmark: benchmarkPath,
|
|
92
|
+
pnl: simulationResult.metrics.totalPnl,
|
|
93
|
+
accuracy: simulationResult.metrics.predictionMetrics.accuracy,
|
|
94
|
+
});
|
|
95
|
+
// Save result if requested (to both database and files)
|
|
96
|
+
if (options.saveResults) {
|
|
97
|
+
await ModelBenchmarkService.saveBenchmarkResultToDatabase(benchmarkResult);
|
|
98
|
+
await ModelBenchmarkService.saveBenchmarkResult(benchmarkResult);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
catch (error) {
|
|
102
|
+
logger.error("Benchmark failed", { benchmark: benchmarkPath, error });
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
// Update model with aggregate benchmark score
|
|
106
|
+
if (results.length > 0) {
|
|
107
|
+
const avgOptimality = results.reduce((sum, r) => sum + r.metrics.optimalityScore, 0) /
|
|
108
|
+
results.length;
|
|
109
|
+
const avgPnl = results.reduce((sum, r) => sum + r.metrics.totalPnl, 0) /
|
|
110
|
+
results.length;
|
|
111
|
+
await adapter.updateModelBenchmark(options.modelId, avgOptimality, avgPnl, (model.benchmarkCount || 0) + results.length);
|
|
112
|
+
}
|
|
113
|
+
logger.info("Model benchmark complete", {
|
|
114
|
+
modelId: options.modelId,
|
|
115
|
+
benchmarksRun: results.length,
|
|
116
|
+
});
|
|
117
|
+
return results;
|
|
118
|
+
}
|
|
119
|
+
/**
|
|
120
|
+
* Compare new model against baseline
|
|
121
|
+
*/
|
|
122
|
+
static async compareToBaseline(modelId) {
|
|
123
|
+
// Get new model benchmarks
|
|
124
|
+
const newModelBenchmarks = await ModelBenchmarkService.getModelBenchmarks(modelId);
|
|
125
|
+
if (newModelBenchmarks.length === 0) {
|
|
126
|
+
throw new Error(`No benchmarks found for model: ${modelId}`);
|
|
127
|
+
}
|
|
128
|
+
// Calculate new model average metrics
|
|
129
|
+
const newModelMetrics = ModelBenchmarkService.calculateAverageMetrics(newModelBenchmarks.map((b) => b.metrics));
|
|
130
|
+
// Get baseline benchmarks (use best baseline model)
|
|
131
|
+
const baselineMetrics = await ModelBenchmarkService.getBaselineAverageMetrics();
|
|
132
|
+
// Calculate improvement
|
|
133
|
+
const pnlDelta = newModelMetrics.totalPnl - baselineMetrics.totalPnl;
|
|
134
|
+
const accuracyDelta = newModelMetrics.accuracy - baselineMetrics.accuracy;
|
|
135
|
+
const optimalityDelta = newModelMetrics.optimality - baselineMetrics.optimality;
|
|
136
|
+
// Determine if this is an improvement (weighted score)
|
|
137
|
+
const improvementScore = (pnlDelta > 0 ? 1 : 0) * 0.4 +
|
|
138
|
+
(accuracyDelta > 0 ? 1 : 0) * 0.3 +
|
|
139
|
+
(optimalityDelta > 0 ? 1 : 0) * 0.3;
|
|
140
|
+
const isImprovement = improvementScore > 0.5;
|
|
141
|
+
let recommendation;
|
|
142
|
+
if (isImprovement && pnlDelta > 0) {
|
|
143
|
+
recommendation = "deploy";
|
|
144
|
+
}
|
|
145
|
+
else if (pnlDelta < -100) {
|
|
146
|
+
recommendation = "baseline_better";
|
|
147
|
+
}
|
|
148
|
+
else {
|
|
149
|
+
recommendation = "keep_training";
|
|
150
|
+
}
|
|
151
|
+
return {
|
|
152
|
+
newModel: {
|
|
153
|
+
modelId,
|
|
154
|
+
version: newModelBenchmarks[0]?.modelVersion,
|
|
155
|
+
avgMetrics: newModelMetrics,
|
|
156
|
+
},
|
|
157
|
+
baseline: {
|
|
158
|
+
modelId: "baseline",
|
|
159
|
+
avgMetrics: baselineMetrics,
|
|
160
|
+
},
|
|
161
|
+
improvement: {
|
|
162
|
+
pnlDelta,
|
|
163
|
+
accuracyDelta,
|
|
164
|
+
optimalityDelta,
|
|
165
|
+
isImprovement,
|
|
166
|
+
},
|
|
167
|
+
recommendation,
|
|
168
|
+
};
|
|
169
|
+
}
|
|
170
|
+
/**
|
|
171
|
+
* Get all unbenchmarked models
|
|
172
|
+
*/
|
|
173
|
+
static async getUnbenchmarkedModels() {
|
|
174
|
+
return getTrainingDataAdapter().getUnbenchmarkedModels();
|
|
175
|
+
}
|
|
176
|
+
/**
|
|
177
|
+
* Get model benchmark results
|
|
178
|
+
*/
|
|
179
|
+
static async getModelBenchmarks(modelId) {
|
|
180
|
+
// For now, read from files
|
|
181
|
+
// In production, you'd store these in a database table
|
|
182
|
+
const benchmarksDir = path.join(process.cwd(), "benchmarks", "model-results");
|
|
183
|
+
const results = [];
|
|
184
|
+
try {
|
|
185
|
+
const model = await getTrainingDataAdapter().getModelById(modelId);
|
|
186
|
+
if (!model)
|
|
187
|
+
return results;
|
|
188
|
+
const modelDir = path.join(benchmarksDir, model.version);
|
|
189
|
+
const files = await fs.readdir(modelDir).catch(() => []);
|
|
190
|
+
for (const file of files) {
|
|
191
|
+
if (file.endsWith(".json")) {
|
|
192
|
+
const filePath = path.join(modelDir, file);
|
|
193
|
+
const data = JSON.parse(await fs.readFile(filePath, "utf-8"));
|
|
194
|
+
if (data.modelId === modelId) {
|
|
195
|
+
results.push(data);
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
catch (error) {
|
|
201
|
+
logger.warn("Could not load benchmark results", { error });
|
|
202
|
+
}
|
|
203
|
+
return results;
|
|
204
|
+
}
|
|
205
|
+
/**
|
|
206
|
+
* Save benchmark result to database
|
|
207
|
+
*/
|
|
208
|
+
static async saveBenchmarkResultToDatabase(result) {
|
|
209
|
+
await getTrainingDataAdapter().insertBenchmarkResult({
|
|
210
|
+
id: await generateSnowflakeId(),
|
|
211
|
+
modelId: result.modelId,
|
|
212
|
+
benchmarkId: result.benchmarkId,
|
|
213
|
+
benchmarkPath: result.benchmarkPath,
|
|
214
|
+
runAt: result.runAt,
|
|
215
|
+
totalPnl: result.metrics.totalPnl,
|
|
216
|
+
predictionAccuracy: result.metrics.predictionMetrics.accuracy,
|
|
217
|
+
perpWinRate: result.metrics.perpMetrics.winRate,
|
|
218
|
+
optimalityScore: result.metrics.optimalityScore,
|
|
219
|
+
detailedMetrics: JSON.parse(JSON.stringify(result.metrics)),
|
|
220
|
+
baselinePnlDelta: result.comparisonToBaseline?.pnlDelta ?? null,
|
|
221
|
+
baselineAccuracyDelta: result.comparisonToBaseline?.accuracyDelta ?? null,
|
|
222
|
+
improved: result.comparisonToBaseline?.improved ?? null,
|
|
223
|
+
duration: result.metrics.timing.totalDuration,
|
|
224
|
+
});
|
|
225
|
+
logger.info("Benchmark result saved to database", {
|
|
226
|
+
modelId: result.modelId,
|
|
227
|
+
benchmarkId: result.benchmarkId,
|
|
228
|
+
});
|
|
229
|
+
}
|
|
230
|
+
/**
|
|
231
|
+
* Save benchmark result to file
|
|
232
|
+
*/
|
|
233
|
+
static async saveBenchmarkResult(result) {
|
|
234
|
+
const outputDir = path.join(process.cwd(), "benchmarks", "model-results", result.modelVersion);
|
|
235
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
236
|
+
const filename = `benchmark-${result.benchmarkId}-${Date.now()}.json`;
|
|
237
|
+
const filePath = path.join(outputDir, filename);
|
|
238
|
+
await fs.writeFile(filePath, JSON.stringify(result, null, 2));
|
|
239
|
+
logger.info("Benchmark result saved to file", { filePath });
|
|
240
|
+
}
|
|
241
|
+
/**
|
|
242
|
+
* Get benchmark results from database
|
|
243
|
+
*/
|
|
244
|
+
static async getBenchmarkResultsFromDatabase(modelId) {
|
|
245
|
+
const results = await getTrainingDataAdapter().getBenchmarkResultsByModel(modelId);
|
|
246
|
+
return results.map((r) => ({
|
|
247
|
+
modelId: r.modelId,
|
|
248
|
+
modelVersion: "", // Not stored in results table
|
|
249
|
+
benchmarkId: r.benchmarkId,
|
|
250
|
+
benchmarkPath: r.benchmarkPath,
|
|
251
|
+
runAt: r.runAt,
|
|
252
|
+
metrics: parseSimulationMetrics(r.detailedMetrics),
|
|
253
|
+
comparisonToBaseline: r.baselinePnlDelta !== null
|
|
254
|
+
? {
|
|
255
|
+
pnlDelta: r.baselinePnlDelta,
|
|
256
|
+
accuracyDelta: r.baselineAccuracyDelta ?? 0,
|
|
257
|
+
optimalityDelta: 0, // Not stored separately
|
|
258
|
+
improved: r.improved || false,
|
|
259
|
+
}
|
|
260
|
+
: undefined,
|
|
261
|
+
}));
|
|
262
|
+
}
|
|
263
|
+
/**
|
|
264
|
+
* Get baseline benchmark for comparison
|
|
265
|
+
*/
|
|
266
|
+
static async getBaselineBenchmark(benchmarkPath) {
|
|
267
|
+
try {
|
|
268
|
+
// Look for baseline result for this benchmark
|
|
269
|
+
const baselinesDir = path.join(process.cwd(), "benchmarks", "baselines");
|
|
270
|
+
const files = await fs.readdir(baselinesDir).catch(() => []);
|
|
271
|
+
for (const file of files) {
|
|
272
|
+
if (file.endsWith(".json")) {
|
|
273
|
+
const filePath = path.join(baselinesDir, file);
|
|
274
|
+
const data = JSON.parse(await fs.readFile(filePath, "utf-8"));
|
|
275
|
+
if (data.benchmark?.path === benchmarkPath ||
|
|
276
|
+
data.benchmark === benchmarkPath) {
|
|
277
|
+
return data.metrics;
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
catch (error) {
|
|
283
|
+
logger.warn("Could not load baseline benchmark", { error });
|
|
284
|
+
}
|
|
285
|
+
return null;
|
|
286
|
+
}
|
|
287
|
+
/**
|
|
288
|
+
* Calculate average metrics across multiple benchmark results
|
|
289
|
+
*/
|
|
290
|
+
static calculateAverageMetrics(metricsArray) {
|
|
291
|
+
if (metricsArray.length === 0) {
|
|
292
|
+
return {
|
|
293
|
+
totalPnl: 0,
|
|
294
|
+
accuracy: 0,
|
|
295
|
+
winRate: 0,
|
|
296
|
+
optimality: 0,
|
|
297
|
+
benchmarkCount: 0,
|
|
298
|
+
};
|
|
299
|
+
}
|
|
300
|
+
const totals = metricsArray.reduce((acc, metrics) => ({
|
|
301
|
+
pnl: acc.pnl + metrics.totalPnl,
|
|
302
|
+
accuracy: acc.accuracy + metrics.predictionMetrics.accuracy,
|
|
303
|
+
winRate: acc.winRate + metrics.perpMetrics.winRate,
|
|
304
|
+
optimality: acc.optimality + metrics.optimalityScore,
|
|
305
|
+
}), { pnl: 0, accuracy: 0, winRate: 0, optimality: 0 });
|
|
306
|
+
const count = metricsArray.length;
|
|
307
|
+
return {
|
|
308
|
+
totalPnl: totals.pnl / count,
|
|
309
|
+
accuracy: totals.accuracy / count,
|
|
310
|
+
winRate: totals.winRate / count,
|
|
311
|
+
optimality: totals.optimality / count,
|
|
312
|
+
benchmarkCount: count,
|
|
313
|
+
};
|
|
314
|
+
}
|
|
315
|
+
/**
|
|
316
|
+
* Get baseline average metrics
|
|
317
|
+
*/
|
|
318
|
+
static async getBaselineAverageMetrics() {
|
|
319
|
+
const baselinesDir = path.join(process.cwd(), "benchmarks", "baselines");
|
|
320
|
+
const metricsArray = [];
|
|
321
|
+
try {
|
|
322
|
+
const files = await fs.readdir(baselinesDir).catch(() => []);
|
|
323
|
+
for (const file of files) {
|
|
324
|
+
if (file.endsWith(".json")) {
|
|
325
|
+
const filePath = path.join(baselinesDir, file);
|
|
326
|
+
const data = JSON.parse(await fs.readFile(filePath, "utf-8"));
|
|
327
|
+
if (data.metrics) {
|
|
328
|
+
metricsArray.push(data.metrics);
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
catch (error) {
|
|
334
|
+
logger.warn("Could not load baseline metrics", { error });
|
|
335
|
+
}
|
|
336
|
+
return ModelBenchmarkService.calculateAverageMetrics(metricsArray);
|
|
337
|
+
}
|
|
338
|
+
/**
|
|
339
|
+
* Get or create test agent for benchmarking
|
|
340
|
+
*/
|
|
341
|
+
static async getOrCreateTestAgent() {
|
|
342
|
+
const testAgentUsername = "model-benchmark-agent";
|
|
343
|
+
const adapter = getTrainingDataAdapter();
|
|
344
|
+
const existing = await adapter.getUserByUsername(testAgentUsername);
|
|
345
|
+
if (existing) {
|
|
346
|
+
return existing.id;
|
|
347
|
+
}
|
|
348
|
+
// Create new test agent
|
|
349
|
+
const agentId = await generateSnowflakeId();
|
|
350
|
+
const agent = await adapter.createUser({
|
|
351
|
+
id: agentId,
|
|
352
|
+
privyId: `did:privy:model-benchmark-${agentId}`,
|
|
353
|
+
username: testAgentUsername,
|
|
354
|
+
displayName: "Model Benchmark Agent",
|
|
355
|
+
walletAddress: ethers.Wallet.createRandom().address,
|
|
356
|
+
isAgent: true,
|
|
357
|
+
virtualBalance: "10000",
|
|
358
|
+
reputationPoints: 1000,
|
|
359
|
+
isTest: true,
|
|
360
|
+
updatedAt: new Date(),
|
|
361
|
+
});
|
|
362
|
+
// Create agent config in separate table
|
|
363
|
+
if (agent) {
|
|
364
|
+
await adapter.createAgentConfig({
|
|
365
|
+
id: await generateSnowflakeId(),
|
|
366
|
+
userId: agentId,
|
|
367
|
+
autonomousTrading: true,
|
|
368
|
+
autonomousPosting: false,
|
|
369
|
+
autonomousCommenting: false,
|
|
370
|
+
systemPrompt: "You are a test agent for benchmarking model performance.",
|
|
371
|
+
modelTier: "pro",
|
|
372
|
+
updatedAt: new Date(),
|
|
373
|
+
});
|
|
374
|
+
}
|
|
375
|
+
if (!agent) {
|
|
376
|
+
throw new Error("Failed to create model benchmark test agent");
|
|
377
|
+
}
|
|
378
|
+
logger.info("Created model benchmark test agent", { agentId: agent.id });
|
|
379
|
+
return agent.id;
|
|
380
|
+
}
|
|
381
|
+
/**
|
|
382
|
+
* Get standard benchmark paths for model evaluation
|
|
383
|
+
*/
|
|
384
|
+
static async getStandardBenchmarkPaths() {
|
|
385
|
+
const benchmarksDir = path.join(process.cwd(), "benchmarks");
|
|
386
|
+
const standardBenchmarks = [];
|
|
387
|
+
try {
|
|
388
|
+
// First, look in benchmarks/standard/ directory
|
|
389
|
+
const standardDir = path.join(benchmarksDir, "standard");
|
|
390
|
+
if (await fs
|
|
391
|
+
.access(standardDir)
|
|
392
|
+
.then(() => true)
|
|
393
|
+
.catch(() => false)) {
|
|
394
|
+
const standardFiles = await fs.readdir(standardDir);
|
|
395
|
+
for (const file of standardFiles) {
|
|
396
|
+
if (file.startsWith("standard-") && file.endsWith(".json")) {
|
|
397
|
+
standardBenchmarks.push(path.join(standardDir, file));
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
// If standard benchmarks found, use those
|
|
402
|
+
if (standardBenchmarks.length > 0) {
|
|
403
|
+
logger.info(`Using ${standardBenchmarks.length} standard benchmarks from benchmarks/standard/`);
|
|
404
|
+
return standardBenchmarks;
|
|
405
|
+
}
|
|
406
|
+
// Fallback: Look for week-long benchmarks in main directory
|
|
407
|
+
const files = await fs.readdir(benchmarksDir);
|
|
408
|
+
for (const file of files) {
|
|
409
|
+
if (file.startsWith("benchmark-week-") && file.endsWith(".json")) {
|
|
410
|
+
standardBenchmarks.push(path.join(benchmarksDir, file));
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
// If still nothing, use any benchmark files
|
|
414
|
+
if (standardBenchmarks.length === 0) {
|
|
415
|
+
for (const file of files) {
|
|
416
|
+
if (file.startsWith("benchmark-") &&
|
|
417
|
+
file.endsWith(".json") &&
|
|
418
|
+
!file.includes("comparison")) {
|
|
419
|
+
const filePath = path.join(benchmarksDir, file);
|
|
420
|
+
standardBenchmarks.push(filePath);
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
catch (error) {
|
|
426
|
+
logger.error("Could not load standard benchmarks", { error });
|
|
427
|
+
}
|
|
428
|
+
if (standardBenchmarks.length === 0) {
|
|
429
|
+
logger.warn("No standard benchmarks found. Generate benchmark fixtures before upload.");
|
|
430
|
+
}
|
|
431
|
+
return standardBenchmarks;
|
|
432
|
+
}
|
|
433
|
+
}
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Model Registry
|
|
3
|
+
*
|
|
4
|
+
* Centralized configuration for all models available for benchmarking.
|
|
5
|
+
* Add new models here to make them available for comparison.
|
|
6
|
+
*/
|
|
7
|
+
/**
|
|
8
|
+
* Registry of all available models for benchmarking
|
|
9
|
+
*/
|
|
10
|
+
export const MODEL_REGISTRY = [
|
|
11
|
+
{
|
|
12
|
+
id: "llama-8b",
|
|
13
|
+
displayName: "LLaMA 3.1 8B",
|
|
14
|
+
provider: "groq",
|
|
15
|
+
modelId: "llama-3.1-8b-instant",
|
|
16
|
+
tier: "lite",
|
|
17
|
+
parametersBillions: 8,
|
|
18
|
+
isBaseline: true,
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
id: "llama-70b",
|
|
22
|
+
displayName: "LLaMA 3.1 70B",
|
|
23
|
+
provider: "groq",
|
|
24
|
+
modelId: "llama-3.1-70b-versatile",
|
|
25
|
+
tier: "standard",
|
|
26
|
+
parametersBillions: 70,
|
|
27
|
+
isBaseline: false,
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
id: "qwen-32b",
|
|
31
|
+
displayName: "Qwen 3 32B",
|
|
32
|
+
provider: "groq",
|
|
33
|
+
modelId: "qwen/qwen3-32b",
|
|
34
|
+
tier: "standard",
|
|
35
|
+
parametersBillions: 32,
|
|
36
|
+
isBaseline: true,
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
id: "mixtral-8x7b",
|
|
40
|
+
displayName: "Mixtral 8x7B",
|
|
41
|
+
provider: "groq",
|
|
42
|
+
modelId: "mixtral-8x7b-32768",
|
|
43
|
+
tier: "standard",
|
|
44
|
+
parametersBillions: 46,
|
|
45
|
+
isBaseline: false,
|
|
46
|
+
},
|
|
47
|
+
{
|
|
48
|
+
id: "gpt-4o",
|
|
49
|
+
displayName: "GPT-4o",
|
|
50
|
+
provider: "openai",
|
|
51
|
+
modelId: "gpt-4o",
|
|
52
|
+
tier: "pro",
|
|
53
|
+
isBaseline: false,
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
id: "gpt-4o-mini",
|
|
57
|
+
displayName: "GPT-4o Mini",
|
|
58
|
+
provider: "openai",
|
|
59
|
+
modelId: "gpt-4o-mini",
|
|
60
|
+
tier: "lite",
|
|
61
|
+
isBaseline: false,
|
|
62
|
+
},
|
|
63
|
+
{
|
|
64
|
+
id: "claude-sonnet",
|
|
65
|
+
displayName: "Claude 3.5 Sonnet",
|
|
66
|
+
provider: "anthropic",
|
|
67
|
+
modelId: "claude-3-5-sonnet-20241022",
|
|
68
|
+
tier: "pro",
|
|
69
|
+
isBaseline: false,
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
id: "claude-haiku",
|
|
73
|
+
displayName: "Claude 3.5 Haiku",
|
|
74
|
+
provider: "anthropic",
|
|
75
|
+
modelId: "claude-3-5-haiku-20241022",
|
|
76
|
+
tier: "lite",
|
|
77
|
+
isBaseline: false,
|
|
78
|
+
},
|
|
79
|
+
];
|
|
80
|
+
/**
|
|
81
|
+
* Get a model config by ID
|
|
82
|
+
*/
|
|
83
|
+
export function getModelById(id) {
|
|
84
|
+
return MODEL_REGISTRY.find((m) => m.id === id);
|
|
85
|
+
}
|
|
86
|
+
/**
|
|
87
|
+
* Get a model config by model ID (API identifier)
|
|
88
|
+
*/
|
|
89
|
+
export function getModelByModelId(modelId) {
|
|
90
|
+
return MODEL_REGISTRY.find((m) => m.modelId === modelId);
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Get all baseline models
|
|
94
|
+
*/
|
|
95
|
+
export function getBaselineModels() {
|
|
96
|
+
return MODEL_REGISTRY.filter((m) => m.isBaseline);
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* Get models by provider
|
|
100
|
+
*/
|
|
101
|
+
export function getModelsByProvider(provider) {
|
|
102
|
+
return MODEL_REGISTRY.filter((m) => m.provider === provider);
|
|
103
|
+
}
|
|
104
|
+
/**
|
|
105
|
+
* Get models by tier
|
|
106
|
+
*/
|
|
107
|
+
export function getModelsByTier(tier) {
|
|
108
|
+
return MODEL_REGISTRY.filter((m) => m.tier === tier);
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* Validate that a model ID exists
|
|
112
|
+
*/
|
|
113
|
+
export function validateModelId(id) {
|
|
114
|
+
return MODEL_REGISTRY.some((m) => m.id === id || m.modelId === id);
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Get model display name (supports both id and modelId)
|
|
118
|
+
*/
|
|
119
|
+
export function getModelDisplayName(idOrModelId) {
|
|
120
|
+
const model = getModelById(idOrModelId) ?? getModelByModelId(idOrModelId);
|
|
121
|
+
return model?.displayName ?? idOrModelId;
|
|
122
|
+
}
|