@elizaos/training 2.0.0-alpha.13 → 2.0.0-alpha.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/research-output/training-runs/training-run-1773726941205.json +38 -0
- package/scripts/rank_trajectories.ts +0 -1
- package/scripts/run_task_benchmark.ts +4 -11
- package/src/adapter.ts +96 -49
- package/src/archetypes/ArchetypeConfigService.ts +188 -185
- package/src/archetypes/derive-archetype.ts +47 -47
- package/src/archetypes/index.ts +2 -2
- package/src/benchmark/ArchetypeMatchupBenchmark.ts +70 -70
- package/src/benchmark/BenchmarkChartGenerator.ts +70 -69
- package/src/benchmark/BenchmarkDataGenerator.ts +136 -136
- package/src/benchmark/BenchmarkDataViewer.ts +32 -30
- package/src/benchmark/BenchmarkHistoryService.ts +13 -12
- package/src/benchmark/BenchmarkRunner.ts +87 -83
- package/src/benchmark/BenchmarkValidator.ts +48 -46
- package/src/benchmark/FastEvalRunner.ts +17 -16
- package/src/benchmark/MetricsValidator.ts +20 -21
- package/src/benchmark/MetricsVisualizer.ts +92 -85
- package/src/benchmark/ModelBenchmarkService.ts +90 -82
- package/src/benchmark/ModelRegistry.ts +44 -44
- package/src/benchmark/RulerBenchmarkIntegration.ts +24 -24
- package/src/benchmark/SimulationA2AInterface.ts +118 -118
- package/src/benchmark/SimulationEngine.ts +51 -51
- package/src/benchmark/TaskRunner.ts +87 -79
- package/src/benchmark/__tests__/BenchmarkRunner.test.ts +80 -80
- package/src/benchmark/__tests__/HeadToHead.test.ts +26 -26
- package/src/benchmark/index.ts +27 -27
- package/src/benchmark/parseSimulationMetrics.ts +32 -32
- package/src/benchmark/simulation-types.ts +10 -10
- package/src/dependencies.ts +34 -34
- package/src/generation/TrajectoryGenerator.ts +39 -37
- package/src/generation/index.ts +1 -1
- package/src/huggingface/HuggingFaceDatasetUploader.ts +72 -72
- package/src/huggingface/HuggingFaceIntegrationService.ts +59 -53
- package/src/huggingface/HuggingFaceModelUploader.ts +60 -59
- package/src/huggingface/index.ts +6 -6
- package/src/huggingface/shared/HuggingFaceUploadUtil.ts +32 -32
- package/src/index.ts +27 -27
- package/src/init-training.ts +6 -6
- package/src/metrics/TrajectoryMetricsExtractor.ts +70 -71
- package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +182 -182
- package/src/metrics/index.ts +2 -2
- package/src/rubrics/__tests__/index.test.ts +73 -73
- package/src/rubrics/ass-kisser.ts +6 -6
- package/src/rubrics/degen.ts +6 -6
- package/src/rubrics/goody-twoshoes.ts +6 -6
- package/src/rubrics/index.ts +50 -50
- package/src/rubrics/information-trader.ts +6 -6
- package/src/rubrics/infosec.ts +6 -6
- package/src/rubrics/liar.ts +6 -6
- package/src/rubrics/perps-trader.ts +6 -6
- package/src/rubrics/researcher.ts +6 -6
- package/src/rubrics/scammer.ts +6 -6
- package/src/rubrics/social-butterfly.ts +7 -7
- package/src/rubrics/super-predictor.ts +6 -6
- package/src/rubrics/trader.ts +5 -5
- package/src/scoring/ArchetypeScoringService.ts +56 -54
- package/src/scoring/JudgePromptBuilder.ts +96 -96
- package/src/scoring/LLMJudgeCache.ts +26 -23
- package/src/scoring/index.ts +3 -3
- package/src/training/AutomationPipeline.ts +149 -140
- package/src/training/BenchmarkService.ts +49 -45
- package/src/training/ConfigValidator.ts +38 -32
- package/src/training/MarketOutcomesTracker.ts +22 -12
- package/src/training/ModelDeployer.ts +15 -15
- package/src/training/ModelFetcher.ts +7 -7
- package/src/training/ModelSelectionService.ts +32 -32
- package/src/training/ModelUsageVerifier.ts +31 -24
- package/src/training/MultiModelOrchestrator.ts +44 -44
- package/src/training/RLModelConfig.ts +57 -57
- package/src/training/RewardBackpropagationService.ts +18 -17
- package/src/training/RulerScoringService.ts +73 -72
- package/src/training/TrainingMonitor.ts +29 -29
- package/src/training/TrajectoryRecorder.ts +25 -27
- package/src/training/__tests__/TrajectoryRecorder.test.ts +105 -105
- package/src/training/index.ts +36 -36
- package/src/training/logRLConfig.ts +7 -7
- package/src/training/pipeline.ts +13 -16
- package/src/training/storage/ModelStorageService.ts +32 -32
- package/src/training/storage/TrainingDataArchiver.ts +21 -21
- package/src/training/storage/index.ts +2 -2
- package/src/training/types.ts +6 -6
- package/src/training/window-utils.ts +14 -14
- package/src/utils/index.ts +7 -7
- package/src/utils/logger.ts +5 -5
- package/src/utils/snowflake.ts +1 -1
- package/src/utils/synthetic-detector.ts +7 -7
|
@@ -14,19 +14,19 @@
|
|
|
14
14
|
* @see BenchmarkService - For training pipeline evaluation
|
|
15
15
|
*/
|
|
16
16
|
|
|
17
|
-
import {
|
|
18
|
-
import
|
|
19
|
-
import {
|
|
20
|
-
import
|
|
21
|
-
import { getAgentRuntimeManager } from
|
|
22
|
-
import { logger } from
|
|
23
|
-
import { generateSnowflakeId } from
|
|
24
|
-
import { BenchmarkRunner } from
|
|
17
|
+
import { promises as fs } from "node:fs";
|
|
18
|
+
import * as path from "node:path";
|
|
19
|
+
import { ethers } from "ethers";
|
|
20
|
+
import { getTrainingDataAdapter } from "../adapter";
|
|
21
|
+
import { getAgentRuntimeManager } from "../dependencies";
|
|
22
|
+
import { logger } from "../utils/logger";
|
|
23
|
+
import { generateSnowflakeId } from "../utils/snowflake";
|
|
24
|
+
import { BenchmarkRunner } from "./BenchmarkRunner";
|
|
25
25
|
import {
|
|
26
26
|
type JsonValue,
|
|
27
27
|
parseSimulationMetrics,
|
|
28
|
-
} from
|
|
29
|
-
import type { SimulationMetrics, SimulationResult } from
|
|
28
|
+
} from "./parseSimulationMetrics";
|
|
29
|
+
import type { SimulationMetrics, SimulationResult } from "./SimulationEngine";
|
|
30
30
|
|
|
31
31
|
export interface ModelBenchmarkOptions {
|
|
32
32
|
modelId: string;
|
|
@@ -66,7 +66,7 @@ export interface ModelComparisonResult {
|
|
|
66
66
|
optimalityDelta: number;
|
|
67
67
|
isImprovement: boolean;
|
|
68
68
|
};
|
|
69
|
-
recommendation:
|
|
69
|
+
recommendation: "deploy" | "keep_training" | "baseline_better";
|
|
70
70
|
}
|
|
71
71
|
|
|
72
72
|
export interface AverageMetrics {
|
|
@@ -82,9 +82,9 @@ export class ModelBenchmarkService {
|
|
|
82
82
|
* Benchmark a trained model against standard benchmarks
|
|
83
83
|
*/
|
|
84
84
|
static async benchmarkModel(
|
|
85
|
-
options: ModelBenchmarkOptions
|
|
85
|
+
options: ModelBenchmarkOptions,
|
|
86
86
|
): Promise<ModelBenchmarkResult[]> {
|
|
87
|
-
logger.info(
|
|
87
|
+
logger.info("Starting model benchmark", { modelId: options.modelId });
|
|
88
88
|
|
|
89
89
|
// Load model from database
|
|
90
90
|
const adapter = getTrainingDataAdapter();
|
|
@@ -95,9 +95,11 @@ export class ModelBenchmarkService {
|
|
|
95
95
|
}
|
|
96
96
|
|
|
97
97
|
// Check if model already benchmarked
|
|
98
|
-
const existingBenchmarks = await
|
|
98
|
+
const existingBenchmarks = await ModelBenchmarkService.getModelBenchmarks(
|
|
99
|
+
options.modelId,
|
|
100
|
+
);
|
|
99
101
|
if (existingBenchmarks.length > 0 && !options.saveResults) {
|
|
100
|
-
logger.info(
|
|
102
|
+
logger.info("Model already benchmarked", {
|
|
101
103
|
modelId: options.modelId,
|
|
102
104
|
count: existingBenchmarks.length,
|
|
103
105
|
});
|
|
@@ -105,13 +107,13 @@ export class ModelBenchmarkService {
|
|
|
105
107
|
}
|
|
106
108
|
|
|
107
109
|
// Create test agent for benchmarking
|
|
108
|
-
const testAgentId = await
|
|
110
|
+
const testAgentId = await ModelBenchmarkService.getOrCreateTestAgent();
|
|
109
111
|
|
|
110
112
|
const results: ModelBenchmarkResult[] = [];
|
|
111
113
|
|
|
112
114
|
// Run each benchmark
|
|
113
115
|
for (const benchmarkPath of options.benchmarkPaths) {
|
|
114
|
-
logger.info(
|
|
116
|
+
logger.info("Running benchmark", {
|
|
115
117
|
benchmark: benchmarkPath,
|
|
116
118
|
modelId: options.modelId,
|
|
117
119
|
});
|
|
@@ -131,9 +133,9 @@ export class ModelBenchmarkService {
|
|
|
131
133
|
options.outputDir ||
|
|
132
134
|
path.join(
|
|
133
135
|
process.cwd(),
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
model.version
|
|
136
|
+
"benchmarks",
|
|
137
|
+
"model-results",
|
|
138
|
+
model.version,
|
|
137
139
|
),
|
|
138
140
|
forceModel: model.storagePath, // Use the RL model
|
|
139
141
|
});
|
|
@@ -149,7 +151,8 @@ export class ModelBenchmarkService {
|
|
|
149
151
|
};
|
|
150
152
|
|
|
151
153
|
// Compare to baseline if available
|
|
152
|
-
const baseline =
|
|
154
|
+
const baseline =
|
|
155
|
+
await ModelBenchmarkService.getBaselineBenchmark(benchmarkPath);
|
|
153
156
|
if (baseline) {
|
|
154
157
|
benchmarkResult.comparisonToBaseline = {
|
|
155
158
|
pnlDelta: simulationResult.metrics.totalPnl - baseline.totalPnl,
|
|
@@ -165,7 +168,7 @@ export class ModelBenchmarkService {
|
|
|
165
168
|
|
|
166
169
|
results.push(benchmarkResult);
|
|
167
170
|
|
|
168
|
-
logger.info(
|
|
171
|
+
logger.info("Benchmark completed", {
|
|
169
172
|
benchmark: benchmarkPath,
|
|
170
173
|
pnl: simulationResult.metrics.totalPnl,
|
|
171
174
|
accuracy: simulationResult.metrics.predictionMetrics.accuracy,
|
|
@@ -173,11 +176,13 @@ export class ModelBenchmarkService {
|
|
|
173
176
|
|
|
174
177
|
// Save result if requested (to both database and files)
|
|
175
178
|
if (options.saveResults) {
|
|
176
|
-
await
|
|
177
|
-
|
|
179
|
+
await ModelBenchmarkService.saveBenchmarkResultToDatabase(
|
|
180
|
+
benchmarkResult,
|
|
181
|
+
);
|
|
182
|
+
await ModelBenchmarkService.saveBenchmarkResult(benchmarkResult);
|
|
178
183
|
}
|
|
179
184
|
} catch (error) {
|
|
180
|
-
logger.error(
|
|
185
|
+
logger.error("Benchmark failed", { benchmark: benchmarkPath, error });
|
|
181
186
|
}
|
|
182
187
|
}
|
|
183
188
|
|
|
@@ -198,7 +203,7 @@ export class ModelBenchmarkService {
|
|
|
198
203
|
);
|
|
199
204
|
}
|
|
200
205
|
|
|
201
|
-
logger.info(
|
|
206
|
+
logger.info("Model benchmark complete", {
|
|
202
207
|
modelId: options.modelId,
|
|
203
208
|
benchmarksRun: results.length,
|
|
204
209
|
});
|
|
@@ -210,22 +215,24 @@ export class ModelBenchmarkService {
|
|
|
210
215
|
* Compare new model against baseline
|
|
211
216
|
*/
|
|
212
217
|
static async compareToBaseline(
|
|
213
|
-
modelId: string
|
|
218
|
+
modelId: string,
|
|
214
219
|
): Promise<ModelComparisonResult> {
|
|
215
220
|
// Get new model benchmarks
|
|
216
|
-
const newModelBenchmarks =
|
|
221
|
+
const newModelBenchmarks =
|
|
222
|
+
await ModelBenchmarkService.getModelBenchmarks(modelId);
|
|
217
223
|
|
|
218
224
|
if (newModelBenchmarks.length === 0) {
|
|
219
225
|
throw new Error(`No benchmarks found for model: ${modelId}`);
|
|
220
226
|
}
|
|
221
227
|
|
|
222
228
|
// Calculate new model average metrics
|
|
223
|
-
const newModelMetrics =
|
|
224
|
-
newModelBenchmarks.map((b) => b.metrics)
|
|
229
|
+
const newModelMetrics = ModelBenchmarkService.calculateAverageMetrics(
|
|
230
|
+
newModelBenchmarks.map((b) => b.metrics),
|
|
225
231
|
);
|
|
226
232
|
|
|
227
233
|
// Get baseline benchmarks (use best baseline model)
|
|
228
|
-
const baselineMetrics =
|
|
234
|
+
const baselineMetrics =
|
|
235
|
+
await ModelBenchmarkService.getBaselineAverageMetrics();
|
|
229
236
|
|
|
230
237
|
// Calculate improvement
|
|
231
238
|
const pnlDelta = newModelMetrics.totalPnl - baselineMetrics.totalPnl;
|
|
@@ -241,23 +248,23 @@ export class ModelBenchmarkService {
|
|
|
241
248
|
|
|
242
249
|
const isImprovement = improvementScore > 0.5;
|
|
243
250
|
|
|
244
|
-
let recommendation:
|
|
251
|
+
let recommendation: "deploy" | "keep_training" | "baseline_better";
|
|
245
252
|
if (isImprovement && pnlDelta > 0) {
|
|
246
|
-
recommendation =
|
|
253
|
+
recommendation = "deploy";
|
|
247
254
|
} else if (pnlDelta < -100) {
|
|
248
|
-
recommendation =
|
|
255
|
+
recommendation = "baseline_better";
|
|
249
256
|
} else {
|
|
250
|
-
recommendation =
|
|
257
|
+
recommendation = "keep_training";
|
|
251
258
|
}
|
|
252
259
|
|
|
253
260
|
return {
|
|
254
261
|
newModel: {
|
|
255
262
|
modelId,
|
|
256
|
-
version: newModelBenchmarks[0]
|
|
263
|
+
version: newModelBenchmarks[0]?.modelVersion,
|
|
257
264
|
avgMetrics: newModelMetrics,
|
|
258
265
|
},
|
|
259
266
|
baseline: {
|
|
260
|
-
modelId:
|
|
267
|
+
modelId: "baseline",
|
|
261
268
|
avgMetrics: baselineMetrics,
|
|
262
269
|
},
|
|
263
270
|
improvement: {
|
|
@@ -281,15 +288,15 @@ export class ModelBenchmarkService {
|
|
|
281
288
|
* Get model benchmark results
|
|
282
289
|
*/
|
|
283
290
|
private static async getModelBenchmarks(
|
|
284
|
-
modelId: string
|
|
291
|
+
modelId: string,
|
|
285
292
|
): Promise<ModelBenchmarkResult[]> {
|
|
286
293
|
// For now, read from files
|
|
287
294
|
// In production, you'd store these in a database table
|
|
288
295
|
|
|
289
296
|
const benchmarksDir = path.join(
|
|
290
297
|
process.cwd(),
|
|
291
|
-
|
|
292
|
-
|
|
298
|
+
"benchmarks",
|
|
299
|
+
"model-results",
|
|
293
300
|
);
|
|
294
301
|
const results: ModelBenchmarkResult[] = [];
|
|
295
302
|
|
|
@@ -302,9 +309,9 @@ export class ModelBenchmarkService {
|
|
|
302
309
|
const files = await fs.readdir(modelDir).catch(() => []);
|
|
303
310
|
|
|
304
311
|
for (const file of files) {
|
|
305
|
-
if (file.endsWith(
|
|
312
|
+
if (file.endsWith(".json")) {
|
|
306
313
|
const filePath = path.join(modelDir, file);
|
|
307
|
-
const data = JSON.parse(await fs.readFile(filePath,
|
|
314
|
+
const data = JSON.parse(await fs.readFile(filePath, "utf-8"));
|
|
308
315
|
|
|
309
316
|
if (data.modelId === modelId) {
|
|
310
317
|
results.push(data);
|
|
@@ -312,7 +319,7 @@ export class ModelBenchmarkService {
|
|
|
312
319
|
}
|
|
313
320
|
}
|
|
314
321
|
} catch (error) {
|
|
315
|
-
logger.warn(
|
|
322
|
+
logger.warn("Could not load benchmark results", { error });
|
|
316
323
|
}
|
|
317
324
|
|
|
318
325
|
return results;
|
|
@@ -322,7 +329,7 @@ export class ModelBenchmarkService {
|
|
|
322
329
|
* Save benchmark result to database
|
|
323
330
|
*/
|
|
324
331
|
private static async saveBenchmarkResultToDatabase(
|
|
325
|
-
result: ModelBenchmarkResult
|
|
332
|
+
result: ModelBenchmarkResult,
|
|
326
333
|
): Promise<void> {
|
|
327
334
|
await getTrainingDataAdapter().insertBenchmarkResult({
|
|
328
335
|
id: await generateSnowflakeId(),
|
|
@@ -341,7 +348,7 @@ export class ModelBenchmarkService {
|
|
|
341
348
|
duration: result.metrics.timing.totalDuration,
|
|
342
349
|
});
|
|
343
350
|
|
|
344
|
-
logger.info(
|
|
351
|
+
logger.info("Benchmark result saved to database", {
|
|
345
352
|
modelId: result.modelId,
|
|
346
353
|
benchmarkId: result.benchmarkId,
|
|
347
354
|
});
|
|
@@ -351,13 +358,13 @@ export class ModelBenchmarkService {
|
|
|
351
358
|
* Save benchmark result to file
|
|
352
359
|
*/
|
|
353
360
|
private static async saveBenchmarkResult(
|
|
354
|
-
result: ModelBenchmarkResult
|
|
361
|
+
result: ModelBenchmarkResult,
|
|
355
362
|
): Promise<void> {
|
|
356
363
|
const outputDir = path.join(
|
|
357
364
|
process.cwd(),
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
result.modelVersion
|
|
365
|
+
"benchmarks",
|
|
366
|
+
"model-results",
|
|
367
|
+
result.modelVersion,
|
|
361
368
|
);
|
|
362
369
|
await fs.mkdir(outputDir, { recursive: true });
|
|
363
370
|
|
|
@@ -366,20 +373,21 @@ export class ModelBenchmarkService {
|
|
|
366
373
|
|
|
367
374
|
await fs.writeFile(filePath, JSON.stringify(result, null, 2));
|
|
368
375
|
|
|
369
|
-
logger.info(
|
|
376
|
+
logger.info("Benchmark result saved to file", { filePath });
|
|
370
377
|
}
|
|
371
378
|
|
|
372
379
|
/**
|
|
373
380
|
* Get benchmark results from database
|
|
374
381
|
*/
|
|
375
382
|
static async getBenchmarkResultsFromDatabase(
|
|
376
|
-
modelId: string
|
|
383
|
+
modelId: string,
|
|
377
384
|
): Promise<ModelBenchmarkResult[]> {
|
|
378
|
-
const results =
|
|
385
|
+
const results =
|
|
386
|
+
await getTrainingDataAdapter().getBenchmarkResultsByModel(modelId);
|
|
379
387
|
|
|
380
388
|
return results.map((r) => ({
|
|
381
389
|
modelId: r.modelId,
|
|
382
|
-
modelVersion:
|
|
390
|
+
modelVersion: "", // Not stored in results table
|
|
383
391
|
benchmarkId: r.benchmarkId,
|
|
384
392
|
benchmarkPath: r.benchmarkPath,
|
|
385
393
|
runAt: r.runAt,
|
|
@@ -400,17 +408,17 @@ export class ModelBenchmarkService {
|
|
|
400
408
|
* Get baseline benchmark for comparison
|
|
401
409
|
*/
|
|
402
410
|
private static async getBaselineBenchmark(
|
|
403
|
-
benchmarkPath: string
|
|
411
|
+
benchmarkPath: string,
|
|
404
412
|
): Promise<SimulationMetrics | null> {
|
|
405
413
|
try {
|
|
406
414
|
// Look for baseline result for this benchmark
|
|
407
|
-
const baselinesDir = path.join(process.cwd(),
|
|
415
|
+
const baselinesDir = path.join(process.cwd(), "benchmarks", "baselines");
|
|
408
416
|
const files = await fs.readdir(baselinesDir).catch(() => []);
|
|
409
417
|
|
|
410
418
|
for (const file of files) {
|
|
411
|
-
if (file.endsWith(
|
|
419
|
+
if (file.endsWith(".json")) {
|
|
412
420
|
const filePath = path.join(baselinesDir, file);
|
|
413
|
-
const data = JSON.parse(await fs.readFile(filePath,
|
|
421
|
+
const data = JSON.parse(await fs.readFile(filePath, "utf-8"));
|
|
414
422
|
|
|
415
423
|
if (
|
|
416
424
|
data.benchmark?.path === benchmarkPath ||
|
|
@@ -421,7 +429,7 @@ export class ModelBenchmarkService {
|
|
|
421
429
|
}
|
|
422
430
|
}
|
|
423
431
|
} catch (error) {
|
|
424
|
-
logger.warn(
|
|
432
|
+
logger.warn("Could not load baseline benchmark", { error });
|
|
425
433
|
}
|
|
426
434
|
|
|
427
435
|
return null;
|
|
@@ -431,7 +439,7 @@ export class ModelBenchmarkService {
|
|
|
431
439
|
* Calculate average metrics across multiple benchmark results
|
|
432
440
|
*/
|
|
433
441
|
private static calculateAverageMetrics(
|
|
434
|
-
metricsArray: SimulationMetrics[]
|
|
442
|
+
metricsArray: SimulationMetrics[],
|
|
435
443
|
): AverageMetrics {
|
|
436
444
|
if (metricsArray.length === 0) {
|
|
437
445
|
return {
|
|
@@ -450,7 +458,7 @@ export class ModelBenchmarkService {
|
|
|
450
458
|
winRate: acc.winRate + metrics.perpMetrics.winRate,
|
|
451
459
|
optimality: acc.optimality + metrics.optimalityScore,
|
|
452
460
|
}),
|
|
453
|
-
{ pnl: 0, accuracy: 0, winRate: 0, optimality: 0 }
|
|
461
|
+
{ pnl: 0, accuracy: 0, winRate: 0, optimality: 0 },
|
|
454
462
|
);
|
|
455
463
|
|
|
456
464
|
const count = metricsArray.length;
|
|
@@ -468,16 +476,16 @@ export class ModelBenchmarkService {
|
|
|
468
476
|
* Get baseline average metrics
|
|
469
477
|
*/
|
|
470
478
|
private static async getBaselineAverageMetrics(): Promise<AverageMetrics> {
|
|
471
|
-
const baselinesDir = path.join(process.cwd(),
|
|
479
|
+
const baselinesDir = path.join(process.cwd(), "benchmarks", "baselines");
|
|
472
480
|
const metricsArray: SimulationMetrics[] = [];
|
|
473
481
|
|
|
474
482
|
try {
|
|
475
483
|
const files = await fs.readdir(baselinesDir).catch(() => []);
|
|
476
484
|
|
|
477
485
|
for (const file of files) {
|
|
478
|
-
if (file.endsWith(
|
|
486
|
+
if (file.endsWith(".json")) {
|
|
479
487
|
const filePath = path.join(baselinesDir, file);
|
|
480
|
-
const data = JSON.parse(await fs.readFile(filePath,
|
|
488
|
+
const data = JSON.parse(await fs.readFile(filePath, "utf-8"));
|
|
481
489
|
|
|
482
490
|
if (data.metrics) {
|
|
483
491
|
metricsArray.push(data.metrics);
|
|
@@ -485,17 +493,17 @@ export class ModelBenchmarkService {
|
|
|
485
493
|
}
|
|
486
494
|
}
|
|
487
495
|
} catch (error) {
|
|
488
|
-
logger.warn(
|
|
496
|
+
logger.warn("Could not load baseline metrics", { error });
|
|
489
497
|
}
|
|
490
498
|
|
|
491
|
-
return
|
|
499
|
+
return ModelBenchmarkService.calculateAverageMetrics(metricsArray);
|
|
492
500
|
}
|
|
493
501
|
|
|
494
502
|
/**
|
|
495
503
|
* Get or create test agent for benchmarking
|
|
496
504
|
*/
|
|
497
505
|
private static async getOrCreateTestAgent(): Promise<string> {
|
|
498
|
-
const testAgentUsername =
|
|
506
|
+
const testAgentUsername = "model-benchmark-agent";
|
|
499
507
|
const adapter = getTrainingDataAdapter();
|
|
500
508
|
|
|
501
509
|
const existing = await adapter.getUserByUsername(testAgentUsername);
|
|
@@ -510,10 +518,10 @@ export class ModelBenchmarkService {
|
|
|
510
518
|
id: agentId,
|
|
511
519
|
privyId: `did:privy:model-benchmark-${agentId}`,
|
|
512
520
|
username: testAgentUsername,
|
|
513
|
-
displayName:
|
|
521
|
+
displayName: "Model Benchmark Agent",
|
|
514
522
|
walletAddress: ethers.Wallet.createRandom().address,
|
|
515
523
|
isAgent: true,
|
|
516
|
-
virtualBalance:
|
|
524
|
+
virtualBalance: "10000",
|
|
517
525
|
reputationPoints: 1000,
|
|
518
526
|
isTest: true,
|
|
519
527
|
updatedAt: new Date(),
|
|
@@ -528,17 +536,17 @@ export class ModelBenchmarkService {
|
|
|
528
536
|
autonomousPosting: false,
|
|
529
537
|
autonomousCommenting: false,
|
|
530
538
|
systemPrompt:
|
|
531
|
-
|
|
532
|
-
modelTier:
|
|
539
|
+
"You are a test agent for benchmarking model performance.",
|
|
540
|
+
modelTier: "pro",
|
|
533
541
|
updatedAt: new Date(),
|
|
534
542
|
});
|
|
535
543
|
}
|
|
536
544
|
|
|
537
545
|
if (!agent) {
|
|
538
|
-
throw new Error(
|
|
546
|
+
throw new Error("Failed to create model benchmark test agent");
|
|
539
547
|
}
|
|
540
548
|
|
|
541
|
-
logger.info(
|
|
549
|
+
logger.info("Created model benchmark test agent", { agentId: agent.id });
|
|
542
550
|
|
|
543
551
|
return agent.id;
|
|
544
552
|
}
|
|
@@ -547,12 +555,12 @@ export class ModelBenchmarkService {
|
|
|
547
555
|
* Get standard benchmark paths for model evaluation
|
|
548
556
|
*/
|
|
549
557
|
static async getStandardBenchmarkPaths(): Promise<string[]> {
|
|
550
|
-
const benchmarksDir = path.join(process.cwd(),
|
|
558
|
+
const benchmarksDir = path.join(process.cwd(), "benchmarks");
|
|
551
559
|
const standardBenchmarks: string[] = [];
|
|
552
560
|
|
|
553
561
|
try {
|
|
554
562
|
// First, look in benchmarks/standard/ directory
|
|
555
|
-
const standardDir = path.join(benchmarksDir,
|
|
563
|
+
const standardDir = path.join(benchmarksDir, "standard");
|
|
556
564
|
if (
|
|
557
565
|
await fs
|
|
558
566
|
.access(standardDir)
|
|
@@ -561,7 +569,7 @@ export class ModelBenchmarkService {
|
|
|
561
569
|
) {
|
|
562
570
|
const standardFiles = await fs.readdir(standardDir);
|
|
563
571
|
for (const file of standardFiles) {
|
|
564
|
-
if (file.startsWith(
|
|
572
|
+
if (file.startsWith("standard-") && file.endsWith(".json")) {
|
|
565
573
|
standardBenchmarks.push(path.join(standardDir, file));
|
|
566
574
|
}
|
|
567
575
|
}
|
|
@@ -570,7 +578,7 @@ export class ModelBenchmarkService {
|
|
|
570
578
|
// If standard benchmarks found, use those
|
|
571
579
|
if (standardBenchmarks.length > 0) {
|
|
572
580
|
logger.info(
|
|
573
|
-
`Using ${standardBenchmarks.length} standard benchmarks from benchmarks/standard
|
|
581
|
+
`Using ${standardBenchmarks.length} standard benchmarks from benchmarks/standard/`,
|
|
574
582
|
);
|
|
575
583
|
return standardBenchmarks;
|
|
576
584
|
}
|
|
@@ -578,7 +586,7 @@ export class ModelBenchmarkService {
|
|
|
578
586
|
// Fallback: Look for week-long benchmarks in main directory
|
|
579
587
|
const files = await fs.readdir(benchmarksDir);
|
|
580
588
|
for (const file of files) {
|
|
581
|
-
if (file.startsWith(
|
|
589
|
+
if (file.startsWith("benchmark-week-") && file.endsWith(".json")) {
|
|
582
590
|
standardBenchmarks.push(path.join(benchmarksDir, file));
|
|
583
591
|
}
|
|
584
592
|
}
|
|
@@ -587,9 +595,9 @@ export class ModelBenchmarkService {
|
|
|
587
595
|
if (standardBenchmarks.length === 0) {
|
|
588
596
|
for (const file of files) {
|
|
589
597
|
if (
|
|
590
|
-
file.startsWith(
|
|
591
|
-
file.endsWith(
|
|
592
|
-
!file.includes(
|
|
598
|
+
file.startsWith("benchmark-") &&
|
|
599
|
+
file.endsWith(".json") &&
|
|
600
|
+
!file.includes("comparison")
|
|
593
601
|
) {
|
|
594
602
|
const filePath = path.join(benchmarksDir, file);
|
|
595
603
|
standardBenchmarks.push(filePath);
|
|
@@ -597,12 +605,12 @@ export class ModelBenchmarkService {
|
|
|
597
605
|
}
|
|
598
606
|
}
|
|
599
607
|
} catch (error) {
|
|
600
|
-
logger.error(
|
|
608
|
+
logger.error("Could not load standard benchmarks", { error });
|
|
601
609
|
}
|
|
602
610
|
|
|
603
611
|
if (standardBenchmarks.length === 0) {
|
|
604
612
|
logger.warn(
|
|
605
|
-
|
|
613
|
+
"No standard benchmarks found. Generate benchmark fixtures before upload.",
|
|
606
614
|
);
|
|
607
615
|
}
|
|
608
616
|
|
|
@@ -13,13 +13,13 @@ export interface ModelConfig {
|
|
|
13
13
|
displayName: string;
|
|
14
14
|
|
|
15
15
|
/** Provider (groq, openai, anthropic, etc.) */
|
|
16
|
-
provider:
|
|
16
|
+
provider: "groq" | "openai" | "anthropic" | "together" | "local";
|
|
17
17
|
|
|
18
18
|
/** Model identifier for the provider's API */
|
|
19
19
|
modelId: string;
|
|
20
20
|
|
|
21
21
|
/** Model tier (lite, standard, pro) */
|
|
22
|
-
tier:
|
|
22
|
+
tier: "lite" | "standard" | "pro";
|
|
23
23
|
|
|
24
24
|
/** Approximate parameters in billions */
|
|
25
25
|
parametersBillions?: number;
|
|
@@ -36,71 +36,71 @@ export interface ModelConfig {
|
|
|
36
36
|
*/
|
|
37
37
|
export const MODEL_REGISTRY: ModelConfig[] = [
|
|
38
38
|
{
|
|
39
|
-
id:
|
|
40
|
-
displayName:
|
|
41
|
-
provider:
|
|
42
|
-
modelId:
|
|
43
|
-
tier:
|
|
39
|
+
id: "llama-8b",
|
|
40
|
+
displayName: "LLaMA 3.1 8B",
|
|
41
|
+
provider: "groq",
|
|
42
|
+
modelId: "llama-3.1-8b-instant",
|
|
43
|
+
tier: "lite",
|
|
44
44
|
parametersBillions: 8,
|
|
45
45
|
isBaseline: true,
|
|
46
46
|
},
|
|
47
47
|
{
|
|
48
|
-
id:
|
|
49
|
-
displayName:
|
|
50
|
-
provider:
|
|
51
|
-
modelId:
|
|
52
|
-
tier:
|
|
48
|
+
id: "llama-70b",
|
|
49
|
+
displayName: "LLaMA 3.1 70B",
|
|
50
|
+
provider: "groq",
|
|
51
|
+
modelId: "llama-3.1-70b-versatile",
|
|
52
|
+
tier: "standard",
|
|
53
53
|
parametersBillions: 70,
|
|
54
54
|
isBaseline: false,
|
|
55
55
|
},
|
|
56
56
|
{
|
|
57
|
-
id:
|
|
58
|
-
displayName:
|
|
59
|
-
provider:
|
|
60
|
-
modelId:
|
|
61
|
-
tier:
|
|
57
|
+
id: "qwen-32b",
|
|
58
|
+
displayName: "Qwen 3 32B",
|
|
59
|
+
provider: "groq",
|
|
60
|
+
modelId: "qwen/qwen3-32b",
|
|
61
|
+
tier: "standard",
|
|
62
62
|
parametersBillions: 32,
|
|
63
63
|
isBaseline: true,
|
|
64
64
|
},
|
|
65
65
|
{
|
|
66
|
-
id:
|
|
67
|
-
displayName:
|
|
68
|
-
provider:
|
|
69
|
-
modelId:
|
|
70
|
-
tier:
|
|
66
|
+
id: "mixtral-8x7b",
|
|
67
|
+
displayName: "Mixtral 8x7B",
|
|
68
|
+
provider: "groq",
|
|
69
|
+
modelId: "mixtral-8x7b-32768",
|
|
70
|
+
tier: "standard",
|
|
71
71
|
parametersBillions: 46,
|
|
72
72
|
isBaseline: false,
|
|
73
73
|
},
|
|
74
74
|
{
|
|
75
|
-
id:
|
|
76
|
-
displayName:
|
|
77
|
-
provider:
|
|
78
|
-
modelId:
|
|
79
|
-
tier:
|
|
75
|
+
id: "gpt-4o",
|
|
76
|
+
displayName: "GPT-4o",
|
|
77
|
+
provider: "openai",
|
|
78
|
+
modelId: "gpt-4o",
|
|
79
|
+
tier: "pro",
|
|
80
80
|
isBaseline: false,
|
|
81
81
|
},
|
|
82
82
|
{
|
|
83
|
-
id:
|
|
84
|
-
displayName:
|
|
85
|
-
provider:
|
|
86
|
-
modelId:
|
|
87
|
-
tier:
|
|
83
|
+
id: "gpt-4o-mini",
|
|
84
|
+
displayName: "GPT-4o Mini",
|
|
85
|
+
provider: "openai",
|
|
86
|
+
modelId: "gpt-4o-mini",
|
|
87
|
+
tier: "lite",
|
|
88
88
|
isBaseline: false,
|
|
89
89
|
},
|
|
90
90
|
{
|
|
91
|
-
id:
|
|
92
|
-
displayName:
|
|
93
|
-
provider:
|
|
94
|
-
modelId:
|
|
95
|
-
tier:
|
|
91
|
+
id: "claude-sonnet",
|
|
92
|
+
displayName: "Claude 3.5 Sonnet",
|
|
93
|
+
provider: "anthropic",
|
|
94
|
+
modelId: "claude-3-5-sonnet-20241022",
|
|
95
|
+
tier: "pro",
|
|
96
96
|
isBaseline: false,
|
|
97
97
|
},
|
|
98
98
|
{
|
|
99
|
-
id:
|
|
100
|
-
displayName:
|
|
101
|
-
provider:
|
|
102
|
-
modelId:
|
|
103
|
-
tier:
|
|
99
|
+
id: "claude-haiku",
|
|
100
|
+
displayName: "Claude 3.5 Haiku",
|
|
101
|
+
provider: "anthropic",
|
|
102
|
+
modelId: "claude-3-5-haiku-20241022",
|
|
103
|
+
tier: "lite",
|
|
104
104
|
isBaseline: false,
|
|
105
105
|
},
|
|
106
106
|
];
|
|
@@ -130,7 +130,7 @@ export function getBaselineModels(): ModelConfig[] {
|
|
|
130
130
|
* Get models by provider
|
|
131
131
|
*/
|
|
132
132
|
export function getModelsByProvider(
|
|
133
|
-
provider: ModelConfig[
|
|
133
|
+
provider: ModelConfig["provider"],
|
|
134
134
|
): ModelConfig[] {
|
|
135
135
|
return MODEL_REGISTRY.filter((m) => m.provider === provider);
|
|
136
136
|
}
|
|
@@ -138,7 +138,7 @@ export function getModelsByProvider(
|
|
|
138
138
|
/**
|
|
139
139
|
* Get models by tier
|
|
140
140
|
*/
|
|
141
|
-
export function getModelsByTier(tier: ModelConfig[
|
|
141
|
+
export function getModelsByTier(tier: ModelConfig["tier"]): ModelConfig[] {
|
|
142
142
|
return MODEL_REGISTRY.filter((m) => m.tier === tier);
|
|
143
143
|
}
|
|
144
144
|
|