@elizaos/training 2.0.0-alpha.21 → 2.0.0-alpha.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-lint.log +2 -0
- package/.turbo/turbo-typecheck.log +1 -0
- package/dist/.tsbuildinfo +1 -0
- package/dist/adapter.js +59 -0
- package/dist/archetypes/ArchetypeConfigService.js +510 -0
- package/dist/archetypes/derive-archetype.js +196 -0
- package/dist/archetypes/index.js +7 -0
- package/dist/benchmark/ArchetypeMatchupBenchmark.js +547 -0
- package/dist/benchmark/BenchmarkChartGenerator.js +632 -0
- package/dist/benchmark/BenchmarkDataGenerator.js +825 -0
- package/dist/benchmark/BenchmarkDataViewer.js +197 -0
- package/dist/benchmark/BenchmarkHistoryService.js +135 -0
- package/dist/benchmark/BenchmarkRunner.js +483 -0
- package/dist/benchmark/BenchmarkValidator.js +158 -0
- package/dist/benchmark/FastEvalRunner.js +133 -0
- package/dist/benchmark/MetricsValidator.js +104 -0
- package/dist/benchmark/MetricsVisualizer.js +775 -0
- package/dist/benchmark/ModelBenchmarkService.js +433 -0
- package/dist/benchmark/ModelRegistry.js +122 -0
- package/dist/benchmark/RulerBenchmarkIntegration.js +168 -0
- package/dist/benchmark/SimulationA2AInterface.js +683 -0
- package/dist/benchmark/SimulationEngine.js +522 -0
- package/dist/benchmark/TaskRunner.js +60 -0
- package/dist/benchmark/__tests__/BenchmarkRunner.test.js +409 -0
- package/dist/benchmark/__tests__/HeadToHead.test.js +105 -0
- package/dist/benchmark/index.js +23 -0
- package/dist/benchmark/parseSimulationMetrics.js +86 -0
- package/dist/benchmark/simulation-types.js +1 -0
- package/dist/dependencies.js +197 -0
- package/dist/generation/TrajectoryGenerator.js +244 -0
- package/dist/generation/index.js +6 -0
- package/dist/huggingface/HuggingFaceDatasetUploader.js +463 -0
- package/dist/huggingface/HuggingFaceIntegrationService.js +272 -0
- package/dist/huggingface/HuggingFaceModelUploader.js +385 -0
- package/dist/huggingface/index.js +9 -0
- package/dist/huggingface/shared/HuggingFaceUploadUtil.js +144 -0
- package/dist/index.js +41 -0
- package/dist/init-training.js +43 -0
- package/dist/metrics/TrajectoryMetricsExtractor.js +523 -0
- package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +628 -0
- package/dist/metrics/index.js +7 -0
- package/dist/metrics/types.js +21 -0
- package/dist/rubrics/__tests__/index.test.js +150 -0
- package/dist/rubrics/ass-kisser.js +83 -0
- package/dist/rubrics/degen.js +78 -0
- package/dist/rubrics/goody-twoshoes.js +82 -0
- package/dist/rubrics/index.js +184 -0
- package/dist/rubrics/information-trader.js +82 -0
- package/dist/rubrics/infosec.js +99 -0
- package/dist/rubrics/liar.js +102 -0
- package/dist/rubrics/perps-trader.js +85 -0
- package/dist/rubrics/researcher.js +79 -0
- package/dist/rubrics/scammer.js +80 -0
- package/dist/rubrics/social-butterfly.js +71 -0
- package/dist/rubrics/super-predictor.js +95 -0
- package/dist/rubrics/trader.js +65 -0
- package/dist/scoring/ArchetypeScoringService.js +301 -0
- package/dist/scoring/JudgePromptBuilder.js +401 -0
- package/dist/scoring/LLMJudgeCache.js +263 -0
- package/dist/scoring/index.js +8 -0
- package/dist/training/AutomationPipeline.js +714 -0
- package/dist/training/BenchmarkService.js +370 -0
- package/dist/training/ConfigValidator.js +153 -0
- package/dist/training/MarketOutcomesTracker.js +142 -0
- package/dist/training/ModelDeployer.js +128 -0
- package/dist/training/ModelFetcher.js +48 -0
- package/dist/training/ModelSelectionService.js +248 -0
- package/dist/training/ModelUsageVerifier.js +106 -0
- package/dist/training/MultiModelOrchestrator.js +349 -0
- package/dist/training/RLModelConfig.js +295 -0
- package/dist/training/RewardBackpropagationService.js +117 -0
- package/dist/training/RulerScoringService.js +450 -0
- package/dist/training/TrainingMonitor.js +108 -0
- package/dist/training/TrajectoryRecorder.js +281 -0
- package/dist/training/__tests__/TrajectoryRecorder.test.js +363 -0
- package/dist/training/index.js +30 -0
- package/dist/training/logRLConfig.js +29 -0
- package/dist/training/pipeline.js +80 -0
- package/dist/training/storage/ModelStorageService.js +190 -0
- package/dist/training/storage/TrainingDataArchiver.js +136 -0
- package/dist/training/storage/index.js +7 -0
- package/dist/training/types.js +6 -0
- package/dist/training/window-utils.js +100 -0
- package/dist/utils/index.js +73 -0
- package/dist/utils/logger.js +55 -0
- package/dist/utils/snowflake.js +15 -0
- package/dist/utils/synthetic-detector.js +67 -0
- package/package.json +2 -2
- package/research-output/training-runs/training-run-1773742857616.json +38 -0
- package/research-output/training-runs/training-run-1773742946977.json +38 -0
- package/research-output/training-runs/training-run-1773743278891.json +38 -0
- package/research-output/training-runs/training-run-1773743409754.json +38 -0
- package/research-output/training-runs/training-run-1773743651086.json +38 -0
- package/research-output/training-runs/training-run-1773743782883.json +38 -0
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Training Benchmark Service
|
|
3
|
+
*
|
|
4
|
+
* Handles model benchmarking during the training pipeline.
|
|
5
|
+
*
|
|
6
|
+
* **Purpose:** Evaluate models as part of continuous training
|
|
7
|
+
* **Used by:** AutomationPipeline, continuous training scripts
|
|
8
|
+
* **Storage:** trainedModel.evalMetrics (JSON field)
|
|
9
|
+
* **Focus:** Training pipeline integration, deployment decisions
|
|
10
|
+
*
|
|
11
|
+
* **Note:** For HuggingFace upload benchmarking, see ModelBenchmarkService
|
|
12
|
+
*
|
|
13
|
+
* @see ModelBenchmarkService - For HuggingFace upload evaluation
|
|
14
|
+
*/
|
|
15
|
+
import fs from "node:fs/promises";
|
|
16
|
+
import path from "node:path";
|
|
17
|
+
import { getTrainingDataAdapter } from "../adapter";
|
|
18
|
+
import { BenchmarkRunner } from "../benchmark/BenchmarkRunner";
|
|
19
|
+
import { getAgentRuntimeManager } from "../dependencies";
|
|
20
|
+
import { logger } from "../utils/logger";
|
|
21
|
+
export class BenchmarkService {
|
|
22
|
+
DEPLOYMENT_THRESHOLD = 0.95; // Deploy if new model >= 95% of best
|
|
23
|
+
// Use the 1-week benchmark we generated for comprehensive evaluation
|
|
24
|
+
DEFAULT_BENCHMARK_PATH = path.resolve(process.cwd(), "benchmarks/benchmark-week-10080-60-10-5-8-12345.json");
|
|
25
|
+
RESULTS_DIR = path.resolve(process.cwd(), "benchmark-results/models");
|
|
26
|
+
/**
|
|
27
|
+
* Get benchmark path with fallback to first available benchmark
|
|
28
|
+
*
|
|
29
|
+
* Attempts to use the default benchmark file, falling back to any
|
|
30
|
+
* available benchmark file if the default is not found.
|
|
31
|
+
*
|
|
32
|
+
* @returns Path to benchmark JSON file
|
|
33
|
+
* @throws Error if no benchmark files are found
|
|
34
|
+
*/
|
|
35
|
+
async getBenchmarkPath() {
|
|
36
|
+
// Try default first
|
|
37
|
+
try {
|
|
38
|
+
await fs.access(this.DEFAULT_BENCHMARK_PATH);
|
|
39
|
+
return this.DEFAULT_BENCHMARK_PATH;
|
|
40
|
+
}
|
|
41
|
+
catch {
|
|
42
|
+
// Fallback: find any benchmark file
|
|
43
|
+
const benchmarkDir = path.resolve(process.cwd(), "benchmarks");
|
|
44
|
+
const files = await fs.readdir(benchmarkDir);
|
|
45
|
+
const benchmarkFiles = files.filter((f) => f.startsWith("benchmark-") && f.endsWith(".json"));
|
|
46
|
+
if (benchmarkFiles.length > 0) {
|
|
47
|
+
const firstFile = benchmarkFiles[0];
|
|
48
|
+
if (!firstFile)
|
|
49
|
+
throw new Error("Benchmark files array empty");
|
|
50
|
+
const fallbackPath = path.join(benchmarkDir, firstFile);
|
|
51
|
+
logger.warn(`Default benchmark not found, using: ${fallbackPath}`, undefined, "BenchmarkService");
|
|
52
|
+
return fallbackPath;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
throw new Error("No benchmark files found. Generate benchmark data before running evaluation.");
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Run benchmark on a trained model
|
|
59
|
+
*
|
|
60
|
+
* Executes a full benchmark run using the BenchmarkRunner, evaluates
|
|
61
|
+
* the model's performance, and stores results in the database.
|
|
62
|
+
*
|
|
63
|
+
* @param modelId - Unique identifier for the trained model
|
|
64
|
+
* @param benchmarkPath - Optional path to benchmark file (uses default if not provided)
|
|
65
|
+
* @returns BenchmarkResults with comprehensive performance metrics
|
|
66
|
+
* @throws Error if model not found or benchmark fails
|
|
67
|
+
*
|
|
68
|
+
* @example
|
|
69
|
+
* ```typescript
|
|
70
|
+
* const results = await benchmarkService.benchmarkModel('model-123');
|
|
71
|
+
* console.log(`Score: ${results.benchmarkScore}`);
|
|
72
|
+
* console.log(`Accuracy: ${results.accuracy}`);
|
|
73
|
+
* ```
|
|
74
|
+
*/
|
|
75
|
+
async benchmarkModel(modelId, benchmarkPath) {
|
|
76
|
+
logger.info(`Benchmarking model: ${modelId}`, undefined, "BenchmarkService");
|
|
77
|
+
const startTime = Date.now();
|
|
78
|
+
// Get benchmark file (with fallback logic)
|
|
79
|
+
const bmPath = benchmarkPath || (await this.getBenchmarkPath());
|
|
80
|
+
// Get test agent
|
|
81
|
+
const agent = await this.getTestAgent();
|
|
82
|
+
// Create output directory
|
|
83
|
+
const outputDir = path.join(this.RESULTS_DIR, modelId, Date.now().toString());
|
|
84
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
85
|
+
// Get agent runtime
|
|
86
|
+
const runtime = await getAgentRuntimeManager().getRuntime(agent.id);
|
|
87
|
+
// Force the runtime to use the specific model we're benchmarking
|
|
88
|
+
// by temporarily overriding the model selection
|
|
89
|
+
const model = await getTrainingDataAdapter().getModelById(modelId);
|
|
90
|
+
if (!model) {
|
|
91
|
+
throw new Error(`Model not found: ${modelId}`);
|
|
92
|
+
}
|
|
93
|
+
// Validate and get model identifier for inference
|
|
94
|
+
const modelIdentifier = this.getValidModelIdentifier(model);
|
|
95
|
+
// Run benchmark
|
|
96
|
+
logger.info("Running benchmark...", {
|
|
97
|
+
modelId,
|
|
98
|
+
modelIdentifier,
|
|
99
|
+
agent: agent.username,
|
|
100
|
+
}, "BenchmarkService");
|
|
101
|
+
const result = await BenchmarkRunner.runSingle({
|
|
102
|
+
benchmarkPath: bmPath,
|
|
103
|
+
agentRuntime: runtime,
|
|
104
|
+
agentUserId: agent.id,
|
|
105
|
+
saveTrajectory: true,
|
|
106
|
+
outputDir,
|
|
107
|
+
forceModel: modelIdentifier, // Use validated W&B model ID
|
|
108
|
+
});
|
|
109
|
+
const duration = Date.now() - startTime;
|
|
110
|
+
// Calculate composite benchmark score
|
|
111
|
+
// Formula: 0.4 * normalized_pnl + 0.3 * accuracy + 0.3 * optimality
|
|
112
|
+
const normalizedPnl = this.normalizePnl(result.metrics.totalPnl);
|
|
113
|
+
const benchmarkScore = 0.4 * normalizedPnl +
|
|
114
|
+
0.3 * result.metrics.predictionMetrics.accuracy +
|
|
115
|
+
0.3 * (result.metrics.optimalityScore / 100);
|
|
116
|
+
const benchmarkResults = {
|
|
117
|
+
modelId,
|
|
118
|
+
benchmarkScore,
|
|
119
|
+
pnl: result.metrics.totalPnl,
|
|
120
|
+
accuracy: result.metrics.predictionMetrics.accuracy,
|
|
121
|
+
optimality: result.metrics.optimalityScore,
|
|
122
|
+
perpTrades: result.metrics.perpMetrics.totalTrades,
|
|
123
|
+
correctPredictions: result.metrics.predictionMetrics.correctPredictions,
|
|
124
|
+
totalPositions: result.metrics.predictionMetrics.totalPositions,
|
|
125
|
+
duration,
|
|
126
|
+
timestamp: new Date(),
|
|
127
|
+
};
|
|
128
|
+
logger.info("Benchmark complete", {
|
|
129
|
+
modelId,
|
|
130
|
+
score: benchmarkScore.toFixed(3),
|
|
131
|
+
pnl: result.metrics.totalPnl.toFixed(2),
|
|
132
|
+
accuracy: `${(result.metrics.predictionMetrics.accuracy * 100).toFixed(1)}%`,
|
|
133
|
+
optimality: `${result.metrics.optimalityScore.toFixed(1)}%`,
|
|
134
|
+
duration: `${(duration / 1000).toFixed(1)}s`,
|
|
135
|
+
}, "BenchmarkService");
|
|
136
|
+
// Store results
|
|
137
|
+
await this.storeBenchmarkResults(modelId, benchmarkResults);
|
|
138
|
+
return benchmarkResults;
|
|
139
|
+
}
|
|
140
|
+
/**
|
|
141
|
+
* Compare new model performance against previous best
|
|
142
|
+
*
|
|
143
|
+
* Evaluates whether a new model should be deployed based on its benchmark
|
|
144
|
+
* score compared to the previous best model. Uses a configurable threshold.
|
|
145
|
+
*
|
|
146
|
+
* @param newModelId - Unique identifier for the new model to compare
|
|
147
|
+
* @param threshold - Deployment threshold (default: 0.95, meaning 95% of best)
|
|
148
|
+
* @returns ComparisonResults with deployment recommendation
|
|
149
|
+
* @throws Error if model not found or not benchmarked
|
|
150
|
+
*
|
|
151
|
+
* @example
|
|
152
|
+
* ```typescript
|
|
153
|
+
* const comparison = await benchmarkService.compareModels('model-123');
|
|
154
|
+
* if (comparison.shouldDeploy) {
|
|
155
|
+
* console.log(`Deploying: ${comparison.reason}`);
|
|
156
|
+
* }
|
|
157
|
+
* ```
|
|
158
|
+
*/
|
|
159
|
+
async compareModels(newModelId, threshold = this.DEPLOYMENT_THRESHOLD) {
|
|
160
|
+
logger.info(`Comparing model: ${newModelId}`, undefined, "BenchmarkService");
|
|
161
|
+
// Get new model's benchmark results
|
|
162
|
+
const newModel = await getTrainingDataAdapter().getModelById(newModelId);
|
|
163
|
+
if (!newModel) {
|
|
164
|
+
throw new Error(`Model not found: ${newModelId}`);
|
|
165
|
+
}
|
|
166
|
+
if (newModel.benchmarkScore === null) {
|
|
167
|
+
throw new Error(`Model has not been benchmarked: ${newModelId}`);
|
|
168
|
+
}
|
|
169
|
+
const newScore = newModel.benchmarkScore;
|
|
170
|
+
// Get previous best model (excluding the new one)
|
|
171
|
+
const previousBest = await getTrainingDataAdapter().getBestBenchmarkedModel(newModelId);
|
|
172
|
+
// If no previous model, always deploy
|
|
173
|
+
if (!previousBest) {
|
|
174
|
+
logger.info("No previous model to compare - will deploy", { newScore }, "BenchmarkService");
|
|
175
|
+
return {
|
|
176
|
+
newModel: newModelId,
|
|
177
|
+
previousModel: null,
|
|
178
|
+
newScore,
|
|
179
|
+
previousScore: null,
|
|
180
|
+
improvement: null,
|
|
181
|
+
shouldDeploy: true,
|
|
182
|
+
reason: "First model - no comparison available",
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
const previousScore = previousBest.benchmarkScore ?? 0;
|
|
186
|
+
const improvement = ((newScore - previousScore) / previousScore) * 100;
|
|
187
|
+
const thresholdScore = previousScore * threshold;
|
|
188
|
+
const shouldDeploy = newScore >= thresholdScore;
|
|
189
|
+
let reason = "";
|
|
190
|
+
if (shouldDeploy) {
|
|
191
|
+
if (newScore > previousScore) {
|
|
192
|
+
reason = `Improved by ${improvement.toFixed(1)}% (${newScore.toFixed(3)} > ${previousScore.toFixed(3)})`;
|
|
193
|
+
}
|
|
194
|
+
else {
|
|
195
|
+
reason = `Within acceptable range (${newScore.toFixed(3)} >= ${thresholdScore.toFixed(3)}, threshold: ${threshold * 100}%)`;
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
else {
|
|
199
|
+
reason = `Performance too low (${newScore.toFixed(3)} < ${thresholdScore.toFixed(3)}, need ${threshold * 100}% of best)`;
|
|
200
|
+
}
|
|
201
|
+
logger.info("Model comparison complete", {
|
|
202
|
+
newModel: newModelId,
|
|
203
|
+
newScore: newScore.toFixed(3),
|
|
204
|
+
previousModel: previousBest.modelId,
|
|
205
|
+
previousScore: previousScore.toFixed(3),
|
|
206
|
+
improvement: `${improvement.toFixed(1)}%`,
|
|
207
|
+
shouldDeploy,
|
|
208
|
+
reason,
|
|
209
|
+
}, "BenchmarkService");
|
|
210
|
+
return {
|
|
211
|
+
newModel: newModelId,
|
|
212
|
+
previousModel: previousBest.modelId,
|
|
213
|
+
newScore,
|
|
214
|
+
previousScore,
|
|
215
|
+
improvement,
|
|
216
|
+
shouldDeploy,
|
|
217
|
+
reason,
|
|
218
|
+
};
|
|
219
|
+
}
|
|
220
|
+
/**
|
|
221
|
+
* Store benchmark results in database
|
|
222
|
+
*
|
|
223
|
+
* Saves benchmark metrics to the trainedModel record for tracking
|
|
224
|
+
* and comparison purposes.
|
|
225
|
+
*
|
|
226
|
+
* @param modelId - Unique identifier for the trained model
|
|
227
|
+
* @param results - Benchmark results to store
|
|
228
|
+
* @throws Error if model not found or database update fails
|
|
229
|
+
*/
|
|
230
|
+
async storeBenchmarkResults(modelId, results) {
|
|
231
|
+
await getTrainingDataAdapter().updateModelBenchmarkResults(modelId, {
|
|
232
|
+
benchmarkScore: results.benchmarkScore,
|
|
233
|
+
accuracy: results.accuracy,
|
|
234
|
+
evalMetrics: {
|
|
235
|
+
pnl: results.pnl,
|
|
236
|
+
accuracy: results.accuracy,
|
|
237
|
+
optimality: results.optimality,
|
|
238
|
+
perpTrades: results.perpTrades,
|
|
239
|
+
correctPredictions: results.correctPredictions,
|
|
240
|
+
totalPositions: results.totalPositions,
|
|
241
|
+
duration: results.duration,
|
|
242
|
+
benchmarkedAt: results.timestamp.toISOString(),
|
|
243
|
+
},
|
|
244
|
+
});
|
|
245
|
+
logger.info("Stored benchmark results", { modelId, score: results.benchmarkScore }, "BenchmarkService");
|
|
246
|
+
}
|
|
247
|
+
/**
|
|
248
|
+
* Determine if model should be deployed based on performance
|
|
249
|
+
*
|
|
250
|
+
* Convenience wrapper around compareModels() that returns only
|
|
251
|
+
* the deployment decision boolean.
|
|
252
|
+
*
|
|
253
|
+
* @param modelId - Unique identifier for the model to evaluate
|
|
254
|
+
* @param threshold - Deployment threshold (default: 0.95)
|
|
255
|
+
* @returns True if model should be deployed, false otherwise
|
|
256
|
+
* @throws Error if model not found or not benchmarked
|
|
257
|
+
*/
|
|
258
|
+
async shouldDeploy(modelId, threshold = this.DEPLOYMENT_THRESHOLD) {
|
|
259
|
+
const comparison = await this.compareModels(modelId, threshold);
|
|
260
|
+
return comparison.shouldDeploy;
|
|
261
|
+
}
|
|
262
|
+
/**
|
|
263
|
+
* Validate and get model identifier for inference
|
|
264
|
+
*
|
|
265
|
+
* Ensures storagePath is a valid W&B model ID or HuggingFace path.
|
|
266
|
+
* Falls back to modelId or baseModel if storagePath is invalid.
|
|
267
|
+
*
|
|
268
|
+
* @param model - Model object with storagePath, modelId, and baseModel
|
|
269
|
+
* @returns Valid model identifier string for inference
|
|
270
|
+
*
|
|
271
|
+
* @remarks
|
|
272
|
+
* Valid formats:
|
|
273
|
+
* - W&B: "entity/project/model-name:version"
|
|
274
|
+
* - HuggingFace: "org/model-name"
|
|
275
|
+
* - Falls back to baseModel if none valid
|
|
276
|
+
*/
|
|
277
|
+
getValidModelIdentifier(model) {
|
|
278
|
+
const storagePath = model.storagePath;
|
|
279
|
+
// Validate storagePath format (should be W&B model ID or HuggingFace path)
|
|
280
|
+
// W&B format: entity/project/model-name:version or entity/project/model-name:stepN
|
|
281
|
+
// HuggingFace: org/model-name
|
|
282
|
+
if (storagePath && storagePath.trim().length > 0) {
|
|
283
|
+
// Check if it looks like a valid model ID
|
|
284
|
+
if (storagePath.includes("/") || storagePath.includes(":")) {
|
|
285
|
+
return storagePath;
|
|
286
|
+
}
|
|
287
|
+
// StoragePath is invalid, log warning
|
|
288
|
+
logger.warn(`Invalid storagePath format: ${storagePath}, falling back to modelId`, { modelId: model.modelId }, "BenchmarkService");
|
|
289
|
+
}
|
|
290
|
+
// Fallback to base model if modelId also doesn't look valid
|
|
291
|
+
if (model.modelId.includes("/")) {
|
|
292
|
+
return model.modelId;
|
|
293
|
+
}
|
|
294
|
+
// Last resort: use base model from training
|
|
295
|
+
logger.warn(`No valid model identifier found, using baseModel`, { modelId: model.modelId, baseModel: model.baseModel }, "BenchmarkService");
|
|
296
|
+
return model.baseModel;
|
|
297
|
+
}
|
|
298
|
+
/**
|
|
299
|
+
* Get test agent for benchmarking
|
|
300
|
+
*
|
|
301
|
+
* Finds a suitable test agent for running benchmarks.
|
|
302
|
+
* Prefers specific test agents, falls back to any agent if none found.
|
|
303
|
+
*
|
|
304
|
+
* @returns User record for the test agent
|
|
305
|
+
* @throws Error if no agents found in database
|
|
306
|
+
*/
|
|
307
|
+
async getTestAgent() {
|
|
308
|
+
const adapter = getTrainingDataAdapter();
|
|
309
|
+
const allAgents = await adapter.getAgentUsers();
|
|
310
|
+
// Try to find a specific test agent
|
|
311
|
+
const preferredUsernames = [
|
|
312
|
+
"trader-aggressive",
|
|
313
|
+
"test-agent",
|
|
314
|
+
"benchmark-agent",
|
|
315
|
+
];
|
|
316
|
+
let agent = allAgents.find((a) => a.username && preferredUsernames.includes(a.username));
|
|
317
|
+
// Fall back to any agent
|
|
318
|
+
if (!agent) {
|
|
319
|
+
agent = allAgents[0];
|
|
320
|
+
}
|
|
321
|
+
if (!agent) {
|
|
322
|
+
throw new Error("No test agent available for benchmarking");
|
|
323
|
+
}
|
|
324
|
+
return agent;
|
|
325
|
+
}
|
|
326
|
+
/**
|
|
327
|
+
* Normalize P&L to 0-1 scale
|
|
328
|
+
* Assumes typical range of -5000 to +5000
|
|
329
|
+
*/
|
|
330
|
+
normalizePnl(pnl) {
|
|
331
|
+
const min = -5000;
|
|
332
|
+
const max = 5000;
|
|
333
|
+
const normalized = (pnl - min) / (max - min);
|
|
334
|
+
return Math.max(0, Math.min(1, normalized)); // Clamp to [0, 1]
|
|
335
|
+
}
|
|
336
|
+
/**
|
|
337
|
+
* Get benchmark summary for monitoring
|
|
338
|
+
*/
|
|
339
|
+
async getBenchmarkSummary() {
|
|
340
|
+
const models = await getTrainingDataAdapter().getBenchmarkedModels(10);
|
|
341
|
+
const summary = models.map((m) => ({
|
|
342
|
+
modelId: m.modelId,
|
|
343
|
+
version: m.version,
|
|
344
|
+
score: m.benchmarkScore,
|
|
345
|
+
accuracy: m.accuracy,
|
|
346
|
+
status: m.status,
|
|
347
|
+
createdAt: m.createdAt,
|
|
348
|
+
}));
|
|
349
|
+
return {
|
|
350
|
+
totalBenchmarked: models.length,
|
|
351
|
+
topModels: summary.slice(0, 5),
|
|
352
|
+
recentModels: summary
|
|
353
|
+
.sort((a, b) => b.createdAt.getTime() - a.createdAt.getTime())
|
|
354
|
+
.slice(0, 5),
|
|
355
|
+
};
|
|
356
|
+
}
|
|
357
|
+
/**
|
|
358
|
+
* Run benchmark on multiple models for comparison
|
|
359
|
+
*/
|
|
360
|
+
async benchmarkMultipleModels(modelIds, benchmarkPath) {
|
|
361
|
+
const results = {};
|
|
362
|
+
for (const modelId of modelIds) {
|
|
363
|
+
const result = await this.benchmarkModel(modelId, benchmarkPath);
|
|
364
|
+
results[modelId] = result;
|
|
365
|
+
}
|
|
366
|
+
return results;
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
// Export singleton instance
|
|
370
|
+
export const benchmarkService = new BenchmarkService();
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Configuration Validator
|
|
3
|
+
*
|
|
4
|
+
* Validates RL pipeline configuration before execution.
|
|
5
|
+
*/
|
|
6
|
+
import { logger } from "../utils/logger";
|
|
7
|
+
/**
|
|
8
|
+
* Validate training configuration
|
|
9
|
+
*/
|
|
10
|
+
export function validateTrainingConfig(config) {
|
|
11
|
+
const errors = [];
|
|
12
|
+
const warnings = [];
|
|
13
|
+
// Validate batch size
|
|
14
|
+
if (config.batch_size <= 0) {
|
|
15
|
+
errors.push("batch_size must be greater than 0");
|
|
16
|
+
}
|
|
17
|
+
if (config.batch_size > 64) {
|
|
18
|
+
warnings.push("batch_size > 64 may cause memory issues");
|
|
19
|
+
}
|
|
20
|
+
// Validate learning rate
|
|
21
|
+
if (config.learning_rate <= 0) {
|
|
22
|
+
errors.push("learning_rate must be greater than 0");
|
|
23
|
+
}
|
|
24
|
+
if (config.learning_rate > 1e-3) {
|
|
25
|
+
warnings.push("learning_rate > 1e-3 may cause training instability");
|
|
26
|
+
}
|
|
27
|
+
if (config.learning_rate < 1e-8) {
|
|
28
|
+
warnings.push("learning_rate < 1e-8 may be too small for effective learning");
|
|
29
|
+
}
|
|
30
|
+
// Validate KL penalty
|
|
31
|
+
if (config.kl_penalty < 0) {
|
|
32
|
+
errors.push("kl_penalty must be non-negative");
|
|
33
|
+
}
|
|
34
|
+
if (config.kl_penalty > 1.0) {
|
|
35
|
+
warnings.push("kl_penalty > 1.0 may be too high");
|
|
36
|
+
}
|
|
37
|
+
// Validate iterations
|
|
38
|
+
if (config.iterations_per_window <= 0) {
|
|
39
|
+
errors.push("iterations_per_window must be greater than 0");
|
|
40
|
+
}
|
|
41
|
+
// Validate warmup steps
|
|
42
|
+
if (config.warmup_steps < 0) {
|
|
43
|
+
errors.push("warmup_steps must be non-negative");
|
|
44
|
+
}
|
|
45
|
+
// Validate max grad norm
|
|
46
|
+
if (config.max_grad_norm <= 0) {
|
|
47
|
+
errors.push("max_grad_norm must be greater than 0");
|
|
48
|
+
}
|
|
49
|
+
// Validate gamma
|
|
50
|
+
if (config.gamma < 0 || config.gamma > 1) {
|
|
51
|
+
errors.push("gamma must be between 0 and 1");
|
|
52
|
+
}
|
|
53
|
+
// Validate min trajectories
|
|
54
|
+
if (config.min_trajectories_per_batch <= 0) {
|
|
55
|
+
errors.push("min_trajectories_per_batch must be greater than 0");
|
|
56
|
+
}
|
|
57
|
+
return {
|
|
58
|
+
valid: errors.length === 0,
|
|
59
|
+
errors,
|
|
60
|
+
warnings,
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Validate benchmark configuration
|
|
65
|
+
*/
|
|
66
|
+
export function validateBenchmarkConfig(config) {
|
|
67
|
+
const errors = [];
|
|
68
|
+
const warnings = [];
|
|
69
|
+
if (config.duration_minutes <= 0) {
|
|
70
|
+
errors.push("duration_minutes must be greater than 0");
|
|
71
|
+
}
|
|
72
|
+
if (config.duration_minutes > 10080) {
|
|
73
|
+
warnings.push("duration_minutes > 10080 (1 week) may take a long time to generate");
|
|
74
|
+
}
|
|
75
|
+
if (config.tick_interval_seconds <= 0) {
|
|
76
|
+
errors.push("tick_interval_seconds must be greater than 0");
|
|
77
|
+
}
|
|
78
|
+
if (config.num_prediction_markets <= 0) {
|
|
79
|
+
errors.push("num_prediction_markets must be greater than 0");
|
|
80
|
+
}
|
|
81
|
+
if (config.num_perpetual_markets <= 0) {
|
|
82
|
+
errors.push("num_perpetual_markets must be greater than 0");
|
|
83
|
+
}
|
|
84
|
+
return {
|
|
85
|
+
valid: errors.length === 0,
|
|
86
|
+
errors,
|
|
87
|
+
warnings,
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Validate full pipeline config
|
|
92
|
+
*/
|
|
93
|
+
export function validatePipelineConfig(config) {
|
|
94
|
+
const errors = [];
|
|
95
|
+
const warnings = [];
|
|
96
|
+
// Validate benchmark config
|
|
97
|
+
if (config.benchmark) {
|
|
98
|
+
const benchmarkResult = validateBenchmarkConfig({
|
|
99
|
+
duration_minutes: config.benchmark.durationMinutes,
|
|
100
|
+
tick_interval_seconds: config.benchmark.tickInterval,
|
|
101
|
+
num_prediction_markets: config.benchmark.numPredictionMarkets,
|
|
102
|
+
num_perpetual_markets: config.benchmark.numPerpetualMarkets,
|
|
103
|
+
});
|
|
104
|
+
errors.push(...benchmarkResult.errors);
|
|
105
|
+
warnings.push(...benchmarkResult.warnings);
|
|
106
|
+
}
|
|
107
|
+
// Validate training config
|
|
108
|
+
if (config.training) {
|
|
109
|
+
const trainingResult = validateTrainingConfig(config.training);
|
|
110
|
+
errors.push(...trainingResult.errors);
|
|
111
|
+
warnings.push(...trainingResult.warnings);
|
|
112
|
+
}
|
|
113
|
+
// Validate agent config
|
|
114
|
+
if (config.agents.test_agent_count <= 0) {
|
|
115
|
+
errors.push("test_agent_count must be greater than 0");
|
|
116
|
+
}
|
|
117
|
+
if (config.agents.test_agent_count > 10) {
|
|
118
|
+
warnings.push("test_agent_count > 10 may be slow");
|
|
119
|
+
}
|
|
120
|
+
return {
|
|
121
|
+
valid: errors.length === 0,
|
|
122
|
+
errors,
|
|
123
|
+
warnings,
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
/**
|
|
127
|
+
* Validate and log results
|
|
128
|
+
*/
|
|
129
|
+
export function validateAndLog(config) {
|
|
130
|
+
const result = validatePipelineConfig(config);
|
|
131
|
+
if (result.warnings.length > 0) {
|
|
132
|
+
logger.warn("Configuration warnings", { warnings: result.warnings }, "ConfigValidator");
|
|
133
|
+
result.warnings.forEach((w) => {
|
|
134
|
+
console.log(` ⚠️ ${w}`);
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
if (result.errors.length > 0) {
|
|
138
|
+
logger.error("Configuration errors", { errors: result.errors }, "ConfigValidator");
|
|
139
|
+
result.errors.forEach((e) => {
|
|
140
|
+
console.error(` ❌ ${e}`);
|
|
141
|
+
});
|
|
142
|
+
return false;
|
|
143
|
+
}
|
|
144
|
+
logger.info("Configuration validation passed", undefined, "ConfigValidator");
|
|
145
|
+
return true;
|
|
146
|
+
}
|
|
147
|
+
/** @deprecated Use standalone functions instead */
|
|
148
|
+
export const ConfigValidator = {
|
|
149
|
+
validateTrainingConfig,
|
|
150
|
+
validateBenchmarkConfig,
|
|
151
|
+
validatePipelineConfig,
|
|
152
|
+
validateAndLog,
|
|
153
|
+
};
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Market Outcomes Tracker
|
|
3
|
+
*
|
|
4
|
+
* Tracks market outcomes per time window for context-rich RULER judging.
|
|
5
|
+
* This gives RULER the ground truth to evaluate agent decisions.
|
|
6
|
+
*/
|
|
7
|
+
import { getMarketDataAdapter } from "../adapter";
|
|
8
|
+
import { generateSnowflakeId, logger } from "../utils";
|
|
9
|
+
import { getPreviousWindowId } from "./window-utils";
|
|
10
|
+
export class MarketOutcomesTracker {
|
|
11
|
+
/**
|
|
12
|
+
* Track outcomes for a specific window
|
|
13
|
+
*/
|
|
14
|
+
async trackWindowOutcomes(windowId) {
|
|
15
|
+
logger.info(`Tracking market outcomes for window: ${windowId}`);
|
|
16
|
+
const marketAdapter = getMarketDataAdapter();
|
|
17
|
+
if (!marketAdapter) {
|
|
18
|
+
logger.warn("Market data adapter not available, skipping outcome tracking");
|
|
19
|
+
return;
|
|
20
|
+
}
|
|
21
|
+
const windowStart = new Date(windowId);
|
|
22
|
+
const windowEnd = new Date(windowStart.getTime() + 60 * 60 * 1000);
|
|
23
|
+
// Get stock price movements from perpetual positions
|
|
24
|
+
const perpTrades = await marketAdapter.getPerpPositionsForWindow(windowStart, windowEnd);
|
|
25
|
+
// Group by ticker and calculate movements
|
|
26
|
+
const stockMovements = new Map();
|
|
27
|
+
for (const trade of perpTrades) {
|
|
28
|
+
if (!trade.ticker)
|
|
29
|
+
continue;
|
|
30
|
+
const existing = stockMovements.get(trade.ticker);
|
|
31
|
+
const endPrice = Number(trade.currentPrice ?? trade.exitPrice ?? trade.entryPrice);
|
|
32
|
+
if (!existing) {
|
|
33
|
+
stockMovements.set(trade.ticker, {
|
|
34
|
+
start: Number(trade.entryPrice),
|
|
35
|
+
end: endPrice,
|
|
36
|
+
count: 1,
|
|
37
|
+
});
|
|
38
|
+
}
|
|
39
|
+
else {
|
|
40
|
+
// Average the prices
|
|
41
|
+
existing.end = endPrice;
|
|
42
|
+
existing.count++;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
// Save stock outcomes
|
|
46
|
+
for (const [ticker, data] of stockMovements.entries()) {
|
|
47
|
+
const changePercent = ((data.end - data.start) / data.start) * 100;
|
|
48
|
+
await marketAdapter.insertMarketOutcome({
|
|
49
|
+
id: await generateSnowflakeId(),
|
|
50
|
+
windowId,
|
|
51
|
+
stockTicker: ticker,
|
|
52
|
+
startPrice: String(data.start),
|
|
53
|
+
endPrice: String(data.end),
|
|
54
|
+
changePercent: String(changePercent),
|
|
55
|
+
sentiment: changePercent > 0 ? "BULLISH" : "BEARISH",
|
|
56
|
+
});
|
|
57
|
+
}
|
|
58
|
+
// Get prediction market resolutions
|
|
59
|
+
const resolvedMarkets = await marketAdapter.getResolvedMarketsForWindow(windowStart, windowEnd);
|
|
60
|
+
// Save prediction outcomes
|
|
61
|
+
for (const market of resolvedMarkets) {
|
|
62
|
+
await marketAdapter.insertMarketOutcome({
|
|
63
|
+
id: await generateSnowflakeId(),
|
|
64
|
+
windowId,
|
|
65
|
+
predictionMarketId: market.id,
|
|
66
|
+
question: market.question,
|
|
67
|
+
outcome: market.outcome ? "YES" : "NO",
|
|
68
|
+
finalProbability: String(market.finalProbability ?? 0.5),
|
|
69
|
+
});
|
|
70
|
+
}
|
|
71
|
+
logger.info(`Tracked outcomes for ${windowId}`, {
|
|
72
|
+
stocks: stockMovements.size,
|
|
73
|
+
predictions: resolvedMarkets.length,
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Sync outcomes for recent windows
|
|
78
|
+
*/
|
|
79
|
+
async syncRecentWindows(hours = 24) {
|
|
80
|
+
logger.info(`Syncing market outcomes for last ${hours} hours`);
|
|
81
|
+
const marketAdapter = getMarketDataAdapter();
|
|
82
|
+
if (!marketAdapter) {
|
|
83
|
+
logger.warn("Market data adapter not available");
|
|
84
|
+
return 0;
|
|
85
|
+
}
|
|
86
|
+
let synced = 0;
|
|
87
|
+
for (let i = 0; i < hours; i++) {
|
|
88
|
+
const windowId = getPreviousWindowId(i);
|
|
89
|
+
// Check if already tracked
|
|
90
|
+
const exists = await marketAdapter.hasOutcomesForWindow(windowId);
|
|
91
|
+
if (!exists) {
|
|
92
|
+
await this.trackWindowOutcomes(windowId);
|
|
93
|
+
synced++;
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
logger.info(`Synced ${synced} windows`);
|
|
97
|
+
return synced;
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Get outcomes for a window
|
|
101
|
+
*/
|
|
102
|
+
async getWindowOutcomes(windowId) {
|
|
103
|
+
const marketAdapter = getMarketDataAdapter();
|
|
104
|
+
if (!marketAdapter) {
|
|
105
|
+
return null;
|
|
106
|
+
}
|
|
107
|
+
const outcomes = await marketAdapter.getMarketOutcomesByWindow(windowId);
|
|
108
|
+
if (outcomes.length === 0) {
|
|
109
|
+
return null;
|
|
110
|
+
}
|
|
111
|
+
// Access fields through the adapter's dynamic record type
|
|
112
|
+
const stocks = outcomes
|
|
113
|
+
.filter((o) => o.stockTicker)
|
|
114
|
+
.map((o) => {
|
|
115
|
+
const r = o;
|
|
116
|
+
return {
|
|
117
|
+
ticker: r.stockTicker,
|
|
118
|
+
startPrice: Number(r.startPrice),
|
|
119
|
+
endPrice: Number(r.endPrice),
|
|
120
|
+
changePercent: Number(r.changePercent),
|
|
121
|
+
sentiment: r.sentiment || undefined,
|
|
122
|
+
news: r.newsEvents,
|
|
123
|
+
};
|
|
124
|
+
});
|
|
125
|
+
const predictions = outcomes
|
|
126
|
+
.filter((o) => o.predictionMarketId)
|
|
127
|
+
.map((o) => {
|
|
128
|
+
const r = o;
|
|
129
|
+
return {
|
|
130
|
+
marketId: r.predictionMarketId,
|
|
131
|
+
question: r.question || "",
|
|
132
|
+
outcome: r.outcome || "UNRESOLVED",
|
|
133
|
+
finalProbability: Number(r.finalProbability || 0),
|
|
134
|
+
};
|
|
135
|
+
});
|
|
136
|
+
return {
|
|
137
|
+
windowId,
|
|
138
|
+
stocks,
|
|
139
|
+
predictions,
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
}
|