@elizaos/training 2.0.0-alpha.21 → 2.0.0-alpha.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/.turbo/turbo-lint.log +2 -0
  2. package/.turbo/turbo-typecheck.log +1 -0
  3. package/dist/.tsbuildinfo +1 -0
  4. package/dist/adapter.js +59 -0
  5. package/dist/archetypes/ArchetypeConfigService.js +510 -0
  6. package/dist/archetypes/derive-archetype.js +196 -0
  7. package/dist/archetypes/index.js +7 -0
  8. package/dist/benchmark/ArchetypeMatchupBenchmark.js +547 -0
  9. package/dist/benchmark/BenchmarkChartGenerator.js +632 -0
  10. package/dist/benchmark/BenchmarkDataGenerator.js +825 -0
  11. package/dist/benchmark/BenchmarkDataViewer.js +197 -0
  12. package/dist/benchmark/BenchmarkHistoryService.js +135 -0
  13. package/dist/benchmark/BenchmarkRunner.js +483 -0
  14. package/dist/benchmark/BenchmarkValidator.js +158 -0
  15. package/dist/benchmark/FastEvalRunner.js +133 -0
  16. package/dist/benchmark/MetricsValidator.js +104 -0
  17. package/dist/benchmark/MetricsVisualizer.js +775 -0
  18. package/dist/benchmark/ModelBenchmarkService.js +433 -0
  19. package/dist/benchmark/ModelRegistry.js +122 -0
  20. package/dist/benchmark/RulerBenchmarkIntegration.js +168 -0
  21. package/dist/benchmark/SimulationA2AInterface.js +683 -0
  22. package/dist/benchmark/SimulationEngine.js +522 -0
  23. package/dist/benchmark/TaskRunner.js +60 -0
  24. package/dist/benchmark/__tests__/BenchmarkRunner.test.js +409 -0
  25. package/dist/benchmark/__tests__/HeadToHead.test.js +105 -0
  26. package/dist/benchmark/index.js +23 -0
  27. package/dist/benchmark/parseSimulationMetrics.js +86 -0
  28. package/dist/benchmark/simulation-types.js +1 -0
  29. package/dist/dependencies.js +197 -0
  30. package/dist/generation/TrajectoryGenerator.js +244 -0
  31. package/dist/generation/index.js +6 -0
  32. package/dist/huggingface/HuggingFaceDatasetUploader.js +463 -0
  33. package/dist/huggingface/HuggingFaceIntegrationService.js +272 -0
  34. package/dist/huggingface/HuggingFaceModelUploader.js +385 -0
  35. package/dist/huggingface/index.js +9 -0
  36. package/dist/huggingface/shared/HuggingFaceUploadUtil.js +144 -0
  37. package/dist/index.js +41 -0
  38. package/dist/init-training.js +43 -0
  39. package/dist/metrics/TrajectoryMetricsExtractor.js +523 -0
  40. package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +628 -0
  41. package/dist/metrics/index.js +7 -0
  42. package/dist/metrics/types.js +21 -0
  43. package/dist/rubrics/__tests__/index.test.js +150 -0
  44. package/dist/rubrics/ass-kisser.js +83 -0
  45. package/dist/rubrics/degen.js +78 -0
  46. package/dist/rubrics/goody-twoshoes.js +82 -0
  47. package/dist/rubrics/index.js +184 -0
  48. package/dist/rubrics/information-trader.js +82 -0
  49. package/dist/rubrics/infosec.js +99 -0
  50. package/dist/rubrics/liar.js +102 -0
  51. package/dist/rubrics/perps-trader.js +85 -0
  52. package/dist/rubrics/researcher.js +79 -0
  53. package/dist/rubrics/scammer.js +80 -0
  54. package/dist/rubrics/social-butterfly.js +71 -0
  55. package/dist/rubrics/super-predictor.js +95 -0
  56. package/dist/rubrics/trader.js +65 -0
  57. package/dist/scoring/ArchetypeScoringService.js +301 -0
  58. package/dist/scoring/JudgePromptBuilder.js +401 -0
  59. package/dist/scoring/LLMJudgeCache.js +263 -0
  60. package/dist/scoring/index.js +8 -0
  61. package/dist/training/AutomationPipeline.js +714 -0
  62. package/dist/training/BenchmarkService.js +370 -0
  63. package/dist/training/ConfigValidator.js +153 -0
  64. package/dist/training/MarketOutcomesTracker.js +142 -0
  65. package/dist/training/ModelDeployer.js +128 -0
  66. package/dist/training/ModelFetcher.js +48 -0
  67. package/dist/training/ModelSelectionService.js +248 -0
  68. package/dist/training/ModelUsageVerifier.js +106 -0
  69. package/dist/training/MultiModelOrchestrator.js +349 -0
  70. package/dist/training/RLModelConfig.js +295 -0
  71. package/dist/training/RewardBackpropagationService.js +117 -0
  72. package/dist/training/RulerScoringService.js +450 -0
  73. package/dist/training/TrainingMonitor.js +108 -0
  74. package/dist/training/TrajectoryRecorder.js +281 -0
  75. package/dist/training/__tests__/TrajectoryRecorder.test.js +363 -0
  76. package/dist/training/index.js +30 -0
  77. package/dist/training/logRLConfig.js +29 -0
  78. package/dist/training/pipeline.js +80 -0
  79. package/dist/training/storage/ModelStorageService.js +190 -0
  80. package/dist/training/storage/TrainingDataArchiver.js +136 -0
  81. package/dist/training/storage/index.js +7 -0
  82. package/dist/training/types.js +6 -0
  83. package/dist/training/window-utils.js +100 -0
  84. package/dist/utils/index.js +73 -0
  85. package/dist/utils/logger.js +55 -0
  86. package/dist/utils/snowflake.js +15 -0
  87. package/dist/utils/synthetic-detector.js +67 -0
  88. package/package.json +2 -2
  89. package/research-output/training-runs/training-run-1773742857616.json +38 -0
  90. package/research-output/training-runs/training-run-1773742946977.json +38 -0
  91. package/research-output/training-runs/training-run-1773743278891.json +38 -0
  92. package/research-output/training-runs/training-run-1773743409754.json +38 -0
  93. package/research-output/training-runs/training-run-1773743651086.json +38 -0
  94. package/research-output/training-runs/training-run-1773743782883.json +38 -0
@@ -0,0 +1,370 @@
1
+ /**
2
+ * Training Benchmark Service
3
+ *
4
+ * Handles model benchmarking during the training pipeline.
5
+ *
6
+ * **Purpose:** Evaluate models as part of continuous training
7
+ * **Used by:** AutomationPipeline, continuous training scripts
8
+ * **Storage:** trainedModel.evalMetrics (JSON field)
9
+ * **Focus:** Training pipeline integration, deployment decisions
10
+ *
11
+ * **Note:** For HuggingFace upload benchmarking, see ModelBenchmarkService
12
+ *
13
+ * @see ModelBenchmarkService - For HuggingFace upload evaluation
14
+ */
15
+ import fs from "node:fs/promises";
16
+ import path from "node:path";
17
+ import { getTrainingDataAdapter } from "../adapter";
18
+ import { BenchmarkRunner } from "../benchmark/BenchmarkRunner";
19
+ import { getAgentRuntimeManager } from "../dependencies";
20
+ import { logger } from "../utils/logger";
21
+ export class BenchmarkService {
22
+ DEPLOYMENT_THRESHOLD = 0.95; // Deploy if new model >= 95% of best
23
+ // Use the 1-week benchmark we generated for comprehensive evaluation
24
+ DEFAULT_BENCHMARK_PATH = path.resolve(process.cwd(), "benchmarks/benchmark-week-10080-60-10-5-8-12345.json");
25
+ RESULTS_DIR = path.resolve(process.cwd(), "benchmark-results/models");
26
+ /**
27
+ * Get benchmark path with fallback to first available benchmark
28
+ *
29
+ * Attempts to use the default benchmark file, falling back to any
30
+ * available benchmark file if the default is not found.
31
+ *
32
+ * @returns Path to benchmark JSON file
33
+ * @throws Error if no benchmark files are found
34
+ */
35
+ async getBenchmarkPath() {
36
+ // Try default first
37
+ try {
38
+ await fs.access(this.DEFAULT_BENCHMARK_PATH);
39
+ return this.DEFAULT_BENCHMARK_PATH;
40
+ }
41
+ catch {
42
+ // Fallback: find any benchmark file
43
+ const benchmarkDir = path.resolve(process.cwd(), "benchmarks");
44
+ const files = await fs.readdir(benchmarkDir);
45
+ const benchmarkFiles = files.filter((f) => f.startsWith("benchmark-") && f.endsWith(".json"));
46
+ if (benchmarkFiles.length > 0) {
47
+ const firstFile = benchmarkFiles[0];
48
+ if (!firstFile)
49
+ throw new Error("Benchmark files array empty");
50
+ const fallbackPath = path.join(benchmarkDir, firstFile);
51
+ logger.warn(`Default benchmark not found, using: ${fallbackPath}`, undefined, "BenchmarkService");
52
+ return fallbackPath;
53
+ }
54
+ }
55
+ throw new Error("No benchmark files found. Generate benchmark data before running evaluation.");
56
+ }
57
+ /**
58
+ * Run benchmark on a trained model
59
+ *
60
+ * Executes a full benchmark run using the BenchmarkRunner, evaluates
61
+ * the model's performance, and stores results in the database.
62
+ *
63
+ * @param modelId - Unique identifier for the trained model
64
+ * @param benchmarkPath - Optional path to benchmark file (uses default if not provided)
65
+ * @returns BenchmarkResults with comprehensive performance metrics
66
+ * @throws Error if model not found or benchmark fails
67
+ *
68
+ * @example
69
+ * ```typescript
70
+ * const results = await benchmarkService.benchmarkModel('model-123');
71
+ * console.log(`Score: ${results.benchmarkScore}`);
72
+ * console.log(`Accuracy: ${results.accuracy}`);
73
+ * ```
74
+ */
75
+ async benchmarkModel(modelId, benchmarkPath) {
76
+ logger.info(`Benchmarking model: ${modelId}`, undefined, "BenchmarkService");
77
+ const startTime = Date.now();
78
+ // Get benchmark file (with fallback logic)
79
+ const bmPath = benchmarkPath || (await this.getBenchmarkPath());
80
+ // Get test agent
81
+ const agent = await this.getTestAgent();
82
+ // Create output directory
83
+ const outputDir = path.join(this.RESULTS_DIR, modelId, Date.now().toString());
84
+ await fs.mkdir(outputDir, { recursive: true });
85
+ // Get agent runtime
86
+ const runtime = await getAgentRuntimeManager().getRuntime(agent.id);
87
+ // Force the runtime to use the specific model we're benchmarking
88
+ // by temporarily overriding the model selection
89
+ const model = await getTrainingDataAdapter().getModelById(modelId);
90
+ if (!model) {
91
+ throw new Error(`Model not found: ${modelId}`);
92
+ }
93
+ // Validate and get model identifier for inference
94
+ const modelIdentifier = this.getValidModelIdentifier(model);
95
+ // Run benchmark
96
+ logger.info("Running benchmark...", {
97
+ modelId,
98
+ modelIdentifier,
99
+ agent: agent.username,
100
+ }, "BenchmarkService");
101
+ const result = await BenchmarkRunner.runSingle({
102
+ benchmarkPath: bmPath,
103
+ agentRuntime: runtime,
104
+ agentUserId: agent.id,
105
+ saveTrajectory: true,
106
+ outputDir,
107
+ forceModel: modelIdentifier, // Use validated W&B model ID
108
+ });
109
+ const duration = Date.now() - startTime;
110
+ // Calculate composite benchmark score
111
+ // Formula: 0.4 * normalized_pnl + 0.3 * accuracy + 0.3 * optimality
112
+ const normalizedPnl = this.normalizePnl(result.metrics.totalPnl);
113
+ const benchmarkScore = 0.4 * normalizedPnl +
114
+ 0.3 * result.metrics.predictionMetrics.accuracy +
115
+ 0.3 * (result.metrics.optimalityScore / 100);
116
+ const benchmarkResults = {
117
+ modelId,
118
+ benchmarkScore,
119
+ pnl: result.metrics.totalPnl,
120
+ accuracy: result.metrics.predictionMetrics.accuracy,
121
+ optimality: result.metrics.optimalityScore,
122
+ perpTrades: result.metrics.perpMetrics.totalTrades,
123
+ correctPredictions: result.metrics.predictionMetrics.correctPredictions,
124
+ totalPositions: result.metrics.predictionMetrics.totalPositions,
125
+ duration,
126
+ timestamp: new Date(),
127
+ };
128
+ logger.info("Benchmark complete", {
129
+ modelId,
130
+ score: benchmarkScore.toFixed(3),
131
+ pnl: result.metrics.totalPnl.toFixed(2),
132
+ accuracy: `${(result.metrics.predictionMetrics.accuracy * 100).toFixed(1)}%`,
133
+ optimality: `${result.metrics.optimalityScore.toFixed(1)}%`,
134
+ duration: `${(duration / 1000).toFixed(1)}s`,
135
+ }, "BenchmarkService");
136
+ // Store results
137
+ await this.storeBenchmarkResults(modelId, benchmarkResults);
138
+ return benchmarkResults;
139
+ }
140
+ /**
141
+ * Compare new model performance against previous best
142
+ *
143
+ * Evaluates whether a new model should be deployed based on its benchmark
144
+ * score compared to the previous best model. Uses a configurable threshold.
145
+ *
146
+ * @param newModelId - Unique identifier for the new model to compare
147
+ * @param threshold - Deployment threshold (default: 0.95, meaning 95% of best)
148
+ * @returns ComparisonResults with deployment recommendation
149
+ * @throws Error if model not found or not benchmarked
150
+ *
151
+ * @example
152
+ * ```typescript
153
+ * const comparison = await benchmarkService.compareModels('model-123');
154
+ * if (comparison.shouldDeploy) {
155
+ * console.log(`Deploying: ${comparison.reason}`);
156
+ * }
157
+ * ```
158
+ */
159
+ async compareModels(newModelId, threshold = this.DEPLOYMENT_THRESHOLD) {
160
+ logger.info(`Comparing model: ${newModelId}`, undefined, "BenchmarkService");
161
+ // Get new model's benchmark results
162
+ const newModel = await getTrainingDataAdapter().getModelById(newModelId);
163
+ if (!newModel) {
164
+ throw new Error(`Model not found: ${newModelId}`);
165
+ }
166
+ if (newModel.benchmarkScore === null) {
167
+ throw new Error(`Model has not been benchmarked: ${newModelId}`);
168
+ }
169
+ const newScore = newModel.benchmarkScore;
170
+ // Get previous best model (excluding the new one)
171
+ const previousBest = await getTrainingDataAdapter().getBestBenchmarkedModel(newModelId);
172
+ // If no previous model, always deploy
173
+ if (!previousBest) {
174
+ logger.info("No previous model to compare - will deploy", { newScore }, "BenchmarkService");
175
+ return {
176
+ newModel: newModelId,
177
+ previousModel: null,
178
+ newScore,
179
+ previousScore: null,
180
+ improvement: null,
181
+ shouldDeploy: true,
182
+ reason: "First model - no comparison available",
183
+ };
184
+ }
185
+ const previousScore = previousBest.benchmarkScore ?? 0;
186
+ const improvement = ((newScore - previousScore) / previousScore) * 100;
187
+ const thresholdScore = previousScore * threshold;
188
+ const shouldDeploy = newScore >= thresholdScore;
189
+ let reason = "";
190
+ if (shouldDeploy) {
191
+ if (newScore > previousScore) {
192
+ reason = `Improved by ${improvement.toFixed(1)}% (${newScore.toFixed(3)} > ${previousScore.toFixed(3)})`;
193
+ }
194
+ else {
195
+ reason = `Within acceptable range (${newScore.toFixed(3)} >= ${thresholdScore.toFixed(3)}, threshold: ${threshold * 100}%)`;
196
+ }
197
+ }
198
+ else {
199
+ reason = `Performance too low (${newScore.toFixed(3)} < ${thresholdScore.toFixed(3)}, need ${threshold * 100}% of best)`;
200
+ }
201
+ logger.info("Model comparison complete", {
202
+ newModel: newModelId,
203
+ newScore: newScore.toFixed(3),
204
+ previousModel: previousBest.modelId,
205
+ previousScore: previousScore.toFixed(3),
206
+ improvement: `${improvement.toFixed(1)}%`,
207
+ shouldDeploy,
208
+ reason,
209
+ }, "BenchmarkService");
210
+ return {
211
+ newModel: newModelId,
212
+ previousModel: previousBest.modelId,
213
+ newScore,
214
+ previousScore,
215
+ improvement,
216
+ shouldDeploy,
217
+ reason,
218
+ };
219
+ }
220
+ /**
221
+ * Store benchmark results in database
222
+ *
223
+ * Saves benchmark metrics to the trainedModel record for tracking
224
+ * and comparison purposes.
225
+ *
226
+ * @param modelId - Unique identifier for the trained model
227
+ * @param results - Benchmark results to store
228
+ * @throws Error if model not found or database update fails
229
+ */
230
+ async storeBenchmarkResults(modelId, results) {
231
+ await getTrainingDataAdapter().updateModelBenchmarkResults(modelId, {
232
+ benchmarkScore: results.benchmarkScore,
233
+ accuracy: results.accuracy,
234
+ evalMetrics: {
235
+ pnl: results.pnl,
236
+ accuracy: results.accuracy,
237
+ optimality: results.optimality,
238
+ perpTrades: results.perpTrades,
239
+ correctPredictions: results.correctPredictions,
240
+ totalPositions: results.totalPositions,
241
+ duration: results.duration,
242
+ benchmarkedAt: results.timestamp.toISOString(),
243
+ },
244
+ });
245
+ logger.info("Stored benchmark results", { modelId, score: results.benchmarkScore }, "BenchmarkService");
246
+ }
247
+ /**
248
+ * Determine if model should be deployed based on performance
249
+ *
250
+ * Convenience wrapper around compareModels() that returns only
251
+ * the deployment decision boolean.
252
+ *
253
+ * @param modelId - Unique identifier for the model to evaluate
254
+ * @param threshold - Deployment threshold (default: 0.95)
255
+ * @returns True if model should be deployed, false otherwise
256
+ * @throws Error if model not found or not benchmarked
257
+ */
258
+ async shouldDeploy(modelId, threshold = this.DEPLOYMENT_THRESHOLD) {
259
+ const comparison = await this.compareModels(modelId, threshold);
260
+ return comparison.shouldDeploy;
261
+ }
262
+ /**
263
+ * Validate and get model identifier for inference
264
+ *
265
+ * Ensures storagePath is a valid W&B model ID or HuggingFace path.
266
+ * Falls back to modelId or baseModel if storagePath is invalid.
267
+ *
268
+ * @param model - Model object with storagePath, modelId, and baseModel
269
+ * @returns Valid model identifier string for inference
270
+ *
271
+ * @remarks
272
+ * Valid formats:
273
+ * - W&B: "entity/project/model-name:version"
274
+ * - HuggingFace: "org/model-name"
275
+ * - Falls back to baseModel if none valid
276
+ */
277
+ getValidModelIdentifier(model) {
278
+ const storagePath = model.storagePath;
279
+ // Validate storagePath format (should be W&B model ID or HuggingFace path)
280
+ // W&B format: entity/project/model-name:version or entity/project/model-name:stepN
281
+ // HuggingFace: org/model-name
282
+ if (storagePath && storagePath.trim().length > 0) {
283
+ // Check if it looks like a valid model ID
284
+ if (storagePath.includes("/") || storagePath.includes(":")) {
285
+ return storagePath;
286
+ }
287
+ // StoragePath is invalid, log warning
288
+ logger.warn(`Invalid storagePath format: ${storagePath}, falling back to modelId`, { modelId: model.modelId }, "BenchmarkService");
289
+ }
290
+ // Fallback to base model if modelId also doesn't look valid
291
+ if (model.modelId.includes("/")) {
292
+ return model.modelId;
293
+ }
294
+ // Last resort: use base model from training
295
+ logger.warn(`No valid model identifier found, using baseModel`, { modelId: model.modelId, baseModel: model.baseModel }, "BenchmarkService");
296
+ return model.baseModel;
297
+ }
298
+ /**
299
+ * Get test agent for benchmarking
300
+ *
301
+ * Finds a suitable test agent for running benchmarks.
302
+ * Prefers specific test agents, falls back to any agent if none found.
303
+ *
304
+ * @returns User record for the test agent
305
+ * @throws Error if no agents found in database
306
+ */
307
+ async getTestAgent() {
308
+ const adapter = getTrainingDataAdapter();
309
+ const allAgents = await adapter.getAgentUsers();
310
+ // Try to find a specific test agent
311
+ const preferredUsernames = [
312
+ "trader-aggressive",
313
+ "test-agent",
314
+ "benchmark-agent",
315
+ ];
316
+ let agent = allAgents.find((a) => a.username && preferredUsernames.includes(a.username));
317
+ // Fall back to any agent
318
+ if (!agent) {
319
+ agent = allAgents[0];
320
+ }
321
+ if (!agent) {
322
+ throw new Error("No test agent available for benchmarking");
323
+ }
324
+ return agent;
325
+ }
326
+ /**
327
+ * Normalize P&L to 0-1 scale
328
+ * Assumes typical range of -5000 to +5000
329
+ */
330
+ normalizePnl(pnl) {
331
+ const min = -5000;
332
+ const max = 5000;
333
+ const normalized = (pnl - min) / (max - min);
334
+ return Math.max(0, Math.min(1, normalized)); // Clamp to [0, 1]
335
+ }
336
+ /**
337
+ * Get benchmark summary for monitoring
338
+ */
339
+ async getBenchmarkSummary() {
340
+ const models = await getTrainingDataAdapter().getBenchmarkedModels(10);
341
+ const summary = models.map((m) => ({
342
+ modelId: m.modelId,
343
+ version: m.version,
344
+ score: m.benchmarkScore,
345
+ accuracy: m.accuracy,
346
+ status: m.status,
347
+ createdAt: m.createdAt,
348
+ }));
349
+ return {
350
+ totalBenchmarked: models.length,
351
+ topModels: summary.slice(0, 5),
352
+ recentModels: summary
353
+ .sort((a, b) => b.createdAt.getTime() - a.createdAt.getTime())
354
+ .slice(0, 5),
355
+ };
356
+ }
357
+ /**
358
+ * Run benchmark on multiple models for comparison
359
+ */
360
+ async benchmarkMultipleModels(modelIds, benchmarkPath) {
361
+ const results = {};
362
+ for (const modelId of modelIds) {
363
+ const result = await this.benchmarkModel(modelId, benchmarkPath);
364
+ results[modelId] = result;
365
+ }
366
+ return results;
367
+ }
368
+ }
369
+ // Export singleton instance
370
+ export const benchmarkService = new BenchmarkService();
@@ -0,0 +1,153 @@
1
+ /**
2
+ * Configuration Validator
3
+ *
4
+ * Validates RL pipeline configuration before execution.
5
+ */
6
+ import { logger } from "../utils/logger";
7
+ /**
8
+ * Validate training configuration
9
+ */
10
+ export function validateTrainingConfig(config) {
11
+ const errors = [];
12
+ const warnings = [];
13
+ // Validate batch size
14
+ if (config.batch_size <= 0) {
15
+ errors.push("batch_size must be greater than 0");
16
+ }
17
+ if (config.batch_size > 64) {
18
+ warnings.push("batch_size > 64 may cause memory issues");
19
+ }
20
+ // Validate learning rate
21
+ if (config.learning_rate <= 0) {
22
+ errors.push("learning_rate must be greater than 0");
23
+ }
24
+ if (config.learning_rate > 1e-3) {
25
+ warnings.push("learning_rate > 1e-3 may cause training instability");
26
+ }
27
+ if (config.learning_rate < 1e-8) {
28
+ warnings.push("learning_rate < 1e-8 may be too small for effective learning");
29
+ }
30
+ // Validate KL penalty
31
+ if (config.kl_penalty < 0) {
32
+ errors.push("kl_penalty must be non-negative");
33
+ }
34
+ if (config.kl_penalty > 1.0) {
35
+ warnings.push("kl_penalty > 1.0 may be too high");
36
+ }
37
+ // Validate iterations
38
+ if (config.iterations_per_window <= 0) {
39
+ errors.push("iterations_per_window must be greater than 0");
40
+ }
41
+ // Validate warmup steps
42
+ if (config.warmup_steps < 0) {
43
+ errors.push("warmup_steps must be non-negative");
44
+ }
45
+ // Validate max grad norm
46
+ if (config.max_grad_norm <= 0) {
47
+ errors.push("max_grad_norm must be greater than 0");
48
+ }
49
+ // Validate gamma
50
+ if (config.gamma < 0 || config.gamma > 1) {
51
+ errors.push("gamma must be between 0 and 1");
52
+ }
53
+ // Validate min trajectories
54
+ if (config.min_trajectories_per_batch <= 0) {
55
+ errors.push("min_trajectories_per_batch must be greater than 0");
56
+ }
57
+ return {
58
+ valid: errors.length === 0,
59
+ errors,
60
+ warnings,
61
+ };
62
+ }
63
+ /**
64
+ * Validate benchmark configuration
65
+ */
66
+ export function validateBenchmarkConfig(config) {
67
+ const errors = [];
68
+ const warnings = [];
69
+ if (config.duration_minutes <= 0) {
70
+ errors.push("duration_minutes must be greater than 0");
71
+ }
72
+ if (config.duration_minutes > 10080) {
73
+ warnings.push("duration_minutes > 10080 (1 week) may take a long time to generate");
74
+ }
75
+ if (config.tick_interval_seconds <= 0) {
76
+ errors.push("tick_interval_seconds must be greater than 0");
77
+ }
78
+ if (config.num_prediction_markets <= 0) {
79
+ errors.push("num_prediction_markets must be greater than 0");
80
+ }
81
+ if (config.num_perpetual_markets <= 0) {
82
+ errors.push("num_perpetual_markets must be greater than 0");
83
+ }
84
+ return {
85
+ valid: errors.length === 0,
86
+ errors,
87
+ warnings,
88
+ };
89
+ }
90
+ /**
91
+ * Validate full pipeline config
92
+ */
93
+ export function validatePipelineConfig(config) {
94
+ const errors = [];
95
+ const warnings = [];
96
+ // Validate benchmark config
97
+ if (config.benchmark) {
98
+ const benchmarkResult = validateBenchmarkConfig({
99
+ duration_minutes: config.benchmark.durationMinutes,
100
+ tick_interval_seconds: config.benchmark.tickInterval,
101
+ num_prediction_markets: config.benchmark.numPredictionMarkets,
102
+ num_perpetual_markets: config.benchmark.numPerpetualMarkets,
103
+ });
104
+ errors.push(...benchmarkResult.errors);
105
+ warnings.push(...benchmarkResult.warnings);
106
+ }
107
+ // Validate training config
108
+ if (config.training) {
109
+ const trainingResult = validateTrainingConfig(config.training);
110
+ errors.push(...trainingResult.errors);
111
+ warnings.push(...trainingResult.warnings);
112
+ }
113
+ // Validate agent config
114
+ if (config.agents.test_agent_count <= 0) {
115
+ errors.push("test_agent_count must be greater than 0");
116
+ }
117
+ if (config.agents.test_agent_count > 10) {
118
+ warnings.push("test_agent_count > 10 may be slow");
119
+ }
120
+ return {
121
+ valid: errors.length === 0,
122
+ errors,
123
+ warnings,
124
+ };
125
+ }
126
+ /**
127
+ * Validate and log results
128
+ */
129
+ export function validateAndLog(config) {
130
+ const result = validatePipelineConfig(config);
131
+ if (result.warnings.length > 0) {
132
+ logger.warn("Configuration warnings", { warnings: result.warnings }, "ConfigValidator");
133
+ result.warnings.forEach((w) => {
134
+ console.log(` ⚠️ ${w}`);
135
+ });
136
+ }
137
+ if (result.errors.length > 0) {
138
+ logger.error("Configuration errors", { errors: result.errors }, "ConfigValidator");
139
+ result.errors.forEach((e) => {
140
+ console.error(` ❌ ${e}`);
141
+ });
142
+ return false;
143
+ }
144
+ logger.info("Configuration validation passed", undefined, "ConfigValidator");
145
+ return true;
146
+ }
147
+ /** @deprecated Use standalone functions instead */
148
+ export const ConfigValidator = {
149
+ validateTrainingConfig,
150
+ validateBenchmarkConfig,
151
+ validatePipelineConfig,
152
+ validateAndLog,
153
+ };
@@ -0,0 +1,142 @@
1
+ /**
2
+ * Market Outcomes Tracker
3
+ *
4
+ * Tracks market outcomes per time window for context-rich RULER judging.
5
+ * This gives RULER the ground truth to evaluate agent decisions.
6
+ */
7
+ import { getMarketDataAdapter } from "../adapter";
8
+ import { generateSnowflakeId, logger } from "../utils";
9
+ import { getPreviousWindowId } from "./window-utils";
10
+ export class MarketOutcomesTracker {
11
+ /**
12
+ * Track outcomes for a specific window
13
+ */
14
+ async trackWindowOutcomes(windowId) {
15
+ logger.info(`Tracking market outcomes for window: ${windowId}`);
16
+ const marketAdapter = getMarketDataAdapter();
17
+ if (!marketAdapter) {
18
+ logger.warn("Market data adapter not available, skipping outcome tracking");
19
+ return;
20
+ }
21
+ const windowStart = new Date(windowId);
22
+ const windowEnd = new Date(windowStart.getTime() + 60 * 60 * 1000);
23
+ // Get stock price movements from perpetual positions
24
+ const perpTrades = await marketAdapter.getPerpPositionsForWindow(windowStart, windowEnd);
25
+ // Group by ticker and calculate movements
26
+ const stockMovements = new Map();
27
+ for (const trade of perpTrades) {
28
+ if (!trade.ticker)
29
+ continue;
30
+ const existing = stockMovements.get(trade.ticker);
31
+ const endPrice = Number(trade.currentPrice ?? trade.exitPrice ?? trade.entryPrice);
32
+ if (!existing) {
33
+ stockMovements.set(trade.ticker, {
34
+ start: Number(trade.entryPrice),
35
+ end: endPrice,
36
+ count: 1,
37
+ });
38
+ }
39
+ else {
40
+ // Average the prices
41
+ existing.end = endPrice;
42
+ existing.count++;
43
+ }
44
+ }
45
+ // Save stock outcomes
46
+ for (const [ticker, data] of stockMovements.entries()) {
47
+ const changePercent = ((data.end - data.start) / data.start) * 100;
48
+ await marketAdapter.insertMarketOutcome({
49
+ id: await generateSnowflakeId(),
50
+ windowId,
51
+ stockTicker: ticker,
52
+ startPrice: String(data.start),
53
+ endPrice: String(data.end),
54
+ changePercent: String(changePercent),
55
+ sentiment: changePercent > 0 ? "BULLISH" : "BEARISH",
56
+ });
57
+ }
58
+ // Get prediction market resolutions
59
+ const resolvedMarkets = await marketAdapter.getResolvedMarketsForWindow(windowStart, windowEnd);
60
+ // Save prediction outcomes
61
+ for (const market of resolvedMarkets) {
62
+ await marketAdapter.insertMarketOutcome({
63
+ id: await generateSnowflakeId(),
64
+ windowId,
65
+ predictionMarketId: market.id,
66
+ question: market.question,
67
+ outcome: market.outcome ? "YES" : "NO",
68
+ finalProbability: String(market.finalProbability ?? 0.5),
69
+ });
70
+ }
71
+ logger.info(`Tracked outcomes for ${windowId}`, {
72
+ stocks: stockMovements.size,
73
+ predictions: resolvedMarkets.length,
74
+ });
75
+ }
76
+ /**
77
+ * Sync outcomes for recent windows
78
+ */
79
+ async syncRecentWindows(hours = 24) {
80
+ logger.info(`Syncing market outcomes for last ${hours} hours`);
81
+ const marketAdapter = getMarketDataAdapter();
82
+ if (!marketAdapter) {
83
+ logger.warn("Market data adapter not available");
84
+ return 0;
85
+ }
86
+ let synced = 0;
87
+ for (let i = 0; i < hours; i++) {
88
+ const windowId = getPreviousWindowId(i);
89
+ // Check if already tracked
90
+ const exists = await marketAdapter.hasOutcomesForWindow(windowId);
91
+ if (!exists) {
92
+ await this.trackWindowOutcomes(windowId);
93
+ synced++;
94
+ }
95
+ }
96
+ logger.info(`Synced ${synced} windows`);
97
+ return synced;
98
+ }
99
+ /**
100
+ * Get outcomes for a window
101
+ */
102
+ async getWindowOutcomes(windowId) {
103
+ const marketAdapter = getMarketDataAdapter();
104
+ if (!marketAdapter) {
105
+ return null;
106
+ }
107
+ const outcomes = await marketAdapter.getMarketOutcomesByWindow(windowId);
108
+ if (outcomes.length === 0) {
109
+ return null;
110
+ }
111
+ // Access fields through the adapter's dynamic record type
112
+ const stocks = outcomes
113
+ .filter((o) => o.stockTicker)
114
+ .map((o) => {
115
+ const r = o;
116
+ return {
117
+ ticker: r.stockTicker,
118
+ startPrice: Number(r.startPrice),
119
+ endPrice: Number(r.endPrice),
120
+ changePercent: Number(r.changePercent),
121
+ sentiment: r.sentiment || undefined,
122
+ news: r.newsEvents,
123
+ };
124
+ });
125
+ const predictions = outcomes
126
+ .filter((o) => o.predictionMarketId)
127
+ .map((o) => {
128
+ const r = o;
129
+ return {
130
+ marketId: r.predictionMarketId,
131
+ question: r.question || "",
132
+ outcome: r.outcome || "UNRESOLVED",
133
+ finalProbability: Number(r.finalProbability || 0),
134
+ };
135
+ });
136
+ return {
137
+ windowId,
138
+ stocks,
139
+ predictions,
140
+ };
141
+ }
142
+ }