@elizaos/training 2.0.0-alpha.13 → 2.0.0-alpha.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/research-output/training-runs/training-run-1773726941205.json +38 -0
- package/scripts/rank_trajectories.ts +0 -1
- package/scripts/run_task_benchmark.ts +4 -11
- package/src/adapter.ts +96 -49
- package/src/archetypes/ArchetypeConfigService.ts +188 -185
- package/src/archetypes/derive-archetype.ts +47 -47
- package/src/archetypes/index.ts +2 -2
- package/src/benchmark/ArchetypeMatchupBenchmark.ts +70 -70
- package/src/benchmark/BenchmarkChartGenerator.ts +70 -69
- package/src/benchmark/BenchmarkDataGenerator.ts +136 -136
- package/src/benchmark/BenchmarkDataViewer.ts +32 -30
- package/src/benchmark/BenchmarkHistoryService.ts +13 -12
- package/src/benchmark/BenchmarkRunner.ts +87 -83
- package/src/benchmark/BenchmarkValidator.ts +48 -46
- package/src/benchmark/FastEvalRunner.ts +17 -16
- package/src/benchmark/MetricsValidator.ts +20 -21
- package/src/benchmark/MetricsVisualizer.ts +92 -85
- package/src/benchmark/ModelBenchmarkService.ts +90 -82
- package/src/benchmark/ModelRegistry.ts +44 -44
- package/src/benchmark/RulerBenchmarkIntegration.ts +24 -24
- package/src/benchmark/SimulationA2AInterface.ts +118 -118
- package/src/benchmark/SimulationEngine.ts +51 -51
- package/src/benchmark/TaskRunner.ts +87 -79
- package/src/benchmark/__tests__/BenchmarkRunner.test.ts +80 -80
- package/src/benchmark/__tests__/HeadToHead.test.ts +26 -26
- package/src/benchmark/index.ts +27 -27
- package/src/benchmark/parseSimulationMetrics.ts +32 -32
- package/src/benchmark/simulation-types.ts +10 -10
- package/src/dependencies.ts +34 -34
- package/src/generation/TrajectoryGenerator.ts +39 -37
- package/src/generation/index.ts +1 -1
- package/src/huggingface/HuggingFaceDatasetUploader.ts +72 -72
- package/src/huggingface/HuggingFaceIntegrationService.ts +59 -53
- package/src/huggingface/HuggingFaceModelUploader.ts +60 -59
- package/src/huggingface/index.ts +6 -6
- package/src/huggingface/shared/HuggingFaceUploadUtil.ts +32 -32
- package/src/index.ts +27 -27
- package/src/init-training.ts +6 -6
- package/src/metrics/TrajectoryMetricsExtractor.ts +70 -71
- package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +182 -182
- package/src/metrics/index.ts +2 -2
- package/src/rubrics/__tests__/index.test.ts +73 -73
- package/src/rubrics/ass-kisser.ts +6 -6
- package/src/rubrics/degen.ts +6 -6
- package/src/rubrics/goody-twoshoes.ts +6 -6
- package/src/rubrics/index.ts +50 -50
- package/src/rubrics/information-trader.ts +6 -6
- package/src/rubrics/infosec.ts +6 -6
- package/src/rubrics/liar.ts +6 -6
- package/src/rubrics/perps-trader.ts +6 -6
- package/src/rubrics/researcher.ts +6 -6
- package/src/rubrics/scammer.ts +6 -6
- package/src/rubrics/social-butterfly.ts +7 -7
- package/src/rubrics/super-predictor.ts +6 -6
- package/src/rubrics/trader.ts +5 -5
- package/src/scoring/ArchetypeScoringService.ts +56 -54
- package/src/scoring/JudgePromptBuilder.ts +96 -96
- package/src/scoring/LLMJudgeCache.ts +26 -23
- package/src/scoring/index.ts +3 -3
- package/src/training/AutomationPipeline.ts +149 -140
- package/src/training/BenchmarkService.ts +49 -45
- package/src/training/ConfigValidator.ts +38 -32
- package/src/training/MarketOutcomesTracker.ts +22 -12
- package/src/training/ModelDeployer.ts +15 -15
- package/src/training/ModelFetcher.ts +7 -7
- package/src/training/ModelSelectionService.ts +32 -32
- package/src/training/ModelUsageVerifier.ts +31 -24
- package/src/training/MultiModelOrchestrator.ts +44 -44
- package/src/training/RLModelConfig.ts +57 -57
- package/src/training/RewardBackpropagationService.ts +18 -17
- package/src/training/RulerScoringService.ts +73 -72
- package/src/training/TrainingMonitor.ts +29 -29
- package/src/training/TrajectoryRecorder.ts +25 -27
- package/src/training/__tests__/TrajectoryRecorder.test.ts +105 -105
- package/src/training/index.ts +36 -36
- package/src/training/logRLConfig.ts +7 -7
- package/src/training/pipeline.ts +13 -16
- package/src/training/storage/ModelStorageService.ts +32 -32
- package/src/training/storage/TrainingDataArchiver.ts +21 -21
- package/src/training/storage/index.ts +2 -2
- package/src/training/types.ts +6 -6
- package/src/training/window-utils.ts +14 -14
- package/src/utils/index.ts +7 -7
- package/src/utils/logger.ts +5 -5
- package/src/utils/snowflake.ts +1 -1
- package/src/utils/synthetic-detector.ts +7 -7
|
@@ -13,12 +13,12 @@
|
|
|
13
13
|
* @see ModelBenchmarkService - For HuggingFace upload evaluation
|
|
14
14
|
*/
|
|
15
15
|
|
|
16
|
-
import fs from
|
|
17
|
-
import path from
|
|
18
|
-
import { getTrainingDataAdapter, type JsonValue } from
|
|
19
|
-
import { BenchmarkRunner } from
|
|
20
|
-
import { getAgentRuntimeManager } from
|
|
21
|
-
import { logger } from
|
|
16
|
+
import fs from "node:fs/promises";
|
|
17
|
+
import path from "node:path";
|
|
18
|
+
import { getTrainingDataAdapter, type JsonValue } from "../adapter";
|
|
19
|
+
import { BenchmarkRunner } from "../benchmark/BenchmarkRunner";
|
|
20
|
+
import { getAgentRuntimeManager } from "../dependencies";
|
|
21
|
+
import { logger } from "../utils/logger";
|
|
22
22
|
|
|
23
23
|
export interface BenchmarkResults {
|
|
24
24
|
modelId: string;
|
|
@@ -48,11 +48,11 @@ export class BenchmarkService {
|
|
|
48
48
|
// Use the 1-week benchmark we generated for comprehensive evaluation
|
|
49
49
|
private readonly DEFAULT_BENCHMARK_PATH = path.resolve(
|
|
50
50
|
process.cwd(),
|
|
51
|
-
|
|
51
|
+
"benchmarks/benchmark-week-10080-60-10-5-8-12345.json",
|
|
52
52
|
);
|
|
53
53
|
private readonly RESULTS_DIR = path.resolve(
|
|
54
54
|
process.cwd(),
|
|
55
|
-
|
|
55
|
+
"benchmark-results/models",
|
|
56
56
|
);
|
|
57
57
|
|
|
58
58
|
/**
|
|
@@ -71,10 +71,10 @@ export class BenchmarkService {
|
|
|
71
71
|
return this.DEFAULT_BENCHMARK_PATH;
|
|
72
72
|
} catch {
|
|
73
73
|
// Fallback: find any benchmark file
|
|
74
|
-
const benchmarkDir = path.resolve(process.cwd(),
|
|
74
|
+
const benchmarkDir = path.resolve(process.cwd(), "benchmarks");
|
|
75
75
|
const files = await fs.readdir(benchmarkDir);
|
|
76
76
|
const benchmarkFiles = files.filter(
|
|
77
|
-
(f) => f.startsWith(
|
|
77
|
+
(f) => f.startsWith("benchmark-") && f.endsWith(".json"),
|
|
78
78
|
);
|
|
79
79
|
|
|
80
80
|
if (benchmarkFiles.length > 0) {
|
|
@@ -82,14 +82,14 @@ export class BenchmarkService {
|
|
|
82
82
|
logger.warn(
|
|
83
83
|
`Default benchmark not found, using: ${fallbackPath}`,
|
|
84
84
|
undefined,
|
|
85
|
-
|
|
85
|
+
"BenchmarkService",
|
|
86
86
|
);
|
|
87
87
|
return fallbackPath;
|
|
88
88
|
}
|
|
89
89
|
}
|
|
90
90
|
|
|
91
91
|
throw new Error(
|
|
92
|
-
|
|
92
|
+
"No benchmark files found. Generate benchmark data before running evaluation.",
|
|
93
93
|
);
|
|
94
94
|
}
|
|
95
95
|
|
|
@@ -113,12 +113,12 @@ export class BenchmarkService {
|
|
|
113
113
|
*/
|
|
114
114
|
async benchmarkModel(
|
|
115
115
|
modelId: string,
|
|
116
|
-
benchmarkPath?: string
|
|
116
|
+
benchmarkPath?: string,
|
|
117
117
|
): Promise<BenchmarkResults> {
|
|
118
118
|
logger.info(
|
|
119
119
|
`Benchmarking model: ${modelId}`,
|
|
120
120
|
undefined,
|
|
121
|
-
|
|
121
|
+
"BenchmarkService",
|
|
122
122
|
);
|
|
123
123
|
|
|
124
124
|
const startTime = Date.now();
|
|
@@ -133,7 +133,7 @@ export class BenchmarkService {
|
|
|
133
133
|
const outputDir = path.join(
|
|
134
134
|
this.RESULTS_DIR,
|
|
135
135
|
modelId,
|
|
136
|
-
Date.now().toString()
|
|
136
|
+
Date.now().toString(),
|
|
137
137
|
);
|
|
138
138
|
await fs.mkdir(outputDir, { recursive: true });
|
|
139
139
|
|
|
@@ -153,13 +153,13 @@ export class BenchmarkService {
|
|
|
153
153
|
|
|
154
154
|
// Run benchmark
|
|
155
155
|
logger.info(
|
|
156
|
-
|
|
156
|
+
"Running benchmark...",
|
|
157
157
|
{
|
|
158
158
|
modelId,
|
|
159
159
|
modelIdentifier,
|
|
160
160
|
agent: agent.username,
|
|
161
161
|
},
|
|
162
|
-
|
|
162
|
+
"BenchmarkService",
|
|
163
163
|
);
|
|
164
164
|
|
|
165
165
|
const result = await BenchmarkRunner.runSingle({
|
|
@@ -195,17 +195,16 @@ export class BenchmarkService {
|
|
|
195
195
|
};
|
|
196
196
|
|
|
197
197
|
logger.info(
|
|
198
|
-
|
|
198
|
+
"Benchmark complete",
|
|
199
199
|
{
|
|
200
200
|
modelId,
|
|
201
201
|
score: benchmarkScore.toFixed(3),
|
|
202
202
|
pnl: result.metrics.totalPnl.toFixed(2),
|
|
203
|
-
accuracy:
|
|
204
|
-
|
|
205
|
-
optimality: result.metrics.optimalityScore.toFixed(1) + '%',
|
|
203
|
+
accuracy: `${(result.metrics.predictionMetrics.accuracy * 100).toFixed(1)}%`,
|
|
204
|
+
optimality: `${result.metrics.optimalityScore.toFixed(1)}%`,
|
|
206
205
|
duration: `${(duration / 1000).toFixed(1)}s`,
|
|
207
206
|
},
|
|
208
|
-
|
|
207
|
+
"BenchmarkService",
|
|
209
208
|
);
|
|
210
209
|
|
|
211
210
|
// Store results
|
|
@@ -235,12 +234,12 @@ export class BenchmarkService {
|
|
|
235
234
|
*/
|
|
236
235
|
async compareModels(
|
|
237
236
|
newModelId: string,
|
|
238
|
-
threshold: number = this.DEPLOYMENT_THRESHOLD
|
|
237
|
+
threshold: number = this.DEPLOYMENT_THRESHOLD,
|
|
239
238
|
): Promise<ComparisonResults> {
|
|
240
239
|
logger.info(
|
|
241
240
|
`Comparing model: ${newModelId}`,
|
|
242
241
|
undefined,
|
|
243
|
-
|
|
242
|
+
"BenchmarkService",
|
|
244
243
|
);
|
|
245
244
|
|
|
246
245
|
// Get new model's benchmark results
|
|
@@ -257,14 +256,15 @@ export class BenchmarkService {
|
|
|
257
256
|
const newScore = newModel.benchmarkScore;
|
|
258
257
|
|
|
259
258
|
// Get previous best model (excluding the new one)
|
|
260
|
-
const previousBest =
|
|
259
|
+
const previousBest =
|
|
260
|
+
await getTrainingDataAdapter().getBestBenchmarkedModel(newModelId);
|
|
261
261
|
|
|
262
262
|
// If no previous model, always deploy
|
|
263
263
|
if (!previousBest) {
|
|
264
264
|
logger.info(
|
|
265
|
-
|
|
265
|
+
"No previous model to compare - will deploy",
|
|
266
266
|
{ newScore },
|
|
267
|
-
|
|
267
|
+
"BenchmarkService",
|
|
268
268
|
);
|
|
269
269
|
return {
|
|
270
270
|
newModel: newModelId,
|
|
@@ -273,7 +273,7 @@ export class BenchmarkService {
|
|
|
273
273
|
previousScore: null,
|
|
274
274
|
improvement: null,
|
|
275
275
|
shouldDeploy: true,
|
|
276
|
-
reason:
|
|
276
|
+
reason: "First model - no comparison available",
|
|
277
277
|
};
|
|
278
278
|
}
|
|
279
279
|
|
|
@@ -282,7 +282,7 @@ export class BenchmarkService {
|
|
|
282
282
|
const thresholdScore = previousScore * threshold;
|
|
283
283
|
const shouldDeploy = newScore >= thresholdScore;
|
|
284
284
|
|
|
285
|
-
let reason =
|
|
285
|
+
let reason = "";
|
|
286
286
|
if (shouldDeploy) {
|
|
287
287
|
if (newScore > previousScore) {
|
|
288
288
|
reason = `Improved by ${improvement.toFixed(1)}% (${newScore.toFixed(3)} > ${previousScore.toFixed(3)})`;
|
|
@@ -294,17 +294,17 @@ export class BenchmarkService {
|
|
|
294
294
|
}
|
|
295
295
|
|
|
296
296
|
logger.info(
|
|
297
|
-
|
|
297
|
+
"Model comparison complete",
|
|
298
298
|
{
|
|
299
299
|
newModel: newModelId,
|
|
300
300
|
newScore: newScore.toFixed(3),
|
|
301
301
|
previousModel: previousBest.modelId,
|
|
302
302
|
previousScore: previousScore.toFixed(3),
|
|
303
|
-
improvement: improvement.toFixed(1)
|
|
303
|
+
improvement: `${improvement.toFixed(1)}%`,
|
|
304
304
|
shouldDeploy,
|
|
305
305
|
reason,
|
|
306
306
|
},
|
|
307
|
-
|
|
307
|
+
"BenchmarkService",
|
|
308
308
|
);
|
|
309
309
|
|
|
310
310
|
return {
|
|
@@ -330,7 +330,7 @@ export class BenchmarkService {
|
|
|
330
330
|
*/
|
|
331
331
|
async storeBenchmarkResults(
|
|
332
332
|
modelId: string,
|
|
333
|
-
results: BenchmarkResults
|
|
333
|
+
results: BenchmarkResults,
|
|
334
334
|
): Promise<void> {
|
|
335
335
|
await getTrainingDataAdapter().updateModelBenchmarkResults(modelId, {
|
|
336
336
|
benchmarkScore: results.benchmarkScore,
|
|
@@ -348,9 +348,9 @@ export class BenchmarkService {
|
|
|
348
348
|
});
|
|
349
349
|
|
|
350
350
|
logger.info(
|
|
351
|
-
|
|
351
|
+
"Stored benchmark results",
|
|
352
352
|
{ modelId, score: results.benchmarkScore },
|
|
353
|
-
|
|
353
|
+
"BenchmarkService",
|
|
354
354
|
);
|
|
355
355
|
}
|
|
356
356
|
|
|
@@ -367,7 +367,7 @@ export class BenchmarkService {
|
|
|
367
367
|
*/
|
|
368
368
|
async shouldDeploy(
|
|
369
369
|
modelId: string,
|
|
370
|
-
threshold: number = this.DEPLOYMENT_THRESHOLD
|
|
370
|
+
threshold: number = this.DEPLOYMENT_THRESHOLD,
|
|
371
371
|
): Promise<boolean> {
|
|
372
372
|
const comparison = await this.compareModels(modelId, threshold);
|
|
373
373
|
return comparison.shouldDeploy;
|
|
@@ -401,7 +401,7 @@ export class BenchmarkService {
|
|
|
401
401
|
|
|
402
402
|
if (storagePath && storagePath.trim().length > 0) {
|
|
403
403
|
// Check if it looks like a valid model ID
|
|
404
|
-
if (storagePath.includes(
|
|
404
|
+
if (storagePath.includes("/") || storagePath.includes(":")) {
|
|
405
405
|
return storagePath;
|
|
406
406
|
}
|
|
407
407
|
|
|
@@ -409,12 +409,12 @@ export class BenchmarkService {
|
|
|
409
409
|
logger.warn(
|
|
410
410
|
`Invalid storagePath format: ${storagePath}, falling back to modelId`,
|
|
411
411
|
{ modelId: model.modelId },
|
|
412
|
-
|
|
412
|
+
"BenchmarkService",
|
|
413
413
|
);
|
|
414
414
|
}
|
|
415
415
|
|
|
416
416
|
// Fallback to base model if modelId also doesn't look valid
|
|
417
|
-
if (model.modelId.includes(
|
|
417
|
+
if (model.modelId.includes("/")) {
|
|
418
418
|
return model.modelId;
|
|
419
419
|
}
|
|
420
420
|
|
|
@@ -422,7 +422,7 @@ export class BenchmarkService {
|
|
|
422
422
|
logger.warn(
|
|
423
423
|
`No valid model identifier found, using baseModel`,
|
|
424
424
|
{ modelId: model.modelId, baseModel: model.baseModel },
|
|
425
|
-
|
|
425
|
+
"BenchmarkService",
|
|
426
426
|
);
|
|
427
427
|
return model.baseModel;
|
|
428
428
|
}
|
|
@@ -441,9 +441,13 @@ export class BenchmarkService {
|
|
|
441
441
|
const allAgents = await adapter.getAgentUsers();
|
|
442
442
|
|
|
443
443
|
// Try to find a specific test agent
|
|
444
|
-
const preferredUsernames = [
|
|
444
|
+
const preferredUsernames = [
|
|
445
|
+
"trader-aggressive",
|
|
446
|
+
"test-agent",
|
|
447
|
+
"benchmark-agent",
|
|
448
|
+
];
|
|
445
449
|
let agent = allAgents.find(
|
|
446
|
-
(a) => a.username && preferredUsernames.includes(a.username)
|
|
450
|
+
(a) => a.username && preferredUsernames.includes(a.username),
|
|
447
451
|
);
|
|
448
452
|
|
|
449
453
|
// Fall back to any agent
|
|
@@ -452,7 +456,7 @@ export class BenchmarkService {
|
|
|
452
456
|
}
|
|
453
457
|
|
|
454
458
|
if (!agent) {
|
|
455
|
-
throw new Error(
|
|
459
|
+
throw new Error("No test agent available for benchmarking");
|
|
456
460
|
}
|
|
457
461
|
|
|
458
462
|
return agent;
|
|
@@ -490,7 +494,7 @@ export class BenchmarkService {
|
|
|
490
494
|
recentModels: summary
|
|
491
495
|
.sort(
|
|
492
496
|
(a: (typeof summary)[number], b: (typeof summary)[number]) =>
|
|
493
|
-
b.createdAt.getTime() - a.createdAt.getTime()
|
|
497
|
+
b.createdAt.getTime() - a.createdAt.getTime(),
|
|
494
498
|
)
|
|
495
499
|
.slice(0, 5),
|
|
496
500
|
};
|
|
@@ -501,7 +505,7 @@ export class BenchmarkService {
|
|
|
501
505
|
*/
|
|
502
506
|
async benchmarkMultipleModels(
|
|
503
507
|
modelIds: string[],
|
|
504
|
-
benchmarkPath?: string
|
|
508
|
+
benchmarkPath?: string,
|
|
505
509
|
): Promise<Record<string, BenchmarkResults>> {
|
|
506
510
|
const results: Record<string, BenchmarkResults> = {};
|
|
507
511
|
|
|
@@ -4,8 +4,8 @@
|
|
|
4
4
|
* Validates RL pipeline configuration before execution.
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
|
-
import type { BenchmarkConfig } from
|
|
8
|
-
import { logger } from
|
|
7
|
+
import type { BenchmarkConfig } from "../benchmark/BenchmarkDataGenerator";
|
|
8
|
+
import { logger } from "../utils/logger";
|
|
9
9
|
|
|
10
10
|
export interface TrainingConfig {
|
|
11
11
|
min_trajectories_per_batch: number;
|
|
@@ -37,56 +37,56 @@ export class ConfigValidator {
|
|
|
37
37
|
|
|
38
38
|
// Validate batch size
|
|
39
39
|
if (config.batch_size <= 0) {
|
|
40
|
-
errors.push(
|
|
40
|
+
errors.push("batch_size must be greater than 0");
|
|
41
41
|
}
|
|
42
42
|
if (config.batch_size > 64) {
|
|
43
|
-
warnings.push(
|
|
43
|
+
warnings.push("batch_size > 64 may cause memory issues");
|
|
44
44
|
}
|
|
45
45
|
|
|
46
46
|
// Validate learning rate
|
|
47
47
|
if (config.learning_rate <= 0) {
|
|
48
|
-
errors.push(
|
|
48
|
+
errors.push("learning_rate must be greater than 0");
|
|
49
49
|
}
|
|
50
50
|
if (config.learning_rate > 1e-3) {
|
|
51
|
-
warnings.push(
|
|
51
|
+
warnings.push("learning_rate > 1e-3 may cause training instability");
|
|
52
52
|
}
|
|
53
53
|
if (config.learning_rate < 1e-8) {
|
|
54
54
|
warnings.push(
|
|
55
|
-
|
|
55
|
+
"learning_rate < 1e-8 may be too small for effective learning",
|
|
56
56
|
);
|
|
57
57
|
}
|
|
58
58
|
|
|
59
59
|
// Validate KL penalty
|
|
60
60
|
if (config.kl_penalty < 0) {
|
|
61
|
-
errors.push(
|
|
61
|
+
errors.push("kl_penalty must be non-negative");
|
|
62
62
|
}
|
|
63
63
|
if (config.kl_penalty > 1.0) {
|
|
64
|
-
warnings.push(
|
|
64
|
+
warnings.push("kl_penalty > 1.0 may be too high");
|
|
65
65
|
}
|
|
66
66
|
|
|
67
67
|
// Validate iterations
|
|
68
68
|
if (config.iterations_per_window <= 0) {
|
|
69
|
-
errors.push(
|
|
69
|
+
errors.push("iterations_per_window must be greater than 0");
|
|
70
70
|
}
|
|
71
71
|
|
|
72
72
|
// Validate warmup steps
|
|
73
73
|
if (config.warmup_steps < 0) {
|
|
74
|
-
errors.push(
|
|
74
|
+
errors.push("warmup_steps must be non-negative");
|
|
75
75
|
}
|
|
76
76
|
|
|
77
77
|
// Validate max grad norm
|
|
78
78
|
if (config.max_grad_norm <= 0) {
|
|
79
|
-
errors.push(
|
|
79
|
+
errors.push("max_grad_norm must be greater than 0");
|
|
80
80
|
}
|
|
81
81
|
|
|
82
82
|
// Validate gamma
|
|
83
83
|
if (config.gamma < 0 || config.gamma > 1) {
|
|
84
|
-
errors.push(
|
|
84
|
+
errors.push("gamma must be between 0 and 1");
|
|
85
85
|
}
|
|
86
86
|
|
|
87
87
|
// Validate min trajectories
|
|
88
88
|
if (config.min_trajectories_per_batch <= 0) {
|
|
89
|
-
errors.push(
|
|
89
|
+
errors.push("min_trajectories_per_batch must be greater than 0");
|
|
90
90
|
}
|
|
91
91
|
|
|
92
92
|
return {
|
|
@@ -109,24 +109,24 @@ export class ConfigValidator {
|
|
|
109
109
|
const warnings: string[] = [];
|
|
110
110
|
|
|
111
111
|
if (config.duration_minutes <= 0) {
|
|
112
|
-
errors.push(
|
|
112
|
+
errors.push("duration_minutes must be greater than 0");
|
|
113
113
|
}
|
|
114
114
|
if (config.duration_minutes > 10080) {
|
|
115
115
|
warnings.push(
|
|
116
|
-
|
|
116
|
+
"duration_minutes > 10080 (1 week) may take a long time to generate",
|
|
117
117
|
);
|
|
118
118
|
}
|
|
119
119
|
|
|
120
120
|
if (config.tick_interval_seconds <= 0) {
|
|
121
|
-
errors.push(
|
|
121
|
+
errors.push("tick_interval_seconds must be greater than 0");
|
|
122
122
|
}
|
|
123
123
|
|
|
124
124
|
if (config.num_prediction_markets <= 0) {
|
|
125
|
-
errors.push(
|
|
125
|
+
errors.push("num_prediction_markets must be greater than 0");
|
|
126
126
|
}
|
|
127
127
|
|
|
128
128
|
if (config.num_perpetual_markets <= 0) {
|
|
129
|
-
errors.push(
|
|
129
|
+
errors.push("num_perpetual_markets must be greater than 0");
|
|
130
130
|
}
|
|
131
131
|
|
|
132
132
|
return {
|
|
@@ -149,7 +149,7 @@ export class ConfigValidator {
|
|
|
149
149
|
|
|
150
150
|
// Validate benchmark config
|
|
151
151
|
if (config.benchmark) {
|
|
152
|
-
const benchmarkResult =
|
|
152
|
+
const benchmarkResult = ConfigValidator.validateBenchmarkConfig({
|
|
153
153
|
duration_minutes: config.benchmark.durationMinutes,
|
|
154
154
|
tick_interval_seconds: config.benchmark.tickInterval,
|
|
155
155
|
num_prediction_markets: config.benchmark.numPredictionMarkets,
|
|
@@ -161,17 +161,19 @@ export class ConfigValidator {
|
|
|
161
161
|
|
|
162
162
|
// Validate training config
|
|
163
163
|
if (config.training) {
|
|
164
|
-
const trainingResult =
|
|
164
|
+
const trainingResult = ConfigValidator.validateTrainingConfig(
|
|
165
|
+
config.training,
|
|
166
|
+
);
|
|
165
167
|
errors.push(...trainingResult.errors);
|
|
166
168
|
warnings.push(...trainingResult.warnings);
|
|
167
169
|
}
|
|
168
170
|
|
|
169
171
|
// Validate agent config
|
|
170
172
|
if (config.agents.test_agent_count <= 0) {
|
|
171
|
-
errors.push(
|
|
173
|
+
errors.push("test_agent_count must be greater than 0");
|
|
172
174
|
}
|
|
173
175
|
if (config.agents.test_agent_count > 10) {
|
|
174
|
-
warnings.push(
|
|
176
|
+
warnings.push("test_agent_count > 10 may be slow");
|
|
175
177
|
}
|
|
176
178
|
|
|
177
179
|
return {
|
|
@@ -189,31 +191,35 @@ export class ConfigValidator {
|
|
|
189
191
|
training: TrainingConfig;
|
|
190
192
|
agents: { test_agent_count: number };
|
|
191
193
|
}): boolean {
|
|
192
|
-
const result =
|
|
194
|
+
const result = ConfigValidator.validatePipelineConfig(config);
|
|
193
195
|
|
|
194
196
|
if (result.warnings.length > 0) {
|
|
195
197
|
logger.warn(
|
|
196
|
-
|
|
198
|
+
"Configuration warnings",
|
|
197
199
|
{ warnings: result.warnings },
|
|
198
|
-
|
|
200
|
+
"ConfigValidator",
|
|
199
201
|
);
|
|
200
|
-
result.warnings.forEach((w) =>
|
|
202
|
+
result.warnings.forEach((w) => {
|
|
203
|
+
console.log(` ⚠️ ${w}`);
|
|
204
|
+
});
|
|
201
205
|
}
|
|
202
206
|
|
|
203
207
|
if (result.errors.length > 0) {
|
|
204
208
|
logger.error(
|
|
205
|
-
|
|
209
|
+
"Configuration errors",
|
|
206
210
|
{ errors: result.errors },
|
|
207
|
-
|
|
211
|
+
"ConfigValidator",
|
|
208
212
|
);
|
|
209
|
-
result.errors.forEach((e) =>
|
|
213
|
+
result.errors.forEach((e) => {
|
|
214
|
+
console.error(` ❌ ${e}`);
|
|
215
|
+
});
|
|
210
216
|
return false;
|
|
211
217
|
}
|
|
212
218
|
|
|
213
219
|
logger.info(
|
|
214
|
-
|
|
220
|
+
"Configuration validation passed",
|
|
215
221
|
undefined,
|
|
216
|
-
|
|
222
|
+
"ConfigValidator",
|
|
217
223
|
);
|
|
218
224
|
return true;
|
|
219
225
|
}
|
|
@@ -5,9 +5,9 @@
|
|
|
5
5
|
* This gives RULER the ground truth to evaluate agent decisions.
|
|
6
6
|
*/
|
|
7
7
|
|
|
8
|
-
import { getMarketDataAdapter } from
|
|
9
|
-
import { generateSnowflakeId, logger } from
|
|
10
|
-
import { getPreviousWindowId } from
|
|
8
|
+
import { getMarketDataAdapter } from "../adapter";
|
|
9
|
+
import { generateSnowflakeId, logger } from "../utils";
|
|
10
|
+
import { getPreviousWindowId } from "./window-utils";
|
|
11
11
|
|
|
12
12
|
export interface WindowOutcomes {
|
|
13
13
|
windowId: string;
|
|
@@ -36,7 +36,9 @@ export class MarketOutcomesTracker {
|
|
|
36
36
|
|
|
37
37
|
const marketAdapter = getMarketDataAdapter();
|
|
38
38
|
if (!marketAdapter) {
|
|
39
|
-
logger.warn(
|
|
39
|
+
logger.warn(
|
|
40
|
+
"Market data adapter not available, skipping outcome tracking",
|
|
41
|
+
);
|
|
40
42
|
return;
|
|
41
43
|
}
|
|
42
44
|
|
|
@@ -44,7 +46,10 @@ export class MarketOutcomesTracker {
|
|
|
44
46
|
const windowEnd = new Date(windowStart.getTime() + 60 * 60 * 1000);
|
|
45
47
|
|
|
46
48
|
// Get stock price movements from perpetual positions
|
|
47
|
-
const perpTrades = await marketAdapter.getPerpPositionsForWindow(
|
|
49
|
+
const perpTrades = await marketAdapter.getPerpPositionsForWindow(
|
|
50
|
+
windowStart,
|
|
51
|
+
windowEnd,
|
|
52
|
+
);
|
|
48
53
|
|
|
49
54
|
// Group by ticker and calculate movements
|
|
50
55
|
const stockMovements = new Map<
|
|
@@ -56,7 +61,9 @@ export class MarketOutcomesTracker {
|
|
|
56
61
|
if (!trade.ticker) continue;
|
|
57
62
|
|
|
58
63
|
const existing = stockMovements.get(trade.ticker);
|
|
59
|
-
const endPrice = Number(
|
|
64
|
+
const endPrice = Number(
|
|
65
|
+
trade.currentPrice ?? trade.exitPrice ?? trade.entryPrice,
|
|
66
|
+
);
|
|
60
67
|
if (!existing) {
|
|
61
68
|
stockMovements.set(trade.ticker, {
|
|
62
69
|
start: Number(trade.entryPrice),
|
|
@@ -81,12 +88,15 @@ export class MarketOutcomesTracker {
|
|
|
81
88
|
startPrice: String(data.start),
|
|
82
89
|
endPrice: String(data.end),
|
|
83
90
|
changePercent: String(changePercent),
|
|
84
|
-
sentiment: changePercent > 0 ?
|
|
91
|
+
sentiment: changePercent > 0 ? "BULLISH" : "BEARISH",
|
|
85
92
|
});
|
|
86
93
|
}
|
|
87
94
|
|
|
88
95
|
// Get prediction market resolutions
|
|
89
|
-
const resolvedMarkets = await marketAdapter.getResolvedMarketsForWindow(
|
|
96
|
+
const resolvedMarkets = await marketAdapter.getResolvedMarketsForWindow(
|
|
97
|
+
windowStart,
|
|
98
|
+
windowEnd,
|
|
99
|
+
);
|
|
90
100
|
|
|
91
101
|
// Save prediction outcomes
|
|
92
102
|
for (const market of resolvedMarkets) {
|
|
@@ -95,7 +105,7 @@ export class MarketOutcomesTracker {
|
|
|
95
105
|
windowId,
|
|
96
106
|
predictionMarketId: market.id,
|
|
97
107
|
question: market.question,
|
|
98
|
-
outcome: market.outcome ?
|
|
108
|
+
outcome: market.outcome ? "YES" : "NO",
|
|
99
109
|
finalProbability: String(market.finalProbability ?? 0.5),
|
|
100
110
|
});
|
|
101
111
|
}
|
|
@@ -114,7 +124,7 @@ export class MarketOutcomesTracker {
|
|
|
114
124
|
|
|
115
125
|
const marketAdapter = getMarketDataAdapter();
|
|
116
126
|
if (!marketAdapter) {
|
|
117
|
-
logger.warn(
|
|
127
|
+
logger.warn("Market data adapter not available");
|
|
118
128
|
return 0;
|
|
119
129
|
}
|
|
120
130
|
|
|
@@ -172,8 +182,8 @@ export class MarketOutcomesTracker {
|
|
|
172
182
|
const r = o as Record<string, unknown>;
|
|
173
183
|
return {
|
|
174
184
|
marketId: r.predictionMarketId as string,
|
|
175
|
-
question: (r.question as string) ||
|
|
176
|
-
outcome: (r.outcome as string) ||
|
|
185
|
+
question: (r.question as string) || "",
|
|
186
|
+
outcome: (r.outcome as string) || "UNRESOLVED",
|
|
177
187
|
finalProbability: Number(r.finalProbability || 0),
|
|
178
188
|
};
|
|
179
189
|
});
|
|
@@ -5,13 +5,13 @@
|
|
|
5
5
|
* Handles gradual rollout and rollback if needed.
|
|
6
6
|
*/
|
|
7
7
|
|
|
8
|
-
import { getTrainingDataAdapter } from
|
|
9
|
-
import { getAgentRuntimeManager } from
|
|
10
|
-
import { logger } from
|
|
8
|
+
import { getTrainingDataAdapter } from "../adapter";
|
|
9
|
+
import { getAgentRuntimeManager } from "../dependencies";
|
|
10
|
+
import { logger } from "../utils/logger";
|
|
11
11
|
|
|
12
12
|
export interface DeploymentOptions {
|
|
13
13
|
modelVersion: string;
|
|
14
|
-
strategy:
|
|
14
|
+
strategy: "immediate" | "gradual" | "test";
|
|
15
15
|
rolloutPercentage?: number;
|
|
16
16
|
testAgentIds?: string[];
|
|
17
17
|
}
|
|
@@ -26,7 +26,7 @@ export interface DeploymentResult {
|
|
|
26
26
|
interface DeploymentStatusRecord {
|
|
27
27
|
deploymentId: string;
|
|
28
28
|
modelVersion: string;
|
|
29
|
-
status:
|
|
29
|
+
status: "in_progress" | "deployed" | "degraded" | "failed";
|
|
30
30
|
agentsUpdated: number;
|
|
31
31
|
agentsFailed: number;
|
|
32
32
|
performance: {
|
|
@@ -47,7 +47,7 @@ export class ModelDeployer {
|
|
|
47
47
|
async deploy(options: DeploymentOptions): Promise<DeploymentResult> {
|
|
48
48
|
const da = getTrainingDataAdapter();
|
|
49
49
|
|
|
50
|
-
logger.info(
|
|
50
|
+
logger.info("Starting model deployment", {
|
|
51
51
|
version: options.modelVersion,
|
|
52
52
|
strategy: options.strategy,
|
|
53
53
|
});
|
|
@@ -59,7 +59,7 @@ export class ModelDeployer {
|
|
|
59
59
|
}
|
|
60
60
|
|
|
61
61
|
const strategy =
|
|
62
|
-
options.strategy ===
|
|
62
|
+
options.strategy === "immediate" ? "all" : options.strategy;
|
|
63
63
|
|
|
64
64
|
const targetAgents = await da.getAgentUsers({
|
|
65
65
|
strategy,
|
|
@@ -73,7 +73,7 @@ export class ModelDeployer {
|
|
|
73
73
|
this.deploymentStatus.set(deploymentId, {
|
|
74
74
|
deploymentId,
|
|
75
75
|
modelVersion: options.modelVersion,
|
|
76
|
-
status:
|
|
76
|
+
status: "in_progress",
|
|
77
77
|
agentsUpdated: 0,
|
|
78
78
|
agentsFailed: 0,
|
|
79
79
|
performance: {
|
|
@@ -84,7 +84,7 @@ export class ModelDeployer {
|
|
|
84
84
|
completedAt: null,
|
|
85
85
|
});
|
|
86
86
|
|
|
87
|
-
await da.updateModelStatus(model.modelId,
|
|
87
|
+
await da.updateModelStatus(model.modelId, "deployed", {
|
|
88
88
|
deployedAt: new Date(),
|
|
89
89
|
agentsUsing: targetAgents.length,
|
|
90
90
|
});
|
|
@@ -99,14 +99,14 @@ export class ModelDeployer {
|
|
|
99
99
|
runtimesReset++;
|
|
100
100
|
} catch (err) {
|
|
101
101
|
runtimeResetFailures++;
|
|
102
|
-
logger.warn(
|
|
102
|
+
logger.warn("Failed to reset runtime for agent", {
|
|
103
103
|
agentId: agent.id,
|
|
104
104
|
error: err instanceof Error ? err.message : String(err),
|
|
105
105
|
});
|
|
106
106
|
}
|
|
107
107
|
}
|
|
108
108
|
|
|
109
|
-
logger.info(
|
|
109
|
+
logger.info("Model deployed successfully", {
|
|
110
110
|
version: options.modelVersion,
|
|
111
111
|
agentsUpdated: targetAgents.length,
|
|
112
112
|
deploymentId,
|
|
@@ -118,7 +118,7 @@ export class ModelDeployer {
|
|
|
118
118
|
this.deploymentStatus.set(deploymentId, {
|
|
119
119
|
deploymentId,
|
|
120
120
|
modelVersion: options.modelVersion,
|
|
121
|
-
status: runtimeResetFailures > 0 ?
|
|
121
|
+
status: runtimeResetFailures > 0 ? "degraded" : "deployed",
|
|
122
122
|
agentsUpdated: runtimesReset,
|
|
123
123
|
agentsFailed: runtimeResetFailures,
|
|
124
124
|
performance: {
|
|
@@ -146,16 +146,16 @@ export class ModelDeployer {
|
|
|
146
146
|
*/
|
|
147
147
|
async rollback(
|
|
148
148
|
currentVersion: string,
|
|
149
|
-
targetVersion: string
|
|
149
|
+
targetVersion: string,
|
|
150
150
|
): Promise<DeploymentResult> {
|
|
151
|
-
logger.info(
|
|
151
|
+
logger.info("Rolling back model", {
|
|
152
152
|
from: currentVersion,
|
|
153
153
|
to: targetVersion,
|
|
154
154
|
});
|
|
155
155
|
|
|
156
156
|
return await this.deploy({
|
|
157
157
|
modelVersion: targetVersion,
|
|
158
|
-
strategy:
|
|
158
|
+
strategy: "immediate",
|
|
159
159
|
});
|
|
160
160
|
}
|
|
161
161
|
|