@elizaos/training 2.0.0-alpha.13 → 2.0.0-alpha.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/research-output/training-runs/training-run-1773726941205.json +38 -0
- package/scripts/rank_trajectories.ts +0 -1
- package/scripts/run_task_benchmark.ts +4 -11
- package/src/adapter.ts +96 -49
- package/src/archetypes/ArchetypeConfigService.ts +188 -185
- package/src/archetypes/derive-archetype.ts +47 -47
- package/src/archetypes/index.ts +2 -2
- package/src/benchmark/ArchetypeMatchupBenchmark.ts +70 -70
- package/src/benchmark/BenchmarkChartGenerator.ts +70 -69
- package/src/benchmark/BenchmarkDataGenerator.ts +136 -136
- package/src/benchmark/BenchmarkDataViewer.ts +32 -30
- package/src/benchmark/BenchmarkHistoryService.ts +13 -12
- package/src/benchmark/BenchmarkRunner.ts +87 -83
- package/src/benchmark/BenchmarkValidator.ts +48 -46
- package/src/benchmark/FastEvalRunner.ts +17 -16
- package/src/benchmark/MetricsValidator.ts +20 -21
- package/src/benchmark/MetricsVisualizer.ts +92 -85
- package/src/benchmark/ModelBenchmarkService.ts +90 -82
- package/src/benchmark/ModelRegistry.ts +44 -44
- package/src/benchmark/RulerBenchmarkIntegration.ts +24 -24
- package/src/benchmark/SimulationA2AInterface.ts +118 -118
- package/src/benchmark/SimulationEngine.ts +51 -51
- package/src/benchmark/TaskRunner.ts +87 -79
- package/src/benchmark/__tests__/BenchmarkRunner.test.ts +80 -80
- package/src/benchmark/__tests__/HeadToHead.test.ts +26 -26
- package/src/benchmark/index.ts +27 -27
- package/src/benchmark/parseSimulationMetrics.ts +32 -32
- package/src/benchmark/simulation-types.ts +10 -10
- package/src/dependencies.ts +34 -34
- package/src/generation/TrajectoryGenerator.ts +39 -37
- package/src/generation/index.ts +1 -1
- package/src/huggingface/HuggingFaceDatasetUploader.ts +72 -72
- package/src/huggingface/HuggingFaceIntegrationService.ts +59 -53
- package/src/huggingface/HuggingFaceModelUploader.ts +60 -59
- package/src/huggingface/index.ts +6 -6
- package/src/huggingface/shared/HuggingFaceUploadUtil.ts +32 -32
- package/src/index.ts +27 -27
- package/src/init-training.ts +6 -6
- package/src/metrics/TrajectoryMetricsExtractor.ts +70 -71
- package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +182 -182
- package/src/metrics/index.ts +2 -2
- package/src/rubrics/__tests__/index.test.ts +73 -73
- package/src/rubrics/ass-kisser.ts +6 -6
- package/src/rubrics/degen.ts +6 -6
- package/src/rubrics/goody-twoshoes.ts +6 -6
- package/src/rubrics/index.ts +50 -50
- package/src/rubrics/information-trader.ts +6 -6
- package/src/rubrics/infosec.ts +6 -6
- package/src/rubrics/liar.ts +6 -6
- package/src/rubrics/perps-trader.ts +6 -6
- package/src/rubrics/researcher.ts +6 -6
- package/src/rubrics/scammer.ts +6 -6
- package/src/rubrics/social-butterfly.ts +7 -7
- package/src/rubrics/super-predictor.ts +6 -6
- package/src/rubrics/trader.ts +5 -5
- package/src/scoring/ArchetypeScoringService.ts +56 -54
- package/src/scoring/JudgePromptBuilder.ts +96 -96
- package/src/scoring/LLMJudgeCache.ts +26 -23
- package/src/scoring/index.ts +3 -3
- package/src/training/AutomationPipeline.ts +149 -140
- package/src/training/BenchmarkService.ts +49 -45
- package/src/training/ConfigValidator.ts +38 -32
- package/src/training/MarketOutcomesTracker.ts +22 -12
- package/src/training/ModelDeployer.ts +15 -15
- package/src/training/ModelFetcher.ts +7 -7
- package/src/training/ModelSelectionService.ts +32 -32
- package/src/training/ModelUsageVerifier.ts +31 -24
- package/src/training/MultiModelOrchestrator.ts +44 -44
- package/src/training/RLModelConfig.ts +57 -57
- package/src/training/RewardBackpropagationService.ts +18 -17
- package/src/training/RulerScoringService.ts +73 -72
- package/src/training/TrainingMonitor.ts +29 -29
- package/src/training/TrajectoryRecorder.ts +25 -27
- package/src/training/__tests__/TrajectoryRecorder.test.ts +105 -105
- package/src/training/index.ts +36 -36
- package/src/training/logRLConfig.ts +7 -7
- package/src/training/pipeline.ts +13 -16
- package/src/training/storage/ModelStorageService.ts +32 -32
- package/src/training/storage/TrainingDataArchiver.ts +21 -21
- package/src/training/storage/index.ts +2 -2
- package/src/training/types.ts +6 -6
- package/src/training/window-utils.ts +14 -14
- package/src/utils/index.ts +7 -7
- package/src/utils/logger.ts +5 -5
- package/src/utils/snowflake.ts +1 -1
- package/src/utils/synthetic-detector.ts +7 -7
|
@@ -11,18 +11,16 @@
|
|
|
11
11
|
* 7. Monitor performance
|
|
12
12
|
*/
|
|
13
13
|
|
|
14
|
-
import
|
|
15
|
-
import
|
|
16
|
-
import
|
|
17
|
-
import {
|
|
18
|
-
import
|
|
19
|
-
import {
|
|
20
|
-
import {
|
|
21
|
-
import {
|
|
22
|
-
import {
|
|
23
|
-
import {
|
|
24
|
-
import { rewardBackpropagationService } from './RewardBackpropagationService';
|
|
25
|
-
import { rulerScoringService } from './RulerScoringService';
|
|
14
|
+
import { spawn } from "node:child_process";
|
|
15
|
+
import fs from "node:fs/promises";
|
|
16
|
+
import path from "node:path";
|
|
17
|
+
import { getMarketDataAdapter, getTrainingDataAdapter } from "../adapter";
|
|
18
|
+
import { getExportGroupedForGRPO } from "../dependencies";
|
|
19
|
+
import { logger } from "../utils/logger";
|
|
20
|
+
import { benchmarkService } from "./BenchmarkService";
|
|
21
|
+
import { modelSelectionService } from "./ModelSelectionService";
|
|
22
|
+
import { rewardBackpropagationService } from "./RewardBackpropagationService";
|
|
23
|
+
import { rulerScoringService } from "./RulerScoringService";
|
|
26
24
|
import type {
|
|
27
25
|
AutomationConfig,
|
|
28
26
|
AutomationStatus,
|
|
@@ -31,8 +29,8 @@ import type {
|
|
|
31
29
|
TrainingTriggerOptions,
|
|
32
30
|
TrainingTriggerResult,
|
|
33
31
|
TrajectoryStep,
|
|
34
|
-
} from
|
|
35
|
-
import { getCurrentWindowId, getPreviousWindowId } from
|
|
32
|
+
} from "./types";
|
|
33
|
+
import { getCurrentWindowId, getPreviousWindowId } from "./window-utils";
|
|
36
34
|
|
|
37
35
|
export type { AutomationConfig };
|
|
38
36
|
|
|
@@ -42,12 +40,12 @@ export class AutomationPipeline {
|
|
|
42
40
|
|
|
43
41
|
constructor(config: Partial<AutomationConfig> = {}) {
|
|
44
42
|
const envMinTrajectories = parseInt(
|
|
45
|
-
process.env.TRAINING_MIN_TRAJECTORIES ??
|
|
46
|
-
10
|
|
43
|
+
process.env.TRAINING_MIN_TRAJECTORIES ?? "",
|
|
44
|
+
10,
|
|
47
45
|
);
|
|
48
46
|
const envMinGroupSize = parseInt(
|
|
49
|
-
process.env.TRAINING_MIN_GROUP_SIZE ??
|
|
50
|
-
10
|
|
47
|
+
process.env.TRAINING_MIN_GROUP_SIZE ?? "",
|
|
48
|
+
10,
|
|
51
49
|
);
|
|
52
50
|
|
|
53
51
|
this.config = {
|
|
@@ -64,39 +62,41 @@ export class AutomationPipeline {
|
|
|
64
62
|
dataQualityThreshold: config.dataQualityThreshold ?? 0.95,
|
|
65
63
|
autoTriggerTraining: config.autoTriggerTraining !== false,
|
|
66
64
|
trainingInterval: config.trainingInterval || 24, // Daily by default
|
|
67
|
-
baseModel: config.baseModel ||
|
|
68
|
-
modelNamePrefix: config.modelNamePrefix ||
|
|
65
|
+
baseModel: config.baseModel || "unsloth/Qwen3-4B-128K", // 4B params, 128K context - ideal for fine-tuning
|
|
66
|
+
modelNamePrefix: config.modelNamePrefix || "eliza-agent",
|
|
69
67
|
modelIdPrefix:
|
|
70
68
|
config.modelIdPrefix ||
|
|
71
69
|
process.env.TRAINING_MODEL_ID_PREFIX ||
|
|
72
70
|
config.modelNamePrefix ||
|
|
73
|
-
|
|
71
|
+
"eliza-agent",
|
|
74
72
|
modelStoragePath:
|
|
75
73
|
config.modelStoragePath ||
|
|
76
|
-
path.resolve(process.cwd(),
|
|
74
|
+
path.resolve(process.cwd(), "storage/models"),
|
|
77
75
|
dataStoragePath:
|
|
78
76
|
config.dataStoragePath ||
|
|
79
|
-
path.resolve(process.cwd(),
|
|
77
|
+
path.resolve(process.cwd(), "storage/training-data"),
|
|
80
78
|
pythonProjectRoot:
|
|
81
79
|
config.pythonProjectRoot ||
|
|
82
80
|
process.env.TRAINING_PYTHON_ROOT ||
|
|
83
|
-
path.resolve(process.cwd(),
|
|
81
|
+
path.resolve(process.cwd(), "packages/training/python"),
|
|
84
82
|
trainerScriptPath:
|
|
85
|
-
config.trainerScriptPath ||
|
|
83
|
+
config.trainerScriptPath ||
|
|
84
|
+
process.env.TRAINING_SCRIPT_PATH ||
|
|
85
|
+
undefined,
|
|
86
86
|
trainerPythonExecutable:
|
|
87
87
|
config.trainerPythonExecutable ||
|
|
88
88
|
process.env.TRAINING_PYTHON_EXECUTABLE ||
|
|
89
|
-
(process.platform ===
|
|
89
|
+
(process.platform === "win32" ? "python" : "python3"),
|
|
90
90
|
trainingMode:
|
|
91
91
|
config.trainingMode ||
|
|
92
|
-
(process.env.TRAINING_MODE as
|
|
93
|
-
|
|
92
|
+
(process.env.TRAINING_MODE as "atropos" | "tinker") ||
|
|
93
|
+
"atropos",
|
|
94
94
|
atroposApiUrl:
|
|
95
95
|
config.atroposApiUrl ||
|
|
96
96
|
process.env.ATROPOS_API_URL ||
|
|
97
|
-
|
|
97
|
+
"http://localhost:8000",
|
|
98
98
|
vllmPort:
|
|
99
|
-
config.vllmPort || parseInt(process.env.VLLM_PORT ||
|
|
99
|
+
config.vllmPort || parseInt(process.env.VLLM_PORT || "9001", 10),
|
|
100
100
|
};
|
|
101
101
|
}
|
|
102
102
|
|
|
@@ -108,7 +108,9 @@ export class AutomationPipeline {
|
|
|
108
108
|
|
|
109
109
|
const scoredAndReady = await adapter.countScoredTrajectoriesReady();
|
|
110
110
|
const unscored = await adapter.countUnscoredTrajectories();
|
|
111
|
-
const scenarioGroups = await adapter.getScenarioGroups(
|
|
111
|
+
const scenarioGroups = await adapter.getScenarioGroups(
|
|
112
|
+
this.config.minGroupSize,
|
|
113
|
+
);
|
|
112
114
|
const quality = await this.calculateDataQuality();
|
|
113
115
|
|
|
114
116
|
const stats = {
|
|
@@ -144,7 +146,7 @@ export class AutomationPipeline {
|
|
|
144
146
|
|
|
145
147
|
return {
|
|
146
148
|
ready: true,
|
|
147
|
-
reason:
|
|
149
|
+
reason: "Ready to train!",
|
|
148
150
|
stats,
|
|
149
151
|
};
|
|
150
152
|
}
|
|
@@ -165,14 +167,14 @@ export class AutomationPipeline {
|
|
|
165
167
|
// Validate stepsJson exists and is valid before parsing
|
|
166
168
|
if (
|
|
167
169
|
!traj.stepsJson ||
|
|
168
|
-
traj.stepsJson ===
|
|
169
|
-
traj.stepsJson ===
|
|
170
|
+
traj.stepsJson === "null" ||
|
|
171
|
+
traj.stepsJson === "[]"
|
|
170
172
|
) {
|
|
171
173
|
continue; // Skip invalid trajectories
|
|
172
174
|
}
|
|
173
175
|
|
|
174
176
|
const steps: TrajectoryStep[] = JSON.parse(
|
|
175
|
-
traj.stepsJson
|
|
177
|
+
traj.stepsJson,
|
|
176
178
|
) as TrajectoryStep[];
|
|
177
179
|
|
|
178
180
|
if (!Array.isArray(steps)) {
|
|
@@ -186,7 +188,7 @@ export class AutomationPipeline {
|
|
|
186
188
|
// Check 2: Steps have LLM calls
|
|
187
189
|
totalChecks++;
|
|
188
190
|
const hasLLMCalls = steps.every(
|
|
189
|
-
(s) => s.llmCalls && Array.isArray(s.llmCalls) && s.llmCalls.length > 0
|
|
191
|
+
(s) => s.llmCalls && Array.isArray(s.llmCalls) && s.llmCalls.length > 0,
|
|
190
192
|
);
|
|
191
193
|
if (hasLLMCalls) qualityScore++;
|
|
192
194
|
|
|
@@ -200,8 +202,8 @@ export class AutomationPipeline {
|
|
|
200
202
|
llm.systemPrompt &&
|
|
201
203
|
llm.systemPrompt.length > 50 &&
|
|
202
204
|
llm.userPrompt &&
|
|
203
|
-
llm.userPrompt.length > 100
|
|
204
|
-
)
|
|
205
|
+
llm.userPrompt.length > 100,
|
|
206
|
+
),
|
|
205
207
|
);
|
|
206
208
|
if (hasGoodPrompts) qualityScore++;
|
|
207
209
|
|
|
@@ -211,14 +213,14 @@ export class AutomationPipeline {
|
|
|
211
213
|
(s) =>
|
|
212
214
|
s.providerAccesses &&
|
|
213
215
|
Array.isArray(s.providerAccesses) &&
|
|
214
|
-
s.providerAccesses.length > 0
|
|
216
|
+
s.providerAccesses.length > 0,
|
|
215
217
|
);
|
|
216
218
|
if (hasProviders) qualityScore++;
|
|
217
219
|
|
|
218
220
|
// Check 5: Actions have results
|
|
219
221
|
totalChecks++;
|
|
220
222
|
const hasResults = steps.every(
|
|
221
|
-
(s) => s.action && (s.action.result || s.action.error)
|
|
223
|
+
(s) => s.action && (s.action.result || s.action.error),
|
|
222
224
|
);
|
|
223
225
|
if (hasResults) qualityScore++;
|
|
224
226
|
}
|
|
@@ -230,7 +232,7 @@ export class AutomationPipeline {
|
|
|
230
232
|
* Trigger training job
|
|
231
233
|
*/
|
|
232
234
|
async triggerTraining(
|
|
233
|
-
options: TrainingTriggerOptions = {}
|
|
235
|
+
options: TrainingTriggerOptions = {},
|
|
234
236
|
): Promise<TrainingTriggerResult> {
|
|
235
237
|
// Check readiness
|
|
236
238
|
const readiness = await this.checkTrainingReadiness();
|
|
@@ -249,11 +251,11 @@ export class AutomationPipeline {
|
|
|
249
251
|
readiness.stats.unscoredTrajectories > 0
|
|
250
252
|
) {
|
|
251
253
|
logger.info(
|
|
252
|
-
|
|
254
|
+
"Force mode: Attempting to score unscored trajectories first",
|
|
253
255
|
{
|
|
254
256
|
unscored: readiness.stats.unscoredTrajectories,
|
|
255
257
|
},
|
|
256
|
-
|
|
258
|
+
"AutomationPipeline",
|
|
257
259
|
);
|
|
258
260
|
|
|
259
261
|
// Score recent trajectories
|
|
@@ -267,19 +269,19 @@ export class AutomationPipeline {
|
|
|
267
269
|
// Re-check readiness after scoring
|
|
268
270
|
const newReadiness = await this.checkTrainingReadiness();
|
|
269
271
|
logger.info(
|
|
270
|
-
|
|
272
|
+
"After scoring",
|
|
271
273
|
{
|
|
272
274
|
scored: newReadiness.stats.totalTrajectories,
|
|
273
275
|
stillUnscored: newReadiness.stats.unscoredTrajectories,
|
|
274
276
|
},
|
|
275
|
-
|
|
277
|
+
"AutomationPipeline",
|
|
276
278
|
);
|
|
277
279
|
}
|
|
278
280
|
|
|
279
281
|
// Use ModelSelectionService for smart model selection
|
|
280
282
|
const modelSelection = await modelSelectionService.selectBaseModel();
|
|
281
283
|
|
|
282
|
-
logger.info(
|
|
284
|
+
logger.info("Model selection for training", {
|
|
283
285
|
strategy: modelSelection.strategy,
|
|
284
286
|
modelPath: modelSelection.modelPath,
|
|
285
287
|
bundleCount: modelSelection.metadata?.bundleCount,
|
|
@@ -289,7 +291,7 @@ export class AutomationPipeline {
|
|
|
289
291
|
const dataLimit = await modelSelectionService.getTrainingDataLimit();
|
|
290
292
|
|
|
291
293
|
// Prepare data
|
|
292
|
-
logger.info(
|
|
294
|
+
logger.info("Preparing training data...", {
|
|
293
295
|
...readiness.stats,
|
|
294
296
|
selectedModel: modelSelection.modelPath,
|
|
295
297
|
strategy: modelSelection.strategy,
|
|
@@ -314,14 +316,15 @@ export class AutomationPipeline {
|
|
|
314
316
|
if (!exportResult.success) {
|
|
315
317
|
return {
|
|
316
318
|
success: false,
|
|
317
|
-
error:
|
|
319
|
+
error: `Export failed: ${exportResult.error}`,
|
|
318
320
|
};
|
|
319
321
|
}
|
|
320
322
|
|
|
321
323
|
// Create training batch record
|
|
322
324
|
const adapterForBatch = getTrainingDataAdapter();
|
|
323
325
|
const nextVersion = await this.getNextModelVersion();
|
|
324
|
-
const trajectoryIds =
|
|
326
|
+
const trajectoryIds =
|
|
327
|
+
await adapterForBatch.getTrajectoryIdsForTraining(maxTrajectories);
|
|
325
328
|
|
|
326
329
|
const insertedBatchId = await adapterForBatch.insertBatch({
|
|
327
330
|
id: batchId,
|
|
@@ -334,19 +337,22 @@ export class AutomationPipeline {
|
|
|
334
337
|
rewardsJson: JSON.stringify([]),
|
|
335
338
|
trainingLoss: null,
|
|
336
339
|
policyImprovement: null,
|
|
337
|
-
status:
|
|
340
|
+
status: "pending",
|
|
338
341
|
error: null,
|
|
339
342
|
createdAt: new Date(),
|
|
340
343
|
});
|
|
341
344
|
|
|
342
345
|
const batch = await adapterForBatch.getBatchById(insertedBatchId);
|
|
343
346
|
if (!batch) {
|
|
344
|
-
return {
|
|
345
|
-
|
|
347
|
+
return {
|
|
348
|
+
success: false,
|
|
349
|
+
error: "Failed to create training batch record",
|
|
350
|
+
};
|
|
351
|
+
}
|
|
346
352
|
|
|
347
353
|
// Determine training mode: 'tinker' for cloud-based or 'atropos' for local vLLM
|
|
348
|
-
const trainingMode = this.config.trainingMode ||
|
|
349
|
-
const useTinker = trainingMode.toLowerCase() ===
|
|
354
|
+
const trainingMode = this.config.trainingMode || "atropos";
|
|
355
|
+
const useTinker = trainingMode.toLowerCase() === "tinker";
|
|
350
356
|
|
|
351
357
|
// Trigger appropriate Python training script based on mode.
|
|
352
358
|
// Allow explicit override for packaged/runtime deployments.
|
|
@@ -354,10 +360,10 @@ export class AutomationPipeline {
|
|
|
354
360
|
this.config.trainerScriptPath ||
|
|
355
361
|
path.resolve(
|
|
356
362
|
this.config.pythonProjectRoot ||
|
|
357
|
-
path.resolve(process.cwd(),
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
useTinker ?
|
|
363
|
+
path.resolve(process.cwd(), "packages/training/python"),
|
|
364
|
+
"src",
|
|
365
|
+
"training",
|
|
366
|
+
useTinker ? "tinker_trainer.py" : "atropos_trainer.py",
|
|
361
367
|
);
|
|
362
368
|
|
|
363
369
|
try {
|
|
@@ -372,24 +378,24 @@ export class AutomationPipeline {
|
|
|
372
378
|
// Set environment variables for Python script
|
|
373
379
|
const env = {
|
|
374
380
|
...process.env,
|
|
375
|
-
MODE:
|
|
381
|
+
MODE: "single",
|
|
376
382
|
BATCH_ID: batchId,
|
|
377
383
|
MODEL_VERSION: nextVersion,
|
|
378
384
|
WINDOW_ID: windowId,
|
|
379
385
|
BASE_MODEL: modelSelection.modelPath,
|
|
380
|
-
MAX_EXAMPLES: dataLimit ? dataLimit.toString() :
|
|
381
|
-
DATABASE_URL: process.env.DATABASE_URL ||
|
|
382
|
-
ATROPOS_API_URL: this.config.atroposApiUrl ||
|
|
386
|
+
MAX_EXAMPLES: dataLimit ? dataLimit.toString() : "2000",
|
|
387
|
+
DATABASE_URL: process.env.DATABASE_URL || "",
|
|
388
|
+
ATROPOS_API_URL: this.config.atroposApiUrl || "http://localhost:8000",
|
|
383
389
|
VLLM_PORT: String(this.config.vllmPort || 9001),
|
|
384
|
-
FORCE_TRAINING: options.force ?
|
|
385
|
-
MIN_AGENTS_PER_WINDOW:
|
|
390
|
+
FORCE_TRAINING: options.force ? "true" : "false",
|
|
391
|
+
MIN_AGENTS_PER_WINDOW: "1",
|
|
386
392
|
TRAINING_MODE: trainingMode,
|
|
387
393
|
};
|
|
388
394
|
|
|
389
395
|
logger.info(
|
|
390
396
|
useTinker
|
|
391
|
-
?
|
|
392
|
-
:
|
|
397
|
+
? "Training will use Tinker cloud-based GRPO"
|
|
398
|
+
: "Training will use Atropos GRPO with vLLM",
|
|
393
399
|
{
|
|
394
400
|
trainingMode,
|
|
395
401
|
...(useTinker
|
|
@@ -400,36 +406,40 @@ export class AutomationPipeline {
|
|
|
400
406
|
model: env.BASE_MODEL,
|
|
401
407
|
}),
|
|
402
408
|
},
|
|
403
|
-
|
|
409
|
+
"AutomationPipeline",
|
|
404
410
|
);
|
|
405
411
|
|
|
406
412
|
const pythonCmd =
|
|
407
413
|
this.config.trainerPythonExecutable ||
|
|
408
|
-
(process.platform ===
|
|
414
|
+
(process.platform === "win32" ? "python" : "python3");
|
|
409
415
|
|
|
410
416
|
const trainingProcess = spawn(pythonCmd, [pythonScript], {
|
|
411
417
|
detached: false,
|
|
412
|
-
stdio: [
|
|
418
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
413
419
|
env,
|
|
414
420
|
});
|
|
415
421
|
|
|
416
422
|
// Capture and log training process output
|
|
417
|
-
trainingProcess.stdout?.on(
|
|
418
|
-
logger.info(
|
|
423
|
+
trainingProcess.stdout?.on("data", (data: Buffer) => {
|
|
424
|
+
logger.info("Training stdout", { output: data.toString().trim() });
|
|
419
425
|
});
|
|
420
426
|
|
|
421
|
-
trainingProcess.stderr?.on(
|
|
422
|
-
logger.warn(
|
|
427
|
+
trainingProcess.stderr?.on("data", (data: Buffer) => {
|
|
428
|
+
logger.warn("Training stderr", { output: data.toString().trim() });
|
|
423
429
|
});
|
|
424
430
|
|
|
425
|
-
trainingProcess.on(
|
|
426
|
-
logger.error(
|
|
431
|
+
trainingProcess.on("error", (error: Error) => {
|
|
432
|
+
logger.error("Training process error", { error: error.message });
|
|
427
433
|
getTrainingDataAdapter()
|
|
428
|
-
.updateBatchStatus(
|
|
434
|
+
.updateBatchStatus(
|
|
435
|
+
batchId,
|
|
436
|
+
"failed",
|
|
437
|
+
`Process spawn failed: ${error.message}`,
|
|
438
|
+
)
|
|
429
439
|
.catch((err: unknown) =>
|
|
430
|
-
logger.error(
|
|
440
|
+
logger.error("Failed to update batch status", {
|
|
431
441
|
error: err instanceof Error ? err : String(err),
|
|
432
|
-
})
|
|
442
|
+
}),
|
|
433
443
|
);
|
|
434
444
|
});
|
|
435
445
|
|
|
@@ -437,7 +447,7 @@ export class AutomationPipeline {
|
|
|
437
447
|
|
|
438
448
|
this.currentTrainingJob = batch.id;
|
|
439
449
|
|
|
440
|
-
logger.info(
|
|
450
|
+
logger.info("Training job triggered", {
|
|
441
451
|
batchId: batch.id,
|
|
442
452
|
version: nextVersion,
|
|
443
453
|
trajectories: exportResult.trajectoriesExported,
|
|
@@ -456,24 +466,17 @@ export class AutomationPipeline {
|
|
|
456
466
|
const latestModel = await getTrainingDataAdapter().getLatestModel();
|
|
457
467
|
|
|
458
468
|
if (!latestModel) {
|
|
459
|
-
return
|
|
469
|
+
return "v1.0.0";
|
|
460
470
|
}
|
|
461
471
|
|
|
462
472
|
// Increment patch version
|
|
463
473
|
const [major, minor, patch] = latestModel.version
|
|
464
474
|
.substring(1)
|
|
465
|
-
.split(
|
|
475
|
+
.split(".")
|
|
466
476
|
.map(Number);
|
|
467
477
|
return `v${major}.${minor}.${patch! + 1}`;
|
|
468
478
|
}
|
|
469
479
|
|
|
470
|
-
/**
|
|
471
|
-
* Get trajectory IDs for training
|
|
472
|
-
*/
|
|
473
|
-
private async getTrajectoryIds(limit?: number): Promise<string[]> {
|
|
474
|
-
return getTrainingDataAdapter().getTrajectoryIdsForTraining(limit);
|
|
475
|
-
}
|
|
476
|
-
|
|
477
480
|
/**
|
|
478
481
|
* Monitor training job.
|
|
479
482
|
*
|
|
@@ -484,18 +487,22 @@ export class AutomationPipeline {
|
|
|
484
487
|
const batch = await getTrainingDataAdapter().getBatchById(batchId);
|
|
485
488
|
|
|
486
489
|
if (!batch) {
|
|
487
|
-
return { status:
|
|
490
|
+
return { status: "not_found" };
|
|
488
491
|
}
|
|
489
492
|
|
|
490
493
|
// Terminal states – return immediately
|
|
491
|
-
if (batch.status ===
|
|
492
|
-
return { status:
|
|
494
|
+
if (batch.status === "completed") {
|
|
495
|
+
return { status: "completed", progress: 1.0, error: undefined };
|
|
493
496
|
}
|
|
494
|
-
if (batch.status ===
|
|
495
|
-
return {
|
|
497
|
+
if (batch.status === "failed") {
|
|
498
|
+
return {
|
|
499
|
+
status: "failed",
|
|
500
|
+
progress: 0,
|
|
501
|
+
error: batch.error || "Training failed",
|
|
502
|
+
};
|
|
496
503
|
}
|
|
497
|
-
if (batch.status ===
|
|
498
|
-
return { status:
|
|
504
|
+
if (batch.status === "pending") {
|
|
505
|
+
return { status: "pending", progress: 0 };
|
|
499
506
|
}
|
|
500
507
|
|
|
501
508
|
// For 'training' status, attempt to read the metrics log written by
|
|
@@ -506,12 +513,12 @@ export class AutomationPipeline {
|
|
|
506
513
|
const metricsLogPath = path.resolve(
|
|
507
514
|
this.config.dataStoragePath,
|
|
508
515
|
batchId,
|
|
509
|
-
|
|
516
|
+
"training_metrics.jsonl",
|
|
510
517
|
);
|
|
511
518
|
|
|
512
519
|
try {
|
|
513
|
-
const logContent = await fs.readFile(metricsLogPath,
|
|
514
|
-
const lines = logContent.trim().split(
|
|
520
|
+
const logContent = await fs.readFile(metricsLogPath, "utf-8");
|
|
521
|
+
const lines = logContent.trim().split("\n").filter(Boolean);
|
|
515
522
|
if (lines.length > 0) {
|
|
516
523
|
const lastLine = lines[lines.length - 1]!;
|
|
517
524
|
const lastMetric = JSON.parse(lastLine) as {
|
|
@@ -520,13 +527,13 @@ export class AutomationPipeline {
|
|
|
520
527
|
elapsed_ms?: number;
|
|
521
528
|
};
|
|
522
529
|
if (
|
|
523
|
-
typeof lastMetric.step ===
|
|
524
|
-
typeof lastMetric.total_steps ===
|
|
530
|
+
typeof lastMetric.step === "number" &&
|
|
531
|
+
typeof lastMetric.total_steps === "number" &&
|
|
525
532
|
lastMetric.total_steps > 0
|
|
526
533
|
) {
|
|
527
534
|
progress = lastMetric.step / lastMetric.total_steps;
|
|
528
535
|
// Estimate remaining time from elapsed
|
|
529
|
-
if (typeof lastMetric.elapsed_ms ===
|
|
536
|
+
if (typeof lastMetric.elapsed_ms === "number" && progress > 0) {
|
|
530
537
|
const totalEstimatedMs = lastMetric.elapsed_ms / progress;
|
|
531
538
|
eta = Math.max(0, totalEstimatedMs - lastMetric.elapsed_ms);
|
|
532
539
|
}
|
|
@@ -568,12 +575,12 @@ export class AutomationPipeline {
|
|
|
568
575
|
}
|
|
569
576
|
await fs.rmdir(batchDir);
|
|
570
577
|
logger.info(
|
|
571
|
-
|
|
578
|
+
"Cleaned up export files",
|
|
572
579
|
{ batchId, filesRemoved: files.length, dir: batchDir },
|
|
573
|
-
|
|
580
|
+
"AutomationPipeline",
|
|
574
581
|
);
|
|
575
582
|
} catch (err) {
|
|
576
|
-
logger.warn(
|
|
583
|
+
logger.warn("Failed to clean up export files", {
|
|
577
584
|
batchId,
|
|
578
585
|
dir: batchDir,
|
|
579
586
|
error: err instanceof Error ? err.message : String(err),
|
|
@@ -585,17 +592,17 @@ export class AutomationPipeline {
|
|
|
585
592
|
* Automation loop (called by cron)
|
|
586
593
|
*/
|
|
587
594
|
async runAutomationCycle(): Promise<void> {
|
|
588
|
-
logger.info(
|
|
595
|
+
logger.info("Running automation cycle");
|
|
589
596
|
|
|
590
597
|
// Check if training is already running
|
|
591
598
|
if (this.currentTrainingJob) {
|
|
592
599
|
const status = await this.monitorTraining(this.currentTrainingJob);
|
|
593
|
-
if (status.status ===
|
|
600
|
+
if (status.status === "completed") {
|
|
594
601
|
await this.deployModel(this.currentTrainingJob);
|
|
595
602
|
await this.cleanupExportFiles(this.currentTrainingJob);
|
|
596
603
|
this.currentTrainingJob = null;
|
|
597
|
-
} else if (status.status ===
|
|
598
|
-
logger.error(
|
|
604
|
+
} else if (status.status === "failed") {
|
|
605
|
+
logger.error("Training job failed", {
|
|
599
606
|
batchId: this.currentTrainingJob,
|
|
600
607
|
});
|
|
601
608
|
await this.cleanupExportFiles(this.currentTrainingJob);
|
|
@@ -612,11 +619,11 @@ export class AutomationPipeline {
|
|
|
612
619
|
if (newlyCompleted) {
|
|
613
620
|
const alreadyDeployed = await da.getModelByBatchAndStatus(
|
|
614
621
|
newlyCompleted.batchId,
|
|
615
|
-
|
|
622
|
+
"deployed",
|
|
616
623
|
);
|
|
617
624
|
|
|
618
625
|
if (!alreadyDeployed) {
|
|
619
|
-
logger.info(
|
|
626
|
+
logger.info("Found newly completed training batch", {
|
|
620
627
|
batchId: newlyCompleted.batchId,
|
|
621
628
|
});
|
|
622
629
|
await this.deployModel(newlyCompleted.batchId);
|
|
@@ -633,7 +640,7 @@ export class AutomationPipeline {
|
|
|
633
640
|
: 999;
|
|
634
641
|
|
|
635
642
|
if (hoursSinceLastTraining >= this.config.trainingInterval) {
|
|
636
|
-
logger.info(
|
|
643
|
+
logger.info("Triggering automatic training", readiness.stats);
|
|
637
644
|
await this.triggerTraining();
|
|
638
645
|
}
|
|
639
646
|
}
|
|
@@ -641,11 +648,13 @@ export class AutomationPipeline {
|
|
|
641
648
|
// Track market outcomes for recent windows (optional — only if market adapter registered)
|
|
642
649
|
const marketAdapter = getMarketDataAdapter();
|
|
643
650
|
if (marketAdapter) {
|
|
644
|
-
const { MarketOutcomesTracker: MOT } = await import(
|
|
651
|
+
const { MarketOutcomesTracker: MOT } = await import(
|
|
652
|
+
"./MarketOutcomesTracker"
|
|
653
|
+
);
|
|
645
654
|
const outcomesTracker = new MOT();
|
|
646
655
|
const synced = await outcomesTracker.syncRecentWindows(24);
|
|
647
656
|
if (synced > 0) {
|
|
648
|
-
logger.info(
|
|
657
|
+
logger.info("Synced market outcomes for windows", {
|
|
649
658
|
windowsSynced: synced,
|
|
650
659
|
});
|
|
651
660
|
}
|
|
@@ -653,7 +662,7 @@ export class AutomationPipeline {
|
|
|
653
662
|
const processed =
|
|
654
663
|
await rewardBackpropagationService.processPendingWindows();
|
|
655
664
|
if (processed > 0) {
|
|
656
|
-
logger.info(
|
|
665
|
+
logger.info("Updated rewards for trajectories", {
|
|
657
666
|
windowsProcessed: processed,
|
|
658
667
|
});
|
|
659
668
|
}
|
|
@@ -664,7 +673,7 @@ export class AutomationPipeline {
|
|
|
664
673
|
const windowId = getPreviousWindowId(hoursAgo);
|
|
665
674
|
const scored = await rulerScoringService.scoreWindow(windowId);
|
|
666
675
|
if (scored > 0) {
|
|
667
|
-
logger.info(
|
|
676
|
+
logger.info("Scored trajectories with RULER", { windowId, scored });
|
|
668
677
|
}
|
|
669
678
|
}
|
|
670
679
|
|
|
@@ -682,18 +691,18 @@ export class AutomationPipeline {
|
|
|
682
691
|
const batch = await da.getBatchById(batchId);
|
|
683
692
|
|
|
684
693
|
if (!batch) {
|
|
685
|
-
logger.warn(
|
|
694
|
+
logger.warn("Batch not found for deployment", { batchId });
|
|
686
695
|
return;
|
|
687
696
|
}
|
|
688
697
|
|
|
689
|
-
const model = await da.getModelByBatchAndStatus(batch.id,
|
|
698
|
+
const model = await da.getModelByBatchAndStatus(batch.id, "ready");
|
|
690
699
|
|
|
691
700
|
if (!model) {
|
|
692
|
-
logger.warn(
|
|
701
|
+
logger.warn("Model not found for batch", { batchId });
|
|
693
702
|
return;
|
|
694
703
|
}
|
|
695
704
|
|
|
696
|
-
logger.info(
|
|
705
|
+
logger.info("Deploying model", {
|
|
697
706
|
version: batch.modelVersion,
|
|
698
707
|
modelId: model.modelId,
|
|
699
708
|
batchId,
|
|
@@ -703,17 +712,17 @@ export class AutomationPipeline {
|
|
|
703
712
|
let trajectoryIds: string[];
|
|
704
713
|
if (
|
|
705
714
|
!batch.trajectoryIds ||
|
|
706
|
-
batch.trajectoryIds ===
|
|
707
|
-
batch.trajectoryIds ===
|
|
715
|
+
batch.trajectoryIds === "null" ||
|
|
716
|
+
batch.trajectoryIds === "[]"
|
|
708
717
|
) {
|
|
709
|
-
logger.warn(
|
|
718
|
+
logger.warn("Training batch has invalid trajectoryIds", {
|
|
710
719
|
batchId: batch.id,
|
|
711
720
|
});
|
|
712
721
|
trajectoryIds = [];
|
|
713
722
|
} else {
|
|
714
723
|
trajectoryIds = JSON.parse(batch.trajectoryIds) as string[];
|
|
715
724
|
if (!Array.isArray(trajectoryIds)) {
|
|
716
|
-
logger.warn(
|
|
725
|
+
logger.warn("Training batch trajectoryIds is not an array", {
|
|
717
726
|
batchId: batch.id,
|
|
718
727
|
});
|
|
719
728
|
trajectoryIds = [];
|
|
@@ -724,11 +733,11 @@ export class AutomationPipeline {
|
|
|
724
733
|
await da.markTrajectoriesAsUsed(trajectoryIds, batch.id);
|
|
725
734
|
}
|
|
726
735
|
|
|
727
|
-
await da.updateModelStatus(model.modelId,
|
|
736
|
+
await da.updateModelStatus(model.modelId, "deployed", {
|
|
728
737
|
deployedAt: new Date(),
|
|
729
738
|
});
|
|
730
739
|
|
|
731
|
-
logger.info(
|
|
740
|
+
logger.info("Model deployed", {
|
|
732
741
|
version: batch.modelVersion,
|
|
733
742
|
modelId: model.modelId,
|
|
734
743
|
});
|
|
@@ -740,7 +749,7 @@ export class AutomationPipeline {
|
|
|
740
749
|
*/
|
|
741
750
|
async benchmarkAndDeploy(
|
|
742
751
|
batchId: string,
|
|
743
|
-
autoDeploy = true
|
|
752
|
+
autoDeploy = true,
|
|
744
753
|
): Promise<{
|
|
745
754
|
benchmarked: boolean;
|
|
746
755
|
deployed: boolean;
|
|
@@ -750,37 +759,37 @@ export class AutomationPipeline {
|
|
|
750
759
|
const batch = await da.getBatchById(batchId);
|
|
751
760
|
|
|
752
761
|
if (!batch) {
|
|
753
|
-
return { benchmarked: false, deployed: false, reason:
|
|
762
|
+
return { benchmarked: false, deployed: false, reason: "Batch not found" };
|
|
754
763
|
}
|
|
755
764
|
|
|
756
|
-
const model = await da.getModelByBatchAndStatus(batch.id,
|
|
765
|
+
const model = await da.getModelByBatchAndStatus(batch.id, "ready");
|
|
757
766
|
|
|
758
767
|
if (!model) {
|
|
759
|
-
return { benchmarked: false, deployed: false, reason:
|
|
768
|
+
return { benchmarked: false, deployed: false, reason: "Model not found" };
|
|
760
769
|
}
|
|
761
770
|
|
|
762
771
|
// Benchmark the model
|
|
763
772
|
logger.info(
|
|
764
|
-
|
|
773
|
+
"Benchmarking model...",
|
|
765
774
|
{ modelId: model.modelId },
|
|
766
|
-
|
|
775
|
+
"AutomationPipeline",
|
|
767
776
|
);
|
|
768
777
|
const benchmarkResults = await benchmarkService.benchmarkModel(
|
|
769
|
-
model.modelId
|
|
778
|
+
model.modelId,
|
|
770
779
|
);
|
|
771
780
|
|
|
772
781
|
// Compare with previous models
|
|
773
782
|
const comparison = await benchmarkService.compareModels(model.modelId);
|
|
774
783
|
|
|
775
784
|
logger.info(
|
|
776
|
-
|
|
785
|
+
"Benchmark complete",
|
|
777
786
|
{
|
|
778
787
|
modelId: model.modelId,
|
|
779
788
|
score: benchmarkResults.benchmarkScore,
|
|
780
789
|
shouldDeploy: comparison.shouldDeploy,
|
|
781
790
|
reason: comparison.reason,
|
|
782
791
|
},
|
|
783
|
-
|
|
792
|
+
"AutomationPipeline",
|
|
784
793
|
);
|
|
785
794
|
|
|
786
795
|
// Deploy if performance is good enough (and autoDeploy is enabled)
|
|
@@ -796,7 +805,7 @@ export class AutomationPipeline {
|
|
|
796
805
|
return {
|
|
797
806
|
benchmarked: true,
|
|
798
807
|
deployed: false,
|
|
799
|
-
reason: comparison.reason ||
|
|
808
|
+
reason: comparison.reason || "Performance below threshold",
|
|
800
809
|
};
|
|
801
810
|
}
|
|
802
811
|
|
|
@@ -821,13 +830,13 @@ export class AutomationPipeline {
|
|
|
821
830
|
const da = getTrainingDataAdapter();
|
|
822
831
|
const dbOk = await da.healthCheck();
|
|
823
832
|
if (!dbOk) {
|
|
824
|
-
logger.warn(
|
|
833
|
+
logger.warn("Health check: database unreachable");
|
|
825
834
|
}
|
|
826
835
|
|
|
827
836
|
const oneHourAgo = new Date(Date.now() - 60 * 60 * 1000);
|
|
828
837
|
const last1h = await da.countTrajectoriesSince(oneHourAgo);
|
|
829
838
|
if (last1h < 1) {
|
|
830
|
-
logger.warn(
|
|
839
|
+
logger.warn("Low data collection rate", { trajectoriesLastHour: last1h });
|
|
831
840
|
}
|
|
832
841
|
|
|
833
842
|
// Ensure storage directories exist
|
|
@@ -891,10 +900,10 @@ export class AutomationPipeline {
|
|
|
891
900
|
training: {
|
|
892
901
|
currentJob: this.currentTrainingJob,
|
|
893
902
|
lastCompleted: lastCompleted?.completedAt || null,
|
|
894
|
-
nextScheduled: lastCompleted
|
|
903
|
+
nextScheduled: lastCompleted?.completedAt
|
|
895
904
|
? new Date(
|
|
896
|
-
lastCompleted.completedAt
|
|
897
|
-
this.config.trainingInterval * 60 * 60 * 1000
|
|
905
|
+
lastCompleted.completedAt.getTime() +
|
|
906
|
+
this.config.trainingInterval * 60 * 60 * 1000,
|
|
898
907
|
)
|
|
899
908
|
: null,
|
|
900
909
|
},
|