@elizaos/training 2.0.0-alpha.13 → 2.0.0-alpha.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/scripts/rank_trajectories.ts +0 -1
- package/scripts/run_task_benchmark.ts +4 -11
- package/src/adapter.ts +96 -49
- package/src/archetypes/ArchetypeConfigService.ts +188 -185
- package/src/archetypes/derive-archetype.ts +47 -47
- package/src/archetypes/index.ts +2 -2
- package/src/benchmark/ArchetypeMatchupBenchmark.ts +70 -70
- package/src/benchmark/BenchmarkChartGenerator.ts +70 -69
- package/src/benchmark/BenchmarkDataGenerator.ts +136 -136
- package/src/benchmark/BenchmarkDataViewer.ts +32 -30
- package/src/benchmark/BenchmarkHistoryService.ts +13 -12
- package/src/benchmark/BenchmarkRunner.ts +87 -83
- package/src/benchmark/BenchmarkValidator.ts +48 -46
- package/src/benchmark/FastEvalRunner.ts +17 -16
- package/src/benchmark/MetricsValidator.ts +20 -21
- package/src/benchmark/MetricsVisualizer.ts +92 -85
- package/src/benchmark/ModelBenchmarkService.ts +90 -82
- package/src/benchmark/ModelRegistry.ts +44 -44
- package/src/benchmark/RulerBenchmarkIntegration.ts +24 -24
- package/src/benchmark/SimulationA2AInterface.ts +118 -118
- package/src/benchmark/SimulationEngine.ts +51 -51
- package/src/benchmark/TaskRunner.ts +87 -79
- package/src/benchmark/__tests__/BenchmarkRunner.test.ts +80 -80
- package/src/benchmark/__tests__/HeadToHead.test.ts +26 -26
- package/src/benchmark/index.ts +27 -27
- package/src/benchmark/parseSimulationMetrics.ts +32 -32
- package/src/benchmark/simulation-types.ts +10 -10
- package/src/dependencies.ts +34 -34
- package/src/generation/TrajectoryGenerator.ts +39 -37
- package/src/generation/index.ts +1 -1
- package/src/huggingface/HuggingFaceDatasetUploader.ts +72 -72
- package/src/huggingface/HuggingFaceIntegrationService.ts +59 -53
- package/src/huggingface/HuggingFaceModelUploader.ts +60 -59
- package/src/huggingface/index.ts +6 -6
- package/src/huggingface/shared/HuggingFaceUploadUtil.ts +32 -32
- package/src/index.ts +27 -27
- package/src/init-training.ts +6 -6
- package/src/metrics/TrajectoryMetricsExtractor.ts +70 -71
- package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +182 -182
- package/src/metrics/index.ts +2 -2
- package/src/rubrics/__tests__/index.test.ts +73 -73
- package/src/rubrics/ass-kisser.ts +6 -6
- package/src/rubrics/degen.ts +6 -6
- package/src/rubrics/goody-twoshoes.ts +6 -6
- package/src/rubrics/index.ts +50 -50
- package/src/rubrics/information-trader.ts +6 -6
- package/src/rubrics/infosec.ts +6 -6
- package/src/rubrics/liar.ts +6 -6
- package/src/rubrics/perps-trader.ts +6 -6
- package/src/rubrics/researcher.ts +6 -6
- package/src/rubrics/scammer.ts +6 -6
- package/src/rubrics/social-butterfly.ts +7 -7
- package/src/rubrics/super-predictor.ts +6 -6
- package/src/rubrics/trader.ts +5 -5
- package/src/scoring/ArchetypeScoringService.ts +56 -54
- package/src/scoring/JudgePromptBuilder.ts +96 -96
- package/src/scoring/LLMJudgeCache.ts +26 -23
- package/src/scoring/index.ts +3 -3
- package/src/training/AutomationPipeline.ts +149 -140
- package/src/training/BenchmarkService.ts +49 -45
- package/src/training/ConfigValidator.ts +38 -32
- package/src/training/MarketOutcomesTracker.ts +22 -12
- package/src/training/ModelDeployer.ts +15 -15
- package/src/training/ModelFetcher.ts +7 -7
- package/src/training/ModelSelectionService.ts +32 -32
- package/src/training/ModelUsageVerifier.ts +31 -24
- package/src/training/MultiModelOrchestrator.ts +44 -44
- package/src/training/RLModelConfig.ts +57 -57
- package/src/training/RewardBackpropagationService.ts +18 -17
- package/src/training/RulerScoringService.ts +73 -72
- package/src/training/TrainingMonitor.ts +29 -29
- package/src/training/TrajectoryRecorder.ts +25 -27
- package/src/training/__tests__/TrajectoryRecorder.test.ts +105 -105
- package/src/training/index.ts +36 -36
- package/src/training/logRLConfig.ts +7 -7
- package/src/training/pipeline.ts +13 -16
- package/src/training/storage/ModelStorageService.ts +32 -32
- package/src/training/storage/TrainingDataArchiver.ts +21 -21
- package/src/training/storage/index.ts +2 -2
- package/src/training/types.ts +6 -6
- package/src/training/window-utils.ts +14 -14
- package/src/utils/index.ts +7 -7
- package/src/utils/logger.ts +5 -5
- package/src/utils/snowflake.ts +1 -1
- package/src/utils/synthetic-detector.ts +7 -7
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@elizaos/training",
|
|
3
|
-
"version": "2.0.0-alpha.
|
|
3
|
+
"version": "2.0.0-alpha.14",
|
|
4
4
|
"description": "ElizaOS RL training pipeline with benchmarking and model publishing support",
|
|
5
5
|
"main": "./src/index.ts",
|
|
6
6
|
"types": "./src/index.ts",
|
|
@@ -53,5 +53,5 @@
|
|
|
53
53
|
"bun-types": "^1.3.2",
|
|
54
54
|
"typescript": "^5.9.3"
|
|
55
55
|
},
|
|
56
|
-
"gitHead": "
|
|
56
|
+
"gitHead": "4eb31c47081d48bec956e6b9751f3c9aee3eb38d"
|
|
57
57
|
}
|
|
@@ -50,7 +50,6 @@ class BenchmarkRuntimeManager implements IAgentRuntimeManager {
|
|
|
50
50
|
// Create a new runtime
|
|
51
51
|
const character = {
|
|
52
52
|
name: 'BenchmarkAgent',
|
|
53
|
-
modelProvider: "openai" as any,
|
|
54
53
|
bio: 'A helpful assistant for benchmarking.',
|
|
55
54
|
settings: {
|
|
56
55
|
secrets: {
|
|
@@ -60,13 +59,7 @@ class BenchmarkRuntimeManager implements IAgentRuntimeManager {
|
|
|
60
59
|
};
|
|
61
60
|
|
|
62
61
|
const runtime = new AgentRuntime({
|
|
63
|
-
token: process.env.OPENAI_API_KEY || '',
|
|
64
|
-
modelProvider: "openai" as any,
|
|
65
62
|
character,
|
|
66
|
-
plugins: [],
|
|
67
|
-
providers: [],
|
|
68
|
-
actions: [],
|
|
69
|
-
evaluators: [],
|
|
70
63
|
});
|
|
71
64
|
|
|
72
65
|
// We must initialize with allowNoDatabase to avoid DB error
|
|
@@ -227,11 +220,11 @@ async function main() {
|
|
|
227
220
|
agentService: new BenchmarkAgentService(),
|
|
228
221
|
agentRuntimeManager: new BenchmarkRuntimeManager(),
|
|
229
222
|
autonomousCoordinator: {
|
|
230
|
-
executeAutonomousTick: async () => ({ success: true })
|
|
231
|
-
}
|
|
223
|
+
executeAutonomousTick: async () => ({ success: true }),
|
|
224
|
+
},
|
|
232
225
|
llmCaller: {
|
|
233
|
-
callGroqDirect: async () => "mock response"
|
|
234
|
-
}
|
|
226
|
+
callGroqDirect: async () => "mock response",
|
|
227
|
+
},
|
|
235
228
|
});
|
|
236
229
|
|
|
237
230
|
// Import task interactor config
|
package/src/adapter.ts
CHANGED
|
@@ -24,7 +24,7 @@ export type JsonValue =
|
|
|
24
24
|
/**
|
|
25
25
|
* UUID-like string identifier.
|
|
26
26
|
*/
|
|
27
|
-
export type UUID = string & { readonly __brand:
|
|
27
|
+
export type UUID = string & { readonly __brand: "UUID" };
|
|
28
28
|
|
|
29
29
|
// ─── Record types (replace schema-derived types from @elizaos/db) ───────
|
|
30
30
|
|
|
@@ -170,7 +170,9 @@ export interface ITrainingDataAdapter {
|
|
|
170
170
|
* Get scenario groups with counts.
|
|
171
171
|
* Returns groups where count >= minGroupSize.
|
|
172
172
|
*/
|
|
173
|
-
getScenarioGroups(
|
|
173
|
+
getScenarioGroups(
|
|
174
|
+
minGroupSize: number,
|
|
175
|
+
): Promise<Array<{ scenarioId: string | null; count: number }>>;
|
|
174
176
|
|
|
175
177
|
/**
|
|
176
178
|
* Sample recent trajectories for data quality assessment.
|
|
@@ -194,10 +196,17 @@ export interface ITrainingDataAdapter {
|
|
|
194
196
|
getTrajectoryById(trajectoryId: string): Promise<TrajectoryRecord | null>;
|
|
195
197
|
|
|
196
198
|
/** Mark trajectories as used in a training batch. */
|
|
197
|
-
markTrajectoriesAsUsed(
|
|
199
|
+
markTrajectoriesAsUsed(
|
|
200
|
+
trajectoryIds: string[],
|
|
201
|
+
batchId: string,
|
|
202
|
+
): Promise<void>;
|
|
198
203
|
|
|
199
204
|
/** Update trajectory reward data. */
|
|
200
|
-
updateTrajectoryRewards(
|
|
205
|
+
updateTrajectoryRewards(
|
|
206
|
+
id: string,
|
|
207
|
+
stepsJson: string,
|
|
208
|
+
totalReward: number,
|
|
209
|
+
): Promise<void>;
|
|
201
210
|
|
|
202
211
|
/** Update trajectory with judge score. */
|
|
203
212
|
updateTrajectoryScore(
|
|
@@ -207,7 +216,9 @@ export interface ITrainingDataAdapter {
|
|
|
207
216
|
): Promise<void>;
|
|
208
217
|
|
|
209
218
|
/** Insert a new trajectory record. */
|
|
210
|
-
insertTrajectory(
|
|
219
|
+
insertTrajectory(
|
|
220
|
+
data: Omit<TrajectoryRecord, "createdAt" | "updatedAt">,
|
|
221
|
+
): Promise<void>;
|
|
211
222
|
|
|
212
223
|
/**
|
|
213
224
|
* Count trajectories created since a given timestamp.
|
|
@@ -227,7 +238,10 @@ export interface ITrainingDataAdapter {
|
|
|
227
238
|
getModelByVersion(version: string): Promise<TrainedModelRecord | null>;
|
|
228
239
|
|
|
229
240
|
/** Get model associated with a training batch and status. */
|
|
230
|
-
getModelByBatchAndStatus(
|
|
241
|
+
getModelByBatchAndStatus(
|
|
242
|
+
batchId: string,
|
|
243
|
+
status: string,
|
|
244
|
+
): Promise<TrainedModelRecord | null>;
|
|
231
245
|
|
|
232
246
|
/** Count deployed models. */
|
|
233
247
|
countDeployedModels(): Promise<number>;
|
|
@@ -254,7 +268,7 @@ export interface ITrainingDataAdapter {
|
|
|
254
268
|
updateModelHuggingFaceRepo(modelId: string, repoName: string): Promise<void>;
|
|
255
269
|
|
|
256
270
|
/** Insert a new trained model record. */
|
|
257
|
-
insertModel(data: Omit<TrainedModelRecord,
|
|
271
|
+
insertModel(data: Omit<TrainedModelRecord, "createdAt">): Promise<void>;
|
|
258
272
|
|
|
259
273
|
// ── Batch operations ───────────────────────────────────────────────
|
|
260
274
|
|
|
@@ -271,10 +285,16 @@ export interface ITrainingDataAdapter {
|
|
|
271
285
|
getLastCompletedBatch(): Promise<TrainingBatchRecord | null>;
|
|
272
286
|
|
|
273
287
|
/** Update batch status. */
|
|
274
|
-
updateBatchStatus(
|
|
288
|
+
updateBatchStatus(
|
|
289
|
+
batchId: string,
|
|
290
|
+
status: string,
|
|
291
|
+
error?: string,
|
|
292
|
+
): Promise<void>;
|
|
275
293
|
|
|
276
294
|
/** Insert a new training batch. */
|
|
277
|
-
insertBatch(
|
|
295
|
+
insertBatch(
|
|
296
|
+
data: Omit<TrainingBatchRecord, "startedAt" | "completedAt">,
|
|
297
|
+
): Promise<string>;
|
|
278
298
|
|
|
279
299
|
// ── Benchmark operations ───────────────────────────────────────────
|
|
280
300
|
|
|
@@ -285,13 +305,15 @@ export interface ITrainingDataAdapter {
|
|
|
285
305
|
countBenchmarksSince(since: Date): Promise<number>;
|
|
286
306
|
|
|
287
307
|
/** Insert a benchmark result. */
|
|
288
|
-
insertBenchmarkResult(
|
|
308
|
+
insertBenchmarkResult(
|
|
309
|
+
data: Omit<BenchmarkResultRecord, "createdAt">,
|
|
310
|
+
): Promise<void>;
|
|
289
311
|
|
|
290
312
|
// ── User/Agent operations ──────────────────────────────────────────
|
|
291
313
|
|
|
292
314
|
/** Get agent users (isAgent=true). Supports optional strategy filtering. */
|
|
293
315
|
getAgentUsers(filter?: {
|
|
294
|
-
strategy?:
|
|
316
|
+
strategy?: "all" | "gradual" | "test";
|
|
295
317
|
rolloutPercentage?: number;
|
|
296
318
|
testAgentIds?: string[];
|
|
297
319
|
}): Promise<UserRecord[]>;
|
|
@@ -317,7 +339,10 @@ export interface ITrainingDataAdapter {
|
|
|
317
339
|
createAgentConfig(data: Record<string, unknown>): Promise<void>;
|
|
318
340
|
|
|
319
341
|
/** Update an agent configuration by userId. */
|
|
320
|
-
updateAgentConfig(
|
|
342
|
+
updateAgentConfig(
|
|
343
|
+
userId: string,
|
|
344
|
+
data: Record<string, unknown>,
|
|
345
|
+
): Promise<void>;
|
|
321
346
|
|
|
322
347
|
/**
|
|
323
348
|
* Flexible benchmark result query with optional filters.
|
|
@@ -332,15 +357,17 @@ export interface ITrainingDataAdapter {
|
|
|
332
357
|
}): Promise<BenchmarkResultRecord[]>;
|
|
333
358
|
|
|
334
359
|
/** Aggregate benchmark statistics per model, ordered by avgPnl descending. */
|
|
335
|
-
getBenchmarkModelSummary(): Promise<
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
360
|
+
getBenchmarkModelSummary(): Promise<
|
|
361
|
+
Array<{
|
|
362
|
+
modelId: string;
|
|
363
|
+
runCount: number;
|
|
364
|
+
avgPnl: number;
|
|
365
|
+
avgAccuracy: number;
|
|
366
|
+
avgOptimality: number;
|
|
367
|
+
bestPnl: number;
|
|
368
|
+
latestRun: Date;
|
|
369
|
+
}>
|
|
370
|
+
>;
|
|
344
371
|
|
|
345
372
|
/**
|
|
346
373
|
* Get scored training trajectories (isTrainingData=true with judge scores).
|
|
@@ -366,10 +393,15 @@ export interface ITrainingDataAdapter {
|
|
|
366
393
|
// ── Additional operations (added for service refactoring) ────────
|
|
367
394
|
|
|
368
395
|
/** Get the best benchmarked model, optionally excluding a model ID. Status 'ready'/'deployed', non-null benchmarkScore, ordered by score desc. */
|
|
369
|
-
getBestBenchmarkedModel(
|
|
396
|
+
getBestBenchmarkedModel(
|
|
397
|
+
excludeModelId?: string,
|
|
398
|
+
): Promise<TrainedModelRecord | null>;
|
|
370
399
|
|
|
371
400
|
/** Update model with detailed benchmark results (score, accuracy, eval metrics). */
|
|
372
|
-
updateModelBenchmarkResults(
|
|
401
|
+
updateModelBenchmarkResults(
|
|
402
|
+
modelId: string,
|
|
403
|
+
data: { benchmarkScore: number; accuracy: number; evalMetrics: JsonValue },
|
|
404
|
+
): Promise<void>;
|
|
373
405
|
|
|
374
406
|
/** Get models with benchmark scores, ordered by score descending. */
|
|
375
407
|
getBenchmarkedModels(limit: number): Promise<TrainedModelRecord[]>;
|
|
@@ -387,7 +419,10 @@ export interface ITrainingDataAdapter {
|
|
|
387
419
|
getTrajectoriesByIds(trajectoryIds: string[]): Promise<TrajectoryRecord[]>;
|
|
388
420
|
|
|
389
421
|
/** Get unscored trajectories, optionally filtered by IDs or limited. */
|
|
390
|
-
getUnscoredTrajectories(options?: {
|
|
422
|
+
getUnscoredTrajectories(options?: {
|
|
423
|
+
trajectoryIds?: string[];
|
|
424
|
+
limit?: number;
|
|
425
|
+
}): Promise<TrajectoryRecord[]>;
|
|
391
426
|
|
|
392
427
|
/** Get unscored trajectory IDs for a specific window. */
|
|
393
428
|
getUnscoredWindowTrajectoryIds(windowId: string): Promise<string[]>;
|
|
@@ -399,32 +434,44 @@ export interface ITrainingDataAdapter {
|
|
|
399
434
|
*/
|
|
400
435
|
export interface IMarketDataAdapter {
|
|
401
436
|
/** Get perpetual positions within a time window. */
|
|
402
|
-
getPerpPositionsForWindow(
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
437
|
+
getPerpPositionsForWindow(
|
|
438
|
+
windowStart: Date,
|
|
439
|
+
windowEnd: Date,
|
|
440
|
+
): Promise<
|
|
441
|
+
Array<{
|
|
442
|
+
id: string;
|
|
443
|
+
ticker?: string;
|
|
444
|
+
direction: string;
|
|
445
|
+
entryPrice: number;
|
|
446
|
+
currentPrice?: number | null;
|
|
447
|
+
exitPrice: number | null;
|
|
448
|
+
closedAt?: Date | null;
|
|
449
|
+
pnl: number | null;
|
|
450
|
+
[key: string]: JsonValue | Date | null | undefined;
|
|
451
|
+
}>
|
|
452
|
+
>;
|
|
413
453
|
|
|
414
454
|
/** Get resolved prediction markets within a time window. */
|
|
415
|
-
getResolvedMarketsForWindow(
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
455
|
+
getResolvedMarketsForWindow(
|
|
456
|
+
windowStart: Date,
|
|
457
|
+
windowEnd: Date,
|
|
458
|
+
): Promise<
|
|
459
|
+
Array<{
|
|
460
|
+
id: string;
|
|
461
|
+
question: string;
|
|
462
|
+
outcome: boolean | null;
|
|
463
|
+
finalProbability: number | null;
|
|
464
|
+
[key: string]: JsonValue | boolean | Date | null | undefined;
|
|
465
|
+
}>
|
|
466
|
+
>;
|
|
422
467
|
|
|
423
468
|
/** Get market outcomes for a window ID. */
|
|
424
|
-
getMarketOutcomesByWindow(windowId: string): Promise<
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
469
|
+
getMarketOutcomesByWindow(windowId: string): Promise<
|
|
470
|
+
Array<{
|
|
471
|
+
windowId: string;
|
|
472
|
+
[key: string]: JsonValue | undefined;
|
|
473
|
+
}>
|
|
474
|
+
>;
|
|
428
475
|
|
|
429
476
|
/** Insert a market outcome record. */
|
|
430
477
|
insertMarketOutcome(data: Record<string, JsonValue>): Promise<void>;
|
|
@@ -462,9 +509,9 @@ let _llmLogAdapter: ILlmLogAdapter | null = null;
|
|
|
462
509
|
* Must be called before any training operations that need database access.
|
|
463
510
|
*/
|
|
464
511
|
export function setTrainingDataAdapter(adapter: ITrainingDataAdapter): void {
|
|
465
|
-
if (!adapter || typeof adapter.countScoredTrajectoriesReady !==
|
|
512
|
+
if (!adapter || typeof adapter.countScoredTrajectoriesReady !== "function") {
|
|
466
513
|
throw new TypeError(
|
|
467
|
-
|
|
514
|
+
"setTrainingDataAdapter: provided object does not implement ITrainingDataAdapter",
|
|
468
515
|
);
|
|
469
516
|
}
|
|
470
517
|
_dataAdapter = adapter;
|
|
@@ -487,7 +534,7 @@ export function setLlmLogAdapter(adapter: ILlmLogAdapter): void {
|
|
|
487
534
|
export function getTrainingDataAdapter(): ITrainingDataAdapter {
|
|
488
535
|
if (!_dataAdapter) {
|
|
489
536
|
throw new Error(
|
|
490
|
-
|
|
537
|
+
"Training data adapter not registered. Call setTrainingDataAdapter() before using training operations.",
|
|
491
538
|
);
|
|
492
539
|
}
|
|
493
540
|
return _dataAdapter;
|