@elizaos/training 2.0.0-alpha.13 → 2.0.0-alpha.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/research-output/training-runs/training-run-1773726941205.json +38 -0
- package/scripts/rank_trajectories.ts +0 -1
- package/scripts/run_task_benchmark.ts +4 -11
- package/src/adapter.ts +96 -49
- package/src/archetypes/ArchetypeConfigService.ts +188 -185
- package/src/archetypes/derive-archetype.ts +47 -47
- package/src/archetypes/index.ts +2 -2
- package/src/benchmark/ArchetypeMatchupBenchmark.ts +70 -70
- package/src/benchmark/BenchmarkChartGenerator.ts +70 -69
- package/src/benchmark/BenchmarkDataGenerator.ts +136 -136
- package/src/benchmark/BenchmarkDataViewer.ts +32 -30
- package/src/benchmark/BenchmarkHistoryService.ts +13 -12
- package/src/benchmark/BenchmarkRunner.ts +87 -83
- package/src/benchmark/BenchmarkValidator.ts +48 -46
- package/src/benchmark/FastEvalRunner.ts +17 -16
- package/src/benchmark/MetricsValidator.ts +20 -21
- package/src/benchmark/MetricsVisualizer.ts +92 -85
- package/src/benchmark/ModelBenchmarkService.ts +90 -82
- package/src/benchmark/ModelRegistry.ts +44 -44
- package/src/benchmark/RulerBenchmarkIntegration.ts +24 -24
- package/src/benchmark/SimulationA2AInterface.ts +118 -118
- package/src/benchmark/SimulationEngine.ts +51 -51
- package/src/benchmark/TaskRunner.ts +87 -79
- package/src/benchmark/__tests__/BenchmarkRunner.test.ts +80 -80
- package/src/benchmark/__tests__/HeadToHead.test.ts +26 -26
- package/src/benchmark/index.ts +27 -27
- package/src/benchmark/parseSimulationMetrics.ts +32 -32
- package/src/benchmark/simulation-types.ts +10 -10
- package/src/dependencies.ts +34 -34
- package/src/generation/TrajectoryGenerator.ts +39 -37
- package/src/generation/index.ts +1 -1
- package/src/huggingface/HuggingFaceDatasetUploader.ts +72 -72
- package/src/huggingface/HuggingFaceIntegrationService.ts +59 -53
- package/src/huggingface/HuggingFaceModelUploader.ts +60 -59
- package/src/huggingface/index.ts +6 -6
- package/src/huggingface/shared/HuggingFaceUploadUtil.ts +32 -32
- package/src/index.ts +27 -27
- package/src/init-training.ts +6 -6
- package/src/metrics/TrajectoryMetricsExtractor.ts +70 -71
- package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +182 -182
- package/src/metrics/index.ts +2 -2
- package/src/rubrics/__tests__/index.test.ts +73 -73
- package/src/rubrics/ass-kisser.ts +6 -6
- package/src/rubrics/degen.ts +6 -6
- package/src/rubrics/goody-twoshoes.ts +6 -6
- package/src/rubrics/index.ts +50 -50
- package/src/rubrics/information-trader.ts +6 -6
- package/src/rubrics/infosec.ts +6 -6
- package/src/rubrics/liar.ts +6 -6
- package/src/rubrics/perps-trader.ts +6 -6
- package/src/rubrics/researcher.ts +6 -6
- package/src/rubrics/scammer.ts +6 -6
- package/src/rubrics/social-butterfly.ts +7 -7
- package/src/rubrics/super-predictor.ts +6 -6
- package/src/rubrics/trader.ts +5 -5
- package/src/scoring/ArchetypeScoringService.ts +56 -54
- package/src/scoring/JudgePromptBuilder.ts +96 -96
- package/src/scoring/LLMJudgeCache.ts +26 -23
- package/src/scoring/index.ts +3 -3
- package/src/training/AutomationPipeline.ts +149 -140
- package/src/training/BenchmarkService.ts +49 -45
- package/src/training/ConfigValidator.ts +38 -32
- package/src/training/MarketOutcomesTracker.ts +22 -12
- package/src/training/ModelDeployer.ts +15 -15
- package/src/training/ModelFetcher.ts +7 -7
- package/src/training/ModelSelectionService.ts +32 -32
- package/src/training/ModelUsageVerifier.ts +31 -24
- package/src/training/MultiModelOrchestrator.ts +44 -44
- package/src/training/RLModelConfig.ts +57 -57
- package/src/training/RewardBackpropagationService.ts +18 -17
- package/src/training/RulerScoringService.ts +73 -72
- package/src/training/TrainingMonitor.ts +29 -29
- package/src/training/TrajectoryRecorder.ts +25 -27
- package/src/training/__tests__/TrajectoryRecorder.test.ts +105 -105
- package/src/training/index.ts +36 -36
- package/src/training/logRLConfig.ts +7 -7
- package/src/training/pipeline.ts +13 -16
- package/src/training/storage/ModelStorageService.ts +32 -32
- package/src/training/storage/TrainingDataArchiver.ts +21 -21
- package/src/training/storage/index.ts +2 -2
- package/src/training/types.ts +6 -6
- package/src/training/window-utils.ts +14 -14
- package/src/utils/index.ts +7 -7
- package/src/utils/logger.ts +5 -5
- package/src/utils/snowflake.ts +1 -1
- package/src/utils/synthetic-detector.ts +7 -7
|
@@ -5,15 +5,15 @@
|
|
|
5
5
|
* Useful for validation and understanding benchmark structure.
|
|
6
6
|
*/
|
|
7
7
|
|
|
8
|
-
import
|
|
9
|
-
import {
|
|
8
|
+
import { promises as fs } from "node:fs";
|
|
9
|
+
import type { JsonValue } from "../adapter";
|
|
10
10
|
import type {
|
|
11
11
|
BenchmarkGameSnapshot,
|
|
12
12
|
GameState,
|
|
13
13
|
GroundTruth,
|
|
14
14
|
Tick,
|
|
15
|
-
} from
|
|
16
|
-
import * as BenchmarkValidator from
|
|
15
|
+
} from "./BenchmarkDataGenerator";
|
|
16
|
+
import * as BenchmarkValidator from "./BenchmarkValidator";
|
|
17
17
|
|
|
18
18
|
export interface BenchmarkViewOptions {
|
|
19
19
|
/** Show detailed information */
|
|
@@ -81,9 +81,9 @@ export class BenchmarkDataViewer {
|
|
|
81
81
|
*/
|
|
82
82
|
static async view(
|
|
83
83
|
filePath: string,
|
|
84
|
-
options: BenchmarkViewOptions = {}
|
|
84
|
+
options: BenchmarkViewOptions = {},
|
|
85
85
|
): Promise<BenchmarkView> {
|
|
86
|
-
const data = await fs.readFile(filePath,
|
|
86
|
+
const data = await fs.readFile(filePath, "utf-8");
|
|
87
87
|
const snapshot = JSON.parse(data) as BenchmarkGameSnapshot;
|
|
88
88
|
|
|
89
89
|
// Validate
|
|
@@ -105,13 +105,15 @@ export class BenchmarkDataViewer {
|
|
|
105
105
|
groupChats: snapshot.initialState.groupChats?.length || 0,
|
|
106
106
|
},
|
|
107
107
|
|
|
108
|
-
ticks:
|
|
108
|
+
ticks: BenchmarkDataViewer.analyzeTicks(snapshot.ticks),
|
|
109
109
|
|
|
110
110
|
validation,
|
|
111
111
|
};
|
|
112
112
|
|
|
113
113
|
if (options.showGroundTruth || options.verbose) {
|
|
114
|
-
view.groundTruth =
|
|
114
|
+
view.groundTruth = BenchmarkDataViewer.analyzeGroundTruth(
|
|
115
|
+
snapshot.groundTruth,
|
|
116
|
+
);
|
|
115
117
|
}
|
|
116
118
|
|
|
117
119
|
return view;
|
|
@@ -120,7 +122,7 @@ export class BenchmarkDataViewer {
|
|
|
120
122
|
/**
|
|
121
123
|
* Analyze ticks
|
|
122
124
|
*/
|
|
123
|
-
private static analyzeTicks(ticks: Tick[]): BenchmarkView[
|
|
125
|
+
private static analyzeTicks(ticks: Tick[]): BenchmarkView["ticks"] {
|
|
124
126
|
const eventTypes: Record<string, number> = {};
|
|
125
127
|
let withEvents = 0;
|
|
126
128
|
|
|
@@ -145,15 +147,15 @@ export class BenchmarkDataViewer {
|
|
|
145
147
|
* Analyze ground truth
|
|
146
148
|
*/
|
|
147
149
|
private static analyzeGroundTruth(
|
|
148
|
-
groundTruth: GroundTruth
|
|
149
|
-
): BenchmarkView[
|
|
150
|
+
groundTruth: GroundTruth,
|
|
151
|
+
): BenchmarkView["groundTruth"] {
|
|
150
152
|
return {
|
|
151
153
|
marketOutcomes: Object.keys(groundTruth.marketOutcomes).length,
|
|
152
154
|
priceHistory: Object.fromEntries(
|
|
153
155
|
Object.entries(groundTruth.priceHistory).map(([ticker, history]) => [
|
|
154
156
|
ticker,
|
|
155
157
|
history.length,
|
|
156
|
-
])
|
|
158
|
+
]),
|
|
157
159
|
),
|
|
158
160
|
optimalActions: groundTruth.optimalActions.length,
|
|
159
161
|
socialOpportunities: groundTruth.socialOpportunities.length,
|
|
@@ -167,21 +169,21 @@ export class BenchmarkDataViewer {
|
|
|
167
169
|
* Print view to console
|
|
168
170
|
*/
|
|
169
171
|
static print(view: BenchmarkView, options: BenchmarkViewOptions = {}): void {
|
|
170
|
-
console.log(
|
|
172
|
+
console.log("\n📊 Benchmark Data View\n");
|
|
171
173
|
console.log(`ID: ${view.id}`);
|
|
172
174
|
console.log(`Version: ${view.version}`);
|
|
173
175
|
console.log(`Created: ${new Date(view.createdAt).toISOString()}`);
|
|
174
176
|
console.log(`Duration: ${(view.duration / 60).toFixed(1)} minutes`);
|
|
175
177
|
console.log(`Tick Interval: ${view.tickInterval}s`);
|
|
176
178
|
|
|
177
|
-
console.log(
|
|
179
|
+
console.log("\n📈 Initial State:");
|
|
178
180
|
console.log(` Prediction Markets: ${view.initialState.predictionMarkets}`);
|
|
179
181
|
console.log(` Perpetual Markets: ${view.initialState.perpetualMarkets}`);
|
|
180
182
|
console.log(` Agents: ${view.initialState.agents}`);
|
|
181
183
|
console.log(` Posts: ${view.initialState.posts}`);
|
|
182
184
|
console.log(` Group Chats: ${view.initialState.groupChats}`);
|
|
183
185
|
|
|
184
|
-
console.log(
|
|
186
|
+
console.log("\n⏱️ Ticks:");
|
|
185
187
|
console.log(` Total: ${view.ticks.total}`);
|
|
186
188
|
console.log(` With Events: ${view.ticks.withEvents}`);
|
|
187
189
|
if (options.verbose) {
|
|
@@ -192,27 +194,27 @@ export class BenchmarkDataViewer {
|
|
|
192
194
|
}
|
|
193
195
|
|
|
194
196
|
if (view.groundTruth) {
|
|
195
|
-
console.log(
|
|
197
|
+
console.log("\n🎯 Ground Truth:");
|
|
196
198
|
console.log(` Market Outcomes: ${view.groundTruth.marketOutcomes}`);
|
|
197
199
|
console.log(` Price History:`);
|
|
198
200
|
for (const [ticker, count] of Object.entries(
|
|
199
|
-
view.groundTruth.priceHistory
|
|
201
|
+
view.groundTruth.priceHistory,
|
|
200
202
|
)) {
|
|
201
203
|
console.log(` ${ticker}: ${count} ticks`);
|
|
202
204
|
}
|
|
203
205
|
console.log(` Optimal Actions: ${view.groundTruth.optimalActions}`);
|
|
204
206
|
console.log(
|
|
205
|
-
` Social Opportunities: ${view.groundTruth.socialOpportunities}
|
|
207
|
+
` Social Opportunities: ${view.groundTruth.socialOpportunities}`,
|
|
206
208
|
);
|
|
207
209
|
if (options.showHidden) {
|
|
208
210
|
console.log(` Hidden Facts: ${view.groundTruth.hiddenFacts}`);
|
|
209
211
|
console.log(` Hidden Events: ${view.groundTruth.hiddenEvents}`);
|
|
210
|
-
console.log(` True Facts: ${view.groundTruth.trueFacts.join(
|
|
212
|
+
console.log(` True Facts: ${view.groundTruth.trueFacts.join(", ")}`);
|
|
211
213
|
}
|
|
212
214
|
}
|
|
213
215
|
|
|
214
|
-
console.log(
|
|
215
|
-
console.log(` Valid: ${view.validation.valid ?
|
|
216
|
+
console.log("\n✅ Validation:");
|
|
217
|
+
console.log(` Valid: ${view.validation.valid ? "✅" : "❌"}`);
|
|
216
218
|
if (view.validation.errors.length > 0) {
|
|
217
219
|
console.log(` Errors: ${view.validation.errors.length}`);
|
|
218
220
|
if (options.verbose) {
|
|
@@ -230,7 +232,7 @@ export class BenchmarkDataViewer {
|
|
|
230
232
|
}
|
|
231
233
|
}
|
|
232
234
|
|
|
233
|
-
console.log(
|
|
235
|
+
console.log("");
|
|
234
236
|
}
|
|
235
237
|
|
|
236
238
|
/**
|
|
@@ -238,7 +240,7 @@ export class BenchmarkDataViewer {
|
|
|
238
240
|
*/
|
|
239
241
|
static getTickDetails(
|
|
240
242
|
snapshot: BenchmarkGameSnapshot,
|
|
241
|
-
tickNumber: number
|
|
243
|
+
tickNumber: number,
|
|
242
244
|
): {
|
|
243
245
|
tick: Tick | null;
|
|
244
246
|
state: GameState | null;
|
|
@@ -265,7 +267,7 @@ export class BenchmarkDataViewer {
|
|
|
265
267
|
*/
|
|
266
268
|
static getGroundTruthForTick(
|
|
267
269
|
snapshot: BenchmarkGameSnapshot,
|
|
268
|
-
tickNumber: number
|
|
270
|
+
tickNumber: number,
|
|
269
271
|
): {
|
|
270
272
|
hiddenFacts: Array<{ fact: string; category: string }>;
|
|
271
273
|
hiddenEvents: Array<{ type: string; description: string }>;
|
|
@@ -302,14 +304,14 @@ export class BenchmarkDataViewer {
|
|
|
302
304
|
// Check if ground truth is accidentally in state
|
|
303
305
|
const stateKeys = Object.keys(state);
|
|
304
306
|
const hasGroundTruthInState =
|
|
305
|
-
stateKeys.includes(
|
|
306
|
-
stateKeys.includes(
|
|
307
|
-
stateKeys.includes(
|
|
307
|
+
stateKeys.includes("groundTruth") ||
|
|
308
|
+
stateKeys.includes("hiddenFacts") ||
|
|
309
|
+
stateKeys.includes("hiddenEvents");
|
|
308
310
|
|
|
309
311
|
if (hasGroundTruthInState) {
|
|
310
312
|
return {
|
|
311
313
|
canAccess: true,
|
|
312
|
-
reason:
|
|
314
|
+
reason: "Ground truth found in game state (security issue!)",
|
|
313
315
|
};
|
|
314
316
|
}
|
|
315
317
|
|
|
@@ -317,8 +319,8 @@ export class BenchmarkDataViewer {
|
|
|
317
319
|
canAccess: false,
|
|
318
320
|
reason:
|
|
319
321
|
hasGroundTruth && hasHiddenFacts
|
|
320
|
-
?
|
|
321
|
-
:
|
|
322
|
+
? "Ground truth exists but is properly isolated from game state"
|
|
323
|
+
: "No ground truth data found",
|
|
322
324
|
};
|
|
323
325
|
}
|
|
324
326
|
}
|
|
@@ -5,13 +5,13 @@
|
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
7
|
import {
|
|
8
|
-
getTrainingDataAdapter,
|
|
9
8
|
type BenchmarkResultRecord,
|
|
9
|
+
getTrainingDataAdapter,
|
|
10
10
|
type JsonValue,
|
|
11
|
-
} from
|
|
12
|
-
import { logger } from
|
|
13
|
-
import { generateSnowflakeId } from
|
|
14
|
-
import type { SimulationMetrics } from
|
|
11
|
+
} from "../adapter";
|
|
12
|
+
import { logger } from "../utils/logger";
|
|
13
|
+
import { generateSnowflakeId } from "../utils/snowflake";
|
|
14
|
+
import type { SimulationMetrics } from "./SimulationEngine";
|
|
15
15
|
|
|
16
16
|
export interface BenchmarkResultInput {
|
|
17
17
|
modelId: string;
|
|
@@ -45,12 +45,13 @@ export interface BenchmarkTrendData {
|
|
|
45
45
|
/**
|
|
46
46
|
* Service for managing benchmark result history
|
|
47
47
|
*/
|
|
48
|
+
// biome-ignore lint/complexity/noStaticOnlyClass: Service namespace - methods are logically grouped
|
|
48
49
|
export class BenchmarkHistoryService {
|
|
49
50
|
/**
|
|
50
51
|
* Save a benchmark result to the database
|
|
51
52
|
*/
|
|
52
53
|
static async saveResult(
|
|
53
|
-
input: BenchmarkResultInput
|
|
54
|
+
input: BenchmarkResultInput,
|
|
54
55
|
): Promise<BenchmarkResultRecord> {
|
|
55
56
|
const id = await generateSnowflakeId();
|
|
56
57
|
const now = new Date();
|
|
@@ -74,7 +75,7 @@ export class BenchmarkHistoryService {
|
|
|
74
75
|
|
|
75
76
|
await getTrainingDataAdapter().insertBenchmarkResult(insertData);
|
|
76
77
|
|
|
77
|
-
logger.info(
|
|
78
|
+
logger.info("Saved benchmark result", {
|
|
78
79
|
id,
|
|
79
80
|
modelId: input.modelId,
|
|
80
81
|
benchmarkId: input.benchmarkId,
|
|
@@ -88,7 +89,7 @@ export class BenchmarkHistoryService {
|
|
|
88
89
|
* Get benchmark results by query
|
|
89
90
|
*/
|
|
90
91
|
static async getResults(
|
|
91
|
-
query: BenchmarkHistoryQuery
|
|
92
|
+
query: BenchmarkHistoryQuery,
|
|
92
93
|
): Promise<BenchmarkResultRecord[]> {
|
|
93
94
|
return getTrainingDataAdapter().queryBenchmarkResults({
|
|
94
95
|
modelId: query.modelId,
|
|
@@ -103,7 +104,7 @@ export class BenchmarkHistoryService {
|
|
|
103
104
|
* Get the latest result for a model
|
|
104
105
|
*/
|
|
105
106
|
static async getLatestResult(
|
|
106
|
-
modelId: string
|
|
107
|
+
modelId: string,
|
|
107
108
|
): Promise<BenchmarkResultRecord | null> {
|
|
108
109
|
const results = await getTrainingDataAdapter().queryBenchmarkResults({
|
|
109
110
|
modelId,
|
|
@@ -117,7 +118,7 @@ export class BenchmarkHistoryService {
|
|
|
117
118
|
*/
|
|
118
119
|
static async getTrendData(
|
|
119
120
|
modelId: string,
|
|
120
|
-
limit = 20
|
|
121
|
+
limit = 20,
|
|
121
122
|
): Promise<BenchmarkTrendData> {
|
|
122
123
|
const results = await getTrainingDataAdapter().queryBenchmarkResults({
|
|
123
124
|
modelId,
|
|
@@ -141,7 +142,7 @@ export class BenchmarkHistoryService {
|
|
|
141
142
|
*/
|
|
142
143
|
static async getModelComparison(
|
|
143
144
|
modelIds: string[],
|
|
144
|
-
benchmarkId?: string
|
|
145
|
+
benchmarkId?: string,
|
|
145
146
|
): Promise<Map<string, BenchmarkResultRecord[]>> {
|
|
146
147
|
const adapter = getTrainingDataAdapter();
|
|
147
148
|
const comparison = new Map<string, BenchmarkResultRecord[]>();
|
|
@@ -181,7 +182,7 @@ export class BenchmarkHistoryService {
|
|
|
181
182
|
static async checkImprovement(
|
|
182
183
|
modelId: string,
|
|
183
184
|
baselineModelId: string,
|
|
184
|
-
benchmarkId: string
|
|
185
|
+
benchmarkId: string,
|
|
185
186
|
): Promise<{
|
|
186
187
|
improved: boolean;
|
|
187
188
|
modelPnl: number;
|