@elizaos/training 2.0.0-alpha.21 → 2.0.0-alpha.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-lint.log +2 -0
- package/.turbo/turbo-typecheck.log +1 -0
- package/dist/.tsbuildinfo +1 -0
- package/dist/adapter.js +59 -0
- package/dist/archetypes/ArchetypeConfigService.js +510 -0
- package/dist/archetypes/derive-archetype.js +196 -0
- package/dist/archetypes/index.js +7 -0
- package/dist/benchmark/ArchetypeMatchupBenchmark.js +547 -0
- package/dist/benchmark/BenchmarkChartGenerator.js +632 -0
- package/dist/benchmark/BenchmarkDataGenerator.js +825 -0
- package/dist/benchmark/BenchmarkDataViewer.js +197 -0
- package/dist/benchmark/BenchmarkHistoryService.js +135 -0
- package/dist/benchmark/BenchmarkRunner.js +483 -0
- package/dist/benchmark/BenchmarkValidator.js +158 -0
- package/dist/benchmark/FastEvalRunner.js +133 -0
- package/dist/benchmark/MetricsValidator.js +104 -0
- package/dist/benchmark/MetricsVisualizer.js +775 -0
- package/dist/benchmark/ModelBenchmarkService.js +433 -0
- package/dist/benchmark/ModelRegistry.js +122 -0
- package/dist/benchmark/RulerBenchmarkIntegration.js +168 -0
- package/dist/benchmark/SimulationA2AInterface.js +683 -0
- package/dist/benchmark/SimulationEngine.js +522 -0
- package/dist/benchmark/TaskRunner.js +60 -0
- package/dist/benchmark/__tests__/BenchmarkRunner.test.js +409 -0
- package/dist/benchmark/__tests__/HeadToHead.test.js +105 -0
- package/dist/benchmark/index.js +23 -0
- package/dist/benchmark/parseSimulationMetrics.js +86 -0
- package/dist/benchmark/simulation-types.js +1 -0
- package/dist/dependencies.js +197 -0
- package/dist/generation/TrajectoryGenerator.js +244 -0
- package/dist/generation/index.js +6 -0
- package/dist/huggingface/HuggingFaceDatasetUploader.js +463 -0
- package/dist/huggingface/HuggingFaceIntegrationService.js +272 -0
- package/dist/huggingface/HuggingFaceModelUploader.js +385 -0
- package/dist/huggingface/index.js +9 -0
- package/dist/huggingface/shared/HuggingFaceUploadUtil.js +144 -0
- package/dist/index.js +41 -0
- package/dist/init-training.js +43 -0
- package/dist/metrics/TrajectoryMetricsExtractor.js +523 -0
- package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +628 -0
- package/dist/metrics/index.js +7 -0
- package/dist/metrics/types.js +21 -0
- package/dist/rubrics/__tests__/index.test.js +150 -0
- package/dist/rubrics/ass-kisser.js +83 -0
- package/dist/rubrics/degen.js +78 -0
- package/dist/rubrics/goody-twoshoes.js +82 -0
- package/dist/rubrics/index.js +184 -0
- package/dist/rubrics/information-trader.js +82 -0
- package/dist/rubrics/infosec.js +99 -0
- package/dist/rubrics/liar.js +102 -0
- package/dist/rubrics/perps-trader.js +85 -0
- package/dist/rubrics/researcher.js +79 -0
- package/dist/rubrics/scammer.js +80 -0
- package/dist/rubrics/social-butterfly.js +71 -0
- package/dist/rubrics/super-predictor.js +95 -0
- package/dist/rubrics/trader.js +65 -0
- package/dist/scoring/ArchetypeScoringService.js +301 -0
- package/dist/scoring/JudgePromptBuilder.js +401 -0
- package/dist/scoring/LLMJudgeCache.js +263 -0
- package/dist/scoring/index.js +8 -0
- package/dist/training/AutomationPipeline.js +714 -0
- package/dist/training/BenchmarkService.js +370 -0
- package/dist/training/ConfigValidator.js +153 -0
- package/dist/training/MarketOutcomesTracker.js +142 -0
- package/dist/training/ModelDeployer.js +128 -0
- package/dist/training/ModelFetcher.js +48 -0
- package/dist/training/ModelSelectionService.js +248 -0
- package/dist/training/ModelUsageVerifier.js +106 -0
- package/dist/training/MultiModelOrchestrator.js +349 -0
- package/dist/training/RLModelConfig.js +295 -0
- package/dist/training/RewardBackpropagationService.js +117 -0
- package/dist/training/RulerScoringService.js +450 -0
- package/dist/training/TrainingMonitor.js +108 -0
- package/dist/training/TrajectoryRecorder.js +281 -0
- package/dist/training/__tests__/TrajectoryRecorder.test.js +363 -0
- package/dist/training/index.js +30 -0
- package/dist/training/logRLConfig.js +29 -0
- package/dist/training/pipeline.js +80 -0
- package/dist/training/storage/ModelStorageService.js +190 -0
- package/dist/training/storage/TrainingDataArchiver.js +136 -0
- package/dist/training/storage/index.js +7 -0
- package/dist/training/types.js +6 -0
- package/dist/training/window-utils.js +100 -0
- package/dist/utils/index.js +73 -0
- package/dist/utils/logger.js +55 -0
- package/dist/utils/snowflake.js +15 -0
- package/dist/utils/synthetic-detector.js +67 -0
- package/package.json +2 -2
- package/research-output/training-runs/training-run-1773742857616.json +38 -0
- package/research-output/training-runs/training-run-1773742946977.json +38 -0
- package/research-output/training-runs/training-run-1773743278891.json +38 -0
- package/research-output/training-runs/training-run-1773743409754.json +38 -0
- package/research-output/training-runs/training-run-1773743651086.json +38 -0
- package/research-output/training-runs/training-run-1773743782883.json +38 -0
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HuggingFace Integration Service
|
|
3
|
+
*
|
|
4
|
+
* Orchestrates the complete HuggingFace integration pipeline.
|
|
5
|
+
* Main entry point for all HuggingFace operations.
|
|
6
|
+
*/
|
|
7
|
+
import { getTrainingDataAdapter } from "../adapter";
|
|
8
|
+
import { ModelBenchmarkService } from "../benchmark/ModelBenchmarkService";
|
|
9
|
+
import { getExportToHuggingFace } from "../dependencies";
|
|
10
|
+
import { logger } from "../utils";
|
|
11
|
+
import { HuggingFaceDatasetUploader } from "./HuggingFaceDatasetUploader";
|
|
12
|
+
import { HuggingFaceModelUploader } from "./HuggingFaceModelUploader";
|
|
13
|
+
import { getHuggingFaceToken } from "./shared/HuggingFaceUploadUtil";
|
|
14
|
+
export class HuggingFaceIntegrationService {
|
|
15
|
+
datasetUploader;
|
|
16
|
+
modelUploader;
|
|
17
|
+
constructor() {
|
|
18
|
+
this.datasetUploader = new HuggingFaceDatasetUploader();
|
|
19
|
+
this.modelUploader = new HuggingFaceModelUploader();
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Execute complete weekly upload pipeline
|
|
23
|
+
*/
|
|
24
|
+
async executeWeeklyUpload(options = {}) {
|
|
25
|
+
const startTime = Date.now();
|
|
26
|
+
logger.info("Starting weekly upload pipeline", options, "HuggingFaceIntegration");
|
|
27
|
+
const result = {
|
|
28
|
+
success: false,
|
|
29
|
+
datasets: {
|
|
30
|
+
benchmarks: { success: false },
|
|
31
|
+
trajectories: { success: false },
|
|
32
|
+
},
|
|
33
|
+
models: {
|
|
34
|
+
processed: 0,
|
|
35
|
+
benchmarked: 0,
|
|
36
|
+
uploaded: 0,
|
|
37
|
+
},
|
|
38
|
+
errors: [],
|
|
39
|
+
duration: 0,
|
|
40
|
+
};
|
|
41
|
+
try {
|
|
42
|
+
// Step 1: Upload benchmark dataset
|
|
43
|
+
if (!options.dryRun) {
|
|
44
|
+
logger.info("Step 1: Uploading benchmark dataset", undefined, "HuggingFaceIntegration");
|
|
45
|
+
const benchmarkResult = await this.datasetUploader.uploadDataset({
|
|
46
|
+
datasetName: options.datasetName ||
|
|
47
|
+
process.env.HF_DATASET_NAME ||
|
|
48
|
+
"elizaos/agent-benchmarks",
|
|
49
|
+
description: "Weekly benchmark results for autonomous ElizaOS agents",
|
|
50
|
+
});
|
|
51
|
+
result.datasets.benchmarks = {
|
|
52
|
+
success: benchmarkResult.success,
|
|
53
|
+
url: benchmarkResult.datasetUrl,
|
|
54
|
+
error: benchmarkResult.error,
|
|
55
|
+
};
|
|
56
|
+
if (!benchmarkResult.success) {
|
|
57
|
+
result.errors.push(`Benchmark dataset upload: ${benchmarkResult.error}`);
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
else {
|
|
61
|
+
logger.info("DRY RUN: Skipping benchmark dataset upload", undefined, "HuggingFaceIntegration");
|
|
62
|
+
result.datasets.benchmarks.success = true;
|
|
63
|
+
}
|
|
64
|
+
// Step 2: Upload trajectory dataset
|
|
65
|
+
if (!options.dryRun) {
|
|
66
|
+
logger.info("Step 2: Uploading trajectory dataset", undefined, "HuggingFaceIntegration");
|
|
67
|
+
const exportToHuggingFace = getExportToHuggingFace();
|
|
68
|
+
const trajectoryResult = await exportToHuggingFace({
|
|
69
|
+
datasetName: options.trajectoryDatasetName ||
|
|
70
|
+
process.env.HF_TRAJECTORY_DATASET_NAME ||
|
|
71
|
+
"elizaos/agent-trajectories",
|
|
72
|
+
format: "jsonl",
|
|
73
|
+
});
|
|
74
|
+
result.datasets.trajectories = {
|
|
75
|
+
success: trajectoryResult.success,
|
|
76
|
+
url: trajectoryResult.url,
|
|
77
|
+
error: trajectoryResult.error,
|
|
78
|
+
};
|
|
79
|
+
if (!trajectoryResult.success) {
|
|
80
|
+
result.errors.push(`Trajectory dataset upload: ${trajectoryResult.error}`);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
else {
|
|
84
|
+
logger.info("DRY RUN: Skipping trajectory dataset upload", undefined, "HuggingFaceIntegration");
|
|
85
|
+
result.datasets.trajectories.success = true;
|
|
86
|
+
}
|
|
87
|
+
// Step 3: Process models
|
|
88
|
+
const unbenchmarkedModels = await ModelBenchmarkService.getUnbenchmarkedModels();
|
|
89
|
+
result.models.processed = unbenchmarkedModels.length;
|
|
90
|
+
logger.info(`Step 3: Found ${unbenchmarkedModels.length} unbenchmarked models`, undefined, "HuggingFaceIntegration");
|
|
91
|
+
if (unbenchmarkedModels.length > 0) {
|
|
92
|
+
const standardBenchmarks = await ModelBenchmarkService.getStandardBenchmarkPaths();
|
|
93
|
+
if (standardBenchmarks.length === 0) {
|
|
94
|
+
const error = "No standard benchmarks available for model evaluation";
|
|
95
|
+
logger.error(error, undefined, "HuggingFaceIntegration");
|
|
96
|
+
result.errors.push(error);
|
|
97
|
+
}
|
|
98
|
+
else {
|
|
99
|
+
for (const modelId of unbenchmarkedModels) {
|
|
100
|
+
try {
|
|
101
|
+
// Benchmark model
|
|
102
|
+
logger.info(`Benchmarking model: ${modelId}`, undefined, "HuggingFaceIntegration");
|
|
103
|
+
await ModelBenchmarkService.benchmarkModel({
|
|
104
|
+
modelId,
|
|
105
|
+
benchmarkPaths: standardBenchmarks,
|
|
106
|
+
saveResults: true,
|
|
107
|
+
});
|
|
108
|
+
result.models.benchmarked++;
|
|
109
|
+
// Compare to baseline
|
|
110
|
+
const comparison = await ModelBenchmarkService.compareToBaseline(modelId);
|
|
111
|
+
// Upload if improved
|
|
112
|
+
if (comparison.recommendation === "deploy" && !options.dryRun) {
|
|
113
|
+
logger.info(`Model ${modelId} improved, uploading`, undefined, "HuggingFaceIntegration");
|
|
114
|
+
const model = await getTrainingDataAdapter().getModelById(modelId);
|
|
115
|
+
if (model) {
|
|
116
|
+
const modelName = options.modelNamePrefix
|
|
117
|
+
? `${options.modelNamePrefix}-${model.version}`
|
|
118
|
+
: process.env.HF_MODEL_NAME
|
|
119
|
+
? `${process.env.HF_MODEL_NAME}-${model.version}`
|
|
120
|
+
: `elizaos/agent-${model.version}`;
|
|
121
|
+
const modelDescription = options.modelDescriptionPrefix ||
|
|
122
|
+
process.env.HF_MODEL_DESCRIPTION_PREFIX ||
|
|
123
|
+
"Autonomous ElizaOS agent";
|
|
124
|
+
const uploadResult = await this.modelUploader.uploadModel({
|
|
125
|
+
modelId,
|
|
126
|
+
modelName,
|
|
127
|
+
description: `${modelDescription} - v${model.version}`,
|
|
128
|
+
includeWeights: true,
|
|
129
|
+
});
|
|
130
|
+
if (uploadResult.success) {
|
|
131
|
+
result.models.uploaded++;
|
|
132
|
+
// Update model with HuggingFace repo
|
|
133
|
+
await getTrainingDataAdapter().updateModelHuggingFaceRepo(modelId, modelName);
|
|
134
|
+
}
|
|
135
|
+
else {
|
|
136
|
+
result.errors.push(`Model upload ${modelId}: ${uploadResult.error}`);
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
else {
|
|
141
|
+
logger.info(`Model ${modelId} not ready for deployment: ${comparison.recommendation}`, undefined, "HuggingFaceIntegration");
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
catch (error) {
|
|
145
|
+
const errorMsg = error instanceof Error ? error.message : String(error);
|
|
146
|
+
logger.error(`Failed to process model ${modelId}`, { error }, "HuggingFaceIntegration");
|
|
147
|
+
result.errors.push(`Model ${modelId}: ${errorMsg}`);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
result.success = result.errors.length === 0;
|
|
153
|
+
result.duration = Date.now() - startTime;
|
|
154
|
+
logger.info("Weekly upload pipeline complete", {
|
|
155
|
+
success: result.success,
|
|
156
|
+
benchmarkDataset: result.datasets.benchmarks.success,
|
|
157
|
+
trajectoryDataset: result.datasets.trajectories.success,
|
|
158
|
+
modelsProcessed: result.models.processed,
|
|
159
|
+
modelsBenchmarked: result.models.benchmarked,
|
|
160
|
+
modelsUploaded: result.models.uploaded,
|
|
161
|
+
errors: result.errors.length,
|
|
162
|
+
duration: result.duration,
|
|
163
|
+
}, "HuggingFaceIntegration");
|
|
164
|
+
return result;
|
|
165
|
+
}
|
|
166
|
+
catch (error) {
|
|
167
|
+
result.duration = Date.now() - startTime;
|
|
168
|
+
result.errors.push(error instanceof Error ? error.message : String(error));
|
|
169
|
+
logger.error("Weekly upload pipeline failed", { error }, "HuggingFaceIntegration");
|
|
170
|
+
return result;
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
/**
|
|
174
|
+
* Check if new data is available for upload
|
|
175
|
+
*/
|
|
176
|
+
async hasNewDataToUpload() {
|
|
177
|
+
const adapter = getTrainingDataAdapter();
|
|
178
|
+
// Get last upload time from database
|
|
179
|
+
const lastUploadTime = (await adapter.getLastDeployedModelDate()) || new Date(0);
|
|
180
|
+
// Check for new benchmarks since last upload
|
|
181
|
+
const newBenchmarksCount = await adapter.countBenchmarksSince(lastUploadTime);
|
|
182
|
+
// Check for new trajectories since last upload
|
|
183
|
+
const newTrajectoriesCount = await adapter.countTrajectoriesSince(lastUploadTime);
|
|
184
|
+
// Check for unbenchmarked models
|
|
185
|
+
const unbenchmarkedModels = await ModelBenchmarkService.getUnbenchmarkedModels();
|
|
186
|
+
return {
|
|
187
|
+
hasNewBenchmarks: newBenchmarksCount > 0,
|
|
188
|
+
hasNewTrajectories: newTrajectoriesCount > 0,
|
|
189
|
+
hasUnbenchmarkedModels: unbenchmarkedModels.length > 0,
|
|
190
|
+
details: {
|
|
191
|
+
newBenchmarksSince: lastUploadTime,
|
|
192
|
+
newTrajectoriesCount,
|
|
193
|
+
unbenchmarkedModels: unbenchmarkedModels.length,
|
|
194
|
+
},
|
|
195
|
+
};
|
|
196
|
+
}
|
|
197
|
+
/**
|
|
198
|
+
* Validate system is ready for HuggingFace operations
|
|
199
|
+
*/
|
|
200
|
+
async validateSystemReadiness() {
|
|
201
|
+
const issues = [];
|
|
202
|
+
const warnings = [];
|
|
203
|
+
// Check HuggingFace token
|
|
204
|
+
if (!getHuggingFaceToken()) {
|
|
205
|
+
issues.push("HUGGING_FACE_TOKEN or HF_TOKEN environment variable not set");
|
|
206
|
+
}
|
|
207
|
+
const adapter = getTrainingDataAdapter();
|
|
208
|
+
// Check database connection
|
|
209
|
+
try {
|
|
210
|
+
const healthy = await adapter.healthCheck();
|
|
211
|
+
if (!healthy) {
|
|
212
|
+
issues.push("Cannot connect to database");
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
catch {
|
|
216
|
+
issues.push("Cannot connect to database");
|
|
217
|
+
}
|
|
218
|
+
// Check for standard benchmarks
|
|
219
|
+
const standardBenchmarks = await ModelBenchmarkService.getStandardBenchmarkPaths();
|
|
220
|
+
if (standardBenchmarks.length === 0) {
|
|
221
|
+
warnings.push("No standard benchmarks found. Generate benchmark fixtures before upload.");
|
|
222
|
+
}
|
|
223
|
+
// Check for data using training statistics
|
|
224
|
+
try {
|
|
225
|
+
const stats = await adapter.getTrainingStatistics();
|
|
226
|
+
if (stats.benchmarkCount === 0) {
|
|
227
|
+
warnings.push("No benchmark results in database. Run some benchmarks first.");
|
|
228
|
+
}
|
|
229
|
+
if (stats.trajectoryTraining === 0) {
|
|
230
|
+
warnings.push("No training trajectories in database. Generate with agents or test data.");
|
|
231
|
+
}
|
|
232
|
+
if (stats.modelTotal === 0) {
|
|
233
|
+
warnings.push("No trained models in database.");
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
catch {
|
|
237
|
+
issues.push("Could not retrieve training statistics");
|
|
238
|
+
}
|
|
239
|
+
return {
|
|
240
|
+
ready: issues.length === 0,
|
|
241
|
+
issues,
|
|
242
|
+
warnings,
|
|
243
|
+
};
|
|
244
|
+
}
|
|
245
|
+
/**
|
|
246
|
+
* Get integration statistics
|
|
247
|
+
*/
|
|
248
|
+
async getStatistics() {
|
|
249
|
+
const stats = await getTrainingDataAdapter().getTrainingStatistics();
|
|
250
|
+
return {
|
|
251
|
+
benchmarks: {
|
|
252
|
+
total: stats.benchmarkCount,
|
|
253
|
+
lastUpload: stats.lastBenchmarkDate ?? undefined,
|
|
254
|
+
},
|
|
255
|
+
trajectories: {
|
|
256
|
+
total: stats.trajectoryTotal,
|
|
257
|
+
training: stats.trajectoryTraining,
|
|
258
|
+
},
|
|
259
|
+
models: {
|
|
260
|
+
total: stats.modelTotal,
|
|
261
|
+
benchmarked: stats.modelBenchmarked,
|
|
262
|
+
deployed: stats.modelDeployed,
|
|
263
|
+
},
|
|
264
|
+
huggingface: {
|
|
265
|
+
datasetsPublished: (stats.benchmarkCount > 0 ? 1 : 0) +
|
|
266
|
+
(stats.trajectoryTraining > 0 ? 1 : 0),
|
|
267
|
+
modelsPublished: stats.publishedRepoCount,
|
|
268
|
+
},
|
|
269
|
+
};
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
export const huggingFaceIntegration = new HuggingFaceIntegrationService();
|
|
@@ -0,0 +1,385 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HuggingFace Model Uploader
|
|
3
|
+
*
|
|
4
|
+
* Uploads trained RL models to HuggingFace Hub with benchmark results and model cards.
|
|
5
|
+
*/
|
|
6
|
+
import { promises as fs } from "node:fs";
|
|
7
|
+
import * as path from "node:path";
|
|
8
|
+
import { getTrainingDataAdapter } from "../adapter";
|
|
9
|
+
import { parseSimulationMetrics, } from "../benchmark/parseSimulationMetrics";
|
|
10
|
+
import { logger } from "../utils";
|
|
11
|
+
import { getHuggingFaceToken, HuggingFaceUploadUtil, requireHuggingFaceToken, } from "./shared/HuggingFaceUploadUtil";
|
|
12
|
+
export class HuggingFaceModelUploader {
|
|
13
|
+
huggingFaceToken;
|
|
14
|
+
constructor(huggingFaceToken) {
|
|
15
|
+
this.huggingFaceToken = huggingFaceToken || getHuggingFaceToken();
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Upload model to HuggingFace with benchmarks and model card
|
|
19
|
+
*/
|
|
20
|
+
async uploadModel(options) {
|
|
21
|
+
try {
|
|
22
|
+
logger.info("Starting HuggingFace model upload", {
|
|
23
|
+
modelId: options.modelId,
|
|
24
|
+
});
|
|
25
|
+
// Validate token (throws if not set)
|
|
26
|
+
const token = this.huggingFaceToken || requireHuggingFaceToken();
|
|
27
|
+
this.huggingFaceToken = token;
|
|
28
|
+
// Step 1: Load model from database
|
|
29
|
+
const adapter = getTrainingDataAdapter();
|
|
30
|
+
const model = await adapter.getModelById(options.modelId);
|
|
31
|
+
if (!model) {
|
|
32
|
+
throw new Error(`Model not found: ${options.modelId}`);
|
|
33
|
+
}
|
|
34
|
+
// Step 2: Get benchmark results
|
|
35
|
+
logger.info("Loading benchmark results", { modelId: options.modelId });
|
|
36
|
+
const modelBenchmarks = await this.getBenchmarkResults(options.modelId);
|
|
37
|
+
if (modelBenchmarks.length === 0) {
|
|
38
|
+
logger.warn("No benchmark results found for model", {
|
|
39
|
+
modelId: options.modelId,
|
|
40
|
+
});
|
|
41
|
+
}
|
|
42
|
+
// Step 3: Prepare model card data
|
|
43
|
+
const cardData = {
|
|
44
|
+
modelId: model.modelId,
|
|
45
|
+
modelName: options.modelName,
|
|
46
|
+
version: model.version,
|
|
47
|
+
baseModel: model.baseModel,
|
|
48
|
+
trainedAt: model.createdAt,
|
|
49
|
+
trainingRunId: model.trainingBatch || undefined,
|
|
50
|
+
benchmarkResults: modelBenchmarks,
|
|
51
|
+
metrics: this.calculateAverageMetrics(modelBenchmarks),
|
|
52
|
+
};
|
|
53
|
+
// Step 4: Create output directory
|
|
54
|
+
const outputDir = options.outputDir ||
|
|
55
|
+
path.join(process.cwd(), "exports", "models", model.version);
|
|
56
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
57
|
+
// Step 5: Generate model card
|
|
58
|
+
logger.info("Generating model card");
|
|
59
|
+
await this.generateModelCard(cardData, outputDir);
|
|
60
|
+
// Step 6: Save metadata
|
|
61
|
+
const metadataPath = path.join(outputDir, "model_metadata.json");
|
|
62
|
+
await fs.writeFile(metadataPath, JSON.stringify({
|
|
63
|
+
modelId: model.modelId,
|
|
64
|
+
version: model.version,
|
|
65
|
+
baseModel: model.baseModel,
|
|
66
|
+
storagePath: model.storagePath,
|
|
67
|
+
trainingBatch: model.trainingBatch,
|
|
68
|
+
trainedAt: model.createdAt.toISOString(),
|
|
69
|
+
benchmarkScore: model.benchmarkScore,
|
|
70
|
+
avgReward: model.avgReward,
|
|
71
|
+
accuracy: model.accuracy,
|
|
72
|
+
}, null, 2));
|
|
73
|
+
// Step 7: Save benchmark results
|
|
74
|
+
const benchmarksPath = path.join(outputDir, "benchmark_results.json");
|
|
75
|
+
await fs.writeFile(benchmarksPath, JSON.stringify(modelBenchmarks, null, 2));
|
|
76
|
+
// Step 8: Upload to HuggingFace (if weights available and requested)
|
|
77
|
+
let filesUploaded = 2; // README.md + metadata
|
|
78
|
+
if (options.includeWeights && model.storagePath) {
|
|
79
|
+
logger.info("Uploading model to HuggingFace", {
|
|
80
|
+
modelName: options.modelName,
|
|
81
|
+
});
|
|
82
|
+
const uploadCount = await this.uploadToHub(options.modelName, outputDir, options.private ?? false);
|
|
83
|
+
filesUploaded = uploadCount;
|
|
84
|
+
}
|
|
85
|
+
else {
|
|
86
|
+
logger.info("Skipping model weight upload (not requested or no weights available)");
|
|
87
|
+
}
|
|
88
|
+
const modelUrl = `https://huggingface.co/${options.modelName}`;
|
|
89
|
+
logger.info("Model uploaded successfully", { modelUrl, filesUploaded });
|
|
90
|
+
// Update model status in database
|
|
91
|
+
await adapter.updateModelStatus(options.modelId, "deployed", {
|
|
92
|
+
deployedAt: new Date(),
|
|
93
|
+
});
|
|
94
|
+
return {
|
|
95
|
+
success: true,
|
|
96
|
+
modelUrl,
|
|
97
|
+
modelId: options.modelId,
|
|
98
|
+
filesUploaded,
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
catch (error) {
|
|
102
|
+
logger.error("Failed to upload model", { error });
|
|
103
|
+
return {
|
|
104
|
+
success: false,
|
|
105
|
+
modelId: options.modelId,
|
|
106
|
+
filesUploaded: 0,
|
|
107
|
+
error: error instanceof Error ? error.message : "Unknown error",
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* Get benchmark results for a model
|
|
113
|
+
*/
|
|
114
|
+
async getBenchmarkResults(modelId) {
|
|
115
|
+
// Query benchmark results from database
|
|
116
|
+
try {
|
|
117
|
+
const results = await getTrainingDataAdapter().getBenchmarkResultsByModel(modelId);
|
|
118
|
+
return results.map((r) => ({
|
|
119
|
+
benchmarkId: r.benchmarkId,
|
|
120
|
+
runAt: r.runAt.toISOString(),
|
|
121
|
+
// detailedMetrics is stored as JSON in database, validate it matches SimulationMetrics
|
|
122
|
+
metrics: parseSimulationMetrics(r.detailedMetrics),
|
|
123
|
+
}));
|
|
124
|
+
}
|
|
125
|
+
catch (error) {
|
|
126
|
+
logger.warn("Could not load benchmark results from database", { error });
|
|
127
|
+
// Fallback to files if database fails
|
|
128
|
+
return await this.getBenchmarkResultsFromFiles(modelId);
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
/**
|
|
132
|
+
* Fallback: Get benchmark results from files
|
|
133
|
+
*/
|
|
134
|
+
async getBenchmarkResultsFromFiles(modelId) {
|
|
135
|
+
const results = [];
|
|
136
|
+
try {
|
|
137
|
+
const benchmarksDir = path.join(process.cwd(), "benchmarks");
|
|
138
|
+
const files = await fs.readdir(benchmarksDir);
|
|
139
|
+
for (const file of files) {
|
|
140
|
+
if (file.endsWith(".json") && file.includes(modelId)) {
|
|
141
|
+
const filePath = path.join(benchmarksDir, file);
|
|
142
|
+
const data = JSON.parse(await fs.readFile(filePath, "utf-8"));
|
|
143
|
+
if (data.metrics) {
|
|
144
|
+
results.push({
|
|
145
|
+
benchmarkId: data.benchmarkId || file,
|
|
146
|
+
runAt: data.runAt || new Date().toISOString(),
|
|
147
|
+
metrics: data.metrics,
|
|
148
|
+
});
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
catch (error) {
|
|
154
|
+
logger.warn("Could not load benchmark results from files either", {
|
|
155
|
+
error,
|
|
156
|
+
});
|
|
157
|
+
}
|
|
158
|
+
return results;
|
|
159
|
+
}
|
|
160
|
+
/**
|
|
161
|
+
* Calculate average metrics across benchmarks
|
|
162
|
+
*/
|
|
163
|
+
calculateAverageMetrics(benchmarkResults) {
|
|
164
|
+
if (benchmarkResults.length === 0) {
|
|
165
|
+
return {
|
|
166
|
+
avgPnl: 0,
|
|
167
|
+
avgAccuracy: 0,
|
|
168
|
+
avgOptimality: 0,
|
|
169
|
+
benchmarkCount: 0,
|
|
170
|
+
};
|
|
171
|
+
}
|
|
172
|
+
const totalPnl = benchmarkResults.reduce((sum, r) => sum + r.metrics.totalPnl, 0);
|
|
173
|
+
const totalAccuracy = benchmarkResults.reduce((sum, r) => sum + r.metrics.predictionMetrics.accuracy, 0);
|
|
174
|
+
const totalOptimality = benchmarkResults.reduce((sum, r) => sum + r.metrics.optimalityScore, 0);
|
|
175
|
+
return {
|
|
176
|
+
avgPnl: totalPnl / benchmarkResults.length,
|
|
177
|
+
avgAccuracy: totalAccuracy / benchmarkResults.length,
|
|
178
|
+
avgOptimality: totalOptimality / benchmarkResults.length,
|
|
179
|
+
benchmarkCount: benchmarkResults.length,
|
|
180
|
+
};
|
|
181
|
+
}
|
|
182
|
+
/**
|
|
183
|
+
* Generate model card for HuggingFace
|
|
184
|
+
*/
|
|
185
|
+
async generateModelCard(data, outputDir) {
|
|
186
|
+
const brandName = process.env.TRAINING_BRAND_NAME || "ElizaOS";
|
|
187
|
+
const brandOrg = process.env.TRAINING_BRAND_ORG || "ElizaOS Contributors";
|
|
188
|
+
const platformName = process.env.TRAINING_PLATFORM_NAME || "ElizaOS-compatible runtimes";
|
|
189
|
+
const brandTag = brandName.toLowerCase().replace(/\s+/g, "-");
|
|
190
|
+
const citationKey = `${brandTag}_agent_${data.version.replace(/\./g, "_")}`;
|
|
191
|
+
const card = `---
|
|
192
|
+
license: mit
|
|
193
|
+
library_name: transformers
|
|
194
|
+
tags:
|
|
195
|
+
- ${brandTag}
|
|
196
|
+
- reinforcement-learning
|
|
197
|
+
- trading-agent
|
|
198
|
+
- prediction-markets
|
|
199
|
+
base_model: ${data.baseModel}
|
|
200
|
+
---
|
|
201
|
+
|
|
202
|
+
# ${data.modelName}
|
|
203
|
+
|
|
204
|
+
Autonomous agent trained with reinforcement learning for market-style decision making.
|
|
205
|
+
|
|
206
|
+
## Model Details
|
|
207
|
+
|
|
208
|
+
- **Version:** ${data.version}
|
|
209
|
+
- **Base Model:** ${data.baseModel}
|
|
210
|
+
- **Training Date:** ${data.trainedAt.toISOString().split("T")[0]}
|
|
211
|
+
- **Model ID:** ${data.modelId}
|
|
212
|
+
${data.trainingRunId ? `- **Training Run:** ${data.trainingRunId}` : ""}
|
|
213
|
+
|
|
214
|
+
## Performance Metrics
|
|
215
|
+
|
|
216
|
+
${data.benchmarkResults.length > 0
|
|
217
|
+
? `
|
|
218
|
+
### Benchmark Results (${data.benchmarkResults.length} runs)
|
|
219
|
+
|
|
220
|
+
| Metric | Value |
|
|
221
|
+
|--------|-------|
|
|
222
|
+
| Average P&L | ${data.metrics.avgPnl.toFixed(2)} |
|
|
223
|
+
| Average Accuracy | ${(data.metrics.avgAccuracy * 100).toFixed(1)}% |
|
|
224
|
+
| Average Optimality | ${data.metrics.avgOptimality.toFixed(1)} |
|
|
225
|
+
|
|
226
|
+
### Detailed Benchmark Results
|
|
227
|
+
|
|
228
|
+
${this.generateBenchmarkTable(data.benchmarkResults)}
|
|
229
|
+
`
|
|
230
|
+
: "No benchmark results available yet."}
|
|
231
|
+
|
|
232
|
+
## Training Details
|
|
233
|
+
|
|
234
|
+
### Training Data
|
|
235
|
+
|
|
236
|
+
- **Source:** Autonomous agent trajectories
|
|
237
|
+
- **Collection Method:** Live agent gameplay on prediction markets
|
|
238
|
+
- **Training Framework:** Atropos GRPO
|
|
239
|
+
- **Base Model:** ${data.baseModel}
|
|
240
|
+
|
|
241
|
+
### Training Procedure
|
|
242
|
+
|
|
243
|
+
This model was trained using Group Relative Policy Optimization (GRPO) via the Atropos framework on trajectories collected from autonomous agents. The training process:
|
|
244
|
+
|
|
245
|
+
1. Agents generate trajectories through market interactions
|
|
246
|
+
2. Trajectories are scored using RLAIF with an LLM judge based on P&L, prediction accuracy, and decision quality
|
|
247
|
+
3. GRPO training optimizes policy to maximize expected rewards
|
|
248
|
+
4. Model checkpoints are evaluated on standardized benchmarks
|
|
249
|
+
|
|
250
|
+
### Compute Infrastructure
|
|
251
|
+
|
|
252
|
+
- **Platform:** ${data.trainingRunId ? "Atropos GRPO Training" : "Local training"}
|
|
253
|
+
- **Training Time:** Continuous learning with hourly updates
|
|
254
|
+
|
|
255
|
+
## Intended Use
|
|
256
|
+
|
|
257
|
+
This model is designed for:
|
|
258
|
+
|
|
259
|
+
- Autonomous market decision support and simulation
|
|
260
|
+
- Research on RL-based trading strategies
|
|
261
|
+
- Benchmarking agent decision-making
|
|
262
|
+
- Educational purposes
|
|
263
|
+
|
|
264
|
+
**Not intended for:**
|
|
265
|
+
- Production trading without human oversight
|
|
266
|
+
- Financial advice
|
|
267
|
+
- Real-money trading without risk management
|
|
268
|
+
|
|
269
|
+
## Evaluation
|
|
270
|
+
|
|
271
|
+
The model is evaluated on standardized benchmarks that include:
|
|
272
|
+
|
|
273
|
+
- **Prediction Market Trading:** Betting on binary outcomes with LMSR pricing
|
|
274
|
+
- **Perpetual Trading:** Long/short positions on crypto perps
|
|
275
|
+
- **Social Interaction:** Posts, group chats, and reputation building
|
|
276
|
+
- **Risk Management:** Position sizing and portfolio optimization
|
|
277
|
+
|
|
278
|
+
### Metrics
|
|
279
|
+
|
|
280
|
+
- **Total P&L:** Cumulative profit/loss across all positions
|
|
281
|
+
- **Prediction Accuracy:** Percentage of correct market predictions
|
|
282
|
+
- **Optimality Score:** Alignment with theoretically optimal actions (0-100)
|
|
283
|
+
- **Response Time:** Decision-making latency
|
|
284
|
+
|
|
285
|
+
## Usage
|
|
286
|
+
|
|
287
|
+
### Via ${platformName}
|
|
288
|
+
|
|
289
|
+
The model can be deployed in compatible runtimes and accessed via an agent API:
|
|
290
|
+
|
|
291
|
+
\`\`\`typescript
|
|
292
|
+
import { agentRuntimeManager } from '@elizaos/agents';
|
|
293
|
+
|
|
294
|
+
const runtime = await agentRuntimeManager.getRuntime(agentId);
|
|
295
|
+
const response = await runtime.chat({
|
|
296
|
+
messages: [{ role: 'user', content: 'Analyze this market...' }]
|
|
297
|
+
});
|
|
298
|
+
\`\`\`
|
|
299
|
+
|
|
300
|
+
### Direct Inference
|
|
301
|
+
|
|
302
|
+
If you have downloaded the model weights:
|
|
303
|
+
|
|
304
|
+
\`\`\`python
|
|
305
|
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
306
|
+
|
|
307
|
+
model = AutoModelForCausalLM.from_pretrained("${data.modelName}")
|
|
308
|
+
tokenizer = AutoTokenizer.from_pretrained("${data.modelName}")
|
|
309
|
+
|
|
310
|
+
# Use model for inference
|
|
311
|
+
inputs = tokenizer("Should I bet YES on this market?", return_tensors="pt")
|
|
312
|
+
outputs = model.generate(**inputs)
|
|
313
|
+
response = tokenizer.decode(outputs[0])
|
|
314
|
+
\`\`\`
|
|
315
|
+
|
|
316
|
+
## Limitations
|
|
317
|
+
|
|
318
|
+
- Trained on simulated market data; real-world performance may vary
|
|
319
|
+
- May not generalize to markets significantly different from training distribution
|
|
320
|
+
- Decision quality depends on market information quality
|
|
321
|
+
- No guarantees of profitability
|
|
322
|
+
|
|
323
|
+
## Ethical Considerations
|
|
324
|
+
|
|
325
|
+
This model is part of a research project on autonomous agents in prediction markets. Users should:
|
|
326
|
+
|
|
327
|
+
- Understand the risks of algorithmic trading
|
|
328
|
+
- Not rely solely on model decisions for financial outcomes
|
|
329
|
+
- Use appropriate risk management and position sizing
|
|
330
|
+
- Consider market impact and fairness implications
|
|
331
|
+
|
|
332
|
+
## Citation
|
|
333
|
+
|
|
334
|
+
\`\`\`bibtex
|
|
335
|
+
@model{${citationKey},
|
|
336
|
+
title = {${brandName} Trading Agent},
|
|
337
|
+
author = {${brandOrg}},
|
|
338
|
+
year = {${new Date().getFullYear()}},
|
|
339
|
+
version = {${data.version}},
|
|
340
|
+
url = {https://huggingface.co/${data.modelName}}
|
|
341
|
+
}
|
|
342
|
+
\`\`\`
|
|
343
|
+
|
|
344
|
+
## Model Card Contact
|
|
345
|
+
|
|
346
|
+
For questions or issues, please open an issue on the repository.
|
|
347
|
+
`;
|
|
348
|
+
const cardPath = path.join(outputDir, "README.md");
|
|
349
|
+
await fs.writeFile(cardPath, card);
|
|
350
|
+
}
|
|
351
|
+
/**
|
|
352
|
+
* Generate benchmark results table
|
|
353
|
+
*/
|
|
354
|
+
generateBenchmarkTable(results) {
|
|
355
|
+
if (results.length === 0)
|
|
356
|
+
return "";
|
|
357
|
+
let table = "| Benchmark | Date | P&L | Accuracy | Win Rate | Optimality |\n";
|
|
358
|
+
table += "|-----------|------|-----|----------|----------|------------|\n";
|
|
359
|
+
results.forEach((result) => {
|
|
360
|
+
const date = new Date(result.runAt).toISOString().split("T")[0];
|
|
361
|
+
table += `| ${result.benchmarkId.substring(0, 20)}... | ${date} | ${result.metrics.totalPnl.toFixed(2)} | ${(result.metrics.predictionMetrics.accuracy * 100).toFixed(1)}% | ${(result.metrics.perpMetrics.winRate * 100).toFixed(1)}% | ${result.metrics.optimalityScore.toFixed(1)} |\n`;
|
|
362
|
+
});
|
|
363
|
+
return table;
|
|
364
|
+
}
|
|
365
|
+
/**
|
|
366
|
+
* Upload files to HuggingFace Hub
|
|
367
|
+
* Uses shared utility for consistent upload behavior
|
|
368
|
+
*/
|
|
369
|
+
async uploadToHub(modelName, localDir, _isPrivate) {
|
|
370
|
+
if (!this.huggingFaceToken) {
|
|
371
|
+
throw new Error("HuggingFace token not configured");
|
|
372
|
+
}
|
|
373
|
+
try {
|
|
374
|
+
// Use shared upload utility
|
|
375
|
+
return await HuggingFaceUploadUtil.uploadDirectory(modelName, "model", localDir, this.huggingFaceToken);
|
|
376
|
+
}
|
|
377
|
+
catch (error) {
|
|
378
|
+
logger.error("Failed to upload to HuggingFace Hub", { error });
|
|
379
|
+
// Provide helpful manual upload instructions
|
|
380
|
+
const instructions = HuggingFaceUploadUtil.getManualUploadInstructions(modelName, "model", localDir);
|
|
381
|
+
logger.info("To upload manually:", { instructions });
|
|
382
|
+
throw error;
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HuggingFace Integration Module
|
|
3
|
+
*
|
|
4
|
+
* Tools for uploading models and datasets to HuggingFace Hub.
|
|
5
|
+
*/
|
|
6
|
+
export { HuggingFaceDatasetUploader } from "./HuggingFaceDatasetUploader";
|
|
7
|
+
export { HuggingFaceIntegrationService, huggingFaceIntegration, } from "./HuggingFaceIntegrationService";
|
|
8
|
+
export { HuggingFaceModelUploader } from "./HuggingFaceModelUploader";
|
|
9
|
+
export { getHuggingFaceToken, HuggingFaceUploadUtil, requireHuggingFaceToken, } from "./shared/HuggingFaceUploadUtil";
|