@elizaos/training 2.0.0-alpha.21 → 2.0.0-alpha.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-lint.log +2 -0
- package/.turbo/turbo-typecheck.log +1 -0
- package/dist/.tsbuildinfo +1 -0
- package/dist/adapter.js +59 -0
- package/dist/archetypes/ArchetypeConfigService.js +510 -0
- package/dist/archetypes/derive-archetype.js +196 -0
- package/dist/archetypes/index.js +7 -0
- package/dist/benchmark/ArchetypeMatchupBenchmark.js +547 -0
- package/dist/benchmark/BenchmarkChartGenerator.js +632 -0
- package/dist/benchmark/BenchmarkDataGenerator.js +825 -0
- package/dist/benchmark/BenchmarkDataViewer.js +197 -0
- package/dist/benchmark/BenchmarkHistoryService.js +135 -0
- package/dist/benchmark/BenchmarkRunner.js +483 -0
- package/dist/benchmark/BenchmarkValidator.js +158 -0
- package/dist/benchmark/FastEvalRunner.js +133 -0
- package/dist/benchmark/MetricsValidator.js +104 -0
- package/dist/benchmark/MetricsVisualizer.js +775 -0
- package/dist/benchmark/ModelBenchmarkService.js +433 -0
- package/dist/benchmark/ModelRegistry.js +122 -0
- package/dist/benchmark/RulerBenchmarkIntegration.js +168 -0
- package/dist/benchmark/SimulationA2AInterface.js +683 -0
- package/dist/benchmark/SimulationEngine.js +522 -0
- package/dist/benchmark/TaskRunner.js +60 -0
- package/dist/benchmark/__tests__/BenchmarkRunner.test.js +409 -0
- package/dist/benchmark/__tests__/HeadToHead.test.js +105 -0
- package/dist/benchmark/index.js +23 -0
- package/dist/benchmark/parseSimulationMetrics.js +86 -0
- package/dist/benchmark/simulation-types.js +1 -0
- package/dist/dependencies.js +197 -0
- package/dist/generation/TrajectoryGenerator.js +244 -0
- package/dist/generation/index.js +6 -0
- package/dist/huggingface/HuggingFaceDatasetUploader.js +463 -0
- package/dist/huggingface/HuggingFaceIntegrationService.js +272 -0
- package/dist/huggingface/HuggingFaceModelUploader.js +385 -0
- package/dist/huggingface/index.js +9 -0
- package/dist/huggingface/shared/HuggingFaceUploadUtil.js +144 -0
- package/dist/index.js +41 -0
- package/dist/init-training.js +43 -0
- package/dist/metrics/TrajectoryMetricsExtractor.js +523 -0
- package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +628 -0
- package/dist/metrics/index.js +7 -0
- package/dist/metrics/types.js +21 -0
- package/dist/rubrics/__tests__/index.test.js +150 -0
- package/dist/rubrics/ass-kisser.js +83 -0
- package/dist/rubrics/degen.js +78 -0
- package/dist/rubrics/goody-twoshoes.js +82 -0
- package/dist/rubrics/index.js +184 -0
- package/dist/rubrics/information-trader.js +82 -0
- package/dist/rubrics/infosec.js +99 -0
- package/dist/rubrics/liar.js +102 -0
- package/dist/rubrics/perps-trader.js +85 -0
- package/dist/rubrics/researcher.js +79 -0
- package/dist/rubrics/scammer.js +80 -0
- package/dist/rubrics/social-butterfly.js +71 -0
- package/dist/rubrics/super-predictor.js +95 -0
- package/dist/rubrics/trader.js +65 -0
- package/dist/scoring/ArchetypeScoringService.js +301 -0
- package/dist/scoring/JudgePromptBuilder.js +401 -0
- package/dist/scoring/LLMJudgeCache.js +263 -0
- package/dist/scoring/index.js +8 -0
- package/dist/training/AutomationPipeline.js +714 -0
- package/dist/training/BenchmarkService.js +370 -0
- package/dist/training/ConfigValidator.js +153 -0
- package/dist/training/MarketOutcomesTracker.js +142 -0
- package/dist/training/ModelDeployer.js +128 -0
- package/dist/training/ModelFetcher.js +48 -0
- package/dist/training/ModelSelectionService.js +248 -0
- package/dist/training/ModelUsageVerifier.js +106 -0
- package/dist/training/MultiModelOrchestrator.js +349 -0
- package/dist/training/RLModelConfig.js +295 -0
- package/dist/training/RewardBackpropagationService.js +117 -0
- package/dist/training/RulerScoringService.js +450 -0
- package/dist/training/TrainingMonitor.js +108 -0
- package/dist/training/TrajectoryRecorder.js +281 -0
- package/dist/training/__tests__/TrajectoryRecorder.test.js +363 -0
- package/dist/training/index.js +30 -0
- package/dist/training/logRLConfig.js +29 -0
- package/dist/training/pipeline.js +80 -0
- package/dist/training/storage/ModelStorageService.js +190 -0
- package/dist/training/storage/TrainingDataArchiver.js +136 -0
- package/dist/training/storage/index.js +7 -0
- package/dist/training/types.js +6 -0
- package/dist/training/window-utils.js +100 -0
- package/dist/utils/index.js +73 -0
- package/dist/utils/logger.js +55 -0
- package/dist/utils/snowflake.js +15 -0
- package/dist/utils/synthetic-detector.js +67 -0
- package/package.json +2 -2
- package/research-output/training-runs/training-run-1773742857616.json +38 -0
- package/research-output/training-runs/training-run-1773742946977.json +38 -0
- package/research-output/training-runs/training-run-1773743278891.json +38 -0
- package/research-output/training-runs/training-run-1773743409754.json +38 -0
- package/research-output/training-runs/training-run-1773743651086.json +38 -0
- package/research-output/training-runs/training-run-1773743782883.json +38 -0
|
@@ -0,0 +1,463 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HuggingFace Dataset Uploader
|
|
3
|
+
*
|
|
4
|
+
* Prepares and uploads benchmark datasets to HuggingFace Hub for public access.
|
|
5
|
+
* Creates dataset cards with visualizations, metrics, and usage examples.
|
|
6
|
+
*/
|
|
7
|
+
import { promises as fs } from "node:fs";
|
|
8
|
+
import * as path from "node:path";
|
|
9
|
+
import { calculateArrayStats, logger } from "../utils";
|
|
10
|
+
import { getHuggingFaceToken, HuggingFaceUploadUtil, requireHuggingFaceToken, } from "./shared/HuggingFaceUploadUtil";
|
|
11
|
+
export class HuggingFaceDatasetUploader {
|
|
12
|
+
huggingFaceToken;
|
|
13
|
+
constructor(huggingFaceToken) {
|
|
14
|
+
this.huggingFaceToken = huggingFaceToken || getHuggingFaceToken();
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Prepare and upload benchmark dataset to HuggingFace
|
|
18
|
+
*/
|
|
19
|
+
async uploadDataset(options) {
|
|
20
|
+
try {
|
|
21
|
+
logger.info("Starting HuggingFace dataset upload", {
|
|
22
|
+
datasetName: options.datasetName,
|
|
23
|
+
});
|
|
24
|
+
// Validate token (throws if not set)
|
|
25
|
+
const token = this.huggingFaceToken || requireHuggingFaceToken();
|
|
26
|
+
this.huggingFaceToken = token;
|
|
27
|
+
// Set defaults
|
|
28
|
+
const version = options.version || this.generateVersion();
|
|
29
|
+
const benchmarkDir = options.benchmarkDir || path.join(process.cwd(), "benchmarks");
|
|
30
|
+
const outputDir = options.outputDir ||
|
|
31
|
+
path.join(process.cwd(), "exports", "huggingface", version);
|
|
32
|
+
// Step 1: Collect benchmark data
|
|
33
|
+
logger.info("Collecting benchmark data", { benchmarkDir });
|
|
34
|
+
const benchmarks = await this.collectBenchmarkData(benchmarkDir);
|
|
35
|
+
logger.info(`Collected ${benchmarks.length} benchmark records`);
|
|
36
|
+
if (benchmarks.length === 0) {
|
|
37
|
+
throw new Error("No benchmark data found to upload");
|
|
38
|
+
}
|
|
39
|
+
// Step 2: Prepare dataset files
|
|
40
|
+
logger.info("Preparing dataset files", { outputDir });
|
|
41
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
42
|
+
const metadata = await this.prepareDatasetFiles(benchmarks, outputDir, {
|
|
43
|
+
datasetName: options.datasetName,
|
|
44
|
+
version,
|
|
45
|
+
description: options.description || "Autonomous agent benchmark results",
|
|
46
|
+
});
|
|
47
|
+
// Step 3: Generate dataset card
|
|
48
|
+
logger.info("Generating dataset card");
|
|
49
|
+
await this.generateDatasetCard(metadata, benchmarks, outputDir);
|
|
50
|
+
// Step 4: Create repository if it doesn't exist
|
|
51
|
+
logger.info("Ensuring repository exists", {
|
|
52
|
+
datasetName: options.datasetName,
|
|
53
|
+
});
|
|
54
|
+
await this.ensureRepository(options.datasetName, options.private ?? false);
|
|
55
|
+
// Step 5: Upload to HuggingFace
|
|
56
|
+
logger.info("Uploading to HuggingFace", {
|
|
57
|
+
datasetName: options.datasetName,
|
|
58
|
+
});
|
|
59
|
+
const filesUploaded = await this.uploadToHub(options.datasetName, outputDir, options.private ?? false);
|
|
60
|
+
const datasetUrl = `https://huggingface.co/datasets/${options.datasetName}`;
|
|
61
|
+
logger.info("Dataset uploaded successfully", {
|
|
62
|
+
datasetUrl,
|
|
63
|
+
filesUploaded,
|
|
64
|
+
});
|
|
65
|
+
return {
|
|
66
|
+
success: true,
|
|
67
|
+
datasetUrl,
|
|
68
|
+
version,
|
|
69
|
+
filesUploaded,
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
catch (error) {
|
|
73
|
+
logger.error("Failed to upload dataset", { error });
|
|
74
|
+
return {
|
|
75
|
+
success: false,
|
|
76
|
+
version: options.version || "unknown",
|
|
77
|
+
filesUploaded: 0,
|
|
78
|
+
error: error instanceof Error ? error.message : "Unknown error",
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Collect benchmark data from files
|
|
84
|
+
*/
|
|
85
|
+
async collectBenchmarkData(benchmarkDir) {
|
|
86
|
+
const records = [];
|
|
87
|
+
// Collect from model-comparison directory
|
|
88
|
+
const comparisonDir = path.join(benchmarkDir, "model-comparison");
|
|
89
|
+
if (await this.fileExists(comparisonDir)) {
|
|
90
|
+
const comparisonFile = path.join(comparisonDir, "comparison.json");
|
|
91
|
+
if (await this.fileExists(comparisonFile)) {
|
|
92
|
+
const data = JSON.parse(await fs.readFile(comparisonFile, "utf-8"));
|
|
93
|
+
for (const result of data.results || []) {
|
|
94
|
+
if (result.metrics) {
|
|
95
|
+
records.push({
|
|
96
|
+
benchmarkId: data.benchmark || "comparison",
|
|
97
|
+
modelId: result.model.modelId,
|
|
98
|
+
modelVersion: "baseline",
|
|
99
|
+
modelName: result.model.displayName,
|
|
100
|
+
runAt: data.runAt,
|
|
101
|
+
metrics: result.metrics,
|
|
102
|
+
benchmarkSnapshot: {
|
|
103
|
+
duration: result.metrics.timing?.totalDuration || 0,
|
|
104
|
+
tickInterval: 60,
|
|
105
|
+
markets: 10,
|
|
106
|
+
ticks: Math.floor((result.metrics.timing?.totalDuration || 0) / 60),
|
|
107
|
+
},
|
|
108
|
+
});
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
// Collect from baselines directory
|
|
114
|
+
const baselinesDir = path.join(benchmarkDir, "baselines");
|
|
115
|
+
if (await this.fileExists(baselinesDir)) {
|
|
116
|
+
const files = await fs.readdir(baselinesDir);
|
|
117
|
+
for (const file of files) {
|
|
118
|
+
if (file.endsWith(".json") && file.startsWith("baseline-")) {
|
|
119
|
+
const filePath = path.join(baselinesDir, file);
|
|
120
|
+
const data = JSON.parse(await fs.readFile(filePath, "utf-8"));
|
|
121
|
+
// Skip if no metrics
|
|
122
|
+
if (!data.metrics)
|
|
123
|
+
continue;
|
|
124
|
+
records.push({
|
|
125
|
+
benchmarkId: data.benchmark?.id ||
|
|
126
|
+
data.benchmark?.path ||
|
|
127
|
+
file.replace(".json", ""),
|
|
128
|
+
modelId: data.model?.modelId || "unknown",
|
|
129
|
+
modelVersion: data.model?.version || "baseline",
|
|
130
|
+
modelName: data.model?.displayName ||
|
|
131
|
+
data.model?.name ||
|
|
132
|
+
file.replace(".json", ""),
|
|
133
|
+
runAt: data.runAt || new Date().toISOString(),
|
|
134
|
+
metrics: data.metrics,
|
|
135
|
+
benchmarkSnapshot: {
|
|
136
|
+
duration: data.timing?.totalDuration ||
|
|
137
|
+
data.metrics.timing?.totalDuration ||
|
|
138
|
+
0,
|
|
139
|
+
tickInterval: 60,
|
|
140
|
+
markets: 10,
|
|
141
|
+
ticks: Math.floor((data.timing?.totalDuration ||
|
|
142
|
+
data.metrics.timing?.totalDuration ||
|
|
143
|
+
0) / 60),
|
|
144
|
+
},
|
|
145
|
+
});
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
// Collect from test-baselines directory
|
|
150
|
+
const testBaselinesDir = path.join(benchmarkDir, "test-baselines");
|
|
151
|
+
if (await this.fileExists(testBaselinesDir)) {
|
|
152
|
+
const subdirs = await fs.readdir(testBaselinesDir);
|
|
153
|
+
for (const subdir of subdirs) {
|
|
154
|
+
const metricsFile = path.join(testBaselinesDir, subdir, "metrics.json");
|
|
155
|
+
if (await this.fileExists(metricsFile)) {
|
|
156
|
+
const data = JSON.parse(await fs.readFile(metricsFile, "utf-8"));
|
|
157
|
+
// Skip if no required fields
|
|
158
|
+
if (!data.totalPnl && !data.predictionMetrics)
|
|
159
|
+
continue;
|
|
160
|
+
records.push({
|
|
161
|
+
benchmarkId: data.benchmarkId || "test-benchmark",
|
|
162
|
+
modelId: subdir,
|
|
163
|
+
modelVersion: "test-baseline",
|
|
164
|
+
modelName: subdir,
|
|
165
|
+
runAt: data.runAt || new Date().toISOString(),
|
|
166
|
+
metrics: data,
|
|
167
|
+
benchmarkSnapshot: {
|
|
168
|
+
duration: data.timing?.totalDuration || 0,
|
|
169
|
+
tickInterval: 60,
|
|
170
|
+
markets: 10,
|
|
171
|
+
ticks: Math.floor((data.timing?.totalDuration || 0) / 60),
|
|
172
|
+
},
|
|
173
|
+
});
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
return records;
|
|
178
|
+
}
|
|
179
|
+
/**
|
|
180
|
+
* Prepare dataset files in HuggingFace format
|
|
181
|
+
*/
|
|
182
|
+
async prepareDatasetFiles(benchmarks, outputDir, options) {
|
|
183
|
+
// Create data.jsonl with all benchmark records
|
|
184
|
+
const jsonlPath = path.join(outputDir, "data.jsonl");
|
|
185
|
+
const jsonlLines = benchmarks.map((b) => JSON.stringify(b)).join("\n");
|
|
186
|
+
await fs.writeFile(jsonlPath, jsonlLines);
|
|
187
|
+
// Create metadata.json
|
|
188
|
+
const metadata = {
|
|
189
|
+
datasetName: options.datasetName,
|
|
190
|
+
version: options.version,
|
|
191
|
+
description: options.description,
|
|
192
|
+
createdAt: new Date().toISOString(),
|
|
193
|
+
totalBenchmarks: benchmarks.length,
|
|
194
|
+
models: Array.from(new Set(benchmarks.map((b) => b.modelName))),
|
|
195
|
+
benchmarkTypes: Array.from(new Set(benchmarks.map((b) => b.benchmarkId))),
|
|
196
|
+
license: "MIT",
|
|
197
|
+
};
|
|
198
|
+
const metadataPath = path.join(outputDir, "metadata.json");
|
|
199
|
+
await fs.writeFile(metadataPath, JSON.stringify(metadata, null, 2));
|
|
200
|
+
// Create summary statistics
|
|
201
|
+
const summary = this.calculateSummaryStatistics(benchmarks);
|
|
202
|
+
const summaryPath = path.join(outputDir, "summary.json");
|
|
203
|
+
await fs.writeFile(summaryPath, JSON.stringify(summary, null, 2));
|
|
204
|
+
return metadata;
|
|
205
|
+
}
|
|
206
|
+
/**
|
|
207
|
+
* Generate README.md dataset card for HuggingFace
|
|
208
|
+
*/
|
|
209
|
+
async generateDatasetCard(metadata, benchmarks, outputDir) {
|
|
210
|
+
const summary = this.calculateSummaryStatistics(benchmarks);
|
|
211
|
+
const brandName = process.env.TRAINING_BRAND_NAME || "ElizaOS";
|
|
212
|
+
const brandOrg = process.env.TRAINING_BRAND_ORG || "ElizaOS Contributors";
|
|
213
|
+
const platformName = process.env.TRAINING_PLATFORM_NAME || "ElizaOS-compatible runtimes";
|
|
214
|
+
const brandTag = brandName.toLowerCase().replace(/\s+/g, "-");
|
|
215
|
+
const card = `---
|
|
216
|
+
license: ${metadata.license}
|
|
217
|
+
task_categories:
|
|
218
|
+
- reinforcement-learning
|
|
219
|
+
- agent-evaluation
|
|
220
|
+
tags:
|
|
221
|
+
- ${brandTag}
|
|
222
|
+
- prediction-markets
|
|
223
|
+
- trading-agents
|
|
224
|
+
- benchmarks
|
|
225
|
+
size_categories:
|
|
226
|
+
- n<1K
|
|
227
|
+
---
|
|
228
|
+
|
|
229
|
+
# ${metadata.datasetName}
|
|
230
|
+
|
|
231
|
+
## Dataset Description
|
|
232
|
+
|
|
233
|
+
${metadata.description}
|
|
234
|
+
|
|
235
|
+
This dataset contains benchmark results for autonomous trading agents on prediction-market style environments. Each record includes comprehensive performance metrics, market conditions, and agent behavior data.
|
|
236
|
+
|
|
237
|
+
**Version:** ${metadata.version}
|
|
238
|
+
**Created:** ${metadata.createdAt}
|
|
239
|
+
**Total Benchmarks:** ${metadata.totalBenchmarks}
|
|
240
|
+
**Models Evaluated:** ${metadata.models.length}
|
|
241
|
+
|
|
242
|
+
## Dataset Statistics
|
|
243
|
+
|
|
244
|
+
### Overall Performance
|
|
245
|
+
|
|
246
|
+
| Metric | Mean | Median | Std Dev | Min | Max |
|
|
247
|
+
|--------|------|--------|---------|-----|-----|
|
|
248
|
+
| Total P&L | ${summary.pnl.mean.toFixed(2)} | ${summary.pnl.median.toFixed(2)} | ${summary.pnl.std.toFixed(2)} | ${summary.pnl.min.toFixed(2)} | ${summary.pnl.max.toFixed(2)} |
|
|
249
|
+
| Prediction Accuracy | ${(summary.accuracy.mean * 100).toFixed(1)}% | ${(summary.accuracy.median * 100).toFixed(1)}% | ${(summary.accuracy.std * 100).toFixed(1)}% | ${(summary.accuracy.min * 100).toFixed(1)}% | ${(summary.accuracy.max * 100).toFixed(1)}% |
|
|
250
|
+
| Optimality Score | ${summary.optimality.mean.toFixed(1)} | ${summary.optimality.median.toFixed(1)} | ${summary.optimality.std.toFixed(1)} | ${summary.optimality.min.toFixed(1)} | ${summary.optimality.max.toFixed(1)} |
|
|
251
|
+
|
|
252
|
+
### Model Leaderboard
|
|
253
|
+
|
|
254
|
+
${this.generateLeaderboardTable(benchmarks)}
|
|
255
|
+
|
|
256
|
+
## Dataset Structure
|
|
257
|
+
|
|
258
|
+
### Data Fields
|
|
259
|
+
|
|
260
|
+
- \`benchmarkId\`: Unique identifier for the benchmark scenario
|
|
261
|
+
- \`modelId\`: Model identifier
|
|
262
|
+
- \`modelVersion\`: Model version (baseline, trained, etc.)
|
|
263
|
+
- \`modelName\`: Human-readable model name
|
|
264
|
+
- \`runAt\`: ISO timestamp of benchmark execution
|
|
265
|
+
- \`metrics\`: Performance metrics object
|
|
266
|
+
- \`totalPnl\`: Total profit/loss across all positions
|
|
267
|
+
- \`predictionMetrics\`: Prediction market performance
|
|
268
|
+
- \`totalPositions\`: Number of prediction positions taken
|
|
269
|
+
- \`correctPredictions\`: Number of correct predictions
|
|
270
|
+
- \`accuracy\`: Prediction accuracy (0-1)
|
|
271
|
+
- \`perpMetrics\`: Perpetual trading performance
|
|
272
|
+
- \`totalTrades\`: Number of perpetual trades
|
|
273
|
+
- \`winRate\`: Win rate for perpetual trades
|
|
274
|
+
- \`socialMetrics\`: Social engagement metrics
|
|
275
|
+
- \`timing\`: Execution timing statistics
|
|
276
|
+
- \`optimalityScore\`: How close to optimal play (0-100)
|
|
277
|
+
|
|
278
|
+
### Data Splits
|
|
279
|
+
|
|
280
|
+
This dataset does not have predefined splits. Use for model evaluation and comparison.
|
|
281
|
+
|
|
282
|
+
## Usage
|
|
283
|
+
|
|
284
|
+
### Load Dataset
|
|
285
|
+
|
|
286
|
+
\`\`\`python
|
|
287
|
+
from datasets import load_dataset
|
|
288
|
+
|
|
289
|
+
dataset = load_dataset("${metadata.datasetName}")
|
|
290
|
+
\`\`\`
|
|
291
|
+
|
|
292
|
+
### Example Analysis
|
|
293
|
+
|
|
294
|
+
\`\`\`python
|
|
295
|
+
import pandas as pd
|
|
296
|
+
|
|
297
|
+
# Load as DataFrame
|
|
298
|
+
df = pd.read_json("hf://datasets/${metadata.datasetName}/data.jsonl", lines=True)
|
|
299
|
+
|
|
300
|
+
# Compare models
|
|
301
|
+
model_performance = df.groupby('modelName').agg({
|
|
302
|
+
'metrics.totalPnl': 'mean',
|
|
303
|
+
'metrics.predictionMetrics.accuracy': 'mean',
|
|
304
|
+
'metrics.optimalityScore': 'mean'
|
|
305
|
+
})
|
|
306
|
+
|
|
307
|
+
print(model_performance.sort_values('metrics.totalPnl', ascending=False))
|
|
308
|
+
\`\`\`
|
|
309
|
+
|
|
310
|
+
## Benchmark Details
|
|
311
|
+
|
|
312
|
+
### Environment
|
|
313
|
+
|
|
314
|
+
- **Platform:** ${platformName}
|
|
315
|
+
- **Market Types:** Prediction markets + perpetual futures
|
|
316
|
+
- **Tick Interval:** ${benchmarks[0]?.benchmarkSnapshot.tickInterval || 60} seconds
|
|
317
|
+
- **Duration:** ${Math.floor((benchmarks[0]?.benchmarkSnapshot.duration || 0) / 60000)} minutes
|
|
318
|
+
|
|
319
|
+
### Evaluation Metrics
|
|
320
|
+
|
|
321
|
+
1. **Total P&L:** Cumulative profit/loss across all positions
|
|
322
|
+
2. **Prediction Accuracy:** Percentage of correct market outcome predictions
|
|
323
|
+
3. **Perp Win Rate:** Percentage of profitable perpetual trades
|
|
324
|
+
4. **Optimality Score:** Alignment with theoretically optimal actions (0-100)
|
|
325
|
+
5. **Response Time:** Agent decision-making speed
|
|
326
|
+
|
|
327
|
+
## Citation
|
|
328
|
+
|
|
329
|
+
If you use this dataset in your research, please cite:
|
|
330
|
+
|
|
331
|
+
\`\`\`bibtex
|
|
332
|
+
@dataset{${brandTag}_benchmarks_${metadata.version.replace(/\./g, "_")},
|
|
333
|
+
title = {${brandName} Agent Benchmarks},
|
|
334
|
+
author = {${brandOrg}},
|
|
335
|
+
year = {${new Date().getFullYear()}},
|
|
336
|
+
version = {${metadata.version}},
|
|
337
|
+
url = {https://huggingface.co/datasets/${metadata.datasetName}}
|
|
338
|
+
}
|
|
339
|
+
\`\`\`
|
|
340
|
+
|
|
341
|
+
## License
|
|
342
|
+
|
|
343
|
+
${metadata.license}
|
|
344
|
+
|
|
345
|
+
## Contact
|
|
346
|
+
|
|
347
|
+
For questions or issues, please open an issue on the repository.
|
|
348
|
+
`;
|
|
349
|
+
const cardPath = path.join(outputDir, "README.md");
|
|
350
|
+
await fs.writeFile(cardPath, card);
|
|
351
|
+
}
|
|
352
|
+
/**
|
|
353
|
+
* Generate leaderboard table for dataset card
|
|
354
|
+
*/
|
|
355
|
+
generateLeaderboardTable(benchmarks) {
|
|
356
|
+
// Group by model and calculate averages
|
|
357
|
+
const modelStats = new Map();
|
|
358
|
+
for (const benchmark of benchmarks) {
|
|
359
|
+
if (!modelStats.has(benchmark.modelName)) {
|
|
360
|
+
modelStats.set(benchmark.modelName, {
|
|
361
|
+
pnl: [],
|
|
362
|
+
accuracy: [],
|
|
363
|
+
optimality: [],
|
|
364
|
+
});
|
|
365
|
+
}
|
|
366
|
+
const stats = modelStats.get(benchmark.modelName);
|
|
367
|
+
if (!stats)
|
|
368
|
+
continue;
|
|
369
|
+
stats.pnl.push(benchmark.metrics.totalPnl);
|
|
370
|
+
stats.accuracy.push(benchmark.metrics.predictionMetrics.accuracy);
|
|
371
|
+
stats.optimality.push(benchmark.metrics.optimalityScore);
|
|
372
|
+
}
|
|
373
|
+
// Calculate averages and sort by P&L
|
|
374
|
+
const leaderboard = Array.from(modelStats.entries())
|
|
375
|
+
.map(([model, stats]) => ({
|
|
376
|
+
model,
|
|
377
|
+
avgPnl: stats.pnl.reduce((a, b) => a + b, 0) / stats.pnl.length,
|
|
378
|
+
avgAccuracy: stats.accuracy.reduce((a, b) => a + b, 0) / stats.accuracy.length,
|
|
379
|
+
avgOptimality: stats.optimality.reduce((a, b) => a + b, 0) / stats.optimality.length,
|
|
380
|
+
runs: stats.pnl.length,
|
|
381
|
+
}))
|
|
382
|
+
.sort((a, b) => b.avgPnl - a.avgPnl);
|
|
383
|
+
let table = "| Rank | Model | Avg P&L | Accuracy | Optimality | Runs |\n";
|
|
384
|
+
table += "|------|-------|---------|----------|------------|------|\n";
|
|
385
|
+
leaderboard.forEach((entry, index) => {
|
|
386
|
+
table += `| ${index + 1} | ${entry.model} | ${entry.avgPnl.toFixed(2)} | ${(entry.avgAccuracy * 100).toFixed(1)}% | ${entry.avgOptimality.toFixed(1)} | ${entry.runs} |\n`;
|
|
387
|
+
});
|
|
388
|
+
return table;
|
|
389
|
+
}
|
|
390
|
+
/**
|
|
391
|
+
* Calculate summary statistics
|
|
392
|
+
*/
|
|
393
|
+
calculateSummaryStatistics(benchmarks) {
|
|
394
|
+
const pnls = benchmarks
|
|
395
|
+
.map((b) => b.metrics.totalPnl)
|
|
396
|
+
.sort((a, b) => a - b);
|
|
397
|
+
const accuracies = benchmarks
|
|
398
|
+
.map((b) => b.metrics.predictionMetrics.accuracy)
|
|
399
|
+
.sort((a, b) => a - b);
|
|
400
|
+
const optimalities = benchmarks
|
|
401
|
+
.map((b) => b.metrics.optimalityScore)
|
|
402
|
+
.sort((a, b) => a - b);
|
|
403
|
+
return {
|
|
404
|
+
pnl: calculateArrayStats(pnls),
|
|
405
|
+
accuracy: calculateArrayStats(accuracies),
|
|
406
|
+
optimality: calculateArrayStats(optimalities),
|
|
407
|
+
};
|
|
408
|
+
}
|
|
409
|
+
/**
|
|
410
|
+
* Ensure repository exists on HuggingFace
|
|
411
|
+
* Uses shared utility for consistent behavior
|
|
412
|
+
*/
|
|
413
|
+
async ensureRepository(datasetName, isPrivate) {
|
|
414
|
+
if (!this.huggingFaceToken) {
|
|
415
|
+
throw new Error("HuggingFace token not configured");
|
|
416
|
+
}
|
|
417
|
+
await HuggingFaceUploadUtil.ensureRepository(datasetName, "dataset", this.huggingFaceToken, isPrivate);
|
|
418
|
+
}
|
|
419
|
+
/**
|
|
420
|
+
* Upload files to HuggingFace Hub
|
|
421
|
+
* Uses shared utility for consistent upload behavior
|
|
422
|
+
*/
|
|
423
|
+
async uploadToHub(datasetName, localDir, _isPrivate) {
|
|
424
|
+
if (!this.huggingFaceToken) {
|
|
425
|
+
throw new Error("HuggingFace token not configured");
|
|
426
|
+
}
|
|
427
|
+
try {
|
|
428
|
+
// Use shared upload utility
|
|
429
|
+
const { HuggingFaceUploadUtil } = await import("./shared/HuggingFaceUploadUtil");
|
|
430
|
+
return await HuggingFaceUploadUtil.uploadDirectory(datasetName, "dataset", localDir, this.huggingFaceToken);
|
|
431
|
+
}
|
|
432
|
+
catch (error) {
|
|
433
|
+
logger.error("Failed to upload to HuggingFace Hub", { error });
|
|
434
|
+
// Provide helpful manual upload instructions
|
|
435
|
+
const { HuggingFaceUploadUtil } = await import("./shared/HuggingFaceUploadUtil");
|
|
436
|
+
const instructions = HuggingFaceUploadUtil.getManualUploadInstructions(datasetName, "dataset", localDir);
|
|
437
|
+
logger.info("To upload manually:", { instructions });
|
|
438
|
+
throw error;
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
/**
|
|
442
|
+
* Generate version string (YYYY.MM.DD format)
|
|
443
|
+
*/
|
|
444
|
+
generateVersion() {
|
|
445
|
+
const now = new Date();
|
|
446
|
+
const year = now.getFullYear();
|
|
447
|
+
const month = String(now.getMonth() + 1).padStart(2, "0");
|
|
448
|
+
const day = String(now.getDate()).padStart(2, "0");
|
|
449
|
+
return `${year}.${month}.${day}`;
|
|
450
|
+
}
|
|
451
|
+
/**
|
|
452
|
+
* Check if file exists
|
|
453
|
+
*/
|
|
454
|
+
async fileExists(filePath) {
|
|
455
|
+
try {
|
|
456
|
+
await fs.access(filePath);
|
|
457
|
+
return true;
|
|
458
|
+
}
|
|
459
|
+
catch {
|
|
460
|
+
return false;
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
}
|