@elizaos/training 2.0.0-alpha.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. package/Dockerfile +75 -0
  2. package/LICENSE +21 -0
  3. package/Makefile +374 -0
  4. package/README.md +346 -0
  5. package/config/rubrics.json +137 -0
  6. package/docker-compose.test.yml +57 -0
  7. package/package.json +57 -0
  8. package/python/config/babylon_atropos.yaml +90 -0
  9. package/python/config/profiles/12gb.json +11 -0
  10. package/python/config/profiles/16gb.json +10 -0
  11. package/python/config/profiles/24gb.json +10 -0
  12. package/python/config/profiles/48gb.json +10 -0
  13. package/python/config/profiles/cpu.json +11 -0
  14. package/python/config/profiles/l40-2gpu-safe.json +20 -0
  15. package/python/config/profiles/l40-2gpu.json +22 -0
  16. package/python/config/profiles/l40-4gpu.json +21 -0
  17. package/python/config/profiles/l40.json +17 -0
  18. package/python/config/tinker_training.yaml +143 -0
  19. package/python/curriculum_state.json +165 -0
  20. package/python/env.template +86 -0
  21. package/python/env.training.template +46 -0
  22. package/python/pyproject.toml +41 -0
  23. package/python/requirements-ci.txt +31 -0
  24. package/python/requirements.txt +87 -0
  25. package/python/scripts/__init__.py +4 -0
  26. package/python/scripts/benchmark_should_respond.py +190 -0
  27. package/python/scripts/debug_inference.py +62 -0
  28. package/python/scripts/import_json_trajectories.py +412 -0
  29. package/python/scripts/local-finetune/README.md +63 -0
  30. package/python/scripts/local-finetune/ingest_and_score.py +139 -0
  31. package/python/scripts/local-finetune/merge_model.py +32 -0
  32. package/python/scripts/local-finetune/test_adapter.py +91 -0
  33. package/python/scripts/local-finetune/train_from_csv.py +132 -0
  34. package/python/scripts/merge_trajectories.py +318 -0
  35. package/python/scripts/optimize_prompt_grpo.py +269 -0
  36. package/python/scripts/run_ab_test.py +143 -0
  37. package/python/scripts/run_full_pipeline.py +544 -0
  38. package/python/scripts/run_tinker_training.py +192 -0
  39. package/python/scripts/run_training.py +914 -0
  40. package/python/scripts/test_generation.py +29 -0
  41. package/python/scripts/test_judge.py +155 -0
  42. package/python/scripts/test_pipeline.py +356 -0
  43. package/python/scripts/test_trained_model.py +380 -0
  44. package/python/scripts/train_grpo.py +360 -0
  45. package/python/scripts/train_jsonl.py +223 -0
  46. package/python/scripts/train_local.py +528 -0
  47. package/python/setup.py +20 -0
  48. package/python/src/__init__.py +190 -0
  49. package/python/src/data_bridge/__init__.py +24 -0
  50. package/python/src/data_bridge/converter.py +435 -0
  51. package/python/src/data_bridge/reader.py +393 -0
  52. package/python/src/models.py +283 -0
  53. package/python/src/training/__init__.py +605 -0
  54. package/python/src/training/ab_testing.py +404 -0
  55. package/python/src/training/action_executor.py +621 -0
  56. package/python/src/training/archetype_trainer.py +347 -0
  57. package/python/src/training/atropos_trainer.py +980 -0
  58. package/python/src/training/babylon_env.py +1254 -0
  59. package/python/src/training/error_recovery.py +647 -0
  60. package/python/src/training/evaluation.py +856 -0
  61. package/python/src/training/fast_simulator.py +880 -0
  62. package/python/src/training/format_validator.py +584 -0
  63. package/python/src/training/hybrid_env.py +522 -0
  64. package/python/src/training/kl_controller.py +628 -0
  65. package/python/src/training/multi_prompt_dataset.py +883 -0
  66. package/python/src/training/multi_turn.py +656 -0
  67. package/python/src/training/online_env.py +1084 -0
  68. package/python/src/training/quality_scorer.py +391 -0
  69. package/python/src/training/quality_utils.py +633 -0
  70. package/python/src/training/rewards.py +1344 -0
  71. package/python/src/training/rlaif_env.py +17 -0
  72. package/python/src/training/rollout_generator.py +502 -0
  73. package/python/src/training/rubric_loader.py +198 -0
  74. package/python/src/training/scenario_pool.py +1072 -0
  75. package/python/src/training/schemas.py +481 -0
  76. package/python/src/training/service_manager.py +552 -0
  77. package/python/src/training/simulation_bridge.py +535 -0
  78. package/python/src/training/tick_reward_attribution.py +399 -0
  79. package/python/src/training/tinker_client.py +575 -0
  80. package/python/src/training/tinker_trainer.py +646 -0
  81. package/python/src/training/tokenization_utils.py +402 -0
  82. package/python/tests/e2e/__init__.py +13 -0
  83. package/python/tests/e2e/conftest.py +258 -0
  84. package/python/tests/e2e/test_full_pipeline.py +643 -0
  85. package/python/tests/e2e/test_online_training_e2e.py +365 -0
  86. package/python/tests/integration/__init__.py +12 -0
  87. package/python/tests/integration/conftest.py +383 -0
  88. package/python/tests/integration/test_db_integration.py +649 -0
  89. package/python/tests/integration/test_json_mode_integration.py +554 -0
  90. package/python/tests/test_action_executor.py +594 -0
  91. package/python/tests/test_archetype_scoring.py +1027 -0
  92. package/python/tests/test_atropos_integration.py +360 -0
  93. package/python/tests/test_evaluation.py +727 -0
  94. package/python/tests/test_format_validator.py +486 -0
  95. package/python/tests/test_kl_controller.py +432 -0
  96. package/python/tests/test_lr_scheduler.py +579 -0
  97. package/python/tests/test_multi_turn.py +590 -0
  98. package/python/tests/test_online_env.py +519 -0
  99. package/python/tests/test_quality_scorer.py +474 -0
  100. package/python/tests/test_scenario_pool.py +735 -0
  101. package/python/tests/test_service_manager.py +585 -0
  102. package/python/tests/test_simulation_rollout.py +581 -0
  103. package/python/tests/test_tokenization_utils.py +501 -0
  104. package/python/tests/test_training_orchestrator.py +497 -0
  105. package/python/tests/test_training_output_structure.py +661 -0
  106. package/research-output/training-runs/training-run-1770772042899.json +26 -0
  107. package/research-output/training-runs/training-run-1770930079670.json +32 -0
  108. package/research-output/training-runs/training-run-1770930143700.json +44 -0
  109. package/research-output/training-runs/training-run-1770930183638.json +38 -0
  110. package/research-output/training-runs/training-run-1770930442049.json +38 -0
  111. package/research-output/training-runs/training-run-1770930793243.json +38 -0
  112. package/research-output/training-runs/training-run-1771276293257.json +38 -0
  113. package/research-output/training-runs/training-run-1771276389280.json +38 -0
  114. package/research-output/training-runs/training-run-1771276502776.json +38 -0
  115. package/research-output/training-runs/training-run-1771277340748.json +38 -0
  116. package/research-output/training-runs/training-run-1773013658993.json +38 -0
  117. package/research-output/training-runs/training-run-1773013861014.json +38 -0
  118. package/research-output/training-runs/training-run-1773014215983.json +38 -0
  119. package/scripts/assess-training-data.ts +422 -0
  120. package/scripts/e2e-training-test.ts +550 -0
  121. package/scripts/export-rubrics.ts +64 -0
  122. package/scripts/generate-research-report.ts +1523 -0
  123. package/scripts/generate_dataset.sh +173 -0
  124. package/scripts/generate_should_respond.ts +267 -0
  125. package/scripts/generate_should_respond_dataset.ts +162 -0
  126. package/scripts/json-mode-benchmark.ts +399 -0
  127. package/scripts/rank_trajectories.ts +207 -0
  128. package/scripts/real-archetype-benchmark.ts +210 -0
  129. package/scripts/run-baseline-comparison.ts +116 -0
  130. package/scripts/run-full-pipeline.ts +272 -0
  131. package/scripts/run_rlaif_loop.ts +78 -0
  132. package/scripts/run_task_benchmark.ts +247 -0
  133. package/scripts/runpod_setup.sh +137 -0
  134. package/scripts/runpod_validate.sh +147 -0
  135. package/scripts/test-model-in-game.ts +955 -0
  136. package/scripts/test-scoring.ts +73 -0
  137. package/scripts/test-trained-model.ts +209 -0
  138. package/scripts/train-and-test.ts +824 -0
  139. package/scripts/verify-final.ts +118 -0
  140. package/src/adapter.ts +516 -0
  141. package/src/archetypes/ArchetypeConfigService.ts +626 -0
  142. package/src/archetypes/derive-archetype.ts +249 -0
  143. package/src/archetypes/index.ts +22 -0
  144. package/src/benchmark/ArchetypeMatchupBenchmark.ts +825 -0
  145. package/src/benchmark/BenchmarkChartGenerator.ts +748 -0
  146. package/src/benchmark/BenchmarkDataGenerator.ts +1288 -0
  147. package/src/benchmark/BenchmarkDataViewer.ts +324 -0
  148. package/src/benchmark/BenchmarkHistoryService.ts +221 -0
  149. package/src/benchmark/BenchmarkRunner.ts +685 -0
  150. package/src/benchmark/BenchmarkValidator.ts +204 -0
  151. package/src/benchmark/FastEvalRunner.ts +225 -0
  152. package/src/benchmark/MetricsValidator.ts +165 -0
  153. package/src/benchmark/MetricsVisualizer.ts +909 -0
  154. package/src/benchmark/ModelBenchmarkService.ts +611 -0
  155. package/src/benchmark/ModelRegistry.ts +158 -0
  156. package/src/benchmark/RulerBenchmarkIntegration.ts +235 -0
  157. package/src/benchmark/SimulationA2AInterface.ts +1169 -0
  158. package/src/benchmark/SimulationEngine.ts +832 -0
  159. package/src/benchmark/TaskRunner.ts +94 -0
  160. package/src/benchmark/__tests__/BenchmarkRunner.test.ts +534 -0
  161. package/src/benchmark/__tests__/HeadToHead.test.ts +126 -0
  162. package/src/benchmark/index.ts +91 -0
  163. package/src/benchmark/parseSimulationMetrics.ts +124 -0
  164. package/src/benchmark/simulation-types.ts +78 -0
  165. package/src/dependencies.ts +475 -0
  166. package/src/generation/TrajectoryGenerator.ts +387 -0
  167. package/src/generation/index.ts +12 -0
  168. package/src/huggingface/HuggingFaceDatasetUploader.ts +636 -0
  169. package/src/huggingface/HuggingFaceIntegrationService.ts +426 -0
  170. package/src/huggingface/HuggingFaceModelUploader.ts +532 -0
  171. package/src/huggingface/index.ts +27 -0
  172. package/src/huggingface/shared/HuggingFaceUploadUtil.ts +206 -0
  173. package/src/index.ts +102 -0
  174. package/src/init-training.ts +53 -0
  175. package/src/metrics/TrajectoryMetricsExtractor.ts +653 -0
  176. package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +759 -0
  177. package/src/metrics/index.ts +8 -0
  178. package/src/metrics/types.ts +200 -0
  179. package/src/rubrics/__tests__/index.test.ts +184 -0
  180. package/src/rubrics/ass-kisser.ts +85 -0
  181. package/src/rubrics/degen.ts +80 -0
  182. package/src/rubrics/goody-twoshoes.ts +84 -0
  183. package/src/rubrics/index.ts +236 -0
  184. package/src/rubrics/information-trader.ts +84 -0
  185. package/src/rubrics/infosec.ts +101 -0
  186. package/src/rubrics/liar.ts +104 -0
  187. package/src/rubrics/perps-trader.ts +87 -0
  188. package/src/rubrics/researcher.ts +81 -0
  189. package/src/rubrics/scammer.ts +82 -0
  190. package/src/rubrics/social-butterfly.ts +73 -0
  191. package/src/rubrics/super-predictor.ts +97 -0
  192. package/src/rubrics/trader.ts +67 -0
  193. package/src/scoring/ArchetypeScoringService.ts +486 -0
  194. package/src/scoring/JudgePromptBuilder.ts +556 -0
  195. package/src/scoring/LLMJudgeCache.ts +401 -0
  196. package/src/scoring/index.ts +9 -0
  197. package/src/training/AutomationPipeline.ts +916 -0
  198. package/src/training/BenchmarkService.ts +518 -0
  199. package/src/training/ConfigValidator.ts +220 -0
  200. package/src/training/MarketOutcomesTracker.ts +187 -0
  201. package/src/training/ModelDeployer.ts +186 -0
  202. package/src/training/ModelFetcher.ts +76 -0
  203. package/src/training/ModelSelectionService.ts +341 -0
  204. package/src/training/ModelUsageVerifier.ts +160 -0
  205. package/src/training/MultiModelOrchestrator.ts +580 -0
  206. package/src/training/RLModelConfig.ts +407 -0
  207. package/src/training/RewardBackpropagationService.ts +149 -0
  208. package/src/training/RulerScoringService.ts +666 -0
  209. package/src/training/TrainingMonitor.ts +166 -0
  210. package/src/training/TrajectoryRecorder.ts +399 -0
  211. package/src/training/__tests__/TrajectoryRecorder.test.ts +472 -0
  212. package/src/training/index.ts +100 -0
  213. package/src/training/logRLConfig.ts +34 -0
  214. package/src/training/pipeline.ts +129 -0
  215. package/src/training/storage/ModelStorageService.ts +279 -0
  216. package/src/training/storage/TrainingDataArchiver.ts +197 -0
  217. package/src/training/storage/index.ts +17 -0
  218. package/src/training/types.ts +207 -0
  219. package/src/training/window-utils.ts +138 -0
  220. package/src/utils/index.ts +101 -0
  221. package/src/utils/logger.ts +59 -0
  222. package/src/utils/snowflake.ts +17 -0
  223. package/src/utils/synthetic-detector.ts +111 -0
  224. package/tsconfig.json +20 -0
@@ -0,0 +1,909 @@
1
+ /**
2
+ * Metrics Visualizer
3
+ *
4
+ * Generates visualizations and reports from benchmark results:
5
+ * - P&L over time charts
6
+ * - Prediction accuracy graphs
7
+ * - Social metrics
8
+ * - Comparison tables
9
+ * - Performance scorecards
10
+ * - Head-to-Head Baseline vs Challenger reports
11
+ *
12
+ * Outputs HTML reports, JSON data, and ASCII terminal charts for analysis.
13
+ */
14
+
15
+ import { promises as fs } from 'fs';
16
+ import * as path from 'path';
17
+ import { logger } from '../utils/logger';
18
+ import type { BenchmarkComparisonResult } from './BenchmarkRunner';
19
+ import type { SimulationResult } from './SimulationEngine';
20
+
21
+ export interface VisualizationConfig {
22
+ /** Output directory for visualizations */
23
+ outputDir: string;
24
+
25
+ /** Generate HTML report */
26
+ generateHtml: boolean;
27
+
28
+ /** Generate CSV exports */
29
+ generateCsv: boolean;
30
+
31
+ /** Generate charts (requires chart library) */
32
+ generateCharts: boolean;
33
+ }
34
+
35
+ export class MetricsVisualizer {
36
+ /**
37
+ * Generate complete visualization suite for a single run
38
+ */
39
+ static async visualizeSingleRun(
40
+ result: SimulationResult,
41
+ config: VisualizationConfig
42
+ ): Promise<void> {
43
+ logger.info('Generating visualizations', { resultId: result.id });
44
+
45
+ await fs.mkdir(config.outputDir, { recursive: true });
46
+
47
+ // 1. Generate metrics summary
48
+ const summaryHtml = this.generateMetricsSummary(result);
49
+ await fs.writeFile(
50
+ path.join(config.outputDir, 'summary.html'),
51
+ summaryHtml
52
+ );
53
+
54
+ // 2. Generate detailed metrics tables
55
+ const detailedHtml = this.generateDetailedMetrics(result);
56
+ await fs.writeFile(
57
+ path.join(config.outputDir, 'detailed.html'),
58
+ detailedHtml
59
+ );
60
+
61
+ // 3. Generate action timeline
62
+ const timelineHtml = this.generateActionTimeline(result);
63
+ await fs.writeFile(
64
+ path.join(config.outputDir, 'timeline.html'),
65
+ timelineHtml
66
+ );
67
+
68
+ // 4. Generate CSV exports if requested
69
+ if (config.generateCsv) {
70
+ await this.exportToCsv(result, config.outputDir);
71
+ }
72
+
73
+ // 5. Generate master report that links everything
74
+ const reportHtml = this.generateMasterReport(result);
75
+ await fs.writeFile(path.join(config.outputDir, 'index.html'), reportHtml);
76
+
77
+ logger.info('Visualizations generated', { outputDir: config.outputDir });
78
+ }
79
+
80
+ /**
81
+ * Generate comparison visualization for multiple runs (Batch Mode)
82
+ */
83
+ static async visualizeComparison(
84
+ comparison: BenchmarkComparisonResult,
85
+ config: VisualizationConfig
86
+ ): Promise<void> {
87
+ logger.info('Generating comparison visualizations');
88
+
89
+ await fs.mkdir(config.outputDir, { recursive: true });
90
+
91
+ // 1. Generate comparison summary
92
+ const summaryHtml = this.generateComparisonSummary(comparison);
93
+ await fs.writeFile(
94
+ path.join(config.outputDir, 'comparison.html'),
95
+ summaryHtml
96
+ );
97
+
98
+ // 2. Generate performance distribution charts
99
+ const distributionHtml = this.generateDistributionCharts(comparison);
100
+ await fs.writeFile(
101
+ path.join(config.outputDir, 'distribution.html'),
102
+ distributionHtml
103
+ );
104
+
105
+ // 3. Export comparison data to CSV
106
+ if (config.generateCsv) {
107
+ await this.exportComparisonToCsv(comparison, config.outputDir);
108
+ }
109
+
110
+ logger.info('Comparison visualizations generated');
111
+ }
112
+
113
+ /**
114
+ * Generate Head-to-Head Comparison Report (Baseline vs Challenger)
115
+ * Includes ASCII chart for terminal output and JSON/Text reports.
116
+ */
117
+ static async generateComparisonReport(
118
+ baseline: SimulationResult,
119
+ challenger: SimulationResult,
120
+ outputDir: string
121
+ ): Promise<void> {
122
+ logger.info('Generating head-to-head comparison report...');
123
+ await fs.mkdir(outputDir, { recursive: true });
124
+
125
+ // 1. Generate ASCII Chart and print to terminal
126
+ const asciiReport = this.generateAsciiComparison(baseline, challenger);
127
+ console.log(asciiReport);
128
+
129
+ // 2. Save JSON Report with full data
130
+ const jsonReport = {
131
+ timestamp: new Date().toISOString(),
132
+ benchmarkId: baseline.benchmarkId,
133
+ baseline: {
134
+ agentId: baseline.agentId,
135
+ pnl: baseline.metrics.totalPnl,
136
+ accuracy: baseline.metrics.predictionMetrics.accuracy,
137
+ winRate: baseline.metrics.perpMetrics.winRate,
138
+ optimality: baseline.metrics.optimalityScore,
139
+ },
140
+ challenger: {
141
+ agentId: challenger.agentId,
142
+ pnl: challenger.metrics.totalPnl,
143
+ accuracy: challenger.metrics.predictionMetrics.accuracy,
144
+ winRate: challenger.metrics.perpMetrics.winRate,
145
+ optimality: challenger.metrics.optimalityScore,
146
+ },
147
+ delta: {
148
+ pnl: challenger.metrics.totalPnl - baseline.metrics.totalPnl,
149
+ accuracy:
150
+ challenger.metrics.predictionMetrics.accuracy -
151
+ baseline.metrics.predictionMetrics.accuracy,
152
+ winRate:
153
+ challenger.metrics.perpMetrics.winRate -
154
+ baseline.metrics.perpMetrics.winRate,
155
+ },
156
+ pnlHistory: this.mergePnlHistory(baseline, challenger),
157
+ };
158
+
159
+ await fs.writeFile(
160
+ path.join(outputDir, 'comparison.json'),
161
+ JSON.stringify(jsonReport, null, 2)
162
+ );
163
+
164
+ // 3. Save Text Report (ASCII chart)
165
+ await fs.writeFile(path.join(outputDir, 'report.txt'), asciiReport);
166
+
167
+ logger.info(`Comparison report saved to ${outputDir}`);
168
+ }
169
+
170
+ /**
171
+ * Generate ASCII Comparison Chart for Terminal
172
+ * Public for testing purposes
173
+ */
174
+ static generateAsciiComparison(
175
+ baseline: SimulationResult,
176
+ challenger: SimulationResult
177
+ ): string {
178
+ const pnlDelta = challenger.metrics.totalPnl - baseline.metrics.totalPnl;
179
+ const winner = pnlDelta >= 0 ? 'Challenger (LLM)' : 'Baseline';
180
+
181
+ let output = `
182
+ === 🥊 HEAD-TO-HEAD RESULTS ===
183
+ Benchmark: ${baseline.benchmarkId}
184
+ Baseline: ${baseline.agentId} | Challenger: ${challenger.agentId}
185
+
186
+ 💰 Cumulative PnL:
187
+ Tick | Baseline | Challenger | Delta
188
+ ----------------------------------------------------------------------
189
+ `;
190
+
191
+ // Sample points (every 10th tick or so to fit terminal vertically)
192
+ const history = this.mergePnlHistory(baseline, challenger);
193
+ const step = Math.max(1, Math.floor(history.length / 10));
194
+
195
+ for (let i = 0; i < history.length; i += step) {
196
+ const point = history[i];
197
+ if (!point) continue; // Skip if point is somehow undefined
198
+
199
+ const basePnl = point.baseline.toFixed(0);
200
+ const chalPnl = point.challenger.toFixed(0);
201
+ const deltaVal = point.challenger - point.baseline;
202
+ const deltaStr = deltaVal.toFixed(0);
203
+ const sign = deltaVal >= 0 ? '+' : '';
204
+
205
+ // Format columns nicely
206
+ output += `${point.tick.toString().padEnd(5)} | $${basePnl.padEnd(
207
+ 21
208
+ )} | $${chalPnl.padEnd(21)} | ${sign}$${deltaStr}\n`;
209
+ }
210
+
211
+ // Final result row
212
+ const finalBase = baseline.metrics.totalPnl.toFixed(2);
213
+ const finalChal = challenger.metrics.totalPnl.toFixed(2);
214
+ const finalDelta = pnlDelta.toFixed(2);
215
+ const finalSign = pnlDelta >= 0 ? '+' : '';
216
+
217
+ output += `
218
+ ----------------------------------------------------------------------
219
+ FINAL | $${finalBase.padEnd(21)} | $${finalChal.padEnd(
220
+ 21
221
+ )} | ${finalSign}$${finalDelta}
222
+
223
+ 🏆 WINNER: ${winner}
224
+ 🚀 Alpha Generated: ${finalSign}$${finalDelta}
225
+ `;
226
+
227
+ return output;
228
+ }
229
+
230
+ /**
231
+ * Merge PnL histories from two runs into a single timeline
232
+ * Public for testing purposes
233
+ */
234
+ static mergePnlHistory(
235
+ baseline: SimulationResult,
236
+ challenger: SimulationResult
237
+ ): Array<{ tick: number; baseline: number; challenger: number }> {
238
+ const merged = [];
239
+ const maxTicks = Math.max(
240
+ baseline.pnlHistory?.length || 0,
241
+ challenger.pnlHistory?.length || 0
242
+ );
243
+
244
+ for (let i = 0; i < maxTicks; i++) {
245
+ // Use optional chaining and default to final PnL if history is missing or shorter
246
+ // pnlHistory[i] might be undefined if one run is shorter than the other
247
+ const baseTick = baseline.pnlHistory?.[i];
248
+ const chalTick = challenger.pnlHistory?.[i];
249
+
250
+ const basePnl = baseTick ? baseTick.pnl : baseline.metrics.totalPnl;
251
+ const chalPnl = chalTick ? chalTick.pnl : challenger.metrics.totalPnl;
252
+
253
+ merged.push({
254
+ tick: i,
255
+ baseline: basePnl,
256
+ challenger: chalPnl,
257
+ });
258
+ }
259
+ return merged;
260
+ }
261
+
262
+ // =========================================================================
263
+ // Existing Single-Run and Batch Visualizations
264
+ // =========================================================================
265
+
266
+ /**
267
+ * Generate metrics summary card
268
+ */
269
+ private static generateMetricsSummary(result: SimulationResult): string {
270
+ const { metrics } = result;
271
+
272
+ return `
273
+ <!DOCTYPE html>
274
+ <html>
275
+ <head>
276
+ <title>Benchmark Metrics Summary</title>
277
+ <style>
278
+ body {
279
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
280
+ max-width: 1200px;
281
+ margin: 40px auto;
282
+ padding: 20px;
283
+ background: #f5f5f5;
284
+ }
285
+ .card {
286
+ background: white;
287
+ border-radius: 8px;
288
+ padding: 24px;
289
+ margin-bottom: 20px;
290
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
291
+ }
292
+ .metric {
293
+ display: grid;
294
+ grid-template-columns: 1fr 1fr;
295
+ gap: 20px;
296
+ }
297
+ .metric-group {
298
+ background: #f9f9f9;
299
+ padding: 16px;
300
+ border-radius: 6px;
301
+ }
302
+ .metric-group h3 {
303
+ margin-top: 0;
304
+ color: #333;
305
+ font-size: 14px;
306
+ text-transform: uppercase;
307
+ letter-spacing: 0.5px;
308
+ }
309
+ .metric-item {
310
+ display: flex;
311
+ justify-content: space-between;
312
+ padding: 8px 0;
313
+ border-bottom: 1px solid #eee;
314
+ }
315
+ .metric-item:last-child {
316
+ border-bottom: none;
317
+ }
318
+ .metric-label {
319
+ color: #666;
320
+ font-size: 14px;
321
+ }
322
+ .metric-value {
323
+ font-weight: 600;
324
+ font-size: 16px;
325
+ color: #333;
326
+ }
327
+ .metric-value.positive {
328
+ color: #10b981;
329
+ }
330
+ .metric-value.negative {
331
+ color: #ef4444;
332
+ }
333
+ .score-badge {
334
+ display: inline-block;
335
+ padding: 4px 12px;
336
+ border-radius: 12px;
337
+ font-size: 12px;
338
+ font-weight: 600;
339
+ }
340
+ .score-excellent { background: #d1fae5; color: #065f46; }
341
+ .score-good { background: #dbeafe; color: #1e40af; }
342
+ .score-fair { background: #fef3c7; color: #92400e; }
343
+ .score-poor { background: #fee2e2; color: #991b1b; }
344
+ h1 {
345
+ color: #111;
346
+ margin-bottom: 8px;
347
+ }
348
+ .subtitle {
349
+ color: #666;
350
+ margin-bottom: 32px;
351
+ }
352
+ </style>
353
+ </head>
354
+ <body>
355
+ <h1>📊 Benchmark Results</h1>
356
+ <p class="subtitle">Agent: ${result.agentId} | Benchmark: ${result.benchmarkId}</p>
357
+
358
+ <div class="card">
359
+ <h2>Overall Performance</h2>
360
+ <div class="metric-item">
361
+ <span class="metric-label">Total P&L</span>
362
+ <span class="metric-value ${metrics.totalPnl >= 0 ? 'positive' : 'negative'}">
363
+ ${metrics.totalPnl >= 0 ? '+' : ''}$${metrics.totalPnl.toFixed(2)}
364
+ </span>
365
+ </div>
366
+ <div class="metric-item">
367
+ <span class="metric-label">Optimality Score</span>
368
+ <span class="metric-value">
369
+ ${metrics.optimalityScore.toFixed(1)}%
370
+ ${this.getScoreBadge(metrics.optimalityScore)}
371
+ </span>
372
+ </div>
373
+ <div class="metric-item">
374
+ <span class="metric-label">Total Duration</span>
375
+ <span class="metric-value">${(metrics.timing.totalDuration / 1000).toFixed(1)}s</span>
376
+ </div>
377
+ <div class="metric-item">
378
+ <span class="metric-label">Avg Response Time</span>
379
+ <span class="metric-value">${metrics.timing.avgResponseTime.toFixed(0)}ms</span>
380
+ </div>
381
+ </div>
382
+
383
+ <div class="card">
384
+ <div class="metric">
385
+ <div class="metric-group">
386
+ <h3>Prediction Markets</h3>
387
+ <div class="metric-item">
388
+ <span class="metric-label">Total Positions</span>
389
+ <span class="metric-value">${metrics.predictionMetrics.totalPositions}</span>
390
+ </div>
391
+ <div class="metric-item">
392
+ <span class="metric-label">Accuracy</span>
393
+ <span class="metric-value ${metrics.predictionMetrics.accuracy >= 0.6 ? 'positive' : ''}">${(metrics.predictionMetrics.accuracy * 100).toFixed(1)}%</span>
394
+ </div>
395
+ <div class="metric-item">
396
+ <span class="metric-label">Correct</span>
397
+ <span class="metric-value positive">${metrics.predictionMetrics.correctPredictions}</span>
398
+ </div>
399
+ <div class="metric-item">
400
+ <span class="metric-label">Incorrect</span>
401
+ <span class="metric-value negative">${metrics.predictionMetrics.incorrectPredictions}</span>
402
+ </div>
403
+ <div class="metric-item">
404
+ <span class="metric-label">Avg P&L per Position</span>
405
+ <span class="metric-value ${metrics.predictionMetrics.avgPnlPerPosition >= 0 ? 'positive' : 'negative'}">
406
+ ${metrics.predictionMetrics.avgPnlPerPosition >= 0 ? '+' : ''}$${metrics.predictionMetrics.avgPnlPerPosition.toFixed(2)}
407
+ </span>
408
+ </div>
409
+ </div>
410
+
411
+ <div class="metric-group">
412
+ <h3>Perpetual Futures</h3>
413
+ <div class="metric-item">
414
+ <span class="metric-label">Total Trades</span>
415
+ <span class="metric-value">${metrics.perpMetrics.totalTrades}</span>
416
+ </div>
417
+ <div class="metric-item">
418
+ <span class="metric-label">Win Rate</span>
419
+ <span class="metric-value ${metrics.perpMetrics.winRate >= 0.5 ? 'positive' : ''}">${(metrics.perpMetrics.winRate * 100).toFixed(1)}%</span>
420
+ </div>
421
+ <div class="metric-item">
422
+ <span class="metric-label">Profitable Trades</span>
423
+ <span class="metric-value positive">${metrics.perpMetrics.profitableTrades}</span>
424
+ </div>
425
+ <div class="metric-item">
426
+ <span class="metric-label">Avg P&L per Trade</span>
427
+ <span class="metric-value ${metrics.perpMetrics.avgPnlPerTrade >= 0 ? 'positive' : 'negative'}">
428
+ ${metrics.perpMetrics.avgPnlPerTrade >= 0 ? '+' : ''}$${metrics.perpMetrics.avgPnlPerTrade.toFixed(2)}
429
+ </span>
430
+ </div>
431
+ <div class="metric-item">
432
+ <span class="metric-label">Max Drawdown</span>
433
+ <span class="metric-value negative">$${metrics.perpMetrics.maxDrawdown.toFixed(2)}</span>
434
+ </div>
435
+ </div>
436
+ </div>
437
+ </div>
438
+
439
+ <div class="card">
440
+ <h2>Social Engagement</h2>
441
+ <div class="metric-item">
442
+ <span class="metric-label">Posts Created</span>
443
+ <span class="metric-value">${metrics.socialMetrics.postsCreated}</span>
444
+ </div>
445
+ <div class="metric-item">
446
+ <span class="metric-label">Groups Joined</span>
447
+ <span class="metric-value">${metrics.socialMetrics.groupsJoined}</span>
448
+ </div>
449
+ <div class="metric-item">
450
+ <span class="metric-label">Reputation Gained</span>
451
+ <span class="metric-value ${metrics.socialMetrics.reputationGained >= 0 ? 'positive' : 'negative'}">
452
+ ${metrics.socialMetrics.reputationGained >= 0 ? '+' : ''}${metrics.socialMetrics.reputationGained}
453
+ </span>
454
+ </div>
455
+ </div>
456
+
457
+ <p style="text-align: center; color: #999; margin-top: 40px;">
458
+ Generated: ${new Date().toLocaleString()}
459
+ </p>
460
+ </body>
461
+ </html>`;
462
+ }
463
+
464
+ /**
465
+ * Generate detailed metrics tables
466
+ */
467
+ private static generateDetailedMetrics(result: SimulationResult): string {
468
+ return `
469
+ <!DOCTYPE html>
470
+ <html>
471
+ <head>
472
+ <title>Detailed Metrics</title>
473
+ <style>
474
+ body {
475
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
476
+ max-width: 1400px;
477
+ margin: 40px auto;
478
+ padding: 20px;
479
+ background: #f5f5f5;
480
+ }
481
+ table {
482
+ width: 100%;
483
+ background: white;
484
+ border-radius: 8px;
485
+ overflow: hidden;
486
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
487
+ margin-bottom: 20px;
488
+ }
489
+ th, td {
490
+ padding: 12px;
491
+ text-align: left;
492
+ border-bottom: 1px solid #eee;
493
+ }
494
+ th {
495
+ background: #f9f9f9;
496
+ font-weight: 600;
497
+ font-size: 12px;
498
+ text-transform: uppercase;
499
+ letter-spacing: 0.5px;
500
+ color: #666;
501
+ }
502
+ tr:last-child td {
503
+ border-bottom: none;
504
+ }
505
+ .positive { color: #10b981; }
506
+ .negative { color: #ef4444; }
507
+ </style>
508
+ </head>
509
+ <body>
510
+ <h1>Detailed Action Log</h1>
511
+
512
+ <table>
513
+ <thead>
514
+ <tr>
515
+ <th>Tick</th>
516
+ <th>Type</th>
517
+ <th>Details</th>
518
+ <th>Duration</th>
519
+ </tr>
520
+ </thead>
521
+ <tbody>
522
+ ${result.actions
523
+ .map(
524
+ (action) => `
525
+ <tr>
526
+ <td>#${action.tick}</td>
527
+ <td>${action.type}</td>
528
+ <td><code>${JSON.stringify(action.data)}</code></td>
529
+ <td>${action.duration}ms</td>
530
+ </tr>
531
+ `
532
+ )
533
+ .join('')}
534
+ </tbody>
535
+ </table>
536
+ </body>
537
+ </html>`;
538
+ }
539
+
540
+ /**
541
+ * Generate action timeline
542
+ */
543
+ private static generateActionTimeline(result: SimulationResult): string {
544
+ return `
545
+ <!DOCTYPE html>
546
+ <html>
547
+ <head>
548
+ <title>Action Timeline</title>
549
+ <style>
550
+ body {
551
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
552
+ max-width: 1200px;
553
+ margin: 40px auto;
554
+ padding: 20px;
555
+ background: #f5f5f5;
556
+ }
557
+ .timeline {
558
+ position: relative;
559
+ padding: 20px 0;
560
+ }
561
+ .timeline-item {
562
+ background: white;
563
+ border-radius: 8px;
564
+ padding: 16px;
565
+ margin-bottom: 12px;
566
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
567
+ position: relative;
568
+ padding-left: 80px;
569
+ }
570
+ .timeline-item::before {
571
+ content: '#' attr(data-tick);
572
+ position: absolute;
573
+ left: 16px;
574
+ top: 16px;
575
+ font-weight: 600;
576
+ color: #666;
577
+ font-size: 14px;
578
+ }
579
+ .action-type {
580
+ font-weight: 600;
581
+ color: #333;
582
+ margin-bottom: 4px;
583
+ }
584
+ .action-details {
585
+ color: #666;
586
+ font-size: 14px;
587
+ }
588
+ </style>
589
+ </head>
590
+ <body>
591
+ <h1>Action Timeline</h1>
592
+ <div class="timeline">
593
+ ${result.actions
594
+ .map(
595
+ (action) => `
596
+ <div class="timeline-item" data-tick="${action.tick}">
597
+ <div class="action-type">${action.type}</div>
598
+ <div class="action-details">${JSON.stringify(action.data)}</div>
599
+ </div>
600
+ `
601
+ )
602
+ .join('')}
603
+ </div>
604
+ </body>
605
+ </html>`;
606
+ }
607
+
608
+ /**
609
+ * Generate master report
610
+ */
611
+ private static generateMasterReport(result: SimulationResult): string {
612
+ return `
613
+ <!DOCTYPE html>
614
+ <html>
615
+ <head>
616
+ <title>Benchmark Report</title>
617
+ <style>
618
+ body {
619
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
620
+ max-width: 800px;
621
+ margin: 40px auto;
622
+ padding: 20px;
623
+ background: #f5f5f5;
624
+ }
625
+ .nav {
626
+ background: white;
627
+ border-radius: 8px;
628
+ padding: 24px;
629
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
630
+ }
631
+ .nav a {
632
+ display: block;
633
+ padding: 12px 16px;
634
+ color: #333;
635
+ text-decoration: none;
636
+ border-radius: 6px;
637
+ margin-bottom: 8px;
638
+ transition: background 0.2s;
639
+ }
640
+ .nav a:hover {
641
+ background: #f9f9f9;
642
+ }
643
+ h1 {
644
+ color: #111;
645
+ }
646
+ </style>
647
+ </head>
648
+ <body>
649
+ <h1>📊 Benchmark Report</h1>
650
+ <p>Agent: <strong>${result.agentId}</strong></p>
651
+ <p>Benchmark: <strong>${result.benchmarkId}</strong></p>
652
+ <p>Date: ${new Date(result.startTime).toLocaleString()}</p>
653
+
654
+ <div class="nav">
655
+ <h2>Reports</h2>
656
+ <a href="summary.html">📈 Summary</a>
657
+ <a href="detailed.html">📋 Detailed Metrics</a>
658
+ <a href="timeline.html">⏱️ Action Timeline</a>
659
+ </div>
660
+ </body>
661
+ </html>`;
662
+ }
663
+
664
+ /**
665
+ * Generate comparison summary
666
+ */
667
+ private static generateComparisonSummary(
668
+ comparison: BenchmarkComparisonResult
669
+ ): string {
670
+ return `
671
+ <!DOCTYPE html>
672
+ <html>
673
+ <head>
674
+ <title>Benchmark Comparison</title>
675
+ <style>
676
+ body {
677
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
678
+ max-width: 1200px;
679
+ margin: 40px auto;
680
+ padding: 20px;
681
+ background: #f5f5f5;
682
+ }
683
+ .card {
684
+ background: white;
685
+ border-radius: 8px;
686
+ padding: 24px;
687
+ margin-bottom: 20px;
688
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
689
+ }
690
+ table {
691
+ width: 100%;
692
+ border-collapse: collapse;
693
+ }
694
+ th, td {
695
+ padding: 12px;
696
+ text-align: left;
697
+ border-bottom: 1px solid #eee;
698
+ }
699
+ th {
700
+ background: #f9f9f9;
701
+ font-weight: 600;
702
+ }
703
+ .positive { color: #10b981; }
704
+ .negative { color: #ef4444; }
705
+ </style>
706
+ </head>
707
+ <body>
708
+ <h1>Benchmark Comparison (${comparison.runs.length} runs)</h1>
709
+
710
+ <div class="card">
711
+ <h2>Summary Statistics</h2>
712
+ <table>
713
+ <tr>
714
+ <th>Metric</th>
715
+ <th>Average</th>
716
+ <th>Best</th>
717
+ <th>Worst</th>
718
+ </tr>
719
+ <tr>
720
+ <td>P&L</td>
721
+ <td class="${comparison.comparison.avgPnl >= 0 ? 'positive' : 'negative'}">$${comparison.comparison.avgPnl.toFixed(2)}</td>
722
+ <td>${comparison.comparison.bestRun}</td>
723
+ <td>${comparison.comparison.worstRun}</td>
724
+ </tr>
725
+ <tr>
726
+ <td>Accuracy</td>
727
+ <td>${(comparison.comparison.avgAccuracy * 100).toFixed(1)}%</td>
728
+ <td>-</td>
729
+ <td>-</td>
730
+ </tr>
731
+ <tr>
732
+ <td>Optimality</td>
733
+ <td>${comparison.comparison.avgOptimality.toFixed(1)}%</td>
734
+ <td>-</td>
735
+ <td>-</td>
736
+ </tr>
737
+ </table>
738
+ </div>
739
+
740
+ <div class="card">
741
+ <h2>Individual Runs</h2>
742
+ <table>
743
+ <thead>
744
+ <tr>
745
+ <th>Run</th>
746
+ <th>Total P&L</th>
747
+ <th>Accuracy</th>
748
+ <th>Optimality</th>
749
+ <th>Duration</th>
750
+ </tr>
751
+ </thead>
752
+ <tbody>
753
+ ${comparison.runs
754
+ .map(
755
+ (run, i) => `
756
+ <tr>
757
+ <td>Run ${i + 1}</td>
758
+ <td class="${run.metrics.totalPnl >= 0 ? 'positive' : 'negative'}">$${run.metrics.totalPnl.toFixed(2)}</td>
759
+ <td>${(run.metrics.predictionMetrics.accuracy * 100).toFixed(1)}%</td>
760
+ <td>${run.metrics.optimalityScore.toFixed(1)}%</td>
761
+ <td>${(run.metrics.timing.totalDuration / 1000).toFixed(1)}s</td>
762
+ </tr>
763
+ `
764
+ )
765
+ .join('')}
766
+ </tbody>
767
+ </table>
768
+ </div>
769
+ </body>
770
+ </html>`;
771
+ }
772
+
773
+ /**
774
+ * Generate distribution charts
775
+ */
776
+ private static generateDistributionCharts(
777
+ comparison: BenchmarkComparisonResult
778
+ ): string {
779
+ const pnls = comparison.runs.map((r) => r.metrics.totalPnl);
780
+ const accuracies = comparison.runs.map(
781
+ (r) => r.metrics.predictionMetrics.accuracy * 100
782
+ );
783
+
784
+ return `
785
+ <!DOCTYPE html>
786
+ <html>
787
+ <head>
788
+ <title>Performance Distribution</title>
789
+ <style>
790
+ body {
791
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
792
+ max-width: 1200px;
793
+ margin: 40px auto;
794
+ padding: 20px;
795
+ background: #f5f5f5;
796
+ }
797
+ .chart {
798
+ background: white;
799
+ border-radius: 8px;
800
+ padding: 24px;
801
+ margin-bottom: 20px;
802
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
803
+ }
804
+ .bar {
805
+ height: 30px;
806
+ background: #3b82f6;
807
+ border-radius: 4px;
808
+ margin-bottom: 8px;
809
+ display: flex;
810
+ align-items: center;
811
+ padding: 0 12px;
812
+ color: white;
813
+ font-size: 14px;
814
+ font-weight: 600;
815
+ }
816
+ </style>
817
+ </head>
818
+ <body>
819
+ <h1>Performance Distribution</h1>
820
+
821
+ <div class="chart">
822
+ <h2>P&L Distribution</h2>
823
+ ${pnls
824
+ .map(
825
+ (pnl, i) => `
826
+ <div class="bar" style="width: ${(Math.abs(pnl) / Math.max(...pnls.map(Math.abs))) * 100}%">
827
+ Run ${i + 1}: $${pnl.toFixed(2)}
828
+ </div>
829
+ `
830
+ )
831
+ .join('')}
832
+ </div>
833
+
834
+ <div class="chart">
835
+ <h2>Accuracy Distribution</h2>
836
+ ${accuracies
837
+ .map(
838
+ (acc, i) => `
839
+ <div class="bar" style="width: ${acc}%">
840
+ Run ${i + 1}: ${acc.toFixed(1)}%
841
+ </div>
842
+ `
843
+ )
844
+ .join('')}
845
+ </div>
846
+ </body>
847
+ </html>`;
848
+ }
849
+
850
+ /**
851
+ * Export to CSV
852
+ */
853
+ private static async exportToCsv(
854
+ result: SimulationResult,
855
+ outputDir: string
856
+ ): Promise<void> {
857
+ // Actions CSV
858
+ const actionsCsv = [
859
+ 'tick,type,data,duration',
860
+ ...result.actions.map(
861
+ (a) =>
862
+ `${a.tick},"${a.type}","${JSON.stringify(a.data).replace(/"/g, '""')}",${a.duration}`
863
+ ),
864
+ ].join('\n');
865
+
866
+ await fs.writeFile(path.join(outputDir, 'actions.csv'), actionsCsv);
867
+
868
+ // Metrics CSV
869
+ const metricsCsv = [
870
+ 'metric,value',
871
+ `total_pnl,${result.metrics.totalPnl}`,
872
+ `prediction_accuracy,${result.metrics.predictionMetrics.accuracy}`,
873
+ `perp_win_rate,${result.metrics.perpMetrics.winRate}`,
874
+ `optimality_score,${result.metrics.optimalityScore}`,
875
+ `avg_response_time,${result.metrics.timing.avgResponseTime}`,
876
+ ].join('\n');
877
+
878
+ await fs.writeFile(path.join(outputDir, 'metrics.csv'), metricsCsv);
879
+ }
880
+
881
+ /**
882
+ * Export comparison to CSV
883
+ */
884
+ private static async exportComparisonToCsv(
885
+ comparison: BenchmarkComparisonResult,
886
+ outputDir: string
887
+ ): Promise<void> {
888
+ const csv = [
889
+ 'run,total_pnl,accuracy,optimality,duration',
890
+ ...comparison.runs.map(
891
+ (run, i) =>
892
+ `${i + 1},${run.metrics.totalPnl},${run.metrics.predictionMetrics.accuracy},${run.metrics.optimalityScore},${run.metrics.timing.totalDuration}`
893
+ ),
894
+ ].join('\n');
895
+
896
+ await fs.writeFile(path.join(outputDir, 'comparison.csv'), csv);
897
+ }
898
+
899
+ /**
900
+ * Get score badge HTML
901
+ */
902
+ private static getScoreBadge(score: number): string {
903
+ if (score >= 80)
904
+ return '<span class="score-badge score-excellent">Excellent</span>';
905
+ if (score >= 60) return '<span class="score-badge score-good">Good</span>';
906
+ if (score >= 40) return '<span class="score-badge score-fair">Fair</span>';
907
+ return '<span class="score-badge score-poor">Poor</span>';
908
+ }
909
+ }