@elizaos/training 2.0.0-alpha.21 → 2.0.0-alpha.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/.turbo/turbo-lint.log +2 -0
  2. package/.turbo/turbo-typecheck.log +1 -0
  3. package/dist/.tsbuildinfo +1 -0
  4. package/dist/adapter.js +59 -0
  5. package/dist/archetypes/ArchetypeConfigService.js +510 -0
  6. package/dist/archetypes/derive-archetype.js +196 -0
  7. package/dist/archetypes/index.js +7 -0
  8. package/dist/benchmark/ArchetypeMatchupBenchmark.js +547 -0
  9. package/dist/benchmark/BenchmarkChartGenerator.js +632 -0
  10. package/dist/benchmark/BenchmarkDataGenerator.js +825 -0
  11. package/dist/benchmark/BenchmarkDataViewer.js +197 -0
  12. package/dist/benchmark/BenchmarkHistoryService.js +135 -0
  13. package/dist/benchmark/BenchmarkRunner.js +483 -0
  14. package/dist/benchmark/BenchmarkValidator.js +158 -0
  15. package/dist/benchmark/FastEvalRunner.js +133 -0
  16. package/dist/benchmark/MetricsValidator.js +104 -0
  17. package/dist/benchmark/MetricsVisualizer.js +775 -0
  18. package/dist/benchmark/ModelBenchmarkService.js +433 -0
  19. package/dist/benchmark/ModelRegistry.js +122 -0
  20. package/dist/benchmark/RulerBenchmarkIntegration.js +168 -0
  21. package/dist/benchmark/SimulationA2AInterface.js +683 -0
  22. package/dist/benchmark/SimulationEngine.js +522 -0
  23. package/dist/benchmark/TaskRunner.js +60 -0
  24. package/dist/benchmark/__tests__/BenchmarkRunner.test.js +409 -0
  25. package/dist/benchmark/__tests__/HeadToHead.test.js +105 -0
  26. package/dist/benchmark/index.js +23 -0
  27. package/dist/benchmark/parseSimulationMetrics.js +86 -0
  28. package/dist/benchmark/simulation-types.js +1 -0
  29. package/dist/dependencies.js +197 -0
  30. package/dist/generation/TrajectoryGenerator.js +244 -0
  31. package/dist/generation/index.js +6 -0
  32. package/dist/huggingface/HuggingFaceDatasetUploader.js +463 -0
  33. package/dist/huggingface/HuggingFaceIntegrationService.js +272 -0
  34. package/dist/huggingface/HuggingFaceModelUploader.js +385 -0
  35. package/dist/huggingface/index.js +9 -0
  36. package/dist/huggingface/shared/HuggingFaceUploadUtil.js +144 -0
  37. package/dist/index.js +41 -0
  38. package/dist/init-training.js +43 -0
  39. package/dist/metrics/TrajectoryMetricsExtractor.js +523 -0
  40. package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +628 -0
  41. package/dist/metrics/index.js +7 -0
  42. package/dist/metrics/types.js +21 -0
  43. package/dist/rubrics/__tests__/index.test.js +150 -0
  44. package/dist/rubrics/ass-kisser.js +83 -0
  45. package/dist/rubrics/degen.js +78 -0
  46. package/dist/rubrics/goody-twoshoes.js +82 -0
  47. package/dist/rubrics/index.js +184 -0
  48. package/dist/rubrics/information-trader.js +82 -0
  49. package/dist/rubrics/infosec.js +99 -0
  50. package/dist/rubrics/liar.js +102 -0
  51. package/dist/rubrics/perps-trader.js +85 -0
  52. package/dist/rubrics/researcher.js +79 -0
  53. package/dist/rubrics/scammer.js +80 -0
  54. package/dist/rubrics/social-butterfly.js +71 -0
  55. package/dist/rubrics/super-predictor.js +95 -0
  56. package/dist/rubrics/trader.js +65 -0
  57. package/dist/scoring/ArchetypeScoringService.js +301 -0
  58. package/dist/scoring/JudgePromptBuilder.js +401 -0
  59. package/dist/scoring/LLMJudgeCache.js +263 -0
  60. package/dist/scoring/index.js +8 -0
  61. package/dist/training/AutomationPipeline.js +714 -0
  62. package/dist/training/BenchmarkService.js +370 -0
  63. package/dist/training/ConfigValidator.js +153 -0
  64. package/dist/training/MarketOutcomesTracker.js +142 -0
  65. package/dist/training/ModelDeployer.js +128 -0
  66. package/dist/training/ModelFetcher.js +48 -0
  67. package/dist/training/ModelSelectionService.js +248 -0
  68. package/dist/training/ModelUsageVerifier.js +106 -0
  69. package/dist/training/MultiModelOrchestrator.js +349 -0
  70. package/dist/training/RLModelConfig.js +295 -0
  71. package/dist/training/RewardBackpropagationService.js +117 -0
  72. package/dist/training/RulerScoringService.js +450 -0
  73. package/dist/training/TrainingMonitor.js +108 -0
  74. package/dist/training/TrajectoryRecorder.js +281 -0
  75. package/dist/training/__tests__/TrajectoryRecorder.test.js +363 -0
  76. package/dist/training/index.js +30 -0
  77. package/dist/training/logRLConfig.js +29 -0
  78. package/dist/training/pipeline.js +80 -0
  79. package/dist/training/storage/ModelStorageService.js +190 -0
  80. package/dist/training/storage/TrainingDataArchiver.js +136 -0
  81. package/dist/training/storage/index.js +7 -0
  82. package/dist/training/types.js +6 -0
  83. package/dist/training/window-utils.js +100 -0
  84. package/dist/utils/index.js +73 -0
  85. package/dist/utils/logger.js +55 -0
  86. package/dist/utils/snowflake.js +15 -0
  87. package/dist/utils/synthetic-detector.js +67 -0
  88. package/package.json +2 -2
  89. package/research-output/training-runs/training-run-1773742857616.json +38 -0
  90. package/research-output/training-runs/training-run-1773742946977.json +38 -0
  91. package/research-output/training-runs/training-run-1773743278891.json +38 -0
  92. package/research-output/training-runs/training-run-1773743409754.json +38 -0
  93. package/research-output/training-runs/training-run-1773743651086.json +38 -0
  94. package/research-output/training-runs/training-run-1773743782883.json +38 -0
@@ -0,0 +1,775 @@
1
+ /**
2
+ * Metrics Visualizer
3
+ *
4
+ * Generates visualizations and reports from benchmark results:
5
+ * - P&L over time charts
6
+ * - Prediction accuracy graphs
7
+ * - Social metrics
8
+ * - Comparison tables
9
+ * - Performance scorecards
10
+ * - Head-to-Head Baseline vs Challenger reports
11
+ *
12
+ * Outputs HTML reports, JSON data, and ASCII terminal charts for analysis.
13
+ */
14
+ import { promises as fs } from "node:fs";
15
+ import * as path from "node:path";
16
+ import { logger } from "../utils/logger";
17
+ export class MetricsVisualizer {
18
+ /**
19
+ * Generate complete visualization suite for a single run
20
+ */
21
+ static async visualizeSingleRun(result, config) {
22
+ logger.info("Generating visualizations", { resultId: result.id });
23
+ await fs.mkdir(config.outputDir, { recursive: true });
24
+ // 1. Generate metrics summary
25
+ const summaryHtml = MetricsVisualizer.generateMetricsSummary(result);
26
+ await fs.writeFile(path.join(config.outputDir, "summary.html"), summaryHtml);
27
+ // 2. Generate detailed metrics tables
28
+ const detailedHtml = MetricsVisualizer.generateDetailedMetrics(result);
29
+ await fs.writeFile(path.join(config.outputDir, "detailed.html"), detailedHtml);
30
+ // 3. Generate action timeline
31
+ const timelineHtml = MetricsVisualizer.generateActionTimeline(result);
32
+ await fs.writeFile(path.join(config.outputDir, "timeline.html"), timelineHtml);
33
+ // 4. Generate CSV exports if requested
34
+ if (config.generateCsv) {
35
+ await MetricsVisualizer.exportToCsv(result, config.outputDir);
36
+ }
37
+ // 5. Generate master report that links everything
38
+ const reportHtml = MetricsVisualizer.generateMasterReport(result);
39
+ await fs.writeFile(path.join(config.outputDir, "index.html"), reportHtml);
40
+ logger.info("Visualizations generated", { outputDir: config.outputDir });
41
+ }
42
+ /**
43
+ * Generate comparison visualization for multiple runs (Batch Mode)
44
+ */
45
+ static async visualizeComparison(comparison, config) {
46
+ logger.info("Generating comparison visualizations");
47
+ await fs.mkdir(config.outputDir, { recursive: true });
48
+ // 1. Generate comparison summary
49
+ const summaryHtml = MetricsVisualizer.generateComparisonSummary(comparison);
50
+ await fs.writeFile(path.join(config.outputDir, "comparison.html"), summaryHtml);
51
+ // 2. Generate performance distribution charts
52
+ const distributionHtml = MetricsVisualizer.generateDistributionCharts(comparison);
53
+ await fs.writeFile(path.join(config.outputDir, "distribution.html"), distributionHtml);
54
+ // 3. Export comparison data to CSV
55
+ if (config.generateCsv) {
56
+ await MetricsVisualizer.exportComparisonToCsv(comparison, config.outputDir);
57
+ }
58
+ logger.info("Comparison visualizations generated");
59
+ }
60
+ /**
61
+ * Generate Head-to-Head Comparison Report (Baseline vs Challenger)
62
+ * Includes ASCII chart for terminal output and JSON/Text reports.
63
+ */
64
+ static async generateComparisonReport(baseline, challenger, outputDir) {
65
+ logger.info("Generating head-to-head comparison report...");
66
+ await fs.mkdir(outputDir, { recursive: true });
67
+ // 1. Generate ASCII Chart and print to terminal
68
+ const asciiReport = MetricsVisualizer.generateAsciiComparison(baseline, challenger);
69
+ console.log(asciiReport);
70
+ // 2. Save JSON Report with full data
71
+ const jsonReport = {
72
+ timestamp: new Date().toISOString(),
73
+ benchmarkId: baseline.benchmarkId,
74
+ baseline: {
75
+ agentId: baseline.agentId,
76
+ pnl: baseline.metrics.totalPnl,
77
+ accuracy: baseline.metrics.predictionMetrics.accuracy,
78
+ winRate: baseline.metrics.perpMetrics.winRate,
79
+ optimality: baseline.metrics.optimalityScore,
80
+ },
81
+ challenger: {
82
+ agentId: challenger.agentId,
83
+ pnl: challenger.metrics.totalPnl,
84
+ accuracy: challenger.metrics.predictionMetrics.accuracy,
85
+ winRate: challenger.metrics.perpMetrics.winRate,
86
+ optimality: challenger.metrics.optimalityScore,
87
+ },
88
+ delta: {
89
+ pnl: challenger.metrics.totalPnl - baseline.metrics.totalPnl,
90
+ accuracy: challenger.metrics.predictionMetrics.accuracy -
91
+ baseline.metrics.predictionMetrics.accuracy,
92
+ winRate: challenger.metrics.perpMetrics.winRate -
93
+ baseline.metrics.perpMetrics.winRate,
94
+ },
95
+ pnlHistory: MetricsVisualizer.mergePnlHistory(baseline, challenger),
96
+ };
97
+ await fs.writeFile(path.join(outputDir, "comparison.json"), JSON.stringify(jsonReport, null, 2));
98
+ // 3. Save Text Report (ASCII chart)
99
+ await fs.writeFile(path.join(outputDir, "report.txt"), asciiReport);
100
+ logger.info(`Comparison report saved to ${outputDir}`);
101
+ }
102
+ /**
103
+ * Generate ASCII Comparison Chart for Terminal
104
+ * Public for testing purposes
105
+ */
106
+ static generateAsciiComparison(baseline, challenger) {
107
+ const pnlDelta = challenger.metrics.totalPnl - baseline.metrics.totalPnl;
108
+ const winner = pnlDelta >= 0 ? "Challenger (LLM)" : "Baseline";
109
+ let output = `
110
+ === 🥊 HEAD-TO-HEAD RESULTS ===
111
+ Benchmark: ${baseline.benchmarkId}
112
+ Baseline: ${baseline.agentId} | Challenger: ${challenger.agentId}
113
+
114
+ 💰 Cumulative PnL:
115
+ Tick | Baseline | Challenger | Delta
116
+ ----------------------------------------------------------------------
117
+ `;
118
+ // Sample points (every 10th tick or so to fit terminal vertically)
119
+ const history = MetricsVisualizer.mergePnlHistory(baseline, challenger);
120
+ const step = Math.max(1, Math.floor(history.length / 10));
121
+ for (let i = 0; i < history.length; i += step) {
122
+ const point = history[i];
123
+ if (!point)
124
+ continue; // Skip if point is somehow undefined
125
+ const basePnl = point.baseline.toFixed(0);
126
+ const chalPnl = point.challenger.toFixed(0);
127
+ const deltaVal = point.challenger - point.baseline;
128
+ const deltaStr = deltaVal.toFixed(0);
129
+ const sign = deltaVal >= 0 ? "+" : "";
130
+ // Format columns nicely
131
+ output += `${point.tick.toString().padEnd(5)} | $${basePnl.padEnd(21)} | $${chalPnl.padEnd(21)} | ${sign}$${deltaStr}\n`;
132
+ }
133
+ // Final result row
134
+ const finalBase = baseline.metrics.totalPnl.toFixed(2);
135
+ const finalChal = challenger.metrics.totalPnl.toFixed(2);
136
+ const finalDelta = pnlDelta.toFixed(2);
137
+ const finalSign = pnlDelta >= 0 ? "+" : "";
138
+ output += `
139
+ ----------------------------------------------------------------------
140
+ FINAL | $${finalBase.padEnd(21)} | $${finalChal.padEnd(21)} | ${finalSign}$${finalDelta}
141
+
142
+ 🏆 WINNER: ${winner}
143
+ 🚀 Alpha Generated: ${finalSign}$${finalDelta}
144
+ `;
145
+ return output;
146
+ }
147
+ /**
148
+ * Merge PnL histories from two runs into a single timeline
149
+ * Public for testing purposes
150
+ */
151
+ static mergePnlHistory(baseline, challenger) {
152
+ const merged = [];
153
+ const maxTicks = Math.max(baseline.pnlHistory?.length || 0, challenger.pnlHistory?.length || 0);
154
+ for (let i = 0; i < maxTicks; i++) {
155
+ // Use optional chaining and default to final PnL if history is missing or shorter
156
+ // pnlHistory[i] might be undefined if one run is shorter than the other
157
+ const baseTick = baseline.pnlHistory?.[i];
158
+ const chalTick = challenger.pnlHistory?.[i];
159
+ const basePnl = baseTick ? baseTick.pnl : baseline.metrics.totalPnl;
160
+ const chalPnl = chalTick ? chalTick.pnl : challenger.metrics.totalPnl;
161
+ merged.push({
162
+ tick: i,
163
+ baseline: basePnl,
164
+ challenger: chalPnl,
165
+ });
166
+ }
167
+ return merged;
168
+ }
169
+ // =========================================================================
170
+ // Existing Single-Run and Batch Visualizations
171
+ // =========================================================================
172
+ /**
173
+ * Generate metrics summary card
174
+ */
175
+ static generateMetricsSummary(result) {
176
+ const { metrics } = result;
177
+ return `
178
+ <!DOCTYPE html>
179
+ <html>
180
+ <head>
181
+ <title>Benchmark Metrics Summary</title>
182
+ <style>
183
+ body {
184
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
185
+ max-width: 1200px;
186
+ margin: 40px auto;
187
+ padding: 20px;
188
+ background: #f5f5f5;
189
+ }
190
+ .card {
191
+ background: white;
192
+ border-radius: 8px;
193
+ padding: 24px;
194
+ margin-bottom: 20px;
195
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
196
+ }
197
+ .metric {
198
+ display: grid;
199
+ grid-template-columns: 1fr 1fr;
200
+ gap: 20px;
201
+ }
202
+ .metric-group {
203
+ background: #f9f9f9;
204
+ padding: 16px;
205
+ border-radius: 6px;
206
+ }
207
+ .metric-group h3 {
208
+ margin-top: 0;
209
+ color: #333;
210
+ font-size: 14px;
211
+ text-transform: uppercase;
212
+ letter-spacing: 0.5px;
213
+ }
214
+ .metric-item {
215
+ display: flex;
216
+ justify-content: space-between;
217
+ padding: 8px 0;
218
+ border-bottom: 1px solid #eee;
219
+ }
220
+ .metric-item:last-child {
221
+ border-bottom: none;
222
+ }
223
+ .metric-label {
224
+ color: #666;
225
+ font-size: 14px;
226
+ }
227
+ .metric-value {
228
+ font-weight: 600;
229
+ font-size: 16px;
230
+ color: #333;
231
+ }
232
+ .metric-value.positive {
233
+ color: #10b981;
234
+ }
235
+ .metric-value.negative {
236
+ color: #ef4444;
237
+ }
238
+ .score-badge {
239
+ display: inline-block;
240
+ padding: 4px 12px;
241
+ border-radius: 12px;
242
+ font-size: 12px;
243
+ font-weight: 600;
244
+ }
245
+ .score-excellent { background: #d1fae5; color: #065f46; }
246
+ .score-good { background: #dbeafe; color: #1e40af; }
247
+ .score-fair { background: #fef3c7; color: #92400e; }
248
+ .score-poor { background: #fee2e2; color: #991b1b; }
249
+ h1 {
250
+ color: #111;
251
+ margin-bottom: 8px;
252
+ }
253
+ .subtitle {
254
+ color: #666;
255
+ margin-bottom: 32px;
256
+ }
257
+ </style>
258
+ </head>
259
+ <body>
260
+ <h1>📊 Benchmark Results</h1>
261
+ <p class="subtitle">Agent: ${result.agentId} | Benchmark: ${result.benchmarkId}</p>
262
+
263
+ <div class="card">
264
+ <h2>Overall Performance</h2>
265
+ <div class="metric-item">
266
+ <span class="metric-label">Total P&L</span>
267
+ <span class="metric-value ${metrics.totalPnl >= 0 ? "positive" : "negative"}">
268
+ ${metrics.totalPnl >= 0 ? "+" : ""}$${metrics.totalPnl.toFixed(2)}
269
+ </span>
270
+ </div>
271
+ <div class="metric-item">
272
+ <span class="metric-label">Optimality Score</span>
273
+ <span class="metric-value">
274
+ ${metrics.optimalityScore.toFixed(1)}%
275
+ ${MetricsVisualizer.getScoreBadge(metrics.optimalityScore)}
276
+ </span>
277
+ </div>
278
+ <div class="metric-item">
279
+ <span class="metric-label">Total Duration</span>
280
+ <span class="metric-value">${(metrics.timing.totalDuration / 1000).toFixed(1)}s</span>
281
+ </div>
282
+ <div class="metric-item">
283
+ <span class="metric-label">Avg Response Time</span>
284
+ <span class="metric-value">${metrics.timing.avgResponseTime.toFixed(0)}ms</span>
285
+ </div>
286
+ </div>
287
+
288
+ <div class="card">
289
+ <div class="metric">
290
+ <div class="metric-group">
291
+ <h3>Prediction Markets</h3>
292
+ <div class="metric-item">
293
+ <span class="metric-label">Total Positions</span>
294
+ <span class="metric-value">${metrics.predictionMetrics.totalPositions}</span>
295
+ </div>
296
+ <div class="metric-item">
297
+ <span class="metric-label">Accuracy</span>
298
+ <span class="metric-value ${metrics.predictionMetrics.accuracy >= 0.6 ? "positive" : ""}">${(metrics.predictionMetrics.accuracy * 100).toFixed(1)}%</span>
299
+ </div>
300
+ <div class="metric-item">
301
+ <span class="metric-label">Correct</span>
302
+ <span class="metric-value positive">${metrics.predictionMetrics.correctPredictions}</span>
303
+ </div>
304
+ <div class="metric-item">
305
+ <span class="metric-label">Incorrect</span>
306
+ <span class="metric-value negative">${metrics.predictionMetrics.incorrectPredictions}</span>
307
+ </div>
308
+ <div class="metric-item">
309
+ <span class="metric-label">Avg P&L per Position</span>
310
+ <span class="metric-value ${metrics.predictionMetrics.avgPnlPerPosition >= 0 ? "positive" : "negative"}">
311
+ ${metrics.predictionMetrics.avgPnlPerPosition >= 0 ? "+" : ""}$${metrics.predictionMetrics.avgPnlPerPosition.toFixed(2)}
312
+ </span>
313
+ </div>
314
+ </div>
315
+
316
+ <div class="metric-group">
317
+ <h3>Perpetual Futures</h3>
318
+ <div class="metric-item">
319
+ <span class="metric-label">Total Trades</span>
320
+ <span class="metric-value">${metrics.perpMetrics.totalTrades}</span>
321
+ </div>
322
+ <div class="metric-item">
323
+ <span class="metric-label">Win Rate</span>
324
+ <span class="metric-value ${metrics.perpMetrics.winRate >= 0.5 ? "positive" : ""}">${(metrics.perpMetrics.winRate * 100).toFixed(1)}%</span>
325
+ </div>
326
+ <div class="metric-item">
327
+ <span class="metric-label">Profitable Trades</span>
328
+ <span class="metric-value positive">${metrics.perpMetrics.profitableTrades}</span>
329
+ </div>
330
+ <div class="metric-item">
331
+ <span class="metric-label">Avg P&L per Trade</span>
332
+ <span class="metric-value ${metrics.perpMetrics.avgPnlPerTrade >= 0 ? "positive" : "negative"}">
333
+ ${metrics.perpMetrics.avgPnlPerTrade >= 0 ? "+" : ""}$${metrics.perpMetrics.avgPnlPerTrade.toFixed(2)}
334
+ </span>
335
+ </div>
336
+ <div class="metric-item">
337
+ <span class="metric-label">Max Drawdown</span>
338
+ <span class="metric-value negative">$${metrics.perpMetrics.maxDrawdown.toFixed(2)}</span>
339
+ </div>
340
+ </div>
341
+ </div>
342
+ </div>
343
+
344
+ <div class="card">
345
+ <h2>Social Engagement</h2>
346
+ <div class="metric-item">
347
+ <span class="metric-label">Posts Created</span>
348
+ <span class="metric-value">${metrics.socialMetrics.postsCreated}</span>
349
+ </div>
350
+ <div class="metric-item">
351
+ <span class="metric-label">Groups Joined</span>
352
+ <span class="metric-value">${metrics.socialMetrics.groupsJoined}</span>
353
+ </div>
354
+ <div class="metric-item">
355
+ <span class="metric-label">Reputation Gained</span>
356
+ <span class="metric-value ${metrics.socialMetrics.reputationGained >= 0 ? "positive" : "negative"}">
357
+ ${metrics.socialMetrics.reputationGained >= 0 ? "+" : ""}${metrics.socialMetrics.reputationGained}
358
+ </span>
359
+ </div>
360
+ </div>
361
+
362
+ <p style="text-align: center; color: #999; margin-top: 40px;">
363
+ Generated: ${new Date().toLocaleString()}
364
+ </p>
365
+ </body>
366
+ </html>`;
367
+ }
368
+ /**
369
+ * Generate detailed metrics tables
370
+ */
371
+ static generateDetailedMetrics(result) {
372
+ return `
373
+ <!DOCTYPE html>
374
+ <html>
375
+ <head>
376
+ <title>Detailed Metrics</title>
377
+ <style>
378
+ body {
379
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
380
+ max-width: 1400px;
381
+ margin: 40px auto;
382
+ padding: 20px;
383
+ background: #f5f5f5;
384
+ }
385
+ table {
386
+ width: 100%;
387
+ background: white;
388
+ border-radius: 8px;
389
+ overflow: hidden;
390
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
391
+ margin-bottom: 20px;
392
+ }
393
+ th, td {
394
+ padding: 12px;
395
+ text-align: left;
396
+ border-bottom: 1px solid #eee;
397
+ }
398
+ th {
399
+ background: #f9f9f9;
400
+ font-weight: 600;
401
+ font-size: 12px;
402
+ text-transform: uppercase;
403
+ letter-spacing: 0.5px;
404
+ color: #666;
405
+ }
406
+ tr:last-child td {
407
+ border-bottom: none;
408
+ }
409
+ .positive { color: #10b981; }
410
+ .negative { color: #ef4444; }
411
+ </style>
412
+ </head>
413
+ <body>
414
+ <h1>Detailed Action Log</h1>
415
+
416
+ <table>
417
+ <thead>
418
+ <tr>
419
+ <th>Tick</th>
420
+ <th>Type</th>
421
+ <th>Details</th>
422
+ <th>Duration</th>
423
+ </tr>
424
+ </thead>
425
+ <tbody>
426
+ ${result.actions
427
+ .map((action) => `
428
+ <tr>
429
+ <td>#${action.tick}</td>
430
+ <td>${action.type}</td>
431
+ <td><code>${JSON.stringify(action.data)}</code></td>
432
+ <td>${action.duration}ms</td>
433
+ </tr>
434
+ `)
435
+ .join("")}
436
+ </tbody>
437
+ </table>
438
+ </body>
439
+ </html>`;
440
+ }
441
+ /**
442
+ * Generate action timeline
443
+ */
444
+ static generateActionTimeline(result) {
445
+ return `
446
+ <!DOCTYPE html>
447
+ <html>
448
+ <head>
449
+ <title>Action Timeline</title>
450
+ <style>
451
+ body {
452
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
453
+ max-width: 1200px;
454
+ margin: 40px auto;
455
+ padding: 20px;
456
+ background: #f5f5f5;
457
+ }
458
+ .timeline {
459
+ position: relative;
460
+ padding: 20px 0;
461
+ }
462
+ .timeline-item {
463
+ background: white;
464
+ border-radius: 8px;
465
+ padding: 16px;
466
+ margin-bottom: 12px;
467
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
468
+ position: relative;
469
+ padding-left: 80px;
470
+ }
471
+ .timeline-item::before {
472
+ content: '#' attr(data-tick);
473
+ position: absolute;
474
+ left: 16px;
475
+ top: 16px;
476
+ font-weight: 600;
477
+ color: #666;
478
+ font-size: 14px;
479
+ }
480
+ .action-type {
481
+ font-weight: 600;
482
+ color: #333;
483
+ margin-bottom: 4px;
484
+ }
485
+ .action-details {
486
+ color: #666;
487
+ font-size: 14px;
488
+ }
489
+ </style>
490
+ </head>
491
+ <body>
492
+ <h1>Action Timeline</h1>
493
+ <div class="timeline">
494
+ ${result.actions
495
+ .map((action) => `
496
+ <div class="timeline-item" data-tick="${action.tick}">
497
+ <div class="action-type">${action.type}</div>
498
+ <div class="action-details">${JSON.stringify(action.data)}</div>
499
+ </div>
500
+ `)
501
+ .join("")}
502
+ </div>
503
+ </body>
504
+ </html>`;
505
+ }
506
+ /**
507
+ * Generate master report
508
+ */
509
+ static generateMasterReport(result) {
510
+ return `
511
+ <!DOCTYPE html>
512
+ <html>
513
+ <head>
514
+ <title>Benchmark Report</title>
515
+ <style>
516
+ body {
517
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
518
+ max-width: 800px;
519
+ margin: 40px auto;
520
+ padding: 20px;
521
+ background: #f5f5f5;
522
+ }
523
+ .nav {
524
+ background: white;
525
+ border-radius: 8px;
526
+ padding: 24px;
527
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
528
+ }
529
+ .nav a {
530
+ display: block;
531
+ padding: 12px 16px;
532
+ color: #333;
533
+ text-decoration: none;
534
+ border-radius: 6px;
535
+ margin-bottom: 8px;
536
+ transition: background 0.2s;
537
+ }
538
+ .nav a:hover {
539
+ background: #f9f9f9;
540
+ }
541
+ h1 {
542
+ color: #111;
543
+ }
544
+ </style>
545
+ </head>
546
+ <body>
547
+ <h1>📊 Benchmark Report</h1>
548
+ <p>Agent: <strong>${result.agentId}</strong></p>
549
+ <p>Benchmark: <strong>${result.benchmarkId}</strong></p>
550
+ <p>Date: ${new Date(result.startTime).toLocaleString()}</p>
551
+
552
+ <div class="nav">
553
+ <h2>Reports</h2>
554
+ <a href="summary.html">📈 Summary</a>
555
+ <a href="detailed.html">📋 Detailed Metrics</a>
556
+ <a href="timeline.html">⏱️ Action Timeline</a>
557
+ </div>
558
+ </body>
559
+ </html>`;
560
+ }
561
+ /**
562
+ * Generate comparison summary
563
+ */
564
+ static generateComparisonSummary(comparison) {
565
+ return `
566
+ <!DOCTYPE html>
567
+ <html>
568
+ <head>
569
+ <title>Benchmark Comparison</title>
570
+ <style>
571
+ body {
572
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
573
+ max-width: 1200px;
574
+ margin: 40px auto;
575
+ padding: 20px;
576
+ background: #f5f5f5;
577
+ }
578
+ .card {
579
+ background: white;
580
+ border-radius: 8px;
581
+ padding: 24px;
582
+ margin-bottom: 20px;
583
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
584
+ }
585
+ table {
586
+ width: 100%;
587
+ border-collapse: collapse;
588
+ }
589
+ th, td {
590
+ padding: 12px;
591
+ text-align: left;
592
+ border-bottom: 1px solid #eee;
593
+ }
594
+ th {
595
+ background: #f9f9f9;
596
+ font-weight: 600;
597
+ }
598
+ .positive { color: #10b981; }
599
+ .negative { color: #ef4444; }
600
+ </style>
601
+ </head>
602
+ <body>
603
+ <h1>Benchmark Comparison (${comparison.runs.length} runs)</h1>
604
+
605
+ <div class="card">
606
+ <h2>Summary Statistics</h2>
607
+ <table>
608
+ <tr>
609
+ <th>Metric</th>
610
+ <th>Average</th>
611
+ <th>Best</th>
612
+ <th>Worst</th>
613
+ </tr>
614
+ <tr>
615
+ <td>P&L</td>
616
+ <td class="${comparison.comparison.avgPnl >= 0 ? "positive" : "negative"}">$${comparison.comparison.avgPnl.toFixed(2)}</td>
617
+ <td>${comparison.comparison.bestRun}</td>
618
+ <td>${comparison.comparison.worstRun}</td>
619
+ </tr>
620
+ <tr>
621
+ <td>Accuracy</td>
622
+ <td>${(comparison.comparison.avgAccuracy * 100).toFixed(1)}%</td>
623
+ <td>-</td>
624
+ <td>-</td>
625
+ </tr>
626
+ <tr>
627
+ <td>Optimality</td>
628
+ <td>${comparison.comparison.avgOptimality.toFixed(1)}%</td>
629
+ <td>-</td>
630
+ <td>-</td>
631
+ </tr>
632
+ </table>
633
+ </div>
634
+
635
+ <div class="card">
636
+ <h2>Individual Runs</h2>
637
+ <table>
638
+ <thead>
639
+ <tr>
640
+ <th>Run</th>
641
+ <th>Total P&L</th>
642
+ <th>Accuracy</th>
643
+ <th>Optimality</th>
644
+ <th>Duration</th>
645
+ </tr>
646
+ </thead>
647
+ <tbody>
648
+ ${comparison.runs
649
+ .map((run, i) => `
650
+ <tr>
651
+ <td>Run ${i + 1}</td>
652
+ <td class="${run.metrics.totalPnl >= 0 ? "positive" : "negative"}">$${run.metrics.totalPnl.toFixed(2)}</td>
653
+ <td>${(run.metrics.predictionMetrics.accuracy * 100).toFixed(1)}%</td>
654
+ <td>${run.metrics.optimalityScore.toFixed(1)}%</td>
655
+ <td>${(run.metrics.timing.totalDuration / 1000).toFixed(1)}s</td>
656
+ </tr>
657
+ `)
658
+ .join("")}
659
+ </tbody>
660
+ </table>
661
+ </div>
662
+ </body>
663
+ </html>`;
664
+ }
665
+ /**
666
+ * Generate distribution charts
667
+ */
668
+ static generateDistributionCharts(comparison) {
669
+ const pnls = comparison.runs.map((r) => r.metrics.totalPnl);
670
+ const accuracies = comparison.runs.map((r) => r.metrics.predictionMetrics.accuracy * 100);
671
+ return `
672
+ <!DOCTYPE html>
673
+ <html>
674
+ <head>
675
+ <title>Performance Distribution</title>
676
+ <style>
677
+ body {
678
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
679
+ max-width: 1200px;
680
+ margin: 40px auto;
681
+ padding: 20px;
682
+ background: #f5f5f5;
683
+ }
684
+ .chart {
685
+ background: white;
686
+ border-radius: 8px;
687
+ padding: 24px;
688
+ margin-bottom: 20px;
689
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
690
+ }
691
+ .bar {
692
+ height: 30px;
693
+ background: #3b82f6;
694
+ border-radius: 4px;
695
+ margin-bottom: 8px;
696
+ display: flex;
697
+ align-items: center;
698
+ padding: 0 12px;
699
+ color: white;
700
+ font-size: 14px;
701
+ font-weight: 600;
702
+ }
703
+ </style>
704
+ </head>
705
+ <body>
706
+ <h1>Performance Distribution</h1>
707
+
708
+ <div class="chart">
709
+ <h2>P&L Distribution</h2>
710
+ ${pnls
711
+ .map((pnl, i) => `
712
+ <div class="bar" style="width: ${(Math.abs(pnl) / Math.max(...pnls.map(Math.abs))) * 100}%">
713
+ Run ${i + 1}: $${pnl.toFixed(2)}
714
+ </div>
715
+ `)
716
+ .join("")}
717
+ </div>
718
+
719
+ <div class="chart">
720
+ <h2>Accuracy Distribution</h2>
721
+ ${accuracies
722
+ .map((acc, i) => `
723
+ <div class="bar" style="width: ${acc}%">
724
+ Run ${i + 1}: ${acc.toFixed(1)}%
725
+ </div>
726
+ `)
727
+ .join("")}
728
+ </div>
729
+ </body>
730
+ </html>`;
731
+ }
732
+ /**
733
+ * Export to CSV
734
+ */
735
+ static async exportToCsv(result, outputDir) {
736
+ // Actions CSV
737
+ const actionsCsv = [
738
+ "tick,type,data,duration",
739
+ ...result.actions.map((a) => `${a.tick},"${a.type}","${JSON.stringify(a.data).replace(/"/g, '""')}",${a.duration}`),
740
+ ].join("\n");
741
+ await fs.writeFile(path.join(outputDir, "actions.csv"), actionsCsv);
742
+ // Metrics CSV
743
+ const metricsCsv = [
744
+ "metric,value",
745
+ `total_pnl,${result.metrics.totalPnl}`,
746
+ `prediction_accuracy,${result.metrics.predictionMetrics.accuracy}`,
747
+ `perp_win_rate,${result.metrics.perpMetrics.winRate}`,
748
+ `optimality_score,${result.metrics.optimalityScore}`,
749
+ `avg_response_time,${result.metrics.timing.avgResponseTime}`,
750
+ ].join("\n");
751
+ await fs.writeFile(path.join(outputDir, "metrics.csv"), metricsCsv);
752
+ }
753
+ /**
754
+ * Export comparison to CSV
755
+ */
756
+ static async exportComparisonToCsv(comparison, outputDir) {
757
+ const csv = [
758
+ "run,total_pnl,accuracy,optimality,duration",
759
+ ...comparison.runs.map((run, i) => `${i + 1},${run.metrics.totalPnl},${run.metrics.predictionMetrics.accuracy},${run.metrics.optimalityScore},${run.metrics.timing.totalDuration}`),
760
+ ].join("\n");
761
+ await fs.writeFile(path.join(outputDir, "comparison.csv"), csv);
762
+ }
763
+ /**
764
+ * Get score badge HTML
765
+ */
766
+ static getScoreBadge(score) {
767
+ if (score >= 80)
768
+ return '<span class="score-badge score-excellent">Excellent</span>';
769
+ if (score >= 60)
770
+ return '<span class="score-badge score-good">Good</span>';
771
+ if (score >= 40)
772
+ return '<span class="score-badge score-fair">Fair</span>';
773
+ return '<span class="score-badge score-poor">Poor</span>';
774
+ }
775
+ }