@elizaos/training 2.0.0-alpha.21 → 2.0.0-alpha.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/.turbo/turbo-lint.log +2 -0
  2. package/.turbo/turbo-typecheck.log +1 -0
  3. package/dist/.tsbuildinfo +1 -0
  4. package/dist/adapter.js +59 -0
  5. package/dist/archetypes/ArchetypeConfigService.js +510 -0
  6. package/dist/archetypes/derive-archetype.js +196 -0
  7. package/dist/archetypes/index.js +7 -0
  8. package/dist/benchmark/ArchetypeMatchupBenchmark.js +547 -0
  9. package/dist/benchmark/BenchmarkChartGenerator.js +632 -0
  10. package/dist/benchmark/BenchmarkDataGenerator.js +825 -0
  11. package/dist/benchmark/BenchmarkDataViewer.js +197 -0
  12. package/dist/benchmark/BenchmarkHistoryService.js +135 -0
  13. package/dist/benchmark/BenchmarkRunner.js +483 -0
  14. package/dist/benchmark/BenchmarkValidator.js +158 -0
  15. package/dist/benchmark/FastEvalRunner.js +133 -0
  16. package/dist/benchmark/MetricsValidator.js +104 -0
  17. package/dist/benchmark/MetricsVisualizer.js +775 -0
  18. package/dist/benchmark/ModelBenchmarkService.js +433 -0
  19. package/dist/benchmark/ModelRegistry.js +122 -0
  20. package/dist/benchmark/RulerBenchmarkIntegration.js +168 -0
  21. package/dist/benchmark/SimulationA2AInterface.js +683 -0
  22. package/dist/benchmark/SimulationEngine.js +522 -0
  23. package/dist/benchmark/TaskRunner.js +60 -0
  24. package/dist/benchmark/__tests__/BenchmarkRunner.test.js +409 -0
  25. package/dist/benchmark/__tests__/HeadToHead.test.js +105 -0
  26. package/dist/benchmark/index.js +23 -0
  27. package/dist/benchmark/parseSimulationMetrics.js +86 -0
  28. package/dist/benchmark/simulation-types.js +1 -0
  29. package/dist/dependencies.js +197 -0
  30. package/dist/generation/TrajectoryGenerator.js +244 -0
  31. package/dist/generation/index.js +6 -0
  32. package/dist/huggingface/HuggingFaceDatasetUploader.js +463 -0
  33. package/dist/huggingface/HuggingFaceIntegrationService.js +272 -0
  34. package/dist/huggingface/HuggingFaceModelUploader.js +385 -0
  35. package/dist/huggingface/index.js +9 -0
  36. package/dist/huggingface/shared/HuggingFaceUploadUtil.js +144 -0
  37. package/dist/index.js +41 -0
  38. package/dist/init-training.js +43 -0
  39. package/dist/metrics/TrajectoryMetricsExtractor.js +523 -0
  40. package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +628 -0
  41. package/dist/metrics/index.js +7 -0
  42. package/dist/metrics/types.js +21 -0
  43. package/dist/rubrics/__tests__/index.test.js +150 -0
  44. package/dist/rubrics/ass-kisser.js +83 -0
  45. package/dist/rubrics/degen.js +78 -0
  46. package/dist/rubrics/goody-twoshoes.js +82 -0
  47. package/dist/rubrics/index.js +184 -0
  48. package/dist/rubrics/information-trader.js +82 -0
  49. package/dist/rubrics/infosec.js +99 -0
  50. package/dist/rubrics/liar.js +102 -0
  51. package/dist/rubrics/perps-trader.js +85 -0
  52. package/dist/rubrics/researcher.js +79 -0
  53. package/dist/rubrics/scammer.js +80 -0
  54. package/dist/rubrics/social-butterfly.js +71 -0
  55. package/dist/rubrics/super-predictor.js +95 -0
  56. package/dist/rubrics/trader.js +65 -0
  57. package/dist/scoring/ArchetypeScoringService.js +301 -0
  58. package/dist/scoring/JudgePromptBuilder.js +401 -0
  59. package/dist/scoring/LLMJudgeCache.js +263 -0
  60. package/dist/scoring/index.js +8 -0
  61. package/dist/training/AutomationPipeline.js +714 -0
  62. package/dist/training/BenchmarkService.js +370 -0
  63. package/dist/training/ConfigValidator.js +153 -0
  64. package/dist/training/MarketOutcomesTracker.js +142 -0
  65. package/dist/training/ModelDeployer.js +128 -0
  66. package/dist/training/ModelFetcher.js +48 -0
  67. package/dist/training/ModelSelectionService.js +248 -0
  68. package/dist/training/ModelUsageVerifier.js +106 -0
  69. package/dist/training/MultiModelOrchestrator.js +349 -0
  70. package/dist/training/RLModelConfig.js +295 -0
  71. package/dist/training/RewardBackpropagationService.js +117 -0
  72. package/dist/training/RulerScoringService.js +450 -0
  73. package/dist/training/TrainingMonitor.js +108 -0
  74. package/dist/training/TrajectoryRecorder.js +281 -0
  75. package/dist/training/__tests__/TrajectoryRecorder.test.js +363 -0
  76. package/dist/training/index.js +30 -0
  77. package/dist/training/logRLConfig.js +29 -0
  78. package/dist/training/pipeline.js +80 -0
  79. package/dist/training/storage/ModelStorageService.js +190 -0
  80. package/dist/training/storage/TrainingDataArchiver.js +136 -0
  81. package/dist/training/storage/index.js +7 -0
  82. package/dist/training/types.js +6 -0
  83. package/dist/training/window-utils.js +100 -0
  84. package/dist/utils/index.js +73 -0
  85. package/dist/utils/logger.js +55 -0
  86. package/dist/utils/snowflake.js +15 -0
  87. package/dist/utils/synthetic-detector.js +67 -0
  88. package/package.json +2 -2
  89. package/research-output/training-runs/training-run-1773742857616.json +38 -0
  90. package/research-output/training-runs/training-run-1773742946977.json +38 -0
  91. package/research-output/training-runs/training-run-1773743278891.json +38 -0
  92. package/research-output/training-runs/training-run-1773743409754.json +38 -0
  93. package/research-output/training-runs/training-run-1773743651086.json +38 -0
  94. package/research-output/training-runs/training-run-1773743782883.json +38 -0
@@ -0,0 +1,168 @@
1
+ /**
2
+ * RULER Benchmark Integration
3
+ *
4
+ * Provides utilities to integrate benchmark ground truth data with RULER scoring.
5
+ * This allows RULER to evaluate agent trajectories against known benchmark outcomes.
6
+ */
7
+ /**
8
+ * Extract market outcomes from benchmark ground truth for RULER scoring
9
+ *
10
+ * Converts benchmark ground truth data into the format expected by RULER
11
+ * scoring service, extracting both prediction market outcomes and stock
12
+ * price changes.
13
+ *
14
+ * @param snapshot - Benchmark game snapshot with ground truth data
15
+ * @returns MarketOutcomes with stocks and predictions arrays
16
+ *
17
+ * @example
18
+ * ```typescript
19
+ * const outcomes = extractMarketOutcomesFromBenchmark(snapshot);
20
+ * // Returns: { stocks: [...], predictions: [...] }
21
+ * ```
22
+ */
23
+ export function extractMarketOutcomesFromBenchmark(snapshot) {
24
+ const gt = snapshot.groundTruth;
25
+ // Extract prediction market outcomes
26
+ const predictions = Object.entries(gt.marketOutcomes).map(([marketId, outcome]) => ({
27
+ marketId,
28
+ outcome: outcome ? "YES" : "NO",
29
+ }));
30
+ // Extract stock/perpetual outcomes from price history
31
+ const stocks = Object.entries(gt.priceHistory).map(([ticker, history]) => {
32
+ if (history.length === 0) {
33
+ return {
34
+ ticker,
35
+ changePercent: 0,
36
+ };
37
+ }
38
+ const startPrice = history[0]?.price || 0;
39
+ const endPrice = history[history.length - 1]?.price || startPrice;
40
+ const changePercent = startPrice > 0 ? ((endPrice - startPrice) / startPrice) * 100 : 0;
41
+ return {
42
+ ticker,
43
+ changePercent,
44
+ };
45
+ });
46
+ return {
47
+ stocks,
48
+ predictions,
49
+ };
50
+ }
51
+ /**
52
+ * Get hidden facts for a specific tick (for RULER analysis)
53
+ *
54
+ * Retrieves hidden facts that were not visible to the agent at a specific
55
+ * tick, used for evaluating whether agent decisions aligned with hidden information.
56
+ *
57
+ * @param snapshot - Benchmark game snapshot
58
+ * @param tickNumber - Tick number to get facts for
59
+ * @returns Array of hidden facts for that tick
60
+ */
61
+ export function getHiddenFactsForTick(snapshot, tickNumber) {
62
+ return (snapshot.groundTruth.hiddenFacts || []).filter((f) => f.tick === tickNumber);
63
+ }
64
+ /**
65
+ * Get hidden events for a specific tick (for RULER analysis)
66
+ *
67
+ * Retrieves hidden events that occurred at a specific tick, used for
68
+ * evaluating agent decision quality against ground truth.
69
+ *
70
+ * @param snapshot - Benchmark game snapshot
71
+ * @param tickNumber - Tick number to get events for
72
+ * @returns Array of hidden events for that tick
73
+ */
74
+ export function getHiddenEventsForTick(snapshot, tickNumber) {
75
+ return (snapshot.groundTruth.hiddenEvents || []).filter((e) => e.tick === tickNumber);
76
+ }
77
+ /**
78
+ * Check if agent decision was optimal given ground truth
79
+ *
80
+ * Compares an agent's action against the optimal actions defined in the
81
+ * benchmark ground truth, allowing a small time window for timing differences.
82
+ *
83
+ * @param snapshot - Benchmark game snapshot
84
+ * @param tickNumber - Tick when action occurred
85
+ * @param actionType - Type of action taken
86
+ * @param target - Target of the action (market ID, ticker, etc.)
87
+ * @returns True if action matches an optimal action within the time window
88
+ */
89
+ export function wasDecisionOptimal(snapshot, tickNumber, actionType, target) {
90
+ const optimalActions = snapshot.groundTruth.optimalActions;
91
+ // Find optimal actions near this tick
92
+ const window = 2; // Allow 2 tick window
93
+ const relevantActions = optimalActions.filter((a) => Math.abs(a.tick - tickNumber) <= window &&
94
+ a.type === actionType &&
95
+ a.target === target);
96
+ return relevantActions.length > 0;
97
+ }
98
+ /**
99
+ * Get true facts about the world state (for RULER context)
100
+ *
101
+ * Retrieves the true facts about the world state that agents don't know,
102
+ * used for RULER evaluation context.
103
+ *
104
+ * @param snapshot - Benchmark game snapshot
105
+ * @returns Object containing true facts about the world state
106
+ */
107
+ export function getTrueFacts(snapshot) {
108
+ return snapshot.groundTruth.trueFacts || {};
109
+ }
110
+ /**
111
+ * Create RULER evaluation context from benchmark
112
+ *
113
+ * Provides all the ground truth information RULER needs to evaluate
114
+ * agent decisions, while ensuring agents never see this data during execution.
115
+ *
116
+ * @param snapshot - Benchmark game snapshot
117
+ * @returns Complete RULER evaluation context with all ground truth data
118
+ *
119
+ * @remarks
120
+ * This function aggregates all ground truth data into a single context object
121
+ * that can be used by RULER to score agent trajectories. The data includes
122
+ * market outcomes, hidden facts/events, optimal actions, and true facts.
123
+ */
124
+ export function createRulerContext(snapshot) {
125
+ return {
126
+ marketOutcomes: extractMarketOutcomesFromBenchmark(snapshot),
127
+ trueFacts: getTrueFacts(snapshot),
128
+ hiddenFacts: snapshot.groundTruth.hiddenFacts || [],
129
+ hiddenEvents: snapshot.groundTruth.hiddenEvents || [],
130
+ optimalActions: snapshot.groundTruth.optimalActions,
131
+ };
132
+ }
133
+ /**
134
+ * Score agent action against ground truth
135
+ *
136
+ * Evaluates a single agent action against the benchmark ground truth and
137
+ * returns a score indicating how well it aligned with optimal play.
138
+ *
139
+ * @param snapshot - Benchmark game snapshot
140
+ * @param tickNumber - Tick when action occurred
141
+ * @param actionType - Type of action taken
142
+ * @param target - Target of the action (market ID, ticker, etc.)
143
+ * @returns Score from 0-1 (1.0 = optimal, 0.5 = reasonable, 0.0 = poor)
144
+ *
145
+ * @remarks
146
+ * - Returns 1.0 if action matches optimal action
147
+ * - Returns 0.5 if action aligns with hidden facts
148
+ * - Returns 0.0 otherwise
149
+ */
150
+ export function scoreActionAgainstGroundTruth(snapshot, tickNumber, actionType, target) {
151
+ // Check if action was optimal
152
+ const wasOptimal = wasDecisionOptimal(snapshot, tickNumber, actionType, target);
153
+ if (wasOptimal) {
154
+ return 1.0;
155
+ }
156
+ // Check if action was reasonable given hidden facts
157
+ const hiddenFacts = getHiddenFactsForTick(snapshot, tickNumber);
158
+ const relevantFacts = hiddenFacts.filter((f) => f.value &&
159
+ typeof f.value === "object" &&
160
+ "marketId" in f.value &&
161
+ f.value.marketId === target);
162
+ if (relevantFacts.length > 0) {
163
+ // Partial credit for actions that align with hidden facts
164
+ return 0.5;
165
+ }
166
+ // No credit for actions that don't align with optimal play or hidden facts
167
+ return 0.0;
168
+ }