@elizaos/training 2.0.0-alpha.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. package/Dockerfile +75 -0
  2. package/Makefile +374 -0
  3. package/README.md +346 -0
  4. package/config/rubrics.json +137 -0
  5. package/data/.gitkeep +0 -0
  6. package/data/degen/.gitkeep +2 -0
  7. package/data/trader/.gitkeep +2 -0
  8. package/docker-compose.test.yml +57 -0
  9. package/package.json +58 -0
  10. package/python/config/babylon_atropos.yaml +90 -0
  11. package/python/config/profiles/12gb.json +11 -0
  12. package/python/config/profiles/16gb.json +10 -0
  13. package/python/config/profiles/24gb.json +10 -0
  14. package/python/config/profiles/48gb.json +10 -0
  15. package/python/config/profiles/cpu.json +11 -0
  16. package/python/config/profiles/l40-2gpu-safe.json +20 -0
  17. package/python/config/profiles/l40-2gpu.json +22 -0
  18. package/python/config/profiles/l40-4gpu.json +21 -0
  19. package/python/config/profiles/l40.json +17 -0
  20. package/python/config/tinker_training.yaml +143 -0
  21. package/python/curriculum_state.json +165 -0
  22. package/python/env.template +86 -0
  23. package/python/env.training.template +46 -0
  24. package/python/pyproject.toml +41 -0
  25. package/python/requirements-ci.txt +31 -0
  26. package/python/requirements.txt +87 -0
  27. package/python/scripts/__init__.py +4 -0
  28. package/python/scripts/import_json_trajectories.py +412 -0
  29. package/python/scripts/local-finetune/README.md +63 -0
  30. package/python/scripts/local-finetune/ingest_and_score.py +139 -0
  31. package/python/scripts/local-finetune/merge_model.py +32 -0
  32. package/python/scripts/local-finetune/test_adapter.py +91 -0
  33. package/python/scripts/local-finetune/train_from_csv.py +132 -0
  34. package/python/scripts/merge_trajectories.py +318 -0
  35. package/python/scripts/run_ab_test.py +143 -0
  36. package/python/scripts/run_full_pipeline.py +544 -0
  37. package/python/scripts/run_tinker_training.py +192 -0
  38. package/python/scripts/run_training.py +914 -0
  39. package/python/scripts/test_judge.py +155 -0
  40. package/python/scripts/test_pipeline.py +356 -0
  41. package/python/scripts/test_trained_model.py +380 -0
  42. package/python/scripts/train_local.py +528 -0
  43. package/python/setup.py +20 -0
  44. package/python/src/__init__.py +190 -0
  45. package/python/src/data_bridge/__init__.py +24 -0
  46. package/python/src/data_bridge/converter.py +435 -0
  47. package/python/src/data_bridge/reader.py +393 -0
  48. package/python/src/models.py +283 -0
  49. package/python/src/training/__init__.py +605 -0
  50. package/python/src/training/ab_testing.py +404 -0
  51. package/python/src/training/action_executor.py +621 -0
  52. package/python/src/training/archetype_trainer.py +347 -0
  53. package/python/src/training/atropos_trainer.py +980 -0
  54. package/python/src/training/babylon_env.py +1254 -0
  55. package/python/src/training/error_recovery.py +647 -0
  56. package/python/src/training/evaluation.py +856 -0
  57. package/python/src/training/fast_simulator.py +880 -0
  58. package/python/src/training/format_validator.py +584 -0
  59. package/python/src/training/hybrid_env.py +522 -0
  60. package/python/src/training/kl_controller.py +628 -0
  61. package/python/src/training/multi_prompt_dataset.py +883 -0
  62. package/python/src/training/multi_turn.py +656 -0
  63. package/python/src/training/online_env.py +1084 -0
  64. package/python/src/training/quality_scorer.py +391 -0
  65. package/python/src/training/quality_utils.py +633 -0
  66. package/python/src/training/rewards.py +1344 -0
  67. package/python/src/training/rlaif_env.py +17 -0
  68. package/python/src/training/rollout_generator.py +502 -0
  69. package/python/src/training/rubric_loader.py +198 -0
  70. package/python/src/training/scenario_pool.py +1072 -0
  71. package/python/src/training/schemas.py +481 -0
  72. package/python/src/training/service_manager.py +552 -0
  73. package/python/src/training/simulation_bridge.py +535 -0
  74. package/python/src/training/tick_reward_attribution.py +399 -0
  75. package/python/src/training/tinker_client.py +575 -0
  76. package/python/src/training/tinker_trainer.py +646 -0
  77. package/python/src/training/tokenization_utils.py +402 -0
  78. package/python/tests/e2e/__init__.py +13 -0
  79. package/python/tests/e2e/conftest.py +258 -0
  80. package/python/tests/e2e/test_full_pipeline.py +643 -0
  81. package/python/tests/e2e/test_online_training_e2e.py +365 -0
  82. package/python/tests/integration/__init__.py +12 -0
  83. package/python/tests/integration/conftest.py +383 -0
  84. package/python/tests/integration/test_db_integration.py +649 -0
  85. package/python/tests/integration/test_json_mode_integration.py +554 -0
  86. package/python/tests/test_action_executor.py +594 -0
  87. package/python/tests/test_archetype_scoring.py +1027 -0
  88. package/python/tests/test_atropos_integration.py +360 -0
  89. package/python/tests/test_evaluation.py +727 -0
  90. package/python/tests/test_format_validator.py +486 -0
  91. package/python/tests/test_kl_controller.py +432 -0
  92. package/python/tests/test_lr_scheduler.py +579 -0
  93. package/python/tests/test_multi_turn.py +590 -0
  94. package/python/tests/test_online_env.py +519 -0
  95. package/python/tests/test_quality_scorer.py +474 -0
  96. package/python/tests/test_scenario_pool.py +735 -0
  97. package/python/tests/test_service_manager.py +585 -0
  98. package/python/tests/test_simulation_rollout.py +581 -0
  99. package/python/tests/test_tokenization_utils.py +501 -0
  100. package/python/tests/test_training_orchestrator.py +497 -0
  101. package/python/tests/test_training_output_structure.py +661 -0
  102. package/research-output/training-runs/training-run-1770772042899.json +26 -0
  103. package/research-output/training-runs/training-run-1770930079670.json +32 -0
  104. package/research-output/training-runs/training-run-1770930143700.json +44 -0
  105. package/research-output/training-runs/training-run-1770930183638.json +38 -0
  106. package/research-output/training-runs/training-run-1770930442049.json +38 -0
  107. package/research-output/training-runs/training-run-1770930793243.json +38 -0
  108. package/scripts/assess-training-data.ts +422 -0
  109. package/scripts/e2e-training-test.ts +550 -0
  110. package/scripts/export-rubrics.ts +64 -0
  111. package/scripts/generate-research-report.ts +1523 -0
  112. package/scripts/generate_dataset.sh +173 -0
  113. package/scripts/json-mode-benchmark.ts +399 -0
  114. package/scripts/real-archetype-benchmark.ts +210 -0
  115. package/scripts/run-baseline-comparison.ts +116 -0
  116. package/scripts/run-full-pipeline.ts +272 -0
  117. package/scripts/runpod_setup.sh +137 -0
  118. package/scripts/runpod_validate.sh +147 -0
  119. package/scripts/test-model-in-game.ts +955 -0
  120. package/scripts/test-scoring.ts +73 -0
  121. package/scripts/test-trained-model.ts +209 -0
  122. package/scripts/train-and-test.ts +824 -0
  123. package/scripts/verify-final.ts +118 -0
  124. package/src/adapter.ts +516 -0
  125. package/src/archetypes/ArchetypeConfigService.ts +626 -0
  126. package/src/archetypes/derive-archetype.ts +249 -0
  127. package/src/archetypes/index.ts +22 -0
  128. package/src/benchmark/ArchetypeMatchupBenchmark.ts +825 -0
  129. package/src/benchmark/BenchmarkChartGenerator.ts +748 -0
  130. package/src/benchmark/BenchmarkDataGenerator.ts +1288 -0
  131. package/src/benchmark/BenchmarkDataViewer.ts +324 -0
  132. package/src/benchmark/BenchmarkHistoryService.ts +221 -0
  133. package/src/benchmark/BenchmarkRunner.ts +685 -0
  134. package/src/benchmark/BenchmarkValidator.ts +206 -0
  135. package/src/benchmark/FastEvalRunner.ts +225 -0
  136. package/src/benchmark/MetricsValidator.ts +165 -0
  137. package/src/benchmark/MetricsVisualizer.ts +909 -0
  138. package/src/benchmark/ModelBenchmarkService.ts +611 -0
  139. package/src/benchmark/ModelRegistry.ts +158 -0
  140. package/src/benchmark/RulerBenchmarkIntegration.ts +235 -0
  141. package/src/benchmark/SimulationA2AInterface.ts +1169 -0
  142. package/src/benchmark/SimulationEngine.ts +832 -0
  143. package/src/benchmark/__tests__/BenchmarkRunner.test.ts +534 -0
  144. package/src/benchmark/__tests__/HeadToHead.test.ts +126 -0
  145. package/src/benchmark/index.ts +89 -0
  146. package/src/benchmark/parseSimulationMetrics.ts +124 -0
  147. package/src/benchmark/simulation-types.ts +78 -0
  148. package/src/dependencies.ts +439 -0
  149. package/src/generation/TrajectoryGenerator.ts +387 -0
  150. package/src/generation/index.ts +12 -0
  151. package/src/huggingface/HuggingFaceDatasetUploader.ts +636 -0
  152. package/src/huggingface/HuggingFaceIntegrationService.ts +426 -0
  153. package/src/huggingface/HuggingFaceModelUploader.ts +532 -0
  154. package/src/huggingface/index.ts +27 -0
  155. package/src/huggingface/shared/HuggingFaceUploadUtil.ts +206 -0
  156. package/src/index.ts +102 -0
  157. package/src/init-training.ts +53 -0
  158. package/src/metrics/TrajectoryMetricsExtractor.ts +653 -0
  159. package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +759 -0
  160. package/src/metrics/index.ts +8 -0
  161. package/src/metrics/types.ts +200 -0
  162. package/src/rubrics/__tests__/index.test.ts +184 -0
  163. package/src/rubrics/ass-kisser.ts +85 -0
  164. package/src/rubrics/degen.ts +80 -0
  165. package/src/rubrics/goody-twoshoes.ts +84 -0
  166. package/src/rubrics/index.ts +236 -0
  167. package/src/rubrics/information-trader.ts +84 -0
  168. package/src/rubrics/infosec.ts +101 -0
  169. package/src/rubrics/liar.ts +104 -0
  170. package/src/rubrics/perps-trader.ts +87 -0
  171. package/src/rubrics/researcher.ts +81 -0
  172. package/src/rubrics/scammer.ts +82 -0
  173. package/src/rubrics/social-butterfly.ts +73 -0
  174. package/src/rubrics/super-predictor.ts +97 -0
  175. package/src/rubrics/trader.ts +67 -0
  176. package/src/scoring/ArchetypeScoringService.ts +486 -0
  177. package/src/scoring/JudgePromptBuilder.ts +556 -0
  178. package/src/scoring/LLMJudgeCache.ts +401 -0
  179. package/src/scoring/index.ts +9 -0
  180. package/src/training/AutomationPipeline.ts +916 -0
  181. package/src/training/BenchmarkService.ts +518 -0
  182. package/src/training/ConfigValidator.ts +220 -0
  183. package/src/training/MarketOutcomesTracker.ts +187 -0
  184. package/src/training/ModelDeployer.ts +186 -0
  185. package/src/training/ModelFetcher.ts +76 -0
  186. package/src/training/ModelSelectionService.ts +341 -0
  187. package/src/training/ModelUsageVerifier.ts +160 -0
  188. package/src/training/MultiModelOrchestrator.ts +580 -0
  189. package/src/training/RLModelConfig.ts +407 -0
  190. package/src/training/RewardBackpropagationService.ts +149 -0
  191. package/src/training/RulerScoringService.ts +666 -0
  192. package/src/training/TrainingMonitor.ts +166 -0
  193. package/src/training/TrajectoryRecorder.ts +399 -0
  194. package/src/training/__tests__/TrajectoryRecorder.test.ts +472 -0
  195. package/src/training/index.ts +100 -0
  196. package/src/training/logRLConfig.ts +34 -0
  197. package/src/training/pipeline.ts +129 -0
  198. package/src/training/storage/ModelStorageService.ts +279 -0
  199. package/src/training/storage/TrainingDataArchiver.ts +197 -0
  200. package/src/training/storage/index.ts +17 -0
  201. package/src/training/types.ts +207 -0
  202. package/src/training/window-utils.ts +138 -0
  203. package/src/utils/index.ts +101 -0
  204. package/src/utils/logger.ts +59 -0
  205. package/src/utils/snowflake.ts +17 -0
  206. package/src/utils/synthetic-detector.ts +111 -0
  207. package/tsconfig.json +20 -0
@@ -0,0 +1,158 @@
1
+ /**
2
+ * Model Registry
3
+ *
4
+ * Centralized configuration for all models available for benchmarking.
5
+ * Add new models here to make them available for comparison.
6
+ */
7
+
8
+ export interface ModelConfig {
9
+ /** Unique identifier for the model */
10
+ id: string;
11
+
12
+ /** Display name for reports */
13
+ displayName: string;
14
+
15
+ /** Provider (groq, openai, anthropic, etc.) */
16
+ provider: 'groq' | 'openai' | 'anthropic' | 'together' | 'local';
17
+
18
+ /** Model identifier for the provider's API */
19
+ modelId: string;
20
+
21
+ /** Model tier (lite, standard, pro) */
22
+ tier: 'lite' | 'standard' | 'pro';
23
+
24
+ /** Approximate parameters in billions */
25
+ parametersBillions?: number;
26
+
27
+ /** Whether this is a baseline model */
28
+ isBaseline: boolean;
29
+
30
+ /** Additional metadata */
31
+ metadata?: Record<string, string | number | boolean>;
32
+ }
33
+
34
+ /**
35
+ * Registry of all available models for benchmarking
36
+ */
37
+ export const MODEL_REGISTRY: ModelConfig[] = [
38
+ {
39
+ id: 'llama-8b',
40
+ displayName: 'LLaMA 3.1 8B',
41
+ provider: 'groq',
42
+ modelId: 'llama-3.1-8b-instant',
43
+ tier: 'lite',
44
+ parametersBillions: 8,
45
+ isBaseline: true,
46
+ },
47
+ {
48
+ id: 'llama-70b',
49
+ displayName: 'LLaMA 3.1 70B',
50
+ provider: 'groq',
51
+ modelId: 'llama-3.1-70b-versatile',
52
+ tier: 'standard',
53
+ parametersBillions: 70,
54
+ isBaseline: false,
55
+ },
56
+ {
57
+ id: 'qwen-32b',
58
+ displayName: 'Qwen 3 32B',
59
+ provider: 'groq',
60
+ modelId: 'qwen/qwen3-32b',
61
+ tier: 'standard',
62
+ parametersBillions: 32,
63
+ isBaseline: true,
64
+ },
65
+ {
66
+ id: 'mixtral-8x7b',
67
+ displayName: 'Mixtral 8x7B',
68
+ provider: 'groq',
69
+ modelId: 'mixtral-8x7b-32768',
70
+ tier: 'standard',
71
+ parametersBillions: 46,
72
+ isBaseline: false,
73
+ },
74
+ {
75
+ id: 'gpt-4o',
76
+ displayName: 'GPT-4o',
77
+ provider: 'openai',
78
+ modelId: 'gpt-4o',
79
+ tier: 'pro',
80
+ isBaseline: false,
81
+ },
82
+ {
83
+ id: 'gpt-4o-mini',
84
+ displayName: 'GPT-4o Mini',
85
+ provider: 'openai',
86
+ modelId: 'gpt-4o-mini',
87
+ tier: 'lite',
88
+ isBaseline: false,
89
+ },
90
+ {
91
+ id: 'claude-sonnet',
92
+ displayName: 'Claude 3.5 Sonnet',
93
+ provider: 'anthropic',
94
+ modelId: 'claude-3-5-sonnet-20241022',
95
+ tier: 'pro',
96
+ isBaseline: false,
97
+ },
98
+ {
99
+ id: 'claude-haiku',
100
+ displayName: 'Claude 3.5 Haiku',
101
+ provider: 'anthropic',
102
+ modelId: 'claude-3-5-haiku-20241022',
103
+ tier: 'lite',
104
+ isBaseline: false,
105
+ },
106
+ ];
107
+
108
+ /**
109
+ * Get a model config by ID
110
+ */
111
+ export function getModelById(id: string): ModelConfig | undefined {
112
+ return MODEL_REGISTRY.find((m) => m.id === id);
113
+ }
114
+
115
+ /**
116
+ * Get a model config by model ID (API identifier)
117
+ */
118
+ export function getModelByModelId(modelId: string): ModelConfig | undefined {
119
+ return MODEL_REGISTRY.find((m) => m.modelId === modelId);
120
+ }
121
+
122
+ /**
123
+ * Get all baseline models
124
+ */
125
+ export function getBaselineModels(): ModelConfig[] {
126
+ return MODEL_REGISTRY.filter((m) => m.isBaseline);
127
+ }
128
+
129
+ /**
130
+ * Get models by provider
131
+ */
132
+ export function getModelsByProvider(
133
+ provider: ModelConfig['provider']
134
+ ): ModelConfig[] {
135
+ return MODEL_REGISTRY.filter((m) => m.provider === provider);
136
+ }
137
+
138
+ /**
139
+ * Get models by tier
140
+ */
141
+ export function getModelsByTier(tier: ModelConfig['tier']): ModelConfig[] {
142
+ return MODEL_REGISTRY.filter((m) => m.tier === tier);
143
+ }
144
+
145
+ /**
146
+ * Validate that a model ID exists
147
+ */
148
+ export function validateModelId(id: string): boolean {
149
+ return MODEL_REGISTRY.some((m) => m.id === id || m.modelId === id);
150
+ }
151
+
152
+ /**
153
+ * Get model display name (supports both id and modelId)
154
+ */
155
+ export function getModelDisplayName(idOrModelId: string): string {
156
+ const model = getModelById(idOrModelId) ?? getModelByModelId(idOrModelId);
157
+ return model?.displayName ?? idOrModelId;
158
+ }
@@ -0,0 +1,235 @@
1
+ /**
2
+ * RULER Benchmark Integration
3
+ *
4
+ * Provides utilities to integrate benchmark ground truth data with RULER scoring.
5
+ * This allows RULER to evaluate agent trajectories against known benchmark outcomes.
6
+ */
7
+
8
+ import type { MarketOutcomes } from '../training/RulerScoringService';
9
+ import type {
10
+ BenchmarkGameSnapshot,
11
+ GroundTruth,
12
+ } from './BenchmarkDataGenerator';
13
+
14
+ /**
15
+ * Extract market outcomes from benchmark ground truth for RULER scoring
16
+ *
17
+ * Converts benchmark ground truth data into the format expected by RULER
18
+ * scoring service, extracting both prediction market outcomes and stock
19
+ * price changes.
20
+ *
21
+ * @param snapshot - Benchmark game snapshot with ground truth data
22
+ * @returns MarketOutcomes with stocks and predictions arrays
23
+ *
24
+ * @example
25
+ * ```typescript
26
+ * const outcomes = extractMarketOutcomesFromBenchmark(snapshot);
27
+ * // Returns: { stocks: [...], predictions: [...] }
28
+ * ```
29
+ */
30
+ export function extractMarketOutcomesFromBenchmark(
31
+ snapshot: BenchmarkGameSnapshot
32
+ ): MarketOutcomes {
33
+ const gt = snapshot.groundTruth;
34
+
35
+ // Extract prediction market outcomes
36
+ const predictions: Array<{ marketId: string; outcome: 'YES' | 'NO' }> =
37
+ Object.entries(gt.marketOutcomes).map(([marketId, outcome]) => ({
38
+ marketId,
39
+ outcome: outcome ? 'YES' : 'NO',
40
+ }));
41
+
42
+ // Extract stock/perpetual outcomes from price history
43
+ const stocks = Object.entries(gt.priceHistory).map(([ticker, history]) => {
44
+ if (history.length === 0) {
45
+ return {
46
+ ticker,
47
+ changePercent: 0,
48
+ };
49
+ }
50
+
51
+ const startPrice = history[0]?.price || 0;
52
+ const endPrice = history[history.length - 1]?.price || startPrice;
53
+ const changePercent =
54
+ startPrice > 0 ? ((endPrice - startPrice) / startPrice) * 100 : 0;
55
+
56
+ return {
57
+ ticker,
58
+ changePercent,
59
+ };
60
+ });
61
+
62
+ return {
63
+ stocks,
64
+ predictions,
65
+ };
66
+ }
67
+
68
+ /**
69
+ * Get hidden facts for a specific tick (for RULER analysis)
70
+ *
71
+ * Retrieves hidden facts that were not visible to the agent at a specific
72
+ * tick, used for evaluating whether agent decisions aligned with hidden information.
73
+ *
74
+ * @param snapshot - Benchmark game snapshot
75
+ * @param tickNumber - Tick number to get facts for
76
+ * @returns Array of hidden facts for that tick
77
+ */
78
+ export function getHiddenFactsForTick(
79
+ snapshot: BenchmarkGameSnapshot,
80
+ tickNumber: number
81
+ ): GroundTruth['hiddenFacts'] {
82
+ return (snapshot.groundTruth.hiddenFacts || []).filter(
83
+ (f) => f.tick === tickNumber
84
+ );
85
+ }
86
+
87
+ /**
88
+ * Get hidden events for a specific tick (for RULER analysis)
89
+ *
90
+ * Retrieves hidden events that occurred at a specific tick, used for
91
+ * evaluating agent decision quality against ground truth.
92
+ *
93
+ * @param snapshot - Benchmark game snapshot
94
+ * @param tickNumber - Tick number to get events for
95
+ * @returns Array of hidden events for that tick
96
+ */
97
+ export function getHiddenEventsForTick(
98
+ snapshot: BenchmarkGameSnapshot,
99
+ tickNumber: number
100
+ ): GroundTruth['hiddenEvents'] {
101
+ return (snapshot.groundTruth.hiddenEvents || []).filter(
102
+ (e) => e.tick === tickNumber
103
+ );
104
+ }
105
+
106
+ /**
107
+ * Check if agent decision was optimal given ground truth
108
+ *
109
+ * Compares an agent's action against the optimal actions defined in the
110
+ * benchmark ground truth, allowing a small time window for timing differences.
111
+ *
112
+ * @param snapshot - Benchmark game snapshot
113
+ * @param tickNumber - Tick when action occurred
114
+ * @param actionType - Type of action taken
115
+ * @param target - Target of the action (market ID, ticker, etc.)
116
+ * @returns True if action matches an optimal action within the time window
117
+ */
118
+ export function wasDecisionOptimal(
119
+ snapshot: BenchmarkGameSnapshot,
120
+ tickNumber: number,
121
+ actionType: string,
122
+ target: string
123
+ ): boolean {
124
+ const optimalActions = snapshot.groundTruth.optimalActions;
125
+
126
+ // Find optimal actions near this tick
127
+ const window = 2; // Allow 2 tick window
128
+ const relevantActions = optimalActions.filter(
129
+ (a) =>
130
+ Math.abs(a.tick - tickNumber) <= window &&
131
+ a.type === actionType &&
132
+ a.target === target
133
+ );
134
+
135
+ return relevantActions.length > 0;
136
+ }
137
+
138
+ /**
139
+ * Get true facts about the world state (for RULER context)
140
+ *
141
+ * Retrieves the true facts about the world state that agents don't know,
142
+ * used for RULER evaluation context.
143
+ *
144
+ * @param snapshot - Benchmark game snapshot
145
+ * @returns Object containing true facts about the world state
146
+ */
147
+ export function getTrueFacts(
148
+ snapshot: BenchmarkGameSnapshot
149
+ ): GroundTruth['trueFacts'] {
150
+ return snapshot.groundTruth.trueFacts || {};
151
+ }
152
+
153
+ /**
154
+ * Create RULER evaluation context from benchmark
155
+ *
156
+ * Provides all the ground truth information RULER needs to evaluate
157
+ * agent decisions, while ensuring agents never see this data during execution.
158
+ *
159
+ * @param snapshot - Benchmark game snapshot
160
+ * @returns Complete RULER evaluation context with all ground truth data
161
+ *
162
+ * @remarks
163
+ * This function aggregates all ground truth data into a single context object
164
+ * that can be used by RULER to score agent trajectories. The data includes
165
+ * market outcomes, hidden facts/events, optimal actions, and true facts.
166
+ */
167
+ export function createRulerContext(snapshot: BenchmarkGameSnapshot): {
168
+ marketOutcomes: MarketOutcomes;
169
+ trueFacts: GroundTruth['trueFacts'];
170
+ hiddenFacts: GroundTruth['hiddenFacts'];
171
+ hiddenEvents: GroundTruth['hiddenEvents'];
172
+ optimalActions: GroundTruth['optimalActions'];
173
+ } {
174
+ return {
175
+ marketOutcomes: extractMarketOutcomesFromBenchmark(snapshot),
176
+ trueFacts: getTrueFacts(snapshot),
177
+ hiddenFacts: snapshot.groundTruth.hiddenFacts || [],
178
+ hiddenEvents: snapshot.groundTruth.hiddenEvents || [],
179
+ optimalActions: snapshot.groundTruth.optimalActions,
180
+ };
181
+ }
182
+
183
+ /**
184
+ * Score agent action against ground truth
185
+ *
186
+ * Evaluates a single agent action against the benchmark ground truth and
187
+ * returns a score indicating how well it aligned with optimal play.
188
+ *
189
+ * @param snapshot - Benchmark game snapshot
190
+ * @param tickNumber - Tick when action occurred
191
+ * @param actionType - Type of action taken
192
+ * @param target - Target of the action (market ID, ticker, etc.)
193
+ * @returns Score from 0-1 (1.0 = optimal, 0.5 = reasonable, 0.0 = poor)
194
+ *
195
+ * @remarks
196
+ * - Returns 1.0 if action matches optimal action
197
+ * - Returns 0.5 if action aligns with hidden facts
198
+ * - Returns 0.0 otherwise
199
+ */
200
+ export function scoreActionAgainstGroundTruth(
201
+ snapshot: BenchmarkGameSnapshot,
202
+ tickNumber: number,
203
+ actionType: string,
204
+ target: string
205
+ ): number {
206
+ // Check if action was optimal
207
+ const wasOptimal = wasDecisionOptimal(
208
+ snapshot,
209
+ tickNumber,
210
+ actionType,
211
+ target
212
+ );
213
+
214
+ if (wasOptimal) {
215
+ return 1.0;
216
+ }
217
+
218
+ // Check if action was reasonable given hidden facts
219
+ const hiddenFacts = getHiddenFactsForTick(snapshot, tickNumber);
220
+ const relevantFacts = hiddenFacts.filter(
221
+ (f) =>
222
+ f.value &&
223
+ typeof f.value === 'object' &&
224
+ 'marketId' in f.value &&
225
+ (f.value as { marketId: string }).marketId === target
226
+ );
227
+
228
+ if (relevantFacts.length > 0) {
229
+ // Partial credit for actions that align with hidden facts
230
+ return 0.5;
231
+ }
232
+
233
+ // No credit for actions that don't align with optimal play or hidden facts
234
+ return 0.0;
235
+ }