@elizaos/training 2.0.0-alpha.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Dockerfile +75 -0
- package/Makefile +374 -0
- package/README.md +346 -0
- package/config/rubrics.json +137 -0
- package/data/.gitkeep +0 -0
- package/data/degen/.gitkeep +2 -0
- package/data/trader/.gitkeep +2 -0
- package/docker-compose.test.yml +57 -0
- package/package.json +58 -0
- package/python/config/babylon_atropos.yaml +90 -0
- package/python/config/profiles/12gb.json +11 -0
- package/python/config/profiles/16gb.json +10 -0
- package/python/config/profiles/24gb.json +10 -0
- package/python/config/profiles/48gb.json +10 -0
- package/python/config/profiles/cpu.json +11 -0
- package/python/config/profiles/l40-2gpu-safe.json +20 -0
- package/python/config/profiles/l40-2gpu.json +22 -0
- package/python/config/profiles/l40-4gpu.json +21 -0
- package/python/config/profiles/l40.json +17 -0
- package/python/config/tinker_training.yaml +143 -0
- package/python/curriculum_state.json +165 -0
- package/python/env.template +86 -0
- package/python/env.training.template +46 -0
- package/python/pyproject.toml +41 -0
- package/python/requirements-ci.txt +31 -0
- package/python/requirements.txt +87 -0
- package/python/scripts/__init__.py +4 -0
- package/python/scripts/import_json_trajectories.py +412 -0
- package/python/scripts/local-finetune/README.md +63 -0
- package/python/scripts/local-finetune/ingest_and_score.py +139 -0
- package/python/scripts/local-finetune/merge_model.py +32 -0
- package/python/scripts/local-finetune/test_adapter.py +91 -0
- package/python/scripts/local-finetune/train_from_csv.py +132 -0
- package/python/scripts/merge_trajectories.py +318 -0
- package/python/scripts/run_ab_test.py +143 -0
- package/python/scripts/run_full_pipeline.py +544 -0
- package/python/scripts/run_tinker_training.py +192 -0
- package/python/scripts/run_training.py +914 -0
- package/python/scripts/test_judge.py +155 -0
- package/python/scripts/test_pipeline.py +356 -0
- package/python/scripts/test_trained_model.py +380 -0
- package/python/scripts/train_local.py +528 -0
- package/python/setup.py +20 -0
- package/python/src/__init__.py +190 -0
- package/python/src/data_bridge/__init__.py +24 -0
- package/python/src/data_bridge/converter.py +435 -0
- package/python/src/data_bridge/reader.py +393 -0
- package/python/src/models.py +283 -0
- package/python/src/training/__init__.py +605 -0
- package/python/src/training/ab_testing.py +404 -0
- package/python/src/training/action_executor.py +621 -0
- package/python/src/training/archetype_trainer.py +347 -0
- package/python/src/training/atropos_trainer.py +980 -0
- package/python/src/training/babylon_env.py +1254 -0
- package/python/src/training/error_recovery.py +647 -0
- package/python/src/training/evaluation.py +856 -0
- package/python/src/training/fast_simulator.py +880 -0
- package/python/src/training/format_validator.py +584 -0
- package/python/src/training/hybrid_env.py +522 -0
- package/python/src/training/kl_controller.py +628 -0
- package/python/src/training/multi_prompt_dataset.py +883 -0
- package/python/src/training/multi_turn.py +656 -0
- package/python/src/training/online_env.py +1084 -0
- package/python/src/training/quality_scorer.py +391 -0
- package/python/src/training/quality_utils.py +633 -0
- package/python/src/training/rewards.py +1344 -0
- package/python/src/training/rlaif_env.py +17 -0
- package/python/src/training/rollout_generator.py +502 -0
- package/python/src/training/rubric_loader.py +198 -0
- package/python/src/training/scenario_pool.py +1072 -0
- package/python/src/training/schemas.py +481 -0
- package/python/src/training/service_manager.py +552 -0
- package/python/src/training/simulation_bridge.py +535 -0
- package/python/src/training/tick_reward_attribution.py +399 -0
- package/python/src/training/tinker_client.py +575 -0
- package/python/src/training/tinker_trainer.py +646 -0
- package/python/src/training/tokenization_utils.py +402 -0
- package/python/tests/e2e/__init__.py +13 -0
- package/python/tests/e2e/conftest.py +258 -0
- package/python/tests/e2e/test_full_pipeline.py +643 -0
- package/python/tests/e2e/test_online_training_e2e.py +365 -0
- package/python/tests/integration/__init__.py +12 -0
- package/python/tests/integration/conftest.py +383 -0
- package/python/tests/integration/test_db_integration.py +649 -0
- package/python/tests/integration/test_json_mode_integration.py +554 -0
- package/python/tests/test_action_executor.py +594 -0
- package/python/tests/test_archetype_scoring.py +1027 -0
- package/python/tests/test_atropos_integration.py +360 -0
- package/python/tests/test_evaluation.py +727 -0
- package/python/tests/test_format_validator.py +486 -0
- package/python/tests/test_kl_controller.py +432 -0
- package/python/tests/test_lr_scheduler.py +579 -0
- package/python/tests/test_multi_turn.py +590 -0
- package/python/tests/test_online_env.py +519 -0
- package/python/tests/test_quality_scorer.py +474 -0
- package/python/tests/test_scenario_pool.py +735 -0
- package/python/tests/test_service_manager.py +585 -0
- package/python/tests/test_simulation_rollout.py +581 -0
- package/python/tests/test_tokenization_utils.py +501 -0
- package/python/tests/test_training_orchestrator.py +497 -0
- package/python/tests/test_training_output_structure.py +661 -0
- package/research-output/training-runs/training-run-1770772042899.json +26 -0
- package/research-output/training-runs/training-run-1770930079670.json +32 -0
- package/research-output/training-runs/training-run-1770930143700.json +44 -0
- package/research-output/training-runs/training-run-1770930183638.json +38 -0
- package/research-output/training-runs/training-run-1770930442049.json +38 -0
- package/research-output/training-runs/training-run-1770930793243.json +38 -0
- package/scripts/assess-training-data.ts +422 -0
- package/scripts/e2e-training-test.ts +550 -0
- package/scripts/export-rubrics.ts +64 -0
- package/scripts/generate-research-report.ts +1523 -0
- package/scripts/generate_dataset.sh +173 -0
- package/scripts/json-mode-benchmark.ts +399 -0
- package/scripts/real-archetype-benchmark.ts +210 -0
- package/scripts/run-baseline-comparison.ts +116 -0
- package/scripts/run-full-pipeline.ts +272 -0
- package/scripts/runpod_setup.sh +137 -0
- package/scripts/runpod_validate.sh +147 -0
- package/scripts/test-model-in-game.ts +955 -0
- package/scripts/test-scoring.ts +73 -0
- package/scripts/test-trained-model.ts +209 -0
- package/scripts/train-and-test.ts +824 -0
- package/scripts/verify-final.ts +118 -0
- package/src/adapter.ts +516 -0
- package/src/archetypes/ArchetypeConfigService.ts +626 -0
- package/src/archetypes/derive-archetype.ts +249 -0
- package/src/archetypes/index.ts +22 -0
- package/src/benchmark/ArchetypeMatchupBenchmark.ts +825 -0
- package/src/benchmark/BenchmarkChartGenerator.ts +748 -0
- package/src/benchmark/BenchmarkDataGenerator.ts +1288 -0
- package/src/benchmark/BenchmarkDataViewer.ts +324 -0
- package/src/benchmark/BenchmarkHistoryService.ts +221 -0
- package/src/benchmark/BenchmarkRunner.ts +685 -0
- package/src/benchmark/BenchmarkValidator.ts +206 -0
- package/src/benchmark/FastEvalRunner.ts +225 -0
- package/src/benchmark/MetricsValidator.ts +165 -0
- package/src/benchmark/MetricsVisualizer.ts +909 -0
- package/src/benchmark/ModelBenchmarkService.ts +611 -0
- package/src/benchmark/ModelRegistry.ts +158 -0
- package/src/benchmark/RulerBenchmarkIntegration.ts +235 -0
- package/src/benchmark/SimulationA2AInterface.ts +1169 -0
- package/src/benchmark/SimulationEngine.ts +832 -0
- package/src/benchmark/__tests__/BenchmarkRunner.test.ts +534 -0
- package/src/benchmark/__tests__/HeadToHead.test.ts +126 -0
- package/src/benchmark/index.ts +89 -0
- package/src/benchmark/parseSimulationMetrics.ts +124 -0
- package/src/benchmark/simulation-types.ts +78 -0
- package/src/dependencies.ts +439 -0
- package/src/generation/TrajectoryGenerator.ts +387 -0
- package/src/generation/index.ts +12 -0
- package/src/huggingface/HuggingFaceDatasetUploader.ts +636 -0
- package/src/huggingface/HuggingFaceIntegrationService.ts +426 -0
- package/src/huggingface/HuggingFaceModelUploader.ts +532 -0
- package/src/huggingface/index.ts +27 -0
- package/src/huggingface/shared/HuggingFaceUploadUtil.ts +206 -0
- package/src/index.ts +102 -0
- package/src/init-training.ts +53 -0
- package/src/metrics/TrajectoryMetricsExtractor.ts +653 -0
- package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +759 -0
- package/src/metrics/index.ts +8 -0
- package/src/metrics/types.ts +200 -0
- package/src/rubrics/__tests__/index.test.ts +184 -0
- package/src/rubrics/ass-kisser.ts +85 -0
- package/src/rubrics/degen.ts +80 -0
- package/src/rubrics/goody-twoshoes.ts +84 -0
- package/src/rubrics/index.ts +236 -0
- package/src/rubrics/information-trader.ts +84 -0
- package/src/rubrics/infosec.ts +101 -0
- package/src/rubrics/liar.ts +104 -0
- package/src/rubrics/perps-trader.ts +87 -0
- package/src/rubrics/researcher.ts +81 -0
- package/src/rubrics/scammer.ts +82 -0
- package/src/rubrics/social-butterfly.ts +73 -0
- package/src/rubrics/super-predictor.ts +97 -0
- package/src/rubrics/trader.ts +67 -0
- package/src/scoring/ArchetypeScoringService.ts +486 -0
- package/src/scoring/JudgePromptBuilder.ts +556 -0
- package/src/scoring/LLMJudgeCache.ts +401 -0
- package/src/scoring/index.ts +9 -0
- package/src/training/AutomationPipeline.ts +916 -0
- package/src/training/BenchmarkService.ts +518 -0
- package/src/training/ConfigValidator.ts +220 -0
- package/src/training/MarketOutcomesTracker.ts +187 -0
- package/src/training/ModelDeployer.ts +186 -0
- package/src/training/ModelFetcher.ts +76 -0
- package/src/training/ModelSelectionService.ts +341 -0
- package/src/training/ModelUsageVerifier.ts +160 -0
- package/src/training/MultiModelOrchestrator.ts +580 -0
- package/src/training/RLModelConfig.ts +407 -0
- package/src/training/RewardBackpropagationService.ts +149 -0
- package/src/training/RulerScoringService.ts +666 -0
- package/src/training/TrainingMonitor.ts +166 -0
- package/src/training/TrajectoryRecorder.ts +399 -0
- package/src/training/__tests__/TrajectoryRecorder.test.ts +472 -0
- package/src/training/index.ts +100 -0
- package/src/training/logRLConfig.ts +34 -0
- package/src/training/pipeline.ts +129 -0
- package/src/training/storage/ModelStorageService.ts +279 -0
- package/src/training/storage/TrainingDataArchiver.ts +197 -0
- package/src/training/storage/index.ts +17 -0
- package/src/training/types.ts +207 -0
- package/src/training/window-utils.ts +138 -0
- package/src/utils/index.ts +101 -0
- package/src/utils/logger.ts +59 -0
- package/src/utils/snowflake.ts +17 -0
- package/src/utils/synthetic-detector.ts +111 -0
- package/tsconfig.json +20 -0
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Model Registry
|
|
3
|
+
*
|
|
4
|
+
* Centralized configuration for all models available for benchmarking.
|
|
5
|
+
* Add new models here to make them available for comparison.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
export interface ModelConfig {
|
|
9
|
+
/** Unique identifier for the model */
|
|
10
|
+
id: string;
|
|
11
|
+
|
|
12
|
+
/** Display name for reports */
|
|
13
|
+
displayName: string;
|
|
14
|
+
|
|
15
|
+
/** Provider (groq, openai, anthropic, etc.) */
|
|
16
|
+
provider: 'groq' | 'openai' | 'anthropic' | 'together' | 'local';
|
|
17
|
+
|
|
18
|
+
/** Model identifier for the provider's API */
|
|
19
|
+
modelId: string;
|
|
20
|
+
|
|
21
|
+
/** Model tier (lite, standard, pro) */
|
|
22
|
+
tier: 'lite' | 'standard' | 'pro';
|
|
23
|
+
|
|
24
|
+
/** Approximate parameters in billions */
|
|
25
|
+
parametersBillions?: number;
|
|
26
|
+
|
|
27
|
+
/** Whether this is a baseline model */
|
|
28
|
+
isBaseline: boolean;
|
|
29
|
+
|
|
30
|
+
/** Additional metadata */
|
|
31
|
+
metadata?: Record<string, string | number | boolean>;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Registry of all available models for benchmarking
|
|
36
|
+
*/
|
|
37
|
+
export const MODEL_REGISTRY: ModelConfig[] = [
|
|
38
|
+
{
|
|
39
|
+
id: 'llama-8b',
|
|
40
|
+
displayName: 'LLaMA 3.1 8B',
|
|
41
|
+
provider: 'groq',
|
|
42
|
+
modelId: 'llama-3.1-8b-instant',
|
|
43
|
+
tier: 'lite',
|
|
44
|
+
parametersBillions: 8,
|
|
45
|
+
isBaseline: true,
|
|
46
|
+
},
|
|
47
|
+
{
|
|
48
|
+
id: 'llama-70b',
|
|
49
|
+
displayName: 'LLaMA 3.1 70B',
|
|
50
|
+
provider: 'groq',
|
|
51
|
+
modelId: 'llama-3.1-70b-versatile',
|
|
52
|
+
tier: 'standard',
|
|
53
|
+
parametersBillions: 70,
|
|
54
|
+
isBaseline: false,
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
id: 'qwen-32b',
|
|
58
|
+
displayName: 'Qwen 3 32B',
|
|
59
|
+
provider: 'groq',
|
|
60
|
+
modelId: 'qwen/qwen3-32b',
|
|
61
|
+
tier: 'standard',
|
|
62
|
+
parametersBillions: 32,
|
|
63
|
+
isBaseline: true,
|
|
64
|
+
},
|
|
65
|
+
{
|
|
66
|
+
id: 'mixtral-8x7b',
|
|
67
|
+
displayName: 'Mixtral 8x7B',
|
|
68
|
+
provider: 'groq',
|
|
69
|
+
modelId: 'mixtral-8x7b-32768',
|
|
70
|
+
tier: 'standard',
|
|
71
|
+
parametersBillions: 46,
|
|
72
|
+
isBaseline: false,
|
|
73
|
+
},
|
|
74
|
+
{
|
|
75
|
+
id: 'gpt-4o',
|
|
76
|
+
displayName: 'GPT-4o',
|
|
77
|
+
provider: 'openai',
|
|
78
|
+
modelId: 'gpt-4o',
|
|
79
|
+
tier: 'pro',
|
|
80
|
+
isBaseline: false,
|
|
81
|
+
},
|
|
82
|
+
{
|
|
83
|
+
id: 'gpt-4o-mini',
|
|
84
|
+
displayName: 'GPT-4o Mini',
|
|
85
|
+
provider: 'openai',
|
|
86
|
+
modelId: 'gpt-4o-mini',
|
|
87
|
+
tier: 'lite',
|
|
88
|
+
isBaseline: false,
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
id: 'claude-sonnet',
|
|
92
|
+
displayName: 'Claude 3.5 Sonnet',
|
|
93
|
+
provider: 'anthropic',
|
|
94
|
+
modelId: 'claude-3-5-sonnet-20241022',
|
|
95
|
+
tier: 'pro',
|
|
96
|
+
isBaseline: false,
|
|
97
|
+
},
|
|
98
|
+
{
|
|
99
|
+
id: 'claude-haiku',
|
|
100
|
+
displayName: 'Claude 3.5 Haiku',
|
|
101
|
+
provider: 'anthropic',
|
|
102
|
+
modelId: 'claude-3-5-haiku-20241022',
|
|
103
|
+
tier: 'lite',
|
|
104
|
+
isBaseline: false,
|
|
105
|
+
},
|
|
106
|
+
];
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* Get a model config by ID
|
|
110
|
+
*/
|
|
111
|
+
export function getModelById(id: string): ModelConfig | undefined {
|
|
112
|
+
return MODEL_REGISTRY.find((m) => m.id === id);
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Get a model config by model ID (API identifier)
|
|
117
|
+
*/
|
|
118
|
+
export function getModelByModelId(modelId: string): ModelConfig | undefined {
|
|
119
|
+
return MODEL_REGISTRY.find((m) => m.modelId === modelId);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/**
|
|
123
|
+
* Get all baseline models
|
|
124
|
+
*/
|
|
125
|
+
export function getBaselineModels(): ModelConfig[] {
|
|
126
|
+
return MODEL_REGISTRY.filter((m) => m.isBaseline);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Get models by provider
|
|
131
|
+
*/
|
|
132
|
+
export function getModelsByProvider(
|
|
133
|
+
provider: ModelConfig['provider']
|
|
134
|
+
): ModelConfig[] {
|
|
135
|
+
return MODEL_REGISTRY.filter((m) => m.provider === provider);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Get models by tier
|
|
140
|
+
*/
|
|
141
|
+
export function getModelsByTier(tier: ModelConfig['tier']): ModelConfig[] {
|
|
142
|
+
return MODEL_REGISTRY.filter((m) => m.tier === tier);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
/**
|
|
146
|
+
* Validate that a model ID exists
|
|
147
|
+
*/
|
|
148
|
+
export function validateModelId(id: string): boolean {
|
|
149
|
+
return MODEL_REGISTRY.some((m) => m.id === id || m.modelId === id);
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* Get model display name (supports both id and modelId)
|
|
154
|
+
*/
|
|
155
|
+
export function getModelDisplayName(idOrModelId: string): string {
|
|
156
|
+
const model = getModelById(idOrModelId) ?? getModelByModelId(idOrModelId);
|
|
157
|
+
return model?.displayName ?? idOrModelId;
|
|
158
|
+
}
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* RULER Benchmark Integration
|
|
3
|
+
*
|
|
4
|
+
* Provides utilities to integrate benchmark ground truth data with RULER scoring.
|
|
5
|
+
* This allows RULER to evaluate agent trajectories against known benchmark outcomes.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { MarketOutcomes } from '../training/RulerScoringService';
|
|
9
|
+
import type {
|
|
10
|
+
BenchmarkGameSnapshot,
|
|
11
|
+
GroundTruth,
|
|
12
|
+
} from './BenchmarkDataGenerator';
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Extract market outcomes from benchmark ground truth for RULER scoring
|
|
16
|
+
*
|
|
17
|
+
* Converts benchmark ground truth data into the format expected by RULER
|
|
18
|
+
* scoring service, extracting both prediction market outcomes and stock
|
|
19
|
+
* price changes.
|
|
20
|
+
*
|
|
21
|
+
* @param snapshot - Benchmark game snapshot with ground truth data
|
|
22
|
+
* @returns MarketOutcomes with stocks and predictions arrays
|
|
23
|
+
*
|
|
24
|
+
* @example
|
|
25
|
+
* ```typescript
|
|
26
|
+
* const outcomes = extractMarketOutcomesFromBenchmark(snapshot);
|
|
27
|
+
* // Returns: { stocks: [...], predictions: [...] }
|
|
28
|
+
* ```
|
|
29
|
+
*/
|
|
30
|
+
export function extractMarketOutcomesFromBenchmark(
|
|
31
|
+
snapshot: BenchmarkGameSnapshot
|
|
32
|
+
): MarketOutcomes {
|
|
33
|
+
const gt = snapshot.groundTruth;
|
|
34
|
+
|
|
35
|
+
// Extract prediction market outcomes
|
|
36
|
+
const predictions: Array<{ marketId: string; outcome: 'YES' | 'NO' }> =
|
|
37
|
+
Object.entries(gt.marketOutcomes).map(([marketId, outcome]) => ({
|
|
38
|
+
marketId,
|
|
39
|
+
outcome: outcome ? 'YES' : 'NO',
|
|
40
|
+
}));
|
|
41
|
+
|
|
42
|
+
// Extract stock/perpetual outcomes from price history
|
|
43
|
+
const stocks = Object.entries(gt.priceHistory).map(([ticker, history]) => {
|
|
44
|
+
if (history.length === 0) {
|
|
45
|
+
return {
|
|
46
|
+
ticker,
|
|
47
|
+
changePercent: 0,
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
const startPrice = history[0]?.price || 0;
|
|
52
|
+
const endPrice = history[history.length - 1]?.price || startPrice;
|
|
53
|
+
const changePercent =
|
|
54
|
+
startPrice > 0 ? ((endPrice - startPrice) / startPrice) * 100 : 0;
|
|
55
|
+
|
|
56
|
+
return {
|
|
57
|
+
ticker,
|
|
58
|
+
changePercent,
|
|
59
|
+
};
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
return {
|
|
63
|
+
stocks,
|
|
64
|
+
predictions,
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Get hidden facts for a specific tick (for RULER analysis)
|
|
70
|
+
*
|
|
71
|
+
* Retrieves hidden facts that were not visible to the agent at a specific
|
|
72
|
+
* tick, used for evaluating whether agent decisions aligned with hidden information.
|
|
73
|
+
*
|
|
74
|
+
* @param snapshot - Benchmark game snapshot
|
|
75
|
+
* @param tickNumber - Tick number to get facts for
|
|
76
|
+
* @returns Array of hidden facts for that tick
|
|
77
|
+
*/
|
|
78
|
+
export function getHiddenFactsForTick(
|
|
79
|
+
snapshot: BenchmarkGameSnapshot,
|
|
80
|
+
tickNumber: number
|
|
81
|
+
): GroundTruth['hiddenFacts'] {
|
|
82
|
+
return (snapshot.groundTruth.hiddenFacts || []).filter(
|
|
83
|
+
(f) => f.tick === tickNumber
|
|
84
|
+
);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Get hidden events for a specific tick (for RULER analysis)
|
|
89
|
+
*
|
|
90
|
+
* Retrieves hidden events that occurred at a specific tick, used for
|
|
91
|
+
* evaluating agent decision quality against ground truth.
|
|
92
|
+
*
|
|
93
|
+
* @param snapshot - Benchmark game snapshot
|
|
94
|
+
* @param tickNumber - Tick number to get events for
|
|
95
|
+
* @returns Array of hidden events for that tick
|
|
96
|
+
*/
|
|
97
|
+
export function getHiddenEventsForTick(
|
|
98
|
+
snapshot: BenchmarkGameSnapshot,
|
|
99
|
+
tickNumber: number
|
|
100
|
+
): GroundTruth['hiddenEvents'] {
|
|
101
|
+
return (snapshot.groundTruth.hiddenEvents || []).filter(
|
|
102
|
+
(e) => e.tick === tickNumber
|
|
103
|
+
);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Check if agent decision was optimal given ground truth
|
|
108
|
+
*
|
|
109
|
+
* Compares an agent's action against the optimal actions defined in the
|
|
110
|
+
* benchmark ground truth, allowing a small time window for timing differences.
|
|
111
|
+
*
|
|
112
|
+
* @param snapshot - Benchmark game snapshot
|
|
113
|
+
* @param tickNumber - Tick when action occurred
|
|
114
|
+
* @param actionType - Type of action taken
|
|
115
|
+
* @param target - Target of the action (market ID, ticker, etc.)
|
|
116
|
+
* @returns True if action matches an optimal action within the time window
|
|
117
|
+
*/
|
|
118
|
+
export function wasDecisionOptimal(
|
|
119
|
+
snapshot: BenchmarkGameSnapshot,
|
|
120
|
+
tickNumber: number,
|
|
121
|
+
actionType: string,
|
|
122
|
+
target: string
|
|
123
|
+
): boolean {
|
|
124
|
+
const optimalActions = snapshot.groundTruth.optimalActions;
|
|
125
|
+
|
|
126
|
+
// Find optimal actions near this tick
|
|
127
|
+
const window = 2; // Allow 2 tick window
|
|
128
|
+
const relevantActions = optimalActions.filter(
|
|
129
|
+
(a) =>
|
|
130
|
+
Math.abs(a.tick - tickNumber) <= window &&
|
|
131
|
+
a.type === actionType &&
|
|
132
|
+
a.target === target
|
|
133
|
+
);
|
|
134
|
+
|
|
135
|
+
return relevantActions.length > 0;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Get true facts about the world state (for RULER context)
|
|
140
|
+
*
|
|
141
|
+
* Retrieves the true facts about the world state that agents don't know,
|
|
142
|
+
* used for RULER evaluation context.
|
|
143
|
+
*
|
|
144
|
+
* @param snapshot - Benchmark game snapshot
|
|
145
|
+
* @returns Object containing true facts about the world state
|
|
146
|
+
*/
|
|
147
|
+
export function getTrueFacts(
|
|
148
|
+
snapshot: BenchmarkGameSnapshot
|
|
149
|
+
): GroundTruth['trueFacts'] {
|
|
150
|
+
return snapshot.groundTruth.trueFacts || {};
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
/**
|
|
154
|
+
* Create RULER evaluation context from benchmark
|
|
155
|
+
*
|
|
156
|
+
* Provides all the ground truth information RULER needs to evaluate
|
|
157
|
+
* agent decisions, while ensuring agents never see this data during execution.
|
|
158
|
+
*
|
|
159
|
+
* @param snapshot - Benchmark game snapshot
|
|
160
|
+
* @returns Complete RULER evaluation context with all ground truth data
|
|
161
|
+
*
|
|
162
|
+
* @remarks
|
|
163
|
+
* This function aggregates all ground truth data into a single context object
|
|
164
|
+
* that can be used by RULER to score agent trajectories. The data includes
|
|
165
|
+
* market outcomes, hidden facts/events, optimal actions, and true facts.
|
|
166
|
+
*/
|
|
167
|
+
export function createRulerContext(snapshot: BenchmarkGameSnapshot): {
|
|
168
|
+
marketOutcomes: MarketOutcomes;
|
|
169
|
+
trueFacts: GroundTruth['trueFacts'];
|
|
170
|
+
hiddenFacts: GroundTruth['hiddenFacts'];
|
|
171
|
+
hiddenEvents: GroundTruth['hiddenEvents'];
|
|
172
|
+
optimalActions: GroundTruth['optimalActions'];
|
|
173
|
+
} {
|
|
174
|
+
return {
|
|
175
|
+
marketOutcomes: extractMarketOutcomesFromBenchmark(snapshot),
|
|
176
|
+
trueFacts: getTrueFacts(snapshot),
|
|
177
|
+
hiddenFacts: snapshot.groundTruth.hiddenFacts || [],
|
|
178
|
+
hiddenEvents: snapshot.groundTruth.hiddenEvents || [],
|
|
179
|
+
optimalActions: snapshot.groundTruth.optimalActions,
|
|
180
|
+
};
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
/**
|
|
184
|
+
* Score agent action against ground truth
|
|
185
|
+
*
|
|
186
|
+
* Evaluates a single agent action against the benchmark ground truth and
|
|
187
|
+
* returns a score indicating how well it aligned with optimal play.
|
|
188
|
+
*
|
|
189
|
+
* @param snapshot - Benchmark game snapshot
|
|
190
|
+
* @param tickNumber - Tick when action occurred
|
|
191
|
+
* @param actionType - Type of action taken
|
|
192
|
+
* @param target - Target of the action (market ID, ticker, etc.)
|
|
193
|
+
* @returns Score from 0-1 (1.0 = optimal, 0.5 = reasonable, 0.0 = poor)
|
|
194
|
+
*
|
|
195
|
+
* @remarks
|
|
196
|
+
* - Returns 1.0 if action matches optimal action
|
|
197
|
+
* - Returns 0.5 if action aligns with hidden facts
|
|
198
|
+
* - Returns 0.0 otherwise
|
|
199
|
+
*/
|
|
200
|
+
export function scoreActionAgainstGroundTruth(
|
|
201
|
+
snapshot: BenchmarkGameSnapshot,
|
|
202
|
+
tickNumber: number,
|
|
203
|
+
actionType: string,
|
|
204
|
+
target: string
|
|
205
|
+
): number {
|
|
206
|
+
// Check if action was optimal
|
|
207
|
+
const wasOptimal = wasDecisionOptimal(
|
|
208
|
+
snapshot,
|
|
209
|
+
tickNumber,
|
|
210
|
+
actionType,
|
|
211
|
+
target
|
|
212
|
+
);
|
|
213
|
+
|
|
214
|
+
if (wasOptimal) {
|
|
215
|
+
return 1.0;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
// Check if action was reasonable given hidden facts
|
|
219
|
+
const hiddenFacts = getHiddenFactsForTick(snapshot, tickNumber);
|
|
220
|
+
const relevantFacts = hiddenFacts.filter(
|
|
221
|
+
(f) =>
|
|
222
|
+
f.value &&
|
|
223
|
+
typeof f.value === 'object' &&
|
|
224
|
+
'marketId' in f.value &&
|
|
225
|
+
(f.value as { marketId: string }).marketId === target
|
|
226
|
+
);
|
|
227
|
+
|
|
228
|
+
if (relevantFacts.length > 0) {
|
|
229
|
+
// Partial credit for actions that align with hidden facts
|
|
230
|
+
return 0.5;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
// No credit for actions that don't align with optimal play or hidden facts
|
|
234
|
+
return 0.0;
|
|
235
|
+
}
|