@elizaos/training 2.0.0-alpha.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Dockerfile +75 -0
- package/Makefile +374 -0
- package/README.md +346 -0
- package/config/rubrics.json +137 -0
- package/data/.gitkeep +0 -0
- package/data/degen/.gitkeep +2 -0
- package/data/trader/.gitkeep +2 -0
- package/docker-compose.test.yml +57 -0
- package/package.json +58 -0
- package/python/config/babylon_atropos.yaml +90 -0
- package/python/config/profiles/12gb.json +11 -0
- package/python/config/profiles/16gb.json +10 -0
- package/python/config/profiles/24gb.json +10 -0
- package/python/config/profiles/48gb.json +10 -0
- package/python/config/profiles/cpu.json +11 -0
- package/python/config/profiles/l40-2gpu-safe.json +20 -0
- package/python/config/profiles/l40-2gpu.json +22 -0
- package/python/config/profiles/l40-4gpu.json +21 -0
- package/python/config/profiles/l40.json +17 -0
- package/python/config/tinker_training.yaml +143 -0
- package/python/curriculum_state.json +165 -0
- package/python/env.template +86 -0
- package/python/env.training.template +46 -0
- package/python/pyproject.toml +41 -0
- package/python/requirements-ci.txt +31 -0
- package/python/requirements.txt +87 -0
- package/python/scripts/__init__.py +4 -0
- package/python/scripts/import_json_trajectories.py +412 -0
- package/python/scripts/local-finetune/README.md +63 -0
- package/python/scripts/local-finetune/ingest_and_score.py +139 -0
- package/python/scripts/local-finetune/merge_model.py +32 -0
- package/python/scripts/local-finetune/test_adapter.py +91 -0
- package/python/scripts/local-finetune/train_from_csv.py +132 -0
- package/python/scripts/merge_trajectories.py +318 -0
- package/python/scripts/run_ab_test.py +143 -0
- package/python/scripts/run_full_pipeline.py +544 -0
- package/python/scripts/run_tinker_training.py +192 -0
- package/python/scripts/run_training.py +914 -0
- package/python/scripts/test_judge.py +155 -0
- package/python/scripts/test_pipeline.py +356 -0
- package/python/scripts/test_trained_model.py +380 -0
- package/python/scripts/train_local.py +528 -0
- package/python/setup.py +20 -0
- package/python/src/__init__.py +190 -0
- package/python/src/data_bridge/__init__.py +24 -0
- package/python/src/data_bridge/converter.py +435 -0
- package/python/src/data_bridge/reader.py +393 -0
- package/python/src/models.py +283 -0
- package/python/src/training/__init__.py +605 -0
- package/python/src/training/ab_testing.py +404 -0
- package/python/src/training/action_executor.py +621 -0
- package/python/src/training/archetype_trainer.py +347 -0
- package/python/src/training/atropos_trainer.py +980 -0
- package/python/src/training/babylon_env.py +1254 -0
- package/python/src/training/error_recovery.py +647 -0
- package/python/src/training/evaluation.py +856 -0
- package/python/src/training/fast_simulator.py +880 -0
- package/python/src/training/format_validator.py +584 -0
- package/python/src/training/hybrid_env.py +522 -0
- package/python/src/training/kl_controller.py +628 -0
- package/python/src/training/multi_prompt_dataset.py +883 -0
- package/python/src/training/multi_turn.py +656 -0
- package/python/src/training/online_env.py +1084 -0
- package/python/src/training/quality_scorer.py +391 -0
- package/python/src/training/quality_utils.py +633 -0
- package/python/src/training/rewards.py +1344 -0
- package/python/src/training/rlaif_env.py +17 -0
- package/python/src/training/rollout_generator.py +502 -0
- package/python/src/training/rubric_loader.py +198 -0
- package/python/src/training/scenario_pool.py +1072 -0
- package/python/src/training/schemas.py +481 -0
- package/python/src/training/service_manager.py +552 -0
- package/python/src/training/simulation_bridge.py +535 -0
- package/python/src/training/tick_reward_attribution.py +399 -0
- package/python/src/training/tinker_client.py +575 -0
- package/python/src/training/tinker_trainer.py +646 -0
- package/python/src/training/tokenization_utils.py +402 -0
- package/python/tests/e2e/__init__.py +13 -0
- package/python/tests/e2e/conftest.py +258 -0
- package/python/tests/e2e/test_full_pipeline.py +643 -0
- package/python/tests/e2e/test_online_training_e2e.py +365 -0
- package/python/tests/integration/__init__.py +12 -0
- package/python/tests/integration/conftest.py +383 -0
- package/python/tests/integration/test_db_integration.py +649 -0
- package/python/tests/integration/test_json_mode_integration.py +554 -0
- package/python/tests/test_action_executor.py +594 -0
- package/python/tests/test_archetype_scoring.py +1027 -0
- package/python/tests/test_atropos_integration.py +360 -0
- package/python/tests/test_evaluation.py +727 -0
- package/python/tests/test_format_validator.py +486 -0
- package/python/tests/test_kl_controller.py +432 -0
- package/python/tests/test_lr_scheduler.py +579 -0
- package/python/tests/test_multi_turn.py +590 -0
- package/python/tests/test_online_env.py +519 -0
- package/python/tests/test_quality_scorer.py +474 -0
- package/python/tests/test_scenario_pool.py +735 -0
- package/python/tests/test_service_manager.py +585 -0
- package/python/tests/test_simulation_rollout.py +581 -0
- package/python/tests/test_tokenization_utils.py +501 -0
- package/python/tests/test_training_orchestrator.py +497 -0
- package/python/tests/test_training_output_structure.py +661 -0
- package/research-output/training-runs/training-run-1770772042899.json +26 -0
- package/research-output/training-runs/training-run-1770930079670.json +32 -0
- package/research-output/training-runs/training-run-1770930143700.json +44 -0
- package/research-output/training-runs/training-run-1770930183638.json +38 -0
- package/research-output/training-runs/training-run-1770930442049.json +38 -0
- package/research-output/training-runs/training-run-1770930793243.json +38 -0
- package/scripts/assess-training-data.ts +422 -0
- package/scripts/e2e-training-test.ts +550 -0
- package/scripts/export-rubrics.ts +64 -0
- package/scripts/generate-research-report.ts +1523 -0
- package/scripts/generate_dataset.sh +173 -0
- package/scripts/json-mode-benchmark.ts +399 -0
- package/scripts/real-archetype-benchmark.ts +210 -0
- package/scripts/run-baseline-comparison.ts +116 -0
- package/scripts/run-full-pipeline.ts +272 -0
- package/scripts/runpod_setup.sh +137 -0
- package/scripts/runpod_validate.sh +147 -0
- package/scripts/test-model-in-game.ts +955 -0
- package/scripts/test-scoring.ts +73 -0
- package/scripts/test-trained-model.ts +209 -0
- package/scripts/train-and-test.ts +824 -0
- package/scripts/verify-final.ts +118 -0
- package/src/adapter.ts +516 -0
- package/src/archetypes/ArchetypeConfigService.ts +626 -0
- package/src/archetypes/derive-archetype.ts +249 -0
- package/src/archetypes/index.ts +22 -0
- package/src/benchmark/ArchetypeMatchupBenchmark.ts +825 -0
- package/src/benchmark/BenchmarkChartGenerator.ts +748 -0
- package/src/benchmark/BenchmarkDataGenerator.ts +1288 -0
- package/src/benchmark/BenchmarkDataViewer.ts +324 -0
- package/src/benchmark/BenchmarkHistoryService.ts +221 -0
- package/src/benchmark/BenchmarkRunner.ts +685 -0
- package/src/benchmark/BenchmarkValidator.ts +206 -0
- package/src/benchmark/FastEvalRunner.ts +225 -0
- package/src/benchmark/MetricsValidator.ts +165 -0
- package/src/benchmark/MetricsVisualizer.ts +909 -0
- package/src/benchmark/ModelBenchmarkService.ts +611 -0
- package/src/benchmark/ModelRegistry.ts +158 -0
- package/src/benchmark/RulerBenchmarkIntegration.ts +235 -0
- package/src/benchmark/SimulationA2AInterface.ts +1169 -0
- package/src/benchmark/SimulationEngine.ts +832 -0
- package/src/benchmark/__tests__/BenchmarkRunner.test.ts +534 -0
- package/src/benchmark/__tests__/HeadToHead.test.ts +126 -0
- package/src/benchmark/index.ts +89 -0
- package/src/benchmark/parseSimulationMetrics.ts +124 -0
- package/src/benchmark/simulation-types.ts +78 -0
- package/src/dependencies.ts +439 -0
- package/src/generation/TrajectoryGenerator.ts +387 -0
- package/src/generation/index.ts +12 -0
- package/src/huggingface/HuggingFaceDatasetUploader.ts +636 -0
- package/src/huggingface/HuggingFaceIntegrationService.ts +426 -0
- package/src/huggingface/HuggingFaceModelUploader.ts +532 -0
- package/src/huggingface/index.ts +27 -0
- package/src/huggingface/shared/HuggingFaceUploadUtil.ts +206 -0
- package/src/index.ts +102 -0
- package/src/init-training.ts +53 -0
- package/src/metrics/TrajectoryMetricsExtractor.ts +653 -0
- package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +759 -0
- package/src/metrics/index.ts +8 -0
- package/src/metrics/types.ts +200 -0
- package/src/rubrics/__tests__/index.test.ts +184 -0
- package/src/rubrics/ass-kisser.ts +85 -0
- package/src/rubrics/degen.ts +80 -0
- package/src/rubrics/goody-twoshoes.ts +84 -0
- package/src/rubrics/index.ts +236 -0
- package/src/rubrics/information-trader.ts +84 -0
- package/src/rubrics/infosec.ts +101 -0
- package/src/rubrics/liar.ts +104 -0
- package/src/rubrics/perps-trader.ts +87 -0
- package/src/rubrics/researcher.ts +81 -0
- package/src/rubrics/scammer.ts +82 -0
- package/src/rubrics/social-butterfly.ts +73 -0
- package/src/rubrics/super-predictor.ts +97 -0
- package/src/rubrics/trader.ts +67 -0
- package/src/scoring/ArchetypeScoringService.ts +486 -0
- package/src/scoring/JudgePromptBuilder.ts +556 -0
- package/src/scoring/LLMJudgeCache.ts +401 -0
- package/src/scoring/index.ts +9 -0
- package/src/training/AutomationPipeline.ts +916 -0
- package/src/training/BenchmarkService.ts +518 -0
- package/src/training/ConfigValidator.ts +220 -0
- package/src/training/MarketOutcomesTracker.ts +187 -0
- package/src/training/ModelDeployer.ts +186 -0
- package/src/training/ModelFetcher.ts +76 -0
- package/src/training/ModelSelectionService.ts +341 -0
- package/src/training/ModelUsageVerifier.ts +160 -0
- package/src/training/MultiModelOrchestrator.ts +580 -0
- package/src/training/RLModelConfig.ts +407 -0
- package/src/training/RewardBackpropagationService.ts +149 -0
- package/src/training/RulerScoringService.ts +666 -0
- package/src/training/TrainingMonitor.ts +166 -0
- package/src/training/TrajectoryRecorder.ts +399 -0
- package/src/training/__tests__/TrajectoryRecorder.test.ts +472 -0
- package/src/training/index.ts +100 -0
- package/src/training/logRLConfig.ts +34 -0
- package/src/training/pipeline.ts +129 -0
- package/src/training/storage/ModelStorageService.ts +279 -0
- package/src/training/storage/TrainingDataArchiver.ts +197 -0
- package/src/training/storage/index.ts +17 -0
- package/src/training/types.ts +207 -0
- package/src/training/window-utils.ts +138 -0
- package/src/utils/index.ts +101 -0
- package/src/utils/logger.ts +59 -0
- package/src/utils/snowflake.ts +17 -0
- package/src/utils/synthetic-detector.ts +111 -0
- package/tsconfig.json +20 -0
|
@@ -0,0 +1,825 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Archetype Matchup Benchmark
|
|
3
|
+
*
|
|
4
|
+
* Simulates multiple archetypes competing against each other to understand:
|
|
5
|
+
* - Which archetypes perform best in different market conditions
|
|
6
|
+
* - How archetypes interact (trader vs scammer, social-butterfly vs contrarian, etc.)
|
|
7
|
+
* - Relative strengths and weaknesses
|
|
8
|
+
*
|
|
9
|
+
* Uses the multi-model orchestrator to efficiently run multiple archetype models.
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import {
|
|
13
|
+
type ArchetypeConfig,
|
|
14
|
+
ArchetypeConfigService,
|
|
15
|
+
} from '../archetypes/ArchetypeConfigService';
|
|
16
|
+
import {
|
|
17
|
+
createMultiModelOrchestrator,
|
|
18
|
+
type MultiModelOrchestrator,
|
|
19
|
+
} from '../training/MultiModelOrchestrator';
|
|
20
|
+
import { logger } from '../utils/logger';
|
|
21
|
+
import {
|
|
22
|
+
type BenchmarkConfig,
|
|
23
|
+
BenchmarkDataGenerator,
|
|
24
|
+
type BenchmarkGameSnapshot,
|
|
25
|
+
type Tick,
|
|
26
|
+
} from './BenchmarkDataGenerator';
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Individual agent in the matchup simulation
|
|
30
|
+
*/
|
|
31
|
+
export interface MatchupAgent {
|
|
32
|
+
id: string;
|
|
33
|
+
archetype: string;
|
|
34
|
+
config: ArchetypeConfig;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Result for a single agent in the matchup
|
|
39
|
+
*/
|
|
40
|
+
export interface MatchupAgentResult {
|
|
41
|
+
agentId: string;
|
|
42
|
+
archetype: string;
|
|
43
|
+
pnl: number;
|
|
44
|
+
tradingMetrics: {
|
|
45
|
+
totalTrades: number;
|
|
46
|
+
winRate: number;
|
|
47
|
+
avgPnlPerTrade: number;
|
|
48
|
+
};
|
|
49
|
+
socialMetrics: {
|
|
50
|
+
postsCreated: number;
|
|
51
|
+
engagementReceived: number;
|
|
52
|
+
reputationGained: number;
|
|
53
|
+
};
|
|
54
|
+
actions: number;
|
|
55
|
+
rank: number; // 1-based rank in this matchup
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Head-to-head comparison between two archetypes
|
|
60
|
+
*/
|
|
61
|
+
export interface ArchetypeVsResult {
|
|
62
|
+
archetype1: string;
|
|
63
|
+
archetype2: string;
|
|
64
|
+
archetype1Wins: number;
|
|
65
|
+
archetype2Wins: number;
|
|
66
|
+
ties: number;
|
|
67
|
+
archetype1AvgMargin: number;
|
|
68
|
+
archetype2AvgMargin: number;
|
|
69
|
+
winRate1: number;
|
|
70
|
+
winRate2: number;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Complete matchup benchmark result
|
|
75
|
+
*/
|
|
76
|
+
export interface MatchupBenchmarkResult {
|
|
77
|
+
benchmarkId: string;
|
|
78
|
+
timestamp: number;
|
|
79
|
+
duration: number;
|
|
80
|
+
|
|
81
|
+
/** All agents that participated */
|
|
82
|
+
agents: MatchupAgentResult[];
|
|
83
|
+
|
|
84
|
+
/** Overall archetype rankings across all matchups */
|
|
85
|
+
archetypeRankings: Array<{
|
|
86
|
+
archetype: string;
|
|
87
|
+
avgRank: number;
|
|
88
|
+
avgPnl: number;
|
|
89
|
+
totalWins: number;
|
|
90
|
+
totalLosses: number;
|
|
91
|
+
winRate: number;
|
|
92
|
+
}>;
|
|
93
|
+
|
|
94
|
+
/** Head-to-head matchup results */
|
|
95
|
+
headToHead: ArchetypeVsResult[];
|
|
96
|
+
|
|
97
|
+
/** Market condition during benchmark */
|
|
98
|
+
marketCondition: 'bull' | 'bear' | 'volatile' | 'stable';
|
|
99
|
+
|
|
100
|
+
/** Insights derived from the matchup */
|
|
101
|
+
insights: string[];
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Configuration for matchup benchmark
|
|
106
|
+
*/
|
|
107
|
+
export interface MatchupBenchmarkConfig {
|
|
108
|
+
/** Archetypes to include in matchup (or 'all' for all archetypes) */
|
|
109
|
+
archetypes: string[] | 'all';
|
|
110
|
+
|
|
111
|
+
/** Number of agents per archetype */
|
|
112
|
+
agentsPerArchetype: number;
|
|
113
|
+
|
|
114
|
+
/** Number of simulation rounds */
|
|
115
|
+
rounds: number;
|
|
116
|
+
|
|
117
|
+
/** Number of ticks per round */
|
|
118
|
+
ticksPerRound: number;
|
|
119
|
+
|
|
120
|
+
/** Market conditions to test */
|
|
121
|
+
marketConditions: Array<'bull' | 'bear' | 'volatile' | 'stable'>;
|
|
122
|
+
|
|
123
|
+
/** Available VRAM for model loading */
|
|
124
|
+
availableVramGb: number;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* Runs multi-archetype benchmark simulations
|
|
129
|
+
*/
|
|
130
|
+
export class ArchetypeMatchupBenchmark {
|
|
131
|
+
private config: MatchupBenchmarkConfig;
|
|
132
|
+
private orchestrator: MultiModelOrchestrator;
|
|
133
|
+
|
|
134
|
+
constructor(config: MatchupBenchmarkConfig) {
|
|
135
|
+
this.config = config;
|
|
136
|
+
this.orchestrator = createMultiModelOrchestrator(config.availableVramGb);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
/**
|
|
140
|
+
* Get all archetypes to benchmark
|
|
141
|
+
*/
|
|
142
|
+
private getArchetypes(): string[] {
|
|
143
|
+
if (this.config.archetypes === 'all') {
|
|
144
|
+
return ArchetypeConfigService.getAvailableArchetypes();
|
|
145
|
+
}
|
|
146
|
+
return this.config.archetypes;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* Create agents for the matchup
|
|
151
|
+
*/
|
|
152
|
+
private createAgents(): MatchupAgent[] {
|
|
153
|
+
const agents: MatchupAgent[] = [];
|
|
154
|
+
const archetypes = this.getArchetypes();
|
|
155
|
+
|
|
156
|
+
for (const archetype of archetypes) {
|
|
157
|
+
const archetypeConfig = ArchetypeConfigService.getConfig(archetype);
|
|
158
|
+
|
|
159
|
+
for (let i = 0; i < this.config.agentsPerArchetype; i++) {
|
|
160
|
+
agents.push({
|
|
161
|
+
id: `${archetype}-${i + 1}`,
|
|
162
|
+
archetype,
|
|
163
|
+
config: archetypeConfig,
|
|
164
|
+
});
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
return agents;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Generate benchmark data for a market condition
|
|
173
|
+
* Market condition affects seed to create different scenarios
|
|
174
|
+
*/
|
|
175
|
+
private async generateBenchmarkData(
|
|
176
|
+
condition: 'bull' | 'bear' | 'volatile' | 'stable'
|
|
177
|
+
): Promise<BenchmarkGameSnapshot> {
|
|
178
|
+
// Convert ticks to duration minutes (assuming 1 tick per second)
|
|
179
|
+
const durationMinutes = Math.ceil(this.config.ticksPerRound / 60);
|
|
180
|
+
|
|
181
|
+
// Use condition to create different but reproducible seeds
|
|
182
|
+
const conditionSeeds: Record<string, number> = {
|
|
183
|
+
bull: 1001,
|
|
184
|
+
bear: 2002,
|
|
185
|
+
volatile: 3003,
|
|
186
|
+
stable: 4004,
|
|
187
|
+
};
|
|
188
|
+
const baseSeed = conditionSeeds[condition] || 1000;
|
|
189
|
+
|
|
190
|
+
const benchmarkConfig: BenchmarkConfig = {
|
|
191
|
+
durationMinutes,
|
|
192
|
+
tickInterval: 1,
|
|
193
|
+
numPredictionMarkets: condition === 'volatile' ? 8 : 5,
|
|
194
|
+
numPerpetualMarkets: condition === 'volatile' ? 5 : 3,
|
|
195
|
+
numAgents: 10,
|
|
196
|
+
seed: baseSeed + (Date.now() % 1000), // Semi-reproducible
|
|
197
|
+
};
|
|
198
|
+
|
|
199
|
+
const generator = new BenchmarkDataGenerator(benchmarkConfig);
|
|
200
|
+
return generator.generate();
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
/**
|
|
204
|
+
* Simulate a single round of the matchup
|
|
205
|
+
*/
|
|
206
|
+
private async simulateRound(
|
|
207
|
+
agents: MatchupAgent[],
|
|
208
|
+
snapshot: BenchmarkGameSnapshot,
|
|
209
|
+
roundNumber: number
|
|
210
|
+
): Promise<MatchupAgentResult[]> {
|
|
211
|
+
const results: MatchupAgentResult[] = [];
|
|
212
|
+
|
|
213
|
+
logger.info(
|
|
214
|
+
`Simulating round ${roundNumber} with ${agents.length} agents`,
|
|
215
|
+
{ archetypes: [...new Set(agents.map((a) => a.archetype))] },
|
|
216
|
+
'ArchetypeMatchupBenchmark'
|
|
217
|
+
);
|
|
218
|
+
|
|
219
|
+
// Check if we should use real inference or simulation
|
|
220
|
+
const useRealInference = process.env.USE_REAL_INFERENCE === 'true';
|
|
221
|
+
|
|
222
|
+
if (useRealInference) {
|
|
223
|
+
// Use real model inference via the orchestrator
|
|
224
|
+
for (const agent of agents) {
|
|
225
|
+
const result = await this.runAgentWithRealModel(agent, snapshot);
|
|
226
|
+
results.push(result);
|
|
227
|
+
}
|
|
228
|
+
} else {
|
|
229
|
+
// Use simulated performance based on archetype characteristics
|
|
230
|
+
for (const agent of agents) {
|
|
231
|
+
const result = this.simulateAgentPerformance(agent, snapshot);
|
|
232
|
+
results.push(result);
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
// Assign ranks
|
|
237
|
+
results.sort((a, b) => b.pnl - a.pnl);
|
|
238
|
+
results.forEach((r, i) => {
|
|
239
|
+
r.rank = i + 1;
|
|
240
|
+
});
|
|
241
|
+
|
|
242
|
+
return results;
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
/**
|
|
246
|
+
* Run an agent with real model inference
|
|
247
|
+
*/
|
|
248
|
+
private async runAgentWithRealModel(
|
|
249
|
+
agent: MatchupAgent,
|
|
250
|
+
snapshot: BenchmarkGameSnapshot
|
|
251
|
+
): Promise<MatchupAgentResult> {
|
|
252
|
+
let totalPnl = 0;
|
|
253
|
+
let totalTrades = 0;
|
|
254
|
+
let wins = 0;
|
|
255
|
+
let postsCreated = 0;
|
|
256
|
+
|
|
257
|
+
// Process a subset of ticks (every 10th tick to speed up)
|
|
258
|
+
const ticksToProcess = snapshot.ticks
|
|
259
|
+
.filter((_, i) => i % 10 === 0)
|
|
260
|
+
.slice(0, 10);
|
|
261
|
+
|
|
262
|
+
for (const tick of ticksToProcess) {
|
|
263
|
+
// Build a prompt with the current game state
|
|
264
|
+
const prompt = this.buildDecisionPrompt(agent, tick);
|
|
265
|
+
|
|
266
|
+
// Get decision from model
|
|
267
|
+
const response = await this.orchestrator.inference({
|
|
268
|
+
archetype: agent.archetype,
|
|
269
|
+
prompt,
|
|
270
|
+
systemPrompt: agent.config.system,
|
|
271
|
+
maxTokens: 256,
|
|
272
|
+
temperature: 0.7,
|
|
273
|
+
});
|
|
274
|
+
|
|
275
|
+
// Parse the decision and simulate outcome
|
|
276
|
+
const decision = this.parseAgentDecision(response.response);
|
|
277
|
+
|
|
278
|
+
if (decision.action === 'trade') {
|
|
279
|
+
totalTrades++;
|
|
280
|
+
// Simulate trade outcome based on market conditions
|
|
281
|
+
const marketTrend = this.getMarketTrend(tick);
|
|
282
|
+
const isCorrectDirection =
|
|
283
|
+
(decision.direction === 'long' && marketTrend > 0) ||
|
|
284
|
+
(decision.direction === 'short' && marketTrend < 0);
|
|
285
|
+
if (isCorrectDirection) {
|
|
286
|
+
wins++;
|
|
287
|
+
totalPnl += Math.abs(marketTrend) * 100 * (decision.confidence || 1);
|
|
288
|
+
} else {
|
|
289
|
+
totalPnl -= Math.abs(marketTrend) * 50 * (decision.confidence || 1);
|
|
290
|
+
}
|
|
291
|
+
} else if (decision.action === 'post') {
|
|
292
|
+
postsCreated++;
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
const winRate = totalTrades > 0 ? wins / totalTrades : 0;
|
|
297
|
+
|
|
298
|
+
return {
|
|
299
|
+
agentId: agent.id,
|
|
300
|
+
archetype: agent.archetype,
|
|
301
|
+
pnl: totalPnl,
|
|
302
|
+
tradingMetrics: {
|
|
303
|
+
totalTrades,
|
|
304
|
+
winRate,
|
|
305
|
+
avgPnlPerTrade: totalTrades > 0 ? totalPnl / totalTrades : 0,
|
|
306
|
+
},
|
|
307
|
+
socialMetrics: {
|
|
308
|
+
postsCreated,
|
|
309
|
+
engagementReceived: postsCreated * 5,
|
|
310
|
+
reputationGained: postsCreated * 10 + wins * 5,
|
|
311
|
+
},
|
|
312
|
+
actions: totalTrades + postsCreated,
|
|
313
|
+
rank: 0,
|
|
314
|
+
};
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
/**
|
|
318
|
+
* Build a decision prompt for the agent
|
|
319
|
+
*/
|
|
320
|
+
private buildDecisionPrompt(agent: MatchupAgent, tick: Tick): string {
|
|
321
|
+
const state = tick.state;
|
|
322
|
+
// Find agent's balance from agents array
|
|
323
|
+
const agentState = state.agents.find((a) => a.id === agent.id);
|
|
324
|
+
const agentBalance =
|
|
325
|
+
agentState?.totalPnl !== undefined ? 1000 + agentState.totalPnl : 1000;
|
|
326
|
+
|
|
327
|
+
// Extract market prices from perpetual markets
|
|
328
|
+
const marketPrices = Object.fromEntries(
|
|
329
|
+
state.perpetualMarkets.map((m) => [m.ticker, m.price])
|
|
330
|
+
);
|
|
331
|
+
|
|
332
|
+
// Recent posts can serve as "news"
|
|
333
|
+
const recentNews = state.posts?.slice(-5).map((p) => p.content) || [];
|
|
334
|
+
|
|
335
|
+
return `
|
|
336
|
+
Current game state:
|
|
337
|
+
- Timestamp: ${tick.timestamp}
|
|
338
|
+
- Your balance: ${agentBalance}
|
|
339
|
+
- Market prices: ${JSON.stringify(marketPrices)}
|
|
340
|
+
- Recent news: ${JSON.stringify(recentNews)}
|
|
341
|
+
|
|
342
|
+
As a ${agent.archetype} agent, what action would you take?
|
|
343
|
+
Respond with a JSON object containing:
|
|
344
|
+
- action: "trade" | "post" | "observe"
|
|
345
|
+
- direction: "long" | "short" (if trading)
|
|
346
|
+
- confidence: 0.0 to 1.0
|
|
347
|
+
- reasoning: brief explanation
|
|
348
|
+
`;
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
/**
|
|
352
|
+
* Parse agent decision from model response
|
|
353
|
+
*/
|
|
354
|
+
private parseAgentDecision(response: string): {
|
|
355
|
+
action: 'trade' | 'post' | 'observe';
|
|
356
|
+
direction?: 'long' | 'short';
|
|
357
|
+
confidence?: number;
|
|
358
|
+
} {
|
|
359
|
+
try {
|
|
360
|
+
// Try to extract JSON from response
|
|
361
|
+
const jsonMatch = response.match(/\{[\s\S]*\}/);
|
|
362
|
+
if (jsonMatch) {
|
|
363
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
364
|
+
return {
|
|
365
|
+
action: parsed.action || 'observe',
|
|
366
|
+
direction: parsed.direction,
|
|
367
|
+
confidence: parsed.confidence || 0.5,
|
|
368
|
+
};
|
|
369
|
+
}
|
|
370
|
+
} catch {
|
|
371
|
+
// Failed to parse, default to observe
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
// Default behavior based on response content
|
|
375
|
+
if (
|
|
376
|
+
response.toLowerCase().includes('trade') ||
|
|
377
|
+
response.toLowerCase().includes('buy') ||
|
|
378
|
+
response.toLowerCase().includes('sell')
|
|
379
|
+
) {
|
|
380
|
+
return {
|
|
381
|
+
action: 'trade',
|
|
382
|
+
direction: response.toLowerCase().includes('short') ? 'short' : 'long',
|
|
383
|
+
confidence: 0.5,
|
|
384
|
+
};
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
if (
|
|
388
|
+
response.toLowerCase().includes('post') ||
|
|
389
|
+
response.toLowerCase().includes('share')
|
|
390
|
+
) {
|
|
391
|
+
return { action: 'post' };
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
return { action: 'observe' };
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
/**
|
|
398
|
+
* Get market trend from tick data
|
|
399
|
+
*/
|
|
400
|
+
private getMarketTrend(tick: Tick): number {
|
|
401
|
+
const state = tick.state;
|
|
402
|
+
// Extract prices from perpetual markets
|
|
403
|
+
if (state.perpetualMarkets.length === 0) return 0;
|
|
404
|
+
|
|
405
|
+
const prices = Object.fromEntries(
|
|
406
|
+
state.perpetualMarkets.map((m) => [m.ticker, m.price])
|
|
407
|
+
);
|
|
408
|
+
|
|
409
|
+
// Calculate average price change
|
|
410
|
+
const priceValues = Object.values(prices);
|
|
411
|
+
if (priceValues.length === 0) return 0;
|
|
412
|
+
|
|
413
|
+
const avgPrice =
|
|
414
|
+
priceValues.reduce((a, b) => a + b, 0) / priceValues.length;
|
|
415
|
+
// Normalize to -1 to 1 range
|
|
416
|
+
return (avgPrice - 100) / 100;
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
/**
|
|
420
|
+
* Simulate agent performance based on archetype characteristics
|
|
421
|
+
* Used when real model inference is not available
|
|
422
|
+
*/
|
|
423
|
+
private simulateAgentPerformance(
|
|
424
|
+
agent: MatchupAgent,
|
|
425
|
+
snapshot: BenchmarkGameSnapshot
|
|
426
|
+
): MatchupAgentResult {
|
|
427
|
+
const config = agent.config;
|
|
428
|
+
const tickCount = snapshot.ticks.length;
|
|
429
|
+
|
|
430
|
+
// Calculate expected performance based on archetype traits
|
|
431
|
+
// Higher risk tolerance = higher variance in PnL
|
|
432
|
+
const riskFactor = config.riskTolerance;
|
|
433
|
+
const basePnl = (Math.random() - 0.5) * 1000 * riskFactor;
|
|
434
|
+
|
|
435
|
+
// Trading-focused archetypes trade more
|
|
436
|
+
const tradeWeight = config.actionWeights.trade;
|
|
437
|
+
const totalTrades = Math.floor(tickCount * tradeWeight * 0.1);
|
|
438
|
+
const winRate =
|
|
439
|
+
0.45 + (config.riskTolerance < 0.5 ? 0.15 : -0.05) + Math.random() * 0.1;
|
|
440
|
+
|
|
441
|
+
// Social-focused archetypes post more
|
|
442
|
+
const postWeight = config.actionWeights.post;
|
|
443
|
+
const postsCreated = Math.floor(tickCount * postWeight * 0.05);
|
|
444
|
+
|
|
445
|
+
return {
|
|
446
|
+
agentId: agent.id,
|
|
447
|
+
archetype: agent.archetype,
|
|
448
|
+
pnl: basePnl + (winRate > 0.5 ? 100 : -100) * Math.random(),
|
|
449
|
+
tradingMetrics: {
|
|
450
|
+
totalTrades,
|
|
451
|
+
winRate,
|
|
452
|
+
avgPnlPerTrade: basePnl / Math.max(totalTrades, 1),
|
|
453
|
+
},
|
|
454
|
+
socialMetrics: {
|
|
455
|
+
postsCreated,
|
|
456
|
+
engagementReceived: postsCreated * (2 + Math.random() * 5),
|
|
457
|
+
reputationGained: postsCreated * 10,
|
|
458
|
+
},
|
|
459
|
+
actions: totalTrades + postsCreated,
|
|
460
|
+
rank: 0, // Set after sorting
|
|
461
|
+
};
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
/**
|
|
465
|
+
* Calculate head-to-head results between archetypes
|
|
466
|
+
*/
|
|
467
|
+
private calculateHeadToHead(
|
|
468
|
+
allResults: MatchupAgentResult[][]
|
|
469
|
+
): ArchetypeVsResult[] {
|
|
470
|
+
const archetypes = this.getArchetypes();
|
|
471
|
+
const headToHead: ArchetypeVsResult[] = [];
|
|
472
|
+
|
|
473
|
+
for (let i = 0; i < archetypes.length; i++) {
|
|
474
|
+
for (let j = i + 1; j < archetypes.length; j++) {
|
|
475
|
+
const arch1 = archetypes[i] as string;
|
|
476
|
+
const arch2 = archetypes[j] as string;
|
|
477
|
+
|
|
478
|
+
let wins1 = 0;
|
|
479
|
+
let wins2 = 0;
|
|
480
|
+
let ties = 0;
|
|
481
|
+
let margin1Total = 0;
|
|
482
|
+
let margin2Total = 0;
|
|
483
|
+
|
|
484
|
+
// Compare performance in each round
|
|
485
|
+
for (const roundResults of allResults) {
|
|
486
|
+
const arch1Results = roundResults.filter(
|
|
487
|
+
(r) => r.archetype === arch1
|
|
488
|
+
);
|
|
489
|
+
const arch2Results = roundResults.filter(
|
|
490
|
+
(r) => r.archetype === arch2
|
|
491
|
+
);
|
|
492
|
+
|
|
493
|
+
if (arch1Results.length === 0 || arch2Results.length === 0) continue;
|
|
494
|
+
|
|
495
|
+
const avgPnl1 =
|
|
496
|
+
arch1Results.reduce((sum, r) => sum + r.pnl, 0) /
|
|
497
|
+
arch1Results.length;
|
|
498
|
+
const avgPnl2 =
|
|
499
|
+
arch2Results.reduce((sum, r) => sum + r.pnl, 0) /
|
|
500
|
+
arch2Results.length;
|
|
501
|
+
|
|
502
|
+
if (avgPnl1 > avgPnl2) {
|
|
503
|
+
wins1++;
|
|
504
|
+
margin1Total += avgPnl1 - avgPnl2;
|
|
505
|
+
} else if (avgPnl2 > avgPnl1) {
|
|
506
|
+
wins2++;
|
|
507
|
+
margin2Total += avgPnl2 - avgPnl1;
|
|
508
|
+
} else {
|
|
509
|
+
ties++;
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
const totalGames = wins1 + wins2 + ties;
|
|
514
|
+
headToHead.push({
|
|
515
|
+
archetype1: arch1,
|
|
516
|
+
archetype2: arch2,
|
|
517
|
+
archetype1Wins: wins1,
|
|
518
|
+
archetype2Wins: wins2,
|
|
519
|
+
ties,
|
|
520
|
+
archetype1AvgMargin: wins1 > 0 ? margin1Total / wins1 : 0,
|
|
521
|
+
archetype2AvgMargin: wins2 > 0 ? margin2Total / wins2 : 0,
|
|
522
|
+
winRate1: totalGames > 0 ? wins1 / totalGames : 0,
|
|
523
|
+
winRate2: totalGames > 0 ? wins2 / totalGames : 0,
|
|
524
|
+
});
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
return headToHead;
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
/**
|
|
532
|
+
* Calculate overall archetype rankings
|
|
533
|
+
*/
|
|
534
|
+
private calculateRankings(
|
|
535
|
+
allResults: MatchupAgentResult[][]
|
|
536
|
+
): MatchupBenchmarkResult['archetypeRankings'] {
|
|
537
|
+
const archetypes = this.getArchetypes();
|
|
538
|
+
const rankings: Map<
|
|
539
|
+
string,
|
|
540
|
+
{
|
|
541
|
+
totalRank: number;
|
|
542
|
+
totalPnl: number;
|
|
543
|
+
wins: number;
|
|
544
|
+
losses: number;
|
|
545
|
+
count: number;
|
|
546
|
+
}
|
|
547
|
+
> = new Map();
|
|
548
|
+
|
|
549
|
+
// Initialize
|
|
550
|
+
for (const arch of archetypes) {
|
|
551
|
+
rankings.set(arch, {
|
|
552
|
+
totalRank: 0,
|
|
553
|
+
totalPnl: 0,
|
|
554
|
+
wins: 0,
|
|
555
|
+
losses: 0,
|
|
556
|
+
count: 0,
|
|
557
|
+
});
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
// Aggregate results
|
|
561
|
+
for (const roundResults of allResults) {
|
|
562
|
+
const archetypeResults = new Map<string, number[]>();
|
|
563
|
+
|
|
564
|
+
for (const result of roundResults) {
|
|
565
|
+
const existing = archetypeResults.get(result.archetype) || [];
|
|
566
|
+
existing.push(result.pnl);
|
|
567
|
+
archetypeResults.set(result.archetype, existing);
|
|
568
|
+
|
|
569
|
+
const stats = rankings.get(result.archetype);
|
|
570
|
+
if (stats) {
|
|
571
|
+
stats.totalRank += result.rank;
|
|
572
|
+
stats.totalPnl += result.pnl;
|
|
573
|
+
stats.count++;
|
|
574
|
+
if (result.rank === 1) stats.wins++;
|
|
575
|
+
if (result.rank === roundResults.length) stats.losses++;
|
|
576
|
+
}
|
|
577
|
+
}
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
return Array.from(rankings.entries())
|
|
581
|
+
.map(([archetype, stats]) => ({
|
|
582
|
+
archetype,
|
|
583
|
+
avgRank: stats.count > 0 ? stats.totalRank / stats.count : 0,
|
|
584
|
+
avgPnl: stats.count > 0 ? stats.totalPnl / stats.count : 0,
|
|
585
|
+
totalWins: stats.wins,
|
|
586
|
+
totalLosses: stats.losses,
|
|
587
|
+
winRate: stats.count > 0 ? stats.wins / stats.count : 0,
|
|
588
|
+
}))
|
|
589
|
+
.sort((a, b) => a.avgRank - b.avgRank);
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
/**
|
|
593
|
+
* Generate insights from the matchup results
|
|
594
|
+
*/
|
|
595
|
+
private generateInsights(
|
|
596
|
+
rankings: MatchupBenchmarkResult['archetypeRankings'],
|
|
597
|
+
headToHead: ArchetypeVsResult[],
|
|
598
|
+
marketCondition: string
|
|
599
|
+
): string[] {
|
|
600
|
+
const insights: string[] = [];
|
|
601
|
+
|
|
602
|
+
// Top performer insight
|
|
603
|
+
const topRanking = rankings[0];
|
|
604
|
+
if (topRanking) {
|
|
605
|
+
insights.push(
|
|
606
|
+
`${topRanking.archetype} performed best in ${marketCondition} conditions with avg rank ${topRanking.avgRank.toFixed(2)}`
|
|
607
|
+
);
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
// Find dominant matchups
|
|
611
|
+
for (const h2h of headToHead) {
|
|
612
|
+
if (h2h.winRate1 >= 0.7) {
|
|
613
|
+
insights.push(
|
|
614
|
+
`${h2h.archetype1} dominates ${h2h.archetype2} (${(h2h.winRate1 * 100).toFixed(0)}% win rate)`
|
|
615
|
+
);
|
|
616
|
+
} else if (h2h.winRate2 >= 0.7) {
|
|
617
|
+
insights.push(
|
|
618
|
+
`${h2h.archetype2} dominates ${h2h.archetype1} (${(h2h.winRate2 * 100).toFixed(0)}% win rate)`
|
|
619
|
+
);
|
|
620
|
+
}
|
|
621
|
+
}
|
|
622
|
+
|
|
623
|
+
// Find rock-paper-scissors patterns
|
|
624
|
+
const counters = this.findCounterArchetypes(headToHead);
|
|
625
|
+
for (const counter of counters) {
|
|
626
|
+
insights.push(counter);
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
return insights;
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
/**
|
|
633
|
+
* Find archetype counter relationships (A beats B, B beats C, C beats A)
|
|
634
|
+
*/
|
|
635
|
+
private findCounterArchetypes(headToHead: ArchetypeVsResult[]): string[] {
|
|
636
|
+
const insights: string[] = [];
|
|
637
|
+
const wins = new Map<string, Set<string>>();
|
|
638
|
+
|
|
639
|
+
// Build win graph
|
|
640
|
+
for (const h2h of headToHead) {
|
|
641
|
+
if (h2h.winRate1 > 0.6) {
|
|
642
|
+
const set = wins.get(h2h.archetype1) || new Set();
|
|
643
|
+
set.add(h2h.archetype2);
|
|
644
|
+
wins.set(h2h.archetype1, set);
|
|
645
|
+
}
|
|
646
|
+
if (h2h.winRate2 > 0.6) {
|
|
647
|
+
const set = wins.get(h2h.archetype2) || new Set();
|
|
648
|
+
set.add(h2h.archetype1);
|
|
649
|
+
wins.set(h2h.archetype2, set);
|
|
650
|
+
}
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
// Find triangles (rock-paper-scissors patterns)
|
|
654
|
+
for (const [a, aWins] of wins) {
|
|
655
|
+
for (const b of aWins) {
|
|
656
|
+
const bWins = wins.get(b);
|
|
657
|
+
if (bWins) {
|
|
658
|
+
for (const c of bWins) {
|
|
659
|
+
const cWins = wins.get(c);
|
|
660
|
+
if (cWins && cWins.has(a)) {
|
|
661
|
+
insights.push(
|
|
662
|
+
`Counter triangle found: ${a} → ${b} → ${c} → ${a}`
|
|
663
|
+
);
|
|
664
|
+
}
|
|
665
|
+
}
|
|
666
|
+
}
|
|
667
|
+
}
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
return insights;
|
|
671
|
+
}
|
|
672
|
+
|
|
673
|
+
/**
|
|
674
|
+
* Run the complete matchup benchmark
|
|
675
|
+
*/
|
|
676
|
+
async run(): Promise<MatchupBenchmarkResult[]> {
|
|
677
|
+
const startTime = Date.now();
|
|
678
|
+
const results: MatchupBenchmarkResult[] = [];
|
|
679
|
+
|
|
680
|
+
logger.info(
|
|
681
|
+
'Starting Archetype Matchup Benchmark',
|
|
682
|
+
{
|
|
683
|
+
archetypes: this.getArchetypes(),
|
|
684
|
+
agentsPerArchetype: this.config.agentsPerArchetype,
|
|
685
|
+
rounds: this.config.rounds,
|
|
686
|
+
conditions: this.config.marketConditions,
|
|
687
|
+
},
|
|
688
|
+
'ArchetypeMatchupBenchmark'
|
|
689
|
+
);
|
|
690
|
+
|
|
691
|
+
const agents = this.createAgents();
|
|
692
|
+
|
|
693
|
+
for (const condition of this.config.marketConditions) {
|
|
694
|
+
logger.info(
|
|
695
|
+
`Testing in ${condition} market conditions`,
|
|
696
|
+
{},
|
|
697
|
+
'ArchetypeMatchupBenchmark'
|
|
698
|
+
);
|
|
699
|
+
|
|
700
|
+
const allRoundResults: MatchupAgentResult[][] = [];
|
|
701
|
+
|
|
702
|
+
for (let round = 0; round < this.config.rounds; round++) {
|
|
703
|
+
const snapshot = await this.generateBenchmarkData(condition);
|
|
704
|
+
const roundResults = await this.simulateRound(
|
|
705
|
+
agents,
|
|
706
|
+
snapshot,
|
|
707
|
+
round + 1
|
|
708
|
+
);
|
|
709
|
+
allRoundResults.push(roundResults);
|
|
710
|
+
}
|
|
711
|
+
|
|
712
|
+
// Flatten agent results for this condition
|
|
713
|
+
const flatAgentResults = allRoundResults.flat();
|
|
714
|
+
|
|
715
|
+
// Calculate aggregated results
|
|
716
|
+
const headToHead = this.calculateHeadToHead(allRoundResults);
|
|
717
|
+
const rankings = this.calculateRankings(allRoundResults);
|
|
718
|
+
const insights = this.generateInsights(rankings, headToHead, condition);
|
|
719
|
+
|
|
720
|
+
results.push({
|
|
721
|
+
benchmarkId: `matchup-${condition}-${Date.now()}`,
|
|
722
|
+
timestamp: Date.now(),
|
|
723
|
+
duration: Date.now() - startTime,
|
|
724
|
+
agents: flatAgentResults,
|
|
725
|
+
archetypeRankings: rankings,
|
|
726
|
+
headToHead,
|
|
727
|
+
marketCondition: condition,
|
|
728
|
+
insights,
|
|
729
|
+
});
|
|
730
|
+
|
|
731
|
+
logger.info(
|
|
732
|
+
`Completed ${condition} market benchmark`,
|
|
733
|
+
{
|
|
734
|
+
topArchetype: rankings[0]?.archetype,
|
|
735
|
+
avgPnl: rankings[0]?.avgPnl.toFixed(2),
|
|
736
|
+
},
|
|
737
|
+
'ArchetypeMatchupBenchmark'
|
|
738
|
+
);
|
|
739
|
+
}
|
|
740
|
+
|
|
741
|
+
// Cleanup
|
|
742
|
+
this.orchestrator.unloadAll();
|
|
743
|
+
|
|
744
|
+
const totalDuration = Date.now() - startTime;
|
|
745
|
+
logger.info(
|
|
746
|
+
'Archetype Matchup Benchmark complete',
|
|
747
|
+
{
|
|
748
|
+
totalDurationMs: totalDuration,
|
|
749
|
+
conditionsTested: this.config.marketConditions.length,
|
|
750
|
+
totalRounds: this.config.rounds * this.config.marketConditions.length,
|
|
751
|
+
},
|
|
752
|
+
'ArchetypeMatchupBenchmark'
|
|
753
|
+
);
|
|
754
|
+
|
|
755
|
+
return results;
|
|
756
|
+
}
|
|
757
|
+
|
|
758
|
+
/**
|
|
759
|
+
* Generate a summary report of the matchup results
|
|
760
|
+
*/
|
|
761
|
+
static generateReport(results: MatchupBenchmarkResult[]): string {
|
|
762
|
+
const lines: string[] = [];
|
|
763
|
+
lines.push('# Archetype Matchup Benchmark Report\n');
|
|
764
|
+
|
|
765
|
+
for (const result of results) {
|
|
766
|
+
lines.push(
|
|
767
|
+
`## ${result.marketCondition.toUpperCase()} Market Conditions\n`
|
|
768
|
+
);
|
|
769
|
+
|
|
770
|
+
// Rankings table
|
|
771
|
+
lines.push('### Overall Rankings\n');
|
|
772
|
+
lines.push('| Rank | Archetype | Avg PnL | Win Rate |');
|
|
773
|
+
lines.push('|------|-----------|---------|----------|');
|
|
774
|
+
for (const ranking of result.archetypeRankings) {
|
|
775
|
+
lines.push(
|
|
776
|
+
`| ${ranking.avgRank.toFixed(1)} | ${ranking.archetype} | ${ranking.avgPnl.toFixed(2)} | ${(ranking.winRate * 100).toFixed(1)}% |`
|
|
777
|
+
);
|
|
778
|
+
}
|
|
779
|
+
lines.push('');
|
|
780
|
+
|
|
781
|
+
// Head-to-head table
|
|
782
|
+
lines.push('### Head-to-Head Results\n');
|
|
783
|
+
lines.push('| Matchup | Winner | Win Rate |');
|
|
784
|
+
lines.push('|---------|--------|----------|');
|
|
785
|
+
for (const h2h of result.headToHead) {
|
|
786
|
+
const winner =
|
|
787
|
+
h2h.winRate1 > h2h.winRate2 ? h2h.archetype1 : h2h.archetype2;
|
|
788
|
+
const winRate = Math.max(h2h.winRate1, h2h.winRate2);
|
|
789
|
+
lines.push(
|
|
790
|
+
`| ${h2h.archetype1} vs ${h2h.archetype2} | ${winner} | ${(winRate * 100).toFixed(1)}% |`
|
|
791
|
+
);
|
|
792
|
+
}
|
|
793
|
+
lines.push('');
|
|
794
|
+
|
|
795
|
+
// Insights
|
|
796
|
+
if (result.insights.length > 0) {
|
|
797
|
+
lines.push('### Key Insights\n');
|
|
798
|
+
for (const insight of result.insights) {
|
|
799
|
+
lines.push(`- ${insight}`);
|
|
800
|
+
}
|
|
801
|
+
lines.push('');
|
|
802
|
+
}
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
return lines.join('\n');
|
|
806
|
+
}
|
|
807
|
+
}
|
|
808
|
+
|
|
809
|
+
/**
|
|
810
|
+
* Run a quick matchup benchmark with sensible defaults
|
|
811
|
+
*/
|
|
812
|
+
export async function runQuickMatchupBenchmark(): Promise<
|
|
813
|
+
MatchupBenchmarkResult[]
|
|
814
|
+
> {
|
|
815
|
+
const benchmark = new ArchetypeMatchupBenchmark({
|
|
816
|
+
archetypes: 'all',
|
|
817
|
+
agentsPerArchetype: 2,
|
|
818
|
+
rounds: 5,
|
|
819
|
+
ticksPerRound: 100,
|
|
820
|
+
marketConditions: ['bull', 'bear', 'volatile', 'stable'],
|
|
821
|
+
availableVramGb: 16,
|
|
822
|
+
});
|
|
823
|
+
|
|
824
|
+
return benchmark.run();
|
|
825
|
+
}
|