npm - @elizaos/training - Versions diffs - 2.0.0-alpha.11 - Mend

@elizaos/training 2.0.0-alpha.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (207) hide show

package/Dockerfile +75 -0
package/Makefile +374 -0
package/README.md +346 -0
package/config/rubrics.json +137 -0
package/data/.gitkeep +0 -0
package/data/degen/.gitkeep +2 -0
package/data/trader/.gitkeep +2 -0
package/docker-compose.test.yml +57 -0
package/package.json +58 -0
package/python/config/babylon_atropos.yaml +90 -0
package/python/config/profiles/12gb.json +11 -0
package/python/config/profiles/16gb.json +10 -0
package/python/config/profiles/24gb.json +10 -0
package/python/config/profiles/48gb.json +10 -0
package/python/config/profiles/cpu.json +11 -0
package/python/config/profiles/l40-2gpu-safe.json +20 -0
package/python/config/profiles/l40-2gpu.json +22 -0
package/python/config/profiles/l40-4gpu.json +21 -0
package/python/config/profiles/l40.json +17 -0
package/python/config/tinker_training.yaml +143 -0
package/python/curriculum_state.json +165 -0
package/python/env.template +86 -0
package/python/env.training.template +46 -0
package/python/pyproject.toml +41 -0
package/python/requirements-ci.txt +31 -0
package/python/requirements.txt +87 -0
package/python/scripts/__init__.py +4 -0
package/python/scripts/import_json_trajectories.py +412 -0
package/python/scripts/local-finetune/README.md +63 -0
package/python/scripts/local-finetune/ingest_and_score.py +139 -0
package/python/scripts/local-finetune/merge_model.py +32 -0
package/python/scripts/local-finetune/test_adapter.py +91 -0
package/python/scripts/local-finetune/train_from_csv.py +132 -0
package/python/scripts/merge_trajectories.py +318 -0
package/python/scripts/run_ab_test.py +143 -0
package/python/scripts/run_full_pipeline.py +544 -0
package/python/scripts/run_tinker_training.py +192 -0
package/python/scripts/run_training.py +914 -0
package/python/scripts/test_judge.py +155 -0
package/python/scripts/test_pipeline.py +356 -0
package/python/scripts/test_trained_model.py +380 -0
package/python/scripts/train_local.py +528 -0
package/python/setup.py +20 -0
package/python/src/__init__.py +190 -0
package/python/src/data_bridge/__init__.py +24 -0
package/python/src/data_bridge/converter.py +435 -0
package/python/src/data_bridge/reader.py +393 -0
package/python/src/models.py +283 -0
package/python/src/training/__init__.py +605 -0
package/python/src/training/ab_testing.py +404 -0
package/python/src/training/action_executor.py +621 -0
package/python/src/training/archetype_trainer.py +347 -0
package/python/src/training/atropos_trainer.py +980 -0
package/python/src/training/babylon_env.py +1254 -0
package/python/src/training/error_recovery.py +647 -0
package/python/src/training/evaluation.py +856 -0
package/python/src/training/fast_simulator.py +880 -0
package/python/src/training/format_validator.py +584 -0
package/python/src/training/hybrid_env.py +522 -0
package/python/src/training/kl_controller.py +628 -0
package/python/src/training/multi_prompt_dataset.py +883 -0
package/python/src/training/multi_turn.py +656 -0
package/python/src/training/online_env.py +1084 -0
package/python/src/training/quality_scorer.py +391 -0
package/python/src/training/quality_utils.py +633 -0
package/python/src/training/rewards.py +1344 -0
package/python/src/training/rlaif_env.py +17 -0
package/python/src/training/rollout_generator.py +502 -0
package/python/src/training/rubric_loader.py +198 -0
package/python/src/training/scenario_pool.py +1072 -0
package/python/src/training/schemas.py +481 -0
package/python/src/training/service_manager.py +552 -0
package/python/src/training/simulation_bridge.py +535 -0
package/python/src/training/tick_reward_attribution.py +399 -0
package/python/src/training/tinker_client.py +575 -0
package/python/src/training/tinker_trainer.py +646 -0
package/python/src/training/tokenization_utils.py +402 -0
package/python/tests/e2e/__init__.py +13 -0
package/python/tests/e2e/conftest.py +258 -0
package/python/tests/e2e/test_full_pipeline.py +643 -0
package/python/tests/e2e/test_online_training_e2e.py +365 -0
package/python/tests/integration/__init__.py +12 -0
package/python/tests/integration/conftest.py +383 -0
package/python/tests/integration/test_db_integration.py +649 -0
package/python/tests/integration/test_json_mode_integration.py +554 -0
package/python/tests/test_action_executor.py +594 -0
package/python/tests/test_archetype_scoring.py +1027 -0
package/python/tests/test_atropos_integration.py +360 -0
package/python/tests/test_evaluation.py +727 -0
package/python/tests/test_format_validator.py +486 -0
package/python/tests/test_kl_controller.py +432 -0
package/python/tests/test_lr_scheduler.py +579 -0
package/python/tests/test_multi_turn.py +590 -0
package/python/tests/test_online_env.py +519 -0
package/python/tests/test_quality_scorer.py +474 -0
package/python/tests/test_scenario_pool.py +735 -0
package/python/tests/test_service_manager.py +585 -0
package/python/tests/test_simulation_rollout.py +581 -0
package/python/tests/test_tokenization_utils.py +501 -0
package/python/tests/test_training_orchestrator.py +497 -0
package/python/tests/test_training_output_structure.py +661 -0
package/research-output/training-runs/training-run-1770772042899.json +26 -0
package/research-output/training-runs/training-run-1770930079670.json +32 -0
package/research-output/training-runs/training-run-1770930143700.json +44 -0
package/research-output/training-runs/training-run-1770930183638.json +38 -0
package/research-output/training-runs/training-run-1770930442049.json +38 -0
package/research-output/training-runs/training-run-1770930793243.json +38 -0
package/scripts/assess-training-data.ts +422 -0
package/scripts/e2e-training-test.ts +550 -0
package/scripts/export-rubrics.ts +64 -0
package/scripts/generate-research-report.ts +1523 -0
package/scripts/generate_dataset.sh +173 -0
package/scripts/json-mode-benchmark.ts +399 -0
package/scripts/real-archetype-benchmark.ts +210 -0
package/scripts/run-baseline-comparison.ts +116 -0
package/scripts/run-full-pipeline.ts +272 -0
package/scripts/runpod_setup.sh +137 -0
package/scripts/runpod_validate.sh +147 -0
package/scripts/test-model-in-game.ts +955 -0
package/scripts/test-scoring.ts +73 -0
package/scripts/test-trained-model.ts +209 -0
package/scripts/train-and-test.ts +824 -0
package/scripts/verify-final.ts +118 -0
package/src/adapter.ts +516 -0
package/src/archetypes/ArchetypeConfigService.ts +626 -0
package/src/archetypes/derive-archetype.ts +249 -0
package/src/archetypes/index.ts +22 -0
package/src/benchmark/ArchetypeMatchupBenchmark.ts +825 -0
package/src/benchmark/BenchmarkChartGenerator.ts +748 -0
package/src/benchmark/BenchmarkDataGenerator.ts +1288 -0
package/src/benchmark/BenchmarkDataViewer.ts +324 -0
package/src/benchmark/BenchmarkHistoryService.ts +221 -0
package/src/benchmark/BenchmarkRunner.ts +685 -0
package/src/benchmark/BenchmarkValidator.ts +206 -0
package/src/benchmark/FastEvalRunner.ts +225 -0
package/src/benchmark/MetricsValidator.ts +165 -0
package/src/benchmark/MetricsVisualizer.ts +909 -0
package/src/benchmark/ModelBenchmarkService.ts +611 -0
package/src/benchmark/ModelRegistry.ts +158 -0
package/src/benchmark/RulerBenchmarkIntegration.ts +235 -0
package/src/benchmark/SimulationA2AInterface.ts +1169 -0
package/src/benchmark/SimulationEngine.ts +832 -0
package/src/benchmark/__tests__/BenchmarkRunner.test.ts +534 -0
package/src/benchmark/__tests__/HeadToHead.test.ts +126 -0
package/src/benchmark/index.ts +89 -0
package/src/benchmark/parseSimulationMetrics.ts +124 -0
package/src/benchmark/simulation-types.ts +78 -0
package/src/dependencies.ts +439 -0
package/src/generation/TrajectoryGenerator.ts +387 -0
package/src/generation/index.ts +12 -0
package/src/huggingface/HuggingFaceDatasetUploader.ts +636 -0
package/src/huggingface/HuggingFaceIntegrationService.ts +426 -0
package/src/huggingface/HuggingFaceModelUploader.ts +532 -0
package/src/huggingface/index.ts +27 -0
package/src/huggingface/shared/HuggingFaceUploadUtil.ts +206 -0
package/src/index.ts +102 -0
package/src/init-training.ts +53 -0
package/src/metrics/TrajectoryMetricsExtractor.ts +653 -0
package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +759 -0
package/src/metrics/index.ts +8 -0
package/src/metrics/types.ts +200 -0
package/src/rubrics/__tests__/index.test.ts +184 -0
package/src/rubrics/ass-kisser.ts +85 -0
package/src/rubrics/degen.ts +80 -0
package/src/rubrics/goody-twoshoes.ts +84 -0
package/src/rubrics/index.ts +236 -0
package/src/rubrics/information-trader.ts +84 -0
package/src/rubrics/infosec.ts +101 -0
package/src/rubrics/liar.ts +104 -0
package/src/rubrics/perps-trader.ts +87 -0
package/src/rubrics/researcher.ts +81 -0
package/src/rubrics/scammer.ts +82 -0
package/src/rubrics/social-butterfly.ts +73 -0
package/src/rubrics/super-predictor.ts +97 -0
package/src/rubrics/trader.ts +67 -0
package/src/scoring/ArchetypeScoringService.ts +486 -0
package/src/scoring/JudgePromptBuilder.ts +556 -0
package/src/scoring/LLMJudgeCache.ts +401 -0
package/src/scoring/index.ts +9 -0
package/src/training/AutomationPipeline.ts +916 -0
package/src/training/BenchmarkService.ts +518 -0
package/src/training/ConfigValidator.ts +220 -0
package/src/training/MarketOutcomesTracker.ts +187 -0
package/src/training/ModelDeployer.ts +186 -0
package/src/training/ModelFetcher.ts +76 -0
package/src/training/ModelSelectionService.ts +341 -0
package/src/training/ModelUsageVerifier.ts +160 -0
package/src/training/MultiModelOrchestrator.ts +580 -0
package/src/training/RLModelConfig.ts +407 -0
package/src/training/RewardBackpropagationService.ts +149 -0
package/src/training/RulerScoringService.ts +666 -0
package/src/training/TrainingMonitor.ts +166 -0
package/src/training/TrajectoryRecorder.ts +399 -0
package/src/training/__tests__/TrajectoryRecorder.test.ts +472 -0
package/src/training/index.ts +100 -0
package/src/training/logRLConfig.ts +34 -0
package/src/training/pipeline.ts +129 -0
package/src/training/storage/ModelStorageService.ts +279 -0
package/src/training/storage/TrainingDataArchiver.ts +197 -0
package/src/training/storage/index.ts +17 -0
package/src/training/types.ts +207 -0
package/src/training/window-utils.ts +138 -0
package/src/utils/index.ts +101 -0
package/src/utils/logger.ts +59 -0
package/src/utils/snowflake.ts +17 -0
package/src/utils/synthetic-detector.ts +111 -0
package/tsconfig.json +20 -0

package/python/tests/test_online_env.py ADDED Viewed

@@ -0,0 +1,519 @@
+"""
+Tests for BabylonOnlineEnv
+Tests cover:
+- Prompt building
+- Action parsing
+- Response scoring
+- Integration with scenario pool
+"""
+import pytest
+from unittest.mock import AsyncMock, MagicMock, patch, Mock
+from src.training.online_env import (
+    build_trading_system_prompt,
+    build_observation_prompt,
+    parse_action_from_response,
+    extract_thinking,
+    score_response,
+    BabylonOnlineEnv,
+    BabylonOnlineEnvConfig,
+)
+from src.training.scenario_pool import (
+    Scenario,
+    ScenarioPoolConfig,
+    MarketState,
+    PerpetualState,
+    NewsItem,
+    PortfolioState,
+)
+# =============================================================================
+# Prompt Building Tests
+# =============================================================================
+class TestBuildTradingSystemPrompt:
+    """Tests for build_trading_system_prompt"""
+    def test_default_trader(self):
+        prompt = build_trading_system_prompt("trader")
+        assert "trading agent" in prompt.lower()
+        assert "trader" in prompt.lower()
+        assert "<think>" in prompt
+        assert "</think>" in prompt
+        assert "action" in prompt.lower()
+    def test_degen_archetype(self):
+        prompt = build_trading_system_prompt("degen")
+        assert "high-frequency" in prompt.lower() or "volume" in prompt.lower()
+    def test_analyst_archetype(self):
+        prompt = build_trading_system_prompt("analyst")
+        assert "research" in prompt.lower() or "analysis" in prompt.lower()
+    def test_unknown_archetype_defaults(self):
+        prompt = build_trading_system_prompt("unknown")
+        # Should get default (trader) behavior
+        assert "trading agent" in prompt.lower()
+    def test_contains_action_examples(self):
+        prompt = build_trading_system_prompt()
+        assert "buy" in prompt
+        assert "sell" in prompt
+        assert "wait" in prompt
+class TestBuildObservationPrompt:
+    """Tests for build_observation_prompt"""
+    def test_basic_scenario(self):
+        scenario = Scenario(
+            id="test-1",
+            source="synthetic",
+            portfolio=PortfolioState(balance=15000.0, total_pnl=500.0),
+        )
+        prompt = build_observation_prompt(scenario)
+        assert "15000" in prompt or "15,000" in prompt
+        assert "500" in prompt
+        assert "MARKET UPDATE" in prompt
+    def test_with_markets(self):
+        scenario = Scenario(
+            id="test-markets",
+            source="synthetic",
+            markets=[
+                MarketState(
+                    market_id="m1",
+                    question="Will BTC hit $100K?",
+                    yes_price=0.65,
+                    no_price=0.35,
+                    volume_24h=100000.0,
+                    liquidity=500000.0,
+                    expires_at=1735689600000,
+                )
+            ],
+        )
+        prompt = build_observation_prompt(scenario)
+        assert "PREDICTION MARKETS" in prompt
+        assert "BTC" in prompt
+        assert "0.65" in prompt
+    def test_with_perpetuals(self):
+        scenario = Scenario(
+            id="test-perps",
+            source="synthetic",
+            perpetuals=[
+                PerpetualState(
+                    ticker="ETH",
+                    mark_price=3500.0,
+                    index_price=3495.0,
+                    funding_rate=0.0001,
+                    open_interest=25000000.0,
+                    volume_24h=50000000.0,
+                    change_24h=0.02,
+                    high_24h=3600.0,
+                    low_24h=3400.0,
+                )
+            ],
+        )
+        prompt = build_observation_prompt(scenario)
+        assert "PERPETUAL MARKETS" in prompt
+        assert "ETH" in prompt
+        assert "3,500" in prompt or "3500" in prompt
+    def test_with_news(self):
+        scenario = Scenario(
+            id="test-news",
+            source="synthetic",
+            news=[
+                NewsItem(
+                    headline="Bitcoin Rally Continues",
+                    sentiment="bullish",
+                    impact="high",
+                    source="CryptoNews",
+                    timestamp=1735689600000,
+                )
+            ],
+        )
+        prompt = build_observation_prompt(scenario)
+        assert "RECENT NEWS" in prompt
+        assert "Bitcoin Rally" in prompt
+        assert "CryptoNews" in prompt
+# =============================================================================
+# Action Parsing Tests
+# =============================================================================
+class TestParseActionFromResponse:
+    """Tests for parse_action_from_response"""
+    def test_simple_json(self):
+        response = '{"action": "buy", "market": "m1", "amount": 100}'
+        action = parse_action_from_response(response)
+        assert action is not None
+        assert action["action"] == "buy"
+        assert action["market"] == "m1"
+        assert action["amount"] == 100
+    def test_with_think_tags(self):
+        response = """<think>
+I should analyze the market carefully.
+BTC is showing bullish momentum.
+</think>
+{"action": "open_perp", "ticker": "BTC", "size": 0.1, "direction": "long"}"""
+        action = parse_action_from_response(response)
+        assert action is not None
+        assert action["action"] == "open_perp"
+        assert action["ticker"] == "BTC"
+    def test_wait_action(self):
+        response = '{"action": "wait", "reason": "Need more data"}'
+        action = parse_action_from_response(response)
+        assert action is not None
+        assert action["action"] == "wait"
+    def test_invalid_json(self):
+        response = "This is not JSON at all"
+        action = parse_action_from_response(response)
+        assert action is None
+    def test_json_without_action_key(self):
+        response = '{"type": "buy", "amount": 100}'
+        action = parse_action_from_response(response)
+        assert action is None
+    def test_nested_json_in_text(self):
+        response = """Here's my analysis and decision:
+Based on the data, I'll buy.
+{"action": "buy", "market": "market-1", "amount": 50, "side": "yes"}
+This should be profitable."""
+        action = parse_action_from_response(response)
+        assert action is not None
+        assert action["action"] == "buy"
+    def test_multiple_json_objects_takes_first_valid(self):
+        # The function finds the first JSON with an "action" key
+        # when first JSON doesn't have action, it may not find the second
+        # This is expected behavior - we look for action JSON in specific patterns
+        response = """Some text here
+{"action": "sell", "market": "m2", "amount": 25}"""
+        action = parse_action_from_response(response)
+        assert action is not None
+        assert action["action"] == "sell"
+class TestExtractThinking:
+    """Tests for extract_thinking"""
+    def test_valid_think_tags(self):
+        response = "<think>This is my analysis</think>\n{\"action\": \"wait\"}"
+        thinking = extract_thinking(response)
+        assert thinking == "This is my analysis"
+    def test_multiline_thinking(self):
+        response = """<think>
+Line 1
+Line 2
+Line 3
+</think>
+{"action": "buy"}"""
+        thinking = extract_thinking(response)
+        assert "Line 1" in thinking
+        assert "Line 3" in thinking
+    def test_no_think_tags(self):
+        response = '{"action": "wait"}'
+        thinking = extract_thinking(response)
+        assert thinking == ""
+    def test_empty_think_tags(self):
+        response = "<think></think>action"
+        thinking = extract_thinking(response)
+        assert thinking == ""
+# =============================================================================
+# Scoring Tests
+# =============================================================================
+class TestScoreResponse:
+    """Tests for score_response"""
+    def test_well_formatted_response(self):
+        scenario = Scenario(id="test", source="synthetic")
+        response = """<think>
+The market is showing bullish momentum with BTC trading at $100,000.
+I should consider opening a long position because the funding rate is low.
+Looking at the risk, a small position of 0.1 BTC seems reasonable.
+</think>
+{"action": "open_perp", "ticker": "BTC", "size": 0.1, "direction": "long"}"""
+        score, metrics = score_response(response, scenario, "trader")
+        assert metrics["has_thinking"] is True
+        assert metrics["has_valid_action"] is True
+        assert metrics["action_type"] == "open_perp"
+        assert metrics["format_score"] > 0.5
+        assert metrics["reasoning_score"] > 0.3
+    def test_no_thinking_tags(self):
+        scenario = Scenario(id="test", source="synthetic")
+        response = '{"action": "wait", "reason": "unclear"}'
+        score, metrics = score_response(response, scenario, "trader")
+        assert metrics["has_thinking"] is False
+        assert metrics["has_valid_action"] is True
+        assert metrics["format_score"] < 0.5
+    def test_invalid_action(self):
+        scenario = Scenario(id="test", source="synthetic")
+        response = "<think>Analysis here</think>\nI'll wait for now."
+        score, metrics = score_response(response, scenario, "trader")
+        assert metrics["has_thinking"] is True
+        assert metrics["has_valid_action"] is False
+        assert metrics["action_type"] is None
+    def test_very_short_response_penalized(self):
+        scenario = Scenario(id="test", source="synthetic")
+        response = '{"action": "wait"}'
+        score, metrics = score_response(response, scenario, "trader")
+        # Short responses should have lower format scores
+        assert metrics["format_score"] <= 0.3
+    def test_reasoning_with_analysis_terms(self):
+        scenario = Scenario(id="test", source="synthetic")
+        response = """<think>
+The price is showing strong momentum with high volume.
+The trend is bullish and the market sentiment is positive.
+Given the probability of success and managing risk, I'll proceed.
+</think>
+{"action": "buy", "market": "m1", "amount": 100, "side": "yes"}"""
+        score, metrics = score_response(response, scenario, "trader")
+        # Should have high reasoning score due to analysis terms
+        assert metrics["reasoning_score"] > 0.4
+    def test_different_archetypes_affect_score(self):
+        scenario = Scenario(id="test", source="synthetic")
+        # A trade-heavy response
+        response = """<think>Quick analysis - buying now.</think>
+{"action": "buy", "market": "m1", "amount": 1000, "side": "yes"}"""
+        trader_score, _ = score_response(response, scenario, "trader")
+        degen_score, _ = score_response(response, scenario, "degen")
+        # Both should be scored (actual values depend on reward weights)
+        assert trader_score is not None
+        assert degen_score is not None
+# =============================================================================
+# Environment Tests
+# =============================================================================
+class TestBabylonOnlineEnvConfig:
+    """Tests for BabylonOnlineEnvConfig"""
+    def test_default_config(self):
+        config = BabylonOnlineEnvConfig()
+        assert config.group_size == 4
+        assert config.max_response_tokens == 512
+        assert config.temperature == 0.8
+        assert config.default_archetype == "trader"
+    def test_archetype_distribution(self):
+        config = BabylonOnlineEnvConfig()
+        assert "trader" in config.archetype_distribution
+        assert "degen" in config.archetype_distribution
+        assert sum(config.archetype_distribution.values()) == pytest.approx(1.0)
+class TestBabylonOnlineEnv:
+    """Tests for BabylonOnlineEnv (mock-based)"""
+    def test_config_init(self):
+        env_config, server_configs = BabylonOnlineEnv.config_init()
+        assert isinstance(env_config, BabylonOnlineEnvConfig)
+        assert len(server_configs) > 0
+        assert env_config.group_size >= 2
+    @pytest.mark.asyncio
+    async def test_setup_initializes_pool(self):
+        """Test that setup creates and initializes scenario pool"""
+        # Use config_init to get proper configs that include server_configs
+        config, server_configs = BabylonOnlineEnv.config_init()
+        # Mock the server manager to avoid actual vLLM calls
+        with patch('atroposlib.envs.base.ServerManager'):
+            env = BabylonOnlineEnv(config, server_configs, testing=True)
+            await env.setup()
+            assert env.scenario_pool is not None
+            assert len(env.scenario_pool.scenarios) > 0
+    @pytest.mark.asyncio
+    async def test_get_next_item_returns_scenario(self):
+        """Test get_next_item returns a scenario and archetype"""
+        config, server_configs = BabylonOnlineEnv.config_init()
+        with patch('atroposlib.envs.base.ServerManager'):
+            env = BabylonOnlineEnv(config, server_configs, testing=True)
+            await env.setup()
+            item = await env.get_next_item()
+            assert item is not None
+            scenario, archetype = item
+            assert isinstance(scenario, Scenario)
+            assert archetype in config.archetype_distribution
+class TestIntegration:
+    """Integration tests for online environment components"""
+    def test_full_prompt_building_flow(self):
+        """Test building prompts from scenario to final messages"""
+        scenario = Scenario(
+            id="integration-test",
+            source="synthetic",
+            markets=[
+                MarketState(
+                    market_id="btc-100k",
+                    question="Will BTC exceed $100K by EOY?",
+                    yes_price=0.72,
+                    no_price=0.28,
+                    volume_24h=500000.0,
+                    liquidity=1000000.0,
+                    expires_at=1735689600000,
+                )
+            ],
+            perpetuals=[
+                PerpetualState(
+                    ticker="BTC",
+                    mark_price=98000.0,
+                    index_price=97950.0,
+                    funding_rate=0.0002,
+                    open_interest=100000000.0,
+                    volume_24h=500000000.0,
+                    change_24h=0.03,
+                    high_24h=99000.0,
+                    low_24h=95000.0,
+                )
+            ],
+            news=[
+                NewsItem(
+                    headline="Institutional Buying Accelerates",
+                    sentiment="bullish",
+                    impact="high",
+                    source="Bloomberg",
+                    timestamp=1735689600000,
+                )
+            ],
+            portfolio=PortfolioState(balance=50000.0, total_pnl=2500.0),
+            difficulty="hard",
+        )
+        system_prompt = build_trading_system_prompt("trader")
+        user_prompt = build_observation_prompt(scenario)
+        # Verify prompts are valid
+        assert len(system_prompt) > 100
+        assert len(user_prompt) > 100
+        # Verify key content is present
+        assert "98,000" in user_prompt or "98000" in user_prompt  # BTC price
+        assert "100K" in user_prompt  # Question
+        assert "Institutional" in user_prompt  # News
+        assert "50,000" in user_prompt or "50000" in user_prompt  # Balance
+    def test_scoring_pipeline(self):
+        """Test full scoring pipeline"""
+        scenario = Scenario(id="scoring-test", source="synthetic")
+        # Simulate different quality responses
+        responses = [
+            # High quality
+            """<think>
+The market shows strong bullish momentum. BTC is at $98K with positive funding.
+The prediction market for $100K is at 72% YES which seems fair given momentum.
+I'll take a small long position because the risk/reward is favorable.
+</think>
+{"action": "open_perp", "ticker": "BTC", "size": 0.05, "direction": "long"}""",
+            # Medium quality
+            """<think>Buying BTC looks good.</think>
+{"action": "buy", "market": "btc-100k", "amount": 100, "side": "yes"}""",
+            # Low quality
+            '{"action": "wait"}',
+        ]
+        scores = []
+        for resp in responses:
+            score, _ = score_response(resp, scenario, "trader")
+            scores.append(score)
+        # Higher quality responses should generally score higher
+        # (Though exact ordering depends on reward weights)
+        assert all(isinstance(s, float) for s in scores)