synth-ai 0.2.12__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/multi_step/configs/crafter_rl_outcome.toml +74 -0
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +186 -0
- examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +83 -0
- examples/multi_step/configs/crafter_rl_stepwise_simple.toml +78 -0
- examples/multi_step/crafter_rl_lora.md +51 -10
- examples/multi_step/sse_metrics_streaming_notes.md +357 -0
- examples/multi_step/task_app_config_notes.md +7 -1
- examples/swe/task_app/grpo_swe_mini.py +55 -26
- examples/swe/task_app/hosted/rollout.py +40 -0
- examples/swe/task_app/hosted/test_service.py +5 -6
- examples/task_apps/TESTING.md +275 -0
- examples/task_apps/__init__.py +0 -0
- examples/task_apps/crafter/__init__.py +0 -0
- examples/task_apps/crafter/task_app/__init__.py +2 -0
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +21 -46
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +60 -4
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +109 -45
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +67 -49
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +242 -193
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
- examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
- examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
- examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
- examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
- examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
- examples/task_apps/enron/__init__.py +1 -0
- examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
- examples/task_apps/enron/task_app/README.md +14 -0
- examples/task_apps/enron/task_app/__init__.py +1 -0
- examples/task_apps/enron/task_app/grpo_enron.py +906 -0
- examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
- examples/task_apps/enron/tests/__init__.py +2 -0
- examples/task_apps/enron/tests/conftest.py +115 -0
- examples/task_apps/enron/tests/integration/__init__.py +2 -0
- examples/task_apps/enron/tests/integration/test_enron_eval.py +177 -0
- examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
- examples/task_apps/enron/tests/unit/__init__.py +2 -0
- examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
- examples/task_apps/math/__init__.py +0 -0
- examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
- examples/task_apps/pokemon_battle/__init__.py +2 -0
- examples/task_apps/pokemon_battle/modal_app.py +104 -0
- examples/task_apps/pokemon_battle/task_app/README.md +68 -0
- examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
- examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
- examples/task_apps/pokemon_red/README.md +357 -0
- examples/task_apps/pokemon_red/__init__.py +3 -0
- examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
- examples/task_apps/pokemon_red/pallet_town_rl_config.toml +73 -0
- examples/task_apps/pokemon_red/task_app.py +606 -0
- examples/task_apps/pokemon_red/test_pallet_town_rewards.py +191 -0
- examples/task_apps/sokoban/README.md +307 -0
- examples/task_apps/sokoban/__init__.py +3 -0
- examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
- examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
- examples/task_apps/sokoban/task_app.py +1058 -0
- examples/task_apps/sokoban/tests/__init__.py +2 -0
- examples/task_apps/sokoban/tests/conftest.py +113 -0
- examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
- examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
- examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
- examples/task_apps/verilog/__init__.py +1 -0
- examples/task_apps/verilog/eval_groq_qwen32b.toml +20 -0
- examples/task_apps/verilog/task_app/README.md +12 -0
- examples/task_apps/verilog/task_app/__init__.py +1 -0
- examples/task_apps/verilog/task_app/grpo_verilog.py +931 -0
- examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
- examples/task_apps/verilog/tests/__init__.py +2 -0
- examples/task_apps/verilog/tests/conftest.py +115 -0
- examples/task_apps/verilog/tests/integration/__init__.py +2 -0
- examples/task_apps/verilog/tests/integration/test_verilog_eval.py +179 -0
- examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
- examples/task_apps/verilog/tests/unit/__init__.py +2 -0
- examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
- examples/vlm/crafter_openai_vlm_agent.py +4 -4
- examples/vlm/run_crafter_vlm_benchmark.py +4 -4
- examples/warming_up_to_rl/configs/eval_stepwise_complex.toml +4 -2
- examples/warming_up_to_rl/configs/eval_stepwise_simple.toml +4 -2
- examples/warming_up_to_rl/run_eval.py +127 -18
- examples/workflows/__init__.py +0 -0
- examples/workflows/math_rl/__init__.py +0 -0
- examples/workflows/math_rl/download_dataset.py +80 -0
- synth_ai/__init__.py +41 -1
- synth_ai/api/train/builders.py +73 -29
- synth_ai/api/train/cli.py +12 -6
- synth_ai/api/train/configs/__init__.py +44 -0
- synth_ai/api/train/configs/rl.py +134 -0
- synth_ai/api/train/configs/sft.py +95 -0
- synth_ai/api/train/configs/shared.py +24 -0
- synth_ai/api/train/env_resolver.py +5 -2
- synth_ai/api/train/supported_algos.py +10 -5
- synth_ai/api/train/utils.py +7 -4
- synth_ai/cli/__init__.py +7 -51
- synth_ai/cli/_storage.py +4 -3
- synth_ai/cli/_validate_task_app.py +11 -0
- synth_ai/cli/balance.py +4 -3
- synth_ai/cli/calc.py +2 -2
- synth_ai/cli/demo.py +49 -43
- synth_ai/cli/legacy_root_backup.py +1 -1
- synth_ai/cli/rl_demo.py +86 -106
- synth_ai/cli/root.py +0 -97
- synth_ai/cli/task_apps.py +1710 -186
- synth_ai/demos/core/cli.py +121 -159
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +28 -16
- synth_ai/environments/examples/crafter_classic/environment.py +16 -0
- synth_ai/environments/examples/enron/engine.py +7 -2
- synth_ai/environments/examples/enron/environment.py +68 -0
- synth_ai/environments/examples/red/engine.py +27 -0
- synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
- synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
- synth_ai/environments/examples/red/environment.py +60 -0
- synth_ai/environments/examples/sokoban/taskset.py +116 -0
- synth_ai/environments/examples/verilog/engine.py +30 -4
- synth_ai/evals/__init__.py +15 -0
- synth_ai/evals/client.py +82 -0
- synth_ai/evals/types.py +42 -0
- synth_ai/jobs/client.py +16 -4
- synth_ai/judge_schemas.py +127 -0
- synth_ai/py.typed +0 -0
- synth_ai/task/__init__.py +14 -5
- synth_ai/task/contracts.py +124 -38
- synth_ai/task/proxy.py +48 -56
- synth_ai/task/rubrics/__init__.py +53 -0
- synth_ai/task/rubrics/loaders.py +133 -0
- synth_ai/task/rubrics/models.py +57 -0
- synth_ai/task/rubrics/scoring.py +113 -0
- synth_ai/task/rubrics/strict.py +149 -0
- synth_ai/task/server.py +8 -7
- synth_ai/task/validators.py +269 -6
- synth_ai/tracing_v3/decorators.py +7 -3
- synth_ai/tracing_v3/replica_sync.py +4 -4
- synth_ai/tracing_v3/serialization.py +130 -0
- synth_ai/tracing_v3/trace_utils.py +317 -0
- synth_ai/tracing_v3/turso/native_manager.py +3 -3
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/METADATA +4 -1
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/RECORD +228 -89
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/entry_points.txt +0 -1
- synth_ai/task/rubrics.py +0 -219
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/utils.py +0 -0
- /examples/{rl/task_app → task_apps/math}/README.md +0 -0
- /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
- /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,357 @@
|
|
|
1
|
+
# Pokémon Red Task App
|
|
2
|
+
|
|
3
|
+
A reinforcement learning environment for Pokémon Red using PyBoy emulation with VLM support.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Full Game Boy Emulation**: Uses PyBoy to run authentic Pokémon Red ROM
|
|
8
|
+
- **VLM Support**: Base64-encoded PNG frames for vision models (GPT-4V, Qwen-VL, etc.)
|
|
9
|
+
- **Policy Proxy**: OpenAI/Groq API integration for LLM-driven gameplay
|
|
10
|
+
- **Rich State Extraction**: Comprehensive game state from RAM (HP, position, party, battle data)
|
|
11
|
+
- **Reward Shaping**: Ultra-dense reward functions for RL training
|
|
12
|
+
- **Instant Start**: Pre-configured init state skips intro (starts in Red's bedroom)
|
|
13
|
+
|
|
14
|
+
## Quick Start
|
|
15
|
+
|
|
16
|
+
### 1. Start the Task App Server
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
# From synth-ai root
|
|
20
|
+
uv run -m synth_ai task-app serve pokemon_red --port 8913
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
### 2. Run a Random Rollout
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
import httpx
|
|
27
|
+
import asyncio
|
|
28
|
+
|
|
29
|
+
async def test_rollout():
|
|
30
|
+
async with httpx.AsyncClient(timeout=60.0) as client:
|
|
31
|
+
response = await client.post(
|
|
32
|
+
"http://127.0.0.1:8913/rollout",
|
|
33
|
+
json={
|
|
34
|
+
"ops": [
|
|
35
|
+
{"button": "DOWN", "frames": 10},
|
|
36
|
+
{"button": "A", "frames": 20},
|
|
37
|
+
{"button": "RIGHT", "frames": 15},
|
|
38
|
+
],
|
|
39
|
+
"policy": {"config": {}},
|
|
40
|
+
},
|
|
41
|
+
)
|
|
42
|
+
result = response.json()
|
|
43
|
+
print(f"Steps: {len(result['steps'])}")
|
|
44
|
+
|
|
45
|
+
asyncio.run(test_rollout())
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### 3. Run with VLM Policy
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
# Using Qwen-VL via Groq
|
|
52
|
+
uv run python examples/task_apps/pokemon_red/test_pallet_town_rewards.py
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Reward Functions
|
|
56
|
+
|
|
57
|
+
### Pallet Town Progression (Recommended for Beginners)
|
|
58
|
+
|
|
59
|
+
**Location**: `synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py`
|
|
60
|
+
|
|
61
|
+
Ultra-rich reward shaping for the opening sequence:
|
|
62
|
+
|
|
63
|
+
| Milestone | Reward | Description |
|
|
64
|
+
|-----------|--------|-------------|
|
|
65
|
+
| Leave bedroom | +20 | Go downstairs |
|
|
66
|
+
| Exit house | +30 | Enter Pallet Town |
|
|
67
|
+
| Find Oak's lab | +40 | Discover and enter lab |
|
|
68
|
+
| Talk to Oak | +50 | First dialogue |
|
|
69
|
+
| Get starter | +100 | Receive your first Pokémon |
|
|
70
|
+
| Enter battle | +75 | Start rival battle |
|
|
71
|
+
| Deal damage | +50 | Attack rival (10×5) |
|
|
72
|
+
| Half HP | +25 | Reduce enemy to <50% HP |
|
|
73
|
+
| Low HP | +35 | Reduce enemy to <25% HP |
|
|
74
|
+
| Win battle | +150 | Defeat rival |
|
|
75
|
+
| Exit lab | +60 | Leave with Pokémon |
|
|
76
|
+
| **Efficiency bonuses** | +100 | Fast navigation, healthy Pokémon |
|
|
77
|
+
|
|
78
|
+
**Total: ~600-700 points**
|
|
79
|
+
|
|
80
|
+
See [`PALLET_TOWN_REWARDS.md`](../../../synth_ai/environments/examples/red/engine_helpers/reward_library/PALLET_TOWN_REWARDS.md) for full documentation.
|
|
81
|
+
|
|
82
|
+
### Usage in Training
|
|
83
|
+
|
|
84
|
+
```toml
|
|
85
|
+
# pallet_town_rl_config.toml
|
|
86
|
+
[reward]
|
|
87
|
+
reward_type = "composite"
|
|
88
|
+
reward_class = "synth_ai.environments.examples.red.engine_helpers.reward_library.pallet_town_progression.PalletTownProgressionCompositeReward"
|
|
89
|
+
|
|
90
|
+
[training]
|
|
91
|
+
algorithm = "ppo"
|
|
92
|
+
max_steps_per_episode = 500
|
|
93
|
+
num_episodes = 1000
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## State Schema
|
|
97
|
+
|
|
98
|
+
The environment exposes comprehensive game state:
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
{
|
|
102
|
+
# Position
|
|
103
|
+
"map_id": int, # Current location
|
|
104
|
+
"player_x": int,
|
|
105
|
+
"player_y": int,
|
|
106
|
+
|
|
107
|
+
# Party
|
|
108
|
+
"party_count": int,
|
|
109
|
+
"party_pokemon": [
|
|
110
|
+
{
|
|
111
|
+
"species_id": int,
|
|
112
|
+
"level": int,
|
|
113
|
+
"hp_current": int,
|
|
114
|
+
"hp_max": int,
|
|
115
|
+
"hp_percentage": float,
|
|
116
|
+
"xp": int,
|
|
117
|
+
}
|
|
118
|
+
],
|
|
119
|
+
|
|
120
|
+
# Battle
|
|
121
|
+
"in_battle": bool,
|
|
122
|
+
"battle_outcome": int, # 0=ongoing, 1=win, 2=lose
|
|
123
|
+
"enemy_hp_current": int,
|
|
124
|
+
"enemy_hp_max": int,
|
|
125
|
+
"enemy_hp_percentage": float,
|
|
126
|
+
"enemy_level": int,
|
|
127
|
+
"enemy_species_id": int,
|
|
128
|
+
"battle_turn": int,
|
|
129
|
+
|
|
130
|
+
# Dialogue & UI
|
|
131
|
+
"text_box_active": bool,
|
|
132
|
+
"menu_state": int,
|
|
133
|
+
|
|
134
|
+
# Progress
|
|
135
|
+
"badges": int, # Bitfield of earned badges
|
|
136
|
+
"money": int,
|
|
137
|
+
|
|
138
|
+
# VLM Support
|
|
139
|
+
"observation_image_base64": str, # PNG frame for vision models
|
|
140
|
+
}
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## Action Space
|
|
144
|
+
|
|
145
|
+
### Button Actions
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
{
|
|
149
|
+
"button": "A" | "B" | "START" | "SELECT" | "UP" | "DOWN" | "LEFT" | "RIGHT",
|
|
150
|
+
"frames": int, # How long to hold the button (60fps)
|
|
151
|
+
}
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### Policy-Driven Actions
|
|
155
|
+
|
|
156
|
+
When using LLM policies, the task app proxies requests to OpenAI/Groq:
|
|
157
|
+
|
|
158
|
+
```python
|
|
159
|
+
{
|
|
160
|
+
"policy": {
|
|
161
|
+
"config": {
|
|
162
|
+
"model": "gpt-4-turbo",
|
|
163
|
+
"api_key": "...",
|
|
164
|
+
# or for Groq:
|
|
165
|
+
# "model": "qwen-2.5-7b",
|
|
166
|
+
# "base_url": "https://api.groq.com/v1",
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
## Files
|
|
173
|
+
|
|
174
|
+
- **`task_app.py`**: Main task app entry point
|
|
175
|
+
- **`pallet_town_rl_config.toml`**: Training config for Pallet Town sequence
|
|
176
|
+
- **`test_pallet_town_rewards.py`**: Reward function test/demo script
|
|
177
|
+
- **`create_red_init_state.py`** (repo root): Script to generate init state
|
|
178
|
+
- **`Pokemon - Red Version (USA, Europe) (SGB Enhanced).gb`**: Your ROM (not committed)
|
|
179
|
+
|
|
180
|
+
## Creating Init States
|
|
181
|
+
|
|
182
|
+
The default init state starts in Red's bedroom with intro skipped. To create custom states:
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
# See /Users/joshpurtell/Documents/GitHub/synth-ai/create_red_init_state.py
|
|
186
|
+
from pyboy import PyBoy
|
|
187
|
+
|
|
188
|
+
emulator = PyBoy("path/to/rom.gb", window="null")
|
|
189
|
+
|
|
190
|
+
# Navigate to desired starting point
|
|
191
|
+
# ... (button presses)
|
|
192
|
+
|
|
193
|
+
# Save state
|
|
194
|
+
with open("custom_init.state", "wb") as f:
|
|
195
|
+
emulator.save_state(f)
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
## Memory Addresses
|
|
199
|
+
|
|
200
|
+
Key RAM addresses are defined in `synth_ai/environments/examples/red/engine_helpers/memory_map.py`:
|
|
201
|
+
|
|
202
|
+
- `MAP_ID = 0xD35E`
|
|
203
|
+
- `PLAYER_X/Y = 0xD362/0xD361`
|
|
204
|
+
- `IN_BATTLE_FLAG = 0xD057`
|
|
205
|
+
- `ENEMY_HP_CURRENT = 0xCFE6`
|
|
206
|
+
- `PARTY_COUNT = 0xD163`
|
|
207
|
+
- `BADGE_FLAGS = 0xD356`
|
|
208
|
+
- (and many more)
|
|
209
|
+
|
|
210
|
+
## Troubleshooting
|
|
211
|
+
|
|
212
|
+
### ROM Not Found
|
|
213
|
+
|
|
214
|
+
```bash
|
|
215
|
+
# Set environment variable
|
|
216
|
+
export POKEMON_RED_ROM_PATH="/path/to/pokemon_red.gb"
|
|
217
|
+
|
|
218
|
+
# Or copy ROM to expected location
|
|
219
|
+
cp "Pokemon - Red Version.gb" synth_ai/environments/examples/red/roms/pokemon_red.gb
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
### PyBoy Not Installed
|
|
223
|
+
|
|
224
|
+
```bash
|
|
225
|
+
uv add pyboy
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
### Server Won't Start (Port in Use)
|
|
229
|
+
|
|
230
|
+
```bash
|
|
231
|
+
# Kill existing server
|
|
232
|
+
lsof -ti :8913 | xargs -r kill -9
|
|
233
|
+
|
|
234
|
+
# Or use a different port
|
|
235
|
+
uv run -m synth_ai task-app serve pokemon_red --port 8914
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
## Examples
|
|
239
|
+
|
|
240
|
+
### 1. Policy Evaluation with GPT-5-nano
|
|
241
|
+
|
|
242
|
+
Evaluate a GPT-5-nano policy across 10 episodes (10 policy calls each):
|
|
243
|
+
|
|
244
|
+
```bash
|
|
245
|
+
# From synth-ai root
|
|
246
|
+
cd /Users/joshpurtell/Documents/GitHub/synth-ai
|
|
247
|
+
|
|
248
|
+
# 1. Make sure OpenAI API key is in .env
|
|
249
|
+
echo "OPENAI_API_KEY=sk-..." >> .env
|
|
250
|
+
|
|
251
|
+
# 2. Start the task app server (in background)
|
|
252
|
+
nohup sh -c 'printf "n\n" | uv run -m synth_ai task-app serve pokemon_red --port 8913 --no-reload' > nohup_pokemon.log 2>&1 &
|
|
253
|
+
|
|
254
|
+
# Wait for startup
|
|
255
|
+
sleep 8
|
|
256
|
+
|
|
257
|
+
# 3. Run the evaluation
|
|
258
|
+
uv run python examples/task_apps/pokemon_red/eval_pokemon_red_policy.py
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
**Expected Output:**
|
|
262
|
+
```
|
|
263
|
+
================================================================================
|
|
264
|
+
POKÉMON RED - POLICY EVALUATION
|
|
265
|
+
================================================================================
|
|
266
|
+
|
|
267
|
+
Task: Pallet Town Progression
|
|
268
|
+
Policy: gpt-5-nano
|
|
269
|
+
Episodes: 10
|
|
270
|
+
Max steps per episode: 10
|
|
271
|
+
|
|
272
|
+
✓ Server is healthy
|
|
273
|
+
✓ API key loaded
|
|
274
|
+
|
|
275
|
+
🎮 Running 10 episodes in parallel...
|
|
276
|
+
|
|
277
|
+
================================================================================
|
|
278
|
+
RESULTS SUMMARY
|
|
279
|
+
================================================================================
|
|
280
|
+
|
|
281
|
+
+-----------+----------+---------+-------------+---------+----------+--------------+
|
|
282
|
+
| Episode | Reward | Steps | Final Map | Party | Badges | Milestones |
|
|
283
|
+
+===========+==========+=========+=============+=========+==========+==============+
|
|
284
|
+
| 1 | 0 | 10 | Map38 | 0 | 0 | 0 |
|
|
285
|
+
| 2 | 0 | 9 | Map38 | 0 | 0 | 0 |
|
|
286
|
+
| 9 | 20 | 10 | Map38 | 0 | 0 | 1 |
|
|
287
|
+
+-----------+----------+---------+-------------+---------+----------+--------------+
|
|
288
|
+
|
|
289
|
+
Statistics:
|
|
290
|
+
Mean reward: 2.00
|
|
291
|
+
Max reward: 20.00
|
|
292
|
+
Success rate: 10% reached first milestone
|
|
293
|
+
|
|
294
|
+
Best Episode (#9):
|
|
295
|
+
Total reward: 20.0
|
|
296
|
+
Milestones achieved:
|
|
297
|
+
Step 5: Moved from Map38 to Map37 (+20.0)
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
**Key Features:**
|
|
301
|
+
- ✅ **Action Batching**: Each policy call returns 5-10 actions via `execute_sequence` tool
|
|
302
|
+
- ✅ **Parallel Execution**: All 10 episodes run concurrently
|
|
303
|
+
- ✅ **Rich Metrics**: Rewards, steps, maps, party status, milestones tracked
|
|
304
|
+
- ✅ **Fast Evaluation**: ~2-3 minutes for 10 episodes (vs 50+ min without batching)
|
|
305
|
+
|
|
306
|
+
**Customize the Evaluation:**
|
|
307
|
+
|
|
308
|
+
```python
|
|
309
|
+
# In eval_pokemon_red_policy.py
|
|
310
|
+
NUM_EPISODES = 10 # Number of episodes to run
|
|
311
|
+
MAX_STEPS_PER_EPISODE = 10 # Policy calls per episode (each returns 5-10 actions)
|
|
312
|
+
MODEL = "gpt-5-nano" # Or "gpt-4-turbo", "qwen-2.5-7b", etc.
|
|
313
|
+
```
|
|
314
|
+
|
|
315
|
+
### 2. Test Script (Random Actions)
|
|
316
|
+
|
|
317
|
+
```bash
|
|
318
|
+
cd /Users/joshpurtell/Documents/GitHub/synth-ai
|
|
319
|
+
uv run python test_pokemon_red_rollout.py
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
### 3. Reward Function Demo
|
|
323
|
+
|
|
324
|
+
```bash
|
|
325
|
+
uv run python examples/task_apps/pokemon_red/test_pallet_town_rewards.py
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
Output:
|
|
329
|
+
```
|
|
330
|
+
======================================================================
|
|
331
|
+
PALLET TOWN PROGRESSION - REWARD SIMULATION
|
|
332
|
+
======================================================================
|
|
333
|
+
|
|
334
|
+
✓ Leave bedroom (Map 1→2): +20 points
|
|
335
|
+
✓ Exit house to Pallet Town (Map 2→0): +30 points
|
|
336
|
+
✓ Find and enter Oak's Lab (Map 0→3): +40 points
|
|
337
|
+
...
|
|
338
|
+
======================================================================
|
|
339
|
+
TOTAL REWARD: 705 points
|
|
340
|
+
======================================================================
|
|
341
|
+
```
|
|
342
|
+
|
|
343
|
+
## Future Work
|
|
344
|
+
|
|
345
|
+
- [ ] Route 1 exploration rewards
|
|
346
|
+
- [ ] Wild Pokémon encounter rewards
|
|
347
|
+
- [ ] Capture mechanics rewards
|
|
348
|
+
- [ ] Gym battle rewards
|
|
349
|
+
- [ ] Badge collection rewards
|
|
350
|
+
- [ ] Multi-environment curriculum (Pallet → Viridian → Pewter)
|
|
351
|
+
|
|
352
|
+
## Credits
|
|
353
|
+
|
|
354
|
+
- **PyBoy**: Game Boy emulator - https://github.com/Baekalfen/PyBoy
|
|
355
|
+
- **Pokémon Red Disassembly**: RAM map reference - https://github.com/pret/pokered
|
|
356
|
+
- **Datacrystal.org**: Memory address documentation
|
|
357
|
+
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Evaluate GPT-5-nano policy on Pokemon Red Pallet Town progression.
|
|
3
|
+
|
|
4
|
+
Runs 10 parallel rollouts and reports rewards in a table.
|
|
5
|
+
"""
|
|
6
|
+
import asyncio
|
|
7
|
+
import os
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
from dotenv import load_dotenv
|
|
12
|
+
from tabulate import tabulate
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# Load environment variables
|
|
16
|
+
load_dotenv()
|
|
17
|
+
|
|
18
|
+
# Configuration
|
|
19
|
+
TASK_APP_URL = "http://127.0.0.1:8913"
|
|
20
|
+
NUM_EPISODES = 10
|
|
21
|
+
MAX_STEPS_PER_EPISODE = 10 # 10 policy calls per episode (each may return 5-10 actions)
|
|
22
|
+
MODEL = "gpt-5-nano"
|
|
23
|
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
async def run_single_rollout(
|
|
27
|
+
client: httpx.AsyncClient,
|
|
28
|
+
episode_id: int,
|
|
29
|
+
) -> dict[str, Any]:
|
|
30
|
+
"""Run a single rollout with policy-driven actions."""
|
|
31
|
+
|
|
32
|
+
# Build rollout request with policy actions
|
|
33
|
+
# Send "policy" for each step to trigger LLM inference
|
|
34
|
+
rollout_request = {
|
|
35
|
+
"run_id": f"eval_episode_{episode_id:03d}",
|
|
36
|
+
"env": {"instance_id": f"pallet_town_{episode_id:03d}"},
|
|
37
|
+
"ops": ["policy"] * MAX_STEPS_PER_EPISODE, # Let policy drive all actions
|
|
38
|
+
"policy": {
|
|
39
|
+
"type": "llm",
|
|
40
|
+
"model": MODEL,
|
|
41
|
+
"config": {
|
|
42
|
+
"model": MODEL,
|
|
43
|
+
"temperature": 0.7,
|
|
44
|
+
"max_tokens": 500,
|
|
45
|
+
}
|
|
46
|
+
},
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
response = await client.post(
|
|
51
|
+
f"{TASK_APP_URL}/rollout",
|
|
52
|
+
json=rollout_request,
|
|
53
|
+
timeout=300.0, # 5 minutes per rollout
|
|
54
|
+
)
|
|
55
|
+
response.raise_for_status()
|
|
56
|
+
result = response.json()
|
|
57
|
+
|
|
58
|
+
# Extract metrics
|
|
59
|
+
trajectories = result.get("trajectories", [])
|
|
60
|
+
if not trajectories:
|
|
61
|
+
return {
|
|
62
|
+
"episode_id": episode_id,
|
|
63
|
+
"status": "error",
|
|
64
|
+
"error": "No trajectories returned",
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
trajectory = trajectories[0]
|
|
68
|
+
steps = trajectory.get("steps", [])
|
|
69
|
+
num_steps = len(steps) - 1 # Subtract initial observation
|
|
70
|
+
|
|
71
|
+
# Get metrics
|
|
72
|
+
metrics = result.get("metrics", {})
|
|
73
|
+
total_reward = metrics.get("episode_returns", [0.0])[0]
|
|
74
|
+
outcome_score = metrics.get("outcome_score", 0.0)
|
|
75
|
+
details = metrics.get("details", {})
|
|
76
|
+
|
|
77
|
+
# Extract milestone info
|
|
78
|
+
reward_components = details.get("reward_components", [])
|
|
79
|
+
milestone_events = details.get("milestone_events", [])
|
|
80
|
+
final_map = details.get("final_map", -1)
|
|
81
|
+
party_count = details.get("party_count", 0)
|
|
82
|
+
badges = details.get("badges", 0)
|
|
83
|
+
|
|
84
|
+
return {
|
|
85
|
+
"episode_id": episode_id,
|
|
86
|
+
"status": "success",
|
|
87
|
+
"total_reward": total_reward,
|
|
88
|
+
"outcome_score": outcome_score,
|
|
89
|
+
"num_steps": num_steps,
|
|
90
|
+
"final_map": final_map,
|
|
91
|
+
"party_count": party_count,
|
|
92
|
+
"badges": badges,
|
|
93
|
+
"num_milestones": len(milestone_events),
|
|
94
|
+
"reward_components": reward_components,
|
|
95
|
+
"milestone_events": milestone_events,
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
except httpx.TimeoutException:
|
|
99
|
+
return {
|
|
100
|
+
"episode_id": episode_id,
|
|
101
|
+
"status": "timeout",
|
|
102
|
+
"error": "Rollout timed out after 5 minutes",
|
|
103
|
+
}
|
|
104
|
+
except Exception as e:
|
|
105
|
+
return {
|
|
106
|
+
"episode_id": episode_id,
|
|
107
|
+
"status": "error",
|
|
108
|
+
"error": str(e),
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
async def main():
|
|
113
|
+
print("=" * 80)
|
|
114
|
+
print("POKÉMON RED - POLICY EVALUATION")
|
|
115
|
+
print("=" * 80)
|
|
116
|
+
print()
|
|
117
|
+
print(f"Task: Pallet Town Progression")
|
|
118
|
+
print(f"Policy: {MODEL}")
|
|
119
|
+
print(f"Episodes: {NUM_EPISODES}")
|
|
120
|
+
print(f"Max steps per episode: {MAX_STEPS_PER_EPISODE}")
|
|
121
|
+
print(f"Server: {TASK_APP_URL}")
|
|
122
|
+
print()
|
|
123
|
+
|
|
124
|
+
# Check server health
|
|
125
|
+
async with httpx.AsyncClient() as client:
|
|
126
|
+
try:
|
|
127
|
+
response = await client.get(f"{TASK_APP_URL}/health", timeout=5.0)
|
|
128
|
+
response.raise_for_status()
|
|
129
|
+
print("✓ Server is healthy")
|
|
130
|
+
except Exception as e:
|
|
131
|
+
print(f"❌ Server not responding: {e}")
|
|
132
|
+
print(f" Start it with: uv run -m synth_ai task-app serve pokemon_red --port 8913")
|
|
133
|
+
return
|
|
134
|
+
|
|
135
|
+
# Check API key
|
|
136
|
+
if not OPENAI_API_KEY:
|
|
137
|
+
print("❌ OPENAI_API_KEY not found in environment")
|
|
138
|
+
print(" Make sure .env file contains OPENAI_API_KEY")
|
|
139
|
+
return
|
|
140
|
+
print(f"✓ API key loaded (sk_env...{OPENAI_API_KEY[-4:]})")
|
|
141
|
+
print()
|
|
142
|
+
|
|
143
|
+
# Run rollouts in parallel
|
|
144
|
+
print(f"🎮 Running {NUM_EPISODES} episodes in parallel...")
|
|
145
|
+
print()
|
|
146
|
+
|
|
147
|
+
tasks = [
|
|
148
|
+
run_single_rollout(client, episode_id)
|
|
149
|
+
for episode_id in range(1, NUM_EPISODES + 1)
|
|
150
|
+
]
|
|
151
|
+
|
|
152
|
+
results = await asyncio.gather(*tasks)
|
|
153
|
+
|
|
154
|
+
# Separate successful and failed results
|
|
155
|
+
successful = [r for r in results if r.get("status") == "success"]
|
|
156
|
+
failed = [r for r in results if r.get("status") != "success"]
|
|
157
|
+
|
|
158
|
+
# Print summary table
|
|
159
|
+
print()
|
|
160
|
+
print("=" * 80)
|
|
161
|
+
print("RESULTS SUMMARY")
|
|
162
|
+
print("=" * 80)
|
|
163
|
+
print()
|
|
164
|
+
|
|
165
|
+
if successful:
|
|
166
|
+
table_data = []
|
|
167
|
+
for r in successful:
|
|
168
|
+
table_data.append([
|
|
169
|
+
r["episode_id"],
|
|
170
|
+
f"{r['total_reward']:.1f}",
|
|
171
|
+
r["num_steps"],
|
|
172
|
+
f"Map{r['final_map']}",
|
|
173
|
+
r["party_count"],
|
|
174
|
+
r["badges"],
|
|
175
|
+
r["num_milestones"],
|
|
176
|
+
f"{r['outcome_score']:.3f}",
|
|
177
|
+
])
|
|
178
|
+
|
|
179
|
+
headers = [
|
|
180
|
+
"Episode",
|
|
181
|
+
"Reward",
|
|
182
|
+
"Steps",
|
|
183
|
+
"Final Map",
|
|
184
|
+
"Party",
|
|
185
|
+
"Badges",
|
|
186
|
+
"Milestones",
|
|
187
|
+
"Outcome Score",
|
|
188
|
+
]
|
|
189
|
+
|
|
190
|
+
print(tabulate(table_data, headers=headers, tablefmt="grid"))
|
|
191
|
+
print()
|
|
192
|
+
|
|
193
|
+
# Print statistics
|
|
194
|
+
rewards = [r["total_reward"] for r in successful]
|
|
195
|
+
steps = [r["num_steps"] for r in successful]
|
|
196
|
+
outcome_scores = [r["outcome_score"] for r in successful]
|
|
197
|
+
|
|
198
|
+
print("Statistics:")
|
|
199
|
+
print(f" Mean reward: {sum(rewards) / len(rewards):.2f}")
|
|
200
|
+
print(f" Max reward: {max(rewards):.2f}")
|
|
201
|
+
print(f" Min reward: {min(rewards):.2f}")
|
|
202
|
+
print(f" Mean steps: {sum(steps) / len(steps):.1f}")
|
|
203
|
+
print(f" Mean outcome score: {sum(outcome_scores) / len(outcome_scores):.4f}")
|
|
204
|
+
print()
|
|
205
|
+
|
|
206
|
+
# Print milestone breakdown for best episode
|
|
207
|
+
best_episode = max(successful, key=lambda r: r["total_reward"])
|
|
208
|
+
print(f"Best Episode (#{best_episode['episode_id']}):")
|
|
209
|
+
print(f" Total reward: {best_episode['total_reward']:.1f}")
|
|
210
|
+
print(f" Steps taken: {best_episode['num_steps']}")
|
|
211
|
+
print(f" Milestones achieved:")
|
|
212
|
+
for milestone in best_episode["milestone_events"]:
|
|
213
|
+
print(f" Step {milestone['step']}: {milestone['description']} (+{milestone['reward']:.1f})")
|
|
214
|
+
print()
|
|
215
|
+
|
|
216
|
+
if failed:
|
|
217
|
+
print(f"Failed episodes: {len(failed)}")
|
|
218
|
+
for r in failed:
|
|
219
|
+
print(f" Episode {r['episode_id']}: {r.get('error', 'Unknown error')}")
|
|
220
|
+
print()
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
if __name__ == "__main__":
|
|
224
|
+
asyncio.run(main())
|
|
225
|
+
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Pokemon Red: Pallet Town Progression RL Configuration
|
|
2
|
+
#
|
|
3
|
+
# This configuration uses the ultra-rich reward function that tracks
|
|
4
|
+
# all major milestones in the Pallet Town opening sequence:
|
|
5
|
+
# - Leaving bedroom
|
|
6
|
+
# - Exiting house
|
|
7
|
+
# - Finding Oak's lab
|
|
8
|
+
# - Talking to Oak
|
|
9
|
+
# - Receiving starter Pokemon
|
|
10
|
+
# - Entering first battle
|
|
11
|
+
# - Dealing damage to rival
|
|
12
|
+
# - Winning first battle
|
|
13
|
+
# - Exiting lab with Pokemon
|
|
14
|
+
#
|
|
15
|
+
# Total possible reward: ~600+ points
|
|
16
|
+
|
|
17
|
+
[experiment]
|
|
18
|
+
name = "pokemon_red_pallet_town_progression"
|
|
19
|
+
description = "Learn to complete the Pallet Town intro sequence with dense reward shaping"
|
|
20
|
+
|
|
21
|
+
[environment]
|
|
22
|
+
task_app_id = "pokemon_red"
|
|
23
|
+
# Start state is already configured to begin in Red's bedroom (Map26)
|
|
24
|
+
|
|
25
|
+
[reward]
|
|
26
|
+
# Use the comprehensive Pallet Town progression reward
|
|
27
|
+
reward_type = "composite"
|
|
28
|
+
reward_class = "synth_ai.environments.examples.red.engine_helpers.reward_library.pallet_town_progression.PalletTownProgressionCompositeReward"
|
|
29
|
+
|
|
30
|
+
[policy]
|
|
31
|
+
model = "gpt-4-turbo"
|
|
32
|
+
# or use "qwen-2.5-7b" for faster, cheaper training
|
|
33
|
+
|
|
34
|
+
[training]
|
|
35
|
+
algorithm = "ppo" # or "grpo"
|
|
36
|
+
max_steps_per_episode = 500 # Plenty of steps for the Pallet Town sequence
|
|
37
|
+
num_episodes = 1000
|
|
38
|
+
batch_size = 32
|
|
39
|
+
|
|
40
|
+
[training.hyperparameters]
|
|
41
|
+
learning_rate = 3e-4
|
|
42
|
+
gamma = 0.99
|
|
43
|
+
clip_epsilon = 0.2
|
|
44
|
+
|
|
45
|
+
[logging]
|
|
46
|
+
log_interval = 10
|
|
47
|
+
save_interval = 100
|
|
48
|
+
wandb_project = "pokemon-red-rl"
|
|
49
|
+
wandb_run_name = "pallet-town-progression"
|
|
50
|
+
|
|
51
|
+
[evaluation]
|
|
52
|
+
eval_interval = 50
|
|
53
|
+
num_eval_episodes = 10
|
|
54
|
+
|
|
55
|
+
# Reward breakdown (for reference):
|
|
56
|
+
# ============================================
|
|
57
|
+
# LeaveBedroomReward +20
|
|
58
|
+
# ExitHouseFirstTimeReward +30
|
|
59
|
+
# FindOakLabReward +40
|
|
60
|
+
# TalkToOakReward +50
|
|
61
|
+
# ReceiveStarterPokemonReward +100
|
|
62
|
+
# EnterFirstBattleReward +75
|
|
63
|
+
# DealDamageToRivalReward +50 (10×5)
|
|
64
|
+
# ReduceEnemyHPByHalfReward +25
|
|
65
|
+
# ReduceEnemyHPToLowReward +35
|
|
66
|
+
# WinFirstBattleReward +150
|
|
67
|
+
# ExitLabAfterBattleReward +60
|
|
68
|
+
# FirstBattleEfficiencyReward +20
|
|
69
|
+
# KeepPokemonHealthyReward +30
|
|
70
|
+
# NavigationSpeedReward +50
|
|
71
|
+
# ============================================
|
|
72
|
+
# TOTAL POSSIBLE ~600+
|
|
73
|
+
|