synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +12 -1
- examples/swe/task_app/grpo_swe_mini.py +55 -26
- examples/swe/task_app/hosted/rollout.py +40 -0
- examples/swe/task_app/hosted/test_service.py +5 -6
- examples/task_apps/TESTING.md +275 -0
- examples/task_apps/__init__.py +0 -0
- examples/task_apps/crafter/__init__.py +0 -0
- examples/task_apps/crafter/task_app/__init__.py +2 -0
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +18 -13
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +60 -4
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +25 -3
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +10 -0
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
- examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
- examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
- examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
- examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
- examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
- examples/task_apps/enron/__init__.py +1 -0
- examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
- examples/task_apps/enron/task_app/README.md +14 -0
- examples/task_apps/enron/task_app/__init__.py +1 -0
- examples/task_apps/enron/task_app/grpo_enron.py +906 -0
- examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
- examples/task_apps/enron/tests/__init__.py +2 -0
- examples/task_apps/enron/tests/conftest.py +115 -0
- examples/task_apps/enron/tests/integration/__init__.py +2 -0
- examples/task_apps/enron/tests/integration/test_enron_eval.py +177 -0
- examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
- examples/task_apps/enron/tests/unit/__init__.py +2 -0
- examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
- examples/task_apps/math/__init__.py +0 -0
- examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
- examples/task_apps/pokemon_battle/__init__.py +2 -0
- examples/task_apps/pokemon_battle/modal_app.py +104 -0
- examples/task_apps/pokemon_battle/task_app/README.md +68 -0
- examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
- examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
- examples/task_apps/pokemon_red/README.md +357 -0
- examples/task_apps/pokemon_red/__init__.py +3 -0
- examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
- examples/task_apps/pokemon_red/pallet_town_rl_config.toml +73 -0
- examples/task_apps/pokemon_red/task_app.py +606 -0
- examples/task_apps/pokemon_red/test_pallet_town_rewards.py +191 -0
- examples/task_apps/sokoban/README.md +307 -0
- examples/task_apps/sokoban/__init__.py +3 -0
- examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
- examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
- examples/task_apps/sokoban/task_app.py +1058 -0
- examples/task_apps/sokoban/tests/__init__.py +2 -0
- examples/task_apps/sokoban/tests/conftest.py +113 -0
- examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
- examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
- examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
- examples/task_apps/verilog/__init__.py +1 -0
- examples/task_apps/verilog/eval_groq_qwen32b.toml +20 -0
- examples/task_apps/verilog/task_app/README.md +12 -0
- examples/task_apps/verilog/task_app/__init__.py +1 -0
- examples/task_apps/verilog/task_app/grpo_verilog.py +931 -0
- examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
- examples/task_apps/verilog/tests/__init__.py +2 -0
- examples/task_apps/verilog/tests/conftest.py +115 -0
- examples/task_apps/verilog/tests/integration/__init__.py +2 -0
- examples/task_apps/verilog/tests/integration/test_verilog_eval.py +179 -0
- examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
- examples/task_apps/verilog/tests/unit/__init__.py +2 -0
- examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
- examples/vlm/crafter_openai_vlm_agent.py +4 -4
- examples/vlm/run_crafter_vlm_benchmark.py +4 -4
- examples/workflows/__init__.py +0 -0
- examples/workflows/math_rl/__init__.py +0 -0
- examples/workflows/math_rl/download_dataset.py +80 -0
- synth_ai/__init__.py +2 -2
- synth_ai/api/train/builders.py +25 -11
- synth_ai/api/train/cli.py +12 -6
- synth_ai/api/train/configs/__init__.py +10 -10
- synth_ai/api/train/configs/rl.py +5 -4
- synth_ai/api/train/configs/sft.py +4 -3
- synth_ai/api/train/env_resolver.py +5 -2
- synth_ai/api/train/supported_algos.py +10 -5
- synth_ai/api/train/utils.py +7 -4
- synth_ai/cli/__init__.py +7 -51
- synth_ai/cli/_storage.py +4 -3
- synth_ai/cli/_validate_task_app.py +11 -0
- synth_ai/cli/balance.py +4 -3
- synth_ai/cli/calc.py +2 -2
- synth_ai/cli/demo.py +14 -7
- synth_ai/cli/legacy_root_backup.py +1 -1
- synth_ai/cli/rl_demo.py +8 -7
- synth_ai/cli/root.py +0 -97
- synth_ai/cli/task_apps.py +1707 -186
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +28 -16
- synth_ai/environments/examples/enron/engine.py +7 -2
- synth_ai/environments/examples/enron/environment.py +68 -0
- synth_ai/environments/examples/red/engine.py +27 -0
- synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
- synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
- synth_ai/environments/examples/red/environment.py +60 -0
- synth_ai/environments/examples/sokoban/taskset.py +116 -0
- synth_ai/environments/examples/verilog/engine.py +30 -4
- synth_ai/evals/client.py +58 -61
- synth_ai/jobs/client.py +16 -4
- synth_ai/judge_schemas.py +16 -16
- synth_ai/py.typed +0 -0
- synth_ai/task/__init__.py +14 -5
- synth_ai/task/contracts.py +124 -38
- synth_ai/task/proxy.py +48 -56
- synth_ai/task/rubrics/__init__.py +53 -0
- synth_ai/task/rubrics/loaders.py +133 -0
- synth_ai/task/rubrics/models.py +57 -0
- synth_ai/task/rubrics/scoring.py +113 -0
- synth_ai/{rubrics/validators.py → task/rubrics/strict.py} +53 -30
- synth_ai/task/server.py +8 -7
- synth_ai/task/validators.py +269 -6
- synth_ai/tracing_v3/decorators.py +7 -3
- synth_ai/tracing_v3/replica_sync.py +4 -4
- synth_ai/tracing_v3/serialization.py +5 -5
- synth_ai/tracing_v3/trace_utils.py +317 -0
- synth_ai/tracing_v3/turso/native_manager.py +3 -3
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/METADATA +4 -1
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/RECORD +214 -101
- examples/agora_ex/README_MoE.md +0 -224
- examples/agora_ex/__init__.py +0 -7
- examples/agora_ex/agora_ex.py +0 -65
- examples/agora_ex/agora_ex_task_app.py +0 -590
- examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +0 -121
- examples/agora_ex/reward_fn_grpo-human.py +0 -129
- examples/agora_ex/system_prompt_CURRENT.md +0 -63
- examples/agora_ex/task_app/agora_ex_task_app.py +0 -590
- examples/agora_ex/task_app/reward_fn_grpo-human.py +0 -129
- examples/agora_ex/task_app/system_prompt_CURRENT.md +0 -63
- synth_ai/rubrics/__init__.py +0 -22
- synth_ai/task/rubrics.py +0 -219
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/utils.py +0 -0
- /examples/{rl/task_app → task_apps/math}/README.md +0 -0
- /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
- /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Test script for Pallet Town Progression Rewards
|
|
3
|
+
|
|
4
|
+
This script demonstrates the reward function by simulating
|
|
5
|
+
a sequence of states representing the ideal Pallet Town progression.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
from synth_ai.environments.examples.red.engine_helpers.reward_library.pallet_town_progression import (
|
|
10
|
+
PalletTownProgressionCompositeReward,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
async def main():
|
|
15
|
+
"""Simulate a perfect Pallet Town run and show rewards"""
|
|
16
|
+
|
|
17
|
+
reward_fn = PalletTownProgressionCompositeReward()
|
|
18
|
+
total_reward = 0.0
|
|
19
|
+
|
|
20
|
+
print("=" * 70)
|
|
21
|
+
print("PALLET TOWN PROGRESSION - REWARD SIMULATION")
|
|
22
|
+
print("=" * 70)
|
|
23
|
+
print()
|
|
24
|
+
|
|
25
|
+
# Step 1: Start in bedroom (Map 1)
|
|
26
|
+
state1 = {
|
|
27
|
+
"map_id": 1,
|
|
28
|
+
"player_x": 3,
|
|
29
|
+
"player_y": 4,
|
|
30
|
+
"party_count": 0,
|
|
31
|
+
"in_battle": False,
|
|
32
|
+
"text_box_active": False,
|
|
33
|
+
"battle_outcome": 0,
|
|
34
|
+
"enemy_hp_current": 0,
|
|
35
|
+
"enemy_hp_max": 0,
|
|
36
|
+
"enemy_hp_percentage": 0.0,
|
|
37
|
+
}
|
|
38
|
+
action1 = {
|
|
39
|
+
"prev_map_id": 1,
|
|
40
|
+
"prev_party_count": 0,
|
|
41
|
+
"prev_in_battle": False,
|
|
42
|
+
"prev_text_box_active": False,
|
|
43
|
+
"prev_enemy_hp_current": 0,
|
|
44
|
+
"prev_enemy_hp_percentage": 0.0,
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
# Step 2: Go downstairs (Map 1 -> Map 2)
|
|
48
|
+
state2 = {**state1, "map_id": 2, "player_y": 8}
|
|
49
|
+
action2 = {**action1, "prev_map_id": 1}
|
|
50
|
+
|
|
51
|
+
r = await reward_fn.score(state2, action2)
|
|
52
|
+
total_reward += r
|
|
53
|
+
print(f"✓ Leave bedroom (Map 1→2): +{r:.0f} points")
|
|
54
|
+
|
|
55
|
+
# Step 3: Exit house (Map 2 -> Map 0)
|
|
56
|
+
state3 = {**state2, "map_id": 0, "player_x": 5, "player_y": 7}
|
|
57
|
+
action3 = {**action2, "prev_map_id": 2}
|
|
58
|
+
|
|
59
|
+
r = await reward_fn.score(state3, action3)
|
|
60
|
+
total_reward += r
|
|
61
|
+
print(f"✓ Exit house to Pallet Town (Map 2→0): +{r:.0f} points")
|
|
62
|
+
|
|
63
|
+
# Step 4: Navigate to and enter Oak's Lab (Map 0 -> Map 3)
|
|
64
|
+
state4 = {**state3, "map_id": 3, "player_x": 4, "player_y": 11}
|
|
65
|
+
action4 = {**action3, "prev_map_id": 0}
|
|
66
|
+
|
|
67
|
+
r = await reward_fn.score(state4, action4)
|
|
68
|
+
total_reward += r
|
|
69
|
+
print(f"✓ Find and enter Oak's Lab (Map 0→3): +{r:.0f} points")
|
|
70
|
+
|
|
71
|
+
# Step 5: Talk to Oak (text box appears)
|
|
72
|
+
state5 = {**state4, "text_box_active": True}
|
|
73
|
+
action5 = {**action4, "prev_text_box_active": False}
|
|
74
|
+
|
|
75
|
+
r = await reward_fn.score(state5, action5)
|
|
76
|
+
total_reward += r
|
|
77
|
+
print(f"✓ Talk to Professor Oak: +{r:.0f} points")
|
|
78
|
+
|
|
79
|
+
# Step 6: Receive starter Pokemon (party count 0 -> 1)
|
|
80
|
+
state6 = {
|
|
81
|
+
**state5,
|
|
82
|
+
"party_count": 1,
|
|
83
|
+
"party_pokemon": [
|
|
84
|
+
{
|
|
85
|
+
"species_id": 4, # Charmander
|
|
86
|
+
"level": 5,
|
|
87
|
+
"hp_current": 20,
|
|
88
|
+
"hp_max": 20,
|
|
89
|
+
"hp_percentage": 100.0,
|
|
90
|
+
}
|
|
91
|
+
],
|
|
92
|
+
}
|
|
93
|
+
action6 = {**action5, "prev_party_count": 0}
|
|
94
|
+
|
|
95
|
+
r = await reward_fn.score(state6, action6)
|
|
96
|
+
total_reward += r
|
|
97
|
+
print(f"✓ Receive starter Pokemon: +{r:.0f} points")
|
|
98
|
+
|
|
99
|
+
# Step 7: Enter first battle
|
|
100
|
+
state7 = {**state6, "in_battle": True, "text_box_active": False,
|
|
101
|
+
"enemy_hp_current": 20, "enemy_hp_max": 20, "enemy_hp_percentage": 100.0}
|
|
102
|
+
action7 = {**action6, "prev_in_battle": False, "prev_text_box_active": True}
|
|
103
|
+
|
|
104
|
+
r = await reward_fn.score(state7, action7)
|
|
105
|
+
total_reward += r
|
|
106
|
+
print(f"✓ Enter first battle with rival: +{r:.0f} points")
|
|
107
|
+
|
|
108
|
+
# Step 8-12: Deal damage (5 attacks)
|
|
109
|
+
print()
|
|
110
|
+
print("Battle sequence:")
|
|
111
|
+
for i in range(5):
|
|
112
|
+
prev_hp = 20 - (i * 4)
|
|
113
|
+
curr_hp = 20 - ((i + 1) * 4)
|
|
114
|
+
state_dmg = {
|
|
115
|
+
**state7,
|
|
116
|
+
"enemy_hp_current": curr_hp,
|
|
117
|
+
"enemy_hp_percentage": (curr_hp / 20) * 100,
|
|
118
|
+
}
|
|
119
|
+
action_dmg = {
|
|
120
|
+
**action7,
|
|
121
|
+
"prev_in_battle": True,
|
|
122
|
+
"prev_enemy_hp_current": prev_hp,
|
|
123
|
+
"prev_enemy_hp_percentage": (prev_hp / 20) * 100,
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
r = await reward_fn.score(state_dmg, action_dmg)
|
|
127
|
+
total_reward += r
|
|
128
|
+
|
|
129
|
+
# Check for half HP and low HP milestones
|
|
130
|
+
if r > 5: # Got bonus reward
|
|
131
|
+
if (prev_hp / 20) >= 0.5 and (curr_hp / 20) < 0.5:
|
|
132
|
+
print(f" → Attack {i+1}: Enemy HP {prev_hp}→{curr_hp} (+5) + Half HP bonus (+25) = +{r:.0f}")
|
|
133
|
+
elif (prev_hp / 20) >= 0.25 and (curr_hp / 20) < 0.25:
|
|
134
|
+
print(f" → Attack {i+1}: Enemy HP {prev_hp}→{curr_hp} (+5) + Low HP bonus (+35) = +{r:.0f}")
|
|
135
|
+
else:
|
|
136
|
+
print(f" → Attack {i+1}: Enemy HP {prev_hp}→{curr_hp} +{r:.0f} points")
|
|
137
|
+
|
|
138
|
+
print()
|
|
139
|
+
|
|
140
|
+
# Step 13: Win battle
|
|
141
|
+
state13 = {
|
|
142
|
+
**state7,
|
|
143
|
+
"in_battle": False,
|
|
144
|
+
"battle_outcome": 1, # Win
|
|
145
|
+
"enemy_hp_current": 0,
|
|
146
|
+
"enemy_hp_percentage": 0.0,
|
|
147
|
+
"battle_turn": 4,
|
|
148
|
+
"party_pokemon": [
|
|
149
|
+
{
|
|
150
|
+
"species_id": 4,
|
|
151
|
+
"level": 5,
|
|
152
|
+
"hp_current": 15, # 75% HP
|
|
153
|
+
"hp_max": 20,
|
|
154
|
+
"hp_percentage": 75.0,
|
|
155
|
+
}
|
|
156
|
+
],
|
|
157
|
+
}
|
|
158
|
+
action13 = {
|
|
159
|
+
**action7,
|
|
160
|
+
"prev_in_battle": True,
|
|
161
|
+
"prev_enemy_hp_current": 0,
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
r = await reward_fn.score(state13, action13)
|
|
165
|
+
total_reward += r
|
|
166
|
+
print(f"✓ Win first battle: +{r:.0f} points")
|
|
167
|
+
|
|
168
|
+
# Step 14: Exit lab with Pokemon (Map 3 -> Map 0)
|
|
169
|
+
state14 = {**state13, "map_id": 0, "player_x": 5, "player_y": 11}
|
|
170
|
+
action14 = {**action13, "prev_map_id": 3}
|
|
171
|
+
|
|
172
|
+
r = await reward_fn.score(state14, action14)
|
|
173
|
+
total_reward += r
|
|
174
|
+
print(f"✓ Exit Oak's Lab with Pokemon (Map 3→0): +{r:.0f} points")
|
|
175
|
+
|
|
176
|
+
print()
|
|
177
|
+
print("=" * 70)
|
|
178
|
+
print(f"TOTAL REWARD: {total_reward:.0f} points")
|
|
179
|
+
print("=" * 70)
|
|
180
|
+
print()
|
|
181
|
+
print("Breakdown by category:")
|
|
182
|
+
print(" Navigation: 150 points (bedroom, house, lab, exit)")
|
|
183
|
+
print(" Story: 150 points (talk to Oak, get Pokemon)")
|
|
184
|
+
print(" Battle: 335 points (enter, damage, milestones, win)")
|
|
185
|
+
print(" Efficiency: ~100 points (battle speed, health, navigation)")
|
|
186
|
+
print()
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
if __name__ == "__main__":
|
|
190
|
+
asyncio.run(main())
|
|
191
|
+
|
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
# Sokoban Task App
|
|
2
|
+
|
|
3
|
+
A task app for training and evaluating LLM agents on Sokoban puzzles.
|
|
4
|
+
|
|
5
|
+
Sokoban is a classic puzzle game where the player must push boxes onto target locations. It's a good benchmark for spatial reasoning, planning, and sequential decision-making.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- 🎮 Multiple difficulty levels (easy, medium, hard)
|
|
10
|
+
- 🤖 LLM policy support (GPT-5-mini, Qwen)
|
|
11
|
+
- 📊 Supports both RL training and evaluation rollouts
|
|
12
|
+
- 🎯 Rich observations with ASCII grid visualization
|
|
13
|
+
- ⚡ Batched actions (up to 8 actions per LLM call)
|
|
14
|
+
|
|
15
|
+
## Quick Start
|
|
16
|
+
|
|
17
|
+
### 1. Start the Server
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
cd /path/to/synth-ai
|
|
21
|
+
|
|
22
|
+
# Start the Sokoban task app on port 8911
|
|
23
|
+
uvx synth-ai task-app serve sokoban --port 8911
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
The server will be available at `http://localhost:8911`.
|
|
27
|
+
|
|
28
|
+
### 2. Run a Test Rollout
|
|
29
|
+
|
|
30
|
+
#### Option A: Using GPT-5-mini
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
export OPENAI_API_KEY="your-api-key"
|
|
34
|
+
|
|
35
|
+
python3 << 'EOF'
|
|
36
|
+
import httpx
|
|
37
|
+
import asyncio
|
|
38
|
+
|
|
39
|
+
async def test_gpt5mini():
|
|
40
|
+
async with httpx.AsyncClient(timeout=600.0) as client: # Longer timeout
|
|
41
|
+
print("🎮 Testing with GPT-5-mini (slower due to reasoning tokens)...\n")
|
|
42
|
+
|
|
43
|
+
response = await client.post(
|
|
44
|
+
"http://localhost:8911/rollout",
|
|
45
|
+
json={
|
|
46
|
+
"run_id": "test_gpt5mini",
|
|
47
|
+
"env": {"seed": 123, "config": {"difficulty": "easy", "max_steps": 100}},
|
|
48
|
+
"ops": ["policy"] * 5, # Fewer calls due to slowness
|
|
49
|
+
"policy": {
|
|
50
|
+
"config": {
|
|
51
|
+
"provider": "openai",
|
|
52
|
+
"model": "gpt-5-mini",
|
|
53
|
+
"max_actions_per_call": 8
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
},
|
|
57
|
+
headers={"Authorization": "Bearer sk_env_your_key_here"}
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
result = response.json()
|
|
61
|
+
traj = result["trajectories"][0]
|
|
62
|
+
final = traj["final"]["observation"]
|
|
63
|
+
|
|
64
|
+
print(f"Boxes: {final['boxes_on_target']}/{final['num_boxes']}")
|
|
65
|
+
print(f"Steps: {final['steps_taken']}")
|
|
66
|
+
|
|
67
|
+
asyncio.run(test_gpt5mini())
|
|
68
|
+
EOF
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
#### Option B: Using Qwen via Groq (Fast & Cheap)
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
export GROQ_API_KEY="your-groq-key"
|
|
75
|
+
|
|
76
|
+
python3 << 'EOF'
|
|
77
|
+
import httpx
|
|
78
|
+
import asyncio
|
|
79
|
+
|
|
80
|
+
async def test_qwen():
|
|
81
|
+
async with httpx.AsyncClient(timeout=300.0) as client:
|
|
82
|
+
response = await client.post(
|
|
83
|
+
"http://localhost:8911/rollout",
|
|
84
|
+
json={
|
|
85
|
+
"run_id": "test_qwen",
|
|
86
|
+
"env": {"seed": 123, "config": {"difficulty": "easy", "max_steps": 100}},
|
|
87
|
+
"ops": ["policy"] * 15,
|
|
88
|
+
"policy": {
|
|
89
|
+
"config": {
|
|
90
|
+
"provider": "groq",
|
|
91
|
+
"model": "qwen-2.5-7b",
|
|
92
|
+
"max_actions_per_call": 8
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
},
|
|
96
|
+
headers={"Authorization": "Bearer sk_env_your_key_here"}
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
result = response.json()
|
|
100
|
+
traj = result["trajectories"][0]
|
|
101
|
+
final = traj["final"]["observation"]
|
|
102
|
+
|
|
103
|
+
print(f"Result: {'✅ SOLVED!' if final['boxes_on_target'] == final['num_boxes'] else '❌ Not solved'}")
|
|
104
|
+
print(f"Boxes: {final['boxes_on_target']}/{final['num_boxes']}")
|
|
105
|
+
|
|
106
|
+
asyncio.run(test_qwen())
|
|
107
|
+
EOF
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## Configuration Options
|
|
111
|
+
|
|
112
|
+
### Environment Config
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
{
|
|
116
|
+
"seed": 123, # Random seed for puzzle generation
|
|
117
|
+
"config": {
|
|
118
|
+
"difficulty": "easy", # "easy", "medium", or "hard"
|
|
119
|
+
"max_steps": 100 # Maximum steps before truncation
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Policy Config
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
{
|
|
128
|
+
"provider": "openai", # "openai" or "groq"
|
|
129
|
+
"model": "gpt-5-mini", # Model name
|
|
130
|
+
"max_actions_per_call": 8, # Actions per policy call (1-8)
|
|
131
|
+
"temperature": 0.7, # Temperature (optional)
|
|
132
|
+
"max_completion_tokens": 4000 # Max tokens (optional)
|
|
133
|
+
}
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## Model Recommendations
|
|
137
|
+
|
|
138
|
+
| Model | Status | Speed | Notes |
|
|
139
|
+
|-------|--------|-------|-------|
|
|
140
|
+
| **gpt-5-mini** | ✅ Recommended | Slow (30-50s/call) | Uses 1500-2750 reasoning tokens per call |
|
|
141
|
+
| **gpt-5** | ❌ Not supported | N/A | Doesn't support tool calling |
|
|
142
|
+
| **gpt-5-nano** | ❌ Not supported | N/A | Doesn't support tool calling |
|
|
143
|
+
| **qwen-2.5-7b** (Groq) | ✅ Works | Very fast | Cheap and fast alternative |
|
|
144
|
+
|
|
145
|
+
### Why is GPT-5-mini slow?
|
|
146
|
+
|
|
147
|
+
GPT-5-mini uses extensive internal reasoning (1500-2750 reasoning tokens per call) before generating actions. While this could lead to better puzzle-solving, it makes each policy call take 30-50 seconds.
|
|
148
|
+
|
|
149
|
+
Example usage breakdown:
|
|
150
|
+
```json
|
|
151
|
+
{
|
|
152
|
+
"usage": {
|
|
153
|
+
"completion_tokens": 2465,
|
|
154
|
+
"reasoning_tokens": 2432, // Deep thinking!
|
|
155
|
+
"prompt_tokens": 470
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## Observation Format
|
|
161
|
+
|
|
162
|
+
Each observation includes:
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
{
|
|
166
|
+
"room_text": str, # ASCII visualization of the puzzle
|
|
167
|
+
"player_position": [x, y], # Player coordinates
|
|
168
|
+
"boxes_on_target": int, # Number of boxes on target squares
|
|
169
|
+
"num_boxes": int, # Total number of boxes
|
|
170
|
+
"steps_taken": int, # Steps taken so far
|
|
171
|
+
"max_steps": int, # Maximum allowed steps
|
|
172
|
+
"last_action": str, # Last action taken
|
|
173
|
+
"reward_last": float, # Reward from last step
|
|
174
|
+
"total_reward": float, # Cumulative reward
|
|
175
|
+
"terminated": bool, # Puzzle solved?
|
|
176
|
+
"truncated": bool # Max steps reached?
|
|
177
|
+
}
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
### ASCII Legend
|
|
181
|
+
|
|
182
|
+
- `P` = Player
|
|
183
|
+
- `O` = Box
|
|
184
|
+
- `X` = Target square
|
|
185
|
+
- `@` = Box on target
|
|
186
|
+
- `+` = Player on target
|
|
187
|
+
- `#` = Wall
|
|
188
|
+
- `_` = Floor
|
|
189
|
+
|
|
190
|
+
## Action Space
|
|
191
|
+
|
|
192
|
+
The agent uses the `interact_many` tool to execute multiple actions in sequence:
|
|
193
|
+
|
|
194
|
+
```python
|
|
195
|
+
{
|
|
196
|
+
"tool": "interact_many",
|
|
197
|
+
"args": {
|
|
198
|
+
"actions": [0, 1, 2, 3] # 0=left, 1=up, 2=right, 3=down
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
Or with string names:
|
|
204
|
+
```python
|
|
205
|
+
{
|
|
206
|
+
"actions": ["left", "up", "right", "down"]
|
|
207
|
+
}
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
## Training with RL
|
|
211
|
+
|
|
212
|
+
The Sokoban task app supports RL training. Example config:
|
|
213
|
+
|
|
214
|
+
```toml
|
|
215
|
+
# sokoban_rl_config.toml
|
|
216
|
+
[task_app]
|
|
217
|
+
url = "http://localhost:8911"
|
|
218
|
+
auth_token = "sk_env_your_key_here"
|
|
219
|
+
|
|
220
|
+
[rl]
|
|
221
|
+
algorithm = "grpo"
|
|
222
|
+
num_episodes = 1000
|
|
223
|
+
batch_size = 32
|
|
224
|
+
|
|
225
|
+
[policy]
|
|
226
|
+
provider = "groq"
|
|
227
|
+
model = "qwen-2.5-7b"
|
|
228
|
+
max_actions_per_call = 8
|
|
229
|
+
|
|
230
|
+
[env]
|
|
231
|
+
difficulty = "easy"
|
|
232
|
+
max_steps = 100
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
Run training:
|
|
236
|
+
```bash
|
|
237
|
+
uvx synth-ai train --config sokoban_rl_config.toml
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
## Debugging
|
|
241
|
+
|
|
242
|
+
### Check server health
|
|
243
|
+
```bash
|
|
244
|
+
curl http://localhost:8911/health
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
### View server logs
|
|
248
|
+
```bash
|
|
249
|
+
# If running with nohup
|
|
250
|
+
tail -f nohup_sokoban.log
|
|
251
|
+
|
|
252
|
+
# Filter for important logs
|
|
253
|
+
tail -f nohup_sokoban.log | grep -E "extract|debug|error"
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
### Test with explicit actions
|
|
257
|
+
```python
|
|
258
|
+
# Instead of "policy", provide explicit actions
|
|
259
|
+
"ops": [
|
|
260
|
+
{"button": "right", "count": 3},
|
|
261
|
+
{"button": "down", "count": 2}
|
|
262
|
+
]
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
## Troubleshooting
|
|
266
|
+
|
|
267
|
+
### Empty responses from LLM
|
|
268
|
+
- **GPT-5/GPT-5-nano**: These models don't support tool calling reliably. Use GPT-5-mini instead.
|
|
269
|
+
- **Timeout errors**: GPT-5-mini is slow. Increase client timeout to 600+ seconds or use fewer policy calls.
|
|
270
|
+
|
|
271
|
+
### Puzzle not solving
|
|
272
|
+
- Try more policy calls (15-30)
|
|
273
|
+
- Use a different seed
|
|
274
|
+
- Try "easy" difficulty first
|
|
275
|
+
- Check if the agent is stuck in a loop (repeating same actions)
|
|
276
|
+
|
|
277
|
+
### Server won't start
|
|
278
|
+
```bash
|
|
279
|
+
# Check if port is in use
|
|
280
|
+
lsof -i :8911
|
|
281
|
+
|
|
282
|
+
# Kill existing process
|
|
283
|
+
kill -9 $(lsof -ti :8911)
|
|
284
|
+
|
|
285
|
+
# Restart
|
|
286
|
+
uvx synth-ai task-app serve sokoban --port 8911
|
|
287
|
+
```
|
|
288
|
+
|
|
289
|
+
## Examples
|
|
290
|
+
|
|
291
|
+
See the `examples/workflows/` directory for:
|
|
292
|
+
- RL training scripts
|
|
293
|
+
- Evaluation scripts
|
|
294
|
+
- Multi-episode parallel evaluation
|
|
295
|
+
|
|
296
|
+
## Contributing
|
|
297
|
+
|
|
298
|
+
To add new features:
|
|
299
|
+
1. Edit `task_app.py` for core logic
|
|
300
|
+
2. Update `_base_task_info()` for new observation/action specs
|
|
301
|
+
3. Modify `rollout_executor()` for custom rollout behavior
|
|
302
|
+
4. Add tests in `tests/integration/`
|
|
303
|
+
|
|
304
|
+
## License
|
|
305
|
+
|
|
306
|
+
MIT
|
|
307
|
+
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Evaluation config for running Groq Qwen/Qwen3-32B against the Sokoban task app.
|
|
2
|
+
|
|
3
|
+
provider = "groq"
|
|
4
|
+
task_app_url = "http://127.0.0.1:8911"
|
|
5
|
+
model = "qwen/qwen3-32b"
|
|
6
|
+
seeds = [123]
|
|
7
|
+
max_turns = 60
|
|
8
|
+
concurrency = 1
|
|
9
|
+
|
|
10
|
+
[policy]
|
|
11
|
+
provider = "groq"
|
|
12
|
+
model = "qwen/qwen3-32b"
|
|
13
|
+
temperature = 0.2
|
|
14
|
+
top_p = 0.95
|
|
15
|
+
max_tokens = 8000
|
|
16
|
+
max_actions_per_call = 4
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Evaluation config for running OpenAI GPT-5 against the Sokoban task app.
|
|
2
|
+
|
|
3
|
+
provider = "openai"
|
|
4
|
+
task_app_url = "http://127.0.0.1:8911"
|
|
5
|
+
model = "gpt-5"
|
|
6
|
+
seeds = [123]
|
|
7
|
+
max_turns = 60
|
|
8
|
+
concurrency = 1
|
|
9
|
+
|
|
10
|
+
[policy]
|
|
11
|
+
provider = "openai"
|
|
12
|
+
model = "gpt-5"
|
|
13
|
+
temperature = 0.2
|
|
14
|
+
top_p = 0.9
|
|
15
|
+
max_completion_tokens = 4000
|
|
16
|
+
max_actions_per_call = 4
|