synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (226) hide show
  1. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +12 -1
  2. examples/swe/task_app/grpo_swe_mini.py +55 -26
  3. examples/swe/task_app/hosted/rollout.py +40 -0
  4. examples/swe/task_app/hosted/test_service.py +5 -6
  5. examples/task_apps/TESTING.md +275 -0
  6. examples/task_apps/__init__.py +0 -0
  7. examples/task_apps/crafter/__init__.py +0 -0
  8. examples/task_apps/crafter/task_app/__init__.py +2 -0
  9. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +18 -13
  10. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
  11. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +60 -4
  12. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +25 -3
  13. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +10 -0
  14. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
  15. examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
  16. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
  17. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
  18. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
  19. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
  20. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
  21. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
  22. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
  23. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
  24. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
  25. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
  26. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
  27. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
  28. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
  29. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
  30. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
  31. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
  32. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
  33. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
  34. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
  35. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
  36. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
  37. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
  38. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
  39. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
  40. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
  41. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
  42. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
  43. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
  44. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
  45. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
  46. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
  47. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
  48. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
  49. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
  50. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
  51. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
  52. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
  53. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
  54. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
  55. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
  56. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
  57. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
  58. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
  59. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
  60. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
  61. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
  62. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
  63. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
  64. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
  65. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
  66. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
  67. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
  68. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
  69. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
  70. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
  71. examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
  72. examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
  73. examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
  74. examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
  75. examples/task_apps/enron/__init__.py +1 -0
  76. examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
  77. examples/task_apps/enron/task_app/README.md +14 -0
  78. examples/task_apps/enron/task_app/__init__.py +1 -0
  79. examples/task_apps/enron/task_app/grpo_enron.py +906 -0
  80. examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
  81. examples/task_apps/enron/tests/__init__.py +2 -0
  82. examples/task_apps/enron/tests/conftest.py +115 -0
  83. examples/task_apps/enron/tests/integration/__init__.py +2 -0
  84. examples/task_apps/enron/tests/integration/test_enron_eval.py +177 -0
  85. examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
  86. examples/task_apps/enron/tests/unit/__init__.py +2 -0
  87. examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
  88. examples/task_apps/math/__init__.py +0 -0
  89. examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
  90. examples/task_apps/pokemon_battle/__init__.py +2 -0
  91. examples/task_apps/pokemon_battle/modal_app.py +104 -0
  92. examples/task_apps/pokemon_battle/task_app/README.md +68 -0
  93. examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
  94. examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
  95. examples/task_apps/pokemon_red/README.md +357 -0
  96. examples/task_apps/pokemon_red/__init__.py +3 -0
  97. examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
  98. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +73 -0
  99. examples/task_apps/pokemon_red/task_app.py +606 -0
  100. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +191 -0
  101. examples/task_apps/sokoban/README.md +307 -0
  102. examples/task_apps/sokoban/__init__.py +3 -0
  103. examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
  104. examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
  105. examples/task_apps/sokoban/task_app.py +1058 -0
  106. examples/task_apps/sokoban/tests/__init__.py +2 -0
  107. examples/task_apps/sokoban/tests/conftest.py +113 -0
  108. examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
  109. examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
  110. examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
  111. examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
  112. examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
  113. examples/task_apps/verilog/__init__.py +1 -0
  114. examples/task_apps/verilog/eval_groq_qwen32b.toml +20 -0
  115. examples/task_apps/verilog/task_app/README.md +12 -0
  116. examples/task_apps/verilog/task_app/__init__.py +1 -0
  117. examples/task_apps/verilog/task_app/grpo_verilog.py +931 -0
  118. examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
  119. examples/task_apps/verilog/tests/__init__.py +2 -0
  120. examples/task_apps/verilog/tests/conftest.py +115 -0
  121. examples/task_apps/verilog/tests/integration/__init__.py +2 -0
  122. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +179 -0
  123. examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
  124. examples/task_apps/verilog/tests/unit/__init__.py +2 -0
  125. examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
  126. examples/vlm/crafter_openai_vlm_agent.py +4 -4
  127. examples/vlm/run_crafter_vlm_benchmark.py +4 -4
  128. examples/workflows/__init__.py +0 -0
  129. examples/workflows/math_rl/__init__.py +0 -0
  130. examples/workflows/math_rl/download_dataset.py +80 -0
  131. synth_ai/__init__.py +2 -2
  132. synth_ai/api/train/builders.py +25 -11
  133. synth_ai/api/train/cli.py +12 -6
  134. synth_ai/api/train/configs/__init__.py +10 -10
  135. synth_ai/api/train/configs/rl.py +5 -4
  136. synth_ai/api/train/configs/sft.py +4 -3
  137. synth_ai/api/train/env_resolver.py +5 -2
  138. synth_ai/api/train/supported_algos.py +10 -5
  139. synth_ai/api/train/utils.py +7 -4
  140. synth_ai/cli/__init__.py +7 -51
  141. synth_ai/cli/_storage.py +4 -3
  142. synth_ai/cli/_validate_task_app.py +11 -0
  143. synth_ai/cli/balance.py +4 -3
  144. synth_ai/cli/calc.py +2 -2
  145. synth_ai/cli/demo.py +14 -7
  146. synth_ai/cli/legacy_root_backup.py +1 -1
  147. synth_ai/cli/rl_demo.py +8 -7
  148. synth_ai/cli/root.py +0 -97
  149. synth_ai/cli/task_apps.py +1707 -186
  150. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +28 -16
  151. synth_ai/environments/examples/enron/engine.py +7 -2
  152. synth_ai/environments/examples/enron/environment.py +68 -0
  153. synth_ai/environments/examples/red/engine.py +27 -0
  154. synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
  155. synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
  156. synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
  157. synth_ai/environments/examples/red/environment.py +60 -0
  158. synth_ai/environments/examples/sokoban/taskset.py +116 -0
  159. synth_ai/environments/examples/verilog/engine.py +30 -4
  160. synth_ai/evals/client.py +58 -61
  161. synth_ai/jobs/client.py +16 -4
  162. synth_ai/judge_schemas.py +16 -16
  163. synth_ai/py.typed +0 -0
  164. synth_ai/task/__init__.py +14 -5
  165. synth_ai/task/contracts.py +124 -38
  166. synth_ai/task/proxy.py +48 -56
  167. synth_ai/task/rubrics/__init__.py +53 -0
  168. synth_ai/task/rubrics/loaders.py +133 -0
  169. synth_ai/task/rubrics/models.py +57 -0
  170. synth_ai/task/rubrics/scoring.py +113 -0
  171. synth_ai/{rubrics/validators.py → task/rubrics/strict.py} +53 -30
  172. synth_ai/task/server.py +8 -7
  173. synth_ai/task/validators.py +269 -6
  174. synth_ai/tracing_v3/decorators.py +7 -3
  175. synth_ai/tracing_v3/replica_sync.py +4 -4
  176. synth_ai/tracing_v3/serialization.py +5 -5
  177. synth_ai/tracing_v3/trace_utils.py +317 -0
  178. synth_ai/tracing_v3/turso/native_manager.py +3 -3
  179. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/METADATA +4 -1
  180. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/RECORD +214 -101
  181. examples/agora_ex/README_MoE.md +0 -224
  182. examples/agora_ex/__init__.py +0 -7
  183. examples/agora_ex/agora_ex.py +0 -65
  184. examples/agora_ex/agora_ex_task_app.py +0 -590
  185. examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +0 -121
  186. examples/agora_ex/reward_fn_grpo-human.py +0 -129
  187. examples/agora_ex/system_prompt_CURRENT.md +0 -63
  188. examples/agora_ex/task_app/agora_ex_task_app.py +0 -590
  189. examples/agora_ex/task_app/reward_fn_grpo-human.py +0 -129
  190. examples/agora_ex/task_app/system_prompt_CURRENT.md +0 -63
  191. synth_ai/rubrics/__init__.py +0 -22
  192. synth_ai/task/rubrics.py +0 -219
  193. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
  194. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
  195. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
  196. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
  197. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
  198. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
  199. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
  200. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
  201. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +0 -0
  202. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +0 -0
  203. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
  204. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
  205. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
  206. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
  207. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +0 -0
  208. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
  209. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
  210. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
  211. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
  212. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
  213. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/utils.py +0 -0
  214. /examples/{rl/task_app → task_apps/math}/README.md +0 -0
  215. /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
  216. /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
  217. /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
  218. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
  219. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
  220. /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
  221. /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
  222. /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
  223. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/WHEEL +0 -0
  224. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/entry_points.txt +0 -0
  225. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/licenses/LICENSE +0 -0
  226. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,122 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test runner for Pokemon Emerald emulator tests
4
+
5
+ Usage:
6
+ python tests/run_tests.py # Run all tests
7
+ python tests/run_tests.py test_fps_adjustment # Run specific test
8
+ """
9
+
10
+ import os
11
+ import sys
12
+ import subprocess
13
+ import importlib.util
14
+
15
+ def run_test(test_name):
16
+ """Run a specific test"""
17
+ test_file = f"tests/{test_name}.py"
18
+
19
+ if not os.path.exists(test_file):
20
+ print(f"❌ Test file not found: {test_file}")
21
+ return False
22
+
23
+ print(f"🧪 Running test: {test_name}")
24
+ print("=" * 50)
25
+
26
+ try:
27
+ # Run the test
28
+ result = subprocess.run([
29
+ sys.executable, test_file
30
+ ], capture_output=False, text=True)
31
+
32
+ if result.returncode == 0:
33
+ print(f"✅ Test {test_name} passed!")
34
+ return True
35
+ else:
36
+ print(f"❌ Test {test_name} failed!")
37
+ return False
38
+
39
+ except Exception as e:
40
+ print(f"❌ Error running test {test_name}: {e}")
41
+ return False
42
+
43
+ def run_all_tests():
44
+ """Run all tests in the tests directory"""
45
+ print("🧪 Pokemon Emerald Emulator Test Suite")
46
+ print("=" * 50)
47
+
48
+ # Find all test files
49
+ test_files = []
50
+ for file in os.listdir("tests"):
51
+ if file.startswith("test_") and file.endswith(".py"):
52
+ test_name = file[:-3] # Remove .py extension
53
+ test_files.append(test_name)
54
+
55
+ if not test_files:
56
+ print("❌ No test files found in tests/ directory")
57
+ return False
58
+
59
+ print(f"Found {len(test_files)} test(s):")
60
+ for test in test_files:
61
+ print(f" - {test}")
62
+ print()
63
+ print("💡 Note: For pytest-style tests, run:")
64
+ print(" python -m pytest tests/test_fps_adjustment_pytest.py -v")
65
+ print(" python -m pytest tests/test_server_map_validation.py -v")
66
+ print("💡 Note: Test state files are located in tests/states/")
67
+ print("💡 Note: Map reference files are saved in tests/map_references/")
68
+ print()
69
+
70
+ # Run each test
71
+ results = []
72
+ for test in test_files:
73
+ success = run_test(test)
74
+ results.append((test, success))
75
+ print() # Add spacing between tests
76
+
77
+ # Print summary
78
+ print("📋 Test Summary")
79
+ print("=" * 30)
80
+
81
+ passed_count = sum(1 for _, success in results if success)
82
+ total_count = len(results)
83
+
84
+ for test, success in results:
85
+ status = "✅ PASS" if success else "❌ FAIL"
86
+ print(f"{test}: {status}")
87
+
88
+ print(f"\nResults: {passed_count}/{total_count} tests passed")
89
+
90
+ if passed_count == total_count:
91
+ print("🎉 All tests passed!")
92
+ return True
93
+ else:
94
+ print("⚠️ Some tests failed.")
95
+ return False
96
+
97
+ def main():
98
+ """Main function"""
99
+ # Check if we're in the right directory
100
+ if not os.path.exists("server/app.py"):
101
+ print("❌ Error: This test runner must be run from the project root directory")
102
+ print("Please run: python tests/run_tests.py")
103
+ sys.exit(1)
104
+
105
+ # Check if tests directory exists
106
+ if not os.path.exists("tests"):
107
+ print("❌ Error: tests/ directory not found")
108
+ sys.exit(1)
109
+
110
+ # Parse command line arguments
111
+ if len(sys.argv) > 1:
112
+ # Run specific test
113
+ test_name = sys.argv[1]
114
+ success = run_test(test_name)
115
+ sys.exit(0 if success else 1)
116
+ else:
117
+ # Run all tests
118
+ success = run_all_tests()
119
+ sys.exit(0 if success else 1)
120
+
121
+ if __name__ == "__main__":
122
+ main()
@@ -0,0 +1,76 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Pytest for agent functionality (legacy test file)
4
+ """
5
+
6
+ import pytest
7
+ import requests
8
+ import json
9
+ import time
10
+
11
+ class TestAgentDirectAPI:
12
+ """Test class for agent API endpoints"""
13
+
14
+ base_url = "http://localhost:8080"
15
+
16
+ def test_imports_work(self):
17
+ """Test that our imports are working"""
18
+ import agent
19
+ assert hasattr(agent, 'app')
20
+ assert hasattr(agent, 'agent_mode')
21
+ assert hasattr(agent, 'websocket_connections')
22
+
23
+ def test_global_state_initialized(self):
24
+ """Test that global state variables are properly initialized"""
25
+ import agent
26
+ assert agent.agent_mode == True # Should start in agent mode by default
27
+ assert agent.agent_auto_enabled == False # Should start with auto disabled
28
+ assert isinstance(agent.websocket_connections, set)
29
+ assert len(agent.websocket_connections) == 0 # Should start empty
30
+
31
+ def test_broadcast_function_exists(self):
32
+ """Test that broadcast function exists and is callable"""
33
+ import agent
34
+ assert hasattr(agent, 'broadcast_state_update')
35
+ assert callable(agent.broadcast_state_update)
36
+
37
+ @pytest.mark.skip(reason="Requires running server")
38
+ def test_status_endpoint(self):
39
+ """Test the /status endpoint"""
40
+ try:
41
+ response = requests.get(f"{self.base_url}/status", timeout=2)
42
+ assert response.status_code == 200
43
+ data = response.json()
44
+ assert "step" in data
45
+ assert "agent_initialized" in data
46
+ except requests.exceptions.ConnectionError:
47
+ pytest.skip("agent server not running")
48
+
49
+ @pytest.mark.skip(reason="Requires running server")
50
+ def test_toggle_mode_endpoint(self):
51
+ """Test the /toggle_mode endpoint"""
52
+ try:
53
+ response = requests.post(f"{self.base_url}/toggle_mode", timeout=2)
54
+ assert response.status_code == 200
55
+ data = response.json()
56
+ assert "mode" in data
57
+ assert "agent_mode" in data
58
+ assert data["mode"] in ["MANUAL", "AGENT"]
59
+ except requests.exceptions.ConnectionError:
60
+ pytest.skip("agent server not running")
61
+
62
+ @pytest.mark.skip(reason="Requires running server")
63
+ def test_toggle_auto_endpoint(self):
64
+ """Test the /toggle_auto endpoint"""
65
+ try:
66
+ response = requests.post(f"{self.base_url}/toggle_auto", timeout=2)
67
+ assert response.status_code == 200
68
+ data = response.json()
69
+ assert "auto_enabled" in data
70
+ assert "status" in data
71
+ assert data["status"] in ["ENABLED", "DISABLED"]
72
+ except requests.exceptions.ConnectionError:
73
+ pytest.skip("agent server not running")
74
+
75
+ if __name__ == "__main__":
76
+ pytest.main([__file__, "-v"])
@@ -0,0 +1,413 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Pytest for Agent Prompts Validation
4
+
5
+ Tests that validate the actual prompt outputs from agent modules:
6
+ - action.py: Action decision prompts
7
+ - memory.py: Memory context generation
8
+ - perception.py: Observation and scene analysis
9
+ - planning.py: Strategic planning prompts
10
+
11
+ This test validates that the agent modules generate proper prompts without "Unknown" values.
12
+ """
13
+
14
+ import pytest
15
+ import json
16
+ import sys
17
+ import os
18
+ import requests
19
+ import time
20
+ import subprocess
21
+ from typing import Dict, Any, List, Set
22
+ from unittest.mock import Mock, patch
23
+ from pathlib import Path
24
+
25
+ # Add the project root to the path
26
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
27
+
28
+ # Import agent modules
29
+ from agent.action import action_step
30
+ from agent.memory import memory_step, extract_key_state_info
31
+ from agent.perception import perception_step
32
+ from agent.planning import planning_step
33
+ from utils.vlm import VLM
34
+ from utils.state_formatter import format_state_for_llm, format_state_summary
35
+
36
+
37
+ class TestAgentPrompts:
38
+ """Test class for agent prompts validation"""
39
+
40
+ @pytest.fixture
41
+ def server_url(self):
42
+ """Server URL for testing"""
43
+ return "http://localhost:8000"
44
+
45
+ @pytest.fixture
46
+ def mock_vlm(self):
47
+ """Create a mock VLM that captures prompts and returns reasonable responses"""
48
+ mock_vlm = Mock(spec=VLM)
49
+
50
+ def mock_get_query(frame, prompt, context=""):
51
+ # Capture the prompt for analysis
52
+ if not hasattr(mock_vlm, 'captured_prompts'):
53
+ mock_vlm.captured_prompts = []
54
+ mock_vlm.captured_prompts.append({
55
+ 'context': context,
56
+ 'prompt': prompt,
57
+ 'frame': frame is not None
58
+ })
59
+
60
+ # Return reasonable responses based on context
61
+ if "PERCEPTION" in context:
62
+ return "I can see the player character on a grassy route with trees and paths."
63
+ elif "ACTION" in context:
64
+ return "UP"
65
+ elif "PLANNING" in context:
66
+ return "Continue exploring the route and look for items or trainers."
67
+ elif "MEMORY" in context:
68
+ return "Updated memory context with current observations."
69
+ else:
70
+ return "Default response"
71
+
72
+ def mock_get_text_query(prompt, context=""):
73
+ # Capture the prompt for analysis
74
+ if not hasattr(mock_vlm, 'captured_prompts'):
75
+ mock_vlm.captured_prompts = []
76
+ mock_vlm.captured_prompts.append({
77
+ 'context': context,
78
+ 'prompt': prompt,
79
+ 'frame': False
80
+ })
81
+
82
+ # Return reasonable responses based on context
83
+ if "ACTION" in context:
84
+ return "UP"
85
+ elif "PLANNING" in context:
86
+ return "Continue exploring the route and look for items or trainers."
87
+ elif "MEMORY" in context:
88
+ return "Updated memory context with current observations."
89
+ else:
90
+ return "Default response"
91
+
92
+ mock_vlm.get_query = mock_get_query
93
+ mock_vlm.get_text_query = mock_get_text_query
94
+
95
+ return mock_vlm
96
+
97
+ def find_state_files(self):
98
+ """Find all .state files in the tests/states directory"""
99
+ states_dir = Path(__file__).parent / "states"
100
+ if not states_dir.exists():
101
+ return []
102
+
103
+ state_files = list(states_dir.glob("*.state"))
104
+ return sorted(state_files)
105
+
106
+ def start_server_with_state(self, state_file_path: str):
107
+ """Start the server with a specific state file"""
108
+ import subprocess
109
+
110
+ # Kill any existing server processes
111
+ try:
112
+ subprocess.run(["pkill", "-f", "server.app"], check=False)
113
+ time.sleep(1)
114
+ except:
115
+ pass
116
+
117
+ # Start new server
118
+ server_process = subprocess.Popen([
119
+ "conda", "run", "-n", "mgba", "python", "-m", "server.app",
120
+ "--load-state", state_file_path
121
+ ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
122
+
123
+ # Wait for server to start
124
+ time.sleep(3)
125
+
126
+ return server_process
127
+
128
+ def stop_server(self, server_process):
129
+ """Stop the server"""
130
+ if server_process:
131
+ try:
132
+ server_process.terminate()
133
+ server_process.wait(timeout=5)
134
+ except:
135
+ server_process.kill()
136
+
137
+ def get_state_from_server(self, server_url: str) -> Dict[str, Any]:
138
+ """Get state data from the server"""
139
+ try:
140
+ response = requests.get(f"{server_url}/state", timeout=10)
141
+ if response.status_code == 200:
142
+ return response.json()
143
+ else:
144
+ return {}
145
+ except Exception:
146
+ return {}
147
+
148
+ def test_action_module_prompts(self, mock_vlm, server_url):
149
+ """Test that action module generates proper prompts without 'Unknown' values"""
150
+ state_files = self.find_state_files()
151
+
152
+ for state_file in state_files:
153
+ print(f"\nTesting ACTION module with {state_file}")
154
+
155
+ # Start server
156
+ server_process = self.start_server_with_state(str(state_file))
157
+
158
+ try:
159
+ # Get state data
160
+ state_data = self.get_state_from_server(server_url)
161
+ if not state_data:
162
+ pytest.skip(f"Could not get state data for {state_file}")
163
+
164
+ # Mock inputs for action_step
165
+ memory_context = "Test memory context"
166
+ current_plan = "Test plan"
167
+ latest_observation = "Test observation"
168
+ frame = None
169
+ recent_actions = ["UP", "A", "RIGHT"]
170
+
171
+ # Call action_step
172
+ actions = action_step(memory_context, current_plan, latest_observation, frame, state_data, recent_actions, mock_vlm)
173
+
174
+ # Check captured prompts
175
+ action_prompts = [p for p in mock_vlm.captured_prompts if "ACTION" in p['context']]
176
+
177
+ assert action_prompts, f"No action prompts captured for {state_file}"
178
+
179
+ # Analyze the action prompt for issues
180
+ action_prompt = action_prompts[0]['prompt']
181
+
182
+ # Check for "Unknown" values in the prompt
183
+ assert "Unknown" not in action_prompt, f"Action prompt contains 'Unknown' values in {state_file}"
184
+
185
+ # Check for required sections
186
+ required_sections = ["COMPREHENSIVE GAME STATE DATA", "ENHANCED ACTION CONTEXT", "ACTION DECISION TASK"]
187
+ for section in required_sections:
188
+ assert section in action_prompt, f"Action prompt missing section '{section}' in {state_file}"
189
+
190
+ # Check that actions were returned
191
+ assert actions, f"Action module returned no actions for {state_file}"
192
+ assert isinstance(actions, list), f"Action module returned non-list actions for {state_file}"
193
+
194
+ finally:
195
+ self.stop_server(server_process)
196
+
197
+ def test_memory_module_prompts(self, mock_vlm, server_url):
198
+ """Test that memory module generates proper prompts without 'Unknown' values"""
199
+ state_files = self.find_state_files()
200
+
201
+ for state_file in state_files:
202
+ print(f"\nTesting MEMORY module with {state_file}")
203
+
204
+ # Start server
205
+ server_process = self.start_server_with_state(str(state_file))
206
+
207
+ try:
208
+ # Get state data
209
+ state_data = self.get_state_from_server(server_url)
210
+ if not state_data:
211
+ pytest.skip(f"Could not get state data for {state_file}")
212
+
213
+ # Test extract_key_state_info function
214
+ key_info = extract_key_state_info(state_data)
215
+
216
+ # Check for "Unknown" values in key info
217
+ assert "Unknown" not in str(key_info), f"Memory key_info contains 'Unknown' values in {state_file}"
218
+
219
+ # Check for required fields
220
+ required_fields = ['state_summary', 'player_name', 'money', 'current_map', 'in_battle', 'party_health']
221
+ for field in required_fields:
222
+ assert field in key_info, f"Memory key_info missing field '{field}' in {state_file}"
223
+
224
+ # Test memory_step function
225
+ memory_context = "Test memory context"
226
+ current_plan = "Test plan"
227
+ recent_actions = ["UP", "A", "RIGHT"]
228
+ observation_buffer = [
229
+ {
230
+ "frame_id": 1,
231
+ "observation": "Test observation",
232
+ "state": state_data
233
+ }
234
+ ]
235
+
236
+ # Call memory_step
237
+ updated_memory = memory_step(memory_context, current_plan, recent_actions, observation_buffer, mock_vlm)
238
+
239
+ # Check for "Unknown" values in memory context
240
+ assert "Unknown" not in updated_memory, f"Memory context contains 'Unknown' values in {state_file}"
241
+
242
+ # Check for required sections
243
+ required_sections = ["COMPREHENSIVE MEMORY CONTEXT", "CURRENT STATE", "CURRENT PLAN", "KEY EVENTS", "RECENT MEMORY"]
244
+ for section in required_sections:
245
+ assert section in updated_memory, f"Memory context missing section '{section}' in {state_file}"
246
+
247
+ # Check that memory context is not empty
248
+ assert len(updated_memory.strip()) > 100, f"Memory context seems too short in {state_file}"
249
+
250
+ finally:
251
+ self.stop_server(server_process)
252
+
253
+ def test_perception_module_prompts(self, mock_vlm, server_url):
254
+ """Test that perception module generates proper prompts without 'Unknown' values"""
255
+ state_files = self.find_state_files()
256
+
257
+ for state_file in state_files:
258
+ print(f"\nTesting PERCEPTION module with {state_file}")
259
+
260
+ # Start server
261
+ server_process = self.start_server_with_state(str(state_file))
262
+
263
+ try:
264
+ # Get state data
265
+ state_data = self.get_state_from_server(server_url)
266
+ if not state_data:
267
+ pytest.skip(f"Could not get state data for {state_file}")
268
+
269
+ # Mock frame
270
+ frame = None
271
+
272
+ # Call perception_step
273
+ observation, slow_thinking = perception_step(frame, state_data, mock_vlm)
274
+
275
+ # Check captured prompts
276
+ perception_prompts = [p for p in mock_vlm.captured_prompts if "PERCEPTION" in p['context']]
277
+
278
+ assert perception_prompts, f"No perception prompts captured for {state_file}"
279
+
280
+ # Analyze the perception prompt for issues
281
+ perception_prompt = perception_prompts[0]['prompt']
282
+
283
+ # Check for "Unknown" values in the prompt
284
+ assert "Unknown" not in perception_prompt, f"Perception prompt contains 'Unknown' values in {state_file}"
285
+
286
+ # Check for required sections
287
+ required_sections = ["COMPREHENSIVE GAME STATE DATA", "VISUAL ANALYSIS TASK"]
288
+ for section in required_sections:
289
+ assert section in perception_prompt, f"Perception prompt missing section '{section}' in {state_file}"
290
+
291
+ # Check for analysis instructions
292
+ analysis_keywords = ["CUTSCENE", "MAP", "BATTLE", "DIALOGUE", "MENU"]
293
+ found_keywords = [kw for kw in analysis_keywords if kw in perception_prompt]
294
+ assert len(found_keywords) >= 3, f"Perception prompt missing analysis keywords in {state_file}. Found: {found_keywords}"
295
+
296
+ # Check that observation was returned
297
+ assert observation, f"Perception module returned no observation for {state_file}"
298
+ assert isinstance(observation, dict), f"Perception module returned non-dict observation for {state_file}"
299
+
300
+ # Check that slow_thinking is boolean
301
+ assert isinstance(slow_thinking, bool), f"Perception module returned non-boolean slow_thinking for {state_file}"
302
+
303
+ finally:
304
+ self.stop_server(server_process)
305
+
306
+ def test_planning_module_prompts(self, mock_vlm, server_url):
307
+ """Test that planning module generates proper prompts without 'Unknown' values"""
308
+ state_files = self.find_state_files()
309
+
310
+ for state_file in state_files:
311
+ print(f"\nTesting PLANNING module with {state_file}")
312
+
313
+ # Start server
314
+ server_process = self.start_server_with_state(str(state_file))
315
+
316
+ try:
317
+ # Get state data
318
+ state_data = self.get_state_from_server(server_url)
319
+ if not state_data:
320
+ pytest.skip(f"Could not get state data for {state_file}")
321
+
322
+ # Mock inputs for planning_step
323
+ memory_context = "Test memory context"
324
+ current_plan = None # Start with no plan
325
+ slow_thinking_needed = True
326
+
327
+ # Call planning_step
328
+ plan = planning_step(memory_context, current_plan, slow_thinking_needed, state_data, mock_vlm)
329
+
330
+ # Check captured prompts
331
+ planning_prompts = [p for p in mock_vlm.captured_prompts if "PLANNING" in p['context']]
332
+
333
+ assert planning_prompts, f"No planning prompts captured for {state_file}"
334
+
335
+ # Analyze the planning prompt for issues
336
+ planning_prompt = planning_prompts[0]['prompt']
337
+
338
+ # Check for "Unknown" values in the prompt
339
+ assert "Unknown" not in planning_prompt, f"Planning prompt contains 'Unknown' values in {state_file}"
340
+
341
+ # Check for required sections
342
+ required_sections = ["COMPREHENSIVE GAME STATE DATA", "STRATEGIC PLANNING TASK"]
343
+ for section in required_sections:
344
+ assert section in planning_prompt, f"Planning prompt missing section '{section}' in {state_file}"
345
+
346
+ # Check for planning instructions
347
+ planning_keywords = ["IMMEDIATE GOAL", "SHORT-TERM OBJECTIVES", "LONG-TERM STRATEGY", "EFFICIENCY NOTES"]
348
+ found_keywords = [kw for kw in planning_keywords if kw in planning_prompt]
349
+ assert len(found_keywords) >= 3, f"Planning prompt missing planning keywords in {state_file}. Found: {found_keywords}"
350
+
351
+ # Check that plan was returned
352
+ assert plan, f"Planning module returned no plan for {state_file}"
353
+ assert isinstance(plan, str), f"Planning module returned non-string plan for {state_file}"
354
+
355
+ finally:
356
+ self.stop_server(server_process)
357
+
358
+ def test_all_modules_integration(self, mock_vlm, server_url):
359
+ """Test that all modules work together without 'Unknown' values"""
360
+ state_files = self.find_state_files()
361
+
362
+ for state_file in state_files:
363
+ print(f"\nTesting ALL MODULES integration with {state_file}")
364
+
365
+ # Start server
366
+ server_process = self.start_server_with_state(str(state_file))
367
+
368
+ try:
369
+ # Get state data
370
+ state_data = self.get_state_from_server(server_url)
371
+ if not state_data:
372
+ pytest.skip(f"Could not get state data for {state_file}")
373
+
374
+ # Test all modules in sequence
375
+ memory_context = "Test memory context"
376
+ current_plan = None
377
+ latest_observation = "Test observation"
378
+ frame = None
379
+ recent_actions = ["UP", "A", "RIGHT"]
380
+ observation_buffer = [
381
+ {
382
+ "frame_id": 1,
383
+ "observation": "Test observation",
384
+ "state": state_data
385
+ }
386
+ ]
387
+
388
+ # 1. Perception
389
+ observation, slow_thinking = perception_step(frame, state_data, mock_vlm)
390
+ assert observation and isinstance(observation, dict)
391
+
392
+ # 2. Memory
393
+ updated_memory = memory_step(memory_context, current_plan, recent_actions, observation_buffer, mock_vlm)
394
+ assert "Unknown" not in updated_memory
395
+
396
+ # 3. Planning
397
+ plan = planning_step(updated_memory, current_plan, slow_thinking, state_data, mock_vlm)
398
+ assert plan and isinstance(plan, str)
399
+
400
+ # 4. Action
401
+ actions = action_step(updated_memory, plan, observation, frame, state_data, recent_actions, mock_vlm)
402
+ assert actions and isinstance(actions, list)
403
+
404
+ # Check that no prompts contain "Unknown"
405
+ for prompt_data in mock_vlm.captured_prompts:
406
+ assert "Unknown" not in prompt_data['prompt'], f"Found 'Unknown' in {prompt_data['context']} prompt for {state_file}"
407
+
408
+ finally:
409
+ self.stop_server(server_process)
410
+
411
+
412
+ if __name__ == "__main__":
413
+ pytest.main([__file__])