synth-ai 0.2.12__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (229) hide show
  1. examples/multi_step/configs/crafter_rl_outcome.toml +74 -0
  2. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +186 -0
  3. examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +83 -0
  4. examples/multi_step/configs/crafter_rl_stepwise_simple.toml +78 -0
  5. examples/multi_step/crafter_rl_lora.md +51 -10
  6. examples/multi_step/sse_metrics_streaming_notes.md +357 -0
  7. examples/multi_step/task_app_config_notes.md +7 -1
  8. examples/swe/task_app/grpo_swe_mini.py +55 -26
  9. examples/swe/task_app/hosted/rollout.py +40 -0
  10. examples/swe/task_app/hosted/test_service.py +5 -6
  11. examples/task_apps/TESTING.md +275 -0
  12. examples/task_apps/__init__.py +0 -0
  13. examples/task_apps/crafter/__init__.py +0 -0
  14. examples/task_apps/crafter/task_app/__init__.py +2 -0
  15. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +21 -46
  16. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
  17. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +60 -4
  18. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +109 -45
  19. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +67 -49
  20. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +242 -193
  21. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
  22. examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
  23. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
  24. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
  25. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
  26. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
  27. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
  28. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
  29. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
  30. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
  31. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
  32. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
  33. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
  34. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
  35. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
  36. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
  37. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
  38. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
  39. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
  40. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
  41. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
  42. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
  43. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
  44. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
  45. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
  46. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
  47. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
  48. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
  49. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
  50. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
  51. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
  52. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
  53. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
  54. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
  55. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
  56. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
  57. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
  58. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
  59. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
  60. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
  61. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
  62. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
  63. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
  64. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
  65. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
  66. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
  67. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
  68. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
  69. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
  70. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
  71. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
  72. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
  73. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
  74. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
  75. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
  76. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
  77. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
  78. examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
  79. examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
  80. examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
  81. examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
  82. examples/task_apps/enron/__init__.py +1 -0
  83. examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
  84. examples/task_apps/enron/task_app/README.md +14 -0
  85. examples/task_apps/enron/task_app/__init__.py +1 -0
  86. examples/task_apps/enron/task_app/grpo_enron.py +906 -0
  87. examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
  88. examples/task_apps/enron/tests/__init__.py +2 -0
  89. examples/task_apps/enron/tests/conftest.py +115 -0
  90. examples/task_apps/enron/tests/integration/__init__.py +2 -0
  91. examples/task_apps/enron/tests/integration/test_enron_eval.py +177 -0
  92. examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
  93. examples/task_apps/enron/tests/unit/__init__.py +2 -0
  94. examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
  95. examples/task_apps/math/__init__.py +0 -0
  96. examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
  97. examples/task_apps/pokemon_battle/__init__.py +2 -0
  98. examples/task_apps/pokemon_battle/modal_app.py +104 -0
  99. examples/task_apps/pokemon_battle/task_app/README.md +68 -0
  100. examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
  101. examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
  102. examples/task_apps/pokemon_red/README.md +357 -0
  103. examples/task_apps/pokemon_red/__init__.py +3 -0
  104. examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
  105. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +73 -0
  106. examples/task_apps/pokemon_red/task_app.py +606 -0
  107. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +191 -0
  108. examples/task_apps/sokoban/README.md +307 -0
  109. examples/task_apps/sokoban/__init__.py +3 -0
  110. examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
  111. examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
  112. examples/task_apps/sokoban/task_app.py +1058 -0
  113. examples/task_apps/sokoban/tests/__init__.py +2 -0
  114. examples/task_apps/sokoban/tests/conftest.py +113 -0
  115. examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
  116. examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
  117. examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
  118. examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
  119. examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
  120. examples/task_apps/verilog/__init__.py +1 -0
  121. examples/task_apps/verilog/eval_groq_qwen32b.toml +20 -0
  122. examples/task_apps/verilog/task_app/README.md +12 -0
  123. examples/task_apps/verilog/task_app/__init__.py +1 -0
  124. examples/task_apps/verilog/task_app/grpo_verilog.py +931 -0
  125. examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
  126. examples/task_apps/verilog/tests/__init__.py +2 -0
  127. examples/task_apps/verilog/tests/conftest.py +115 -0
  128. examples/task_apps/verilog/tests/integration/__init__.py +2 -0
  129. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +179 -0
  130. examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
  131. examples/task_apps/verilog/tests/unit/__init__.py +2 -0
  132. examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
  133. examples/vlm/crafter_openai_vlm_agent.py +4 -4
  134. examples/vlm/run_crafter_vlm_benchmark.py +4 -4
  135. examples/warming_up_to_rl/configs/eval_stepwise_complex.toml +4 -2
  136. examples/warming_up_to_rl/configs/eval_stepwise_simple.toml +4 -2
  137. examples/warming_up_to_rl/run_eval.py +127 -18
  138. examples/workflows/__init__.py +0 -0
  139. examples/workflows/math_rl/__init__.py +0 -0
  140. examples/workflows/math_rl/download_dataset.py +80 -0
  141. synth_ai/__init__.py +41 -1
  142. synth_ai/api/train/builders.py +73 -29
  143. synth_ai/api/train/cli.py +12 -6
  144. synth_ai/api/train/configs/__init__.py +44 -0
  145. synth_ai/api/train/configs/rl.py +134 -0
  146. synth_ai/api/train/configs/sft.py +95 -0
  147. synth_ai/api/train/configs/shared.py +24 -0
  148. synth_ai/api/train/env_resolver.py +5 -2
  149. synth_ai/api/train/supported_algos.py +10 -5
  150. synth_ai/api/train/utils.py +7 -4
  151. synth_ai/cli/__init__.py +7 -51
  152. synth_ai/cli/_storage.py +4 -3
  153. synth_ai/cli/_validate_task_app.py +11 -0
  154. synth_ai/cli/balance.py +4 -3
  155. synth_ai/cli/calc.py +2 -2
  156. synth_ai/cli/demo.py +49 -43
  157. synth_ai/cli/legacy_root_backup.py +1 -1
  158. synth_ai/cli/rl_demo.py +86 -106
  159. synth_ai/cli/root.py +0 -97
  160. synth_ai/cli/task_apps.py +1710 -186
  161. synth_ai/demos/core/cli.py +121 -159
  162. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +28 -16
  163. synth_ai/environments/examples/crafter_classic/environment.py +16 -0
  164. synth_ai/environments/examples/enron/engine.py +7 -2
  165. synth_ai/environments/examples/enron/environment.py +68 -0
  166. synth_ai/environments/examples/red/engine.py +27 -0
  167. synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
  168. synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
  169. synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
  170. synth_ai/environments/examples/red/environment.py +60 -0
  171. synth_ai/environments/examples/sokoban/taskset.py +116 -0
  172. synth_ai/environments/examples/verilog/engine.py +30 -4
  173. synth_ai/evals/__init__.py +15 -0
  174. synth_ai/evals/client.py +82 -0
  175. synth_ai/evals/types.py +42 -0
  176. synth_ai/jobs/client.py +16 -4
  177. synth_ai/judge_schemas.py +127 -0
  178. synth_ai/py.typed +0 -0
  179. synth_ai/task/__init__.py +14 -5
  180. synth_ai/task/contracts.py +124 -38
  181. synth_ai/task/proxy.py +48 -56
  182. synth_ai/task/rubrics/__init__.py +53 -0
  183. synth_ai/task/rubrics/loaders.py +133 -0
  184. synth_ai/task/rubrics/models.py +57 -0
  185. synth_ai/task/rubrics/scoring.py +113 -0
  186. synth_ai/task/rubrics/strict.py +149 -0
  187. synth_ai/task/server.py +8 -7
  188. synth_ai/task/validators.py +269 -6
  189. synth_ai/tracing_v3/decorators.py +7 -3
  190. synth_ai/tracing_v3/replica_sync.py +4 -4
  191. synth_ai/tracing_v3/serialization.py +130 -0
  192. synth_ai/tracing_v3/trace_utils.py +317 -0
  193. synth_ai/tracing_v3/turso/native_manager.py +3 -3
  194. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/METADATA +4 -1
  195. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/RECORD +228 -89
  196. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/entry_points.txt +0 -1
  197. synth_ai/task/rubrics.py +0 -219
  198. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
  199. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
  200. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
  201. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
  202. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
  203. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
  204. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
  205. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
  206. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +0 -0
  207. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +0 -0
  208. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
  209. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
  210. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
  211. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
  212. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
  213. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
  214. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
  215. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
  216. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
  217. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/utils.py +0 -0
  218. /examples/{rl/task_app → task_apps/math}/README.md +0 -0
  219. /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
  220. /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
  221. /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
  222. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
  223. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
  224. /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
  225. /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
  226. /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
  227. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/WHEEL +0 -0
  228. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/licenses/LICENSE +0 -0
  229. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,204 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test battle state formatting - separate presentation for battle vs normal gameplay.
4
+ """
5
+
6
+ import pytest
7
+ from utils.state_formatter import format_state_for_llm
8
+ from pokemon_env.enums import MetatileBehavior
9
+
10
+
11
+ def test_battle_mode_hides_map():
12
+ """Test that battle mode doesn't show map information."""
13
+ battle_state = {
14
+ 'player': {
15
+ 'name': 'Red',
16
+ 'position': {'x': 10, 'y': 10},
17
+ 'party': [
18
+ {'species_name': 'Pikachu', 'level': 25, 'current_hp': 50, 'max_hp': 75, 'status': 'Normal'}
19
+ ]
20
+ },
21
+ 'game': {
22
+ 'is_in_battle': True,
23
+ 'battle_info': {
24
+ 'player_pokemon': {'species': 'Pikachu', 'level': 25, 'current_hp': 50, 'max_hp': 75},
25
+ 'opponent_pokemon': {'species': 'Zubat', 'level': 10, 'current_hp': 20, 'max_hp': 30}
26
+ }
27
+ },
28
+ 'map': {
29
+ 'tiles': [[(1, MetatileBehavior.NORMAL, 0, 0)] * 3] * 3,
30
+ 'current_map': 'Route 1'
31
+ }
32
+ }
33
+
34
+ formatted = format_state_for_llm(battle_state)
35
+
36
+ # Should show battle mode indicator
37
+ assert "=== BATTLE MODE ===" in formatted
38
+ assert "Currently in battle" in formatted
39
+
40
+ # Should show battle status
41
+ assert "=== BATTLE STATUS ===" in formatted
42
+ assert "Your Pokemon: Pikachu" in formatted
43
+ assert "Opponent: Zubat" in formatted
44
+
45
+ # Should NOT show map
46
+ assert "LOCATION & MAP INFO" not in formatted
47
+ assert "TRAVERSABILITY MAP" not in formatted
48
+ assert "Route 1" not in formatted
49
+
50
+
51
+ def test_battle_mode_hides_dialogue():
52
+ """Test that battle mode doesn't show dialogue information."""
53
+ battle_state = {
54
+ 'player': {'name': 'Red'},
55
+ 'game': {
56
+ 'is_in_battle': True,
57
+ 'dialog_text': 'Trainer wants to battle!', # This might be residual
58
+ 'battle_info': {
59
+ 'player_pokemon': {'species': 'Charmander', 'level': 5, 'current_hp': 18, 'max_hp': 20},
60
+ 'opponent_pokemon': {'species': 'Rattata', 'level': 3, 'current_hp': 10, 'max_hp': 15}
61
+ }
62
+ }
63
+ }
64
+
65
+ formatted = format_state_for_llm(battle_state)
66
+
67
+ # Should show battle info
68
+ assert "Charmander" in formatted
69
+ assert "Rattata" in formatted
70
+
71
+ # Should NOT show dialogue
72
+ assert "--- DIALOGUE ---" not in formatted
73
+ assert "Trainer wants to battle" not in formatted
74
+ assert "RESIDUAL TEXT" not in formatted
75
+
76
+
77
+ def test_normal_mode_shows_everything():
78
+ """Test that normal (non-battle) mode shows all information."""
79
+ normal_state = {
80
+ 'player': {
81
+ 'name': 'Red',
82
+ 'position': {'x': 10, 'y': 10},
83
+ 'facing': 'North',
84
+ 'party': [
85
+ {'species_name': 'Squirtle', 'level': 10, 'current_hp': 30, 'max_hp': 35, 'status': 'Normal'}
86
+ ]
87
+ },
88
+ 'game': {
89
+ 'is_in_battle': False,
90
+ 'dialog_text': 'Welcome to the Pokemon Center!',
91
+ 'dialogue_detected': {'has_dialogue': True, 'confidence': 0.9}
92
+ },
93
+ 'map': {
94
+ 'tiles': [[(1, MetatileBehavior.NORMAL, 0, 0)] * 3] * 3,
95
+ 'current_map': 'Pokemon Center'
96
+ }
97
+ }
98
+
99
+ formatted = format_state_for_llm(normal_state)
100
+
101
+ # Should show normal player info
102
+ assert "=== PLAYER INFO ===" in formatted
103
+ assert "Position: X=10, Y=10" in formatted
104
+ assert "Facing: North" in formatted
105
+
106
+ # Should show map
107
+ assert "LOCATION & MAP INFO" in formatted
108
+
109
+ # Should show dialogue
110
+ assert "--- DIALOGUE ---" in formatted
111
+ assert "Welcome to the Pokemon Center" in formatted
112
+ assert "Detection confidence: 90.0%" in formatted
113
+
114
+ # Should NOT show battle mode indicator
115
+ assert "=== BATTLE MODE ===" not in formatted
116
+
117
+
118
+ def test_battle_party_information():
119
+ """Test that battle mode shows full party for switching decisions."""
120
+ battle_state = {
121
+ 'player': {
122
+ 'name': 'Red',
123
+ 'party': [
124
+ {'species_name': 'Venusaur', 'level': 50, 'current_hp': 0, 'max_hp': 200, 'status': 'Fainted'},
125
+ {'species_name': 'Charizard', 'level': 50, 'current_hp': 180, 'max_hp': 185, 'status': 'Normal'},
126
+ {'species_name': 'Blastoise', 'level': 50, 'current_hp': 100, 'max_hp': 190, 'status': 'Poisoned'}
127
+ ]
128
+ },
129
+ 'game': {
130
+ 'in_battle': True, # Alternative key
131
+ 'battle_info': {
132
+ 'player_pokemon': {'species': 'Venusaur', 'level': 50, 'current_hp': 0, 'max_hp': 200},
133
+ 'opponent_pokemon': {'species': 'Alakazam', 'level': 55, 'current_hp': 150, 'max_hp': 160}
134
+ }
135
+ }
136
+ }
137
+
138
+ formatted = format_state_for_llm(battle_state)
139
+
140
+ # Should show party status section
141
+ assert "=== PARTY STATUS ===" in formatted
142
+
143
+ # Should list all party members
144
+ assert "Venusaur" in formatted
145
+ assert "Charizard" in formatted
146
+ assert "Blastoise" in formatted
147
+
148
+ # Should show status conditions
149
+ assert "Fainted" in formatted or "0/200" in formatted # Venusaur fainted
150
+ assert "Poisoned" in formatted # Blastoise poisoned
151
+
152
+
153
+ def test_battle_mode_detection_variants():
154
+ """Test that both is_in_battle and in_battle keys trigger battle mode."""
155
+ # Test with is_in_battle
156
+ state1 = {
157
+ 'player': {'name': 'Red'},
158
+ 'game': {
159
+ 'is_in_battle': True,
160
+ 'battle_info': {'player_pokemon': {'species': 'Mew'}}
161
+ },
162
+ 'map': {'current_map': 'Should not appear'}
163
+ }
164
+
165
+ formatted1 = format_state_for_llm(state1)
166
+ assert "=== BATTLE MODE ===" in formatted1
167
+ assert "Should not appear" not in formatted1
168
+
169
+ # Test with in_battle
170
+ state2 = {
171
+ 'player': {'name': 'Blue'},
172
+ 'game': {
173
+ 'in_battle': True,
174
+ 'battle_info': {'player_pokemon': {'species': 'Mewtwo'}}
175
+ },
176
+ 'map': {'current_map': 'Also should not appear'}
177
+ }
178
+
179
+ formatted2 = format_state_for_llm(state2)
180
+ assert "=== BATTLE MODE ===" in formatted2
181
+ assert "Also should not appear" not in formatted2
182
+
183
+
184
+ def test_empty_battle_info():
185
+ """Test handling of battle mode with missing battle info."""
186
+ state = {
187
+ 'player': {'name': 'Red'},
188
+ 'game': {
189
+ 'is_in_battle': True,
190
+ # No battle_info provided
191
+ }
192
+ }
193
+
194
+ formatted = format_state_for_llm(state)
195
+
196
+ # Should still enter battle mode
197
+ assert "=== BATTLE MODE ===" in formatted
198
+
199
+ # Should handle missing battle info gracefully
200
+ assert "=== PARTY STATUS ===" in formatted # Still shows party section
201
+
202
+
203
+ if __name__ == "__main__":
204
+ pytest.main([__file__, "-v"])
@@ -0,0 +1,133 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test dialogue detection functionality.
4
+ """
5
+
6
+ import pytest
7
+ import numpy as np
8
+ from utils.state_formatter import detect_dialogue_on_frame, format_state_for_llm
9
+
10
+
11
+ def test_dialogue_detection_with_blue_box():
12
+ """Test dialogue detection with typical blue dialogue box."""
13
+ # Create a mock frame with dialogue box characteristics (240x160 GBA resolution)
14
+ frame = np.zeros((160, 240, 3), dtype=np.uint8)
15
+
16
+ # Add blue dialogue box in bottom 50 pixels
17
+ # Blue color (R, G, B) where blue is dominant
18
+ frame[110:160, :] = [50, 70, 150] # Bluish background
19
+
20
+ # Add some white text areas
21
+ frame[120:130, 20:220] = [220, 220, 220] # White text area
22
+
23
+ # Add border lines
24
+ frame[110:112, :] = [100, 100, 100] # Top border
25
+ frame[158:160, :] = [100, 100, 100] # Bottom border
26
+
27
+ result = detect_dialogue_on_frame(frame_array=frame)
28
+
29
+ assert result['has_dialogue'] == True
30
+ assert result['confidence'] > 0.5
31
+ assert 'blue dialogue box' in result['reason'].lower()
32
+
33
+
34
+ def test_no_dialogue_detection():
35
+ """Test that normal gameplay doesn't trigger dialogue detection."""
36
+ # Create a mock frame with varied gameplay content (no dialogue)
37
+ frame = np.random.randint(0, 255, (160, 240, 3), dtype=np.uint8)
38
+
39
+ # Make it less random - add some structure but not dialogue-like
40
+ frame[0:80, :] = [100, 150, 100] # Greenish top (grass/trees)
41
+ frame[80:160, :] = [150, 130, 100] # Brownish bottom (ground)
42
+
43
+ result = detect_dialogue_on_frame(frame_array=frame)
44
+
45
+ assert result['has_dialogue'] == False
46
+ assert result['confidence'] < 0.5
47
+
48
+
49
+ def test_dialogue_detection_grayscale():
50
+ """Test dialogue detection with grayscale input."""
51
+ # Create a grayscale frame
52
+ frame = np.zeros((160, 240), dtype=np.uint8)
53
+
54
+ # Add high contrast pattern in dialogue area
55
+ frame[110:160, :] = 50 # Dark background
56
+ frame[120:130, 20:220] = 200 # Light text area
57
+
58
+ # Add horizontal edges
59
+ frame[110:112, :] = 150
60
+ frame[158:160, :] = 150
61
+
62
+ result = detect_dialogue_on_frame(frame_array=frame)
63
+
64
+ # Should detect based on contrast and structure
65
+ assert 'text contrast' in result['reason'].lower() or 'borders' in result['reason'].lower()
66
+
67
+
68
+ def test_dialogue_validation_in_state():
69
+ """Test that dialogue validation works in state formatting."""
70
+ # State with dialogue text but no frame detection
71
+ state_no_detection = {
72
+ 'player': {'name': 'Red', 'position': {'x': 10, 'y': 10}},
73
+ 'game': {
74
+ 'dialog_text': 'Hello trainer! Would you like to battle?',
75
+ 'dialogue_detected': {'has_dialogue': True, 'confidence': 0.8}
76
+ },
77
+ 'map': {}
78
+ }
79
+
80
+ formatted = format_state_for_llm(state_no_detection)
81
+ assert 'DIALOGUE' in formatted
82
+ assert 'Hello trainer' in formatted
83
+ assert 'Detection confidence: 80.0%' in formatted
84
+
85
+ # State with dialogue text but frame says no dialogue visible
86
+ state_no_visible = {
87
+ 'player': {'name': 'Red', 'position': {'x': 10, 'y': 10}},
88
+ 'game': {
89
+ 'dialog_text': 'Hello trainer! Would you like to battle?',
90
+ 'dialogue_detected': {'has_dialogue': False, 'confidence': 0.1}
91
+ },
92
+ 'map': {}
93
+ }
94
+
95
+ formatted = format_state_for_llm(state_no_visible)
96
+ assert 'RESIDUAL TEXT' in formatted
97
+ assert 'not visible' in formatted
98
+
99
+ # State with no dialogue detection info (backwards compatibility)
100
+ state_legacy = {
101
+ 'player': {'name': 'Red', 'position': {'x': 10, 'y': 10}},
102
+ 'game': {
103
+ 'dialog_text': 'Hello trainer! Would you like to battle?'
104
+ },
105
+ 'map': {}
106
+ }
107
+
108
+ formatted = format_state_for_llm(state_legacy)
109
+ assert 'Hello trainer' in formatted # Should still show dialogue
110
+
111
+
112
+ def test_dialogue_detection_edge_cases():
113
+ """Test edge cases for dialogue detection."""
114
+ # Test with None
115
+ result = detect_dialogue_on_frame(frame_array=None)
116
+ assert result['has_dialogue'] == False
117
+ assert 'No frame data' in result['reason']
118
+
119
+ # Test with wrong shape
120
+ small_frame = np.zeros((10, 10, 3), dtype=np.uint8)
121
+ result = detect_dialogue_on_frame(frame_array=small_frame)
122
+ # Should still work but likely no dialogue detected
123
+ assert 'has_dialogue' in result
124
+ assert 'confidence' in result
125
+
126
+ # Test with very small dialogue region
127
+ tiny_frame = np.zeros((50, 240, 3), dtype=np.uint8)
128
+ result = detect_dialogue_on_frame(frame_array=tiny_frame)
129
+ assert 'has_dialogue' in result
130
+
131
+
132
+ if __name__ == "__main__":
133
+ pytest.main([__file__, "-v"])
@@ -0,0 +1,229 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Comprehensive pytest for dialogue detection system across all states
4
+ """
5
+
6
+ import pytest
7
+ import sys
8
+ import os
9
+ import io
10
+ import subprocess
11
+ import time
12
+ import requests
13
+ import json
14
+ import base64
15
+ from pathlib import Path
16
+ from PIL import Image
17
+
18
+ # Add parent directory to path for imports
19
+ sys.path.append(str(Path(__file__).parent.parent))
20
+
21
+ from utils.ocr_dialogue import create_ocr_detector
22
+
23
+ class TestDialogueDetection:
24
+ """Test dialogue detection accuracy across all provided states"""
25
+
26
+ @pytest.fixture(autouse=True)
27
+ def setup(self):
28
+ """Setup for each test"""
29
+ self.detector = create_ocr_detector()
30
+ self.agent_port = 8000
31
+ assert self.detector is not None, "Could not create OCR detector"
32
+
33
+ # Kill any existing agent_direct processes
34
+ subprocess.run(["pkill", "-f", "agent_direct.py"], capture_output=True)
35
+ time.sleep(1)
36
+
37
+ def teardown_method(self):
38
+ """Cleanup after each test"""
39
+ subprocess.run(["pkill", "-f", "agent_direct.py"], capture_output=True)
40
+ time.sleep(0.5)
41
+
42
+ def _test_state_file(self, state_file, expected_dialogue, description=""):
43
+ """Helper to test a single state file"""
44
+ print(f"\n🧪 Testing: {state_file}")
45
+ print(f" Expected dialogue: {expected_dialogue}")
46
+ print(f" Description: {description}")
47
+
48
+ # Start agent_direct with this state
49
+ cmd = [
50
+ "/home/milkkarten/anaconda3/envs/mgba/bin/python",
51
+ "agent_direct.py",
52
+ "--load-state", state_file,
53
+ "--backend", "gemini",
54
+ "--manual"
55
+ ]
56
+
57
+ # Start agent_direct
58
+ process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
59
+
60
+ try:
61
+ # Wait for startup
62
+ time.sleep(3)
63
+
64
+ # Test server responsiveness
65
+ for attempt in range(5):
66
+ try:
67
+ response = requests.get(f"http://localhost:{self.agent_port}/status", timeout=2)
68
+ if response.status_code == 200:
69
+ break
70
+ time.sleep(1)
71
+ except:
72
+ time.sleep(1)
73
+ else:
74
+ pytest.fail(f"Agent_direct failed to start for {state_file}")
75
+
76
+ # Get screenshot
77
+ frame_response = requests.get(f"http://localhost:{self.agent_port}/api/frame", timeout=5)
78
+ assert frame_response.status_code == 200, f"Failed to get screenshot for {state_file}"
79
+
80
+ frame_data = frame_response.json()
81
+ assert frame_data.get('frame'), f"No frame data for {state_file}"
82
+
83
+ # Decode screenshot
84
+ image_data = base64.b64decode(frame_data['frame'])
85
+ screenshot = Image.open(io.BytesIO(image_data))
86
+
87
+ # Test dialogue detection
88
+ box_detected = self.detector.is_dialogue_box_visible(screenshot)
89
+ ocr_text = self.detector.detect_dialogue_from_screenshot(screenshot)
90
+
91
+ print(f" 📦 Box detected: {box_detected}")
92
+ print(f" 👁️ OCR text: '{ocr_text}'")
93
+
94
+ # Get memory reading for comparison
95
+ try:
96
+ state_response = requests.get(f"http://localhost:{self.agent_port}/state", timeout=3)
97
+ if state_response.status_code == 200:
98
+ state_data = state_response.json()
99
+ memory_text = state_data.get('game', {}).get('dialog_text', None)
100
+ print(f" 💾 Memory text: '{memory_text}'")
101
+ else:
102
+ memory_text = "N/A"
103
+ except:
104
+ memory_text = "N/A"
105
+
106
+ # Verify detection accuracy
107
+ assert box_detected == expected_dialogue, (
108
+ f"Detection mismatch for {state_file}: expected {expected_dialogue}, got {box_detected}"
109
+ )
110
+
111
+ return {
112
+ 'state_file': state_file,
113
+ 'expected_dialogue': expected_dialogue,
114
+ 'box_detected': box_detected,
115
+ 'ocr_text': ocr_text,
116
+ 'memory_text': memory_text,
117
+ 'description': description
118
+ }
119
+
120
+ finally:
121
+ process.terminate()
122
+ time.sleep(1)
123
+
124
+ def test_coordinate_tightness(self):
125
+ """Test that OCR coordinates are properly tight around text area"""
126
+ dialogue_coords = self.detector.DIALOGUE_BOX_COORDS
127
+ ocr_coords = self.detector.OCR_TEXT_COORDS
128
+
129
+ # Calculate margins
130
+ left_margin = ocr_coords['x'] - dialogue_coords['x']
131
+ top_margin = ocr_coords['y'] - dialogue_coords['y']
132
+ right_margin = (dialogue_coords['x'] + dialogue_coords['width']) - (ocr_coords['x'] + ocr_coords['width'])
133
+ bottom_margin = (dialogue_coords['y'] + dialogue_coords['height']) - (ocr_coords['y'] + ocr_coords['height'])
134
+
135
+ print(f"📏 Margins - Left: {left_margin}px, Top: {top_margin}px, Right: {right_margin}px, Bottom: {bottom_margin}px")
136
+
137
+ # Verify margins are reasonable (4-16 pixels to avoid borders but not cut text)
138
+ assert 4 <= left_margin <= 16, f"Left margin {left_margin}px outside acceptable range (4-16px)"
139
+ assert 4 <= top_margin <= 16, f"Top margin {top_margin}px outside acceptable range (4-16px)"
140
+ assert 4 <= right_margin <= 16, f"Right margin {right_margin}px outside acceptable range (4-16px)"
141
+ assert 4 <= bottom_margin <= 16, f"Bottom margin {bottom_margin}px outside acceptable range (4-16px)"
142
+
143
+ def test_no_dialog_states(self):
144
+ """Test states that should NOT have dialogue"""
145
+ no_dialog_states = [
146
+ ("tests/states/no_dialog1.state", "No dialogue state 1"),
147
+ ("tests/states/no_dialog2.state", "No dialogue state 2"),
148
+ ("tests/states/no_dialog3.state", "No dialogue state 3"),
149
+ ]
150
+
151
+ for state_file, description in no_dialog_states:
152
+ if os.path.exists(state_file):
153
+ result = self._test_state_file(state_file, False, description)
154
+ assert result['box_detected'] == False, f"False positive detected in {state_file}"
155
+ else:
156
+ pytest.skip(f"State file not found: {state_file}")
157
+
158
+ def test_dialog_states(self):
159
+ """Test states that SHOULD have dialogue"""
160
+ dialog_states = [
161
+ ("tests/states/dialog.state", "Original dialogue state"),
162
+ ("tests/states/dialog2.state", "Second dialogue state"),
163
+ ("tests/states/dialog3.state", "New dialogue state 3"),
164
+ ]
165
+
166
+ for state_file, description in dialog_states:
167
+ if os.path.exists(state_file):
168
+ result = self._test_state_file(state_file, True, description)
169
+ assert result['box_detected'] == True, f"Failed to detect dialogue in {state_file}"
170
+ else:
171
+ pytest.skip(f"State file not found: {state_file}")
172
+
173
+ def test_static_image_detection(self):
174
+ """Test detection on static images"""
175
+ # Test known dialogue frame
176
+ if os.path.exists("dialog_frame.png"):
177
+ image = Image.open("dialog_frame.png")
178
+ box_detected = self.detector.is_dialogue_box_visible(image)
179
+ assert box_detected == True, "Failed to detect dialogue in known dialogue frame"
180
+
181
+ # Test emerald.png (should be no dialogue)
182
+ if os.path.exists("emerald.png"):
183
+ image = Image.open("emerald.png")
184
+ box_detected = self.detector.is_dialogue_box_visible(image)
185
+ assert box_detected == False, "False positive detected in emerald.png"
186
+
187
+ def test_ocr_preprocessing_quality(self):
188
+ """Test that OCR preprocessing produces high-quality black/white output"""
189
+ if os.path.exists("dialog_frame.png"):
190
+ image = Image.open("dialog_frame.png")
191
+ image_np = np.array(image)
192
+
193
+ # Extract OCR region
194
+ ocr_coords = self.detector.OCR_TEXT_COORDS
195
+ ocr_region = image_np[
196
+ ocr_coords['y']:ocr_coords['y'] + ocr_coords['height'],
197
+ ocr_coords['x']:ocr_coords['x'] + ocr_coords['width']
198
+ ]
199
+
200
+ # Test preprocessing
201
+ processed = self.detector._preprocess_for_ocr(ocr_region)
202
+
203
+ # Verify it's binary (only 0 and 255 values)
204
+ unique_values = np.unique(processed)
205
+ assert len(unique_values) <= 2, f"Processed image should be binary, found {len(unique_values)} unique values"
206
+
207
+ # Should have both black and white pixels (text and background)
208
+ if len(unique_values) == 2:
209
+ assert 0 in unique_values and 255 in unique_values, "Should have pure black (0) and white (255) pixels"
210
+
211
+ class TestDialogueIntegration:
212
+ """Test integration with LLM agent comprehensive state"""
213
+
214
+ def test_comprehensive_state_includes_dialog(self):
215
+ """Test that comprehensive state includes dialogue reading"""
216
+ # This test verifies the integration works but doesn't need to run agent_direct
217
+ # Just verify the OCR detector can be imported and works
218
+ detector = create_ocr_detector()
219
+ assert detector is not None, "OCR detector should be available for comprehensive state"
220
+
221
+ # Verify key methods exist
222
+ assert hasattr(detector, 'is_dialogue_box_visible'), "Detector should have dialogue box detection"
223
+ assert hasattr(detector, 'detect_dialogue_from_screenshot'), "Detector should have text detection"
224
+ assert hasattr(detector, 'read_dialog_with_ocr_fallback'), "Detector should have smart fallback logic"
225
+
226
+ if __name__ == "__main__":
227
+ # Allow running as script for debugging
228
+ import numpy as np
229
+ pytest.main([__file__, "-v", "-s"])