synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (226) hide show
  1. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +12 -1
  2. examples/swe/task_app/grpo_swe_mini.py +55 -26
  3. examples/swe/task_app/hosted/rollout.py +40 -0
  4. examples/swe/task_app/hosted/test_service.py +5 -6
  5. examples/task_apps/TESTING.md +275 -0
  6. examples/task_apps/__init__.py +0 -0
  7. examples/task_apps/crafter/__init__.py +0 -0
  8. examples/task_apps/crafter/task_app/__init__.py +2 -0
  9. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +18 -13
  10. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
  11. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +60 -4
  12. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +25 -3
  13. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +10 -0
  14. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
  15. examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
  16. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
  17. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
  18. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
  19. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
  20. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
  21. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
  22. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
  23. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
  24. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
  25. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
  26. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
  27. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
  28. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
  29. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
  30. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
  31. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
  32. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
  33. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
  34. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
  35. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
  36. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
  37. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
  38. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
  39. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
  40. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
  41. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
  42. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
  43. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
  44. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
  45. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
  46. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
  47. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
  48. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
  49. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
  50. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
  51. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
  52. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
  53. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
  54. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
  55. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
  56. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
  57. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
  58. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
  59. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
  60. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
  61. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
  62. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
  63. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
  64. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
  65. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
  66. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
  67. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
  68. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
  69. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
  70. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
  71. examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
  72. examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
  73. examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
  74. examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
  75. examples/task_apps/enron/__init__.py +1 -0
  76. examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
  77. examples/task_apps/enron/task_app/README.md +14 -0
  78. examples/task_apps/enron/task_app/__init__.py +1 -0
  79. examples/task_apps/enron/task_app/grpo_enron.py +906 -0
  80. examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
  81. examples/task_apps/enron/tests/__init__.py +2 -0
  82. examples/task_apps/enron/tests/conftest.py +115 -0
  83. examples/task_apps/enron/tests/integration/__init__.py +2 -0
  84. examples/task_apps/enron/tests/integration/test_enron_eval.py +177 -0
  85. examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
  86. examples/task_apps/enron/tests/unit/__init__.py +2 -0
  87. examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
  88. examples/task_apps/math/__init__.py +0 -0
  89. examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
  90. examples/task_apps/pokemon_battle/__init__.py +2 -0
  91. examples/task_apps/pokemon_battle/modal_app.py +104 -0
  92. examples/task_apps/pokemon_battle/task_app/README.md +68 -0
  93. examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
  94. examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
  95. examples/task_apps/pokemon_red/README.md +357 -0
  96. examples/task_apps/pokemon_red/__init__.py +3 -0
  97. examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
  98. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +73 -0
  99. examples/task_apps/pokemon_red/task_app.py +606 -0
  100. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +191 -0
  101. examples/task_apps/sokoban/README.md +307 -0
  102. examples/task_apps/sokoban/__init__.py +3 -0
  103. examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
  104. examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
  105. examples/task_apps/sokoban/task_app.py +1058 -0
  106. examples/task_apps/sokoban/tests/__init__.py +2 -0
  107. examples/task_apps/sokoban/tests/conftest.py +113 -0
  108. examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
  109. examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
  110. examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
  111. examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
  112. examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
  113. examples/task_apps/verilog/__init__.py +1 -0
  114. examples/task_apps/verilog/eval_groq_qwen32b.toml +20 -0
  115. examples/task_apps/verilog/task_app/README.md +12 -0
  116. examples/task_apps/verilog/task_app/__init__.py +1 -0
  117. examples/task_apps/verilog/task_app/grpo_verilog.py +931 -0
  118. examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
  119. examples/task_apps/verilog/tests/__init__.py +2 -0
  120. examples/task_apps/verilog/tests/conftest.py +115 -0
  121. examples/task_apps/verilog/tests/integration/__init__.py +2 -0
  122. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +179 -0
  123. examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
  124. examples/task_apps/verilog/tests/unit/__init__.py +2 -0
  125. examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
  126. examples/vlm/crafter_openai_vlm_agent.py +4 -4
  127. examples/vlm/run_crafter_vlm_benchmark.py +4 -4
  128. examples/workflows/__init__.py +0 -0
  129. examples/workflows/math_rl/__init__.py +0 -0
  130. examples/workflows/math_rl/download_dataset.py +80 -0
  131. synth_ai/__init__.py +2 -2
  132. synth_ai/api/train/builders.py +25 -11
  133. synth_ai/api/train/cli.py +12 -6
  134. synth_ai/api/train/configs/__init__.py +10 -10
  135. synth_ai/api/train/configs/rl.py +5 -4
  136. synth_ai/api/train/configs/sft.py +4 -3
  137. synth_ai/api/train/env_resolver.py +5 -2
  138. synth_ai/api/train/supported_algos.py +10 -5
  139. synth_ai/api/train/utils.py +7 -4
  140. synth_ai/cli/__init__.py +7 -51
  141. synth_ai/cli/_storage.py +4 -3
  142. synth_ai/cli/_validate_task_app.py +11 -0
  143. synth_ai/cli/balance.py +4 -3
  144. synth_ai/cli/calc.py +2 -2
  145. synth_ai/cli/demo.py +14 -7
  146. synth_ai/cli/legacy_root_backup.py +1 -1
  147. synth_ai/cli/rl_demo.py +8 -7
  148. synth_ai/cli/root.py +0 -97
  149. synth_ai/cli/task_apps.py +1707 -186
  150. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +28 -16
  151. synth_ai/environments/examples/enron/engine.py +7 -2
  152. synth_ai/environments/examples/enron/environment.py +68 -0
  153. synth_ai/environments/examples/red/engine.py +27 -0
  154. synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
  155. synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
  156. synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
  157. synth_ai/environments/examples/red/environment.py +60 -0
  158. synth_ai/environments/examples/sokoban/taskset.py +116 -0
  159. synth_ai/environments/examples/verilog/engine.py +30 -4
  160. synth_ai/evals/client.py +58 -61
  161. synth_ai/jobs/client.py +16 -4
  162. synth_ai/judge_schemas.py +16 -16
  163. synth_ai/py.typed +0 -0
  164. synth_ai/task/__init__.py +14 -5
  165. synth_ai/task/contracts.py +124 -38
  166. synth_ai/task/proxy.py +48 -56
  167. synth_ai/task/rubrics/__init__.py +53 -0
  168. synth_ai/task/rubrics/loaders.py +133 -0
  169. synth_ai/task/rubrics/models.py +57 -0
  170. synth_ai/task/rubrics/scoring.py +113 -0
  171. synth_ai/{rubrics/validators.py → task/rubrics/strict.py} +53 -30
  172. synth_ai/task/server.py +8 -7
  173. synth_ai/task/validators.py +269 -6
  174. synth_ai/tracing_v3/decorators.py +7 -3
  175. synth_ai/tracing_v3/replica_sync.py +4 -4
  176. synth_ai/tracing_v3/serialization.py +5 -5
  177. synth_ai/tracing_v3/trace_utils.py +317 -0
  178. synth_ai/tracing_v3/turso/native_manager.py +3 -3
  179. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/METADATA +4 -1
  180. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/RECORD +214 -101
  181. examples/agora_ex/README_MoE.md +0 -224
  182. examples/agora_ex/__init__.py +0 -7
  183. examples/agora_ex/agora_ex.py +0 -65
  184. examples/agora_ex/agora_ex_task_app.py +0 -590
  185. examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +0 -121
  186. examples/agora_ex/reward_fn_grpo-human.py +0 -129
  187. examples/agora_ex/system_prompt_CURRENT.md +0 -63
  188. examples/agora_ex/task_app/agora_ex_task_app.py +0 -590
  189. examples/agora_ex/task_app/reward_fn_grpo-human.py +0 -129
  190. examples/agora_ex/task_app/system_prompt_CURRENT.md +0 -63
  191. synth_ai/rubrics/__init__.py +0 -22
  192. synth_ai/task/rubrics.py +0 -219
  193. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
  194. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
  195. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
  196. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
  197. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
  198. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
  199. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
  200. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
  201. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +0 -0
  202. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +0 -0
  203. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
  204. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
  205. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
  206. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
  207. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +0 -0
  208. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
  209. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
  210. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
  211. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
  212. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
  213. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/utils.py +0 -0
  214. /examples/{rl/task_app → task_apps/math}/README.md +0 -0
  215. /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
  216. /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
  217. /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
  218. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
  219. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
  220. /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
  221. /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
  222. /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
  223. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/WHEEL +0 -0
  224. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/entry_points.txt +0 -0
  225. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/licenses/LICENSE +0 -0
  226. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,126 @@
1
+ """Unit tests for Enron environment tools and rewards."""
2
+ import pytest
3
+
4
+
5
+ @pytest.mark.asyncio
6
+ @pytest.mark.fast
7
+ async def test_enron_search_tool():
8
+ """Test that the search_emails tool works correctly."""
9
+ from synth_ai.environments.examples.enron.environment import SearchEmailsTool
10
+ from synth_ai.environments.examples.enron.engine import EnronEngine
11
+ from synth_ai.environments.tasks.core import TaskInstance, Impetus, Intent
12
+
13
+ # Create a minimal task instance
14
+ task = TaskInstance(
15
+ id="test",
16
+ impetus=Impetus(instructions="Test question"),
17
+ intent=Intent(
18
+ rubric={"goal": "test"},
19
+ gold_trajectories=None,
20
+ gold_state_diff={},
21
+ deterministic_eval_functions=[],
22
+ ),
23
+ metadata={
24
+ "question": "Test?",
25
+ "gold_answer": "Test answer",
26
+ "inbox_address": "test@enron.com",
27
+ },
28
+ is_reproducible=False,
29
+ initial_engine_snapshot=None,
30
+ )
31
+
32
+ engine = EnronEngine(task)
33
+ tool = SearchEmailsTool(engine)
34
+
35
+ # Test that tool has correct name
36
+ assert tool.name == "search_emails"
37
+
38
+ # Test that tool requires keywords
39
+ from synth_ai.environments.environment.tools import EnvToolCall
40
+
41
+ # Call with minimal args should work (or fail gracefully)
42
+ result = await tool(EnvToolCall(tool="search_emails", args={"keywords": ["test"]}))
43
+ assert result.ok in (True, False) # Either succeeds or fails gracefully
44
+
45
+ # Result should have search_results field
46
+ if result.ok:
47
+ assert "search_results" in result.payload
48
+
49
+
50
+ @pytest.mark.asyncio
51
+ async def test_enron_answer_tool():
52
+ """Test that the answer_question tool calculates rewards correctly."""
53
+ from synth_ai.environments.examples.enron.environment import AnswerQuestionTool
54
+ from synth_ai.environments.examples.enron.engine import EnronEngine
55
+ from synth_ai.environments.tasks.core import TaskInstance, Impetus, Intent
56
+
57
+ task = TaskInstance(
58
+ id="test",
59
+ impetus=Impetus(instructions="Test question"),
60
+ intent=Intent(
61
+ rubric={"goal": "test"},
62
+ gold_trajectories=None,
63
+ gold_state_diff={},
64
+ deterministic_eval_functions=[],
65
+ ),
66
+ metadata={
67
+ "question": "What is the answer?",
68
+ "gold_answer": "The answer is 42",
69
+ "inbox_address": "test@enron.com",
70
+ },
71
+ is_reproducible=False,
72
+ initial_engine_snapshot=None,
73
+ )
74
+
75
+ engine = EnronEngine(task)
76
+ tool = AnswerQuestionTool(engine)
77
+
78
+ # Test exact match
79
+ from synth_ai.environments.environment.tools import EnvToolCall
80
+ result_exact = await tool(EnvToolCall(tool="answer_question", args={"answer": "The answer is 42"}))
81
+ assert result_exact.ok is True
82
+ assert "status" in result_exact.payload
83
+
84
+ # Test partial match (should still give some reward)
85
+ result_partial = await tool(EnvToolCall(tool="answer_question", args={"answer": "answer is 42"}))
86
+ assert result_partial.ok is True
87
+
88
+
89
+ @pytest.mark.asyncio
90
+ async def test_enron_reward_calculation():
91
+ """Test that Enron rewards are calculated correctly."""
92
+ from synth_ai.environments.examples.enron.engine import EnronEngine
93
+ from synth_ai.environments.tasks.core import TaskInstance, Impetus, Intent
94
+
95
+ task = TaskInstance(
96
+ id="test",
97
+ impetus=Impetus(instructions="Test question"),
98
+ intent=Intent(
99
+ rubric={"goal": "test"},
100
+ gold_trajectories=None,
101
+ gold_state_diff={},
102
+ deterministic_eval_functions=[],
103
+ ),
104
+ metadata={
105
+ "question": "What is the answer?",
106
+ "gold_answer": "forty two",
107
+ "inbox_address": "test@enron.com",
108
+ },
109
+ is_reproducible=False,
110
+ initial_engine_snapshot=None,
111
+ )
112
+
113
+ engine = EnronEngine(task)
114
+
115
+ # Test exact match gives high reward
116
+ reward_exact = await engine._judge_answer("forty two")
117
+ assert reward_exact > 0.9, f"Expected high reward for exact match, got {reward_exact}"
118
+
119
+ # Test partial match gives medium reward
120
+ reward_partial = await engine._judge_answer("the answer is forty two")
121
+ assert reward_partial > 0.5, f"Expected medium reward for partial match, got {reward_partial}"
122
+
123
+ # Test wrong answer gives low/zero reward
124
+ reward_wrong = await engine._judge_answer("completely wrong answer")
125
+ assert reward_wrong < 0.5, f"Expected low reward for wrong answer, got {reward_wrong}"
126
+
File without changes
@@ -723,6 +723,9 @@ async def rollout_executor(request: RolloutRequest, fastapi_request: Request) ->
723
723
  },
724
724
  )
725
725
 
726
+ # Extract inference_url from policy config
727
+ inference_url = (request.policy.config or {}).get("inference_url")
728
+
726
729
  trajectory = RolloutTrajectory(
727
730
  env_id=f"math::{sample['split']}::{sample['index']}",
728
731
  policy_id=request.policy.policy_id or "policy",
@@ -732,6 +735,7 @@ async def rollout_executor(request: RolloutRequest, fastapi_request: Request) ->
732
735
  "reward": reward,
733
736
  },
734
737
  length=1,
738
+ inference_url=inference_url, # NEW: Required for trace correlation
735
739
  )
736
740
  metrics = RolloutMetrics(
737
741
  episode_returns=[reward],
@@ -800,7 +804,7 @@ def build_dataset() -> tuple[TaskDatasetRegistry, MathDataset]:
800
804
  def _base_task_info() -> TaskInfo:
801
805
  return TaskInfo(
802
806
  task={"id": "math_single_step", "name": "Math Single Step", "version": "1.0.0"},
803
- environments=["math"],
807
+ environment="math",
804
808
  action_space={
805
809
  "type": "tool_call",
806
810
  "tools": [
@@ -830,11 +834,6 @@ def _base_task_info() -> TaskInfo:
830
834
  "supports_proxy": True,
831
835
  "tool": {"name": TOOL_NAME, "parallel_tool_calls": False},
832
836
  },
833
- capabilities={
834
- "supports_rollout": True,
835
- "supports_env_lifecycle": True,
836
- "requires_api_key_header": True,
837
- },
838
837
  limits={"max_turns": 1},
839
838
  )
840
839
 
@@ -887,21 +886,31 @@ def describe_taskset(dataset: MathDataset) -> dict[str, Any]:
887
886
 
888
887
  def provide_task_instances(dataset: MathDataset, seeds: Sequence[int]) -> Iterable[TaskInfo]:
889
888
  info = _base_task_info()
889
+ base_observation = getattr(info, "observation", None)
890
+ if hasattr(base_observation, "model_dump"):
891
+ observation_template = base_observation.model_dump()
892
+ elif isinstance(base_observation, dict):
893
+ observation_template = dict(base_observation)
894
+ else:
895
+ observation_template = {}
896
+
890
897
  for seed in seeds:
891
898
  sample = dataset.sample(split=DEFAULT_SPLIT, index=seed)
892
899
  yield TaskInfo(
893
900
  task=info.task,
894
- environments=info.environments,
901
+ environment=info.environment,
895
902
  action_space=info.action_space,
896
- observation={**info.observation, "sample_index": sample["index"]},
903
+ observation={
904
+ **observation_template,
905
+ "sample_index": sample["index"],
906
+ },
897
907
  dataset={
898
- **info.dataset,
908
+ **info.dataset.model_dump(),
899
909
  "split": sample["split"],
900
910
  "index": sample["index"],
901
911
  },
902
912
  rubric=info.rubric,
903
913
  inference=info.inference,
904
- capabilities=info.capabilities,
905
914
  limits=info.limits,
906
915
  )
907
916
 
@@ -0,0 +1,2 @@
1
+ """Pokemon competitive battle task app examples."""
2
+
@@ -0,0 +1,104 @@
1
+ """Modal deployment helper for the Pokémon Showdown task app example.
2
+
3
+ This file mirrors the manual setup steps documented in the README:
4
+
5
+ - Clone `pokechamp` and install its Python dependencies.
6
+ - Clone the reference Pokémon Showdown server and install Node dependencies.
7
+ - Mount the local `synth-ai` repository so the task app code is available.
8
+
9
+ Deploy with:
10
+
11
+ ```
12
+ modal deploy examples/task_apps/pokemon_battle/modal_app.py
13
+ ```
14
+
15
+ After deployment the FastAPI service will be reachable at a URL similar to
16
+ `https://<org>--pokemon-showdown-task-app-example.modal.run`.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import subprocess
22
+ import sys
23
+ from pathlib import Path
24
+
25
+ import modal
26
+
27
+ REPO_ROOT = Path(__file__).resolve().parents[3]
28
+ POKECHAMP_REPO = "https://github.com/sethkarten/pokechamp.git"
29
+ SHOWDOWN_REPO = "https://github.com/jakegrigsby/pokemon-showdown.git"
30
+
31
+ app = modal.App("pokemon-showdown-task-app-example")
32
+
33
+ BASE_IMAGE = (
34
+ modal.Image.debian_slim(python_version="3.11")
35
+ .apt_install("git", "nodejs", "npm")
36
+ .pip_install(["uvicorn[standard]", "fastapi", "httpx", "horizons-ai"])
37
+ .run_commands(
38
+ [
39
+ "mkdir -p /external",
40
+ f"git clone --depth 1 {POKECHAMP_REPO} /external/pokechamp || true",
41
+ "pip install --no-cache-dir -r /external/pokechamp/requirements.txt",
42
+ f"git clone --depth 1 {SHOWDOWN_REPO} /external/pokemon-showdown || true",
43
+ "cd /external/pokemon-showdown && npm ci --no-optional",
44
+ ]
45
+ )
46
+ )
47
+
48
+ REPO_MOUNT = modal.Mount.from_local_dir(REPO_ROOT, remote_path="/workspace/synth-ai")
49
+
50
+
51
+ @app.function(
52
+ image=BASE_IMAGE,
53
+ mounts=[REPO_MOUNT],
54
+ timeout=900,
55
+ memory=8192,
56
+ cpu=4.0,
57
+ secrets=[modal.Secret.from_name("environment-api-key")],
58
+ keep_warm=1,
59
+ )
60
+ @modal.asgi_app()
61
+ def fastapi_app():
62
+ """Serve the Synth task app via Modal."""
63
+
64
+ import os
65
+ from fastapi import APIRouter
66
+
67
+ repo_path = Path("/workspace/synth-ai").resolve()
68
+ if str(repo_path) not in sys.path:
69
+ sys.path.insert(0, str(repo_path))
70
+
71
+ marker = Path("/tmp/.synth_ai_editable")
72
+ if not marker.exists():
73
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "-e", str(repo_path)])
74
+ marker.touch()
75
+
76
+ os.environ.setdefault("PYTHONHASHSEED", "0")
77
+ os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
78
+ os.environ.setdefault("OMP_NUM_THREADS", "1")
79
+ os.environ.setdefault("POKECHAMP_ROOT", "/external/pokechamp")
80
+ os.environ.setdefault("POKEMON_SHOWDOWN_ROOT", "/external/pokemon-showdown")
81
+
82
+ from examples.task_apps.pokemon_battle.task_app.pokemon_showdown import build_config
83
+ from synth_ai.task.server import create_task_app
84
+
85
+ app = create_task_app(build_config())
86
+
87
+ health_router = APIRouter()
88
+
89
+ @health_router.get("/healthz")
90
+ def healthz():
91
+ return {"status": "ok"}
92
+
93
+ app.include_router(health_router)
94
+ return app
95
+
96
+
97
+ @app.local_entrypoint()
98
+ def main():
99
+ """Print handy commands for local testing."""
100
+
101
+ print("Pokémon Showdown task app Modal helper")
102
+ print("Deploy with: modal deploy examples/task_apps/pokemon_battle/modal_app.py")
103
+ print("Test locally: modal serve examples/task_apps/pokemon_battle/modal_app.py")
104
+ print("Once deployed, set TASK_APP_URL to the issued modal.run domain.")
@@ -0,0 +1,68 @@
1
+ # Pokemon Battle Task App
2
+
3
+ This example shows how to expose a Horizons-compatible Pokémon Showdown battle
4
+ environment through the Synth AI task app scaffolding. The adapter runs fully
5
+ locally by driving pokechamp’s deterministic `LocalSim`, so battles can be
6
+ snapshotted and restored without a live Showdown server.
7
+
8
+ ## Local setup (Track 1)
9
+
10
+ 1. Clone and install **PokeChamp** together with its `poke-env` fork:
11
+
12
+ ```bash
13
+ git clone https://github.com/sethkarten/pokechamp.git
14
+ cd pokechamp
15
+ pip install -r requirements.txt
16
+ ```
17
+
18
+ 2. Export environment variables so the task app can locate the cloned repo:
19
+
20
+ ```bash
21
+ export POKECHAMP_ROOT=/path/to/pokechamp
22
+ export POKEMON_SHOWDOWN_ROOT=/path/to/pokemon-showdown
23
+ ```
24
+
25
+ 3. Run a rollout to sanity-check the wiring:
26
+
27
+ ```bash
28
+ uv run python -m synth_ai.task.describe pokemon_showdown
29
+ uv run python -m synth_ai.task.rollout pokemon_showdown --seed 1001
30
+ ```
31
+
32
+ The adapter uses the pokechamp dataset teams bundled with the repository to
33
+ instantiate deterministic Gen 9 OU battles. You can point `POKECHAMP_ROOT` at a
34
+ fork with custom teams to experiment with other formats.
35
+
36
+ ## Modal deployment
37
+
38
+ A ready-to-use deployment helper is available at
39
+ `examples/task_apps/pokemon_battle/modal_app.py`. It mirrors the above manual
40
+ steps (cloning `pokechamp`, installing requirements, and mounting the Synth AI
41
+ repo). Deploy with:
42
+
43
+ ```bash
44
+ modal deploy examples/task_apps/pokemon_battle/modal_app.py
45
+ ```
46
+
47
+ The resulting URL can be plugged into Synth AI workflows via `TASK_APP_URL`.
48
+
49
+ ## Notes
50
+
51
+ - The dataset catalog resolves team files from the PokeChamp repo when available
52
+ (`POKECHAMP_ROOT`). If the assets are missing, `/info` marks the scenario as
53
+ unavailable.
54
+ - Snapshots serialise the entire deterministic battle state, allowing training
55
+ algorithms to branch or reset mid-match.
56
+ - Deterministic RNG seeding (Python, NumPy, PyTorch) keeps rollouts reproducible
57
+ across Modal replicas and local runs.
58
+ - The opponent policy now favours super-effective moves to provide a stronger
59
+ baseline; swap it out with a pokechamp minimax bot for ladder-level play.
60
+ - A `/healthz` endpoint is exposed in the Modal service for liveness probes.
61
+
62
+ ## Status & Next Steps
63
+
64
+ - **Observation polish**: expose richer per-turn summaries (hazards, stat boosts, tera states) and compact text strings tailored for language agents.
65
+ - **Action helpers**: surface explicit target slots/tera/mega toggles so higher formats (doubles, VGC) can plug in with minimal code.
66
+ - **Benchmark opponent**: replace the heuristic opponent with a pokechamp bot (e.g. minimax) or hook into the official PokéAgent ladder for eval parity.
67
+ - **Integration tests**: add pytest smoke tests covering `/snapshot` → `/restore` loops and multi-step rollouts.
68
+ - **Agent wiring**: ship a reference RL/LLM policy config (Synth CLI or Modal job) that exercises the adapter end-to-end and logs battle traces.
@@ -0,0 +1,6 @@
1
+ """Pokemon Showdown task app configuration."""
2
+
3
+ from .pokemon_showdown import build_config
4
+
5
+ __all__ = ["build_config"]
6
+