synth-ai 0.1.9__py3-none-any.whl → 0.2.1.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (266) hide show
  1. synth_ai/__init__.py +28 -2
  2. synth_ai/core/system.py +4 -0
  3. synth_ai/environments/__init__.py +35 -0
  4. synth_ai/environments/environment/__init__.py +1 -0
  5. synth_ai/environments/environment/artifacts/__init__.py +1 -0
  6. synth_ai/environments/environment/artifacts/base.py +50 -0
  7. synth_ai/environments/environment/core.py +22 -0
  8. synth_ai/environments/environment/db/__init__.py +1 -0
  9. synth_ai/environments/environment/db/sqlite.py +45 -0
  10. synth_ai/environments/environment/registry.py +24 -0
  11. synth_ai/environments/environment/resources/sqlite.py +46 -0
  12. synth_ai/environments/environment/results.py +1 -0
  13. synth_ai/environments/environment/rewards/__init__.py +1 -0
  14. synth_ai/environments/environment/rewards/core.py +28 -0
  15. synth_ai/environments/environment/shared_engine.py +26 -0
  16. synth_ai/environments/environment/tools/__init__.py +34 -0
  17. synth_ai/environments/examples/__init__.py +1 -0
  18. synth_ai/environments/examples/crafter_classic/__init__.py +8 -0
  19. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py +58 -0
  20. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_browser.py +152 -0
  21. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_framework.py +1194 -0
  22. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py +51 -0
  23. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_react_agent.py +872 -0
  24. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_trace_evaluation.py +1412 -0
  25. synth_ai/environments/examples/crafter_classic/agent_demos/test_crafter_react_agent.py +1110 -0
  26. synth_ai/environments/examples/crafter_classic/config_logging.py +111 -0
  27. synth_ai/environments/examples/crafter_classic/engine.py +502 -0
  28. synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +63 -0
  29. synth_ai/environments/examples/crafter_classic/engine_helpers/action_map.py +5 -0
  30. synth_ai/environments/examples/crafter_classic/engine_helpers/serialization.py +74 -0
  31. synth_ai/environments/examples/crafter_classic/environment.py +255 -0
  32. synth_ai/environments/examples/crafter_classic/taskset.py +228 -0
  33. synth_ai/environments/examples/enron/agent_demos/test_synth_react.py +535 -0
  34. synth_ai/environments/examples/enron/art_helpers/email_search_tools.py +156 -0
  35. synth_ai/environments/examples/enron/art_helpers/local_email_db.py +280 -0
  36. synth_ai/environments/examples/enron/art_helpers/types_enron.py +24 -0
  37. synth_ai/environments/examples/enron/engine.py +291 -0
  38. synth_ai/environments/examples/enron/environment.py +165 -0
  39. synth_ai/environments/examples/enron/taskset.py +112 -0
  40. synth_ai/environments/examples/enron/units/keyword_stats.py +111 -0
  41. synth_ai/environments/examples/enron/units/test_email_index.py +8 -0
  42. synth_ai/environments/examples/minigrid/__init__.py +48 -0
  43. synth_ai/environments/examples/minigrid/agent_demos/minigrid_evaluation_framework.py +1188 -0
  44. synth_ai/environments/examples/minigrid/agent_demos/minigrid_quick_evaluation.py +47 -0
  45. synth_ai/environments/examples/minigrid/agent_demos/minigrid_react_agent.py +562 -0
  46. synth_ai/environments/examples/minigrid/agent_demos/minigrid_trace_evaluation.py +220 -0
  47. synth_ai/environments/examples/minigrid/agent_demos/test_minigrid_react_agent.py +393 -0
  48. synth_ai/environments/examples/minigrid/engine.py +589 -0
  49. synth_ai/environments/examples/minigrid/environment.py +274 -0
  50. synth_ai/environments/examples/minigrid/environment_mapping.py +242 -0
  51. synth_ai/environments/examples/minigrid/puzzle_loader.py +416 -0
  52. synth_ai/environments/examples/minigrid/taskset.py +583 -0
  53. synth_ai/environments/examples/minigrid/units/test_action_behavior.py +226 -0
  54. synth_ai/environments/examples/minigrid/units/test_debug_messages.py +83 -0
  55. synth_ai/environments/examples/minigrid/units/test_exploration.py +120 -0
  56. synth_ai/environments/examples/minigrid/units/test_minigrid_engine.py +214 -0
  57. synth_ai/environments/examples/minigrid/units/test_minigrid_environment.py +238 -0
  58. synth_ai/environments/examples/minigrid/units/test_minigrid_environment_mapping.py +301 -0
  59. synth_ai/environments/examples/minigrid/units/test_minigrid_taskset.py +210 -0
  60. synth_ai/environments/examples/nethack/__init__.py +7 -0
  61. synth_ai/environments/examples/nethack/achievements.py +337 -0
  62. synth_ai/environments/examples/nethack/agent_demos/nethack_evaluation_framework.py +981 -0
  63. synth_ai/environments/examples/nethack/agent_demos/nethack_quick_evaluation.py +74 -0
  64. synth_ai/environments/examples/nethack/agent_demos/nethack_react_agent.py +832 -0
  65. synth_ai/environments/examples/nethack/agent_demos/test_nethack_react_agent.py +1112 -0
  66. synth_ai/environments/examples/nethack/engine.py +738 -0
  67. synth_ai/environments/examples/nethack/environment.py +255 -0
  68. synth_ai/environments/examples/nethack/helpers/__init__.py +42 -0
  69. synth_ai/environments/examples/nethack/helpers/action_mapping.py +301 -0
  70. synth_ai/environments/examples/nethack/helpers/nle_wrapper.py +401 -0
  71. synth_ai/environments/examples/nethack/helpers/observation_utils.py +433 -0
  72. synth_ai/environments/examples/nethack/helpers/recording_wrapper.py +201 -0
  73. synth_ai/environments/examples/nethack/helpers/trajectory_recorder.py +268 -0
  74. synth_ai/environments/examples/nethack/helpers/visualization/replay_viewer.py +308 -0
  75. synth_ai/environments/examples/nethack/helpers/visualization/visualizer.py +430 -0
  76. synth_ai/environments/examples/nethack/taskset.py +323 -0
  77. synth_ai/environments/examples/nethack/units/test_nethack_engine.py +277 -0
  78. synth_ai/environments/examples/nethack/units/test_nethack_environment.py +281 -0
  79. synth_ai/environments/examples/nethack/units/test_nethack_taskset.py +213 -0
  80. synth_ai/environments/examples/nethack/units/test_recording.py +307 -0
  81. synth_ai/environments/examples/red/__init__.py +7 -0
  82. synth_ai/environments/examples/red/agent_demos/__init__.py +1 -0
  83. synth_ai/environments/examples/red/agent_demos/test_synth_react.py +1471 -0
  84. synth_ai/environments/examples/red/config_logging.py +110 -0
  85. synth_ai/environments/examples/red/engine.py +693 -0
  86. synth_ai/environments/examples/red/engine_helpers/__init__.py +1 -0
  87. synth_ai/environments/examples/red/engine_helpers/memory_map.py +28 -0
  88. synth_ai/environments/examples/red/engine_helpers/reward_components.py +275 -0
  89. synth_ai/environments/examples/red/engine_helpers/reward_library/__init__.py +142 -0
  90. synth_ai/environments/examples/red/engine_helpers/reward_library/adaptive_rewards.py +56 -0
  91. synth_ai/environments/examples/red/engine_helpers/reward_library/battle_rewards.py +283 -0
  92. synth_ai/environments/examples/red/engine_helpers/reward_library/composite_rewards.py +149 -0
  93. synth_ai/environments/examples/red/engine_helpers/reward_library/economy_rewards.py +137 -0
  94. synth_ai/environments/examples/red/engine_helpers/reward_library/efficiency_rewards.py +56 -0
  95. synth_ai/environments/examples/red/engine_helpers/reward_library/exploration_rewards.py +330 -0
  96. synth_ai/environments/examples/red/engine_helpers/reward_library/novelty_rewards.py +120 -0
  97. synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_rewards.py +558 -0
  98. synth_ai/environments/examples/red/engine_helpers/reward_library/pokemon_rewards.py +312 -0
  99. synth_ai/environments/examples/red/engine_helpers/reward_library/social_rewards.py +147 -0
  100. synth_ai/environments/examples/red/engine_helpers/reward_library/story_rewards.py +246 -0
  101. synth_ai/environments/examples/red/engine_helpers/screen_analysis.py +367 -0
  102. synth_ai/environments/examples/red/engine_helpers/state_extraction.py +139 -0
  103. synth_ai/environments/examples/red/environment.py +235 -0
  104. synth_ai/environments/examples/red/taskset.py +77 -0
  105. synth_ai/environments/examples/red/test_fixes.py +125 -0
  106. synth_ai/environments/examples/red/test_fixes_mock.py +148 -0
  107. synth_ai/environments/examples/red/units/__init__.py +1 -0
  108. synth_ai/environments/examples/red/units/test_basic_functionality.py +97 -0
  109. synth_ai/environments/examples/red/units/test_button_press_requirements.py +217 -0
  110. synth_ai/environments/examples/red/units/test_engine.py +192 -0
  111. synth_ai/environments/examples/red/units/test_environment.py +455 -0
  112. synth_ai/environments/examples/red/units/test_exploration_strategy.py +227 -0
  113. synth_ai/environments/examples/red/units/test_integration.py +217 -0
  114. synth_ai/environments/examples/red/units/test_memory_extraction.py +111 -0
  115. synth_ai/environments/examples/red/units/test_menu_bug_reproduction.py +1100 -0
  116. synth_ai/environments/examples/red/units/test_movement_debug.py +255 -0
  117. synth_ai/environments/examples/red/units/test_pokemon_mcts_debug.py +163 -0
  118. synth_ai/environments/examples/red/units/test_pokemon_mcts_verbose.py +117 -0
  119. synth_ai/environments/examples/red/units/test_red_basic.py +145 -0
  120. synth_ai/environments/examples/red/units/test_red_comprehensive.py +323 -0
  121. synth_ai/environments/examples/red/units/test_retry_movement.py +195 -0
  122. synth_ai/environments/examples/red/units/test_reward_components.py +186 -0
  123. synth_ai/environments/examples/red/units/test_rom_integration.py +260 -0
  124. synth_ai/environments/examples/red/units/test_taskset.py +116 -0
  125. synth_ai/environments/examples/red/units/test_tree.py +448 -0
  126. synth_ai/environments/examples/sokoban/__init__.py +1 -0
  127. synth_ai/environments/examples/sokoban/agent_demos/sokoban_full_eval.py +900 -0
  128. synth_ai/environments/examples/sokoban/agent_demos/test_dspy_react.py +1 -0
  129. synth_ai/environments/examples/sokoban/agent_demos/test_sokoban_react_agent.py +498 -0
  130. synth_ai/environments/examples/sokoban/agent_demos/test_synth_lats.py +1 -0
  131. synth_ai/environments/examples/sokoban/agent_demos/test_synth_react_locally.py +748 -0
  132. synth_ai/environments/examples/sokoban/agent_demos/test_synth_react_service.py +296 -0
  133. synth_ai/environments/examples/sokoban/engine.py +675 -0
  134. synth_ai/environments/examples/sokoban/engine_helpers/__init__.py +1 -0
  135. synth_ai/environments/examples/sokoban/engine_helpers/room_utils.py +656 -0
  136. synth_ai/environments/examples/sokoban/engine_helpers/vendored/__init__.py +17 -0
  137. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/__init__.py +3 -0
  138. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/boxoban_env.py +129 -0
  139. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/render_utils.py +370 -0
  140. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/room_utils.py +331 -0
  141. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env.py +305 -0
  142. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_fixed_targets.py +66 -0
  143. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_pull.py +114 -0
  144. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_two_player.py +122 -0
  145. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_variations.py +394 -0
  146. synth_ai/environments/examples/sokoban/environment.py +228 -0
  147. synth_ai/environments/examples/sokoban/generate_verified_puzzles.py +438 -0
  148. synth_ai/environments/examples/sokoban/puzzle_loader.py +311 -0
  149. synth_ai/environments/examples/sokoban/taskset.py +425 -0
  150. synth_ai/environments/examples/sokoban/units/astar_common.py +94 -0
  151. synth_ai/environments/examples/sokoban/units/test_building_task_set.py +49 -0
  152. synth_ai/environments/examples/sokoban/units/test_false_positive.py +120 -0
  153. synth_ai/environments/examples/sokoban/units/test_simple_run_through_environment.py +119 -0
  154. synth_ai/environments/examples/sokoban/units/test_sokoban_environment.py +98 -0
  155. synth_ai/environments/examples/sokoban/units/test_tree.py +364 -0
  156. synth_ai/environments/examples/tictactoe/__init__.py +1 -0
  157. synth_ai/environments/examples/tictactoe/agent_demos/test_synth_react.py +266 -0
  158. synth_ai/environments/examples/tictactoe/agent_demos/test_tictactoe_react_agent.py +470 -0
  159. synth_ai/environments/examples/tictactoe/engine.py +368 -0
  160. synth_ai/environments/examples/tictactoe/environment.py +239 -0
  161. synth_ai/environments/examples/tictactoe/taskset.py +214 -0
  162. synth_ai/environments/examples/tictactoe/units/test_tictactoe_engine.py +393 -0
  163. synth_ai/environments/examples/tictactoe/units/test_tictactoe_environment.py +493 -0
  164. synth_ai/environments/examples/tictactoe/units/test_tictactoe_taskset.py +191 -0
  165. synth_ai/environments/examples/verilog/__init__.py +10 -0
  166. synth_ai/environments/examples/verilog/agent_demos/test_synth_react.py +520 -0
  167. synth_ai/environments/examples/verilog/engine.py +328 -0
  168. synth_ai/environments/examples/verilog/environment.py +349 -0
  169. synth_ai/environments/examples/verilog/taskset.py +418 -0
  170. synth_ai/environments/examples/verilog/units/test_verilog_engine.py +466 -0
  171. synth_ai/environments/examples/verilog/units/test_verilog_environment.py +585 -0
  172. synth_ai/environments/examples/verilog/units/test_verilog_integration.py +383 -0
  173. synth_ai/environments/examples/verilog/units/test_verilog_taskset.py +457 -0
  174. synth_ai/environments/reproducibility/core.py +42 -0
  175. synth_ai/environments/reproducibility/tree.py +364 -0
  176. synth_ai/environments/service/app.py +78 -0
  177. synth_ai/environments/service/core_routes.py +775 -0
  178. synth_ai/environments/service/external_registry.py +57 -0
  179. synth_ai/environments/service/registry.py +9 -0
  180. synth_ai/environments/stateful/__init__.py +1 -0
  181. synth_ai/environments/stateful/core.py +28 -0
  182. synth_ai/environments/stateful/engine.py +21 -0
  183. synth_ai/environments/stateful/state.py +7 -0
  184. synth_ai/environments/tasks/api.py +19 -0
  185. synth_ai/environments/tasks/core.py +78 -0
  186. synth_ai/environments/tasks/filters.py +39 -0
  187. synth_ai/environments/tasks/utils.py +89 -0
  188. synth_ai/environments/v0_observability/history.py +3 -0
  189. synth_ai/environments/v0_observability/log.py +2 -0
  190. synth_ai/lm/caching/constants.py +1 -0
  191. synth_ai/{zyk/lms → lm}/caching/ephemeral.py +4 -8
  192. synth_ai/{zyk/lms → lm}/caching/handler.py +15 -15
  193. synth_ai/{zyk/lms → lm}/caching/initialize.py +2 -4
  194. synth_ai/{zyk/lms → lm}/caching/persistent.py +4 -10
  195. synth_ai/{zyk/lms → lm}/config.py +2 -1
  196. synth_ai/{zyk/lms → lm}/constants.py +2 -2
  197. synth_ai/{zyk/lms → lm}/core/all.py +10 -10
  198. synth_ai/{zyk/lms → lm}/core/main.py +57 -33
  199. synth_ai/{zyk/lms → lm}/core/vendor_clients.py +12 -10
  200. synth_ai/lm/cost/monitor.py +1 -0
  201. synth_ai/lm/cost/statefulness.py +1 -0
  202. synth_ai/lm/provider_support/__init__.py +8 -0
  203. synth_ai/lm/provider_support/anthropic.py +945 -0
  204. synth_ai/lm/provider_support/openai.py +1115 -0
  205. synth_ai/lm/provider_support/suppress_logging.py +31 -0
  206. synth_ai/{zyk/lms → lm}/structured_outputs/handler.py +58 -80
  207. synth_ai/{zyk/lms → lm}/structured_outputs/inject.py +6 -20
  208. synth_ai/{zyk/lms → lm}/structured_outputs/rehabilitate.py +6 -12
  209. synth_ai/{zyk/lms → lm}/vendors/core/anthropic_api.py +21 -30
  210. synth_ai/{zyk/lms → lm}/vendors/core/gemini_api.py +37 -32
  211. synth_ai/{zyk/lms → lm}/vendors/core/mistral_api.py +19 -28
  212. synth_ai/{zyk/lms → lm}/vendors/core/openai_api.py +26 -36
  213. synth_ai/{zyk/lms → lm}/vendors/openai_standard.py +29 -33
  214. synth_ai/{zyk/lms → lm}/vendors/retries.py +1 -1
  215. synth_ai/lm/vendors/supported/__init__.py +0 -0
  216. synth_ai/{zyk/lms → lm}/vendors/supported/custom_endpoint.py +131 -118
  217. synth_ai/{zyk/lms → lm}/vendors/supported/deepseek.py +4 -8
  218. synth_ai/{zyk/lms → lm}/vendors/supported/grok.py +6 -8
  219. synth_ai/{zyk/lms → lm}/vendors/supported/groq.py +1 -1
  220. synth_ai/{zyk/lms → lm}/vendors/supported/ollama.py +2 -2
  221. synth_ai/{zyk/lms → lm}/vendors/supported/openrouter.py +18 -16
  222. synth_ai/{zyk/lms → lm}/vendors/supported/together.py +1 -1
  223. synth_ai/tracing/__init__.py +0 -0
  224. synth_ai/tracing/abstractions.py +224 -0
  225. synth_ai/tracing/base_client.py +91 -0
  226. synth_ai/tracing/client_manager.py +131 -0
  227. synth_ai/tracing/config.py +140 -0
  228. synth_ai/tracing/context.py +146 -0
  229. synth_ai/tracing/decorators.py +679 -0
  230. synth_ai/tracing/events/__init__.py +0 -0
  231. synth_ai/tracing/events/manage.py +147 -0
  232. synth_ai/tracing/events/scope.py +86 -0
  233. synth_ai/tracing/events/store.py +227 -0
  234. synth_ai/tracing/immediate_client.py +152 -0
  235. synth_ai/tracing/local.py +18 -0
  236. synth_ai/tracing/log_client_base.py +74 -0
  237. synth_ai/tracing/retry_queue.py +187 -0
  238. synth_ai/tracing/trackers.py +515 -0
  239. synth_ai/tracing/upload.py +504 -0
  240. synth_ai/tracing/utils.py +9 -0
  241. synth_ai/zyk/__init__.py +28 -2
  242. synth_ai-0.2.1.dev0.dist-info/METADATA +349 -0
  243. synth_ai-0.2.1.dev0.dist-info/RECORD +261 -0
  244. synth_ai/zyk/lms/caching/constants.py +0 -1
  245. synth_ai/zyk/lms/cost/monitor.py +0 -1
  246. synth_ai/zyk/lms/cost/statefulness.py +0 -1
  247. synth_ai-0.1.9.dist-info/METADATA +0 -37
  248. synth_ai-0.1.9.dist-info/RECORD +0 -50
  249. /synth_ai/{zyk/lms/__init__.py → environments/reproducibility/helpers.py} +0 -0
  250. /synth_ai/{zyk/lms/caching → lm}/__init__.py +0 -0
  251. /synth_ai/{zyk/lms/core → lm/caching}/__init__.py +0 -0
  252. /synth_ai/{zyk/lms → lm}/caching/dbs.py +0 -0
  253. /synth_ai/{zyk/lms/cost → lm/core}/__init__.py +0 -0
  254. /synth_ai/{zyk/lms → lm}/core/exceptions.py +0 -0
  255. /synth_ai/{zyk/lms/structured_outputs → lm/cost}/__init__.py +0 -0
  256. /synth_ai/{zyk/lms/vendors → lm/structured_outputs}/__init__.py +0 -0
  257. /synth_ai/{zyk/lms → lm}/tools/__init__.py +0 -0
  258. /synth_ai/{zyk/lms → lm}/tools/base.py +0 -0
  259. /synth_ai/{zyk/lms/vendors/core → lm/vendors}/__init__.py +0 -0
  260. /synth_ai/{zyk/lms → lm}/vendors/base.py +0 -0
  261. /synth_ai/{zyk/lms/vendors/local → lm/vendors/core}/__init__.py +0 -0
  262. /synth_ai/{zyk/lms/vendors/supported → lm/vendors/local}/__init__.py +0 -0
  263. /synth_ai/{zyk/lms → lm}/vendors/local/ollama.py +0 -0
  264. {synth_ai-0.1.9.dist-info → synth_ai-0.2.1.dev0.dist-info}/WHEEL +0 -0
  265. {synth_ai-0.1.9.dist-info → synth_ai-0.2.1.dev0.dist-info}/licenses/LICENSE +0 -0
  266. {synth_ai-0.1.9.dist-info → synth_ai-0.2.1.dev0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,296 @@
1
+ import pytest
2
+ import asyncio
3
+ import json
4
+
5
+ from httpx import AsyncClient
6
+
7
+ from synth_ai.environments.examples.sokoban.agent_demos.test_synth_react_locally import (
8
+ ReActAgent,
9
+ SIMPLE_SNAPSHOT,
10
+ )
11
+ from synth_sdk.tracing.abstractions import RewardSignal, Dataset, TrainingQuestion
12
+ from synth_ai.zyk import LM
13
+
14
+ # Demo: drive Sokoban via FastAPI service endpoints
15
+
16
+
17
+ # HTTP-mode formatting for service-based observations
18
+ def format_obs_http(public: dict, private: dict, total_boxes: int) -> str:
19
+ room_text = public.get("room_text") or public.get("room_text_final", "")
20
+ return (
21
+ f"{room_text}\n"
22
+ f"Boxes on Target: {public.get('boxes_on_target', 0)} / {total_boxes}\n"
23
+ f"Steps Taken: {public.get('steps_taken', 0)} / {public.get('max_steps', 0)}\n"
24
+ f"Terminated: {private.get('terminated')}\n"
25
+ f"Last Reward: {private.get('reward_last', 0)}"
26
+ )
27
+
28
+
29
+ @pytest.mark.anyio
30
+ async def test_react_service_sokoban():
31
+ # Launch the service with in-process AsyncClient
32
+ async with AsyncClient(base_url="http://localhost:8000") as client:
33
+ # 1) Health check
34
+ health = await client.get("/env/health")
35
+ assert health.status_code == 200
36
+ supported = health.json()["supported_environments"]
37
+ assert "Sokoban" in supported
38
+
39
+ # 2) Create a Sokoban instance from a simple snapshot
40
+ resp = await client.post(
41
+ "/env/Sokoban/create",
42
+ json={"initial_state": SIMPLE_SNAPSHOT},
43
+ )
44
+ assert resp.status_code == 200
45
+ instance_id = resp.json()["instance_id"]
46
+
47
+ # 3) Reset to get initial observation
48
+ reset_resp = await client.post(f"/env/Sokoban/{instance_id}/reset")
49
+ assert reset_resp.status_code == 200
50
+ obs = reset_resp.json()
51
+ private = obs["private"]
52
+ public = obs["public"]
53
+
54
+ # 4) Instantiate the LLM & ReAct agent
55
+ llm = LM(model_name="gpt-4.1", formatting_model_name="gpt-4.1", temperature=0.0)
56
+ agent = ReActAgent(llm)
57
+
58
+ # Helper to track total boxes from the initial snapshot
59
+ total_boxes = SIMPLE_SNAPSHOT.get("num_boxes", 0)
60
+
61
+ # 5) Run episode loop via service step calls
62
+ prompt = format_obs_http(public, private, total_boxes)
63
+ for _ in range(agent.max_turns):
64
+ action_idx = await agent.decide(prompt)
65
+ # Agent signals termination
66
+ if action_idx == -1:
67
+ break
68
+
69
+ # POST step with a single EnvToolCall JSON
70
+ step_resp = await client.post(
71
+ f"/env/Sokoban/{instance_id}/step",
72
+ json=[{"tool": "interact", "args": {"action": action_idx}}],
73
+ )
74
+ assert step_resp.status_code == 200
75
+ obs = step_resp.json()
76
+ private = obs["private"]
77
+ public = obs["public"]
78
+
79
+ # Update prompt and check termination
80
+ prompt = format_obs_http(public, private, total_boxes)
81
+ if private.get("terminated"):
82
+ break
83
+
84
+ # 6) Final checkpoint (optional)
85
+ ckpt = await client.get(f"/env/Sokoban/{instance_id}/checkpoint")
86
+ assert ckpt.status_code == 200
87
+ snapshot = ckpt.json().get("snapshot")
88
+
89
+ # 7) Assertions: ensure solved state
90
+ assert private.get("terminated") is True
91
+ assert public.get("boxes_on_target") == total_boxes
92
+
93
+ # 8) Optionally upload or record dataset
94
+ dataset = Dataset(
95
+ questions=[TrainingQuestion(id="sokoban_ep", intent="solve", criteria="solved")],
96
+ reward_signals=[
97
+ RewardSignal(
98
+ question_id="sokoban_ep",
99
+ system_instance_id=agent.system_instance_id,
100
+ reward=1,
101
+ annotation=json.dumps({"agent_history": agent.history}),
102
+ )
103
+ ],
104
+ )
105
+ # upload(dataset=dataset) # Uncomment to send logs
106
+
107
+
108
+ # --- single-episode runner for service-based Sokoban ---
109
+ async def run_service_episode(client, agent, snapshot, total_boxes):
110
+ # Create new instance
111
+ resp = await client.post(
112
+ "/env/Sokoban/create",
113
+ json={"initial_state": snapshot},
114
+ )
115
+ instance_id = resp.json()["instance_id"]
116
+ # Reset environment
117
+ reset_resp = await client.post(f"/env/Sokoban/{instance_id}/reset")
118
+ obs = reset_resp.json()
119
+ private = obs["private"]
120
+ public = obs["public"]
121
+ # Initialize prompt
122
+ prompt = format_obs_http(public, private, total_boxes)
123
+ # Run one episode loop
124
+ for _ in range(agent.max_turns):
125
+ decision_record = await agent.decide(prompt)
126
+ action_idx = decision_record.action_int
127
+ if action_idx == -1:
128
+ break
129
+ # Step via service
130
+ step_resp = await client.post(
131
+ f"/env/Sokoban/{instance_id}/step",
132
+ json=[{"tool": "interact", "args": {"action": action_idx}}],
133
+ )
134
+ if step_resp.status_code != 200:
135
+ print(f"ERROR in STEP: Status {step_resp.status_code}, Response: {step_resp.text}")
136
+ # Decide how to handle error, e.g., raise or return False
137
+ raise Exception(f"Step API call failed with status {step_resp.status_code}")
138
+ obs = step_resp.json()
139
+ private = obs["private"]
140
+ public = obs["public"]
141
+ prompt = format_obs_http(public, private, total_boxes)
142
+ if private.get("terminated"):
143
+ break
144
+ # Optionally terminate (cleanup)
145
+ await client.post(f"/env/Sokoban/{instance_id}/terminate")
146
+ return bool(private.get("terminated"))
147
+
148
+
149
+ # --- batch evaluation helper for service-based Sokoban ---
150
+ async def eval_react_service_sokoban(
151
+ model_name: str = "gpt-4.1-nano",
152
+ formatting_model_name: str = "gpt-4.1-nano",
153
+ modes: list[str] = ["ultra-easy", "easy", "medium"],
154
+ ):
155
+ from examples.sokoban.engine_helpers.room_utils import (
156
+ generate_room,
157
+ get_shortest_action_path,
158
+ )
159
+ from tabulate import tabulate
160
+
161
+ llm = LM(
162
+ model_name=model_name,
163
+ formatting_model_name=formatting_model_name,
164
+ temperature=0.0,
165
+ )
166
+ agent = ReActAgent(llm)
167
+ total_boxes = 1
168
+
169
+ difficulty_to_length_map = {
170
+ "ultra-easy": 1,
171
+ "easy": 3,
172
+ "medium": 5,
173
+ "hard": 7,
174
+ "ultra-hard": 10,
175
+ }
176
+
177
+ configs_for_modes = []
178
+ for mode_label in modes:
179
+ if mode_label in difficulty_to_length_map:
180
+ configs_for_modes.append((mode_label, difficulty_to_length_map[mode_label]))
181
+ else:
182
+ print(f"Warning: Mode '{mode_label}' not found in difficulty_to_length_map. Skipping.")
183
+
184
+ if not configs_for_modes:
185
+ print("No valid modes selected for evaluation. Exiting.")
186
+ return
187
+
188
+ async def evaluate_single_mode(
189
+ client,
190
+ mode_label: str,
191
+ target_len: int,
192
+ agent_for_mode: ReActAgent,
193
+ boxes_for_mode: int,
194
+ ) -> dict:
195
+ """Generates instances for a mode, runs episodes in parallel, and returns results for that mode."""
196
+ print(
197
+ f" Starting evaluation for mode: {mode_label} (target_len: {target_len}) for model {model_name}..."
198
+ )
199
+ snapshots = []
200
+ seed = 0
201
+ # Generate 3 instances for this mode
202
+ while len(snapshots) < 3:
203
+ room_struct, room_state, _, _ = generate_room(
204
+ dim=(5, 5),
205
+ initial_seed=seed,
206
+ num_boxes=1,
207
+ search_depth=max(10, target_len + 2),
208
+ )
209
+ path = get_shortest_action_path(room_struct, room_state, MAX_DEPTH=20)
210
+ if len(path) == target_len:
211
+ snapshots.append(
212
+ {
213
+ "dim_room": (5, 5),
214
+ "room_fixed": room_struct.tolist(),
215
+ "room_state": room_state.tolist(),
216
+ "boxes_on_target": 0,
217
+ "max_steps": 20,
218
+ "num_boxes": 1,
219
+ }
220
+ )
221
+ seed += 1
222
+
223
+ episode_tasks = [
224
+ run_service_episode(client, agent_for_mode, snap, boxes_for_mode) for snap in snapshots
225
+ ]
226
+ solved_statuses = await asyncio.gather(*episode_tasks)
227
+ num_solved = sum(solved_statuses)
228
+ num_instances = len(snapshots)
229
+ rate = num_solved / num_instances if num_instances > 0 else 0.0
230
+ print(
231
+ f" Completed mode: {mode_label} for model {model_name} - Solved: {num_solved}/{num_instances} ({rate:.0%})"
232
+ )
233
+ return {
234
+ "Difficulty": mode_label,
235
+ "Solved": f"{num_solved}/{num_instances}",
236
+ "Success Rate": f"{rate:.0%}",
237
+ }
238
+
239
+ all_mode_results_list = []
240
+ async with AsyncClient(base_url="http://localhost:8000") as client:
241
+ mode_evaluation_tasks = []
242
+ for mode_label, target_len in configs_for_modes:
243
+ # Create a new agent instance for each mode to ensure isolated history, if ReActAgent maintains state
244
+ # If ReActAgent is stateless or history is reset per decide call, this might not be strictly necessary
245
+ # but it is safer for parallel execution if there's any doubt.
246
+ llm_for_mode = LM(
247
+ model_name=model_name,
248
+ formatting_model_name=formatting_model_name,
249
+ temperature=0.0,
250
+ )
251
+ agent_for_mode = ReActAgent(llm_for_mode)
252
+ mode_evaluation_tasks.append(
253
+ evaluate_single_mode(client, mode_label, target_len, agent_for_mode, total_boxes)
254
+ )
255
+
256
+ # Run evaluations for all modes in parallel
257
+ all_mode_results_list = await asyncio.gather(*mode_evaluation_tasks)
258
+
259
+ # Sort results by the original order in modes (optional, but good for consistent table output)
260
+ # This requires knowing the original order. If gather changes it, we might need to re-sort.
261
+ # For now, let's assume gather maintains order or sort based on a predefined difficulty order.
262
+ # To simplify, we'll use the order from `configs_for_modes` if needed, though `all_mode_results_list` should be in order.
263
+
264
+ # Build table_rows from the collected results
265
+ table_rows = []
266
+ for result_dict in all_mode_results_list:
267
+ table_rows.append(
268
+ [
269
+ result_dict["Difficulty"],
270
+ result_dict["Solved"],
271
+ result_dict["Success Rate"],
272
+ ]
273
+ )
274
+
275
+ print(
276
+ f"\nModel: {llm.model_name}, System: {agent.system_name}"
277
+ ) # agent here is the one from the outer scope
278
+ print(
279
+ tabulate(
280
+ table_rows,
281
+ headers=["Difficulty", "Solved", "Success Rate"],
282
+ tablefmt="github",
283
+ )
284
+ )
285
+
286
+
287
+ if __name__ == "__main__":
288
+ import asyncio
289
+
290
+ asyncio.run(
291
+ eval_react_service_sokoban(
292
+ model_name="gpt-4.1-mini",
293
+ formatting_model_name="gpt-4.1-mini",
294
+ modes=["ultra-easy", "easy"],
295
+ )
296
+ )