synth-ai 0.2.0__py3-none-any.whl → 0.2.1.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (266) hide show
  1. synth_ai/__init__.py +28 -2
  2. synth_ai/core/system.py +4 -0
  3. synth_ai/environments/__init__.py +35 -0
  4. synth_ai/environments/environment/__init__.py +1 -0
  5. synth_ai/environments/environment/artifacts/__init__.py +1 -0
  6. synth_ai/environments/environment/artifacts/base.py +50 -0
  7. synth_ai/environments/environment/core.py +22 -0
  8. synth_ai/environments/environment/db/__init__.py +1 -0
  9. synth_ai/environments/environment/db/sqlite.py +45 -0
  10. synth_ai/environments/environment/registry.py +24 -0
  11. synth_ai/environments/environment/resources/sqlite.py +46 -0
  12. synth_ai/environments/environment/results.py +1 -0
  13. synth_ai/environments/environment/rewards/__init__.py +1 -0
  14. synth_ai/environments/environment/rewards/core.py +28 -0
  15. synth_ai/environments/environment/shared_engine.py +26 -0
  16. synth_ai/environments/environment/tools/__init__.py +34 -0
  17. synth_ai/environments/examples/__init__.py +1 -0
  18. synth_ai/environments/examples/crafter_classic/__init__.py +8 -0
  19. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py +58 -0
  20. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_browser.py +152 -0
  21. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_framework.py +1194 -0
  22. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py +51 -0
  23. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_react_agent.py +872 -0
  24. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_trace_evaluation.py +1412 -0
  25. synth_ai/environments/examples/crafter_classic/agent_demos/test_crafter_react_agent.py +1110 -0
  26. synth_ai/environments/examples/crafter_classic/config_logging.py +111 -0
  27. synth_ai/environments/examples/crafter_classic/engine.py +502 -0
  28. synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +63 -0
  29. synth_ai/environments/examples/crafter_classic/engine_helpers/action_map.py +5 -0
  30. synth_ai/environments/examples/crafter_classic/engine_helpers/serialization.py +74 -0
  31. synth_ai/environments/examples/crafter_classic/environment.py +255 -0
  32. synth_ai/environments/examples/crafter_classic/taskset.py +228 -0
  33. synth_ai/environments/examples/enron/agent_demos/test_synth_react.py +535 -0
  34. synth_ai/environments/examples/enron/art_helpers/email_search_tools.py +156 -0
  35. synth_ai/environments/examples/enron/art_helpers/local_email_db.py +280 -0
  36. synth_ai/environments/examples/enron/art_helpers/types_enron.py +24 -0
  37. synth_ai/environments/examples/enron/engine.py +291 -0
  38. synth_ai/environments/examples/enron/environment.py +165 -0
  39. synth_ai/environments/examples/enron/taskset.py +112 -0
  40. synth_ai/environments/examples/enron/units/keyword_stats.py +111 -0
  41. synth_ai/environments/examples/enron/units/test_email_index.py +8 -0
  42. synth_ai/environments/examples/minigrid/__init__.py +48 -0
  43. synth_ai/environments/examples/minigrid/agent_demos/minigrid_evaluation_framework.py +1188 -0
  44. synth_ai/environments/examples/minigrid/agent_demos/minigrid_quick_evaluation.py +47 -0
  45. synth_ai/environments/examples/minigrid/agent_demos/minigrid_react_agent.py +562 -0
  46. synth_ai/environments/examples/minigrid/agent_demos/minigrid_trace_evaluation.py +220 -0
  47. synth_ai/environments/examples/minigrid/agent_demos/test_minigrid_react_agent.py +393 -0
  48. synth_ai/environments/examples/minigrid/engine.py +589 -0
  49. synth_ai/environments/examples/minigrid/environment.py +274 -0
  50. synth_ai/environments/examples/minigrid/environment_mapping.py +242 -0
  51. synth_ai/environments/examples/minigrid/puzzle_loader.py +416 -0
  52. synth_ai/environments/examples/minigrid/taskset.py +583 -0
  53. synth_ai/environments/examples/minigrid/units/test_action_behavior.py +226 -0
  54. synth_ai/environments/examples/minigrid/units/test_debug_messages.py +83 -0
  55. synth_ai/environments/examples/minigrid/units/test_exploration.py +120 -0
  56. synth_ai/environments/examples/minigrid/units/test_minigrid_engine.py +214 -0
  57. synth_ai/environments/examples/minigrid/units/test_minigrid_environment.py +238 -0
  58. synth_ai/environments/examples/minigrid/units/test_minigrid_environment_mapping.py +301 -0
  59. synth_ai/environments/examples/minigrid/units/test_minigrid_taskset.py +210 -0
  60. synth_ai/environments/examples/nethack/__init__.py +7 -0
  61. synth_ai/environments/examples/nethack/achievements.py +337 -0
  62. synth_ai/environments/examples/nethack/agent_demos/nethack_evaluation_framework.py +981 -0
  63. synth_ai/environments/examples/nethack/agent_demos/nethack_quick_evaluation.py +74 -0
  64. synth_ai/environments/examples/nethack/agent_demos/nethack_react_agent.py +832 -0
  65. synth_ai/environments/examples/nethack/agent_demos/test_nethack_react_agent.py +1112 -0
  66. synth_ai/environments/examples/nethack/engine.py +738 -0
  67. synth_ai/environments/examples/nethack/environment.py +255 -0
  68. synth_ai/environments/examples/nethack/helpers/__init__.py +42 -0
  69. synth_ai/environments/examples/nethack/helpers/action_mapping.py +301 -0
  70. synth_ai/environments/examples/nethack/helpers/nle_wrapper.py +401 -0
  71. synth_ai/environments/examples/nethack/helpers/observation_utils.py +433 -0
  72. synth_ai/environments/examples/nethack/helpers/recording_wrapper.py +201 -0
  73. synth_ai/environments/examples/nethack/helpers/trajectory_recorder.py +268 -0
  74. synth_ai/environments/examples/nethack/helpers/visualization/replay_viewer.py +308 -0
  75. synth_ai/environments/examples/nethack/helpers/visualization/visualizer.py +430 -0
  76. synth_ai/environments/examples/nethack/taskset.py +323 -0
  77. synth_ai/environments/examples/nethack/units/test_nethack_engine.py +277 -0
  78. synth_ai/environments/examples/nethack/units/test_nethack_environment.py +281 -0
  79. synth_ai/environments/examples/nethack/units/test_nethack_taskset.py +213 -0
  80. synth_ai/environments/examples/nethack/units/test_recording.py +307 -0
  81. synth_ai/environments/examples/red/__init__.py +7 -0
  82. synth_ai/environments/examples/red/agent_demos/__init__.py +1 -0
  83. synth_ai/environments/examples/red/agent_demos/test_synth_react.py +1471 -0
  84. synth_ai/environments/examples/red/config_logging.py +110 -0
  85. synth_ai/environments/examples/red/engine.py +693 -0
  86. synth_ai/environments/examples/red/engine_helpers/__init__.py +1 -0
  87. synth_ai/environments/examples/red/engine_helpers/memory_map.py +28 -0
  88. synth_ai/environments/examples/red/engine_helpers/reward_components.py +275 -0
  89. synth_ai/environments/examples/red/engine_helpers/reward_library/__init__.py +142 -0
  90. synth_ai/environments/examples/red/engine_helpers/reward_library/adaptive_rewards.py +56 -0
  91. synth_ai/environments/examples/red/engine_helpers/reward_library/battle_rewards.py +283 -0
  92. synth_ai/environments/examples/red/engine_helpers/reward_library/composite_rewards.py +149 -0
  93. synth_ai/environments/examples/red/engine_helpers/reward_library/economy_rewards.py +137 -0
  94. synth_ai/environments/examples/red/engine_helpers/reward_library/efficiency_rewards.py +56 -0
  95. synth_ai/environments/examples/red/engine_helpers/reward_library/exploration_rewards.py +330 -0
  96. synth_ai/environments/examples/red/engine_helpers/reward_library/novelty_rewards.py +120 -0
  97. synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_rewards.py +558 -0
  98. synth_ai/environments/examples/red/engine_helpers/reward_library/pokemon_rewards.py +312 -0
  99. synth_ai/environments/examples/red/engine_helpers/reward_library/social_rewards.py +147 -0
  100. synth_ai/environments/examples/red/engine_helpers/reward_library/story_rewards.py +246 -0
  101. synth_ai/environments/examples/red/engine_helpers/screen_analysis.py +367 -0
  102. synth_ai/environments/examples/red/engine_helpers/state_extraction.py +139 -0
  103. synth_ai/environments/examples/red/environment.py +235 -0
  104. synth_ai/environments/examples/red/taskset.py +77 -0
  105. synth_ai/environments/examples/red/test_fixes.py +125 -0
  106. synth_ai/environments/examples/red/test_fixes_mock.py +148 -0
  107. synth_ai/environments/examples/red/units/__init__.py +1 -0
  108. synth_ai/environments/examples/red/units/test_basic_functionality.py +97 -0
  109. synth_ai/environments/examples/red/units/test_button_press_requirements.py +217 -0
  110. synth_ai/environments/examples/red/units/test_engine.py +192 -0
  111. synth_ai/environments/examples/red/units/test_environment.py +455 -0
  112. synth_ai/environments/examples/red/units/test_exploration_strategy.py +227 -0
  113. synth_ai/environments/examples/red/units/test_integration.py +217 -0
  114. synth_ai/environments/examples/red/units/test_memory_extraction.py +111 -0
  115. synth_ai/environments/examples/red/units/test_menu_bug_reproduction.py +1100 -0
  116. synth_ai/environments/examples/red/units/test_movement_debug.py +255 -0
  117. synth_ai/environments/examples/red/units/test_pokemon_mcts_debug.py +163 -0
  118. synth_ai/environments/examples/red/units/test_pokemon_mcts_verbose.py +117 -0
  119. synth_ai/environments/examples/red/units/test_red_basic.py +145 -0
  120. synth_ai/environments/examples/red/units/test_red_comprehensive.py +323 -0
  121. synth_ai/environments/examples/red/units/test_retry_movement.py +195 -0
  122. synth_ai/environments/examples/red/units/test_reward_components.py +186 -0
  123. synth_ai/environments/examples/red/units/test_rom_integration.py +260 -0
  124. synth_ai/environments/examples/red/units/test_taskset.py +116 -0
  125. synth_ai/environments/examples/red/units/test_tree.py +448 -0
  126. synth_ai/environments/examples/sokoban/__init__.py +1 -0
  127. synth_ai/environments/examples/sokoban/agent_demos/sokoban_full_eval.py +900 -0
  128. synth_ai/environments/examples/sokoban/agent_demos/test_dspy_react.py +1 -0
  129. synth_ai/environments/examples/sokoban/agent_demos/test_sokoban_react_agent.py +498 -0
  130. synth_ai/environments/examples/sokoban/agent_demos/test_synth_lats.py +1 -0
  131. synth_ai/environments/examples/sokoban/agent_demos/test_synth_react_locally.py +748 -0
  132. synth_ai/environments/examples/sokoban/agent_demos/test_synth_react_service.py +296 -0
  133. synth_ai/environments/examples/sokoban/engine.py +675 -0
  134. synth_ai/environments/examples/sokoban/engine_helpers/__init__.py +1 -0
  135. synth_ai/environments/examples/sokoban/engine_helpers/room_utils.py +656 -0
  136. synth_ai/environments/examples/sokoban/engine_helpers/vendored/__init__.py +17 -0
  137. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/__init__.py +3 -0
  138. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/boxoban_env.py +129 -0
  139. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/render_utils.py +370 -0
  140. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/room_utils.py +331 -0
  141. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env.py +305 -0
  142. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_fixed_targets.py +66 -0
  143. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_pull.py +114 -0
  144. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_two_player.py +122 -0
  145. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_variations.py +394 -0
  146. synth_ai/environments/examples/sokoban/environment.py +228 -0
  147. synth_ai/environments/examples/sokoban/generate_verified_puzzles.py +438 -0
  148. synth_ai/environments/examples/sokoban/puzzle_loader.py +311 -0
  149. synth_ai/environments/examples/sokoban/taskset.py +425 -0
  150. synth_ai/environments/examples/sokoban/units/astar_common.py +94 -0
  151. synth_ai/environments/examples/sokoban/units/test_building_task_set.py +49 -0
  152. synth_ai/environments/examples/sokoban/units/test_false_positive.py +120 -0
  153. synth_ai/environments/examples/sokoban/units/test_simple_run_through_environment.py +119 -0
  154. synth_ai/environments/examples/sokoban/units/test_sokoban_environment.py +98 -0
  155. synth_ai/environments/examples/sokoban/units/test_tree.py +364 -0
  156. synth_ai/environments/examples/tictactoe/__init__.py +1 -0
  157. synth_ai/environments/examples/tictactoe/agent_demos/test_synth_react.py +266 -0
  158. synth_ai/environments/examples/tictactoe/agent_demos/test_tictactoe_react_agent.py +470 -0
  159. synth_ai/environments/examples/tictactoe/engine.py +368 -0
  160. synth_ai/environments/examples/tictactoe/environment.py +239 -0
  161. synth_ai/environments/examples/tictactoe/taskset.py +214 -0
  162. synth_ai/environments/examples/tictactoe/units/test_tictactoe_engine.py +393 -0
  163. synth_ai/environments/examples/tictactoe/units/test_tictactoe_environment.py +493 -0
  164. synth_ai/environments/examples/tictactoe/units/test_tictactoe_taskset.py +191 -0
  165. synth_ai/environments/examples/verilog/__init__.py +10 -0
  166. synth_ai/environments/examples/verilog/agent_demos/test_synth_react.py +520 -0
  167. synth_ai/environments/examples/verilog/engine.py +328 -0
  168. synth_ai/environments/examples/verilog/environment.py +349 -0
  169. synth_ai/environments/examples/verilog/taskset.py +418 -0
  170. synth_ai/environments/examples/verilog/units/test_verilog_engine.py +466 -0
  171. synth_ai/environments/examples/verilog/units/test_verilog_environment.py +585 -0
  172. synth_ai/environments/examples/verilog/units/test_verilog_integration.py +383 -0
  173. synth_ai/environments/examples/verilog/units/test_verilog_taskset.py +457 -0
  174. synth_ai/environments/reproducibility/core.py +42 -0
  175. synth_ai/environments/reproducibility/tree.py +364 -0
  176. synth_ai/environments/service/app.py +78 -0
  177. synth_ai/environments/service/core_routes.py +775 -0
  178. synth_ai/environments/service/external_registry.py +57 -0
  179. synth_ai/environments/service/registry.py +9 -0
  180. synth_ai/environments/stateful/__init__.py +1 -0
  181. synth_ai/environments/stateful/core.py +28 -0
  182. synth_ai/environments/stateful/engine.py +21 -0
  183. synth_ai/environments/stateful/state.py +7 -0
  184. synth_ai/environments/tasks/api.py +19 -0
  185. synth_ai/environments/tasks/core.py +78 -0
  186. synth_ai/environments/tasks/filters.py +39 -0
  187. synth_ai/environments/tasks/utils.py +89 -0
  188. synth_ai/environments/v0_observability/history.py +3 -0
  189. synth_ai/environments/v0_observability/log.py +2 -0
  190. synth_ai/lm/caching/constants.py +1 -0
  191. synth_ai/{zyk/lms → lm}/caching/ephemeral.py +4 -8
  192. synth_ai/{zyk/lms → lm}/caching/handler.py +15 -15
  193. synth_ai/{zyk/lms → lm}/caching/initialize.py +2 -4
  194. synth_ai/{zyk/lms → lm}/caching/persistent.py +4 -10
  195. synth_ai/{zyk/lms → lm}/config.py +2 -1
  196. synth_ai/{zyk/lms → lm}/constants.py +2 -2
  197. synth_ai/{zyk/lms → lm}/core/all.py +10 -10
  198. synth_ai/{zyk/lms → lm}/core/main.py +57 -33
  199. synth_ai/{zyk/lms → lm}/core/vendor_clients.py +12 -10
  200. synth_ai/lm/cost/monitor.py +1 -0
  201. synth_ai/lm/cost/statefulness.py +1 -0
  202. synth_ai/lm/provider_support/__init__.py +8 -0
  203. synth_ai/lm/provider_support/anthropic.py +945 -0
  204. synth_ai/lm/provider_support/openai.py +1115 -0
  205. synth_ai/lm/provider_support/suppress_logging.py +31 -0
  206. synth_ai/{zyk/lms → lm}/structured_outputs/handler.py +58 -80
  207. synth_ai/{zyk/lms → lm}/structured_outputs/inject.py +6 -20
  208. synth_ai/{zyk/lms → lm}/structured_outputs/rehabilitate.py +6 -12
  209. synth_ai/{zyk/lms → lm}/vendors/core/anthropic_api.py +21 -30
  210. synth_ai/{zyk/lms → lm}/vendors/core/gemini_api.py +35 -32
  211. synth_ai/{zyk/lms → lm}/vendors/core/mistral_api.py +19 -28
  212. synth_ai/{zyk/lms → lm}/vendors/core/openai_api.py +26 -36
  213. synth_ai/{zyk/lms → lm}/vendors/openai_standard.py +29 -33
  214. synth_ai/{zyk/lms → lm}/vendors/retries.py +1 -1
  215. synth_ai/lm/vendors/supported/__init__.py +0 -0
  216. synth_ai/{zyk/lms → lm}/vendors/supported/custom_endpoint.py +131 -118
  217. synth_ai/{zyk/lms → lm}/vendors/supported/deepseek.py +4 -8
  218. synth_ai/{zyk/lms → lm}/vendors/supported/grok.py +6 -8
  219. synth_ai/{zyk/lms → lm}/vendors/supported/groq.py +1 -1
  220. synth_ai/{zyk/lms → lm}/vendors/supported/ollama.py +2 -2
  221. synth_ai/{zyk/lms → lm}/vendors/supported/openrouter.py +18 -16
  222. synth_ai/{zyk/lms → lm}/vendors/supported/together.py +1 -1
  223. synth_ai/tracing/__init__.py +0 -0
  224. synth_ai/tracing/abstractions.py +224 -0
  225. synth_ai/tracing/base_client.py +91 -0
  226. synth_ai/tracing/client_manager.py +131 -0
  227. synth_ai/tracing/config.py +140 -0
  228. synth_ai/tracing/context.py +146 -0
  229. synth_ai/tracing/decorators.py +679 -0
  230. synth_ai/tracing/events/__init__.py +0 -0
  231. synth_ai/tracing/events/manage.py +147 -0
  232. synth_ai/tracing/events/scope.py +86 -0
  233. synth_ai/tracing/events/store.py +227 -0
  234. synth_ai/tracing/immediate_client.py +152 -0
  235. synth_ai/tracing/local.py +18 -0
  236. synth_ai/tracing/log_client_base.py +74 -0
  237. synth_ai/tracing/retry_queue.py +187 -0
  238. synth_ai/tracing/trackers.py +515 -0
  239. synth_ai/tracing/upload.py +504 -0
  240. synth_ai/tracing/utils.py +9 -0
  241. synth_ai/zyk/__init__.py +28 -2
  242. synth_ai-0.2.1.dev0.dist-info/METADATA +349 -0
  243. synth_ai-0.2.1.dev0.dist-info/RECORD +261 -0
  244. {synth_ai-0.2.0.dist-info → synth_ai-0.2.1.dev0.dist-info}/WHEEL +1 -1
  245. synth_ai/zyk/lms/caching/constants.py +0 -1
  246. synth_ai/zyk/lms/cost/monitor.py +0 -1
  247. synth_ai/zyk/lms/cost/statefulness.py +0 -1
  248. synth_ai-0.2.0.dist-info/METADATA +0 -36
  249. synth_ai-0.2.0.dist-info/RECORD +0 -50
  250. /synth_ai/{zyk/lms/__init__.py → environments/reproducibility/helpers.py} +0 -0
  251. /synth_ai/{zyk/lms/caching → lm}/__init__.py +0 -0
  252. /synth_ai/{zyk/lms/core → lm/caching}/__init__.py +0 -0
  253. /synth_ai/{zyk/lms → lm}/caching/dbs.py +0 -0
  254. /synth_ai/{zyk/lms/cost → lm/core}/__init__.py +0 -0
  255. /synth_ai/{zyk/lms → lm}/core/exceptions.py +0 -0
  256. /synth_ai/{zyk/lms/structured_outputs → lm/cost}/__init__.py +0 -0
  257. /synth_ai/{zyk/lms/vendors → lm/structured_outputs}/__init__.py +0 -0
  258. /synth_ai/{zyk/lms → lm}/tools/__init__.py +0 -0
  259. /synth_ai/{zyk/lms → lm}/tools/base.py +0 -0
  260. /synth_ai/{zyk/lms/vendors/core → lm/vendors}/__init__.py +0 -0
  261. /synth_ai/{zyk/lms → lm}/vendors/base.py +0 -0
  262. /synth_ai/{zyk/lms/vendors/local → lm/vendors/core}/__init__.py +0 -0
  263. /synth_ai/{zyk/lms/vendors/supported → lm/vendors/local}/__init__.py +0 -0
  264. /synth_ai/{zyk/lms → lm}/vendors/local/ollama.py +0 -0
  265. {synth_ai-0.2.0.dist-info → synth_ai-0.2.1.dev0.dist-info/licenses}/LICENSE +0 -0
  266. {synth_ai-0.2.0.dist-info → synth_ai-0.2.1.dev0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,498 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script to run ReAct agents against Sokoban environment on synth service (port 8901)
4
+ Tests gemini-1.5-flash on multiple easy Sokoban instances
5
+ """
6
+
7
+ import asyncio
8
+ import json
9
+ import uuid
10
+ from datetime import datetime
11
+ from typing import Dict, Any, Optional, List
12
+ from pydantic import BaseModel, Field
13
+ from httpx import AsyncClient
14
+ import sys
15
+ import os
16
+
17
+ # Add the src directory to the path
18
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "src"))
19
+
20
+ from synth_ai.zyk import LM
21
+ from synth_ai.zyk.lms.tools.base import BaseTool
22
+
23
+
24
+ # --- Service Configuration ---
25
+ SERVICE_BASE_URL = "http://localhost:8901"
26
+ MODEL_NAME = "gpt-4.1-mini"
27
+ NUM_INSTANCES = 10
28
+ MAX_TURNS = 15
29
+ DIFFICULTY = "ultra_easy"
30
+
31
+ # ultra easy - gpt-4.1-nano - 0%, gpt-4.1-mini - 16%, o4-mini - 84%
32
+ # easy - o4-mini - 10%
33
+
34
+ # --- Action Mapping ---
35
+ ACTION_STRING_TO_INT = {
36
+ "no operation": 0,
37
+ "push up": 1,
38
+ "push down": 2,
39
+ "push left": 3,
40
+ "push right": 4,
41
+ "move up": 5,
42
+ "move down": 6,
43
+ "move left": 7,
44
+ "move right": 8,
45
+ }
46
+
47
+
48
+ # --- Tool Definitions ---
49
+ class GameActionArgs(BaseModel):
50
+ """Arguments for game actions."""
51
+
52
+ action: str = Field(description="The action to take")
53
+ reasoning: str = Field(description="Brief explanation of why this action was chosen")
54
+
55
+
56
+ class TerminateArgs(BaseModel):
57
+ """Arguments for termination."""
58
+
59
+ reason: str = Field(description="Reason for termination")
60
+
61
+
62
+ class GameActionTool(BaseTool):
63
+ """Tool for performing an action in the game."""
64
+
65
+ name: str = "game_action"
66
+ arguments: type[BaseModel] = GameActionArgs
67
+ description: str = "Perform an action in the game environment."
68
+
69
+
70
+ class TerminateTool(BaseTool):
71
+ """Tool to terminate the episode."""
72
+
73
+ name: str = "terminate"
74
+ arguments: type[BaseModel] = TerminateArgs
75
+ description: str = "End the episode when finished or no progress can be made."
76
+
77
+
78
+ # --- Base ReAct Agent ---
79
+ class BaseReActAgent:
80
+ """Base ReAct agent for game environments."""
81
+
82
+ def __init__(self, llm: LM, max_turns: int = MAX_TURNS, verbose: bool = False):
83
+ self.llm = llm
84
+ self.max_turns = max_turns
85
+ self.verbose = verbose
86
+ self.history = []
87
+ self.system_name = "base-react-agent"
88
+ self.system_instance_id = str(uuid.uuid4())
89
+ self.tools = [GameActionTool(), TerminateTool()]
90
+
91
+ async def decide(self, obs: str, system_message: str, turn: int) -> Dict[str, Any]:
92
+ """Get LLM decision for next action."""
93
+ # Build action history (only last 2 for brevity)
94
+ action_history = ""
95
+ if len(self.history) > 0:
96
+ action_history = "\n\nRECENT HISTORY:\n"
97
+ for i, h in enumerate(self.history[-2:], 1):
98
+ action_history += f"{i}. {h}\n"
99
+
100
+ user_content = f"Current state:\n{obs}{action_history}\n\nWhat action should I take?"
101
+
102
+ # Use the same pattern as Crafter ReAct agent
103
+ response_obj = await self.llm.respond_async(
104
+ system_message=system_message, user_message=user_content, tools=self.tools
105
+ )
106
+
107
+ tool_calls = response_obj.tool_calls
108
+
109
+ # Handle case where tool_calls is None or empty (graceful fallback)
110
+ if not tool_calls:
111
+ if self.verbose:
112
+ print(f"[WARNING] No tool calls returned by LLM, using default action")
113
+ return {
114
+ "name": "game_action",
115
+ "parameters": {
116
+ "action": "up",
117
+ "reasoning": "Default action - no tool call received",
118
+ },
119
+ }
120
+
121
+ tool_call_data = tool_calls[0]
122
+
123
+ # Handle both dict and object formats (same as Crafter)
124
+ if isinstance(tool_call_data, dict):
125
+ tool_name = tool_call_data["function"]["name"]
126
+ tool_args_str = tool_call_data["function"]["arguments"]
127
+ else:
128
+ tool_name = tool_call_data.function.name
129
+ tool_args_str = tool_call_data.function.arguments
130
+
131
+ tool_arguments = json.loads(tool_args_str)
132
+
133
+ return {"name": tool_name, "parameters": tool_arguments}
134
+
135
+
136
+ # --- Sokoban ReAct Agent ---
137
+ class SokobanReActAgent(BaseReActAgent):
138
+ """ReAct agent for Sokoban environment."""
139
+
140
+ def __init__(self, llm: LM, max_turns: int = 15, verbose: bool = False):
141
+ super().__init__(llm, max_turns, verbose)
142
+ self.system_name = "sokoban-react-agent"
143
+
144
+ def get_system_message(self) -> str:
145
+ return """You are playing Sokoban. Push all boxes (X) onto targets (O) to win.
146
+
147
+ RULES: Move/push in 4 directions. Cannot pull boxes or push into walls/boxes.
148
+
149
+ ACTIONS: "move up", "move down", "move left", "move right", "push up", "push down", "push left", "push right", "no operation"
150
+
151
+ SYMBOLS: # = wall, _ = empty, O = target, X = box, √ = box on target, P = you
152
+
153
+ STRATEGY: Analyze layout, plan moves, avoid getting boxes stuck in corners. Use PUSH actions when next to a box to move it.
154
+
155
+ Be concise and decisive. Always use the exact action names listed above."""
156
+
157
+ def format_observation(self, obs: Dict[str, Any]) -> str:
158
+ """Format observation for Sokoban."""
159
+ parts = []
160
+
161
+ if "room_text" in obs:
162
+ parts.append(f"Board:\n{obs['room_text']}")
163
+
164
+ if "boxes_on_target" in obs and "num_boxes" in obs:
165
+ parts.append(f"Progress: {obs['boxes_on_target']}/{obs['num_boxes']} boxes on target")
166
+
167
+ if "steps_taken" in obs and "max_steps" in obs:
168
+ parts.append(f"Steps: {obs['steps_taken']}/{obs['max_steps']}")
169
+
170
+ return "\n".join(parts)
171
+
172
+
173
+ # --- Episode Runner ---
174
+ async def run_single_episode(
175
+ client: AsyncClient, agent: SokobanReActAgent, config: Dict, instance_num: int
176
+ ) -> Dict[str, Any]:
177
+ """Run a single Sokoban episode and return episode metrics."""
178
+ try:
179
+ # Create environment
180
+ create_resp = await client.post(f"/env/Sokoban/initialize", json={"initial_state": config})
181
+
182
+ if create_resp.status_code != 200:
183
+ print(
184
+ f" Instance {instance_num}: Failed to create environment - {create_resp.status_code}: {create_resp.text}"
185
+ )
186
+ return {"eval_metric": 0.0, "rubric": {}, "error": True}
187
+
188
+ env_id = create_resp.json()["env_id"]
189
+
190
+ # Get initial observation
191
+ obs = create_resp.json()["observation"]
192
+ formatted_obs = agent.format_observation(obs)
193
+
194
+ # DEBUG: Print initial state
195
+ print(f"\n Instance {instance_num}: Starting puzzle")
196
+ print(f" Initial state:")
197
+ print(f" {formatted_obs}")
198
+
199
+ # Track episode metrics
200
+ steps_taken = 0
201
+ max_steps = config.get("max_steps", 120)
202
+
203
+ # Run episode
204
+ for turn in range(agent.max_turns):
205
+ # Get agent decision
206
+ action = await agent.decide(formatted_obs, agent.get_system_message(), turn)
207
+
208
+ # DEBUG: Print agent decision
209
+ print(
210
+ f" Turn {turn + 1}: Agent chose '{action['parameters']['action']}' - {action['parameters'].get('reasoning', 'no reasoning')}"
211
+ )
212
+
213
+ # Check for termination
214
+ if action["name"] == "terminate":
215
+ print(
216
+ f" Agent terminated: {action['parameters'].get('reason', 'no reason given')}"
217
+ )
218
+ break
219
+
220
+ # Execute action in environment
221
+ action_name = action["parameters"]["action"]
222
+
223
+ # Convert action string to integer (Sokoban expects integers)
224
+ if action_name in ACTION_STRING_TO_INT:
225
+ action_int = ACTION_STRING_TO_INT[action_name]
226
+ else:
227
+ print(f" ❌ Unknown action '{action_name}', using no-op")
228
+ action_int = 0 # Default to "no operation"
229
+
230
+ step_resp = await client.post(
231
+ f"/env/Sokoban/step",
232
+ json={
233
+ "env_id": env_id,
234
+ "request_id": str(uuid.uuid4()),
235
+ "action": {
236
+ "tool_calls": [{"tool": "interact", "args": {"action": action_int}}]
237
+ },
238
+ },
239
+ )
240
+
241
+ if step_resp.status_code != 200:
242
+ print(f" ❌ Step failed: {step_resp.status_code}: {step_resp.text}")
243
+ break
244
+
245
+ obs = step_resp.json()["observation"]
246
+ formatted_obs = agent.format_observation(obs)
247
+
248
+ # DEBUG: Print state after action
249
+ print(f" After action:")
250
+ print(f" {formatted_obs}")
251
+
252
+ # Update history
253
+ agent.history.append(f"{action_name}: {action['parameters'].get('reasoning', '')[:50]}")
254
+
255
+ # Track steps
256
+ steps_taken = obs.get("steps_taken", steps_taken + 1)
257
+
258
+ # Check if game is won
259
+ boxes_on_target = obs.get("boxes_on_target", 0)
260
+ num_boxes = obs.get("num_boxes", 0)
261
+ terminated = obs.get("terminated", False)
262
+
263
+ if terminated and boxes_on_target == num_boxes:
264
+ print(
265
+ f" ✅ Instance {instance_num}: SUCCESS! All boxes on target in {steps_taken} steps"
266
+ )
267
+
268
+ # Calculate eval metric and rubric
269
+ eval_metric = 1.0
270
+
271
+ # Create rubric - we'll estimate optimal solution as a fraction of max_steps
272
+ # This is a rough estimate since we don't have actual optimal solutions
273
+ estimated_optimal = max(num_boxes * 3, 10) # Rough estimate
274
+ step_efficiency = min(1.0, estimated_optimal / max(steps_taken, 1))
275
+
276
+ rubric = {
277
+ "solved": 1.0,
278
+ "step_efficiency": step_efficiency,
279
+ "boxes_placed": float(boxes_on_target) / max(num_boxes, 1),
280
+ "completed_in_time": 1.0 if steps_taken <= max_steps else 0.0,
281
+ }
282
+
283
+ await client.post(f"/env/Sokoban/terminate", json={"env_id": env_id})
284
+ return {
285
+ "eval_metric": eval_metric,
286
+ "rubric": rubric,
287
+ "steps_taken": steps_taken,
288
+ "boxes_on_target": boxes_on_target,
289
+ "num_boxes": num_boxes,
290
+ "solved": True,
291
+ "error": False,
292
+ }
293
+
294
+ if terminated:
295
+ print(
296
+ f" ❌ Instance {instance_num}: Game terminated without success (boxes: {boxes_on_target}/{num_boxes})"
297
+ )
298
+ break
299
+
300
+ print(f" ❌ Instance {instance_num}: Failed to solve in {agent.max_turns} turns")
301
+
302
+ # Calculate eval metric and rubric for failed episode
303
+ eval_metric = 0.0
304
+ rubric = {
305
+ "solved": 0.0,
306
+ "step_efficiency": 0.0,
307
+ "boxes_placed": float(boxes_on_target) / max(num_boxes, 1),
308
+ "completed_in_time": 0.0,
309
+ }
310
+
311
+ # Cleanup
312
+ await client.post(f"/env/Sokoban/terminate", json={"env_id": env_id})
313
+ return {
314
+ "eval_metric": eval_metric,
315
+ "rubric": rubric,
316
+ "steps_taken": steps_taken,
317
+ "boxes_on_target": boxes_on_target,
318
+ "num_boxes": num_boxes,
319
+ "solved": False,
320
+ "error": False,
321
+ }
322
+
323
+ except Exception as e:
324
+ print(f" Instance {instance_num}: Error - {e}")
325
+ import traceback
326
+
327
+ traceback.print_exc()
328
+ return {"eval_metric": 0.0, "rubric": {}, "error": True}
329
+
330
+
331
+ # --- Batch Evaluation ---
332
+ async def evaluate_sokoban_batch() -> Dict[str, Any]:
333
+ """Evaluate Sokoban agent on multiple easy instances."""
334
+ print(f"🎯 Evaluating Sokoban on {NUM_INSTANCES} easy instances...")
335
+
336
+ llm = LM(model_name=MODEL_NAME, formatting_model_name=MODEL_NAME, temperature=0.0)
337
+
338
+ # Get easy task instances using the taskset system
339
+ from synth_ai.environments.examples.sokoban.taskset import create_task_instance_from_seed
340
+
341
+ easy_task_instances = []
342
+ task_debug_info = []
343
+
344
+ for seed in range(NUM_INSTANCES):
345
+ try:
346
+ print(f" 🔍 Creating task instance for seed {seed}...")
347
+ task_instance = await create_task_instance_from_seed(DIFFICULTY, seed)
348
+ easy_task_instances.append(task_instance)
349
+
350
+ # Extract debug info
351
+ task_id = getattr(task_instance, "id", "unknown")
352
+ metadata = getattr(task_instance, "metadata", {})
353
+ initial_snapshot = getattr(task_instance, "initial_engine_snapshot", {})
354
+
355
+ debug_info = {
356
+ "seed": seed,
357
+ "task_id": task_id,
358
+ "metadata": metadata,
359
+ "room_state_hash": hash(str(initial_snapshot.get("room_state", []))),
360
+ "room_fixed_hash": hash(str(initial_snapshot.get("room_fixed", []))),
361
+ "num_boxes": initial_snapshot.get("num_boxes", 0),
362
+ "max_steps": initial_snapshot.get("max_steps", 0),
363
+ }
364
+ task_debug_info.append(debug_info)
365
+
366
+ print(
367
+ f" ✅ Seed {seed}: task_id={task_id}, room_state_hash={debug_info['room_state_hash']}"
368
+ )
369
+
370
+ except Exception as e:
371
+ print(f" ⚠️ Failed to get task instance for seed {seed}: {e}")
372
+ continue
373
+
374
+ print(f" 📝 Generated {len(easy_task_instances)} {DIFFICULTY} task instances from seeds 0,1,2")
375
+
376
+ # Print debug summary
377
+ print(f" 🔍 Task Debug Summary:")
378
+ for info in task_debug_info:
379
+ print(
380
+ f" Seed {info['seed']}: ID={info['task_id']}, StateHash={info['room_state_hash']}, FixedHash={info['room_fixed_hash']}"
381
+ )
382
+
383
+ async with AsyncClient(base_url=SERVICE_BASE_URL, timeout=30.0) as client:
384
+ tasks = []
385
+ for i, task_instance in enumerate(easy_task_instances):
386
+ agent = SokobanReActAgent(llm, max_turns=MAX_TURNS, verbose=False)
387
+
388
+ # Extract configuration from task instance
389
+ config = {
390
+ "dim_room": list(task_instance.metadata.dim_room),
391
+ "max_steps": task_instance.metadata.max_steps,
392
+ "num_boxes": task_instance.metadata.num_boxes,
393
+ "room_fixed": task_instance.initial_engine_snapshot["room_fixed"],
394
+ "room_state": task_instance.initial_engine_snapshot["room_state"],
395
+ "boxes_on_target": task_instance.initial_engine_snapshot.get("boxes_on_target", 0),
396
+ }
397
+
398
+ tasks.append(run_single_episode(client, agent, config, i + 1))
399
+
400
+ results = await asyncio.gather(*tasks)
401
+
402
+ # Filter out error results
403
+ valid_results = [r for r in results if not r.get("error", False)]
404
+
405
+ if not valid_results:
406
+ return {
407
+ "eval_metrics": [],
408
+ "mean_eval_metric": 0.0,
409
+ "mean_rubric": {},
410
+ "num_episodes": 0,
411
+ }
412
+
413
+ # Extract eval metrics and rubrics
414
+ eval_metrics = [r["eval_metric"] for r in valid_results]
415
+ mean_eval_metric = sum(eval_metrics) / len(eval_metrics)
416
+
417
+ # Calculate mean rubric values
418
+ all_rubric_keys = set()
419
+ for r in valid_results:
420
+ all_rubric_keys.update(r["rubric"].keys())
421
+
422
+ mean_rubric = {}
423
+ for key in all_rubric_keys:
424
+ values = [r["rubric"].get(key, 0.0) for r in valid_results]
425
+ mean_rubric[key] = sum(values) / len(values)
426
+
427
+ return {
428
+ "eval_metrics": eval_metrics,
429
+ "mean_eval_metric": mean_eval_metric,
430
+ "mean_rubric": mean_rubric,
431
+ "num_episodes": len(valid_results),
432
+ }
433
+
434
+
435
+ async def main():
436
+ """Run Sokoban evaluation."""
437
+ print(f"🎮 Sokoban ReAct Agent Evaluation")
438
+ print(f"Model: {MODEL_NAME}")
439
+ print(f"Service: {SERVICE_BASE_URL}")
440
+ print(f"Instances: {NUM_INSTANCES}")
441
+ print(f"Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
442
+ print("=" * 50)
443
+
444
+ # Test service health
445
+ async with AsyncClient(base_url=SERVICE_BASE_URL, timeout=10.0) as client:
446
+ try:
447
+ health_resp = await client.get("/health")
448
+ health_data = health_resp.json()
449
+
450
+ if "Sokoban" not in health_data.get("supported_environments", []):
451
+ print("❌ Sokoban not available on service")
452
+ return
453
+
454
+ print("✅ Service health check passed")
455
+
456
+ except Exception as e:
457
+ print(f"❌ Service health check failed: {e}")
458
+ return
459
+
460
+ # Run evaluation
461
+ try:
462
+ results = await evaluate_sokoban_batch()
463
+
464
+ print("\n" + "=" * 80)
465
+ print("🏆 FINAL SOKOBAN EVALUATION RESULTS")
466
+ print("=" * 80)
467
+
468
+ # Print eval metrics
469
+ print(f"📊 EVAL METRICS:")
470
+ print(f" Episodes: {results['num_episodes']}")
471
+ print(f" Individual Scores: {[f'{x:.1f}' for x in results['eval_metrics']]}")
472
+ print(f" Mean Eval Metric: {results['mean_eval_metric']:.2f}")
473
+
474
+ # Print rubric results
475
+ print(f"\n🎯 RUBRIC RESULTS:")
476
+ if results["mean_rubric"]:
477
+ for metric, score in sorted(results["mean_rubric"].items()):
478
+ print(f" {metric}: {score:.2f}")
479
+ else:
480
+ print(" No rubric data available")
481
+
482
+ # Overall assessment
483
+ print(f"\n🔍 ASSESSMENT:")
484
+ if results["mean_eval_metric"] > 0.5:
485
+ print("🎉 Excellent performance!")
486
+ elif results["mean_eval_metric"] > 0.3:
487
+ print("✅ Good performance!")
488
+ elif results["mean_eval_metric"] > 0.1:
489
+ print("⚠️ Moderate performance")
490
+ else:
491
+ print("❌ Poor performance - needs improvement")
492
+
493
+ except Exception as e:
494
+ print(f"❌ Evaluation failed: {e}")
495
+
496
+
497
+ if __name__ == "__main__":
498
+ asyncio.run(main())