synth-ai 0.1.9__py3-none-any.whl → 0.2.1.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (266) hide show
  1. synth_ai/__init__.py +28 -2
  2. synth_ai/core/system.py +4 -0
  3. synth_ai/environments/__init__.py +35 -0
  4. synth_ai/environments/environment/__init__.py +1 -0
  5. synth_ai/environments/environment/artifacts/__init__.py +1 -0
  6. synth_ai/environments/environment/artifacts/base.py +50 -0
  7. synth_ai/environments/environment/core.py +22 -0
  8. synth_ai/environments/environment/db/__init__.py +1 -0
  9. synth_ai/environments/environment/db/sqlite.py +45 -0
  10. synth_ai/environments/environment/registry.py +24 -0
  11. synth_ai/environments/environment/resources/sqlite.py +46 -0
  12. synth_ai/environments/environment/results.py +1 -0
  13. synth_ai/environments/environment/rewards/__init__.py +1 -0
  14. synth_ai/environments/environment/rewards/core.py +28 -0
  15. synth_ai/environments/environment/shared_engine.py +26 -0
  16. synth_ai/environments/environment/tools/__init__.py +34 -0
  17. synth_ai/environments/examples/__init__.py +1 -0
  18. synth_ai/environments/examples/crafter_classic/__init__.py +8 -0
  19. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py +58 -0
  20. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_browser.py +152 -0
  21. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_framework.py +1194 -0
  22. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py +51 -0
  23. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_react_agent.py +872 -0
  24. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_trace_evaluation.py +1412 -0
  25. synth_ai/environments/examples/crafter_classic/agent_demos/test_crafter_react_agent.py +1110 -0
  26. synth_ai/environments/examples/crafter_classic/config_logging.py +111 -0
  27. synth_ai/environments/examples/crafter_classic/engine.py +502 -0
  28. synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +63 -0
  29. synth_ai/environments/examples/crafter_classic/engine_helpers/action_map.py +5 -0
  30. synth_ai/environments/examples/crafter_classic/engine_helpers/serialization.py +74 -0
  31. synth_ai/environments/examples/crafter_classic/environment.py +255 -0
  32. synth_ai/environments/examples/crafter_classic/taskset.py +228 -0
  33. synth_ai/environments/examples/enron/agent_demos/test_synth_react.py +535 -0
  34. synth_ai/environments/examples/enron/art_helpers/email_search_tools.py +156 -0
  35. synth_ai/environments/examples/enron/art_helpers/local_email_db.py +280 -0
  36. synth_ai/environments/examples/enron/art_helpers/types_enron.py +24 -0
  37. synth_ai/environments/examples/enron/engine.py +291 -0
  38. synth_ai/environments/examples/enron/environment.py +165 -0
  39. synth_ai/environments/examples/enron/taskset.py +112 -0
  40. synth_ai/environments/examples/enron/units/keyword_stats.py +111 -0
  41. synth_ai/environments/examples/enron/units/test_email_index.py +8 -0
  42. synth_ai/environments/examples/minigrid/__init__.py +48 -0
  43. synth_ai/environments/examples/minigrid/agent_demos/minigrid_evaluation_framework.py +1188 -0
  44. synth_ai/environments/examples/minigrid/agent_demos/minigrid_quick_evaluation.py +47 -0
  45. synth_ai/environments/examples/minigrid/agent_demos/minigrid_react_agent.py +562 -0
  46. synth_ai/environments/examples/minigrid/agent_demos/minigrid_trace_evaluation.py +220 -0
  47. synth_ai/environments/examples/minigrid/agent_demos/test_minigrid_react_agent.py +393 -0
  48. synth_ai/environments/examples/minigrid/engine.py +589 -0
  49. synth_ai/environments/examples/minigrid/environment.py +274 -0
  50. synth_ai/environments/examples/minigrid/environment_mapping.py +242 -0
  51. synth_ai/environments/examples/minigrid/puzzle_loader.py +416 -0
  52. synth_ai/environments/examples/minigrid/taskset.py +583 -0
  53. synth_ai/environments/examples/minigrid/units/test_action_behavior.py +226 -0
  54. synth_ai/environments/examples/minigrid/units/test_debug_messages.py +83 -0
  55. synth_ai/environments/examples/minigrid/units/test_exploration.py +120 -0
  56. synth_ai/environments/examples/minigrid/units/test_minigrid_engine.py +214 -0
  57. synth_ai/environments/examples/minigrid/units/test_minigrid_environment.py +238 -0
  58. synth_ai/environments/examples/minigrid/units/test_minigrid_environment_mapping.py +301 -0
  59. synth_ai/environments/examples/minigrid/units/test_minigrid_taskset.py +210 -0
  60. synth_ai/environments/examples/nethack/__init__.py +7 -0
  61. synth_ai/environments/examples/nethack/achievements.py +337 -0
  62. synth_ai/environments/examples/nethack/agent_demos/nethack_evaluation_framework.py +981 -0
  63. synth_ai/environments/examples/nethack/agent_demos/nethack_quick_evaluation.py +74 -0
  64. synth_ai/environments/examples/nethack/agent_demos/nethack_react_agent.py +832 -0
  65. synth_ai/environments/examples/nethack/agent_demos/test_nethack_react_agent.py +1112 -0
  66. synth_ai/environments/examples/nethack/engine.py +738 -0
  67. synth_ai/environments/examples/nethack/environment.py +255 -0
  68. synth_ai/environments/examples/nethack/helpers/__init__.py +42 -0
  69. synth_ai/environments/examples/nethack/helpers/action_mapping.py +301 -0
  70. synth_ai/environments/examples/nethack/helpers/nle_wrapper.py +401 -0
  71. synth_ai/environments/examples/nethack/helpers/observation_utils.py +433 -0
  72. synth_ai/environments/examples/nethack/helpers/recording_wrapper.py +201 -0
  73. synth_ai/environments/examples/nethack/helpers/trajectory_recorder.py +268 -0
  74. synth_ai/environments/examples/nethack/helpers/visualization/replay_viewer.py +308 -0
  75. synth_ai/environments/examples/nethack/helpers/visualization/visualizer.py +430 -0
  76. synth_ai/environments/examples/nethack/taskset.py +323 -0
  77. synth_ai/environments/examples/nethack/units/test_nethack_engine.py +277 -0
  78. synth_ai/environments/examples/nethack/units/test_nethack_environment.py +281 -0
  79. synth_ai/environments/examples/nethack/units/test_nethack_taskset.py +213 -0
  80. synth_ai/environments/examples/nethack/units/test_recording.py +307 -0
  81. synth_ai/environments/examples/red/__init__.py +7 -0
  82. synth_ai/environments/examples/red/agent_demos/__init__.py +1 -0
  83. synth_ai/environments/examples/red/agent_demos/test_synth_react.py +1471 -0
  84. synth_ai/environments/examples/red/config_logging.py +110 -0
  85. synth_ai/environments/examples/red/engine.py +693 -0
  86. synth_ai/environments/examples/red/engine_helpers/__init__.py +1 -0
  87. synth_ai/environments/examples/red/engine_helpers/memory_map.py +28 -0
  88. synth_ai/environments/examples/red/engine_helpers/reward_components.py +275 -0
  89. synth_ai/environments/examples/red/engine_helpers/reward_library/__init__.py +142 -0
  90. synth_ai/environments/examples/red/engine_helpers/reward_library/adaptive_rewards.py +56 -0
  91. synth_ai/environments/examples/red/engine_helpers/reward_library/battle_rewards.py +283 -0
  92. synth_ai/environments/examples/red/engine_helpers/reward_library/composite_rewards.py +149 -0
  93. synth_ai/environments/examples/red/engine_helpers/reward_library/economy_rewards.py +137 -0
  94. synth_ai/environments/examples/red/engine_helpers/reward_library/efficiency_rewards.py +56 -0
  95. synth_ai/environments/examples/red/engine_helpers/reward_library/exploration_rewards.py +330 -0
  96. synth_ai/environments/examples/red/engine_helpers/reward_library/novelty_rewards.py +120 -0
  97. synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_rewards.py +558 -0
  98. synth_ai/environments/examples/red/engine_helpers/reward_library/pokemon_rewards.py +312 -0
  99. synth_ai/environments/examples/red/engine_helpers/reward_library/social_rewards.py +147 -0
  100. synth_ai/environments/examples/red/engine_helpers/reward_library/story_rewards.py +246 -0
  101. synth_ai/environments/examples/red/engine_helpers/screen_analysis.py +367 -0
  102. synth_ai/environments/examples/red/engine_helpers/state_extraction.py +139 -0
  103. synth_ai/environments/examples/red/environment.py +235 -0
  104. synth_ai/environments/examples/red/taskset.py +77 -0
  105. synth_ai/environments/examples/red/test_fixes.py +125 -0
  106. synth_ai/environments/examples/red/test_fixes_mock.py +148 -0
  107. synth_ai/environments/examples/red/units/__init__.py +1 -0
  108. synth_ai/environments/examples/red/units/test_basic_functionality.py +97 -0
  109. synth_ai/environments/examples/red/units/test_button_press_requirements.py +217 -0
  110. synth_ai/environments/examples/red/units/test_engine.py +192 -0
  111. synth_ai/environments/examples/red/units/test_environment.py +455 -0
  112. synth_ai/environments/examples/red/units/test_exploration_strategy.py +227 -0
  113. synth_ai/environments/examples/red/units/test_integration.py +217 -0
  114. synth_ai/environments/examples/red/units/test_memory_extraction.py +111 -0
  115. synth_ai/environments/examples/red/units/test_menu_bug_reproduction.py +1100 -0
  116. synth_ai/environments/examples/red/units/test_movement_debug.py +255 -0
  117. synth_ai/environments/examples/red/units/test_pokemon_mcts_debug.py +163 -0
  118. synth_ai/environments/examples/red/units/test_pokemon_mcts_verbose.py +117 -0
  119. synth_ai/environments/examples/red/units/test_red_basic.py +145 -0
  120. synth_ai/environments/examples/red/units/test_red_comprehensive.py +323 -0
  121. synth_ai/environments/examples/red/units/test_retry_movement.py +195 -0
  122. synth_ai/environments/examples/red/units/test_reward_components.py +186 -0
  123. synth_ai/environments/examples/red/units/test_rom_integration.py +260 -0
  124. synth_ai/environments/examples/red/units/test_taskset.py +116 -0
  125. synth_ai/environments/examples/red/units/test_tree.py +448 -0
  126. synth_ai/environments/examples/sokoban/__init__.py +1 -0
  127. synth_ai/environments/examples/sokoban/agent_demos/sokoban_full_eval.py +900 -0
  128. synth_ai/environments/examples/sokoban/agent_demos/test_dspy_react.py +1 -0
  129. synth_ai/environments/examples/sokoban/agent_demos/test_sokoban_react_agent.py +498 -0
  130. synth_ai/environments/examples/sokoban/agent_demos/test_synth_lats.py +1 -0
  131. synth_ai/environments/examples/sokoban/agent_demos/test_synth_react_locally.py +748 -0
  132. synth_ai/environments/examples/sokoban/agent_demos/test_synth_react_service.py +296 -0
  133. synth_ai/environments/examples/sokoban/engine.py +675 -0
  134. synth_ai/environments/examples/sokoban/engine_helpers/__init__.py +1 -0
  135. synth_ai/environments/examples/sokoban/engine_helpers/room_utils.py +656 -0
  136. synth_ai/environments/examples/sokoban/engine_helpers/vendored/__init__.py +17 -0
  137. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/__init__.py +3 -0
  138. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/boxoban_env.py +129 -0
  139. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/render_utils.py +370 -0
  140. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/room_utils.py +331 -0
  141. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env.py +305 -0
  142. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_fixed_targets.py +66 -0
  143. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_pull.py +114 -0
  144. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_two_player.py +122 -0
  145. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_variations.py +394 -0
  146. synth_ai/environments/examples/sokoban/environment.py +228 -0
  147. synth_ai/environments/examples/sokoban/generate_verified_puzzles.py +438 -0
  148. synth_ai/environments/examples/sokoban/puzzle_loader.py +311 -0
  149. synth_ai/environments/examples/sokoban/taskset.py +425 -0
  150. synth_ai/environments/examples/sokoban/units/astar_common.py +94 -0
  151. synth_ai/environments/examples/sokoban/units/test_building_task_set.py +49 -0
  152. synth_ai/environments/examples/sokoban/units/test_false_positive.py +120 -0
  153. synth_ai/environments/examples/sokoban/units/test_simple_run_through_environment.py +119 -0
  154. synth_ai/environments/examples/sokoban/units/test_sokoban_environment.py +98 -0
  155. synth_ai/environments/examples/sokoban/units/test_tree.py +364 -0
  156. synth_ai/environments/examples/tictactoe/__init__.py +1 -0
  157. synth_ai/environments/examples/tictactoe/agent_demos/test_synth_react.py +266 -0
  158. synth_ai/environments/examples/tictactoe/agent_demos/test_tictactoe_react_agent.py +470 -0
  159. synth_ai/environments/examples/tictactoe/engine.py +368 -0
  160. synth_ai/environments/examples/tictactoe/environment.py +239 -0
  161. synth_ai/environments/examples/tictactoe/taskset.py +214 -0
  162. synth_ai/environments/examples/tictactoe/units/test_tictactoe_engine.py +393 -0
  163. synth_ai/environments/examples/tictactoe/units/test_tictactoe_environment.py +493 -0
  164. synth_ai/environments/examples/tictactoe/units/test_tictactoe_taskset.py +191 -0
  165. synth_ai/environments/examples/verilog/__init__.py +10 -0
  166. synth_ai/environments/examples/verilog/agent_demos/test_synth_react.py +520 -0
  167. synth_ai/environments/examples/verilog/engine.py +328 -0
  168. synth_ai/environments/examples/verilog/environment.py +349 -0
  169. synth_ai/environments/examples/verilog/taskset.py +418 -0
  170. synth_ai/environments/examples/verilog/units/test_verilog_engine.py +466 -0
  171. synth_ai/environments/examples/verilog/units/test_verilog_environment.py +585 -0
  172. synth_ai/environments/examples/verilog/units/test_verilog_integration.py +383 -0
  173. synth_ai/environments/examples/verilog/units/test_verilog_taskset.py +457 -0
  174. synth_ai/environments/reproducibility/core.py +42 -0
  175. synth_ai/environments/reproducibility/tree.py +364 -0
  176. synth_ai/environments/service/app.py +78 -0
  177. synth_ai/environments/service/core_routes.py +775 -0
  178. synth_ai/environments/service/external_registry.py +57 -0
  179. synth_ai/environments/service/registry.py +9 -0
  180. synth_ai/environments/stateful/__init__.py +1 -0
  181. synth_ai/environments/stateful/core.py +28 -0
  182. synth_ai/environments/stateful/engine.py +21 -0
  183. synth_ai/environments/stateful/state.py +7 -0
  184. synth_ai/environments/tasks/api.py +19 -0
  185. synth_ai/environments/tasks/core.py +78 -0
  186. synth_ai/environments/tasks/filters.py +39 -0
  187. synth_ai/environments/tasks/utils.py +89 -0
  188. synth_ai/environments/v0_observability/history.py +3 -0
  189. synth_ai/environments/v0_observability/log.py +2 -0
  190. synth_ai/lm/caching/constants.py +1 -0
  191. synth_ai/{zyk/lms → lm}/caching/ephemeral.py +4 -8
  192. synth_ai/{zyk/lms → lm}/caching/handler.py +15 -15
  193. synth_ai/{zyk/lms → lm}/caching/initialize.py +2 -4
  194. synth_ai/{zyk/lms → lm}/caching/persistent.py +4 -10
  195. synth_ai/{zyk/lms → lm}/config.py +2 -1
  196. synth_ai/{zyk/lms → lm}/constants.py +2 -2
  197. synth_ai/{zyk/lms → lm}/core/all.py +10 -10
  198. synth_ai/{zyk/lms → lm}/core/main.py +57 -33
  199. synth_ai/{zyk/lms → lm}/core/vendor_clients.py +12 -10
  200. synth_ai/lm/cost/monitor.py +1 -0
  201. synth_ai/lm/cost/statefulness.py +1 -0
  202. synth_ai/lm/provider_support/__init__.py +8 -0
  203. synth_ai/lm/provider_support/anthropic.py +945 -0
  204. synth_ai/lm/provider_support/openai.py +1115 -0
  205. synth_ai/lm/provider_support/suppress_logging.py +31 -0
  206. synth_ai/{zyk/lms → lm}/structured_outputs/handler.py +58 -80
  207. synth_ai/{zyk/lms → lm}/structured_outputs/inject.py +6 -20
  208. synth_ai/{zyk/lms → lm}/structured_outputs/rehabilitate.py +6 -12
  209. synth_ai/{zyk/lms → lm}/vendors/core/anthropic_api.py +21 -30
  210. synth_ai/{zyk/lms → lm}/vendors/core/gemini_api.py +37 -32
  211. synth_ai/{zyk/lms → lm}/vendors/core/mistral_api.py +19 -28
  212. synth_ai/{zyk/lms → lm}/vendors/core/openai_api.py +26 -36
  213. synth_ai/{zyk/lms → lm}/vendors/openai_standard.py +29 -33
  214. synth_ai/{zyk/lms → lm}/vendors/retries.py +1 -1
  215. synth_ai/lm/vendors/supported/__init__.py +0 -0
  216. synth_ai/{zyk/lms → lm}/vendors/supported/custom_endpoint.py +131 -118
  217. synth_ai/{zyk/lms → lm}/vendors/supported/deepseek.py +4 -8
  218. synth_ai/{zyk/lms → lm}/vendors/supported/grok.py +6 -8
  219. synth_ai/{zyk/lms → lm}/vendors/supported/groq.py +1 -1
  220. synth_ai/{zyk/lms → lm}/vendors/supported/ollama.py +2 -2
  221. synth_ai/{zyk/lms → lm}/vendors/supported/openrouter.py +18 -16
  222. synth_ai/{zyk/lms → lm}/vendors/supported/together.py +1 -1
  223. synth_ai/tracing/__init__.py +0 -0
  224. synth_ai/tracing/abstractions.py +224 -0
  225. synth_ai/tracing/base_client.py +91 -0
  226. synth_ai/tracing/client_manager.py +131 -0
  227. synth_ai/tracing/config.py +140 -0
  228. synth_ai/tracing/context.py +146 -0
  229. synth_ai/tracing/decorators.py +679 -0
  230. synth_ai/tracing/events/__init__.py +0 -0
  231. synth_ai/tracing/events/manage.py +147 -0
  232. synth_ai/tracing/events/scope.py +86 -0
  233. synth_ai/tracing/events/store.py +227 -0
  234. synth_ai/tracing/immediate_client.py +152 -0
  235. synth_ai/tracing/local.py +18 -0
  236. synth_ai/tracing/log_client_base.py +74 -0
  237. synth_ai/tracing/retry_queue.py +187 -0
  238. synth_ai/tracing/trackers.py +515 -0
  239. synth_ai/tracing/upload.py +504 -0
  240. synth_ai/tracing/utils.py +9 -0
  241. synth_ai/zyk/__init__.py +28 -2
  242. synth_ai-0.2.1.dev0.dist-info/METADATA +349 -0
  243. synth_ai-0.2.1.dev0.dist-info/RECORD +261 -0
  244. synth_ai/zyk/lms/caching/constants.py +0 -1
  245. synth_ai/zyk/lms/cost/monitor.py +0 -1
  246. synth_ai/zyk/lms/cost/statefulness.py +0 -1
  247. synth_ai-0.1.9.dist-info/METADATA +0 -37
  248. synth_ai-0.1.9.dist-info/RECORD +0 -50
  249. /synth_ai/{zyk/lms/__init__.py → environments/reproducibility/helpers.py} +0 -0
  250. /synth_ai/{zyk/lms/caching → lm}/__init__.py +0 -0
  251. /synth_ai/{zyk/lms/core → lm/caching}/__init__.py +0 -0
  252. /synth_ai/{zyk/lms → lm}/caching/dbs.py +0 -0
  253. /synth_ai/{zyk/lms/cost → lm/core}/__init__.py +0 -0
  254. /synth_ai/{zyk/lms → lm}/core/exceptions.py +0 -0
  255. /synth_ai/{zyk/lms/structured_outputs → lm/cost}/__init__.py +0 -0
  256. /synth_ai/{zyk/lms/vendors → lm/structured_outputs}/__init__.py +0 -0
  257. /synth_ai/{zyk/lms → lm}/tools/__init__.py +0 -0
  258. /synth_ai/{zyk/lms → lm}/tools/base.py +0 -0
  259. /synth_ai/{zyk/lms/vendors/core → lm/vendors}/__init__.py +0 -0
  260. /synth_ai/{zyk/lms → lm}/vendors/base.py +0 -0
  261. /synth_ai/{zyk/lms/vendors/local → lm/vendors/core}/__init__.py +0 -0
  262. /synth_ai/{zyk/lms/vendors/supported → lm/vendors/local}/__init__.py +0 -0
  263. /synth_ai/{zyk/lms → lm}/vendors/local/ollama.py +0 -0
  264. {synth_ai-0.1.9.dist-info → synth_ai-0.2.1.dev0.dist-info}/WHEEL +0 -0
  265. {synth_ai-0.1.9.dist-info → synth_ai-0.2.1.dev0.dist-info}/licenses/LICENSE +0 -0
  266. {synth_ai-0.1.9.dist-info → synth_ai-0.2.1.dev0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,470 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script to run ReAct agents against TicTacToe environment on synth service (port 8901)
4
+ Tests on multiple TicTacToe instances with random opponent moves
5
+ """
6
+
7
+ import asyncio
8
+ import json
9
+ import uuid
10
+ from datetime import datetime
11
+ from typing import Dict, Any, Optional, List
12
+ from pydantic import BaseModel, Field
13
+ from httpx import AsyncClient
14
+ import sys
15
+ import os
16
+
17
+ # Add the src directory to the path
18
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "src"))
19
+
20
+ from synth_ai.zyk import LM
21
+ from synth_ai.zyk.lms.tools.base import BaseTool
22
+
23
+
24
+ # --- Service Configuration ---
25
+ SERVICE_BASE_URL = "http://localhost:8901"
26
+ MODEL_NAME = "o3"
27
+ NUM_INSTANCES = 5
28
+ MAX_TURNS = 9 # TicTacToe has at most 9 moves
29
+ DIFFICULTY = "random"
30
+
31
+
32
+ # --- Tool Definitions ---
33
+ class TicTacToeActionArgs(BaseModel):
34
+ """Arguments for tictactoe actions."""
35
+
36
+ action: str = Field(description="Cell coordinate (e.g., A1, B2, C3)")
37
+ reasoning: str = Field(description="Brief explanation of why this move was chosen")
38
+
39
+
40
+ class TerminateArgs(BaseModel):
41
+ """Arguments for termination."""
42
+
43
+ reason: str = Field(description="Reason for termination")
44
+
45
+
46
+ class TicTacToeActionTool(BaseTool):
47
+ """Tool for performing a move in the TicTacToe environment."""
48
+
49
+ name: str = "tictactoe_interact"
50
+ arguments: type[BaseModel] = TicTacToeActionArgs
51
+ description: str = "Place your mark in a cell. Valid cells are A1-A3, B1-B3, C1-C3."
52
+
53
+
54
+ class TerminateTool(BaseTool):
55
+ """Tool to terminate the episode."""
56
+
57
+ name: str = "terminate"
58
+ arguments: type[BaseModel] = TerminateArgs
59
+ description: str = "End the game when finished or no progress can be made."
60
+
61
+
62
+ # --- Base ReAct Agent ---
63
+ class BaseReActAgent:
64
+ """Base ReAct agent for environment interaction."""
65
+
66
+ def __init__(self, llm: LM, max_turns: int = 9, verbose: bool = False):
67
+ self.llm = llm
68
+ self.max_turns = max_turns
69
+ self.verbose = verbose
70
+ self.history = []
71
+ self.system_name = "base-react-agent"
72
+
73
+ # Define tools in OpenAI format
74
+ self.tools = [
75
+ TicTacToeActionTool(),
76
+ TerminateTool(),
77
+ ]
78
+
79
+ async def decide(self, obs: str, system_message: str, turn: int) -> Dict[str, Any]:
80
+ """Get agent decision based on observation."""
81
+ # Create conversation context
82
+ context = f"Turn {turn + 1}/{self.max_turns}\n\n{obs}"
83
+
84
+ # Generate response using LLM
85
+ response_obj = await self.llm.respond_async(
86
+ system_message=system_message, user_message=context, tools=self.tools
87
+ )
88
+
89
+ tool_calls = response_obj.tool_calls
90
+
91
+ # Handle case where tool_calls is None or empty (graceful fallback)
92
+ if not tool_calls:
93
+ if self.verbose:
94
+ print(f"[WARNING] No tool calls returned by LLM, using default action")
95
+ return {
96
+ "name": "tictactoe_interact",
97
+ "parameters": {
98
+ "action": "B2", # Center is usually a safe default
99
+ "reasoning": "Default action - no tool call received",
100
+ },
101
+ }
102
+
103
+ tool_call_data = tool_calls[0]
104
+
105
+ # Handle both dict and object formats
106
+ if isinstance(tool_call_data, dict):
107
+ tool_name = tool_call_data["function"]["name"]
108
+ tool_args_str = tool_call_data["function"]["arguments"]
109
+ else:
110
+ tool_name = tool_call_data.function.name
111
+ tool_args_str = tool_call_data.function.arguments
112
+
113
+ tool_arguments = json.loads(tool_args_str)
114
+
115
+ return {"name": tool_name, "parameters": tool_arguments}
116
+
117
+
118
+ # --- TicTacToe ReAct Agent ---
119
+ class TicTacToeReActAgent(BaseReActAgent):
120
+ """ReAct agent for TicTacToe environment."""
121
+
122
+ def __init__(self, llm: LM, max_turns: int = 9, verbose: bool = False):
123
+ super().__init__(llm, max_turns, verbose)
124
+ self.system_name = "tictactoe-react-agent"
125
+
126
+ def get_system_message(self) -> str:
127
+ return """You are playing TicTacToe against a random opponent. Your goal is to win or at least force a draw.
128
+
129
+ CRITICAL RULES:
130
+ - You play on a 3x3 grid with cells labeled A1-A3, B1-B3, C1-C3
131
+ - You MUST ONLY choose from cells listed as "Available" in the observation
132
+ - NEVER choose cells listed as "Occupied" - this will cause an illegal move and immediate loss
133
+ - Get three of your marks in a row (horizontally, vertically, or diagonally) to win
134
+ - If no one gets three in a row and the board is full, it's a draw
135
+
136
+ STRATEGY:
137
+ 1. Try to get three in a row to win
138
+ 2. Block your opponent from getting three in a row
139
+ 3. Take center (B2) if available - it's usually the best opening
140
+ 4. Take corners if center is not available
141
+ 5. Avoid giving opponent easy wins
142
+
143
+ COORDINATE SYSTEM:
144
+ 1 2 3
145
+ A . . .
146
+ B . . .
147
+ C . . .
148
+
149
+ IMPORTANT: Always check the "Available" cells list in the observation and ONLY choose from those cells. Choosing an occupied cell will result in an illegal move and automatic loss."""
150
+
151
+ def format_observation(self, obs: Dict[str, Any]) -> str:
152
+ """Format observation for TicTacToe with enhanced clarity."""
153
+ parts = []
154
+
155
+ if "board_text" in obs:
156
+ parts.append("Current Board:")
157
+ parts.append(obs["board_text"])
158
+
159
+ # Add explicit cell status for clarity
160
+ board_lines = obs["board_text"].strip().split("\n")
161
+ if len(board_lines) >= 4:
162
+ parts.append("\nCell Status:")
163
+ occupied = []
164
+ available = []
165
+
166
+ # Parse board more carefully - the display format is:
167
+ # A B C
168
+ # 1 . . .
169
+ # 2 . X .
170
+ # 3 . . .
171
+ # Where A,B,C are COLUMNS and 1,2,3 are ROWS
172
+ # But our coordinate system is A1-A3, B1-B3, C1-C3 where:
173
+ # - A,B,C are ROWS
174
+ # - 1,2,3 are COLUMNS
175
+
176
+ for i, line in enumerate(board_lines[1:4]): # Skip header
177
+ display_row = i + 1 # 1, 2, 3
178
+
179
+ if len(line) >= 2:
180
+ # Parse the line like "2 X X "
181
+ cell_chars = line[2:] if len(line) > 2 else ""
182
+
183
+ # The board format uses space separators: "A B C" where positions are:
184
+ # Column A: position 0, space, Column B: position 2, space, Column C: position 4
185
+ column_positions = [0, 2, 4] # Positions of A, B, C columns
186
+
187
+ # Extract characters from the 3 columns
188
+ for col_idx in range(3):
189
+ # Get the character at the correct position for this column
190
+ pos = column_positions[col_idx]
191
+ if pos < len(cell_chars):
192
+ cell = cell_chars[pos]
193
+ else:
194
+ cell = " "
195
+
196
+ # Convert display coordinates to our coordinate system:
197
+ # Display row 1 → our row A, Display row 2 → our row B, etc.
198
+ # Display col A → our col 1, Display col B → our col 2, etc.
199
+ our_row = ["A", "B", "C"][i] # i is 0,1,2 → A,B,C
200
+ our_col = col_idx + 1 # 0,1,2 → 1,2,3
201
+ coord = f"{our_row}{our_col}"
202
+
203
+ if cell.strip() in ["X", "O"]:
204
+ occupied.append(f"{coord}={cell.strip()}")
205
+ else:
206
+ available.append(coord)
207
+
208
+ if occupied:
209
+ parts.append(f" Occupied: {', '.join(occupied)}")
210
+ if available:
211
+ parts.append(f" Available: {', '.join(available)}")
212
+
213
+ if "current_player" in obs:
214
+ parts.append(f"\nCurrent Player: {obs['current_player']}")
215
+
216
+ if "last_move" in obs and obs["last_move"]:
217
+ parts.append(f"Last Move: {obs['last_move']}")
218
+
219
+ if "move_count" in obs:
220
+ parts.append(f"Move Count: {obs['move_count']}/9")
221
+
222
+ if "winner" in obs and obs["winner"]:
223
+ parts.append(f"\nGame Result: {obs['winner']}")
224
+
225
+ if "reward_last" in obs and obs["reward_last"] != 0:
226
+ parts.append(f"Reward: {obs['reward_last']}")
227
+
228
+ if "error" in obs:
229
+ parts.append(f"\nError: {obs['error']}")
230
+
231
+ return "\n".join(parts)
232
+
233
+
234
+ # Random opponent moves are now handled by the TicTacToe environment internally
235
+
236
+
237
+ # --- Episode Runner ---
238
+ async def run_single_episode(
239
+ client: AsyncClient, agent: TicTacToeReActAgent, task_instance, instance_num: int
240
+ ) -> Dict[str, Any]:
241
+ """Run a single TicTacToe episode and return episode metrics."""
242
+ try:
243
+ # Create environment using the task instance
244
+ create_resp = await client.post(
245
+ f"/env/TicTacToe/initialize", json={"task_instance": await task_instance.serialize()}
246
+ )
247
+
248
+ if create_resp.status_code != 200:
249
+ print(
250
+ f" Instance {instance_num}: Failed to create environment - {create_resp.status_code}: {create_resp.text}"
251
+ )
252
+ return {"eval_metric": 0.0, "rubric": {}, "error": True}
253
+
254
+ env_id = create_resp.json()["env_id"]
255
+
256
+ # Get initial observation
257
+ obs = create_resp.json()["observation"]
258
+ formatted_obs = agent.format_observation(obs)
259
+
260
+ # DEBUG: Print initial state
261
+ print(f"\n Instance {instance_num}: Starting TicTacToe game")
262
+ print(f" Agent plays as: {task_instance.metadata.starting_player}")
263
+ print(f" Opening moves: {task_instance.metadata.opening_moves}")
264
+ print(f" Initial observation:")
265
+ print(f" {formatted_obs}")
266
+
267
+ # Track game state
268
+ agent_player = task_instance.metadata.starting_player
269
+ print(f" DEBUG: agent_player = {agent_player}")
270
+
271
+ # Run episode - TicTacToe handles opponent moves automatically
272
+ for turn in range(agent.max_turns):
273
+ # Check if game is already terminated
274
+ if obs.get("terminated", False):
275
+ break
276
+
277
+ # Agent makes a move
278
+ action = await agent.decide(formatted_obs, agent.get_system_message(), turn)
279
+
280
+ # DEBUG: Print agent decision
281
+ print(
282
+ f" Turn {turn + 1}: Agent chose '{action['parameters']['action']}' - {action['parameters'].get('reasoning', 'no reasoning')}"
283
+ )
284
+
285
+ # Check for termination
286
+ if action["name"] == "terminate":
287
+ print(
288
+ f" Agent terminated: {action['parameters'].get('reason', 'no reason given')}"
289
+ )
290
+ break
291
+
292
+ # Execute action in environment
293
+ action_name = action["parameters"]["action"]
294
+
295
+ step_resp = await client.post(
296
+ f"/env/TicTacToe/step",
297
+ json={
298
+ "env_id": env_id,
299
+ "request_id": str(uuid.uuid4()),
300
+ "action": {
301
+ "tool_calls": [{"tool": "interact", "args": {"action": action_name}}]
302
+ },
303
+ },
304
+ )
305
+
306
+ if step_resp.status_code != 200:
307
+ print(f" ❌ Step failed: {step_resp.status_code}: {step_resp.text}")
308
+ break
309
+
310
+ obs = step_resp.json()["observation"]
311
+ formatted_obs = agent.format_observation(obs)
312
+
313
+ # Update history
314
+ agent.history.append(f"{action_name}: {action['parameters'].get('reasoning', '')[:50]}")
315
+
316
+ # DEBUG: Print state after action
317
+ print(f" After move:")
318
+ print(f" {formatted_obs}")
319
+
320
+ # Check if game ended
321
+ terminated = obs.get("terminated", False)
322
+ winner = obs.get("winner")
323
+
324
+ if terminated:
325
+ # DEBUG: Print evaluation details
326
+ print(
327
+ f" DEBUG: Game ended - winner='{winner}', agent_player='{agent_player}', winner==agent_player={winner == agent_player}"
328
+ )
329
+
330
+ # Calculate eval metric
331
+ eval_metric = 0.0
332
+ if winner == agent_player:
333
+ eval_metric = 1.0
334
+ print(f" ✅ Instance {instance_num}: SUCCESS! Agent won as {agent_player}")
335
+ elif winner == "draw":
336
+ eval_metric = 0.5
337
+ print(f" ⚪ Instance {instance_num}: DRAW - acceptable result")
338
+ else:
339
+ eval_metric = 0.0
340
+ print(f" ❌ Instance {instance_num}: Agent lost to random opponent")
341
+
342
+ await client.post(f"/env/TicTacToe/terminate", json={"env_id": env_id})
343
+ return {
344
+ "eval_metric": eval_metric,
345
+ "rubric": {}, # No rubric for TicTacToe
346
+ "result": winner,
347
+ "agent_player": agent_player,
348
+ "error": False,
349
+ }
350
+
351
+ print(f" ❌ Instance {instance_num}: Game didn't finish in {agent.max_turns} turns")
352
+
353
+ # Cleanup
354
+ await client.post(f"/env/TicTacToe/terminate", json={"env_id": env_id})
355
+ return {"eval_metric": 0.0, "rubric": {}, "error": False}
356
+
357
+ except Exception as e:
358
+ print(f" Instance {instance_num}: Error - {e}")
359
+ import traceback
360
+
361
+ traceback.print_exc()
362
+ return {"eval_metric": 0.0, "rubric": {}, "error": True}
363
+
364
+
365
+ # --- Batch Evaluation ---
366
+ async def evaluate_tictactoe_batch() -> Dict[str, Any]:
367
+ """Evaluate TicTacToe agent on multiple instances."""
368
+ print(f"🎯 Evaluating TicTacToe on {NUM_INSTANCES} instances with random opponent...")
369
+
370
+ llm = LM(model_name=MODEL_NAME, formatting_model_name=MODEL_NAME, temperature=0.0)
371
+
372
+ # Get task instances using the taskset system
373
+ from synth_ai.environments.examples.tictactoe.taskset import create_tictactoe_taskset
374
+
375
+ taskset = await create_tictactoe_taskset()
376
+ task_instances = taskset.instances[:NUM_INSTANCES]
377
+
378
+ print(f" 📝 Using {len(task_instances)} task instances from taskset")
379
+
380
+ async with AsyncClient(base_url=SERVICE_BASE_URL, timeout=30.0) as client:
381
+ tasks = []
382
+ for i, task_instance in enumerate(task_instances):
383
+ agent = TicTacToeReActAgent(llm, max_turns=MAX_TURNS, verbose=False)
384
+ tasks.append(run_single_episode(client, agent, task_instance, i + 1))
385
+
386
+ results = await asyncio.gather(*tasks)
387
+
388
+ # Filter out error results
389
+ valid_results = [r for r in results if not r.get("error", False)]
390
+
391
+ if not valid_results:
392
+ return {
393
+ "eval_metrics": [],
394
+ "mean_eval_metric": 0.0,
395
+ "mean_rubric": {},
396
+ "num_episodes": 0,
397
+ }
398
+
399
+ # Extract eval metrics (no rubric for TicTacToe)
400
+ eval_metrics = [r["eval_metric"] for r in valid_results]
401
+ mean_eval_metric = sum(eval_metrics) / len(eval_metrics)
402
+
403
+ return {
404
+ "eval_metrics": eval_metrics,
405
+ "mean_eval_metric": mean_eval_metric,
406
+ "mean_rubric": {}, # No rubric for TicTacToe
407
+ "num_episodes": len(valid_results),
408
+ }
409
+
410
+
411
+ async def main():
412
+ """Run TicTacToe evaluation."""
413
+ print(f"🎮 TicTacToe ReAct Agent Evaluation")
414
+ print(f"Model: {MODEL_NAME}")
415
+ print(f"Service: {SERVICE_BASE_URL}")
416
+ print(f"Instances: {NUM_INSTANCES}")
417
+ print(f"Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
418
+ print("=" * 50)
419
+
420
+ # Test service health
421
+ async with AsyncClient(base_url=SERVICE_BASE_URL, timeout=10.0) as client:
422
+ try:
423
+ health_resp = await client.get("/health")
424
+ health_data = health_resp.json()
425
+
426
+ if "TicTacToe" not in health_data.get("supported_environments", []):
427
+ print("❌ TicTacToe not available on service")
428
+ return
429
+
430
+ print("✅ Service health check passed")
431
+
432
+ except Exception as e:
433
+ print(f"❌ Service health check failed: {e}")
434
+ return
435
+
436
+ # Run evaluation
437
+ try:
438
+ results = await evaluate_tictactoe_batch()
439
+
440
+ print("\n" + "=" * 80)
441
+ print("🏆 FINAL TICTACTOE EVALUATION RESULTS")
442
+ print("=" * 80)
443
+
444
+ # Print eval metrics
445
+ print(f"📊 EVAL METRICS:")
446
+ print(f" Episodes: {results['num_episodes']}")
447
+ print(f" Individual Scores: {[f'{x:.1f}' for x in results['eval_metrics']]}")
448
+ print(f" Mean Eval Metric: {results['mean_eval_metric']:.2f}")
449
+
450
+ # Print rubric results (none for TicTacToe)
451
+ print(f"\n🎯 RUBRIC RESULTS:")
452
+ print(" No rubric for TicTacToe")
453
+
454
+ # Overall assessment
455
+ print(f"\n🔍 ASSESSMENT:")
456
+ if results["mean_eval_metric"] > 0.8:
457
+ print("🎉 Excellent performance against random opponent!")
458
+ elif results["mean_eval_metric"] > 0.6:
459
+ print("✅ Good performance!")
460
+ elif results["mean_eval_metric"] > 0.4:
461
+ print("⚠️ Moderate performance")
462
+ else:
463
+ print("❌ Poor performance - struggling against random moves")
464
+
465
+ except Exception as e:
466
+ print(f"❌ Evaluation failed: {e}")
467
+
468
+
469
+ if __name__ == "__main__":
470
+ asyncio.run(main())