synth-ai 0.2.0__py3-none-any.whl → 0.2.1.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (266) hide show
  1. synth_ai/__init__.py +28 -2
  2. synth_ai/core/system.py +4 -0
  3. synth_ai/environments/__init__.py +35 -0
  4. synth_ai/environments/environment/__init__.py +1 -0
  5. synth_ai/environments/environment/artifacts/__init__.py +1 -0
  6. synth_ai/environments/environment/artifacts/base.py +50 -0
  7. synth_ai/environments/environment/core.py +22 -0
  8. synth_ai/environments/environment/db/__init__.py +1 -0
  9. synth_ai/environments/environment/db/sqlite.py +45 -0
  10. synth_ai/environments/environment/registry.py +24 -0
  11. synth_ai/environments/environment/resources/sqlite.py +46 -0
  12. synth_ai/environments/environment/results.py +1 -0
  13. synth_ai/environments/environment/rewards/__init__.py +1 -0
  14. synth_ai/environments/environment/rewards/core.py +28 -0
  15. synth_ai/environments/environment/shared_engine.py +26 -0
  16. synth_ai/environments/environment/tools/__init__.py +34 -0
  17. synth_ai/environments/examples/__init__.py +1 -0
  18. synth_ai/environments/examples/crafter_classic/__init__.py +8 -0
  19. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py +58 -0
  20. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_browser.py +152 -0
  21. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_framework.py +1194 -0
  22. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py +51 -0
  23. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_react_agent.py +872 -0
  24. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_trace_evaluation.py +1412 -0
  25. synth_ai/environments/examples/crafter_classic/agent_demos/test_crafter_react_agent.py +1110 -0
  26. synth_ai/environments/examples/crafter_classic/config_logging.py +111 -0
  27. synth_ai/environments/examples/crafter_classic/engine.py +502 -0
  28. synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +63 -0
  29. synth_ai/environments/examples/crafter_classic/engine_helpers/action_map.py +5 -0
  30. synth_ai/environments/examples/crafter_classic/engine_helpers/serialization.py +74 -0
  31. synth_ai/environments/examples/crafter_classic/environment.py +255 -0
  32. synth_ai/environments/examples/crafter_classic/taskset.py +228 -0
  33. synth_ai/environments/examples/enron/agent_demos/test_synth_react.py +535 -0
  34. synth_ai/environments/examples/enron/art_helpers/email_search_tools.py +156 -0
  35. synth_ai/environments/examples/enron/art_helpers/local_email_db.py +280 -0
  36. synth_ai/environments/examples/enron/art_helpers/types_enron.py +24 -0
  37. synth_ai/environments/examples/enron/engine.py +291 -0
  38. synth_ai/environments/examples/enron/environment.py +165 -0
  39. synth_ai/environments/examples/enron/taskset.py +112 -0
  40. synth_ai/environments/examples/enron/units/keyword_stats.py +111 -0
  41. synth_ai/environments/examples/enron/units/test_email_index.py +8 -0
  42. synth_ai/environments/examples/minigrid/__init__.py +48 -0
  43. synth_ai/environments/examples/minigrid/agent_demos/minigrid_evaluation_framework.py +1188 -0
  44. synth_ai/environments/examples/minigrid/agent_demos/minigrid_quick_evaluation.py +47 -0
  45. synth_ai/environments/examples/minigrid/agent_demos/minigrid_react_agent.py +562 -0
  46. synth_ai/environments/examples/minigrid/agent_demos/minigrid_trace_evaluation.py +220 -0
  47. synth_ai/environments/examples/minigrid/agent_demos/test_minigrid_react_agent.py +393 -0
  48. synth_ai/environments/examples/minigrid/engine.py +589 -0
  49. synth_ai/environments/examples/minigrid/environment.py +274 -0
  50. synth_ai/environments/examples/minigrid/environment_mapping.py +242 -0
  51. synth_ai/environments/examples/minigrid/puzzle_loader.py +416 -0
  52. synth_ai/environments/examples/minigrid/taskset.py +583 -0
  53. synth_ai/environments/examples/minigrid/units/test_action_behavior.py +226 -0
  54. synth_ai/environments/examples/minigrid/units/test_debug_messages.py +83 -0
  55. synth_ai/environments/examples/minigrid/units/test_exploration.py +120 -0
  56. synth_ai/environments/examples/minigrid/units/test_minigrid_engine.py +214 -0
  57. synth_ai/environments/examples/minigrid/units/test_minigrid_environment.py +238 -0
  58. synth_ai/environments/examples/minigrid/units/test_minigrid_environment_mapping.py +301 -0
  59. synth_ai/environments/examples/minigrid/units/test_minigrid_taskset.py +210 -0
  60. synth_ai/environments/examples/nethack/__init__.py +7 -0
  61. synth_ai/environments/examples/nethack/achievements.py +337 -0
  62. synth_ai/environments/examples/nethack/agent_demos/nethack_evaluation_framework.py +981 -0
  63. synth_ai/environments/examples/nethack/agent_demos/nethack_quick_evaluation.py +74 -0
  64. synth_ai/environments/examples/nethack/agent_demos/nethack_react_agent.py +832 -0
  65. synth_ai/environments/examples/nethack/agent_demos/test_nethack_react_agent.py +1112 -0
  66. synth_ai/environments/examples/nethack/engine.py +738 -0
  67. synth_ai/environments/examples/nethack/environment.py +255 -0
  68. synth_ai/environments/examples/nethack/helpers/__init__.py +42 -0
  69. synth_ai/environments/examples/nethack/helpers/action_mapping.py +301 -0
  70. synth_ai/environments/examples/nethack/helpers/nle_wrapper.py +401 -0
  71. synth_ai/environments/examples/nethack/helpers/observation_utils.py +433 -0
  72. synth_ai/environments/examples/nethack/helpers/recording_wrapper.py +201 -0
  73. synth_ai/environments/examples/nethack/helpers/trajectory_recorder.py +268 -0
  74. synth_ai/environments/examples/nethack/helpers/visualization/replay_viewer.py +308 -0
  75. synth_ai/environments/examples/nethack/helpers/visualization/visualizer.py +430 -0
  76. synth_ai/environments/examples/nethack/taskset.py +323 -0
  77. synth_ai/environments/examples/nethack/units/test_nethack_engine.py +277 -0
  78. synth_ai/environments/examples/nethack/units/test_nethack_environment.py +281 -0
  79. synth_ai/environments/examples/nethack/units/test_nethack_taskset.py +213 -0
  80. synth_ai/environments/examples/nethack/units/test_recording.py +307 -0
  81. synth_ai/environments/examples/red/__init__.py +7 -0
  82. synth_ai/environments/examples/red/agent_demos/__init__.py +1 -0
  83. synth_ai/environments/examples/red/agent_demos/test_synth_react.py +1471 -0
  84. synth_ai/environments/examples/red/config_logging.py +110 -0
  85. synth_ai/environments/examples/red/engine.py +693 -0
  86. synth_ai/environments/examples/red/engine_helpers/__init__.py +1 -0
  87. synth_ai/environments/examples/red/engine_helpers/memory_map.py +28 -0
  88. synth_ai/environments/examples/red/engine_helpers/reward_components.py +275 -0
  89. synth_ai/environments/examples/red/engine_helpers/reward_library/__init__.py +142 -0
  90. synth_ai/environments/examples/red/engine_helpers/reward_library/adaptive_rewards.py +56 -0
  91. synth_ai/environments/examples/red/engine_helpers/reward_library/battle_rewards.py +283 -0
  92. synth_ai/environments/examples/red/engine_helpers/reward_library/composite_rewards.py +149 -0
  93. synth_ai/environments/examples/red/engine_helpers/reward_library/economy_rewards.py +137 -0
  94. synth_ai/environments/examples/red/engine_helpers/reward_library/efficiency_rewards.py +56 -0
  95. synth_ai/environments/examples/red/engine_helpers/reward_library/exploration_rewards.py +330 -0
  96. synth_ai/environments/examples/red/engine_helpers/reward_library/novelty_rewards.py +120 -0
  97. synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_rewards.py +558 -0
  98. synth_ai/environments/examples/red/engine_helpers/reward_library/pokemon_rewards.py +312 -0
  99. synth_ai/environments/examples/red/engine_helpers/reward_library/social_rewards.py +147 -0
  100. synth_ai/environments/examples/red/engine_helpers/reward_library/story_rewards.py +246 -0
  101. synth_ai/environments/examples/red/engine_helpers/screen_analysis.py +367 -0
  102. synth_ai/environments/examples/red/engine_helpers/state_extraction.py +139 -0
  103. synth_ai/environments/examples/red/environment.py +235 -0
  104. synth_ai/environments/examples/red/taskset.py +77 -0
  105. synth_ai/environments/examples/red/test_fixes.py +125 -0
  106. synth_ai/environments/examples/red/test_fixes_mock.py +148 -0
  107. synth_ai/environments/examples/red/units/__init__.py +1 -0
  108. synth_ai/environments/examples/red/units/test_basic_functionality.py +97 -0
  109. synth_ai/environments/examples/red/units/test_button_press_requirements.py +217 -0
  110. synth_ai/environments/examples/red/units/test_engine.py +192 -0
  111. synth_ai/environments/examples/red/units/test_environment.py +455 -0
  112. synth_ai/environments/examples/red/units/test_exploration_strategy.py +227 -0
  113. synth_ai/environments/examples/red/units/test_integration.py +217 -0
  114. synth_ai/environments/examples/red/units/test_memory_extraction.py +111 -0
  115. synth_ai/environments/examples/red/units/test_menu_bug_reproduction.py +1100 -0
  116. synth_ai/environments/examples/red/units/test_movement_debug.py +255 -0
  117. synth_ai/environments/examples/red/units/test_pokemon_mcts_debug.py +163 -0
  118. synth_ai/environments/examples/red/units/test_pokemon_mcts_verbose.py +117 -0
  119. synth_ai/environments/examples/red/units/test_red_basic.py +145 -0
  120. synth_ai/environments/examples/red/units/test_red_comprehensive.py +323 -0
  121. synth_ai/environments/examples/red/units/test_retry_movement.py +195 -0
  122. synth_ai/environments/examples/red/units/test_reward_components.py +186 -0
  123. synth_ai/environments/examples/red/units/test_rom_integration.py +260 -0
  124. synth_ai/environments/examples/red/units/test_taskset.py +116 -0
  125. synth_ai/environments/examples/red/units/test_tree.py +448 -0
  126. synth_ai/environments/examples/sokoban/__init__.py +1 -0
  127. synth_ai/environments/examples/sokoban/agent_demos/sokoban_full_eval.py +900 -0
  128. synth_ai/environments/examples/sokoban/agent_demos/test_dspy_react.py +1 -0
  129. synth_ai/environments/examples/sokoban/agent_demos/test_sokoban_react_agent.py +498 -0
  130. synth_ai/environments/examples/sokoban/agent_demos/test_synth_lats.py +1 -0
  131. synth_ai/environments/examples/sokoban/agent_demos/test_synth_react_locally.py +748 -0
  132. synth_ai/environments/examples/sokoban/agent_demos/test_synth_react_service.py +296 -0
  133. synth_ai/environments/examples/sokoban/engine.py +675 -0
  134. synth_ai/environments/examples/sokoban/engine_helpers/__init__.py +1 -0
  135. synth_ai/environments/examples/sokoban/engine_helpers/room_utils.py +656 -0
  136. synth_ai/environments/examples/sokoban/engine_helpers/vendored/__init__.py +17 -0
  137. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/__init__.py +3 -0
  138. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/boxoban_env.py +129 -0
  139. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/render_utils.py +370 -0
  140. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/room_utils.py +331 -0
  141. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env.py +305 -0
  142. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_fixed_targets.py +66 -0
  143. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_pull.py +114 -0
  144. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_two_player.py +122 -0
  145. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_variations.py +394 -0
  146. synth_ai/environments/examples/sokoban/environment.py +228 -0
  147. synth_ai/environments/examples/sokoban/generate_verified_puzzles.py +438 -0
  148. synth_ai/environments/examples/sokoban/puzzle_loader.py +311 -0
  149. synth_ai/environments/examples/sokoban/taskset.py +425 -0
  150. synth_ai/environments/examples/sokoban/units/astar_common.py +94 -0
  151. synth_ai/environments/examples/sokoban/units/test_building_task_set.py +49 -0
  152. synth_ai/environments/examples/sokoban/units/test_false_positive.py +120 -0
  153. synth_ai/environments/examples/sokoban/units/test_simple_run_through_environment.py +119 -0
  154. synth_ai/environments/examples/sokoban/units/test_sokoban_environment.py +98 -0
  155. synth_ai/environments/examples/sokoban/units/test_tree.py +364 -0
  156. synth_ai/environments/examples/tictactoe/__init__.py +1 -0
  157. synth_ai/environments/examples/tictactoe/agent_demos/test_synth_react.py +266 -0
  158. synth_ai/environments/examples/tictactoe/agent_demos/test_tictactoe_react_agent.py +470 -0
  159. synth_ai/environments/examples/tictactoe/engine.py +368 -0
  160. synth_ai/environments/examples/tictactoe/environment.py +239 -0
  161. synth_ai/environments/examples/tictactoe/taskset.py +214 -0
  162. synth_ai/environments/examples/tictactoe/units/test_tictactoe_engine.py +393 -0
  163. synth_ai/environments/examples/tictactoe/units/test_tictactoe_environment.py +493 -0
  164. synth_ai/environments/examples/tictactoe/units/test_tictactoe_taskset.py +191 -0
  165. synth_ai/environments/examples/verilog/__init__.py +10 -0
  166. synth_ai/environments/examples/verilog/agent_demos/test_synth_react.py +520 -0
  167. synth_ai/environments/examples/verilog/engine.py +328 -0
  168. synth_ai/environments/examples/verilog/environment.py +349 -0
  169. synth_ai/environments/examples/verilog/taskset.py +418 -0
  170. synth_ai/environments/examples/verilog/units/test_verilog_engine.py +466 -0
  171. synth_ai/environments/examples/verilog/units/test_verilog_environment.py +585 -0
  172. synth_ai/environments/examples/verilog/units/test_verilog_integration.py +383 -0
  173. synth_ai/environments/examples/verilog/units/test_verilog_taskset.py +457 -0
  174. synth_ai/environments/reproducibility/core.py +42 -0
  175. synth_ai/environments/reproducibility/tree.py +364 -0
  176. synth_ai/environments/service/app.py +78 -0
  177. synth_ai/environments/service/core_routes.py +775 -0
  178. synth_ai/environments/service/external_registry.py +57 -0
  179. synth_ai/environments/service/registry.py +9 -0
  180. synth_ai/environments/stateful/__init__.py +1 -0
  181. synth_ai/environments/stateful/core.py +28 -0
  182. synth_ai/environments/stateful/engine.py +21 -0
  183. synth_ai/environments/stateful/state.py +7 -0
  184. synth_ai/environments/tasks/api.py +19 -0
  185. synth_ai/environments/tasks/core.py +78 -0
  186. synth_ai/environments/tasks/filters.py +39 -0
  187. synth_ai/environments/tasks/utils.py +89 -0
  188. synth_ai/environments/v0_observability/history.py +3 -0
  189. synth_ai/environments/v0_observability/log.py +2 -0
  190. synth_ai/lm/caching/constants.py +1 -0
  191. synth_ai/{zyk/lms → lm}/caching/ephemeral.py +4 -8
  192. synth_ai/{zyk/lms → lm}/caching/handler.py +15 -15
  193. synth_ai/{zyk/lms → lm}/caching/initialize.py +2 -4
  194. synth_ai/{zyk/lms → lm}/caching/persistent.py +4 -10
  195. synth_ai/{zyk/lms → lm}/config.py +2 -1
  196. synth_ai/{zyk/lms → lm}/constants.py +2 -2
  197. synth_ai/{zyk/lms → lm}/core/all.py +10 -10
  198. synth_ai/{zyk/lms → lm}/core/main.py +57 -33
  199. synth_ai/{zyk/lms → lm}/core/vendor_clients.py +12 -10
  200. synth_ai/lm/cost/monitor.py +1 -0
  201. synth_ai/lm/cost/statefulness.py +1 -0
  202. synth_ai/lm/provider_support/__init__.py +8 -0
  203. synth_ai/lm/provider_support/anthropic.py +945 -0
  204. synth_ai/lm/provider_support/openai.py +1115 -0
  205. synth_ai/lm/provider_support/suppress_logging.py +31 -0
  206. synth_ai/{zyk/lms → lm}/structured_outputs/handler.py +58 -80
  207. synth_ai/{zyk/lms → lm}/structured_outputs/inject.py +6 -20
  208. synth_ai/{zyk/lms → lm}/structured_outputs/rehabilitate.py +6 -12
  209. synth_ai/{zyk/lms → lm}/vendors/core/anthropic_api.py +21 -30
  210. synth_ai/{zyk/lms → lm}/vendors/core/gemini_api.py +35 -32
  211. synth_ai/{zyk/lms → lm}/vendors/core/mistral_api.py +19 -28
  212. synth_ai/{zyk/lms → lm}/vendors/core/openai_api.py +26 -36
  213. synth_ai/{zyk/lms → lm}/vendors/openai_standard.py +29 -33
  214. synth_ai/{zyk/lms → lm}/vendors/retries.py +1 -1
  215. synth_ai/lm/vendors/supported/__init__.py +0 -0
  216. synth_ai/{zyk/lms → lm}/vendors/supported/custom_endpoint.py +131 -118
  217. synth_ai/{zyk/lms → lm}/vendors/supported/deepseek.py +4 -8
  218. synth_ai/{zyk/lms → lm}/vendors/supported/grok.py +6 -8
  219. synth_ai/{zyk/lms → lm}/vendors/supported/groq.py +1 -1
  220. synth_ai/{zyk/lms → lm}/vendors/supported/ollama.py +2 -2
  221. synth_ai/{zyk/lms → lm}/vendors/supported/openrouter.py +18 -16
  222. synth_ai/{zyk/lms → lm}/vendors/supported/together.py +1 -1
  223. synth_ai/tracing/__init__.py +0 -0
  224. synth_ai/tracing/abstractions.py +224 -0
  225. synth_ai/tracing/base_client.py +91 -0
  226. synth_ai/tracing/client_manager.py +131 -0
  227. synth_ai/tracing/config.py +140 -0
  228. synth_ai/tracing/context.py +146 -0
  229. synth_ai/tracing/decorators.py +679 -0
  230. synth_ai/tracing/events/__init__.py +0 -0
  231. synth_ai/tracing/events/manage.py +147 -0
  232. synth_ai/tracing/events/scope.py +86 -0
  233. synth_ai/tracing/events/store.py +227 -0
  234. synth_ai/tracing/immediate_client.py +152 -0
  235. synth_ai/tracing/local.py +18 -0
  236. synth_ai/tracing/log_client_base.py +74 -0
  237. synth_ai/tracing/retry_queue.py +187 -0
  238. synth_ai/tracing/trackers.py +515 -0
  239. synth_ai/tracing/upload.py +504 -0
  240. synth_ai/tracing/utils.py +9 -0
  241. synth_ai/zyk/__init__.py +28 -2
  242. synth_ai-0.2.1.dev0.dist-info/METADATA +349 -0
  243. synth_ai-0.2.1.dev0.dist-info/RECORD +261 -0
  244. {synth_ai-0.2.0.dist-info → synth_ai-0.2.1.dev0.dist-info}/WHEEL +1 -1
  245. synth_ai/zyk/lms/caching/constants.py +0 -1
  246. synth_ai/zyk/lms/cost/monitor.py +0 -1
  247. synth_ai/zyk/lms/cost/statefulness.py +0 -1
  248. synth_ai-0.2.0.dist-info/METADATA +0 -36
  249. synth_ai-0.2.0.dist-info/RECORD +0 -50
  250. /synth_ai/{zyk/lms/__init__.py → environments/reproducibility/helpers.py} +0 -0
  251. /synth_ai/{zyk/lms/caching → lm}/__init__.py +0 -0
  252. /synth_ai/{zyk/lms/core → lm/caching}/__init__.py +0 -0
  253. /synth_ai/{zyk/lms → lm}/caching/dbs.py +0 -0
  254. /synth_ai/{zyk/lms/cost → lm/core}/__init__.py +0 -0
  255. /synth_ai/{zyk/lms → lm}/core/exceptions.py +0 -0
  256. /synth_ai/{zyk/lms/structured_outputs → lm/cost}/__init__.py +0 -0
  257. /synth_ai/{zyk/lms/vendors → lm/structured_outputs}/__init__.py +0 -0
  258. /synth_ai/{zyk/lms → lm}/tools/__init__.py +0 -0
  259. /synth_ai/{zyk/lms → lm}/tools/base.py +0 -0
  260. /synth_ai/{zyk/lms/vendors/core → lm/vendors}/__init__.py +0 -0
  261. /synth_ai/{zyk/lms → lm}/vendors/base.py +0 -0
  262. /synth_ai/{zyk/lms/vendors/local → lm/vendors/core}/__init__.py +0 -0
  263. /synth_ai/{zyk/lms/vendors/supported → lm/vendors/local}/__init__.py +0 -0
  264. /synth_ai/{zyk/lms → lm}/vendors/local/ollama.py +0 -0
  265. {synth_ai-0.2.0.dist-info → synth_ai-0.2.1.dev0.dist-info/licenses}/LICENSE +0 -0
  266. {synth_ai-0.2.0.dist-info → synth_ai-0.2.1.dev0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,220 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Simple MiniGrid evaluation script to generate traces.
4
+ """
5
+
6
+ import asyncio
7
+ import json
8
+ import os
9
+ import sys
10
+ import uuid
11
+ from datetime import datetime
12
+ from pathlib import Path
13
+
14
+ # Add parent directories to path
15
+ sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent.parent))
16
+
17
+ import gymnasium as gym
18
+ import minigrid
19
+ from minigrid.wrappers import ImgObsWrapper, RGBImgPartialObsWrapper
20
+ import numpy as np
21
+ import base64
22
+ from PIL import Image
23
+ import io
24
+
25
+
26
+ # Environment setup
27
+ def create_minigrid_env(env_name="MiniGrid-Empty-6x6-v0"):
28
+ """Create a MiniGrid environment with image observations."""
29
+ env = gym.make(env_name)
30
+ # Wrap to get RGB image observations
31
+ env = RGBImgPartialObsWrapper(env)
32
+ env = ImgObsWrapper(env)
33
+ return env
34
+
35
+
36
+ def image_to_base64(image_array):
37
+ """Convert numpy image array to base64 string."""
38
+ # Convert to PIL Image
39
+ img = Image.fromarray(image_array.astype(np.uint8))
40
+ # Save to bytes buffer
41
+ buffer = io.BytesIO()
42
+ img.save(buffer, format="PNG")
43
+ buffer.seek(0)
44
+ # Encode to base64
45
+ img_base64 = base64.b64encode(buffer.read()).decode("utf-8")
46
+ return img_base64
47
+
48
+
49
+ def get_action_name(action_idx):
50
+ """Map action index to name."""
51
+ action_names = {
52
+ 0: "left",
53
+ 1: "right",
54
+ 2: "forward",
55
+ 3: "pickup",
56
+ 4: "drop",
57
+ 5: "toggle",
58
+ 6: "done",
59
+ }
60
+ return action_names.get(action_idx, f"action_{action_idx}")
61
+
62
+
63
+ async def run_simple_minigrid_eval(
64
+ model_name="simple-agent",
65
+ env_name="MiniGrid-Empty-6x6-v0",
66
+ num_episodes=3,
67
+ max_steps=50,
68
+ ):
69
+ """Run a simple evaluation to generate MiniGrid traces."""
70
+
71
+ print(f"\n🎮 Running MiniGrid Evaluation")
72
+ print(f" Environment: {env_name}")
73
+ print(f" Episodes: {num_episodes}")
74
+ print(f" Max steps: {max_steps}")
75
+
76
+ # Create output directory
77
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
78
+ run_id = f"run_{int(datetime.now().timestamp())}"
79
+ output_dir = Path(f"src/evals/minigrid/{run_id}")
80
+ traces_dir = output_dir / "traces"
81
+ traces_dir.mkdir(parents=True, exist_ok=True)
82
+
83
+ results = []
84
+
85
+ for episode in range(num_episodes):
86
+ print(f"\n📍 Episode {episode + 1}/{num_episodes}")
87
+
88
+ # Create environment
89
+ env = create_minigrid_env(env_name)
90
+ obs, info = env.reset()
91
+
92
+ # Initialize trace
93
+ trace_id = str(uuid.uuid4())
94
+ trace_data = {
95
+ "trace": {
96
+ "metadata": {
97
+ "model_name": model_name,
98
+ "env_name": env_name,
99
+ "difficulty": "easy",
100
+ "seed": episode,
101
+ "max_steps": max_steps,
102
+ },
103
+ "partition": [],
104
+ },
105
+ "dataset": {"reward_signals": []},
106
+ }
107
+
108
+ total_reward = 0.0
109
+ done = False
110
+ step = 0
111
+
112
+ while not done and step < max_steps:
113
+ # Simple policy: random actions with bias towards forward
114
+ if np.random.random() < 0.6:
115
+ action = 2 # forward
116
+ else:
117
+ action = env.action_space.sample()
118
+
119
+ # Take action
120
+ next_obs, reward, terminated, truncated, info = env.step(action)
121
+ done = terminated or truncated
122
+ total_reward += reward
123
+
124
+ # Create partition for this step
125
+ partition = {
126
+ "events": [
127
+ {
128
+ "environment_compute_steps": [
129
+ {
130
+ "compute_output": [
131
+ {
132
+ "outputs": {
133
+ "observation": {
134
+ "mission": getattr(
135
+ env.unwrapped,
136
+ "mission",
137
+ "Reach the goal",
138
+ ),
139
+ "image_base64": image_to_base64(
140
+ obs
141
+ if isinstance(obs, np.ndarray)
142
+ else obs["image"]
143
+ ),
144
+ },
145
+ "action": action,
146
+ "reward": float(reward),
147
+ "terminated": terminated,
148
+ "truncated": truncated,
149
+ }
150
+ }
151
+ ]
152
+ }
153
+ ]
154
+ }
155
+ ]
156
+ }
157
+
158
+ trace_data["trace"]["partition"].append(partition)
159
+
160
+ obs = next_obs
161
+ step += 1
162
+
163
+ if done and reward > 0:
164
+ print(f" ✅ Success! Reached goal in {step} steps")
165
+
166
+ if not done:
167
+ print(f" ⏰ Timeout after {step} steps")
168
+
169
+ # Update trace metadata
170
+ trace_data["trace"]["metadata"]["success"] = reward > 0
171
+ trace_data["trace"]["metadata"]["num_steps"] = step
172
+ trace_data["dataset"]["reward_signals"].append({"reward": float(total_reward)})
173
+
174
+ # Save trace
175
+ trace_file = traces_dir / f"minigrid_trace_{trace_id}.json"
176
+ with open(trace_file, "w") as f:
177
+ json.dump(trace_data, f, indent=2)
178
+
179
+ results.append(
180
+ {
181
+ "trace_id": trace_id,
182
+ "success": reward > 0,
183
+ "steps": step,
184
+ "total_reward": total_reward,
185
+ }
186
+ )
187
+
188
+ print(f" 💾 Saved trace: {trace_file.name}")
189
+
190
+ # Save evaluation summary
191
+ summary = {
192
+ "run_id": run_id,
193
+ "timestamp": timestamp,
194
+ "environment": env_name,
195
+ "model_name": model_name,
196
+ "num_episodes": num_episodes,
197
+ "results": results,
198
+ "success_rate": sum(1 for r in results if r["success"]) / len(results),
199
+ "avg_steps": sum(r["steps"] for r in results) / len(results),
200
+ "models_evaluated": [model_name],
201
+ "difficulties_evaluated": ["easy"],
202
+ }
203
+
204
+ summary_file = output_dir / "evaluation_summary.json"
205
+ with open(summary_file, "w") as f:
206
+ json.dump(summary, f, indent=2)
207
+
208
+ print(f"\n✅ Evaluation complete!")
209
+ print(f" Success rate: {summary['success_rate']:.1%}")
210
+ print(f" Average steps: {summary['avg_steps']:.1f}")
211
+ print(f" Output directory: {output_dir}")
212
+
213
+ return summary
214
+
215
+
216
+ if __name__ == "__main__":
217
+ # Run evaluation
218
+ asyncio.run(
219
+ run_simple_minigrid_eval(env_name="MiniGrid-Empty-6x6-v0", num_episodes=3, max_steps=30)
220
+ )
@@ -0,0 +1,393 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script to run ReAct agents against MiniGrid environment on synth service (port 8901)
4
+ Tests on multiple easy MiniGrid instances with enhanced debugging
5
+ """
6
+
7
+ import asyncio
8
+ import json
9
+ import uuid
10
+ from datetime import datetime
11
+ from typing import Dict, Any, Optional, List
12
+ from pydantic import BaseModel, Field
13
+ from httpx import AsyncClient
14
+ import sys
15
+ import os
16
+
17
+ # Add the src directory to the path
18
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "src"))
19
+
20
+ from synth_ai.zyk import LM
21
+ from synth_ai.zyk.lms.tools.base import BaseTool
22
+
23
+
24
+ # --- Service Configuration ---
25
+ SERVICE_BASE_URL = "http://localhost:8901"
26
+ MODEL_NAME = "o3"
27
+ NUM_INSTANCES = 1
28
+ MAX_TURNS = 20
29
+ DIFFICULTY = "ultra_easy"
30
+
31
+
32
+ # --- Tool Definitions ---
33
+ class NavigationActionArgs(BaseModel):
34
+ """Arguments for navigation actions."""
35
+
36
+ action: str = Field(
37
+ description="The action to take: left, right, forward, pickup, drop, toggle, done"
38
+ )
39
+ reasoning: str = Field(description="Brief explanation of why this action was chosen")
40
+
41
+
42
+ class TerminateArgs(BaseModel):
43
+ """Arguments for termination."""
44
+
45
+ reason: str = Field(description="Reason for termination")
46
+
47
+
48
+ class NavigationActionTool(BaseTool):
49
+ """Tool for performing an action in the MiniGrid environment."""
50
+
51
+ name: str = "navigation_action"
52
+ arguments: type[BaseModel] = NavigationActionArgs
53
+ description: str = "Perform a navigation action in the MiniGrid environment."
54
+
55
+
56
+ class TerminateTool(BaseTool):
57
+ """Tool to terminate the episode."""
58
+
59
+ name: str = "terminate"
60
+ arguments: type[BaseModel] = TerminateArgs
61
+ description: str = "End the episode when finished or no progress can be made."
62
+
63
+
64
+ # --- Base ReAct Agent ---
65
+ class BaseReActAgent:
66
+ """Base ReAct agent for environment interaction."""
67
+
68
+ def __init__(self, llm: LM, max_turns: int = 15, verbose: bool = False):
69
+ self.llm = llm
70
+ self.max_turns = max_turns
71
+ self.verbose = verbose
72
+ self.history = []
73
+ self.system_name = "base-react-agent"
74
+
75
+ # Define tools in OpenAI format (like Enron agent)
76
+ self.tools = [
77
+ NavigationActionTool(),
78
+ TerminateTool(),
79
+ ]
80
+
81
+ async def decide(self, obs: str, system_message: str, turn: int) -> Dict[str, Any]:
82
+ """Get agent decision based on observation."""
83
+ # Create conversation context
84
+ context = f"Turn {turn + 1}/{self.max_turns}\n\n{obs}"
85
+
86
+ # Generate response using LLM (same pattern as Crafter)
87
+ response_obj = await self.llm.respond_async(
88
+ system_message=system_message, user_message=context, tools=self.tools
89
+ )
90
+
91
+ tool_calls = response_obj.tool_calls
92
+
93
+ # Handle case where tool_calls is None or empty (graceful fallback)
94
+ if not tool_calls:
95
+ if self.verbose:
96
+ print(f"[WARNING] No tool calls returned by LLM, using default action")
97
+ return {
98
+ "name": "navigation_action",
99
+ "parameters": {
100
+ "action": "forward",
101
+ "reasoning": "Default action - no tool call received",
102
+ },
103
+ }
104
+
105
+ tool_call_data = tool_calls[0]
106
+
107
+ # Handle both dict and object formats (same as Crafter)
108
+ if isinstance(tool_call_data, dict):
109
+ tool_name = tool_call_data["function"]["name"]
110
+ tool_args_str = tool_call_data["function"]["arguments"]
111
+ else:
112
+ tool_name = tool_call_data.function.name
113
+ tool_args_str = tool_call_data.function.arguments
114
+
115
+ tool_arguments = json.loads(tool_args_str)
116
+
117
+ return {"name": tool_name, "parameters": tool_arguments}
118
+
119
+
120
+ # --- MiniGrid ReAct Agent ---
121
+ class MiniGridReActAgent(BaseReActAgent):
122
+ """ReAct agent for MiniGrid environment."""
123
+
124
+ def __init__(self, llm: LM, max_turns: int = 15, verbose: bool = False):
125
+ super().__init__(llm, max_turns, verbose)
126
+ self.system_name = "minigrid-react-agent"
127
+
128
+ def get_system_message(self) -> str:
129
+ return """You are navigating a MiniGrid environment. Your goal is to reach the goal (G) to complete the mission successfully.
130
+
131
+ ACTIONS:
132
+ - "left": turn left (counter-clockwise)
133
+ - "right": turn right (clockwise)
134
+ - "forward": move forward one step
135
+ - "pickup": pick up object in front of you
136
+ - "drop": drop carried object
137
+ - "toggle": open/close door or interact with object
138
+ - "done": complete mission when you reach the goal
139
+
140
+ SYMBOLS:
141
+ - # = wall (blocks movement)
142
+ - . = empty space (can move through)
143
+ - G = goal (your destination)
144
+ - K = key (pick up to unlock doors)
145
+ - D = door (may need key to open)
146
+ - L = lava (avoid - will end mission)
147
+ - @ = you (your current position)
148
+
149
+ STRATEGY:
150
+ 1. Analyze the grid layout to understand the environment
151
+ 2. Plan a path to reach the goal (G)
152
+ 3. Navigate systematically - turn to face the right direction, then move forward
153
+ 4. Pick up keys (K) before trying to open doors (D)
154
+ 5. Use "toggle" to open doors when you have the key
155
+ 6. Avoid lava (L) at all costs
156
+ 7. Use "done" when you reach the goal
157
+
158
+ IMPORTANT: You can only see a limited view around you. Move and explore to discover the full environment. Be systematic in your exploration."""
159
+
160
+ def format_observation(self, obs: Dict[str, Any]) -> str:
161
+ """Format observation for MiniGrid."""
162
+ parts = []
163
+
164
+ if "grid" in obs:
165
+ parts.append(f"Grid view:\n{obs['grid']}")
166
+ elif "observation" in obs:
167
+ parts.append(f"Observation:\n{obs['observation']}")
168
+
169
+ if "direction" in obs:
170
+ parts.append(f"Facing: {obs['direction']}")
171
+
172
+ if "carrying" in obs and obs["carrying"]:
173
+ parts.append(f"Carrying: {obs['carrying']}")
174
+
175
+ if "step_count" in obs:
176
+ parts.append(f"Steps: {obs['step_count']}")
177
+
178
+ if "mission" in obs:
179
+ parts.append(f"Mission: {obs['mission']}")
180
+
181
+ # Add more possible observation fields
182
+ if "terminated" in obs:
183
+ parts.append(f"Terminated: {obs['terminated']}")
184
+
185
+ if "success" in obs:
186
+ parts.append(f"Success: {obs['success']}")
187
+
188
+ if "reward_last" in obs:
189
+ parts.append(f"Last reward: {obs['reward_last']}")
190
+
191
+ return "\n".join(parts) if parts else "No formatted observation available"
192
+
193
+
194
+ # --- Episode Runner ---
195
+ async def run_single_episode(
196
+ client: AsyncClient, agent: MiniGridReActAgent, task_instance, instance_num: int
197
+ ) -> bool:
198
+ """Run a single MiniGrid episode and return success status."""
199
+ try:
200
+ # Create environment using the task instance
201
+ create_resp = await client.post(
202
+ f"/env/MiniGrid/initialize", json={"task_instance": await task_instance.serialize()}
203
+ )
204
+
205
+ if create_resp.status_code != 200:
206
+ print(
207
+ f" Instance {instance_num}: Failed to create environment - {create_resp.status_code}: {create_resp.text}"
208
+ )
209
+ return False
210
+
211
+ env_id = create_resp.json()["env_id"]
212
+
213
+ # Get initial observation
214
+ obs = create_resp.json()["observation"]
215
+ formatted_obs = agent.format_observation(obs)
216
+
217
+ # DEBUG: Print initial state
218
+ print(f"\n Instance {instance_num}: Starting MiniGrid mission")
219
+ print(f" Environment: {task_instance.metadata.env_name}")
220
+ print(f" Mission: {task_instance.impetus.instructions[:100]}...")
221
+ print(f" Initial observation:")
222
+ print(f" {formatted_obs}")
223
+
224
+ # Run episode
225
+ for turn in range(agent.max_turns):
226
+ # Get agent decision
227
+ action = await agent.decide(formatted_obs, agent.get_system_message(), turn)
228
+
229
+ # DEBUG: Print agent decision
230
+ print(
231
+ f" Turn {turn + 1}: Agent chose '{action['parameters']['action']}' - {action['parameters'].get('reasoning', 'no reasoning')}"
232
+ )
233
+
234
+ # Check for termination
235
+ if action["name"] == "terminate":
236
+ print(
237
+ f" Agent terminated: {action['parameters'].get('reason', 'no reason given')}"
238
+ )
239
+ break
240
+
241
+ # Execute action in environment
242
+ action_name = action["parameters"]["action"]
243
+
244
+ step_resp = await client.post(
245
+ f"/env/MiniGrid/step",
246
+ json={
247
+ "env_id": env_id,
248
+ "request_id": str(uuid.uuid4()),
249
+ "action": {
250
+ "tool_calls": [{"tool": "minigrid_act", "args": {"action": action_name}}]
251
+ },
252
+ },
253
+ )
254
+
255
+ if step_resp.status_code != 200:
256
+ print(f" ❌ Step failed: {step_resp.status_code}: {step_resp.text}")
257
+ break
258
+
259
+ obs = step_resp.json()["observation"]
260
+ formatted_obs = agent.format_observation(obs)
261
+
262
+ # DEBUG: Print state after action
263
+ print(f" After action: {formatted_obs}")
264
+
265
+ # Update history
266
+ agent.history.append(f"{action_name}: {action['parameters'].get('reasoning', '')[:50]}")
267
+
268
+ # Check if goal is reached
269
+ terminated = obs.get("terminated", False)
270
+ success = obs.get("success", False)
271
+ reward_last = obs.get("reward_last", 0.0)
272
+
273
+ # MiniGrid success is typically indicated by positive reward when terminated
274
+ # Success reward is usually close to 1.0 (1.0 - step_penalties)
275
+ actual_success = terminated and reward_last > 0.1 # Threshold for success reward
276
+
277
+ if terminated and actual_success:
278
+ print(
279
+ f" ✅ Instance {instance_num}: SUCCESS! Mission completed in {turn + 1} turns (reward: {reward_last:.3f})"
280
+ )
281
+ await client.post(f"/env/MiniGrid/terminate", json={"env_id": env_id})
282
+ return True
283
+
284
+ if terminated:
285
+ print(
286
+ f" ❌ Instance {instance_num}: Terminated without success (success field: {success}, reward: {reward_last:.3f})"
287
+ )
288
+ break
289
+
290
+ print(
291
+ f" ❌ Instance {instance_num}: Failed to complete mission in {agent.max_turns} turns"
292
+ )
293
+
294
+ # Cleanup
295
+ await client.post(f"/env/MiniGrid/terminate", json={"env_id": env_id})
296
+ return False
297
+
298
+ except Exception as e:
299
+ print(f" Instance {instance_num}: Error - {e}")
300
+ import traceback
301
+
302
+ traceback.print_exc()
303
+ return False
304
+
305
+
306
+ # --- Batch Evaluation ---
307
+ async def evaluate_minigrid_batch() -> float:
308
+ """Evaluate MiniGrid agent on multiple easy instances."""
309
+ print(f"🎯 Evaluating MiniGrid on {NUM_INSTANCES} easy instances...")
310
+
311
+ llm = LM(model_name=MODEL_NAME, formatting_model_name=MODEL_NAME, temperature=0.0)
312
+
313
+ # Get easy task instances using the taskset system
314
+ from synth_ai.environments.examples.minigrid.taskset import create_minigrid_task_from_seed
315
+
316
+ easy_task_instances = []
317
+ for seed in range(NUM_INSTANCES):
318
+ try:
319
+ task_instance = await create_minigrid_task_from_seed(DIFFICULTY, seed)
320
+ easy_task_instances.append(task_instance)
321
+ except Exception as e:
322
+ print(f" ⚠️ Failed to get task instance for seed {seed}: {e}")
323
+ continue
324
+
325
+ print(
326
+ f" 📝 Generated {len(easy_task_instances)} {DIFFICULTY} task instances from seeds 0-{NUM_INSTANCES - 1}"
327
+ )
328
+
329
+ async with AsyncClient(base_url=SERVICE_BASE_URL, timeout=30.0) as client:
330
+ tasks = []
331
+ for i, task_instance in enumerate(easy_task_instances):
332
+ agent = MiniGridReActAgent(llm, max_turns=MAX_TURNS, verbose=False)
333
+ tasks.append(run_single_episode(client, agent, task_instance, i + 1))
334
+
335
+ results = await asyncio.gather(*tasks)
336
+ success_count = sum(results)
337
+ success_rate = success_count / len(easy_task_instances)
338
+
339
+ print(
340
+ f" 📊 MiniGrid Results: {success_count}/{len(easy_task_instances)} solved ({success_rate:.1%})"
341
+ )
342
+ return success_rate
343
+
344
+
345
+ async def main():
346
+ """Run MiniGrid evaluation."""
347
+ print(f"🎮 MiniGrid ReAct Agent Evaluation")
348
+ print(f"Model: {MODEL_NAME}")
349
+ print(f"Service: {SERVICE_BASE_URL}")
350
+ print(f"Instances: {NUM_INSTANCES}")
351
+ print(f"Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
352
+ print("=" * 50)
353
+
354
+ # Test service health
355
+ async with AsyncClient(base_url=SERVICE_BASE_URL, timeout=10.0) as client:
356
+ try:
357
+ health_resp = await client.get("/health")
358
+ health_data = health_resp.json()
359
+
360
+ if "MiniGrid" not in health_data.get("supported_environments", []):
361
+ print("❌ MiniGrid not available on service")
362
+ return
363
+
364
+ print("✅ Service health check passed")
365
+
366
+ except Exception as e:
367
+ print(f"❌ Service health check failed: {e}")
368
+ return
369
+
370
+ # Run evaluation
371
+ try:
372
+ success_rate = await evaluate_minigrid_batch()
373
+
374
+ print("\n" + "=" * 50)
375
+ print("🏆 FINAL MINIGRID RESULTS")
376
+ print("=" * 50)
377
+ print(f"Success Rate: {success_rate:.1%}")
378
+
379
+ if success_rate > 0.5:
380
+ print("🎉 Excellent performance!")
381
+ elif success_rate > 0.3:
382
+ print("✅ Good performance!")
383
+ elif success_rate > 0.1:
384
+ print("⚠️ Moderate performance")
385
+ else:
386
+ print("❌ Poor performance - needs improvement")
387
+
388
+ except Exception as e:
389
+ print(f"❌ Evaluation failed: {e}")
390
+
391
+
392
+ if __name__ == "__main__":
393
+ asyncio.run(main())