synth-ai 0.2.13.dev2__py3-none-any.whl → 0.2.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (293) hide show
  1. examples/README.md +1 -0
  2. examples/multi_step/SFT_README.md +147 -0
  3. examples/multi_step/configs/README_verilog_rl.md +77 -0
  4. examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
  5. examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
  6. examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
  7. examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
  8. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +12 -11
  9. examples/multi_step/configs/crafter_sft_qwen30b_lora.toml +62 -0
  10. examples/multi_step/configs/crafter_synth_backend.md +40 -0
  11. examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
  12. examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
  13. examples/multi_step/configs/verilog_rl_lora.toml +190 -0
  14. examples/multi_step/convert_traces_to_sft.py +84 -0
  15. examples/multi_step/judges/crafter_backend_judge.py +220 -0
  16. examples/multi_step/judges/verilog_backend_judge.py +234 -0
  17. examples/multi_step/readme.md +48 -0
  18. examples/multi_step/run_sft_qwen30b.sh +45 -0
  19. examples/multi_step/verilog_rl_lora.md +218 -0
  20. examples/qwen_coder/configs/coder_lora_30b.toml +3 -2
  21. examples/qwen_coder/configs/coder_lora_4b.toml +2 -1
  22. examples/qwen_coder/configs/coder_lora_small.toml +2 -1
  23. examples/qwen_vl/BUGS_AND_FIXES.md +232 -0
  24. examples/qwen_vl/IMAGE_VALIDATION_COMPLETE.md +271 -0
  25. examples/qwen_vl/IMAGE_VALIDATION_SUMMARY.md +260 -0
  26. examples/qwen_vl/INFERENCE_SFT_TESTS.md +412 -0
  27. examples/qwen_vl/NEXT_STEPS_2B.md +325 -0
  28. examples/qwen_vl/QUICKSTART.md +327 -0
  29. examples/qwen_vl/QUICKSTART_RL_VISION.md +110 -0
  30. examples/qwen_vl/README.md +154 -0
  31. examples/qwen_vl/RL_VISION_COMPLETE.md +475 -0
  32. examples/qwen_vl/RL_VISION_TESTING.md +333 -0
  33. examples/qwen_vl/SDK_VISION_INTEGRATION.md +328 -0
  34. examples/qwen_vl/SETUP_COMPLETE.md +275 -0
  35. examples/qwen_vl/VISION_TESTS_COMPLETE.md +490 -0
  36. examples/qwen_vl/VLM_PIPELINE_COMPLETE.md +242 -0
  37. examples/qwen_vl/__init__.py +2 -0
  38. examples/qwen_vl/collect_data_via_cli.md +423 -0
  39. examples/qwen_vl/collect_vision_traces.py +368 -0
  40. examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +127 -0
  41. examples/qwen_vl/configs/crafter_vlm_sft_example.toml +60 -0
  42. examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +43 -0
  43. examples/qwen_vl/configs/eval_gpt4o_vision_proper.toml +29 -0
  44. examples/qwen_vl/configs/eval_gpt5nano_vision.toml +45 -0
  45. examples/qwen_vl/configs/eval_qwen2vl_vision.toml +44 -0
  46. examples/qwen_vl/configs/filter_qwen2vl_sft.toml +50 -0
  47. examples/qwen_vl/configs/filter_vision_sft.toml +53 -0
  48. examples/qwen_vl/configs/filter_vision_test.toml +8 -0
  49. examples/qwen_vl/configs/sft_qwen3_vl_2b_test.toml +54 -0
  50. examples/qwen_vl/crafter_gpt5nano_agent.py +308 -0
  51. examples/qwen_vl/crafter_qwen_vl_agent.py +300 -0
  52. examples/qwen_vl/run_vision_comparison.sh +62 -0
  53. examples/qwen_vl/run_vision_sft_pipeline.sh +175 -0
  54. examples/qwen_vl/test_image_validation.py +201 -0
  55. examples/qwen_vl/test_sft_vision_data.py +110 -0
  56. examples/rl/README.md +1 -1
  57. examples/rl/configs/eval_base_qwen.toml +17 -0
  58. examples/rl/configs/eval_rl_qwen.toml +13 -0
  59. examples/rl/configs/rl_from_base_qwen.toml +37 -0
  60. examples/rl/configs/rl_from_base_qwen17.toml +76 -0
  61. examples/rl/configs/rl_from_ft_qwen.toml +37 -0
  62. examples/rl/run_eval.py +436 -0
  63. examples/rl/run_rl_and_save.py +111 -0
  64. examples/rl/task_app/README.md +22 -0
  65. examples/rl/task_app/math_single_step.py +990 -0
  66. examples/rl/task_app/math_task_app.py +111 -0
  67. examples/sft/README.md +5 -5
  68. examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -2
  69. examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -3
  70. examples/sft/evaluate.py +4 -4
  71. examples/sft/export_dataset.py +7 -4
  72. examples/sft/generate_traces.py +2 -0
  73. examples/swe/task_app/README.md +1 -1
  74. examples/swe/task_app/grpo_swe_mini.py +1 -1
  75. examples/swe/task_app/grpo_swe_mini_task_app.py +0 -12
  76. examples/swe/task_app/hosted/envs/mini_swe/environment.py +13 -13
  77. examples/swe/task_app/hosted/policy_routes.py +0 -2
  78. examples/swe/task_app/hosted/rollout.py +2 -8
  79. examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
  80. examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
  81. examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
  82. examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
  83. examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
  84. examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
  85. examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
  86. examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
  87. examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
  88. examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
  89. examples/task_apps/crafter/task_app/__init__.py +3 -0
  90. examples/task_apps/crafter/task_app/grpo_crafter.py +309 -14
  91. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
  92. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +75 -4
  93. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
  94. examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +55 -3
  95. examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +114 -32
  96. examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +127 -27
  97. examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +156 -0
  98. examples/task_apps/enron/__init__.py +1 -0
  99. examples/task_apps/enron/filter_sft.toml +5 -0
  100. examples/task_apps/enron/tests/__init__.py +2 -0
  101. examples/task_apps/enron/tests/integration/__init__.py +2 -0
  102. examples/task_apps/enron/tests/integration/test_enron_eval.py +2 -0
  103. examples/task_apps/enron/tests/unit/__init__.py +2 -0
  104. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
  105. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
  106. examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
  107. examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
  108. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +2 -0
  109. examples/task_apps/pokemon_red/task_app.py +199 -6
  110. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +2 -0
  111. examples/task_apps/sokoban/filter_sft.toml +5 -0
  112. examples/task_apps/sokoban/tests/__init__.py +2 -0
  113. examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
  114. examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
  115. examples/task_apps/verilog/eval_groq_qwen32b.toml +8 -4
  116. examples/task_apps/verilog/filter_sft.toml +5 -0
  117. examples/task_apps/verilog/task_app/grpo_verilog.py +258 -23
  118. examples/task_apps/verilog/tests/__init__.py +2 -0
  119. examples/task_apps/verilog/tests/integration/__init__.py +2 -0
  120. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +2 -0
  121. examples/task_apps/verilog/tests/unit/__init__.py +2 -0
  122. examples/vlm/README.md +3 -3
  123. examples/vlm/configs/crafter_vlm_gpt4o.toml +2 -0
  124. examples/vlm/crafter_openai_vlm_agent.py +3 -5
  125. examples/vlm/filter_image_rows.py +1 -1
  126. examples/vlm/run_crafter_vlm_benchmark.py +2 -2
  127. examples/warming_up_to_rl/_utils.py +92 -0
  128. examples/warming_up_to_rl/analyze_trace_db.py +1 -1
  129. examples/warming_up_to_rl/configs/crafter_fft.toml +2 -0
  130. examples/warming_up_to_rl/configs/crafter_fft_4b.toml +2 -0
  131. examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml +2 -0
  132. examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml +2 -0
  133. examples/warming_up_to_rl/configs/eval_modal_qwen4b.toml +2 -1
  134. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +2 -1
  135. examples/warming_up_to_rl/configs/rl_from_ft.toml +2 -0
  136. examples/warming_up_to_rl/export_trace_sft.py +174 -60
  137. examples/warming_up_to_rl/groq_test.py +2 -0
  138. examples/warming_up_to_rl/readme.md +63 -132
  139. examples/warming_up_to_rl/run_fft_and_save.py +1 -1
  140. examples/warming_up_to_rl/run_local_rollout.py +2 -0
  141. examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
  142. examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
  143. examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
  144. examples/warming_up_to_rl/run_rl_and_save.py +1 -1
  145. examples/warming_up_to_rl/run_rollout_remote.py +2 -0
  146. examples/warming_up_to_rl/task_app/README.md +42 -0
  147. examples/warming_up_to_rl/task_app/grpo_crafter.py +696 -0
  148. examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +135 -0
  149. examples/warming_up_to_rl/task_app/synth_envs_hosted/README.md +173 -0
  150. examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +5 -0
  151. examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +143 -0
  152. examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +1226 -0
  153. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -0
  154. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
  155. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
  156. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +522 -0
  157. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +478 -0
  158. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +108 -0
  159. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +305 -0
  160. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
  161. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +204 -0
  162. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +5 -0
  163. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +618 -0
  164. examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +100 -0
  165. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +1081 -0
  166. examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +195 -0
  167. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +1861 -0
  168. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +5 -0
  169. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +211 -0
  170. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +161 -0
  171. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +137 -0
  172. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +62 -0
  173. synth_ai/__init__.py +44 -30
  174. synth_ai/_utils/__init__.py +47 -0
  175. synth_ai/_utils/base_url.py +10 -0
  176. synth_ai/_utils/http.py +10 -0
  177. synth_ai/_utils/prompts.py +10 -0
  178. synth_ai/_utils/task_app_state.py +12 -0
  179. synth_ai/_utils/user_config.py +10 -0
  180. synth_ai/api/models/supported.py +145 -7
  181. synth_ai/api/train/__init__.py +13 -1
  182. synth_ai/api/train/cli.py +30 -7
  183. synth_ai/api/train/config_finder.py +18 -11
  184. synth_ai/api/train/env_resolver.py +13 -10
  185. synth_ai/cli/__init__.py +66 -49
  186. synth_ai/cli/_modal_wrapper.py +9 -6
  187. synth_ai/cli/_typer_patch.py +0 -2
  188. synth_ai/cli/_validate_task_app.py +22 -4
  189. synth_ai/cli/legacy_root_backup.py +3 -1
  190. synth_ai/cli/lib/__init__.py +10 -0
  191. synth_ai/cli/lib/task_app_discovery.py +7 -0
  192. synth_ai/cli/lib/task_app_env.py +518 -0
  193. synth_ai/cli/recent.py +1 -0
  194. synth_ai/cli/setup.py +266 -0
  195. synth_ai/cli/task_app_deploy.py +16 -0
  196. synth_ai/cli/task_app_list.py +25 -0
  197. synth_ai/cli/task_app_modal_serve.py +16 -0
  198. synth_ai/cli/task_app_serve.py +18 -0
  199. synth_ai/cli/task_apps.py +392 -141
  200. synth_ai/cli/train.py +18 -0
  201. synth_ai/cli/tui.py +62 -0
  202. synth_ai/demos/__init__.py +10 -0
  203. synth_ai/demos/core/__init__.py +28 -1
  204. synth_ai/demos/crafter/__init__.py +1 -0
  205. synth_ai/demos/crafter/crafter_fft_4b.toml +55 -0
  206. synth_ai/demos/crafter/grpo_crafter_task_app.py +185 -0
  207. synth_ai/demos/crafter/rl_from_base_qwen4b.toml +74 -0
  208. synth_ai/demos/demo_registry.py +176 -0
  209. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
  210. synth_ai/demos/math/__init__.py +1 -0
  211. synth_ai/demos/math/_common.py +16 -0
  212. synth_ai/demos/math/app.py +38 -0
  213. synth_ai/demos/math/config.toml +76 -0
  214. synth_ai/demos/math/deploy_modal.py +54 -0
  215. synth_ai/demos/math/modal_task_app.py +702 -0
  216. synth_ai/demos/math/task_app_entry.py +51 -0
  217. synth_ai/environments/environment/core.py +7 -1
  218. synth_ai/environments/examples/bandit/engine.py +0 -1
  219. synth_ai/environments/examples/bandit/environment.py +0 -1
  220. synth_ai/environments/examples/crafter_classic/environment.py +1 -1
  221. synth_ai/environments/examples/verilog/engine.py +76 -10
  222. synth_ai/environments/examples/wordle/environment.py +0 -1
  223. synth_ai/evals/base.py +16 -5
  224. synth_ai/evals/client.py +1 -1
  225. synth_ai/inference/client.py +1 -1
  226. synth_ai/learning/client.py +1 -1
  227. synth_ai/learning/health.py +1 -1
  228. synth_ai/learning/jobs.py +1 -1
  229. synth_ai/learning/rl/client.py +1 -1
  230. synth_ai/learning/rl/env_keys.py +1 -1
  231. synth_ai/learning/rl/secrets.py +1 -1
  232. synth_ai/learning/sft/client.py +1 -1
  233. synth_ai/learning/sft/data.py +407 -4
  234. synth_ai/learning/validators.py +4 -1
  235. synth_ai/task/__init__.py +11 -1
  236. synth_ai/task/apps/__init__.py +5 -2
  237. synth_ai/task/config.py +259 -0
  238. synth_ai/task/contracts.py +15 -2
  239. synth_ai/task/rubrics/__init__.py +4 -2
  240. synth_ai/task/rubrics/loaders.py +27 -4
  241. synth_ai/task/rubrics/scoring.py +3 -0
  242. synth_ai/task/rubrics.py +219 -0
  243. synth_ai/task/trace_correlation_helpers.py +328 -0
  244. synth_ai/task/tracing_utils.py +14 -3
  245. synth_ai/task/validators.py +145 -2
  246. synth_ai/tracing_v3/config.py +15 -13
  247. synth_ai/tracing_v3/constants.py +21 -0
  248. synth_ai/tracing_v3/db_config.py +3 -1
  249. synth_ai/tracing_v3/decorators.py +10 -7
  250. synth_ai/tracing_v3/session_tracer.py +10 -0
  251. synth_ai/tracing_v3/turso/daemon.py +2 -2
  252. synth_ai/tracing_v3/turso/native_manager.py +108 -77
  253. synth_ai/tracing_v3/utils.py +1 -1
  254. synth_ai/tui/__init__.py +5 -0
  255. synth_ai/tui/__main__.py +13 -0
  256. synth_ai/tui/cli/__init__.py +1 -0
  257. synth_ai/tui/cli/query_experiments.py +164 -0
  258. synth_ai/tui/cli/query_experiments_v3.py +164 -0
  259. synth_ai/tui/dashboard.py +911 -0
  260. synth_ai/utils/__init__.py +101 -0
  261. synth_ai/utils/base_url.py +94 -0
  262. synth_ai/utils/cli.py +131 -0
  263. synth_ai/utils/env.py +287 -0
  264. synth_ai/utils/http.py +169 -0
  265. synth_ai/utils/modal.py +308 -0
  266. synth_ai/utils/process.py +212 -0
  267. synth_ai/utils/prompts.py +39 -0
  268. synth_ai/utils/sqld.py +122 -0
  269. synth_ai/utils/task_app_discovery.py +882 -0
  270. synth_ai/utils/task_app_env.py +186 -0
  271. synth_ai/utils/task_app_state.py +318 -0
  272. synth_ai/utils/user_config.py +137 -0
  273. synth_ai/v0/config/__init__.py +1 -5
  274. synth_ai/v0/config/base_url.py +1 -7
  275. synth_ai/v0/tracing/config.py +1 -1
  276. synth_ai/v0/tracing/decorators.py +1 -1
  277. synth_ai/v0/tracing/upload.py +1 -1
  278. synth_ai/v0/tracing_v1/config.py +1 -1
  279. synth_ai/v0/tracing_v1/decorators.py +1 -1
  280. synth_ai/v0/tracing_v1/upload.py +1 -1
  281. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/METADATA +85 -31
  282. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/RECORD +286 -135
  283. synth_ai/cli/man.py +0 -106
  284. synth_ai/compound/cais.py +0 -0
  285. synth_ai/core/experiment.py +0 -13
  286. synth_ai/core/system.py +0 -15
  287. synth_ai/demo_registry.py +0 -295
  288. synth_ai/handshake.py +0 -109
  289. synth_ai/http.py +0 -26
  290. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/WHEEL +0 -0
  291. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/entry_points.txt +0 -0
  292. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/licenses/LICENSE +0 -0
  293. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,300 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Crafter agent using Qwen-VL models via synth-ai's hosted inference.
4
+
5
+ This demonstrates vision-language models (Qwen2-VL, Qwen3-VL) playing Crafter
6
+ with image observations. The CrafterPolicy automatically detects vision capability
7
+ from the model name and includes base64-encoded PNG frames in the prompt.
8
+
9
+ Requirements:
10
+ - `SYNTH_API_KEY` environment variable (for synth-ai hosted inference)
11
+ - synth-ai package with Crafter task app dependencies
12
+
13
+ Usage:
14
+ uv run python examples/qwen_vl/crafter_qwen_vl_agent.py \
15
+ --model Qwen/Qwen2-VL-7B-Instruct --seeds 10 --steps 20
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import asyncio
22
+ import base64
23
+ import json
24
+ import os
25
+ from contextlib import suppress
26
+ from pathlib import Path
27
+ from typing import Any
28
+ from uuid import uuid4
29
+
30
+ from examples.task_apps.crafter.task_app.synth_envs_hosted.envs.crafter.environment import (
31
+ CrafterEnvironmentWrapper,
32
+ )
33
+ from examples.task_apps.crafter.task_app.synth_envs_hosted.envs.crafter.policy import CrafterPolicy
34
+ from synth_ai.environments.examples.crafter_classic.environment import CrafterClassicEnvironment
35
+ from synth_ai.environments.examples.crafter_classic.taskset import (
36
+ CrafterTaskInstance,
37
+ CrafterTaskInstanceMetadata,
38
+ )
39
+ from synth_ai.environments.tasks.core import Impetus, Intent
40
+
41
+ # Import synth-ai inference client
42
+ try:
43
+ from synth_ai.inference.client import InferenceClient
44
+ except ImportError:
45
+ print("Error: synth-ai inference client not found. Make sure synth-ai is installed.")
46
+ raise
47
+
48
+ DEFAULT_OUTPUT = Path("examples/qwen_vl/temp")
49
+ FRAME_SUBDIR = "qwen_vl_frames"
50
+
51
+
52
+ def _default_backend_base_url() -> str:
53
+ raw = os.getenv("BACKEND_BASE_URL", "https://agent-learning.onrender.com/api").strip()
54
+ return raw if raw.endswith("/api") else f"{raw}/api"
55
+
56
+
57
+ class EpisodeResult:
58
+ def __init__(self, seed: int) -> None:
59
+ self.seed = seed
60
+ self.steps_taken: int = 0
61
+ self.achievements: set[str] = set()
62
+ self.total_reward: float = 0.0
63
+ self.tool_calls: int = 0
64
+
65
+ def record_observation(self, observation: dict[str, Any]) -> None:
66
+ obs = observation.get("observation") if isinstance(observation, dict) else None
67
+ if not isinstance(obs, dict):
68
+ return
69
+ ach = obs.get("achievements_status")
70
+ if isinstance(ach, dict):
71
+ for name, unlocked in ach.items():
72
+ if unlocked:
73
+ self.achievements.add(str(name))
74
+ reward = obs.get("reward_last_step")
75
+ if isinstance(reward, int | float):
76
+ self.total_reward += float(reward)
77
+
78
+
79
+ def _ensure_synth_client() -> InferenceClient:
80
+ """Initialize synth-ai inference client."""
81
+ api_key = os.getenv("SYNTH_API_KEY")
82
+ if not api_key:
83
+ raise RuntimeError(
84
+ "SYNTH_API_KEY must be set for synth-ai hosted inference. "
85
+ "Get your key from https://synth-ai.com"
86
+ )
87
+ base_url = os.getenv("SYNTH_BASE_URL", _default_backend_base_url())
88
+ return InferenceClient(base_url=base_url, api_key=api_key)
89
+
90
+
91
+ def _build_task_instance(seed: int) -> CrafterTaskInstance:
92
+ """Create a Crafter task instance with specified seed."""
93
+ impetus = Impetus(instructions="Explore, survive, and unlock achievements.")
94
+ intent = Intent(
95
+ rubric={"goal": "Maximise Crafter achievements."},
96
+ gold_trajectories=None,
97
+ gold_state_diff={},
98
+ )
99
+ metadata = CrafterTaskInstanceMetadata(
100
+ difficulty="custom",
101
+ seed=seed,
102
+ num_trees_radius=0,
103
+ num_cows_radius=0,
104
+ num_hostiles_radius=0,
105
+ )
106
+ instance = CrafterTaskInstance(
107
+ id=uuid4(),
108
+ impetus=impetus,
109
+ intent=intent,
110
+ metadata=metadata,
111
+ is_reproducible=True,
112
+ initial_engine_snapshot=None,
113
+ )
114
+ setattr(instance, "config", {"seed": seed, "length": 256, "area": [64, 64]})
115
+ return instance
116
+
117
+
118
+ def _decode_and_save_image(observation: dict[str, Any], path: Path) -> None:
119
+ """Extract and save PNG frame from observation."""
120
+ obs = observation.get("observation") if isinstance(observation, dict) else None
121
+ if not isinstance(obs, dict):
122
+ return
123
+ base64_data = obs.get("observation_image_base64")
124
+ if not isinstance(base64_data, str) or not base64_data:
125
+ return
126
+ path.parent.mkdir(parents=True, exist_ok=True)
127
+ with suppress(Exception):
128
+ path.write_bytes(base64.b64decode(base64_data))
129
+
130
+
131
+ async def _run_episode(
132
+ *,
133
+ seed: int,
134
+ client: InferenceClient,
135
+ model: str,
136
+ max_steps: int,
137
+ output_dir: Path,
138
+ temperature: float,
139
+ ) -> EpisodeResult:
140
+ """Run a single Crafter episode with Qwen-VL."""
141
+ task_instance = _build_task_instance(seed)
142
+ env = CrafterClassicEnvironment(task_instance)
143
+ wrapper = CrafterEnvironmentWrapper(env, seed=seed)
144
+
145
+ # Policy will auto-detect vision from model name (qwen-vl, qwen2-vl, qwen3-vl)
146
+ policy = CrafterPolicy(inference_url="synth://inference", model=model)
147
+ await policy.initialize({
148
+ "use_tools": True,
149
+ "model": model,
150
+ "temperature": temperature,
151
+ "max_tokens": 512,
152
+ })
153
+
154
+ episode_result = EpisodeResult(seed=seed)
155
+
156
+ observation_packet = await wrapper.initialize()
157
+ episode_result.record_observation(observation_packet)
158
+
159
+ frames_root = output_dir / FRAME_SUBDIR / f"seed_{seed:04d}"
160
+ _decode_and_save_image(observation_packet, frames_root / "step_000.png")
161
+
162
+ for step_idx in range(max_steps):
163
+ obs_dict = observation_packet.get("observation")
164
+ if not isinstance(obs_dict, dict):
165
+ break
166
+
167
+ # Format observation text
168
+ obs_text = policy._format_observation_for_llm(observation_packet) # noqa: SLF001
169
+
170
+ # Get tool calls from policy (it prepares the inference request internally)
171
+ tool_calls, meta = await policy.step(
172
+ observation_text=obs_text,
173
+ metadata={"raw_observation": observation_packet},
174
+ )
175
+ if "inference_request" not in meta:
176
+ break
177
+
178
+ episode_result.steps_taken += 1
179
+ inference_request = meta["inference_request"]
180
+
181
+ # Call synth-ai hosted inference
182
+ response = await client.create_chat_completion(
183
+ model=model,
184
+ messages=inference_request["messages"],
185
+ temperature=temperature,
186
+ max_tokens=512,
187
+ tools=inference_request.get("tools"),
188
+ )
189
+
190
+ # Parse tool calls from response
191
+ assistant_tool_calls = CrafterPolicy.parse_response_to_tool_calls(
192
+ response,
193
+ use_tools=policy.use_tools,
194
+ )
195
+ if not assistant_tool_calls:
196
+ print(
197
+ f"Seed {seed}: no tool calls returned by model; ending episode early at step {step_idx}."
198
+ )
199
+ break
200
+
201
+ episode_result.tool_calls += len(assistant_tool_calls)
202
+
203
+ # Extract assistant message
204
+ assistant_message = response.get("choices", [{}])[0].get("message", {})
205
+ assistant_text = assistant_message.get("content")
206
+
207
+ # Execute action in environment
208
+ env_response = await wrapper.step(assistant_tool_calls)
209
+ if not isinstance(env_response, dict):
210
+ raise RuntimeError(
211
+ f"Unexpected environment response type: {type(env_response)!r}"
212
+ )
213
+ episode_result.record_observation(env_response)
214
+
215
+ # Update policy history
216
+ policy._append_assistant_turn( # noqa: SLF001
217
+ assistant_text,
218
+ assistant_tool_calls,
219
+ env_response,
220
+ )
221
+
222
+ # Save frame
223
+ frame_path = frames_root / f"step_{step_idx + 1:03d}.png"
224
+ _decode_and_save_image(env_response, frame_path)
225
+
226
+ if env_response.get("done"):
227
+ break
228
+ observation_packet = env_response
229
+
230
+ await wrapper.terminate()
231
+ return episode_result
232
+
233
+
234
+ async def main() -> None:
235
+ parser = argparse.ArgumentParser(description=__doc__)
236
+ parser.add_argument(
237
+ "--model",
238
+ default="Qwen/Qwen2-VL-7B-Instruct",
239
+ help="Qwen-VL model name (e.g., Qwen/Qwen2-VL-7B-Instruct, Qwen/Qwen3-VL-8B)",
240
+ )
241
+ parser.add_argument("--seeds", type=int, default=10, help="Number of random seeds to evaluate")
242
+ parser.add_argument("--steps", type=int, default=20, help="Max steps per seed")
243
+ parser.add_argument("--temperature", type=float, default=0.7, help="Sampling temperature")
244
+ parser.add_argument(
245
+ "--output-dir",
246
+ type=Path,
247
+ default=DEFAULT_OUTPUT,
248
+ help=f"Directory for saved frames and summaries (default: {DEFAULT_OUTPUT})",
249
+ )
250
+ args = parser.parse_args()
251
+
252
+ client = _ensure_synth_client()
253
+ results: list[EpisodeResult] = []
254
+
255
+ seeds = list(range(args.seeds))
256
+ print(f"Running {len(seeds)} Crafter episodes with model={args.model}")
257
+ print(f"Using synth-ai hosted inference\n")
258
+
259
+ for seed in seeds:
260
+ result = await _run_episode(
261
+ seed=seed,
262
+ client=client,
263
+ model=args.model,
264
+ max_steps=args.steps,
265
+ output_dir=args.output_dir,
266
+ temperature=args.temperature,
267
+ )
268
+ results.append(result)
269
+ print(
270
+ f"Seed {seed:02d}: steps={result.steps_taken}, "
271
+ f"achievements={len(result.achievements)}, "
272
+ f"tool_calls={result.tool_calls}, reward≈{result.total_reward:.3f}"
273
+ )
274
+
275
+ summary = {
276
+ "model": args.model,
277
+ "provider": "synth-ai",
278
+ "episodes": len(results),
279
+ "mean_steps": round(
280
+ sum(res.steps_taken for res in results) / max(len(results), 1), 2
281
+ ),
282
+ "mean_achievements": round(
283
+ sum(len(res.achievements) for res in results) / max(len(results), 1), 2
284
+ ),
285
+ "total_tool_calls": sum(res.tool_calls for res in results),
286
+ "output_dir": str(args.output_dir / FRAME_SUBDIR),
287
+ }
288
+
289
+ args.output_dir.mkdir(parents=True, exist_ok=True)
290
+ summary_path = args.output_dir / "qwen_vl_summary.json"
291
+ summary_path.write_text(json.dumps(summary, indent=2), encoding="utf-8")
292
+
293
+ print("\nSummary")
294
+ print("-------")
295
+ print(json.dumps(summary, indent=2))
296
+ print(f"\nFrames saved in: {summary['output_dir']}")
297
+
298
+
299
+ if __name__ == "__main__":
300
+ asyncio.run(main())
@@ -0,0 +1,62 @@
1
+ #!/bin/bash
2
+ # Compare Qwen-VL (via synth) vs gpt-5-nano (via OpenAI) on Crafter
3
+
4
+ set -e
5
+
6
+ SEEDS=10
7
+ STEPS=20
8
+ OUTPUT_DIR="examples/qwen_vl/temp/comparison"
9
+
10
+ echo "======================================"
11
+ echo "Vision Model Comparison on Crafter"
12
+ echo "======================================"
13
+ echo ""
14
+ echo "Running $SEEDS episodes, $STEPS steps each"
15
+ echo ""
16
+
17
+ # Check API keys
18
+ if [ -z "$OPENAI_API_KEY" ]; then
19
+ echo "Error: OPENAI_API_KEY not set"
20
+ exit 1
21
+ fi
22
+
23
+ if [ -z "$SYNTH_API_KEY" ]; then
24
+ echo "Error: SYNTH_API_KEY not set"
25
+ exit 1
26
+ fi
27
+
28
+ # Run gpt-5-nano
29
+ echo "======================================"
30
+ echo "1. Running gpt-5-nano (OpenAI)"
31
+ echo "======================================"
32
+ uv run python examples/qwen_vl/crafter_gpt5nano_agent.py \
33
+ --model gpt-5-nano \
34
+ --seeds $SEEDS \
35
+ --steps $STEPS \
36
+ --output-dir "$OUTPUT_DIR/gpt5nano"
37
+
38
+ echo ""
39
+ echo "======================================"
40
+ echo "2. Running Qwen2-VL-7B (synth-ai)"
41
+ echo "======================================"
42
+ uv run python examples/qwen_vl/crafter_qwen_vl_agent.py \
43
+ --model Qwen/Qwen2-VL-7B-Instruct \
44
+ --seeds $SEEDS \
45
+ --steps $STEPS \
46
+ --output-dir "$OUTPUT_DIR/qwen2vl"
47
+
48
+ echo ""
49
+ echo "======================================"
50
+ echo "Results Summary"
51
+ echo "======================================"
52
+ echo ""
53
+ echo "gpt-5-nano (OpenAI):"
54
+ cat "$OUTPUT_DIR/gpt5nano/gpt5nano_summary.json" | python -m json.tool
55
+ echo ""
56
+ echo "Qwen2-VL-7B (synth-ai):"
57
+ cat "$OUTPUT_DIR/qwen2vl/qwen_vl_summary.json" | python -m json.tool
58
+ echo ""
59
+ echo "Frames saved in:"
60
+ echo " - $OUTPUT_DIR/gpt5nano/gpt5nano_frames/"
61
+ echo " - $OUTPUT_DIR/qwen2vl/qwen_vl_frames/"
62
+
@@ -0,0 +1,175 @@
1
+ #!/bin/bash
2
+ # Complete pipeline: Collect vision traces → Filter → Train SFT
3
+ # Uses synth-ai CLI tools for data collection and processing
4
+
5
+ set -e
6
+
7
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
8
+ # Allow callers to override root paths, otherwise derive them relative to this script.
9
+ SYNTH_DIR="${SYNTH_DIR:-$(cd "$SCRIPT_DIR/../.." && pwd)}"
10
+ DEFAULT_MONOREPO_DIR="$(cd "$SYNTH_DIR/.." && pwd)/monorepo"
11
+ MONOREPO_DIR="${MONOREPO_DIR:-$DEFAULT_MONOREPO_DIR}"
12
+
13
+ if [ ! -d "$SYNTH_DIR" ]; then
14
+ echo "Error: synth-ai repository not found at: $SYNTH_DIR"
15
+ exit 1
16
+ fi
17
+
18
+ if [ ! -d "$MONOREPO_DIR" ]; then
19
+ echo "Warning: MONOREPO_DIR not found at: $MONOREPO_DIR"
20
+ echo " Set MONOREPO_DIR to a valid path if you plan to run the optional training step."
21
+ fi
22
+
23
+ # Configuration
24
+ MODEL="gpt-5-nano"
25
+ PROVIDER="openai"
26
+ NUM_EPISODES=100
27
+ OUTPUT_DIR="traces/gpt5nano_vision"
28
+
29
+ echo "======================================"
30
+ echo "Vision SFT Pipeline for Crafter"
31
+ echo "======================================"
32
+ echo ""
33
+ echo "Model: $MODEL"
34
+ echo "Provider: $PROVIDER"
35
+ echo "Episodes: $NUM_EPISODES"
36
+ echo "Output: $OUTPUT_DIR"
37
+ echo ""
38
+
39
+ # Check API keys
40
+ if [ "$PROVIDER" = "openai" ]; then
41
+ if [ -z "$OPENAI_API_KEY" ]; then
42
+ echo "Error: OPENAI_API_KEY not set"
43
+ exit 1
44
+ fi
45
+ echo "✓ OpenAI API key found"
46
+ elif [ "$PROVIDER" = "synth" ]; then
47
+ if [ -z "$SYNTH_API_KEY" ]; then
48
+ echo "Error: SYNTH_API_KEY not set"
49
+ exit 1
50
+ fi
51
+ echo "✓ Synth API key found"
52
+ fi
53
+
54
+ if [ -z "$BACKEND_BASE_URL" ]; then
55
+ echo "Warning: BACKEND_BASE_URL not set, using default"
56
+ export BACKEND_BASE_URL="https://synth-backend-dev-docker.onrender.com/api"
57
+ fi
58
+
59
+ echo ""
60
+
61
+ # Step 1: Collect traces
62
+ echo "======================================"
63
+ echo "STEP 1: Collect Vision Traces"
64
+ echo "======================================"
65
+ echo ""
66
+ echo "Running $NUM_EPISODES episodes with $MODEL..."
67
+ echo "This will take ~30-60 minutes"
68
+ echo ""
69
+
70
+ cd "$SYNTH_DIR"
71
+
72
+ uvx synth-ai eval \
73
+ --config examples/qwen_vl/configs/eval_${PROVIDER}_${MODEL/\//_}_vision.toml \
74
+ --output-dir "$OUTPUT_DIR" \
75
+ || {
76
+ # Fallback to gpt5nano config if custom config not found
77
+ uvx synth-ai eval \
78
+ --config examples/qwen_vl/configs/eval_gpt5nano_vision.toml \
79
+ --output-dir "$OUTPUT_DIR"
80
+ }
81
+
82
+ echo ""
83
+ echo "✅ Trace collection complete!"
84
+ echo ""
85
+
86
+ # Step 2: Filter and export to SFT format
87
+ echo "======================================"
88
+ echo "STEP 2: Filter & Export to SFT JSONL"
89
+ echo "======================================"
90
+ echo ""
91
+
92
+ uvx synth-ai filter \
93
+ --config examples/qwen_vl/configs/filter_vision_sft.toml \
94
+ --input-db "$OUTPUT_DIR/rollouts.db" \
95
+ --output-dir "$OUTPUT_DIR/sft"
96
+
97
+ echo ""
98
+ echo "✅ Filtering complete!"
99
+ echo ""
100
+
101
+ # Show dataset stats
102
+ echo "======================================"
103
+ echo "Dataset Statistics"
104
+ echo "======================================"
105
+ echo ""
106
+
107
+ if [ -f "$OUTPUT_DIR/sft/filter_stats.json" ]; then
108
+ cat "$OUTPUT_DIR/sft/filter_stats.json" | python3 -m json.tool
109
+ else
110
+ echo "Train samples: $(wc -l < "$OUTPUT_DIR/sft/train.jsonl")"
111
+ echo "Val samples: $(wc -l < "$OUTPUT_DIR/sft/val.jsonl")"
112
+ fi
113
+
114
+ echo ""
115
+
116
+ # Step 3: Train SFT (optional - user can run this separately)
117
+ echo "======================================"
118
+ echo "STEP 3: Train Vision SFT (Optional)"
119
+ echo "======================================"
120
+ echo ""
121
+ echo "To train the model, run:"
122
+ echo ""
123
+ echo " cd $MONOREPO_DIR"
124
+ echo " uvx synth-ai train \\"
125
+ echo " --type sft \\"
126
+ echo " --config configs/vision_sft/crafter_qwen3vl_8b_gpt5nano.toml \\"
127
+ echo " --dataset $SYNTH_DIR/$OUTPUT_DIR/sft/train.jsonl \\"
128
+ echo " --eval-dataset $SYNTH_DIR/$OUTPUT_DIR/sft/val.jsonl \\"
129
+ echo " --env-file backend/.env.dev"
130
+ echo ""
131
+
132
+ read -p "Run training now? (y/N) " -n 1 -r
133
+ echo
134
+ if [[ $REPLY =~ ^[Yy]$ ]]; then
135
+ echo ""
136
+ echo "Starting SFT training..."
137
+ echo ""
138
+
139
+ if [ ! -d "$MONOREPO_DIR" ]; then
140
+ echo "Error: MONOREPO_DIR not found. Set MONOREPO_DIR to your monorepo path before running training."
141
+ exit 1
142
+ fi
143
+
144
+ cd "$MONOREPO_DIR"
145
+
146
+ uvx synth-ai train \
147
+ --type sft \
148
+ --config configs/vision_sft/crafter_qwen3vl_8b_gpt5nano.toml \
149
+ --dataset "$SYNTH_DIR/$OUTPUT_DIR/sft/train.jsonl" \
150
+ --eval-dataset "$SYNTH_DIR/$OUTPUT_DIR/sft/val.jsonl" \
151
+ --env-file backend/.env.dev
152
+
153
+ echo ""
154
+ echo "✅ Training complete!"
155
+ else
156
+ echo ""
157
+ echo "Skipping training. You can run it later using the command above."
158
+ fi
159
+
160
+ echo ""
161
+ echo "======================================"
162
+ echo "Pipeline Complete!"
163
+ echo "======================================"
164
+ echo ""
165
+ echo "📂 Outputs:"
166
+ echo " - Raw traces: $OUTPUT_DIR/rollouts.db"
167
+ echo " - SFT train: $OUTPUT_DIR/sft/train.jsonl"
168
+ echo " - SFT val: $OUTPUT_DIR/sft/val.jsonl"
169
+ echo " - Stats: $OUTPUT_DIR/sft/filter_stats.json"
170
+ echo ""
171
+ echo "🚀 Next steps:"
172
+ echo " 1. Train SFT model (see command above)"
173
+ echo " 2. Evaluate trained model"
174
+ echo " 3. Fine-tune with RL"
175
+ echo ""