synth-ai 0.2.9.dev0__py3-none-any.whl → 0.2.23.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (890) hide show
  1. examples/README.md +1 -0
  2. examples/__init__.py +16 -0
  3. examples/analyze_semantic_words.sh +17 -0
  4. examples/baseline/banking77_baseline.py +243 -0
  5. examples/baseline/banking77_pipeline_baseline.py +294 -0
  6. examples/baseline/crafter_baseline.py +407 -0
  7. examples/baseline/pokemon_red_baseline.py +326 -0
  8. examples/baseline/simple_baseline.py +56 -0
  9. examples/baseline/warming_up_to_rl_baseline.py +239 -0
  10. examples/blog_posts/gepa/README.md +355 -0
  11. examples/blog_posts/gepa/configs/banking77_gepa_local.toml +95 -0
  12. examples/blog_posts/gepa/configs/banking77_gepa_test.toml +80 -0
  13. examples/blog_posts/gepa/configs/banking77_mipro_local.toml +50 -0
  14. examples/blog_posts/gepa/configs/banking77_pipeline_gepa_local.toml +101 -0
  15. examples/blog_posts/gepa/configs/banking77_pipeline_gepa_test.toml +96 -0
  16. examples/blog_posts/gepa/configs/hotpotqa_gepa_local.toml +57 -0
  17. examples/blog_posts/gepa/configs/hotpotqa_gepa_qwen.toml +35 -0
  18. examples/blog_posts/gepa/configs/hotpotqa_mipro_local.toml +51 -0
  19. examples/blog_posts/gepa/configs/hover_gepa_local.toml +57 -0
  20. examples/blog_posts/gepa/configs/hover_gepa_qwen.toml +35 -0
  21. examples/blog_posts/gepa/configs/hover_mipro_local.toml +51 -0
  22. examples/blog_posts/gepa/configs/ifbench_gepa_local.toml +57 -0
  23. examples/blog_posts/gepa/configs/ifbench_gepa_qwen.toml +35 -0
  24. examples/blog_posts/gepa/configs/ifbench_mipro_local.toml +51 -0
  25. examples/blog_posts/gepa/configs/pupa_gepa_local.toml +58 -0
  26. examples/blog_posts/gepa/configs/pupa_mipro_local.toml +52 -0
  27. examples/blog_posts/gepa/deploy_banking77_task_app.sh +54 -0
  28. examples/blog_posts/gepa/gepa_baseline.py +204 -0
  29. examples/blog_posts/gepa/query_prompts_example.py +97 -0
  30. examples/blog_posts/gepa/run_gepa_banking77.sh +112 -0
  31. examples/blog_posts/gepa/run_gepa_banking77_pipeline.sh +163 -0
  32. examples/blog_posts/gepa/task_apps.py +105 -0
  33. examples/blog_posts/gepa/test_gepa_local.sh +67 -0
  34. examples/blog_posts/gepa/verify_banking77_setup.sh +123 -0
  35. examples/blog_posts/mipro/README.md +415 -0
  36. examples/blog_posts/mipro/configs/banking77_mipro_local.toml +91 -0
  37. examples/blog_posts/mipro/configs/banking77_mipro_test.toml +87 -0
  38. examples/blog_posts/mipro/configs/banking77_pipeline_mipro_gemini_flash_lite_local.toml +98 -0
  39. examples/blog_posts/mipro/configs/banking77_pipeline_mipro_gpt41mini_local.toml +96 -0
  40. examples/blog_posts/mipro/configs/banking77_pipeline_mipro_local.toml +94 -0
  41. examples/blog_posts/mipro/configs/banking77_pipeline_mipro_test.toml +170 -0
  42. examples/blog_posts/mipro/deploy_banking77_pipeline_task_app.sh +59 -0
  43. examples/blog_posts/mipro/deploy_banking77_task_app.sh +41 -0
  44. examples/blog_posts/mipro/multi_step.md +79 -0
  45. examples/blog_posts/mipro/run_mipro_banking77.sh +191 -0
  46. examples/blog_posts/mipro/run_mipro_banking77_pipeline.sh +171 -0
  47. examples/blog_posts/mipro/run_mipro_banking77_pipeline_gemini_flash_lite.sh +177 -0
  48. examples/blog_posts/mipro/run_mipro_banking77_pipeline_gpt41mini.sh +173 -0
  49. examples/blog_posts/mipro/verify_banking77_setup.sh +117 -0
  50. examples/blog_posts/pokemon_vl/README.md +98 -0
  51. examples/blog_posts/pokemon_vl/configs/eval_gpt5nano.toml +26 -0
  52. examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +27 -0
  53. examples/blog_posts/pokemon_vl/configs/eval_rl_final.toml +24 -0
  54. examples/blog_posts/pokemon_vl/configs/filter_high_reward.toml +10 -0
  55. examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +43 -0
  56. examples/blog_posts/pokemon_vl/configs/train_sft_qwen4b_vl.toml +40 -0
  57. examples/blog_posts/pokemon_vl/extract_images.py +239 -0
  58. examples/blog_posts/pokemon_vl/pokemon_vl_baseline.py +326 -0
  59. examples/blog_posts/pokemon_vl/run_eval_extract_images.py +209 -0
  60. examples/blog_posts/pokemon_vl/run_qwen_eval_extract_images.py +212 -0
  61. examples/blog_posts/pokemon_vl/text_box_analysis.md +106 -0
  62. examples/blog_posts/warming_up_to_rl/ARCHITECTURE.md +195 -0
  63. examples/blog_posts/warming_up_to_rl/FINAL_TEST_RESULTS.md +127 -0
  64. examples/blog_posts/warming_up_to_rl/INFERENCE_SUCCESS.md +132 -0
  65. examples/blog_posts/warming_up_to_rl/README.md +158 -0
  66. examples/blog_posts/warming_up_to_rl/SMOKE_TESTING.md +164 -0
  67. examples/blog_posts/warming_up_to_rl/SMOKE_TEST_COMPLETE.md +253 -0
  68. examples/blog_posts/warming_up_to_rl/configs/eval_baseline_qwen32b_10x20.toml +25 -0
  69. examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b.toml +25 -0
  70. examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b_10x20.toml +26 -0
  71. examples/blog_posts/warming_up_to_rl/configs/eval_groq_qwen32b.toml +25 -0
  72. examples/blog_posts/warming_up_to_rl/configs/eval_openai_gpt_oss_120b.toml +29 -0
  73. examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +10 -0
  74. examples/blog_posts/warming_up_to_rl/configs/smoke_test.toml +75 -0
  75. examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +91 -0
  76. examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +40 -0
  77. examples/blog_posts/warming_up_to_rl/warming_up_to_rl_baseline.py +187 -0
  78. examples/crafter_debug_render.py +186 -0
  79. examples/dev/qwen3_32b_qlora_4xh100.toml +45 -0
  80. examples/gepa/banking77_pipeline_gepa.toml +96 -0
  81. examples/gepa/multi_stage_gepa_example.toml +84 -0
  82. examples/gepa/run_gepa_banking77_pipeline.sh +157 -0
  83. examples/multi_step/SFT_README.md +147 -0
  84. examples/multi_step/configs/README_verilog_rl.md +77 -0
  85. examples/multi_step/configs/VERILOG_REWARDS.md +103 -0
  86. examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +196 -0
  87. examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
  88. examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
  89. examples/multi_step/configs/crafter_rl_outcome.toml +75 -0
  90. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +145 -0
  91. examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +84 -0
  92. examples/multi_step/configs/crafter_rl_stepwise_simple.toml +79 -0
  93. examples/multi_step/configs/crafter_rl_stepwise_simple_NEW_FORMAT.toml +105 -0
  94. examples/multi_step/configs/crafter_sft_qwen30b_lora.toml +62 -0
  95. examples/multi_step/configs/crafter_synth_backend.md +40 -0
  96. examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
  97. examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
  98. examples/multi_step/configs/verilog_rl_lora.toml +147 -0
  99. examples/multi_step/convert_traces_to_sft.py +84 -0
  100. examples/multi_step/crafter_rl_lora.md +70 -0
  101. examples/multi_step/judges/crafter_backend_judge.py +220 -0
  102. examples/multi_step/judges/verilog_backend_judge.py +234 -0
  103. examples/multi_step/readme.md +48 -0
  104. examples/multi_step/run_sft_qwen30b.sh +45 -0
  105. examples/multi_step/sse_metrics_streaming_notes.md +357 -0
  106. examples/multi_step/task_app_config_notes.md +494 -0
  107. examples/multi_step/verilog_rl_lora.md +218 -0
  108. examples/qwen_coder/README.md +102 -0
  109. examples/qwen_coder/_shared.py +113 -0
  110. examples/qwen_coder/configs/coder_lora_30b.toml +60 -0
  111. examples/qwen_coder/configs/coder_lora_4b.toml +61 -0
  112. examples/qwen_coder/configs/coder_lora_small.toml +57 -0
  113. examples/qwen_coder/generate_dataset.py +98 -0
  114. examples/qwen_coder/infer_ft_smoke.py +65 -0
  115. examples/qwen_coder/infer_prod_proxy.py +73 -0
  116. examples/qwen_coder/infer_via_synth.py +87 -0
  117. examples/qwen_coder/scripts/infer_coder.sh +19 -0
  118. examples/qwen_coder/scripts/train_coder_30b.sh +22 -0
  119. examples/qwen_coder/sft_full_17b.py +103 -0
  120. examples/qwen_coder/sft_lora_30b.py +110 -0
  121. examples/qwen_coder/subset_jsonl.py +39 -0
  122. examples/qwen_coder/todos.md +38 -0
  123. examples/qwen_coder/validate_jsonl.py +60 -0
  124. examples/qwen_vl/BUGS_AND_FIXES.md +232 -0
  125. examples/qwen_vl/IMAGE_VALIDATION_COMPLETE.md +271 -0
  126. examples/qwen_vl/IMAGE_VALIDATION_SUMMARY.md +260 -0
  127. examples/qwen_vl/INFERENCE_SFT_TESTS.md +412 -0
  128. examples/qwen_vl/NEXT_STEPS_2B.md +325 -0
  129. examples/qwen_vl/QUICKSTART.md +327 -0
  130. examples/qwen_vl/QUICKSTART_RL_VISION.md +110 -0
  131. examples/qwen_vl/README.md +152 -0
  132. examples/qwen_vl/RL_VISION_COMPLETE.md +475 -0
  133. examples/qwen_vl/RL_VISION_TESTING.md +333 -0
  134. examples/qwen_vl/SDK_VISION_INTEGRATION.md +328 -0
  135. examples/qwen_vl/SETUP_COMPLETE.md +274 -0
  136. examples/qwen_vl/VISION_TESTS_COMPLETE.md +489 -0
  137. examples/qwen_vl/VLM_PIPELINE_COMPLETE.md +242 -0
  138. examples/qwen_vl/__init__.py +2 -0
  139. examples/qwen_vl/collect_data_via_cli.md +415 -0
  140. examples/qwen_vl/collect_vision_traces.py +368 -0
  141. examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +110 -0
  142. examples/qwen_vl/configs/crafter_vlm_sft_example.toml +59 -0
  143. examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +26 -0
  144. examples/qwen_vl/configs/eval_gpt4o_vision_proper.toml +29 -0
  145. examples/qwen_vl/configs/eval_gpt5nano_vision.toml +26 -0
  146. examples/qwen_vl/configs/eval_qwen3vl_vision.toml +26 -0
  147. examples/qwen_vl/configs/filter_qwen3vl_sft.toml +49 -0
  148. examples/qwen_vl/configs/filter_vision_sft.toml +52 -0
  149. examples/qwen_vl/configs/filter_vision_test.toml +8 -0
  150. examples/qwen_vl/configs/sft_qwen3_vl_2b_test.toml +54 -0
  151. examples/qwen_vl/crafter_gpt5nano_agent.py +308 -0
  152. examples/qwen_vl/crafter_qwen_vl_agent.py +300 -0
  153. examples/qwen_vl/run_vision_comparison.sh +61 -0
  154. examples/qwen_vl/run_vision_sft_pipeline.sh +175 -0
  155. examples/qwen_vl/test_image_validation.py +201 -0
  156. examples/qwen_vl/test_sft_vision_data.py +110 -0
  157. examples/rl/README.md +169 -0
  158. examples/rl/configs/eval_base_qwen.toml +17 -0
  159. examples/rl/configs/eval_rl_qwen.toml +13 -0
  160. examples/rl/configs/rl_from_base_qwen.toml +62 -0
  161. examples/rl/configs/rl_from_base_qwen17.toml +80 -0
  162. examples/rl/configs/rl_from_ft_qwen.toml +37 -0
  163. examples/rl/download_dataset.py +80 -0
  164. examples/rl/run_eval.py +436 -0
  165. examples/rl/run_rl_and_save.py +111 -0
  166. examples/rl/task_app/README.md +21 -0
  167. {synth_ai/task/apps → examples/rl/task_app}/math_single_step.py +188 -50
  168. examples/rl/task_app/math_task_app.py +111 -0
  169. examples/run_crafter_demo.sh +10 -0
  170. examples/sdk_prompt_learning_example.py +55 -0
  171. examples/sft/README.md +139 -0
  172. examples/sft/configs/crafter_fft_qwen0p6b.toml +49 -0
  173. examples/sft/configs/crafter_lora_qwen0p6b.toml +49 -0
  174. examples/sft/evaluate.py +117 -0
  175. examples/sft/export_dataset.py +120 -0
  176. examples/sft/generate_traces.py +164 -0
  177. examples/swe/__init__.py +12 -0
  178. examples/swe/task_app/README.md +135 -0
  179. examples/swe/task_app/__init__.py +2 -0
  180. examples/swe/task_app/grpo_swe_mini.py +604 -0
  181. examples/swe/task_app/grpo_swe_mini_task_app.py +124 -0
  182. examples/swe/task_app/hosted/README.md +173 -0
  183. examples/swe/task_app/hosted/__init__.py +5 -0
  184. examples/swe/task_app/hosted/branching.py +143 -0
  185. examples/swe/task_app/hosted/environment_routes.py +1289 -0
  186. examples/swe/task_app/hosted/envs/__init__.py +1 -0
  187. examples/swe/task_app/hosted/envs/crafter/__init__.py +6 -0
  188. examples/swe/task_app/hosted/envs/crafter/app.py +1 -0
  189. examples/swe/task_app/hosted/envs/crafter/environment.py +522 -0
  190. examples/swe/task_app/hosted/envs/crafter/policy.py +478 -0
  191. examples/swe/task_app/hosted/envs/crafter/react_agent.py +108 -0
  192. examples/swe/task_app/hosted/envs/crafter/shared.py +305 -0
  193. examples/swe/task_app/hosted/envs/crafter/tools.py +47 -0
  194. examples/swe/task_app/hosted/envs/mini_swe/__init__.py +8 -0
  195. examples/swe/task_app/hosted/envs/mini_swe/environment.py +1191 -0
  196. examples/swe/task_app/hosted/envs/mini_swe/policy.py +355 -0
  197. examples/swe/task_app/hosted/envs/mini_swe/shared.py +83 -0
  198. examples/swe/task_app/hosted/envs/mini_swe/tools.py +96 -0
  199. examples/swe/task_app/hosted/hosted_app.py +204 -0
  200. examples/swe/task_app/hosted/inference/__init__.py +5 -0
  201. examples/swe/task_app/hosted/inference/openai_client.py +584 -0
  202. examples/swe/task_app/hosted/main.py +100 -0
  203. examples/swe/task_app/hosted/policy_routes.py +1094 -0
  204. examples/swe/task_app/hosted/registry.py +195 -0
  205. examples/swe/task_app/hosted/rollout.py +1905 -0
  206. examples/swe/task_app/hosted/storage/__init__.py +5 -0
  207. examples/swe/task_app/hosted/storage/volume.py +211 -0
  208. examples/swe/task_app/hosted/test_agents.py +161 -0
  209. examples/swe/task_app/hosted/test_service.py +136 -0
  210. examples/swe/task_app/hosted/utils.py +62 -0
  211. examples/swe/task_app/morph_backend.py +178 -0
  212. examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
  213. examples/task_apps/TESTING.md +275 -0
  214. examples/task_apps/banking77/__init__.py +6 -0
  215. examples/task_apps/banking77/banking77_task_app.py +912 -0
  216. examples/task_apps/banking77/deploy_wrapper.py +46 -0
  217. examples/task_apps/banking77_pipeline/__init__.py +6 -0
  218. examples/task_apps/banking77_pipeline/banking77_pipeline_task_app.py +489 -0
  219. examples/task_apps/banking77_pipeline/deploy_wrapper.py +50 -0
  220. examples/task_apps/crafter/CREATE_SFT_DATASET.md +286 -0
  221. examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
  222. examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +187 -0
  223. examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +281 -0
  224. examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
  225. examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
  226. examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
  227. examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
  228. examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
  229. examples/task_apps/crafter/task_app/README.md +42 -0
  230. examples/task_apps/crafter/task_app/__init__.py +5 -0
  231. examples/task_apps/crafter/task_app/grpo_crafter.py +1055 -0
  232. examples/task_apps/crafter/task_app/grpo_crafter_task_app.py +146 -0
  233. examples/task_apps/crafter/task_app/synth_envs_hosted/README.md +173 -0
  234. examples/task_apps/crafter/task_app/synth_envs_hosted/__init__.py +5 -0
  235. examples/task_apps/crafter/task_app/synth_envs_hosted/branching.py +143 -0
  236. examples/task_apps/crafter/task_app/synth_envs_hosted/environment_routes.py +1226 -0
  237. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/__init__.py +1 -0
  238. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
  239. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
  240. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/environment.py +532 -0
  241. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +583 -0
  242. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +122 -0
  243. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/shared.py +305 -0
  244. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
  245. examples/task_apps/crafter/task_app/synth_envs_hosted/hosted_app.py +253 -0
  246. examples/task_apps/crafter/task_app/synth_envs_hosted/inference/__init__.py +5 -0
  247. examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +999 -0
  248. examples/task_apps/crafter/task_app/synth_envs_hosted/main.py +100 -0
  249. examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +1252 -0
  250. examples/task_apps/crafter/task_app/synth_envs_hosted/registry.py +195 -0
  251. examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +2233 -0
  252. examples/task_apps/crafter/task_app/synth_envs_hosted/storage/__init__.py +5 -0
  253. examples/task_apps/crafter/task_app/synth_envs_hosted/storage/volume.py +211 -0
  254. examples/task_apps/crafter/task_app/synth_envs_hosted/test_agents.py +161 -0
  255. examples/task_apps/crafter/task_app/synth_envs_hosted/test_service.py +136 -0
  256. examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +411 -0
  257. examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
  258. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
  259. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
  260. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
  261. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
  262. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
  263. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
  264. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
  265. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
  266. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
  267. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
  268. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
  269. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
  270. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
  271. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
  272. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
  273. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
  274. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
  275. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
  276. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
  277. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
  278. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
  279. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
  280. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
  281. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
  282. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
  283. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
  284. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
  285. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
  286. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
  287. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
  288. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
  289. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
  290. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
  291. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
  292. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
  293. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
  294. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
  295. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
  296. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
  297. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
  298. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
  299. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
  300. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
  301. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
  302. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
  303. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
  304. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
  305. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
  306. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
  307. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
  308. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
  309. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
  310. examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
  311. examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
  312. examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
  313. examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
  314. examples/task_apps/enron/__init__.py +2 -0
  315. examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
  316. examples/task_apps/enron/filter_sft.toml +5 -0
  317. examples/task_apps/enron/task_app/README.md +14 -0
  318. examples/task_apps/enron/task_app/__init__.py +1 -0
  319. examples/task_apps/enron/task_app/grpo_enron.py +906 -0
  320. examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
  321. examples/task_apps/enron/tests/__init__.py +4 -0
  322. examples/task_apps/enron/tests/conftest.py +115 -0
  323. examples/task_apps/enron/tests/integration/__init__.py +4 -0
  324. examples/task_apps/enron/tests/integration/test_enron_eval.py +179 -0
  325. examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
  326. examples/task_apps/enron/tests/unit/__init__.py +4 -0
  327. examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
  328. examples/task_apps/gepa_benchmarks/__init__.py +7 -0
  329. examples/task_apps/gepa_benchmarks/common.py +260 -0
  330. examples/task_apps/gepa_benchmarks/hotpotqa_task_app.py +507 -0
  331. examples/task_apps/gepa_benchmarks/hover_task_app.py +436 -0
  332. examples/task_apps/gepa_benchmarks/ifbench_task_app.py +563 -0
  333. examples/task_apps/gepa_benchmarks/pupa_task_app.py +460 -0
  334. examples/task_apps/math/README.md +21 -0
  335. examples/task_apps/math/math_single_step.py +1000 -0
  336. examples/task_apps/math/math_task_app.py +115 -0
  337. examples/task_apps/pokemon_battle/__init__.py +2 -0
  338. examples/task_apps/pokemon_battle/modal_app.py +104 -0
  339. examples/task_apps/pokemon_battle/task_app/README.md +68 -0
  340. examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
  341. examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
  342. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
  343. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
  344. examples/task_apps/pokemon_red/README.md +356 -0
  345. examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +428 -0
  346. examples/task_apps/pokemon_red/__init__.py +3 -0
  347. examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +30 -0
  348. examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +224 -0
  349. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +75 -0
  350. examples/task_apps/pokemon_red/task_app.py +1048 -0
  351. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +193 -0
  352. examples/task_apps/sokoban/README.md +306 -0
  353. examples/task_apps/sokoban/__init__.py +3 -0
  354. examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
  355. examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
  356. examples/task_apps/sokoban/filter_sft.toml +5 -0
  357. examples/task_apps/sokoban/task_app.py +1058 -0
  358. examples/task_apps/sokoban/tests/__init__.py +4 -0
  359. examples/task_apps/sokoban/tests/conftest.py +113 -0
  360. examples/task_apps/sokoban/tests/integration/__init__.py +4 -0
  361. examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
  362. examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
  363. examples/task_apps/sokoban/tests/unit/__init__.py +4 -0
  364. examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
  365. examples/task_apps/verilog/__init__.py +1 -0
  366. examples/task_apps/verilog/eval_groq_qwen32b.toml +22 -0
  367. examples/task_apps/verilog/filter_sft.toml +5 -0
  368. examples/task_apps/verilog/task_app/README.md +12 -0
  369. examples/task_apps/verilog/task_app/__init__.py +1 -0
  370. examples/task_apps/verilog/task_app/grpo_verilog.py +1166 -0
  371. examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
  372. examples/task_apps/verilog/tests/__init__.py +4 -0
  373. examples/task_apps/verilog/tests/conftest.py +115 -0
  374. examples/task_apps/verilog/tests/integration/__init__.py +4 -0
  375. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +181 -0
  376. examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
  377. examples/task_apps/verilog/tests/unit/__init__.py +4 -0
  378. examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
  379. examples/tunnel_gepa_banking77/README.md +106 -0
  380. examples/tunnel_gepa_banking77/banking77_gepa_tunnel.toml +95 -0
  381. examples/tunnel_gepa_banking77/keep_tunnel_running.py +60 -0
  382. examples/tunnel_gepa_banking77/run_gepa_with_tunnel.sh +226 -0
  383. examples/vlm/PROPOSAL.md +53 -0
  384. examples/vlm/README.md +68 -0
  385. examples/vlm/configs/crafter_vlm_gpt4o.toml +49 -0
  386. examples/vlm/crafter_image_only_agent.py +207 -0
  387. examples/vlm/crafter_openai_vlm_agent.py +275 -0
  388. examples/vlm/filter_image_rows.py +63 -0
  389. examples/vlm/run_crafter_vlm_benchmark.py +316 -0
  390. examples/warming_up_to_rl/_utils.py +92 -0
  391. examples/warming_up_to_rl/analyze_trace_db.py +422 -0
  392. examples/warming_up_to_rl/configs/crafter_fft.toml +53 -0
  393. examples/warming_up_to_rl/configs/crafter_fft_4b.toml +54 -0
  394. examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml +22 -0
  395. examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml +15 -0
  396. examples/warming_up_to_rl/configs/eval_modal_qwen4b.toml +24 -0
  397. examples/warming_up_to_rl/configs/eval_stepwise_complex.toml +35 -0
  398. examples/warming_up_to_rl/configs/eval_stepwise_consistent.toml +26 -0
  399. examples/warming_up_to_rl/configs/eval_stepwise_per_achievement.toml +36 -0
  400. examples/warming_up_to_rl/configs/eval_stepwise_simple.toml +32 -0
  401. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +85 -0
  402. examples/warming_up_to_rl/configs/rl_from_ft.toml +58 -0
  403. examples/warming_up_to_rl/export_trace_sft.py +837 -0
  404. examples/warming_up_to_rl/groq_test.py +97 -0
  405. examples/warming_up_to_rl/manage_secrets.py +131 -0
  406. examples/warming_up_to_rl/old/event_rewards.md +234 -0
  407. examples/warming_up_to_rl/old/notes.md +73 -0
  408. examples/warming_up_to_rl/readme.md +110 -0
  409. examples/warming_up_to_rl/run_eval.py +736 -0
  410. examples/warming_up_to_rl/run_fft_and_save.py +380 -0
  411. examples/warming_up_to_rl/run_local_rollout.py +239 -0
  412. examples/warming_up_to_rl/run_local_rollout_modal.py +248 -0
  413. examples/warming_up_to_rl/run_local_rollout_parallel.py +405 -0
  414. examples/warming_up_to_rl/run_local_rollout_traced.py +477 -0
  415. examples/warming_up_to_rl/run_rl_and_save.py +124 -0
  416. examples/warming_up_to_rl/run_rollout_remote.py +156 -0
  417. examples/warming_up_to_rl/task_app/README.md +42 -0
  418. examples/warming_up_to_rl/task_app/grpo_crafter.py +876 -0
  419. examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +135 -0
  420. examples/warming_up_to_rl/task_app/synth_envs_hosted/README.md +173 -0
  421. examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +5 -0
  422. examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +143 -0
  423. examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +1226 -0
  424. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -0
  425. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
  426. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
  427. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +522 -0
  428. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +454 -0
  429. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +108 -0
  430. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +305 -0
  431. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
  432. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +253 -0
  433. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +5 -0
  434. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +729 -0
  435. examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +100 -0
  436. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +1114 -0
  437. examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +195 -0
  438. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +1891 -0
  439. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +5 -0
  440. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +211 -0
  441. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +161 -0
  442. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +137 -0
  443. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +129 -0
  444. examples/workflows/math_rl/configs/eval_base_qwen.toml +15 -0
  445. examples/workflows/math_rl/configs/eval_rl_qwen.toml +11 -0
  446. examples/workflows/math_rl/configs/rl_from_base_qwen.toml +62 -0
  447. examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +80 -0
  448. examples/workflows/math_rl/configs/rl_from_ft_qwen.toml +35 -0
  449. examples/workflows/math_rl/download_dataset.py +80 -0
  450. examples/workflows/math_rl/run_eval.py +436 -0
  451. examples/workflows/math_rl/run_rl_and_save.py +111 -0
  452. synth_ai/__init__.py +47 -23
  453. synth_ai/_utils/__init__.py +47 -0
  454. synth_ai/_utils/base_url.py +10 -0
  455. synth_ai/_utils/http.py +10 -0
  456. synth_ai/_utils/prompts.py +10 -0
  457. synth_ai/_utils/task_app_state.py +12 -0
  458. synth_ai/_utils/user_config.py +10 -0
  459. synth_ai/api/models/supported.py +514 -0
  460. synth_ai/api/train/__init__.py +60 -2
  461. synth_ai/api/train/builders.py +347 -39
  462. synth_ai/api/train/cli.py +895 -160
  463. synth_ai/api/train/config_finder.py +103 -25
  464. synth_ai/api/train/configs/__init__.py +65 -0
  465. synth_ai/api/train/configs/prompt_learning.py +496 -0
  466. synth_ai/api/train/configs/rl.py +188 -0
  467. synth_ai/api/train/configs/sft.py +99 -0
  468. synth_ai/api/train/configs/shared.py +81 -0
  469. synth_ai/api/train/env_resolver.py +70 -20
  470. synth_ai/api/train/pollers.py +29 -4
  471. synth_ai/api/train/prompt_learning.py +425 -0
  472. synth_ai/api/train/sft.py +390 -0
  473. synth_ai/api/train/supported_algos.py +147 -0
  474. synth_ai/api/train/task_app.py +6 -4
  475. synth_ai/api/train/utils.py +64 -52
  476. synth_ai/api/train/validators.py +1117 -0
  477. synth_ai/api/tunnel.py +49 -0
  478. synth_ai/auth/credentials.py +94 -0
  479. synth_ai/baseline/__init__.py +25 -0
  480. synth_ai/baseline/config.py +209 -0
  481. synth_ai/baseline/discovery.py +214 -0
  482. synth_ai/baseline/execution.py +146 -0
  483. synth_ai/cfgs.py +227 -0
  484. synth_ai/cli/__init__.py +85 -63
  485. synth_ai/cli/_modal_wrapper.py +31 -0
  486. synth_ai/cli/_storage.py +20 -0
  487. synth_ai/cli/_typer_patch.py +47 -0
  488. synth_ai/cli/_validate_task_app.py +29 -0
  489. synth_ai/cli/balance.py +16 -4
  490. synth_ai/cli/calc.py +36 -21
  491. synth_ai/cli/claude.py +70 -0
  492. synth_ai/cli/codex.py +267 -0
  493. synth_ai/cli/commands/__init__.py +18 -0
  494. synth_ai/cli/commands/baseline/__init__.py +12 -0
  495. synth_ai/cli/commands/baseline/core.py +637 -0
  496. synth_ai/cli/commands/baseline/list.py +93 -0
  497. synth_ai/cli/commands/demo/__init__.py +6 -0
  498. synth_ai/cli/commands/demo/core.py +163 -0
  499. synth_ai/cli/commands/eval/__init__.py +19 -0
  500. synth_ai/cli/commands/eval/core.py +1112 -0
  501. synth_ai/cli/commands/eval/errors.py +81 -0
  502. synth_ai/cli/commands/eval/validation.py +133 -0
  503. synth_ai/cli/commands/filter/__init__.py +12 -0
  504. synth_ai/cli/commands/filter/core.py +424 -0
  505. synth_ai/cli/commands/filter/errors.py +55 -0
  506. synth_ai/cli/commands/filter/validation.py +77 -0
  507. synth_ai/cli/commands/help/__init__.py +185 -0
  508. synth_ai/cli/commands/help/core.py +72 -0
  509. synth_ai/cli/commands/smoke/__init__.py +7 -0
  510. synth_ai/cli/commands/smoke/core.py +1437 -0
  511. synth_ai/cli/commands/status/__init__.py +66 -0
  512. synth_ai/cli/commands/status/client.py +192 -0
  513. synth_ai/cli/commands/status/config.py +92 -0
  514. synth_ai/cli/commands/status/errors.py +20 -0
  515. synth_ai/cli/commands/status/formatters.py +164 -0
  516. synth_ai/cli/commands/status/subcommands/__init__.py +9 -0
  517. synth_ai/cli/commands/status/subcommands/files.py +79 -0
  518. synth_ai/cli/commands/status/subcommands/jobs.py +334 -0
  519. synth_ai/cli/commands/status/subcommands/models.py +79 -0
  520. synth_ai/cli/commands/status/subcommands/pricing.py +22 -0
  521. synth_ai/cli/commands/status/subcommands/runs.py +81 -0
  522. synth_ai/cli/commands/status/subcommands/session.py +183 -0
  523. synth_ai/cli/commands/status/subcommands/summary.py +47 -0
  524. synth_ai/cli/commands/status/subcommands/usage.py +203 -0
  525. synth_ai/cli/commands/status/utils.py +114 -0
  526. synth_ai/cli/commands/train/__init__.py +53 -0
  527. synth_ai/cli/commands/train/core.py +21 -0
  528. synth_ai/cli/commands/train/errors.py +117 -0
  529. synth_ai/cli/commands/train/judge_schemas.py +200 -0
  530. synth_ai/cli/commands/train/judge_validation.py +305 -0
  531. synth_ai/cli/commands/train/validation.py +386 -0
  532. synth_ai/cli/demo.py +32 -140
  533. synth_ai/cli/deploy.py +233 -0
  534. synth_ai/cli/eval/__init__.py +36 -0
  535. synth_ai/cli/eval/core.py +5 -0
  536. synth_ai/cli/eval/errors.py +31 -0
  537. synth_ai/cli/eval/validation.py +5 -0
  538. synth_ai/cli/filter/__init__.py +28 -0
  539. synth_ai/cli/filter/core.py +5 -0
  540. synth_ai/cli/filter/errors.py +23 -0
  541. synth_ai/cli/filter/validation.py +5 -0
  542. synth_ai/cli/legacy_root_backup.py +28 -22
  543. synth_ai/cli/lib/__init__.py +10 -0
  544. synth_ai/cli/lib/task_app_discovery.py +7 -0
  545. synth_ai/cli/lib/task_app_env.py +518 -0
  546. synth_ai/cli/mcp.py +34 -0
  547. synth_ai/cli/modal_serve/__init__.py +12 -0
  548. synth_ai/cli/modal_serve/core.py +14 -0
  549. synth_ai/cli/modal_serve/errors.py +8 -0
  550. synth_ai/cli/modal_serve/validation.py +11 -0
  551. synth_ai/cli/opencode.py +256 -0
  552. synth_ai/cli/recent.py +13 -7
  553. synth_ai/cli/rl_demo.py +156 -116
  554. synth_ai/cli/root.py +131 -132
  555. synth_ai/cli/serve/__init__.py +12 -0
  556. synth_ai/cli/serve/core.py +14 -0
  557. synth_ai/cli/serve/errors.py +8 -0
  558. synth_ai/cli/serve/validation.py +11 -0
  559. synth_ai/cli/setup.py +49 -0
  560. synth_ai/cli/status.py +7 -125
  561. synth_ai/cli/task_app_deploy.py +7 -0
  562. synth_ai/cli/task_app_list.py +25 -0
  563. synth_ai/cli/task_app_modal_serve.py +11 -0
  564. synth_ai/cli/task_app_serve.py +11 -0
  565. synth_ai/cli/task_apps.py +2284 -257
  566. synth_ai/cli/traces.py +9 -5
  567. synth_ai/cli/train/__init__.py +12 -0
  568. synth_ai/cli/train/core.py +21 -0
  569. synth_ai/cli/train/errors.py +8 -0
  570. synth_ai/cli/train/validation.py +24 -0
  571. synth_ai/cli/train.py +5 -0
  572. synth_ai/cli/turso.py +73 -0
  573. synth_ai/cli/watch.py +13 -18
  574. synth_ai/demos/__init__.py +10 -0
  575. synth_ai/demos/core/__init__.py +28 -1
  576. synth_ai/demos/core/cli.py +579 -291
  577. synth_ai/demos/crafter/__init__.py +1 -0
  578. synth_ai/demos/crafter/crafter_fft_4b.toml +55 -0
  579. synth_ai/demos/crafter/grpo_crafter_task_app.py +185 -0
  580. synth_ai/demos/crafter/rl_from_base_qwen4b.toml +74 -0
  581. synth_ai/demos/demo_registry.py +176 -0
  582. synth_ai/demos/demo_task_apps/__init__.py +3 -3
  583. synth_ai/demos/demo_task_apps/core.py +64 -28
  584. synth_ai/demos/demo_task_apps/crafter/__init__.py +1 -0
  585. synth_ai/demos/demo_task_apps/crafter/configs/crafter_fft_4b.toml +53 -0
  586. synth_ai/demos/demo_task_apps/crafter/configs/rl_from_base_qwen4b.toml +73 -0
  587. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +184 -0
  588. synth_ai/demos/demo_task_apps/math/_common.py +1 -2
  589. synth_ai/demos/demo_task_apps/math/app.py +2 -1
  590. synth_ai/demos/demo_task_apps/math/deploy_modal.py +3 -6
  591. synth_ai/demos/demo_task_apps/math/modal_task_app.py +185 -83
  592. synth_ai/demos/demo_task_apps/math/task_app_entry.py +0 -2
  593. synth_ai/demos/math/__init__.py +1 -0
  594. synth_ai/demos/math/_common.py +16 -0
  595. synth_ai/demos/math/app.py +38 -0
  596. synth_ai/demos/math/config.toml +76 -0
  597. synth_ai/demos/math/deploy_modal.py +54 -0
  598. synth_ai/demos/math/modal_task_app.py +703 -0
  599. synth_ai/demos/math/task_app_entry.py +51 -0
  600. synth_ai/environments/environment/core.py +7 -1
  601. synth_ai/environments/examples/bandit/engine.py +12 -5
  602. synth_ai/environments/examples/bandit/environment.py +0 -1
  603. synth_ai/environments/examples/bandit/taskset.py +4 -4
  604. synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +7 -4
  605. synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +9 -5
  606. synth_ai/environments/examples/crafter_classic/environment.py +93 -2
  607. synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +4 -3
  608. synth_ai/environments/examples/enron/engine.py +7 -2
  609. synth_ai/environments/examples/enron/environment.py +68 -0
  610. synth_ai/environments/examples/red/engine.py +60 -12
  611. synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
  612. synth_ai/environments/examples/red/engine_helpers/reward_components.py +151 -179
  613. synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
  614. synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
  615. synth_ai/environments/examples/red/environment.py +86 -0
  616. synth_ai/environments/examples/red/trace_hooks_v3.py +168 -0
  617. synth_ai/environments/examples/sokoban/taskset.py +116 -0
  618. synth_ai/environments/examples/verilog/engine.py +104 -12
  619. synth_ai/environments/examples/wordle/environment.py +0 -1
  620. synth_ai/environments/reproducibility/tree.py +5 -6
  621. synth_ai/environments/service/app.py +11 -12
  622. synth_ai/environments/service/core_routes.py +10 -9
  623. synth_ai/environments/stateful/engine.py +1 -1
  624. synth_ai/environments/tasks/core.py +1 -0
  625. synth_ai/environments/tasks/filters.py +5 -6
  626. synth_ai/environments/tasks/utils.py +4 -5
  627. synth_ai/evals/__init__.py +15 -0
  628. synth_ai/evals/base.py +14 -5
  629. synth_ai/evals/client.py +82 -0
  630. synth_ai/evals/types.py +42 -0
  631. synth_ai/http.py +8 -22
  632. synth_ai/http_client.py +45 -12
  633. synth_ai/inference/__init__.py +0 -2
  634. synth_ai/inference/client.py +21 -7
  635. synth_ai/jobs/client.py +129 -80
  636. synth_ai/judge_schemas.py +127 -0
  637. synth_ai/learning/__init__.py +51 -6
  638. synth_ai/learning/algorithms.py +14 -0
  639. synth_ai/learning/client.py +122 -30
  640. synth_ai/learning/config.py +2 -40
  641. synth_ai/learning/constants.py +0 -2
  642. synth_ai/learning/ft_client.py +4 -56
  643. synth_ai/learning/health.py +14 -8
  644. synth_ai/learning/jobs.py +43 -47
  645. synth_ai/learning/prompt_learning_client.py +276 -0
  646. synth_ai/learning/prompt_learning_types.py +185 -0
  647. synth_ai/{rl → learning/rl}/__init__.py +14 -5
  648. synth_ai/learning/rl/client.py +269 -0
  649. synth_ai/learning/rl/config.py +31 -0
  650. synth_ai/{rl → learning/rl}/contracts.py +5 -10
  651. synth_ai/{rl → learning/rl}/env_keys.py +45 -16
  652. synth_ai/learning/rl/secrets.py +13 -0
  653. synth_ai/learning/rl_client.py +2 -253
  654. synth_ai/learning/sft/__init__.py +29 -0
  655. synth_ai/learning/sft/client.py +68 -0
  656. synth_ai/learning/sft/config.py +270 -0
  657. synth_ai/learning/sft/data.py +698 -0
  658. synth_ai/learning/sse.py +25 -26
  659. synth_ai/learning/validators.py +29 -25
  660. synth_ai/mcp/__init__.py +5 -0
  661. synth_ai/mcp/__main__.py +8 -0
  662. synth_ai/mcp/main.py +254 -0
  663. synth_ai/mcp/setup.py +100 -0
  664. synth_ai/modal.py +257 -0
  665. synth_ai/pricing/__init__.py +3 -0
  666. synth_ai/pricing/model_pricing.py +64 -0
  667. synth_ai/session/__init__.py +75 -0
  668. synth_ai/session/client.py +383 -0
  669. synth_ai/session/constants.py +63 -0
  670. synth_ai/session/exceptions.py +105 -0
  671. synth_ai/session/manager.py +139 -0
  672. synth_ai/session/models.py +89 -0
  673. synth_ai/session/query.py +110 -0
  674. synth_ai/spec/__init__.py +46 -0
  675. synth_ai/spec/dataclasses.py +149 -0
  676. synth_ai/spec/loader.py +144 -0
  677. synth_ai/spec/serializer.py +199 -0
  678. synth_ai/spec/validation.py +250 -0
  679. synth_ai/streaming/__init__.py +29 -0
  680. synth_ai/streaming/config.py +94 -0
  681. synth_ai/streaming/handlers.py +589 -0
  682. synth_ai/streaming/streamer.py +320 -0
  683. synth_ai/streaming/types.py +95 -0
  684. synth_ai/task/__init__.py +50 -30
  685. synth_ai/task/apps/__init__.py +63 -19
  686. synth_ai/task/auth.py +35 -23
  687. synth_ai/task/client.py +15 -13
  688. synth_ai/task/config.py +261 -0
  689. synth_ai/task/contracts.py +165 -64
  690. synth_ai/task/datasets.py +9 -6
  691. synth_ai/task/errors.py +11 -10
  692. synth_ai/task/health.py +17 -11
  693. synth_ai/task/inference_api.py +101 -0
  694. synth_ai/task/json.py +58 -24
  695. synth_ai/task/proxy.py +59 -66
  696. synth_ai/task/rubrics/__init__.py +55 -0
  697. synth_ai/task/rubrics/loaders.py +156 -0
  698. synth_ai/task/rubrics/models.py +57 -0
  699. synth_ai/task/rubrics/scoring.py +116 -0
  700. synth_ai/task/rubrics/strict.py +149 -0
  701. synth_ai/task/rubrics.py +22 -15
  702. synth_ai/task/server.py +65 -31
  703. synth_ai/task/trace_correlation_helpers.py +328 -0
  704. synth_ai/task/tracing_utils.py +44 -28
  705. synth_ai/task/validators.py +449 -6
  706. synth_ai/task/vendors.py +5 -7
  707. synth_ai/tracing_v3/__init__.py +4 -0
  708. synth_ai/tracing_v3/abstractions.py +21 -4
  709. synth_ai/tracing_v3/config.py +167 -22
  710. synth_ai/tracing_v3/constants.py +21 -0
  711. synth_ai/tracing_v3/db_config.py +42 -29
  712. synth_ai/tracing_v3/decorators.py +80 -45
  713. synth_ai/tracing_v3/examples/basic_usage.py +15 -9
  714. synth_ai/tracing_v3/hooks.py +6 -4
  715. synth_ai/tracing_v3/llm_call_record_helpers.py +161 -61
  716. synth_ai/tracing_v3/migration_helper.py +1 -2
  717. synth_ai/tracing_v3/replica_sync.py +12 -7
  718. synth_ai/tracing_v3/serialization.py +130 -0
  719. synth_ai/tracing_v3/session_tracer.py +73 -16
  720. synth_ai/tracing_v3/storage/base.py +89 -1
  721. synth_ai/tracing_v3/storage/config.py +63 -16
  722. synth_ai/tracing_v3/storage/factory.py +11 -9
  723. synth_ai/tracing_v3/storage/utils.py +15 -11
  724. synth_ai/tracing_v3/trace_utils.py +317 -0
  725. synth_ai/tracing_v3/turso/__init__.py +8 -21
  726. synth_ai/tracing_v3/turso/daemon.py +123 -15
  727. synth_ai/tracing_v3/turso/models.py +5 -2
  728. synth_ai/tracing_v3/turso/native_manager.py +1293 -0
  729. synth_ai/tracing_v3/utils.py +5 -4
  730. synth_ai/tunnel.py +143 -0
  731. synth_ai/tunnel_deploy.py +278 -0
  732. synth_ai/types.py +8 -0
  733. synth_ai/urls.py +11 -0
  734. synth_ai/utils/__init__.py +166 -0
  735. synth_ai/utils/agents.py +74 -0
  736. synth_ai/utils/apps.py +152 -0
  737. synth_ai/utils/base_url.py +94 -0
  738. synth_ai/utils/bin.py +39 -0
  739. synth_ai/utils/claude.py +36 -0
  740. synth_ai/utils/cli.py +284 -0
  741. synth_ai/utils/config.py +81 -0
  742. synth_ai/utils/env.py +346 -0
  743. synth_ai/utils/errors.py +85 -0
  744. synth_ai/utils/http.py +172 -0
  745. synth_ai/utils/json.py +72 -0
  746. synth_ai/utils/log_filter.py +99 -0
  747. synth_ai/utils/logging.py +198 -0
  748. synth_ai/utils/modal.py +299 -0
  749. synth_ai/utils/paths.py +95 -0
  750. synth_ai/utils/process.py +233 -0
  751. synth_ai/utils/prompts.py +39 -0
  752. synth_ai/utils/sqld.py +122 -0
  753. synth_ai/utils/ssl.py +25 -0
  754. synth_ai/utils/task_app_discovery.py +882 -0
  755. synth_ai/utils/task_app_env.py +186 -0
  756. synth_ai/utils/task_app_state.py +318 -0
  757. synth_ai/utils/tunnel/__init__.py +12 -0
  758. synth_ai/utils/tunnel/config.py +55 -0
  759. synth_ai/utils/user_config.py +137 -0
  760. synth_ai/uvicorn.py +77 -0
  761. synth_ai-0.2.23.dev3.dist-info/METADATA +357 -0
  762. synth_ai-0.2.23.dev3.dist-info/RECORD +983 -0
  763. {synth_ai-0.2.9.dev0.dist-info → synth_ai-0.2.23.dev3.dist-info}/entry_points.txt +0 -1
  764. {synth_ai-0.2.9.dev0.dist-info → synth_ai-0.2.23.dev3.dist-info}/top_level.txt +1 -0
  765. synth_ai/cli/man.py +0 -106
  766. synth_ai/core/experiment.py +0 -15
  767. synth_ai/core/system.py +0 -15
  768. synth_ai/demo_registry.py +0 -258
  769. synth_ai/environments/examples/sokoban/units/astar_common.py +0 -95
  770. synth_ai/experimental/synth_oss.py +0 -446
  771. synth_ai/handshake.py +0 -107
  772. synth_ai/install_sqld.sh +0 -40
  773. synth_ai/learning/offline/dpo.py +0 -0
  774. synth_ai/learning/offline/providers.py +0 -7
  775. synth_ai/learning/offline/sft.py +0 -0
  776. synth_ai/learning/offline/shared.py +0 -0
  777. synth_ai/learning/online/grpo.py +0 -0
  778. synth_ai/learning/online/irft.py +0 -0
  779. synth_ai/learning/prompts/banking77_injection_eval.py +0 -168
  780. synth_ai/learning/prompts/gepa.py +0 -0
  781. synth_ai/learning/prompts/hello_world_in_context_injection_ex.py +0 -213
  782. synth_ai/learning/prompts/mipro.py +0 -289
  783. synth_ai/learning/prompts/random_search.py +0 -246
  784. synth_ai/learning/prompts/run_mipro_banking77.py +0 -172
  785. synth_ai/learning/prompts/run_random_search_banking77.py +0 -324
  786. synth_ai/lm/__init__.py +0 -51
  787. synth_ai/lm/caching/constants.py +0 -6
  788. synth_ai/lm/caching/dbs.py +0 -0
  789. synth_ai/lm/caching/ephemeral.py +0 -102
  790. synth_ai/lm/caching/handler.py +0 -137
  791. synth_ai/lm/caching/initialize.py +0 -11
  792. synth_ai/lm/caching/persistent.py +0 -114
  793. synth_ai/lm/config.py +0 -110
  794. synth_ai/lm/constants.py +0 -32
  795. synth_ai/lm/core/__init__.py +0 -8
  796. synth_ai/lm/core/all.py +0 -73
  797. synth_ai/lm/core/exceptions.py +0 -7
  798. synth_ai/lm/core/main.py +0 -319
  799. synth_ai/lm/core/main_v3.py +0 -594
  800. synth_ai/lm/core/synth_models.py +0 -48
  801. synth_ai/lm/core/vendor_clients.py +0 -188
  802. synth_ai/lm/cost/monitor.py +0 -1
  803. synth_ai/lm/cost/statefulness.py +0 -1
  804. synth_ai/lm/injection.py +0 -80
  805. synth_ai/lm/overrides.py +0 -206
  806. synth_ai/lm/provider_support/__init__.py +0 -8
  807. synth_ai/lm/provider_support/anthropic.py +0 -972
  808. synth_ai/lm/provider_support/openai.py +0 -1139
  809. synth_ai/lm/provider_support/suppress_logging.py +0 -31
  810. synth_ai/lm/structured_outputs/handler.py +0 -440
  811. synth_ai/lm/structured_outputs/inject.py +0 -297
  812. synth_ai/lm/structured_outputs/rehabilitate.py +0 -185
  813. synth_ai/lm/tools/__init__.py +0 -3
  814. synth_ai/lm/tools/base.py +0 -172
  815. synth_ai/lm/unified_interface.py +0 -202
  816. synth_ai/lm/vendors/base.py +0 -81
  817. synth_ai/lm/vendors/core/anthropic_api.py +0 -387
  818. synth_ai/lm/vendors/core/gemini_api.py +0 -292
  819. synth_ai/lm/vendors/core/mistral_api.py +0 -322
  820. synth_ai/lm/vendors/core/openai_api.py +0 -225
  821. synth_ai/lm/vendors/core/synth_dev_api.py +0 -0
  822. synth_ai/lm/vendors/local/ollama.py +0 -0
  823. synth_ai/lm/vendors/openai_standard.py +0 -780
  824. synth_ai/lm/vendors/openai_standard_responses.py +0 -256
  825. synth_ai/lm/vendors/retries.py +0 -22
  826. synth_ai/lm/vendors/supported/custom_endpoint.py +0 -417
  827. synth_ai/lm/vendors/supported/deepseek.py +0 -69
  828. synth_ai/lm/vendors/supported/grok.py +0 -75
  829. synth_ai/lm/vendors/supported/groq.py +0 -16
  830. synth_ai/lm/vendors/supported/ollama.py +0 -15
  831. synth_ai/lm/vendors/supported/openrouter.py +0 -74
  832. synth_ai/lm/vendors/supported/together.py +0 -11
  833. synth_ai/lm/vendors/synth_client.py +0 -808
  834. synth_ai/lm/warmup.py +0 -186
  835. synth_ai/rl/secrets.py +0 -19
  836. synth_ai/scripts/verify_rewards.py +0 -100
  837. synth_ai/task/apps/grpo_crafter.py +0 -438
  838. synth_ai/tracing/__init__.py +0 -30
  839. synth_ai/tracing_v1/__init__.py +0 -33
  840. synth_ai/tracing_v3/turso/manager.py +0 -774
  841. synth_ai/v0/tracing/abstractions.py +0 -224
  842. synth_ai/v0/tracing/base_client.py +0 -91
  843. synth_ai/v0/tracing/client_manager.py +0 -131
  844. synth_ai/v0/tracing/config.py +0 -142
  845. synth_ai/v0/tracing/context.py +0 -146
  846. synth_ai/v0/tracing/decorators.py +0 -682
  847. synth_ai/v0/tracing/events/__init__.py +0 -0
  848. synth_ai/v0/tracing/events/manage.py +0 -147
  849. synth_ai/v0/tracing/events/scope.py +0 -86
  850. synth_ai/v0/tracing/events/store.py +0 -228
  851. synth_ai/v0/tracing/immediate_client.py +0 -151
  852. synth_ai/v0/tracing/local.py +0 -18
  853. synth_ai/v0/tracing/log_client_base.py +0 -73
  854. synth_ai/v0/tracing/retry_queue.py +0 -186
  855. synth_ai/v0/tracing/trackers.py +0 -515
  856. synth_ai/v0/tracing/upload.py +0 -512
  857. synth_ai/v0/tracing/utils.py +0 -9
  858. synth_ai/v0/tracing_v1/__init__.py +0 -16
  859. synth_ai/v0/tracing_v1/abstractions.py +0 -224
  860. synth_ai/v0/tracing_v1/base_client.py +0 -91
  861. synth_ai/v0/tracing_v1/client_manager.py +0 -131
  862. synth_ai/v0/tracing_v1/config.py +0 -142
  863. synth_ai/v0/tracing_v1/context.py +0 -146
  864. synth_ai/v0/tracing_v1/decorators.py +0 -703
  865. synth_ai/v0/tracing_v1/events/__init__.py +0 -0
  866. synth_ai/v0/tracing_v1/events/manage.py +0 -147
  867. synth_ai/v0/tracing_v1/events/scope.py +0 -86
  868. synth_ai/v0/tracing_v1/events/store.py +0 -228
  869. synth_ai/v0/tracing_v1/immediate_client.py +0 -151
  870. synth_ai/v0/tracing_v1/local.py +0 -18
  871. synth_ai/v0/tracing_v1/log_client_base.py +0 -73
  872. synth_ai/v0/tracing_v1/retry_queue.py +0 -186
  873. synth_ai/v0/tracing_v1/trackers.py +0 -515
  874. synth_ai/v0/tracing_v1/upload.py +0 -527
  875. synth_ai/v0/tracing_v1/utils.py +0 -9
  876. synth_ai/zyk/__init__.py +0 -30
  877. synth_ai-0.2.9.dev0.dist-info/METADATA +0 -131
  878. synth_ai-0.2.9.dev0.dist-info/RECORD +0 -444
  879. {synth_ai/lm/caching → examples/task_apps}/__init__.py +0 -0
  880. {synth_ai/lm/cost → examples/task_apps/crafter}/__init__.py +0 -0
  881. {synth_ai/lm/structured_outputs → examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server}/__init__.py +0 -0
  882. {synth_ai/lm/vendors → examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests}/__init__.py +0 -0
  883. {synth_ai/lm/vendors/core → examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils}/__init__.py +0 -0
  884. {synth_ai/lm/vendors/local → examples/task_apps/math}/__init__.py +0 -0
  885. {synth_ai/lm/vendors/supported → examples/workflows}/__init__.py +0 -0
  886. {synth_ai/v0/tracing → examples/workflows/math_rl}/__init__.py +0 -0
  887. /synth_ai/{compound/cais.py → cli/__main__.py} +0 -0
  888. /synth_ai/{learning/filtering.py → py.typed} +0 -0
  889. {synth_ai-0.2.9.dev0.dist-info → synth_ai-0.2.23.dev3.dist-info}/WHEEL +0 -0
  890. {synth_ai-0.2.9.dev0.dist-info → synth_ai-0.2.23.dev3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1191 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import contextlib
5
+ import json
6
+ import logging
7
+ import os
8
+ import shlex
9
+ import shutil
10
+ import subprocess
11
+ import threading
12
+ import time
13
+ import uuid
14
+ from dataclasses import dataclass, field
15
+ from pathlib import Path
16
+ from typing import Any
17
+
18
+ from minisweagent.environments import get_environment
19
+ from synth_ai.environments.environment.tools import EnvToolCall
20
+
21
+ from examples.swe.task_app.morph_backend import MorphSandboxBackend
22
+ from .shared import summarise_history
23
+ from .tools import TOOLS_SCHEMA
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ def _environment_type_from_config(config: dict[str, Any]) -> str:
29
+ default = "morph" if os.getenv("MORPH_API_KEY") else "local"
30
+ value = (config or {}).get("environment_class") or os.getenv(
31
+ "SWE_MINI_ENVIRONMENT_CLASS", default
32
+ )
33
+ return str(value).strip() or "local"
34
+
35
+
36
+ def _environment_kwargs_from_config(config: dict[str, Any]) -> dict[str, Any]:
37
+ kwargs = dict(config or {}).get("environment_kwargs") or {}
38
+ if not kwargs and (raw := os.getenv("SWE_MINI_ENVIRONMENT_KWARGS")):
39
+ try:
40
+ kwargs = json.loads(raw)
41
+ except Exception: # pragma: no cover - environment var malformed
42
+ logger.warning("Failed to parse SWE_MINI_ENVIRONMENT_KWARGS; ignoring")
43
+ kwargs = {}
44
+ if not isinstance(kwargs, dict):
45
+ logger.warning("environment_kwargs must be a mapping, got %r", type(kwargs))
46
+ kwargs = {}
47
+ return kwargs
48
+
49
+
50
+ def _default_submit_command() -> str:
51
+ return "echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT && git add -A && git diff --cached"
52
+
53
+
54
+ @dataclass
55
+ class MiniSweEnvironmentState:
56
+ """Serializable environment state used for snapshots."""
57
+
58
+ task: dict[str, Any]
59
+ history: list[dict[str, Any]] = field(default_factory=list)
60
+ step_idx: int = 0
61
+ submitted: bool = False
62
+ submission_success: bool | None = None
63
+
64
+
65
+ class MiniSweEnvironmentWrapper:
66
+ """Wrapper around mini-swe-agent environments exposing Synth task-app semantics."""
67
+
68
+ name = "swe-mini"
69
+
70
+ def __init__(
71
+ self,
72
+ *,
73
+ task: dict[str, Any],
74
+ env_config: dict[str, Any] | None = None,
75
+ submit_command: str | None = None,
76
+ ) -> None:
77
+ self.task = dict(task)
78
+ self.env_config = dict(env_config or {})
79
+ self.submit_command = submit_command or _default_submit_command()
80
+ self.environment_type = _environment_type_from_config(self.env_config)
81
+ kwargs = _environment_kwargs_from_config(self.env_config)
82
+
83
+ self.instance_id = str(
84
+ self.task.get("instance_id") or f"swe-mini-{uuid.uuid4().hex[:8]}"
85
+ )
86
+ self.metadata = dict(self.task.get("metadata") or {})
87
+ self.repo_url = self._resolve_repo_url(self.metadata)
88
+ self.base_commit = (
89
+ self.metadata.get("base_commit")
90
+ or self.metadata.get("environment_setup_commit")
91
+ or None
92
+ )
93
+ self._local_workspace_dir: Path | None = None
94
+ self._remote_workspace: str | None = None
95
+ self._cleanup_workspace = False
96
+ self._using_morph_backend = False
97
+
98
+ if self.environment_type == "local":
99
+ workspace = self._prepare_local_workspace(kwargs)
100
+ kwargs.setdefault("cwd", str(workspace))
101
+ kwargs.setdefault("timeout", int(self.env_config.get("timeout", 60)))
102
+ # Merge custom env vars with defaults expected by mini-swe
103
+ merged_env = dict(kwargs.get("env") or {})
104
+ merged_env.setdefault("PAGER", "cat")
105
+ merged_env.setdefault("MANPAGER", "cat")
106
+ merged_env.setdefault("LESS", "-R")
107
+ merged_env.setdefault("PIP_PROGRESS_BAR", "off")
108
+ merged_env.setdefault("TQDM_DISABLE", "1")
109
+ merged_env.setdefault("GIT_TERMINAL_PROMPT", "0")
110
+ kwargs["env"] = merged_env
111
+ self._local_workspace_dir = workspace
112
+ self._cleanup_workspace = True
113
+ else:
114
+ remote_cwd = kwargs.get("cwd")
115
+ if not remote_cwd:
116
+ base_remote = os.getenv("SWE_MINI_REMOTE_WORKSPACE_BASE", "/workspace")
117
+ remote_cwd = f"{base_remote.rstrip('/')}/{self.instance_id}"
118
+ kwargs["cwd"] = remote_cwd
119
+ self._remote_workspace = kwargs["cwd"]
120
+ timeout = self.env_config.get("timeout")
121
+ if timeout and "timeout" not in kwargs:
122
+ kwargs["timeout"] = int(timeout)
123
+ if self.environment_type in {"docker", "bubblewrap"} and self.repo_url and "image" not in kwargs:
124
+ image = self.metadata.get("image_name") or os.getenv("SWE_MINI_DOCKER_IMAGE")
125
+ if image:
126
+ kwargs["image"] = image
127
+ if self.environment_type in {"docker", "bubblewrap", "morph"}:
128
+ remote_env = dict(kwargs.get("env") or {})
129
+ remote_env.setdefault("GIT_TERMINAL_PROMPT", "0")
130
+ kwargs["env"] = remote_env
131
+
132
+ logger.info(
133
+ "Initialising mini-swe environment: type=%s kwargs=%s",
134
+ self.environment_type,
135
+ kwargs,
136
+ )
137
+ if self.environment_type == "morph":
138
+ morph_kwargs = dict(kwargs)
139
+ image_value = morph_kwargs.pop("image", None)
140
+ if image_value and "image_id" not in morph_kwargs:
141
+ morph_kwargs["image_id"] = image_value
142
+ timeout_value = morph_kwargs.pop("timeout", None)
143
+ if timeout_value is not None and "startup_timeout" not in morph_kwargs:
144
+ try:
145
+ morph_kwargs["startup_timeout"] = int(timeout_value)
146
+ except Exception:
147
+ logger.warning("Invalid timeout value for morph backend: %r", timeout_value)
148
+ metadata_override = morph_kwargs.pop("metadata", {}) or {}
149
+ metadata_payload = {
150
+ "app": "swe-mini",
151
+ "instance_id": self.instance_id,
152
+ }
153
+ metadata_payload.update({str(k): str(v) for k, v in dict(metadata_override).items()})
154
+ morph_kwargs["metadata"] = metadata_payload
155
+ self.env = MorphSandboxBackend(**morph_kwargs)
156
+ self._using_morph_backend = True
157
+ else:
158
+ self.env = get_environment(
159
+ {
160
+ "environment_class": self.environment_type,
161
+ **kwargs,
162
+ },
163
+ default_type="local",
164
+ )
165
+
166
+ if self.environment_type != "local":
167
+ self._bootstrap_remote_workspace()
168
+
169
+ self.state = MiniSweEnvironmentState(task=self.task)
170
+ self.last_result: dict[str, Any] | None = None
171
+ self.last_submission: dict[str, Any] | None = None
172
+
173
+ async def initialize(self) -> dict[str, Any]:
174
+ """Return initial observation."""
175
+ logger.info(
176
+ "Mini-swe task initialised: instance=%s",
177
+ self.task.get("instance_id"),
178
+ )
179
+ return self._build_response(observation=self._build_observation(None), step_idx=0)
180
+
181
+ async def terminate(self) -> dict[str, Any]:
182
+ """Terminate the environment, returning the final observation."""
183
+ logger.info(
184
+ "Terminating mini-swe environment instance=%s submitted=%s",
185
+ self.task.get("instance_id"),
186
+ self.state.submitted,
187
+ )
188
+ response = self._build_response(
189
+ observation=self._build_observation(self.last_result),
190
+ step_idx=self.state.step_idx,
191
+ )
192
+ self._cleanup_workspaces()
193
+ return response
194
+
195
+ def _cleanup_workspaces(self) -> None:
196
+ if self._cleanup_workspace and self._local_workspace_dir:
197
+ with contextlib.suppress(Exception):
198
+ shutil.rmtree(self._local_workspace_dir)
199
+ self._local_workspace_dir = None
200
+ self._cleanup_workspace = False
201
+ if (
202
+ self._remote_workspace
203
+ and os.getenv("SWE_MINI_CLEANUP_REMOTE_WORKSPACE", "1") not in {"0", "false", "False"}
204
+ ):
205
+ with contextlib.suppress(Exception):
206
+ self.env.execute(f"rm -rf {shlex.quote(self._remote_workspace)}")
207
+ self._remote_workspace = None
208
+ if self._using_morph_backend and hasattr(self.env, "close"):
209
+ with contextlib.suppress(Exception):
210
+ self.env.close()
211
+
212
+ def _resolve_repo_url(self, metadata: dict[str, Any]) -> str | None:
213
+ candidates = [
214
+ metadata.get("repo_url"),
215
+ metadata.get("repo"),
216
+ metadata.get("repository"),
217
+ ]
218
+ for value in candidates:
219
+ if not value:
220
+ continue
221
+ repo = str(value).strip()
222
+ if not repo:
223
+ continue
224
+ if repo.startswith("http://") or repo.startswith("https://"):
225
+ url = repo
226
+ else:
227
+ repo = repo.removesuffix(".git")
228
+ url = f"https://github.com/{repo}.git"
229
+ if not url.endswith(".git"):
230
+ url = f"{url}.git"
231
+ return url
232
+ return None
233
+
234
+ def _prepare_local_workspace(self, kwargs: dict[str, Any]) -> Path:
235
+ if not self.repo_url:
236
+ fallback = Path(kwargs.get("cwd") or self.env_config.get("cwd") or os.getcwd())
237
+ fallback.mkdir(parents=True, exist_ok=True)
238
+ logger.warning(
239
+ "No repo URL provided for swe-mini instance %s; using cwd=%s",
240
+ self.instance_id,
241
+ fallback,
242
+ )
243
+ return fallback
244
+
245
+ root = Path(
246
+ os.getenv("SWE_MINI_LOCAL_WORKSPACE_ROOT")
247
+ or Path.home() / ".cache" / "synth-ai" / "swe-mini" / "workspaces"
248
+ )
249
+ workspace = root / self.instance_id
250
+ if workspace.exists():
251
+ shutil.rmtree(workspace, ignore_errors=True)
252
+ workspace.parent.mkdir(parents=True, exist_ok=True)
253
+
254
+ self._run_local_cmd(
255
+ [
256
+ "git",
257
+ "clone",
258
+ "--filter=blob:none",
259
+ "--no-tags",
260
+ self.repo_url,
261
+ str(workspace),
262
+ ],
263
+ description="clone repository",
264
+ )
265
+ if self.base_commit:
266
+ self._run_local_cmd(
267
+ ["git", "-C", str(workspace), "checkout", self.base_commit],
268
+ description="checkout base commit",
269
+ )
270
+ self._run_local_cmd(
271
+ ["git", "-C", str(workspace), "reset", "--hard"],
272
+ description="reset working tree",
273
+ )
274
+ self._run_local_cmd(
275
+ ["git", "-C", str(workspace), "clean", "-ffd"],
276
+ description="clean working tree",
277
+ )
278
+ logger.info(
279
+ "Prepared local workspace for %s at %s (repo=%s, commit=%s)",
280
+ self.instance_id,
281
+ workspace,
282
+ self.repo_url,
283
+ self.base_commit,
284
+ )
285
+ return workspace
286
+
287
+ def _bootstrap_remote_workspace(self) -> None:
288
+ if not self.repo_url or not self._remote_workspace:
289
+ logger.warning(
290
+ "Skipping remote workspace bootstrap for instance %s (repo=%s workspace=%s)",
291
+ self.instance_id,
292
+ self.repo_url,
293
+ self._remote_workspace,
294
+ )
295
+ return
296
+
297
+ workspace = self._remote_workspace.rstrip("/")
298
+ base_dir = os.path.dirname(workspace) or "/"
299
+ self._execute_bootstrap_command(f"mkdir -p {shlex.quote(base_dir)}")
300
+ self._execute_bootstrap_command(f"rm -rf {shlex.quote(workspace)}")
301
+ clone_cmd = (
302
+ f"git clone --filter=blob:none --no-tags {shlex.quote(self.repo_url)} {shlex.quote(workspace)}"
303
+ )
304
+ self._execute_bootstrap_command(clone_cmd, timeout=900, description="clone repository")
305
+ if self.base_commit:
306
+ checkout_cmd = (
307
+ f"cd {shlex.quote(workspace)} && git checkout {shlex.quote(self.base_commit)}"
308
+ )
309
+ self._execute_bootstrap_command(checkout_cmd, timeout=300, description="checkout commit")
310
+ self._execute_bootstrap_command(
311
+ f"cd {shlex.quote(workspace)} && git reset --hard",
312
+ description="reset working tree",
313
+ )
314
+ self._execute_bootstrap_command(
315
+ f"cd {shlex.quote(workspace)} && git clean -ffd",
316
+ description="clean working tree",
317
+ )
318
+ logger.info(
319
+ "Prepared remote workspace for %s at %s (repo=%s, commit=%s)",
320
+ self.instance_id,
321
+ workspace,
322
+ self.repo_url,
323
+ self.base_commit,
324
+ )
325
+
326
+ def _run_local_cmd(
327
+ self, args: list[str], *, cwd: Path | None = None, description: str | None = None
328
+ ) -> None:
329
+ logger.debug(
330
+ "Preparing workspace %s: running local command %s",
331
+ self.instance_id,
332
+ " ".join(args),
333
+ )
334
+ proc = subprocess.run(
335
+ args,
336
+ cwd=str(cwd) if cwd else None,
337
+ text=True,
338
+ capture_output=True,
339
+ )
340
+ if proc.returncode != 0:
341
+ desc = description or "command"
342
+ raise RuntimeError(
343
+ f"Failed to {desc} (cmd={' '.join(args)}): {proc.stdout or ''}{proc.stderr or ''}"
344
+ )
345
+
346
+ def _execute_bootstrap_command(
347
+ self, command: str, *, timeout: int | None = None, description: str | None = None
348
+ ) -> None:
349
+ logger.debug(
350
+ "Preparing workspace %s: running remote command %s",
351
+ self.instance_id,
352
+ command,
353
+ )
354
+ result = self.env.execute(command, timeout=timeout)
355
+ if result.get("returncode"):
356
+ desc = description or command
357
+ raise RuntimeError(
358
+ f"Failed to {desc}: rc={result.get('returncode')} output={result.get('output')}"
359
+ )
360
+
361
+ def _normalize_tool_call(self, tool_call: EnvToolCall | dict[str, Any]) -> EnvToolCall:
362
+ if isinstance(tool_call, EnvToolCall):
363
+ return tool_call
364
+ tool = tool_call.get("tool") or tool_call.get("tool_name")
365
+ if not tool:
366
+ raise ValueError(f"Tool call missing tool name: {tool_call}")
367
+ args = tool_call.get("args") or tool_call.get("arguments") or {}
368
+ if isinstance(args, str):
369
+ try:
370
+ args = json.loads(args)
371
+ except Exception:
372
+ args = {}
373
+ return EnvToolCall(tool=str(tool), args=dict(args))
374
+
375
+ async def step(self, tool_calls: list[EnvToolCall] | list[dict[str, Any]]) -> dict[str, Any]:
376
+ """Execute run_command or submit_patch tool calls."""
377
+ if not tool_calls:
378
+ raise ValueError("MiniSweEnvironmentWrapper.step requires at least one tool call")
379
+
380
+ responses: list[dict[str, Any]] = []
381
+ for raw_call in tool_calls:
382
+ call = self._normalize_tool_call(raw_call)
383
+ tool = call.tool
384
+ if tool == "run_command":
385
+ responses.append(self._run_command(call))
386
+ elif tool == "submit_patch":
387
+ responses.append(self._submit(call))
388
+ else:
389
+ raise ValueError(f"Unsupported tool '{tool}' for swe-mini environment")
390
+
391
+ last_result = responses[-1] if responses else None
392
+ self.last_result = last_result
393
+ observation = self._build_observation(last_result)
394
+ done = bool(self.state.submitted)
395
+ reward = 0.0
396
+ if done:
397
+ reward = 1.0 if self.state.submission_success else 0.0
398
+ return self._build_response(
399
+ observation=observation,
400
+ step_idx=self.state.step_idx,
401
+ done=done,
402
+ reward=reward,
403
+ info={"responses": responses},
404
+ )
405
+
406
+ def _run_command(self, call: EnvToolCall) -> dict[str, Any]:
407
+ command = str(call.args.get("command") or "").strip()
408
+ if not command:
409
+ raise ValueError("run_command requires a non-empty 'command' argument")
410
+ timeout = call.args.get("timeout")
411
+ timeout = int(timeout) if timeout is not None else None
412
+
413
+ started_at = time.time()
414
+ result = self.env.execute(command, timeout=timeout)
415
+ duration = time.time() - started_at
416
+
417
+ record = {
418
+ "command": command,
419
+ "returncode": result.get("returncode"),
420
+ "stdout": result.get("output") or "",
421
+ "duration": duration,
422
+ "timestamp": started_at,
423
+ }
424
+ self.state.history.append(record)
425
+ self.state.step_idx += 1
426
+ logger.info(
427
+ "Executed command step=%s rc=%s",
428
+ self.state.step_idx,
429
+ record["returncode"],
430
+ )
431
+ return record
432
+
433
+ def _submit(self, call: EnvToolCall) -> dict[str, Any]:
434
+ if self.state.submitted:
435
+ logger.info("Submit called again; ignoring additional submission.")
436
+ return {
437
+ "submitted": True,
438
+ "command": None,
439
+ "returncode": 0,
440
+ "stdout": "",
441
+ "submission_success": self.state.submission_success,
442
+ "evaluation": self.last_submission,
443
+ }
444
+ command = str(call.args.get("command") or self.submit_command)
445
+ result = self.env.execute(command)
446
+ record = {
447
+ "command": command,
448
+ "returncode": result.get("returncode"),
449
+ "stdout": result.get("output") or "",
450
+ "duration": 0.0,
451
+ "timestamp": time.time(),
452
+ }
453
+ self.state.history.append(record)
454
+ self.state.step_idx += 1
455
+ diff = self._extract_submission_diff(record["stdout"])
456
+
457
+ evaluation: dict[str, Any] | None = None
458
+ submission_success = False
459
+ if record["returncode"] == 0 and diff is not None:
460
+ evaluation = self._evaluate_submission(diff)
461
+ submission_success = bool(evaluation.get("resolved")) if evaluation else False
462
+ else:
463
+ evaluation = {
464
+ "completed": False,
465
+ "resolved": False,
466
+ "error": "submit command failed or diff unavailable",
467
+ "returncode": record["returncode"],
468
+ }
469
+
470
+ self.state.submitted = True
471
+ self.state.submission_success = submission_success
472
+ self.last_submission = evaluation
473
+
474
+ logger.info(
475
+ "Submission command executed rc=%s resolved=%s",
476
+ record["returncode"],
477
+ submission_success,
478
+ )
479
+
480
+ return {
481
+ **record,
482
+ "submitted": True,
483
+ "submission_success": submission_success,
484
+ "diff": diff,
485
+ "evaluation": evaluation,
486
+ }
487
+
488
+ def _extract_submission_diff(self, stdout: str) -> str | None:
489
+ if stdout is None:
490
+ return None
491
+ lines = stdout.splitlines()
492
+ if not lines:
493
+ return ""
494
+ first = lines[0].strip()
495
+ sentinel = "COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT"
496
+ if first.startswith(sentinel):
497
+ lines = lines[1:]
498
+ diff = "\n".join(lines).strip("\n")
499
+ return diff
500
+
501
+ def _evaluate_submission(self, diff: str) -> dict[str, Any]:
502
+ metadata = dict(self.task.get("metadata") or {})
503
+ instance = dict(metadata.get("raw_instance") or {})
504
+ instance_id = instance.setdefault("instance_id", self.task.get("instance_id"))
505
+
506
+ required_fields = ["repo", "base_commit", "test_patch", "version"]
507
+ missing = [field for field in required_fields if not instance.get(field)]
508
+ if missing:
509
+ msg = (
510
+ "Cannot run SWE-bench evaluation; task metadata missing required fields "
511
+ f"{missing}. Ensure the dataset preserves full SWE-bench records."
512
+ )
513
+ logger.error(msg)
514
+ return {"completed": False, "resolved": False, "error": msg}
515
+
516
+ try:
517
+ from swebench.harness.constants import (
518
+ KEY_INSTANCE_ID,
519
+ KEY_MODEL,
520
+ KEY_PREDICTION,
521
+ )
522
+ except Exception as exc: # pragma: no cover - dependency missing
523
+ msg = (
524
+ "SWE-bench harness is required for official scoring. "
525
+ "Install swebench with evaluation extras."
526
+ )
527
+ logger.exception("Failed to import swebench harness constants: %s", exc)
528
+ return {"completed": False, "resolved": False, "error": f"{msg} ({exc})"}
529
+
530
+ backend = self._resolve_evaluation_backend(metadata)
531
+
532
+ image_name = str(metadata.get("image_name") or "")
533
+ namespace = metadata.get("namespace") or self._namespace_from_image(image_name) or "swebench"
534
+ instance_image_tag = metadata.get("instance_image_tag") or self._image_tag_from_name(image_name) or "latest"
535
+ env_image_tag = metadata.get("env_image_tag") or "latest"
536
+
537
+ model_name = metadata.get("submission_model_name") or metadata.get("model_name") or "synth-ai-agent"
538
+ run_id = f"swe_mini_eval_{uuid.uuid4().hex[:12]}"
539
+ eval_timeout = self._resolve_eval_timeout(metadata)
540
+ rm_image = self._to_bool(metadata.get("eval_rm_image") or os.getenv("SWE_MINI_EVAL_RM_IMAGE", "false"))
541
+ force_rebuild = self._to_bool(metadata.get("eval_force_rebuild") or os.getenv("SWE_MINI_EVAL_FORCE_REBUILD", "false"))
542
+
543
+ prediction = {
544
+ KEY_INSTANCE_ID: instance_id,
545
+ KEY_MODEL: model_name,
546
+ KEY_PREDICTION: diff or "",
547
+ }
548
+
549
+ # Ensure log root exists so downstream collection succeeds.
550
+ with contextlib.suppress(Exception):
551
+ from swebench.harness.constants import RUN_EVALUATION_LOG_DIR
552
+
553
+ Path(RUN_EVALUATION_LOG_DIR).mkdir(parents=True, exist_ok=True)
554
+
555
+ if backend == "modal_harness":
556
+ evaluation_payload = self._run_modal_harness(
557
+ instance=instance,
558
+ prediction=prediction,
559
+ run_id=run_id,
560
+ eval_timeout=eval_timeout,
561
+ model_name=model_name,
562
+ )
563
+ elif backend == "swe_rex":
564
+ evaluation_payload = self._run_swe_rex(
565
+ instance=instance,
566
+ prediction=prediction,
567
+ run_id=run_id,
568
+ eval_timeout=eval_timeout,
569
+ namespace=namespace,
570
+ instance_image_tag=instance_image_tag,
571
+ env_image_tag=env_image_tag,
572
+ model_name=model_name,
573
+ )
574
+ else:
575
+ evaluation_payload = self._run_local_harness(
576
+ instance=instance,
577
+ prediction=prediction,
578
+ run_id=run_id,
579
+ eval_timeout=eval_timeout,
580
+ namespace=namespace,
581
+ instance_image_tag=instance_image_tag,
582
+ env_image_tag=env_image_tag,
583
+ rm_image=rm_image,
584
+ force_rebuild=force_rebuild,
585
+ model_name=model_name,
586
+ )
587
+
588
+ evaluation_payload = dict(evaluation_payload or {})
589
+ evaluation_payload.setdefault("backend", backend)
590
+ evaluation_payload.setdefault("run_id", run_id)
591
+ evaluation_payload.setdefault("model_name", model_name)
592
+ evaluation_payload.setdefault("instance_id", instance_id)
593
+
594
+ artifacts = self._collect_evaluation_artifacts(
595
+ run_id=run_id,
596
+ model_name=model_name,
597
+ instance_id=instance_id,
598
+ )
599
+ # Merge artifact data without clobbering explicit error/resolution flags.
600
+ merged = {**artifacts, **evaluation_payload}
601
+ if artifacts.get("completed"):
602
+ merged["completed"] = True
603
+ else:
604
+ merged.setdefault("completed", False)
605
+ if artifacts.get("resolved"):
606
+ merged["resolved"] = True
607
+ else:
608
+ merged.setdefault("resolved", False)
609
+ merged.setdefault("log_dir", artifacts.get("log_dir"))
610
+ merged.setdefault("report_path", artifacts.get("report_path"))
611
+ merged.setdefault("test_output_path", artifacts.get("test_output_path"))
612
+ if artifacts.get("report") and not merged.get("report"):
613
+ merged["report"] = artifacts["report"]
614
+ if artifacts.get("error") and not merged.get("error"):
615
+ merged["error"] = artifacts["error"]
616
+ return merged
617
+
618
+ def _resolve_evaluation_backend(self, metadata: dict[str, Any]) -> str:
619
+ raw = (
620
+ metadata.get("evaluation_backend")
621
+ or self.env_config.get("evaluation_backend")
622
+ or os.getenv("SWE_MINI_EVALUATION_BACKEND")
623
+ or "local"
624
+ )
625
+ backend = str(raw).strip().lower()
626
+ mapping = {
627
+ "": "local",
628
+ "local": "local",
629
+ "docker": "local",
630
+ "modal": "modal_harness",
631
+ "modal_harness": "modal_harness",
632
+ "modal-harness": "modal_harness",
633
+ "modal-harnesses": "modal_harness",
634
+ "swe_rex": "swe_rex",
635
+ "swe-rex": "swe_rex",
636
+ "swerex": "swe_rex",
637
+ }
638
+ return mapping.get(backend, "local")
639
+
640
+ def _resolve_eval_timeout(self, metadata: dict[str, Any]) -> int:
641
+ raw = (
642
+ metadata.get("evaluation_timeout")
643
+ or self.env_config.get("evaluation_timeout")
644
+ or os.getenv("SWE_MINI_EVALUATION_TIMEOUT")
645
+ or 3600
646
+ )
647
+ try:
648
+ value = int(raw)
649
+ except (TypeError, ValueError):
650
+ return 3600
651
+ return max(1, value)
652
+
653
+ def _run_local_harness(
654
+ self,
655
+ *,
656
+ instance: dict[str, Any],
657
+ prediction: dict[str, Any],
658
+ run_id: str,
659
+ eval_timeout: int,
660
+ namespace: str,
661
+ instance_image_tag: str,
662
+ env_image_tag: str,
663
+ rm_image: bool,
664
+ force_rebuild: bool,
665
+ model_name: str,
666
+ ) -> dict[str, Any]:
667
+ try:
668
+ from swebench.harness.run_evaluation import run_instance
669
+ from swebench.harness.test_spec.test_spec import make_test_spec
670
+ except Exception as exc: # pragma: no cover - dependency missing
671
+ msg = (
672
+ "SWE-bench harness is required for official scoring. "
673
+ "Install swebench with evaluation extras."
674
+ )
675
+ logger.exception("Failed to import swebench harness: %s", exc)
676
+ return {"completed": False, "resolved": False, "error": f"{msg} ({exc})", "backend": "local"}
677
+
678
+ try:
679
+ import docker
680
+ except Exception as exc: # pragma: no cover - dependency missing
681
+ msg = "Docker SDK for Python is required to run local SWE-bench evaluation."
682
+ logger.exception("Failed to import docker SDK: %s", exc)
683
+ return {"completed": False, "resolved": False, "error": f"{msg} ({exc})", "backend": "local"}
684
+
685
+ instance_id = str(instance["instance_id"])
686
+ try:
687
+ test_spec = make_test_spec(
688
+ instance,
689
+ namespace=namespace,
690
+ instance_image_tag=instance_image_tag,
691
+ env_image_tag=env_image_tag,
692
+ )
693
+ except Exception as exc:
694
+ logger.exception("Failed to build SWE-bench test spec for %s: %s", instance_id, exc)
695
+ return {"completed": False, "resolved": False, "error": f"Failed to build test spec: {exc}", "backend": "local"}
696
+
697
+ client = None
698
+ result: dict[str, Any] = {}
699
+ try:
700
+ client = docker.from_env()
701
+ result = run_instance(
702
+ test_spec,
703
+ prediction,
704
+ rm_image,
705
+ force_rebuild,
706
+ client,
707
+ run_id,
708
+ int(eval_timeout),
709
+ rewrite_reports=False,
710
+ )
711
+ except Exception as exc:
712
+ logger.exception("Error while running SWE-bench evaluation for %s: %s", instance_id, exc)
713
+ return {"completed": False, "resolved": False, "error": f"Evaluation failed: {exc}", "backend": "local"}
714
+ finally:
715
+ with contextlib.suppress(Exception):
716
+ if client is not None:
717
+ client.close()
718
+
719
+ payload = {
720
+ "completed": bool(result.get("completed")),
721
+ "resolved": bool(result.get("resolved")),
722
+ "backend": "local",
723
+ }
724
+ return payload
725
+
726
+ def _run_modal_harness(
727
+ self,
728
+ *,
729
+ instance: dict[str, Any],
730
+ prediction: dict[str, Any],
731
+ run_id: str,
732
+ eval_timeout: int,
733
+ model_name: str,
734
+ ) -> dict[str, Any]:
735
+ try:
736
+ from swebench.harness.modal_eval import run_instances_modal
737
+ except Exception as exc: # pragma: no cover - dependency missing
738
+ msg = (
739
+ "SWE-bench modal extras are required for the modal_harness backend. "
740
+ "Install swebench[modal] inside the Modal deployment."
741
+ )
742
+ logger.exception("Failed to import swebench modal harness: %s", exc)
743
+ return {"completed": False, "resolved": False, "error": f"{msg} ({exc})", "backend": "modal_harness"}
744
+
745
+ instance_id = str(instance["instance_id"])
746
+ predictions = {instance_id: dict(prediction)}
747
+ dataset = [instance]
748
+ try:
749
+ run_instances_modal(
750
+ predictions,
751
+ dataset,
752
+ dataset,
753
+ run_id,
754
+ int(eval_timeout),
755
+ )
756
+ except Exception as exc:
757
+ logger.exception("Modal SWE-bench evaluation failed for %s: %s", instance_id, exc)
758
+ return {"completed": False, "resolved": False, "error": f"Modal evaluation failed: {exc}", "backend": "modal_harness"}
759
+
760
+ # run_instances_modal writes reports to RUN_EVALUATION_LOG_DIR; we rely on artifact collection.
761
+ return {"backend": "modal_harness"}
762
+
763
+ def _run_swe_rex(
764
+ self,
765
+ *,
766
+ instance: dict[str, Any],
767
+ prediction: dict[str, Any],
768
+ run_id: str,
769
+ eval_timeout: int,
770
+ namespace: str,
771
+ instance_image_tag: str,
772
+ env_image_tag: str,
773
+ model_name: str,
774
+ ) -> dict[str, Any]:
775
+ try:
776
+ from swerex.deployment.config import ModalDeploymentConfig
777
+ from swerex.runtime.abstract import Command, ReadFileRequest, WriteFileRequest
778
+ except ModuleNotFoundError as exc: # pragma: no cover - optional dependency
779
+ msg = (
780
+ "SWE-ReX backend requires the swe-rex package. "
781
+ "Install swe-rex (pip install swe-rex[modal]) to enable this backend."
782
+ )
783
+ logger.exception("Failed to import swe-rex: %s", exc)
784
+ return {"completed": False, "resolved": False, "error": f"{msg} ({exc})", "backend": "swe_rex"}
785
+ except Exception as exc: # pragma: no cover - defensive
786
+ logger.exception("Unexpected swe-rex import failure: %s", exc)
787
+ return {"completed": False, "resolved": False, "error": f"swe-rex import failed: {exc}", "backend": "swe_rex"}
788
+
789
+ image_spec = (
790
+ instance.get("swe_rex_image")
791
+ or self.env_config.get("swe_rex_image")
792
+ or os.getenv("SWE_REX_MODAL_IMAGE")
793
+ or "ghcr.io/swe-agent/swe-rex-modal:latest"
794
+ )
795
+ install_pipx = self._to_bool(
796
+ instance.get("swe_rex_install_pipx")
797
+ or self.env_config.get("swe_rex_install_pipx")
798
+ or os.getenv("SWE_REX_INSTALL_PIPX", "true")
799
+ )
800
+ modal_kwargs_raw = (
801
+ instance.get("swe_rex_modal_kwargs")
802
+ or self.env_config.get("swe_rex_modal_kwargs")
803
+ or os.getenv("SWE_REX_MODAL_SANDBOX_KWARGS")
804
+ )
805
+ modal_kwargs: dict[str, Any] = {}
806
+ if isinstance(modal_kwargs_raw, dict | list):
807
+ modal_kwargs = dict(modal_kwargs_raw or {})
808
+ elif isinstance(modal_kwargs_raw, str) and modal_kwargs_raw.strip():
809
+ try:
810
+ modal_kwargs = dict(json.loads(modal_kwargs_raw))
811
+ except Exception as exc: # pragma: no cover - user input parsing
812
+ logger.warning("Failed to parse SWE_REX_MODAL_SANDBOX_KWARGS=%s: %s", modal_kwargs_raw, exc)
813
+
814
+ deployment_config = ModalDeploymentConfig(
815
+ image=image_spec,
816
+ runtime_timeout=float(
817
+ instance.get("swe_rex_runtime_timeout")
818
+ or self.env_config.get("swe_rex_runtime_timeout")
819
+ or os.getenv("SWE_REX_RUNTIME_TIMEOUT", 900)
820
+ ),
821
+ deployment_timeout=float(
822
+ instance.get("swe_rex_deployment_timeout")
823
+ or self.env_config.get("swe_rex_deployment_timeout")
824
+ or os.getenv("SWE_REX_DEPLOYMENT_TIMEOUT", 3600)
825
+ ),
826
+ modal_sandbox_kwargs=modal_kwargs,
827
+ install_pipx=bool(install_pipx),
828
+ )
829
+
830
+ remote_root = (
831
+ instance.get("swe_rex_workdir")
832
+ or self.env_config.get("swe_rex_workdir")
833
+ or os.getenv("SWE_REX_REMOTE_WORKDIR")
834
+ or "/root/swebench_eval"
835
+ )
836
+ remote_root = str(remote_root).rstrip("/")
837
+ dataset_remote_path = f"{remote_root}/dataset.json"
838
+ predictions_remote_path = f"{remote_root}/predictions.json"
839
+
840
+ environment_forward_raw = (
841
+ instance.get("swe_rex_forward_env")
842
+ or self.env_config.get("swe_rex_forward_env")
843
+ or os.getenv("SWE_REX_FORWARD_ENV")
844
+ )
845
+ forward_env: dict[str, str] | None = None
846
+ if isinstance(environment_forward_raw, dict):
847
+ forward_env = {str(k): str(v) for k, v in environment_forward_raw.items()}
848
+ elif isinstance(environment_forward_raw, str) and environment_forward_raw.strip():
849
+ try:
850
+ parsed = json.loads(environment_forward_raw)
851
+ if isinstance(parsed, dict):
852
+ forward_env = {str(k): str(v) for k, v in parsed.items()}
853
+ except Exception as exc: # pragma: no cover - parsing failure
854
+ logger.warning("Failed to parse SWE_REX_FORWARD_ENV=%s: %s", environment_forward_raw, exc)
855
+
856
+ # Build coroutine for the async swe-rex flow.
857
+ coro = self._run_swe_rex_async(
858
+ deployment_config=deployment_config,
859
+ remote_root=remote_root,
860
+ dataset_remote_path=dataset_remote_path,
861
+ predictions_remote_path=predictions_remote_path,
862
+ forward_env=forward_env,
863
+ instance=instance,
864
+ prediction=prediction,
865
+ run_id=run_id,
866
+ eval_timeout=eval_timeout,
867
+ namespace=namespace,
868
+ instance_image_tag=instance_image_tag,
869
+ env_image_tag=env_image_tag,
870
+ model_name=model_name,
871
+ command_cls=Command,
872
+ write_file_request_cls=WriteFileRequest,
873
+ read_file_request_cls=ReadFileRequest,
874
+ )
875
+ try:
876
+ return self._run_coroutine_blocking(coro)
877
+ except Exception as exc: # pragma: no cover - remote execution failure
878
+ logger.exception("SWE-ReX evaluation failed for %s: %s", instance.get("instance_id"), exc)
879
+ return {"completed": False, "resolved": False, "error": f"SWE-ReX evaluation failed: {exc}", "backend": "swe_rex"}
880
+
881
+ async def _run_swe_rex_async(
882
+ self,
883
+ *,
884
+ deployment_config,
885
+ remote_root: str,
886
+ dataset_remote_path: str,
887
+ predictions_remote_path: str,
888
+ forward_env: dict[str, str] | None,
889
+ instance: dict[str, Any],
890
+ prediction: dict[str, Any],
891
+ run_id: str,
892
+ eval_timeout: int,
893
+ namespace: str,
894
+ instance_image_tag: str,
895
+ env_image_tag: str,
896
+ model_name: str,
897
+ command_cls,
898
+ write_file_request_cls,
899
+ read_file_request_cls,
900
+ ) -> dict[str, Any]:
901
+ deployment = deployment_config.get_deployment()
902
+ await deployment.start()
903
+ try:
904
+ runtime = deployment.runtime
905
+ instance_id = str(instance["instance_id"])
906
+ safe_model = prediction["model_name_or_path"].replace("/", "__")
907
+
908
+ # Ensure working directory exists.
909
+ mkdir_resp = await runtime.execute(
910
+ command_cls(command=["mkdir", "-p", remote_root], timeout=60, shell=False)
911
+ )
912
+ if mkdir_resp.exit_code not in (0, None):
913
+ logger.warning("Failed to ensure remote directory %s (exit=%s)", remote_root, mkdir_resp.exit_code)
914
+
915
+ # Upload dataset & predictions.
916
+ dataset_blob = json.dumps([instance], ensure_ascii=False)
917
+ predictions_blob = json.dumps({instance_id: prediction}, ensure_ascii=False)
918
+ await runtime.write_file(write_file_request_cls(path=dataset_remote_path, content=dataset_blob))
919
+ await runtime.write_file(write_file_request_cls(path=predictions_remote_path, content=predictions_blob))
920
+
921
+ eval_cmd = [
922
+ "python",
923
+ "-m",
924
+ "swebench.harness.run_evaluation",
925
+ "--dataset_name",
926
+ dataset_remote_path,
927
+ "--split",
928
+ "test",
929
+ "--instance_ids",
930
+ instance_id,
931
+ "--predictions_path",
932
+ predictions_remote_path,
933
+ "-id",
934
+ run_id,
935
+ "--modal",
936
+ "true",
937
+ "--timeout",
938
+ str(eval_timeout),
939
+ "--namespace",
940
+ namespace,
941
+ "--instance_image_tag",
942
+ instance_image_tag,
943
+ "--env_image_tag",
944
+ env_image_tag,
945
+ "--max_workers",
946
+ "1",
947
+ ]
948
+
949
+ command_timeout = max(eval_timeout + 900, 1200)
950
+ response = await runtime.execute(
951
+ command_cls(
952
+ command=eval_cmd,
953
+ timeout=command_timeout,
954
+ cwd=remote_root,
955
+ env=forward_env,
956
+ shell=False,
957
+ merge_output_streams=True,
958
+ )
959
+ )
960
+ command_output = (response.stdout or "") + (response.stderr or "")
961
+ exit_code = response.exit_code if response.exit_code is not None else -1
962
+
963
+ # Retrieve artifacts back to local disk.
964
+ artifacts = {}
965
+ try:
966
+ from swebench.harness.constants import RUN_EVALUATION_LOG_DIR
967
+
968
+ local_log_dir = Path(RUN_EVALUATION_LOG_DIR) / run_id / safe_model / instance_id
969
+ local_log_dir.mkdir(parents=True, exist_ok=True)
970
+
971
+ remote_log_dir = f"{remote_root}/logs/run_evaluation/{run_id}/{safe_model}/{instance_id}"
972
+ for filename in ("report.json", "test_output.txt", "run_instance.log", "patch.diff"):
973
+ remote_path = f"{remote_log_dir}/{filename}"
974
+ try:
975
+ content = await runtime.read_file(read_file_request_cls(path=remote_path))
976
+ except Exception:
977
+ continue
978
+ if getattr(content, "content", None):
979
+ (local_log_dir / filename).write_text(content.content)
980
+
981
+ artifacts = {
982
+ "log_dir": str(local_log_dir),
983
+ }
984
+ except Exception as exc: # pragma: no cover - best effort artifact copy
985
+ logger.warning("Failed to copy SWE-ReX artifacts locally: %s", exc)
986
+
987
+ payload = {
988
+ "backend": "swe_rex",
989
+ "command_exit_code": exit_code,
990
+ "command_output": command_output[-4000:] if command_output else "",
991
+ "artifacts": artifacts,
992
+ }
993
+ if exit_code == 0:
994
+ payload.setdefault("completed", True)
995
+ return payload
996
+ finally:
997
+ with contextlib.suppress(Exception):
998
+ await deployment.stop()
999
+
1000
+ def _collect_evaluation_artifacts(
1001
+ self,
1002
+ *,
1003
+ run_id: str,
1004
+ model_name: str,
1005
+ instance_id: str,
1006
+ ) -> dict[str, Any]:
1007
+ try:
1008
+ from swebench.harness.constants import (
1009
+ LOG_REPORT,
1010
+ LOG_TEST_OUTPUT,
1011
+ RUN_EVALUATION_LOG_DIR,
1012
+ )
1013
+ except Exception: # pragma: no cover - dependency missing
1014
+ return {
1015
+ "completed": False,
1016
+ "resolved": False,
1017
+ "log_dir": None,
1018
+ "report_path": None,
1019
+ "test_output_path": None,
1020
+ }
1021
+
1022
+ log_model = model_name.replace("/", "__")
1023
+ log_dir = Path(RUN_EVALUATION_LOG_DIR) / run_id / log_model / instance_id
1024
+ payload: dict[str, Any] = {
1025
+ "log_dir": str(log_dir),
1026
+ "report_path": None,
1027
+ "test_output_path": None,
1028
+ "report": None,
1029
+ "completed": False,
1030
+ "resolved": False,
1031
+ }
1032
+
1033
+ if not log_dir.exists():
1034
+ return payload
1035
+
1036
+ report_path = log_dir / LOG_REPORT
1037
+ if report_path.exists():
1038
+ payload["report_path"] = str(report_path)
1039
+ try:
1040
+ report_blob = json.loads(report_path.read_text())
1041
+ per_instance = report_blob.get(instance_id)
1042
+ if per_instance is not None:
1043
+ payload["report"] = per_instance
1044
+ payload["completed"] = True
1045
+ payload["resolved"] = bool(per_instance.get("resolved"))
1046
+ except Exception as exc: # pragma: no cover - log parsing failure
1047
+ logger.exception("Failed to parse SWE-bench report for %s: %s", instance_id, exc)
1048
+ payload["error"] = f"Failed to parse report.json: {exc}"
1049
+
1050
+ test_output_path = log_dir / LOG_TEST_OUTPUT
1051
+ if test_output_path.exists():
1052
+ payload["test_output_path"] = str(test_output_path)
1053
+
1054
+ return payload
1055
+
1056
+ @staticmethod
1057
+ def _run_coroutine_blocking(coro):
1058
+ try:
1059
+ loop = asyncio.get_running_loop()
1060
+ except RuntimeError:
1061
+ loop = None
1062
+
1063
+ if loop and loop.is_running():
1064
+ result: dict[str, Any] = {}
1065
+ error: dict[str, Exception] = {}
1066
+
1067
+ def runner():
1068
+ try:
1069
+ result["value"] = asyncio.run(coro)
1070
+ except Exception as exc: # pragma: no cover - propagate to caller
1071
+ error["exc"] = exc
1072
+
1073
+ thread = threading.Thread(target=runner, daemon=True)
1074
+ thread.start()
1075
+ thread.join()
1076
+ if error:
1077
+ raise error["exc"]
1078
+ return result.get("value")
1079
+
1080
+ return asyncio.run(coro)
1081
+
1082
+ @staticmethod
1083
+ def _namespace_from_image(image_name: str) -> str | None:
1084
+ if not image_name:
1085
+ return None
1086
+ parts = image_name.split("/")
1087
+ if len(parts) >= 2:
1088
+ return parts[-2] if parts[0].endswith(".io") else parts[0]
1089
+ return None
1090
+
1091
+ @staticmethod
1092
+ def _image_tag_from_name(image_name: str) -> str | None:
1093
+ if not image_name or ":" not in image_name:
1094
+ return None
1095
+ return image_name.rsplit(":", 1)[-1] or None
1096
+
1097
+ @staticmethod
1098
+ def _to_bool(value: Any) -> bool:
1099
+ if isinstance(value, bool):
1100
+ return value
1101
+ if isinstance(value, str):
1102
+ return value.strip().lower() in {"1", "true", "yes", "on"}
1103
+ if isinstance(value, int | float):
1104
+ return bool(value)
1105
+ return False # pragma: no cover - defensive default
1106
+
1107
+ def _build_observation(self, last_result: dict[str, Any] | None) -> dict[str, Any]:
1108
+ trimmed_history = summarise_history(self.state.history)
1109
+ observation = {
1110
+ "task": self.task,
1111
+ "step_idx": self.state.step_idx,
1112
+ "history": trimmed_history,
1113
+ "submitted": self.state.submitted,
1114
+ "submission_success": self.state.submission_success,
1115
+ "tools": TOOLS_SCHEMA,
1116
+ }
1117
+ if last_result is not None:
1118
+ observation["last"] = last_result
1119
+ if self.last_submission is not None:
1120
+ observation["submission_result"] = self.last_submission
1121
+ return observation
1122
+
1123
+ def _build_response(
1124
+ self,
1125
+ *,
1126
+ observation: dict[str, Any],
1127
+ step_idx: int,
1128
+ done: bool = False,
1129
+ reward: float | None = None,
1130
+ info: dict[str, Any] | None = None,
1131
+ ) -> dict[str, Any]:
1132
+ response = {
1133
+ "observation": observation,
1134
+ "step_idx": step_idx,
1135
+ "done": bool(done),
1136
+ }
1137
+ if reward is not None:
1138
+ response["reward"] = reward
1139
+ if info is not None:
1140
+ response["info"] = info
1141
+ return response
1142
+
1143
+ def state_dict(self) -> dict[str, Any]:
1144
+ return {
1145
+ "task": self.state.task,
1146
+ "history": self.state.history,
1147
+ "step_idx": self.state.step_idx,
1148
+ "submitted": self.state.submitted,
1149
+ "submission_success": self.state.submission_success,
1150
+ "last_result": self.last_result,
1151
+ "last_submission": self.last_submission,
1152
+ "environment_type": self.environment_type,
1153
+ "env_config": self.env_config,
1154
+ }
1155
+
1156
+ def load_state_dict(self, payload: dict[str, Any]) -> None:
1157
+ self.state = MiniSweEnvironmentState(
1158
+ task=payload["task"],
1159
+ history=payload.get("history", []),
1160
+ step_idx=int(payload.get("step_idx", 0)),
1161
+ submitted=bool(payload.get("submitted", False)),
1162
+ submission_success=payload.get("submission_success"),
1163
+ )
1164
+ self.last_result = payload.get("last_result")
1165
+ self.last_submission = payload.get("last_submission")
1166
+ self.environment_type = payload.get("environment_type", self.environment_type)
1167
+ self.env_config = payload.get("env_config", self.env_config)
1168
+
1169
+ async def serialize(self) -> dict[str, Any]:
1170
+ return {
1171
+ "name": self.name,
1172
+ "config": {
1173
+ "env_config": self.env_config,
1174
+ "submit_command": self.submit_command,
1175
+ },
1176
+ "state": self.state_dict(),
1177
+ }
1178
+
1179
+ @classmethod
1180
+ async def deserialize(cls, payload: dict[str, Any]) -> MiniSweEnvironmentWrapper:
1181
+ config = payload.get("config", {}) or {}
1182
+ wrapper = cls(
1183
+ task=payload["state"]["task"],
1184
+ env_config=config.get("env_config"),
1185
+ submit_command=config.get("submit_command"),
1186
+ )
1187
+ wrapper.load_state_dict(payload["state"])
1188
+ return wrapper
1189
+
1190
+
1191
+ __all__ = ["MiniSweEnvironmentWrapper"]