synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (291) hide show
  1. examples/multi_step/configs/README_verilog_rl.md +77 -0
  2. examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
  3. examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
  4. examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
  5. examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
  6. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +17 -5
  7. examples/multi_step/configs/crafter_synth_backend.md +40 -0
  8. examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
  9. examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
  10. examples/multi_step/configs/verilog_rl_lora.toml +190 -0
  11. examples/multi_step/judges/crafter_backend_judge.py +220 -0
  12. examples/multi_step/judges/verilog_backend_judge.py +234 -0
  13. examples/multi_step/readme.md +48 -0
  14. examples/multi_step/verilog_rl_lora.md +218 -0
  15. examples/qwen_coder/configs/coder_lora_30b.toml +1 -1
  16. examples/sft/evaluate.py +2 -0
  17. examples/sft/generate_traces.py +2 -0
  18. examples/swe/task_app/grpo_swe_mini.py +56 -26
  19. examples/swe/task_app/hosted/rollout.py +42 -0
  20. examples/swe/task_app/hosted/test_service.py +5 -6
  21. examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
  22. examples/task_apps/TESTING.md +275 -0
  23. examples/task_apps/__init__.py +0 -0
  24. examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
  25. examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
  26. examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
  27. examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
  28. examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
  29. examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
  30. examples/task_apps/crafter/__init__.py +0 -0
  31. examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
  32. examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
  33. examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
  34. examples/task_apps/crafter/task_app/__init__.py +5 -0
  35. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +324 -21
  36. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
  37. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
  38. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +76 -7
  39. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
  40. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +25 -3
  41. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +77 -4
  42. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +117 -9
  43. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
  44. examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +218 -0
  45. examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
  46. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
  47. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
  48. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
  49. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
  50. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
  51. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
  52. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
  53. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
  54. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
  55. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
  56. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
  57. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
  58. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
  59. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
  60. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
  61. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
  62. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
  63. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
  64. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
  65. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
  66. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
  67. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
  68. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
  69. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
  70. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
  71. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
  72. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
  73. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
  74. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
  75. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
  76. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
  77. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
  78. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
  79. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
  80. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
  81. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
  82. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
  83. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
  84. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
  85. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
  86. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
  87. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
  88. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
  89. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
  90. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
  91. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
  92. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
  93. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
  94. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
  95. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
  96. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
  97. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
  98. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
  99. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
  100. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
  101. examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
  102. examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
  103. examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
  104. examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
  105. examples/task_apps/enron/__init__.py +1 -0
  106. examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
  107. examples/task_apps/enron/filter_sft.toml +5 -0
  108. examples/task_apps/enron/task_app/README.md +14 -0
  109. examples/task_apps/enron/task_app/__init__.py +1 -0
  110. examples/task_apps/enron/task_app/grpo_enron.py +906 -0
  111. examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
  112. examples/task_apps/enron/tests/__init__.py +4 -0
  113. examples/task_apps/enron/tests/conftest.py +115 -0
  114. examples/task_apps/enron/tests/integration/__init__.py +4 -0
  115. examples/task_apps/enron/tests/integration/test_enron_eval.py +179 -0
  116. examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
  117. examples/task_apps/enron/tests/unit/__init__.py +4 -0
  118. examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
  119. examples/task_apps/math/__init__.py +0 -0
  120. examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
  121. examples/task_apps/pokemon_battle/__init__.py +2 -0
  122. examples/task_apps/pokemon_battle/modal_app.py +104 -0
  123. examples/task_apps/pokemon_battle/task_app/README.md +68 -0
  124. examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
  125. examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
  126. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
  127. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
  128. examples/task_apps/pokemon_red/README.md +357 -0
  129. examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
  130. examples/task_apps/pokemon_red/__init__.py +3 -0
  131. examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
  132. examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
  133. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +75 -0
  134. examples/task_apps/pokemon_red/task_app.py +799 -0
  135. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +193 -0
  136. examples/task_apps/sokoban/README.md +307 -0
  137. examples/task_apps/sokoban/__init__.py +3 -0
  138. examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
  139. examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
  140. examples/task_apps/sokoban/filter_sft.toml +5 -0
  141. examples/task_apps/sokoban/task_app.py +1058 -0
  142. examples/task_apps/sokoban/tests/__init__.py +4 -0
  143. examples/task_apps/sokoban/tests/conftest.py +113 -0
  144. examples/task_apps/sokoban/tests/integration/__init__.py +4 -0
  145. examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
  146. examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
  147. examples/task_apps/sokoban/tests/unit/__init__.py +4 -0
  148. examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
  149. examples/task_apps/verilog/__init__.py +1 -0
  150. examples/task_apps/verilog/eval_groq_qwen32b.toml +24 -0
  151. examples/task_apps/verilog/filter_sft.toml +5 -0
  152. examples/task_apps/verilog/task_app/README.md +12 -0
  153. examples/task_apps/verilog/task_app/__init__.py +1 -0
  154. examples/task_apps/verilog/task_app/grpo_verilog.py +1166 -0
  155. examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
  156. examples/task_apps/verilog/tests/__init__.py +4 -0
  157. examples/task_apps/verilog/tests/conftest.py +115 -0
  158. examples/task_apps/verilog/tests/integration/__init__.py +4 -0
  159. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +181 -0
  160. examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
  161. examples/task_apps/verilog/tests/unit/__init__.py +4 -0
  162. examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
  163. examples/vlm/crafter_openai_vlm_agent.py +4 -4
  164. examples/vlm/run_crafter_vlm_benchmark.py +4 -4
  165. examples/warming_up_to_rl/groq_test.py +2 -0
  166. examples/warming_up_to_rl/run_local_rollout.py +2 -0
  167. examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
  168. examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
  169. examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
  170. examples/warming_up_to_rl/run_rollout_remote.py +2 -0
  171. examples/workflows/__init__.py +0 -0
  172. examples/workflows/math_rl/__init__.py +0 -0
  173. examples/workflows/math_rl/download_dataset.py +80 -0
  174. synth_ai/__init__.py +2 -2
  175. synth_ai/api/models/supported.py +1 -0
  176. synth_ai/api/train/builders.py +25 -11
  177. synth_ai/api/train/cli.py +12 -6
  178. synth_ai/api/train/configs/__init__.py +10 -10
  179. synth_ai/api/train/configs/rl.py +5 -4
  180. synth_ai/api/train/configs/sft.py +4 -3
  181. synth_ai/api/train/env_resolver.py +5 -2
  182. synth_ai/api/train/supported_algos.py +10 -5
  183. synth_ai/api/train/utils.py +7 -4
  184. synth_ai/cli/__init__.py +48 -59
  185. synth_ai/cli/_modal_wrapper.py +3 -2
  186. synth_ai/cli/_storage.py +4 -3
  187. synth_ai/cli/_validate_task_app.py +11 -0
  188. synth_ai/cli/balance.py +4 -3
  189. synth_ai/cli/calc.py +2 -2
  190. synth_ai/cli/demo.py +14 -7
  191. synth_ai/cli/legacy_root_backup.py +1 -1
  192. synth_ai/cli/recent.py +1 -1
  193. synth_ai/cli/rl_demo.py +8 -7
  194. synth_ai/cli/root.py +0 -97
  195. synth_ai/cli/status.py +1 -1
  196. synth_ai/cli/task_apps.py +1922 -190
  197. synth_ai/cli/traces.py +1 -1
  198. synth_ai/cli/tui.py +57 -0
  199. synth_ai/cli/turso.py +1 -1
  200. synth_ai/cli/watch.py +1 -1
  201. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +29 -17
  202. synth_ai/environments/examples/crafter_classic/environment.py +1 -1
  203. synth_ai/environments/examples/enron/engine.py +7 -2
  204. synth_ai/environments/examples/enron/environment.py +68 -0
  205. synth_ai/environments/examples/red/engine.py +27 -0
  206. synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
  207. synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
  208. synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
  209. synth_ai/environments/examples/red/environment.py +60 -0
  210. synth_ai/environments/examples/sokoban/taskset.py +116 -0
  211. synth_ai/environments/examples/verilog/engine.py +104 -12
  212. synth_ai/evals/client.py +58 -61
  213. synth_ai/jobs/client.py +16 -4
  214. synth_ai/judge_schemas.py +9 -9
  215. synth_ai/py.typed +0 -0
  216. synth_ai/task/__init__.py +24 -5
  217. synth_ai/task/apps/__init__.py +1 -0
  218. synth_ai/task/config.py +257 -0
  219. synth_ai/task/contracts.py +138 -39
  220. synth_ai/task/proxy.py +48 -56
  221. synth_ai/task/rubrics/__init__.py +56 -0
  222. synth_ai/task/rubrics/loaders.py +152 -0
  223. synth_ai/task/rubrics/models.py +57 -0
  224. synth_ai/task/rubrics/scoring.py +116 -0
  225. synth_ai/{rubrics/validators.py → task/rubrics/strict.py} +53 -30
  226. synth_ai/task/server.py +8 -7
  227. synth_ai/task/trace_correlation_helpers.py +315 -0
  228. synth_ai/task/validators.py +413 -6
  229. synth_ai/tracing_v3/abstractions.py +3 -3
  230. synth_ai/tracing_v3/decorators.py +7 -3
  231. synth_ai/tracing_v3/llm_call_record_helpers.py +5 -5
  232. synth_ai/tracing_v3/replica_sync.py +4 -4
  233. synth_ai/tracing_v3/serialization.py +5 -5
  234. synth_ai/tracing_v3/session_tracer.py +16 -6
  235. synth_ai/tracing_v3/storage/base.py +29 -29
  236. synth_ai/tracing_v3/storage/config.py +3 -3
  237. synth_ai/tracing_v3/trace_utils.py +317 -0
  238. synth_ai/tracing_v3/turso/daemon.py +8 -7
  239. synth_ai/tracing_v3/turso/native_manager.py +66 -43
  240. synth_ai/tracing_v3/utils.py +3 -3
  241. synth_ai/tui/__init__.py +5 -0
  242. synth_ai/tui/__main__.py +13 -0
  243. synth_ai/tui/cli/__init__.py +1 -0
  244. synth_ai/tui/cli/query_experiments.py +164 -0
  245. synth_ai/tui/cli/query_experiments_v3.py +164 -0
  246. synth_ai/tui/dashboard.py +906 -0
  247. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/METADATA +4 -1
  248. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/RECORD +278 -126
  249. examples/agora_ex/README_MoE.md +0 -224
  250. examples/agora_ex/__init__.py +0 -7
  251. examples/agora_ex/agora_ex.py +0 -65
  252. examples/agora_ex/agora_ex_task_app.py +0 -590
  253. examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +0 -121
  254. examples/agora_ex/reward_fn_grpo-human.py +0 -129
  255. examples/agora_ex/system_prompt_CURRENT.md +0 -63
  256. examples/agora_ex/task_app/agora_ex_task_app.py +0 -590
  257. examples/agora_ex/task_app/reward_fn_grpo-human.py +0 -129
  258. examples/agora_ex/task_app/system_prompt_CURRENT.md +0 -63
  259. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +0 -62
  260. synth_ai/rubrics/__init__.py +0 -22
  261. synth_ai/task/rubrics.py +0 -219
  262. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
  263. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
  264. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
  265. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
  266. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
  267. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
  268. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
  269. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
  270. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
  271. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
  272. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
  273. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
  274. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
  275. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
  276. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
  277. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
  278. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
  279. /examples/{rl/task_app → task_apps/math}/README.md +0 -0
  280. /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
  281. /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
  282. /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
  283. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
  284. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
  285. /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
  286. /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
  287. /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
  288. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/WHEEL +0 -0
  289. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/entry_points.txt +0 -0
  290. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/licenses/LICENSE +0 -0
  291. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,257 @@
1
+ """Configuration dataclasses for task app CLI commands (eval, filter)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from pathlib import Path
7
+ from typing import Any, Literal
8
+
9
+
10
+ @dataclass(slots=True)
11
+ class EvalConfig:
12
+ """Configuration for 'synth-ai eval' command.
13
+
14
+ Validates and provides defaults for evaluation runs against task apps.
15
+ """
16
+
17
+ # Required: Task app identifier
18
+ app_id: str
19
+
20
+ # Required: Model to evaluate
21
+ model: str
22
+
23
+ # Required: Seeds to run
24
+ seeds: list[int]
25
+
26
+ # Optional: Task app URL (None = spawn in-process)
27
+ task_app_url: str | None = None
28
+
29
+ # Optional: Data split to use
30
+ split: str = "train"
31
+
32
+ # Optional: Maximum turns/steps per episode
33
+ max_turns: int | None = None
34
+
35
+ # Optional: Maximum LLM calls per episode
36
+ max_llm_calls: int = 10
37
+
38
+ # Optional: Concurrency for parallel rollouts
39
+ concurrency: int = 1
40
+
41
+ # Optional: Environment name
42
+ env_name: str | None = None
43
+
44
+ # Optional: Policy name
45
+ policy_name: str | None = None
46
+
47
+ # Optional: Trace format ("compact", "full", "structured")
48
+ trace_format: Literal["compact", "full", "structured"] = "compact"
49
+
50
+ # Optional: Whether to return traces in response
51
+ return_trace: bool = False
52
+
53
+ # Optional: Operations sequence (if not provided, generates default)
54
+ ops: list[str] | None = None
55
+
56
+ # Optional: Environment config overrides
57
+ env_config: dict[str, Any] = field(default_factory=dict)
58
+
59
+ # Optional: Policy config overrides
60
+ policy_config: dict[str, Any] = field(default_factory=dict)
61
+
62
+ # Optional: Metadata for traces
63
+ metadata: dict[str, str] = field(default_factory=dict)
64
+
65
+ # Optional: SQL query for metadata filtering
66
+ metadata_sql: str | None = None
67
+
68
+ def __post_init__(self):
69
+ """Validate configuration after initialization."""
70
+ if not self.app_id:
71
+ raise ValueError("app_id is required")
72
+
73
+ if not self.model:
74
+ raise ValueError("model is required")
75
+
76
+ if not self.seeds:
77
+ raise ValueError("seeds list cannot be empty")
78
+
79
+ if not isinstance(self.seeds, list):
80
+ raise ValueError("seeds must be a list of integers")
81
+
82
+ if self.concurrency < 1:
83
+ raise ValueError("concurrency must be >= 1")
84
+
85
+ if self.max_llm_calls < 1:
86
+ raise ValueError("max_llm_calls must be >= 1")
87
+
88
+ if self.max_turns is not None and self.max_turns < 1:
89
+ raise ValueError("max_turns must be >= 1")
90
+
91
+ if self.trace_format not in ("compact", "full", "structured"):
92
+ raise ValueError(f"trace_format must be 'compact', 'full', or 'structured', got: {self.trace_format}")
93
+
94
+ @classmethod
95
+ def from_dict(cls, data: dict[str, Any]) -> EvalConfig:
96
+ """Create EvalConfig from a dictionary (e.g. from TOML).
97
+
98
+ Args:
99
+ data: Dictionary with eval configuration
100
+
101
+ Returns:
102
+ Validated EvalConfig instance
103
+ """
104
+ # Extract known fields
105
+ config_dict = {
106
+ "app_id": data.get("app_id"),
107
+ "model": data.get("model"),
108
+ "seeds": data.get("seeds", []),
109
+ "task_app_url": data.get("task_app_url"),
110
+ "split": data.get("split", "train"),
111
+ "max_turns": data.get("max_turns"),
112
+ "max_llm_calls": data.get("max_llm_calls", 10),
113
+ "concurrency": data.get("concurrency", 1),
114
+ "env_name": data.get("env_name"),
115
+ "policy_name": data.get("policy_name"),
116
+ "trace_format": data.get("trace_format", "compact"),
117
+ "return_trace": data.get("return_trace", False),
118
+ "ops": data.get("ops"),
119
+ "env_config": data.get("env_config", {}),
120
+ "policy_config": data.get("policy_config", {}),
121
+ "metadata": data.get("metadata", {}),
122
+ "metadata_sql": data.get("metadata_sql"),
123
+ }
124
+
125
+ return cls(**config_dict)
126
+
127
+
128
+ @dataclass(slots=True)
129
+ class FilterConfig:
130
+ """Configuration for 'synth-ai filter' command.
131
+
132
+ Validates and provides defaults for filtering traces into SFT datasets.
133
+ """
134
+
135
+ # Required: Database path or URL
136
+ db: str
137
+
138
+ # Required: Output JSONL path
139
+ output: str
140
+
141
+ # Optional: Filter by data splits
142
+ splits: list[str] = field(default_factory=list)
143
+
144
+ # Optional: Filter by task IDs
145
+ task_ids: list[str] = field(default_factory=list)
146
+
147
+ # Optional: Filter by models
148
+ models: list[str] = field(default_factory=list)
149
+
150
+ # Optional: Minimum official score threshold
151
+ min_official_score: float | None = None
152
+
153
+ # Optional: Maximum official score threshold
154
+ max_official_score: float | None = None
155
+
156
+ # Optional: Minimum judge scores (judge_name -> min_score)
157
+ min_judge_scores: dict[str, float] = field(default_factory=dict)
158
+
159
+ # Optional: Maximum judge scores (judge_name -> max_score)
160
+ max_judge_scores: dict[str, float] = field(default_factory=dict)
161
+
162
+ # Optional: Limit number of examples
163
+ limit: int | None = None
164
+
165
+ # Optional: Offset for pagination
166
+ offset: int | None = None
167
+
168
+ # Optional: Whether to shuffle results
169
+ shuffle: bool = False
170
+
171
+ # Optional: Random seed for shuffling
172
+ shuffle_seed: int | None = None
173
+
174
+ def __post_init__(self):
175
+ """Validate configuration after initialization."""
176
+ if not self.db:
177
+ raise ValueError("db (database path or URL) is required")
178
+
179
+ if not self.output:
180
+ raise ValueError("output (JSONL file path) is required")
181
+
182
+ # Validate output has .jsonl extension
183
+ output_path = Path(self.output)
184
+ if output_path.suffix.lower() not in (".jsonl", ".json"):
185
+ raise ValueError(f"output must be a .jsonl or .json file, got: {self.output}")
186
+
187
+ # Validate score thresholds
188
+ if self.min_official_score is not None and self.max_official_score is not None:
189
+ if self.min_official_score > self.max_official_score:
190
+ raise ValueError("min_official_score cannot be greater than max_official_score")
191
+
192
+ # Validate limit/offset
193
+ if self.limit is not None and self.limit < 1:
194
+ raise ValueError("limit must be >= 1")
195
+
196
+ if self.offset is not None and self.offset < 0:
197
+ raise ValueError("offset must be >= 0")
198
+
199
+ # Validate shuffle seed requires shuffle
200
+ if self.shuffle_seed is not None and not self.shuffle:
201
+ raise ValueError("shuffle_seed requires shuffle=true")
202
+
203
+ @classmethod
204
+ def from_dict(cls, data: dict[str, Any]) -> FilterConfig:
205
+ """Create FilterConfig from a dictionary (e.g. from TOML).
206
+
207
+ Args:
208
+ data: Dictionary with filter configuration
209
+
210
+ Returns:
211
+ Validated FilterConfig instance
212
+ """
213
+ # Extract known fields
214
+ config_dict = {
215
+ "db": data.get("db"),
216
+ "output": data.get("output"),
217
+ "splits": data.get("splits", []),
218
+ "task_ids": data.get("task_ids", []),
219
+ "models": data.get("models", []),
220
+ "min_official_score": data.get("min_official_score"),
221
+ "max_official_score": data.get("max_official_score"),
222
+ "min_judge_scores": data.get("min_judge_scores", {}),
223
+ "max_judge_scores": data.get("max_judge_scores", {}),
224
+ "limit": data.get("limit"),
225
+ "offset": data.get("offset"),
226
+ "shuffle": data.get("shuffle", False),
227
+ "shuffle_seed": data.get("shuffle_seed"),
228
+ }
229
+
230
+ return cls(**config_dict)
231
+
232
+ def get_db_url(self) -> str:
233
+ """Convert db path to proper SQLite URL if needed.
234
+
235
+ Returns:
236
+ Database URL suitable for SQLAlchemy/aiosqlite
237
+ """
238
+ db_value = self.db.strip()
239
+ if "://" in db_value:
240
+ return db_value
241
+ else:
242
+ db_path = Path(db_value).expanduser().resolve()
243
+ # Ensure parent directory exists
244
+ db_path.parent.mkdir(parents=True, exist_ok=True)
245
+ return f"sqlite+aiosqlite:///{db_path}"
246
+
247
+ def get_output_path(self) -> Path:
248
+ """Get resolved output path with parent directory created.
249
+
250
+ Returns:
251
+ Resolved Path object with parent directory created
252
+ """
253
+ output_path = Path(self.output).expanduser().resolve()
254
+ output_path.parent.mkdir(parents=True, exist_ok=True)
255
+ return output_path
256
+
257
+
@@ -1,19 +1,25 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass
4
+ from enum import Enum
4
5
  from typing import Any, Literal
5
6
 
6
- from pydantic import BaseModel, Field
7
+ from pydantic import BaseModel, ConfigDict, Field
8
+
9
+
10
+ class RolloutMode(str, Enum):
11
+ """Mode controls how rollout infrastructure processes inference URLs."""
12
+ RL = "rl"
13
+ EVAL = "eval"
7
14
 
8
15
 
9
16
  @dataclass(frozen=True)
10
17
  class TaskAppEndpoints:
11
- """Canonical Task App endpoint shapes used by RL trainers.
18
+ """Required Task App endpoints used by RL trainers and clients.
12
19
 
13
- Task Apps run as lightweight HTTP services (often on Modal) that expose a
14
- consistent set of endpoints for health, metadata, environment lifecycle,
15
- rollouts, and optional proxy access to vendor models. The endpoint strings
16
- defined here act as defaults and documentation for clients.
20
+ Task Apps run as lightweight HTTP services (often on Modal) that expose these
21
+ standard endpoints. Additional endpoints (proxies, debug routes) may be added
22
+ by individual task apps as needed.
17
23
  """
18
24
 
19
25
  root: str = "/"
@@ -21,28 +27,6 @@ class TaskAppEndpoints:
21
27
  info: str = "/info"
22
28
  task_info: str = "/task_info"
23
29
  rollout: str = "/rollout"
24
- proxy_chat_completions: str = "/proxy/v1/chat/completions"
25
- proxy_groq_chat_completions: str = "/proxy/groq/v1/chat/completions"
26
- env_initialize: str = "/env/{env_name}/initialize"
27
- env_step: str = "/env/{env_name}/step"
28
- env_terminate: str = "/env/{env_name}/terminate"
29
-
30
-
31
- @dataclass(frozen=True)
32
- class TaskAppContract:
33
- """Requirements and expectations for a Task App used by RL trainers.
34
-
35
- - Auth: ENVIRONMENT_API_KEY must be set in the Task App environment; requests include X-API-Key.
36
- - Health: /health returns 200 and JSON; may verify X-API-Key header.
37
- - Env API: initialize/step/terminate are present for the target env (e.g., CrafterClassic).
38
- - Rollout API: optional; provides a single-call rollout for convenience/testing.
39
- - Inference routing: policy config passes an inference_url (Synth backend or OpenAI proxy).
40
- - URL: base must be reachable via HTTPS and should be under .modal.run in production.
41
- """
42
-
43
- base_url: str
44
- env_name: str | None = None
45
- requires_api_key_header: bool = True
46
30
 
47
31
 
48
32
  # --- Unified rollout schema used by Task App services and SDK utilities ---
@@ -66,7 +50,7 @@ class RolloutRecordConfig(BaseModel):
66
50
  logprobs: bool = False
67
51
  value: bool = False
68
52
  return_trace: bool = False
69
- trace_format: Literal["compact", "full"] = "compact"
53
+ trace_format: Literal["compact", "full", "structured"] = "compact"
70
54
 
71
55
 
72
56
  class RolloutSafetyConfig(BaseModel):
@@ -84,9 +68,16 @@ class RolloutRequest(BaseModel):
84
68
  safety: RolloutSafetyConfig = RolloutSafetyConfig()
85
69
  training_session_id: str | None = None
86
70
  synth_base_url: str | None = None
71
+ mode: RolloutMode # Required: explicit RL vs EVAL mode
87
72
 
88
73
 
89
74
  class RolloutStep(BaseModel):
75
+ """Single step in a rollout trajectory.
76
+
77
+ DEPRECATED: This is part of the legacy trajectory format. New code should
78
+ consume v3 traces (RolloutResponse.trace) instead. See monorepo/trace_single_source.txt
79
+ for migration plan.
80
+ """
90
81
  obs: dict[str, Any]
91
82
  tool_calls: list[dict[str, Any]]
92
83
  reward: float | None = None
@@ -96,11 +87,40 @@ class RolloutStep(BaseModel):
96
87
 
97
88
 
98
89
  class RolloutTrajectory(BaseModel):
90
+ """Legacy trajectory format for rollout results.
91
+
92
+ DEPRECATED: This format duplicates data already present in v3 traces and will
93
+ be removed once training code migrates to consuming RolloutResponse.trace.
94
+
95
+ Current state:
96
+ - Task apps emit BOTH this format AND v3 traces (dual serialization)
97
+ - Training code (GSPO) reads from this format
98
+ - Eval/filter tools read from v3 traces
99
+
100
+ Migration plan:
101
+ - Phase 1: Training code learns to read from v3 traces (with fallback to this)
102
+ - Phase 2: Make this field optional once training is migrated
103
+ - Phase 3: Remove this field entirely and delete this class
104
+
105
+ See: monorepo/trace_single_source.txt for full migration plan and timeline.
106
+
107
+ Why v3 traces are better:
108
+ - Single source of truth (no duplication/drift)
109
+ - Richer data: token IDs, logprobs, reasoning, timing, images
110
+ - Built-in audit trail and replay capability
111
+ - Standard schema across all Synth AI tooling
112
+ """
99
113
  env_id: str
100
114
  policy_id: str
101
115
  steps: list[RolloutStep]
102
116
  final: dict[str, Any] | None = None
103
117
  length: int
118
+
119
+ # Required for trace correlation with inference mesh (optional initially for backward compat)
120
+ # See: monorepo/INFERENCE_URL_REQUIREMENT_PLAN.md and trace_creation_and_judgement.txt
121
+ inference_url: str
122
+
123
+ decision_samples: list[dict[str, Any]] | None = None
104
124
 
105
125
 
106
126
  class RolloutMetrics(BaseModel):
@@ -114,24 +134,103 @@ class RolloutMetrics(BaseModel):
114
134
 
115
135
 
116
136
  class RolloutResponse(BaseModel):
137
+ """Response from a rollout execution.
138
+
139
+ Contains both legacy trajectory format (for backward compatibility) and
140
+ modern v3 trace format (preferred going forward).
141
+ """
117
142
  run_id: str
143
+
144
+ # DEPRECATED: Legacy format maintained for training code compatibility.
145
+ # Will be removed once training migrates to reading from `trace` field.
146
+ # See: monorepo/trace_single_source.txt for migration plan.
118
147
  trajectories: list[RolloutTrajectory]
148
+
119
149
  branches: dict[str, list[str]] = Field(default_factory=dict)
120
150
  metrics: RolloutMetrics
121
151
  aborted: bool = False
122
152
  ops_executed: int = 0
153
+
154
+ # OPTIONAL: correlation ID for linking rollout to inference traces
155
+ # If not provided, trainer will infer it from trajectory.inference_url ?cid=... parameter
156
+ trace_correlation_id: str | None = None
157
+
158
+ # PREFERRED: v3 trace format (SessionTrace). This is the single source of truth
159
+ # for rollout data and should be used by all new code. Contains richer data than
160
+ # trajectories including token IDs, logprobs, timing, and multimodal content.
123
161
  trace: dict[str, Any] | None = None
162
+ pipeline_metadata: dict[str, Any] = Field(default_factory=dict)
163
+
164
+
165
+ class _ExtraAllowModel(BaseModel):
166
+ """Base helper that preserves unknown keys while still exposing typed attributes."""
167
+
168
+ model_config = ConfigDict(extra="allow")
169
+
170
+
171
+ class TaskDescriptor(_ExtraAllowModel):
172
+ """Human-readable task identifiers shown in UIs and logs."""
173
+
174
+ id: str
175
+ name: str
176
+ description: str | None = None
177
+ version: str | None = None
178
+
179
+
180
+ class DatasetInfo(_ExtraAllowModel):
181
+ """Metadata about the prompt/task dataset powering the environment."""
182
+
183
+ id: str | None = None
184
+ name: str | None = None
185
+ version: str | None = None
186
+ splits: list[str] | None = None
187
+ default_split: str | None = None
188
+ description: str | None = None
189
+
190
+
191
+ class RubricCriterion(_ExtraAllowModel):
192
+ id: str
193
+ description: str
194
+ weight: float | None = None
195
+
196
+
197
+ class RubricSection(_ExtraAllowModel):
198
+ name: str
199
+ criteria: list[RubricCriterion] = Field(default_factory=list)
200
+
201
+
202
+ class RubricInfo(_ExtraAllowModel):
203
+ """Outcome and event scoring definitions used by judges."""
204
+
205
+ outcome: RubricSection | None = None
206
+ events: RubricSection | None = None
207
+
208
+
209
+ class InferenceInfo(_ExtraAllowModel):
210
+ """Recommended defaults for policy model routing."""
211
+
212
+ model: str | None = None
213
+ inference_url: str | None = None
214
+
215
+
216
+ class LimitsInfo(_ExtraAllowModel):
217
+ """Operational limits the environment enforces."""
218
+
219
+ max_turns: int | None = None
220
+ max_response_tokens: int | None = None
221
+ timeout_seconds: int | None = None
124
222
 
125
223
 
126
- class TaskInfo(BaseModel):
224
+ class TaskInfo(_ExtraAllowModel):
127
225
  """Static metadata describing the capabilities of a Task App task."""
128
226
 
129
- task: dict[str, Any]
130
- environments: list[str]
131
- action_space: dict[str, Any]
132
- observation: dict[str, Any]
133
- dataset: dict[str, Any]
134
- rubric: dict[str, Any]
135
- inference: dict[str, Any]
136
- capabilities: dict[str, Any]
137
- limits: dict[str, Any]
227
+ task: TaskDescriptor
228
+ environment: str
229
+ dataset: DatasetInfo
230
+ rubric: RubricInfo
231
+ inference: InferenceInfo
232
+ limits: LimitsInfo
233
+ task_metadata: dict[str, Any] = Field(
234
+ default_factory=dict,
235
+ description="Task-specific extras (e.g. prompt version info, documentation links).",
236
+ )
synth_ai/task/proxy.py CHANGED
@@ -1,39 +1,15 @@
1
- """Shared helpers for Task App proxy endpoints (OpenAI, Groq, etc.)."""
1
+ """Shared helpers for Task App proxy endpoints (OpenAI, Groq, etc.).
2
+
3
+ The proxy is tool-agnostic - each task app provides its own tools schema.
4
+ """
2
5
 
3
6
  from __future__ import annotations
4
7
 
5
8
  import copy
6
9
  import json
7
10
  import re
8
- from collections.abc import Iterable
9
11
  from typing import Any
10
12
 
11
- INTERACT_TOOL_SCHEMA: list[dict[str, Any]] = [
12
- {
13
- "type": "function",
14
- "function": {
15
- "name": "interact",
16
- "description": "Perform one or more environment actions.",
17
- "parameters": {
18
- "type": "object",
19
- "properties": {
20
- "actions": {
21
- "type": "array",
22
- "items": {"type": "string"},
23
- "description": "List of environment actions to execute in order.",
24
- },
25
- "reasoning": {
26
- "type": "string",
27
- "description": "Optional reasoning for the chosen actions.",
28
- },
29
- },
30
- "required": ["actions"],
31
- "additionalProperties": False,
32
- },
33
- },
34
- }
35
- ]
36
-
37
13
  _REMOVE_FIELDS = {
38
14
  "stop_after_tool_calls",
39
15
  "thinking_mode",
@@ -44,14 +20,12 @@ _REMOVE_SAMPLING_FIELDS = {"temperature", "top_p"}
44
20
  _GPT5_MIN_COMPLETION_TOKENS = 16000
45
21
 
46
22
 
47
- def _ensure_tools(payload: dict[str, Any]) -> None:
48
- tools = payload.get("tools")
49
- if not isinstance(tools, list) or not tools:
50
- payload["tools"] = copy.deepcopy(INTERACT_TOOL_SCHEMA)
51
-
52
-
53
23
  def prepare_for_openai(model: str | None, payload: dict[str, Any]) -> dict[str, Any]:
54
- """Sanitise an OpenAI chat completions payload for Task App usage."""
24
+ """Sanitise an OpenAI chat completions payload for Task App usage.
25
+
26
+ The task app is responsible for providing tools in the payload.
27
+ This function only handles model-specific parameter normalization.
28
+ """
55
29
 
56
30
  sanitized = copy.deepcopy(payload)
57
31
  for field in _REMOVE_FIELDS:
@@ -68,10 +42,18 @@ def prepare_for_openai(model: str | None, payload: dict[str, Any]) -> dict[str,
68
42
  mct = sanitized.get("max_completion_tokens")
69
43
  if not isinstance(mct, int) or mct < _GPT5_MIN_COMPLETION_TOKENS:
70
44
  sanitized["max_completion_tokens"] = _GPT5_MIN_COMPLETION_TOKENS
71
- sanitized["tool_choice"] = {"type": "function", "function": {"name": "interact"}}
45
+
46
+ # Set tool_choice to first provided tool (task app must provide tools)
47
+ # If tool_choice not already set and tools are provided, use the first one
48
+ if "tool_choice" not in sanitized:
49
+ tools = sanitized.get("tools", [])
50
+ if isinstance(tools, list) and tools:
51
+ first_func = tools[0].get("function", {})
52
+ if isinstance(first_func, dict) and "name" in first_func:
53
+ sanitized["tool_choice"] = {"type": "function", "function": {"name": first_func["name"]}}
54
+
72
55
  sanitized["parallel_tool_calls"] = False
73
56
 
74
- _ensure_tools(sanitized)
75
57
  return sanitized
76
58
 
77
59
 
@@ -206,24 +188,18 @@ def parse_tool_call_from_text(text: str) -> tuple[list[str], str]:
206
188
  return [], text
207
189
 
208
190
 
209
- def _build_tool_call(actions: Iterable[str], reasoning: str) -> dict[str, Any]:
210
- payload = {
211
- "actions": [str(a).strip() for a in actions if str(a).strip()],
212
- }
213
- if reasoning.strip():
214
- payload["reasoning"] = reasoning.strip()
215
- return {
216
- "id": "tool_interact_fallback",
217
- "type": "function",
218
- "function": {
219
- "name": INTERACT_TOOL_SCHEMA[0]["function"]["name"],
220
- "arguments": json.dumps(payload, ensure_ascii=False),
221
- },
222
- }
223
-
224
-
225
- def synthesize_tool_call_if_missing(openai_response: dict[str, Any]) -> dict[str, Any]:
226
- """Ensure the first choice carries a tool_call derived from text if absent."""
191
+ def synthesize_tool_call_if_missing(
192
+ openai_response: dict[str, Any],
193
+ fallback_tool_name: str = "interact"
194
+ ) -> dict[str, Any]:
195
+ """Ensure the first choice carries a tool_call derived from text if absent.
196
+
197
+ This is a fallback for models that don't properly support tool calling.
198
+ Task apps can specify their preferred fallback tool name (e.g., "interact", "execute_sequence").
199
+
200
+ DEPRECATED: Task apps should prefer models with native tool calling support.
201
+ This function will be removed in a future version.
202
+ """
227
203
 
228
204
  if not isinstance(openai_response, dict):
229
205
  return openai_response
@@ -245,8 +221,24 @@ def synthesize_tool_call_if_missing(openai_response: dict[str, Any]) -> dict[str
245
221
  if not actions:
246
222
  return openai_response
247
223
 
224
+ # Build a fallback tool call using the provided tool name
225
+ payload = {
226
+ "actions": [str(a).strip() for a in actions if str(a).strip()],
227
+ }
228
+ if reasoning.strip():
229
+ payload["reasoning"] = reasoning.strip()
230
+
231
+ tool_call = {
232
+ "id": f"tool_{fallback_tool_name}_fallback",
233
+ "type": "function",
234
+ "function": {
235
+ "name": fallback_tool_name,
236
+ "arguments": json.dumps(payload, ensure_ascii=False),
237
+ },
238
+ }
239
+
248
240
  new_message = copy.deepcopy(message)
249
- new_message["tool_calls"] = [_build_tool_call(actions, reasoning)]
241
+ new_message["tool_calls"] = [tool_call]
250
242
  if "content" not in new_message:
251
243
  new_message["content"] = None
252
244