synth-ai 0.2.8.dev4__py3-none-any.whl → 0.2.23.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (889) hide show
  1. examples/README.md +1 -0
  2. examples/__init__.py +16 -0
  3. examples/analyze_semantic_words.sh +17 -0
  4. examples/baseline/banking77_baseline.py +243 -0
  5. examples/baseline/banking77_pipeline_baseline.py +294 -0
  6. examples/baseline/crafter_baseline.py +407 -0
  7. examples/baseline/pokemon_red_baseline.py +326 -0
  8. examples/baseline/simple_baseline.py +56 -0
  9. examples/baseline/warming_up_to_rl_baseline.py +239 -0
  10. examples/blog_posts/gepa/README.md +355 -0
  11. examples/blog_posts/gepa/configs/banking77_gepa_local.toml +95 -0
  12. examples/blog_posts/gepa/configs/banking77_gepa_test.toml +80 -0
  13. examples/blog_posts/gepa/configs/banking77_mipro_local.toml +50 -0
  14. examples/blog_posts/gepa/configs/banking77_pipeline_gepa_local.toml +101 -0
  15. examples/blog_posts/gepa/configs/banking77_pipeline_gepa_test.toml +96 -0
  16. examples/blog_posts/gepa/configs/hotpotqa_gepa_local.toml +57 -0
  17. examples/blog_posts/gepa/configs/hotpotqa_gepa_qwen.toml +35 -0
  18. examples/blog_posts/gepa/configs/hotpotqa_mipro_local.toml +51 -0
  19. examples/blog_posts/gepa/configs/hover_gepa_local.toml +57 -0
  20. examples/blog_posts/gepa/configs/hover_gepa_qwen.toml +35 -0
  21. examples/blog_posts/gepa/configs/hover_mipro_local.toml +51 -0
  22. examples/blog_posts/gepa/configs/ifbench_gepa_local.toml +57 -0
  23. examples/blog_posts/gepa/configs/ifbench_gepa_qwen.toml +35 -0
  24. examples/blog_posts/gepa/configs/ifbench_mipro_local.toml +51 -0
  25. examples/blog_posts/gepa/configs/pupa_gepa_local.toml +58 -0
  26. examples/blog_posts/gepa/configs/pupa_mipro_local.toml +52 -0
  27. examples/blog_posts/gepa/deploy_banking77_task_app.sh +54 -0
  28. examples/blog_posts/gepa/gepa_baseline.py +204 -0
  29. examples/blog_posts/gepa/query_prompts_example.py +97 -0
  30. examples/blog_posts/gepa/run_gepa_banking77.sh +112 -0
  31. examples/blog_posts/gepa/run_gepa_banking77_pipeline.sh +163 -0
  32. examples/blog_posts/gepa/task_apps.py +105 -0
  33. examples/blog_posts/gepa/test_gepa_local.sh +67 -0
  34. examples/blog_posts/gepa/verify_banking77_setup.sh +123 -0
  35. examples/blog_posts/mipro/README.md +415 -0
  36. examples/blog_posts/mipro/configs/banking77_mipro_local.toml +91 -0
  37. examples/blog_posts/mipro/configs/banking77_mipro_test.toml +87 -0
  38. examples/blog_posts/mipro/configs/banking77_pipeline_mipro_gemini_flash_lite_local.toml +98 -0
  39. examples/blog_posts/mipro/configs/banking77_pipeline_mipro_gpt41mini_local.toml +96 -0
  40. examples/blog_posts/mipro/configs/banking77_pipeline_mipro_local.toml +94 -0
  41. examples/blog_posts/mipro/configs/banking77_pipeline_mipro_test.toml +170 -0
  42. examples/blog_posts/mipro/deploy_banking77_pipeline_task_app.sh +59 -0
  43. examples/blog_posts/mipro/deploy_banking77_task_app.sh +41 -0
  44. examples/blog_posts/mipro/multi_step.md +79 -0
  45. examples/blog_posts/mipro/run_mipro_banking77.sh +191 -0
  46. examples/blog_posts/mipro/run_mipro_banking77_pipeline.sh +171 -0
  47. examples/blog_posts/mipro/run_mipro_banking77_pipeline_gemini_flash_lite.sh +177 -0
  48. examples/blog_posts/mipro/run_mipro_banking77_pipeline_gpt41mini.sh +173 -0
  49. examples/blog_posts/mipro/verify_banking77_setup.sh +117 -0
  50. examples/blog_posts/pokemon_vl/README.md +98 -0
  51. examples/blog_posts/pokemon_vl/configs/eval_gpt5nano.toml +26 -0
  52. examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +27 -0
  53. examples/blog_posts/pokemon_vl/configs/eval_rl_final.toml +24 -0
  54. examples/blog_posts/pokemon_vl/configs/filter_high_reward.toml +10 -0
  55. examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +43 -0
  56. examples/blog_posts/pokemon_vl/configs/train_sft_qwen4b_vl.toml +40 -0
  57. examples/blog_posts/pokemon_vl/extract_images.py +239 -0
  58. examples/blog_posts/pokemon_vl/pokemon_vl_baseline.py +326 -0
  59. examples/blog_posts/pokemon_vl/run_eval_extract_images.py +209 -0
  60. examples/blog_posts/pokemon_vl/run_qwen_eval_extract_images.py +212 -0
  61. examples/blog_posts/pokemon_vl/text_box_analysis.md +106 -0
  62. examples/blog_posts/warming_up_to_rl/ARCHITECTURE.md +195 -0
  63. examples/blog_posts/warming_up_to_rl/FINAL_TEST_RESULTS.md +127 -0
  64. examples/blog_posts/warming_up_to_rl/INFERENCE_SUCCESS.md +132 -0
  65. examples/blog_posts/warming_up_to_rl/README.md +158 -0
  66. examples/blog_posts/warming_up_to_rl/SMOKE_TESTING.md +164 -0
  67. examples/blog_posts/warming_up_to_rl/SMOKE_TEST_COMPLETE.md +253 -0
  68. examples/blog_posts/warming_up_to_rl/configs/eval_baseline_qwen32b_10x20.toml +25 -0
  69. examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b.toml +25 -0
  70. examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b_10x20.toml +26 -0
  71. examples/blog_posts/warming_up_to_rl/configs/eval_groq_qwen32b.toml +25 -0
  72. examples/blog_posts/warming_up_to_rl/configs/eval_openai_gpt_oss_120b.toml +29 -0
  73. examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +10 -0
  74. examples/blog_posts/warming_up_to_rl/configs/smoke_test.toml +75 -0
  75. examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +91 -0
  76. examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +40 -0
  77. examples/blog_posts/warming_up_to_rl/warming_up_to_rl_baseline.py +187 -0
  78. examples/crafter_debug_render.py +186 -0
  79. examples/dev/qwen3_32b_qlora_4xh100.toml +45 -0
  80. examples/gepa/banking77_pipeline_gepa.toml +96 -0
  81. examples/gepa/multi_stage_gepa_example.toml +84 -0
  82. examples/gepa/run_gepa_banking77_pipeline.sh +157 -0
  83. examples/multi_step/SFT_README.md +147 -0
  84. examples/multi_step/configs/README_verilog_rl.md +77 -0
  85. examples/multi_step/configs/VERILOG_REWARDS.md +103 -0
  86. examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +196 -0
  87. examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
  88. examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
  89. examples/multi_step/configs/crafter_rl_outcome.toml +75 -0
  90. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +145 -0
  91. examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +84 -0
  92. examples/multi_step/configs/crafter_rl_stepwise_simple.toml +79 -0
  93. examples/multi_step/configs/crafter_rl_stepwise_simple_NEW_FORMAT.toml +105 -0
  94. examples/multi_step/configs/crafter_sft_qwen30b_lora.toml +62 -0
  95. examples/multi_step/configs/crafter_synth_backend.md +40 -0
  96. examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
  97. examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
  98. examples/multi_step/configs/verilog_rl_lora.toml +147 -0
  99. examples/multi_step/convert_traces_to_sft.py +84 -0
  100. examples/multi_step/crafter_rl_lora.md +70 -0
  101. examples/multi_step/judges/crafter_backend_judge.py +220 -0
  102. examples/multi_step/judges/verilog_backend_judge.py +234 -0
  103. examples/multi_step/readme.md +48 -0
  104. examples/multi_step/run_sft_qwen30b.sh +45 -0
  105. examples/multi_step/sse_metrics_streaming_notes.md +357 -0
  106. examples/multi_step/task_app_config_notes.md +494 -0
  107. examples/multi_step/verilog_rl_lora.md +218 -0
  108. examples/qwen_coder/README.md +102 -0
  109. examples/qwen_coder/_shared.py +113 -0
  110. examples/qwen_coder/configs/coder_lora_30b.toml +60 -0
  111. examples/qwen_coder/configs/coder_lora_4b.toml +61 -0
  112. examples/qwen_coder/configs/coder_lora_small.toml +57 -0
  113. examples/qwen_coder/generate_dataset.py +98 -0
  114. examples/qwen_coder/infer_ft_smoke.py +65 -0
  115. examples/qwen_coder/infer_prod_proxy.py +73 -0
  116. examples/qwen_coder/infer_via_synth.py +87 -0
  117. examples/qwen_coder/scripts/infer_coder.sh +19 -0
  118. examples/qwen_coder/scripts/train_coder_30b.sh +22 -0
  119. examples/qwen_coder/sft_full_17b.py +103 -0
  120. examples/qwen_coder/sft_lora_30b.py +110 -0
  121. examples/qwen_coder/subset_jsonl.py +39 -0
  122. examples/qwen_coder/todos.md +38 -0
  123. examples/qwen_coder/validate_jsonl.py +60 -0
  124. examples/qwen_vl/BUGS_AND_FIXES.md +232 -0
  125. examples/qwen_vl/IMAGE_VALIDATION_COMPLETE.md +271 -0
  126. examples/qwen_vl/IMAGE_VALIDATION_SUMMARY.md +260 -0
  127. examples/qwen_vl/INFERENCE_SFT_TESTS.md +412 -0
  128. examples/qwen_vl/NEXT_STEPS_2B.md +325 -0
  129. examples/qwen_vl/QUICKSTART.md +327 -0
  130. examples/qwen_vl/QUICKSTART_RL_VISION.md +110 -0
  131. examples/qwen_vl/README.md +152 -0
  132. examples/qwen_vl/RL_VISION_COMPLETE.md +475 -0
  133. examples/qwen_vl/RL_VISION_TESTING.md +333 -0
  134. examples/qwen_vl/SDK_VISION_INTEGRATION.md +328 -0
  135. examples/qwen_vl/SETUP_COMPLETE.md +274 -0
  136. examples/qwen_vl/VISION_TESTS_COMPLETE.md +489 -0
  137. examples/qwen_vl/VLM_PIPELINE_COMPLETE.md +242 -0
  138. examples/qwen_vl/__init__.py +2 -0
  139. examples/qwen_vl/collect_data_via_cli.md +415 -0
  140. examples/qwen_vl/collect_vision_traces.py +368 -0
  141. examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +110 -0
  142. examples/qwen_vl/configs/crafter_vlm_sft_example.toml +59 -0
  143. examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +26 -0
  144. examples/qwen_vl/configs/eval_gpt4o_vision_proper.toml +29 -0
  145. examples/qwen_vl/configs/eval_gpt5nano_vision.toml +26 -0
  146. examples/qwen_vl/configs/eval_qwen3vl_vision.toml +26 -0
  147. examples/qwen_vl/configs/filter_qwen3vl_sft.toml +49 -0
  148. examples/qwen_vl/configs/filter_vision_sft.toml +52 -0
  149. examples/qwen_vl/configs/filter_vision_test.toml +8 -0
  150. examples/qwen_vl/configs/sft_qwen3_vl_2b_test.toml +54 -0
  151. examples/qwen_vl/crafter_gpt5nano_agent.py +308 -0
  152. examples/qwen_vl/crafter_qwen_vl_agent.py +300 -0
  153. examples/qwen_vl/run_vision_comparison.sh +61 -0
  154. examples/qwen_vl/run_vision_sft_pipeline.sh +175 -0
  155. examples/qwen_vl/test_image_validation.py +201 -0
  156. examples/qwen_vl/test_sft_vision_data.py +110 -0
  157. examples/rl/README.md +169 -0
  158. examples/rl/configs/eval_base_qwen.toml +17 -0
  159. examples/rl/configs/eval_rl_qwen.toml +13 -0
  160. examples/rl/configs/rl_from_base_qwen.toml +62 -0
  161. examples/rl/configs/rl_from_base_qwen17.toml +80 -0
  162. examples/rl/configs/rl_from_ft_qwen.toml +37 -0
  163. examples/rl/download_dataset.py +80 -0
  164. examples/rl/run_eval.py +436 -0
  165. examples/rl/run_rl_and_save.py +111 -0
  166. examples/rl/task_app/README.md +21 -0
  167. examples/rl/task_app/math_single_step.py +990 -0
  168. examples/rl/task_app/math_task_app.py +111 -0
  169. examples/run_crafter_demo.sh +10 -0
  170. examples/sdk_prompt_learning_example.py +55 -0
  171. examples/sft/README.md +139 -0
  172. examples/sft/configs/crafter_fft_qwen0p6b.toml +49 -0
  173. examples/sft/configs/crafter_lora_qwen0p6b.toml +49 -0
  174. examples/sft/evaluate.py +117 -0
  175. examples/sft/export_dataset.py +120 -0
  176. examples/sft/generate_traces.py +164 -0
  177. examples/swe/__init__.py +12 -0
  178. examples/swe/task_app/README.md +135 -0
  179. examples/swe/task_app/__init__.py +2 -0
  180. examples/swe/task_app/grpo_swe_mini.py +604 -0
  181. examples/swe/task_app/grpo_swe_mini_task_app.py +124 -0
  182. examples/swe/task_app/hosted/README.md +173 -0
  183. examples/swe/task_app/hosted/__init__.py +5 -0
  184. examples/swe/task_app/hosted/branching.py +143 -0
  185. examples/swe/task_app/hosted/environment_routes.py +1289 -0
  186. examples/swe/task_app/hosted/envs/__init__.py +1 -0
  187. examples/swe/task_app/hosted/envs/crafter/__init__.py +6 -0
  188. examples/swe/task_app/hosted/envs/crafter/app.py +1 -0
  189. examples/swe/task_app/hosted/envs/crafter/environment.py +522 -0
  190. examples/swe/task_app/hosted/envs/crafter/policy.py +478 -0
  191. examples/swe/task_app/hosted/envs/crafter/react_agent.py +108 -0
  192. examples/swe/task_app/hosted/envs/crafter/shared.py +305 -0
  193. examples/swe/task_app/hosted/envs/crafter/tools.py +47 -0
  194. examples/swe/task_app/hosted/envs/mini_swe/__init__.py +8 -0
  195. examples/swe/task_app/hosted/envs/mini_swe/environment.py +1191 -0
  196. examples/swe/task_app/hosted/envs/mini_swe/policy.py +355 -0
  197. examples/swe/task_app/hosted/envs/mini_swe/shared.py +83 -0
  198. examples/swe/task_app/hosted/envs/mini_swe/tools.py +96 -0
  199. examples/swe/task_app/hosted/hosted_app.py +204 -0
  200. examples/swe/task_app/hosted/inference/__init__.py +5 -0
  201. examples/swe/task_app/hosted/inference/openai_client.py +584 -0
  202. examples/swe/task_app/hosted/main.py +100 -0
  203. examples/swe/task_app/hosted/policy_routes.py +1094 -0
  204. examples/swe/task_app/hosted/registry.py +195 -0
  205. examples/swe/task_app/hosted/rollout.py +1905 -0
  206. examples/swe/task_app/hosted/storage/__init__.py +5 -0
  207. examples/swe/task_app/hosted/storage/volume.py +211 -0
  208. examples/swe/task_app/hosted/test_agents.py +161 -0
  209. examples/swe/task_app/hosted/test_service.py +136 -0
  210. examples/swe/task_app/hosted/utils.py +62 -0
  211. examples/swe/task_app/morph_backend.py +178 -0
  212. examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
  213. examples/task_apps/TESTING.md +275 -0
  214. examples/task_apps/banking77/__init__.py +6 -0
  215. examples/task_apps/banking77/banking77_task_app.py +912 -0
  216. examples/task_apps/banking77/deploy_wrapper.py +46 -0
  217. examples/task_apps/banking77_pipeline/__init__.py +6 -0
  218. examples/task_apps/banking77_pipeline/banking77_pipeline_task_app.py +489 -0
  219. examples/task_apps/banking77_pipeline/deploy_wrapper.py +50 -0
  220. examples/task_apps/crafter/CREATE_SFT_DATASET.md +286 -0
  221. examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
  222. examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +187 -0
  223. examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +281 -0
  224. examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
  225. examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
  226. examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
  227. examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
  228. examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
  229. examples/task_apps/crafter/task_app/README.md +42 -0
  230. examples/task_apps/crafter/task_app/__init__.py +5 -0
  231. examples/task_apps/crafter/task_app/grpo_crafter.py +1055 -0
  232. examples/task_apps/crafter/task_app/grpo_crafter_task_app.py +146 -0
  233. examples/task_apps/crafter/task_app/synth_envs_hosted/README.md +173 -0
  234. examples/task_apps/crafter/task_app/synth_envs_hosted/__init__.py +5 -0
  235. examples/task_apps/crafter/task_app/synth_envs_hosted/branching.py +143 -0
  236. examples/task_apps/crafter/task_app/synth_envs_hosted/environment_routes.py +1226 -0
  237. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/__init__.py +1 -0
  238. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
  239. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
  240. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/environment.py +532 -0
  241. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +583 -0
  242. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +122 -0
  243. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/shared.py +305 -0
  244. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
  245. examples/task_apps/crafter/task_app/synth_envs_hosted/hosted_app.py +253 -0
  246. examples/task_apps/crafter/task_app/synth_envs_hosted/inference/__init__.py +5 -0
  247. examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +999 -0
  248. examples/task_apps/crafter/task_app/synth_envs_hosted/main.py +100 -0
  249. examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +1252 -0
  250. examples/task_apps/crafter/task_app/synth_envs_hosted/registry.py +195 -0
  251. examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +2233 -0
  252. examples/task_apps/crafter/task_app/synth_envs_hosted/storage/__init__.py +5 -0
  253. examples/task_apps/crafter/task_app/synth_envs_hosted/storage/volume.py +211 -0
  254. examples/task_apps/crafter/task_app/synth_envs_hosted/test_agents.py +161 -0
  255. examples/task_apps/crafter/task_app/synth_envs_hosted/test_service.py +136 -0
  256. examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +411 -0
  257. examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
  258. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
  259. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
  260. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
  261. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
  262. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
  263. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
  264. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
  265. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
  266. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
  267. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
  268. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
  269. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
  270. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
  271. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
  272. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
  273. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
  274. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
  275. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
  276. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
  277. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
  278. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
  279. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
  280. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
  281. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
  282. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
  283. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
  284. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
  285. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
  286. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
  287. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
  288. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
  289. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
  290. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
  291. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
  292. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
  293. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
  294. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
  295. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
  296. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
  297. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
  298. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
  299. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
  300. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
  301. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
  302. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
  303. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
  304. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
  305. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
  306. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
  307. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
  308. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
  309. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
  310. examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
  311. examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
  312. examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
  313. examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
  314. examples/task_apps/enron/__init__.py +2 -0
  315. examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
  316. examples/task_apps/enron/filter_sft.toml +5 -0
  317. examples/task_apps/enron/task_app/README.md +14 -0
  318. examples/task_apps/enron/task_app/__init__.py +1 -0
  319. examples/task_apps/enron/task_app/grpo_enron.py +906 -0
  320. examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
  321. examples/task_apps/enron/tests/__init__.py +4 -0
  322. examples/task_apps/enron/tests/conftest.py +115 -0
  323. examples/task_apps/enron/tests/integration/__init__.py +4 -0
  324. examples/task_apps/enron/tests/integration/test_enron_eval.py +179 -0
  325. examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
  326. examples/task_apps/enron/tests/unit/__init__.py +4 -0
  327. examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
  328. examples/task_apps/gepa_benchmarks/__init__.py +7 -0
  329. examples/task_apps/gepa_benchmarks/common.py +260 -0
  330. examples/task_apps/gepa_benchmarks/hotpotqa_task_app.py +507 -0
  331. examples/task_apps/gepa_benchmarks/hover_task_app.py +436 -0
  332. examples/task_apps/gepa_benchmarks/ifbench_task_app.py +563 -0
  333. examples/task_apps/gepa_benchmarks/pupa_task_app.py +460 -0
  334. examples/task_apps/math/README.md +21 -0
  335. examples/task_apps/math/math_single_step.py +1000 -0
  336. examples/task_apps/math/math_task_app.py +115 -0
  337. examples/task_apps/pokemon_battle/__init__.py +2 -0
  338. examples/task_apps/pokemon_battle/modal_app.py +104 -0
  339. examples/task_apps/pokemon_battle/task_app/README.md +68 -0
  340. examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
  341. examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
  342. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
  343. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
  344. examples/task_apps/pokemon_red/README.md +356 -0
  345. examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +428 -0
  346. examples/task_apps/pokemon_red/__init__.py +3 -0
  347. examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +30 -0
  348. examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +224 -0
  349. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +75 -0
  350. examples/task_apps/pokemon_red/task_app.py +1048 -0
  351. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +193 -0
  352. examples/task_apps/sokoban/README.md +306 -0
  353. examples/task_apps/sokoban/__init__.py +3 -0
  354. examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
  355. examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
  356. examples/task_apps/sokoban/filter_sft.toml +5 -0
  357. examples/task_apps/sokoban/task_app.py +1058 -0
  358. examples/task_apps/sokoban/tests/__init__.py +4 -0
  359. examples/task_apps/sokoban/tests/conftest.py +113 -0
  360. examples/task_apps/sokoban/tests/integration/__init__.py +4 -0
  361. examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
  362. examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
  363. examples/task_apps/sokoban/tests/unit/__init__.py +4 -0
  364. examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
  365. examples/task_apps/verilog/__init__.py +1 -0
  366. examples/task_apps/verilog/eval_groq_qwen32b.toml +22 -0
  367. examples/task_apps/verilog/filter_sft.toml +5 -0
  368. examples/task_apps/verilog/task_app/README.md +12 -0
  369. examples/task_apps/verilog/task_app/__init__.py +1 -0
  370. examples/task_apps/verilog/task_app/grpo_verilog.py +1166 -0
  371. examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
  372. examples/task_apps/verilog/tests/__init__.py +4 -0
  373. examples/task_apps/verilog/tests/conftest.py +115 -0
  374. examples/task_apps/verilog/tests/integration/__init__.py +4 -0
  375. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +181 -0
  376. examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
  377. examples/task_apps/verilog/tests/unit/__init__.py +4 -0
  378. examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
  379. examples/tunnel_gepa_banking77/README.md +106 -0
  380. examples/tunnel_gepa_banking77/banking77_gepa_tunnel.toml +95 -0
  381. examples/tunnel_gepa_banking77/keep_tunnel_running.py +60 -0
  382. examples/tunnel_gepa_banking77/run_gepa_with_tunnel.sh +226 -0
  383. examples/vlm/PROPOSAL.md +53 -0
  384. examples/vlm/README.md +68 -0
  385. examples/vlm/configs/crafter_vlm_gpt4o.toml +49 -0
  386. examples/vlm/crafter_image_only_agent.py +207 -0
  387. examples/vlm/crafter_openai_vlm_agent.py +275 -0
  388. examples/vlm/filter_image_rows.py +63 -0
  389. examples/vlm/run_crafter_vlm_benchmark.py +316 -0
  390. examples/warming_up_to_rl/_utils.py +92 -0
  391. examples/warming_up_to_rl/analyze_trace_db.py +422 -0
  392. examples/warming_up_to_rl/configs/crafter_fft.toml +53 -0
  393. examples/warming_up_to_rl/configs/crafter_fft_4b.toml +54 -0
  394. examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml +22 -0
  395. examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml +15 -0
  396. examples/warming_up_to_rl/configs/eval_modal_qwen4b.toml +24 -0
  397. examples/warming_up_to_rl/configs/eval_stepwise_complex.toml +35 -0
  398. examples/warming_up_to_rl/configs/eval_stepwise_consistent.toml +26 -0
  399. examples/warming_up_to_rl/configs/eval_stepwise_per_achievement.toml +36 -0
  400. examples/warming_up_to_rl/configs/eval_stepwise_simple.toml +32 -0
  401. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +85 -0
  402. examples/warming_up_to_rl/configs/rl_from_ft.toml +58 -0
  403. examples/warming_up_to_rl/export_trace_sft.py +837 -0
  404. examples/warming_up_to_rl/groq_test.py +97 -0
  405. examples/warming_up_to_rl/manage_secrets.py +131 -0
  406. examples/warming_up_to_rl/old/event_rewards.md +234 -0
  407. examples/warming_up_to_rl/old/notes.md +73 -0
  408. examples/warming_up_to_rl/readme.md +110 -0
  409. examples/warming_up_to_rl/run_eval.py +736 -0
  410. examples/warming_up_to_rl/run_fft_and_save.py +380 -0
  411. examples/warming_up_to_rl/run_local_rollout.py +239 -0
  412. examples/warming_up_to_rl/run_local_rollout_modal.py +248 -0
  413. examples/warming_up_to_rl/run_local_rollout_parallel.py +405 -0
  414. examples/warming_up_to_rl/run_local_rollout_traced.py +477 -0
  415. examples/warming_up_to_rl/run_rl_and_save.py +124 -0
  416. examples/warming_up_to_rl/run_rollout_remote.py +156 -0
  417. examples/warming_up_to_rl/task_app/README.md +42 -0
  418. examples/warming_up_to_rl/task_app/grpo_crafter.py +876 -0
  419. examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +135 -0
  420. examples/warming_up_to_rl/task_app/synth_envs_hosted/README.md +173 -0
  421. examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +5 -0
  422. examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +143 -0
  423. examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +1226 -0
  424. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -0
  425. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
  426. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
  427. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +522 -0
  428. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +454 -0
  429. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +108 -0
  430. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +305 -0
  431. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
  432. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +253 -0
  433. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +5 -0
  434. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +729 -0
  435. examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +100 -0
  436. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +1114 -0
  437. examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +195 -0
  438. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +1891 -0
  439. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +5 -0
  440. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +211 -0
  441. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +161 -0
  442. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +137 -0
  443. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +129 -0
  444. examples/workflows/math_rl/configs/eval_base_qwen.toml +15 -0
  445. examples/workflows/math_rl/configs/eval_rl_qwen.toml +11 -0
  446. examples/workflows/math_rl/configs/rl_from_base_qwen.toml +62 -0
  447. examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +80 -0
  448. examples/workflows/math_rl/configs/rl_from_ft_qwen.toml +35 -0
  449. examples/workflows/math_rl/download_dataset.py +80 -0
  450. examples/workflows/math_rl/run_eval.py +436 -0
  451. examples/workflows/math_rl/run_rl_and_save.py +111 -0
  452. synth_ai/__init__.py +47 -23
  453. synth_ai/_utils/__init__.py +47 -0
  454. synth_ai/_utils/base_url.py +10 -0
  455. synth_ai/_utils/http.py +10 -0
  456. synth_ai/_utils/prompts.py +10 -0
  457. synth_ai/_utils/task_app_state.py +12 -0
  458. synth_ai/_utils/user_config.py +10 -0
  459. synth_ai/api/models/supported.py +514 -0
  460. synth_ai/api/train/__init__.py +63 -0
  461. synth_ai/api/train/builders.py +473 -0
  462. synth_ai/api/train/cli.py +1185 -0
  463. synth_ai/api/train/config_finder.py +246 -0
  464. synth_ai/api/train/configs/__init__.py +65 -0
  465. synth_ai/api/train/configs/prompt_learning.py +496 -0
  466. synth_ai/api/train/configs/rl.py +188 -0
  467. synth_ai/api/train/configs/sft.py +99 -0
  468. synth_ai/api/train/configs/shared.py +81 -0
  469. synth_ai/api/train/env_resolver.py +352 -0
  470. synth_ai/api/train/pollers.py +91 -0
  471. synth_ai/api/train/prompt_learning.py +425 -0
  472. synth_ai/api/train/sft.py +390 -0
  473. synth_ai/api/train/supported_algos.py +147 -0
  474. synth_ai/api/train/task_app.py +195 -0
  475. synth_ai/api/train/utils.py +244 -0
  476. synth_ai/api/train/validators.py +1117 -0
  477. synth_ai/api/tunnel.py +49 -0
  478. synth_ai/auth/credentials.py +94 -0
  479. synth_ai/baseline/__init__.py +25 -0
  480. synth_ai/baseline/config.py +209 -0
  481. synth_ai/baseline/discovery.py +214 -0
  482. synth_ai/baseline/execution.py +146 -0
  483. synth_ai/cfgs.py +227 -0
  484. synth_ai/cli/__init__.py +90 -45
  485. synth_ai/cli/_modal_wrapper.py +31 -0
  486. synth_ai/cli/_storage.py +20 -0
  487. synth_ai/cli/_typer_patch.py +47 -0
  488. synth_ai/cli/_validate_task_app.py +29 -0
  489. synth_ai/cli/balance.py +16 -4
  490. synth_ai/cli/calc.py +36 -21
  491. synth_ai/cli/claude.py +70 -0
  492. synth_ai/cli/codex.py +267 -0
  493. synth_ai/cli/commands/__init__.py +18 -0
  494. synth_ai/cli/commands/baseline/__init__.py +12 -0
  495. synth_ai/cli/commands/baseline/core.py +637 -0
  496. synth_ai/cli/commands/baseline/list.py +93 -0
  497. synth_ai/cli/commands/demo/__init__.py +6 -0
  498. synth_ai/cli/commands/demo/core.py +163 -0
  499. synth_ai/cli/commands/eval/__init__.py +19 -0
  500. synth_ai/cli/commands/eval/core.py +1112 -0
  501. synth_ai/cli/commands/eval/errors.py +81 -0
  502. synth_ai/cli/commands/eval/validation.py +133 -0
  503. synth_ai/cli/commands/filter/__init__.py +12 -0
  504. synth_ai/cli/commands/filter/core.py +424 -0
  505. synth_ai/cli/commands/filter/errors.py +55 -0
  506. synth_ai/cli/commands/filter/validation.py +77 -0
  507. synth_ai/cli/commands/help/__init__.py +185 -0
  508. synth_ai/cli/commands/help/core.py +72 -0
  509. synth_ai/cli/commands/smoke/__init__.py +7 -0
  510. synth_ai/cli/commands/smoke/core.py +1437 -0
  511. synth_ai/cli/commands/status/__init__.py +66 -0
  512. synth_ai/cli/commands/status/client.py +192 -0
  513. synth_ai/cli/commands/status/config.py +92 -0
  514. synth_ai/cli/commands/status/errors.py +20 -0
  515. synth_ai/cli/commands/status/formatters.py +164 -0
  516. synth_ai/cli/commands/status/subcommands/__init__.py +9 -0
  517. synth_ai/cli/commands/status/subcommands/files.py +79 -0
  518. synth_ai/cli/commands/status/subcommands/jobs.py +334 -0
  519. synth_ai/cli/commands/status/subcommands/models.py +79 -0
  520. synth_ai/cli/commands/status/subcommands/pricing.py +22 -0
  521. synth_ai/cli/commands/status/subcommands/runs.py +81 -0
  522. synth_ai/cli/commands/status/subcommands/session.py +183 -0
  523. synth_ai/cli/commands/status/subcommands/summary.py +47 -0
  524. synth_ai/cli/commands/status/subcommands/usage.py +203 -0
  525. synth_ai/cli/commands/status/utils.py +114 -0
  526. synth_ai/cli/commands/train/__init__.py +53 -0
  527. synth_ai/cli/commands/train/core.py +21 -0
  528. synth_ai/cli/commands/train/errors.py +117 -0
  529. synth_ai/cli/commands/train/judge_schemas.py +200 -0
  530. synth_ai/cli/commands/train/judge_validation.py +305 -0
  531. synth_ai/cli/commands/train/validation.py +386 -0
  532. synth_ai/cli/demo.py +32 -140
  533. synth_ai/cli/deploy.py +233 -0
  534. synth_ai/cli/eval/__init__.py +36 -0
  535. synth_ai/cli/eval/core.py +5 -0
  536. synth_ai/cli/eval/errors.py +31 -0
  537. synth_ai/cli/eval/validation.py +5 -0
  538. synth_ai/cli/filter/__init__.py +28 -0
  539. synth_ai/cli/filter/core.py +5 -0
  540. synth_ai/cli/filter/errors.py +23 -0
  541. synth_ai/cli/filter/validation.py +5 -0
  542. synth_ai/cli/legacy_root_backup.py +28 -22
  543. synth_ai/cli/lib/__init__.py +10 -0
  544. synth_ai/cli/lib/task_app_discovery.py +7 -0
  545. synth_ai/cli/lib/task_app_env.py +518 -0
  546. synth_ai/cli/mcp.py +34 -0
  547. synth_ai/cli/modal_serve/__init__.py +12 -0
  548. synth_ai/cli/modal_serve/core.py +14 -0
  549. synth_ai/cli/modal_serve/errors.py +8 -0
  550. synth_ai/cli/modal_serve/validation.py +11 -0
  551. synth_ai/cli/opencode.py +256 -0
  552. synth_ai/cli/recent.py +13 -7
  553. synth_ai/cli/rl_demo.py +166 -114
  554. synth_ai/cli/root.py +143 -112
  555. synth_ai/cli/serve/__init__.py +12 -0
  556. synth_ai/cli/serve/core.py +14 -0
  557. synth_ai/cli/serve/errors.py +8 -0
  558. synth_ai/cli/serve/validation.py +11 -0
  559. synth_ai/cli/setup.py +49 -0
  560. synth_ai/cli/status.py +7 -125
  561. synth_ai/cli/task_app_deploy.py +7 -0
  562. synth_ai/cli/task_app_list.py +25 -0
  563. synth_ai/cli/task_app_modal_serve.py +11 -0
  564. synth_ai/cli/task_app_serve.py +11 -0
  565. synth_ai/cli/task_apps.py +3134 -0
  566. synth_ai/cli/traces.py +9 -5
  567. synth_ai/cli/train/__init__.py +12 -0
  568. synth_ai/cli/train/core.py +21 -0
  569. synth_ai/cli/train/errors.py +8 -0
  570. synth_ai/cli/train/validation.py +24 -0
  571. synth_ai/cli/train.py +5 -0
  572. synth_ai/cli/turso.py +73 -0
  573. synth_ai/cli/watch.py +13 -18
  574. synth_ai/demos/__init__.py +10 -0
  575. synth_ai/demos/core/__init__.py +28 -1
  576. synth_ai/demos/core/cli.py +745 -416
  577. synth_ai/demos/crafter/__init__.py +1 -0
  578. synth_ai/demos/crafter/crafter_fft_4b.toml +55 -0
  579. synth_ai/demos/crafter/grpo_crafter_task_app.py +185 -0
  580. synth_ai/demos/crafter/rl_from_base_qwen4b.toml +74 -0
  581. synth_ai/demos/demo_registry.py +176 -0
  582. synth_ai/demos/demo_task_apps/__init__.py +7 -1
  583. synth_ai/demos/demo_task_apps/core.py +75 -37
  584. synth_ai/demos/demo_task_apps/crafter/__init__.py +1 -0
  585. synth_ai/demos/demo_task_apps/crafter/configs/crafter_fft_4b.toml +53 -0
  586. synth_ai/demos/demo_task_apps/crafter/configs/rl_from_base_qwen4b.toml +73 -0
  587. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +184 -0
  588. synth_ai/demos/demo_task_apps/math/_common.py +1 -2
  589. synth_ai/demos/demo_task_apps/math/app.py +2 -1
  590. synth_ai/demos/demo_task_apps/math/config.toml +55 -110
  591. synth_ai/demos/demo_task_apps/math/deploy_modal.py +3 -6
  592. synth_ai/demos/demo_task_apps/math/modal_task_app.py +491 -166
  593. synth_ai/demos/demo_task_apps/math/task_app_entry.py +37 -0
  594. synth_ai/demos/math/__init__.py +1 -0
  595. synth_ai/demos/math/_common.py +16 -0
  596. synth_ai/demos/math/app.py +38 -0
  597. synth_ai/demos/math/config.toml +76 -0
  598. synth_ai/demos/math/deploy_modal.py +54 -0
  599. synth_ai/demos/math/modal_task_app.py +703 -0
  600. synth_ai/demos/math/task_app_entry.py +51 -0
  601. synth_ai/environments/environment/core.py +7 -1
  602. synth_ai/environments/examples/bandit/engine.py +12 -5
  603. synth_ai/environments/examples/bandit/environment.py +0 -1
  604. synth_ai/environments/examples/bandit/taskset.py +4 -4
  605. synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +7 -4
  606. synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +9 -5
  607. synth_ai/environments/examples/crafter_classic/environment.py +93 -2
  608. synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +4 -3
  609. synth_ai/environments/examples/enron/engine.py +7 -2
  610. synth_ai/environments/examples/enron/environment.py +68 -0
  611. synth_ai/environments/examples/red/engine.py +60 -12
  612. synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
  613. synth_ai/environments/examples/red/engine_helpers/reward_components.py +151 -179
  614. synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
  615. synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
  616. synth_ai/environments/examples/red/environment.py +86 -0
  617. synth_ai/environments/examples/red/trace_hooks_v3.py +168 -0
  618. synth_ai/environments/examples/sokoban/taskset.py +116 -0
  619. synth_ai/environments/examples/verilog/engine.py +104 -12
  620. synth_ai/environments/examples/wordle/environment.py +0 -1
  621. synth_ai/environments/reproducibility/tree.py +5 -6
  622. synth_ai/environments/service/app.py +11 -12
  623. synth_ai/environments/service/core_routes.py +10 -9
  624. synth_ai/environments/stateful/engine.py +1 -1
  625. synth_ai/environments/tasks/core.py +1 -0
  626. synth_ai/environments/tasks/filters.py +5 -6
  627. synth_ai/environments/tasks/utils.py +4 -5
  628. synth_ai/evals/__init__.py +15 -0
  629. synth_ai/evals/base.py +14 -5
  630. synth_ai/evals/client.py +82 -0
  631. synth_ai/evals/types.py +42 -0
  632. synth_ai/http.py +8 -22
  633. synth_ai/http_client.py +45 -12
  634. synth_ai/inference/__init__.py +0 -2
  635. synth_ai/inference/client.py +21 -7
  636. synth_ai/jobs/client.py +129 -80
  637. synth_ai/judge_schemas.py +127 -0
  638. synth_ai/learning/__init__.py +51 -6
  639. synth_ai/learning/algorithms.py +14 -0
  640. synth_ai/learning/client.py +122 -30
  641. synth_ai/learning/config.py +2 -40
  642. synth_ai/learning/constants.py +0 -2
  643. synth_ai/learning/ft_client.py +4 -56
  644. synth_ai/learning/health.py +14 -8
  645. synth_ai/learning/jobs.py +43 -47
  646. synth_ai/learning/prompt_learning_client.py +276 -0
  647. synth_ai/learning/prompt_learning_types.py +185 -0
  648. synth_ai/{rl → learning/rl}/__init__.py +14 -5
  649. synth_ai/learning/rl/client.py +269 -0
  650. synth_ai/learning/rl/config.py +31 -0
  651. synth_ai/{rl → learning/rl}/contracts.py +5 -10
  652. synth_ai/{rl → learning/rl}/env_keys.py +45 -16
  653. synth_ai/learning/rl/secrets.py +13 -0
  654. synth_ai/learning/rl_client.py +2 -253
  655. synth_ai/learning/sft/__init__.py +29 -0
  656. synth_ai/learning/sft/client.py +68 -0
  657. synth_ai/learning/sft/config.py +270 -0
  658. synth_ai/learning/sft/data.py +698 -0
  659. synth_ai/learning/sse.py +25 -26
  660. synth_ai/learning/validators.py +29 -25
  661. synth_ai/mcp/__init__.py +5 -0
  662. synth_ai/mcp/__main__.py +8 -0
  663. synth_ai/mcp/main.py +254 -0
  664. synth_ai/mcp/setup.py +100 -0
  665. synth_ai/modal.py +257 -0
  666. synth_ai/pricing/__init__.py +3 -0
  667. synth_ai/pricing/model_pricing.py +64 -0
  668. synth_ai/session/__init__.py +75 -0
  669. synth_ai/session/client.py +383 -0
  670. synth_ai/session/constants.py +63 -0
  671. synth_ai/session/exceptions.py +105 -0
  672. synth_ai/session/manager.py +139 -0
  673. synth_ai/session/models.py +89 -0
  674. synth_ai/session/query.py +110 -0
  675. synth_ai/spec/__init__.py +46 -0
  676. synth_ai/spec/dataclasses.py +149 -0
  677. synth_ai/spec/loader.py +144 -0
  678. synth_ai/spec/serializer.py +199 -0
  679. synth_ai/spec/validation.py +250 -0
  680. synth_ai/streaming/__init__.py +29 -0
  681. synth_ai/streaming/config.py +94 -0
  682. synth_ai/streaming/handlers.py +589 -0
  683. synth_ai/streaming/streamer.py +320 -0
  684. synth_ai/streaming/types.py +95 -0
  685. synth_ai/task/__init__.py +116 -3
  686. synth_ai/task/apps/__init__.py +132 -0
  687. synth_ai/task/auth.py +165 -0
  688. synth_ai/task/client.py +167 -0
  689. synth_ai/task/config.py +261 -0
  690. synth_ai/task/contracts.py +173 -57
  691. synth_ai/task/datasets.py +108 -0
  692. synth_ai/task/errors.py +50 -0
  693. synth_ai/task/health.py +17 -11
  694. synth_ai/task/inference_api.py +101 -0
  695. synth_ai/task/json.py +111 -0
  696. synth_ai/task/proxy.py +251 -0
  697. synth_ai/task/rubrics/__init__.py +55 -0
  698. synth_ai/task/rubrics/loaders.py +156 -0
  699. synth_ai/task/rubrics/models.py +57 -0
  700. synth_ai/task/rubrics/scoring.py +116 -0
  701. synth_ai/task/rubrics/strict.py +149 -0
  702. synth_ai/task/rubrics.py +219 -0
  703. synth_ai/task/server.py +432 -0
  704. synth_ai/task/trace_correlation_helpers.py +328 -0
  705. synth_ai/task/tracing_utils.py +95 -0
  706. synth_ai/task/validators.py +449 -6
  707. synth_ai/task/vendors.py +59 -0
  708. synth_ai/tracing_v3/__init__.py +4 -0
  709. synth_ai/tracing_v3/abstractions.py +21 -4
  710. synth_ai/tracing_v3/config.py +167 -22
  711. synth_ai/tracing_v3/constants.py +21 -0
  712. synth_ai/tracing_v3/db_config.py +42 -29
  713. synth_ai/tracing_v3/decorators.py +80 -45
  714. synth_ai/tracing_v3/examples/basic_usage.py +15 -9
  715. synth_ai/tracing_v3/hooks.py +6 -4
  716. synth_ai/tracing_v3/llm_call_record_helpers.py +161 -61
  717. synth_ai/tracing_v3/migration_helper.py +1 -2
  718. synth_ai/tracing_v3/replica_sync.py +12 -7
  719. synth_ai/tracing_v3/serialization.py +130 -0
  720. synth_ai/tracing_v3/session_tracer.py +86 -21
  721. synth_ai/tracing_v3/storage/base.py +98 -12
  722. synth_ai/tracing_v3/storage/config.py +63 -16
  723. synth_ai/tracing_v3/storage/factory.py +11 -9
  724. synth_ai/tracing_v3/storage/utils.py +15 -11
  725. synth_ai/tracing_v3/trace_utils.py +317 -0
  726. synth_ai/tracing_v3/turso/__init__.py +8 -21
  727. synth_ai/tracing_v3/turso/daemon.py +123 -15
  728. synth_ai/tracing_v3/turso/models.py +5 -2
  729. synth_ai/tracing_v3/turso/native_manager.py +1293 -0
  730. synth_ai/tracing_v3/utils.py +5 -4
  731. synth_ai/tunnel.py +143 -0
  732. synth_ai/tunnel_deploy.py +278 -0
  733. synth_ai/types.py +8 -0
  734. synth_ai/urls.py +11 -0
  735. synth_ai/utils/__init__.py +166 -0
  736. synth_ai/utils/agents.py +74 -0
  737. synth_ai/utils/apps.py +152 -0
  738. synth_ai/utils/base_url.py +94 -0
  739. synth_ai/utils/bin.py +39 -0
  740. synth_ai/utils/claude.py +36 -0
  741. synth_ai/utils/cli.py +284 -0
  742. synth_ai/utils/config.py +81 -0
  743. synth_ai/utils/env.py +346 -0
  744. synth_ai/utils/errors.py +85 -0
  745. synth_ai/utils/http.py +172 -0
  746. synth_ai/utils/json.py +72 -0
  747. synth_ai/utils/log_filter.py +99 -0
  748. synth_ai/utils/logging.py +198 -0
  749. synth_ai/utils/modal.py +299 -0
  750. synth_ai/utils/paths.py +95 -0
  751. synth_ai/utils/process.py +233 -0
  752. synth_ai/utils/prompts.py +39 -0
  753. synth_ai/utils/sqld.py +122 -0
  754. synth_ai/utils/ssl.py +25 -0
  755. synth_ai/utils/task_app_discovery.py +882 -0
  756. synth_ai/utils/task_app_env.py +186 -0
  757. synth_ai/utils/task_app_state.py +318 -0
  758. synth_ai/utils/tunnel/__init__.py +12 -0
  759. synth_ai/utils/tunnel/config.py +55 -0
  760. synth_ai/utils/user_config.py +137 -0
  761. synth_ai/uvicorn.py +77 -0
  762. synth_ai-0.2.23.dev3.dist-info/METADATA +357 -0
  763. synth_ai-0.2.23.dev3.dist-info/RECORD +983 -0
  764. {synth_ai-0.2.8.dev4.dist-info → synth_ai-0.2.23.dev3.dist-info}/entry_points.txt +0 -1
  765. {synth_ai-0.2.8.dev4.dist-info → synth_ai-0.2.23.dev3.dist-info}/top_level.txt +1 -0
  766. synth_ai/cli/man.py +0 -106
  767. synth_ai/core/experiment.py +0 -15
  768. synth_ai/core/system.py +0 -15
  769. synth_ai/environments/examples/sokoban/units/astar_common.py +0 -95
  770. synth_ai/experimental/synth_oss.py +0 -446
  771. synth_ai/handshake.py +0 -63
  772. synth_ai/install_sqld.sh +0 -40
  773. synth_ai/learning/offline/dpo.py +0 -0
  774. synth_ai/learning/offline/providers.py +0 -7
  775. synth_ai/learning/offline/sft.py +0 -0
  776. synth_ai/learning/offline/shared.py +0 -0
  777. synth_ai/learning/online/grpo.py +0 -0
  778. synth_ai/learning/online/irft.py +0 -0
  779. synth_ai/learning/prompts/banking77_injection_eval.py +0 -168
  780. synth_ai/learning/prompts/gepa.py +0 -0
  781. synth_ai/learning/prompts/hello_world_in_context_injection_ex.py +0 -213
  782. synth_ai/learning/prompts/mipro.py +0 -289
  783. synth_ai/learning/prompts/random_search.py +0 -246
  784. synth_ai/learning/prompts/run_mipro_banking77.py +0 -172
  785. synth_ai/learning/prompts/run_random_search_banking77.py +0 -324
  786. synth_ai/lm/__init__.py +0 -51
  787. synth_ai/lm/caching/constants.py +0 -6
  788. synth_ai/lm/caching/dbs.py +0 -0
  789. synth_ai/lm/caching/ephemeral.py +0 -102
  790. synth_ai/lm/caching/handler.py +0 -137
  791. synth_ai/lm/caching/initialize.py +0 -11
  792. synth_ai/lm/caching/persistent.py +0 -114
  793. synth_ai/lm/config.py +0 -110
  794. synth_ai/lm/constants.py +0 -32
  795. synth_ai/lm/core/__init__.py +0 -8
  796. synth_ai/lm/core/all.py +0 -73
  797. synth_ai/lm/core/exceptions.py +0 -7
  798. synth_ai/lm/core/main.py +0 -319
  799. synth_ai/lm/core/main_v3.py +0 -594
  800. synth_ai/lm/core/synth_models.py +0 -48
  801. synth_ai/lm/core/vendor_clients.py +0 -188
  802. synth_ai/lm/cost/monitor.py +0 -1
  803. synth_ai/lm/cost/statefulness.py +0 -1
  804. synth_ai/lm/injection.py +0 -80
  805. synth_ai/lm/overrides.py +0 -206
  806. synth_ai/lm/provider_support/__init__.py +0 -8
  807. synth_ai/lm/provider_support/anthropic.py +0 -972
  808. synth_ai/lm/provider_support/openai.py +0 -1139
  809. synth_ai/lm/provider_support/suppress_logging.py +0 -31
  810. synth_ai/lm/structured_outputs/handler.py +0 -440
  811. synth_ai/lm/structured_outputs/inject.py +0 -297
  812. synth_ai/lm/structured_outputs/rehabilitate.py +0 -185
  813. synth_ai/lm/tools/__init__.py +0 -3
  814. synth_ai/lm/tools/base.py +0 -172
  815. synth_ai/lm/unified_interface.py +0 -202
  816. synth_ai/lm/vendors/base.py +0 -81
  817. synth_ai/lm/vendors/core/anthropic_api.py +0 -387
  818. synth_ai/lm/vendors/core/gemini_api.py +0 -292
  819. synth_ai/lm/vendors/core/mistral_api.py +0 -322
  820. synth_ai/lm/vendors/core/openai_api.py +0 -225
  821. synth_ai/lm/vendors/core/synth_dev_api.py +0 -0
  822. synth_ai/lm/vendors/local/ollama.py +0 -0
  823. synth_ai/lm/vendors/openai_standard.py +0 -780
  824. synth_ai/lm/vendors/openai_standard_responses.py +0 -256
  825. synth_ai/lm/vendors/retries.py +0 -22
  826. synth_ai/lm/vendors/supported/custom_endpoint.py +0 -417
  827. synth_ai/lm/vendors/supported/deepseek.py +0 -69
  828. synth_ai/lm/vendors/supported/grok.py +0 -75
  829. synth_ai/lm/vendors/supported/groq.py +0 -16
  830. synth_ai/lm/vendors/supported/ollama.py +0 -15
  831. synth_ai/lm/vendors/supported/openrouter.py +0 -74
  832. synth_ai/lm/vendors/supported/together.py +0 -11
  833. synth_ai/lm/vendors/synth_client.py +0 -808
  834. synth_ai/lm/warmup.py +0 -186
  835. synth_ai/rl/secrets.py +0 -19
  836. synth_ai/scripts/verify_rewards.py +0 -100
  837. synth_ai/tracing/__init__.py +0 -30
  838. synth_ai/tracing_v1/__init__.py +0 -33
  839. synth_ai/tracing_v3/turso/manager.py +0 -760
  840. synth_ai/v0/tracing/abstractions.py +0 -224
  841. synth_ai/v0/tracing/base_client.py +0 -91
  842. synth_ai/v0/tracing/client_manager.py +0 -131
  843. synth_ai/v0/tracing/config.py +0 -142
  844. synth_ai/v0/tracing/context.py +0 -146
  845. synth_ai/v0/tracing/decorators.py +0 -682
  846. synth_ai/v0/tracing/events/__init__.py +0 -0
  847. synth_ai/v0/tracing/events/manage.py +0 -147
  848. synth_ai/v0/tracing/events/scope.py +0 -86
  849. synth_ai/v0/tracing/events/store.py +0 -228
  850. synth_ai/v0/tracing/immediate_client.py +0 -151
  851. synth_ai/v0/tracing/local.py +0 -18
  852. synth_ai/v0/tracing/log_client_base.py +0 -73
  853. synth_ai/v0/tracing/retry_queue.py +0 -186
  854. synth_ai/v0/tracing/trackers.py +0 -515
  855. synth_ai/v0/tracing/upload.py +0 -512
  856. synth_ai/v0/tracing/utils.py +0 -9
  857. synth_ai/v0/tracing_v1/__init__.py +0 -16
  858. synth_ai/v0/tracing_v1/abstractions.py +0 -224
  859. synth_ai/v0/tracing_v1/base_client.py +0 -91
  860. synth_ai/v0/tracing_v1/client_manager.py +0 -131
  861. synth_ai/v0/tracing_v1/config.py +0 -142
  862. synth_ai/v0/tracing_v1/context.py +0 -146
  863. synth_ai/v0/tracing_v1/decorators.py +0 -703
  864. synth_ai/v0/tracing_v1/events/__init__.py +0 -0
  865. synth_ai/v0/tracing_v1/events/manage.py +0 -147
  866. synth_ai/v0/tracing_v1/events/scope.py +0 -86
  867. synth_ai/v0/tracing_v1/events/store.py +0 -228
  868. synth_ai/v0/tracing_v1/immediate_client.py +0 -151
  869. synth_ai/v0/tracing_v1/local.py +0 -18
  870. synth_ai/v0/tracing_v1/log_client_base.py +0 -73
  871. synth_ai/v0/tracing_v1/retry_queue.py +0 -186
  872. synth_ai/v0/tracing_v1/trackers.py +0 -515
  873. synth_ai/v0/tracing_v1/upload.py +0 -527
  874. synth_ai/v0/tracing_v1/utils.py +0 -9
  875. synth_ai/zyk/__init__.py +0 -30
  876. synth_ai-0.2.8.dev4.dist-info/METADATA +0 -129
  877. synth_ai-0.2.8.dev4.dist-info/RECORD +0 -420
  878. {synth_ai/lm/caching → examples/task_apps}/__init__.py +0 -0
  879. {synth_ai/lm/cost → examples/task_apps/crafter}/__init__.py +0 -0
  880. {synth_ai/lm/structured_outputs → examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server}/__init__.py +0 -0
  881. {synth_ai/lm/vendors → examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests}/__init__.py +0 -0
  882. {synth_ai/lm/vendors/core → examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils}/__init__.py +0 -0
  883. {synth_ai/lm/vendors/local → examples/task_apps/math}/__init__.py +0 -0
  884. {synth_ai/lm/vendors/supported → examples/workflows}/__init__.py +0 -0
  885. {synth_ai/v0/tracing → examples/workflows/math_rl}/__init__.py +0 -0
  886. /synth_ai/{compound/cais.py → cli/__main__.py} +0 -0
  887. /synth_ai/{learning/filtering.py → py.typed} +0 -0
  888. {synth_ai-0.2.8.dev4.dist-info → synth_ai-0.2.23.dev3.dist-info}/WHEEL +0 -0
  889. {synth_ai-0.2.8.dev4.dist-info → synth_ai-0.2.23.dev3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,912 @@
1
+ """Banking77 intent classification task app for Synth prompt optimization benchmarks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import contextlib
6
+ import json
7
+ import os
8
+ import uuid
9
+ from collections.abc import Iterable, Sequence
10
+ from pathlib import Path
11
+ from typing import Any, Mapping, cast
12
+ import socket
13
+ from urllib.parse import urlparse
14
+
15
+ # removed top-level httpx and datasets import to allow modal deploy without local deps
16
+ from fastapi import APIRouter, HTTPException, Request
17
+ from pydantic import BaseModel
18
+ from dotenv import load_dotenv
19
+
20
+ from fastapi.exceptions import RequestValidationError
21
+ from fastapi.responses import JSONResponse
22
+ from starlette.requests import Request as StarletteRequest
23
+
24
+ from synth_ai.task.apps import ModalDeploymentConfig, TaskAppEntry, register_task_app
25
+ from synth_ai.task.auth import is_api_key_header_authorized, normalize_environment_api_key
26
+ from synth_ai.task.contracts import (
27
+ RolloutMetrics,
28
+ RolloutRequest,
29
+ RolloutResponse,
30
+ RolloutStep,
31
+ RolloutTrajectory,
32
+ TaskInfo,
33
+ )
34
+ from synth_ai.task.datasets import TaskDatasetRegistry, TaskDatasetSpec
35
+ from synth_ai.task.rubrics import Rubric, load_rubric
36
+ from synth_ai.task.server import ProxyConfig, RubricBundle, TaskAppConfig, create_task_app, run_task_app
37
+ from synth_ai.task.vendors import normalize_vendor_keys
38
+
39
+ def _compute_repo_root() -> Path:
40
+ p = Path(__file__).resolve()
41
+ parents = list(p.parents)
42
+ if len(parents) >= 4:
43
+ # parents[3] exists when file is within repo (e.g., examples/task_apps/…)
44
+ return parents[3]
45
+ # Modal inline deploy: code may be at /root/*.py, but we mount synth_ai at /opt/synth_ai_repo/synth_ai
46
+ if "/opt/synth_ai_repo" in os.getenv("PYTHONPATH", "") or Path("/opt/synth_ai_repo/synth_ai").exists():
47
+ return Path("/opt/synth_ai_repo")
48
+ # Fallback to current working directory
49
+ return Path.cwd()
50
+
51
+ REPO_ROOT = _compute_repo_root()
52
+
53
+ # Dataset configuration
54
+ DATASET_NAME = os.getenv("BANKING77_DATASET_NAME", "banking77")
55
+ DEFAULT_SPLIT = "train"
56
+ AVAILABLE_SPLITS: tuple[str, ...] = ("train", "test")
57
+ TOOL_NAME = "banking77_classify"
58
+
59
+
60
+ class Banking77Dataset:
61
+ """Lazy Hugging Face dataset loader for Banking77."""
62
+
63
+ def __init__(self) -> None:
64
+ self._cache: dict[str, Any] = {}
65
+ self._label_names: list[str] | None = None
66
+
67
+ def _load_split(self, split: str):
68
+ if split not in AVAILABLE_SPLITS:
69
+ raise ValueError(f"Unknown split: {split}. Available: {AVAILABLE_SPLITS}")
70
+ if split not in self._cache:
71
+ try:
72
+ from datasets import load_dataset as _load_dataset # lazy import
73
+ ds = _load_dataset(DATASET_NAME, split=split, trust_remote_code=False)
74
+ self._cache[split] = ds
75
+ if self._label_names is None and hasattr(ds.features.get("label"), "names"):
76
+ self._label_names = ds.features["label"].names
77
+ except Exception as exc:
78
+ raise RuntimeError(
79
+ f"Dataset preparation failed: {split}: Failed to download Banking77 dataset from Hugging Face. "
80
+ f"Dataset: {DATASET_NAME} | Split: {split}"
81
+ ) from exc
82
+ return self._cache[split]
83
+
84
+ def ensure_ready(self, splits: Sequence[str]) -> None:
85
+ for split in splits:
86
+ self._load_split(split)
87
+
88
+ def size(self, split: str) -> int:
89
+ dataset = self._load_split(split)
90
+ return len(dataset)
91
+
92
+ def sample(self, *, split: str, index: int) -> dict[str, Any]:
93
+ dataset = self._load_split(split)
94
+ size = len(dataset)
95
+ if size == 0:
96
+ raise RuntimeError(f"Banking77 split '{split}' is empty")
97
+ idx = int(index) % size
98
+ row = dataset[int(idx)]
99
+
100
+ label_idx = int(row.get("label", 0))
101
+ label_text = self.get_label_name(label_idx)
102
+
103
+ return {
104
+ "index": idx,
105
+ "split": split,
106
+ "text": str(row.get("text", "")),
107
+ "label": label_text,
108
+ "label_idx": label_idx,
109
+ }
110
+
111
+ def get_label_name(self, label_idx: int) -> str:
112
+ if self._label_names is None:
113
+ self._load_split(DEFAULT_SPLIT)
114
+ if self._label_names and 0 <= label_idx < len(self._label_names):
115
+ return self._label_names[label_idx]
116
+ return f"label_{label_idx}"
117
+
118
+ @property
119
+ def label_names(self) -> list[str]:
120
+ if self._label_names is None:
121
+ self._load_split(DEFAULT_SPLIT)
122
+ return self._label_names or []
123
+
124
+
125
+ banking77_router = APIRouter()
126
+
127
+
128
+ BANKING77_DATASET_SPEC = TaskDatasetSpec(
129
+ id="banking77",
130
+ name="Banking77 Intent Classification",
131
+ version="1.0.0",
132
+ splits=list(AVAILABLE_SPLITS),
133
+ default_split=DEFAULT_SPLIT,
134
+ description="Banking customer query intent classification with 77 intent categories.",
135
+ )
136
+
137
+
138
+ class ClassifyRequest(BaseModel):
139
+ query: str
140
+
141
+
142
+ class ClassifyResponse(BaseModel):
143
+ intent: str
144
+ confidence: float | None = None
145
+
146
+
147
+ @banking77_router.post("/classify", response_model=ClassifyResponse)
148
+ async def classify_endpoint(req: ClassifyRequest, request: Request):
149
+ dataset: Banking77Dataset = request.app.state.banking77_dataset
150
+ return ClassifyResponse(intent="activate_my_card", confidence=None)
151
+
152
+
153
+ async def call_chat_completion(
154
+ policy_config: dict[str, Any],
155
+ placeholders: dict[str, Any],
156
+ default_messages: list[dict[str, str]],
157
+ api_key: str | None = None,
158
+ ) -> tuple[str, dict[str, Any] | None, list[dict[str, Any]]]:
159
+ # STRICT: require all policy fields to come from TOML (no defaults)
160
+ missing_fields: list[str] = []
161
+ # Always require model; provider optional when routing via proxy
162
+ model_val = policy_config.get("model")
163
+ if not isinstance(model_val, str) or not model_val.strip():
164
+ missing_fields.append("model")
165
+ # Resolve routing base - ALWAYS prioritize inference_url if provided (trainer-provided interceptor URL)
166
+ # If inference_url is set, use it exclusively and ignore api_base/base_url
167
+ inference_url_raw = policy_config.get("inference_url")
168
+ api_base_raw = policy_config.get("api_base")
169
+ base_url_raw = policy_config.get("base_url")
170
+
171
+ if inference_url_raw:
172
+ # Trainer provided inference_url (interceptor URL) - use it exclusively
173
+ route_base = str(inference_url_raw).strip()
174
+ if api_base_raw or base_url_raw:
175
+ # Log warning if api_base/base_url are also present (they'll be ignored)
176
+ with contextlib.suppress(Exception):
177
+ print(
178
+ f"[TASK_APP] ⚠️ inference_url is set ({route_base}), ignoring api_base/base_url",
179
+ flush=True,
180
+ )
181
+ else:
182
+ # Fallback: use api_base or base_url if inference_url not provided
183
+ route_base = (
184
+ (api_base_raw or "").strip()
185
+ or (base_url_raw or "").strip()
186
+ )
187
+ if not route_base:
188
+ missing_fields.append("inference_url")
189
+ if missing_fields:
190
+ raise HTTPException(
191
+ status_code=400,
192
+ detail=(
193
+ "Missing policy fields in TOML [prompt_learning.policy]: " + ", ".join(missing_fields)
194
+ ),
195
+ )
196
+ model = policy_config["model"].strip()
197
+ provider = str(policy_config.get("provider", "")).strip() or "groq"
198
+ lowered = route_base.lower()
199
+ is_provider_host = ("api.openai.com" in lowered) or ("api.groq.com" in lowered)
200
+ # Normalize inference URL: allow bases like .../v1 and auto-append /chat/completions
201
+ def _normalize_chat_url(url: str) -> str:
202
+ u = (url or "").rstrip("/")
203
+ if u.endswith("/chat/completions"):
204
+ return u
205
+ if u.endswith("/v1"):
206
+ return u + "/chat/completions"
207
+ if u.endswith("/completions"):
208
+ return u.rsplit("/", 1)[0] + "/chat/completions"
209
+ return u + "/chat/completions"
210
+ inference_url = _normalize_chat_url(str(route_base))
211
+ temperature = policy_config.get("temperature", 0.7)
212
+ max_tokens = policy_config.get("max_completion_tokens", 100)
213
+
214
+ # Loud route log
215
+ with contextlib.suppress(Exception):
216
+ print(f"[TASK_APP] POLICY ROUTE → {inference_url}", flush=True)
217
+
218
+ messages = []
219
+ for msg_template in default_messages:
220
+ role = msg_template.get("role", "user")
221
+ pattern = msg_template.get("pattern", "")
222
+ content = pattern.format(**placeholders)
223
+ messages.append({"role": role, "content": content})
224
+
225
+ # Loud logging of rendered messages (trim for safety)
226
+ preview = [
227
+ {"role": m.get("role"), "len": len(m.get("content", "")), "head": (m.get("content", "")[:160])}
228
+ for m in messages
229
+ ]
230
+ print(f"[TASK_APP] MESSAGES: {preview}", flush=True)
231
+
232
+ # Assert we are NOT hitting a provider host directly for policy
233
+ if is_provider_host:
234
+ # Print full policy config for forensics
235
+ with contextlib.suppress(Exception):
236
+ print(
237
+ f"[TASK_APP] POLICY_CONFIG: {json.dumps(policy_config, ensure_ascii=False)}",
238
+ flush=True,
239
+ )
240
+ raise HTTPException(status_code=502, detail=f"Direct provider URL not allowed for policy: {route_base}")
241
+
242
+ # If routing to proxy/interceptor, include task app API key if provided
243
+ headers: dict[str, str]
244
+ headers = {"Content-Type": "application/json"}
245
+ if api_key:
246
+ headers["X-API-Key"] = api_key
247
+ with contextlib.suppress(Exception):
248
+ print(f"[TASK_APP] 🔐 PROXY ROUTING with API key: {api_key[:12]}...{api_key[-4:]} (len={len(api_key)})", flush=True)
249
+ print(f"[TASK_APP] 🔐 Headers being sent to proxy: {list(headers.keys())}", flush=True)
250
+ # Verify the key is actually in the headers
251
+ assert "X-API-Key" in headers, "X-API-Key missing from headers!"
252
+ assert headers["X-API-Key"] == api_key, "X-API-Key value mismatch!"
253
+ print(f"[TASK_APP] ✅ Header validation passed: X-API-Key present", flush=True)
254
+ else:
255
+ with contextlib.suppress(Exception):
256
+ print("[TASK_APP] ⚠️ PROXY ROUTING (NO API KEY PROVIDED!)", flush=True)
257
+ print(f"[TASK_APP] ⚠️ This will likely fail auth at the proxy endpoint", flush=True)
258
+
259
+ # Define tool schema for banking77 classification (no enum to keep payload small)
260
+ classify_tool = {
261
+ "type": "function",
262
+ "function": {
263
+ "name": TOOL_NAME,
264
+ "description": "Return the predicted banking77 intent label in the `intent` field.",
265
+ "parameters": {
266
+ "type": "object",
267
+ "properties": {"intent": {"type": "string"}},
268
+ "required": ["intent"],
269
+ },
270
+ },
271
+ }
272
+
273
+ payload = {
274
+ "model": model,
275
+ "messages": messages,
276
+ "temperature": temperature,
277
+ "max_tokens": max_tokens,
278
+ "tools": [classify_tool],
279
+ "tool_choice": {"type": "function", "function": {"name": TOOL_NAME}},
280
+ }
281
+
282
+ print(
283
+ f"[TASK_APP] OUTBOUND: model={model} temp={temperature} max={max_tokens} tools=1 choice={TOOL_NAME}",
284
+ flush=True,
285
+ )
286
+
287
+ # Lazy import httpx to avoid top-level import during modal code gen
288
+ try:
289
+ import httpx # type: ignore
290
+ except Exception as _exc: # pragma: no cover
291
+ raise HTTPException(status_code=500, detail=f"httpx unavailable: {_exc}")
292
+
293
+ # Proxy target diagnostics (no preflight health; we go straight to POST)
294
+ try:
295
+ parsed = urlparse(inference_url)
296
+ host = parsed.hostname or ""
297
+ port = parsed.port or (443 if parsed.scheme == "https" else 80)
298
+ print(f"[TASK_APP] PROXY_TARGET: scheme={parsed.scheme} host={host} port={port} path={parsed.path}", flush=True)
299
+ addrinfo = socket.getaddrinfo(host, None)
300
+ ips = sorted({ai[4][0] for ai in addrinfo})
301
+ print(f"[TASK_APP] PROXY_DNS: ips={ips}", flush=True)
302
+ except Exception as e:
303
+ print(f"[TASK_APP] PROXY_DNS_ERROR: {e}", flush=True)
304
+
305
+ async with httpx.AsyncClient(timeout=30.0) as client:
306
+ # Log the actual request about to be sent
307
+ with contextlib.suppress(Exception):
308
+ headers_log = {k: (f"{v[:15]}..." if k == "X-API-Key" and len(v) > 15 else v) for k, v in headers.items()}
309
+ print(f"[TASK_APP] 📤 Sending POST to: {inference_url}", flush=True)
310
+ print(f"[TASK_APP] 📤 With headers: {headers_log}", flush=True)
311
+ print(f"[TASK_APP] 📤 Payload keys: {list(payload.keys())}", flush=True)
312
+ # Final assertion before sending
313
+ if "X-API-Key" in headers:
314
+ print(f"[TASK_APP] ✅ X-API-Key IS in headers (len={len(headers['X-API-Key'])})", flush=True)
315
+ else:
316
+ print(f"[TASK_APP] ❌ X-API-Key NOT in headers!", flush=True)
317
+
318
+ try:
319
+ response = await client.post(inference_url, json=payload, headers=headers)
320
+ except Exception as e:
321
+ print(f"[TASK_APP] POST_EXCEPTION: {type(e).__name__}: {e}", flush=True)
322
+ raise HTTPException(status_code=502, detail=f"Proxy POST failed: {e}")
323
+
324
+ # Always print status/headers/body BEFORE any error is raised
325
+ print(f"[TASK_APP] RESPONSE_STATUS: {response.status_code}", flush=True)
326
+ print(f"[TASK_APP] RESPONSE_HEADERS: {dict(response.headers)}", flush=True)
327
+
328
+ # Handle error responses from interceptor/provider
329
+ if response.status_code != 200:
330
+ try:
331
+ error_json = response.json()
332
+ error_msg = str(error_json.get("error", {}).get("message", error_json.get("error", "Unknown error")))
333
+ print(f"[TASK_APP] ❌ Error response from interceptor: {error_msg}", flush=True)
334
+ raise HTTPException(
335
+ status_code=response.status_code,
336
+ detail=f"Interceptor/provider error: {error_msg}"
337
+ )
338
+ except HTTPException:
339
+ raise
340
+ except Exception:
341
+ error_text = response.text[:500]
342
+ print(f"[TASK_APP] ❌ Non-JSON error response: {error_text}", flush=True)
343
+ raise HTTPException(
344
+ status_code=response.status_code,
345
+ detail=f"Interceptor/provider returned error: {error_text}"
346
+ )
347
+
348
+ # Try JSON, fallback to text
349
+ try:
350
+ response_json = response.json()
351
+ raw = json.dumps(response_json, ensure_ascii=False)
352
+ print(f"[TASK_APP] RESPONSE_JSON ({len(raw)} bytes): {raw}", flush=True)
353
+ except Exception:
354
+ response_text = response.text
355
+ print(f"[TASK_APP] RESPONSE_TEXT ({len(response_text)} bytes): {response_text}", flush=True)
356
+ response.raise_for_status()
357
+ # If we got here, raise_for_status didn't throw; keep an empty JSON
358
+ response_json = {}
359
+ # After logging, surface HTTP errors (shouldn't reach here if status != 200)
360
+ response.raise_for_status()
361
+
362
+ with contextlib.suppress(Exception):
363
+ usage = response_json.get("usage", {}) if isinstance(response_json, dict) else {}
364
+ ch = (response_json.get("choices") or [{}])[0]
365
+ txt = (ch.get("message", {}) or {}).get("content", "")
366
+ tc = (ch.get("message", {}) or {}).get("tool_calls", [])
367
+ print(
368
+ f"[TASK_APP] RESPONSE: usage={usage} choices={len(response_json.get('choices', []))} first_len={len(txt)} tool_calls={len(tc)}",
369
+ flush=True,
370
+ )
371
+
372
+ # Hard assertions: require either tool_calls or non-empty content
373
+ try:
374
+ choices = response_json.get("choices") or []
375
+ first_msg = (choices[0] or {}).get("message", {}) if choices else {}
376
+ tool_calls = first_msg.get("tool_calls", []) or []
377
+ content_text = str(first_msg.get("content", ""))
378
+ if not tool_calls and not content_text.strip():
379
+ raise HTTPException(status_code=502, detail="Empty model output: no tool_calls and no content")
380
+ # If tool_calls present, validate schema
381
+ if tool_calls:
382
+ for call in tool_calls:
383
+ fn = (call or {}).get("function", {}) or {}
384
+ if fn.get("name") != TOOL_NAME:
385
+ raise HTTPException(status_code=502, detail=f"Unexpected tool name: {fn.get('name')}")
386
+ args_raw = fn.get("arguments", "{}")
387
+ try:
388
+ args = json.loads(args_raw)
389
+ except Exception:
390
+ raise HTTPException(status_code=502, detail="Tool call arguments not valid JSON")
391
+ if not str(args.get("intent", "")).strip():
392
+ raise HTTPException(status_code=502, detail="Tool call missing 'intent'")
393
+ except HTTPException:
394
+ raise
395
+ except Exception as exc:
396
+ # Convert unexpected errors to HTTP for visibility
397
+ raise HTTPException(status_code=500, detail=f"Response validation failed: {exc}")
398
+
399
+ response_text = ""
400
+ tool_calls = []
401
+
402
+ if "choices" in response_json and len(response_json["choices"]) > 0:
403
+ choice = response_json["choices"][0]
404
+ message = choice.get("message", {})
405
+ response_text = message.get("content", "")
406
+
407
+ if "tool_calls" in message and message["tool_calls"]:
408
+ for tc in message["tool_calls"]:
409
+ tool_calls.append({
410
+ "id": tc.get("id", ""),
411
+ "type": tc.get("type", "function"),
412
+ "function": {
413
+ "name": tc.get("function", {}).get("name", ""),
414
+ "arguments": tc.get("function", {}).get("arguments", "{}"),
415
+ }
416
+ })
417
+
418
+ return response_text, response_json, tool_calls
419
+
420
+
421
+ async def rollout_executor(request: RolloutRequest, fastapi_request: Request) -> RolloutResponse:
422
+ dataset: Banking77Dataset = fastapi_request.app.state.banking77_dataset
423
+ # Inbound snapshot from GEPA
424
+ with contextlib.suppress(Exception):
425
+ cfg = (request.policy.config or {})
426
+ print(
427
+ f"[TASK_APP] INBOUND_ROLLOUT: run_id={request.run_id} seed={request.env.seed} env={request.env.env_name} "
428
+ f"policy.model={cfg.get('model')} provider={cfg.get('provider')} api_base={cfg.get('inference_url') or cfg.get('api_base') or cfg.get('base_url')}",
429
+ flush=True,
430
+ )
431
+
432
+ split = str(((request.env.config or {}).get("split")) or DEFAULT_SPLIT)
433
+ seed = request.env.seed or 0
434
+
435
+ sample = dataset.sample(split=split, index=seed)
436
+ observation = {
437
+ "query": sample["text"],
438
+ "index": sample["index"],
439
+ "split": sample["split"],
440
+ "available_intents": dataset.label_names,
441
+ }
442
+
443
+ # Format available intents as a numbered list for the prompt
444
+ intents_list = "\n".join(f"{i+1}. {label}" for i, label in enumerate(dataset.label_names))
445
+ placeholders = {
446
+ "query": sample["text"],
447
+ "available_intents": intents_list,
448
+ }
449
+
450
+ default_messages = [
451
+ {
452
+ "role": "system",
453
+ "pattern": (
454
+ "You are an expert banking assistant that classifies customer queries into banking intents. "
455
+ "Given a customer message, respond with exactly one intent label from the provided list using the `banking77_classify` tool."
456
+ ),
457
+ },
458
+ {
459
+ "role": "user",
460
+ "pattern": "Customer Query: {query}\n\nAvailable Intents:\n{available_intents}\n\nClassify this query into one of the above banking intents using the tool call.",
461
+ },
462
+ ]
463
+
464
+ response_json: dict[str, Any] | None = None
465
+ response_text = ""
466
+ tool_calls = []
467
+ # Render baseline messages for validation/introspection
468
+ rendered_messages: list[dict[str, str]] = []
469
+ for msg_template in default_messages:
470
+ role = msg_template.get("role", "user")
471
+ pattern = msg_template.get("pattern", "")
472
+ content = pattern.format(**placeholders)
473
+ rendered_messages.append({"role": role, "content": content})
474
+ error_info: dict[str, Any] = {}
475
+
476
+ # Extract API key from request headers for forwarding to proxy
477
+ api_key = (
478
+ fastapi_request.headers.get("X-API-Key")
479
+ or fastapi_request.headers.get("x-api-key")
480
+ or (fastapi_request.headers.get("Authorization", "").replace("Bearer ", "").strip() if fastapi_request.headers.get("Authorization") else None)
481
+ or None
482
+ )
483
+
484
+ # Call proxy - HARD FAILS on any invalid/empty responses. No soft handling.
485
+ response_text, response_json, tool_calls = await call_chat_completion(
486
+ request.policy.config or {},
487
+ placeholders,
488
+ default_messages,
489
+ api_key=api_key,
490
+ )
491
+ # Full upstream JSON must be present and non-empty
492
+ try:
493
+ raw_upstream = json.dumps(response_json, ensure_ascii=False)
494
+ except Exception:
495
+ raw_upstream = str(response_json)
496
+ print(f"[TASK_APP] UPSTREAM_RESPONSE_JSON ({len(raw_upstream)} bytes): {raw_upstream}", flush=True)
497
+ if not isinstance(response_json, dict) or not response_json:
498
+ raise RuntimeError("Proxy returned missing/empty JSON")
499
+ # Must have choices
500
+ choices = response_json.get("choices") or []
501
+ if not isinstance(choices, list) or len(choices) == 0:
502
+ raise RuntimeError("Proxy JSON missing choices")
503
+ first_msg = (choices[0] or {}).get("message", {}) if choices else {}
504
+ if not isinstance(first_msg, dict):
505
+ raise RuntimeError("Proxy JSON message malformed")
506
+ tc_list = first_msg.get("tool_calls") or []
507
+ content_text = str(first_msg.get("content", ""))
508
+ if not tc_list and not content_text.strip():
509
+ raise RuntimeError("Proxy JSON has neither tool_calls nor content")
510
+ print(f"[TASK_APP] RAW_TOOL_CALLS: {tool_calls}", flush=True)
511
+
512
+ predicted_intent = ""
513
+ if tool_calls:
514
+ for tc in tool_calls:
515
+ if tc.get("function", {}).get("name") == TOOL_NAME:
516
+ args_str = tc.get("function", {}).get("arguments", "{}")
517
+ try:
518
+ args = json.loads(args_str)
519
+ predicted_intent = args.get("intent", "")
520
+ print(f"[TASK_APP] PARSED_TOOL_INTENT: {predicted_intent}", flush=True)
521
+ except Exception:
522
+ print(f"[TASK_APP] TOOL_PARSE_ERROR: {args_str}", flush=True)
523
+ elif response_text:
524
+ predicted_intent = response_text.strip().split()[0] if response_text.strip() else ""
525
+ print(f"[TASK_APP] CONTENT_FALLBACK_INTENT: {predicted_intent} text_len={len(response_text or '')}", flush=True)
526
+
527
+ # Hard-crash if no prediction produced at this point
528
+ if not str(predicted_intent or "").strip():
529
+ raise RuntimeError("No prediction produced from proxy response")
530
+
531
+ expected_intent = sample["label"]
532
+ is_correct = (predicted_intent.lower().replace("_", " ") == expected_intent.lower().replace("_", " "))
533
+ reward = 1.0 if is_correct else 0.0
534
+
535
+ print(
536
+ f"[TASK_APP] PREDICTION: expected={expected_intent} predicted={predicted_intent} correct={is_correct}",
537
+ flush=True,
538
+ )
539
+
540
+ info_payload = {
541
+ "expected_intent": expected_intent,
542
+ "predicted_intent": predicted_intent,
543
+ "response_json": response_json,
544
+ "tool_calls": tool_calls,
545
+ "correct": is_correct,
546
+ # Provide messages so pattern validation can extract them reliably
547
+ "messages": rendered_messages,
548
+ **error_info,
549
+ }
550
+
551
+ with contextlib.suppress(Exception):
552
+ print(
553
+ f"[BANKING77_ROLLOUT] run_id={request.run_id} split={sample['split']} "
554
+ f"index={sample['index']} expected={expected_intent} predicted={predicted_intent} "
555
+ f"reward={reward}",
556
+ flush=True,
557
+ )
558
+
559
+ step = RolloutStep(
560
+ obs=observation,
561
+ tool_calls=tool_calls,
562
+ reward=reward,
563
+ done=True,
564
+ info=info_payload,
565
+ )
566
+
567
+ inference_url = (request.policy.config or {}).get("inference_url")
568
+ trajectory = RolloutTrajectory(
569
+ env_id=f"banking77::{sample['split']}::{sample['index']}",
570
+ policy_id=request.policy.policy_id or request.policy.policy_name or "policy",
571
+ steps=[step],
572
+ final={"observation": observation, "reward": reward},
573
+ length=1,
574
+ inference_url=str(inference_url or ""),
575
+ )
576
+
577
+ metrics = RolloutMetrics(
578
+ episode_returns=[reward],
579
+ mean_return=reward,
580
+ num_steps=1,
581
+ num_episodes=1,
582
+ outcome_score=reward,
583
+ events_score=reward,
584
+ details={"correct": is_correct},
585
+ )
586
+
587
+ trace_payload = None
588
+ include_trace = bool(
589
+ (request.record and getattr(request.record, "return_trace", False))
590
+ or os.getenv("TASKAPP_TRACING_ENABLED")
591
+ )
592
+ if include_trace:
593
+ trace_payload = {
594
+ "session_id": str(uuid.uuid4()),
595
+ "events_count": 1,
596
+ "decision_rewards": [reward],
597
+ "metadata": {
598
+ "env": "banking77",
599
+ "split": sample["split"],
600
+ "index": sample["index"],
601
+ "correct": is_correct,
602
+ },
603
+ }
604
+
605
+ return RolloutResponse(
606
+ run_id=request.run_id,
607
+ trajectories=[trajectory],
608
+ branches={},
609
+ metrics=metrics,
610
+ aborted=False,
611
+ ops_executed=2,
612
+ trace=trace_payload,
613
+ )
614
+
615
+
616
+ def build_dataset() -> tuple[TaskDatasetRegistry, Banking77Dataset]:
617
+ registry = TaskDatasetRegistry()
618
+ dataset = Banking77Dataset()
619
+ # Lazy load dataset on first use to avoid cold-start latency/timeouts
620
+ registry.register(BANKING77_DATASET_SPEC, lambda _spec: dataset, cache=True)
621
+ return registry, dataset
622
+
623
+
624
+ def _base_task_info() -> TaskInfo:
625
+ return TaskInfo(
626
+ task={
627
+ "id": "banking77",
628
+ "name": "Banking77 Intent Classification",
629
+ "version": "1.0.0",
630
+ "action_space": {
631
+ "type": "tool_call",
632
+ "tool_name": TOOL_NAME,
633
+ "description": "Classify banking queries into one of 77 intent categories.",
634
+ },
635
+ },
636
+ environment="banking77",
637
+ dataset={
638
+ **BANKING77_DATASET_SPEC.model_dump(),
639
+ "hf_dataset": DATASET_NAME,
640
+ },
641
+ rubric={
642
+ "version": "1",
643
+ "criteria_count": 1,
644
+ "source": "inline",
645
+ },
646
+ inference={
647
+ "supports_proxy": True,
648
+ "tool": TOOL_NAME,
649
+ },
650
+ limits={"max_turns": 1},
651
+ task_metadata={"format": "tool_call"},
652
+ )
653
+
654
+
655
+ def describe_taskset(dataset: Banking77Dataset) -> Mapping[str, Any]:
656
+ return {
657
+ **BANKING77_DATASET_SPEC.model_dump(),
658
+ "hf_dataset": DATASET_NAME,
659
+ "num_labels": len(dataset.label_names),
660
+ "sizes": {split: dataset.size(split) for split in AVAILABLE_SPLITS},
661
+ }
662
+
663
+
664
+ def provide_task_instances(dataset: Banking77Dataset, seeds: Sequence[int]) -> Iterable[TaskInfo]:
665
+ base_info = _base_task_info()
666
+ for seed in seeds:
667
+ sample = dataset.sample(split=DEFAULT_SPLIT, index=seed)
668
+ yield TaskInfo(
669
+ task=base_info.task,
670
+ environment=base_info.environment,
671
+ dataset={
672
+ **base_info.dataset,
673
+ "split": sample["split"],
674
+ "index": sample["index"],
675
+ },
676
+ rubric=base_info.rubric,
677
+ inference=base_info.inference,
678
+ limits=base_info.limits,
679
+ task_metadata={
680
+ **base_info.task_metadata,
681
+ "query": sample["text"],
682
+ },
683
+ )
684
+
685
+
686
+ OUTCOME_RUBRIC: Rubric = cast(
687
+ Rubric,
688
+ load_rubric(
689
+ {
690
+ "version": "1",
691
+ "goal_text": "Classify banking customer queries into the correct intent category.",
692
+ "aggregation": "weighted_sum",
693
+ "criteria": [
694
+ {
695
+ "id": "intent_accuracy",
696
+ "description": "Correctly classify the customer query into the appropriate banking intent.",
697
+ "weight": 1.0,
698
+ }
699
+ ],
700
+ }
701
+ ),
702
+ )
703
+
704
+ EVENTS_RUBRIC: Rubric = cast(
705
+ Rubric,
706
+ load_rubric(
707
+ {
708
+ "version": "1",
709
+ "goal_text": "Use the banking77_classify tool correctly.",
710
+ "aggregation": "weighted_sum",
711
+ "criteria": [
712
+ {
713
+ "id": "tool_usage",
714
+ "description": "Properly invoke the banking77_classify tool with the correct format.",
715
+ "weight": 1.0,
716
+ }
717
+ ],
718
+ }
719
+ ),
720
+ )
721
+
722
+
723
+ def build_config() -> TaskAppConfig:
724
+ registry, dataset = build_dataset()
725
+ base_info = _base_task_info()
726
+
727
+ proxy_keys = normalize_vendor_keys()
728
+ proxy_config = ProxyConfig(
729
+ enable_openai=proxy_keys.get("OPENAI_API_KEY") is not None,
730
+ enable_groq=proxy_keys.get("GROQ_API_KEY") is not None,
731
+ system_hint="Use the banking77_classify tool to classify the customer query.",
732
+ )
733
+
734
+ config = TaskAppConfig(
735
+ app_id="banking77",
736
+ name="Banking77 Intent Classification Task",
737
+ description="Banking77 dataset task app for classifying customer queries into banking intents.",
738
+ base_task_info=base_info,
739
+ describe_taskset=lambda: describe_taskset(dataset),
740
+ provide_task_instances=lambda seeds: provide_task_instances(dataset, seeds),
741
+ rollout=rollout_executor,
742
+ dataset_registry=registry,
743
+ rubrics=RubricBundle(outcome=OUTCOME_RUBRIC, events=EVENTS_RUBRIC),
744
+ proxy=proxy_config,
745
+ routers=(banking77_router,),
746
+ app_state={"banking77_dataset": dataset},
747
+ cors_origins=["*"],
748
+ )
749
+ return config
750
+
751
+
752
+ register_task_app(
753
+ entry=TaskAppEntry(
754
+ app_id="banking77",
755
+ description="Banking77 intent classification task app using the banking77 dataset.",
756
+ config_factory=build_config,
757
+ aliases=("banking-intents",),
758
+ modal=ModalDeploymentConfig(
759
+ app_name="synth-banking77",
760
+ pip_packages=(
761
+ "datasets>=2.14.0",
762
+ "fastapi>=0.115.0",
763
+ "pydantic>=2.0.0",
764
+ "httpx>=0.26.0",
765
+ ),
766
+ extra_local_dirs=((str(REPO_ROOT / "synth_ai"), "/opt/synth_ai_repo/synth_ai"),),
767
+ ),
768
+ )
769
+ )
770
+
771
+ # Modal deployment
772
+ try:
773
+ import modal
774
+
775
+ # For direct Modal deployment (modal deploy banking77_task_app.py)
776
+ app = modal.App("synth-banking77")
777
+
778
+ _image = (
779
+ modal.Image.debian_slim(python_version="3.11")
780
+ .pip_install(
781
+ "synth-ai",
782
+ "datasets>=2.14.0",
783
+ "fastapi>=0.115.0",
784
+ "pydantic>=2.0.0",
785
+ "httpx>=0.26.0",
786
+ "python-dotenv>=1.0.0",
787
+ )
788
+ .env({"PYTHONPATH": "/opt/synth_ai_repo"})
789
+ .add_local_dir(str(REPO_ROOT / "synth_ai"), "/opt/synth_ai_repo/synth_ai", copy=True)
790
+ )
791
+ _env_file = REPO_ROOT / ".env"
792
+ if _env_file.exists():
793
+ _image = _image.add_local_file(str(_env_file), "/opt/synth_ai_repo/.env")
794
+
795
+ @app.function(
796
+ image=_image,
797
+ timeout=600,
798
+ )
799
+ @modal.asgi_app()
800
+ def web():
801
+ return fastapi_app()
802
+
803
+ except ImportError:
804
+ pass
805
+
806
+
807
+ def fastapi_app():
808
+ """Return the FastAPI application for Modal or other ASGI hosts."""
809
+
810
+ # Load environment from .env if present (works in Modal via added local file)
811
+ with contextlib.suppress(Exception):
812
+ load_dotenv(str(REPO_ROOT / ".env"), override=False)
813
+
814
+ app = create_task_app(build_config())
815
+
816
+ # Replace default health endpoints with auth-tolerant handlers
817
+ filtered_routes = []
818
+ for route in app.router.routes:
819
+ path = getattr(route, "path", None)
820
+ methods = getattr(route, "methods", set()) or set()
821
+ if path in {"/health", "/health/rollout"} and "GET" in methods:
822
+ continue
823
+ filtered_routes.append(route)
824
+ app.router.routes = filtered_routes
825
+
826
+ def _log_env_key_prefix(source: str, env_key: str | None) -> str | None:
827
+ if not env_key:
828
+ return None
829
+ prefix = env_key[: max(1, len(env_key) // 2)]
830
+ print(f"[{source}] expected ENVIRONMENT_API_KEY prefix: {prefix}")
831
+ return prefix
832
+
833
+ @app.get("/health")
834
+ async def health(request: StarletteRequest):
835
+ env_key = normalize_environment_api_key()
836
+ if not env_key:
837
+ return JSONResponse(
838
+ status_code=503,
839
+ content={"status": "unhealthy", "detail": "Missing ENVIRONMENT_API_KEY"},
840
+ )
841
+ if not is_api_key_header_authorized(request):
842
+ prefix = _log_env_key_prefix("health", env_key)
843
+ content = {"status": "healthy", "authorized": False}
844
+ if prefix:
845
+ content["expected_api_key_prefix"] = prefix
846
+ return JSONResponse(status_code=200, content=content)
847
+ return {"status": "healthy", "authorized": True}
848
+
849
+ @app.get("/health/rollout")
850
+ async def health_rollout(request: StarletteRequest):
851
+ env_key = normalize_environment_api_key()
852
+ if not env_key:
853
+ return JSONResponse(
854
+ status_code=503,
855
+ content={"status": "unhealthy", "detail": "Missing ENVIRONMENT_API_KEY"},
856
+ )
857
+ if not is_api_key_header_authorized(request):
858
+ prefix = _log_env_key_prefix("health/rollout", env_key)
859
+ content = {"status": "healthy", "authorized": False}
860
+ if prefix:
861
+ content["expected_api_key_prefix"] = prefix
862
+ return JSONResponse(status_code=200, content=content)
863
+ return {"ok": True, "authorized": True}
864
+
865
+ @app.exception_handler(RequestValidationError)
866
+ async def _on_validation_error(request: StarletteRequest, exc: RequestValidationError):
867
+ try:
868
+ hdr = request.headers
869
+ snapshot = {
870
+ "path": str(request.url.path),
871
+ "have_x_api_key": bool(hdr.get("x-api-key")),
872
+ "have_x_api_keys": bool(hdr.get("x-api-keys")),
873
+ "have_authorization": bool(hdr.get("authorization")),
874
+ "errors": exc.errors()[:5],
875
+ }
876
+ print("[422] validation", snapshot, flush=True)
877
+ except Exception:
878
+ pass
879
+ return JSONResponse(
880
+ status_code=422,
881
+ content={"status": "invalid", "detail": exc.errors()[:5]},
882
+ )
883
+
884
+ return app
885
+
886
+
887
+ if __name__ == "__main__":
888
+ import argparse
889
+
890
+ parser = argparse.ArgumentParser(description="Run the Banking77 task app locally")
891
+ parser.add_argument("--host", default="0.0.0.0")
892
+ parser.add_argument("--port", type=int, default=8102)
893
+ parser.add_argument("--reload", action="store_true", help="Enable uvicorn autoreload")
894
+ parser.add_argument(
895
+ "--env-file",
896
+ action="append",
897
+ default=[],
898
+ help="Additional .env files to load before startup",
899
+ )
900
+ args = parser.parse_args()
901
+
902
+ default_env = Path(__file__).resolve().parents[2] / ".env"
903
+ env_files = [str(default_env)] if default_env.exists() else []
904
+ env_files.extend(args.env_file or [])
905
+
906
+ run_task_app(
907
+ build_config,
908
+ host=args.host,
909
+ port=args.port,
910
+ reload=args.reload,
911
+ env_files=env_files,
912
+ )