synth-ai 0.2.16__py3-none-any.whl → 0.2.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (299) hide show
  1. examples/analyze_semantic_words.sh +2 -2
  2. examples/baseline/banking77_baseline.py +204 -0
  3. examples/baseline/crafter_baseline.py +407 -0
  4. examples/baseline/pokemon_red_baseline.py +326 -0
  5. examples/baseline/simple_baseline.py +56 -0
  6. examples/baseline/warming_up_to_rl_baseline.py +239 -0
  7. examples/blog_posts/gepa/README.md +355 -0
  8. examples/blog_posts/gepa/configs/banking77_gepa_local.toml +95 -0
  9. examples/blog_posts/gepa/configs/banking77_gepa_test.toml +82 -0
  10. examples/blog_posts/gepa/configs/banking77_mipro_local.toml +52 -0
  11. examples/blog_posts/gepa/configs/hotpotqa_gepa_local.toml +59 -0
  12. examples/blog_posts/gepa/configs/hotpotqa_gepa_qwen.toml +36 -0
  13. examples/blog_posts/gepa/configs/hotpotqa_mipro_local.toml +53 -0
  14. examples/blog_posts/gepa/configs/hover_gepa_local.toml +59 -0
  15. examples/blog_posts/gepa/configs/hover_gepa_qwen.toml +36 -0
  16. examples/blog_posts/gepa/configs/hover_mipro_local.toml +53 -0
  17. examples/blog_posts/gepa/configs/ifbench_gepa_local.toml +59 -0
  18. examples/blog_posts/gepa/configs/ifbench_gepa_qwen.toml +36 -0
  19. examples/blog_posts/gepa/configs/ifbench_mipro_local.toml +53 -0
  20. examples/blog_posts/gepa/configs/pupa_gepa_local.toml +60 -0
  21. examples/blog_posts/gepa/configs/pupa_mipro_local.toml +54 -0
  22. examples/blog_posts/gepa/deploy_banking77_task_app.sh +41 -0
  23. examples/blog_posts/gepa/gepa_baseline.py +204 -0
  24. examples/blog_posts/gepa/query_prompts_example.py +97 -0
  25. examples/blog_posts/gepa/run_gepa_banking77.sh +87 -0
  26. examples/blog_posts/gepa/task_apps.py +105 -0
  27. examples/blog_posts/gepa/test_gepa_local.sh +67 -0
  28. examples/blog_posts/gepa/verify_banking77_setup.sh +123 -0
  29. examples/blog_posts/pokemon_vl/README.md +98 -0
  30. examples/blog_posts/pokemon_vl/configs/eval_gpt5nano.toml +26 -0
  31. examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +27 -0
  32. examples/blog_posts/pokemon_vl/configs/eval_rl_final.toml +24 -0
  33. examples/blog_posts/pokemon_vl/configs/filter_high_reward.toml +10 -0
  34. examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +43 -0
  35. examples/blog_posts/pokemon_vl/configs/train_sft_qwen4b_vl.toml +40 -0
  36. examples/blog_posts/pokemon_vl/extract_images.py +239 -0
  37. examples/blog_posts/pokemon_vl/pokemon_vl_baseline.py +326 -0
  38. examples/blog_posts/pokemon_vl/run_eval_extract_images.py +209 -0
  39. examples/blog_posts/pokemon_vl/run_qwen_eval_extract_images.py +212 -0
  40. examples/blog_posts/pokemon_vl/text_box_analysis.md +106 -0
  41. examples/blog_posts/warming_up_to_rl/ARCHITECTURE.md +195 -0
  42. examples/blog_posts/warming_up_to_rl/FINAL_TEST_RESULTS.md +127 -0
  43. examples/blog_posts/warming_up_to_rl/INFERENCE_SUCCESS.md +132 -0
  44. examples/blog_posts/warming_up_to_rl/README.md +158 -0
  45. examples/blog_posts/warming_up_to_rl/SMOKE_TESTING.md +164 -0
  46. examples/blog_posts/warming_up_to_rl/SMOKE_TEST_COMPLETE.md +253 -0
  47. examples/blog_posts/warming_up_to_rl/configs/eval_baseline_qwen32b_10x20.toml +25 -0
  48. examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b.toml +25 -0
  49. examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b_10x20.toml +26 -0
  50. examples/blog_posts/warming_up_to_rl/configs/eval_groq_qwen32b.toml +25 -0
  51. examples/blog_posts/warming_up_to_rl/configs/eval_openai_gpt_oss_120b.toml +29 -0
  52. examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +10 -0
  53. examples/blog_posts/warming_up_to_rl/configs/smoke_test.toml +75 -0
  54. examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +91 -0
  55. examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +40 -0
  56. examples/blog_posts/warming_up_to_rl/warming_up_to_rl_baseline.py +187 -0
  57. examples/dev/qwen3_32b_qlora_4xh100.toml +5 -0
  58. examples/multi_step/configs/VERILOG_REWARDS.md +4 -0
  59. examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +4 -0
  60. examples/multi_step/configs/crafter_rl_outcome.toml +2 -1
  61. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +65 -107
  62. examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +2 -1
  63. examples/multi_step/configs/crafter_rl_stepwise_simple.toml +2 -1
  64. examples/multi_step/configs/crafter_rl_stepwise_simple_NEW_FORMAT.toml +105 -0
  65. examples/multi_step/configs/verilog_rl_lora.toml +80 -123
  66. examples/qwen_coder/configs/coder_lora_30b.toml +1 -3
  67. examples/qwen_coder/configs/coder_lora_4b.toml +4 -1
  68. examples/qwen_coder/configs/coder_lora_small.toml +1 -3
  69. examples/qwen_vl/README.md +10 -12
  70. examples/qwen_vl/SETUP_COMPLETE.md +7 -8
  71. examples/qwen_vl/VISION_TESTS_COMPLETE.md +2 -3
  72. examples/qwen_vl/collect_data_via_cli.md +76 -84
  73. examples/qwen_vl/collect_vision_traces.py +4 -4
  74. examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +40 -57
  75. examples/qwen_vl/configs/crafter_vlm_sft_example.toml +1 -2
  76. examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +20 -37
  77. examples/qwen_vl/configs/eval_gpt5nano_vision.toml +21 -40
  78. examples/qwen_vl/configs/eval_qwen3vl_vision.toml +26 -0
  79. examples/qwen_vl/configs/{filter_qwen2vl_sft.toml → filter_qwen3vl_sft.toml} +4 -5
  80. examples/qwen_vl/configs/filter_vision_sft.toml +2 -3
  81. examples/qwen_vl/crafter_qwen_vl_agent.py +5 -5
  82. examples/qwen_vl/run_vision_comparison.sh +6 -7
  83. examples/rl/README.md +5 -5
  84. examples/rl/configs/rl_from_base_qwen.toml +26 -1
  85. examples/rl/configs/rl_from_base_qwen17.toml +6 -2
  86. examples/rl/task_app/README.md +1 -2
  87. examples/rl/task_app/math_single_step.py +2 -2
  88. examples/run_crafter_demo.sh +2 -2
  89. examples/sft/README.md +1 -1
  90. examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -1
  91. examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -1
  92. examples/swe/task_app/README.md +32 -2
  93. examples/swe/task_app/grpo_swe_mini.py +4 -0
  94. examples/swe/task_app/hosted/envs/crafter/react_agent.py +1 -1
  95. examples/swe/task_app/hosted/envs/mini_swe/environment.py +37 -10
  96. examples/swe/task_app/hosted/inference/openai_client.py +4 -38
  97. examples/swe/task_app/hosted/policy_routes.py +17 -0
  98. examples/swe/task_app/hosted/rollout.py +4 -2
  99. examples/swe/task_app/morph_backend.py +178 -0
  100. examples/task_apps/banking77/__init__.py +6 -0
  101. examples/task_apps/banking77/banking77_task_app.py +841 -0
  102. examples/task_apps/banking77/deploy_wrapper.py +46 -0
  103. examples/task_apps/crafter/CREATE_SFT_DATASET.md +4 -0
  104. examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +4 -0
  105. examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +4 -0
  106. examples/task_apps/crafter/task_app/README.md +1 -1
  107. examples/task_apps/crafter/task_app/grpo_crafter.py +90 -5
  108. examples/task_apps/crafter/task_app/grpo_crafter_task_app.py +1 -1
  109. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +4 -26
  110. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -2
  111. examples/task_apps/crafter/task_app/synth_envs_hosted/hosted_app.py +49 -0
  112. examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +372 -107
  113. examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +81 -12
  114. examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +82 -11
  115. examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +194 -1
  116. examples/task_apps/enron/task_app/grpo_enron_task_app.py +1 -1
  117. examples/task_apps/gepa_benchmarks/__init__.py +7 -0
  118. examples/task_apps/gepa_benchmarks/common.py +260 -0
  119. examples/task_apps/gepa_benchmarks/hotpotqa_task_app.py +507 -0
  120. examples/task_apps/gepa_benchmarks/hover_task_app.py +436 -0
  121. examples/task_apps/gepa_benchmarks/ifbench_task_app.py +563 -0
  122. examples/task_apps/gepa_benchmarks/pupa_task_app.py +460 -0
  123. examples/task_apps/math/README.md +1 -2
  124. examples/task_apps/pokemon_red/README.md +3 -4
  125. examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +4 -0
  126. examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +6 -5
  127. examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +1 -2
  128. examples/task_apps/pokemon_red/task_app.py +288 -39
  129. examples/task_apps/sokoban/README.md +2 -3
  130. examples/task_apps/verilog/eval_groq_qwen32b.toml +12 -14
  131. examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +1 -1
  132. examples/vlm/configs/crafter_vlm_gpt4o.toml +4 -1
  133. examples/warming_up_to_rl/configs/crafter_fft.toml +4 -1
  134. examples/warming_up_to_rl/configs/crafter_fft_4b.toml +0 -2
  135. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +3 -2
  136. examples/warming_up_to_rl/run_local_rollout_traced.py +1 -1
  137. examples/warming_up_to_rl/task_app/README.md +1 -1
  138. examples/warming_up_to_rl/task_app/grpo_crafter.py +185 -5
  139. examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +1 -1
  140. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +3 -27
  141. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -1
  142. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +49 -0
  143. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +156 -45
  144. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +37 -4
  145. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +33 -3
  146. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +67 -0
  147. examples/workflows/math_rl/configs/rl_from_base_qwen.toml +27 -0
  148. examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +6 -0
  149. synth_ai/api/train/builders.py +99 -4
  150. synth_ai/api/train/cli.py +516 -26
  151. synth_ai/api/train/config_finder.py +13 -2
  152. synth_ai/api/train/configs/__init__.py +23 -2
  153. synth_ai/api/train/configs/prompt_learning.py +442 -0
  154. synth_ai/api/train/configs/rl.py +61 -7
  155. synth_ai/api/train/configs/sft.py +6 -2
  156. synth_ai/api/train/configs/shared.py +59 -2
  157. synth_ai/api/train/task_app.py +1 -1
  158. synth_ai/api/train/validators.py +277 -0
  159. synth_ai/auth/credentials.py +119 -0
  160. synth_ai/baseline/__init__.py +25 -0
  161. synth_ai/baseline/config.py +209 -0
  162. synth_ai/baseline/discovery.py +214 -0
  163. synth_ai/baseline/execution.py +146 -0
  164. synth_ai/cli/__init__.py +94 -18
  165. synth_ai/cli/__main__.py +0 -0
  166. synth_ai/cli/claude.py +70 -0
  167. synth_ai/cli/codex.py +84 -0
  168. synth_ai/cli/commands/__init__.py +18 -0
  169. synth_ai/cli/commands/baseline/__init__.py +12 -0
  170. synth_ai/cli/commands/baseline/core.py +637 -0
  171. synth_ai/cli/commands/baseline/list.py +93 -0
  172. synth_ai/cli/commands/demo/__init__.py +6 -0
  173. synth_ai/cli/commands/demo/core.py +163 -0
  174. synth_ai/cli/commands/eval/__init__.py +19 -0
  175. synth_ai/cli/commands/eval/core.py +1112 -0
  176. synth_ai/cli/commands/eval/errors.py +81 -0
  177. synth_ai/cli/commands/eval/validation.py +133 -0
  178. synth_ai/cli/commands/filter/__init__.py +12 -0
  179. synth_ai/cli/commands/filter/core.py +424 -0
  180. synth_ai/cli/commands/filter/errors.py +55 -0
  181. synth_ai/cli/commands/filter/validation.py +77 -0
  182. synth_ai/cli/commands/help/__init__.py +177 -0
  183. synth_ai/cli/commands/help/core.py +72 -0
  184. synth_ai/cli/commands/smoke/__init__.py +7 -0
  185. synth_ai/cli/commands/smoke/core.py +1436 -0
  186. synth_ai/cli/commands/status/__init__.py +64 -0
  187. synth_ai/cli/commands/status/client.py +192 -0
  188. synth_ai/cli/commands/status/config.py +92 -0
  189. synth_ai/cli/commands/status/errors.py +20 -0
  190. synth_ai/cli/commands/status/formatters.py +164 -0
  191. synth_ai/cli/commands/status/subcommands/__init__.py +9 -0
  192. synth_ai/cli/commands/status/subcommands/files.py +79 -0
  193. synth_ai/cli/commands/status/subcommands/jobs.py +334 -0
  194. synth_ai/cli/commands/status/subcommands/models.py +79 -0
  195. synth_ai/cli/commands/status/subcommands/pricing.py +22 -0
  196. synth_ai/cli/commands/status/subcommands/runs.py +81 -0
  197. synth_ai/cli/commands/status/subcommands/summary.py +47 -0
  198. synth_ai/cli/commands/status/subcommands/usage.py +203 -0
  199. synth_ai/cli/commands/status/utils.py +114 -0
  200. synth_ai/cli/commands/train/__init__.py +53 -0
  201. synth_ai/cli/commands/train/core.py +21 -0
  202. synth_ai/cli/commands/train/errors.py +117 -0
  203. synth_ai/cli/commands/train/judge_schemas.py +200 -0
  204. synth_ai/cli/commands/train/judge_validation.py +305 -0
  205. synth_ai/cli/commands/train/validation.py +386 -0
  206. synth_ai/cli/demo.py +30 -158
  207. synth_ai/cli/deploy/__init__.py +43 -0
  208. synth_ai/cli/deploy.py +162 -0
  209. synth_ai/cli/eval/__init__.py +36 -0
  210. synth_ai/cli/eval/core.py +5 -0
  211. synth_ai/cli/eval/errors.py +31 -0
  212. synth_ai/cli/eval/validation.py +5 -0
  213. synth_ai/cli/filter/__init__.py +28 -0
  214. synth_ai/cli/filter/core.py +5 -0
  215. synth_ai/cli/filter/errors.py +23 -0
  216. synth_ai/cli/filter/validation.py +5 -0
  217. synth_ai/cli/legacy_root_backup.py +14 -8
  218. synth_ai/cli/modal_serve/__init__.py +12 -0
  219. synth_ai/cli/modal_serve/core.py +14 -0
  220. synth_ai/cli/modal_serve/errors.py +8 -0
  221. synth_ai/cli/modal_serve/validation.py +11 -0
  222. synth_ai/cli/opencode.py +107 -0
  223. synth_ai/cli/root.py +9 -5
  224. synth_ai/cli/serve/__init__.py +12 -0
  225. synth_ai/cli/serve/core.py +14 -0
  226. synth_ai/cli/serve/errors.py +8 -0
  227. synth_ai/cli/serve/validation.py +11 -0
  228. synth_ai/cli/setup.py +20 -265
  229. synth_ai/cli/status.py +7 -126
  230. synth_ai/cli/task_app_deploy.py +1 -10
  231. synth_ai/cli/task_app_modal_serve.py +4 -9
  232. synth_ai/cli/task_app_serve.py +4 -11
  233. synth_ai/cli/task_apps.py +51 -1480
  234. synth_ai/cli/train/__init__.py +12 -0
  235. synth_ai/cli/train/core.py +21 -0
  236. synth_ai/cli/train/errors.py +8 -0
  237. synth_ai/cli/train/validation.py +24 -0
  238. synth_ai/cli/train.py +1 -14
  239. synth_ai/demos/crafter/grpo_crafter_task_app.py +1 -1
  240. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
  241. synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +7 -4
  242. synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +9 -5
  243. synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +4 -3
  244. synth_ai/environments/examples/red/engine.py +33 -12
  245. synth_ai/environments/examples/red/engine_helpers/reward_components.py +151 -179
  246. synth_ai/environments/examples/red/environment.py +26 -0
  247. synth_ai/environments/examples/red/trace_hooks_v3.py +168 -0
  248. synth_ai/http.py +12 -0
  249. synth_ai/judge_schemas.py +10 -10
  250. synth_ai/learning/__init__.py +10 -0
  251. synth_ai/learning/prompt_learning_client.py +276 -0
  252. synth_ai/learning/prompt_learning_types.py +184 -0
  253. synth_ai/learning/rl/client.py +3 -1
  254. synth_ai/pricing/__init__.py +2 -0
  255. synth_ai/pricing/model_pricing.py +57 -0
  256. synth_ai/streaming/__init__.py +29 -0
  257. synth_ai/streaming/config.py +94 -0
  258. synth_ai/streaming/handlers.py +518 -0
  259. synth_ai/streaming/streamer.py +320 -0
  260. synth_ai/streaming/types.py +95 -0
  261. synth_ai/task/apps/__init__.py +1 -0
  262. synth_ai/task/config.py +2 -0
  263. synth_ai/task/tracing_utils.py +25 -25
  264. synth_ai/task/validators.py +45 -9
  265. synth_ai/task_app_cfgs.py +21 -0
  266. synth_ai/tracing_v3/config.py +162 -19
  267. synth_ai/tracing_v3/constants.py +1 -1
  268. synth_ai/tracing_v3/db_config.py +24 -38
  269. synth_ai/tracing_v3/migration_helper.py +1 -2
  270. synth_ai/tracing_v3/storage/config.py +47 -13
  271. synth_ai/tracing_v3/storage/factory.py +3 -3
  272. synth_ai/tracing_v3/turso/daemon.py +113 -11
  273. synth_ai/tracing_v3/turso/native_manager.py +92 -16
  274. synth_ai/types.py +8 -0
  275. synth_ai/urls.py +11 -0
  276. synth_ai/utils/__init__.py +30 -1
  277. synth_ai/utils/agents.py +74 -0
  278. synth_ai/utils/bin.py +39 -0
  279. synth_ai/utils/cli.py +149 -5
  280. synth_ai/utils/env.py +40 -33
  281. synth_ai/utils/http.py +4 -1
  282. synth_ai/utils/json.py +72 -0
  283. synth_ai/utils/modal.py +285 -3
  284. synth_ai/utils/paths.py +48 -0
  285. synth_ai/utils/uvicorn.py +113 -0
  286. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/METADATA +109 -6
  287. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/RECORD +291 -142
  288. examples/qwen_vl/configs/eval_qwen2vl_vision.toml +0 -44
  289. synth_ai/cli/tui.py +0 -62
  290. synth_ai/tui/__init__.py +0 -5
  291. synth_ai/tui/__main__.py +0 -13
  292. synth_ai/tui/cli/__init__.py +0 -1
  293. synth_ai/tui/cli/query_experiments.py +0 -164
  294. synth_ai/tui/cli/query_experiments_v3.py +0 -164
  295. synth_ai/tui/dashboard.py +0 -911
  296. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/WHEEL +0 -0
  297. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/entry_points.txt +0 -0
  298. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/licenses/LICENSE +0 -0
  299. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,53 @@
1
+ # MIPROv2 Prompt Learning for HotpotQA
2
+ # Local backend configuration targeting the HotpotQA task app.
3
+
4
+ [prompt_learning]
5
+ algorithm = "mipro"
6
+ task_app_url = "http://127.0.0.1:8110"
7
+ task_app_id = "hotpotqa"
8
+
9
+ # Seeds used during online optimisation
10
+ evaluation_seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
11
+
12
+ # Held-out seeds for the final sweep
13
+ test_pool = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
14
+
15
+ [prompt_learning.initial_prompt]
16
+ id = "hotpotqa_chain"
17
+ name = "HotpotQA Multi-Hop Reasoning"
18
+
19
+ [[prompt_learning.initial_prompt.messages]]
20
+ role = "system"
21
+ pattern = "You are a research assistant that answers multi-hop questions. Read the supporting passages carefully and articulate the final answer plus a short justification. Use the format:\nAnswer: ...\nSupport: ..."
22
+ order = 0
23
+
24
+ [[prompt_learning.initial_prompt.messages]]
25
+ role = "user"
26
+ pattern = "Question: {question}\n\nPassages:\n{context}\n\nProvide the final answer and cite the supporting facts."
27
+ order = 1
28
+
29
+ [prompt_learning.initial_prompt.wildcards]
30
+ question = "REQUIRED"
31
+ context = "REQUIRED"
32
+
33
+ [prompt_learning.policy]
34
+ model = "openai/gpt-oss-20b"
35
+ provider = "groq"
36
+ inference_url = "https://api.groq.com/openai/v1"
37
+ temperature = 0.0
38
+ max_completion_tokens = 512
39
+ policy_name = "hotpotqa-mipro"
40
+
41
+ [prompt_learning.mipro]
42
+ env_name = "hotpotqa"
43
+ num_iterations = 20
44
+ num_evaluations_per_iteration = 8
45
+ batch_size = 8
46
+ max_concurrent = 16
47
+ meta_model = "gpt-4.1-mini"
48
+ meta_model_provider = "openai"
49
+ meta_model_inference_url = "https://api.openai.com/v1"
50
+ few_shot_score_threshold = 0.75
51
+ test_pool = [20, 21, 22, 23, 24]
52
+ bootstrap_train_seeds = [0, 1, 2, 3, 4]
53
+ online_pool = [5, 6, 7, 8, 9]
@@ -0,0 +1,59 @@
1
+ # GEPA Prompt Learning for HoVer
2
+ # Local backend configuration targeting the HoVer task app.
3
+
4
+ [prompt_learning]
5
+ algorithm = "gepa"
6
+ task_app_url = "http://127.0.0.1:8112"
7
+ task_app_id = "hover"
8
+
9
+ # Seeds for online evaluation during optimisation
10
+ evaluation_seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
11
+
12
+ # Held-out seeds for final reporting
13
+ test_pool = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
14
+
15
+ [prompt_learning.initial_prompt]
16
+ id = "hover_verification"
17
+ name = "HoVer Claim Verification"
18
+
19
+ [[prompt_learning.initial_prompt.messages]]
20
+ role = "system"
21
+ pattern = "You verify Wikipedia claims. For each example decide whether the claim is SUPPORTED, REFUTED, or INSUFFICIENT and cite the sentences that justify the label."
22
+ order = 0
23
+
24
+ [[prompt_learning.initial_prompt.messages]]
25
+ role = "user"
26
+ pattern = "Claim: {claim}\n\nEvidence:\n{evidence}\n\nRespond with the format:\nLabel: <SUPPORTED|REFUTED|INSUFFICIENT>\nRationale: <brief explanation citing evidence lines>."
27
+ order = 1
28
+
29
+ [prompt_learning.initial_prompt.wildcards]
30
+ claim = "REQUIRED"
31
+ evidence = "REQUIRED"
32
+
33
+ [prompt_learning.policy]
34
+ model = "openai/gpt-oss-20b"
35
+ provider = "groq"
36
+ inference_url = "https://api.groq.com/openai/v1"
37
+ temperature = 0.0
38
+ max_completion_tokens = 512
39
+ policy_name = "hover-gepa"
40
+
41
+ [prompt_learning.gepa]
42
+ env_name = "hover"
43
+ initial_population_size = 24
44
+ num_generations = 15
45
+ mutation_rate = 0.33
46
+ crossover_rate = 0.55
47
+ selection_pressure = 1.0
48
+ minibatch_size = 8
49
+ pareto_set_size = 24
50
+ feedback_fraction = 0.5
51
+ children_per_generation = 12
52
+ patience_generations = 5
53
+ rollout_budget = 540
54
+ archive_size = 36
55
+ pareto_eps = 1e-6
56
+ max_concurrent_rollouts = 20
57
+ mutation_llm_model = "openai/gpt-oss-20b"
58
+ mutation_llm_provider = "groq"
59
+ mutation_llm_inference_url = "https://api.groq.com/openai/v1"
@@ -0,0 +1,36 @@
1
+ [prompt_learning]
2
+ algorithm = "gepa"
3
+ task_app_url = "https://synth-laboratories-dev--synth-banking77-web-web.modal.run" # TODO: replace with HotpotQA task app URL
4
+ task_app_id = "hotpotqa"
5
+
6
+ # Seeds
7
+ evaluation_seeds = [0,1,2,3,4,5,6,7,8,9]
8
+
9
+ # Held-out validation
10
+ validation_seeds = [10,11,12,13,14,15,16,17,18,19]
11
+ validation_pool = "validation"
12
+ validation_top_k = 3
13
+
14
+ # Train split configuration
15
+ [prompt_learning.env_config]
16
+ pool = "train"
17
+
18
+ # Policy model (synth Qwen via backend inference proxy)
19
+ [prompt_learning.policy]
20
+ provider = "synth"
21
+ model = "Qwen/Qwen3-8B"
22
+ # inference_url will be mapped to backend /api/inference/v1 by the optimizer
23
+
24
+ # GEPA parameters (tune as needed)
25
+ [prompt_learning.gepa]
26
+ env_name = "hover"
27
+ initial_population_size = 24
28
+ num_generations = 6
29
+ children_per_generation = 12
30
+ minibatch_size = 10
31
+ pareto_set_size = 32
32
+ rollout_budget = 600
33
+ max_concurrent_rollouts = 16
34
+ mutation_llm_model = "openai/gpt-oss-120b"
35
+ mutation_llm_provider = "groq"
36
+ proposer_type = "dspy"
@@ -0,0 +1,53 @@
1
+ # MIPROv2 Prompt Learning for HoVer
2
+ # Local backend configuration targeting the HoVer task app.
3
+
4
+ [prompt_learning]
5
+ algorithm = "mipro"
6
+ task_app_url = "http://127.0.0.1:8112"
7
+ task_app_id = "hover"
8
+
9
+ # Seeds explored during optimisation
10
+ evaluation_seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
11
+
12
+ # Held-out seeds used for the final sweep
13
+ test_pool = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
14
+
15
+ [prompt_learning.initial_prompt]
16
+ id = "hover_verification"
17
+ name = "HoVer Claim Verification"
18
+
19
+ [[prompt_learning.initial_prompt.messages]]
20
+ role = "system"
21
+ pattern = "You are a fact-checking assistant. Review the evidence carefully and respond with SUPPORTED, REFUTED, or INSUFFICIENT along with a concise justification."
22
+ order = 0
23
+
24
+ [[prompt_learning.initial_prompt.messages]]
25
+ role = "user"
26
+ pattern = "Claim: {claim}\n\nEvidence:\n{evidence}\n\nRespond with the format:\nLabel: <SUPPORTED|REFUTED|INSUFFICIENT>\nRationale: <brief explanation>."
27
+ order = 1
28
+
29
+ [prompt_learning.initial_prompt.wildcards]
30
+ claim = "REQUIRED"
31
+ evidence = "REQUIRED"
32
+
33
+ [prompt_learning.policy]
34
+ model = "openai/gpt-oss-20b"
35
+ provider = "groq"
36
+ inference_url = "https://api.groq.com/openai/v1"
37
+ temperature = 0.0
38
+ max_completion_tokens = 512
39
+ policy_name = "hover-mipro"
40
+
41
+ [prompt_learning.mipro]
42
+ env_name = "hover"
43
+ num_iterations = 20
44
+ num_evaluations_per_iteration = 6
45
+ batch_size = 6
46
+ max_concurrent = 16
47
+ meta_model = "gpt-4.1-mini"
48
+ meta_model_provider = "openai"
49
+ meta_model_inference_url = "https://api.openai.com/v1"
50
+ few_shot_score_threshold = 0.8
51
+ test_pool = [20, 21, 22, 23, 24]
52
+ bootstrap_train_seeds = [0, 1, 2, 3, 4]
53
+ online_pool = [5, 6, 7, 8, 9]
@@ -0,0 +1,59 @@
1
+ # GEPA Prompt Learning for IFBench
2
+ # Local backend configuration targeting the IFBench task app.
3
+
4
+ [prompt_learning]
5
+ algorithm = "gepa"
6
+ task_app_url = "http://127.0.0.1:8111"
7
+ task_app_id = "ifbench"
8
+
9
+ # Candidate evaluation seeds during optimisation
10
+ evaluation_seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
11
+
12
+ # Held-out pool used for the final comparison sweep
13
+ test_pool = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
14
+
15
+ [prompt_learning.initial_prompt]
16
+ id = "ifbench_following"
17
+ name = "IFBench Instruction Following"
18
+
19
+ [[prompt_learning.initial_prompt.messages]]
20
+ role = "system"
21
+ pattern = "You are an obedient assistant that must follow instructions exactly. Ensure that every requirement is satisfied, avoid unsolicited commentary, and be explicit when information is missing."
22
+ order = 0
23
+
24
+ [[prompt_learning.initial_prompt.messages]]
25
+ role = "user"
26
+ pattern = "Instruction: {instruction}\n\nInput: {input}\n\nProvide the response that best follows the instruction."
27
+ order = 1
28
+
29
+ [prompt_learning.initial_prompt.wildcards]
30
+ instruction = "REQUIRED"
31
+ input = "OPTIONAL"
32
+
33
+ [prompt_learning.policy]
34
+ model = "openai/gpt-oss-20b"
35
+ provider = "groq"
36
+ inference_url = "https://api.groq.com/openai/v1"
37
+ temperature = 0.0
38
+ max_completion_tokens = 512
39
+ policy_name = "ifbench-gepa"
40
+
41
+ [prompt_learning.gepa]
42
+ env_name = "ifbench"
43
+ initial_population_size = 24
44
+ num_generations = 12
45
+ mutation_rate = 0.3
46
+ crossover_rate = 0.6
47
+ selection_pressure = 1.0
48
+ minibatch_size = 8
49
+ pareto_set_size = 24
50
+ feedback_fraction = 0.5
51
+ children_per_generation = 12
52
+ patience_generations = 4
53
+ rollout_budget = 480
54
+ archive_size = 32
55
+ pareto_eps = 1e-6
56
+ max_concurrent_rollouts = 20
57
+ mutation_llm_model = "openai/gpt-oss-20b"
58
+ mutation_llm_provider = "groq"
59
+ mutation_llm_inference_url = "https://api.groq.com/openai/v1"
@@ -0,0 +1,36 @@
1
+ [prompt_learning]
2
+ algorithm = "gepa"
3
+ task_app_url = "https://synth-laboratories-dev--synth-banking77-web-web.modal.run" # TODO: replace with HotpotQA task app URL
4
+ task_app_id = "hotpotqa"
5
+
6
+ # Seeds
7
+ evaluation_seeds = [0,1,2,3,4,5,6,7,8,9]
8
+
9
+ # Held-out validation
10
+ validation_seeds = [10,11,12,13,14,15,16,17,18,19]
11
+ validation_pool = "validation"
12
+ validation_top_k = 3
13
+
14
+ # Train split configuration
15
+ [prompt_learning.env_config]
16
+ pool = "train"
17
+
18
+ # Policy model (synth Qwen via backend inference proxy)
19
+ [prompt_learning.policy]
20
+ provider = "synth"
21
+ model = "Qwen/Qwen3-8B"
22
+ # inference_url will be mapped to backend /api/inference/v1 by the optimizer
23
+
24
+ # GEPA parameters (tune as needed)
25
+ [prompt_learning.gepa]
26
+ env_name = "ifbench"
27
+ initial_population_size = 24
28
+ num_generations = 6
29
+ children_per_generation = 12
30
+ minibatch_size = 10
31
+ pareto_set_size = 32
32
+ rollout_budget = 600
33
+ max_concurrent_rollouts = 16
34
+ mutation_llm_model = "openai/gpt-oss-120b"
35
+ mutation_llm_provider = "groq"
36
+ proposer_type = "dspy"
@@ -0,0 +1,53 @@
1
+ # MIPROv2 Prompt Learning for IFBench
2
+ # Local backend configuration targeting the IFBench task app.
3
+
4
+ [prompt_learning]
5
+ algorithm = "mipro"
6
+ task_app_url = "http://127.0.0.1:8111"
7
+ task_app_id = "ifbench"
8
+
9
+ # Seeds evaluated during optimisation
10
+ evaluation_seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
11
+
12
+ # Held-out seeds for the final comparison
13
+ test_pool = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
14
+
15
+ [prompt_learning.initial_prompt]
16
+ id = "ifbench_following"
17
+ name = "IFBench Instruction Following"
18
+
19
+ [[prompt_learning.initial_prompt.messages]]
20
+ role = "system"
21
+ pattern = "You are an obedient assistant that follows instructions exactly. Ensure that every constraint is satisfied and mention explicitly if something cannot be completed."
22
+ order = 0
23
+
24
+ [[prompt_learning.initial_prompt.messages]]
25
+ role = "user"
26
+ pattern = "Instruction: {instruction}\n\nInput: {input}\n\nReturn the response that best follows the instruction."
27
+ order = 1
28
+
29
+ [prompt_learning.initial_prompt.wildcards]
30
+ instruction = "REQUIRED"
31
+ input = "OPTIONAL"
32
+
33
+ [prompt_learning.policy]
34
+ model = "openai/gpt-oss-20b"
35
+ provider = "groq"
36
+ inference_url = "https://api.groq.com/openai/v1"
37
+ temperature = 0.0
38
+ max_completion_tokens = 512
39
+ policy_name = "ifbench-mipro"
40
+
41
+ [prompt_learning.mipro]
42
+ env_name = "ifbench"
43
+ num_iterations = 16
44
+ num_evaluations_per_iteration = 6
45
+ batch_size = 6
46
+ max_concurrent = 16
47
+ meta_model = "gpt-4.1-mini"
48
+ meta_model_provider = "openai"
49
+ meta_model_inference_url = "https://api.openai.com/v1"
50
+ few_shot_score_threshold = 0.8
51
+ test_pool = [20, 21, 22, 23, 24]
52
+ bootstrap_train_seeds = [0, 1, 2, 3, 4]
53
+ online_pool = [5, 6, 7, 8, 9]
@@ -0,0 +1,60 @@
1
+ # GEPA Prompt Learning for PUPA
2
+ # Local backend configuration targeting the PUPA privacy-aware delegation task app.
3
+
4
+ [prompt_learning]
5
+ algorithm = "gepa"
6
+ task_app_url = "http://127.0.0.1:8113"
7
+ task_app_id = "pupa"
8
+
9
+ # Seeds explored during optimisation
10
+ evaluation_seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
11
+
12
+ # Held-out seeds for the final evaluation
13
+ test_pool = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
14
+
15
+ [prompt_learning.initial_prompt]
16
+ id = "pupa_privacy"
17
+ name = "PUPA Privacy-Constrained Delegation"
18
+
19
+ [[prompt_learning.initial_prompt.messages]]
20
+ role = "system"
21
+ pattern = "You are an assistant that must complete the task while honouring every privacy rule. Never reveal disallowed fields, always justify decisions, and explicitly state when data cannot be shared."
22
+ order = 0
23
+
24
+ [[prompt_learning.initial_prompt.messages]]
25
+ role = "user"
26
+ pattern = "Task: {task}\n\nPrivacy Policy: {policy}\n\nRecords:\n{records}\n\nProduce the delegated plan or response while respecting the privacy policy."
27
+ order = 1
28
+
29
+ [prompt_learning.initial_prompt.wildcards]
30
+ task = "REQUIRED"
31
+ policy = "REQUIRED"
32
+ records = "REQUIRED"
33
+
34
+ [prompt_learning.policy]
35
+ model = "openai/gpt-oss-20b"
36
+ provider = "groq"
37
+ inference_url = "https://api.groq.com/openai/v1"
38
+ temperature = 0.0
39
+ max_completion_tokens = 512
40
+ policy_name = "pupa-gepa"
41
+
42
+ [prompt_learning.gepa]
43
+ env_name = "pupa"
44
+ initial_population_size = 24
45
+ num_generations = 15
46
+ mutation_rate = 0.3
47
+ crossover_rate = 0.6
48
+ selection_pressure = 1.0
49
+ minibatch_size = 8
50
+ pareto_set_size = 24
51
+ feedback_fraction = 0.6
52
+ children_per_generation = 12
53
+ patience_generations = 5
54
+ rollout_budget = 540
55
+ archive_size = 36
56
+ pareto_eps = 1e-6
57
+ max_concurrent_rollouts = 20
58
+ mutation_llm_model = "openai/gpt-oss-20b"
59
+ mutation_llm_provider = "groq"
60
+ mutation_llm_inference_url = "https://api.groq.com/openai/v1"
@@ -0,0 +1,54 @@
1
+ # MIPROv2 Prompt Learning for PUPA
2
+ # Local backend configuration targeting the PUPA privacy-aware delegation task app.
3
+
4
+ [prompt_learning]
5
+ algorithm = "mipro"
6
+ task_app_url = "http://127.0.0.1:8113"
7
+ task_app_id = "pupa"
8
+
9
+ # Seeds evaluated during optimisation
10
+ evaluation_seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
11
+
12
+ # Held-out seeds for the final sweep
13
+ test_pool = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
14
+
15
+ [prompt_learning.initial_prompt]
16
+ id = "pupa_privacy"
17
+ name = "PUPA Privacy-Constrained Delegation"
18
+
19
+ [[prompt_learning.initial_prompt.messages]]
20
+ role = "system"
21
+ pattern = "You are an assistant that must complete tasks without violating the privacy policy. Redact any forbidden attributes and justify refusals."
22
+ order = 0
23
+
24
+ [[prompt_learning.initial_prompt.messages]]
25
+ role = "user"
26
+ pattern = "Task: {task}\n\nPrivacy Policy: {policy}\n\nRecords:\n{records}\n\nProvide the delegated plan or answer, ensuring compliance with the privacy policy."
27
+ order = 1
28
+
29
+ [prompt_learning.initial_prompt.wildcards]
30
+ task = "REQUIRED"
31
+ policy = "REQUIRED"
32
+ records = "REQUIRED"
33
+
34
+ [prompt_learning.policy]
35
+ model = "openai/gpt-oss-20b"
36
+ provider = "groq"
37
+ inference_url = "https://api.groq.com/openai/v1"
38
+ temperature = 0.0
39
+ max_completion_tokens = 512
40
+ policy_name = "pupa-mipro"
41
+
42
+ [prompt_learning.mipro]
43
+ env_name = "pupa"
44
+ num_iterations = 20
45
+ num_evaluations_per_iteration = 6
46
+ batch_size = 6
47
+ max_concurrent = 16
48
+ meta_model = "gpt-4.1-mini"
49
+ meta_model_provider = "openai"
50
+ meta_model_inference_url = "https://api.openai.com/v1"
51
+ few_shot_score_threshold = 0.85
52
+ test_pool = [20, 21, 22, 23, 24]
53
+ bootstrap_train_seeds = [0, 1, 2, 3, 4]
54
+ online_pool = [5, 6, 7, 8, 9]
@@ -0,0 +1,41 @@
1
+ #!/bin/bash
2
+ # Deploy Banking77 task app locally for GEPA optimization
3
+
4
+ set -e
5
+
6
+ echo "🚀 Deploying Banking77 Task App..."
7
+ echo "=================================="
8
+
9
+ # Set up environment variables
10
+ export ENVIRONMENT_API_KEY="${ENVIRONMENT_API_KEY:-$(python -c 'import secrets; print(secrets.token_urlsafe(32))')}"
11
+ export GROQ_API_KEY="${GROQ_API_KEY}"
12
+
13
+ # Check for required env vars
14
+ if [ -z "$GROQ_API_KEY" ]; then
15
+ echo "❌ Error: GROQ_API_KEY not set"
16
+ echo "Please set it: export GROQ_API_KEY=your_key"
17
+ exit 1
18
+ fi
19
+
20
+ echo "✅ ENVIRONMENT_API_KEY: ${ENVIRONMENT_API_KEY:0:20}..."
21
+ echo "✅ GROQ_API_KEY: ${GROQ_API_KEY:0:20}..."
22
+
23
+ # Navigate to repo root
24
+ cd "$(dirname "$0")/../../.."
25
+
26
+ echo ""
27
+ echo "📦 Installing dependencies..."
28
+ uv pip install -e . --quiet || true
29
+
30
+ echo ""
31
+ echo "🏃 Starting Banking77 task app on http://127.0.0.1:8102"
32
+ echo "Press Ctrl+C to stop"
33
+ echo ""
34
+
35
+ # Run the task app
36
+ python -m examples.task_apps.banking77.banking77_task_app \
37
+ --host 0.0.0.0 \
38
+ --port 8102 \
39
+ --reload
40
+
41
+