synth-ai 0.2.16__py3-none-any.whl → 0.2.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (299) hide show
  1. examples/analyze_semantic_words.sh +2 -2
  2. examples/baseline/banking77_baseline.py +204 -0
  3. examples/baseline/crafter_baseline.py +407 -0
  4. examples/baseline/pokemon_red_baseline.py +326 -0
  5. examples/baseline/simple_baseline.py +56 -0
  6. examples/baseline/warming_up_to_rl_baseline.py +239 -0
  7. examples/blog_posts/gepa/README.md +355 -0
  8. examples/blog_posts/gepa/configs/banking77_gepa_local.toml +95 -0
  9. examples/blog_posts/gepa/configs/banking77_gepa_test.toml +82 -0
  10. examples/blog_posts/gepa/configs/banking77_mipro_local.toml +52 -0
  11. examples/blog_posts/gepa/configs/hotpotqa_gepa_local.toml +59 -0
  12. examples/blog_posts/gepa/configs/hotpotqa_gepa_qwen.toml +36 -0
  13. examples/blog_posts/gepa/configs/hotpotqa_mipro_local.toml +53 -0
  14. examples/blog_posts/gepa/configs/hover_gepa_local.toml +59 -0
  15. examples/blog_posts/gepa/configs/hover_gepa_qwen.toml +36 -0
  16. examples/blog_posts/gepa/configs/hover_mipro_local.toml +53 -0
  17. examples/blog_posts/gepa/configs/ifbench_gepa_local.toml +59 -0
  18. examples/blog_posts/gepa/configs/ifbench_gepa_qwen.toml +36 -0
  19. examples/blog_posts/gepa/configs/ifbench_mipro_local.toml +53 -0
  20. examples/blog_posts/gepa/configs/pupa_gepa_local.toml +60 -0
  21. examples/blog_posts/gepa/configs/pupa_mipro_local.toml +54 -0
  22. examples/blog_posts/gepa/deploy_banking77_task_app.sh +41 -0
  23. examples/blog_posts/gepa/gepa_baseline.py +204 -0
  24. examples/blog_posts/gepa/query_prompts_example.py +97 -0
  25. examples/blog_posts/gepa/run_gepa_banking77.sh +87 -0
  26. examples/blog_posts/gepa/task_apps.py +105 -0
  27. examples/blog_posts/gepa/test_gepa_local.sh +67 -0
  28. examples/blog_posts/gepa/verify_banking77_setup.sh +123 -0
  29. examples/blog_posts/pokemon_vl/README.md +98 -0
  30. examples/blog_posts/pokemon_vl/configs/eval_gpt5nano.toml +26 -0
  31. examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +27 -0
  32. examples/blog_posts/pokemon_vl/configs/eval_rl_final.toml +24 -0
  33. examples/blog_posts/pokemon_vl/configs/filter_high_reward.toml +10 -0
  34. examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +43 -0
  35. examples/blog_posts/pokemon_vl/configs/train_sft_qwen4b_vl.toml +40 -0
  36. examples/blog_posts/pokemon_vl/extract_images.py +239 -0
  37. examples/blog_posts/pokemon_vl/pokemon_vl_baseline.py +326 -0
  38. examples/blog_posts/pokemon_vl/run_eval_extract_images.py +209 -0
  39. examples/blog_posts/pokemon_vl/run_qwen_eval_extract_images.py +212 -0
  40. examples/blog_posts/pokemon_vl/text_box_analysis.md +106 -0
  41. examples/blog_posts/warming_up_to_rl/ARCHITECTURE.md +195 -0
  42. examples/blog_posts/warming_up_to_rl/FINAL_TEST_RESULTS.md +127 -0
  43. examples/blog_posts/warming_up_to_rl/INFERENCE_SUCCESS.md +132 -0
  44. examples/blog_posts/warming_up_to_rl/README.md +158 -0
  45. examples/blog_posts/warming_up_to_rl/SMOKE_TESTING.md +164 -0
  46. examples/blog_posts/warming_up_to_rl/SMOKE_TEST_COMPLETE.md +253 -0
  47. examples/blog_posts/warming_up_to_rl/configs/eval_baseline_qwen32b_10x20.toml +25 -0
  48. examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b.toml +25 -0
  49. examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b_10x20.toml +26 -0
  50. examples/blog_posts/warming_up_to_rl/configs/eval_groq_qwen32b.toml +25 -0
  51. examples/blog_posts/warming_up_to_rl/configs/eval_openai_gpt_oss_120b.toml +29 -0
  52. examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +10 -0
  53. examples/blog_posts/warming_up_to_rl/configs/smoke_test.toml +75 -0
  54. examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +91 -0
  55. examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +40 -0
  56. examples/blog_posts/warming_up_to_rl/warming_up_to_rl_baseline.py +187 -0
  57. examples/dev/qwen3_32b_qlora_4xh100.toml +5 -0
  58. examples/multi_step/configs/VERILOG_REWARDS.md +4 -0
  59. examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +4 -0
  60. examples/multi_step/configs/crafter_rl_outcome.toml +2 -1
  61. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +65 -107
  62. examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +2 -1
  63. examples/multi_step/configs/crafter_rl_stepwise_simple.toml +2 -1
  64. examples/multi_step/configs/crafter_rl_stepwise_simple_NEW_FORMAT.toml +105 -0
  65. examples/multi_step/configs/verilog_rl_lora.toml +80 -123
  66. examples/qwen_coder/configs/coder_lora_30b.toml +1 -3
  67. examples/qwen_coder/configs/coder_lora_4b.toml +4 -1
  68. examples/qwen_coder/configs/coder_lora_small.toml +1 -3
  69. examples/qwen_vl/README.md +10 -12
  70. examples/qwen_vl/SETUP_COMPLETE.md +7 -8
  71. examples/qwen_vl/VISION_TESTS_COMPLETE.md +2 -3
  72. examples/qwen_vl/collect_data_via_cli.md +76 -84
  73. examples/qwen_vl/collect_vision_traces.py +4 -4
  74. examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +40 -57
  75. examples/qwen_vl/configs/crafter_vlm_sft_example.toml +1 -2
  76. examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +20 -37
  77. examples/qwen_vl/configs/eval_gpt5nano_vision.toml +21 -40
  78. examples/qwen_vl/configs/eval_qwen3vl_vision.toml +26 -0
  79. examples/qwen_vl/configs/{filter_qwen2vl_sft.toml → filter_qwen3vl_sft.toml} +4 -5
  80. examples/qwen_vl/configs/filter_vision_sft.toml +2 -3
  81. examples/qwen_vl/crafter_qwen_vl_agent.py +5 -5
  82. examples/qwen_vl/run_vision_comparison.sh +6 -7
  83. examples/rl/README.md +5 -5
  84. examples/rl/configs/rl_from_base_qwen.toml +26 -1
  85. examples/rl/configs/rl_from_base_qwen17.toml +6 -2
  86. examples/rl/task_app/README.md +1 -2
  87. examples/rl/task_app/math_single_step.py +2 -2
  88. examples/run_crafter_demo.sh +2 -2
  89. examples/sft/README.md +1 -1
  90. examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -1
  91. examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -1
  92. examples/swe/task_app/README.md +32 -2
  93. examples/swe/task_app/grpo_swe_mini.py +4 -0
  94. examples/swe/task_app/hosted/envs/crafter/react_agent.py +1 -1
  95. examples/swe/task_app/hosted/envs/mini_swe/environment.py +37 -10
  96. examples/swe/task_app/hosted/inference/openai_client.py +4 -38
  97. examples/swe/task_app/hosted/policy_routes.py +17 -0
  98. examples/swe/task_app/hosted/rollout.py +4 -2
  99. examples/swe/task_app/morph_backend.py +178 -0
  100. examples/task_apps/banking77/__init__.py +6 -0
  101. examples/task_apps/banking77/banking77_task_app.py +841 -0
  102. examples/task_apps/banking77/deploy_wrapper.py +46 -0
  103. examples/task_apps/crafter/CREATE_SFT_DATASET.md +4 -0
  104. examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +4 -0
  105. examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +4 -0
  106. examples/task_apps/crafter/task_app/README.md +1 -1
  107. examples/task_apps/crafter/task_app/grpo_crafter.py +90 -5
  108. examples/task_apps/crafter/task_app/grpo_crafter_task_app.py +1 -1
  109. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +4 -26
  110. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -2
  111. examples/task_apps/crafter/task_app/synth_envs_hosted/hosted_app.py +49 -0
  112. examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +372 -107
  113. examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +81 -12
  114. examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +82 -11
  115. examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +194 -1
  116. examples/task_apps/enron/task_app/grpo_enron_task_app.py +1 -1
  117. examples/task_apps/gepa_benchmarks/__init__.py +7 -0
  118. examples/task_apps/gepa_benchmarks/common.py +260 -0
  119. examples/task_apps/gepa_benchmarks/hotpotqa_task_app.py +507 -0
  120. examples/task_apps/gepa_benchmarks/hover_task_app.py +436 -0
  121. examples/task_apps/gepa_benchmarks/ifbench_task_app.py +563 -0
  122. examples/task_apps/gepa_benchmarks/pupa_task_app.py +460 -0
  123. examples/task_apps/math/README.md +1 -2
  124. examples/task_apps/pokemon_red/README.md +3 -4
  125. examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +4 -0
  126. examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +6 -5
  127. examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +1 -2
  128. examples/task_apps/pokemon_red/task_app.py +288 -39
  129. examples/task_apps/sokoban/README.md +2 -3
  130. examples/task_apps/verilog/eval_groq_qwen32b.toml +12 -14
  131. examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +1 -1
  132. examples/vlm/configs/crafter_vlm_gpt4o.toml +4 -1
  133. examples/warming_up_to_rl/configs/crafter_fft.toml +4 -1
  134. examples/warming_up_to_rl/configs/crafter_fft_4b.toml +0 -2
  135. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +3 -2
  136. examples/warming_up_to_rl/run_local_rollout_traced.py +1 -1
  137. examples/warming_up_to_rl/task_app/README.md +1 -1
  138. examples/warming_up_to_rl/task_app/grpo_crafter.py +185 -5
  139. examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +1 -1
  140. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +3 -27
  141. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -1
  142. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +49 -0
  143. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +156 -45
  144. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +37 -4
  145. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +33 -3
  146. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +67 -0
  147. examples/workflows/math_rl/configs/rl_from_base_qwen.toml +27 -0
  148. examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +6 -0
  149. synth_ai/api/train/builders.py +99 -4
  150. synth_ai/api/train/cli.py +516 -26
  151. synth_ai/api/train/config_finder.py +13 -2
  152. synth_ai/api/train/configs/__init__.py +23 -2
  153. synth_ai/api/train/configs/prompt_learning.py +442 -0
  154. synth_ai/api/train/configs/rl.py +61 -7
  155. synth_ai/api/train/configs/sft.py +6 -2
  156. synth_ai/api/train/configs/shared.py +59 -2
  157. synth_ai/api/train/task_app.py +1 -1
  158. synth_ai/api/train/validators.py +277 -0
  159. synth_ai/auth/credentials.py +119 -0
  160. synth_ai/baseline/__init__.py +25 -0
  161. synth_ai/baseline/config.py +209 -0
  162. synth_ai/baseline/discovery.py +214 -0
  163. synth_ai/baseline/execution.py +146 -0
  164. synth_ai/cli/__init__.py +94 -18
  165. synth_ai/cli/__main__.py +0 -0
  166. synth_ai/cli/claude.py +70 -0
  167. synth_ai/cli/codex.py +84 -0
  168. synth_ai/cli/commands/__init__.py +18 -0
  169. synth_ai/cli/commands/baseline/__init__.py +12 -0
  170. synth_ai/cli/commands/baseline/core.py +637 -0
  171. synth_ai/cli/commands/baseline/list.py +93 -0
  172. synth_ai/cli/commands/demo/__init__.py +6 -0
  173. synth_ai/cli/commands/demo/core.py +163 -0
  174. synth_ai/cli/commands/eval/__init__.py +19 -0
  175. synth_ai/cli/commands/eval/core.py +1112 -0
  176. synth_ai/cli/commands/eval/errors.py +81 -0
  177. synth_ai/cli/commands/eval/validation.py +133 -0
  178. synth_ai/cli/commands/filter/__init__.py +12 -0
  179. synth_ai/cli/commands/filter/core.py +424 -0
  180. synth_ai/cli/commands/filter/errors.py +55 -0
  181. synth_ai/cli/commands/filter/validation.py +77 -0
  182. synth_ai/cli/commands/help/__init__.py +177 -0
  183. synth_ai/cli/commands/help/core.py +72 -0
  184. synth_ai/cli/commands/smoke/__init__.py +7 -0
  185. synth_ai/cli/commands/smoke/core.py +1436 -0
  186. synth_ai/cli/commands/status/__init__.py +64 -0
  187. synth_ai/cli/commands/status/client.py +192 -0
  188. synth_ai/cli/commands/status/config.py +92 -0
  189. synth_ai/cli/commands/status/errors.py +20 -0
  190. synth_ai/cli/commands/status/formatters.py +164 -0
  191. synth_ai/cli/commands/status/subcommands/__init__.py +9 -0
  192. synth_ai/cli/commands/status/subcommands/files.py +79 -0
  193. synth_ai/cli/commands/status/subcommands/jobs.py +334 -0
  194. synth_ai/cli/commands/status/subcommands/models.py +79 -0
  195. synth_ai/cli/commands/status/subcommands/pricing.py +22 -0
  196. synth_ai/cli/commands/status/subcommands/runs.py +81 -0
  197. synth_ai/cli/commands/status/subcommands/summary.py +47 -0
  198. synth_ai/cli/commands/status/subcommands/usage.py +203 -0
  199. synth_ai/cli/commands/status/utils.py +114 -0
  200. synth_ai/cli/commands/train/__init__.py +53 -0
  201. synth_ai/cli/commands/train/core.py +21 -0
  202. synth_ai/cli/commands/train/errors.py +117 -0
  203. synth_ai/cli/commands/train/judge_schemas.py +200 -0
  204. synth_ai/cli/commands/train/judge_validation.py +305 -0
  205. synth_ai/cli/commands/train/validation.py +386 -0
  206. synth_ai/cli/demo.py +30 -158
  207. synth_ai/cli/deploy/__init__.py +43 -0
  208. synth_ai/cli/deploy.py +162 -0
  209. synth_ai/cli/eval/__init__.py +36 -0
  210. synth_ai/cli/eval/core.py +5 -0
  211. synth_ai/cli/eval/errors.py +31 -0
  212. synth_ai/cli/eval/validation.py +5 -0
  213. synth_ai/cli/filter/__init__.py +28 -0
  214. synth_ai/cli/filter/core.py +5 -0
  215. synth_ai/cli/filter/errors.py +23 -0
  216. synth_ai/cli/filter/validation.py +5 -0
  217. synth_ai/cli/legacy_root_backup.py +14 -8
  218. synth_ai/cli/modal_serve/__init__.py +12 -0
  219. synth_ai/cli/modal_serve/core.py +14 -0
  220. synth_ai/cli/modal_serve/errors.py +8 -0
  221. synth_ai/cli/modal_serve/validation.py +11 -0
  222. synth_ai/cli/opencode.py +107 -0
  223. synth_ai/cli/root.py +9 -5
  224. synth_ai/cli/serve/__init__.py +12 -0
  225. synth_ai/cli/serve/core.py +14 -0
  226. synth_ai/cli/serve/errors.py +8 -0
  227. synth_ai/cli/serve/validation.py +11 -0
  228. synth_ai/cli/setup.py +20 -265
  229. synth_ai/cli/status.py +7 -126
  230. synth_ai/cli/task_app_deploy.py +1 -10
  231. synth_ai/cli/task_app_modal_serve.py +4 -9
  232. synth_ai/cli/task_app_serve.py +4 -11
  233. synth_ai/cli/task_apps.py +51 -1480
  234. synth_ai/cli/train/__init__.py +12 -0
  235. synth_ai/cli/train/core.py +21 -0
  236. synth_ai/cli/train/errors.py +8 -0
  237. synth_ai/cli/train/validation.py +24 -0
  238. synth_ai/cli/train.py +1 -14
  239. synth_ai/demos/crafter/grpo_crafter_task_app.py +1 -1
  240. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
  241. synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +7 -4
  242. synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +9 -5
  243. synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +4 -3
  244. synth_ai/environments/examples/red/engine.py +33 -12
  245. synth_ai/environments/examples/red/engine_helpers/reward_components.py +151 -179
  246. synth_ai/environments/examples/red/environment.py +26 -0
  247. synth_ai/environments/examples/red/trace_hooks_v3.py +168 -0
  248. synth_ai/http.py +12 -0
  249. synth_ai/judge_schemas.py +10 -10
  250. synth_ai/learning/__init__.py +10 -0
  251. synth_ai/learning/prompt_learning_client.py +276 -0
  252. synth_ai/learning/prompt_learning_types.py +184 -0
  253. synth_ai/learning/rl/client.py +3 -1
  254. synth_ai/pricing/__init__.py +2 -0
  255. synth_ai/pricing/model_pricing.py +57 -0
  256. synth_ai/streaming/__init__.py +29 -0
  257. synth_ai/streaming/config.py +94 -0
  258. synth_ai/streaming/handlers.py +518 -0
  259. synth_ai/streaming/streamer.py +320 -0
  260. synth_ai/streaming/types.py +95 -0
  261. synth_ai/task/apps/__init__.py +1 -0
  262. synth_ai/task/config.py +2 -0
  263. synth_ai/task/tracing_utils.py +25 -25
  264. synth_ai/task/validators.py +45 -9
  265. synth_ai/task_app_cfgs.py +21 -0
  266. synth_ai/tracing_v3/config.py +162 -19
  267. synth_ai/tracing_v3/constants.py +1 -1
  268. synth_ai/tracing_v3/db_config.py +24 -38
  269. synth_ai/tracing_v3/migration_helper.py +1 -2
  270. synth_ai/tracing_v3/storage/config.py +47 -13
  271. synth_ai/tracing_v3/storage/factory.py +3 -3
  272. synth_ai/tracing_v3/turso/daemon.py +113 -11
  273. synth_ai/tracing_v3/turso/native_manager.py +92 -16
  274. synth_ai/types.py +8 -0
  275. synth_ai/urls.py +11 -0
  276. synth_ai/utils/__init__.py +30 -1
  277. synth_ai/utils/agents.py +74 -0
  278. synth_ai/utils/bin.py +39 -0
  279. synth_ai/utils/cli.py +149 -5
  280. synth_ai/utils/env.py +40 -33
  281. synth_ai/utils/http.py +4 -1
  282. synth_ai/utils/json.py +72 -0
  283. synth_ai/utils/modal.py +285 -3
  284. synth_ai/utils/paths.py +48 -0
  285. synth_ai/utils/uvicorn.py +113 -0
  286. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/METADATA +109 -6
  287. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/RECORD +291 -142
  288. examples/qwen_vl/configs/eval_qwen2vl_vision.toml +0 -44
  289. synth_ai/cli/tui.py +0 -62
  290. synth_ai/tui/__init__.py +0 -5
  291. synth_ai/tui/__main__.py +0 -13
  292. synth_ai/tui/cli/__init__.py +0 -1
  293. synth_ai/tui/cli/query_experiments.py +0 -164
  294. synth_ai/tui/cli/query_experiments_v3.py +0 -164
  295. synth_ai/tui/dashboard.py +0 -911
  296. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/WHEEL +0 -0
  297. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/entry_points.txt +0 -0
  298. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/licenses/LICENSE +0 -0
  299. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,355 @@
1
+ # GEPA: Genetic Evolution for Prompt Optimization
2
+
3
+ This directory contains examples and configurations for using GEPA (Genetic Evolution for Prompt Optimization) to optimize prompts for various classification and reasoning tasks.
4
+
5
+ ## Overview
6
+
7
+ **GEPA** is an evolutionary algorithm that optimizes prompts through genetic operations (mutation, crossover, selection) across multiple generations. It's particularly effective for:
8
+ - Intent classification (Banking77)
9
+ - Multi-hop QA (HotpotQA)
10
+ - Instruction following (IFBench)
11
+ - Claim verification (HoVer)
12
+ - Privacy-aware delegation (PUPA)
13
+
14
+ ## Supported Tasks
15
+
16
+ Configuration files live under `configs/`:
17
+
18
+ | Task | Description | Config Files |
19
+ |------|-------------|--------------|
20
+ | **Banking77** | Intent classification (77 banking intents) | `banking77_gepa_local.toml`, `banking77_mipro_local.toml` |
21
+ | **HotpotQA** | Multi-hop question answering | `hotpotqa_gepa_local.toml`, `hotpotqa_mipro_local.toml` |
22
+ | **IFBench** | Instruction following benchmark | `ifbench_gepa_local.toml`, `ifbench_mipro_local.toml` |
23
+ | **HoVer** | Claim verification against Wikipedia | `hover_gepa_local.toml`, `hover_mipro_local.toml` |
24
+ | **PUPA** | Privacy-aware task delegation | `pupa_gepa_local.toml`, `pupa_mipro_local.toml` |
25
+
26
+ Each template targets a different default port (8110–8113) so you can run multiple task apps side-by-side.
27
+
28
+ ---
29
+
30
+ ## Quick Start (Banking77 Example)
31
+
32
+ ### Prerequisites
33
+
34
+ ```bash
35
+ # 1. Install dependencies
36
+ uv pip install -e .
37
+
38
+ # 2. Set environment variables
39
+ export SYNTH_API_KEY="your-backend-api-key"
40
+ export GROQ_API_KEY="gsk_your_groq_key"
41
+ export ENVIRONMENT_API_KEY="$(python -c 'import secrets; print(secrets.token_urlsafe(32))')"
42
+ ```
43
+
44
+ **Where to get API keys:**
45
+ - **GROQ_API_KEY**: Get from https://console.groq.com/keys
46
+ - **SYNTH_API_KEY**: Get from your backend admin or `.env.dev` file
47
+ - **ENVIRONMENT_API_KEY**: Generate a random secure token (command above)
48
+
49
+ ### Step 1: Start the Backend
50
+
51
+ ```bash
52
+ # Make sure your backend is running
53
+ curl http://localhost:8000/api/health
54
+ # Should return: {"status":"ok"}
55
+ ```
56
+
57
+ ### Step 2: Deploy Task App
58
+
59
+ **Option A: Using helper script (recommended)**
60
+ ```bash
61
+ # Terminal 1
62
+ ./examples/blog_posts/gepa/deploy_banking77_task_app.sh
63
+ ```
64
+
65
+ **Option B: Using CLI**
66
+ ```bash
67
+ uvx synth-ai deploy banking77 --runtime uvicorn --port 8102
68
+ ```
69
+
70
+ **Option C: Deploy to Modal**
71
+ ```bash
72
+ uvx synth-ai deploy banking77 --runtime modal --name banking77-gepa --env-file .env
73
+ ```
74
+
75
+ ### Step 3: Run GEPA Optimization
76
+
77
+ **Option A: Using helper script (recommended)**
78
+ ```bash
79
+ # Terminal 2
80
+ ./examples/blog_posts/gepa/run_gepa_banking77.sh
81
+ ```
82
+
83
+ **Option B: Using CLI directly**
84
+ ```bash
85
+ uvx synth-ai train \
86
+ --config examples/blog_posts/gepa/configs/banking77_gepa_local.toml \
87
+ --backend http://localhost:8000 \
88
+ --poll
89
+ ```
90
+
91
+ ### Step 4: Monitor Progress
92
+
93
+ You'll see real-time output like:
94
+ ```
95
+ 🧬 Running GEPA on Banking77
96
+ =============================
97
+ ✅ Backend URL: http://localhost:8000
98
+ ✅ Task app is healthy
99
+
100
+ 🚀 Starting GEPA training...
101
+
102
+ proposal[0] train_accuracy=0.65 len=120 tool_rate=0.95 N=30
103
+ 🔄 TRANSFORMATION:
104
+ [SYSTEM]: Classify customer banking queries into intents...
105
+
106
+ Generation 1/15: Best reward=0.75 (75% accuracy)
107
+ Generation 2/15: Best reward=0.82 (82% accuracy)
108
+ ...
109
+ ✅ GEPA training complete!
110
+ ```
111
+
112
+ Results are automatically saved to `configs/results/gepa_results_<job_id>_<timestamp>.txt`.
113
+
114
+ ---
115
+
116
+ ## Configuration
117
+
118
+ ### Example: Banking77 GEPA Configuration
119
+
120
+ ```toml
121
+ [prompt_learning]
122
+ algorithm = "gepa"
123
+ task_app_url = "http://127.0.0.1:8102"
124
+ task_app_id = "banking77"
125
+
126
+ # Training seeds (30 seeds from train pool)
127
+ evaluation_seeds = [50, 51, 52, ..., 79]
128
+
129
+ # Validation seeds (50 seeds from validation pool - not in training)
130
+ validation_seeds = [0, 1, 2, ..., 49]
131
+
132
+ [prompt_learning.gepa]
133
+ initial_population_size = 20 # Starting population of prompts
134
+ num_generations = 15 # Number of evolutionary cycles
135
+ mutation_rate = 0.3 # Probability of mutation
136
+ crossover_rate = 0.5 # Probability of crossover
137
+ rollout_budget = 1000 # Total rollouts across all generations
138
+ max_concurrent_rollouts = 20 # Parallel rollout limit
139
+ pareto_set_size = 20 # Size of Pareto front
140
+ ```
141
+
142
+ ### Key Parameters
143
+
144
+ | Parameter | Description | Typical Range |
145
+ |-----------|-------------|---------------|
146
+ | `initial_population_size` | Starting number of prompt variants | 10-50 |
147
+ | `num_generations` | Evolutionary cycles to run | 5-30 |
148
+ | `mutation_rate` | Probability of mutating a prompt | 0.1-0.5 |
149
+ | `crossover_rate` | Probability of combining two prompts | 0.3-0.7 |
150
+ | `rollout_budget` | Total task evaluations allowed | 200-2000 |
151
+ | `max_concurrent_rollouts` | Parallel rollout limit | 10-50 |
152
+ | `pareto_set_size` | Multi-objective optimization frontier size | 10-30 |
153
+
154
+ ---
155
+
156
+ ## Querying Results
157
+
158
+ After GEPA completes, you can query job results programmatically:
159
+
160
+ ### Python API
161
+
162
+ ```python
163
+ from synth_ai.learning import get_prompts, get_prompt_text, get_scoring_summary
164
+
165
+ # Get all results
166
+ results = get_prompts(
167
+ job_id="pl_abc123",
168
+ base_url="http://localhost:8000",
169
+ api_key="sk_..."
170
+ )
171
+
172
+ # Access best prompt
173
+ best_prompt = results["best_prompt"]
174
+ best_score = results["best_score"]
175
+ print(f"Best Score: {best_score:.3f}")
176
+
177
+ # Get top-K prompts
178
+ for prompt_info in results["top_prompts"]:
179
+ print(f"Rank {prompt_info['rank']}: {prompt_info['train_accuracy']:.3f}")
180
+ print(prompt_info["full_text"])
181
+
182
+ # Quick access to best prompt text only
183
+ best_text = get_prompt_text(
184
+ job_id="pl_abc123",
185
+ base_url="http://localhost:8000",
186
+ api_key="sk_...",
187
+ rank=1 # 1 = best, 2 = second best, etc.
188
+ )
189
+
190
+ # Get scoring statistics
191
+ summary = get_scoring_summary(
192
+ job_id="pl_abc123",
193
+ base_url="http://localhost:8000",
194
+ api_key="sk_..."
195
+ )
196
+ print(f"Best: {summary['best_train_accuracy']:.3f}")
197
+ print(f"Mean: {summary['mean_train_accuracy']:.3f}")
198
+ print(f"Tried: {summary['num_candidates_tried']}")
199
+ ```
200
+
201
+ ### Command Line
202
+
203
+ ```bash
204
+ # Set environment variables
205
+ export BACKEND_BASE_URL="http://localhost:8000"
206
+ export SYNTH_API_KEY="sk_..."
207
+
208
+ # Run the example script
209
+ python examples/blog_posts/gepa/query_prompts_example.py pl_abc123
210
+ ```
211
+
212
+ ### REST API
213
+
214
+ ```bash
215
+ # Get job status
216
+ curl -H "Authorization: Bearer $SYNTH_API_KEY" \
217
+ http://localhost:8000/api/prompt-learning/online/jobs/JOB_ID
218
+
219
+ # Stream events
220
+ curl -H "Authorization: Bearer $SYNTH_API_KEY" \
221
+ http://localhost:8000/api/prompt-learning/online/jobs/JOB_ID/events/stream
222
+
223
+ # Get metrics
224
+ curl -H "Authorization: Bearer $SYNTH_API_KEY" \
225
+ http://localhost:8000/api/prompt-learning/online/jobs/JOB_ID/metrics
226
+ ```
227
+
228
+ ---
229
+
230
+ ## Expected Results
231
+
232
+ GEPA typically improves accuracy over generations:
233
+
234
+ | Generation | Typical Accuracy | Notes |
235
+ |------------|------------------|-------|
236
+ | 1 (baseline) | 60-75% | Initial random/baseline prompts |
237
+ | 5 | 75-80% | Early optimization gains |
238
+ | 10 | 80-85% | Convergence begins |
239
+ | 15 (final) | 85-90%+ | Optimized prompts on Pareto front |
240
+
241
+ The Pareto front contains multiple prompt variants balancing:
242
+ - **Accuracy** (primary objective)
243
+ - **Token count** (efficiency objective)
244
+ - **Tool call rate** (task-specific objective)
245
+
246
+ ---
247
+
248
+ ## Helper Scripts
249
+
250
+ | Script | Purpose |
251
+ |--------|---------|
252
+ | `deploy_banking77_task_app.sh` | Start Banking77 task app locally |
253
+ | `run_gepa_banking77.sh` | Run GEPA optimization with validation checks |
254
+ | `test_gepa_local.sh` | Quick test script for local setup |
255
+ | `verify_banking77_setup.sh` | Comprehensive setup verification |
256
+ | `query_prompts_example.py` | Example script for querying results |
257
+
258
+ ---
259
+
260
+ ## Troubleshooting
261
+
262
+ ### ❌ "Banking77 task app is not running"
263
+
264
+ **Solution:** Start the task app first
265
+ ```bash
266
+ ./examples/blog_posts/gepa/deploy_banking77_task_app.sh
267
+ ```
268
+
269
+ ### ❌ "Cannot connect to backend"
270
+
271
+ **Solution:** Verify backend is running
272
+ ```bash
273
+ curl http://localhost:8000/api/health
274
+ ```
275
+
276
+ If not running, start your backend service.
277
+
278
+ ### ❌ "GROQ_API_KEY environment variable is required"
279
+
280
+ **Solution:** Export your Groq API key
281
+ ```bash
282
+ export GROQ_API_KEY="gsk_your_key_here"
283
+ ```
284
+
285
+ ### ❌ "Failed to download dataset"
286
+
287
+ **Solution:** Check internet connection. The task app downloads from Hugging Face.
288
+
289
+ If you have the dataset locally:
290
+ ```bash
291
+ export BANKING77_DATASET_NAME="/path/to/local/banking77"
292
+ ```
293
+
294
+ ### ❌ Pattern validation failed
295
+
296
+ **Solution:** Ensure your config's `initial_prompt.messages` uses the `{query}` wildcard:
297
+ ```toml
298
+ [[prompt_learning.initial_prompt.messages]]
299
+ role = "user"
300
+ pattern = "Customer Query: {query}\n\nClassify this query."
301
+ ```
302
+
303
+ ### ⚠️ Metrics not streaming
304
+
305
+ **Solution:**
306
+ 1. Verify backend `/metrics` endpoint exists
307
+ 2. Check SDK `StreamConfig` enables `StreamType.METRICS`
308
+ 3. Restart local backend to pick up latest code
309
+
310
+ ---
311
+
312
+ ## Files in This Directory
313
+
314
+ ```
315
+ examples/blog_posts/gepa/
316
+ ├── README.md # This file - comprehensive guide
317
+ ├── configs/ # Configuration files
318
+ │ ├── banking77_gepa_local.toml # Banking77 GEPA config
319
+ │ ├── banking77_mipro_local.toml # Banking77 MIPRO config
320
+ │ ├── hotpotqa_gepa_local.toml # HotpotQA configs
321
+ │ ├── ifbench_gepa_local.toml # IFBench configs
322
+ │ ├── hover_gepa_local.toml # HoVer configs
323
+ │ └── pupa_gepa_local.toml # PUPA configs
324
+ ├── deploy_banking77_task_app.sh # Helper: Start task app
325
+ ├── run_gepa_banking77.sh # Helper: Run GEPA
326
+ ├── test_gepa_local.sh # Helper: Quick test
327
+ ├── verify_banking77_setup.sh # Helper: Verify setup
328
+ ├── (baseline: examples/baseline/banking77_baseline.py)
329
+ ├── query_prompts_example.py # Query results example
330
+ └── task_apps.py # Task app registry
331
+ ```
332
+
333
+ ---
334
+
335
+ ## Next Steps
336
+
337
+ 1. **Evaluate optimized prompts**: Test best prompts on held-out validation split
338
+ 2. **Compare with baseline**: Run `uvx synth-ai baseline banking77` to measure improvement
339
+ 3. **Experiment with parameters**: Adjust mutation/crossover rates, population size
340
+ 4. **Try MIPRO**: Compare GEPA with MIPROv2 optimization
341
+ 5. **Benchmark across tasks**: Test on HotpotQA, IFBench, HoVer, PUPA
342
+
343
+ ---
344
+
345
+ ## Support
346
+
347
+ For issues or questions:
348
+
349
+ 1. Verify all API keys are set correctly
350
+ 2. Check task app: `curl -H "X-API-Key: $ENVIRONMENT_API_KEY" http://127.0.0.1:8102/health`
351
+ 3. Check backend: `curl http://localhost:8000/api/health`
352
+ 4. Review logs in both terminals for error messages
353
+ 5. Run verification script: `./verify_banking77_setup.sh`
354
+
355
+ Happy optimizing! 🧬🚀
@@ -0,0 +1,95 @@
1
+ [prompt_learning]
2
+ algorithm = "gepa"
3
+ task_app_url = "https://synth-laboratories-dev--synth-banking77-web-web.modal.run"
4
+ task_app_id = "banking77"
5
+
6
+ # Initial prompt pattern (pattern-based mode)
7
+ [prompt_learning.initial_prompt]
8
+ id = "banking77_pattern"
9
+ name = "Banking77 Classification Pattern"
10
+
11
+ [[prompt_learning.initial_prompt.messages]]
12
+ role = "system"
13
+ pattern = "You are an expert banking assistant that classifies customer queries into banking intents. Given a customer message, respond with exactly one intent label from the provided list using the `banking77_classify` tool."
14
+ order = 0
15
+
16
+ [[prompt_learning.initial_prompt.messages]]
17
+ role = "user"
18
+ pattern = "Customer Query: {query}\n\nClassify this query into one of the banking intents using the tool call."
19
+ order = 1
20
+
21
+ [prompt_learning.initial_prompt.wildcards]
22
+ query = "REQUIRED" # Will be provided by task app at runtime
23
+
24
+ # Policy configuration (model, provider, etc.)
25
+ [prompt_learning.policy]
26
+ inference_mode = "synth_hosted"
27
+ model = "openai/gpt-oss-20b"
28
+ provider = "groq"
29
+ inference_url = "https://api.groq.com/openai/v1"
30
+ temperature = 0.0
31
+ max_completion_tokens = 512
32
+ policy_name = "banking77-classifier" # Required for Banking77 task app
33
+
34
+ # Training split config
35
+ [prompt_learning.env_config]
36
+ pool = "train"
37
+
38
+ # GEPA-specific configuration with nested subsections (mirrors RL structure)
39
+ [prompt_learning.gepa]
40
+ env_name = "banking77"
41
+ proposer_type = "dspy"
42
+
43
+ # Rollout configuration (mirrors RL [rollout] section)
44
+ [prompt_learning.gepa.rollout]
45
+ budget = 1000
46
+ max_concurrent = 20
47
+ minibatch_size = 10
48
+
49
+ # Evaluation configuration (mirrors RL [evaluation] section)
50
+ [prompt_learning.gepa.evaluation]
51
+ seeds = [
52
+ 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
53
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
54
+ 70, 71, 72, 73, 74, 75, 76, 77, 78, 79
55
+ ] # Training seeds (30 seeds from train pool)
56
+ validation_seeds = [
57
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
58
+ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
59
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
60
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
61
+ 40, 41, 42, 43, 44, 45, 46, 47, 48, 49
62
+ ] # Held-out validation seeds (50 seeds from validation pool - not in training)
63
+ validation_pool = "validation"
64
+ validation_top_k = 3
65
+ test_pool = [2, 3] # Test pool for final evaluation (small held-out set)
66
+
67
+ # Mutation configuration (LLM-guided mutation settings)
68
+ [prompt_learning.gepa.mutation]
69
+ rate = 0.3
70
+ llm_model = "openai/gpt-oss-120b"
71
+ llm_provider = "groq"
72
+ llm_inference_url = "https://api.groq.com/openai/v1"
73
+
74
+ # Population configuration (evolution parameters)
75
+ [prompt_learning.gepa.population]
76
+ initial_size = 10
77
+ num_generations = 3
78
+ children_per_generation = 12
79
+ crossover_rate = 0.5
80
+ selection_pressure = 1.0
81
+ patience_generations = 3
82
+
83
+ # Archive configuration (Pareto archive settings)
84
+ [prompt_learning.gepa.archive]
85
+ size = 40
86
+ pareto_set_size = 32
87
+ pareto_eps = 1e-6
88
+ feedback_fraction = 0.5
89
+
90
+ # Token and budget configuration
91
+ [prompt_learning.gepa.token]
92
+ # max_limit = 1000 # Uncomment to set a token limit
93
+ counting_model = "gpt-4"
94
+ enforce_pattern_limit = true
95
+ # max_spend_usd = 100.0 # Uncomment to set a budget cap
@@ -0,0 +1,82 @@
1
+ # GEPA Prompt Learning for Banking77
2
+ # Local backend configuration (localhost:8000)
3
+
4
+ [prompt_learning]
5
+ algorithm = "gepa"
6
+ task_app_url = "https://synth-laboratories-dev--synth-banking77-web-web.modal.run"
7
+ task_app_id = "banking77"
8
+ evaluation_seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
9
+
10
+ # Held-out validation config
11
+ validation_seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
12
+ validation_pool = "validation"
13
+ validation_top_k = 3
14
+
15
+ # Training split config
16
+ [prompt_learning.env_config]
17
+ pool = "train"
18
+
19
+ # Seeds for evaluation (increase to score prompts with more rollouts)
20
+ evaluation_seeds = [
21
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
22
+ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
23
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29
24
+ ]
25
+
26
+ # Test pool for final evaluation (held-out episodes)
27
+ test_pool = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
28
+
29
+ # Initial prompt pattern (pattern-based mode)
30
+ [prompt_learning.initial_prompt]
31
+ id = "banking77_pattern"
32
+ name = "Banking77 Classification Pattern"
33
+
34
+ [[prompt_learning.initial_prompt.messages]]
35
+ role = "system"
36
+ pattern = "You are an expert banking assistant that classifies customer queries into banking intents. Given a customer message, respond with exactly one intent label from the provided list using the `banking77_classify` tool."
37
+ order = 0
38
+
39
+ [[prompt_learning.initial_prompt.messages]]
40
+ role = "user"
41
+ pattern = "Customer Query: {query}\n\nClassify this query into one of the banking intents using the tool call."
42
+ order = 1
43
+
44
+ [prompt_learning.initial_prompt.wildcards]
45
+ query = "REQUIRED" # Will be provided by task app at runtime
46
+
47
+ # Policy configuration (model, provider, etc.)
48
+ [prompt_learning.policy]
49
+ inference_mode = "synth_hosted"
50
+ model = "openai/gpt-oss-120b"
51
+ provider = "groq"
52
+ inference_url = "https://api.groq.com/openai/v1"
53
+ temperature = 0.0
54
+ max_completion_tokens = 512
55
+ policy_name = "banking77-classifier" # Required for Banking77 task app
56
+
57
+ # GEPA-specific configuration
58
+ [prompt_learning.gepa]
59
+ env_name = "banking77"
60
+ initial_population_size = 40
61
+ num_generations = 10
62
+ mutation_rate = 0.3
63
+ crossover_rate = 0.5
64
+ selection_pressure = 1.0
65
+ minibatch_size = 12
66
+ pareto_set_size = 40
67
+ feedback_fraction = 0.5
68
+ children_per_generation = 16
69
+ patience_generations = 5
70
+ rollout_budget = 1500
71
+ archive_size = 30
72
+ pareto_eps = 1e-6
73
+ max_concurrent_rollouts = 20 # Maximum concurrent rollouts across all transformations
74
+
75
+ # Instruction proposer selection
76
+ proposer_type = "dspy"
77
+
78
+ # LLM-guided mutation configuration
79
+ mutation_llm_model = "openai/gpt-oss-20b"
80
+ mutation_llm_provider = "groq"
81
+ mutation_llm_inference_url = "https://api.groq.com/openai/v1"
82
+
@@ -0,0 +1,52 @@
1
+ # MIPROv2 Prompt Learning for Banking77
2
+ # Local backend configuration targeting the Banking77 intent classification task app.
3
+
4
+ [prompt_learning]
5
+ algorithm = "mipro"
6
+ task_app_url = "http://127.0.0.1:8102"
7
+ task_app_id = "banking77"
8
+
9
+ # Seeds evaluated during optimisation
10
+ evaluation_seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
11
+
12
+ # Held-out seeds for final scoring
13
+ test_pool = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
14
+
15
+ [prompt_learning.initial_prompt]
16
+ id = "banking77_pattern"
17
+ name = "Banking77 Classification Pattern"
18
+
19
+ [[prompt_learning.initial_prompt.messages]]
20
+ role = "system"
21
+ pattern = "You are an expert banking assistant that classifies customer queries into banking intents. Return only the intent label using the `banking77_classify` tool."
22
+ order = 0
23
+
24
+ [[prompt_learning.initial_prompt.messages]]
25
+ role = "user"
26
+ pattern = "Customer Query: {query}\n\nClassify this query into one of the banking intents using the tool call."
27
+ order = 1
28
+
29
+ [prompt_learning.initial_prompt.wildcards]
30
+ query = "REQUIRED"
31
+
32
+ [prompt_learning.policy]
33
+ model = "openai/gpt-oss-20b"
34
+ provider = "groq"
35
+ inference_url = "https://api.groq.com/openai/v1"
36
+ temperature = 0.0
37
+ max_completion_tokens = 128
38
+ policy_name = "banking77-mipro"
39
+
40
+ [prompt_learning.mipro]
41
+ env_name = "banking77"
42
+ num_iterations = 16
43
+ num_evaluations_per_iteration = 6
44
+ batch_size = 6
45
+ max_concurrent = 16
46
+ meta_model = "gpt-4.1-mini"
47
+ meta_model_provider = "openai"
48
+ meta_model_inference_url = "https://api.openai.com/v1"
49
+ few_shot_score_threshold = 0.85
50
+ test_pool = [20, 21, 22, 23, 24]
51
+ bootstrap_train_seeds = [0, 1, 2, 3, 4]
52
+ online_pool = [5, 6, 7, 8, 9]
@@ -0,0 +1,59 @@
1
+ # GEPA Prompt Learning for HotpotQA
2
+ # Local backend configuration targeting the HotpotQA task app.
3
+
4
+ [prompt_learning]
5
+ algorithm = "gepa"
6
+ task_app_url = "http://127.0.0.1:8110"
7
+ task_app_id = "hotpotqa"
8
+
9
+ # Seeds for online evaluation (episode IDs)
10
+ evaluation_seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
11
+
12
+ # Held-out pool used for final evaluation
13
+ test_pool = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
14
+
15
+ [prompt_learning.initial_prompt]
16
+ id = "hotpotqa_chain"
17
+ name = "HotpotQA Multi-Hop Reasoning"
18
+
19
+ [[prompt_learning.initial_prompt.messages]]
20
+ role = "system"
21
+ pattern = "You are a research assistant that answers multi-hop questions. Use the provided supporting passages to reason out the final answer. Reply with the format:\nAnswer: <short answer>\nSupport: <brief justification referencing the passages>."
22
+ order = 0
23
+
24
+ [[prompt_learning.initial_prompt.messages]]
25
+ role = "user"
26
+ pattern = "Question: {question}\n\nPassages:\n{context}\n\nProvide the final answer and cite the relevant supporting facts."
27
+ order = 1
28
+
29
+ [prompt_learning.initial_prompt.wildcards]
30
+ question = "REQUIRED"
31
+ context = "REQUIRED"
32
+
33
+ [prompt_learning.policy]
34
+ model = "openai/gpt-oss-20b"
35
+ provider = "groq"
36
+ inference_url = "https://api.groq.com/openai/v1"
37
+ temperature = 0.0
38
+ max_completion_tokens = 512
39
+ policy_name = "hotpotqa-gepa"
40
+
41
+ [prompt_learning.gepa]
42
+ env_name = "hotpotqa"
43
+ initial_population_size = 24
44
+ num_generations = 15
45
+ mutation_rate = 0.35
46
+ crossover_rate = 0.55
47
+ selection_pressure = 1.0
48
+ minibatch_size = 8
49
+ pareto_set_size = 24
50
+ feedback_fraction = 0.5
51
+ children_per_generation = 12
52
+ patience_generations = 5
53
+ rollout_budget = 600
54
+ archive_size = 36
55
+ pareto_eps = 1e-6
56
+ max_concurrent_rollouts = 24
57
+ mutation_llm_model = "openai/gpt-oss-20b"
58
+ mutation_llm_provider = "groq"
59
+ mutation_llm_inference_url = "https://api.groq.com/openai/v1"
@@ -0,0 +1,36 @@
1
+ [prompt_learning]
2
+ algorithm = "gepa"
3
+ task_app_url = "https://synth-laboratories-dev--synth-banking77-web-web.modal.run" # TODO: replace with HotpotQA task app URL
4
+ task_app_id = "hotpotqa"
5
+
6
+ # Seeds
7
+ evaluation_seeds = [0,1,2,3,4,5,6,7,8,9]
8
+
9
+ # Held-out validation
10
+ validation_seeds = [10,11,12,13,14,15,16,17,18,19]
11
+ validation_pool = "validation"
12
+ validation_top_k = 3
13
+
14
+ # Train split configuration
15
+ [prompt_learning.env_config]
16
+ pool = "train"
17
+
18
+ # Policy model (synth Qwen via backend inference proxy)
19
+ [prompt_learning.policy]
20
+ provider = "synth"
21
+ model = "Qwen/Qwen3-8B"
22
+ # inference_url will be mapped to backend /api/inference/v1 by the optimizer
23
+
24
+ # GEPA parameters (tune as needed)
25
+ [prompt_learning.gepa]
26
+ env_name = "hotpotqa"
27
+ initial_population_size = 24
28
+ num_generations = 6
29
+ children_per_generation = 12
30
+ minibatch_size = 10
31
+ pareto_set_size = 32
32
+ rollout_budget = 600
33
+ max_concurrent_rollouts = 16
34
+ mutation_llm_model = "openai/gpt-oss-120b"
35
+ mutation_llm_provider = "groq"
36
+ proposer_type = "dspy"