synth-ai 0.2.9.dev0__py3-none-any.whl → 0.2.23.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/README.md +1 -0
- examples/__init__.py +16 -0
- examples/analyze_semantic_words.sh +17 -0
- examples/baseline/banking77_baseline.py +243 -0
- examples/baseline/banking77_pipeline_baseline.py +294 -0
- examples/baseline/crafter_baseline.py +407 -0
- examples/baseline/pokemon_red_baseline.py +326 -0
- examples/baseline/simple_baseline.py +56 -0
- examples/baseline/warming_up_to_rl_baseline.py +239 -0
- examples/blog_posts/gepa/README.md +355 -0
- examples/blog_posts/gepa/configs/banking77_gepa_local.toml +95 -0
- examples/blog_posts/gepa/configs/banking77_gepa_test.toml +80 -0
- examples/blog_posts/gepa/configs/banking77_mipro_local.toml +50 -0
- examples/blog_posts/gepa/configs/banking77_pipeline_gepa_local.toml +101 -0
- examples/blog_posts/gepa/configs/banking77_pipeline_gepa_test.toml +96 -0
- examples/blog_posts/gepa/configs/hotpotqa_gepa_local.toml +57 -0
- examples/blog_posts/gepa/configs/hotpotqa_gepa_qwen.toml +35 -0
- examples/blog_posts/gepa/configs/hotpotqa_mipro_local.toml +51 -0
- examples/blog_posts/gepa/configs/hover_gepa_local.toml +57 -0
- examples/blog_posts/gepa/configs/hover_gepa_qwen.toml +35 -0
- examples/blog_posts/gepa/configs/hover_mipro_local.toml +51 -0
- examples/blog_posts/gepa/configs/ifbench_gepa_local.toml +57 -0
- examples/blog_posts/gepa/configs/ifbench_gepa_qwen.toml +35 -0
- examples/blog_posts/gepa/configs/ifbench_mipro_local.toml +51 -0
- examples/blog_posts/gepa/configs/pupa_gepa_local.toml +58 -0
- examples/blog_posts/gepa/configs/pupa_mipro_local.toml +52 -0
- examples/blog_posts/gepa/deploy_banking77_task_app.sh +54 -0
- examples/blog_posts/gepa/gepa_baseline.py +204 -0
- examples/blog_posts/gepa/query_prompts_example.py +97 -0
- examples/blog_posts/gepa/run_gepa_banking77.sh +112 -0
- examples/blog_posts/gepa/run_gepa_banking77_pipeline.sh +163 -0
- examples/blog_posts/gepa/task_apps.py +105 -0
- examples/blog_posts/gepa/test_gepa_local.sh +67 -0
- examples/blog_posts/gepa/verify_banking77_setup.sh +123 -0
- examples/blog_posts/mipro/README.md +415 -0
- examples/blog_posts/mipro/configs/banking77_mipro_local.toml +91 -0
- examples/blog_posts/mipro/configs/banking77_mipro_test.toml +87 -0
- examples/blog_posts/mipro/configs/banking77_pipeline_mipro_gemini_flash_lite_local.toml +98 -0
- examples/blog_posts/mipro/configs/banking77_pipeline_mipro_gpt41mini_local.toml +96 -0
- examples/blog_posts/mipro/configs/banking77_pipeline_mipro_local.toml +94 -0
- examples/blog_posts/mipro/configs/banking77_pipeline_mipro_test.toml +170 -0
- examples/blog_posts/mipro/deploy_banking77_pipeline_task_app.sh +59 -0
- examples/blog_posts/mipro/deploy_banking77_task_app.sh +41 -0
- examples/blog_posts/mipro/multi_step.md +79 -0
- examples/blog_posts/mipro/run_mipro_banking77.sh +191 -0
- examples/blog_posts/mipro/run_mipro_banking77_pipeline.sh +171 -0
- examples/blog_posts/mipro/run_mipro_banking77_pipeline_gemini_flash_lite.sh +177 -0
- examples/blog_posts/mipro/run_mipro_banking77_pipeline_gpt41mini.sh +173 -0
- examples/blog_posts/mipro/verify_banking77_setup.sh +117 -0
- examples/blog_posts/pokemon_vl/README.md +98 -0
- examples/blog_posts/pokemon_vl/configs/eval_gpt5nano.toml +26 -0
- examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +27 -0
- examples/blog_posts/pokemon_vl/configs/eval_rl_final.toml +24 -0
- examples/blog_posts/pokemon_vl/configs/filter_high_reward.toml +10 -0
- examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +43 -0
- examples/blog_posts/pokemon_vl/configs/train_sft_qwen4b_vl.toml +40 -0
- examples/blog_posts/pokemon_vl/extract_images.py +239 -0
- examples/blog_posts/pokemon_vl/pokemon_vl_baseline.py +326 -0
- examples/blog_posts/pokemon_vl/run_eval_extract_images.py +209 -0
- examples/blog_posts/pokemon_vl/run_qwen_eval_extract_images.py +212 -0
- examples/blog_posts/pokemon_vl/text_box_analysis.md +106 -0
- examples/blog_posts/warming_up_to_rl/ARCHITECTURE.md +195 -0
- examples/blog_posts/warming_up_to_rl/FINAL_TEST_RESULTS.md +127 -0
- examples/blog_posts/warming_up_to_rl/INFERENCE_SUCCESS.md +132 -0
- examples/blog_posts/warming_up_to_rl/README.md +158 -0
- examples/blog_posts/warming_up_to_rl/SMOKE_TESTING.md +164 -0
- examples/blog_posts/warming_up_to_rl/SMOKE_TEST_COMPLETE.md +253 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_baseline_qwen32b_10x20.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b_10x20.toml +26 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_groq_qwen32b.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_openai_gpt_oss_120b.toml +29 -0
- examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +10 -0
- examples/blog_posts/warming_up_to_rl/configs/smoke_test.toml +75 -0
- examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +91 -0
- examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +40 -0
- examples/blog_posts/warming_up_to_rl/warming_up_to_rl_baseline.py +187 -0
- examples/crafter_debug_render.py +186 -0
- examples/dev/qwen3_32b_qlora_4xh100.toml +45 -0
- examples/gepa/banking77_pipeline_gepa.toml +96 -0
- examples/gepa/multi_stage_gepa_example.toml +84 -0
- examples/gepa/run_gepa_banking77_pipeline.sh +157 -0
- examples/multi_step/SFT_README.md +147 -0
- examples/multi_step/configs/README_verilog_rl.md +77 -0
- examples/multi_step/configs/VERILOG_REWARDS.md +103 -0
- examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +196 -0
- examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
- examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
- examples/multi_step/configs/crafter_rl_outcome.toml +75 -0
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +145 -0
- examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +84 -0
- examples/multi_step/configs/crafter_rl_stepwise_simple.toml +79 -0
- examples/multi_step/configs/crafter_rl_stepwise_simple_NEW_FORMAT.toml +105 -0
- examples/multi_step/configs/crafter_sft_qwen30b_lora.toml +62 -0
- examples/multi_step/configs/crafter_synth_backend.md +40 -0
- examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
- examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
- examples/multi_step/configs/verilog_rl_lora.toml +147 -0
- examples/multi_step/convert_traces_to_sft.py +84 -0
- examples/multi_step/crafter_rl_lora.md +70 -0
- examples/multi_step/judges/crafter_backend_judge.py +220 -0
- examples/multi_step/judges/verilog_backend_judge.py +234 -0
- examples/multi_step/readme.md +48 -0
- examples/multi_step/run_sft_qwen30b.sh +45 -0
- examples/multi_step/sse_metrics_streaming_notes.md +357 -0
- examples/multi_step/task_app_config_notes.md +494 -0
- examples/multi_step/verilog_rl_lora.md +218 -0
- examples/qwen_coder/README.md +102 -0
- examples/qwen_coder/_shared.py +113 -0
- examples/qwen_coder/configs/coder_lora_30b.toml +60 -0
- examples/qwen_coder/configs/coder_lora_4b.toml +61 -0
- examples/qwen_coder/configs/coder_lora_small.toml +57 -0
- examples/qwen_coder/generate_dataset.py +98 -0
- examples/qwen_coder/infer_ft_smoke.py +65 -0
- examples/qwen_coder/infer_prod_proxy.py +73 -0
- examples/qwen_coder/infer_via_synth.py +87 -0
- examples/qwen_coder/scripts/infer_coder.sh +19 -0
- examples/qwen_coder/scripts/train_coder_30b.sh +22 -0
- examples/qwen_coder/sft_full_17b.py +103 -0
- examples/qwen_coder/sft_lora_30b.py +110 -0
- examples/qwen_coder/subset_jsonl.py +39 -0
- examples/qwen_coder/todos.md +38 -0
- examples/qwen_coder/validate_jsonl.py +60 -0
- examples/qwen_vl/BUGS_AND_FIXES.md +232 -0
- examples/qwen_vl/IMAGE_VALIDATION_COMPLETE.md +271 -0
- examples/qwen_vl/IMAGE_VALIDATION_SUMMARY.md +260 -0
- examples/qwen_vl/INFERENCE_SFT_TESTS.md +412 -0
- examples/qwen_vl/NEXT_STEPS_2B.md +325 -0
- examples/qwen_vl/QUICKSTART.md +327 -0
- examples/qwen_vl/QUICKSTART_RL_VISION.md +110 -0
- examples/qwen_vl/README.md +152 -0
- examples/qwen_vl/RL_VISION_COMPLETE.md +475 -0
- examples/qwen_vl/RL_VISION_TESTING.md +333 -0
- examples/qwen_vl/SDK_VISION_INTEGRATION.md +328 -0
- examples/qwen_vl/SETUP_COMPLETE.md +274 -0
- examples/qwen_vl/VISION_TESTS_COMPLETE.md +489 -0
- examples/qwen_vl/VLM_PIPELINE_COMPLETE.md +242 -0
- examples/qwen_vl/__init__.py +2 -0
- examples/qwen_vl/collect_data_via_cli.md +415 -0
- examples/qwen_vl/collect_vision_traces.py +368 -0
- examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +110 -0
- examples/qwen_vl/configs/crafter_vlm_sft_example.toml +59 -0
- examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +26 -0
- examples/qwen_vl/configs/eval_gpt4o_vision_proper.toml +29 -0
- examples/qwen_vl/configs/eval_gpt5nano_vision.toml +26 -0
- examples/qwen_vl/configs/eval_qwen3vl_vision.toml +26 -0
- examples/qwen_vl/configs/filter_qwen3vl_sft.toml +49 -0
- examples/qwen_vl/configs/filter_vision_sft.toml +52 -0
- examples/qwen_vl/configs/filter_vision_test.toml +8 -0
- examples/qwen_vl/configs/sft_qwen3_vl_2b_test.toml +54 -0
- examples/qwen_vl/crafter_gpt5nano_agent.py +308 -0
- examples/qwen_vl/crafter_qwen_vl_agent.py +300 -0
- examples/qwen_vl/run_vision_comparison.sh +61 -0
- examples/qwen_vl/run_vision_sft_pipeline.sh +175 -0
- examples/qwen_vl/test_image_validation.py +201 -0
- examples/qwen_vl/test_sft_vision_data.py +110 -0
- examples/rl/README.md +169 -0
- examples/rl/configs/eval_base_qwen.toml +17 -0
- examples/rl/configs/eval_rl_qwen.toml +13 -0
- examples/rl/configs/rl_from_base_qwen.toml +62 -0
- examples/rl/configs/rl_from_base_qwen17.toml +80 -0
- examples/rl/configs/rl_from_ft_qwen.toml +37 -0
- examples/rl/download_dataset.py +80 -0
- examples/rl/run_eval.py +436 -0
- examples/rl/run_rl_and_save.py +111 -0
- examples/rl/task_app/README.md +21 -0
- {synth_ai/task/apps → examples/rl/task_app}/math_single_step.py +188 -50
- examples/rl/task_app/math_task_app.py +111 -0
- examples/run_crafter_demo.sh +10 -0
- examples/sdk_prompt_learning_example.py +55 -0
- examples/sft/README.md +139 -0
- examples/sft/configs/crafter_fft_qwen0p6b.toml +49 -0
- examples/sft/configs/crafter_lora_qwen0p6b.toml +49 -0
- examples/sft/evaluate.py +117 -0
- examples/sft/export_dataset.py +120 -0
- examples/sft/generate_traces.py +164 -0
- examples/swe/__init__.py +12 -0
- examples/swe/task_app/README.md +135 -0
- examples/swe/task_app/__init__.py +2 -0
- examples/swe/task_app/grpo_swe_mini.py +604 -0
- examples/swe/task_app/grpo_swe_mini_task_app.py +124 -0
- examples/swe/task_app/hosted/README.md +173 -0
- examples/swe/task_app/hosted/__init__.py +5 -0
- examples/swe/task_app/hosted/branching.py +143 -0
- examples/swe/task_app/hosted/environment_routes.py +1289 -0
- examples/swe/task_app/hosted/envs/__init__.py +1 -0
- examples/swe/task_app/hosted/envs/crafter/__init__.py +6 -0
- examples/swe/task_app/hosted/envs/crafter/app.py +1 -0
- examples/swe/task_app/hosted/envs/crafter/environment.py +522 -0
- examples/swe/task_app/hosted/envs/crafter/policy.py +478 -0
- examples/swe/task_app/hosted/envs/crafter/react_agent.py +108 -0
- examples/swe/task_app/hosted/envs/crafter/shared.py +305 -0
- examples/swe/task_app/hosted/envs/crafter/tools.py +47 -0
- examples/swe/task_app/hosted/envs/mini_swe/__init__.py +8 -0
- examples/swe/task_app/hosted/envs/mini_swe/environment.py +1191 -0
- examples/swe/task_app/hosted/envs/mini_swe/policy.py +355 -0
- examples/swe/task_app/hosted/envs/mini_swe/shared.py +83 -0
- examples/swe/task_app/hosted/envs/mini_swe/tools.py +96 -0
- examples/swe/task_app/hosted/hosted_app.py +204 -0
- examples/swe/task_app/hosted/inference/__init__.py +5 -0
- examples/swe/task_app/hosted/inference/openai_client.py +584 -0
- examples/swe/task_app/hosted/main.py +100 -0
- examples/swe/task_app/hosted/policy_routes.py +1094 -0
- examples/swe/task_app/hosted/registry.py +195 -0
- examples/swe/task_app/hosted/rollout.py +1905 -0
- examples/swe/task_app/hosted/storage/__init__.py +5 -0
- examples/swe/task_app/hosted/storage/volume.py +211 -0
- examples/swe/task_app/hosted/test_agents.py +161 -0
- examples/swe/task_app/hosted/test_service.py +136 -0
- examples/swe/task_app/hosted/utils.py +62 -0
- examples/swe/task_app/morph_backend.py +178 -0
- examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
- examples/task_apps/TESTING.md +275 -0
- examples/task_apps/banking77/__init__.py +6 -0
- examples/task_apps/banking77/banking77_task_app.py +912 -0
- examples/task_apps/banking77/deploy_wrapper.py +46 -0
- examples/task_apps/banking77_pipeline/__init__.py +6 -0
- examples/task_apps/banking77_pipeline/banking77_pipeline_task_app.py +489 -0
- examples/task_apps/banking77_pipeline/deploy_wrapper.py +50 -0
- examples/task_apps/crafter/CREATE_SFT_DATASET.md +286 -0
- examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
- examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +187 -0
- examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +281 -0
- examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
- examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
- examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
- examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
- examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
- examples/task_apps/crafter/task_app/README.md +42 -0
- examples/task_apps/crafter/task_app/__init__.py +5 -0
- examples/task_apps/crafter/task_app/grpo_crafter.py +1055 -0
- examples/task_apps/crafter/task_app/grpo_crafter_task_app.py +146 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/README.md +173 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/__init__.py +5 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/branching.py +143 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/environment_routes.py +1226 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/__init__.py +1 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/environment.py +532 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +583 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +122 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/shared.py +305 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/hosted_app.py +253 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/inference/__init__.py +5 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +999 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/main.py +100 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +1252 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/registry.py +195 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +2233 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/storage/__init__.py +5 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/storage/volume.py +211 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/test_agents.py +161 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/test_service.py +136 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +411 -0
- examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
- examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
- examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
- examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
- examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
- examples/task_apps/enron/__init__.py +2 -0
- examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
- examples/task_apps/enron/filter_sft.toml +5 -0
- examples/task_apps/enron/task_app/README.md +14 -0
- examples/task_apps/enron/task_app/__init__.py +1 -0
- examples/task_apps/enron/task_app/grpo_enron.py +906 -0
- examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
- examples/task_apps/enron/tests/__init__.py +4 -0
- examples/task_apps/enron/tests/conftest.py +115 -0
- examples/task_apps/enron/tests/integration/__init__.py +4 -0
- examples/task_apps/enron/tests/integration/test_enron_eval.py +179 -0
- examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
- examples/task_apps/enron/tests/unit/__init__.py +4 -0
- examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
- examples/task_apps/gepa_benchmarks/__init__.py +7 -0
- examples/task_apps/gepa_benchmarks/common.py +260 -0
- examples/task_apps/gepa_benchmarks/hotpotqa_task_app.py +507 -0
- examples/task_apps/gepa_benchmarks/hover_task_app.py +436 -0
- examples/task_apps/gepa_benchmarks/ifbench_task_app.py +563 -0
- examples/task_apps/gepa_benchmarks/pupa_task_app.py +460 -0
- examples/task_apps/math/README.md +21 -0
- examples/task_apps/math/math_single_step.py +1000 -0
- examples/task_apps/math/math_task_app.py +115 -0
- examples/task_apps/pokemon_battle/__init__.py +2 -0
- examples/task_apps/pokemon_battle/modal_app.py +104 -0
- examples/task_apps/pokemon_battle/task_app/README.md +68 -0
- examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
- examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
- examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
- examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
- examples/task_apps/pokemon_red/README.md +356 -0
- examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +428 -0
- examples/task_apps/pokemon_red/__init__.py +3 -0
- examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +30 -0
- examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +224 -0
- examples/task_apps/pokemon_red/pallet_town_rl_config.toml +75 -0
- examples/task_apps/pokemon_red/task_app.py +1048 -0
- examples/task_apps/pokemon_red/test_pallet_town_rewards.py +193 -0
- examples/task_apps/sokoban/README.md +306 -0
- examples/task_apps/sokoban/__init__.py +3 -0
- examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
- examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
- examples/task_apps/sokoban/filter_sft.toml +5 -0
- examples/task_apps/sokoban/task_app.py +1058 -0
- examples/task_apps/sokoban/tests/__init__.py +4 -0
- examples/task_apps/sokoban/tests/conftest.py +113 -0
- examples/task_apps/sokoban/tests/integration/__init__.py +4 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
- examples/task_apps/sokoban/tests/unit/__init__.py +4 -0
- examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
- examples/task_apps/verilog/__init__.py +1 -0
- examples/task_apps/verilog/eval_groq_qwen32b.toml +22 -0
- examples/task_apps/verilog/filter_sft.toml +5 -0
- examples/task_apps/verilog/task_app/README.md +12 -0
- examples/task_apps/verilog/task_app/__init__.py +1 -0
- examples/task_apps/verilog/task_app/grpo_verilog.py +1166 -0
- examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
- examples/task_apps/verilog/tests/__init__.py +4 -0
- examples/task_apps/verilog/tests/conftest.py +115 -0
- examples/task_apps/verilog/tests/integration/__init__.py +4 -0
- examples/task_apps/verilog/tests/integration/test_verilog_eval.py +181 -0
- examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
- examples/task_apps/verilog/tests/unit/__init__.py +4 -0
- examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
- examples/tunnel_gepa_banking77/README.md +106 -0
- examples/tunnel_gepa_banking77/banking77_gepa_tunnel.toml +95 -0
- examples/tunnel_gepa_banking77/keep_tunnel_running.py +60 -0
- examples/tunnel_gepa_banking77/run_gepa_with_tunnel.sh +226 -0
- examples/vlm/PROPOSAL.md +53 -0
- examples/vlm/README.md +68 -0
- examples/vlm/configs/crafter_vlm_gpt4o.toml +49 -0
- examples/vlm/crafter_image_only_agent.py +207 -0
- examples/vlm/crafter_openai_vlm_agent.py +275 -0
- examples/vlm/filter_image_rows.py +63 -0
- examples/vlm/run_crafter_vlm_benchmark.py +316 -0
- examples/warming_up_to_rl/_utils.py +92 -0
- examples/warming_up_to_rl/analyze_trace_db.py +422 -0
- examples/warming_up_to_rl/configs/crafter_fft.toml +53 -0
- examples/warming_up_to_rl/configs/crafter_fft_4b.toml +54 -0
- examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml +22 -0
- examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml +15 -0
- examples/warming_up_to_rl/configs/eval_modal_qwen4b.toml +24 -0
- examples/warming_up_to_rl/configs/eval_stepwise_complex.toml +35 -0
- examples/warming_up_to_rl/configs/eval_stepwise_consistent.toml +26 -0
- examples/warming_up_to_rl/configs/eval_stepwise_per_achievement.toml +36 -0
- examples/warming_up_to_rl/configs/eval_stepwise_simple.toml +32 -0
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +85 -0
- examples/warming_up_to_rl/configs/rl_from_ft.toml +58 -0
- examples/warming_up_to_rl/export_trace_sft.py +837 -0
- examples/warming_up_to_rl/groq_test.py +97 -0
- examples/warming_up_to_rl/manage_secrets.py +131 -0
- examples/warming_up_to_rl/old/event_rewards.md +234 -0
- examples/warming_up_to_rl/old/notes.md +73 -0
- examples/warming_up_to_rl/readme.md +110 -0
- examples/warming_up_to_rl/run_eval.py +736 -0
- examples/warming_up_to_rl/run_fft_and_save.py +380 -0
- examples/warming_up_to_rl/run_local_rollout.py +239 -0
- examples/warming_up_to_rl/run_local_rollout_modal.py +248 -0
- examples/warming_up_to_rl/run_local_rollout_parallel.py +405 -0
- examples/warming_up_to_rl/run_local_rollout_traced.py +477 -0
- examples/warming_up_to_rl/run_rl_and_save.py +124 -0
- examples/warming_up_to_rl/run_rollout_remote.py +156 -0
- examples/warming_up_to_rl/task_app/README.md +42 -0
- examples/warming_up_to_rl/task_app/grpo_crafter.py +876 -0
- examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +135 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/README.md +173 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +143 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +1226 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +522 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +454 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +108 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +305 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +253 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +729 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +100 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +1114 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +195 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +1891 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +211 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +161 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +137 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +129 -0
- examples/workflows/math_rl/configs/eval_base_qwen.toml +15 -0
- examples/workflows/math_rl/configs/eval_rl_qwen.toml +11 -0
- examples/workflows/math_rl/configs/rl_from_base_qwen.toml +62 -0
- examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +80 -0
- examples/workflows/math_rl/configs/rl_from_ft_qwen.toml +35 -0
- examples/workflows/math_rl/download_dataset.py +80 -0
- examples/workflows/math_rl/run_eval.py +436 -0
- examples/workflows/math_rl/run_rl_and_save.py +111 -0
- synth_ai/__init__.py +47 -23
- synth_ai/_utils/__init__.py +47 -0
- synth_ai/_utils/base_url.py +10 -0
- synth_ai/_utils/http.py +10 -0
- synth_ai/_utils/prompts.py +10 -0
- synth_ai/_utils/task_app_state.py +12 -0
- synth_ai/_utils/user_config.py +10 -0
- synth_ai/api/models/supported.py +514 -0
- synth_ai/api/train/__init__.py +60 -2
- synth_ai/api/train/builders.py +347 -39
- synth_ai/api/train/cli.py +895 -160
- synth_ai/api/train/config_finder.py +103 -25
- synth_ai/api/train/configs/__init__.py +65 -0
- synth_ai/api/train/configs/prompt_learning.py +496 -0
- synth_ai/api/train/configs/rl.py +188 -0
- synth_ai/api/train/configs/sft.py +99 -0
- synth_ai/api/train/configs/shared.py +81 -0
- synth_ai/api/train/env_resolver.py +70 -20
- synth_ai/api/train/pollers.py +29 -4
- synth_ai/api/train/prompt_learning.py +425 -0
- synth_ai/api/train/sft.py +390 -0
- synth_ai/api/train/supported_algos.py +147 -0
- synth_ai/api/train/task_app.py +6 -4
- synth_ai/api/train/utils.py +64 -52
- synth_ai/api/train/validators.py +1117 -0
- synth_ai/api/tunnel.py +49 -0
- synth_ai/auth/credentials.py +94 -0
- synth_ai/baseline/__init__.py +25 -0
- synth_ai/baseline/config.py +209 -0
- synth_ai/baseline/discovery.py +214 -0
- synth_ai/baseline/execution.py +146 -0
- synth_ai/cfgs.py +227 -0
- synth_ai/cli/__init__.py +85 -63
- synth_ai/cli/_modal_wrapper.py +31 -0
- synth_ai/cli/_storage.py +20 -0
- synth_ai/cli/_typer_patch.py +47 -0
- synth_ai/cli/_validate_task_app.py +29 -0
- synth_ai/cli/balance.py +16 -4
- synth_ai/cli/calc.py +36 -21
- synth_ai/cli/claude.py +70 -0
- synth_ai/cli/codex.py +267 -0
- synth_ai/cli/commands/__init__.py +18 -0
- synth_ai/cli/commands/baseline/__init__.py +12 -0
- synth_ai/cli/commands/baseline/core.py +637 -0
- synth_ai/cli/commands/baseline/list.py +93 -0
- synth_ai/cli/commands/demo/__init__.py +6 -0
- synth_ai/cli/commands/demo/core.py +163 -0
- synth_ai/cli/commands/eval/__init__.py +19 -0
- synth_ai/cli/commands/eval/core.py +1112 -0
- synth_ai/cli/commands/eval/errors.py +81 -0
- synth_ai/cli/commands/eval/validation.py +133 -0
- synth_ai/cli/commands/filter/__init__.py +12 -0
- synth_ai/cli/commands/filter/core.py +424 -0
- synth_ai/cli/commands/filter/errors.py +55 -0
- synth_ai/cli/commands/filter/validation.py +77 -0
- synth_ai/cli/commands/help/__init__.py +185 -0
- synth_ai/cli/commands/help/core.py +72 -0
- synth_ai/cli/commands/smoke/__init__.py +7 -0
- synth_ai/cli/commands/smoke/core.py +1437 -0
- synth_ai/cli/commands/status/__init__.py +66 -0
- synth_ai/cli/commands/status/client.py +192 -0
- synth_ai/cli/commands/status/config.py +92 -0
- synth_ai/cli/commands/status/errors.py +20 -0
- synth_ai/cli/commands/status/formatters.py +164 -0
- synth_ai/cli/commands/status/subcommands/__init__.py +9 -0
- synth_ai/cli/commands/status/subcommands/files.py +79 -0
- synth_ai/cli/commands/status/subcommands/jobs.py +334 -0
- synth_ai/cli/commands/status/subcommands/models.py +79 -0
- synth_ai/cli/commands/status/subcommands/pricing.py +22 -0
- synth_ai/cli/commands/status/subcommands/runs.py +81 -0
- synth_ai/cli/commands/status/subcommands/session.py +183 -0
- synth_ai/cli/commands/status/subcommands/summary.py +47 -0
- synth_ai/cli/commands/status/subcommands/usage.py +203 -0
- synth_ai/cli/commands/status/utils.py +114 -0
- synth_ai/cli/commands/train/__init__.py +53 -0
- synth_ai/cli/commands/train/core.py +21 -0
- synth_ai/cli/commands/train/errors.py +117 -0
- synth_ai/cli/commands/train/judge_schemas.py +200 -0
- synth_ai/cli/commands/train/judge_validation.py +305 -0
- synth_ai/cli/commands/train/validation.py +386 -0
- synth_ai/cli/demo.py +32 -140
- synth_ai/cli/deploy.py +233 -0
- synth_ai/cli/eval/__init__.py +36 -0
- synth_ai/cli/eval/core.py +5 -0
- synth_ai/cli/eval/errors.py +31 -0
- synth_ai/cli/eval/validation.py +5 -0
- synth_ai/cli/filter/__init__.py +28 -0
- synth_ai/cli/filter/core.py +5 -0
- synth_ai/cli/filter/errors.py +23 -0
- synth_ai/cli/filter/validation.py +5 -0
- synth_ai/cli/legacy_root_backup.py +28 -22
- synth_ai/cli/lib/__init__.py +10 -0
- synth_ai/cli/lib/task_app_discovery.py +7 -0
- synth_ai/cli/lib/task_app_env.py +518 -0
- synth_ai/cli/mcp.py +34 -0
- synth_ai/cli/modal_serve/__init__.py +12 -0
- synth_ai/cli/modal_serve/core.py +14 -0
- synth_ai/cli/modal_serve/errors.py +8 -0
- synth_ai/cli/modal_serve/validation.py +11 -0
- synth_ai/cli/opencode.py +256 -0
- synth_ai/cli/recent.py +13 -7
- synth_ai/cli/rl_demo.py +156 -116
- synth_ai/cli/root.py +131 -132
- synth_ai/cli/serve/__init__.py +12 -0
- synth_ai/cli/serve/core.py +14 -0
- synth_ai/cli/serve/errors.py +8 -0
- synth_ai/cli/serve/validation.py +11 -0
- synth_ai/cli/setup.py +49 -0
- synth_ai/cli/status.py +7 -125
- synth_ai/cli/task_app_deploy.py +7 -0
- synth_ai/cli/task_app_list.py +25 -0
- synth_ai/cli/task_app_modal_serve.py +11 -0
- synth_ai/cli/task_app_serve.py +11 -0
- synth_ai/cli/task_apps.py +2284 -257
- synth_ai/cli/traces.py +9 -5
- synth_ai/cli/train/__init__.py +12 -0
- synth_ai/cli/train/core.py +21 -0
- synth_ai/cli/train/errors.py +8 -0
- synth_ai/cli/train/validation.py +24 -0
- synth_ai/cli/train.py +5 -0
- synth_ai/cli/turso.py +73 -0
- synth_ai/cli/watch.py +13 -18
- synth_ai/demos/__init__.py +10 -0
- synth_ai/demos/core/__init__.py +28 -1
- synth_ai/demos/core/cli.py +579 -291
- synth_ai/demos/crafter/__init__.py +1 -0
- synth_ai/demos/crafter/crafter_fft_4b.toml +55 -0
- synth_ai/demos/crafter/grpo_crafter_task_app.py +185 -0
- synth_ai/demos/crafter/rl_from_base_qwen4b.toml +74 -0
- synth_ai/demos/demo_registry.py +176 -0
- synth_ai/demos/demo_task_apps/__init__.py +3 -3
- synth_ai/demos/demo_task_apps/core.py +64 -28
- synth_ai/demos/demo_task_apps/crafter/__init__.py +1 -0
- synth_ai/demos/demo_task_apps/crafter/configs/crafter_fft_4b.toml +53 -0
- synth_ai/demos/demo_task_apps/crafter/configs/rl_from_base_qwen4b.toml +73 -0
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +184 -0
- synth_ai/demos/demo_task_apps/math/_common.py +1 -2
- synth_ai/demos/demo_task_apps/math/app.py +2 -1
- synth_ai/demos/demo_task_apps/math/deploy_modal.py +3 -6
- synth_ai/demos/demo_task_apps/math/modal_task_app.py +185 -83
- synth_ai/demos/demo_task_apps/math/task_app_entry.py +0 -2
- synth_ai/demos/math/__init__.py +1 -0
- synth_ai/demos/math/_common.py +16 -0
- synth_ai/demos/math/app.py +38 -0
- synth_ai/demos/math/config.toml +76 -0
- synth_ai/demos/math/deploy_modal.py +54 -0
- synth_ai/demos/math/modal_task_app.py +703 -0
- synth_ai/demos/math/task_app_entry.py +51 -0
- synth_ai/environments/environment/core.py +7 -1
- synth_ai/environments/examples/bandit/engine.py +12 -5
- synth_ai/environments/examples/bandit/environment.py +0 -1
- synth_ai/environments/examples/bandit/taskset.py +4 -4
- synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +7 -4
- synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +9 -5
- synth_ai/environments/examples/crafter_classic/environment.py +93 -2
- synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +4 -3
- synth_ai/environments/examples/enron/engine.py +7 -2
- synth_ai/environments/examples/enron/environment.py +68 -0
- synth_ai/environments/examples/red/engine.py +60 -12
- synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
- synth_ai/environments/examples/red/engine_helpers/reward_components.py +151 -179
- synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
- synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
- synth_ai/environments/examples/red/environment.py +86 -0
- synth_ai/environments/examples/red/trace_hooks_v3.py +168 -0
- synth_ai/environments/examples/sokoban/taskset.py +116 -0
- synth_ai/environments/examples/verilog/engine.py +104 -12
- synth_ai/environments/examples/wordle/environment.py +0 -1
- synth_ai/environments/reproducibility/tree.py +5 -6
- synth_ai/environments/service/app.py +11 -12
- synth_ai/environments/service/core_routes.py +10 -9
- synth_ai/environments/stateful/engine.py +1 -1
- synth_ai/environments/tasks/core.py +1 -0
- synth_ai/environments/tasks/filters.py +5 -6
- synth_ai/environments/tasks/utils.py +4 -5
- synth_ai/evals/__init__.py +15 -0
- synth_ai/evals/base.py +14 -5
- synth_ai/evals/client.py +82 -0
- synth_ai/evals/types.py +42 -0
- synth_ai/http.py +8 -22
- synth_ai/http_client.py +45 -12
- synth_ai/inference/__init__.py +0 -2
- synth_ai/inference/client.py +21 -7
- synth_ai/jobs/client.py +129 -80
- synth_ai/judge_schemas.py +127 -0
- synth_ai/learning/__init__.py +51 -6
- synth_ai/learning/algorithms.py +14 -0
- synth_ai/learning/client.py +122 -30
- synth_ai/learning/config.py +2 -40
- synth_ai/learning/constants.py +0 -2
- synth_ai/learning/ft_client.py +4 -56
- synth_ai/learning/health.py +14 -8
- synth_ai/learning/jobs.py +43 -47
- synth_ai/learning/prompt_learning_client.py +276 -0
- synth_ai/learning/prompt_learning_types.py +185 -0
- synth_ai/{rl → learning/rl}/__init__.py +14 -5
- synth_ai/learning/rl/client.py +269 -0
- synth_ai/learning/rl/config.py +31 -0
- synth_ai/{rl → learning/rl}/contracts.py +5 -10
- synth_ai/{rl → learning/rl}/env_keys.py +45 -16
- synth_ai/learning/rl/secrets.py +13 -0
- synth_ai/learning/rl_client.py +2 -253
- synth_ai/learning/sft/__init__.py +29 -0
- synth_ai/learning/sft/client.py +68 -0
- synth_ai/learning/sft/config.py +270 -0
- synth_ai/learning/sft/data.py +698 -0
- synth_ai/learning/sse.py +25 -26
- synth_ai/learning/validators.py +29 -25
- synth_ai/mcp/__init__.py +5 -0
- synth_ai/mcp/__main__.py +8 -0
- synth_ai/mcp/main.py +254 -0
- synth_ai/mcp/setup.py +100 -0
- synth_ai/modal.py +257 -0
- synth_ai/pricing/__init__.py +3 -0
- synth_ai/pricing/model_pricing.py +64 -0
- synth_ai/session/__init__.py +75 -0
- synth_ai/session/client.py +383 -0
- synth_ai/session/constants.py +63 -0
- synth_ai/session/exceptions.py +105 -0
- synth_ai/session/manager.py +139 -0
- synth_ai/session/models.py +89 -0
- synth_ai/session/query.py +110 -0
- synth_ai/spec/__init__.py +46 -0
- synth_ai/spec/dataclasses.py +149 -0
- synth_ai/spec/loader.py +144 -0
- synth_ai/spec/serializer.py +199 -0
- synth_ai/spec/validation.py +250 -0
- synth_ai/streaming/__init__.py +29 -0
- synth_ai/streaming/config.py +94 -0
- synth_ai/streaming/handlers.py +589 -0
- synth_ai/streaming/streamer.py +320 -0
- synth_ai/streaming/types.py +95 -0
- synth_ai/task/__init__.py +50 -30
- synth_ai/task/apps/__init__.py +63 -19
- synth_ai/task/auth.py +35 -23
- synth_ai/task/client.py +15 -13
- synth_ai/task/config.py +261 -0
- synth_ai/task/contracts.py +165 -64
- synth_ai/task/datasets.py +9 -6
- synth_ai/task/errors.py +11 -10
- synth_ai/task/health.py +17 -11
- synth_ai/task/inference_api.py +101 -0
- synth_ai/task/json.py +58 -24
- synth_ai/task/proxy.py +59 -66
- synth_ai/task/rubrics/__init__.py +55 -0
- synth_ai/task/rubrics/loaders.py +156 -0
- synth_ai/task/rubrics/models.py +57 -0
- synth_ai/task/rubrics/scoring.py +116 -0
- synth_ai/task/rubrics/strict.py +149 -0
- synth_ai/task/rubrics.py +22 -15
- synth_ai/task/server.py +65 -31
- synth_ai/task/trace_correlation_helpers.py +328 -0
- synth_ai/task/tracing_utils.py +44 -28
- synth_ai/task/validators.py +449 -6
- synth_ai/task/vendors.py +5 -7
- synth_ai/tracing_v3/__init__.py +4 -0
- synth_ai/tracing_v3/abstractions.py +21 -4
- synth_ai/tracing_v3/config.py +167 -22
- synth_ai/tracing_v3/constants.py +21 -0
- synth_ai/tracing_v3/db_config.py +42 -29
- synth_ai/tracing_v3/decorators.py +80 -45
- synth_ai/tracing_v3/examples/basic_usage.py +15 -9
- synth_ai/tracing_v3/hooks.py +6 -4
- synth_ai/tracing_v3/llm_call_record_helpers.py +161 -61
- synth_ai/tracing_v3/migration_helper.py +1 -2
- synth_ai/tracing_v3/replica_sync.py +12 -7
- synth_ai/tracing_v3/serialization.py +130 -0
- synth_ai/tracing_v3/session_tracer.py +73 -16
- synth_ai/tracing_v3/storage/base.py +89 -1
- synth_ai/tracing_v3/storage/config.py +63 -16
- synth_ai/tracing_v3/storage/factory.py +11 -9
- synth_ai/tracing_v3/storage/utils.py +15 -11
- synth_ai/tracing_v3/trace_utils.py +317 -0
- synth_ai/tracing_v3/turso/__init__.py +8 -21
- synth_ai/tracing_v3/turso/daemon.py +123 -15
- synth_ai/tracing_v3/turso/models.py +5 -2
- synth_ai/tracing_v3/turso/native_manager.py +1293 -0
- synth_ai/tracing_v3/utils.py +5 -4
- synth_ai/tunnel.py +143 -0
- synth_ai/tunnel_deploy.py +278 -0
- synth_ai/types.py +8 -0
- synth_ai/urls.py +11 -0
- synth_ai/utils/__init__.py +166 -0
- synth_ai/utils/agents.py +74 -0
- synth_ai/utils/apps.py +152 -0
- synth_ai/utils/base_url.py +94 -0
- synth_ai/utils/bin.py +39 -0
- synth_ai/utils/claude.py +36 -0
- synth_ai/utils/cli.py +284 -0
- synth_ai/utils/config.py +81 -0
- synth_ai/utils/env.py +346 -0
- synth_ai/utils/errors.py +85 -0
- synth_ai/utils/http.py +172 -0
- synth_ai/utils/json.py +72 -0
- synth_ai/utils/log_filter.py +99 -0
- synth_ai/utils/logging.py +198 -0
- synth_ai/utils/modal.py +299 -0
- synth_ai/utils/paths.py +95 -0
- synth_ai/utils/process.py +233 -0
- synth_ai/utils/prompts.py +39 -0
- synth_ai/utils/sqld.py +122 -0
- synth_ai/utils/ssl.py +25 -0
- synth_ai/utils/task_app_discovery.py +882 -0
- synth_ai/utils/task_app_env.py +186 -0
- synth_ai/utils/task_app_state.py +318 -0
- synth_ai/utils/tunnel/__init__.py +12 -0
- synth_ai/utils/tunnel/config.py +55 -0
- synth_ai/utils/user_config.py +137 -0
- synth_ai/uvicorn.py +77 -0
- synth_ai-0.2.23.dev3.dist-info/METADATA +357 -0
- synth_ai-0.2.23.dev3.dist-info/RECORD +983 -0
- {synth_ai-0.2.9.dev0.dist-info → synth_ai-0.2.23.dev3.dist-info}/entry_points.txt +0 -1
- {synth_ai-0.2.9.dev0.dist-info → synth_ai-0.2.23.dev3.dist-info}/top_level.txt +1 -0
- synth_ai/cli/man.py +0 -106
- synth_ai/core/experiment.py +0 -15
- synth_ai/core/system.py +0 -15
- synth_ai/demo_registry.py +0 -258
- synth_ai/environments/examples/sokoban/units/astar_common.py +0 -95
- synth_ai/experimental/synth_oss.py +0 -446
- synth_ai/handshake.py +0 -107
- synth_ai/install_sqld.sh +0 -40
- synth_ai/learning/offline/dpo.py +0 -0
- synth_ai/learning/offline/providers.py +0 -7
- synth_ai/learning/offline/sft.py +0 -0
- synth_ai/learning/offline/shared.py +0 -0
- synth_ai/learning/online/grpo.py +0 -0
- synth_ai/learning/online/irft.py +0 -0
- synth_ai/learning/prompts/banking77_injection_eval.py +0 -168
- synth_ai/learning/prompts/gepa.py +0 -0
- synth_ai/learning/prompts/hello_world_in_context_injection_ex.py +0 -213
- synth_ai/learning/prompts/mipro.py +0 -289
- synth_ai/learning/prompts/random_search.py +0 -246
- synth_ai/learning/prompts/run_mipro_banking77.py +0 -172
- synth_ai/learning/prompts/run_random_search_banking77.py +0 -324
- synth_ai/lm/__init__.py +0 -51
- synth_ai/lm/caching/constants.py +0 -6
- synth_ai/lm/caching/dbs.py +0 -0
- synth_ai/lm/caching/ephemeral.py +0 -102
- synth_ai/lm/caching/handler.py +0 -137
- synth_ai/lm/caching/initialize.py +0 -11
- synth_ai/lm/caching/persistent.py +0 -114
- synth_ai/lm/config.py +0 -110
- synth_ai/lm/constants.py +0 -32
- synth_ai/lm/core/__init__.py +0 -8
- synth_ai/lm/core/all.py +0 -73
- synth_ai/lm/core/exceptions.py +0 -7
- synth_ai/lm/core/main.py +0 -319
- synth_ai/lm/core/main_v3.py +0 -594
- synth_ai/lm/core/synth_models.py +0 -48
- synth_ai/lm/core/vendor_clients.py +0 -188
- synth_ai/lm/cost/monitor.py +0 -1
- synth_ai/lm/cost/statefulness.py +0 -1
- synth_ai/lm/injection.py +0 -80
- synth_ai/lm/overrides.py +0 -206
- synth_ai/lm/provider_support/__init__.py +0 -8
- synth_ai/lm/provider_support/anthropic.py +0 -972
- synth_ai/lm/provider_support/openai.py +0 -1139
- synth_ai/lm/provider_support/suppress_logging.py +0 -31
- synth_ai/lm/structured_outputs/handler.py +0 -440
- synth_ai/lm/structured_outputs/inject.py +0 -297
- synth_ai/lm/structured_outputs/rehabilitate.py +0 -185
- synth_ai/lm/tools/__init__.py +0 -3
- synth_ai/lm/tools/base.py +0 -172
- synth_ai/lm/unified_interface.py +0 -202
- synth_ai/lm/vendors/base.py +0 -81
- synth_ai/lm/vendors/core/anthropic_api.py +0 -387
- synth_ai/lm/vendors/core/gemini_api.py +0 -292
- synth_ai/lm/vendors/core/mistral_api.py +0 -322
- synth_ai/lm/vendors/core/openai_api.py +0 -225
- synth_ai/lm/vendors/core/synth_dev_api.py +0 -0
- synth_ai/lm/vendors/local/ollama.py +0 -0
- synth_ai/lm/vendors/openai_standard.py +0 -780
- synth_ai/lm/vendors/openai_standard_responses.py +0 -256
- synth_ai/lm/vendors/retries.py +0 -22
- synth_ai/lm/vendors/supported/custom_endpoint.py +0 -417
- synth_ai/lm/vendors/supported/deepseek.py +0 -69
- synth_ai/lm/vendors/supported/grok.py +0 -75
- synth_ai/lm/vendors/supported/groq.py +0 -16
- synth_ai/lm/vendors/supported/ollama.py +0 -15
- synth_ai/lm/vendors/supported/openrouter.py +0 -74
- synth_ai/lm/vendors/supported/together.py +0 -11
- synth_ai/lm/vendors/synth_client.py +0 -808
- synth_ai/lm/warmup.py +0 -186
- synth_ai/rl/secrets.py +0 -19
- synth_ai/scripts/verify_rewards.py +0 -100
- synth_ai/task/apps/grpo_crafter.py +0 -438
- synth_ai/tracing/__init__.py +0 -30
- synth_ai/tracing_v1/__init__.py +0 -33
- synth_ai/tracing_v3/turso/manager.py +0 -774
- synth_ai/v0/tracing/abstractions.py +0 -224
- synth_ai/v0/tracing/base_client.py +0 -91
- synth_ai/v0/tracing/client_manager.py +0 -131
- synth_ai/v0/tracing/config.py +0 -142
- synth_ai/v0/tracing/context.py +0 -146
- synth_ai/v0/tracing/decorators.py +0 -682
- synth_ai/v0/tracing/events/__init__.py +0 -0
- synth_ai/v0/tracing/events/manage.py +0 -147
- synth_ai/v0/tracing/events/scope.py +0 -86
- synth_ai/v0/tracing/events/store.py +0 -228
- synth_ai/v0/tracing/immediate_client.py +0 -151
- synth_ai/v0/tracing/local.py +0 -18
- synth_ai/v0/tracing/log_client_base.py +0 -73
- synth_ai/v0/tracing/retry_queue.py +0 -186
- synth_ai/v0/tracing/trackers.py +0 -515
- synth_ai/v0/tracing/upload.py +0 -512
- synth_ai/v0/tracing/utils.py +0 -9
- synth_ai/v0/tracing_v1/__init__.py +0 -16
- synth_ai/v0/tracing_v1/abstractions.py +0 -224
- synth_ai/v0/tracing_v1/base_client.py +0 -91
- synth_ai/v0/tracing_v1/client_manager.py +0 -131
- synth_ai/v0/tracing_v1/config.py +0 -142
- synth_ai/v0/tracing_v1/context.py +0 -146
- synth_ai/v0/tracing_v1/decorators.py +0 -703
- synth_ai/v0/tracing_v1/events/__init__.py +0 -0
- synth_ai/v0/tracing_v1/events/manage.py +0 -147
- synth_ai/v0/tracing_v1/events/scope.py +0 -86
- synth_ai/v0/tracing_v1/events/store.py +0 -228
- synth_ai/v0/tracing_v1/immediate_client.py +0 -151
- synth_ai/v0/tracing_v1/local.py +0 -18
- synth_ai/v0/tracing_v1/log_client_base.py +0 -73
- synth_ai/v0/tracing_v1/retry_queue.py +0 -186
- synth_ai/v0/tracing_v1/trackers.py +0 -515
- synth_ai/v0/tracing_v1/upload.py +0 -527
- synth_ai/v0/tracing_v1/utils.py +0 -9
- synth_ai/zyk/__init__.py +0 -30
- synth_ai-0.2.9.dev0.dist-info/METADATA +0 -131
- synth_ai-0.2.9.dev0.dist-info/RECORD +0 -444
- {synth_ai/lm/caching → examples/task_apps}/__init__.py +0 -0
- {synth_ai/lm/cost → examples/task_apps/crafter}/__init__.py +0 -0
- {synth_ai/lm/structured_outputs → examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server}/__init__.py +0 -0
- {synth_ai/lm/vendors → examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests}/__init__.py +0 -0
- {synth_ai/lm/vendors/core → examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils}/__init__.py +0 -0
- {synth_ai/lm/vendors/local → examples/task_apps/math}/__init__.py +0 -0
- {synth_ai/lm/vendors/supported → examples/workflows}/__init__.py +0 -0
- {synth_ai/v0/tracing → examples/workflows/math_rl}/__init__.py +0 -0
- /synth_ai/{compound/cais.py → cli/__main__.py} +0 -0
- /synth_ai/{learning/filtering.py → py.typed} +0 -0
- {synth_ai-0.2.9.dev0.dist-info → synth_ai-0.2.23.dev3.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.9.dev0.dist-info → synth_ai-0.2.23.dev3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
# GEPA: Genetic Evolution for Prompt Optimization
|
|
2
|
+
|
|
3
|
+
This directory contains examples and configurations for using GEPA (Genetic Evolution for Prompt Optimization) to optimize prompts for various classification and reasoning tasks.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
**GEPA** is an evolutionary algorithm that optimizes prompts through genetic operations (mutation, crossover, selection) across multiple generations. It's particularly effective for:
|
|
8
|
+
- Intent classification (Banking77)
|
|
9
|
+
- Multi-hop QA (HotpotQA)
|
|
10
|
+
- Instruction following (IFBench)
|
|
11
|
+
- Claim verification (HoVer)
|
|
12
|
+
- Privacy-aware delegation (PUPA)
|
|
13
|
+
|
|
14
|
+
## Supported Tasks
|
|
15
|
+
|
|
16
|
+
Configuration files live under `configs/`:
|
|
17
|
+
|
|
18
|
+
| Task | Description | Config Files |
|
|
19
|
+
|------|-------------|--------------|
|
|
20
|
+
| **Banking77** | Intent classification (77 banking intents) | `banking77_gepa_local.toml`, `banking77_mipro_local.toml` |
|
|
21
|
+
| **HotpotQA** | Multi-hop question answering | `hotpotqa_gepa_local.toml`, `hotpotqa_mipro_local.toml` |
|
|
22
|
+
| **IFBench** | Instruction following benchmark | `ifbench_gepa_local.toml`, `ifbench_mipro_local.toml` |
|
|
23
|
+
| **HoVer** | Claim verification against Wikipedia | `hover_gepa_local.toml`, `hover_mipro_local.toml` |
|
|
24
|
+
| **PUPA** | Privacy-aware task delegation | `pupa_gepa_local.toml`, `pupa_mipro_local.toml` |
|
|
25
|
+
|
|
26
|
+
Each template targets a different default port (8110–8113) so you can run multiple task apps side-by-side.
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
## Quick Start (Banking77 Example)
|
|
31
|
+
|
|
32
|
+
### Prerequisites
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
# 1. Install dependencies
|
|
36
|
+
uv pip install -e .
|
|
37
|
+
|
|
38
|
+
# 2. Set environment variables
|
|
39
|
+
export SYNTH_API_KEY="your-backend-api-key"
|
|
40
|
+
export GROQ_API_KEY="gsk_your_groq_key"
|
|
41
|
+
export ENVIRONMENT_API_KEY="$(python -c 'import secrets; print(secrets.token_urlsafe(32))')"
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
**Where to get API keys:**
|
|
45
|
+
- **GROQ_API_KEY**: Get from https://console.groq.com/keys
|
|
46
|
+
- **SYNTH_API_KEY**: Get from your backend admin or `.env.dev` file
|
|
47
|
+
- **ENVIRONMENT_API_KEY**: Generate a random secure token (command above)
|
|
48
|
+
|
|
49
|
+
### Step 1: Start the Backend
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
# Make sure your backend is running
|
|
53
|
+
curl http://localhost:8000/api/health
|
|
54
|
+
# Should return: {"status":"ok"}
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Step 2: Deploy Task App
|
|
58
|
+
|
|
59
|
+
**Option A: Using helper script (recommended)**
|
|
60
|
+
```bash
|
|
61
|
+
# Terminal 1
|
|
62
|
+
./examples/blog_posts/gepa/deploy_banking77_task_app.sh
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
**Option B: Using CLI**
|
|
66
|
+
```bash
|
|
67
|
+
uvx synth-ai deploy banking77 --runtime uvicorn --port 8102
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
**Option C: Deploy to Modal**
|
|
71
|
+
```bash
|
|
72
|
+
uvx synth-ai deploy banking77 --runtime modal --name banking77-gepa --env-file .env
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### Step 3: Run GEPA Optimization
|
|
76
|
+
|
|
77
|
+
**Option A: Using helper script (recommended)**
|
|
78
|
+
```bash
|
|
79
|
+
# Terminal 2
|
|
80
|
+
./examples/blog_posts/gepa/run_gepa_banking77.sh
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
**Option B: Using CLI directly**
|
|
84
|
+
```bash
|
|
85
|
+
uvx synth-ai train \
|
|
86
|
+
--config examples/blog_posts/gepa/configs/banking77_gepa_local.toml \
|
|
87
|
+
--backend http://localhost:8000 \
|
|
88
|
+
--poll
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Step 4: Monitor Progress
|
|
92
|
+
|
|
93
|
+
You'll see real-time output like:
|
|
94
|
+
```
|
|
95
|
+
🧬 Running GEPA on Banking77
|
|
96
|
+
=============================
|
|
97
|
+
✅ Backend URL: http://localhost:8000
|
|
98
|
+
✅ Task app is healthy
|
|
99
|
+
|
|
100
|
+
🚀 Starting GEPA training...
|
|
101
|
+
|
|
102
|
+
proposal[0] train_accuracy=0.65 len=120 tool_rate=0.95 N=30
|
|
103
|
+
🔄 TRANSFORMATION:
|
|
104
|
+
[SYSTEM]: Classify customer banking queries into intents...
|
|
105
|
+
|
|
106
|
+
Generation 1/15: Best reward=0.75 (75% accuracy)
|
|
107
|
+
Generation 2/15: Best reward=0.82 (82% accuracy)
|
|
108
|
+
...
|
|
109
|
+
✅ GEPA training complete!
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Results are automatically saved to `configs/results/gepa_results_<job_id>_<timestamp>.txt`.
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## Configuration
|
|
117
|
+
|
|
118
|
+
### Example: Banking77 GEPA Configuration
|
|
119
|
+
|
|
120
|
+
```toml
|
|
121
|
+
[prompt_learning]
|
|
122
|
+
algorithm = "gepa"
|
|
123
|
+
task_app_url = "http://127.0.0.1:8102"
|
|
124
|
+
task_app_id = "banking77"
|
|
125
|
+
|
|
126
|
+
# Training seeds (30 seeds from train pool)
|
|
127
|
+
evaluation_seeds = [50, 51, 52, ..., 79]
|
|
128
|
+
|
|
129
|
+
# Validation seeds (50 seeds from validation pool - not in training)
|
|
130
|
+
validation_seeds = [0, 1, 2, ..., 49]
|
|
131
|
+
|
|
132
|
+
[prompt_learning.gepa]
|
|
133
|
+
initial_population_size = 20 # Starting population of prompts
|
|
134
|
+
num_generations = 15 # Number of evolutionary cycles
|
|
135
|
+
mutation_rate = 0.3 # Probability of mutation
|
|
136
|
+
crossover_rate = 0.5 # Probability of crossover
|
|
137
|
+
rollout_budget = 1000 # Total rollouts across all generations
|
|
138
|
+
max_concurrent_rollouts = 20 # Parallel rollout limit
|
|
139
|
+
pareto_set_size = 20 # Size of Pareto front
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### Key Parameters
|
|
143
|
+
|
|
144
|
+
| Parameter | Description | Typical Range |
|
|
145
|
+
|-----------|-------------|---------------|
|
|
146
|
+
| `initial_population_size` | Starting number of prompt variants | 10-50 |
|
|
147
|
+
| `num_generations` | Evolutionary cycles to run | 5-30 |
|
|
148
|
+
| `mutation_rate` | Probability of mutating a prompt | 0.1-0.5 |
|
|
149
|
+
| `crossover_rate` | Probability of combining two prompts | 0.3-0.7 |
|
|
150
|
+
| `rollout_budget` | Total task evaluations allowed | 200-2000 |
|
|
151
|
+
| `max_concurrent_rollouts` | Parallel rollout limit | 10-50 |
|
|
152
|
+
| `pareto_set_size` | Multi-objective optimization frontier size | 10-30 |
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
## Querying Results
|
|
157
|
+
|
|
158
|
+
After GEPA completes, you can query job results programmatically:
|
|
159
|
+
|
|
160
|
+
### Python API
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
from synth_ai.learning import get_prompts, get_prompt_text, get_scoring_summary
|
|
164
|
+
|
|
165
|
+
# Get all results
|
|
166
|
+
results = get_prompts(
|
|
167
|
+
job_id="pl_abc123",
|
|
168
|
+
base_url="http://localhost:8000",
|
|
169
|
+
api_key="sk_..."
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Access best prompt
|
|
173
|
+
best_prompt = results["best_prompt"]
|
|
174
|
+
best_score = results["best_score"]
|
|
175
|
+
print(f"Best Score: {best_score:.3f}")
|
|
176
|
+
|
|
177
|
+
# Get top-K prompts
|
|
178
|
+
for prompt_info in results["top_prompts"]:
|
|
179
|
+
print(f"Rank {prompt_info['rank']}: {prompt_info['train_accuracy']:.3f}")
|
|
180
|
+
print(prompt_info["full_text"])
|
|
181
|
+
|
|
182
|
+
# Quick access to best prompt text only
|
|
183
|
+
best_text = get_prompt_text(
|
|
184
|
+
job_id="pl_abc123",
|
|
185
|
+
base_url="http://localhost:8000",
|
|
186
|
+
api_key="sk_...",
|
|
187
|
+
rank=1 # 1 = best, 2 = second best, etc.
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# Get scoring statistics
|
|
191
|
+
summary = get_scoring_summary(
|
|
192
|
+
job_id="pl_abc123",
|
|
193
|
+
base_url="http://localhost:8000",
|
|
194
|
+
api_key="sk_..."
|
|
195
|
+
)
|
|
196
|
+
print(f"Best: {summary['best_train_accuracy']:.3f}")
|
|
197
|
+
print(f"Mean: {summary['mean_train_accuracy']:.3f}")
|
|
198
|
+
print(f"Tried: {summary['num_candidates_tried']}")
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
### Command Line
|
|
202
|
+
|
|
203
|
+
```bash
|
|
204
|
+
# Set environment variables
|
|
205
|
+
export BACKEND_BASE_URL="http://localhost:8000"
|
|
206
|
+
export SYNTH_API_KEY="sk_..."
|
|
207
|
+
|
|
208
|
+
# Run the example script
|
|
209
|
+
python examples/blog_posts/gepa/query_prompts_example.py pl_abc123
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
### REST API
|
|
213
|
+
|
|
214
|
+
```bash
|
|
215
|
+
# Get job status
|
|
216
|
+
curl -H "Authorization: Bearer $SYNTH_API_KEY" \
|
|
217
|
+
http://localhost:8000/api/prompt-learning/online/jobs/JOB_ID
|
|
218
|
+
|
|
219
|
+
# Stream events
|
|
220
|
+
curl -H "Authorization: Bearer $SYNTH_API_KEY" \
|
|
221
|
+
http://localhost:8000/api/prompt-learning/online/jobs/JOB_ID/events/stream
|
|
222
|
+
|
|
223
|
+
# Get metrics
|
|
224
|
+
curl -H "Authorization: Bearer $SYNTH_API_KEY" \
|
|
225
|
+
http://localhost:8000/api/prompt-learning/online/jobs/JOB_ID/metrics
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
---
|
|
229
|
+
|
|
230
|
+
## Expected Results
|
|
231
|
+
|
|
232
|
+
GEPA typically improves accuracy over generations:
|
|
233
|
+
|
|
234
|
+
| Generation | Typical Accuracy | Notes |
|
|
235
|
+
|------------|------------------|-------|
|
|
236
|
+
| 1 (baseline) | 60-75% | Initial random/baseline prompts |
|
|
237
|
+
| 5 | 75-80% | Early optimization gains |
|
|
238
|
+
| 10 | 80-85% | Convergence begins |
|
|
239
|
+
| 15 (final) | 85-90%+ | Optimized prompts on Pareto front |
|
|
240
|
+
|
|
241
|
+
The Pareto front contains multiple prompt variants balancing:
|
|
242
|
+
- **Accuracy** (primary objective)
|
|
243
|
+
- **Token count** (efficiency objective)
|
|
244
|
+
- **Tool call rate** (task-specific objective)
|
|
245
|
+
|
|
246
|
+
---
|
|
247
|
+
|
|
248
|
+
## Helper Scripts
|
|
249
|
+
|
|
250
|
+
| Script | Purpose |
|
|
251
|
+
|--------|---------|
|
|
252
|
+
| `deploy_banking77_task_app.sh` | Start Banking77 task app locally |
|
|
253
|
+
| `run_gepa_banking77.sh` | Run GEPA optimization with validation checks |
|
|
254
|
+
| `test_gepa_local.sh` | Quick test script for local setup |
|
|
255
|
+
| `verify_banking77_setup.sh` | Comprehensive setup verification |
|
|
256
|
+
| `query_prompts_example.py` | Example script for querying results |
|
|
257
|
+
|
|
258
|
+
---
|
|
259
|
+
|
|
260
|
+
## Troubleshooting
|
|
261
|
+
|
|
262
|
+
### ❌ "Banking77 task app is not running"
|
|
263
|
+
|
|
264
|
+
**Solution:** Start the task app first
|
|
265
|
+
```bash
|
|
266
|
+
./examples/blog_posts/gepa/deploy_banking77_task_app.sh
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
### ❌ "Cannot connect to backend"
|
|
270
|
+
|
|
271
|
+
**Solution:** Verify backend is running
|
|
272
|
+
```bash
|
|
273
|
+
curl http://localhost:8000/api/health
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
If not running, start your backend service.
|
|
277
|
+
|
|
278
|
+
### ❌ "GROQ_API_KEY environment variable is required"
|
|
279
|
+
|
|
280
|
+
**Solution:** Export your Groq API key
|
|
281
|
+
```bash
|
|
282
|
+
export GROQ_API_KEY="gsk_your_key_here"
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
### ❌ "Failed to download dataset"
|
|
286
|
+
|
|
287
|
+
**Solution:** Check internet connection. The task app downloads from Hugging Face.
|
|
288
|
+
|
|
289
|
+
If you have the dataset locally:
|
|
290
|
+
```bash
|
|
291
|
+
export BANKING77_DATASET_NAME="/path/to/local/banking77"
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
### ❌ Pattern validation failed
|
|
295
|
+
|
|
296
|
+
**Solution:** Ensure your config's `initial_prompt.messages` uses the `{query}` wildcard:
|
|
297
|
+
```toml
|
|
298
|
+
[[prompt_learning.initial_prompt.messages]]
|
|
299
|
+
role = "user"
|
|
300
|
+
pattern = "Customer Query: {query}\n\nClassify this query."
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
### ⚠️ Metrics not streaming
|
|
304
|
+
|
|
305
|
+
**Solution:**
|
|
306
|
+
1. Verify backend `/metrics` endpoint exists
|
|
307
|
+
2. Check SDK `StreamConfig` enables `StreamType.METRICS`
|
|
308
|
+
3. Restart local backend to pick up latest code
|
|
309
|
+
|
|
310
|
+
---
|
|
311
|
+
|
|
312
|
+
## Files in This Directory
|
|
313
|
+
|
|
314
|
+
```
|
|
315
|
+
examples/blog_posts/gepa/
|
|
316
|
+
├── README.md # This file - comprehensive guide
|
|
317
|
+
├── configs/ # Configuration files
|
|
318
|
+
│ ├── banking77_gepa_local.toml # Banking77 GEPA config
|
|
319
|
+
│ ├── banking77_mipro_local.toml # Banking77 MIPRO config
|
|
320
|
+
│ ├── hotpotqa_gepa_local.toml # HotpotQA configs
|
|
321
|
+
│ ├── ifbench_gepa_local.toml # IFBench configs
|
|
322
|
+
│ ├── hover_gepa_local.toml # HoVer configs
|
|
323
|
+
│ └── pupa_gepa_local.toml # PUPA configs
|
|
324
|
+
├── deploy_banking77_task_app.sh # Helper: Start task app
|
|
325
|
+
├── run_gepa_banking77.sh # Helper: Run GEPA
|
|
326
|
+
├── test_gepa_local.sh # Helper: Quick test
|
|
327
|
+
├── verify_banking77_setup.sh # Helper: Verify setup
|
|
328
|
+
├── (baseline: examples/baseline/banking77_baseline.py)
|
|
329
|
+
├── query_prompts_example.py # Query results example
|
|
330
|
+
└── task_apps.py # Task app registry
|
|
331
|
+
```
|
|
332
|
+
|
|
333
|
+
---
|
|
334
|
+
|
|
335
|
+
## Next Steps
|
|
336
|
+
|
|
337
|
+
1. **Evaluate optimized prompts**: Test best prompts on held-out validation split
|
|
338
|
+
2. **Compare with baseline**: Run `uvx synth-ai baseline banking77` to measure improvement
|
|
339
|
+
3. **Experiment with parameters**: Adjust mutation/crossover rates, population size
|
|
340
|
+
4. **Try MIPRO**: Compare GEPA with MIPROv2 optimization
|
|
341
|
+
5. **Benchmark across tasks**: Test on HotpotQA, IFBench, HoVer, PUPA
|
|
342
|
+
|
|
343
|
+
---
|
|
344
|
+
|
|
345
|
+
## Support
|
|
346
|
+
|
|
347
|
+
For issues or questions:
|
|
348
|
+
|
|
349
|
+
1. Verify all API keys are set correctly
|
|
350
|
+
2. Check task app: `curl -H "X-API-Key: $ENVIRONMENT_API_KEY" http://127.0.0.1:8102/health`
|
|
351
|
+
3. Check backend: `curl http://localhost:8000/api/health`
|
|
352
|
+
4. Review logs in both terminals for error messages
|
|
353
|
+
5. Run verification script: `./verify_banking77_setup.sh`
|
|
354
|
+
|
|
355
|
+
Happy optimizing! 🧬🚀
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
[prompt_learning]
|
|
2
|
+
algorithm = "gepa"
|
|
3
|
+
task_app_url = "https://synth-laboratories-dev--synth-banking77-web-web.modal.run"
|
|
4
|
+
task_app_id = "banking77"
|
|
5
|
+
|
|
6
|
+
# Initial prompt pattern (pattern-based mode)
|
|
7
|
+
[prompt_learning.initial_prompt]
|
|
8
|
+
id = "banking77_pattern"
|
|
9
|
+
name = "Banking77 Classification Pattern"
|
|
10
|
+
|
|
11
|
+
[[prompt_learning.initial_prompt.messages]]
|
|
12
|
+
role = "system"
|
|
13
|
+
pattern = "You are an expert banking assistant. \n\n**Available Banking Intents:**\n{available_intents}\n\n**Task:**\nCall the `banking77_classify` tool with the `intent` parameter set to ONE of the intent labels listed above that best matches the customer query. The intent must be an exact match from the list."
|
|
14
|
+
order = 0
|
|
15
|
+
|
|
16
|
+
[[prompt_learning.initial_prompt.messages]]
|
|
17
|
+
role = "user"
|
|
18
|
+
pattern = "Customer Query: {query}\n\nClassify this query by calling the tool with the correct intent label from the list above."
|
|
19
|
+
order = 1
|
|
20
|
+
|
|
21
|
+
[prompt_learning.initial_prompt.wildcards]
|
|
22
|
+
query = "REQUIRED" # Will be provided by task app at runtime
|
|
23
|
+
available_intents = "OPTIONAL" # Intent list (numbered 1-77) will be provided by task app
|
|
24
|
+
|
|
25
|
+
# Policy configuration (model, provider, etc.)
|
|
26
|
+
[prompt_learning.policy]
|
|
27
|
+
inference_mode = "synth_hosted"
|
|
28
|
+
model = "openai/gpt-oss-20b"
|
|
29
|
+
provider = "groq"
|
|
30
|
+
temperature = 0.0
|
|
31
|
+
max_completion_tokens = 512
|
|
32
|
+
policy_name = "banking77-classifier" # Required for Banking77 task app
|
|
33
|
+
|
|
34
|
+
# Training split config
|
|
35
|
+
[prompt_learning.env_config]
|
|
36
|
+
pool = "train"
|
|
37
|
+
|
|
38
|
+
# GEPA-specific configuration with nested subsections (mirrors RL structure)
|
|
39
|
+
[prompt_learning.gepa]
|
|
40
|
+
env_name = "banking77"
|
|
41
|
+
proposer_type = "dspy"
|
|
42
|
+
|
|
43
|
+
# Rollout configuration (mirrors RL [rollout] section)
|
|
44
|
+
[prompt_learning.gepa.rollout]
|
|
45
|
+
budget = 100
|
|
46
|
+
max_concurrent = 20
|
|
47
|
+
minibatch_size = 10
|
|
48
|
+
|
|
49
|
+
# Evaluation configuration (mirrors RL [evaluation] section)
|
|
50
|
+
[prompt_learning.gepa.evaluation]
|
|
51
|
+
seeds = [
|
|
52
|
+
50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
|
|
53
|
+
60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
|
|
54
|
+
70, 71, 72, 73, 74, 75, 76, 77, 78, 79
|
|
55
|
+
] # Training seeds (30 seeds from train pool)
|
|
56
|
+
validation_seeds = [
|
|
57
|
+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
|
|
58
|
+
10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
|
|
59
|
+
20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
|
|
60
|
+
30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
|
|
61
|
+
40, 41, 42, 43, 44, 45, 46, 47, 48, 49
|
|
62
|
+
] # Held-out validation seeds (50 seeds from validation pool - not in training)
|
|
63
|
+
validation_pool = "validation"
|
|
64
|
+
validation_top_k = 3
|
|
65
|
+
test_pool = [2, 3] # Test pool for final evaluation (small held-out set)
|
|
66
|
+
|
|
67
|
+
# Mutation configuration (LLM-guided mutation settings)
|
|
68
|
+
[prompt_learning.gepa.mutation]
|
|
69
|
+
rate = 0.3
|
|
70
|
+
llm_model = "openai/gpt-oss-120b"
|
|
71
|
+
llm_provider = "groq"
|
|
72
|
+
llm_inference_url = "https://api.groq.com/openai/v1"
|
|
73
|
+
|
|
74
|
+
# Population configuration (evolution parameters)
|
|
75
|
+
[prompt_learning.gepa.population]
|
|
76
|
+
initial_size = 10
|
|
77
|
+
num_generations = 3
|
|
78
|
+
children_per_generation = 12
|
|
79
|
+
crossover_rate = 0.5
|
|
80
|
+
selection_pressure = 1.0
|
|
81
|
+
patience_generations = 3
|
|
82
|
+
|
|
83
|
+
# Archive configuration (Pareto archive settings)
|
|
84
|
+
[prompt_learning.gepa.archive]
|
|
85
|
+
size = 40
|
|
86
|
+
pareto_set_size = 32
|
|
87
|
+
pareto_eps = 1e-6
|
|
88
|
+
feedback_fraction = 0.5
|
|
89
|
+
|
|
90
|
+
# Token and budget configuration
|
|
91
|
+
[prompt_learning.gepa.token]
|
|
92
|
+
# max_limit = 1000 # Uncomment to set a token limit
|
|
93
|
+
counting_model = "gpt-4"
|
|
94
|
+
enforce_pattern_limit = true
|
|
95
|
+
# max_spend_usd = 100.0 # Uncomment to set a budget cap
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# GEPA Prompt Learning for Banking77
|
|
2
|
+
# Local backend configuration (localhost:8000)
|
|
3
|
+
|
|
4
|
+
[prompt_learning]
|
|
5
|
+
algorithm = "gepa"
|
|
6
|
+
task_app_url = "https://synth-laboratories-dev--synth-banking77-web-web.modal.run"
|
|
7
|
+
task_app_id = "banking77"
|
|
8
|
+
evaluation_seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
|
|
9
|
+
|
|
10
|
+
# Held-out validation config
|
|
11
|
+
validation_seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
|
|
12
|
+
validation_pool = "validation"
|
|
13
|
+
validation_top_k = 3
|
|
14
|
+
|
|
15
|
+
# Training split config
|
|
16
|
+
[prompt_learning.env_config]
|
|
17
|
+
pool = "train"
|
|
18
|
+
|
|
19
|
+
# Seeds for evaluation (increase to score prompts with more rollouts)
|
|
20
|
+
evaluation_seeds = [
|
|
21
|
+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
|
|
22
|
+
10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
|
|
23
|
+
20, 21, 22, 23, 24, 25, 26, 27, 28, 29
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
# Test pool for final evaluation (held-out episodes)
|
|
27
|
+
test_pool = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
|
|
28
|
+
|
|
29
|
+
# Initial prompt pattern (pattern-based mode)
|
|
30
|
+
[prompt_learning.initial_prompt]
|
|
31
|
+
id = "banking77_pattern"
|
|
32
|
+
name = "Banking77 Classification Pattern"
|
|
33
|
+
|
|
34
|
+
[[prompt_learning.initial_prompt.messages]]
|
|
35
|
+
role = "system"
|
|
36
|
+
pattern = "You are an expert banking assistant that classifies customer queries into banking intents. Given a customer message, respond with exactly one intent label from the provided list using the `banking77_classify` tool."
|
|
37
|
+
order = 0
|
|
38
|
+
|
|
39
|
+
[[prompt_learning.initial_prompt.messages]]
|
|
40
|
+
role = "user"
|
|
41
|
+
pattern = "Customer Query: {query}\n\nClassify this query into one of the banking intents using the tool call."
|
|
42
|
+
order = 1
|
|
43
|
+
|
|
44
|
+
[prompt_learning.initial_prompt.wildcards]
|
|
45
|
+
query = "REQUIRED" # Will be provided by task app at runtime
|
|
46
|
+
|
|
47
|
+
# Policy configuration (model, provider, etc.)
|
|
48
|
+
[prompt_learning.policy]
|
|
49
|
+
inference_mode = "synth_hosted"
|
|
50
|
+
model = "openai/gpt-oss-120b"
|
|
51
|
+
provider = "groq"
|
|
52
|
+
temperature = 0.0
|
|
53
|
+
max_completion_tokens = 512
|
|
54
|
+
policy_name = "banking77-classifier" # Required for Banking77 task app
|
|
55
|
+
|
|
56
|
+
# GEPA-specific configuration
|
|
57
|
+
[prompt_learning.gepa]
|
|
58
|
+
env_name = "banking77"
|
|
59
|
+
initial_population_size = 40
|
|
60
|
+
num_generations = 10
|
|
61
|
+
mutation_rate = 0.3
|
|
62
|
+
crossover_rate = 0.5
|
|
63
|
+
selection_pressure = 1.0
|
|
64
|
+
minibatch_size = 12
|
|
65
|
+
pareto_set_size = 40
|
|
66
|
+
feedback_fraction = 0.5
|
|
67
|
+
children_per_generation = 16
|
|
68
|
+
patience_generations = 5
|
|
69
|
+
rollout_budget = 1500
|
|
70
|
+
archive_size = 30
|
|
71
|
+
pareto_eps = 1e-6
|
|
72
|
+
max_concurrent_rollouts = 20 # Maximum concurrent rollouts across all transformations
|
|
73
|
+
|
|
74
|
+
# Instruction proposer selection
|
|
75
|
+
proposer_type = "dspy"
|
|
76
|
+
|
|
77
|
+
# LLM-guided mutation configuration
|
|
78
|
+
mutation_llm_model = "openai/gpt-oss-20b"
|
|
79
|
+
mutation_llm_provider = "groq"
|
|
80
|
+
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# MIPROv2 Prompt Learning for Banking77
|
|
2
|
+
# Local backend configuration targeting the Banking77 intent classification task app.
|
|
3
|
+
|
|
4
|
+
[prompt_learning]
|
|
5
|
+
algorithm = "mipro"
|
|
6
|
+
task_app_url = "http://127.0.0.1:8102"
|
|
7
|
+
task_app_id = "banking77"
|
|
8
|
+
|
|
9
|
+
# Seeds evaluated during optimisation
|
|
10
|
+
evaluation_seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
|
11
|
+
|
|
12
|
+
# Held-out seeds for final scoring
|
|
13
|
+
test_pool = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
|
|
14
|
+
|
|
15
|
+
[prompt_learning.initial_prompt]
|
|
16
|
+
id = "banking77_pattern"
|
|
17
|
+
name = "Banking77 Classification Pattern"
|
|
18
|
+
|
|
19
|
+
[[prompt_learning.initial_prompt.messages]]
|
|
20
|
+
role = "system"
|
|
21
|
+
pattern = "You are an expert banking assistant that classifies customer queries into banking intents. Return only the intent label using the `banking77_classify` tool."
|
|
22
|
+
order = 0
|
|
23
|
+
|
|
24
|
+
[[prompt_learning.initial_prompt.messages]]
|
|
25
|
+
role = "user"
|
|
26
|
+
pattern = "Customer Query: {query}\n\nClassify this query into one of the banking intents using the tool call."
|
|
27
|
+
order = 1
|
|
28
|
+
|
|
29
|
+
[prompt_learning.initial_prompt.wildcards]
|
|
30
|
+
query = "REQUIRED"
|
|
31
|
+
|
|
32
|
+
[prompt_learning.policy]
|
|
33
|
+
model = "openai/gpt-oss-20b"
|
|
34
|
+
provider = "groq"
|
|
35
|
+
temperature = 0.0
|
|
36
|
+
max_completion_tokens = 128
|
|
37
|
+
policy_name = "banking77-mipro"
|
|
38
|
+
|
|
39
|
+
[prompt_learning.mipro]
|
|
40
|
+
env_name = "banking77"
|
|
41
|
+
num_iterations = 16
|
|
42
|
+
num_evaluations_per_iteration = 6
|
|
43
|
+
batch_size = 6
|
|
44
|
+
max_concurrent = 16
|
|
45
|
+
meta_model = "gpt-4.1-mini"
|
|
46
|
+
meta_model_provider = "openai"
|
|
47
|
+
few_shot_score_threshold = 0.85
|
|
48
|
+
test_pool = [20, 21, 22, 23, 24]
|
|
49
|
+
bootstrap_train_seeds = [0, 1, 2, 3, 4]
|
|
50
|
+
online_pool = [5, 6, 7, 8, 9]
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# GEPA Prompt Learning for the Banking77 Two-Stage Pipeline
|
|
2
|
+
# This config optimizes both classifier and calibrator stages using genetic evolution
|
|
3
|
+
|
|
4
|
+
[prompt_learning]
|
|
5
|
+
algorithm = "gepa"
|
|
6
|
+
task_app_url = "https://synth-laboratories-dev--synth-banking77-pipeline-web-web.modal.run"
|
|
7
|
+
task_app_id = "banking77-pipeline"
|
|
8
|
+
|
|
9
|
+
[prompt_learning.initial_prompt]
|
|
10
|
+
id = "banking77_pipeline_baseline"
|
|
11
|
+
name = "Banking77 Pipeline Baseline"
|
|
12
|
+
|
|
13
|
+
[[prompt_learning.initial_prompt.messages]]
|
|
14
|
+
role = "system"
|
|
15
|
+
pattern = "Pipeline placeholder message. Actual stage instructions are provided via metadata.pipeline_modules."
|
|
16
|
+
order = 0
|
|
17
|
+
|
|
18
|
+
[prompt_learning.initial_prompt.metadata]
|
|
19
|
+
# Define the stages in the pipeline
|
|
20
|
+
pipeline_modules = [
|
|
21
|
+
{ name = "classifier", instruction_text = "You are an expert banking assistant. Classify the customer query into one of the known Banking77 intents. Always return the label using the `banking77_classify` tool.", few_shots = [] },
|
|
22
|
+
{ name = "calibrator", instruction_text = "You refine intent predictions from an upstream classifier. Review the suggested intent alongside the original query. If the suggestion is valid, confirm it. Otherwise, choose the closest Banking77 intent. Always respond via the `banking77_classify` tool with the final label.", few_shots = [] }
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
[prompt_learning.policy]
|
|
26
|
+
inference_mode = "synth_hosted"
|
|
27
|
+
model = "openai/gpt-oss-20b"
|
|
28
|
+
provider = "groq"
|
|
29
|
+
temperature = 0.0
|
|
30
|
+
max_completion_tokens = 128
|
|
31
|
+
policy_name = "banking77-pipeline"
|
|
32
|
+
|
|
33
|
+
[prompt_learning.env_config]
|
|
34
|
+
pool = "train"
|
|
35
|
+
|
|
36
|
+
# GEPA-specific configuration
|
|
37
|
+
[prompt_learning.gepa]
|
|
38
|
+
env_name = "banking77_pipeline"
|
|
39
|
+
rng_seed = 42
|
|
40
|
+
proposer_type = "spec" # Use spec mode to include system specification in mutation prompts
|
|
41
|
+
spec_path = "examples/task_apps/banking77_pipeline/banking77_pipeline_spec.json"
|
|
42
|
+
spec_max_tokens = 5000
|
|
43
|
+
spec_include_examples = true
|
|
44
|
+
spec_priority_threshold = 8
|
|
45
|
+
|
|
46
|
+
# Multi-stage module configuration (instruction-only for GEPA)
|
|
47
|
+
# Each module defines constraints for its stage
|
|
48
|
+
[[prompt_learning.gepa.modules]]
|
|
49
|
+
module_id = "classifier"
|
|
50
|
+
max_instruction_slots = 3
|
|
51
|
+
max_tokens = 1024
|
|
52
|
+
allowed_tools = ["banking77_classify"] # Classifier must use this tool
|
|
53
|
+
|
|
54
|
+
[[prompt_learning.gepa.modules]]
|
|
55
|
+
module_id = "calibrator"
|
|
56
|
+
max_instruction_slots = 3
|
|
57
|
+
max_tokens = 1024
|
|
58
|
+
allowed_tools = ["banking77_classify"] # Calibrator must use this tool
|
|
59
|
+
|
|
60
|
+
# Rollout configuration
|
|
61
|
+
[prompt_learning.gepa.rollout]
|
|
62
|
+
budget = 500 # Reduced for local testing
|
|
63
|
+
max_concurrent = 10
|
|
64
|
+
minibatch_size = 4
|
|
65
|
+
|
|
66
|
+
# Evaluation configuration
|
|
67
|
+
[prompt_learning.gepa.evaluation]
|
|
68
|
+
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
|
|
69
|
+
validation_seeds = [15, 16, 17, 18, 19]
|
|
70
|
+
test_pool = [40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
|
|
71
|
+
|
|
72
|
+
# Mutation configuration (LLM-guided)
|
|
73
|
+
[prompt_learning.gepa.mutation]
|
|
74
|
+
rate = 0.3
|
|
75
|
+
llm_model = "llama3-groq-70b-8192-tool-use-preview"
|
|
76
|
+
llm_provider = "groq"
|
|
77
|
+
llm_inference_url = "https://api.groq.com/openai/v1"
|
|
78
|
+
|
|
79
|
+
# Population/evolution configuration
|
|
80
|
+
[prompt_learning.gepa.population]
|
|
81
|
+
initial_size = 10 # Reduced for local testing
|
|
82
|
+
num_generations = 5 # Reduced for local testing
|
|
83
|
+
children_per_generation = 3
|
|
84
|
+
crossover_rate = 0.5
|
|
85
|
+
selection_pressure = 1.0
|
|
86
|
+
patience_generations = 2
|
|
87
|
+
|
|
88
|
+
# Archive/Pareto configuration
|
|
89
|
+
[prompt_learning.gepa.archive]
|
|
90
|
+
size = 32 # Reduced for local testing
|
|
91
|
+
pareto_set_size = 32
|
|
92
|
+
pareto_eps = 1e-6
|
|
93
|
+
feedback_fraction = 0.5
|
|
94
|
+
|
|
95
|
+
# Token/budget configuration
|
|
96
|
+
[prompt_learning.gepa.token]
|
|
97
|
+
max_limit = 50000
|
|
98
|
+
counting_model = "gpt-4"
|
|
99
|
+
enforce_pattern_limit = true
|
|
100
|
+
max_spend_usd = 50.0 # Reduced for local testing
|
|
101
|
+
|