PyPI - synth-ai - Versions diffs - 0.2.16__py3-none-any.whl → 0.2.19__py3-none-any.whl - Mend

synth-ai 0.2.16py3-none-any.whl → 0.2.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of synth-ai might be problematic. Click here for more details.

Files changed (299) hide show

examples/analyze_semantic_words.sh +2 -2
examples/baseline/banking77_baseline.py +204 -0
examples/baseline/crafter_baseline.py +407 -0
examples/baseline/pokemon_red_baseline.py +326 -0
examples/baseline/simple_baseline.py +56 -0
examples/baseline/warming_up_to_rl_baseline.py +239 -0
examples/blog_posts/gepa/README.md +355 -0
examples/blog_posts/gepa/configs/banking77_gepa_local.toml +95 -0
examples/blog_posts/gepa/configs/banking77_gepa_test.toml +82 -0
examples/blog_posts/gepa/configs/banking77_mipro_local.toml +52 -0
examples/blog_posts/gepa/configs/hotpotqa_gepa_local.toml +59 -0
examples/blog_posts/gepa/configs/hotpotqa_gepa_qwen.toml +36 -0
examples/blog_posts/gepa/configs/hotpotqa_mipro_local.toml +53 -0
examples/blog_posts/gepa/configs/hover_gepa_local.toml +59 -0
examples/blog_posts/gepa/configs/hover_gepa_qwen.toml +36 -0
examples/blog_posts/gepa/configs/hover_mipro_local.toml +53 -0
examples/blog_posts/gepa/configs/ifbench_gepa_local.toml +59 -0
examples/blog_posts/gepa/configs/ifbench_gepa_qwen.toml +36 -0
examples/blog_posts/gepa/configs/ifbench_mipro_local.toml +53 -0
examples/blog_posts/gepa/configs/pupa_gepa_local.toml +60 -0
examples/blog_posts/gepa/configs/pupa_mipro_local.toml +54 -0
examples/blog_posts/gepa/deploy_banking77_task_app.sh +41 -0
examples/blog_posts/gepa/gepa_baseline.py +204 -0
examples/blog_posts/gepa/query_prompts_example.py +97 -0
examples/blog_posts/gepa/run_gepa_banking77.sh +87 -0
examples/blog_posts/gepa/task_apps.py +105 -0
examples/blog_posts/gepa/test_gepa_local.sh +67 -0
examples/blog_posts/gepa/verify_banking77_setup.sh +123 -0
examples/blog_posts/pokemon_vl/README.md +98 -0
examples/blog_posts/pokemon_vl/configs/eval_gpt5nano.toml +26 -0
examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +27 -0
examples/blog_posts/pokemon_vl/configs/eval_rl_final.toml +24 -0
examples/blog_posts/pokemon_vl/configs/filter_high_reward.toml +10 -0
examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +43 -0
examples/blog_posts/pokemon_vl/configs/train_sft_qwen4b_vl.toml +40 -0
examples/blog_posts/pokemon_vl/extract_images.py +239 -0
examples/blog_posts/pokemon_vl/pokemon_vl_baseline.py +326 -0
examples/blog_posts/pokemon_vl/run_eval_extract_images.py +209 -0
examples/blog_posts/pokemon_vl/run_qwen_eval_extract_images.py +212 -0
examples/blog_posts/pokemon_vl/text_box_analysis.md +106 -0
examples/blog_posts/warming_up_to_rl/ARCHITECTURE.md +195 -0
examples/blog_posts/warming_up_to_rl/FINAL_TEST_RESULTS.md +127 -0
examples/blog_posts/warming_up_to_rl/INFERENCE_SUCCESS.md +132 -0
examples/blog_posts/warming_up_to_rl/README.md +158 -0
examples/blog_posts/warming_up_to_rl/SMOKE_TESTING.md +164 -0
examples/blog_posts/warming_up_to_rl/SMOKE_TEST_COMPLETE.md +253 -0
examples/blog_posts/warming_up_to_rl/configs/eval_baseline_qwen32b_10x20.toml +25 -0
examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b.toml +25 -0
examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b_10x20.toml +26 -0
examples/blog_posts/warming_up_to_rl/configs/eval_groq_qwen32b.toml +25 -0
examples/blog_posts/warming_up_to_rl/configs/eval_openai_gpt_oss_120b.toml +29 -0
examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +10 -0
examples/blog_posts/warming_up_to_rl/configs/smoke_test.toml +75 -0
examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +91 -0
examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +40 -0
examples/blog_posts/warming_up_to_rl/warming_up_to_rl_baseline.py +187 -0
examples/dev/qwen3_32b_qlora_4xh100.toml +5 -0
examples/multi_step/configs/VERILOG_REWARDS.md +4 -0
examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +4 -0
examples/multi_step/configs/crafter_rl_outcome.toml +2 -1
examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +65 -107
examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +2 -1
examples/multi_step/configs/crafter_rl_stepwise_simple.toml +2 -1
examples/multi_step/configs/crafter_rl_stepwise_simple_NEW_FORMAT.toml +105 -0
examples/multi_step/configs/verilog_rl_lora.toml +80 -123
examples/qwen_coder/configs/coder_lora_30b.toml +1 -3
examples/qwen_coder/configs/coder_lora_4b.toml +4 -1
examples/qwen_coder/configs/coder_lora_small.toml +1 -3
examples/qwen_vl/README.md +10 -12
examples/qwen_vl/SETUP_COMPLETE.md +7 -8
examples/qwen_vl/VISION_TESTS_COMPLETE.md +2 -3
examples/qwen_vl/collect_data_via_cli.md +76 -84
examples/qwen_vl/collect_vision_traces.py +4 -4
examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +40 -57
examples/qwen_vl/configs/crafter_vlm_sft_example.toml +1 -2
examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +20 -37
examples/qwen_vl/configs/eval_gpt5nano_vision.toml +21 -40
examples/qwen_vl/configs/eval_qwen3vl_vision.toml +26 -0
examples/qwen_vl/configs/{filter_qwen2vl_sft.toml → filter_qwen3vl_sft.toml} +4 -5
examples/qwen_vl/configs/filter_vision_sft.toml +2 -3
examples/qwen_vl/crafter_qwen_vl_agent.py +5 -5
examples/qwen_vl/run_vision_comparison.sh +6 -7
examples/rl/README.md +5 -5
examples/rl/configs/rl_from_base_qwen.toml +26 -1
examples/rl/configs/rl_from_base_qwen17.toml +6 -2
examples/rl/task_app/README.md +1 -2
examples/rl/task_app/math_single_step.py +2 -2
examples/run_crafter_demo.sh +2 -2
examples/sft/README.md +1 -1
examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -1
examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -1
examples/swe/task_app/README.md +32 -2
examples/swe/task_app/grpo_swe_mini.py +4 -0
examples/swe/task_app/hosted/envs/crafter/react_agent.py +1 -1
examples/swe/task_app/hosted/envs/mini_swe/environment.py +37 -10
examples/swe/task_app/hosted/inference/openai_client.py +4 -38
examples/swe/task_app/hosted/policy_routes.py +17 -0
examples/swe/task_app/hosted/rollout.py +4 -2
examples/swe/task_app/morph_backend.py +178 -0
examples/task_apps/banking77/__init__.py +6 -0
examples/task_apps/banking77/banking77_task_app.py +841 -0
examples/task_apps/banking77/deploy_wrapper.py +46 -0
examples/task_apps/crafter/CREATE_SFT_DATASET.md +4 -0
examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +4 -0
examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +4 -0
examples/task_apps/crafter/task_app/README.md +1 -1
examples/task_apps/crafter/task_app/grpo_crafter.py +90 -5
examples/task_apps/crafter/task_app/grpo_crafter_task_app.py +1 -1
examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +4 -26
examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -2
examples/task_apps/crafter/task_app/synth_envs_hosted/hosted_app.py +49 -0
examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +372 -107
examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +81 -12
examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +82 -11
examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +194 -1
examples/task_apps/enron/task_app/grpo_enron_task_app.py +1 -1
examples/task_apps/gepa_benchmarks/__init__.py +7 -0
examples/task_apps/gepa_benchmarks/common.py +260 -0
examples/task_apps/gepa_benchmarks/hotpotqa_task_app.py +507 -0
examples/task_apps/gepa_benchmarks/hover_task_app.py +436 -0
examples/task_apps/gepa_benchmarks/ifbench_task_app.py +563 -0
examples/task_apps/gepa_benchmarks/pupa_task_app.py +460 -0
examples/task_apps/math/README.md +1 -2
examples/task_apps/pokemon_red/README.md +3 -4
examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +4 -0
examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +6 -5
examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +1 -2
examples/task_apps/pokemon_red/task_app.py +288 -39
examples/task_apps/sokoban/README.md +2 -3
examples/task_apps/verilog/eval_groq_qwen32b.toml +12 -14
examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +1 -1
examples/vlm/configs/crafter_vlm_gpt4o.toml +4 -1
examples/warming_up_to_rl/configs/crafter_fft.toml +4 -1
examples/warming_up_to_rl/configs/crafter_fft_4b.toml +0 -2
examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +3 -2
examples/warming_up_to_rl/run_local_rollout_traced.py +1 -1
examples/warming_up_to_rl/task_app/README.md +1 -1
examples/warming_up_to_rl/task_app/grpo_crafter.py +185 -5
examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +1 -1
examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +3 -27
examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -1
examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +49 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +156 -45
examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +37 -4
examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +33 -3
examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +67 -0
examples/workflows/math_rl/configs/rl_from_base_qwen.toml +27 -0
examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +6 -0
synth_ai/api/train/builders.py +99 -4
synth_ai/api/train/cli.py +516 -26
synth_ai/api/train/config_finder.py +13 -2
synth_ai/api/train/configs/__init__.py +23 -2
synth_ai/api/train/configs/prompt_learning.py +442 -0
synth_ai/api/train/configs/rl.py +61 -7
synth_ai/api/train/configs/sft.py +6 -2
synth_ai/api/train/configs/shared.py +59 -2
synth_ai/api/train/task_app.py +1 -1
synth_ai/api/train/validators.py +277 -0
synth_ai/auth/credentials.py +119 -0
synth_ai/baseline/__init__.py +25 -0
synth_ai/baseline/config.py +209 -0
synth_ai/baseline/discovery.py +214 -0
synth_ai/baseline/execution.py +146 -0
synth_ai/cli/__init__.py +94 -18
synth_ai/cli/__main__.py +0 -0
synth_ai/cli/claude.py +70 -0
synth_ai/cli/codex.py +84 -0
synth_ai/cli/commands/__init__.py +18 -0
synth_ai/cli/commands/baseline/__init__.py +12 -0
synth_ai/cli/commands/baseline/core.py +637 -0
synth_ai/cli/commands/baseline/list.py +93 -0
synth_ai/cli/commands/demo/__init__.py +6 -0
synth_ai/cli/commands/demo/core.py +163 -0
synth_ai/cli/commands/eval/__init__.py +19 -0
synth_ai/cli/commands/eval/core.py +1112 -0
synth_ai/cli/commands/eval/errors.py +81 -0
synth_ai/cli/commands/eval/validation.py +133 -0
synth_ai/cli/commands/filter/__init__.py +12 -0
synth_ai/cli/commands/filter/core.py +424 -0
synth_ai/cli/commands/filter/errors.py +55 -0
synth_ai/cli/commands/filter/validation.py +77 -0
synth_ai/cli/commands/help/__init__.py +177 -0
synth_ai/cli/commands/help/core.py +72 -0
synth_ai/cli/commands/smoke/__init__.py +7 -0
synth_ai/cli/commands/smoke/core.py +1436 -0
synth_ai/cli/commands/status/__init__.py +64 -0
synth_ai/cli/commands/status/client.py +192 -0
synth_ai/cli/commands/status/config.py +92 -0
synth_ai/cli/commands/status/errors.py +20 -0
synth_ai/cli/commands/status/formatters.py +164 -0
synth_ai/cli/commands/status/subcommands/__init__.py +9 -0
synth_ai/cli/commands/status/subcommands/files.py +79 -0
synth_ai/cli/commands/status/subcommands/jobs.py +334 -0
synth_ai/cli/commands/status/subcommands/models.py +79 -0
synth_ai/cli/commands/status/subcommands/pricing.py +22 -0
synth_ai/cli/commands/status/subcommands/runs.py +81 -0
synth_ai/cli/commands/status/subcommands/summary.py +47 -0
synth_ai/cli/commands/status/subcommands/usage.py +203 -0
synth_ai/cli/commands/status/utils.py +114 -0
synth_ai/cli/commands/train/__init__.py +53 -0
synth_ai/cli/commands/train/core.py +21 -0
synth_ai/cli/commands/train/errors.py +117 -0
synth_ai/cli/commands/train/judge_schemas.py +200 -0
synth_ai/cli/commands/train/judge_validation.py +305 -0
synth_ai/cli/commands/train/validation.py +386 -0
synth_ai/cli/demo.py +30 -158
synth_ai/cli/deploy/__init__.py +43 -0
synth_ai/cli/deploy.py +162 -0
synth_ai/cli/eval/__init__.py +36 -0
synth_ai/cli/eval/core.py +5 -0
synth_ai/cli/eval/errors.py +31 -0
synth_ai/cli/eval/validation.py +5 -0
synth_ai/cli/filter/__init__.py +28 -0
synth_ai/cli/filter/core.py +5 -0
synth_ai/cli/filter/errors.py +23 -0
synth_ai/cli/filter/validation.py +5 -0
synth_ai/cli/legacy_root_backup.py +14 -8
synth_ai/cli/modal_serve/__init__.py +12 -0
synth_ai/cli/modal_serve/core.py +14 -0
synth_ai/cli/modal_serve/errors.py +8 -0
synth_ai/cli/modal_serve/validation.py +11 -0
synth_ai/cli/opencode.py +107 -0
synth_ai/cli/root.py +9 -5
synth_ai/cli/serve/__init__.py +12 -0
synth_ai/cli/serve/core.py +14 -0
synth_ai/cli/serve/errors.py +8 -0
synth_ai/cli/serve/validation.py +11 -0
synth_ai/cli/setup.py +20 -265
synth_ai/cli/status.py +7 -126
synth_ai/cli/task_app_deploy.py +1 -10
synth_ai/cli/task_app_modal_serve.py +4 -9
synth_ai/cli/task_app_serve.py +4 -11
synth_ai/cli/task_apps.py +51 -1480
synth_ai/cli/train/__init__.py +12 -0
synth_ai/cli/train/core.py +21 -0
synth_ai/cli/train/errors.py +8 -0
synth_ai/cli/train/validation.py +24 -0
synth_ai/cli/train.py +1 -14
synth_ai/demos/crafter/grpo_crafter_task_app.py +1 -1
synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +7 -4
synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +9 -5
synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +4 -3
synth_ai/environments/examples/red/engine.py +33 -12
synth_ai/environments/examples/red/engine_helpers/reward_components.py +151 -179
synth_ai/environments/examples/red/environment.py +26 -0
synth_ai/environments/examples/red/trace_hooks_v3.py +168 -0
synth_ai/http.py +12 -0
synth_ai/judge_schemas.py +10 -10
synth_ai/learning/__init__.py +10 -0
synth_ai/learning/prompt_learning_client.py +276 -0
synth_ai/learning/prompt_learning_types.py +184 -0
synth_ai/learning/rl/client.py +3 -1
synth_ai/pricing/__init__.py +2 -0
synth_ai/pricing/model_pricing.py +57 -0
synth_ai/streaming/__init__.py +29 -0
synth_ai/streaming/config.py +94 -0
synth_ai/streaming/handlers.py +518 -0
synth_ai/streaming/streamer.py +320 -0
synth_ai/streaming/types.py +95 -0
synth_ai/task/apps/__init__.py +1 -0
synth_ai/task/config.py +2 -0
synth_ai/task/tracing_utils.py +25 -25
synth_ai/task/validators.py +45 -9
synth_ai/task_app_cfgs.py +21 -0
synth_ai/tracing_v3/config.py +162 -19
synth_ai/tracing_v3/constants.py +1 -1
synth_ai/tracing_v3/db_config.py +24 -38
synth_ai/tracing_v3/migration_helper.py +1 -2
synth_ai/tracing_v3/storage/config.py +47 -13
synth_ai/tracing_v3/storage/factory.py +3 -3
synth_ai/tracing_v3/turso/daemon.py +113 -11
synth_ai/tracing_v3/turso/native_manager.py +92 -16
synth_ai/types.py +8 -0
synth_ai/urls.py +11 -0
synth_ai/utils/__init__.py +30 -1
synth_ai/utils/agents.py +74 -0
synth_ai/utils/bin.py +39 -0
synth_ai/utils/cli.py +149 -5
synth_ai/utils/env.py +40 -33
synth_ai/utils/http.py +4 -1
synth_ai/utils/json.py +72 -0
synth_ai/utils/modal.py +285 -3
synth_ai/utils/paths.py +48 -0
synth_ai/utils/uvicorn.py +113 -0
{synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/METADATA +109 -6
{synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/RECORD +291 -142
examples/qwen_vl/configs/eval_qwen2vl_vision.toml +0 -44
synth_ai/cli/tui.py +0 -62
synth_ai/tui/__init__.py +0 -5
synth_ai/tui/__main__.py +0 -13
synth_ai/tui/cli/__init__.py +0 -1
synth_ai/tui/cli/query_experiments.py +0 -164
synth_ai/tui/cli/query_experiments_v3.py +0 -164
synth_ai/tui/dashboard.py +0 -911
{synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/WHEEL +0 -0
{synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/entry_points.txt +0 -0
{synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/licenses/LICENSE +0 -0
{synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/top_level.txt +0 -0

examples/blog_posts/gepa/README.md ADDED Viewed

@@ -0,0 +1,355 @@
+# GEPA: Genetic Evolution for Prompt Optimization
+This directory contains examples and configurations for using GEPA (Genetic Evolution for Prompt Optimization) to optimize prompts for various classification and reasoning tasks.
+## Overview
+**GEPA** is an evolutionary algorithm that optimizes prompts through genetic operations (mutation, crossover, selection) across multiple generations. It's particularly effective for:
+- Intent classification (Banking77)
+- Multi-hop QA (HotpotQA)
+- Instruction following (IFBench)
+- Claim verification (HoVer)
+- Privacy-aware delegation (PUPA)
+## Supported Tasks
+Configuration files live under `configs/`:
+| Task | Description | Config Files |
+|------|-------------|--------------|
+| **Banking77** | Intent classification (77 banking intents) | `banking77_gepa_local.toml`, `banking77_mipro_local.toml` |
+| **HotpotQA** | Multi-hop question answering | `hotpotqa_gepa_local.toml`, `hotpotqa_mipro_local.toml` |
+| **IFBench** | Instruction following benchmark | `ifbench_gepa_local.toml`, `ifbench_mipro_local.toml` |
+| **HoVer** | Claim verification against Wikipedia | `hover_gepa_local.toml`, `hover_mipro_local.toml` |
+| **PUPA** | Privacy-aware task delegation | `pupa_gepa_local.toml`, `pupa_mipro_local.toml` |
+Each template targets a different default port (8110–8113) so you can run multiple task apps side-by-side.
+---
+## Quick Start (Banking77 Example)
+### Prerequisites
+```bash
+# 1. Install dependencies
+uv pip install -e .
+# 2. Set environment variables
+export SYNTH_API_KEY="your-backend-api-key"
+export GROQ_API_KEY="gsk_your_groq_key"
+export ENVIRONMENT_API_KEY="$(python -c 'import secrets; print(secrets.token_urlsafe(32))')"
+```
+**Where to get API keys:**
+- **GROQ_API_KEY**: Get from https://console.groq.com/keys
+- **SYNTH_API_KEY**: Get from your backend admin or `.env.dev` file
+- **ENVIRONMENT_API_KEY**: Generate a random secure token (command above)
+### Step 1: Start the Backend
+```bash
+# Make sure your backend is running
+curl http://localhost:8000/api/health
+# Should return: {"status":"ok"}
+```
+### Step 2: Deploy Task App
+**Option A: Using helper script (recommended)**
+```bash
+# Terminal 1
+./examples/blog_posts/gepa/deploy_banking77_task_app.sh
+```
+**Option B: Using CLI**
+```bash
+uvx synth-ai deploy banking77 --runtime uvicorn --port 8102
+```
+**Option C: Deploy to Modal**
+```bash
+uvx synth-ai deploy banking77 --runtime modal --name banking77-gepa --env-file .env
+```
+### Step 3: Run GEPA Optimization
+**Option A: Using helper script (recommended)**
+```bash
+# Terminal 2
+./examples/blog_posts/gepa/run_gepa_banking77.sh
+```
+**Option B: Using CLI directly**
+```bash
+uvx synth-ai train \
+  --config examples/blog_posts/gepa/configs/banking77_gepa_local.toml \
+  --backend http://localhost:8000 \
+  --poll
+```
+### Step 4: Monitor Progress
+You'll see real-time output like:
+```
+🧬 Running GEPA on Banking77
+=============================
+✅ Backend URL: http://localhost:8000
+✅ Task app is healthy
+🚀 Starting GEPA training...
+proposal[0] train_accuracy=0.65 len=120 tool_rate=0.95 N=30
+  🔄 TRANSFORMATION:
+    [SYSTEM]: Classify customer banking queries into intents...
+Generation 1/15: Best reward=0.75 (75% accuracy)
+Generation 2/15: Best reward=0.82 (82% accuracy)
+...
+✅ GEPA training complete!
+```
+Results are automatically saved to `configs/results/gepa_results_<job_id>_<timestamp>.txt`.
+---
+## Configuration
+### Example: Banking77 GEPA Configuration
+```toml
+[prompt_learning]
+algorithm = "gepa"
+task_app_url = "http://127.0.0.1:8102"
+task_app_id = "banking77"
+# Training seeds (30 seeds from train pool)
+evaluation_seeds = [50, 51, 52, ..., 79]
+# Validation seeds (50 seeds from validation pool - not in training)
+validation_seeds = [0, 1, 2, ..., 49]
+[prompt_learning.gepa]
+initial_population_size = 20    # Starting population of prompts
+num_generations = 15            # Number of evolutionary cycles
+mutation_rate = 0.3             # Probability of mutation
+crossover_rate = 0.5            # Probability of crossover
+rollout_budget = 1000           # Total rollouts across all generations
+max_concurrent_rollouts = 20    # Parallel rollout limit
+pareto_set_size = 20           # Size of Pareto front
+```
+### Key Parameters
+| Parameter | Description | Typical Range |
+|-----------|-------------|---------------|
+| `initial_population_size` | Starting number of prompt variants | 10-50 |
+| `num_generations` | Evolutionary cycles to run | 5-30 |
+| `mutation_rate` | Probability of mutating a prompt | 0.1-0.5 |
+| `crossover_rate` | Probability of combining two prompts | 0.3-0.7 |
+| `rollout_budget` | Total task evaluations allowed | 200-2000 |
+| `max_concurrent_rollouts` | Parallel rollout limit | 10-50 |
+| `pareto_set_size` | Multi-objective optimization frontier size | 10-30 |
+---
+## Querying Results
+After GEPA completes, you can query job results programmatically:
+### Python API
+```python
+from synth_ai.learning import get_prompts, get_prompt_text, get_scoring_summary
+# Get all results
+results = get_prompts(
+    job_id="pl_abc123",
+    base_url="http://localhost:8000",
+    api_key="sk_..."
+)
+# Access best prompt
+best_prompt = results["best_prompt"]
+best_score = results["best_score"]
+print(f"Best Score: {best_score:.3f}")
+# Get top-K prompts
+for prompt_info in results["top_prompts"]:
+    print(f"Rank {prompt_info['rank']}: {prompt_info['train_accuracy']:.3f}")
+    print(prompt_info["full_text"])
+# Quick access to best prompt text only
+best_text = get_prompt_text(
+    job_id="pl_abc123",
+    base_url="http://localhost:8000",
+    api_key="sk_...",
+    rank=1  # 1 = best, 2 = second best, etc.
+)
+# Get scoring statistics
+summary = get_scoring_summary(
+    job_id="pl_abc123",
+    base_url="http://localhost:8000",
+    api_key="sk_..."
+)
+print(f"Best: {summary['best_train_accuracy']:.3f}")
+print(f"Mean: {summary['mean_train_accuracy']:.3f}")
+print(f"Tried: {summary['num_candidates_tried']}")
+```
+### Command Line
+```bash
+# Set environment variables
+export BACKEND_BASE_URL="http://localhost:8000"
+export SYNTH_API_KEY="sk_..."
+# Run the example script
+python examples/blog_posts/gepa/query_prompts_example.py pl_abc123
+```
+### REST API
+```bash
+# Get job status
+curl -H "Authorization: Bearer $SYNTH_API_KEY" \
+  http://localhost:8000/api/prompt-learning/online/jobs/JOB_ID
+# Stream events
+curl -H "Authorization: Bearer $SYNTH_API_KEY" \
+  http://localhost:8000/api/prompt-learning/online/jobs/JOB_ID/events/stream
+# Get metrics
+curl -H "Authorization: Bearer $SYNTH_API_KEY" \
+  http://localhost:8000/api/prompt-learning/online/jobs/JOB_ID/metrics
+```
+---
+## Expected Results
+GEPA typically improves accuracy over generations:
+| Generation | Typical Accuracy | Notes |
+|------------|------------------|-------|
+| 1 (baseline) | 60-75% | Initial random/baseline prompts |
+| 5 | 75-80% | Early optimization gains |
+| 10 | 80-85% | Convergence begins |
+| 15 (final) | 85-90%+ | Optimized prompts on Pareto front |
+The Pareto front contains multiple prompt variants balancing:
+- **Accuracy** (primary objective)
+- **Token count** (efficiency objective)
+- **Tool call rate** (task-specific objective)
+---
+## Helper Scripts
+| Script | Purpose |
+|--------|---------|
+| `deploy_banking77_task_app.sh` | Start Banking77 task app locally |
+| `run_gepa_banking77.sh` | Run GEPA optimization with validation checks |
+| `test_gepa_local.sh` | Quick test script for local setup |
+| `verify_banking77_setup.sh` | Comprehensive setup verification |
+| `query_prompts_example.py` | Example script for querying results |
+---
+## Troubleshooting
+### ❌ "Banking77 task app is not running"
+**Solution:** Start the task app first
+```bash
+./examples/blog_posts/gepa/deploy_banking77_task_app.sh
+```
+### ❌ "Cannot connect to backend"
+**Solution:** Verify backend is running
+```bash
+curl http://localhost:8000/api/health
+```
+If not running, start your backend service.
+### ❌ "GROQ_API_KEY environment variable is required"
+**Solution:** Export your Groq API key
+```bash
+export GROQ_API_KEY="gsk_your_key_here"
+```
+### ❌ "Failed to download dataset"
+**Solution:** Check internet connection. The task app downloads from Hugging Face.
+If you have the dataset locally:
+```bash
+export BANKING77_DATASET_NAME="/path/to/local/banking77"
+```
+### ❌ Pattern validation failed
+**Solution:** Ensure your config's `initial_prompt.messages` uses the `{query}` wildcard:
+```toml
+[[prompt_learning.initial_prompt.messages]]
+role = "user"
+pattern = "Customer Query: {query}\n\nClassify this query."
+```
+### ⚠️ Metrics not streaming
+**Solution:**
+1. Verify backend `/metrics` endpoint exists
+2. Check SDK `StreamConfig` enables `StreamType.METRICS`
+3. Restart local backend to pick up latest code
+---
+## Files in This Directory
+```
+examples/blog_posts/gepa/
+├── README.md                         # This file - comprehensive guide
+├── configs/                          # Configuration files
+│   ├── banking77_gepa_local.toml    # Banking77 GEPA config
+│   ├── banking77_mipro_local.toml   # Banking77 MIPRO config
+│   ├── hotpotqa_gepa_local.toml     # HotpotQA configs
+│   ├── ifbench_gepa_local.toml      # IFBench configs
+│   ├── hover_gepa_local.toml        # HoVer configs
+│   └── pupa_gepa_local.toml         # PUPA configs
+├── deploy_banking77_task_app.sh     # Helper: Start task app
+├── run_gepa_banking77.sh            # Helper: Run GEPA
+├── test_gepa_local.sh               # Helper: Quick test
+├── verify_banking77_setup.sh        # Helper: Verify setup
+├── (baseline: examples/baseline/banking77_baseline.py)
+├── query_prompts_example.py         # Query results example
+└── task_apps.py                     # Task app registry
+```
+---
+## Next Steps
+1. **Evaluate optimized prompts**: Test best prompts on held-out validation split
+2. **Compare with baseline**: Run `uvx synth-ai baseline banking77` to measure improvement
+3. **Experiment with parameters**: Adjust mutation/crossover rates, population size
+4. **Try MIPRO**: Compare GEPA with MIPROv2 optimization
+5. **Benchmark across tasks**: Test on HotpotQA, IFBench, HoVer, PUPA
+---
+## Support
+For issues or questions:
+1. Verify all API keys are set correctly
+2. Check task app: `curl -H "X-API-Key: $ENVIRONMENT_API_KEY" http://127.0.0.1:8102/health`
+3. Check backend: `curl http://localhost:8000/api/health`
+4. Review logs in both terminals for error messages
+5. Run verification script: `./verify_banking77_setup.sh`
+Happy optimizing! 🧬🚀

examples/blog_posts/gepa/configs/banking77_gepa_local.toml ADDED Viewed

@@ -0,0 +1,95 @@
+[prompt_learning]
+algorithm = "gepa"
+task_app_url = "https://synth-laboratories-dev--synth-banking77-web-web.modal.run"
+task_app_id = "banking77"
+# Initial prompt pattern (pattern-based mode)
+[prompt_learning.initial_prompt]
+id = "banking77_pattern"
+name = "Banking77 Classification Pattern"
+[[prompt_learning.initial_prompt.messages]]
+role = "system"
+pattern = "You are an expert banking assistant that classifies customer queries into banking intents. Given a customer message, respond with exactly one intent label from the provided list using the `banking77_classify` tool."
+order = 0
+[[prompt_learning.initial_prompt.messages]]
+role = "user"
+pattern = "Customer Query: {query}\n\nClassify this query into one of the banking intents using the tool call."
+order = 1
+[prompt_learning.initial_prompt.wildcards]
+query = "REQUIRED"  # Will be provided by task app at runtime
+# Policy configuration (model, provider, etc.)
+[prompt_learning.policy]
+inference_mode = "synth_hosted"
+model = "openai/gpt-oss-20b"
+provider = "groq"
+inference_url = "https://api.groq.com/openai/v1"
+temperature = 0.0
+max_completion_tokens = 512
+policy_name = "banking77-classifier"  # Required for Banking77 task app
+# Training split config
+[prompt_learning.env_config]
+pool = "train"
+# GEPA-specific configuration with nested subsections (mirrors RL structure)
+[prompt_learning.gepa]
+env_name = "banking77"
+proposer_type = "dspy"
+# Rollout configuration (mirrors RL [rollout] section)
+[prompt_learning.gepa.rollout]
+budget = 1000
+max_concurrent = 20
+minibatch_size = 10
+# Evaluation configuration (mirrors RL [evaluation] section)
+[prompt_learning.gepa.evaluation]
+seeds = [
+  50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+  60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+  70, 71, 72, 73, 74, 75, 76, 77, 78, 79
+]  # Training seeds (30 seeds from train pool)
+validation_seeds = [
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+  20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+  30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+  40, 41, 42, 43, 44, 45, 46, 47, 48, 49
+]  # Held-out validation seeds (50 seeds from validation pool - not in training)
+validation_pool = "validation"
+validation_top_k = 3
+test_pool = [2, 3]  # Test pool for final evaluation (small held-out set)
+# Mutation configuration (LLM-guided mutation settings)
+[prompt_learning.gepa.mutation]
+rate = 0.3
+llm_model = "openai/gpt-oss-120b"
+llm_provider = "groq"
+llm_inference_url = "https://api.groq.com/openai/v1"
+# Population configuration (evolution parameters)
+[prompt_learning.gepa.population]
+initial_size = 10
+num_generations = 3
+children_per_generation = 12
+crossover_rate = 0.5
+selection_pressure = 1.0
+patience_generations = 3
+# Archive configuration (Pareto archive settings)
+[prompt_learning.gepa.archive]
+size = 40
+pareto_set_size = 32
+pareto_eps = 1e-6
+feedback_fraction = 0.5
+# Token and budget configuration
+[prompt_learning.gepa.token]
+# max_limit = 1000  # Uncomment to set a token limit
+counting_model = "gpt-4"
+enforce_pattern_limit = true
+# max_spend_usd = 100.0  # Uncomment to set a budget cap

examples/blog_posts/gepa/configs/banking77_gepa_test.toml ADDED Viewed

@@ -0,0 +1,82 @@
+# GEPA Prompt Learning for Banking77
+# Local backend configuration (localhost:8000)
+[prompt_learning]
+algorithm = "gepa"
+task_app_url = "https://synth-laboratories-dev--synth-banking77-web-web.modal.run"
+task_app_id = "banking77"
+evaluation_seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
+# Held-out validation config
+validation_seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
+validation_pool = "validation"
+validation_top_k = 3
+# Training split config
+[prompt_learning.env_config]
+pool = "train"
+# Seeds for evaluation (increase to score prompts with more rollouts)
+evaluation_seeds = [
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+  20, 21, 22, 23, 24, 25, 26, 27, 28, 29
+]
+# Test pool for final evaluation (held-out episodes)
+test_pool = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
+# Initial prompt pattern (pattern-based mode)
+[prompt_learning.initial_prompt]
+id = "banking77_pattern"
+name = "Banking77 Classification Pattern"
+[[prompt_learning.initial_prompt.messages]]
+role = "system"
+pattern = "You are an expert banking assistant that classifies customer queries into banking intents. Given a customer message, respond with exactly one intent label from the provided list using the `banking77_classify` tool."
+order = 0
+[[prompt_learning.initial_prompt.messages]]
+role = "user"
+pattern = "Customer Query: {query}\n\nClassify this query into one of the banking intents using the tool call."
+order = 1
+[prompt_learning.initial_prompt.wildcards]
+query = "REQUIRED"  # Will be provided by task app at runtime
+# Policy configuration (model, provider, etc.)
+[prompt_learning.policy]
+inference_mode = "synth_hosted"
+model = "openai/gpt-oss-120b"
+provider = "groq"
+inference_url = "https://api.groq.com/openai/v1"
+temperature = 0.0
+max_completion_tokens = 512
+policy_name = "banking77-classifier"  # Required for Banking77 task app
+# GEPA-specific configuration
+[prompt_learning.gepa]
+env_name = "banking77"
+initial_population_size = 40
+num_generations = 10
+mutation_rate = 0.3
+crossover_rate = 0.5
+selection_pressure = 1.0
+minibatch_size = 12
+pareto_set_size = 40
+feedback_fraction = 0.5
+children_per_generation = 16
+patience_generations = 5
+rollout_budget = 1500
+archive_size = 30
+pareto_eps = 1e-6
+max_concurrent_rollouts = 20  # Maximum concurrent rollouts across all transformations
+# Instruction proposer selection
+proposer_type = "dspy"
+# LLM-guided mutation configuration
+mutation_llm_model = "openai/gpt-oss-20b"
+mutation_llm_provider = "groq"
+mutation_llm_inference_url = "https://api.groq.com/openai/v1"

examples/blog_posts/gepa/configs/banking77_mipro_local.toml ADDED Viewed

@@ -0,0 +1,52 @@
+# MIPROv2 Prompt Learning for Banking77
+# Local backend configuration targeting the Banking77 intent classification task app.
+[prompt_learning]
+algorithm = "mipro"
+task_app_url = "http://127.0.0.1:8102"
+task_app_id = "banking77"
+# Seeds evaluated during optimisation
+evaluation_seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+# Held-out seeds for final scoring
+test_pool = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
+[prompt_learning.initial_prompt]
+id = "banking77_pattern"
+name = "Banking77 Classification Pattern"
+[[prompt_learning.initial_prompt.messages]]
+role = "system"
+pattern = "You are an expert banking assistant that classifies customer queries into banking intents. Return only the intent label using the `banking77_classify` tool."
+order = 0
+[[prompt_learning.initial_prompt.messages]]
+role = "user"
+pattern = "Customer Query: {query}\n\nClassify this query into one of the banking intents using the tool call."
+order = 1
+[prompt_learning.initial_prompt.wildcards]
+query = "REQUIRED"
+[prompt_learning.policy]
+model = "openai/gpt-oss-20b"
+provider = "groq"
+inference_url = "https://api.groq.com/openai/v1"
+temperature = 0.0
+max_completion_tokens = 128
+policy_name = "banking77-mipro"
+[prompt_learning.mipro]
+env_name = "banking77"
+num_iterations = 16
+num_evaluations_per_iteration = 6
+batch_size = 6
+max_concurrent = 16
+meta_model = "gpt-4.1-mini"
+meta_model_provider = "openai"
+meta_model_inference_url = "https://api.openai.com/v1"
+few_shot_score_threshold = 0.85
+test_pool = [20, 21, 22, 23, 24]
+bootstrap_train_seeds = [0, 1, 2, 3, 4]
+online_pool = [5, 6, 7, 8, 9]

examples/blog_posts/gepa/configs/hotpotqa_gepa_local.toml ADDED Viewed

@@ -0,0 +1,59 @@
+# GEPA Prompt Learning for HotpotQA
+# Local backend configuration targeting the HotpotQA task app.
+[prompt_learning]
+algorithm = "gepa"
+task_app_url = "http://127.0.0.1:8110"
+task_app_id = "hotpotqa"
+# Seeds for online evaluation (episode IDs)
+evaluation_seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+# Held-out pool used for final evaluation
+test_pool = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
+[prompt_learning.initial_prompt]
+id = "hotpotqa_chain"
+name = "HotpotQA Multi-Hop Reasoning"
+[[prompt_learning.initial_prompt.messages]]
+role = "system"
+pattern = "You are a research assistant that answers multi-hop questions. Use the provided supporting passages to reason out the final answer. Reply with the format:\nAnswer: <short answer>\nSupport: <brief justification referencing the passages>."
+order = 0
+[[prompt_learning.initial_prompt.messages]]
+role = "user"
+pattern = "Question: {question}\n\nPassages:\n{context}\n\nProvide the final answer and cite the relevant supporting facts."
+order = 1
+[prompt_learning.initial_prompt.wildcards]
+question = "REQUIRED"
+context = "REQUIRED"
+[prompt_learning.policy]
+model = "openai/gpt-oss-20b"
+provider = "groq"
+inference_url = "https://api.groq.com/openai/v1"
+temperature = 0.0
+max_completion_tokens = 512
+policy_name = "hotpotqa-gepa"
+[prompt_learning.gepa]
+env_name = "hotpotqa"
+initial_population_size = 24
+num_generations = 15
+mutation_rate = 0.35
+crossover_rate = 0.55
+selection_pressure = 1.0
+minibatch_size = 8
+pareto_set_size = 24
+feedback_fraction = 0.5
+children_per_generation = 12
+patience_generations = 5
+rollout_budget = 600
+archive_size = 36
+pareto_eps = 1e-6
+max_concurrent_rollouts = 24
+mutation_llm_model = "openai/gpt-oss-20b"
+mutation_llm_provider = "groq"
+mutation_llm_inference_url = "https://api.groq.com/openai/v1"

examples/blog_posts/gepa/configs/hotpotqa_gepa_qwen.toml ADDED Viewed

@@ -0,0 +1,36 @@
+[prompt_learning]
+algorithm = "gepa"
+task_app_url = "https://synth-laboratories-dev--synth-banking77-web-web.modal.run" # TODO: replace with HotpotQA task app URL
+task_app_id = "hotpotqa"
+# Seeds
+evaluation_seeds = [0,1,2,3,4,5,6,7,8,9]
+# Held-out validation
+validation_seeds = [10,11,12,13,14,15,16,17,18,19]
+validation_pool = "validation"
+validation_top_k = 3
+# Train split configuration
+[prompt_learning.env_config]
+pool = "train"
+# Policy model (synth Qwen via backend inference proxy)
+[prompt_learning.policy]
+provider = "synth"
+model = "Qwen/Qwen3-8B"
+# inference_url will be mapped to backend /api/inference/v1 by the optimizer
+# GEPA parameters (tune as needed)
+[prompt_learning.gepa]
+env_name = "hotpotqa"
+initial_population_size = 24
+num_generations = 6
+children_per_generation = 12
+minibatch_size = 10
+pareto_set_size = 32
+rollout_budget = 600
+max_concurrent_rollouts = 16
+mutation_llm_model = "openai/gpt-oss-120b"
+mutation_llm_provider = "groq"
+proposer_type = "dspy"

synth-ai 0.2.16__py3-none-any.whl → 0.2.19__py3-none-any.whl

Potentially problematic release.

synth-ai 0.2.16py3-none-any.whl → 0.2.19py3-none-any.whl