synth-ai 0.2.16__py3-none-any.whl → 0.2.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/analyze_semantic_words.sh +2 -2
- examples/baseline/banking77_baseline.py +204 -0
- examples/baseline/crafter_baseline.py +407 -0
- examples/baseline/pokemon_red_baseline.py +326 -0
- examples/baseline/simple_baseline.py +56 -0
- examples/baseline/warming_up_to_rl_baseline.py +239 -0
- examples/blog_posts/gepa/README.md +355 -0
- examples/blog_posts/gepa/configs/banking77_gepa_local.toml +95 -0
- examples/blog_posts/gepa/configs/banking77_gepa_test.toml +82 -0
- examples/blog_posts/gepa/configs/banking77_mipro_local.toml +52 -0
- examples/blog_posts/gepa/configs/hotpotqa_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/hotpotqa_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/hotpotqa_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/hover_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/hover_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/hover_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/ifbench_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/ifbench_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/ifbench_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/pupa_gepa_local.toml +60 -0
- examples/blog_posts/gepa/configs/pupa_mipro_local.toml +54 -0
- examples/blog_posts/gepa/deploy_banking77_task_app.sh +41 -0
- examples/blog_posts/gepa/gepa_baseline.py +204 -0
- examples/blog_posts/gepa/query_prompts_example.py +97 -0
- examples/blog_posts/gepa/run_gepa_banking77.sh +87 -0
- examples/blog_posts/gepa/task_apps.py +105 -0
- examples/blog_posts/gepa/test_gepa_local.sh +67 -0
- examples/blog_posts/gepa/verify_banking77_setup.sh +123 -0
- examples/blog_posts/pokemon_vl/README.md +98 -0
- examples/blog_posts/pokemon_vl/configs/eval_gpt5nano.toml +26 -0
- examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +27 -0
- examples/blog_posts/pokemon_vl/configs/eval_rl_final.toml +24 -0
- examples/blog_posts/pokemon_vl/configs/filter_high_reward.toml +10 -0
- examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +43 -0
- examples/blog_posts/pokemon_vl/configs/train_sft_qwen4b_vl.toml +40 -0
- examples/blog_posts/pokemon_vl/extract_images.py +239 -0
- examples/blog_posts/pokemon_vl/pokemon_vl_baseline.py +326 -0
- examples/blog_posts/pokemon_vl/run_eval_extract_images.py +209 -0
- examples/blog_posts/pokemon_vl/run_qwen_eval_extract_images.py +212 -0
- examples/blog_posts/pokemon_vl/text_box_analysis.md +106 -0
- examples/blog_posts/warming_up_to_rl/ARCHITECTURE.md +195 -0
- examples/blog_posts/warming_up_to_rl/FINAL_TEST_RESULTS.md +127 -0
- examples/blog_posts/warming_up_to_rl/INFERENCE_SUCCESS.md +132 -0
- examples/blog_posts/warming_up_to_rl/README.md +158 -0
- examples/blog_posts/warming_up_to_rl/SMOKE_TESTING.md +164 -0
- examples/blog_posts/warming_up_to_rl/SMOKE_TEST_COMPLETE.md +253 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_baseline_qwen32b_10x20.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b_10x20.toml +26 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_groq_qwen32b.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_openai_gpt_oss_120b.toml +29 -0
- examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +10 -0
- examples/blog_posts/warming_up_to_rl/configs/smoke_test.toml +75 -0
- examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +91 -0
- examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +40 -0
- examples/blog_posts/warming_up_to_rl/warming_up_to_rl_baseline.py +187 -0
- examples/dev/qwen3_32b_qlora_4xh100.toml +5 -0
- examples/multi_step/configs/VERILOG_REWARDS.md +4 -0
- examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +4 -0
- examples/multi_step/configs/crafter_rl_outcome.toml +2 -1
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +65 -107
- examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +2 -1
- examples/multi_step/configs/crafter_rl_stepwise_simple.toml +2 -1
- examples/multi_step/configs/crafter_rl_stepwise_simple_NEW_FORMAT.toml +105 -0
- examples/multi_step/configs/verilog_rl_lora.toml +80 -123
- examples/qwen_coder/configs/coder_lora_30b.toml +1 -3
- examples/qwen_coder/configs/coder_lora_4b.toml +4 -1
- examples/qwen_coder/configs/coder_lora_small.toml +1 -3
- examples/qwen_vl/README.md +10 -12
- examples/qwen_vl/SETUP_COMPLETE.md +7 -8
- examples/qwen_vl/VISION_TESTS_COMPLETE.md +2 -3
- examples/qwen_vl/collect_data_via_cli.md +76 -84
- examples/qwen_vl/collect_vision_traces.py +4 -4
- examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +40 -57
- examples/qwen_vl/configs/crafter_vlm_sft_example.toml +1 -2
- examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +20 -37
- examples/qwen_vl/configs/eval_gpt5nano_vision.toml +21 -40
- examples/qwen_vl/configs/eval_qwen3vl_vision.toml +26 -0
- examples/qwen_vl/configs/{filter_qwen2vl_sft.toml → filter_qwen3vl_sft.toml} +4 -5
- examples/qwen_vl/configs/filter_vision_sft.toml +2 -3
- examples/qwen_vl/crafter_qwen_vl_agent.py +5 -5
- examples/qwen_vl/run_vision_comparison.sh +6 -7
- examples/rl/README.md +5 -5
- examples/rl/configs/rl_from_base_qwen.toml +26 -1
- examples/rl/configs/rl_from_base_qwen17.toml +6 -2
- examples/rl/task_app/README.md +1 -2
- examples/rl/task_app/math_single_step.py +2 -2
- examples/run_crafter_demo.sh +2 -2
- examples/sft/README.md +1 -1
- examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -1
- examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -1
- examples/swe/task_app/README.md +32 -2
- examples/swe/task_app/grpo_swe_mini.py +4 -0
- examples/swe/task_app/hosted/envs/crafter/react_agent.py +1 -1
- examples/swe/task_app/hosted/envs/mini_swe/environment.py +37 -10
- examples/swe/task_app/hosted/inference/openai_client.py +4 -38
- examples/swe/task_app/hosted/policy_routes.py +17 -0
- examples/swe/task_app/hosted/rollout.py +4 -2
- examples/swe/task_app/morph_backend.py +178 -0
- examples/task_apps/banking77/__init__.py +6 -0
- examples/task_apps/banking77/banking77_task_app.py +841 -0
- examples/task_apps/banking77/deploy_wrapper.py +46 -0
- examples/task_apps/crafter/CREATE_SFT_DATASET.md +4 -0
- examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +4 -0
- examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +4 -0
- examples/task_apps/crafter/task_app/README.md +1 -1
- examples/task_apps/crafter/task_app/grpo_crafter.py +90 -5
- examples/task_apps/crafter/task_app/grpo_crafter_task_app.py +1 -1
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +4 -26
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -2
- examples/task_apps/crafter/task_app/synth_envs_hosted/hosted_app.py +49 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +372 -107
- examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +81 -12
- examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +82 -11
- examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +194 -1
- examples/task_apps/enron/task_app/grpo_enron_task_app.py +1 -1
- examples/task_apps/gepa_benchmarks/__init__.py +7 -0
- examples/task_apps/gepa_benchmarks/common.py +260 -0
- examples/task_apps/gepa_benchmarks/hotpotqa_task_app.py +507 -0
- examples/task_apps/gepa_benchmarks/hover_task_app.py +436 -0
- examples/task_apps/gepa_benchmarks/ifbench_task_app.py +563 -0
- examples/task_apps/gepa_benchmarks/pupa_task_app.py +460 -0
- examples/task_apps/math/README.md +1 -2
- examples/task_apps/pokemon_red/README.md +3 -4
- examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +4 -0
- examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +6 -5
- examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +1 -2
- examples/task_apps/pokemon_red/task_app.py +288 -39
- examples/task_apps/sokoban/README.md +2 -3
- examples/task_apps/verilog/eval_groq_qwen32b.toml +12 -14
- examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +1 -1
- examples/vlm/configs/crafter_vlm_gpt4o.toml +4 -1
- examples/warming_up_to_rl/configs/crafter_fft.toml +4 -1
- examples/warming_up_to_rl/configs/crafter_fft_4b.toml +0 -2
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +3 -2
- examples/warming_up_to_rl/run_local_rollout_traced.py +1 -1
- examples/warming_up_to_rl/task_app/README.md +1 -1
- examples/warming_up_to_rl/task_app/grpo_crafter.py +185 -5
- examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +3 -27
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +49 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +156 -45
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +37 -4
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +33 -3
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +67 -0
- examples/workflows/math_rl/configs/rl_from_base_qwen.toml +27 -0
- examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +6 -0
- synth_ai/api/train/builders.py +99 -4
- synth_ai/api/train/cli.py +516 -26
- synth_ai/api/train/config_finder.py +13 -2
- synth_ai/api/train/configs/__init__.py +23 -2
- synth_ai/api/train/configs/prompt_learning.py +442 -0
- synth_ai/api/train/configs/rl.py +61 -7
- synth_ai/api/train/configs/sft.py +6 -2
- synth_ai/api/train/configs/shared.py +59 -2
- synth_ai/api/train/task_app.py +1 -1
- synth_ai/api/train/validators.py +277 -0
- synth_ai/auth/credentials.py +119 -0
- synth_ai/baseline/__init__.py +25 -0
- synth_ai/baseline/config.py +209 -0
- synth_ai/baseline/discovery.py +214 -0
- synth_ai/baseline/execution.py +146 -0
- synth_ai/cli/__init__.py +94 -18
- synth_ai/cli/__main__.py +0 -0
- synth_ai/cli/claude.py +70 -0
- synth_ai/cli/codex.py +84 -0
- synth_ai/cli/commands/__init__.py +18 -0
- synth_ai/cli/commands/baseline/__init__.py +12 -0
- synth_ai/cli/commands/baseline/core.py +637 -0
- synth_ai/cli/commands/baseline/list.py +93 -0
- synth_ai/cli/commands/demo/__init__.py +6 -0
- synth_ai/cli/commands/demo/core.py +163 -0
- synth_ai/cli/commands/eval/__init__.py +19 -0
- synth_ai/cli/commands/eval/core.py +1112 -0
- synth_ai/cli/commands/eval/errors.py +81 -0
- synth_ai/cli/commands/eval/validation.py +133 -0
- synth_ai/cli/commands/filter/__init__.py +12 -0
- synth_ai/cli/commands/filter/core.py +424 -0
- synth_ai/cli/commands/filter/errors.py +55 -0
- synth_ai/cli/commands/filter/validation.py +77 -0
- synth_ai/cli/commands/help/__init__.py +177 -0
- synth_ai/cli/commands/help/core.py +72 -0
- synth_ai/cli/commands/smoke/__init__.py +7 -0
- synth_ai/cli/commands/smoke/core.py +1436 -0
- synth_ai/cli/commands/status/__init__.py +64 -0
- synth_ai/cli/commands/status/client.py +192 -0
- synth_ai/cli/commands/status/config.py +92 -0
- synth_ai/cli/commands/status/errors.py +20 -0
- synth_ai/cli/commands/status/formatters.py +164 -0
- synth_ai/cli/commands/status/subcommands/__init__.py +9 -0
- synth_ai/cli/commands/status/subcommands/files.py +79 -0
- synth_ai/cli/commands/status/subcommands/jobs.py +334 -0
- synth_ai/cli/commands/status/subcommands/models.py +79 -0
- synth_ai/cli/commands/status/subcommands/pricing.py +22 -0
- synth_ai/cli/commands/status/subcommands/runs.py +81 -0
- synth_ai/cli/commands/status/subcommands/summary.py +47 -0
- synth_ai/cli/commands/status/subcommands/usage.py +203 -0
- synth_ai/cli/commands/status/utils.py +114 -0
- synth_ai/cli/commands/train/__init__.py +53 -0
- synth_ai/cli/commands/train/core.py +21 -0
- synth_ai/cli/commands/train/errors.py +117 -0
- synth_ai/cli/commands/train/judge_schemas.py +200 -0
- synth_ai/cli/commands/train/judge_validation.py +305 -0
- synth_ai/cli/commands/train/validation.py +386 -0
- synth_ai/cli/demo.py +30 -158
- synth_ai/cli/deploy/__init__.py +43 -0
- synth_ai/cli/deploy.py +162 -0
- synth_ai/cli/eval/__init__.py +36 -0
- synth_ai/cli/eval/core.py +5 -0
- synth_ai/cli/eval/errors.py +31 -0
- synth_ai/cli/eval/validation.py +5 -0
- synth_ai/cli/filter/__init__.py +28 -0
- synth_ai/cli/filter/core.py +5 -0
- synth_ai/cli/filter/errors.py +23 -0
- synth_ai/cli/filter/validation.py +5 -0
- synth_ai/cli/legacy_root_backup.py +14 -8
- synth_ai/cli/modal_serve/__init__.py +12 -0
- synth_ai/cli/modal_serve/core.py +14 -0
- synth_ai/cli/modal_serve/errors.py +8 -0
- synth_ai/cli/modal_serve/validation.py +11 -0
- synth_ai/cli/opencode.py +107 -0
- synth_ai/cli/root.py +9 -5
- synth_ai/cli/serve/__init__.py +12 -0
- synth_ai/cli/serve/core.py +14 -0
- synth_ai/cli/serve/errors.py +8 -0
- synth_ai/cli/serve/validation.py +11 -0
- synth_ai/cli/setup.py +20 -265
- synth_ai/cli/status.py +7 -126
- synth_ai/cli/task_app_deploy.py +1 -10
- synth_ai/cli/task_app_modal_serve.py +4 -9
- synth_ai/cli/task_app_serve.py +4 -11
- synth_ai/cli/task_apps.py +51 -1480
- synth_ai/cli/train/__init__.py +12 -0
- synth_ai/cli/train/core.py +21 -0
- synth_ai/cli/train/errors.py +8 -0
- synth_ai/cli/train/validation.py +24 -0
- synth_ai/cli/train.py +1 -14
- synth_ai/demos/crafter/grpo_crafter_task_app.py +1 -1
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
- synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +7 -4
- synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +9 -5
- synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +4 -3
- synth_ai/environments/examples/red/engine.py +33 -12
- synth_ai/environments/examples/red/engine_helpers/reward_components.py +151 -179
- synth_ai/environments/examples/red/environment.py +26 -0
- synth_ai/environments/examples/red/trace_hooks_v3.py +168 -0
- synth_ai/http.py +12 -0
- synth_ai/judge_schemas.py +10 -10
- synth_ai/learning/__init__.py +10 -0
- synth_ai/learning/prompt_learning_client.py +276 -0
- synth_ai/learning/prompt_learning_types.py +184 -0
- synth_ai/learning/rl/client.py +3 -1
- synth_ai/pricing/__init__.py +2 -0
- synth_ai/pricing/model_pricing.py +57 -0
- synth_ai/streaming/__init__.py +29 -0
- synth_ai/streaming/config.py +94 -0
- synth_ai/streaming/handlers.py +518 -0
- synth_ai/streaming/streamer.py +320 -0
- synth_ai/streaming/types.py +95 -0
- synth_ai/task/apps/__init__.py +1 -0
- synth_ai/task/config.py +2 -0
- synth_ai/task/tracing_utils.py +25 -25
- synth_ai/task/validators.py +45 -9
- synth_ai/task_app_cfgs.py +21 -0
- synth_ai/tracing_v3/config.py +162 -19
- synth_ai/tracing_v3/constants.py +1 -1
- synth_ai/tracing_v3/db_config.py +24 -38
- synth_ai/tracing_v3/migration_helper.py +1 -2
- synth_ai/tracing_v3/storage/config.py +47 -13
- synth_ai/tracing_v3/storage/factory.py +3 -3
- synth_ai/tracing_v3/turso/daemon.py +113 -11
- synth_ai/tracing_v3/turso/native_manager.py +92 -16
- synth_ai/types.py +8 -0
- synth_ai/urls.py +11 -0
- synth_ai/utils/__init__.py +30 -1
- synth_ai/utils/agents.py +74 -0
- synth_ai/utils/bin.py +39 -0
- synth_ai/utils/cli.py +149 -5
- synth_ai/utils/env.py +40 -33
- synth_ai/utils/http.py +4 -1
- synth_ai/utils/json.py +72 -0
- synth_ai/utils/modal.py +285 -3
- synth_ai/utils/paths.py +48 -0
- synth_ai/utils/uvicorn.py +113 -0
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/METADATA +109 -6
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/RECORD +291 -142
- examples/qwen_vl/configs/eval_qwen2vl_vision.toml +0 -44
- synth_ai/cli/tui.py +0 -62
- synth_ai/tui/__init__.py +0 -5
- synth_ai/tui/__main__.py +0 -13
- synth_ai/tui/cli/__init__.py +0 -1
- synth_ai/tui/cli/query_experiments.py +0 -164
- synth_ai/tui/cli/query_experiments_v3.py +0 -164
- synth_ai/tui/dashboard.py +0 -911
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
"""Banking77 baseline file for intent classification evaluation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Dict
|
|
6
|
+
|
|
7
|
+
from datasets import load_dataset
|
|
8
|
+
|
|
9
|
+
from synth_ai.baseline import BaselineConfig, BaselineTaskRunner, DataSplit, TaskResult
|
|
10
|
+
from synth_ai.inference import InferenceClient
|
|
11
|
+
import os
|
|
12
|
+
import httpx
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# Load dataset once at module level
|
|
16
|
+
_dataset = None
|
|
17
|
+
_label_names = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _load_dataset():
|
|
21
|
+
"""Load Banking77 dataset."""
|
|
22
|
+
global _dataset, _label_names
|
|
23
|
+
if _dataset is None:
|
|
24
|
+
try:
|
|
25
|
+
_dataset = load_dataset("PolyAI/banking77")
|
|
26
|
+
except Exception:
|
|
27
|
+
# Fallback: try without org prefix
|
|
28
|
+
_dataset = load_dataset("banking77")
|
|
29
|
+
_label_names = _dataset["train"].features["label"].names
|
|
30
|
+
return _dataset, _label_names
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class Banking77TaskRunner(BaselineTaskRunner):
|
|
34
|
+
"""Task runner for Banking77 intent classification."""
|
|
35
|
+
|
|
36
|
+
def __init__(self, policy_config: Dict[str, Any], env_config: Dict[str, Any]):
|
|
37
|
+
super().__init__(policy_config, env_config)
|
|
38
|
+
|
|
39
|
+
# Load dataset
|
|
40
|
+
self.dataset, self.label_names = _load_dataset()
|
|
41
|
+
|
|
42
|
+
# Store config for inference
|
|
43
|
+
self.model = policy_config["model"]
|
|
44
|
+
self.temperature = policy_config.get("temperature", 0.0)
|
|
45
|
+
self.max_tokens = policy_config.get("max_tokens", 128)
|
|
46
|
+
self.inference_url = policy_config.get("inference_url")
|
|
47
|
+
|
|
48
|
+
# Tool definition
|
|
49
|
+
self.tool = {
|
|
50
|
+
"type": "function",
|
|
51
|
+
"function": {
|
|
52
|
+
"name": "banking77_classify",
|
|
53
|
+
"description": "Classify a banking query into an intent",
|
|
54
|
+
"parameters": {
|
|
55
|
+
"type": "object",
|
|
56
|
+
"properties": {
|
|
57
|
+
"label": {
|
|
58
|
+
"type": "string",
|
|
59
|
+
"enum": self.label_names,
|
|
60
|
+
"description": "The intent label",
|
|
61
|
+
}
|
|
62
|
+
},
|
|
63
|
+
"required": ["label"],
|
|
64
|
+
},
|
|
65
|
+
},
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
async def run_task(self, seed: int) -> TaskResult:
|
|
69
|
+
"""Run a single Banking77 classification task."""
|
|
70
|
+
|
|
71
|
+
# Get split
|
|
72
|
+
split = self.env_config.get("split", "train")
|
|
73
|
+
|
|
74
|
+
# Get example from dataset
|
|
75
|
+
example = self.dataset[split][seed]
|
|
76
|
+
|
|
77
|
+
# Build prompt
|
|
78
|
+
system_prompt = f"""You are an expert banking assistant that classifies customer queries.
|
|
79
|
+
Given a customer message, respond with exactly one intent label using the tool call.
|
|
80
|
+
|
|
81
|
+
Valid intents: {', '.join(self.label_names)}"""
|
|
82
|
+
|
|
83
|
+
user_prompt = f"Customer Query: {example['text']}\n\nClassify this query."
|
|
84
|
+
|
|
85
|
+
# Run inference
|
|
86
|
+
messages = [
|
|
87
|
+
{"role": "system", "content": system_prompt},
|
|
88
|
+
{"role": "user", "content": user_prompt},
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
# Use InferenceClient if URL provided, otherwise use OpenAI-compatible API
|
|
92
|
+
if self.inference_url and self.inference_url.startswith("http"):
|
|
93
|
+
api_key = os.getenv("SYNTH_API_KEY") or os.getenv("OPENAI_API_KEY") or ""
|
|
94
|
+
base_url = self.inference_url.rstrip("/")
|
|
95
|
+
if not base_url.endswith("/api"):
|
|
96
|
+
base_url = f"{base_url}/api" if "/api" not in base_url else base_url
|
|
97
|
+
client = InferenceClient(base_url=base_url, api_key=api_key)
|
|
98
|
+
response = await client.create_chat_completion(
|
|
99
|
+
model=self.model,
|
|
100
|
+
messages=messages,
|
|
101
|
+
tools=[self.tool],
|
|
102
|
+
tool_choice={"type": "function", "function": {"name": "banking77_classify"}},
|
|
103
|
+
temperature=self.temperature,
|
|
104
|
+
max_tokens=self.max_tokens,
|
|
105
|
+
)
|
|
106
|
+
else:
|
|
107
|
+
# Use OpenAI/Groq directly
|
|
108
|
+
# Check if model starts with groq: prefix
|
|
109
|
+
model_name = self.model
|
|
110
|
+
use_groq = model_name.startswith("groq:")
|
|
111
|
+
if use_groq:
|
|
112
|
+
model_name = model_name[5:] # Remove "groq:" prefix
|
|
113
|
+
|
|
114
|
+
api_key = os.getenv("GROQ_API_KEY") if use_groq else os.getenv("OPENAI_API_KEY") or ""
|
|
115
|
+
base_url = "https://api.groq.com/openai/v1" if use_groq else "https://api.openai.com/v1"
|
|
116
|
+
async with httpx.AsyncClient() as http_client:
|
|
117
|
+
resp = await http_client.post(
|
|
118
|
+
f"{base_url}/chat/completions",
|
|
119
|
+
json={
|
|
120
|
+
"model": model_name,
|
|
121
|
+
"messages": messages,
|
|
122
|
+
"tools": [self.tool],
|
|
123
|
+
"tool_choice": {"type": "function", "function": {"name": "banking77_classify"}},
|
|
124
|
+
"temperature": self.temperature,
|
|
125
|
+
"max_tokens": self.max_tokens,
|
|
126
|
+
},
|
|
127
|
+
headers={"Authorization": f"Bearer {api_key}"} if api_key else {},
|
|
128
|
+
)
|
|
129
|
+
response = resp.json()
|
|
130
|
+
|
|
131
|
+
# Extract prediction
|
|
132
|
+
predicted_label = ""
|
|
133
|
+
tool_calls = []
|
|
134
|
+
if "choices" in response and len(response["choices"]) > 0:
|
|
135
|
+
message = response["choices"][0].get("message", {})
|
|
136
|
+
tool_calls = message.get("tool_calls", [])
|
|
137
|
+
elif "tool_calls" in response:
|
|
138
|
+
tool_calls = response["tool_calls"]
|
|
139
|
+
|
|
140
|
+
if tool_calls:
|
|
141
|
+
# Handle both string and dict arguments
|
|
142
|
+
args = tool_calls[0]["function"].get("arguments", "")
|
|
143
|
+
if isinstance(args, str):
|
|
144
|
+
import json
|
|
145
|
+
args = json.loads(args)
|
|
146
|
+
predicted_label = args.get("label", "") if isinstance(args, dict) else ""
|
|
147
|
+
|
|
148
|
+
# Evaluate
|
|
149
|
+
expected_label = self.label_names[example["label"]]
|
|
150
|
+
correct = predicted_label == expected_label
|
|
151
|
+
|
|
152
|
+
return TaskResult(
|
|
153
|
+
seed=seed,
|
|
154
|
+
success=True,
|
|
155
|
+
outcome_reward=1.0 if correct else 0.0,
|
|
156
|
+
total_steps=1,
|
|
157
|
+
metadata={
|
|
158
|
+
"query": example["text"],
|
|
159
|
+
"expected": expected_label,
|
|
160
|
+
"predicted": predicted_label,
|
|
161
|
+
"correct": correct,
|
|
162
|
+
"split": split,
|
|
163
|
+
},
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
# Define baseline config
|
|
168
|
+
# Note: We need to load the dataset first to get the label names
|
|
169
|
+
_load_dataset()
|
|
170
|
+
gepa_baseline = BaselineConfig(
|
|
171
|
+
baseline_id="gepa",
|
|
172
|
+
name="GEPA - Banking77 Intent Classification",
|
|
173
|
+
description="Banking77 intent classification baseline for prompt optimization experiments",
|
|
174
|
+
task_runner=Banking77TaskRunner,
|
|
175
|
+
splits={
|
|
176
|
+
"train": DataSplit(
|
|
177
|
+
name="train",
|
|
178
|
+
seeds=list(range(min(10000, len(_dataset["train"]))) if _dataset else range(10000)),
|
|
179
|
+
),
|
|
180
|
+
"val": DataSplit(
|
|
181
|
+
name="val",
|
|
182
|
+
seeds=list(range(min(1000, len(_dataset["test"]))) if _dataset else range(1000)),
|
|
183
|
+
),
|
|
184
|
+
"test": DataSplit(
|
|
185
|
+
name="test",
|
|
186
|
+
seeds=list(range(min(3000, len(_dataset["test"]))) if _dataset else range(3000)),
|
|
187
|
+
),
|
|
188
|
+
},
|
|
189
|
+
default_policy_config={
|
|
190
|
+
"model": "groq:llama-3.1-70b-versatile",
|
|
191
|
+
"temperature": 0.0,
|
|
192
|
+
"max_tokens": 128,
|
|
193
|
+
},
|
|
194
|
+
default_env_config={
|
|
195
|
+
"split": "train",
|
|
196
|
+
},
|
|
197
|
+
metadata={
|
|
198
|
+
"dataset": "PolyAI/banking77",
|
|
199
|
+
"num_classes": 77,
|
|
200
|
+
"task_type": "classification",
|
|
201
|
+
},
|
|
202
|
+
tags=["classification", "nlp", "intent", "blog-post"],
|
|
203
|
+
)
|
|
204
|
+
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Example script showing how to query prompt learning job results.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
python query_prompts_example.py pl_9c58b711c2644083
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import sys
|
|
10
|
+
from pprint import pprint
|
|
11
|
+
|
|
12
|
+
from synth_ai.learning import get_prompts, get_prompt_text, get_scoring_summary
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def main():
|
|
16
|
+
if len(sys.argv) < 2:
|
|
17
|
+
print("Usage: python query_prompts_example.py <job_id>")
|
|
18
|
+
print("Example: python query_prompts_example.py pl_9c58b711c2644083")
|
|
19
|
+
sys.exit(1)
|
|
20
|
+
|
|
21
|
+
job_id = sys.argv[1]
|
|
22
|
+
|
|
23
|
+
# Get credentials from environment
|
|
24
|
+
base_url = os.getenv("BACKEND_BASE_URL", "http://localhost:8000")
|
|
25
|
+
api_key = os.getenv("SYNTH_API_KEY")
|
|
26
|
+
|
|
27
|
+
if not api_key:
|
|
28
|
+
print("Error: SYNTH_API_KEY environment variable not set")
|
|
29
|
+
sys.exit(1)
|
|
30
|
+
|
|
31
|
+
print(f"Querying job: {job_id}")
|
|
32
|
+
print(f"Backend: {base_url}")
|
|
33
|
+
print("=" * 80)
|
|
34
|
+
|
|
35
|
+
# Get all prompts and metadata
|
|
36
|
+
print("\n📊 Fetching prompt results...")
|
|
37
|
+
results = get_prompts(job_id, base_url, api_key)
|
|
38
|
+
|
|
39
|
+
# Print best score
|
|
40
|
+
if results.best_score is not None:
|
|
41
|
+
print(f"\n🏆 Best Score: {results.best_score:.3f} ({results.best_score * 100:.1f}%)")
|
|
42
|
+
|
|
43
|
+
# Print top-K prompts with scores
|
|
44
|
+
top_prompts = results.top_prompts
|
|
45
|
+
if top_prompts:
|
|
46
|
+
print(f"\n📝 Top {len(top_prompts)} Prompts:")
|
|
47
|
+
print("=" * 80)
|
|
48
|
+
for prompt_info in sorted(top_prompts, key=lambda p: p.get("rank", 999)):
|
|
49
|
+
rank = prompt_info["rank"]
|
|
50
|
+
train_accuracy = prompt_info.get("train_accuracy")
|
|
51
|
+
val_accuracy = prompt_info.get("val_accuracy")
|
|
52
|
+
|
|
53
|
+
print(f"\nRank #{rank}:")
|
|
54
|
+
if train_accuracy is not None:
|
|
55
|
+
print(f" Train Accuracy: {train_accuracy:.3f} ({train_accuracy * 100:.1f}%)")
|
|
56
|
+
if val_accuracy is not None:
|
|
57
|
+
print(f" Val Accuracy: {val_accuracy:.3f} ({val_accuracy * 100:.1f}%)")
|
|
58
|
+
print(f" Prompt Text:")
|
|
59
|
+
print(" " + "-" * 76)
|
|
60
|
+
full_text = prompt_info.get("full_text", "")
|
|
61
|
+
for line in full_text.split("\n"):
|
|
62
|
+
print(f" {line}")
|
|
63
|
+
print(" " + "-" * 76)
|
|
64
|
+
|
|
65
|
+
# Get scoring summary
|
|
66
|
+
print("\n📈 Scoring Summary:")
|
|
67
|
+
print("=" * 80)
|
|
68
|
+
summary = get_scoring_summary(job_id, base_url, api_key)
|
|
69
|
+
|
|
70
|
+
print(f"Best Train Accuracy: {summary['best_train_accuracy']:.3f} ({summary['best_train_accuracy'] * 100:.1f}%)")
|
|
71
|
+
if summary['best_val_accuracy']:
|
|
72
|
+
print(f"Best Val Accuracy: {summary['best_val_accuracy']:.3f} ({summary['best_val_accuracy'] * 100:.1f}%)")
|
|
73
|
+
print(f"Mean Train Accuracy: {summary['mean_train_accuracy']:.3f} ({summary['mean_train_accuracy'] * 100:.1f}%)")
|
|
74
|
+
print(f"Candidates Tried: {summary['num_candidates_tried']}")
|
|
75
|
+
print(f"Frontier Candidates: {summary['num_frontier_candidates']}")
|
|
76
|
+
|
|
77
|
+
print(f"\nScore Distribution:")
|
|
78
|
+
for bin_range, count in summary['score_distribution'].items():
|
|
79
|
+
bar = "█" * count
|
|
80
|
+
print(f" {bin_range}: {count:3d} {bar}")
|
|
81
|
+
|
|
82
|
+
# Quick access to best prompt text only
|
|
83
|
+
print("\n💡 Quick access to best prompt:")
|
|
84
|
+
print("=" * 80)
|
|
85
|
+
best_text = get_prompt_text(job_id, base_url, api_key, rank=1)
|
|
86
|
+
if best_text:
|
|
87
|
+
print(best_text)
|
|
88
|
+
else:
|
|
89
|
+
print("Best prompt text not available yet (job may still be running)")
|
|
90
|
+
|
|
91
|
+
print("\n" + "=" * 80)
|
|
92
|
+
print("✅ Query complete!")
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
if __name__ == "__main__":
|
|
96
|
+
main()
|
|
97
|
+
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Run GEPA optimization for Banking77 against the backend
|
|
3
|
+
|
|
4
|
+
set -e
|
|
5
|
+
|
|
6
|
+
echo "🧬 Running GEPA on Banking77"
|
|
7
|
+
echo "============================="
|
|
8
|
+
|
|
9
|
+
# Check for required environment variables
|
|
10
|
+
if [ -z "$SYNTH_API_KEY" ]; then
|
|
11
|
+
echo "❌ Error: SYNTH_API_KEY not set"
|
|
12
|
+
echo "Please get your API key from the backend and set it:"
|
|
13
|
+
echo " export SYNTH_API_KEY=your_key"
|
|
14
|
+
exit 1
|
|
15
|
+
fi
|
|
16
|
+
|
|
17
|
+
if [ -z "$ENVIRONMENT_API_KEY" ]; then
|
|
18
|
+
echo "❌ Error: ENVIRONMENT_API_KEY not set"
|
|
19
|
+
echo "Please set the same key used when deploying the task app:"
|
|
20
|
+
echo " export ENVIRONMENT_API_KEY=your_key"
|
|
21
|
+
exit 1
|
|
22
|
+
fi
|
|
23
|
+
|
|
24
|
+
if [ -z "$GROQ_API_KEY" ]; then
|
|
25
|
+
echo "❌ Error: GROQ_API_KEY not set"
|
|
26
|
+
echo "Please set your Groq API key:"
|
|
27
|
+
echo " export GROQ_API_KEY=your_key"
|
|
28
|
+
exit 1
|
|
29
|
+
fi
|
|
30
|
+
|
|
31
|
+
# Default to localhost backend if not specified
|
|
32
|
+
BACKEND_URL="${BACKEND_BASE_URL:-http://localhost:8000}"
|
|
33
|
+
|
|
34
|
+
echo "✅ SYNTH_API_KEY: ${SYNTH_API_KEY:0:20}..."
|
|
35
|
+
echo "✅ ENVIRONMENT_API_KEY: ${ENVIRONMENT_API_KEY:0:20}..."
|
|
36
|
+
echo "✅ GROQ_API_KEY: ${GROQ_API_KEY:0:20}..."
|
|
37
|
+
echo "✅ Backend URL: $BACKEND_URL"
|
|
38
|
+
echo ""
|
|
39
|
+
|
|
40
|
+
# Navigate to repo root
|
|
41
|
+
cd "$(dirname "$0")/../../.."
|
|
42
|
+
|
|
43
|
+
# Check if task app is running
|
|
44
|
+
echo "🔍 Checking if Banking77 task app is running on http://127.0.0.1:8102..."
|
|
45
|
+
if ! curl -s -f -H "X-API-Key: $ENVIRONMENT_API_KEY" http://127.0.0.1:8102/health > /dev/null 2>&1; then
|
|
46
|
+
echo "❌ Error: Banking77 task app is not running on http://127.0.0.1:8102"
|
|
47
|
+
echo ""
|
|
48
|
+
echo "Please start it first:"
|
|
49
|
+
echo " ./examples/blog_posts/gepa/deploy_banking77_task_app.sh"
|
|
50
|
+
echo ""
|
|
51
|
+
echo "Or in another terminal:"
|
|
52
|
+
echo " cd $(pwd)"
|
|
53
|
+
echo " uvx synth-ai deploy banking77 --runtime uvicorn --port 8102"
|
|
54
|
+
exit 1
|
|
55
|
+
fi
|
|
56
|
+
echo "✅ Task app is healthy"
|
|
57
|
+
echo ""
|
|
58
|
+
|
|
59
|
+
# Check backend connection
|
|
60
|
+
echo "🔍 Checking backend connection to $BACKEND_URL..."
|
|
61
|
+
if ! curl -s -f "$BACKEND_URL/api/health" > /dev/null 2>&1; then
|
|
62
|
+
echo "⚠️ Warning: Cannot connect to backend at $BACKEND_URL"
|
|
63
|
+
echo "Make sure the backend is running."
|
|
64
|
+
read -p "Continue anyway? (y/N) " -n 1 -r
|
|
65
|
+
echo
|
|
66
|
+
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
|
|
67
|
+
exit 1
|
|
68
|
+
fi
|
|
69
|
+
else
|
|
70
|
+
echo "✅ Backend is healthy"
|
|
71
|
+
fi
|
|
72
|
+
echo ""
|
|
73
|
+
|
|
74
|
+
echo "🚀 Starting GEPA training..."
|
|
75
|
+
echo "Config: examples/blog_posts/gepa/configs/banking77_gepa_local.toml"
|
|
76
|
+
echo ""
|
|
77
|
+
|
|
78
|
+
# Run the training
|
|
79
|
+
uvx synth-ai train \
|
|
80
|
+
--type prompt_learning \
|
|
81
|
+
--config examples/blog_posts/gepa/configs/banking77_gepa_local.toml \
|
|
82
|
+
--backend "$BACKEND_URL" \
|
|
83
|
+
--poll
|
|
84
|
+
|
|
85
|
+
echo ""
|
|
86
|
+
echo "✅ GEPA training complete!"
|
|
87
|
+
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""Metadata for GEPA blog task app coverage.
|
|
2
|
+
|
|
3
|
+
This module centralises the set of task apps that the GEPA blog post
|
|
4
|
+
references so that configuration files and documentation can import the
|
|
5
|
+
same canonical definitions. Each entry mirrors a task app that is
|
|
6
|
+
available via Synth's prompt-learning backend, making it easier to keep
|
|
7
|
+
configs, docs, and evaluation notebooks in sync.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from typing import Iterable, Sequence
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(frozen=True, slots=True)
|
|
17
|
+
class TaskAppSupport:
|
|
18
|
+
"""Describes a task app that the GEPA blog supports."""
|
|
19
|
+
|
|
20
|
+
app_id: str
|
|
21
|
+
display_name: str
|
|
22
|
+
dataset_id: str
|
|
23
|
+
description: str
|
|
24
|
+
default_port: int
|
|
25
|
+
tags: Sequence[str]
|
|
26
|
+
metrics: Sequence[str]
|
|
27
|
+
sources: Sequence[str]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
SUPPORTED_TASK_APPS: tuple[TaskAppSupport, ...] = (
|
|
31
|
+
TaskAppSupport(
|
|
32
|
+
app_id="banking77",
|
|
33
|
+
display_name="Banking77 Intent Classification",
|
|
34
|
+
dataset_id="PolyAI/banking77",
|
|
35
|
+
description="Classify banking customer support queries into 77 intents.",
|
|
36
|
+
default_port=8102,
|
|
37
|
+
tags=("classification", "intent", "nlp"),
|
|
38
|
+
metrics=("accuracy",),
|
|
39
|
+
sources=(
|
|
40
|
+
"GEPA blog quickstart",
|
|
41
|
+
"PolyAI Banking77 dataset card",
|
|
42
|
+
),
|
|
43
|
+
),
|
|
44
|
+
TaskAppSupport(
|
|
45
|
+
app_id="hotpotqa",
|
|
46
|
+
display_name="HotpotQA Multi-Hop QA",
|
|
47
|
+
dataset_id="hotpot_qa",
|
|
48
|
+
description="Answer multi-hop questions with supporting facts sourced from Wikipedia passages.",
|
|
49
|
+
default_port=8110,
|
|
50
|
+
tags=("qa", "multi-hop", "reasoning"),
|
|
51
|
+
metrics=("answer_em", "supporting_fact_f1"),
|
|
52
|
+
sources=(
|
|
53
|
+
"GEPA Table 1",
|
|
54
|
+
"HotpotQA (Yang et al., 2018)",
|
|
55
|
+
),
|
|
56
|
+
),
|
|
57
|
+
TaskAppSupport(
|
|
58
|
+
app_id="ifbench",
|
|
59
|
+
display_name="IFBench Instruction Following",
|
|
60
|
+
dataset_id="Muennighoff/IFBench",
|
|
61
|
+
description="Follow natural language instructions focusing on faithful adherence.",
|
|
62
|
+
default_port=8111,
|
|
63
|
+
tags=("instruction-following", "nlp"),
|
|
64
|
+
metrics=("compliance", "accuracy"),
|
|
65
|
+
sources=(
|
|
66
|
+
"GEPA Table 1",
|
|
67
|
+
"IFBench benchmark release",
|
|
68
|
+
),
|
|
69
|
+
),
|
|
70
|
+
TaskAppSupport(
|
|
71
|
+
app_id="hover",
|
|
72
|
+
display_name="HoVer Claim Verification",
|
|
73
|
+
dataset_id="hover",
|
|
74
|
+
description="Determine whether Wikipedia claims are supported, refuted, or not enough info given retrieved evidence.",
|
|
75
|
+
default_port=8112,
|
|
76
|
+
tags=("fact-checking", "classification"),
|
|
77
|
+
metrics=("label_accuracy", "evidence_f1"),
|
|
78
|
+
sources=(
|
|
79
|
+
"GEPA Table 1",
|
|
80
|
+
"HoVer benchmark (Jiang et al., 2020)",
|
|
81
|
+
),
|
|
82
|
+
),
|
|
83
|
+
TaskAppSupport(
|
|
84
|
+
app_id="pupa",
|
|
85
|
+
display_name="PUPA Privacy-Aware Delegation",
|
|
86
|
+
dataset_id="microsoft/PUPA",
|
|
87
|
+
description="Delegate actions while respecting privacy policies and extracting structured responses.",
|
|
88
|
+
default_port=8113,
|
|
89
|
+
tags=("delegation", "privacy", "structured-output"),
|
|
90
|
+
metrics=("privacy_compliance", "task_success"),
|
|
91
|
+
sources=(
|
|
92
|
+
"GEPA Table 1",
|
|
93
|
+
"PUPA benchmark release",
|
|
94
|
+
),
|
|
95
|
+
),
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def list_supported_task_apps() -> Iterable[TaskAppSupport]:
|
|
100
|
+
"""Return iterable over supported task apps for convenience."""
|
|
101
|
+
|
|
102
|
+
return SUPPORTED_TASK_APPS
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
__all__ = ["TaskAppSupport", "SUPPORTED_TASK_APPS", "list_supported_task_apps"]
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Quick test script for GEPA Banking77 prompt learning
|
|
3
|
+
# Tests against local backend on port 8000
|
|
4
|
+
|
|
5
|
+
set -e
|
|
6
|
+
|
|
7
|
+
echo "🚀 Testing GEPA Prompt Learning for Banking77"
|
|
8
|
+
echo "=============================================="
|
|
9
|
+
|
|
10
|
+
# Check required environment variables
|
|
11
|
+
if [ -z "$SYNTH_API_KEY" ]; then
|
|
12
|
+
echo "❌ ERROR: SYNTH_API_KEY not set"
|
|
13
|
+
exit 1
|
|
14
|
+
fi
|
|
15
|
+
|
|
16
|
+
if [ -z "$ENVIRONMENT_API_KEY" ]; then
|
|
17
|
+
echo "❌ ERROR: ENVIRONMENT_API_KEY not set"
|
|
18
|
+
exit 1
|
|
19
|
+
fi
|
|
20
|
+
|
|
21
|
+
# Set backend URL (default to localhost:8000)
|
|
22
|
+
BACKEND_URL="${BACKEND_BASE_URL:-http://localhost:8000}"
|
|
23
|
+
echo "📍 Backend URL: $BACKEND_URL"
|
|
24
|
+
|
|
25
|
+
# Check backend is accessible
|
|
26
|
+
echo "🔍 Checking backend health..."
|
|
27
|
+
if curl -s -f "$BACKEND_URL/api/health" > /dev/null 2>&1; then
|
|
28
|
+
echo "✅ Backend is accessible"
|
|
29
|
+
else
|
|
30
|
+
echo "❌ ERROR: Backend not accessible at $BACKEND_URL"
|
|
31
|
+
echo " Make sure backend is running on port 8000"
|
|
32
|
+
exit 1
|
|
33
|
+
fi
|
|
34
|
+
|
|
35
|
+
# Check task app is accessible
|
|
36
|
+
TASK_APP_URL="${TASK_APP_URL:-http://127.0.0.1:8102}"
|
|
37
|
+
echo "🔍 Checking task app health..."
|
|
38
|
+
if curl -s -f -H "X-API-Key: $ENVIRONMENT_API_KEY" "$TASK_APP_URL/health" > /dev/null 2>&1; then
|
|
39
|
+
echo "✅ Task app is accessible"
|
|
40
|
+
else
|
|
41
|
+
echo "⚠️ WARNING: Task app not accessible at $TASK_APP_URL"
|
|
42
|
+
echo " You may need to deploy it first:"
|
|
43
|
+
echo " uvx synth-ai deploy banking77 --runtime uvicorn --port 8102"
|
|
44
|
+
fi
|
|
45
|
+
|
|
46
|
+
# Run GEPA training
|
|
47
|
+
echo ""
|
|
48
|
+
echo "🎯 Starting GEPA prompt optimization..."
|
|
49
|
+
echo ""
|
|
50
|
+
|
|
51
|
+
CONFIG_FILE="examples/blog_posts/gepa/configs/banking77_gepa_local.toml"
|
|
52
|
+
|
|
53
|
+
if [ ! -f "$CONFIG_FILE" ]; then
|
|
54
|
+
echo "❌ ERROR: Config file not found: $CONFIG_FILE"
|
|
55
|
+
exit 1
|
|
56
|
+
fi
|
|
57
|
+
|
|
58
|
+
uvx synth-ai train \
|
|
59
|
+
--type prompt_learning \
|
|
60
|
+
--config "$CONFIG_FILE" \
|
|
61
|
+
--backend "$BACKEND_URL" \
|
|
62
|
+
--poll \
|
|
63
|
+
--poll-timeout 3600
|
|
64
|
+
|
|
65
|
+
echo ""
|
|
66
|
+
echo "✅ GEPA training completed!"
|
|
67
|
+
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Verify Banking77 setup is working
|
|
3
|
+
|
|
4
|
+
set -e
|
|
5
|
+
|
|
6
|
+
echo "🔍 Verifying Banking77 Setup"
|
|
7
|
+
echo "============================="
|
|
8
|
+
echo ""
|
|
9
|
+
|
|
10
|
+
cd "$(dirname "$0")/../../.."
|
|
11
|
+
|
|
12
|
+
echo "1️⃣ Checking Python import..."
|
|
13
|
+
python3 -c "
|
|
14
|
+
try:
|
|
15
|
+
from examples.task_apps.banking77.banking77_task_app import build_config
|
|
16
|
+
print(' ✅ Task app imports successfully')
|
|
17
|
+
config = build_config()
|
|
18
|
+
print(f' ✅ Config built: app_id={config.app_id}')
|
|
19
|
+
print(f' ✅ Task name: {config.name}')
|
|
20
|
+
except ImportError as e:
|
|
21
|
+
print(f' ❌ Import error: {e}')
|
|
22
|
+
print(' 💡 Run: uv pip install -e .')
|
|
23
|
+
exit(1)
|
|
24
|
+
except Exception as e:
|
|
25
|
+
print(f' ❌ Error: {e}')
|
|
26
|
+
exit(1)
|
|
27
|
+
"
|
|
28
|
+
|
|
29
|
+
echo ""
|
|
30
|
+
echo "2️⃣ Checking CLI registration..."
|
|
31
|
+
if uvx synth-ai task-app list 2>/dev/null | grep -q "banking77"; then
|
|
32
|
+
echo " ✅ Banking77 registered with CLI"
|
|
33
|
+
else
|
|
34
|
+
echo " ⚠️ Banking77 not found in task-app list"
|
|
35
|
+
echo " 💡 This is OK if you haven't run 'uv pip install -e .' yet"
|
|
36
|
+
fi
|
|
37
|
+
|
|
38
|
+
echo ""
|
|
39
|
+
echo "3️⃣ Checking helper scripts..."
|
|
40
|
+
if [ -x "./examples/blog_posts/gepa/deploy_banking77_task_app.sh" ]; then
|
|
41
|
+
echo " ✅ deploy_banking77_task_app.sh is executable"
|
|
42
|
+
else
|
|
43
|
+
echo " ❌ deploy_banking77_task_app.sh is not executable"
|
|
44
|
+
echo " 💡 Run: chmod +x ./examples/blog_posts/gepa/deploy_banking77_task_app.sh"
|
|
45
|
+
fi
|
|
46
|
+
|
|
47
|
+
if [ -x "./examples/blog_posts/gepa/run_gepa_banking77.sh" ]; then
|
|
48
|
+
echo " ✅ run_gepa_banking77.sh is executable"
|
|
49
|
+
else
|
|
50
|
+
echo " ❌ run_gepa_banking77.sh is not executable"
|
|
51
|
+
echo " 💡 Run: chmod +x ./examples/blog_posts/gepa/run_gepa_banking77.sh"
|
|
52
|
+
fi
|
|
53
|
+
|
|
54
|
+
echo ""
|
|
55
|
+
echo "4️⃣ Checking configuration files..."
|
|
56
|
+
if [ -f "./examples/blog_posts/gepa/configs/banking77_gepa_local.toml" ]; then
|
|
57
|
+
echo " ✅ banking77_gepa_local.toml exists"
|
|
58
|
+
else
|
|
59
|
+
echo " ❌ banking77_gepa_local.toml not found"
|
|
60
|
+
fi
|
|
61
|
+
|
|
62
|
+
echo ""
|
|
63
|
+
echo "5️⃣ Checking environment variables..."
|
|
64
|
+
if [ -n "$GROQ_API_KEY" ]; then
|
|
65
|
+
echo " ✅ GROQ_API_KEY is set (${GROQ_API_KEY:0:10}...)"
|
|
66
|
+
else
|
|
67
|
+
echo " ⚠️ GROQ_API_KEY not set"
|
|
68
|
+
echo " 💡 Run: export GROQ_API_KEY='gsk_...'"
|
|
69
|
+
fi
|
|
70
|
+
|
|
71
|
+
if [ -n "$ENVIRONMENT_API_KEY" ]; then
|
|
72
|
+
echo " ✅ ENVIRONMENT_API_KEY is set (${ENVIRONMENT_API_KEY:0:10}...)"
|
|
73
|
+
else
|
|
74
|
+
echo " ⚠️ ENVIRONMENT_API_KEY not set"
|
|
75
|
+
echo " 💡 Run: export ENVIRONMENT_API_KEY=\$(python3 -c 'import secrets; print(secrets.token_urlsafe(32))')"
|
|
76
|
+
fi
|
|
77
|
+
|
|
78
|
+
if [ -n "$SYNTH_API_KEY" ]; then
|
|
79
|
+
echo " ✅ SYNTH_API_KEY is set (${SYNTH_API_KEY:0:10}...)"
|
|
80
|
+
else
|
|
81
|
+
echo " ⚠️ SYNTH_API_KEY not set"
|
|
82
|
+
echo " 💡 Get from backend admin or .env.dev file"
|
|
83
|
+
fi
|
|
84
|
+
|
|
85
|
+
echo ""
|
|
86
|
+
echo "6️⃣ Checking services..."
|
|
87
|
+
if curl -s -f http://localhost:8000/api/health > /dev/null 2>&1; then
|
|
88
|
+
echo " ✅ Backend is running on http://localhost:8000"
|
|
89
|
+
else
|
|
90
|
+
echo " ⚠️ Backend not reachable at http://localhost:8000"
|
|
91
|
+
echo " 💡 Start the backend before running GEPA"
|
|
92
|
+
fi
|
|
93
|
+
|
|
94
|
+
if curl -s -f http://127.0.0.1:8102/health > /dev/null 2>&1; then
|
|
95
|
+
echo " ✅ Task app is running on http://127.0.0.1:8102"
|
|
96
|
+
else
|
|
97
|
+
echo " ⚠️ Task app not running on http://127.0.0.1:8102"
|
|
98
|
+
echo " 💡 Run: ./examples/blog_posts/gepa/deploy_banking77_task_app.sh"
|
|
99
|
+
fi
|
|
100
|
+
|
|
101
|
+
echo ""
|
|
102
|
+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
|
103
|
+
echo "Summary"
|
|
104
|
+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
|
105
|
+
echo ""
|
|
106
|
+
echo "To run Banking77 GEPA:"
|
|
107
|
+
echo ""
|
|
108
|
+
echo " 1. Install dependencies:"
|
|
109
|
+
echo " uv pip install -e ."
|
|
110
|
+
echo ""
|
|
111
|
+
echo " 2. Set environment variables:"
|
|
112
|
+
echo " export GROQ_API_KEY='gsk_...'"
|
|
113
|
+
echo " export SYNTH_API_KEY='your-backend-key'"
|
|
114
|
+
echo " export ENVIRONMENT_API_KEY=\$(python3 -c 'import secrets; print(secrets.token_urlsafe(32))')"
|
|
115
|
+
echo ""
|
|
116
|
+
echo " 3. Start task app (Terminal 1):"
|
|
117
|
+
echo " ./examples/blog_posts/gepa/deploy_banking77_task_app.sh"
|
|
118
|
+
echo ""
|
|
119
|
+
echo " 4. Run GEPA (Terminal 2):"
|
|
120
|
+
echo " ./examples/blog_posts/gepa/run_gepa_banking77.sh"
|
|
121
|
+
echo ""
|
|
122
|
+
echo "✅ Setup verification complete!"
|
|
123
|
+
|