synth-ai 0.2.17__py3-none-any.whl → 0.2.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/baseline/banking77_baseline.py +204 -0
- examples/baseline/crafter_baseline.py +407 -0
- examples/baseline/pokemon_red_baseline.py +326 -0
- examples/baseline/simple_baseline.py +56 -0
- examples/baseline/warming_up_to_rl_baseline.py +239 -0
- examples/blog_posts/gepa/README.md +355 -0
- examples/blog_posts/gepa/configs/banking77_gepa_local.toml +95 -0
- examples/blog_posts/gepa/configs/banking77_gepa_test.toml +82 -0
- examples/blog_posts/gepa/configs/banking77_mipro_local.toml +52 -0
- examples/blog_posts/gepa/configs/hotpotqa_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/hotpotqa_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/hotpotqa_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/hover_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/hover_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/hover_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/ifbench_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/ifbench_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/ifbench_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/pupa_gepa_local.toml +60 -0
- examples/blog_posts/gepa/configs/pupa_mipro_local.toml +54 -0
- examples/blog_posts/gepa/deploy_banking77_task_app.sh +41 -0
- examples/blog_posts/gepa/gepa_baseline.py +204 -0
- examples/blog_posts/gepa/query_prompts_example.py +97 -0
- examples/blog_posts/gepa/run_gepa_banking77.sh +87 -0
- examples/blog_posts/gepa/task_apps.py +105 -0
- examples/blog_posts/gepa/test_gepa_local.sh +67 -0
- examples/blog_posts/gepa/verify_banking77_setup.sh +123 -0
- examples/blog_posts/pokemon_vl/configs/eval_gpt5nano.toml +26 -0
- examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +12 -10
- examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +1 -0
- examples/blog_posts/pokemon_vl/extract_images.py +239 -0
- examples/blog_posts/pokemon_vl/pokemon_vl_baseline.py +326 -0
- examples/blog_posts/pokemon_vl/run_eval_extract_images.py +209 -0
- examples/blog_posts/pokemon_vl/run_qwen_eval_extract_images.py +212 -0
- examples/blog_posts/pokemon_vl/text_box_analysis.md +106 -0
- examples/blog_posts/warming_up_to_rl/ARCHITECTURE.md +195 -0
- examples/blog_posts/warming_up_to_rl/FINAL_TEST_RESULTS.md +127 -0
- examples/blog_posts/warming_up_to_rl/INFERENCE_SUCCESS.md +132 -0
- examples/blog_posts/warming_up_to_rl/SMOKE_TESTING.md +164 -0
- examples/blog_posts/warming_up_to_rl/SMOKE_TEST_COMPLETE.md +253 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_baseline_qwen32b_10x20.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b_10x20.toml +26 -0
- examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +1 -1
- examples/blog_posts/warming_up_to_rl/configs/smoke_test.toml +75 -0
- examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +60 -10
- examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +1 -1
- examples/blog_posts/warming_up_to_rl/warming_up_to_rl_baseline.py +187 -0
- examples/multi_step/configs/VERILOG_REWARDS.md +4 -0
- examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +4 -0
- examples/multi_step/configs/crafter_rl_outcome.toml +1 -0
- examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +1 -0
- examples/multi_step/configs/crafter_rl_stepwise_simple.toml +1 -0
- examples/rl/configs/rl_from_base_qwen17.toml +1 -0
- examples/swe/task_app/hosted/inference/openai_client.py +0 -34
- examples/swe/task_app/hosted/policy_routes.py +17 -0
- examples/swe/task_app/hosted/rollout.py +4 -2
- examples/task_apps/banking77/__init__.py +6 -0
- examples/task_apps/banking77/banking77_task_app.py +841 -0
- examples/task_apps/banking77/deploy_wrapper.py +46 -0
- examples/task_apps/crafter/CREATE_SFT_DATASET.md +4 -0
- examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +4 -0
- examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +4 -0
- examples/task_apps/crafter/task_app/grpo_crafter.py +24 -2
- examples/task_apps/crafter/task_app/synth_envs_hosted/hosted_app.py +49 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +355 -58
- examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +68 -7
- examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +78 -21
- examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +194 -1
- examples/task_apps/gepa_benchmarks/__init__.py +7 -0
- examples/task_apps/gepa_benchmarks/common.py +260 -0
- examples/task_apps/gepa_benchmarks/hotpotqa_task_app.py +507 -0
- examples/task_apps/gepa_benchmarks/hover_task_app.py +436 -0
- examples/task_apps/gepa_benchmarks/ifbench_task_app.py +563 -0
- examples/task_apps/gepa_benchmarks/pupa_task_app.py +460 -0
- examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +4 -0
- examples/task_apps/pokemon_red/task_app.py +254 -36
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +1 -0
- examples/warming_up_to_rl/task_app/grpo_crafter.py +53 -4
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +49 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +152 -41
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +31 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +33 -3
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +67 -0
- examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +1 -0
- synth_ai/api/train/builders.py +90 -1
- synth_ai/api/train/cli.py +396 -21
- synth_ai/api/train/config_finder.py +13 -2
- synth_ai/api/train/configs/__init__.py +15 -1
- synth_ai/api/train/configs/prompt_learning.py +442 -0
- synth_ai/api/train/configs/rl.py +29 -0
- synth_ai/api/train/task_app.py +1 -1
- synth_ai/api/train/validators.py +277 -0
- synth_ai/baseline/__init__.py +25 -0
- synth_ai/baseline/config.py +209 -0
- synth_ai/baseline/discovery.py +214 -0
- synth_ai/baseline/execution.py +146 -0
- synth_ai/cli/__init__.py +85 -17
- synth_ai/cli/__main__.py +0 -0
- synth_ai/cli/claude.py +70 -0
- synth_ai/cli/codex.py +84 -0
- synth_ai/cli/commands/__init__.py +1 -0
- synth_ai/cli/commands/baseline/__init__.py +12 -0
- synth_ai/cli/commands/baseline/core.py +637 -0
- synth_ai/cli/commands/baseline/list.py +93 -0
- synth_ai/cli/commands/eval/core.py +13 -10
- synth_ai/cli/commands/filter/core.py +53 -17
- synth_ai/cli/commands/help/core.py +0 -1
- synth_ai/cli/commands/smoke/__init__.py +7 -0
- synth_ai/cli/commands/smoke/core.py +1436 -0
- synth_ai/cli/commands/status/subcommands/pricing.py +22 -0
- synth_ai/cli/commands/status/subcommands/usage.py +203 -0
- synth_ai/cli/commands/train/judge_schemas.py +1 -0
- synth_ai/cli/commands/train/judge_validation.py +1 -0
- synth_ai/cli/commands/train/validation.py +0 -57
- synth_ai/cli/demo.py +35 -3
- synth_ai/cli/deploy/__init__.py +40 -25
- synth_ai/cli/deploy.py +162 -0
- synth_ai/cli/legacy_root_backup.py +14 -8
- synth_ai/cli/opencode.py +107 -0
- synth_ai/cli/root.py +9 -5
- synth_ai/cli/task_app_deploy.py +1 -1
- synth_ai/cli/task_apps.py +53 -53
- synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +7 -4
- synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +9 -5
- synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +4 -3
- synth_ai/judge_schemas.py +1 -0
- synth_ai/learning/__init__.py +10 -0
- synth_ai/learning/prompt_learning_client.py +276 -0
- synth_ai/learning/prompt_learning_types.py +184 -0
- synth_ai/pricing/__init__.py +2 -0
- synth_ai/pricing/model_pricing.py +57 -0
- synth_ai/streaming/handlers.py +53 -4
- synth_ai/streaming/streamer.py +19 -0
- synth_ai/task/apps/__init__.py +1 -0
- synth_ai/task/config.py +2 -0
- synth_ai/task/tracing_utils.py +25 -25
- synth_ai/task/validators.py +44 -8
- synth_ai/task_app_cfgs.py +21 -0
- synth_ai/tracing_v3/config.py +162 -19
- synth_ai/tracing_v3/constants.py +1 -1
- synth_ai/tracing_v3/db_config.py +24 -38
- synth_ai/tracing_v3/storage/config.py +47 -13
- synth_ai/tracing_v3/storage/factory.py +3 -3
- synth_ai/tracing_v3/turso/daemon.py +113 -11
- synth_ai/tracing_v3/turso/native_manager.py +92 -16
- synth_ai/types.py +8 -0
- synth_ai/urls.py +11 -0
- synth_ai/utils/__init__.py +30 -1
- synth_ai/utils/agents.py +74 -0
- synth_ai/utils/bin.py +39 -0
- synth_ai/utils/cli.py +149 -5
- synth_ai/utils/env.py +17 -17
- synth_ai/utils/json.py +72 -0
- synth_ai/utils/modal.py +283 -1
- synth_ai/utils/paths.py +48 -0
- synth_ai/utils/uvicorn.py +113 -0
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/METADATA +102 -4
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/RECORD +162 -88
- synth_ai/cli/commands/deploy/__init__.py +0 -23
- synth_ai/cli/commands/deploy/core.py +0 -614
- synth_ai/cli/commands/deploy/errors.py +0 -72
- synth_ai/cli/commands/deploy/validation.py +0 -11
- synth_ai/cli/deploy/core.py +0 -5
- synth_ai/cli/deploy/errors.py +0 -23
- synth_ai/cli/deploy/validation.py +0 -5
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"""AST-based discovery mechanism for baseline files."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import ast
|
|
6
|
+
import importlib.util
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import List, Optional, Tuple
|
|
10
|
+
|
|
11
|
+
from synth_ai.baseline.config import BaselineConfig
|
|
12
|
+
|
|
13
|
+
# Search patterns for baseline files
|
|
14
|
+
BASELINE_FILE_PATTERNS = [
|
|
15
|
+
"**/baseline/*.py",
|
|
16
|
+
"**/baselines/*.py",
|
|
17
|
+
"**/*_baseline.py",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
# Directories to ignore during discovery
|
|
21
|
+
IGNORE_PATTERNS = {
|
|
22
|
+
"__pycache__",
|
|
23
|
+
".git",
|
|
24
|
+
".venv",
|
|
25
|
+
"venv",
|
|
26
|
+
"node_modules",
|
|
27
|
+
"build",
|
|
28
|
+
"dist",
|
|
29
|
+
".mypy_cache",
|
|
30
|
+
".pytest_cache",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class BaselineChoice:
|
|
36
|
+
"""Represents a discovered baseline configuration."""
|
|
37
|
+
|
|
38
|
+
baseline_id: str
|
|
39
|
+
path: Path
|
|
40
|
+
lineno: int
|
|
41
|
+
source: str # "discovered" or "registered"
|
|
42
|
+
config: Optional[BaselineConfig] = None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class BaselineConfigVisitor(ast.NodeVisitor):
|
|
46
|
+
"""AST visitor to find BaselineConfig instances."""
|
|
47
|
+
|
|
48
|
+
def __init__(self):
|
|
49
|
+
self.matches: List[Tuple[str, int]] = [] # (baseline_id, lineno)
|
|
50
|
+
|
|
51
|
+
def visit_Assign(self, node: ast.Assign) -> None:
|
|
52
|
+
"""Visit assignment statements looking for BaselineConfig."""
|
|
53
|
+
if not isinstance(node.value, ast.Call):
|
|
54
|
+
self.generic_visit(node)
|
|
55
|
+
return
|
|
56
|
+
|
|
57
|
+
# Check if right-hand side is BaselineConfig(...)
|
|
58
|
+
func = node.value.func
|
|
59
|
+
if isinstance(func, ast.Name) and func.id == "BaselineConfig":
|
|
60
|
+
# Extract baseline_id from constructor args
|
|
61
|
+
baseline_id = self._extract_baseline_id(node.value)
|
|
62
|
+
if baseline_id:
|
|
63
|
+
self.matches.append((baseline_id, node.lineno))
|
|
64
|
+
|
|
65
|
+
self.generic_visit(node)
|
|
66
|
+
|
|
67
|
+
def _extract_baseline_id(self, call_node: ast.Call) -> Optional[str]:
|
|
68
|
+
"""Extract baseline_id from BaselineConfig constructor."""
|
|
69
|
+
for keyword in call_node.keywords:
|
|
70
|
+
if keyword.arg == "baseline_id" and isinstance(keyword.value, ast.Constant):
|
|
71
|
+
return keyword.value.value
|
|
72
|
+
return None
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def should_ignore_path(path: Path) -> bool:
|
|
76
|
+
"""Check if a path should be ignored during discovery."""
|
|
77
|
+
return any(part in IGNORE_PATTERNS for part in path.parts)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def discover_baseline_files(search_roots: List[Path]) -> List[BaselineChoice]:
|
|
81
|
+
"""Discover baseline files via AST scanning.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
search_roots: List of root directories to search in
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
List of BaselineChoice objects representing discovered baselines
|
|
88
|
+
"""
|
|
89
|
+
results: List[BaselineChoice] = []
|
|
90
|
+
seen = set()
|
|
91
|
+
|
|
92
|
+
for root in search_roots:
|
|
93
|
+
if not root.exists():
|
|
94
|
+
continue
|
|
95
|
+
|
|
96
|
+
for pattern in BASELINE_FILE_PATTERNS:
|
|
97
|
+
for path in root.glob(pattern):
|
|
98
|
+
if should_ignore_path(path):
|
|
99
|
+
continue
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
source = path.read_text(encoding="utf-8")
|
|
103
|
+
tree = ast.parse(source, filename=str(path))
|
|
104
|
+
except (OSError, SyntaxError):
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
visitor = BaselineConfigVisitor()
|
|
108
|
+
visitor.visit(tree)
|
|
109
|
+
|
|
110
|
+
for baseline_id, lineno in visitor.matches:
|
|
111
|
+
key = (baseline_id, path.resolve())
|
|
112
|
+
if key in seen:
|
|
113
|
+
continue
|
|
114
|
+
seen.add(key)
|
|
115
|
+
|
|
116
|
+
results.append(
|
|
117
|
+
BaselineChoice(
|
|
118
|
+
baseline_id=baseline_id,
|
|
119
|
+
path=path.resolve(),
|
|
120
|
+
lineno=lineno,
|
|
121
|
+
source="discovered",
|
|
122
|
+
)
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
return results
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def load_baseline_config_from_file(
|
|
129
|
+
baseline_id: str,
|
|
130
|
+
path: Path,
|
|
131
|
+
) -> BaselineConfig:
|
|
132
|
+
"""Load a BaselineConfig from a Python file.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
baseline_id: The baseline_id to look for
|
|
136
|
+
path: Path to the Python file
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
BaselineConfig instance
|
|
140
|
+
|
|
141
|
+
Raises:
|
|
142
|
+
ValueError: If baseline_id not found or file cannot be loaded
|
|
143
|
+
"""
|
|
144
|
+
# Load the module
|
|
145
|
+
spec = importlib.util.spec_from_file_location("baseline_module", path)
|
|
146
|
+
if spec is None or spec.loader is None:
|
|
147
|
+
raise ValueError(f"Cannot load baseline file: {path}")
|
|
148
|
+
|
|
149
|
+
module = importlib.util.module_from_spec(spec)
|
|
150
|
+
try:
|
|
151
|
+
spec.loader.exec_module(module)
|
|
152
|
+
except ModuleNotFoundError as e:
|
|
153
|
+
missing_module = str(e).split("'")[1] if "'" in str(e) else str(e)
|
|
154
|
+
raise ImportError(
|
|
155
|
+
f"❌ Missing dependency for baseline '{baseline_id}'\n"
|
|
156
|
+
f" File: {path}\n"
|
|
157
|
+
f" Missing module: {missing_module}\n"
|
|
158
|
+
f" Fix: pip install {missing_module} (or 'uv add {missing_module}')"
|
|
159
|
+
) from e
|
|
160
|
+
except SyntaxError as e:
|
|
161
|
+
raise ValueError(
|
|
162
|
+
f"❌ Syntax error in baseline file '{baseline_id}'\n"
|
|
163
|
+
f" File: {path}\n"
|
|
164
|
+
f" Error at line {e.lineno}: {e.msg}\n"
|
|
165
|
+
f" Text: {e.text.strip() if e.text else 'N/A'}\n"
|
|
166
|
+
f" Fix: Check the Python syntax in the baseline file"
|
|
167
|
+
) from e
|
|
168
|
+
except Exception as e:
|
|
169
|
+
error_type = type(e).__name__
|
|
170
|
+
raise ValueError(
|
|
171
|
+
f"❌ Failed to load baseline '{baseline_id}'\n"
|
|
172
|
+
f" File: {path}\n"
|
|
173
|
+
f" Error type: {error_type}\n"
|
|
174
|
+
f" Message: {str(e)}\n"
|
|
175
|
+
f" This may be due to:\n"
|
|
176
|
+
f" - Missing dependencies (check imports)\n"
|
|
177
|
+
f" - Configuration errors in the baseline file\n"
|
|
178
|
+
f" - Environment variables not set\n"
|
|
179
|
+
f" Tip: Run with --verbose for more details"
|
|
180
|
+
) from e
|
|
181
|
+
|
|
182
|
+
# Find the BaselineConfig instance
|
|
183
|
+
for attr_name in dir(module):
|
|
184
|
+
if attr_name.startswith("_"):
|
|
185
|
+
continue
|
|
186
|
+
|
|
187
|
+
attr = getattr(module, attr_name)
|
|
188
|
+
if isinstance(attr, BaselineConfig) and attr.baseline_id == baseline_id:
|
|
189
|
+
# Set source path for reference
|
|
190
|
+
attr._source_path = path
|
|
191
|
+
return attr
|
|
192
|
+
|
|
193
|
+
# Provide helpful error message
|
|
194
|
+
found_configs = []
|
|
195
|
+
for attr_name in dir(module):
|
|
196
|
+
if attr_name.startswith("_"):
|
|
197
|
+
continue
|
|
198
|
+
attr = getattr(module, attr_name)
|
|
199
|
+
if isinstance(attr, BaselineConfig):
|
|
200
|
+
found_configs.append(attr.baseline_id)
|
|
201
|
+
|
|
202
|
+
if found_configs:
|
|
203
|
+
raise ValueError(
|
|
204
|
+
f"❌ Baseline '{baseline_id}' not found in {path}\n"
|
|
205
|
+
f" Found baselines in this file: {', '.join(found_configs)}\n"
|
|
206
|
+
f" Fix: Use one of the above baseline IDs or check the baseline_id parameter"
|
|
207
|
+
)
|
|
208
|
+
else:
|
|
209
|
+
raise ValueError(
|
|
210
|
+
f"❌ No BaselineConfig instances found in {path}\n"
|
|
211
|
+
f" Expected to find a BaselineConfig with baseline_id='{baseline_id}'\n"
|
|
212
|
+
f" Fix: Ensure the file defines a BaselineConfig instance with baseline_id='{baseline_id}'"
|
|
213
|
+
)
|
|
214
|
+
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""Execution engine for baseline evaluations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
|
+
|
|
8
|
+
from synth_ai.baseline.config import (
|
|
9
|
+
BaselineConfig,
|
|
10
|
+
BaselineTaskRunner,
|
|
11
|
+
TaskResult,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def default_aggregator(results: List[TaskResult]) -> Dict[str, Any]:
|
|
16
|
+
"""Default result aggregation function.
|
|
17
|
+
|
|
18
|
+
Computes mean, std, min, max, success rate, and other basic metrics.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
results: List of TaskResult objects from all seeds
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Dict with aggregate metrics
|
|
25
|
+
"""
|
|
26
|
+
successful_results = [r for r in results if r.success]
|
|
27
|
+
outcome_rewards = [r.outcome_reward for r in successful_results]
|
|
28
|
+
|
|
29
|
+
if not outcome_rewards:
|
|
30
|
+
return {
|
|
31
|
+
"mean_outcome_reward": 0.0,
|
|
32
|
+
"std_outcome_reward": 0.0,
|
|
33
|
+
"min_outcome_reward": 0.0,
|
|
34
|
+
"max_outcome_reward": 0.0,
|
|
35
|
+
"success_rate": 0.0,
|
|
36
|
+
"total_tasks": len(results),
|
|
37
|
+
"successful_tasks": 0,
|
|
38
|
+
"failed_tasks": len(results),
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
mean_reward = sum(outcome_rewards) / len(outcome_rewards)
|
|
42
|
+
|
|
43
|
+
# Calculate standard deviation
|
|
44
|
+
variance = sum((x - mean_reward) ** 2 for x in outcome_rewards) / len(outcome_rewards)
|
|
45
|
+
std_reward = variance ** 0.5
|
|
46
|
+
|
|
47
|
+
return {
|
|
48
|
+
"mean_outcome_reward": mean_reward,
|
|
49
|
+
"std_outcome_reward": std_reward,
|
|
50
|
+
"min_outcome_reward": min(outcome_rewards),
|
|
51
|
+
"max_outcome_reward": max(outcome_rewards),
|
|
52
|
+
"success_rate": len(successful_results) / len(results),
|
|
53
|
+
"total_tasks": len(results),
|
|
54
|
+
"successful_tasks": len(successful_results),
|
|
55
|
+
"failed_tasks": len(results) - len(successful_results),
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _is_class_based_runner(task_runner: Any) -> bool:
|
|
60
|
+
"""Check if task_runner is a class (not a function)."""
|
|
61
|
+
return (
|
|
62
|
+
isinstance(task_runner, type)
|
|
63
|
+
and issubclass(task_runner, BaselineTaskRunner)
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
async def run_baseline_evaluation(
|
|
68
|
+
config: BaselineConfig,
|
|
69
|
+
seeds: List[int],
|
|
70
|
+
policy_config: Dict[str, Any],
|
|
71
|
+
env_config: Dict[str, Any],
|
|
72
|
+
concurrency: int = 4,
|
|
73
|
+
) -> List[TaskResult]:
|
|
74
|
+
"""Run baseline evaluation for given seeds.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
config: BaselineConfig instance
|
|
78
|
+
seeds: List of seeds to evaluate
|
|
79
|
+
policy_config: Policy configuration (merged from defaults + overrides)
|
|
80
|
+
env_config: Environment configuration (merged from defaults + overrides)
|
|
81
|
+
concurrency: Maximum concurrent task executions
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
List of TaskResult objects, one per seed
|
|
85
|
+
"""
|
|
86
|
+
# Determine if we're using class-based or function-based runner
|
|
87
|
+
is_class_based = _is_class_based_runner(config.task_runner)
|
|
88
|
+
|
|
89
|
+
# Instantiate runner if class-based
|
|
90
|
+
runner_instance: Optional[BaselineTaskRunner] = None
|
|
91
|
+
if is_class_based:
|
|
92
|
+
runner_instance = config.task_runner(policy_config, env_config)
|
|
93
|
+
|
|
94
|
+
# Create semaphore for concurrency control
|
|
95
|
+
semaphore = asyncio.Semaphore(concurrency)
|
|
96
|
+
|
|
97
|
+
async def run_task(seed: int) -> TaskResult:
|
|
98
|
+
"""Execute a single task with error handling."""
|
|
99
|
+
async with semaphore:
|
|
100
|
+
try:
|
|
101
|
+
if is_class_based and runner_instance:
|
|
102
|
+
# Class-based: call run_task method
|
|
103
|
+
return await runner_instance.run_task(seed)
|
|
104
|
+
else:
|
|
105
|
+
# Function-based: call function directly
|
|
106
|
+
task_runner_fn = config.task_runner
|
|
107
|
+
return await task_runner_fn(seed, policy_config, env_config)
|
|
108
|
+
except Exception as exc:
|
|
109
|
+
# Return error result
|
|
110
|
+
return TaskResult(
|
|
111
|
+
seed=seed,
|
|
112
|
+
success=False,
|
|
113
|
+
outcome_reward=0.0,
|
|
114
|
+
error=str(exc),
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Execute all tasks concurrently
|
|
118
|
+
results = await asyncio.gather(*[run_task(seed) for seed in seeds])
|
|
119
|
+
return list(results)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def aggregate_results(
|
|
123
|
+
config: BaselineConfig,
|
|
124
|
+
results: List[TaskResult],
|
|
125
|
+
) -> Dict[str, Any]:
|
|
126
|
+
"""Aggregate results using custom aggregator or default.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
config: BaselineConfig instance
|
|
130
|
+
results: List of TaskResult objects
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Dict with aggregate metrics
|
|
134
|
+
"""
|
|
135
|
+
if config.result_aggregator is None:
|
|
136
|
+
return default_aggregator(results)
|
|
137
|
+
|
|
138
|
+
# Check if aggregator is a class or function
|
|
139
|
+
if isinstance(config.result_aggregator, type):
|
|
140
|
+
# Class-based: instantiate and call aggregate()
|
|
141
|
+
aggregator_instance = config.result_aggregator()
|
|
142
|
+
return aggregator_instance.aggregate(results)
|
|
143
|
+
else:
|
|
144
|
+
# Function-based: call directly
|
|
145
|
+
return config.result_aggregator(results)
|
|
146
|
+
|
synth_ai/cli/__init__.py
CHANGED
|
@@ -53,12 +53,72 @@ if not _cli_module:
|
|
|
53
53
|
cli = _cli_module.cli # type: ignore[attr-defined]
|
|
54
54
|
|
|
55
55
|
# Register core commands implemented as standalone modules
|
|
56
|
+
try:
|
|
57
|
+
from synth_ai.cli.demo import demo_cmd
|
|
58
|
+
cli.add_command(demo_cmd, name="demo")
|
|
59
|
+
except Exception as e:
|
|
60
|
+
import sys
|
|
61
|
+
print(f"[DEBUG] Failed to register demo command: {e}", file=sys.stderr)
|
|
62
|
+
import traceback
|
|
63
|
+
traceback.print_exc()
|
|
56
64
|
try:
|
|
57
65
|
from synth_ai.cli.setup import setup_cmd
|
|
58
|
-
|
|
59
66
|
cli.add_command(setup_cmd, name="setup")
|
|
60
|
-
except Exception:
|
|
61
|
-
|
|
67
|
+
except Exception as e:
|
|
68
|
+
import sys
|
|
69
|
+
print(f"[DEBUG] Failed to register setup command: {e}", file=sys.stderr)
|
|
70
|
+
import traceback
|
|
71
|
+
traceback.print_exc()
|
|
72
|
+
try:
|
|
73
|
+
from synth_ai.cli.deploy import deploy_cmd # type: ignore[attr-defined]
|
|
74
|
+
cli.add_command(deploy_cmd, name="deploy")
|
|
75
|
+
except Exception as e:
|
|
76
|
+
import sys
|
|
77
|
+
print(f"[DEBUG] Failed to register deploy command: {e}", file=sys.stderr)
|
|
78
|
+
import traceback
|
|
79
|
+
traceback.print_exc()
|
|
80
|
+
try:
|
|
81
|
+
from synth_ai.cli.opencode import opencode_cmd
|
|
82
|
+
cli.add_command(opencode_cmd, name="opencode")
|
|
83
|
+
except Exception as e:
|
|
84
|
+
import sys
|
|
85
|
+
print(f"[DEBUG] Failed to register opencode command: {e}", file=sys.stderr)
|
|
86
|
+
import traceback
|
|
87
|
+
traceback.print_exc()
|
|
88
|
+
try:
|
|
89
|
+
from synth_ai.cli.codex import codex_cmd
|
|
90
|
+
cli.add_command(codex_cmd, name="codex")
|
|
91
|
+
except Exception as e:
|
|
92
|
+
import sys
|
|
93
|
+
print(f"[DEBUG] Failed to register codex command: {e}", file=sys.stderr)
|
|
94
|
+
import traceback
|
|
95
|
+
traceback.print_exc()
|
|
96
|
+
try:
|
|
97
|
+
from synth_ai.cli.eval import command as eval_cmd
|
|
98
|
+
cli.add_command(eval_cmd, name="eval")
|
|
99
|
+
except Exception as e:
|
|
100
|
+
import sys
|
|
101
|
+
print(f"[DEBUG] Failed to register eval command: {e}", file=sys.stderr)
|
|
102
|
+
import traceback
|
|
103
|
+
traceback.print_exc()
|
|
104
|
+
try:
|
|
105
|
+
from synth_ai.cli.claude import claude_cmd
|
|
106
|
+
cli.add_command(claude_cmd, name="claude")
|
|
107
|
+
except Exception as e:
|
|
108
|
+
import sys
|
|
109
|
+
print(f"[DEBUG] Failed to register claude command: {e}", file=sys.stderr)
|
|
110
|
+
import traceback
|
|
111
|
+
traceback.print_exc()
|
|
112
|
+
try:
|
|
113
|
+
from synth_ai.cli.commands.baseline import command as baseline_cmd
|
|
114
|
+
from synth_ai.cli.commands.baseline.list import list_command as baseline_list_cmd
|
|
115
|
+
cli.add_command(baseline_cmd, name="baseline")
|
|
116
|
+
baseline_cmd.add_command(baseline_list_cmd, name="list")
|
|
117
|
+
except Exception as e:
|
|
118
|
+
import sys
|
|
119
|
+
print(f"[DEBUG] Failed to register baseline command: {e}", file=sys.stderr)
|
|
120
|
+
import traceback
|
|
121
|
+
traceback.print_exc()
|
|
62
122
|
|
|
63
123
|
|
|
64
124
|
# Register optional subcommands packaged under synth_ai.cli.*
|
|
@@ -72,6 +132,14 @@ for _module_path in ("synth_ai.cli.commands.demo", "synth_ai.cli.commands.status
|
|
|
72
132
|
if fn:
|
|
73
133
|
fn(cli)
|
|
74
134
|
|
|
135
|
+
# Smoke command registration (CLI-only helper)
|
|
136
|
+
try:
|
|
137
|
+
from synth_ai.cli.commands.smoke import register as register_smoke
|
|
138
|
+
|
|
139
|
+
register_smoke(cli)
|
|
140
|
+
except Exception:
|
|
141
|
+
pass
|
|
142
|
+
|
|
75
143
|
# Register help command
|
|
76
144
|
_maybe_call("synth_ai.cli.commands.help.core", "register", cli)
|
|
77
145
|
|
|
@@ -80,19 +148,19 @@ _maybe_call("synth_ai.api.train", "register", cli)
|
|
|
80
148
|
|
|
81
149
|
# Task app group/commands are optional and have richer API surface
|
|
82
150
|
_task_apps_module = _maybe_import("synth_ai.cli.task_apps")
|
|
83
|
-
if _task_apps_module:
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
151
|
+
#if _task_apps_module:
|
|
152
|
+
task_app_group = getattr(_task_apps_module, "task_app_group", None)
|
|
153
|
+
if task_app_group is not None:
|
|
154
|
+
cli.add_command(task_app_group, name="task-app")
|
|
155
|
+
# Expose common aliases when present
|
|
156
|
+
commands = getattr(task_app_group, "commands", None)
|
|
157
|
+
if isinstance(commands, dict):
|
|
158
|
+
for alias, name in (("serve", "serve"), ("deploy", "deploy"), ("modal-serve", "modal-serve")):
|
|
159
|
+
command = commands.get(name)
|
|
160
|
+
if command is not None:
|
|
161
|
+
cli.add_command(command, name=alias)
|
|
162
|
+
register_task_apps = _callable_from(_task_apps_module, "register")
|
|
163
|
+
if register_task_apps:
|
|
164
|
+
register_task_apps(cli)
|
|
97
165
|
|
|
98
166
|
# Top-level 'info' alias removed; use `synth-ai task-app info` instead
|
synth_ai/cli/__main__.py
ADDED
|
File without changes
|
synth_ai/cli/claude.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import subprocess
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
from synth_ai.types import MODEL_NAMES, ModelName
|
|
6
|
+
from synth_ai.urls import BACKEND_URL_SYNTH_RESEARCH_ANTHROPIC
|
|
7
|
+
from synth_ai.utils import find_bin_path, install_bin, resolve_env_var, verify_bin, write_agents_md
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@click.command("claude")
|
|
11
|
+
@click.option(
|
|
12
|
+
"--model",
|
|
13
|
+
"model_name",
|
|
14
|
+
type=str,
|
|
15
|
+
default=None
|
|
16
|
+
)
|
|
17
|
+
@click.option(
|
|
18
|
+
"--force",
|
|
19
|
+
is_flag=True,
|
|
20
|
+
help="Prompt for API keys even if cached values exist."
|
|
21
|
+
)
|
|
22
|
+
@click.option(
|
|
23
|
+
"--url",
|
|
24
|
+
"override_url",
|
|
25
|
+
type=str,
|
|
26
|
+
default=None,
|
|
27
|
+
)
|
|
28
|
+
def claude_cmd(
|
|
29
|
+
model_name: ModelName | None = None,
|
|
30
|
+
force: bool = False,
|
|
31
|
+
override_url: str | None = None
|
|
32
|
+
) -> None:
|
|
33
|
+
|
|
34
|
+
while True:
|
|
35
|
+
bin_path = find_bin_path("claude")
|
|
36
|
+
if bin_path:
|
|
37
|
+
break
|
|
38
|
+
if not install_bin(
|
|
39
|
+
"Claude Code",
|
|
40
|
+
["curl -fsSL https://claude.ai/install.sh | bash"]
|
|
41
|
+
):
|
|
42
|
+
print("Failed to find your installed Claude Code")
|
|
43
|
+
print("Please install from: https://claude.com/claude-code")
|
|
44
|
+
return
|
|
45
|
+
print(f"Using Claude at {bin_path}")
|
|
46
|
+
|
|
47
|
+
if not verify_bin(bin_path):
|
|
48
|
+
print("Failed to verify Claude Code is runnable")
|
|
49
|
+
return
|
|
50
|
+
|
|
51
|
+
write_agents_md()
|
|
52
|
+
env = os.environ.copy()
|
|
53
|
+
|
|
54
|
+
if model_name is not None:
|
|
55
|
+
if model_name not in MODEL_NAMES:
|
|
56
|
+
raise ValueError(f"model_name={model_name} is invalid. Valid values for model_name: {MODEL_NAMES}")
|
|
57
|
+
if override_url:
|
|
58
|
+
url = f"{override_url.rstrip('/')}/{model_name}"
|
|
59
|
+
print(f"Using override URL with model: {url}")
|
|
60
|
+
else:
|
|
61
|
+
url = f"{BACKEND_URL_SYNTH_RESEARCH_ANTHROPIC}/{model_name}"
|
|
62
|
+
env["ANTHROPIC_BASE_URL"] = url
|
|
63
|
+
api_key = resolve_env_var("SYNTH_API_KEY", override_process_env=force)
|
|
64
|
+
env["ANTHROPIC_AUTH_TOKEN"] = api_key
|
|
65
|
+
env["SYNTH_API_KEY"] = api_key
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
subprocess.run(["claude"], check=True, env=env)
|
|
69
|
+
except subprocess.CalledProcessError:
|
|
70
|
+
print("Failed to launch Claude Code")
|
synth_ai/cli/codex.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import subprocess
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
from synth_ai.types import MODEL_NAMES, ModelName
|
|
6
|
+
from synth_ai.urls import BACKEND_URL_SYNTH_RESEARCH_OPENAI
|
|
7
|
+
from synth_ai.utils import find_bin_path, install_bin, resolve_env_var, verify_bin, write_agents_md
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@click.command("codex")
|
|
11
|
+
@click.option(
|
|
12
|
+
"--model",
|
|
13
|
+
"model_name",
|
|
14
|
+
type=str,
|
|
15
|
+
default=None
|
|
16
|
+
)
|
|
17
|
+
@click.option(
|
|
18
|
+
"--force",
|
|
19
|
+
is_flag=True,
|
|
20
|
+
help="Prompt for API keys even if cached values exist."
|
|
21
|
+
)
|
|
22
|
+
@click.option(
|
|
23
|
+
"--url",
|
|
24
|
+
"override_url",
|
|
25
|
+
type=str,
|
|
26
|
+
default=None,
|
|
27
|
+
)
|
|
28
|
+
def codex_cmd(
|
|
29
|
+
model_name: ModelName | None = None,
|
|
30
|
+
force: bool = False,
|
|
31
|
+
override_url: str | None = None
|
|
32
|
+
)-> None:
|
|
33
|
+
|
|
34
|
+
while True:
|
|
35
|
+
bin_path = find_bin_path("codex")
|
|
36
|
+
if bin_path:
|
|
37
|
+
break
|
|
38
|
+
if not install_bin(
|
|
39
|
+
"Codex",
|
|
40
|
+
[
|
|
41
|
+
"brew install codex",
|
|
42
|
+
"npm install -g @openai/codex"
|
|
43
|
+
]
|
|
44
|
+
):
|
|
45
|
+
print("Failed to find your installed Codex")
|
|
46
|
+
print("Please install from: https://developers.openai.com/codex/cli/")
|
|
47
|
+
return
|
|
48
|
+
print(f"Using Codex at {bin_path}")
|
|
49
|
+
|
|
50
|
+
if not verify_bin(bin_path):
|
|
51
|
+
print("Failed to verify Codex is runnable")
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
write_agents_md()
|
|
55
|
+
env = os.environ.copy()
|
|
56
|
+
override_args = []
|
|
57
|
+
|
|
58
|
+
if model_name is not None:
|
|
59
|
+
if model_name not in MODEL_NAMES:
|
|
60
|
+
raise ValueError(f"model_name={model_name} is invalid. Valid values for model_name: {MODEL_NAMES}")
|
|
61
|
+
if override_url:
|
|
62
|
+
url = override_url
|
|
63
|
+
print("Using override URL:", url)
|
|
64
|
+
else:
|
|
65
|
+
url = BACKEND_URL_SYNTH_RESEARCH_OPENAI
|
|
66
|
+
provider_config = f'{{name="Synth",base_url="{url}",env_key="OPENAI_API_KEY"}}'
|
|
67
|
+
config_overrides = [
|
|
68
|
+
f"model_providers.synth={provider_config}",
|
|
69
|
+
'model_provider="synth"',
|
|
70
|
+
f'default_model="{model_name}"'
|
|
71
|
+
]
|
|
72
|
+
override_args = [arg for override in config_overrides for arg in ("-c", override)]
|
|
73
|
+
env["OPENAI_API_KEY"] = resolve_env_var("SYNTH_API_KEY", override_process_env=force)
|
|
74
|
+
env["SYNTH_API_KEY"] = env["OPENAI_API_KEY"]
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
cmd = ["codex"]
|
|
78
|
+
if model_name is not None:
|
|
79
|
+
cmd.extend(["-m", model_name])
|
|
80
|
+
cmd.extend(override_args)
|
|
81
|
+
print(" ".join(cmd))
|
|
82
|
+
subprocess.run(cmd, check=True, env=env)
|
|
83
|
+
except subprocess.CalledProcessError:
|
|
84
|
+
print("Failed to run Codex")
|