synth-ai 0.2.17__py3-none-any.whl → 0.2.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (169) hide show
  1. examples/baseline/banking77_baseline.py +204 -0
  2. examples/baseline/crafter_baseline.py +407 -0
  3. examples/baseline/pokemon_red_baseline.py +326 -0
  4. examples/baseline/simple_baseline.py +56 -0
  5. examples/baseline/warming_up_to_rl_baseline.py +239 -0
  6. examples/blog_posts/gepa/README.md +355 -0
  7. examples/blog_posts/gepa/configs/banking77_gepa_local.toml +95 -0
  8. examples/blog_posts/gepa/configs/banking77_gepa_test.toml +82 -0
  9. examples/blog_posts/gepa/configs/banking77_mipro_local.toml +52 -0
  10. examples/blog_posts/gepa/configs/hotpotqa_gepa_local.toml +59 -0
  11. examples/blog_posts/gepa/configs/hotpotqa_gepa_qwen.toml +36 -0
  12. examples/blog_posts/gepa/configs/hotpotqa_mipro_local.toml +53 -0
  13. examples/blog_posts/gepa/configs/hover_gepa_local.toml +59 -0
  14. examples/blog_posts/gepa/configs/hover_gepa_qwen.toml +36 -0
  15. examples/blog_posts/gepa/configs/hover_mipro_local.toml +53 -0
  16. examples/blog_posts/gepa/configs/ifbench_gepa_local.toml +59 -0
  17. examples/blog_posts/gepa/configs/ifbench_gepa_qwen.toml +36 -0
  18. examples/blog_posts/gepa/configs/ifbench_mipro_local.toml +53 -0
  19. examples/blog_posts/gepa/configs/pupa_gepa_local.toml +60 -0
  20. examples/blog_posts/gepa/configs/pupa_mipro_local.toml +54 -0
  21. examples/blog_posts/gepa/deploy_banking77_task_app.sh +41 -0
  22. examples/blog_posts/gepa/gepa_baseline.py +204 -0
  23. examples/blog_posts/gepa/query_prompts_example.py +97 -0
  24. examples/blog_posts/gepa/run_gepa_banking77.sh +87 -0
  25. examples/blog_posts/gepa/task_apps.py +105 -0
  26. examples/blog_posts/gepa/test_gepa_local.sh +67 -0
  27. examples/blog_posts/gepa/verify_banking77_setup.sh +123 -0
  28. examples/blog_posts/pokemon_vl/configs/eval_gpt5nano.toml +26 -0
  29. examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +12 -10
  30. examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +1 -0
  31. examples/blog_posts/pokemon_vl/extract_images.py +239 -0
  32. examples/blog_posts/pokemon_vl/pokemon_vl_baseline.py +326 -0
  33. examples/blog_posts/pokemon_vl/run_eval_extract_images.py +209 -0
  34. examples/blog_posts/pokemon_vl/run_qwen_eval_extract_images.py +212 -0
  35. examples/blog_posts/pokemon_vl/text_box_analysis.md +106 -0
  36. examples/blog_posts/warming_up_to_rl/ARCHITECTURE.md +195 -0
  37. examples/blog_posts/warming_up_to_rl/FINAL_TEST_RESULTS.md +127 -0
  38. examples/blog_posts/warming_up_to_rl/INFERENCE_SUCCESS.md +132 -0
  39. examples/blog_posts/warming_up_to_rl/SMOKE_TESTING.md +164 -0
  40. examples/blog_posts/warming_up_to_rl/SMOKE_TEST_COMPLETE.md +253 -0
  41. examples/blog_posts/warming_up_to_rl/configs/eval_baseline_qwen32b_10x20.toml +25 -0
  42. examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b_10x20.toml +26 -0
  43. examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +1 -1
  44. examples/blog_posts/warming_up_to_rl/configs/smoke_test.toml +75 -0
  45. examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +60 -10
  46. examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +1 -1
  47. examples/blog_posts/warming_up_to_rl/warming_up_to_rl_baseline.py +187 -0
  48. examples/multi_step/configs/VERILOG_REWARDS.md +4 -0
  49. examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +4 -0
  50. examples/multi_step/configs/crafter_rl_outcome.toml +1 -0
  51. examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +1 -0
  52. examples/multi_step/configs/crafter_rl_stepwise_simple.toml +1 -0
  53. examples/rl/configs/rl_from_base_qwen17.toml +1 -0
  54. examples/swe/task_app/hosted/inference/openai_client.py +0 -34
  55. examples/swe/task_app/hosted/policy_routes.py +17 -0
  56. examples/swe/task_app/hosted/rollout.py +4 -2
  57. examples/task_apps/banking77/__init__.py +6 -0
  58. examples/task_apps/banking77/banking77_task_app.py +841 -0
  59. examples/task_apps/banking77/deploy_wrapper.py +46 -0
  60. examples/task_apps/crafter/CREATE_SFT_DATASET.md +4 -0
  61. examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +4 -0
  62. examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +4 -0
  63. examples/task_apps/crafter/task_app/grpo_crafter.py +24 -2
  64. examples/task_apps/crafter/task_app/synth_envs_hosted/hosted_app.py +49 -0
  65. examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +355 -58
  66. examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +68 -7
  67. examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +78 -21
  68. examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +194 -1
  69. examples/task_apps/gepa_benchmarks/__init__.py +7 -0
  70. examples/task_apps/gepa_benchmarks/common.py +260 -0
  71. examples/task_apps/gepa_benchmarks/hotpotqa_task_app.py +507 -0
  72. examples/task_apps/gepa_benchmarks/hover_task_app.py +436 -0
  73. examples/task_apps/gepa_benchmarks/ifbench_task_app.py +563 -0
  74. examples/task_apps/gepa_benchmarks/pupa_task_app.py +460 -0
  75. examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +4 -0
  76. examples/task_apps/pokemon_red/task_app.py +254 -36
  77. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +1 -0
  78. examples/warming_up_to_rl/task_app/grpo_crafter.py +53 -4
  79. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +49 -0
  80. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +152 -41
  81. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +31 -1
  82. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +33 -3
  83. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +67 -0
  84. examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +1 -0
  85. synth_ai/api/train/builders.py +90 -1
  86. synth_ai/api/train/cli.py +396 -21
  87. synth_ai/api/train/config_finder.py +13 -2
  88. synth_ai/api/train/configs/__init__.py +15 -1
  89. synth_ai/api/train/configs/prompt_learning.py +442 -0
  90. synth_ai/api/train/configs/rl.py +29 -0
  91. synth_ai/api/train/task_app.py +1 -1
  92. synth_ai/api/train/validators.py +277 -0
  93. synth_ai/baseline/__init__.py +25 -0
  94. synth_ai/baseline/config.py +209 -0
  95. synth_ai/baseline/discovery.py +214 -0
  96. synth_ai/baseline/execution.py +146 -0
  97. synth_ai/cli/__init__.py +85 -17
  98. synth_ai/cli/__main__.py +0 -0
  99. synth_ai/cli/claude.py +70 -0
  100. synth_ai/cli/codex.py +84 -0
  101. synth_ai/cli/commands/__init__.py +1 -0
  102. synth_ai/cli/commands/baseline/__init__.py +12 -0
  103. synth_ai/cli/commands/baseline/core.py +637 -0
  104. synth_ai/cli/commands/baseline/list.py +93 -0
  105. synth_ai/cli/commands/eval/core.py +13 -10
  106. synth_ai/cli/commands/filter/core.py +53 -17
  107. synth_ai/cli/commands/help/core.py +0 -1
  108. synth_ai/cli/commands/smoke/__init__.py +7 -0
  109. synth_ai/cli/commands/smoke/core.py +1436 -0
  110. synth_ai/cli/commands/status/subcommands/pricing.py +22 -0
  111. synth_ai/cli/commands/status/subcommands/usage.py +203 -0
  112. synth_ai/cli/commands/train/judge_schemas.py +1 -0
  113. synth_ai/cli/commands/train/judge_validation.py +1 -0
  114. synth_ai/cli/commands/train/validation.py +0 -57
  115. synth_ai/cli/demo.py +35 -3
  116. synth_ai/cli/deploy/__init__.py +40 -25
  117. synth_ai/cli/deploy.py +162 -0
  118. synth_ai/cli/legacy_root_backup.py +14 -8
  119. synth_ai/cli/opencode.py +107 -0
  120. synth_ai/cli/root.py +9 -5
  121. synth_ai/cli/task_app_deploy.py +1 -1
  122. synth_ai/cli/task_apps.py +53 -53
  123. synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +7 -4
  124. synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +9 -5
  125. synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +4 -3
  126. synth_ai/judge_schemas.py +1 -0
  127. synth_ai/learning/__init__.py +10 -0
  128. synth_ai/learning/prompt_learning_client.py +276 -0
  129. synth_ai/learning/prompt_learning_types.py +184 -0
  130. synth_ai/pricing/__init__.py +2 -0
  131. synth_ai/pricing/model_pricing.py +57 -0
  132. synth_ai/streaming/handlers.py +53 -4
  133. synth_ai/streaming/streamer.py +19 -0
  134. synth_ai/task/apps/__init__.py +1 -0
  135. synth_ai/task/config.py +2 -0
  136. synth_ai/task/tracing_utils.py +25 -25
  137. synth_ai/task/validators.py +44 -8
  138. synth_ai/task_app_cfgs.py +21 -0
  139. synth_ai/tracing_v3/config.py +162 -19
  140. synth_ai/tracing_v3/constants.py +1 -1
  141. synth_ai/tracing_v3/db_config.py +24 -38
  142. synth_ai/tracing_v3/storage/config.py +47 -13
  143. synth_ai/tracing_v3/storage/factory.py +3 -3
  144. synth_ai/tracing_v3/turso/daemon.py +113 -11
  145. synth_ai/tracing_v3/turso/native_manager.py +92 -16
  146. synth_ai/types.py +8 -0
  147. synth_ai/urls.py +11 -0
  148. synth_ai/utils/__init__.py +30 -1
  149. synth_ai/utils/agents.py +74 -0
  150. synth_ai/utils/bin.py +39 -0
  151. synth_ai/utils/cli.py +149 -5
  152. synth_ai/utils/env.py +17 -17
  153. synth_ai/utils/json.py +72 -0
  154. synth_ai/utils/modal.py +283 -1
  155. synth_ai/utils/paths.py +48 -0
  156. synth_ai/utils/uvicorn.py +113 -0
  157. {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/METADATA +102 -4
  158. {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/RECORD +162 -88
  159. synth_ai/cli/commands/deploy/__init__.py +0 -23
  160. synth_ai/cli/commands/deploy/core.py +0 -614
  161. synth_ai/cli/commands/deploy/errors.py +0 -72
  162. synth_ai/cli/commands/deploy/validation.py +0 -11
  163. synth_ai/cli/deploy/core.py +0 -5
  164. synth_ai/cli/deploy/errors.py +0 -23
  165. synth_ai/cli/deploy/validation.py +0 -5
  166. {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/WHEEL +0 -0
  167. {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/entry_points.txt +0 -0
  168. {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/licenses/LICENSE +0 -0
  169. {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,214 @@
1
+ """AST-based discovery mechanism for baseline files."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import ast
6
+ import importlib.util
7
+ from dataclasses import dataclass
8
+ from pathlib import Path
9
+ from typing import List, Optional, Tuple
10
+
11
+ from synth_ai.baseline.config import BaselineConfig
12
+
13
+ # Search patterns for baseline files
14
+ BASELINE_FILE_PATTERNS = [
15
+ "**/baseline/*.py",
16
+ "**/baselines/*.py",
17
+ "**/*_baseline.py",
18
+ ]
19
+
20
+ # Directories to ignore during discovery
21
+ IGNORE_PATTERNS = {
22
+ "__pycache__",
23
+ ".git",
24
+ ".venv",
25
+ "venv",
26
+ "node_modules",
27
+ "build",
28
+ "dist",
29
+ ".mypy_cache",
30
+ ".pytest_cache",
31
+ }
32
+
33
+
34
+ @dataclass
35
+ class BaselineChoice:
36
+ """Represents a discovered baseline configuration."""
37
+
38
+ baseline_id: str
39
+ path: Path
40
+ lineno: int
41
+ source: str # "discovered" or "registered"
42
+ config: Optional[BaselineConfig] = None
43
+
44
+
45
+ class BaselineConfigVisitor(ast.NodeVisitor):
46
+ """AST visitor to find BaselineConfig instances."""
47
+
48
+ def __init__(self):
49
+ self.matches: List[Tuple[str, int]] = [] # (baseline_id, lineno)
50
+
51
+ def visit_Assign(self, node: ast.Assign) -> None:
52
+ """Visit assignment statements looking for BaselineConfig."""
53
+ if not isinstance(node.value, ast.Call):
54
+ self.generic_visit(node)
55
+ return
56
+
57
+ # Check if right-hand side is BaselineConfig(...)
58
+ func = node.value.func
59
+ if isinstance(func, ast.Name) and func.id == "BaselineConfig":
60
+ # Extract baseline_id from constructor args
61
+ baseline_id = self._extract_baseline_id(node.value)
62
+ if baseline_id:
63
+ self.matches.append((baseline_id, node.lineno))
64
+
65
+ self.generic_visit(node)
66
+
67
+ def _extract_baseline_id(self, call_node: ast.Call) -> Optional[str]:
68
+ """Extract baseline_id from BaselineConfig constructor."""
69
+ for keyword in call_node.keywords:
70
+ if keyword.arg == "baseline_id" and isinstance(keyword.value, ast.Constant):
71
+ return keyword.value.value
72
+ return None
73
+
74
+
75
+ def should_ignore_path(path: Path) -> bool:
76
+ """Check if a path should be ignored during discovery."""
77
+ return any(part in IGNORE_PATTERNS for part in path.parts)
78
+
79
+
80
+ def discover_baseline_files(search_roots: List[Path]) -> List[BaselineChoice]:
81
+ """Discover baseline files via AST scanning.
82
+
83
+ Args:
84
+ search_roots: List of root directories to search in
85
+
86
+ Returns:
87
+ List of BaselineChoice objects representing discovered baselines
88
+ """
89
+ results: List[BaselineChoice] = []
90
+ seen = set()
91
+
92
+ for root in search_roots:
93
+ if not root.exists():
94
+ continue
95
+
96
+ for pattern in BASELINE_FILE_PATTERNS:
97
+ for path in root.glob(pattern):
98
+ if should_ignore_path(path):
99
+ continue
100
+
101
+ try:
102
+ source = path.read_text(encoding="utf-8")
103
+ tree = ast.parse(source, filename=str(path))
104
+ except (OSError, SyntaxError):
105
+ continue
106
+
107
+ visitor = BaselineConfigVisitor()
108
+ visitor.visit(tree)
109
+
110
+ for baseline_id, lineno in visitor.matches:
111
+ key = (baseline_id, path.resolve())
112
+ if key in seen:
113
+ continue
114
+ seen.add(key)
115
+
116
+ results.append(
117
+ BaselineChoice(
118
+ baseline_id=baseline_id,
119
+ path=path.resolve(),
120
+ lineno=lineno,
121
+ source="discovered",
122
+ )
123
+ )
124
+
125
+ return results
126
+
127
+
128
+ def load_baseline_config_from_file(
129
+ baseline_id: str,
130
+ path: Path,
131
+ ) -> BaselineConfig:
132
+ """Load a BaselineConfig from a Python file.
133
+
134
+ Args:
135
+ baseline_id: The baseline_id to look for
136
+ path: Path to the Python file
137
+
138
+ Returns:
139
+ BaselineConfig instance
140
+
141
+ Raises:
142
+ ValueError: If baseline_id not found or file cannot be loaded
143
+ """
144
+ # Load the module
145
+ spec = importlib.util.spec_from_file_location("baseline_module", path)
146
+ if spec is None or spec.loader is None:
147
+ raise ValueError(f"Cannot load baseline file: {path}")
148
+
149
+ module = importlib.util.module_from_spec(spec)
150
+ try:
151
+ spec.loader.exec_module(module)
152
+ except ModuleNotFoundError as e:
153
+ missing_module = str(e).split("'")[1] if "'" in str(e) else str(e)
154
+ raise ImportError(
155
+ f"❌ Missing dependency for baseline '{baseline_id}'\n"
156
+ f" File: {path}\n"
157
+ f" Missing module: {missing_module}\n"
158
+ f" Fix: pip install {missing_module} (or 'uv add {missing_module}')"
159
+ ) from e
160
+ except SyntaxError as e:
161
+ raise ValueError(
162
+ f"❌ Syntax error in baseline file '{baseline_id}'\n"
163
+ f" File: {path}\n"
164
+ f" Error at line {e.lineno}: {e.msg}\n"
165
+ f" Text: {e.text.strip() if e.text else 'N/A'}\n"
166
+ f" Fix: Check the Python syntax in the baseline file"
167
+ ) from e
168
+ except Exception as e:
169
+ error_type = type(e).__name__
170
+ raise ValueError(
171
+ f"❌ Failed to load baseline '{baseline_id}'\n"
172
+ f" File: {path}\n"
173
+ f" Error type: {error_type}\n"
174
+ f" Message: {str(e)}\n"
175
+ f" This may be due to:\n"
176
+ f" - Missing dependencies (check imports)\n"
177
+ f" - Configuration errors in the baseline file\n"
178
+ f" - Environment variables not set\n"
179
+ f" Tip: Run with --verbose for more details"
180
+ ) from e
181
+
182
+ # Find the BaselineConfig instance
183
+ for attr_name in dir(module):
184
+ if attr_name.startswith("_"):
185
+ continue
186
+
187
+ attr = getattr(module, attr_name)
188
+ if isinstance(attr, BaselineConfig) and attr.baseline_id == baseline_id:
189
+ # Set source path for reference
190
+ attr._source_path = path
191
+ return attr
192
+
193
+ # Provide helpful error message
194
+ found_configs = []
195
+ for attr_name in dir(module):
196
+ if attr_name.startswith("_"):
197
+ continue
198
+ attr = getattr(module, attr_name)
199
+ if isinstance(attr, BaselineConfig):
200
+ found_configs.append(attr.baseline_id)
201
+
202
+ if found_configs:
203
+ raise ValueError(
204
+ f"❌ Baseline '{baseline_id}' not found in {path}\n"
205
+ f" Found baselines in this file: {', '.join(found_configs)}\n"
206
+ f" Fix: Use one of the above baseline IDs or check the baseline_id parameter"
207
+ )
208
+ else:
209
+ raise ValueError(
210
+ f"❌ No BaselineConfig instances found in {path}\n"
211
+ f" Expected to find a BaselineConfig with baseline_id='{baseline_id}'\n"
212
+ f" Fix: Ensure the file defines a BaselineConfig instance with baseline_id='{baseline_id}'"
213
+ )
214
+
@@ -0,0 +1,146 @@
1
+ """Execution engine for baseline evaluations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ from synth_ai.baseline.config import (
9
+ BaselineConfig,
10
+ BaselineTaskRunner,
11
+ TaskResult,
12
+ )
13
+
14
+
15
+ def default_aggregator(results: List[TaskResult]) -> Dict[str, Any]:
16
+ """Default result aggregation function.
17
+
18
+ Computes mean, std, min, max, success rate, and other basic metrics.
19
+
20
+ Args:
21
+ results: List of TaskResult objects from all seeds
22
+
23
+ Returns:
24
+ Dict with aggregate metrics
25
+ """
26
+ successful_results = [r for r in results if r.success]
27
+ outcome_rewards = [r.outcome_reward for r in successful_results]
28
+
29
+ if not outcome_rewards:
30
+ return {
31
+ "mean_outcome_reward": 0.0,
32
+ "std_outcome_reward": 0.0,
33
+ "min_outcome_reward": 0.0,
34
+ "max_outcome_reward": 0.0,
35
+ "success_rate": 0.0,
36
+ "total_tasks": len(results),
37
+ "successful_tasks": 0,
38
+ "failed_tasks": len(results),
39
+ }
40
+
41
+ mean_reward = sum(outcome_rewards) / len(outcome_rewards)
42
+
43
+ # Calculate standard deviation
44
+ variance = sum((x - mean_reward) ** 2 for x in outcome_rewards) / len(outcome_rewards)
45
+ std_reward = variance ** 0.5
46
+
47
+ return {
48
+ "mean_outcome_reward": mean_reward,
49
+ "std_outcome_reward": std_reward,
50
+ "min_outcome_reward": min(outcome_rewards),
51
+ "max_outcome_reward": max(outcome_rewards),
52
+ "success_rate": len(successful_results) / len(results),
53
+ "total_tasks": len(results),
54
+ "successful_tasks": len(successful_results),
55
+ "failed_tasks": len(results) - len(successful_results),
56
+ }
57
+
58
+
59
+ def _is_class_based_runner(task_runner: Any) -> bool:
60
+ """Check if task_runner is a class (not a function)."""
61
+ return (
62
+ isinstance(task_runner, type)
63
+ and issubclass(task_runner, BaselineTaskRunner)
64
+ )
65
+
66
+
67
+ async def run_baseline_evaluation(
68
+ config: BaselineConfig,
69
+ seeds: List[int],
70
+ policy_config: Dict[str, Any],
71
+ env_config: Dict[str, Any],
72
+ concurrency: int = 4,
73
+ ) -> List[TaskResult]:
74
+ """Run baseline evaluation for given seeds.
75
+
76
+ Args:
77
+ config: BaselineConfig instance
78
+ seeds: List of seeds to evaluate
79
+ policy_config: Policy configuration (merged from defaults + overrides)
80
+ env_config: Environment configuration (merged from defaults + overrides)
81
+ concurrency: Maximum concurrent task executions
82
+
83
+ Returns:
84
+ List of TaskResult objects, one per seed
85
+ """
86
+ # Determine if we're using class-based or function-based runner
87
+ is_class_based = _is_class_based_runner(config.task_runner)
88
+
89
+ # Instantiate runner if class-based
90
+ runner_instance: Optional[BaselineTaskRunner] = None
91
+ if is_class_based:
92
+ runner_instance = config.task_runner(policy_config, env_config)
93
+
94
+ # Create semaphore for concurrency control
95
+ semaphore = asyncio.Semaphore(concurrency)
96
+
97
+ async def run_task(seed: int) -> TaskResult:
98
+ """Execute a single task with error handling."""
99
+ async with semaphore:
100
+ try:
101
+ if is_class_based and runner_instance:
102
+ # Class-based: call run_task method
103
+ return await runner_instance.run_task(seed)
104
+ else:
105
+ # Function-based: call function directly
106
+ task_runner_fn = config.task_runner
107
+ return await task_runner_fn(seed, policy_config, env_config)
108
+ except Exception as exc:
109
+ # Return error result
110
+ return TaskResult(
111
+ seed=seed,
112
+ success=False,
113
+ outcome_reward=0.0,
114
+ error=str(exc),
115
+ )
116
+
117
+ # Execute all tasks concurrently
118
+ results = await asyncio.gather(*[run_task(seed) for seed in seeds])
119
+ return list(results)
120
+
121
+
122
+ def aggregate_results(
123
+ config: BaselineConfig,
124
+ results: List[TaskResult],
125
+ ) -> Dict[str, Any]:
126
+ """Aggregate results using custom aggregator or default.
127
+
128
+ Args:
129
+ config: BaselineConfig instance
130
+ results: List of TaskResult objects
131
+
132
+ Returns:
133
+ Dict with aggregate metrics
134
+ """
135
+ if config.result_aggregator is None:
136
+ return default_aggregator(results)
137
+
138
+ # Check if aggregator is a class or function
139
+ if isinstance(config.result_aggregator, type):
140
+ # Class-based: instantiate and call aggregate()
141
+ aggregator_instance = config.result_aggregator()
142
+ return aggregator_instance.aggregate(results)
143
+ else:
144
+ # Function-based: call directly
145
+ return config.result_aggregator(results)
146
+
synth_ai/cli/__init__.py CHANGED
@@ -53,12 +53,72 @@ if not _cli_module:
53
53
  cli = _cli_module.cli # type: ignore[attr-defined]
54
54
 
55
55
  # Register core commands implemented as standalone modules
56
+ try:
57
+ from synth_ai.cli.demo import demo_cmd
58
+ cli.add_command(demo_cmd, name="demo")
59
+ except Exception as e:
60
+ import sys
61
+ print(f"[DEBUG] Failed to register demo command: {e}", file=sys.stderr)
62
+ import traceback
63
+ traceback.print_exc()
56
64
  try:
57
65
  from synth_ai.cli.setup import setup_cmd
58
-
59
66
  cli.add_command(setup_cmd, name="setup")
60
- except Exception:
61
- pass
67
+ except Exception as e:
68
+ import sys
69
+ print(f"[DEBUG] Failed to register setup command: {e}", file=sys.stderr)
70
+ import traceback
71
+ traceback.print_exc()
72
+ try:
73
+ from synth_ai.cli.deploy import deploy_cmd # type: ignore[attr-defined]
74
+ cli.add_command(deploy_cmd, name="deploy")
75
+ except Exception as e:
76
+ import sys
77
+ print(f"[DEBUG] Failed to register deploy command: {e}", file=sys.stderr)
78
+ import traceback
79
+ traceback.print_exc()
80
+ try:
81
+ from synth_ai.cli.opencode import opencode_cmd
82
+ cli.add_command(opencode_cmd, name="opencode")
83
+ except Exception as e:
84
+ import sys
85
+ print(f"[DEBUG] Failed to register opencode command: {e}", file=sys.stderr)
86
+ import traceback
87
+ traceback.print_exc()
88
+ try:
89
+ from synth_ai.cli.codex import codex_cmd
90
+ cli.add_command(codex_cmd, name="codex")
91
+ except Exception as e:
92
+ import sys
93
+ print(f"[DEBUG] Failed to register codex command: {e}", file=sys.stderr)
94
+ import traceback
95
+ traceback.print_exc()
96
+ try:
97
+ from synth_ai.cli.eval import command as eval_cmd
98
+ cli.add_command(eval_cmd, name="eval")
99
+ except Exception as e:
100
+ import sys
101
+ print(f"[DEBUG] Failed to register eval command: {e}", file=sys.stderr)
102
+ import traceback
103
+ traceback.print_exc()
104
+ try:
105
+ from synth_ai.cli.claude import claude_cmd
106
+ cli.add_command(claude_cmd, name="claude")
107
+ except Exception as e:
108
+ import sys
109
+ print(f"[DEBUG] Failed to register claude command: {e}", file=sys.stderr)
110
+ import traceback
111
+ traceback.print_exc()
112
+ try:
113
+ from synth_ai.cli.commands.baseline import command as baseline_cmd
114
+ from synth_ai.cli.commands.baseline.list import list_command as baseline_list_cmd
115
+ cli.add_command(baseline_cmd, name="baseline")
116
+ baseline_cmd.add_command(baseline_list_cmd, name="list")
117
+ except Exception as e:
118
+ import sys
119
+ print(f"[DEBUG] Failed to register baseline command: {e}", file=sys.stderr)
120
+ import traceback
121
+ traceback.print_exc()
62
122
 
63
123
 
64
124
  # Register optional subcommands packaged under synth_ai.cli.*
@@ -72,6 +132,14 @@ for _module_path in ("synth_ai.cli.commands.demo", "synth_ai.cli.commands.status
72
132
  if fn:
73
133
  fn(cli)
74
134
 
135
+ # Smoke command registration (CLI-only helper)
136
+ try:
137
+ from synth_ai.cli.commands.smoke import register as register_smoke
138
+
139
+ register_smoke(cli)
140
+ except Exception:
141
+ pass
142
+
75
143
  # Register help command
76
144
  _maybe_call("synth_ai.cli.commands.help.core", "register", cli)
77
145
 
@@ -80,19 +148,19 @@ _maybe_call("synth_ai.api.train", "register", cli)
80
148
 
81
149
  # Task app group/commands are optional and have richer API surface
82
150
  _task_apps_module = _maybe_import("synth_ai.cli.task_apps")
83
- if _task_apps_module:
84
- task_app_group = getattr(_task_apps_module, "task_app_group", None)
85
- if task_app_group is not None:
86
- cli.add_command(task_app_group, name="task-app")
87
- # Expose common aliases when present
88
- commands = getattr(task_app_group, "commands", None)
89
- if isinstance(commands, dict):
90
- for alias, name in (("serve", "serve"), ("deploy", "deploy"), ("modal-serve", "modal-serve")):
91
- command = commands.get(name)
92
- if command is not None:
93
- cli.add_command(command, name=alias)
94
- register_task_apps = _callable_from(_task_apps_module, "register")
95
- if register_task_apps:
96
- register_task_apps(cli)
151
+ #if _task_apps_module:
152
+ task_app_group = getattr(_task_apps_module, "task_app_group", None)
153
+ if task_app_group is not None:
154
+ cli.add_command(task_app_group, name="task-app")
155
+ # Expose common aliases when present
156
+ commands = getattr(task_app_group, "commands", None)
157
+ if isinstance(commands, dict):
158
+ for alias, name in (("serve", "serve"), ("deploy", "deploy"), ("modal-serve", "modal-serve")):
159
+ command = commands.get(name)
160
+ if command is not None:
161
+ cli.add_command(command, name=alias)
162
+ register_task_apps = _callable_from(_task_apps_module, "register")
163
+ if register_task_apps:
164
+ register_task_apps(cli)
97
165
 
98
166
  # Top-level 'info' alias removed; use `synth-ai task-app info` instead
File without changes
synth_ai/cli/claude.py ADDED
@@ -0,0 +1,70 @@
1
+ import os
2
+ import subprocess
3
+
4
+ import click
5
+ from synth_ai.types import MODEL_NAMES, ModelName
6
+ from synth_ai.urls import BACKEND_URL_SYNTH_RESEARCH_ANTHROPIC
7
+ from synth_ai.utils import find_bin_path, install_bin, resolve_env_var, verify_bin, write_agents_md
8
+
9
+
10
+ @click.command("claude")
11
+ @click.option(
12
+ "--model",
13
+ "model_name",
14
+ type=str,
15
+ default=None
16
+ )
17
+ @click.option(
18
+ "--force",
19
+ is_flag=True,
20
+ help="Prompt for API keys even if cached values exist."
21
+ )
22
+ @click.option(
23
+ "--url",
24
+ "override_url",
25
+ type=str,
26
+ default=None,
27
+ )
28
+ def claude_cmd(
29
+ model_name: ModelName | None = None,
30
+ force: bool = False,
31
+ override_url: str | None = None
32
+ ) -> None:
33
+
34
+ while True:
35
+ bin_path = find_bin_path("claude")
36
+ if bin_path:
37
+ break
38
+ if not install_bin(
39
+ "Claude Code",
40
+ ["curl -fsSL https://claude.ai/install.sh | bash"]
41
+ ):
42
+ print("Failed to find your installed Claude Code")
43
+ print("Please install from: https://claude.com/claude-code")
44
+ return
45
+ print(f"Using Claude at {bin_path}")
46
+
47
+ if not verify_bin(bin_path):
48
+ print("Failed to verify Claude Code is runnable")
49
+ return
50
+
51
+ write_agents_md()
52
+ env = os.environ.copy()
53
+
54
+ if model_name is not None:
55
+ if model_name not in MODEL_NAMES:
56
+ raise ValueError(f"model_name={model_name} is invalid. Valid values for model_name: {MODEL_NAMES}")
57
+ if override_url:
58
+ url = f"{override_url.rstrip('/')}/{model_name}"
59
+ print(f"Using override URL with model: {url}")
60
+ else:
61
+ url = f"{BACKEND_URL_SYNTH_RESEARCH_ANTHROPIC}/{model_name}"
62
+ env["ANTHROPIC_BASE_URL"] = url
63
+ api_key = resolve_env_var("SYNTH_API_KEY", override_process_env=force)
64
+ env["ANTHROPIC_AUTH_TOKEN"] = api_key
65
+ env["SYNTH_API_KEY"] = api_key
66
+
67
+ try:
68
+ subprocess.run(["claude"], check=True, env=env)
69
+ except subprocess.CalledProcessError:
70
+ print("Failed to launch Claude Code")
synth_ai/cli/codex.py ADDED
@@ -0,0 +1,84 @@
1
+ import os
2
+ import subprocess
3
+
4
+ import click
5
+ from synth_ai.types import MODEL_NAMES, ModelName
6
+ from synth_ai.urls import BACKEND_URL_SYNTH_RESEARCH_OPENAI
7
+ from synth_ai.utils import find_bin_path, install_bin, resolve_env_var, verify_bin, write_agents_md
8
+
9
+
10
+ @click.command("codex")
11
+ @click.option(
12
+ "--model",
13
+ "model_name",
14
+ type=str,
15
+ default=None
16
+ )
17
+ @click.option(
18
+ "--force",
19
+ is_flag=True,
20
+ help="Prompt for API keys even if cached values exist."
21
+ )
22
+ @click.option(
23
+ "--url",
24
+ "override_url",
25
+ type=str,
26
+ default=None,
27
+ )
28
+ def codex_cmd(
29
+ model_name: ModelName | None = None,
30
+ force: bool = False,
31
+ override_url: str | None = None
32
+ )-> None:
33
+
34
+ while True:
35
+ bin_path = find_bin_path("codex")
36
+ if bin_path:
37
+ break
38
+ if not install_bin(
39
+ "Codex",
40
+ [
41
+ "brew install codex",
42
+ "npm install -g @openai/codex"
43
+ ]
44
+ ):
45
+ print("Failed to find your installed Codex")
46
+ print("Please install from: https://developers.openai.com/codex/cli/")
47
+ return
48
+ print(f"Using Codex at {bin_path}")
49
+
50
+ if not verify_bin(bin_path):
51
+ print("Failed to verify Codex is runnable")
52
+ return
53
+
54
+ write_agents_md()
55
+ env = os.environ.copy()
56
+ override_args = []
57
+
58
+ if model_name is not None:
59
+ if model_name not in MODEL_NAMES:
60
+ raise ValueError(f"model_name={model_name} is invalid. Valid values for model_name: {MODEL_NAMES}")
61
+ if override_url:
62
+ url = override_url
63
+ print("Using override URL:", url)
64
+ else:
65
+ url = BACKEND_URL_SYNTH_RESEARCH_OPENAI
66
+ provider_config = f'{{name="Synth",base_url="{url}",env_key="OPENAI_API_KEY"}}'
67
+ config_overrides = [
68
+ f"model_providers.synth={provider_config}",
69
+ 'model_provider="synth"',
70
+ f'default_model="{model_name}"'
71
+ ]
72
+ override_args = [arg for override in config_overrides for arg in ("-c", override)]
73
+ env["OPENAI_API_KEY"] = resolve_env_var("SYNTH_API_KEY", override_process_env=force)
74
+ env["SYNTH_API_KEY"] = env["OPENAI_API_KEY"]
75
+
76
+ try:
77
+ cmd = ["codex"]
78
+ if model_name is not None:
79
+ cmd.extend(["-m", model_name])
80
+ cmd.extend(override_args)
81
+ print(" ".join(cmd))
82
+ subprocess.run(cmd, check=True, env=env)
83
+ except subprocess.CalledProcessError:
84
+ print("Failed to run Codex")
@@ -14,4 +14,5 @@ __all__ = [
14
14
  "filter",
15
15
  "deploy",
16
16
  "status",
17
+ "smoke",
17
18
  ]
@@ -0,0 +1,12 @@
1
+ """CLI command for baseline evaluation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .core import command
6
+ from .list import list_command
7
+
8
+ __all__ = ["command"]
9
+
10
+ # Register list subcommand
11
+ command.add_command(list_command, name="list")
12
+