synth-ai 0.2.17__py3-none-any.whl → 0.2.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/baseline/banking77_baseline.py +204 -0
- examples/baseline/crafter_baseline.py +407 -0
- examples/baseline/pokemon_red_baseline.py +326 -0
- examples/baseline/simple_baseline.py +56 -0
- examples/baseline/warming_up_to_rl_baseline.py +239 -0
- examples/blog_posts/gepa/README.md +355 -0
- examples/blog_posts/gepa/configs/banking77_gepa_local.toml +95 -0
- examples/blog_posts/gepa/configs/banking77_gepa_test.toml +82 -0
- examples/blog_posts/gepa/configs/banking77_mipro_local.toml +52 -0
- examples/blog_posts/gepa/configs/hotpotqa_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/hotpotqa_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/hotpotqa_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/hover_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/hover_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/hover_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/ifbench_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/ifbench_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/ifbench_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/pupa_gepa_local.toml +60 -0
- examples/blog_posts/gepa/configs/pupa_mipro_local.toml +54 -0
- examples/blog_posts/gepa/deploy_banking77_task_app.sh +41 -0
- examples/blog_posts/gepa/gepa_baseline.py +204 -0
- examples/blog_posts/gepa/query_prompts_example.py +97 -0
- examples/blog_posts/gepa/run_gepa_banking77.sh +87 -0
- examples/blog_posts/gepa/task_apps.py +105 -0
- examples/blog_posts/gepa/test_gepa_local.sh +67 -0
- examples/blog_posts/gepa/verify_banking77_setup.sh +123 -0
- examples/blog_posts/pokemon_vl/configs/eval_gpt5nano.toml +26 -0
- examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +12 -10
- examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +1 -0
- examples/blog_posts/pokemon_vl/extract_images.py +239 -0
- examples/blog_posts/pokemon_vl/pokemon_vl_baseline.py +326 -0
- examples/blog_posts/pokemon_vl/run_eval_extract_images.py +209 -0
- examples/blog_posts/pokemon_vl/run_qwen_eval_extract_images.py +212 -0
- examples/blog_posts/pokemon_vl/text_box_analysis.md +106 -0
- examples/blog_posts/warming_up_to_rl/ARCHITECTURE.md +195 -0
- examples/blog_posts/warming_up_to_rl/FINAL_TEST_RESULTS.md +127 -0
- examples/blog_posts/warming_up_to_rl/INFERENCE_SUCCESS.md +132 -0
- examples/blog_posts/warming_up_to_rl/SMOKE_TESTING.md +164 -0
- examples/blog_posts/warming_up_to_rl/SMOKE_TEST_COMPLETE.md +253 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_baseline_qwen32b_10x20.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b_10x20.toml +26 -0
- examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +1 -1
- examples/blog_posts/warming_up_to_rl/configs/smoke_test.toml +75 -0
- examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +60 -10
- examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +1 -1
- examples/blog_posts/warming_up_to_rl/warming_up_to_rl_baseline.py +187 -0
- examples/multi_step/configs/VERILOG_REWARDS.md +4 -0
- examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +4 -0
- examples/multi_step/configs/crafter_rl_outcome.toml +1 -0
- examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +1 -0
- examples/multi_step/configs/crafter_rl_stepwise_simple.toml +1 -0
- examples/rl/configs/rl_from_base_qwen17.toml +1 -0
- examples/swe/task_app/hosted/inference/openai_client.py +0 -34
- examples/swe/task_app/hosted/policy_routes.py +17 -0
- examples/swe/task_app/hosted/rollout.py +4 -2
- examples/task_apps/banking77/__init__.py +6 -0
- examples/task_apps/banking77/banking77_task_app.py +841 -0
- examples/task_apps/banking77/deploy_wrapper.py +46 -0
- examples/task_apps/crafter/CREATE_SFT_DATASET.md +4 -0
- examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +4 -0
- examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +4 -0
- examples/task_apps/crafter/task_app/grpo_crafter.py +24 -2
- examples/task_apps/crafter/task_app/synth_envs_hosted/hosted_app.py +49 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +355 -58
- examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +68 -7
- examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +78 -21
- examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +194 -1
- examples/task_apps/gepa_benchmarks/__init__.py +7 -0
- examples/task_apps/gepa_benchmarks/common.py +260 -0
- examples/task_apps/gepa_benchmarks/hotpotqa_task_app.py +507 -0
- examples/task_apps/gepa_benchmarks/hover_task_app.py +436 -0
- examples/task_apps/gepa_benchmarks/ifbench_task_app.py +563 -0
- examples/task_apps/gepa_benchmarks/pupa_task_app.py +460 -0
- examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +4 -0
- examples/task_apps/pokemon_red/task_app.py +254 -36
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +1 -0
- examples/warming_up_to_rl/task_app/grpo_crafter.py +53 -4
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +49 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +152 -41
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +31 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +33 -3
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +67 -0
- examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +1 -0
- synth_ai/api/train/builders.py +90 -1
- synth_ai/api/train/cli.py +396 -21
- synth_ai/api/train/config_finder.py +13 -2
- synth_ai/api/train/configs/__init__.py +15 -1
- synth_ai/api/train/configs/prompt_learning.py +442 -0
- synth_ai/api/train/configs/rl.py +29 -0
- synth_ai/api/train/task_app.py +1 -1
- synth_ai/api/train/validators.py +277 -0
- synth_ai/baseline/__init__.py +25 -0
- synth_ai/baseline/config.py +209 -0
- synth_ai/baseline/discovery.py +214 -0
- synth_ai/baseline/execution.py +146 -0
- synth_ai/cli/__init__.py +85 -17
- synth_ai/cli/__main__.py +0 -0
- synth_ai/cli/claude.py +70 -0
- synth_ai/cli/codex.py +84 -0
- synth_ai/cli/commands/__init__.py +1 -0
- synth_ai/cli/commands/baseline/__init__.py +12 -0
- synth_ai/cli/commands/baseline/core.py +637 -0
- synth_ai/cli/commands/baseline/list.py +93 -0
- synth_ai/cli/commands/eval/core.py +13 -10
- synth_ai/cli/commands/filter/core.py +53 -17
- synth_ai/cli/commands/help/core.py +0 -1
- synth_ai/cli/commands/smoke/__init__.py +7 -0
- synth_ai/cli/commands/smoke/core.py +1436 -0
- synth_ai/cli/commands/status/subcommands/pricing.py +22 -0
- synth_ai/cli/commands/status/subcommands/usage.py +203 -0
- synth_ai/cli/commands/train/judge_schemas.py +1 -0
- synth_ai/cli/commands/train/judge_validation.py +1 -0
- synth_ai/cli/commands/train/validation.py +0 -57
- synth_ai/cli/demo.py +35 -3
- synth_ai/cli/deploy/__init__.py +40 -25
- synth_ai/cli/deploy.py +162 -0
- synth_ai/cli/legacy_root_backup.py +14 -8
- synth_ai/cli/opencode.py +107 -0
- synth_ai/cli/root.py +9 -5
- synth_ai/cli/task_app_deploy.py +1 -1
- synth_ai/cli/task_apps.py +53 -53
- synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +7 -4
- synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +9 -5
- synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +4 -3
- synth_ai/judge_schemas.py +1 -0
- synth_ai/learning/__init__.py +10 -0
- synth_ai/learning/prompt_learning_client.py +276 -0
- synth_ai/learning/prompt_learning_types.py +184 -0
- synth_ai/pricing/__init__.py +2 -0
- synth_ai/pricing/model_pricing.py +57 -0
- synth_ai/streaming/handlers.py +53 -4
- synth_ai/streaming/streamer.py +19 -0
- synth_ai/task/apps/__init__.py +1 -0
- synth_ai/task/config.py +2 -0
- synth_ai/task/tracing_utils.py +25 -25
- synth_ai/task/validators.py +44 -8
- synth_ai/task_app_cfgs.py +21 -0
- synth_ai/tracing_v3/config.py +162 -19
- synth_ai/tracing_v3/constants.py +1 -1
- synth_ai/tracing_v3/db_config.py +24 -38
- synth_ai/tracing_v3/storage/config.py +47 -13
- synth_ai/tracing_v3/storage/factory.py +3 -3
- synth_ai/tracing_v3/turso/daemon.py +113 -11
- synth_ai/tracing_v3/turso/native_manager.py +92 -16
- synth_ai/types.py +8 -0
- synth_ai/urls.py +11 -0
- synth_ai/utils/__init__.py +30 -1
- synth_ai/utils/agents.py +74 -0
- synth_ai/utils/bin.py +39 -0
- synth_ai/utils/cli.py +149 -5
- synth_ai/utils/env.py +17 -17
- synth_ai/utils/json.py +72 -0
- synth_ai/utils/modal.py +283 -1
- synth_ai/utils/paths.py +48 -0
- synth_ai/utils/uvicorn.py +113 -0
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/METADATA +102 -4
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/RECORD +162 -88
- synth_ai/cli/commands/deploy/__init__.py +0 -23
- synth_ai/cli/commands/deploy/core.py +0 -614
- synth_ai/cli/commands/deploy/errors.py +0 -72
- synth_ai/cli/commands/deploy/validation.py +0 -11
- synth_ai/cli/deploy/core.py +0 -5
- synth_ai/cli/deploy/errors.py +0 -23
- synth_ai/cli/deploy/validation.py +0 -5
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,637 @@
|
|
|
1
|
+
"""CLI command for baseline evaluation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
import time
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, Optional, Sequence
|
|
11
|
+
|
|
12
|
+
import click
|
|
13
|
+
from synth_ai.baseline.config import BaselineResults
|
|
14
|
+
from synth_ai.baseline.discovery import (
|
|
15
|
+
BASELINE_FILE_PATTERNS,
|
|
16
|
+
BaselineChoice,
|
|
17
|
+
discover_baseline_files,
|
|
18
|
+
load_baseline_config_from_file,
|
|
19
|
+
)
|
|
20
|
+
from synth_ai.baseline.execution import aggregate_results, run_baseline_evaluation
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class BaselineGroup(click.Group):
|
|
24
|
+
"""Custom group that allows positional arguments (baseline_id) even when subcommands exist."""
|
|
25
|
+
|
|
26
|
+
def make_context(
|
|
27
|
+
self,
|
|
28
|
+
info_name: str | None,
|
|
29
|
+
args: list[str],
|
|
30
|
+
parent: click.Context | None = None,
|
|
31
|
+
**extra,
|
|
32
|
+
) -> click.Context:
|
|
33
|
+
"""Override make_context to store original args before Click parses them."""
|
|
34
|
+
# Store original args in the context's meta
|
|
35
|
+
ctx = super().make_context(info_name, args, parent, **extra)
|
|
36
|
+
ctx.meta['_original_args'] = args.copy() if isinstance(args, list) else list(args)
|
|
37
|
+
return ctx
|
|
38
|
+
|
|
39
|
+
def resolve_command(self, ctx: click.Context, args: list[str]) -> tuple[click.Command | None, str, list[str]]:
|
|
40
|
+
"""Resolve command, checking if first arg is a subcommand or baseline_id."""
|
|
41
|
+
|
|
42
|
+
# Check if first arg is a known subcommand
|
|
43
|
+
if args and not args[0].startswith('--'):
|
|
44
|
+
first_arg = args[0]
|
|
45
|
+
if first_arg in self.commands:
|
|
46
|
+
# It's a known subcommand, let Click handle it normally
|
|
47
|
+
cmd_name, cmd, remaining = super().resolve_command(ctx, args)
|
|
48
|
+
# Click returns (name, cmd, args) but type checker expects (cmd, name, args)
|
|
49
|
+
return cmd, cmd_name or "", remaining
|
|
50
|
+
|
|
51
|
+
# Not a subcommand - this means baseline_id is a positional argument
|
|
52
|
+
# Store baseline_id in ctx for the callback to access
|
|
53
|
+
if args and not args[0].startswith('--'):
|
|
54
|
+
baseline_id = args[0]
|
|
55
|
+
ctx.meta['baseline_id'] = baseline_id
|
|
56
|
+
# Remove baseline_id from args so Click doesn't try to parse it
|
|
57
|
+
remaining_args = args[1:]
|
|
58
|
+
|
|
59
|
+
# Create a wrapper function that injects baseline_id into the callback
|
|
60
|
+
original_callback = self.callback
|
|
61
|
+
if original_callback is None:
|
|
62
|
+
raise click.ClickException("Command callback is None")
|
|
63
|
+
def wrapper_callback(ctx, **kwargs):
|
|
64
|
+
# Inject baseline_id into kwargs
|
|
65
|
+
kwargs['baseline_id'] = baseline_id
|
|
66
|
+
return original_callback(ctx, **kwargs)
|
|
67
|
+
|
|
68
|
+
# Create a wrapper command with the modified callback
|
|
69
|
+
# Filter out baseline_id from params since we're injecting it manually
|
|
70
|
+
filtered_params = [p for p in self.params if getattr(p, 'name', None) != 'baseline_id']
|
|
71
|
+
wrapper_cmd = click.Command(
|
|
72
|
+
name="_baseline_wrapper", # Use a different name to avoid confusion
|
|
73
|
+
callback=wrapper_callback,
|
|
74
|
+
params=filtered_params,
|
|
75
|
+
context_settings=self.context_settings,
|
|
76
|
+
)
|
|
77
|
+
return wrapper_cmd, "_baseline_wrapper", remaining_args
|
|
78
|
+
|
|
79
|
+
# No args or args start with --, so no baseline_id
|
|
80
|
+
# Let Click handle it normally (will invoke main callback if invoke_without_command=True)
|
|
81
|
+
cmd_name, cmd, remaining = super().resolve_command(ctx, args)
|
|
82
|
+
# Click returns (name, cmd, args) but type checker expects (cmd, name, args)
|
|
83
|
+
return cmd, cmd_name or "", remaining
|
|
84
|
+
|
|
85
|
+
def invoke(self, ctx: click.Context) -> Any:
|
|
86
|
+
"""Invoke command, handling baseline_id as positional arg."""
|
|
87
|
+
# Check if baseline_id is in ctx.params (Click might have parsed it)
|
|
88
|
+
if 'baseline_id' in ctx.params and ctx.params['baseline_id']:
|
|
89
|
+
baseline_id = ctx.params['baseline_id']
|
|
90
|
+
# Invoke callback with baseline_id from params
|
|
91
|
+
if self.callback is None:
|
|
92
|
+
raise click.ClickException("Command callback is None")
|
|
93
|
+
return self.callback(ctx, **ctx.params)
|
|
94
|
+
|
|
95
|
+
# Manually call resolve_command with full args (including baseline_id if present)
|
|
96
|
+
# Try to get the original args from ctx.meta (stored in make_context())
|
|
97
|
+
full_args = ctx.meta.get('_original_args', ctx.args)
|
|
98
|
+
|
|
99
|
+
# If no args, invoke callback directly (invoke_without_command=True behavior)
|
|
100
|
+
if not full_args:
|
|
101
|
+
if self.callback is None:
|
|
102
|
+
raise click.ClickException("Command callback is None")
|
|
103
|
+
return ctx.invoke(self.callback, **ctx.params)
|
|
104
|
+
|
|
105
|
+
cmd, cmd_name, resolved_args = self.resolve_command(ctx, full_args)
|
|
106
|
+
|
|
107
|
+
# Check if baseline_id was detected
|
|
108
|
+
if 'baseline_id' in ctx.meta:
|
|
109
|
+
baseline_id = ctx.meta['baseline_id']
|
|
110
|
+
# Parse options from resolved_args - don't use OptionParser, just use Click's make_context
|
|
111
|
+
# Create a temporary context to parse the options
|
|
112
|
+
temp_ctx = self.make_context(self.name, resolved_args, parent=ctx.parent, allow_extra_args=True, allow_interspersed_args=False)
|
|
113
|
+
params = temp_ctx.params.copy()
|
|
114
|
+
params['baseline_id'] = baseline_id
|
|
115
|
+
# Don't pass ctx explicitly - Click's @click.pass_context decorator injects it
|
|
116
|
+
# Use ctx.invoke to properly call the callback with the right context
|
|
117
|
+
if self.callback is None:
|
|
118
|
+
raise click.ClickException("Command callback is None")
|
|
119
|
+
return ctx.invoke(self.callback, **params)
|
|
120
|
+
|
|
121
|
+
# Normal flow - if it's a subcommand, invoke it
|
|
122
|
+
if cmd and cmd is not self and isinstance(cmd, click.Command):
|
|
123
|
+
with cmd.make_context(cmd_name, resolved_args, parent=ctx) as sub_ctx:
|
|
124
|
+
return cmd.invoke(sub_ctx)
|
|
125
|
+
|
|
126
|
+
# No baseline_id and no subcommand - invoke callback if invoke_without_command=True
|
|
127
|
+
if self.callback is None:
|
|
128
|
+
raise click.ClickException("Command callback is None")
|
|
129
|
+
return self.callback(ctx)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
__all__ = ["command"]
|
|
133
|
+
|
|
134
|
+
def _select_baseline_interactive(choices: list[BaselineChoice]) -> Optional[str]:
|
|
135
|
+
"""Prompt user to select a baseline interactively."""
|
|
136
|
+
if not choices:
|
|
137
|
+
return None
|
|
138
|
+
|
|
139
|
+
if len(choices) == 1:
|
|
140
|
+
return choices[0].baseline_id
|
|
141
|
+
|
|
142
|
+
click.echo("\nFound multiple baseline files:")
|
|
143
|
+
for i, choice in enumerate(choices, 1):
|
|
144
|
+
click.echo(f" {i}. {choice.baseline_id} ({choice.path})")
|
|
145
|
+
|
|
146
|
+
while True:
|
|
147
|
+
try:
|
|
148
|
+
selection = click.prompt("Select baseline", type=int)
|
|
149
|
+
if 1 <= selection <= len(choices):
|
|
150
|
+
return choices[selection - 1].baseline_id
|
|
151
|
+
click.echo(f"Please enter a number between 1 and {len(choices)}")
|
|
152
|
+
except (click.Abort, KeyboardInterrupt):
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
def _parse_seeds(seeds_str: Optional[str]) -> Optional[list[int]]:
|
|
156
|
+
"""Parse comma-separated seeds string."""
|
|
157
|
+
if not seeds_str:
|
|
158
|
+
return None
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
return [int(s.strip()) for s in seeds_str.split(",") if s.strip()]
|
|
162
|
+
except ValueError as e:
|
|
163
|
+
raise click.ClickException(f"Invalid seeds format: {seeds_str}. Expected comma-separated integers.") from e
|
|
164
|
+
|
|
165
|
+
def _parse_splits(splits_str: str) -> list[str]:
|
|
166
|
+
"""Parse comma-separated splits string."""
|
|
167
|
+
return [s.strip() for s in splits_str.split(",") if s.strip()]
|
|
168
|
+
|
|
169
|
+
@click.group(
|
|
170
|
+
"baseline",
|
|
171
|
+
help="Run self-contained task evaluation using a baseline file.",
|
|
172
|
+
invoke_without_command=True,
|
|
173
|
+
cls=BaselineGroup,
|
|
174
|
+
)
|
|
175
|
+
@click.pass_context
|
|
176
|
+
# DON'T define baseline_id as an argument here - it will be consumed before resolve_command()
|
|
177
|
+
# @click.argument("baseline_id", type=str, required=False)
|
|
178
|
+
@click.option(
|
|
179
|
+
"--split",
|
|
180
|
+
default="train",
|
|
181
|
+
help="Data split(s) to evaluate (comma-separated). Default: train",
|
|
182
|
+
)
|
|
183
|
+
@click.option(
|
|
184
|
+
"--seeds",
|
|
185
|
+
default=None,
|
|
186
|
+
help="Comma-separated seeds to evaluate (overrides split defaults)",
|
|
187
|
+
)
|
|
188
|
+
@click.option(
|
|
189
|
+
"--model",
|
|
190
|
+
default=None,
|
|
191
|
+
help="Model identifier (overrides default_policy_config)",
|
|
192
|
+
)
|
|
193
|
+
@click.option(
|
|
194
|
+
"--temperature",
|
|
195
|
+
type=float,
|
|
196
|
+
default=None,
|
|
197
|
+
help="Sampling temperature (overrides default_policy_config)",
|
|
198
|
+
)
|
|
199
|
+
@click.option(
|
|
200
|
+
"--policy-config",
|
|
201
|
+
type=str,
|
|
202
|
+
default=None,
|
|
203
|
+
help="JSON string with policy config overrides",
|
|
204
|
+
)
|
|
205
|
+
@click.option(
|
|
206
|
+
"--env-config",
|
|
207
|
+
type=str,
|
|
208
|
+
default=None,
|
|
209
|
+
help="JSON string with env config overrides",
|
|
210
|
+
)
|
|
211
|
+
@click.option(
|
|
212
|
+
"--output",
|
|
213
|
+
type=click.Path(),
|
|
214
|
+
default=None,
|
|
215
|
+
help="Save results to JSON file",
|
|
216
|
+
)
|
|
217
|
+
@click.option(
|
|
218
|
+
"--trace-db",
|
|
219
|
+
default=None,
|
|
220
|
+
help="SQLite/Turso URL for storing traces (set to 'none' to disable)",
|
|
221
|
+
)
|
|
222
|
+
@click.option(
|
|
223
|
+
"--concurrency",
|
|
224
|
+
type=int,
|
|
225
|
+
default=4,
|
|
226
|
+
help="Maximum concurrent task executions",
|
|
227
|
+
)
|
|
228
|
+
@click.option(
|
|
229
|
+
"--env-file",
|
|
230
|
+
multiple=True,
|
|
231
|
+
type=click.Path(),
|
|
232
|
+
help="Environment file(s) to load (for API keys, etc.)",
|
|
233
|
+
)
|
|
234
|
+
@click.option(
|
|
235
|
+
"--verbose",
|
|
236
|
+
is_flag=True,
|
|
237
|
+
help="Enable verbose output",
|
|
238
|
+
)
|
|
239
|
+
def command(
|
|
240
|
+
ctx: click.Context,
|
|
241
|
+
baseline_id: str | None = None,
|
|
242
|
+
split: str = "train",
|
|
243
|
+
seeds: str | None = None,
|
|
244
|
+
model: str | None = None,
|
|
245
|
+
temperature: float | None = None,
|
|
246
|
+
policy_config: str | None = None,
|
|
247
|
+
env_config: str | None = None,
|
|
248
|
+
output: str | None = None,
|
|
249
|
+
trace_db: str | None = None,
|
|
250
|
+
concurrency: int = 4,
|
|
251
|
+
env_file: Sequence[str] = (),
|
|
252
|
+
verbose: bool = False,
|
|
253
|
+
) -> None:
|
|
254
|
+
"""Run baseline evaluation."""
|
|
255
|
+
# If a subcommand was invoked, don't run the default command
|
|
256
|
+
if ctx.invoked_subcommand is not None:
|
|
257
|
+
return
|
|
258
|
+
|
|
259
|
+
# Check if baseline_id is actually a subcommand (shouldn't happen, but handle gracefully)
|
|
260
|
+
if baseline_id and isinstance(ctx.command, click.Group) and baseline_id in ctx.command.commands:
|
|
261
|
+
# It's a subcommand, re-invoke with that subcommand
|
|
262
|
+
subcmd = ctx.command.get_command(ctx, baseline_id)
|
|
263
|
+
if subcmd:
|
|
264
|
+
return ctx.invoke(subcmd, **ctx.params)
|
|
265
|
+
|
|
266
|
+
# baseline_id should be parsed by Click as a positional argument
|
|
267
|
+
# No need to extract from meta since resolve_command returns None for non-subcommands
|
|
268
|
+
|
|
269
|
+
# Run the evaluation
|
|
270
|
+
asyncio.run(
|
|
271
|
+
_baseline_command_impl(
|
|
272
|
+
baseline_id=baseline_id,
|
|
273
|
+
split=split,
|
|
274
|
+
seeds=seeds,
|
|
275
|
+
model=model,
|
|
276
|
+
temperature=temperature,
|
|
277
|
+
policy_config_json=policy_config,
|
|
278
|
+
env_config_json=env_config,
|
|
279
|
+
output_path=Path(output) if output else None,
|
|
280
|
+
trace_db_url=trace_db,
|
|
281
|
+
concurrency=concurrency,
|
|
282
|
+
env_files=env_file,
|
|
283
|
+
verbose=verbose,
|
|
284
|
+
)
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
@command.command("run")
|
|
288
|
+
@click.argument("baseline_id", type=str, required=False)
|
|
289
|
+
@click.option(
|
|
290
|
+
"--split",
|
|
291
|
+
default="train",
|
|
292
|
+
help="Data split(s) to evaluate (comma-separated). Default: train",
|
|
293
|
+
)
|
|
294
|
+
@click.option(
|
|
295
|
+
"--seeds",
|
|
296
|
+
default=None,
|
|
297
|
+
help="Comma-separated seeds to evaluate (overrides split defaults)",
|
|
298
|
+
)
|
|
299
|
+
@click.option(
|
|
300
|
+
"--model",
|
|
301
|
+
default=None,
|
|
302
|
+
help="Model identifier (overrides default_policy_config)",
|
|
303
|
+
)
|
|
304
|
+
@click.option(
|
|
305
|
+
"--temperature",
|
|
306
|
+
type=float,
|
|
307
|
+
default=None,
|
|
308
|
+
help="Sampling temperature (overrides default_policy_config)",
|
|
309
|
+
)
|
|
310
|
+
@click.option(
|
|
311
|
+
"--policy-config",
|
|
312
|
+
type=str,
|
|
313
|
+
default=None,
|
|
314
|
+
help="JSON string with policy config overrides",
|
|
315
|
+
)
|
|
316
|
+
@click.option(
|
|
317
|
+
"--env-config",
|
|
318
|
+
type=str,
|
|
319
|
+
default=None,
|
|
320
|
+
help="JSON string with env config overrides",
|
|
321
|
+
)
|
|
322
|
+
@click.option(
|
|
323
|
+
"--output",
|
|
324
|
+
type=click.Path(),
|
|
325
|
+
default=None,
|
|
326
|
+
help="Save results to JSON file",
|
|
327
|
+
)
|
|
328
|
+
@click.option(
|
|
329
|
+
"--trace-db",
|
|
330
|
+
default=None,
|
|
331
|
+
help="SQLite/Turso URL for storing traces (set to 'none' to disable)",
|
|
332
|
+
)
|
|
333
|
+
@click.option(
|
|
334
|
+
"--concurrency",
|
|
335
|
+
type=int,
|
|
336
|
+
default=4,
|
|
337
|
+
help="Maximum concurrent task executions",
|
|
338
|
+
)
|
|
339
|
+
@click.option(
|
|
340
|
+
"--env-file",
|
|
341
|
+
multiple=True,
|
|
342
|
+
type=click.Path(),
|
|
343
|
+
help="Environment file(s) to load (for API keys, etc.)",
|
|
344
|
+
)
|
|
345
|
+
@click.option(
|
|
346
|
+
"--verbose",
|
|
347
|
+
is_flag=True,
|
|
348
|
+
help="Enable verbose output",
|
|
349
|
+
)
|
|
350
|
+
def run_command(
|
|
351
|
+
baseline_id: str | None,
|
|
352
|
+
split: str,
|
|
353
|
+
seeds: str | None,
|
|
354
|
+
model: str | None,
|
|
355
|
+
temperature: float | None,
|
|
356
|
+
policy_config: str | None,
|
|
357
|
+
env_config: str | None,
|
|
358
|
+
output: str | None,
|
|
359
|
+
trace_db: str | None,
|
|
360
|
+
concurrency: int,
|
|
361
|
+
env_file: Sequence[str],
|
|
362
|
+
verbose: bool,
|
|
363
|
+
) -> None:
|
|
364
|
+
"""Run baseline evaluation."""
|
|
365
|
+
asyncio.run(
|
|
366
|
+
_baseline_command_impl(
|
|
367
|
+
baseline_id=baseline_id,
|
|
368
|
+
split=split,
|
|
369
|
+
seeds=seeds,
|
|
370
|
+
model=model,
|
|
371
|
+
temperature=temperature,
|
|
372
|
+
policy_config_json=policy_config,
|
|
373
|
+
env_config_json=env_config,
|
|
374
|
+
output_path=Path(output) if output else None,
|
|
375
|
+
trace_db_url=trace_db,
|
|
376
|
+
concurrency=concurrency,
|
|
377
|
+
env_files=env_file,
|
|
378
|
+
verbose=verbose,
|
|
379
|
+
)
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
async def _baseline_command_impl(
|
|
383
|
+
baseline_id: str | None,
|
|
384
|
+
split: str,
|
|
385
|
+
seeds: str | None,
|
|
386
|
+
model: str | None,
|
|
387
|
+
temperature: float | None,
|
|
388
|
+
policy_config_json: str | None,
|
|
389
|
+
env_config_json: str | None,
|
|
390
|
+
output_path: Path | None,
|
|
391
|
+
trace_db_url: str | None,
|
|
392
|
+
concurrency: int,
|
|
393
|
+
env_files: Sequence[str],
|
|
394
|
+
verbose: bool,
|
|
395
|
+
) -> None:
|
|
396
|
+
"""Implementation of baseline command."""
|
|
397
|
+
|
|
398
|
+
# Load environment files if provided
|
|
399
|
+
if env_files:
|
|
400
|
+
try:
|
|
401
|
+
from dotenv import load_dotenv
|
|
402
|
+
for env_file in env_files:
|
|
403
|
+
load_dotenv(env_file, override=False)
|
|
404
|
+
except ImportError:
|
|
405
|
+
click.echo("Warning: python-dotenv not installed, skipping --env-file", err=True)
|
|
406
|
+
|
|
407
|
+
# 1. Discovery
|
|
408
|
+
search_roots = [Path.cwd()]
|
|
409
|
+
choices = discover_baseline_files(search_roots)
|
|
410
|
+
|
|
411
|
+
if not choices:
|
|
412
|
+
search_dirs = [str(root) for root in search_roots]
|
|
413
|
+
raise click.ClickException(
|
|
414
|
+
f"❌ No baseline files found\n"
|
|
415
|
+
f" Searched in: {', '.join(search_dirs)}\n"
|
|
416
|
+
f" Patterns: {', '.join(BASELINE_FILE_PATTERNS)}\n"
|
|
417
|
+
f" Create baseline files in:\n"
|
|
418
|
+
f" - examples/baseline/*.py\n"
|
|
419
|
+
f" - **/*_baseline.py (anywhere in the tree)\n"
|
|
420
|
+
f" Example: Create examples/baseline/my_task_baseline.py\n"
|
|
421
|
+
f" See: https://docs.usesynth.ai/baseline for more info"
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
if baseline_id is None:
|
|
425
|
+
selected_id = _select_baseline_interactive(choices)
|
|
426
|
+
if selected_id is None:
|
|
427
|
+
raise click.ClickException(
|
|
428
|
+
"❌ No baseline selected\n"
|
|
429
|
+
" Run with a baseline ID: synth-ai baseline <baseline_id>\n"
|
|
430
|
+
" Or use: synth-ai baseline list to see available baselines"
|
|
431
|
+
)
|
|
432
|
+
baseline_id = selected_id
|
|
433
|
+
|
|
434
|
+
# Find matching baseline
|
|
435
|
+
matching = [c for c in choices if c.baseline_id == baseline_id]
|
|
436
|
+
if not matching:
|
|
437
|
+
available = sorted({c.baseline_id for c in choices})
|
|
438
|
+
# Find close matches (fuzzy matching)
|
|
439
|
+
close_matches = [
|
|
440
|
+
bid for bid in available
|
|
441
|
+
if baseline_id.lower() in bid.lower() or bid.lower() in baseline_id.lower()
|
|
442
|
+
]
|
|
443
|
+
|
|
444
|
+
error_msg = (
|
|
445
|
+
f"❌ Baseline '{baseline_id}' not found\n"
|
|
446
|
+
f" Available baselines ({len(available)}): {', '.join(available)}"
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
if close_matches:
|
|
450
|
+
error_msg += f"\n Did you mean: {', '.join(close_matches[:3])}?"
|
|
451
|
+
|
|
452
|
+
error_msg += "\n Use 'synth-ai baseline list' to see all baselines with details"
|
|
453
|
+
|
|
454
|
+
raise click.ClickException(error_msg)
|
|
455
|
+
|
|
456
|
+
choice = matching[0]
|
|
457
|
+
|
|
458
|
+
# 2. Load config
|
|
459
|
+
try:
|
|
460
|
+
config = load_baseline_config_from_file(baseline_id, choice.path)
|
|
461
|
+
except ImportError as e:
|
|
462
|
+
# ImportError already has good formatting from discovery.py
|
|
463
|
+
raise click.ClickException(str(e)) from e
|
|
464
|
+
except ValueError as e:
|
|
465
|
+
# ValueError already has good formatting from discovery.py
|
|
466
|
+
raise click.ClickException(str(e)) from e
|
|
467
|
+
except Exception as e:
|
|
468
|
+
error_type = type(e).__name__
|
|
469
|
+
raise click.ClickException(
|
|
470
|
+
f"❌ Unexpected error loading baseline '{baseline_id}'\n"
|
|
471
|
+
f" File: {choice.path}\n"
|
|
472
|
+
f" Error: {error_type}: {str(e)}\n"
|
|
473
|
+
f" Tip: Run with --verbose for more details"
|
|
474
|
+
) from e
|
|
475
|
+
|
|
476
|
+
# 3. Validate split
|
|
477
|
+
split_names = _parse_splits(split)
|
|
478
|
+
for split_name in split_names:
|
|
479
|
+
if split_name not in config.splits:
|
|
480
|
+
available_splits = sorted(config.splits.keys())
|
|
481
|
+
raise click.ClickException(
|
|
482
|
+
f"❌ Invalid split '{split_name}' for baseline '{baseline_id}'\n"
|
|
483
|
+
f" Available splits: {', '.join(available_splits)}\n"
|
|
484
|
+
f" Use: --split {available_splits[0]} (or comma-separated: --split {','.join(available_splits)})"
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
# 4. Determine seeds
|
|
488
|
+
if seeds:
|
|
489
|
+
try:
|
|
490
|
+
seed_list = _parse_seeds(seeds)
|
|
491
|
+
if not seed_list:
|
|
492
|
+
raise click.ClickException(
|
|
493
|
+
f"❌ No valid seeds provided\n"
|
|
494
|
+
f" Provided: '{seeds}'\n"
|
|
495
|
+
f" Expected: comma-separated integers (e.g., '0,1,2')"
|
|
496
|
+
)
|
|
497
|
+
except ValueError as e:
|
|
498
|
+
raise click.ClickException(
|
|
499
|
+
f"❌ Invalid seeds format\n"
|
|
500
|
+
f" Provided: '{seeds}'\n"
|
|
501
|
+
f" Expected: comma-separated integers (e.g., '0,1,2' or '10,20,30')\n"
|
|
502
|
+
f" Error: {str(e)}"
|
|
503
|
+
) from e
|
|
504
|
+
else:
|
|
505
|
+
# Use all seeds from specified splits
|
|
506
|
+
seed_list = []
|
|
507
|
+
for split_name in split_names:
|
|
508
|
+
seed_list.extend(config.splits[split_name].seeds)
|
|
509
|
+
|
|
510
|
+
if not seed_list:
|
|
511
|
+
split_info = []
|
|
512
|
+
for split_name in split_names:
|
|
513
|
+
num_seeds = len(config.splits[split_name].seeds)
|
|
514
|
+
split_info.append(f"{split_name} ({num_seeds} seeds)")
|
|
515
|
+
|
|
516
|
+
raise click.ClickException(
|
|
517
|
+
f"❌ No seeds found for split(s): {', '.join(split_names)}\n"
|
|
518
|
+
f" Split details: {', '.join(split_info)}\n"
|
|
519
|
+
f" This may indicate an empty split configuration\n"
|
|
520
|
+
f" Fix: Use --seeds to specify seeds manually (e.g., --seeds 0,1,2)"
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
# 5. Merge configs
|
|
524
|
+
policy_config = {**config.default_policy_config}
|
|
525
|
+
if model:
|
|
526
|
+
policy_config["model"] = model
|
|
527
|
+
if temperature is not None:
|
|
528
|
+
policy_config["temperature"] = temperature
|
|
529
|
+
if policy_config_json:
|
|
530
|
+
try:
|
|
531
|
+
policy_overrides = json.loads(policy_config_json)
|
|
532
|
+
policy_config.update(policy_overrides)
|
|
533
|
+
except json.JSONDecodeError as e:
|
|
534
|
+
raise click.ClickException(
|
|
535
|
+
f"❌ Invalid --policy-config JSON\n"
|
|
536
|
+
f" Provided: {policy_config_json[:100]}...\n"
|
|
537
|
+
f" Error: {str(e)}\n"
|
|
538
|
+
f" Expected: Valid JSON object (e.g., '{{\"model\": \"gpt-4o\", \"temperature\": 0.7}}')"
|
|
539
|
+
) from e
|
|
540
|
+
|
|
541
|
+
env_config = {**config.default_env_config}
|
|
542
|
+
if env_config_json:
|
|
543
|
+
try:
|
|
544
|
+
env_overrides = json.loads(env_config_json)
|
|
545
|
+
env_config.update(env_overrides)
|
|
546
|
+
except json.JSONDecodeError as e:
|
|
547
|
+
raise click.ClickException(
|
|
548
|
+
f"❌ Invalid --env-config JSON\n"
|
|
549
|
+
f" Provided: {env_config_json[:100]}...\n"
|
|
550
|
+
f" Error: {str(e)}\n"
|
|
551
|
+
f" Expected: Valid JSON object (e.g., '{{\"max_steps\": 1000}}')"
|
|
552
|
+
) from e
|
|
553
|
+
|
|
554
|
+
# Handle split-specific env config
|
|
555
|
+
for split_name in split_names:
|
|
556
|
+
split_config = config.splits[split_name]
|
|
557
|
+
if split_config.metadata:
|
|
558
|
+
env_config.update(split_config.metadata)
|
|
559
|
+
|
|
560
|
+
# 6. Setup trace storage (if requested)
|
|
561
|
+
tracer = None
|
|
562
|
+
if trace_db_url and trace_db_url.lower() not in {"none", "off"}:
|
|
563
|
+
from synth_ai.tracing_v3.session_tracer import SessionTracer
|
|
564
|
+
tracer = SessionTracer(db_url=trace_db_url, auto_save=True)
|
|
565
|
+
await tracer.initialize()
|
|
566
|
+
|
|
567
|
+
# 7. Execute tasks
|
|
568
|
+
click.echo(f"Running {len(seed_list)} tasks across {len(split_names)} split(s)...")
|
|
569
|
+
click.echo(f"Model: {policy_config.get('model', 'default')}")
|
|
570
|
+
click.echo(f"Concurrency: {concurrency}")
|
|
571
|
+
|
|
572
|
+
start_time = time.perf_counter()
|
|
573
|
+
try:
|
|
574
|
+
results = await run_baseline_evaluation(
|
|
575
|
+
config=config,
|
|
576
|
+
seeds=seed_list,
|
|
577
|
+
policy_config=policy_config,
|
|
578
|
+
env_config=env_config,
|
|
579
|
+
concurrency=concurrency,
|
|
580
|
+
)
|
|
581
|
+
except Exception as e:
|
|
582
|
+
error_type = type(e).__name__
|
|
583
|
+
raise click.ClickException(
|
|
584
|
+
f"❌ Error running baseline evaluation\n"
|
|
585
|
+
f" Baseline: {baseline_id}\n"
|
|
586
|
+
f" Tasks: {len(seed_list)} seeds\n"
|
|
587
|
+
f" Error: {error_type}: {str(e)}\n"
|
|
588
|
+
f" Common causes:\n"
|
|
589
|
+
f" - Missing dependencies (check baseline file imports)\n"
|
|
590
|
+
f" - API key not set (check environment variables)\n"
|
|
591
|
+
f" - Model/inference configuration issues\n"
|
|
592
|
+
f" Tip: Run with --verbose for detailed error output"
|
|
593
|
+
) from e
|
|
594
|
+
|
|
595
|
+
elapsed = time.perf_counter() - start_time
|
|
596
|
+
|
|
597
|
+
# Store traces if requested
|
|
598
|
+
if tracer:
|
|
599
|
+
for result in results:
|
|
600
|
+
if result.trace:
|
|
601
|
+
# Store trace (simplified - would need proper trace storage logic)
|
|
602
|
+
pass
|
|
603
|
+
|
|
604
|
+
# 8. Aggregate results
|
|
605
|
+
aggregate_metrics = aggregate_results(config, results)
|
|
606
|
+
|
|
607
|
+
# 9. Create output
|
|
608
|
+
baseline_results = BaselineResults(
|
|
609
|
+
config=config,
|
|
610
|
+
split_name=",".join(split_names),
|
|
611
|
+
results=results,
|
|
612
|
+
aggregate_metrics=aggregate_metrics,
|
|
613
|
+
execution_time_seconds=elapsed,
|
|
614
|
+
model_name=policy_config.get("model", "unknown"),
|
|
615
|
+
timestamp=datetime.now().isoformat(),
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
# 10. Display summary
|
|
619
|
+
click.echo("\n" + "=" * 60)
|
|
620
|
+
click.echo(f"Baseline Evaluation: {config.name}")
|
|
621
|
+
click.echo("=" * 60)
|
|
622
|
+
click.echo(f"Split(s): {baseline_results.split_name}")
|
|
623
|
+
click.echo(f"Tasks: {len(results)}")
|
|
624
|
+
click.echo(f"Success: {sum(1 for r in results if r.success)}/{len(results)}")
|
|
625
|
+
click.echo(f"Execution time: {elapsed:.2f}s")
|
|
626
|
+
click.echo("\nAggregate Metrics:")
|
|
627
|
+
for key, value in aggregate_metrics.items():
|
|
628
|
+
if isinstance(value, float):
|
|
629
|
+
click.echo(f" {key}: {value:.4f}")
|
|
630
|
+
else:
|
|
631
|
+
click.echo(f" {key}: {value}")
|
|
632
|
+
|
|
633
|
+
# 11. Save output if requested
|
|
634
|
+
if output_path:
|
|
635
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
636
|
+
output_path.write_text(json.dumps(baseline_results.to_dict(), indent=2))
|
|
637
|
+
click.echo(f"\nResults saved to: {output_path}")
|