synth-ai 0.4.1__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (153) hide show
  1. synth_ai/__init__.py +13 -13
  2. synth_ai/cli/__init__.py +6 -15
  3. synth_ai/cli/commands/eval/__init__.py +6 -15
  4. synth_ai/cli/commands/eval/config.py +338 -0
  5. synth_ai/cli/commands/eval/core.py +236 -1091
  6. synth_ai/cli/commands/eval/runner.py +704 -0
  7. synth_ai/cli/commands/eval/validation.py +44 -117
  8. synth_ai/cli/commands/filter/core.py +7 -7
  9. synth_ai/cli/commands/filter/validation.py +2 -2
  10. synth_ai/cli/commands/smoke/core.py +7 -17
  11. synth_ai/cli/commands/status/__init__.py +1 -64
  12. synth_ai/cli/commands/status/client.py +50 -151
  13. synth_ai/cli/commands/status/config.py +3 -83
  14. synth_ai/cli/commands/status/errors.py +4 -13
  15. synth_ai/cli/commands/status/subcommands/__init__.py +2 -8
  16. synth_ai/cli/commands/status/subcommands/config.py +13 -0
  17. synth_ai/cli/commands/status/subcommands/files.py +18 -63
  18. synth_ai/cli/commands/status/subcommands/jobs.py +28 -311
  19. synth_ai/cli/commands/status/subcommands/models.py +18 -62
  20. synth_ai/cli/commands/status/subcommands/runs.py +16 -63
  21. synth_ai/cli/commands/status/subcommands/session.py +67 -172
  22. synth_ai/cli/commands/status/subcommands/summary.py +24 -32
  23. synth_ai/cli/commands/status/subcommands/utils.py +41 -0
  24. synth_ai/cli/commands/status/utils.py +16 -107
  25. synth_ai/cli/commands/train/__init__.py +18 -20
  26. synth_ai/cli/commands/train/errors.py +3 -3
  27. synth_ai/cli/commands/train/prompt_learning_validation.py +15 -16
  28. synth_ai/cli/commands/train/validation.py +7 -7
  29. synth_ai/cli/commands/train/{judge_schemas.py → verifier_schemas.py} +33 -34
  30. synth_ai/cli/commands/train/verifier_validation.py +235 -0
  31. synth_ai/cli/demo_apps/demo_task_apps/math/config.toml +0 -1
  32. synth_ai/cli/demo_apps/demo_task_apps/math/modal_task_app.py +2 -6
  33. synth_ai/cli/demo_apps/math/config.toml +0 -1
  34. synth_ai/cli/demo_apps/math/modal_task_app.py +2 -6
  35. synth_ai/cli/demo_apps/mipro/task_app.py +25 -47
  36. synth_ai/cli/lib/apps/task_app.py +12 -13
  37. synth_ai/cli/lib/task_app_discovery.py +6 -6
  38. synth_ai/cli/lib/train_cfgs.py +10 -10
  39. synth_ai/cli/task_apps/__init__.py +11 -0
  40. synth_ai/cli/task_apps/commands.py +7 -15
  41. synth_ai/core/env.py +12 -1
  42. synth_ai/core/errors.py +1 -2
  43. synth_ai/core/integrations/cloudflare.py +209 -33
  44. synth_ai/core/tracing_v3/abstractions.py +46 -0
  45. synth_ai/data/__init__.py +3 -30
  46. synth_ai/data/enums.py +1 -20
  47. synth_ai/data/rewards.py +100 -3
  48. synth_ai/products/graph_evolve/__init__.py +1 -2
  49. synth_ai/products/graph_evolve/config.py +16 -16
  50. synth_ai/products/graph_evolve/converters/__init__.py +3 -3
  51. synth_ai/products/graph_evolve/converters/openai_sft.py +7 -7
  52. synth_ai/products/graph_evolve/examples/hotpotqa/config.toml +1 -1
  53. synth_ai/products/graph_gepa/__init__.py +23 -0
  54. synth_ai/products/graph_gepa/converters/__init__.py +19 -0
  55. synth_ai/products/graph_gepa/converters/openai_sft.py +29 -0
  56. synth_ai/sdk/__init__.py +45 -35
  57. synth_ai/sdk/api/eval/__init__.py +33 -0
  58. synth_ai/sdk/api/eval/job.py +732 -0
  59. synth_ai/sdk/api/research_agent/__init__.py +276 -66
  60. synth_ai/sdk/api/train/builders.py +181 -0
  61. synth_ai/sdk/api/train/cli.py +41 -33
  62. synth_ai/sdk/api/train/configs/__init__.py +6 -4
  63. synth_ai/sdk/api/train/configs/prompt_learning.py +127 -33
  64. synth_ai/sdk/api/train/configs/rl.py +264 -16
  65. synth_ai/sdk/api/train/configs/sft.py +165 -1
  66. synth_ai/sdk/api/train/graph_validators.py +12 -12
  67. synth_ai/sdk/api/train/graphgen.py +169 -51
  68. synth_ai/sdk/api/train/graphgen_models.py +95 -45
  69. synth_ai/sdk/api/train/local_api.py +10 -0
  70. synth_ai/sdk/api/train/pollers.py +36 -0
  71. synth_ai/sdk/api/train/prompt_learning.py +390 -60
  72. synth_ai/sdk/api/train/rl.py +41 -5
  73. synth_ai/sdk/api/train/sft.py +2 -0
  74. synth_ai/sdk/api/train/task_app.py +20 -0
  75. synth_ai/sdk/api/train/validators.py +17 -17
  76. synth_ai/sdk/graphs/completions.py +239 -33
  77. synth_ai/sdk/{judging/schemas.py → graphs/verifier_schemas.py} +23 -23
  78. synth_ai/sdk/learning/__init__.py +35 -5
  79. synth_ai/sdk/learning/context_learning_client.py +531 -0
  80. synth_ai/sdk/learning/context_learning_types.py +294 -0
  81. synth_ai/sdk/learning/prompt_learning_client.py +1 -1
  82. synth_ai/sdk/learning/prompt_learning_types.py +2 -1
  83. synth_ai/sdk/learning/rl/__init__.py +0 -4
  84. synth_ai/sdk/learning/rl/contracts.py +0 -4
  85. synth_ai/sdk/localapi/__init__.py +40 -0
  86. synth_ai/sdk/localapi/apps/__init__.py +28 -0
  87. synth_ai/sdk/localapi/client.py +10 -0
  88. synth_ai/sdk/localapi/contracts.py +10 -0
  89. synth_ai/sdk/localapi/helpers.py +519 -0
  90. synth_ai/sdk/localapi/rollouts.py +93 -0
  91. synth_ai/sdk/localapi/server.py +29 -0
  92. synth_ai/sdk/localapi/template.py +49 -0
  93. synth_ai/sdk/streaming/handlers.py +6 -6
  94. synth_ai/sdk/streaming/streamer.py +10 -6
  95. synth_ai/sdk/task/__init__.py +18 -5
  96. synth_ai/sdk/task/apps/__init__.py +37 -1
  97. synth_ai/sdk/task/client.py +9 -1
  98. synth_ai/sdk/task/config.py +6 -11
  99. synth_ai/sdk/task/contracts.py +137 -95
  100. synth_ai/sdk/task/in_process.py +32 -22
  101. synth_ai/sdk/task/in_process_runner.py +9 -4
  102. synth_ai/sdk/task/rubrics/__init__.py +2 -3
  103. synth_ai/sdk/task/rubrics/loaders.py +4 -4
  104. synth_ai/sdk/task/rubrics/strict.py +3 -4
  105. synth_ai/sdk/task/server.py +76 -16
  106. synth_ai/sdk/task/trace_correlation_helpers.py +190 -139
  107. synth_ai/sdk/task/validators.py +34 -49
  108. synth_ai/sdk/training/__init__.py +7 -16
  109. synth_ai/sdk/tunnels/__init__.py +118 -0
  110. synth_ai/sdk/tunnels/cleanup.py +83 -0
  111. synth_ai/sdk/tunnels/ports.py +120 -0
  112. synth_ai/sdk/tunnels/tunneled_api.py +363 -0
  113. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/METADATA +71 -4
  114. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/RECORD +118 -128
  115. synth_ai/cli/commands/baseline/__init__.py +0 -12
  116. synth_ai/cli/commands/baseline/core.py +0 -636
  117. synth_ai/cli/commands/baseline/list.py +0 -94
  118. synth_ai/cli/commands/eval/errors.py +0 -81
  119. synth_ai/cli/commands/status/formatters.py +0 -164
  120. synth_ai/cli/commands/status/subcommands/pricing.py +0 -23
  121. synth_ai/cli/commands/status/subcommands/usage.py +0 -203
  122. synth_ai/cli/commands/train/judge_validation.py +0 -305
  123. synth_ai/cli/usage.py +0 -159
  124. synth_ai/data/specs.py +0 -36
  125. synth_ai/sdk/api/research_agent/cli.py +0 -428
  126. synth_ai/sdk/api/research_agent/config.py +0 -357
  127. synth_ai/sdk/api/research_agent/job.py +0 -717
  128. synth_ai/sdk/baseline/__init__.py +0 -25
  129. synth_ai/sdk/baseline/config.py +0 -209
  130. synth_ai/sdk/baseline/discovery.py +0 -216
  131. synth_ai/sdk/baseline/execution.py +0 -154
  132. synth_ai/sdk/judging/__init__.py +0 -15
  133. synth_ai/sdk/judging/base.py +0 -24
  134. synth_ai/sdk/judging/client.py +0 -191
  135. synth_ai/sdk/judging/types.py +0 -42
  136. synth_ai/sdk/research_agent/__init__.py +0 -34
  137. synth_ai/sdk/research_agent/container_builder.py +0 -328
  138. synth_ai/sdk/research_agent/container_spec.py +0 -198
  139. synth_ai/sdk/research_agent/defaults.py +0 -34
  140. synth_ai/sdk/research_agent/results_collector.py +0 -69
  141. synth_ai/sdk/specs/__init__.py +0 -46
  142. synth_ai/sdk/specs/dataclasses.py +0 -149
  143. synth_ai/sdk/specs/loader.py +0 -144
  144. synth_ai/sdk/specs/serializer.py +0 -199
  145. synth_ai/sdk/specs/validation.py +0 -250
  146. synth_ai/sdk/tracing/__init__.py +0 -39
  147. synth_ai/sdk/usage/__init__.py +0 -37
  148. synth_ai/sdk/usage/client.py +0 -171
  149. synth_ai/sdk/usage/models.py +0 -261
  150. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/WHEEL +0 -0
  151. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/entry_points.txt +0 -0
  152. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/licenses/LICENSE +0 -0
  153. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/top_level.txt +0 -0
@@ -1,636 +0,0 @@
1
- """CLI command for baseline evaluation."""
2
-
3
- from __future__ import annotations
4
-
5
- import asyncio
6
- import json
7
- import time
8
- from datetime import datetime
9
- from pathlib import Path
10
- from typing import Any, Optional, Sequence
11
-
12
- import click
13
-
14
- from synth_ai.sdk.baseline.config import BaselineResults
15
- from synth_ai.sdk.baseline.discovery import (
16
- BASELINE_FILE_PATTERNS,
17
- BaselineChoice,
18
- discover_baseline_files,
19
- load_baseline_config_from_file,
20
- )
21
- from synth_ai.sdk.baseline.execution import aggregate_results, run_baseline_evaluation
22
-
23
-
24
- class BaselineGroup(click.Group):
25
- """Custom group that allows positional arguments (baseline_id) even when subcommands exist."""
26
-
27
- def make_context(
28
- self,
29
- info_name: str | None,
30
- args: list[str],
31
- parent: click.Context | None = None,
32
- **extra,
33
- ) -> click.Context:
34
- """Override make_context to store original args before Click parses them."""
35
- # Store original args in the context's meta
36
- ctx = super().make_context(info_name, args, parent, **extra)
37
- ctx.meta['_original_args'] = args.copy() if isinstance(args, list) else list(args)
38
- return ctx
39
-
40
- def resolve_command(self, ctx: click.Context, args: list[str]) -> tuple[str | None, click.Command | None, list[str]]:
41
- """Resolve command, checking if first arg is a subcommand or baseline_id."""
42
-
43
- # Check if first arg is a known subcommand
44
- if args and not args[0].startswith('--'):
45
- first_arg = args[0]
46
- if first_arg in self.commands:
47
- # It's a known subcommand, let Click handle it normally
48
- cmd_name, cmd, remaining = super().resolve_command(ctx, args)
49
- return cmd_name, cmd, remaining
50
-
51
- # Not a subcommand - this means baseline_id is a positional argument
52
- # Store baseline_id in ctx for the callback to access
53
- if args and not args[0].startswith('--'):
54
- baseline_id = args[0]
55
- ctx.meta['baseline_id'] = baseline_id
56
- # Remove baseline_id from args so Click doesn't try to parse it
57
- remaining_args = args[1:]
58
-
59
- # Create a wrapper function that injects baseline_id into the callback
60
- original_callback = self.callback
61
- if original_callback is None:
62
- raise click.ClickException("Command callback is None")
63
- def wrapper_callback(ctx, **kwargs):
64
- # Inject baseline_id into kwargs
65
- kwargs['baseline_id'] = baseline_id
66
- return original_callback(ctx, **kwargs)
67
-
68
- # Create a wrapper command with the modified callback
69
- # Filter out baseline_id from params since we're injecting it manually
70
- filtered_params = [p for p in self.params if getattr(p, 'name', None) != 'baseline_id']
71
- wrapper_cmd = click.Command(
72
- name="_baseline_wrapper", # Use a different name to avoid confusion
73
- callback=wrapper_callback,
74
- params=filtered_params,
75
- context_settings=self.context_settings,
76
- )
77
- return "_baseline_wrapper", wrapper_cmd, remaining_args
78
-
79
- # No args or args start with --, so no baseline_id
80
- # Let Click handle it normally (will invoke main callback if invoke_without_command=True)
81
- cmd_name, cmd, remaining = super().resolve_command(ctx, args)
82
- return cmd_name, cmd, remaining
83
-
84
- def invoke(self, ctx: click.Context) -> Any:
85
- """Invoke command, handling baseline_id as positional arg."""
86
- # Check if baseline_id is in ctx.params (Click might have parsed it)
87
- if 'baseline_id' in ctx.params and ctx.params['baseline_id']:
88
- baseline_id = ctx.params['baseline_id']
89
- # Invoke callback with baseline_id from params
90
- if self.callback is None:
91
- raise click.ClickException("Command callback is None")
92
- return self.callback(ctx, **ctx.params)
93
-
94
- # Manually call resolve_command with full args (including baseline_id if present)
95
- # Try to get the original args from ctx.meta (stored in make_context())
96
- full_args = ctx.meta.get('_original_args', ctx.args)
97
-
98
- # If no args, invoke callback directly (invoke_without_command=True behavior)
99
- if not full_args:
100
- if self.callback is None:
101
- raise click.ClickException("Command callback is None")
102
- return ctx.invoke(self.callback, **ctx.params)
103
-
104
- cmd_name, cmd, resolved_args = self.resolve_command(ctx, full_args)
105
-
106
- # Check if baseline_id was detected
107
- if 'baseline_id' in ctx.meta:
108
- baseline_id = ctx.meta['baseline_id']
109
- # Parse options from resolved_args - don't use OptionParser, just use Click's make_context
110
- # Create a temporary context to parse the options
111
- temp_ctx = self.make_context(self.name, resolved_args, parent=ctx.parent, allow_extra_args=True, allow_interspersed_args=False)
112
- params = temp_ctx.params.copy()
113
- params['baseline_id'] = baseline_id
114
- # Don't pass ctx explicitly - Click's @click.pass_context decorator injects it
115
- # Use ctx.invoke to properly call the callback with the right context
116
- if self.callback is None:
117
- raise click.ClickException("Command callback is None")
118
- return ctx.invoke(self.callback, **params)
119
-
120
- # Normal flow - if it's a subcommand, invoke it
121
- if cmd and cmd is not self and isinstance(cmd, click.Command):
122
- with cmd.make_context(cmd_name, resolved_args, parent=ctx) as sub_ctx:
123
- return cmd.invoke(sub_ctx)
124
-
125
- # No baseline_id and no subcommand - invoke callback if invoke_without_command=True
126
- if self.callback is None:
127
- raise click.ClickException("Command callback is None")
128
- return self.callback(ctx)
129
-
130
-
131
- __all__ = ["command"]
132
-
133
- def _select_baseline_interactive(choices: list[BaselineChoice]) -> Optional[str]:
134
- """Prompt user to select a baseline interactively."""
135
- if not choices:
136
- return None
137
-
138
- if len(choices) == 1:
139
- return choices[0].baseline_id
140
-
141
- click.echo("\nFound multiple baseline files:")
142
- for i, choice in enumerate(choices, 1):
143
- click.echo(f" {i}. {choice.baseline_id} ({choice.path})")
144
-
145
- while True:
146
- try:
147
- selection = click.prompt("Select baseline", type=int)
148
- if 1 <= selection <= len(choices):
149
- return choices[selection - 1].baseline_id
150
- click.echo(f"Please enter a number between 1 and {len(choices)}")
151
- except (click.Abort, KeyboardInterrupt):
152
- return None
153
-
154
- def _parse_seeds(seeds_str: Optional[str]) -> Optional[list[int]]:
155
- """Parse comma-separated seeds string."""
156
- if not seeds_str:
157
- return None
158
-
159
- try:
160
- return [int(s.strip()) for s in seeds_str.split(",") if s.strip()]
161
- except ValueError as e:
162
- raise click.ClickException(f"Invalid seeds format: {seeds_str}. Expected comma-separated integers.") from e
163
-
164
- def _parse_splits(splits_str: str) -> list[str]:
165
- """Parse comma-separated splits string."""
166
- return [s.strip() for s in splits_str.split(",") if s.strip()]
167
-
168
- @click.group(
169
- "baseline",
170
- help="Run self-contained task evaluation using a baseline file.",
171
- invoke_without_command=True,
172
- cls=BaselineGroup,
173
- )
174
- @click.pass_context
175
- # DON'T define baseline_id as an argument here - it will be consumed before resolve_command()
176
- # @click.argument("baseline_id", type=str, required=False)
177
- @click.option(
178
- "--split",
179
- default="train",
180
- help="Data split(s) to evaluate (comma-separated). Default: train",
181
- )
182
- @click.option(
183
- "--seeds",
184
- default=None,
185
- help="Comma-separated seeds to evaluate (overrides split defaults)",
186
- )
187
- @click.option(
188
- "--model",
189
- default=None,
190
- help="Model identifier (overrides default_policy_config)",
191
- )
192
- @click.option(
193
- "--temperature",
194
- type=float,
195
- default=None,
196
- help="Sampling temperature (overrides default_policy_config)",
197
- )
198
- @click.option(
199
- "--policy-config",
200
- type=str,
201
- default=None,
202
- help="JSON string with policy config overrides",
203
- )
204
- @click.option(
205
- "--env-config",
206
- type=str,
207
- default=None,
208
- help="JSON string with env config overrides",
209
- )
210
- @click.option(
211
- "--output",
212
- type=click.Path(),
213
- default=None,
214
- help="Save results to JSON file",
215
- )
216
- @click.option(
217
- "--trace-db",
218
- default=None,
219
- help="SQLite/Turso URL for storing traces (set to 'none' to disable)",
220
- )
221
- @click.option(
222
- "--concurrency",
223
- type=int,
224
- default=4,
225
- help="Maximum concurrent task executions",
226
- )
227
- @click.option(
228
- "--env-file",
229
- multiple=True,
230
- type=click.Path(),
231
- help="Environment file(s) to load (for API keys, etc.)",
232
- )
233
- @click.option(
234
- "--verbose",
235
- is_flag=True,
236
- help="Enable verbose output",
237
- )
238
- def command(
239
- ctx: click.Context,
240
- baseline_id: str | None = None,
241
- split: str = "train",
242
- seeds: str | None = None,
243
- model: str | None = None,
244
- temperature: float | None = None,
245
- policy_config: str | None = None,
246
- env_config: str | None = None,
247
- output: str | None = None,
248
- trace_db: str | None = None,
249
- concurrency: int = 4,
250
- env_file: Sequence[str] = (),
251
- verbose: bool = False,
252
- ) -> None:
253
- """Run baseline evaluation."""
254
- # If a subcommand was invoked, don't run the default command
255
- if ctx.invoked_subcommand is not None:
256
- return
257
-
258
- # Check if baseline_id is actually a subcommand (shouldn't happen, but handle gracefully)
259
- if baseline_id and isinstance(ctx.command, click.Group) and baseline_id in ctx.command.commands:
260
- # It's a subcommand, re-invoke with that subcommand
261
- subcmd = ctx.command.get_command(ctx, baseline_id)
262
- if subcmd:
263
- return ctx.invoke(subcmd, **ctx.params)
264
-
265
- # baseline_id should be parsed by Click as a positional argument
266
- # No need to extract from meta since resolve_command returns None for non-subcommands
267
-
268
- # Run the evaluation
269
- asyncio.run(
270
- _baseline_command_impl(
271
- baseline_id=baseline_id,
272
- split=split,
273
- seeds=seeds,
274
- model=model,
275
- temperature=temperature,
276
- policy_config_json=policy_config,
277
- env_config_json=env_config,
278
- output_path=Path(output) if output else None,
279
- trace_db_url=trace_db,
280
- concurrency=concurrency,
281
- env_files=env_file,
282
- verbose=verbose,
283
- )
284
- )
285
-
286
- @command.command("run")
287
- @click.argument("baseline_id", type=str, required=False)
288
- @click.option(
289
- "--split",
290
- default="train",
291
- help="Data split(s) to evaluate (comma-separated). Default: train",
292
- )
293
- @click.option(
294
- "--seeds",
295
- default=None,
296
- help="Comma-separated seeds to evaluate (overrides split defaults)",
297
- )
298
- @click.option(
299
- "--model",
300
- default=None,
301
- help="Model identifier (overrides default_policy_config)",
302
- )
303
- @click.option(
304
- "--temperature",
305
- type=float,
306
- default=None,
307
- help="Sampling temperature (overrides default_policy_config)",
308
- )
309
- @click.option(
310
- "--policy-config",
311
- type=str,
312
- default=None,
313
- help="JSON string with policy config overrides",
314
- )
315
- @click.option(
316
- "--env-config",
317
- type=str,
318
- default=None,
319
- help="JSON string with env config overrides",
320
- )
321
- @click.option(
322
- "--output",
323
- type=click.Path(),
324
- default=None,
325
- help="Save results to JSON file",
326
- )
327
- @click.option(
328
- "--trace-db",
329
- default=None,
330
- help="SQLite/Turso URL for storing traces (set to 'none' to disable)",
331
- )
332
- @click.option(
333
- "--concurrency",
334
- type=int,
335
- default=4,
336
- help="Maximum concurrent task executions",
337
- )
338
- @click.option(
339
- "--env-file",
340
- multiple=True,
341
- type=click.Path(),
342
- help="Environment file(s) to load (for API keys, etc.)",
343
- )
344
- @click.option(
345
- "--verbose",
346
- is_flag=True,
347
- help="Enable verbose output",
348
- )
349
- def run_command(
350
- baseline_id: str | None,
351
- split: str,
352
- seeds: str | None,
353
- model: str | None,
354
- temperature: float | None,
355
- policy_config: str | None,
356
- env_config: str | None,
357
- output: str | None,
358
- trace_db: str | None,
359
- concurrency: int,
360
- env_file: Sequence[str],
361
- verbose: bool,
362
- ) -> None:
363
- """Run baseline evaluation."""
364
- asyncio.run(
365
- _baseline_command_impl(
366
- baseline_id=baseline_id,
367
- split=split,
368
- seeds=seeds,
369
- model=model,
370
- temperature=temperature,
371
- policy_config_json=policy_config,
372
- env_config_json=env_config,
373
- output_path=Path(output) if output else None,
374
- trace_db_url=trace_db,
375
- concurrency=concurrency,
376
- env_files=env_file,
377
- verbose=verbose,
378
- )
379
- )
380
-
381
- async def _baseline_command_impl(
382
- baseline_id: str | None,
383
- split: str,
384
- seeds: str | None,
385
- model: str | None,
386
- temperature: float | None,
387
- policy_config_json: str | None,
388
- env_config_json: str | None,
389
- output_path: Path | None,
390
- trace_db_url: str | None,
391
- concurrency: int,
392
- env_files: Sequence[str],
393
- verbose: bool,
394
- ) -> None:
395
- """Implementation of baseline command."""
396
-
397
- # Load environment files if provided
398
- if env_files:
399
- try:
400
- from dotenv import load_dotenv
401
- for env_file in env_files:
402
- load_dotenv(env_file, override=False)
403
- except ImportError:
404
- click.echo("Warning: python-dotenv not installed, skipping --env-file", err=True)
405
-
406
- # 1. Discovery
407
- search_roots = [Path.cwd()]
408
- choices = discover_baseline_files(search_roots)
409
-
410
- if not choices:
411
- search_dirs = [str(root) for root in search_roots]
412
- raise click.ClickException(
413
- f"❌ No baseline files found\n"
414
- f" Searched in: {', '.join(search_dirs)}\n"
415
- f" Patterns: {', '.join(BASELINE_FILE_PATTERNS)}\n"
416
- f" Create baseline files in:\n"
417
- f" - examples/baseline/*.py\n"
418
- f" - **/*_baseline.py (anywhere in the tree)\n"
419
- f" Example: Create examples/baseline/my_task_baseline.py\n"
420
- f" See: https://docs.usesynth.ai/baseline for more info"
421
- )
422
-
423
- if baseline_id is None:
424
- selected_id = _select_baseline_interactive(choices)
425
- if selected_id is None:
426
- raise click.ClickException(
427
- "❌ No baseline selected\n"
428
- " Run with a baseline ID: synth-ai baseline <baseline_id>\n"
429
- " Or use: synth-ai baseline list to see available baselines"
430
- )
431
- baseline_id = selected_id
432
-
433
- # Find matching baseline
434
- matching = [c for c in choices if c.baseline_id == baseline_id]
435
- if not matching:
436
- available = sorted({c.baseline_id for c in choices})
437
- # Find close matches (fuzzy matching)
438
- close_matches = [
439
- bid for bid in available
440
- if baseline_id.lower() in bid.lower() or bid.lower() in baseline_id.lower()
441
- ]
442
-
443
- error_msg = (
444
- f"❌ Baseline '{baseline_id}' not found\n"
445
- f" Available baselines ({len(available)}): {', '.join(available)}"
446
- )
447
-
448
- if close_matches:
449
- error_msg += f"\n Did you mean: {', '.join(close_matches[:3])}?"
450
-
451
- error_msg += "\n Use 'synth-ai baseline list' to see all baselines with details"
452
-
453
- raise click.ClickException(error_msg)
454
-
455
- choice = matching[0]
456
-
457
- # 2. Load config
458
- try:
459
- config = load_baseline_config_from_file(baseline_id, choice.path)
460
- except ImportError as e:
461
- # ImportError already has good formatting from discovery.py
462
- raise click.ClickException(str(e)) from e
463
- except ValueError as e:
464
- # ValueError already has good formatting from discovery.py
465
- raise click.ClickException(str(e)) from e
466
- except Exception as e:
467
- error_type = type(e).__name__
468
- raise click.ClickException(
469
- f"❌ Unexpected error loading baseline '{baseline_id}'\n"
470
- f" File: {choice.path}\n"
471
- f" Error: {error_type}: {str(e)}\n"
472
- f" Tip: Run with --verbose for more details"
473
- ) from e
474
-
475
- # 3. Validate split
476
- split_names = _parse_splits(split)
477
- for split_name in split_names:
478
- if split_name not in config.splits:
479
- available_splits = sorted(config.splits.keys())
480
- raise click.ClickException(
481
- f"❌ Invalid split '{split_name}' for baseline '{baseline_id}'\n"
482
- f" Available splits: {', '.join(available_splits)}\n"
483
- f" Use: --split {available_splits[0]} (or comma-separated: --split {','.join(available_splits)})"
484
- )
485
-
486
- # 4. Determine seeds
487
- if seeds:
488
- try:
489
- seed_list = _parse_seeds(seeds)
490
- if not seed_list:
491
- raise click.ClickException(
492
- f"❌ No valid seeds provided\n"
493
- f" Provided: '{seeds}'\n"
494
- f" Expected: comma-separated integers (e.g., '0,1,2')"
495
- )
496
- except ValueError as e:
497
- raise click.ClickException(
498
- f"❌ Invalid seeds format\n"
499
- f" Provided: '{seeds}'\n"
500
- f" Expected: comma-separated integers (e.g., '0,1,2' or '10,20,30')\n"
501
- f" Error: {str(e)}"
502
- ) from e
503
- else:
504
- # Use all seeds from specified splits
505
- seed_list = []
506
- for split_name in split_names:
507
- seed_list.extend(config.splits[split_name].seeds)
508
-
509
- if not seed_list:
510
- split_info = []
511
- for split_name in split_names:
512
- num_seeds = len(config.splits[split_name].seeds)
513
- split_info.append(f"{split_name} ({num_seeds} seeds)")
514
-
515
- raise click.ClickException(
516
- f"❌ No seeds found for split(s): {', '.join(split_names)}\n"
517
- f" Split details: {', '.join(split_info)}\n"
518
- f" This may indicate an empty split configuration\n"
519
- f" Fix: Use --seeds to specify seeds manually (e.g., --seeds 0,1,2)"
520
- )
521
-
522
- # 5. Merge configs
523
- policy_config = {**config.default_policy_config}
524
- if model:
525
- policy_config["model"] = model
526
- if temperature is not None:
527
- policy_config["temperature"] = temperature
528
- if policy_config_json:
529
- try:
530
- policy_overrides = json.loads(policy_config_json)
531
- policy_config.update(policy_overrides)
532
- except json.JSONDecodeError as e:
533
- raise click.ClickException(
534
- f"❌ Invalid --policy-config JSON\n"
535
- f" Provided: {policy_config_json[:100]}...\n"
536
- f" Error: {str(e)}\n"
537
- f" Expected: Valid JSON object (e.g., '{{\"model\": \"gpt-4o\", \"temperature\": 0.7}}')"
538
- ) from e
539
-
540
- env_config = {**config.default_env_config}
541
- if env_config_json:
542
- try:
543
- env_overrides = json.loads(env_config_json)
544
- env_config.update(env_overrides)
545
- except json.JSONDecodeError as e:
546
- raise click.ClickException(
547
- f"❌ Invalid --env-config JSON\n"
548
- f" Provided: {env_config_json[:100]}...\n"
549
- f" Error: {str(e)}\n"
550
- f" Expected: Valid JSON object (e.g., '{{\"max_steps\": 1000}}')"
551
- ) from e
552
-
553
- # Handle split-specific env config
554
- for split_name in split_names:
555
- split_config = config.splits[split_name]
556
- if split_config.metadata:
557
- env_config.update(split_config.metadata)
558
-
559
- # 6. Setup trace storage (if requested)
560
- tracer = None
561
- if trace_db_url and trace_db_url.lower() not in {"none", "off"}:
562
- from synth_ai.core.tracing_v3.session_tracer import SessionTracer
563
- tracer = SessionTracer(db_url=trace_db_url, auto_save=True)
564
- await tracer.initialize()
565
-
566
- # 7. Execute tasks
567
- click.echo(f"Running {len(seed_list)} tasks across {len(split_names)} split(s)...")
568
- click.echo(f"Model: {policy_config.get('model', 'default')}")
569
- click.echo(f"Concurrency: {concurrency}")
570
-
571
- start_time = time.perf_counter()
572
- try:
573
- results = await run_baseline_evaluation(
574
- config=config,
575
- seeds=seed_list,
576
- policy_config=policy_config,
577
- env_config=env_config,
578
- concurrency=concurrency,
579
- )
580
- except Exception as e:
581
- error_type = type(e).__name__
582
- raise click.ClickException(
583
- f"❌ Error running baseline evaluation\n"
584
- f" Baseline: {baseline_id}\n"
585
- f" Tasks: {len(seed_list)} seeds\n"
586
- f" Error: {error_type}: {str(e)}\n"
587
- f" Common causes:\n"
588
- f" - Missing dependencies (check baseline file imports)\n"
589
- f" - API key not set (check environment variables)\n"
590
- f" - Model/inference configuration issues\n"
591
- f" Tip: Run with --verbose for detailed error output"
592
- ) from e
593
-
594
- elapsed = time.perf_counter() - start_time
595
-
596
- # Store traces if requested
597
- if tracer:
598
- for result in results:
599
- if result.trace:
600
- # Store trace (simplified - would need proper trace storage logic)
601
- pass
602
-
603
- # 8. Aggregate results
604
- aggregate_metrics = aggregate_results(config, results)
605
-
606
- # 9. Create output
607
- baseline_results = BaselineResults(
608
- config=config,
609
- split_name=",".join(split_names),
610
- results=results,
611
- aggregate_metrics=aggregate_metrics,
612
- execution_time_seconds=elapsed,
613
- model_name=policy_config.get("model", "unknown"),
614
- timestamp=datetime.now().isoformat(),
615
- )
616
-
617
- # 10. Display summary
618
- click.echo("\n" + "=" * 60)
619
- click.echo(f"Baseline Evaluation: {config.name}")
620
- click.echo("=" * 60)
621
- click.echo(f"Split(s): {baseline_results.split_name}")
622
- click.echo(f"Tasks: {len(results)}")
623
- click.echo(f"Success: {sum(1 for r in results if r.success)}/{len(results)}")
624
- click.echo(f"Execution time: {elapsed:.2f}s")
625
- click.echo("\nAggregate Metrics:")
626
- for key, value in aggregate_metrics.items():
627
- if isinstance(value, float):
628
- click.echo(f" {key}: {value:.4f}")
629
- else:
630
- click.echo(f" {key}: {value}")
631
-
632
- # 11. Save output if requested
633
- if output_path:
634
- output_path.parent.mkdir(parents=True, exist_ok=True)
635
- output_path.write_text(json.dumps(baseline_results.to_dict(), indent=2))
636
- click.echo(f"\nResults saved to: {output_path}")