synth-ai 0.4.1__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (153) hide show
  1. synth_ai/__init__.py +13 -13
  2. synth_ai/cli/__init__.py +6 -15
  3. synth_ai/cli/commands/eval/__init__.py +6 -15
  4. synth_ai/cli/commands/eval/config.py +338 -0
  5. synth_ai/cli/commands/eval/core.py +236 -1091
  6. synth_ai/cli/commands/eval/runner.py +704 -0
  7. synth_ai/cli/commands/eval/validation.py +44 -117
  8. synth_ai/cli/commands/filter/core.py +7 -7
  9. synth_ai/cli/commands/filter/validation.py +2 -2
  10. synth_ai/cli/commands/smoke/core.py +7 -17
  11. synth_ai/cli/commands/status/__init__.py +1 -64
  12. synth_ai/cli/commands/status/client.py +50 -151
  13. synth_ai/cli/commands/status/config.py +3 -83
  14. synth_ai/cli/commands/status/errors.py +4 -13
  15. synth_ai/cli/commands/status/subcommands/__init__.py +2 -8
  16. synth_ai/cli/commands/status/subcommands/config.py +13 -0
  17. synth_ai/cli/commands/status/subcommands/files.py +18 -63
  18. synth_ai/cli/commands/status/subcommands/jobs.py +28 -311
  19. synth_ai/cli/commands/status/subcommands/models.py +18 -62
  20. synth_ai/cli/commands/status/subcommands/runs.py +16 -63
  21. synth_ai/cli/commands/status/subcommands/session.py +67 -172
  22. synth_ai/cli/commands/status/subcommands/summary.py +24 -32
  23. synth_ai/cli/commands/status/subcommands/utils.py +41 -0
  24. synth_ai/cli/commands/status/utils.py +16 -107
  25. synth_ai/cli/commands/train/__init__.py +18 -20
  26. synth_ai/cli/commands/train/errors.py +3 -3
  27. synth_ai/cli/commands/train/prompt_learning_validation.py +15 -16
  28. synth_ai/cli/commands/train/validation.py +7 -7
  29. synth_ai/cli/commands/train/{judge_schemas.py → verifier_schemas.py} +33 -34
  30. synth_ai/cli/commands/train/verifier_validation.py +235 -0
  31. synth_ai/cli/demo_apps/demo_task_apps/math/config.toml +0 -1
  32. synth_ai/cli/demo_apps/demo_task_apps/math/modal_task_app.py +2 -6
  33. synth_ai/cli/demo_apps/math/config.toml +0 -1
  34. synth_ai/cli/demo_apps/math/modal_task_app.py +2 -6
  35. synth_ai/cli/demo_apps/mipro/task_app.py +25 -47
  36. synth_ai/cli/lib/apps/task_app.py +12 -13
  37. synth_ai/cli/lib/task_app_discovery.py +6 -6
  38. synth_ai/cli/lib/train_cfgs.py +10 -10
  39. synth_ai/cli/task_apps/__init__.py +11 -0
  40. synth_ai/cli/task_apps/commands.py +7 -15
  41. synth_ai/core/env.py +12 -1
  42. synth_ai/core/errors.py +1 -2
  43. synth_ai/core/integrations/cloudflare.py +209 -33
  44. synth_ai/core/tracing_v3/abstractions.py +46 -0
  45. synth_ai/data/__init__.py +3 -30
  46. synth_ai/data/enums.py +1 -20
  47. synth_ai/data/rewards.py +100 -3
  48. synth_ai/products/graph_evolve/__init__.py +1 -2
  49. synth_ai/products/graph_evolve/config.py +16 -16
  50. synth_ai/products/graph_evolve/converters/__init__.py +3 -3
  51. synth_ai/products/graph_evolve/converters/openai_sft.py +7 -7
  52. synth_ai/products/graph_evolve/examples/hotpotqa/config.toml +1 -1
  53. synth_ai/products/graph_gepa/__init__.py +23 -0
  54. synth_ai/products/graph_gepa/converters/__init__.py +19 -0
  55. synth_ai/products/graph_gepa/converters/openai_sft.py +29 -0
  56. synth_ai/sdk/__init__.py +45 -35
  57. synth_ai/sdk/api/eval/__init__.py +33 -0
  58. synth_ai/sdk/api/eval/job.py +732 -0
  59. synth_ai/sdk/api/research_agent/__init__.py +276 -66
  60. synth_ai/sdk/api/train/builders.py +181 -0
  61. synth_ai/sdk/api/train/cli.py +41 -33
  62. synth_ai/sdk/api/train/configs/__init__.py +6 -4
  63. synth_ai/sdk/api/train/configs/prompt_learning.py +127 -33
  64. synth_ai/sdk/api/train/configs/rl.py +264 -16
  65. synth_ai/sdk/api/train/configs/sft.py +165 -1
  66. synth_ai/sdk/api/train/graph_validators.py +12 -12
  67. synth_ai/sdk/api/train/graphgen.py +169 -51
  68. synth_ai/sdk/api/train/graphgen_models.py +95 -45
  69. synth_ai/sdk/api/train/local_api.py +10 -0
  70. synth_ai/sdk/api/train/pollers.py +36 -0
  71. synth_ai/sdk/api/train/prompt_learning.py +390 -60
  72. synth_ai/sdk/api/train/rl.py +41 -5
  73. synth_ai/sdk/api/train/sft.py +2 -0
  74. synth_ai/sdk/api/train/task_app.py +20 -0
  75. synth_ai/sdk/api/train/validators.py +17 -17
  76. synth_ai/sdk/graphs/completions.py +239 -33
  77. synth_ai/sdk/{judging/schemas.py → graphs/verifier_schemas.py} +23 -23
  78. synth_ai/sdk/learning/__init__.py +35 -5
  79. synth_ai/sdk/learning/context_learning_client.py +531 -0
  80. synth_ai/sdk/learning/context_learning_types.py +294 -0
  81. synth_ai/sdk/learning/prompt_learning_client.py +1 -1
  82. synth_ai/sdk/learning/prompt_learning_types.py +2 -1
  83. synth_ai/sdk/learning/rl/__init__.py +0 -4
  84. synth_ai/sdk/learning/rl/contracts.py +0 -4
  85. synth_ai/sdk/localapi/__init__.py +40 -0
  86. synth_ai/sdk/localapi/apps/__init__.py +28 -0
  87. synth_ai/sdk/localapi/client.py +10 -0
  88. synth_ai/sdk/localapi/contracts.py +10 -0
  89. synth_ai/sdk/localapi/helpers.py +519 -0
  90. synth_ai/sdk/localapi/rollouts.py +93 -0
  91. synth_ai/sdk/localapi/server.py +29 -0
  92. synth_ai/sdk/localapi/template.py +49 -0
  93. synth_ai/sdk/streaming/handlers.py +6 -6
  94. synth_ai/sdk/streaming/streamer.py +10 -6
  95. synth_ai/sdk/task/__init__.py +18 -5
  96. synth_ai/sdk/task/apps/__init__.py +37 -1
  97. synth_ai/sdk/task/client.py +9 -1
  98. synth_ai/sdk/task/config.py +6 -11
  99. synth_ai/sdk/task/contracts.py +137 -95
  100. synth_ai/sdk/task/in_process.py +32 -22
  101. synth_ai/sdk/task/in_process_runner.py +9 -4
  102. synth_ai/sdk/task/rubrics/__init__.py +2 -3
  103. synth_ai/sdk/task/rubrics/loaders.py +4 -4
  104. synth_ai/sdk/task/rubrics/strict.py +3 -4
  105. synth_ai/sdk/task/server.py +76 -16
  106. synth_ai/sdk/task/trace_correlation_helpers.py +190 -139
  107. synth_ai/sdk/task/validators.py +34 -49
  108. synth_ai/sdk/training/__init__.py +7 -16
  109. synth_ai/sdk/tunnels/__init__.py +118 -0
  110. synth_ai/sdk/tunnels/cleanup.py +83 -0
  111. synth_ai/sdk/tunnels/ports.py +120 -0
  112. synth_ai/sdk/tunnels/tunneled_api.py +363 -0
  113. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/METADATA +71 -4
  114. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/RECORD +118 -128
  115. synth_ai/cli/commands/baseline/__init__.py +0 -12
  116. synth_ai/cli/commands/baseline/core.py +0 -636
  117. synth_ai/cli/commands/baseline/list.py +0 -94
  118. synth_ai/cli/commands/eval/errors.py +0 -81
  119. synth_ai/cli/commands/status/formatters.py +0 -164
  120. synth_ai/cli/commands/status/subcommands/pricing.py +0 -23
  121. synth_ai/cli/commands/status/subcommands/usage.py +0 -203
  122. synth_ai/cli/commands/train/judge_validation.py +0 -305
  123. synth_ai/cli/usage.py +0 -159
  124. synth_ai/data/specs.py +0 -36
  125. synth_ai/sdk/api/research_agent/cli.py +0 -428
  126. synth_ai/sdk/api/research_agent/config.py +0 -357
  127. synth_ai/sdk/api/research_agent/job.py +0 -717
  128. synth_ai/sdk/baseline/__init__.py +0 -25
  129. synth_ai/sdk/baseline/config.py +0 -209
  130. synth_ai/sdk/baseline/discovery.py +0 -216
  131. synth_ai/sdk/baseline/execution.py +0 -154
  132. synth_ai/sdk/judging/__init__.py +0 -15
  133. synth_ai/sdk/judging/base.py +0 -24
  134. synth_ai/sdk/judging/client.py +0 -191
  135. synth_ai/sdk/judging/types.py +0 -42
  136. synth_ai/sdk/research_agent/__init__.py +0 -34
  137. synth_ai/sdk/research_agent/container_builder.py +0 -328
  138. synth_ai/sdk/research_agent/container_spec.py +0 -198
  139. synth_ai/sdk/research_agent/defaults.py +0 -34
  140. synth_ai/sdk/research_agent/results_collector.py +0 -69
  141. synth_ai/sdk/specs/__init__.py +0 -46
  142. synth_ai/sdk/specs/dataclasses.py +0 -149
  143. synth_ai/sdk/specs/loader.py +0 -144
  144. synth_ai/sdk/specs/serializer.py +0 -199
  145. synth_ai/sdk/specs/validation.py +0 -250
  146. synth_ai/sdk/tracing/__init__.py +0 -39
  147. synth_ai/sdk/usage/__init__.py +0 -37
  148. synth_ai/sdk/usage/client.py +0 -171
  149. synth_ai/sdk/usage/models.py +0 -261
  150. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/WHEEL +0 -0
  151. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/entry_points.txt +0 -0
  152. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/licenses/LICENSE +0 -0
  153. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/top_level.txt +0 -0
@@ -1,25 +0,0 @@
1
- """Baseline file system for self-contained task evaluation.
2
-
3
- This package provides abstractions for defining and executing baseline evaluations
4
- without requiring deployed task apps. Supports both class-based and function-based
5
- task runners with first-class train/val/test split support.
6
- """
7
-
8
- from __future__ import annotations
9
-
10
- from synth_ai.sdk.baseline.config import (
11
- BaselineConfig,
12
- BaselineResults,
13
- BaselineTaskRunner,
14
- DataSplit,
15
- TaskResult,
16
- )
17
-
18
- __all__ = [
19
- "BaselineConfig",
20
- "BaselineTaskRunner",
21
- "DataSplit",
22
- "TaskResult",
23
- "BaselineResults",
24
- ]
25
-
@@ -1,209 +0,0 @@
1
- """Core dataclasses for baseline configuration and results."""
2
-
3
- from __future__ import annotations
4
-
5
- from dataclasses import dataclass, field
6
- from pathlib import Path
7
- from typing import Any, Callable, Dict, List, Optional
8
-
9
-
10
- class BaselineTaskRunner:
11
- """
12
- Base class for task runners.
13
-
14
- Subclasses should implement `run_task` method for class-based approach,
15
- or you can use standalone async functions for function-based approach.
16
- """
17
-
18
- def __init__(
19
- self,
20
- policy_config: Dict[str, Any],
21
- env_config: Dict[str, Any],
22
- ):
23
- """
24
- Initialize task runner with configuration.
25
-
26
- Args:
27
- policy_config: Policy configuration (model, temperature, etc.)
28
- env_config: Environment configuration (max_steps, difficulty, etc.)
29
- """
30
- self.policy_config = policy_config
31
- self.env_config = env_config
32
-
33
- async def run_task(self, seed: int) -> TaskResult:
34
- """
35
- Execute a single task instance.
36
-
37
- This method is called for each seed in the selected split.
38
-
39
- Args:
40
- seed: The seed/index for this task instance
41
-
42
- Returns:
43
- TaskResult: Structured result containing success, rewards, metadata, trace
44
- """
45
- raise NotImplementedError("Subclasses must implement run_task method")
46
-
47
-
48
- @dataclass
49
- class DataSplit:
50
- """Definition of a data split (train/val/test)."""
51
-
52
- name: str # "train", "val", "test"
53
- seeds: List[int] # Seed/index values for this split
54
- metadata: Dict[str, Any] = field(default_factory=dict) # Optional metadata
55
-
56
-
57
- @dataclass
58
- class TaskResult:
59
- """Result from a single task execution."""
60
-
61
- # Required: Seed/index that was evaluated
62
- seed: int
63
-
64
- # Required: Did the task complete successfully?
65
- success: bool
66
-
67
- # Required: Outcome reward for the episode
68
- outcome_reward: float
69
-
70
- # Optional: Event rewards (step-level)
71
- event_rewards: List[Dict[str, Any]] = field(default_factory=list)
72
-
73
- # Optional: Total steps/turns taken
74
- total_steps: int = 0
75
-
76
- # Optional: Metadata (achievements, completion info, etc.)
77
- metadata: Dict[str, Any] = field(default_factory=dict)
78
-
79
- # Optional: Error information if success=False
80
- error: Optional[str] = None
81
-
82
- # Optional: v3 trace (SessionTrace dict)
83
- trace: Optional[Dict[str, Any]] = None
84
-
85
-
86
- # Type alias for task runner (can be class or function)
87
- TaskRunnerType = (
88
- type[BaselineTaskRunner]
89
- | Callable[[int, dict[str, Any], dict[str, Any]], Any] # Function signature
90
- )
91
-
92
- # Type alias for result aggregator (can be class or function)
93
- AggregatorType = (
94
- type[Any] # Class with aggregate() method
95
- | Callable[[list[TaskResult]], dict[str, Any]] # Function signature
96
- )
97
-
98
-
99
- @dataclass
100
- class BaselineConfig:
101
- """Configuration for a baseline file.
102
-
103
- A baseline file defines how to evaluate a task without requiring
104
- a deployed task app. It provides self-contained evaluation logic
105
- with first-class support for train/val/test splits.
106
-
107
- Supports both class-based and function-based task runners:
108
- - Class-based: Pass a class that inherits from BaselineTaskRunner
109
- - Function-based: Pass an async function with signature:
110
- async def task_runner(seed: int, policy_config: Dict[str, Any],
111
- env_config: Dict[str, Any]) -> TaskResult
112
- """
113
-
114
- # Required: Unique identifier for this baseline config
115
- baseline_id: str
116
-
117
- # Required: Human-readable name
118
- name: str
119
-
120
- # Required: Task runner (class or function)
121
- # Class-based: Pass a class inheriting from BaselineTaskRunner
122
- # The class will be instantiated with policy_config and env_config,
123
- # and run_task(seed) will be called for each seed.
124
- # Function-based: Pass an async function with signature:
125
- # async def task_runner(seed: int, policy_config: Dict[str, Any],
126
- # env_config: Dict[str, Any]) -> TaskResult
127
- task_runner: TaskRunnerType
128
-
129
- # Required: Data splits (train/val/test)
130
- splits: Dict[str, DataSplit]
131
-
132
- # Optional: Description for documentation
133
- description: str = ""
134
-
135
- # Optional: Default policy configuration
136
- default_policy_config: Dict[str, Any] = field(default_factory=dict)
137
-
138
- # Optional: Default environment configuration
139
- default_env_config: Dict[str, Any] = field(default_factory=dict)
140
-
141
- # Optional: Metadata for filtering/organization
142
- metadata: Dict[str, Any] = field(default_factory=dict)
143
-
144
- # Optional: Tags for filtering and discovery
145
- tags: List[str] = field(default_factory=list)
146
-
147
- # Optional: Custom result aggregator (class or function)
148
- # Class-based: Pass a class with aggregate(results: List[TaskResult]) method
149
- # The class will be instantiated and aggregate() called.
150
- # Function-based: Pass a function with signature:
151
- # def aggregate_results(results: List[TaskResult]) -> Dict[str, Any]
152
- result_aggregator: Optional[AggregatorType] = None
153
-
154
- # Optional: Path to this baseline file (set by discovery)
155
- _source_path: Optional[Path] = None
156
-
157
- def matches_tag(self, tag: str) -> bool:
158
- """Check if baseline matches a tag (case-insensitive)."""
159
- return tag.lower() in [t.lower() for t in self.tags]
160
-
161
- def matches_metadata(self, key: str, value: Any) -> bool:
162
- """Check if baseline metadata matches key-value pair."""
163
- return self.metadata.get(key) == value
164
-
165
-
166
- @dataclass
167
- class BaselineResults:
168
- """Aggregate results from a baseline evaluation."""
169
-
170
- # Configuration that was used
171
- config: BaselineConfig
172
-
173
- # Split that was evaluated
174
- split_name: str
175
-
176
- # Per-seed results
177
- results: List[TaskResult]
178
-
179
- # Aggregate metrics
180
- aggregate_metrics: Dict[str, Any]
181
-
182
- # Execution metadata
183
- execution_time_seconds: float
184
- model_name: str
185
- timestamp: str
186
-
187
- def to_dict(self) -> Dict[str, Any]:
188
- """Serialize to dictionary for JSON output."""
189
- return {
190
- "baseline_id": self.config.baseline_id,
191
- "name": self.config.name,
192
- "split": self.split_name,
193
- "model": self.model_name,
194
- "timestamp": self.timestamp,
195
- "execution_time_seconds": self.execution_time_seconds,
196
- "aggregate_metrics": self.aggregate_metrics,
197
- "results": [
198
- {
199
- "seed": r.seed,
200
- "success": r.success,
201
- "outcome_reward": r.outcome_reward,
202
- "total_steps": r.total_steps,
203
- "metadata": r.metadata,
204
- "error": r.error,
205
- }
206
- for r in self.results
207
- ],
208
- }
209
-
@@ -1,216 +0,0 @@
1
- """AST-based discovery mechanism for baseline files."""
2
-
3
- from __future__ import annotations
4
-
5
- import ast
6
- import importlib.util
7
- from dataclasses import dataclass
8
- from pathlib import Path
9
- from typing import List, Optional, Tuple
10
-
11
- from synth_ai.sdk.baseline.config import BaselineConfig
12
-
13
- # Search patterns for baseline files
14
- BASELINE_FILE_PATTERNS = [
15
- "**/baseline/*.py",
16
- "**/baselines/*.py",
17
- "**/*_baseline.py",
18
- ]
19
-
20
- # Directories to ignore during discovery
21
- IGNORE_PATTERNS = {
22
- "__pycache__",
23
- ".git",
24
- ".venv",
25
- "venv",
26
- "node_modules",
27
- "build",
28
- "dist",
29
- ".mypy_cache",
30
- ".pytest_cache",
31
- }
32
-
33
-
34
- @dataclass
35
- class BaselineChoice:
36
- """Represents a discovered baseline configuration."""
37
-
38
- baseline_id: str
39
- path: Path
40
- lineno: int
41
- source: str # "discovered" or "registered"
42
- config: Optional[BaselineConfig] = None
43
-
44
-
45
- class BaselineConfigVisitor(ast.NodeVisitor):
46
- """AST visitor to find BaselineConfig instances."""
47
-
48
- def __init__(self):
49
- self.matches: List[Tuple[str, int]] = [] # (baseline_id, lineno)
50
-
51
- def visit_Assign(self, node: ast.Assign) -> None:
52
- """Visit assignment statements looking for BaselineConfig."""
53
- if not isinstance(node.value, ast.Call):
54
- self.generic_visit(node)
55
- return
56
-
57
- # Check if right-hand side is BaselineConfig(...)
58
- func = node.value.func
59
- if isinstance(func, ast.Name) and func.id == "BaselineConfig":
60
- # Extract baseline_id from constructor args
61
- baseline_id = self._extract_baseline_id(node.value)
62
- if baseline_id:
63
- self.matches.append((baseline_id, node.lineno))
64
-
65
- self.generic_visit(node)
66
-
67
- def _extract_baseline_id(self, call_node: ast.Call) -> Optional[str]:
68
- """Extract baseline_id from BaselineConfig constructor."""
69
- for keyword in call_node.keywords:
70
- if keyword.arg == "baseline_id" and isinstance(keyword.value, ast.Constant):
71
- val = keyword.value.value
72
- if isinstance(val, str):
73
- return val
74
- return None
75
-
76
-
77
- def should_ignore_path(path: Path) -> bool:
78
- """Check if a path should be ignored during discovery."""
79
- return any(part in IGNORE_PATTERNS for part in path.parts)
80
-
81
-
82
- def discover_baseline_files(search_roots: List[Path]) -> List[BaselineChoice]:
83
- """Discover baseline files via AST scanning.
84
-
85
- Args:
86
- search_roots: List of root directories to search in
87
-
88
- Returns:
89
- List of BaselineChoice objects representing discovered baselines
90
- """
91
- results: List[BaselineChoice] = []
92
- seen = set()
93
-
94
- for root in search_roots:
95
- if not root.exists():
96
- continue
97
-
98
- for pattern in BASELINE_FILE_PATTERNS:
99
- for path in root.glob(pattern):
100
- if should_ignore_path(path):
101
- continue
102
-
103
- try:
104
- source = path.read_text(encoding="utf-8")
105
- tree = ast.parse(source, filename=str(path))
106
- except (OSError, SyntaxError):
107
- continue
108
-
109
- visitor = BaselineConfigVisitor()
110
- visitor.visit(tree)
111
-
112
- for baseline_id, lineno in visitor.matches:
113
- key = (baseline_id, path.resolve())
114
- if key in seen:
115
- continue
116
- seen.add(key)
117
-
118
- results.append(
119
- BaselineChoice(
120
- baseline_id=baseline_id,
121
- path=path.resolve(),
122
- lineno=lineno,
123
- source="discovered",
124
- )
125
- )
126
-
127
- return results
128
-
129
-
130
- def load_baseline_config_from_file(
131
- baseline_id: str,
132
- path: Path,
133
- ) -> BaselineConfig:
134
- """Load a BaselineConfig from a Python file.
135
-
136
- Args:
137
- baseline_id: The baseline_id to look for
138
- path: Path to the Python file
139
-
140
- Returns:
141
- BaselineConfig instance
142
-
143
- Raises:
144
- ValueError: If baseline_id not found or file cannot be loaded
145
- """
146
- # Load the module
147
- spec = importlib.util.spec_from_file_location("baseline_module", path)
148
- if spec is None or spec.loader is None:
149
- raise ValueError(f"Cannot load baseline file: {path}")
150
-
151
- module = importlib.util.module_from_spec(spec)
152
- try:
153
- spec.loader.exec_module(module)
154
- except ModuleNotFoundError as e:
155
- missing_module = str(e).split("'")[1] if "'" in str(e) else str(e)
156
- raise ImportError(
157
- f"❌ Missing dependency for baseline '{baseline_id}'\n"
158
- f" File: {path}\n"
159
- f" Missing module: {missing_module}\n"
160
- f" Fix: pip install {missing_module} (or 'uv add {missing_module}')"
161
- ) from e
162
- except SyntaxError as e:
163
- raise ValueError(
164
- f"❌ Syntax error in baseline file '{baseline_id}'\n"
165
- f" File: {path}\n"
166
- f" Error at line {e.lineno}: {e.msg}\n"
167
- f" Text: {e.text.strip() if e.text else 'N/A'}\n"
168
- f" Fix: Check the Python syntax in the baseline file"
169
- ) from e
170
- except Exception as e:
171
- error_type = type(e).__name__
172
- raise ValueError(
173
- f"❌ Failed to load baseline '{baseline_id}'\n"
174
- f" File: {path}\n"
175
- f" Error type: {error_type}\n"
176
- f" Message: {str(e)}\n"
177
- f" This may be due to:\n"
178
- f" - Missing dependencies (check imports)\n"
179
- f" - Configuration errors in the baseline file\n"
180
- f" - Environment variables not set\n"
181
- f" Tip: Run with --verbose for more details"
182
- ) from e
183
-
184
- # Find the BaselineConfig instance
185
- for attr_name in dir(module):
186
- if attr_name.startswith("_"):
187
- continue
188
-
189
- attr = getattr(module, attr_name)
190
- if isinstance(attr, BaselineConfig) and attr.baseline_id == baseline_id:
191
- # Set source path for reference
192
- attr._source_path = path
193
- return attr
194
-
195
- # Provide helpful error message
196
- found_configs = []
197
- for attr_name in dir(module):
198
- if attr_name.startswith("_"):
199
- continue
200
- attr = getattr(module, attr_name)
201
- if isinstance(attr, BaselineConfig):
202
- found_configs.append(attr.baseline_id)
203
-
204
- if found_configs:
205
- raise ValueError(
206
- f"❌ Baseline '{baseline_id}' not found in {path}\n"
207
- f" Found baselines in this file: {', '.join(found_configs)}\n"
208
- f" Fix: Use one of the above baseline IDs or check the baseline_id parameter"
209
- )
210
- else:
211
- raise ValueError(
212
- f"❌ No BaselineConfig instances found in {path}\n"
213
- f" Expected to find a BaselineConfig with baseline_id='{baseline_id}'\n"
214
- f" Fix: Ensure the file defines a BaselineConfig instance with baseline_id='{baseline_id}'"
215
- )
216
-
@@ -1,154 +0,0 @@
1
- """Execution engine for baseline evaluations."""
2
-
3
- from __future__ import annotations
4
-
5
- import asyncio
6
- from typing import Any, Dict, List, Optional
7
-
8
- from synth_ai.sdk.baseline.config import (
9
- BaselineConfig,
10
- BaselineTaskRunner,
11
- TaskResult,
12
- )
13
-
14
-
15
- def default_aggregator(results: List[TaskResult]) -> Dict[str, Any]:
16
- """Default result aggregation function.
17
-
18
- Computes mean, std, min, max, success rate, and other basic metrics.
19
-
20
- Args:
21
- results: List of TaskResult objects from all seeds
22
-
23
- Returns:
24
- Dict with aggregate metrics
25
- """
26
- successful_results = [r for r in results if r.success]
27
- outcome_rewards = [r.outcome_reward for r in successful_results]
28
-
29
- if not outcome_rewards:
30
- return {
31
- "mean_outcome_reward": 0.0,
32
- "std_outcome_reward": 0.0,
33
- "min_outcome_reward": 0.0,
34
- "max_outcome_reward": 0.0,
35
- "success_rate": 0.0,
36
- "total_tasks": len(results),
37
- "successful_tasks": 0,
38
- "failed_tasks": len(results),
39
- }
40
-
41
- mean_reward = sum(outcome_rewards) / len(outcome_rewards)
42
-
43
- # Calculate standard deviation
44
- variance = sum((x - mean_reward) ** 2 for x in outcome_rewards) / len(outcome_rewards)
45
- std_reward = variance ** 0.5
46
-
47
- return {
48
- "mean_outcome_reward": mean_reward,
49
- "std_outcome_reward": std_reward,
50
- "min_outcome_reward": min(outcome_rewards),
51
- "max_outcome_reward": max(outcome_rewards),
52
- "success_rate": len(successful_results) / len(results),
53
- "total_tasks": len(results),
54
- "successful_tasks": len(successful_results),
55
- "failed_tasks": len(results) - len(successful_results),
56
- }
57
-
58
-
59
- def _is_class_based_runner(task_runner: Any) -> bool:
60
- """Check if task_runner is a class (not a function)."""
61
- return (
62
- isinstance(task_runner, type)
63
- and issubclass(task_runner, BaselineTaskRunner)
64
- )
65
-
66
-
67
- async def run_baseline_evaluation(
68
- config: BaselineConfig,
69
- seeds: List[int],
70
- policy_config: Dict[str, Any],
71
- env_config: Dict[str, Any],
72
- concurrency: int = 4,
73
- ) -> List[TaskResult]:
74
- """Run baseline evaluation for given seeds.
75
-
76
- Args:
77
- config: BaselineConfig instance
78
- seeds: List of seeds to evaluate
79
- policy_config: Policy configuration (merged from defaults + overrides)
80
- env_config: Environment configuration (merged from defaults + overrides)
81
- concurrency: Maximum concurrent task executions
82
-
83
- Returns:
84
- List of TaskResult objects, one per seed
85
- """
86
- # Determine if we're using class-based or function-based runner
87
- is_class_based = _is_class_based_runner(config.task_runner)
88
-
89
- # Instantiate runner if class-based
90
- runner_instance: Optional[BaselineTaskRunner] = None
91
- if is_class_based:
92
- # task_runner is a class - instantiate with policy_config and env_config
93
- # as documented in BaselineConfig and BaselineTaskRunner
94
- runner_instance = config.task_runner(policy_config, env_config) # type: ignore[call-arg]
95
-
96
- # Create semaphore for concurrency control
97
- semaphore = asyncio.Semaphore(concurrency)
98
-
99
- async def run_task(seed: int) -> TaskResult:
100
- """Execute a single task with error handling."""
101
- async with semaphore:
102
- try:
103
- if is_class_based and runner_instance:
104
- # Class-based: call run_task method
105
- return await runner_instance.run_task(seed)
106
- else:
107
- # Function-based: call function directly
108
- task_runner_fn = config.task_runner
109
- if callable(task_runner_fn):
110
- result = task_runner_fn(seed, policy_config, env_config) # type: ignore[call-arg]
111
- # Handle both sync and async functions
112
- if hasattr(result, "__await__"):
113
- return await result
114
- return result
115
- raise RuntimeError("task_runner is not callable")
116
- except Exception as exc:
117
- # Return error result
118
- return TaskResult(
119
- seed=seed,
120
- success=False,
121
- outcome_reward=0.0,
122
- error=str(exc),
123
- )
124
-
125
- # Execute all tasks concurrently
126
- results = await asyncio.gather(*[run_task(seed) for seed in seeds])
127
- return list(results)
128
-
129
-
130
- def aggregate_results(
131
- config: BaselineConfig,
132
- results: List[TaskResult],
133
- ) -> Dict[str, Any]:
134
- """Aggregate results using custom aggregator or default.
135
-
136
- Args:
137
- config: BaselineConfig instance
138
- results: List of TaskResult objects
139
-
140
- Returns:
141
- Dict with aggregate metrics
142
- """
143
- if config.result_aggregator is None:
144
- return default_aggregator(results)
145
-
146
- # Check if aggregator is a class or function
147
- if isinstance(config.result_aggregator, type):
148
- # Class-based: instantiate and call aggregate()
149
- aggregator_instance = config.result_aggregator()
150
- return aggregator_instance.aggregate(results)
151
- else:
152
- # Function-based: call directly
153
- return config.result_aggregator(results)
154
-
@@ -1,15 +0,0 @@
1
- from .client import JudgeClient, JudgeOptions, JudgeScoreResponse, VerifierClient
2
- from .types import Judgement, RewardJudgement, RewardMetadata, Track, TrackAggregate
3
-
4
- __all__ = [
5
- "JudgeClient",
6
- "VerifierClient",
7
- "JudgeOptions",
8
- "JudgeScoreResponse",
9
- "Judgement",
10
- "RewardJudgement",
11
- "RewardMetadata",
12
- "Track",
13
- "TrackAggregate",
14
- ]
15
-
@@ -1,24 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from abc import ABC, abstractmethod
4
- from typing import Any
5
-
6
-
7
- class Judgement:
8
- def __init__(
9
- self,
10
- criteria: str,
11
- score: float,
12
- reasoning: str = "",
13
- evidence: list[str] | None = None,
14
- ) -> None:
15
- self.criteria = criteria
16
- self.score = score
17
- self.reasoning = reasoning
18
- self.evidence = evidence or []
19
-
20
-
21
- class BaseEval(ABC):
22
- @abstractmethod
23
- async def run(self, data: Any) -> list[Judgement]:
24
- """Execute the evaluation and return a list of judgements."""