synth-ai 0.4.1__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- synth_ai/__init__.py +13 -13
- synth_ai/cli/__init__.py +6 -15
- synth_ai/cli/commands/eval/__init__.py +6 -15
- synth_ai/cli/commands/eval/config.py +338 -0
- synth_ai/cli/commands/eval/core.py +236 -1091
- synth_ai/cli/commands/eval/runner.py +704 -0
- synth_ai/cli/commands/eval/validation.py +44 -117
- synth_ai/cli/commands/filter/core.py +7 -7
- synth_ai/cli/commands/filter/validation.py +2 -2
- synth_ai/cli/commands/smoke/core.py +7 -17
- synth_ai/cli/commands/status/__init__.py +1 -64
- synth_ai/cli/commands/status/client.py +50 -151
- synth_ai/cli/commands/status/config.py +3 -83
- synth_ai/cli/commands/status/errors.py +4 -13
- synth_ai/cli/commands/status/subcommands/__init__.py +2 -8
- synth_ai/cli/commands/status/subcommands/config.py +13 -0
- synth_ai/cli/commands/status/subcommands/files.py +18 -63
- synth_ai/cli/commands/status/subcommands/jobs.py +28 -311
- synth_ai/cli/commands/status/subcommands/models.py +18 -62
- synth_ai/cli/commands/status/subcommands/runs.py +16 -63
- synth_ai/cli/commands/status/subcommands/session.py +67 -172
- synth_ai/cli/commands/status/subcommands/summary.py +24 -32
- synth_ai/cli/commands/status/subcommands/utils.py +41 -0
- synth_ai/cli/commands/status/utils.py +16 -107
- synth_ai/cli/commands/train/__init__.py +18 -20
- synth_ai/cli/commands/train/errors.py +3 -3
- synth_ai/cli/commands/train/prompt_learning_validation.py +15 -16
- synth_ai/cli/commands/train/validation.py +7 -7
- synth_ai/cli/commands/train/{judge_schemas.py → verifier_schemas.py} +33 -34
- synth_ai/cli/commands/train/verifier_validation.py +235 -0
- synth_ai/cli/demo_apps/demo_task_apps/math/config.toml +0 -1
- synth_ai/cli/demo_apps/demo_task_apps/math/modal_task_app.py +2 -6
- synth_ai/cli/demo_apps/math/config.toml +0 -1
- synth_ai/cli/demo_apps/math/modal_task_app.py +2 -6
- synth_ai/cli/demo_apps/mipro/task_app.py +25 -47
- synth_ai/cli/lib/apps/task_app.py +12 -13
- synth_ai/cli/lib/task_app_discovery.py +6 -6
- synth_ai/cli/lib/train_cfgs.py +10 -10
- synth_ai/cli/task_apps/__init__.py +11 -0
- synth_ai/cli/task_apps/commands.py +7 -15
- synth_ai/core/env.py +12 -1
- synth_ai/core/errors.py +1 -2
- synth_ai/core/integrations/cloudflare.py +209 -33
- synth_ai/core/tracing_v3/abstractions.py +46 -0
- synth_ai/data/__init__.py +3 -30
- synth_ai/data/enums.py +1 -20
- synth_ai/data/rewards.py +100 -3
- synth_ai/products/graph_evolve/__init__.py +1 -2
- synth_ai/products/graph_evolve/config.py +16 -16
- synth_ai/products/graph_evolve/converters/__init__.py +3 -3
- synth_ai/products/graph_evolve/converters/openai_sft.py +7 -7
- synth_ai/products/graph_evolve/examples/hotpotqa/config.toml +1 -1
- synth_ai/products/graph_gepa/__init__.py +23 -0
- synth_ai/products/graph_gepa/converters/__init__.py +19 -0
- synth_ai/products/graph_gepa/converters/openai_sft.py +29 -0
- synth_ai/sdk/__init__.py +45 -35
- synth_ai/sdk/api/eval/__init__.py +33 -0
- synth_ai/sdk/api/eval/job.py +732 -0
- synth_ai/sdk/api/research_agent/__init__.py +276 -66
- synth_ai/sdk/api/train/builders.py +181 -0
- synth_ai/sdk/api/train/cli.py +41 -33
- synth_ai/sdk/api/train/configs/__init__.py +6 -4
- synth_ai/sdk/api/train/configs/prompt_learning.py +127 -33
- synth_ai/sdk/api/train/configs/rl.py +264 -16
- synth_ai/sdk/api/train/configs/sft.py +165 -1
- synth_ai/sdk/api/train/graph_validators.py +12 -12
- synth_ai/sdk/api/train/graphgen.py +169 -51
- synth_ai/sdk/api/train/graphgen_models.py +95 -45
- synth_ai/sdk/api/train/local_api.py +10 -0
- synth_ai/sdk/api/train/pollers.py +36 -0
- synth_ai/sdk/api/train/prompt_learning.py +390 -60
- synth_ai/sdk/api/train/rl.py +41 -5
- synth_ai/sdk/api/train/sft.py +2 -0
- synth_ai/sdk/api/train/task_app.py +20 -0
- synth_ai/sdk/api/train/validators.py +17 -17
- synth_ai/sdk/graphs/completions.py +239 -33
- synth_ai/sdk/{judging/schemas.py → graphs/verifier_schemas.py} +23 -23
- synth_ai/sdk/learning/__init__.py +35 -5
- synth_ai/sdk/learning/context_learning_client.py +531 -0
- synth_ai/sdk/learning/context_learning_types.py +294 -0
- synth_ai/sdk/learning/prompt_learning_client.py +1 -1
- synth_ai/sdk/learning/prompt_learning_types.py +2 -1
- synth_ai/sdk/learning/rl/__init__.py +0 -4
- synth_ai/sdk/learning/rl/contracts.py +0 -4
- synth_ai/sdk/localapi/__init__.py +40 -0
- synth_ai/sdk/localapi/apps/__init__.py +28 -0
- synth_ai/sdk/localapi/client.py +10 -0
- synth_ai/sdk/localapi/contracts.py +10 -0
- synth_ai/sdk/localapi/helpers.py +519 -0
- synth_ai/sdk/localapi/rollouts.py +93 -0
- synth_ai/sdk/localapi/server.py +29 -0
- synth_ai/sdk/localapi/template.py +49 -0
- synth_ai/sdk/streaming/handlers.py +6 -6
- synth_ai/sdk/streaming/streamer.py +10 -6
- synth_ai/sdk/task/__init__.py +18 -5
- synth_ai/sdk/task/apps/__init__.py +37 -1
- synth_ai/sdk/task/client.py +9 -1
- synth_ai/sdk/task/config.py +6 -11
- synth_ai/sdk/task/contracts.py +137 -95
- synth_ai/sdk/task/in_process.py +32 -22
- synth_ai/sdk/task/in_process_runner.py +9 -4
- synth_ai/sdk/task/rubrics/__init__.py +2 -3
- synth_ai/sdk/task/rubrics/loaders.py +4 -4
- synth_ai/sdk/task/rubrics/strict.py +3 -4
- synth_ai/sdk/task/server.py +76 -16
- synth_ai/sdk/task/trace_correlation_helpers.py +190 -139
- synth_ai/sdk/task/validators.py +34 -49
- synth_ai/sdk/training/__init__.py +7 -16
- synth_ai/sdk/tunnels/__init__.py +118 -0
- synth_ai/sdk/tunnels/cleanup.py +83 -0
- synth_ai/sdk/tunnels/ports.py +120 -0
- synth_ai/sdk/tunnels/tunneled_api.py +363 -0
- {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/METADATA +71 -4
- {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/RECORD +118 -128
- synth_ai/cli/commands/baseline/__init__.py +0 -12
- synth_ai/cli/commands/baseline/core.py +0 -636
- synth_ai/cli/commands/baseline/list.py +0 -94
- synth_ai/cli/commands/eval/errors.py +0 -81
- synth_ai/cli/commands/status/formatters.py +0 -164
- synth_ai/cli/commands/status/subcommands/pricing.py +0 -23
- synth_ai/cli/commands/status/subcommands/usage.py +0 -203
- synth_ai/cli/commands/train/judge_validation.py +0 -305
- synth_ai/cli/usage.py +0 -159
- synth_ai/data/specs.py +0 -36
- synth_ai/sdk/api/research_agent/cli.py +0 -428
- synth_ai/sdk/api/research_agent/config.py +0 -357
- synth_ai/sdk/api/research_agent/job.py +0 -717
- synth_ai/sdk/baseline/__init__.py +0 -25
- synth_ai/sdk/baseline/config.py +0 -209
- synth_ai/sdk/baseline/discovery.py +0 -216
- synth_ai/sdk/baseline/execution.py +0 -154
- synth_ai/sdk/judging/__init__.py +0 -15
- synth_ai/sdk/judging/base.py +0 -24
- synth_ai/sdk/judging/client.py +0 -191
- synth_ai/sdk/judging/types.py +0 -42
- synth_ai/sdk/research_agent/__init__.py +0 -34
- synth_ai/sdk/research_agent/container_builder.py +0 -328
- synth_ai/sdk/research_agent/container_spec.py +0 -198
- synth_ai/sdk/research_agent/defaults.py +0 -34
- synth_ai/sdk/research_agent/results_collector.py +0 -69
- synth_ai/sdk/specs/__init__.py +0 -46
- synth_ai/sdk/specs/dataclasses.py +0 -149
- synth_ai/sdk/specs/loader.py +0 -144
- synth_ai/sdk/specs/serializer.py +0 -199
- synth_ai/sdk/specs/validation.py +0 -250
- synth_ai/sdk/tracing/__init__.py +0 -39
- synth_ai/sdk/usage/__init__.py +0 -37
- synth_ai/sdk/usage/client.py +0 -171
- synth_ai/sdk/usage/models.py +0 -261
- {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/WHEEL +0 -0
- {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,704 @@
|
|
|
1
|
+
"""Eval runner for executing rollouts against task apps.
|
|
2
|
+
|
|
3
|
+
This module provides two execution modes:
|
|
4
|
+
|
|
5
|
+
1. **Backend Mode (Default)**: Routes through backend interceptor for trace/usage capture
|
|
6
|
+
- Creates eval job via POST /api/eval/jobs
|
|
7
|
+
- Polls job status until completion
|
|
8
|
+
- Fetches detailed results with token costs and traces
|
|
9
|
+
- Requires backend_url and backend_api_key (or SYNTH_BASE_URL/SYNTH_API_KEY env vars)
|
|
10
|
+
|
|
11
|
+
2. **Direct Mode**: Calls task apps directly (legacy, no usage tracking)
|
|
12
|
+
- Makes direct HTTP requests to task app /rollout endpoint
|
|
13
|
+
- No trace capture or usage tracking
|
|
14
|
+
- Simpler but limited functionality
|
|
15
|
+
|
|
16
|
+
Usage:
|
|
17
|
+
```python
|
|
18
|
+
from synth_ai.cli.commands.eval.runner import run_eval
|
|
19
|
+
from synth_ai.cli.commands.eval.config import EvalRunConfig
|
|
20
|
+
|
|
21
|
+
config = EvalRunConfig(
|
|
22
|
+
app_id="banking77",
|
|
23
|
+
task_app_url="http://localhost:8103",
|
|
24
|
+
env_name="banking77",
|
|
25
|
+
seeds=[0, 1, 2],
|
|
26
|
+
policy_config={"model": "gpt-4"},
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
results = await run_eval(config)
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
CLI Usage:
|
|
33
|
+
```bash
|
|
34
|
+
# Direct mode (no backend)
|
|
35
|
+
python -m synth_ai.cli eval \
|
|
36
|
+
--config banking77_eval.toml \
|
|
37
|
+
--url http://localhost:8103
|
|
38
|
+
|
|
39
|
+
# Backend mode (with trace capture)
|
|
40
|
+
python -m synth_ai.cli eval \
|
|
41
|
+
--config banking77_eval.toml \
|
|
42
|
+
--url http://localhost:8103 \
|
|
43
|
+
--backend http://localhost:8000
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
See Also:
|
|
47
|
+
- `synth_ai.cli.commands.eval.config`: Configuration loading
|
|
48
|
+
- `monorepo/backend/app/routes/eval/job_service.py`: Backend eval job service
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
from __future__ import annotations
|
|
52
|
+
|
|
53
|
+
import asyncio
|
|
54
|
+
import json
|
|
55
|
+
import os
|
|
56
|
+
import time
|
|
57
|
+
import uuid
|
|
58
|
+
from dataclasses import dataclass
|
|
59
|
+
from typing import Any
|
|
60
|
+
|
|
61
|
+
import httpx
|
|
62
|
+
|
|
63
|
+
from synth_ai.sdk.task.client import TaskAppClient
|
|
64
|
+
from synth_ai.sdk.task.contracts import (
|
|
65
|
+
RolloutEnvSpec,
|
|
66
|
+
RolloutPolicySpec,
|
|
67
|
+
RolloutRecordConfig,
|
|
68
|
+
RolloutRequest,
|
|
69
|
+
RolloutMode,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
from .config import EvalRunConfig
|
|
73
|
+
|
|
74
|
+
# Default poll interval for backend job status
|
|
75
|
+
_POLL_INTERVAL_S = 2.0
|
|
76
|
+
_MAX_POLL_ATTEMPTS = 600 # 20 minutes max
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass(slots=True)
|
|
80
|
+
class EvalResult:
|
|
81
|
+
seed: int
|
|
82
|
+
score: float | None
|
|
83
|
+
reward_mean: float | None
|
|
84
|
+
outcome_score: float | None
|
|
85
|
+
events_score: float | None
|
|
86
|
+
latency_ms: float | None
|
|
87
|
+
verifier_score: float | None
|
|
88
|
+
tokens: int | None
|
|
89
|
+
cost_usd: float | None
|
|
90
|
+
error: str | None = None
|
|
91
|
+
trace: dict[str, Any] | None = None
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _count_tokens_from_trace(trace: dict[str, Any] | None) -> int:
|
|
95
|
+
"""Extract total token count from trace.
|
|
96
|
+
|
|
97
|
+
Checks multiple locations:
|
|
98
|
+
1. trace.usage.total_tokens (task app returns usage directly)
|
|
99
|
+
2. trace.event_history[].usage (v3 trace format)
|
|
100
|
+
3. trace.event_history[].response.usage (nested response)
|
|
101
|
+
"""
|
|
102
|
+
if not trace:
|
|
103
|
+
return 0
|
|
104
|
+
|
|
105
|
+
# First check for direct usage in trace (task app format)
|
|
106
|
+
usage = trace.get("usage")
|
|
107
|
+
if isinstance(usage, dict):
|
|
108
|
+
total = usage.get("total_tokens", 0)
|
|
109
|
+
if total > 0:
|
|
110
|
+
return total
|
|
111
|
+
|
|
112
|
+
# Fall back to event_history (v3 trace format)
|
|
113
|
+
total = 0
|
|
114
|
+
event_history = trace.get("event_history") or []
|
|
115
|
+
for event in event_history:
|
|
116
|
+
if not isinstance(event, dict):
|
|
117
|
+
continue
|
|
118
|
+
# Check for usage in LM call events
|
|
119
|
+
evt_usage = event.get("usage") or {}
|
|
120
|
+
if isinstance(evt_usage, dict):
|
|
121
|
+
total += evt_usage.get("total_tokens", 0)
|
|
122
|
+
# Also check nested response usage
|
|
123
|
+
response = event.get("response") or {}
|
|
124
|
+
if isinstance(response, dict):
|
|
125
|
+
resp_usage = response.get("usage") or {}
|
|
126
|
+
if isinstance(resp_usage, dict):
|
|
127
|
+
total += resp_usage.get("total_tokens", 0)
|
|
128
|
+
return total
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _count_tokens_from_trajectories(trajectories: list[Any]) -> int:
|
|
132
|
+
"""Extract token count from trajectory steps."""
|
|
133
|
+
total = 0
|
|
134
|
+
for traj in trajectories:
|
|
135
|
+
if not hasattr(traj, "steps"):
|
|
136
|
+
continue
|
|
137
|
+
for step in traj.steps:
|
|
138
|
+
if not hasattr(step, "info") or not isinstance(step.info, dict):
|
|
139
|
+
continue
|
|
140
|
+
# Check for tokens in step info
|
|
141
|
+
tokens = step.info.get("tokens")
|
|
142
|
+
if isinstance(tokens, int):
|
|
143
|
+
total += tokens
|
|
144
|
+
# Check nested usage
|
|
145
|
+
usage = step.info.get("usage") or {}
|
|
146
|
+
if isinstance(usage, dict):
|
|
147
|
+
total += usage.get("total_tokens", 0)
|
|
148
|
+
return total
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _build_run_id(config: EvalRunConfig, seed: int) -> str:
|
|
152
|
+
base = config.app_id or config.env_name or "eval"
|
|
153
|
+
suffix = uuid.uuid4().hex[:8]
|
|
154
|
+
return f"{base}-seed-{seed}-{suffix}"
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _build_rollout_request(config: EvalRunConfig, seed: int) -> RolloutRequest:
|
|
158
|
+
env_config = dict(config.env_config or {})
|
|
159
|
+
policy_config = dict(config.policy_config or {})
|
|
160
|
+
|
|
161
|
+
output_mode = policy_config.pop("output_mode", None)
|
|
162
|
+
structured_config = policy_config.pop("structured_config", None)
|
|
163
|
+
|
|
164
|
+
policy_kwargs: dict[str, Any] = {
|
|
165
|
+
"policy_name": config.policy_name,
|
|
166
|
+
"config": policy_config,
|
|
167
|
+
}
|
|
168
|
+
if output_mode is not None:
|
|
169
|
+
policy_kwargs["output_mode"] = output_mode
|
|
170
|
+
if structured_config is not None:
|
|
171
|
+
policy_kwargs["structured_config"] = structured_config
|
|
172
|
+
|
|
173
|
+
# Cast trace_format to expected literal type
|
|
174
|
+
trace_fmt: Any = config.trace_format
|
|
175
|
+
record = RolloutRecordConfig(
|
|
176
|
+
trajectories=True,
|
|
177
|
+
logprobs=False,
|
|
178
|
+
value=False,
|
|
179
|
+
return_trace=config.return_trace,
|
|
180
|
+
trace_format=trace_fmt,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
synth_base = os.getenv("SYNTH_API_BASE") or os.getenv("SYNTH_BASE_URL")
|
|
184
|
+
|
|
185
|
+
return RolloutRequest(
|
|
186
|
+
run_id=_build_run_id(config, seed),
|
|
187
|
+
env=RolloutEnvSpec(env_name=config.env_name, config=env_config, seed=seed),
|
|
188
|
+
policy=RolloutPolicySpec(**policy_kwargs),
|
|
189
|
+
record=record,
|
|
190
|
+
on_done="reset",
|
|
191
|
+
training_session_id=None,
|
|
192
|
+
synth_base_url=synth_base,
|
|
193
|
+
mode=config.mode or RolloutMode.EVAL,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
async def _eval_seed(
|
|
198
|
+
client: TaskAppClient,
|
|
199
|
+
config: EvalRunConfig,
|
|
200
|
+
seed: int,
|
|
201
|
+
semaphore: asyncio.Semaphore,
|
|
202
|
+
) -> EvalResult:
|
|
203
|
+
"""Execute a single rollout for one seed (used in direct mode).
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
client: TaskAppClient instance for making HTTP requests.
|
|
207
|
+
config: Evaluation configuration.
|
|
208
|
+
seed: Seed/index to evaluate.
|
|
209
|
+
semaphore: Semaphore for concurrency control.
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
EvalResult with score, metrics, tokens, cost, and optional trace.
|
|
213
|
+
|
|
214
|
+
Note:
|
|
215
|
+
This function is only used in direct mode. Backend mode uses the
|
|
216
|
+
backend job service which handles rollouts internally.
|
|
217
|
+
"""
|
|
218
|
+
async with semaphore:
|
|
219
|
+
start = time.perf_counter()
|
|
220
|
+
try:
|
|
221
|
+
request = _build_rollout_request(config, seed)
|
|
222
|
+
response = await client.rollout(request)
|
|
223
|
+
latency_ms = (time.perf_counter() - start) * 1000.0
|
|
224
|
+
|
|
225
|
+
metrics = response.metrics
|
|
226
|
+
reward_mean = metrics.reward_mean
|
|
227
|
+
outcome_score = metrics.outcome_score
|
|
228
|
+
events_score = metrics.events_score
|
|
229
|
+
|
|
230
|
+
score = outcome_score if outcome_score is not None else reward_mean
|
|
231
|
+
verifier_score = None
|
|
232
|
+
tokens = None
|
|
233
|
+
cost_usd = None
|
|
234
|
+
|
|
235
|
+
if isinstance(metrics.details, dict):
|
|
236
|
+
verifier_score = metrics.details.get("verifier_score")
|
|
237
|
+
tokens = metrics.details.get("tokens")
|
|
238
|
+
cost_usd = metrics.details.get("cost_usd")
|
|
239
|
+
|
|
240
|
+
# Extract trace if return_trace was requested
|
|
241
|
+
trace = response.trace if config.return_trace else None
|
|
242
|
+
|
|
243
|
+
# Count tokens from trace or trajectories if not in metrics
|
|
244
|
+
if tokens is None:
|
|
245
|
+
if trace:
|
|
246
|
+
tokens = _count_tokens_from_trace(trace)
|
|
247
|
+
else:
|
|
248
|
+
trajectories = getattr(response, "trajectories", None)
|
|
249
|
+
if trajectories:
|
|
250
|
+
tokens = _count_tokens_from_trajectories(trajectories)
|
|
251
|
+
if tokens == 0:
|
|
252
|
+
tokens = None
|
|
253
|
+
|
|
254
|
+
return EvalResult(
|
|
255
|
+
seed=seed,
|
|
256
|
+
score=score,
|
|
257
|
+
reward_mean=reward_mean,
|
|
258
|
+
outcome_score=outcome_score,
|
|
259
|
+
events_score=events_score,
|
|
260
|
+
latency_ms=latency_ms,
|
|
261
|
+
verifier_score=verifier_score,
|
|
262
|
+
tokens=tokens,
|
|
263
|
+
cost_usd=cost_usd,
|
|
264
|
+
error=None,
|
|
265
|
+
trace=trace,
|
|
266
|
+
)
|
|
267
|
+
except Exception as exc:
|
|
268
|
+
latency_ms = (time.perf_counter() - start) * 1000.0
|
|
269
|
+
return EvalResult(
|
|
270
|
+
seed=seed,
|
|
271
|
+
score=None,
|
|
272
|
+
reward_mean=None,
|
|
273
|
+
outcome_score=None,
|
|
274
|
+
events_score=None,
|
|
275
|
+
latency_ms=latency_ms,
|
|
276
|
+
verifier_score=None,
|
|
277
|
+
tokens=None,
|
|
278
|
+
cost_usd=None,
|
|
279
|
+
error=str(exc),
|
|
280
|
+
trace=None,
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
async def run_eval(config: EvalRunConfig) -> list[EvalResult]:
|
|
285
|
+
"""Run evaluation against a task app.
|
|
286
|
+
|
|
287
|
+
Automatically selects execution mode based on configuration:
|
|
288
|
+
- **Backend mode**: Used if `backend_url` and `backend_api_key` are provided
|
|
289
|
+
(or SYNTH_BASE_URL/SYNTH_API_KEY env vars are set)
|
|
290
|
+
- **Direct mode**: Used otherwise (calls task app directly)
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
config: Evaluation configuration including task app URL, seeds, policy config, etc.
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
List of EvalResult objects, one per seed, sorted by seed number.
|
|
297
|
+
|
|
298
|
+
Raises:
|
|
299
|
+
ValueError: If required configuration is missing (task_app_url, seeds, etc.)
|
|
300
|
+
RuntimeError: If backend job creation or polling fails
|
|
301
|
+
|
|
302
|
+
Example:
|
|
303
|
+
```python
|
|
304
|
+
config = EvalRunConfig(
|
|
305
|
+
app_id="banking77",
|
|
306
|
+
task_app_url="http://localhost:8103",
|
|
307
|
+
backend_url="http://localhost:8000", # Enables backend mode
|
|
308
|
+
backend_api_key="sk-...",
|
|
309
|
+
env_name="banking77",
|
|
310
|
+
seeds=[0, 1, 2],
|
|
311
|
+
policy_config={"model": "gpt-4"},
|
|
312
|
+
)
|
|
313
|
+
results = await run_eval(config)
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
See Also:
|
|
317
|
+
- `run_eval_direct()`: Direct mode implementation
|
|
318
|
+
- `run_eval_via_backend()`: Backend mode implementation
|
|
319
|
+
"""
|
|
320
|
+
backend_url = config.backend_url or os.getenv("SYNTH_BASE_URL") or os.getenv("BACKEND_OVERRIDE")
|
|
321
|
+
api_key = config.backend_api_key or os.getenv("SYNTH_API_KEY")
|
|
322
|
+
|
|
323
|
+
# Use backend mode if we have both backend URL and API key
|
|
324
|
+
if backend_url and api_key:
|
|
325
|
+
return await run_eval_via_backend(config, backend_url, api_key)
|
|
326
|
+
|
|
327
|
+
# Fall back to direct mode
|
|
328
|
+
return await run_eval_direct(config)
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
async def run_eval_direct(config: EvalRunConfig) -> list[EvalResult]:
|
|
332
|
+
"""Direct mode: Call task apps directly without backend.
|
|
333
|
+
|
|
334
|
+
Makes direct HTTP requests to the task app's `/rollout` endpoint.
|
|
335
|
+
This mode does NOT capture traces or track token usage via the backend interceptor.
|
|
336
|
+
|
|
337
|
+
**Use Cases:**
|
|
338
|
+
- Quick local testing without backend setup
|
|
339
|
+
- Legacy workflows that don't need trace capture
|
|
340
|
+
- Simple evaluations without cost tracking
|
|
341
|
+
|
|
342
|
+
**Limitations:**
|
|
343
|
+
- No trace capture (traces must be returned by task app if needed)
|
|
344
|
+
- No token cost calculation (unless task app provides it)
|
|
345
|
+
- No backend interceptor for LLM call tracking
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
config: Evaluation configuration. Must include `task_app_url` and `seeds`.
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
List of EvalResult objects, one per seed.
|
|
352
|
+
|
|
353
|
+
Raises:
|
|
354
|
+
ValueError: If `task_app_url` or `seeds` are missing.
|
|
355
|
+
|
|
356
|
+
Example:
|
|
357
|
+
```python
|
|
358
|
+
config = EvalRunConfig(
|
|
359
|
+
app_id="banking77",
|
|
360
|
+
task_app_url="http://localhost:8103",
|
|
361
|
+
env_name="banking77",
|
|
362
|
+
seeds=[0, 1, 2],
|
|
363
|
+
policy_config={"model": "gpt-4"},
|
|
364
|
+
)
|
|
365
|
+
results = await run_eval_direct(config)
|
|
366
|
+
```
|
|
367
|
+
"""
|
|
368
|
+
if not config.task_app_url:
|
|
369
|
+
raise ValueError("task_app_url is required for eval runs")
|
|
370
|
+
if not config.seeds:
|
|
371
|
+
raise ValueError("No seeds provided for evaluation")
|
|
372
|
+
|
|
373
|
+
api_key = config.task_app_api_key or os.getenv("ENVIRONMENT_API_KEY")
|
|
374
|
+
semaphore = asyncio.Semaphore(max(1, int(config.concurrency or 1)))
|
|
375
|
+
|
|
376
|
+
async with TaskAppClient(base_url=config.task_app_url, api_key=api_key) as client:
|
|
377
|
+
tasks = [
|
|
378
|
+
_eval_seed(client, config, seed, semaphore)
|
|
379
|
+
for seed in config.seeds
|
|
380
|
+
]
|
|
381
|
+
results = await asyncio.gather(*tasks)
|
|
382
|
+
|
|
383
|
+
results.sort(key=lambda item: item.seed)
|
|
384
|
+
return results
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
async def run_eval_via_backend(
|
|
388
|
+
config: EvalRunConfig,
|
|
389
|
+
backend_url: str,
|
|
390
|
+
api_key: str,
|
|
391
|
+
) -> list[EvalResult]:
|
|
392
|
+
"""Backend mode: Route through backend interceptor for trace/usage capture.
|
|
393
|
+
|
|
394
|
+
This mode creates an eval job on the backend, which:
|
|
395
|
+
1. Routes LLM calls through the inference interceptor
|
|
396
|
+
2. Captures traces and token usage automatically
|
|
397
|
+
3. Calculates costs based on model pricing
|
|
398
|
+
4. Provides detailed results with timing and metrics
|
|
399
|
+
|
|
400
|
+
**Flow:**
|
|
401
|
+
1. POST `/api/eval/jobs` - Create eval job
|
|
402
|
+
2. Poll GET `/api/eval/jobs/{job_id}` - Check job status until completed
|
|
403
|
+
3. GET `/api/eval/jobs/{job_id}/results` - Fetch detailed results
|
|
404
|
+
|
|
405
|
+
**Benefits:**
|
|
406
|
+
- Automatic trace capture via interceptor
|
|
407
|
+
- Token usage tracking and cost calculation
|
|
408
|
+
- Centralized job management and monitoring
|
|
409
|
+
- Support for async job execution
|
|
410
|
+
|
|
411
|
+
Args:
|
|
412
|
+
config: Evaluation configuration including task app URL, seeds, policy config.
|
|
413
|
+
backend_url: Backend API base URL (e.g., "http://localhost:8000")
|
|
414
|
+
api_key: Backend API key for authentication (Bearer token)
|
|
415
|
+
|
|
416
|
+
Returns:
|
|
417
|
+
List of EvalResult objects with detailed metrics including tokens, costs, traces.
|
|
418
|
+
|
|
419
|
+
Raises:
|
|
420
|
+
ValueError: If required configuration is missing.
|
|
421
|
+
RuntimeError: If job creation, polling, or result fetching fails.
|
|
422
|
+
|
|
423
|
+
Example:
|
|
424
|
+
```python
|
|
425
|
+
config = EvalRunConfig(
|
|
426
|
+
app_id="banking77",
|
|
427
|
+
task_app_url="http://localhost:8103",
|
|
428
|
+
env_name="banking77",
|
|
429
|
+
seeds=[0, 1, 2],
|
|
430
|
+
policy_config={"model": "gpt-4"},
|
|
431
|
+
)
|
|
432
|
+
results = await run_eval_via_backend(
|
|
433
|
+
config,
|
|
434
|
+
backend_url="http://localhost:8000",
|
|
435
|
+
api_key="sk-...",
|
|
436
|
+
)
|
|
437
|
+
```
|
|
438
|
+
|
|
439
|
+
See Also:
|
|
440
|
+
- `monorepo/backend/app/routes/eval/job_service.py`: Backend job service implementation
|
|
441
|
+
- `monorepo/backend/app/routes/eval/routes.py`: Backend API routes
|
|
442
|
+
"""
|
|
443
|
+
if not config.task_app_url:
|
|
444
|
+
raise ValueError("task_app_url is required for eval runs")
|
|
445
|
+
if not config.seeds:
|
|
446
|
+
raise ValueError("No seeds provided for evaluation")
|
|
447
|
+
|
|
448
|
+
base = backend_url.rstrip("/")
|
|
449
|
+
if not base.endswith("/api"):
|
|
450
|
+
base = f"{base}/api"
|
|
451
|
+
|
|
452
|
+
headers = {"Authorization": f"Bearer {api_key}"}
|
|
453
|
+
|
|
454
|
+
# Build policy config for backend
|
|
455
|
+
policy = dict(config.policy_config or {})
|
|
456
|
+
policy["policy_name"] = config.policy_name
|
|
457
|
+
|
|
458
|
+
# Create eval job request
|
|
459
|
+
job_request = {
|
|
460
|
+
"task_app_url": config.task_app_url,
|
|
461
|
+
"task_app_api_key": config.task_app_api_key or os.getenv("ENVIRONMENT_API_KEY"),
|
|
462
|
+
"app_id": config.app_id,
|
|
463
|
+
"env_name": config.env_name,
|
|
464
|
+
"seeds": list(config.seeds),
|
|
465
|
+
"policy": policy,
|
|
466
|
+
"env_config": config.env_config,
|
|
467
|
+
"mode": config.mode.value if hasattr(config.mode, "value") else str(config.mode or "eval"),
|
|
468
|
+
"max_concurrent": config.concurrency,
|
|
469
|
+
"timeout": config.timeout,
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
async with httpx.AsyncClient(timeout=httpx.Timeout(30.0)) as client:
|
|
473
|
+
# 1. Create the eval job
|
|
474
|
+
print(f"[eval] Creating eval job via backend: {base}/eval/jobs", flush=True)
|
|
475
|
+
resp = await client.post(f"{base}/eval/jobs", json=job_request, headers=headers)
|
|
476
|
+
|
|
477
|
+
if resp.status_code not in (200, 201):
|
|
478
|
+
raise RuntimeError(f"Failed to create eval job: {resp.status_code} {resp.text}")
|
|
479
|
+
|
|
480
|
+
job_data = resp.json()
|
|
481
|
+
job_id = job_data.get("job_id")
|
|
482
|
+
if not job_id:
|
|
483
|
+
raise RuntimeError(f"No job_id in response: {job_data}")
|
|
484
|
+
|
|
485
|
+
print(f"[eval] Job created: {job_id}", flush=True)
|
|
486
|
+
|
|
487
|
+
# 2. Poll for job completion
|
|
488
|
+
for attempt in range(_MAX_POLL_ATTEMPTS):
|
|
489
|
+
await asyncio.sleep(_POLL_INTERVAL_S)
|
|
490
|
+
|
|
491
|
+
status_resp = await client.get(f"{base}/eval/jobs/{job_id}", headers=headers)
|
|
492
|
+
if status_resp.status_code != 200:
|
|
493
|
+
print(f"[eval] Warning: status check failed: {status_resp.status_code}", flush=True)
|
|
494
|
+
continue
|
|
495
|
+
|
|
496
|
+
status_data = status_resp.json()
|
|
497
|
+
status = status_data.get("status", "")
|
|
498
|
+
|
|
499
|
+
if status in ("completed", "failed"):
|
|
500
|
+
break
|
|
501
|
+
|
|
502
|
+
if attempt % 10 == 0:
|
|
503
|
+
print(f"[eval] Job {job_id} status: {status} (attempt {attempt})", flush=True)
|
|
504
|
+
else:
|
|
505
|
+
raise RuntimeError(f"Eval job {job_id} timed out after {_MAX_POLL_ATTEMPTS * _POLL_INTERVAL_S}s")
|
|
506
|
+
|
|
507
|
+
if status == "failed":
|
|
508
|
+
error = status_data.get("error", "Unknown error")
|
|
509
|
+
raise RuntimeError(f"Eval job {job_id} failed: {error}")
|
|
510
|
+
|
|
511
|
+
# 3. Get detailed results
|
|
512
|
+
results_resp = await client.get(f"{base}/eval/jobs/{job_id}/results", headers=headers)
|
|
513
|
+
if results_resp.status_code != 200:
|
|
514
|
+
raise RuntimeError(f"Failed to get results: {results_resp.status_code} {results_resp.text}")
|
|
515
|
+
|
|
516
|
+
results_data = results_resp.json()
|
|
517
|
+
result_rows = results_data.get("results", [])
|
|
518
|
+
|
|
519
|
+
# Convert to EvalResult objects
|
|
520
|
+
results: list[EvalResult] = []
|
|
521
|
+
for row in result_rows:
|
|
522
|
+
results.append(EvalResult(
|
|
523
|
+
seed=int(row.get("seed", 0)),
|
|
524
|
+
score=row.get("score"),
|
|
525
|
+
reward_mean=row.get("reward_mean"),
|
|
526
|
+
outcome_score=row.get("outcome_score"),
|
|
527
|
+
events_score=row.get("events_score"),
|
|
528
|
+
latency_ms=row.get("latency_ms"),
|
|
529
|
+
verifier_score=row.get("verifier_score"),
|
|
530
|
+
tokens=row.get("tokens"),
|
|
531
|
+
cost_usd=row.get("cost_usd"),
|
|
532
|
+
error=row.get("error"),
|
|
533
|
+
trace=None, # Traces fetched separately if needed
|
|
534
|
+
))
|
|
535
|
+
|
|
536
|
+
results.sort(key=lambda item: item.seed)
|
|
537
|
+
|
|
538
|
+
# Print summary from backend
|
|
539
|
+
summary = results_data.get("summary", {})
|
|
540
|
+
if summary:
|
|
541
|
+
print(f"[eval] Backend summary: {summary}", flush=True)
|
|
542
|
+
|
|
543
|
+
return results
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
async def fetch_traces_from_backend(
|
|
547
|
+
job_id: str,
|
|
548
|
+
backend_url: str,
|
|
549
|
+
api_key: str,
|
|
550
|
+
output_dir: str,
|
|
551
|
+
) -> str:
|
|
552
|
+
"""Download traces zip from backend and extract to output_dir.
|
|
553
|
+
|
|
554
|
+
Returns path to the extracted traces directory.
|
|
555
|
+
"""
|
|
556
|
+
import zipfile
|
|
557
|
+
import io
|
|
558
|
+
from pathlib import Path
|
|
559
|
+
|
|
560
|
+
base = backend_url.rstrip("/")
|
|
561
|
+
if not base.endswith("/api"):
|
|
562
|
+
base = f"{base}/api"
|
|
563
|
+
|
|
564
|
+
headers = {"Authorization": f"Bearer {api_key}"}
|
|
565
|
+
|
|
566
|
+
async with httpx.AsyncClient(timeout=httpx.Timeout(60.0)) as client:
|
|
567
|
+
resp = await client.get(f"{base}/eval/jobs/{job_id}/traces", headers=headers)
|
|
568
|
+
|
|
569
|
+
if resp.status_code != 200:
|
|
570
|
+
raise RuntimeError(f"Failed to download traces: {resp.status_code} {resp.text}")
|
|
571
|
+
|
|
572
|
+
# Extract zip contents
|
|
573
|
+
path = Path(output_dir)
|
|
574
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
575
|
+
|
|
576
|
+
with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
|
|
577
|
+
zf.extractall(path)
|
|
578
|
+
|
|
579
|
+
return str(path)
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
def format_eval_table(results: list[EvalResult]) -> str:
|
|
583
|
+
headers = [
|
|
584
|
+
"seed",
|
|
585
|
+
"score",
|
|
586
|
+
"reward_mean",
|
|
587
|
+
"outcome",
|
|
588
|
+
"events",
|
|
589
|
+
"latency_ms",
|
|
590
|
+
"verifier",
|
|
591
|
+
"tokens",
|
|
592
|
+
"cost_usd",
|
|
593
|
+
"error",
|
|
594
|
+
]
|
|
595
|
+
|
|
596
|
+
def _fmt(value: Any) -> str:
|
|
597
|
+
if value is None:
|
|
598
|
+
return "-"
|
|
599
|
+
if isinstance(value, float):
|
|
600
|
+
return f"{value:.4f}".rstrip("0").rstrip(".")
|
|
601
|
+
return str(value)
|
|
602
|
+
|
|
603
|
+
rows = [
|
|
604
|
+
[
|
|
605
|
+
r.seed,
|
|
606
|
+
_fmt(r.score),
|
|
607
|
+
_fmt(r.reward_mean),
|
|
608
|
+
_fmt(r.outcome_score),
|
|
609
|
+
_fmt(r.events_score),
|
|
610
|
+
_fmt(r.latency_ms),
|
|
611
|
+
_fmt(r.verifier_score),
|
|
612
|
+
_fmt(r.tokens),
|
|
613
|
+
_fmt(r.cost_usd),
|
|
614
|
+
r.error or "-",
|
|
615
|
+
]
|
|
616
|
+
for r in results
|
|
617
|
+
]
|
|
618
|
+
|
|
619
|
+
def _avg(values: list[float | int]) -> float | None:
|
|
620
|
+
return sum(values) / len(values) if values else None
|
|
621
|
+
|
|
622
|
+
scores = [r.score for r in results if isinstance(r.score, (int, float))]
|
|
623
|
+
reward_means = [r.reward_mean for r in results if isinstance(r.reward_mean, (int, float))]
|
|
624
|
+
outcomes = [r.outcome_score for r in results if isinstance(r.outcome_score, (int, float))]
|
|
625
|
+
events = [r.events_score for r in results if isinstance(r.events_score, (int, float))]
|
|
626
|
+
latencies = [r.latency_ms for r in results if isinstance(r.latency_ms, (int, float))]
|
|
627
|
+
verifier_scores = [r.verifier_score for r in results if isinstance(r.verifier_score, (int, float))]
|
|
628
|
+
tokens = [r.tokens for r in results if isinstance(r.tokens, int)]
|
|
629
|
+
costs = [r.cost_usd for r in results if isinstance(r.cost_usd, (int, float))]
|
|
630
|
+
|
|
631
|
+
rows.append(
|
|
632
|
+
[
|
|
633
|
+
"avg",
|
|
634
|
+
_fmt(_avg(scores)),
|
|
635
|
+
_fmt(_avg(reward_means)),
|
|
636
|
+
_fmt(_avg(outcomes)),
|
|
637
|
+
_fmt(_avg(events)),
|
|
638
|
+
_fmt(_avg(latencies)),
|
|
639
|
+
_fmt(_avg(verifier_scores)),
|
|
640
|
+
_fmt(int(sum(tokens) / len(tokens)) if tokens else None),
|
|
641
|
+
_fmt(_avg(costs)),
|
|
642
|
+
"-",
|
|
643
|
+
]
|
|
644
|
+
)
|
|
645
|
+
|
|
646
|
+
widths = [len(h) for h in headers]
|
|
647
|
+
for row in rows:
|
|
648
|
+
for idx, cell in enumerate(row):
|
|
649
|
+
widths[idx] = max(widths[idx], len(str(cell)))
|
|
650
|
+
|
|
651
|
+
def _render_row(row: list[Any]) -> str:
|
|
652
|
+
return " | ".join(str(cell).ljust(widths[idx]) for idx, cell in enumerate(row))
|
|
653
|
+
|
|
654
|
+
sep = "-+-".join("-" * width for width in widths)
|
|
655
|
+
lines = [_render_row(headers), sep]
|
|
656
|
+
lines.extend(_render_row(row) for row in rows)
|
|
657
|
+
return "\n".join(lines)
|
|
658
|
+
|
|
659
|
+
|
|
660
|
+
def format_eval_report(config: EvalRunConfig, results: list[EvalResult]) -> str:
|
|
661
|
+
payload = {
|
|
662
|
+
"app_id": config.app_id,
|
|
663
|
+
"task_app_url": config.task_app_url,
|
|
664
|
+
"env_name": config.env_name,
|
|
665
|
+
"policy_name": config.policy_name,
|
|
666
|
+
"policy_config": config.policy_config,
|
|
667
|
+
"seeds": config.seeds,
|
|
668
|
+
"concurrency": config.concurrency,
|
|
669
|
+
}
|
|
670
|
+
header = json.dumps(payload, indent=2, default=str)
|
|
671
|
+
table = format_eval_table(results)
|
|
672
|
+
return f"Eval config\n{header}\n\nResults\n{table}\n"
|
|
673
|
+
|
|
674
|
+
|
|
675
|
+
def save_traces(results: list[EvalResult], traces_dir: str) -> int:
|
|
676
|
+
"""Save traces to individual JSON files in the given directory.
|
|
677
|
+
|
|
678
|
+
Returns the number of traces saved.
|
|
679
|
+
"""
|
|
680
|
+
from pathlib import Path
|
|
681
|
+
|
|
682
|
+
path = Path(traces_dir)
|
|
683
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
684
|
+
|
|
685
|
+
saved = 0
|
|
686
|
+
for result in results:
|
|
687
|
+
if result.trace is not None:
|
|
688
|
+
trace_file = path / f"seed_{result.seed}_trace.json"
|
|
689
|
+
trace_file.write_text(json.dumps(result.trace, indent=2, default=str))
|
|
690
|
+
saved += 1
|
|
691
|
+
|
|
692
|
+
return saved
|
|
693
|
+
|
|
694
|
+
|
|
695
|
+
__all__ = [
|
|
696
|
+
"run_eval",
|
|
697
|
+
"run_eval_direct",
|
|
698
|
+
"run_eval_via_backend",
|
|
699
|
+
"fetch_traces_from_backend",
|
|
700
|
+
"format_eval_table",
|
|
701
|
+
"format_eval_report",
|
|
702
|
+
"save_traces",
|
|
703
|
+
"EvalResult",
|
|
704
|
+
]
|