synth-ai 0.4.1__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (153) hide show
  1. synth_ai/__init__.py +13 -13
  2. synth_ai/cli/__init__.py +6 -15
  3. synth_ai/cli/commands/eval/__init__.py +6 -15
  4. synth_ai/cli/commands/eval/config.py +338 -0
  5. synth_ai/cli/commands/eval/core.py +236 -1091
  6. synth_ai/cli/commands/eval/runner.py +704 -0
  7. synth_ai/cli/commands/eval/validation.py +44 -117
  8. synth_ai/cli/commands/filter/core.py +7 -7
  9. synth_ai/cli/commands/filter/validation.py +2 -2
  10. synth_ai/cli/commands/smoke/core.py +7 -17
  11. synth_ai/cli/commands/status/__init__.py +1 -64
  12. synth_ai/cli/commands/status/client.py +50 -151
  13. synth_ai/cli/commands/status/config.py +3 -83
  14. synth_ai/cli/commands/status/errors.py +4 -13
  15. synth_ai/cli/commands/status/subcommands/__init__.py +2 -8
  16. synth_ai/cli/commands/status/subcommands/config.py +13 -0
  17. synth_ai/cli/commands/status/subcommands/files.py +18 -63
  18. synth_ai/cli/commands/status/subcommands/jobs.py +28 -311
  19. synth_ai/cli/commands/status/subcommands/models.py +18 -62
  20. synth_ai/cli/commands/status/subcommands/runs.py +16 -63
  21. synth_ai/cli/commands/status/subcommands/session.py +67 -172
  22. synth_ai/cli/commands/status/subcommands/summary.py +24 -32
  23. synth_ai/cli/commands/status/subcommands/utils.py +41 -0
  24. synth_ai/cli/commands/status/utils.py +16 -107
  25. synth_ai/cli/commands/train/__init__.py +18 -20
  26. synth_ai/cli/commands/train/errors.py +3 -3
  27. synth_ai/cli/commands/train/prompt_learning_validation.py +15 -16
  28. synth_ai/cli/commands/train/validation.py +7 -7
  29. synth_ai/cli/commands/train/{judge_schemas.py → verifier_schemas.py} +33 -34
  30. synth_ai/cli/commands/train/verifier_validation.py +235 -0
  31. synth_ai/cli/demo_apps/demo_task_apps/math/config.toml +0 -1
  32. synth_ai/cli/demo_apps/demo_task_apps/math/modal_task_app.py +2 -6
  33. synth_ai/cli/demo_apps/math/config.toml +0 -1
  34. synth_ai/cli/demo_apps/math/modal_task_app.py +2 -6
  35. synth_ai/cli/demo_apps/mipro/task_app.py +25 -47
  36. synth_ai/cli/lib/apps/task_app.py +12 -13
  37. synth_ai/cli/lib/task_app_discovery.py +6 -6
  38. synth_ai/cli/lib/train_cfgs.py +10 -10
  39. synth_ai/cli/task_apps/__init__.py +11 -0
  40. synth_ai/cli/task_apps/commands.py +7 -15
  41. synth_ai/core/env.py +12 -1
  42. synth_ai/core/errors.py +1 -2
  43. synth_ai/core/integrations/cloudflare.py +209 -33
  44. synth_ai/core/tracing_v3/abstractions.py +46 -0
  45. synth_ai/data/__init__.py +3 -30
  46. synth_ai/data/enums.py +1 -20
  47. synth_ai/data/rewards.py +100 -3
  48. synth_ai/products/graph_evolve/__init__.py +1 -2
  49. synth_ai/products/graph_evolve/config.py +16 -16
  50. synth_ai/products/graph_evolve/converters/__init__.py +3 -3
  51. synth_ai/products/graph_evolve/converters/openai_sft.py +7 -7
  52. synth_ai/products/graph_evolve/examples/hotpotqa/config.toml +1 -1
  53. synth_ai/products/graph_gepa/__init__.py +23 -0
  54. synth_ai/products/graph_gepa/converters/__init__.py +19 -0
  55. synth_ai/products/graph_gepa/converters/openai_sft.py +29 -0
  56. synth_ai/sdk/__init__.py +45 -35
  57. synth_ai/sdk/api/eval/__init__.py +33 -0
  58. synth_ai/sdk/api/eval/job.py +732 -0
  59. synth_ai/sdk/api/research_agent/__init__.py +276 -66
  60. synth_ai/sdk/api/train/builders.py +181 -0
  61. synth_ai/sdk/api/train/cli.py +41 -33
  62. synth_ai/sdk/api/train/configs/__init__.py +6 -4
  63. synth_ai/sdk/api/train/configs/prompt_learning.py +127 -33
  64. synth_ai/sdk/api/train/configs/rl.py +264 -16
  65. synth_ai/sdk/api/train/configs/sft.py +165 -1
  66. synth_ai/sdk/api/train/graph_validators.py +12 -12
  67. synth_ai/sdk/api/train/graphgen.py +169 -51
  68. synth_ai/sdk/api/train/graphgen_models.py +95 -45
  69. synth_ai/sdk/api/train/local_api.py +10 -0
  70. synth_ai/sdk/api/train/pollers.py +36 -0
  71. synth_ai/sdk/api/train/prompt_learning.py +390 -60
  72. synth_ai/sdk/api/train/rl.py +41 -5
  73. synth_ai/sdk/api/train/sft.py +2 -0
  74. synth_ai/sdk/api/train/task_app.py +20 -0
  75. synth_ai/sdk/api/train/validators.py +17 -17
  76. synth_ai/sdk/graphs/completions.py +239 -33
  77. synth_ai/sdk/{judging/schemas.py → graphs/verifier_schemas.py} +23 -23
  78. synth_ai/sdk/learning/__init__.py +35 -5
  79. synth_ai/sdk/learning/context_learning_client.py +531 -0
  80. synth_ai/sdk/learning/context_learning_types.py +294 -0
  81. synth_ai/sdk/learning/prompt_learning_client.py +1 -1
  82. synth_ai/sdk/learning/prompt_learning_types.py +2 -1
  83. synth_ai/sdk/learning/rl/__init__.py +0 -4
  84. synth_ai/sdk/learning/rl/contracts.py +0 -4
  85. synth_ai/sdk/localapi/__init__.py +40 -0
  86. synth_ai/sdk/localapi/apps/__init__.py +28 -0
  87. synth_ai/sdk/localapi/client.py +10 -0
  88. synth_ai/sdk/localapi/contracts.py +10 -0
  89. synth_ai/sdk/localapi/helpers.py +519 -0
  90. synth_ai/sdk/localapi/rollouts.py +93 -0
  91. synth_ai/sdk/localapi/server.py +29 -0
  92. synth_ai/sdk/localapi/template.py +49 -0
  93. synth_ai/sdk/streaming/handlers.py +6 -6
  94. synth_ai/sdk/streaming/streamer.py +10 -6
  95. synth_ai/sdk/task/__init__.py +18 -5
  96. synth_ai/sdk/task/apps/__init__.py +37 -1
  97. synth_ai/sdk/task/client.py +9 -1
  98. synth_ai/sdk/task/config.py +6 -11
  99. synth_ai/sdk/task/contracts.py +137 -95
  100. synth_ai/sdk/task/in_process.py +32 -22
  101. synth_ai/sdk/task/in_process_runner.py +9 -4
  102. synth_ai/sdk/task/rubrics/__init__.py +2 -3
  103. synth_ai/sdk/task/rubrics/loaders.py +4 -4
  104. synth_ai/sdk/task/rubrics/strict.py +3 -4
  105. synth_ai/sdk/task/server.py +76 -16
  106. synth_ai/sdk/task/trace_correlation_helpers.py +190 -139
  107. synth_ai/sdk/task/validators.py +34 -49
  108. synth_ai/sdk/training/__init__.py +7 -16
  109. synth_ai/sdk/tunnels/__init__.py +118 -0
  110. synth_ai/sdk/tunnels/cleanup.py +83 -0
  111. synth_ai/sdk/tunnels/ports.py +120 -0
  112. synth_ai/sdk/tunnels/tunneled_api.py +363 -0
  113. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/METADATA +71 -4
  114. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/RECORD +118 -128
  115. synth_ai/cli/commands/baseline/__init__.py +0 -12
  116. synth_ai/cli/commands/baseline/core.py +0 -636
  117. synth_ai/cli/commands/baseline/list.py +0 -94
  118. synth_ai/cli/commands/eval/errors.py +0 -81
  119. synth_ai/cli/commands/status/formatters.py +0 -164
  120. synth_ai/cli/commands/status/subcommands/pricing.py +0 -23
  121. synth_ai/cli/commands/status/subcommands/usage.py +0 -203
  122. synth_ai/cli/commands/train/judge_validation.py +0 -305
  123. synth_ai/cli/usage.py +0 -159
  124. synth_ai/data/specs.py +0 -36
  125. synth_ai/sdk/api/research_agent/cli.py +0 -428
  126. synth_ai/sdk/api/research_agent/config.py +0 -357
  127. synth_ai/sdk/api/research_agent/job.py +0 -717
  128. synth_ai/sdk/baseline/__init__.py +0 -25
  129. synth_ai/sdk/baseline/config.py +0 -209
  130. synth_ai/sdk/baseline/discovery.py +0 -216
  131. synth_ai/sdk/baseline/execution.py +0 -154
  132. synth_ai/sdk/judging/__init__.py +0 -15
  133. synth_ai/sdk/judging/base.py +0 -24
  134. synth_ai/sdk/judging/client.py +0 -191
  135. synth_ai/sdk/judging/types.py +0 -42
  136. synth_ai/sdk/research_agent/__init__.py +0 -34
  137. synth_ai/sdk/research_agent/container_builder.py +0 -328
  138. synth_ai/sdk/research_agent/container_spec.py +0 -198
  139. synth_ai/sdk/research_agent/defaults.py +0 -34
  140. synth_ai/sdk/research_agent/results_collector.py +0 -69
  141. synth_ai/sdk/specs/__init__.py +0 -46
  142. synth_ai/sdk/specs/dataclasses.py +0 -149
  143. synth_ai/sdk/specs/loader.py +0 -144
  144. synth_ai/sdk/specs/serializer.py +0 -199
  145. synth_ai/sdk/specs/validation.py +0 -250
  146. synth_ai/sdk/tracing/__init__.py +0 -39
  147. synth_ai/sdk/usage/__init__.py +0 -37
  148. synth_ai/sdk/usage/client.py +0 -171
  149. synth_ai/sdk/usage/models.py +0 -261
  150. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/WHEEL +0 -0
  151. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/entry_points.txt +0 -0
  152. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/licenses/LICENSE +0 -0
  153. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,704 @@
1
+ """Eval runner for executing rollouts against task apps.
2
+
3
+ This module provides two execution modes:
4
+
5
+ 1. **Backend Mode (Default)**: Routes through backend interceptor for trace/usage capture
6
+ - Creates eval job via POST /api/eval/jobs
7
+ - Polls job status until completion
8
+ - Fetches detailed results with token costs and traces
9
+ - Requires backend_url and backend_api_key (or SYNTH_BASE_URL/SYNTH_API_KEY env vars)
10
+
11
+ 2. **Direct Mode**: Calls task apps directly (legacy, no usage tracking)
12
+ - Makes direct HTTP requests to task app /rollout endpoint
13
+ - No trace capture or usage tracking
14
+ - Simpler but limited functionality
15
+
16
+ Usage:
17
+ ```python
18
+ from synth_ai.cli.commands.eval.runner import run_eval
19
+ from synth_ai.cli.commands.eval.config import EvalRunConfig
20
+
21
+ config = EvalRunConfig(
22
+ app_id="banking77",
23
+ task_app_url="http://localhost:8103",
24
+ env_name="banking77",
25
+ seeds=[0, 1, 2],
26
+ policy_config={"model": "gpt-4"},
27
+ )
28
+
29
+ results = await run_eval(config)
30
+ ```
31
+
32
+ CLI Usage:
33
+ ```bash
34
+ # Direct mode (no backend)
35
+ python -m synth_ai.cli eval \
36
+ --config banking77_eval.toml \
37
+ --url http://localhost:8103
38
+
39
+ # Backend mode (with trace capture)
40
+ python -m synth_ai.cli eval \
41
+ --config banking77_eval.toml \
42
+ --url http://localhost:8103 \
43
+ --backend http://localhost:8000
44
+ ```
45
+
46
+ See Also:
47
+ - `synth_ai.cli.commands.eval.config`: Configuration loading
48
+ - `monorepo/backend/app/routes/eval/job_service.py`: Backend eval job service
49
+ """
50
+
51
+ from __future__ import annotations
52
+
53
+ import asyncio
54
+ import json
55
+ import os
56
+ import time
57
+ import uuid
58
+ from dataclasses import dataclass
59
+ from typing import Any
60
+
61
+ import httpx
62
+
63
+ from synth_ai.sdk.task.client import TaskAppClient
64
+ from synth_ai.sdk.task.contracts import (
65
+ RolloutEnvSpec,
66
+ RolloutPolicySpec,
67
+ RolloutRecordConfig,
68
+ RolloutRequest,
69
+ RolloutMode,
70
+ )
71
+
72
+ from .config import EvalRunConfig
73
+
74
+ # Default poll interval for backend job status
75
+ _POLL_INTERVAL_S = 2.0
76
+ _MAX_POLL_ATTEMPTS = 600 # 20 minutes max
77
+
78
+
79
+ @dataclass(slots=True)
80
+ class EvalResult:
81
+ seed: int
82
+ score: float | None
83
+ reward_mean: float | None
84
+ outcome_score: float | None
85
+ events_score: float | None
86
+ latency_ms: float | None
87
+ verifier_score: float | None
88
+ tokens: int | None
89
+ cost_usd: float | None
90
+ error: str | None = None
91
+ trace: dict[str, Any] | None = None
92
+
93
+
94
+ def _count_tokens_from_trace(trace: dict[str, Any] | None) -> int:
95
+ """Extract total token count from trace.
96
+
97
+ Checks multiple locations:
98
+ 1. trace.usage.total_tokens (task app returns usage directly)
99
+ 2. trace.event_history[].usage (v3 trace format)
100
+ 3. trace.event_history[].response.usage (nested response)
101
+ """
102
+ if not trace:
103
+ return 0
104
+
105
+ # First check for direct usage in trace (task app format)
106
+ usage = trace.get("usage")
107
+ if isinstance(usage, dict):
108
+ total = usage.get("total_tokens", 0)
109
+ if total > 0:
110
+ return total
111
+
112
+ # Fall back to event_history (v3 trace format)
113
+ total = 0
114
+ event_history = trace.get("event_history") or []
115
+ for event in event_history:
116
+ if not isinstance(event, dict):
117
+ continue
118
+ # Check for usage in LM call events
119
+ evt_usage = event.get("usage") or {}
120
+ if isinstance(evt_usage, dict):
121
+ total += evt_usage.get("total_tokens", 0)
122
+ # Also check nested response usage
123
+ response = event.get("response") or {}
124
+ if isinstance(response, dict):
125
+ resp_usage = response.get("usage") or {}
126
+ if isinstance(resp_usage, dict):
127
+ total += resp_usage.get("total_tokens", 0)
128
+ return total
129
+
130
+
131
+ def _count_tokens_from_trajectories(trajectories: list[Any]) -> int:
132
+ """Extract token count from trajectory steps."""
133
+ total = 0
134
+ for traj in trajectories:
135
+ if not hasattr(traj, "steps"):
136
+ continue
137
+ for step in traj.steps:
138
+ if not hasattr(step, "info") or not isinstance(step.info, dict):
139
+ continue
140
+ # Check for tokens in step info
141
+ tokens = step.info.get("tokens")
142
+ if isinstance(tokens, int):
143
+ total += tokens
144
+ # Check nested usage
145
+ usage = step.info.get("usage") or {}
146
+ if isinstance(usage, dict):
147
+ total += usage.get("total_tokens", 0)
148
+ return total
149
+
150
+
151
+ def _build_run_id(config: EvalRunConfig, seed: int) -> str:
152
+ base = config.app_id or config.env_name or "eval"
153
+ suffix = uuid.uuid4().hex[:8]
154
+ return f"{base}-seed-{seed}-{suffix}"
155
+
156
+
157
+ def _build_rollout_request(config: EvalRunConfig, seed: int) -> RolloutRequest:
158
+ env_config = dict(config.env_config or {})
159
+ policy_config = dict(config.policy_config or {})
160
+
161
+ output_mode = policy_config.pop("output_mode", None)
162
+ structured_config = policy_config.pop("structured_config", None)
163
+
164
+ policy_kwargs: dict[str, Any] = {
165
+ "policy_name": config.policy_name,
166
+ "config": policy_config,
167
+ }
168
+ if output_mode is not None:
169
+ policy_kwargs["output_mode"] = output_mode
170
+ if structured_config is not None:
171
+ policy_kwargs["structured_config"] = structured_config
172
+
173
+ # Cast trace_format to expected literal type
174
+ trace_fmt: Any = config.trace_format
175
+ record = RolloutRecordConfig(
176
+ trajectories=True,
177
+ logprobs=False,
178
+ value=False,
179
+ return_trace=config.return_trace,
180
+ trace_format=trace_fmt,
181
+ )
182
+
183
+ synth_base = os.getenv("SYNTH_API_BASE") or os.getenv("SYNTH_BASE_URL")
184
+
185
+ return RolloutRequest(
186
+ run_id=_build_run_id(config, seed),
187
+ env=RolloutEnvSpec(env_name=config.env_name, config=env_config, seed=seed),
188
+ policy=RolloutPolicySpec(**policy_kwargs),
189
+ record=record,
190
+ on_done="reset",
191
+ training_session_id=None,
192
+ synth_base_url=synth_base,
193
+ mode=config.mode or RolloutMode.EVAL,
194
+ )
195
+
196
+
197
+ async def _eval_seed(
198
+ client: TaskAppClient,
199
+ config: EvalRunConfig,
200
+ seed: int,
201
+ semaphore: asyncio.Semaphore,
202
+ ) -> EvalResult:
203
+ """Execute a single rollout for one seed (used in direct mode).
204
+
205
+ Args:
206
+ client: TaskAppClient instance for making HTTP requests.
207
+ config: Evaluation configuration.
208
+ seed: Seed/index to evaluate.
209
+ semaphore: Semaphore for concurrency control.
210
+
211
+ Returns:
212
+ EvalResult with score, metrics, tokens, cost, and optional trace.
213
+
214
+ Note:
215
+ This function is only used in direct mode. Backend mode uses the
216
+ backend job service which handles rollouts internally.
217
+ """
218
+ async with semaphore:
219
+ start = time.perf_counter()
220
+ try:
221
+ request = _build_rollout_request(config, seed)
222
+ response = await client.rollout(request)
223
+ latency_ms = (time.perf_counter() - start) * 1000.0
224
+
225
+ metrics = response.metrics
226
+ reward_mean = metrics.reward_mean
227
+ outcome_score = metrics.outcome_score
228
+ events_score = metrics.events_score
229
+
230
+ score = outcome_score if outcome_score is not None else reward_mean
231
+ verifier_score = None
232
+ tokens = None
233
+ cost_usd = None
234
+
235
+ if isinstance(metrics.details, dict):
236
+ verifier_score = metrics.details.get("verifier_score")
237
+ tokens = metrics.details.get("tokens")
238
+ cost_usd = metrics.details.get("cost_usd")
239
+
240
+ # Extract trace if return_trace was requested
241
+ trace = response.trace if config.return_trace else None
242
+
243
+ # Count tokens from trace or trajectories if not in metrics
244
+ if tokens is None:
245
+ if trace:
246
+ tokens = _count_tokens_from_trace(trace)
247
+ else:
248
+ trajectories = getattr(response, "trajectories", None)
249
+ if trajectories:
250
+ tokens = _count_tokens_from_trajectories(trajectories)
251
+ if tokens == 0:
252
+ tokens = None
253
+
254
+ return EvalResult(
255
+ seed=seed,
256
+ score=score,
257
+ reward_mean=reward_mean,
258
+ outcome_score=outcome_score,
259
+ events_score=events_score,
260
+ latency_ms=latency_ms,
261
+ verifier_score=verifier_score,
262
+ tokens=tokens,
263
+ cost_usd=cost_usd,
264
+ error=None,
265
+ trace=trace,
266
+ )
267
+ except Exception as exc:
268
+ latency_ms = (time.perf_counter() - start) * 1000.0
269
+ return EvalResult(
270
+ seed=seed,
271
+ score=None,
272
+ reward_mean=None,
273
+ outcome_score=None,
274
+ events_score=None,
275
+ latency_ms=latency_ms,
276
+ verifier_score=None,
277
+ tokens=None,
278
+ cost_usd=None,
279
+ error=str(exc),
280
+ trace=None,
281
+ )
282
+
283
+
284
+ async def run_eval(config: EvalRunConfig) -> list[EvalResult]:
285
+ """Run evaluation against a task app.
286
+
287
+ Automatically selects execution mode based on configuration:
288
+ - **Backend mode**: Used if `backend_url` and `backend_api_key` are provided
289
+ (or SYNTH_BASE_URL/SYNTH_API_KEY env vars are set)
290
+ - **Direct mode**: Used otherwise (calls task app directly)
291
+
292
+ Args:
293
+ config: Evaluation configuration including task app URL, seeds, policy config, etc.
294
+
295
+ Returns:
296
+ List of EvalResult objects, one per seed, sorted by seed number.
297
+
298
+ Raises:
299
+ ValueError: If required configuration is missing (task_app_url, seeds, etc.)
300
+ RuntimeError: If backend job creation or polling fails
301
+
302
+ Example:
303
+ ```python
304
+ config = EvalRunConfig(
305
+ app_id="banking77",
306
+ task_app_url="http://localhost:8103",
307
+ backend_url="http://localhost:8000", # Enables backend mode
308
+ backend_api_key="sk-...",
309
+ env_name="banking77",
310
+ seeds=[0, 1, 2],
311
+ policy_config={"model": "gpt-4"},
312
+ )
313
+ results = await run_eval(config)
314
+ ```
315
+
316
+ See Also:
317
+ - `run_eval_direct()`: Direct mode implementation
318
+ - `run_eval_via_backend()`: Backend mode implementation
319
+ """
320
+ backend_url = config.backend_url or os.getenv("SYNTH_BASE_URL") or os.getenv("BACKEND_OVERRIDE")
321
+ api_key = config.backend_api_key or os.getenv("SYNTH_API_KEY")
322
+
323
+ # Use backend mode if we have both backend URL and API key
324
+ if backend_url and api_key:
325
+ return await run_eval_via_backend(config, backend_url, api_key)
326
+
327
+ # Fall back to direct mode
328
+ return await run_eval_direct(config)
329
+
330
+
331
+ async def run_eval_direct(config: EvalRunConfig) -> list[EvalResult]:
332
+ """Direct mode: Call task apps directly without backend.
333
+
334
+ Makes direct HTTP requests to the task app's `/rollout` endpoint.
335
+ This mode does NOT capture traces or track token usage via the backend interceptor.
336
+
337
+ **Use Cases:**
338
+ - Quick local testing without backend setup
339
+ - Legacy workflows that don't need trace capture
340
+ - Simple evaluations without cost tracking
341
+
342
+ **Limitations:**
343
+ - No trace capture (traces must be returned by task app if needed)
344
+ - No token cost calculation (unless task app provides it)
345
+ - No backend interceptor for LLM call tracking
346
+
347
+ Args:
348
+ config: Evaluation configuration. Must include `task_app_url` and `seeds`.
349
+
350
+ Returns:
351
+ List of EvalResult objects, one per seed.
352
+
353
+ Raises:
354
+ ValueError: If `task_app_url` or `seeds` are missing.
355
+
356
+ Example:
357
+ ```python
358
+ config = EvalRunConfig(
359
+ app_id="banking77",
360
+ task_app_url="http://localhost:8103",
361
+ env_name="banking77",
362
+ seeds=[0, 1, 2],
363
+ policy_config={"model": "gpt-4"},
364
+ )
365
+ results = await run_eval_direct(config)
366
+ ```
367
+ """
368
+ if not config.task_app_url:
369
+ raise ValueError("task_app_url is required for eval runs")
370
+ if not config.seeds:
371
+ raise ValueError("No seeds provided for evaluation")
372
+
373
+ api_key = config.task_app_api_key or os.getenv("ENVIRONMENT_API_KEY")
374
+ semaphore = asyncio.Semaphore(max(1, int(config.concurrency or 1)))
375
+
376
+ async with TaskAppClient(base_url=config.task_app_url, api_key=api_key) as client:
377
+ tasks = [
378
+ _eval_seed(client, config, seed, semaphore)
379
+ for seed in config.seeds
380
+ ]
381
+ results = await asyncio.gather(*tasks)
382
+
383
+ results.sort(key=lambda item: item.seed)
384
+ return results
385
+
386
+
387
+ async def run_eval_via_backend(
388
+ config: EvalRunConfig,
389
+ backend_url: str,
390
+ api_key: str,
391
+ ) -> list[EvalResult]:
392
+ """Backend mode: Route through backend interceptor for trace/usage capture.
393
+
394
+ This mode creates an eval job on the backend, which:
395
+ 1. Routes LLM calls through the inference interceptor
396
+ 2. Captures traces and token usage automatically
397
+ 3. Calculates costs based on model pricing
398
+ 4. Provides detailed results with timing and metrics
399
+
400
+ **Flow:**
401
+ 1. POST `/api/eval/jobs` - Create eval job
402
+ 2. Poll GET `/api/eval/jobs/{job_id}` - Check job status until completed
403
+ 3. GET `/api/eval/jobs/{job_id}/results` - Fetch detailed results
404
+
405
+ **Benefits:**
406
+ - Automatic trace capture via interceptor
407
+ - Token usage tracking and cost calculation
408
+ - Centralized job management and monitoring
409
+ - Support for async job execution
410
+
411
+ Args:
412
+ config: Evaluation configuration including task app URL, seeds, policy config.
413
+ backend_url: Backend API base URL (e.g., "http://localhost:8000")
414
+ api_key: Backend API key for authentication (Bearer token)
415
+
416
+ Returns:
417
+ List of EvalResult objects with detailed metrics including tokens, costs, traces.
418
+
419
+ Raises:
420
+ ValueError: If required configuration is missing.
421
+ RuntimeError: If job creation, polling, or result fetching fails.
422
+
423
+ Example:
424
+ ```python
425
+ config = EvalRunConfig(
426
+ app_id="banking77",
427
+ task_app_url="http://localhost:8103",
428
+ env_name="banking77",
429
+ seeds=[0, 1, 2],
430
+ policy_config={"model": "gpt-4"},
431
+ )
432
+ results = await run_eval_via_backend(
433
+ config,
434
+ backend_url="http://localhost:8000",
435
+ api_key="sk-...",
436
+ )
437
+ ```
438
+
439
+ See Also:
440
+ - `monorepo/backend/app/routes/eval/job_service.py`: Backend job service implementation
441
+ - `monorepo/backend/app/routes/eval/routes.py`: Backend API routes
442
+ """
443
+ if not config.task_app_url:
444
+ raise ValueError("task_app_url is required for eval runs")
445
+ if not config.seeds:
446
+ raise ValueError("No seeds provided for evaluation")
447
+
448
+ base = backend_url.rstrip("/")
449
+ if not base.endswith("/api"):
450
+ base = f"{base}/api"
451
+
452
+ headers = {"Authorization": f"Bearer {api_key}"}
453
+
454
+ # Build policy config for backend
455
+ policy = dict(config.policy_config or {})
456
+ policy["policy_name"] = config.policy_name
457
+
458
+ # Create eval job request
459
+ job_request = {
460
+ "task_app_url": config.task_app_url,
461
+ "task_app_api_key": config.task_app_api_key or os.getenv("ENVIRONMENT_API_KEY"),
462
+ "app_id": config.app_id,
463
+ "env_name": config.env_name,
464
+ "seeds": list(config.seeds),
465
+ "policy": policy,
466
+ "env_config": config.env_config,
467
+ "mode": config.mode.value if hasattr(config.mode, "value") else str(config.mode or "eval"),
468
+ "max_concurrent": config.concurrency,
469
+ "timeout": config.timeout,
470
+ }
471
+
472
+ async with httpx.AsyncClient(timeout=httpx.Timeout(30.0)) as client:
473
+ # 1. Create the eval job
474
+ print(f"[eval] Creating eval job via backend: {base}/eval/jobs", flush=True)
475
+ resp = await client.post(f"{base}/eval/jobs", json=job_request, headers=headers)
476
+
477
+ if resp.status_code not in (200, 201):
478
+ raise RuntimeError(f"Failed to create eval job: {resp.status_code} {resp.text}")
479
+
480
+ job_data = resp.json()
481
+ job_id = job_data.get("job_id")
482
+ if not job_id:
483
+ raise RuntimeError(f"No job_id in response: {job_data}")
484
+
485
+ print(f"[eval] Job created: {job_id}", flush=True)
486
+
487
+ # 2. Poll for job completion
488
+ for attempt in range(_MAX_POLL_ATTEMPTS):
489
+ await asyncio.sleep(_POLL_INTERVAL_S)
490
+
491
+ status_resp = await client.get(f"{base}/eval/jobs/{job_id}", headers=headers)
492
+ if status_resp.status_code != 200:
493
+ print(f"[eval] Warning: status check failed: {status_resp.status_code}", flush=True)
494
+ continue
495
+
496
+ status_data = status_resp.json()
497
+ status = status_data.get("status", "")
498
+
499
+ if status in ("completed", "failed"):
500
+ break
501
+
502
+ if attempt % 10 == 0:
503
+ print(f"[eval] Job {job_id} status: {status} (attempt {attempt})", flush=True)
504
+ else:
505
+ raise RuntimeError(f"Eval job {job_id} timed out after {_MAX_POLL_ATTEMPTS * _POLL_INTERVAL_S}s")
506
+
507
+ if status == "failed":
508
+ error = status_data.get("error", "Unknown error")
509
+ raise RuntimeError(f"Eval job {job_id} failed: {error}")
510
+
511
+ # 3. Get detailed results
512
+ results_resp = await client.get(f"{base}/eval/jobs/{job_id}/results", headers=headers)
513
+ if results_resp.status_code != 200:
514
+ raise RuntimeError(f"Failed to get results: {results_resp.status_code} {results_resp.text}")
515
+
516
+ results_data = results_resp.json()
517
+ result_rows = results_data.get("results", [])
518
+
519
+ # Convert to EvalResult objects
520
+ results: list[EvalResult] = []
521
+ for row in result_rows:
522
+ results.append(EvalResult(
523
+ seed=int(row.get("seed", 0)),
524
+ score=row.get("score"),
525
+ reward_mean=row.get("reward_mean"),
526
+ outcome_score=row.get("outcome_score"),
527
+ events_score=row.get("events_score"),
528
+ latency_ms=row.get("latency_ms"),
529
+ verifier_score=row.get("verifier_score"),
530
+ tokens=row.get("tokens"),
531
+ cost_usd=row.get("cost_usd"),
532
+ error=row.get("error"),
533
+ trace=None, # Traces fetched separately if needed
534
+ ))
535
+
536
+ results.sort(key=lambda item: item.seed)
537
+
538
+ # Print summary from backend
539
+ summary = results_data.get("summary", {})
540
+ if summary:
541
+ print(f"[eval] Backend summary: {summary}", flush=True)
542
+
543
+ return results
544
+
545
+
546
+ async def fetch_traces_from_backend(
547
+ job_id: str,
548
+ backend_url: str,
549
+ api_key: str,
550
+ output_dir: str,
551
+ ) -> str:
552
+ """Download traces zip from backend and extract to output_dir.
553
+
554
+ Returns path to the extracted traces directory.
555
+ """
556
+ import zipfile
557
+ import io
558
+ from pathlib import Path
559
+
560
+ base = backend_url.rstrip("/")
561
+ if not base.endswith("/api"):
562
+ base = f"{base}/api"
563
+
564
+ headers = {"Authorization": f"Bearer {api_key}"}
565
+
566
+ async with httpx.AsyncClient(timeout=httpx.Timeout(60.0)) as client:
567
+ resp = await client.get(f"{base}/eval/jobs/{job_id}/traces", headers=headers)
568
+
569
+ if resp.status_code != 200:
570
+ raise RuntimeError(f"Failed to download traces: {resp.status_code} {resp.text}")
571
+
572
+ # Extract zip contents
573
+ path = Path(output_dir)
574
+ path.mkdir(parents=True, exist_ok=True)
575
+
576
+ with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
577
+ zf.extractall(path)
578
+
579
+ return str(path)
580
+
581
+
582
+ def format_eval_table(results: list[EvalResult]) -> str:
583
+ headers = [
584
+ "seed",
585
+ "score",
586
+ "reward_mean",
587
+ "outcome",
588
+ "events",
589
+ "latency_ms",
590
+ "verifier",
591
+ "tokens",
592
+ "cost_usd",
593
+ "error",
594
+ ]
595
+
596
+ def _fmt(value: Any) -> str:
597
+ if value is None:
598
+ return "-"
599
+ if isinstance(value, float):
600
+ return f"{value:.4f}".rstrip("0").rstrip(".")
601
+ return str(value)
602
+
603
+ rows = [
604
+ [
605
+ r.seed,
606
+ _fmt(r.score),
607
+ _fmt(r.reward_mean),
608
+ _fmt(r.outcome_score),
609
+ _fmt(r.events_score),
610
+ _fmt(r.latency_ms),
611
+ _fmt(r.verifier_score),
612
+ _fmt(r.tokens),
613
+ _fmt(r.cost_usd),
614
+ r.error or "-",
615
+ ]
616
+ for r in results
617
+ ]
618
+
619
+ def _avg(values: list[float | int]) -> float | None:
620
+ return sum(values) / len(values) if values else None
621
+
622
+ scores = [r.score for r in results if isinstance(r.score, (int, float))]
623
+ reward_means = [r.reward_mean for r in results if isinstance(r.reward_mean, (int, float))]
624
+ outcomes = [r.outcome_score for r in results if isinstance(r.outcome_score, (int, float))]
625
+ events = [r.events_score for r in results if isinstance(r.events_score, (int, float))]
626
+ latencies = [r.latency_ms for r in results if isinstance(r.latency_ms, (int, float))]
627
+ verifier_scores = [r.verifier_score for r in results if isinstance(r.verifier_score, (int, float))]
628
+ tokens = [r.tokens for r in results if isinstance(r.tokens, int)]
629
+ costs = [r.cost_usd for r in results if isinstance(r.cost_usd, (int, float))]
630
+
631
+ rows.append(
632
+ [
633
+ "avg",
634
+ _fmt(_avg(scores)),
635
+ _fmt(_avg(reward_means)),
636
+ _fmt(_avg(outcomes)),
637
+ _fmt(_avg(events)),
638
+ _fmt(_avg(latencies)),
639
+ _fmt(_avg(verifier_scores)),
640
+ _fmt(int(sum(tokens) / len(tokens)) if tokens else None),
641
+ _fmt(_avg(costs)),
642
+ "-",
643
+ ]
644
+ )
645
+
646
+ widths = [len(h) for h in headers]
647
+ for row in rows:
648
+ for idx, cell in enumerate(row):
649
+ widths[idx] = max(widths[idx], len(str(cell)))
650
+
651
+ def _render_row(row: list[Any]) -> str:
652
+ return " | ".join(str(cell).ljust(widths[idx]) for idx, cell in enumerate(row))
653
+
654
+ sep = "-+-".join("-" * width for width in widths)
655
+ lines = [_render_row(headers), sep]
656
+ lines.extend(_render_row(row) for row in rows)
657
+ return "\n".join(lines)
658
+
659
+
660
+ def format_eval_report(config: EvalRunConfig, results: list[EvalResult]) -> str:
661
+ payload = {
662
+ "app_id": config.app_id,
663
+ "task_app_url": config.task_app_url,
664
+ "env_name": config.env_name,
665
+ "policy_name": config.policy_name,
666
+ "policy_config": config.policy_config,
667
+ "seeds": config.seeds,
668
+ "concurrency": config.concurrency,
669
+ }
670
+ header = json.dumps(payload, indent=2, default=str)
671
+ table = format_eval_table(results)
672
+ return f"Eval config\n{header}\n\nResults\n{table}\n"
673
+
674
+
675
+ def save_traces(results: list[EvalResult], traces_dir: str) -> int:
676
+ """Save traces to individual JSON files in the given directory.
677
+
678
+ Returns the number of traces saved.
679
+ """
680
+ from pathlib import Path
681
+
682
+ path = Path(traces_dir)
683
+ path.mkdir(parents=True, exist_ok=True)
684
+
685
+ saved = 0
686
+ for result in results:
687
+ if result.trace is not None:
688
+ trace_file = path / f"seed_{result.seed}_trace.json"
689
+ trace_file.write_text(json.dumps(result.trace, indent=2, default=str))
690
+ saved += 1
691
+
692
+ return saved
693
+
694
+
695
+ __all__ = [
696
+ "run_eval",
697
+ "run_eval_direct",
698
+ "run_eval_via_backend",
699
+ "fetch_traces_from_backend",
700
+ "format_eval_table",
701
+ "format_eval_report",
702
+ "save_traces",
703
+ "EvalResult",
704
+ ]