synth-ai 0.4.1__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (153) hide show
  1. synth_ai/__init__.py +13 -13
  2. synth_ai/cli/__init__.py +6 -15
  3. synth_ai/cli/commands/eval/__init__.py +6 -15
  4. synth_ai/cli/commands/eval/config.py +338 -0
  5. synth_ai/cli/commands/eval/core.py +236 -1091
  6. synth_ai/cli/commands/eval/runner.py +704 -0
  7. synth_ai/cli/commands/eval/validation.py +44 -117
  8. synth_ai/cli/commands/filter/core.py +7 -7
  9. synth_ai/cli/commands/filter/validation.py +2 -2
  10. synth_ai/cli/commands/smoke/core.py +7 -17
  11. synth_ai/cli/commands/status/__init__.py +1 -64
  12. synth_ai/cli/commands/status/client.py +50 -151
  13. synth_ai/cli/commands/status/config.py +3 -83
  14. synth_ai/cli/commands/status/errors.py +4 -13
  15. synth_ai/cli/commands/status/subcommands/__init__.py +2 -8
  16. synth_ai/cli/commands/status/subcommands/config.py +13 -0
  17. synth_ai/cli/commands/status/subcommands/files.py +18 -63
  18. synth_ai/cli/commands/status/subcommands/jobs.py +28 -311
  19. synth_ai/cli/commands/status/subcommands/models.py +18 -62
  20. synth_ai/cli/commands/status/subcommands/runs.py +16 -63
  21. synth_ai/cli/commands/status/subcommands/session.py +67 -172
  22. synth_ai/cli/commands/status/subcommands/summary.py +24 -32
  23. synth_ai/cli/commands/status/subcommands/utils.py +41 -0
  24. synth_ai/cli/commands/status/utils.py +16 -107
  25. synth_ai/cli/commands/train/__init__.py +18 -20
  26. synth_ai/cli/commands/train/errors.py +3 -3
  27. synth_ai/cli/commands/train/prompt_learning_validation.py +15 -16
  28. synth_ai/cli/commands/train/validation.py +7 -7
  29. synth_ai/cli/commands/train/{judge_schemas.py → verifier_schemas.py} +33 -34
  30. synth_ai/cli/commands/train/verifier_validation.py +235 -0
  31. synth_ai/cli/demo_apps/demo_task_apps/math/config.toml +0 -1
  32. synth_ai/cli/demo_apps/demo_task_apps/math/modal_task_app.py +2 -6
  33. synth_ai/cli/demo_apps/math/config.toml +0 -1
  34. synth_ai/cli/demo_apps/math/modal_task_app.py +2 -6
  35. synth_ai/cli/demo_apps/mipro/task_app.py +25 -47
  36. synth_ai/cli/lib/apps/task_app.py +12 -13
  37. synth_ai/cli/lib/task_app_discovery.py +6 -6
  38. synth_ai/cli/lib/train_cfgs.py +10 -10
  39. synth_ai/cli/task_apps/__init__.py +11 -0
  40. synth_ai/cli/task_apps/commands.py +7 -15
  41. synth_ai/core/env.py +12 -1
  42. synth_ai/core/errors.py +1 -2
  43. synth_ai/core/integrations/cloudflare.py +209 -33
  44. synth_ai/core/tracing_v3/abstractions.py +46 -0
  45. synth_ai/data/__init__.py +3 -30
  46. synth_ai/data/enums.py +1 -20
  47. synth_ai/data/rewards.py +100 -3
  48. synth_ai/products/graph_evolve/__init__.py +1 -2
  49. synth_ai/products/graph_evolve/config.py +16 -16
  50. synth_ai/products/graph_evolve/converters/__init__.py +3 -3
  51. synth_ai/products/graph_evolve/converters/openai_sft.py +7 -7
  52. synth_ai/products/graph_evolve/examples/hotpotqa/config.toml +1 -1
  53. synth_ai/products/graph_gepa/__init__.py +23 -0
  54. synth_ai/products/graph_gepa/converters/__init__.py +19 -0
  55. synth_ai/products/graph_gepa/converters/openai_sft.py +29 -0
  56. synth_ai/sdk/__init__.py +45 -35
  57. synth_ai/sdk/api/eval/__init__.py +33 -0
  58. synth_ai/sdk/api/eval/job.py +732 -0
  59. synth_ai/sdk/api/research_agent/__init__.py +276 -66
  60. synth_ai/sdk/api/train/builders.py +181 -0
  61. synth_ai/sdk/api/train/cli.py +41 -33
  62. synth_ai/sdk/api/train/configs/__init__.py +6 -4
  63. synth_ai/sdk/api/train/configs/prompt_learning.py +127 -33
  64. synth_ai/sdk/api/train/configs/rl.py +264 -16
  65. synth_ai/sdk/api/train/configs/sft.py +165 -1
  66. synth_ai/sdk/api/train/graph_validators.py +12 -12
  67. synth_ai/sdk/api/train/graphgen.py +169 -51
  68. synth_ai/sdk/api/train/graphgen_models.py +95 -45
  69. synth_ai/sdk/api/train/local_api.py +10 -0
  70. synth_ai/sdk/api/train/pollers.py +36 -0
  71. synth_ai/sdk/api/train/prompt_learning.py +390 -60
  72. synth_ai/sdk/api/train/rl.py +41 -5
  73. synth_ai/sdk/api/train/sft.py +2 -0
  74. synth_ai/sdk/api/train/task_app.py +20 -0
  75. synth_ai/sdk/api/train/validators.py +17 -17
  76. synth_ai/sdk/graphs/completions.py +239 -33
  77. synth_ai/sdk/{judging/schemas.py → graphs/verifier_schemas.py} +23 -23
  78. synth_ai/sdk/learning/__init__.py +35 -5
  79. synth_ai/sdk/learning/context_learning_client.py +531 -0
  80. synth_ai/sdk/learning/context_learning_types.py +294 -0
  81. synth_ai/sdk/learning/prompt_learning_client.py +1 -1
  82. synth_ai/sdk/learning/prompt_learning_types.py +2 -1
  83. synth_ai/sdk/learning/rl/__init__.py +0 -4
  84. synth_ai/sdk/learning/rl/contracts.py +0 -4
  85. synth_ai/sdk/localapi/__init__.py +40 -0
  86. synth_ai/sdk/localapi/apps/__init__.py +28 -0
  87. synth_ai/sdk/localapi/client.py +10 -0
  88. synth_ai/sdk/localapi/contracts.py +10 -0
  89. synth_ai/sdk/localapi/helpers.py +519 -0
  90. synth_ai/sdk/localapi/rollouts.py +93 -0
  91. synth_ai/sdk/localapi/server.py +29 -0
  92. synth_ai/sdk/localapi/template.py +49 -0
  93. synth_ai/sdk/streaming/handlers.py +6 -6
  94. synth_ai/sdk/streaming/streamer.py +10 -6
  95. synth_ai/sdk/task/__init__.py +18 -5
  96. synth_ai/sdk/task/apps/__init__.py +37 -1
  97. synth_ai/sdk/task/client.py +9 -1
  98. synth_ai/sdk/task/config.py +6 -11
  99. synth_ai/sdk/task/contracts.py +137 -95
  100. synth_ai/sdk/task/in_process.py +32 -22
  101. synth_ai/sdk/task/in_process_runner.py +9 -4
  102. synth_ai/sdk/task/rubrics/__init__.py +2 -3
  103. synth_ai/sdk/task/rubrics/loaders.py +4 -4
  104. synth_ai/sdk/task/rubrics/strict.py +3 -4
  105. synth_ai/sdk/task/server.py +76 -16
  106. synth_ai/sdk/task/trace_correlation_helpers.py +190 -139
  107. synth_ai/sdk/task/validators.py +34 -49
  108. synth_ai/sdk/training/__init__.py +7 -16
  109. synth_ai/sdk/tunnels/__init__.py +118 -0
  110. synth_ai/sdk/tunnels/cleanup.py +83 -0
  111. synth_ai/sdk/tunnels/ports.py +120 -0
  112. synth_ai/sdk/tunnels/tunneled_api.py +363 -0
  113. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/METADATA +71 -4
  114. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/RECORD +118 -128
  115. synth_ai/cli/commands/baseline/__init__.py +0 -12
  116. synth_ai/cli/commands/baseline/core.py +0 -636
  117. synth_ai/cli/commands/baseline/list.py +0 -94
  118. synth_ai/cli/commands/eval/errors.py +0 -81
  119. synth_ai/cli/commands/status/formatters.py +0 -164
  120. synth_ai/cli/commands/status/subcommands/pricing.py +0 -23
  121. synth_ai/cli/commands/status/subcommands/usage.py +0 -203
  122. synth_ai/cli/commands/train/judge_validation.py +0 -305
  123. synth_ai/cli/usage.py +0 -159
  124. synth_ai/data/specs.py +0 -36
  125. synth_ai/sdk/api/research_agent/cli.py +0 -428
  126. synth_ai/sdk/api/research_agent/config.py +0 -357
  127. synth_ai/sdk/api/research_agent/job.py +0 -717
  128. synth_ai/sdk/baseline/__init__.py +0 -25
  129. synth_ai/sdk/baseline/config.py +0 -209
  130. synth_ai/sdk/baseline/discovery.py +0 -216
  131. synth_ai/sdk/baseline/execution.py +0 -154
  132. synth_ai/sdk/judging/__init__.py +0 -15
  133. synth_ai/sdk/judging/base.py +0 -24
  134. synth_ai/sdk/judging/client.py +0 -191
  135. synth_ai/sdk/judging/types.py +0 -42
  136. synth_ai/sdk/research_agent/__init__.py +0 -34
  137. synth_ai/sdk/research_agent/container_builder.py +0 -328
  138. synth_ai/sdk/research_agent/container_spec.py +0 -198
  139. synth_ai/sdk/research_agent/defaults.py +0 -34
  140. synth_ai/sdk/research_agent/results_collector.py +0 -69
  141. synth_ai/sdk/specs/__init__.py +0 -46
  142. synth_ai/sdk/specs/dataclasses.py +0 -149
  143. synth_ai/sdk/specs/loader.py +0 -144
  144. synth_ai/sdk/specs/serializer.py +0 -199
  145. synth_ai/sdk/specs/validation.py +0 -250
  146. synth_ai/sdk/tracing/__init__.py +0 -39
  147. synth_ai/sdk/usage/__init__.py +0 -37
  148. synth_ai/sdk/usage/client.py +0 -171
  149. synth_ai/sdk/usage/models.py +0 -261
  150. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/WHEEL +0 -0
  151. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/entry_points.txt +0 -0
  152. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/licenses/LICENSE +0 -0
  153. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,732 @@
1
+ """First-class SDK API for evaluation jobs.
2
+
3
+ This module provides high-level abstractions for running evaluation jobs
4
+ that route through the backend for trace capture and cost tracking.
5
+
6
+ Example:
7
+ from synth_ai.sdk.api.eval import EvalJob, EvalResult
8
+
9
+ job = EvalJob(config)
10
+ job.submit()
11
+
12
+ # progress=True provides built-in status printing:
13
+ # [00:05] running | 3/10 completed
14
+ # [00:10] running | 7/10 completed
15
+ # [00:15] completed | mean_score: 0.85
16
+ result = job.poll_until_complete(progress=True)
17
+
18
+ # Typed result access (not raw dict)
19
+ if result.succeeded:
20
+ print(f"Mean score: {result.mean_score}")
21
+ print(f"Total cost: ${result.total_cost_usd:.4f}")
22
+ for seed_result in result.seed_results:
23
+ print(f" Seed {seed_result['seed']}: {seed_result['score']}")
24
+ elif result.failed:
25
+ print(f"Error: {result.error}")
26
+
27
+ See Also:
28
+ - `synth_ai.cli.commands.eval`: CLI implementation
29
+ - `synth_ai.sdk.api.train.prompt_learning`: Similar pattern for training
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import asyncio
35
+ import os
36
+ import time
37
+ from dataclasses import dataclass, field
38
+ from enum import Enum
39
+ from pathlib import Path
40
+ from typing import Any, Callable, Dict, List, Optional
41
+
42
+ import httpx
43
+
44
+ from synth_ai.core.telemetry import log_info
45
+
46
+
47
+ class EvalStatus(str, Enum):
48
+ """Status of an evaluation job."""
49
+
50
+ PENDING = "pending"
51
+ RUNNING = "running"
52
+ COMPLETED = "completed"
53
+ FAILED = "failed"
54
+ CANCELLED = "cancelled"
55
+
56
+ @classmethod
57
+ def from_string(cls, status: str) -> "EvalStatus":
58
+ """Convert string to EvalStatus, defaulting to PENDING for unknown values."""
59
+ try:
60
+ return cls(status.lower())
61
+ except ValueError:
62
+ return cls.PENDING
63
+
64
+ @property
65
+ def is_terminal(self) -> bool:
66
+ """Whether this status is terminal (job won't change further)."""
67
+ return self in (EvalStatus.COMPLETED, EvalStatus.FAILED, EvalStatus.CANCELLED)
68
+
69
+ @property
70
+ def is_success(self) -> bool:
71
+ """Whether this status indicates success."""
72
+ return self == EvalStatus.COMPLETED
73
+
74
+
75
+ @dataclass
76
+ class EvalResult:
77
+ """Typed result from an evaluation job.
78
+
79
+ Provides clean accessors for common fields instead of raw dict access.
80
+
81
+ Example:
82
+ >>> result = job.poll_until_complete(progress=True)
83
+ >>> if result.succeeded:
84
+ ... print(f"Mean score: {result.mean_score:.2%}")
85
+ ... print(f"Total cost: ${result.total_cost_usd:.4f}")
86
+ >>> else:
87
+ ... print(f"Failed: {result.error}")
88
+ """
89
+
90
+ job_id: str
91
+ status: EvalStatus
92
+ mean_score: Optional[float] = None
93
+ total_tokens: Optional[int] = None
94
+ total_cost_usd: Optional[float] = None
95
+ num_completed: int = 0
96
+ num_total: int = 0
97
+ seed_results: List[Dict[str, Any]] = field(default_factory=list)
98
+ error: Optional[str] = None
99
+ raw: Dict[str, Any] = field(default_factory=dict)
100
+
101
+ @classmethod
102
+ def from_response(cls, job_id: str, data: Dict[str, Any]) -> "EvalResult":
103
+ """Create result from API response dict."""
104
+ status_str = data.get("status", "pending")
105
+ status = EvalStatus.from_string(status_str)
106
+
107
+ # Extract summary metrics
108
+ summary = data.get("summary", {})
109
+ results_info = data.get("results", {})
110
+
111
+ # Handle both summary dict and inline fields
112
+ mean_score = summary.get("mean_score") or data.get("mean_score")
113
+ total_tokens = summary.get("total_tokens") or data.get("total_tokens")
114
+ total_cost_usd = summary.get("total_cost_usd") or data.get("total_cost_usd")
115
+
116
+ # Get completion progress
117
+ num_completed = results_info.get("completed", 0) if isinstance(results_info, dict) else 0
118
+ num_total = results_info.get("total", 0) if isinstance(results_info, dict) else 0
119
+
120
+ # Get per-seed results (can be in "results" list or nested)
121
+ seed_results = data.get("results", [])
122
+ if isinstance(seed_results, dict):
123
+ seed_results = seed_results.get("items", [])
124
+
125
+ return cls(
126
+ job_id=job_id,
127
+ status=status,
128
+ mean_score=mean_score,
129
+ total_tokens=total_tokens,
130
+ total_cost_usd=total_cost_usd,
131
+ num_completed=num_completed,
132
+ num_total=num_total,
133
+ seed_results=list(seed_results) if isinstance(seed_results, list) else [],
134
+ error=data.get("error"),
135
+ raw=data,
136
+ )
137
+
138
+ @property
139
+ def succeeded(self) -> bool:
140
+ """Whether the job completed successfully."""
141
+ return self.status.is_success
142
+
143
+ @property
144
+ def failed(self) -> bool:
145
+ """Whether the job failed."""
146
+ return self.status == EvalStatus.FAILED
147
+
148
+ @property
149
+ def is_terminal(self) -> bool:
150
+ """Whether the job has reached a terminal state."""
151
+ return self.status.is_terminal
152
+
153
+
154
+ @dataclass
155
+ class EvalJobConfig:
156
+ """Configuration for an evaluation job.
157
+
158
+ This dataclass holds all the configuration needed to submit and run
159
+ an evaluation job via the backend.
160
+
161
+ Attributes:
162
+ task_app_url: URL of the task app to evaluate (e.g., "http://localhost:8103").
163
+ Required for job submission. Alias: local_api_url
164
+ backend_url: Base URL of the Synth API backend (e.g., "https://api.usesynth.ai").
165
+ Can also be set via SYNTH_BASE_URL or BACKEND_BASE_URL environment variables.
166
+ api_key: Synth API key for authentication with the backend.
167
+ Can also be set via SYNTH_API_KEY environment variable.
168
+ task_app_api_key: API key for authenticating with the task app.
169
+ Defaults to ENVIRONMENT_API_KEY env var if not provided. Alias: local_api_key
170
+ app_id: Task app identifier (optional, for logging/tracking).
171
+ env_name: Environment name within the task app.
172
+ seeds: List of seeds/indices to evaluate.
173
+ policy_config: Model and provider configuration for the policy.
174
+ env_config: Additional environment configuration.
175
+ concurrency: Maximum number of parallel rollouts (default: 5).
176
+ timeout: Maximum seconds per rollout (default: 600.0).
177
+
178
+ Example:
179
+ >>> config = EvalJobConfig(
180
+ ... task_app_url="http://localhost:8103",
181
+ ... backend_url="https://api.usesynth.ai",
182
+ ... api_key="sk_live_...",
183
+ ... env_name="banking77",
184
+ ... seeds=[0, 1, 2, 3, 4],
185
+ ... policy_config={"model": "gpt-4", "provider": "openai"},
186
+ ... )
187
+ """
188
+
189
+ task_app_url: str = field(default="")
190
+ backend_url: str = field(default="")
191
+ api_key: str = field(default="")
192
+ task_app_api_key: Optional[str] = None
193
+ app_id: Optional[str] = None
194
+ env_name: Optional[str] = None
195
+ seeds: List[int] = field(default_factory=list)
196
+ policy_config: Dict[str, Any] = field(default_factory=dict)
197
+ env_config: Dict[str, Any] = field(default_factory=dict)
198
+ concurrency: int = 5
199
+ timeout: float = 600.0
200
+ # Aliases for backwards compatibility (not stored, just used in __init__)
201
+ local_api_url: str = field(default="", repr=False)
202
+ local_api_key: Optional[str] = field(default=None, repr=False)
203
+
204
+ def __post_init__(self) -> None:
205
+ """Validate configuration and handle aliases."""
206
+ # Handle aliases for backwards compatibility
207
+ if self.local_api_url and not self.task_app_url:
208
+ self.task_app_url = self.local_api_url
209
+ if self.local_api_key and not self.task_app_api_key:
210
+ self.task_app_api_key = self.local_api_key
211
+
212
+ if not self.task_app_url:
213
+ raise ValueError("task_app_url (or local_api_url) is required")
214
+ if not self.backend_url:
215
+ raise ValueError("backend_url is required")
216
+ if not self.api_key:
217
+ raise ValueError("api_key is required")
218
+ if not self.seeds:
219
+ raise ValueError("seeds list is required and cannot be empty")
220
+
221
+ # Get task_app_api_key from environment if not provided
222
+ if not self.task_app_api_key:
223
+ self.task_app_api_key = os.environ.get("ENVIRONMENT_API_KEY")
224
+
225
+
226
+ class EvalJob:
227
+ """High-level SDK class for running evaluation jobs via the backend.
228
+
229
+ This class provides a clean API for:
230
+ 1. Submitting evaluation jobs to the backend
231
+ 2. Polling job status until completion
232
+ 3. Retrieving detailed results with metrics, tokens, and costs
233
+ 4. Downloading traces for analysis
234
+
235
+ The backend routes LLM calls through the inference interceptor, which:
236
+ - Captures traces automatically
237
+ - Tracks token usage
238
+ - Calculates costs based on model pricing
239
+
240
+ Example:
241
+ >>> from synth_ai.sdk.api.eval import EvalJob
242
+ >>>
243
+ >>> # Create job from config file
244
+ >>> job = EvalJob.from_config(
245
+ ... config_path="banking77_eval.toml",
246
+ ... backend_url="https://api.usesynth.ai",
247
+ ... api_key=os.environ["SYNTH_API_KEY"],
248
+ ... )
249
+ >>>
250
+ >>> # Submit job
251
+ >>> job_id = job.submit()
252
+ >>> print(f"Job submitted: {job_id}")
253
+ >>>
254
+ >>> # Poll until complete
255
+ >>> results = job.poll_until_complete(timeout=1200.0)
256
+ >>> print(f"Mean score: {results['summary']['mean_score']}")
257
+ >>>
258
+ >>> # Download traces
259
+ >>> job.download_traces("./traces")
260
+
261
+ See Also:
262
+ - `PromptLearningJob`: Similar pattern for prompt learning jobs
263
+ - Backend API: POST /api/eval/jobs, GET /api/eval/jobs/{job_id}
264
+ """
265
+
266
+ # Default poll settings
267
+ _POLL_INTERVAL_S = 2.0
268
+ _MAX_POLL_ATTEMPTS = 600 # 20 minutes max
269
+
270
+ def __init__(
271
+ self,
272
+ config: EvalJobConfig,
273
+ job_id: Optional[str] = None,
274
+ ) -> None:
275
+ """Initialize an evaluation job.
276
+
277
+ Args:
278
+ config: Job configuration with task app URL, seeds, policy, etc.
279
+ job_id: Existing job ID (if resuming a previous job)
280
+ """
281
+ self.config = config
282
+ self._job_id = job_id
283
+
284
+ @classmethod
285
+ def from_config(
286
+ cls,
287
+ config_path: str | Path,
288
+ backend_url: Optional[str] = None,
289
+ api_key: Optional[str] = None,
290
+ task_app_api_key: Optional[str] = None,
291
+ task_app_url: Optional[str] = None,
292
+ seeds: Optional[List[int]] = None,
293
+ ) -> EvalJob:
294
+ """Create a job from a TOML config file.
295
+
296
+ Loads evaluation configuration from a TOML file and allows
297
+ overriding specific values via arguments.
298
+
299
+ Args:
300
+ config_path: Path to TOML config file
301
+ backend_url: Backend API URL (defaults to env or production)
302
+ api_key: API key (defaults to SYNTH_API_KEY env var)
303
+ task_app_api_key: Task app API key (defaults to ENVIRONMENT_API_KEY)
304
+ task_app_url: Override task app URL from config
305
+ seeds: Override seeds list from config
306
+
307
+ Returns:
308
+ EvalJob instance ready for submission
309
+
310
+ Raises:
311
+ ValueError: If required config is missing
312
+ FileNotFoundError: If config file doesn't exist
313
+
314
+ Example:
315
+ >>> job = EvalJob.from_config(
316
+ ... "banking77_eval.toml",
317
+ ... backend_url="https://api.usesynth.ai",
318
+ ... api_key="sk_live_...",
319
+ ... seeds=[0, 1, 2], # Override seeds
320
+ ... )
321
+ """
322
+ import tomllib
323
+
324
+ config_path_obj = Path(config_path)
325
+ if not config_path_obj.exists():
326
+ raise FileNotFoundError(f"Config file not found: {config_path}")
327
+
328
+ with open(config_path_obj, "rb") as f:
329
+ toml_data = tomllib.load(f)
330
+
331
+ # Extract eval section (supports both [eval] and [prompt_learning] formats)
332
+ eval_config = toml_data.get("eval", {})
333
+ if not eval_config:
334
+ pl_config = toml_data.get("prompt_learning", {})
335
+ if pl_config:
336
+ eval_config = {
337
+ "app_id": pl_config.get("task_app_id"),
338
+ "url": pl_config.get("task_app_url"),
339
+ "env_name": pl_config.get("gepa", {}).get("env_name"),
340
+ "seeds": pl_config.get("gepa", {}).get("evaluation", {}).get("seeds", []),
341
+ "policy_config": pl_config.get("gepa", {}).get("policy", {}),
342
+ }
343
+
344
+ # Resolve backend URL
345
+ if not backend_url:
346
+ backend_url = os.environ.get("SYNTH_BASE_URL") or os.environ.get("BACKEND_BASE_URL")
347
+ if not backend_url:
348
+ backend_url = "https://api.usesynth.ai"
349
+
350
+ # Resolve API key
351
+ if not api_key:
352
+ api_key = os.environ.get("SYNTH_API_KEY")
353
+ if not api_key:
354
+ raise ValueError("api_key is required (provide explicitly or set SYNTH_API_KEY env var)")
355
+
356
+ # Build config with overrides
357
+ final_task_app_url = task_app_url or eval_config.get("url") or eval_config.get("task_app_url")
358
+ if not final_task_app_url:
359
+ raise ValueError("task_app_url is required (in config or as argument)")
360
+
361
+ final_seeds = seeds or eval_config.get("seeds", [])
362
+ if not final_seeds:
363
+ raise ValueError("seeds list is required (in config or as argument)")
364
+
365
+ config = EvalJobConfig(
366
+ task_app_url=final_task_app_url,
367
+ backend_url=backend_url,
368
+ api_key=api_key,
369
+ task_app_api_key=task_app_api_key,
370
+ app_id=eval_config.get("app_id"),
371
+ env_name=eval_config.get("env_name"),
372
+ seeds=list(final_seeds),
373
+ policy_config=eval_config.get("policy_config", {}),
374
+ env_config=eval_config.get("env_config", {}),
375
+ concurrency=eval_config.get("concurrency", 5),
376
+ timeout=eval_config.get("timeout", 600.0),
377
+ )
378
+
379
+ return cls(config)
380
+
381
+ @classmethod
382
+ def from_job_id(
383
+ cls,
384
+ job_id: str,
385
+ backend_url: Optional[str] = None,
386
+ api_key: Optional[str] = None,
387
+ ) -> EvalJob:
388
+ """Resume an existing job by ID.
389
+
390
+ Use this to check status or get results of a previously submitted job.
391
+
392
+ Args:
393
+ job_id: Existing job ID (e.g., "eval-abc123")
394
+ backend_url: Backend API URL (defaults to env or production)
395
+ api_key: API key (defaults to SYNTH_API_KEY env var)
396
+
397
+ Returns:
398
+ EvalJob instance for the existing job
399
+
400
+ Example:
401
+ >>> job = EvalJob.from_job_id("eval-abc123")
402
+ >>> status = job.get_status()
403
+ >>> if status["status"] == "completed":
404
+ ... results = job.get_results()
405
+ """
406
+ # Resolve backend URL
407
+ if not backend_url:
408
+ backend_url = os.environ.get("SYNTH_BASE_URL") or os.environ.get("BACKEND_BASE_URL")
409
+ if not backend_url:
410
+ backend_url = "https://api.usesynth.ai"
411
+
412
+ # Resolve API key
413
+ if not api_key:
414
+ api_key = os.environ.get("SYNTH_API_KEY")
415
+ if not api_key:
416
+ raise ValueError("api_key is required (provide explicitly or set SYNTH_API_KEY env var)")
417
+
418
+ # Create minimal config for resumed job
419
+ config = EvalJobConfig(
420
+ task_app_url="resumed", # Placeholder - not needed for status/results
421
+ backend_url=backend_url,
422
+ api_key=api_key,
423
+ seeds=[0], # Placeholder
424
+ )
425
+
426
+ return cls(config, job_id=job_id)
427
+
428
+ def _base_url(self) -> str:
429
+ """Get normalized base URL for API calls."""
430
+ base = self.config.backend_url.rstrip("/")
431
+ if not base.endswith("/api"):
432
+ base = f"{base}/api"
433
+ return base
434
+
435
+ def _headers(self) -> Dict[str, str]:
436
+ """Get headers for API calls."""
437
+ return {
438
+ "Authorization": f"Bearer {self.config.api_key}",
439
+ "Content-Type": "application/json",
440
+ }
441
+
442
+ def submit(self) -> str:
443
+ """Submit the job to the backend.
444
+
445
+ Creates an eval job on the backend which will:
446
+ 1. Route LLM calls through the inference interceptor
447
+ 2. Capture traces and token usage
448
+ 3. Calculate costs based on model pricing
449
+
450
+ Returns:
451
+ Job ID (e.g., "eval-abc123")
452
+
453
+ Raises:
454
+ RuntimeError: If job submission fails or job already submitted
455
+ ValueError: If configuration is invalid
456
+
457
+ Example:
458
+ >>> job = EvalJob.from_config("eval.toml")
459
+ >>> job_id = job.submit()
460
+ >>> print(f"Submitted: {job_id}")
461
+ """
462
+ ctx: Dict[str, Any] = {"task_app_url": self.config.task_app_url}
463
+ log_info("EvalJob.submit invoked", ctx=ctx)
464
+
465
+ if self._job_id:
466
+ raise RuntimeError(f"Job already submitted: {self._job_id}")
467
+
468
+ # Build job request payload
469
+ policy = dict(self.config.policy_config)
470
+
471
+ job_request = {
472
+ "task_app_url": self.config.task_app_url,
473
+ "task_app_api_key": self.config.task_app_api_key,
474
+ "app_id": self.config.app_id,
475
+ "env_name": self.config.env_name,
476
+ "seeds": self.config.seeds,
477
+ "policy": policy,
478
+ "env_config": self.config.env_config,
479
+ "max_concurrent": self.config.concurrency,
480
+ "timeout": self.config.timeout,
481
+ }
482
+
483
+ # Submit synchronously using httpx
484
+ url = f"{self._base_url()}/eval/jobs"
485
+
486
+ with httpx.Client(timeout=httpx.Timeout(30.0)) as client:
487
+ resp = client.post(url, json=job_request, headers=self._headers())
488
+
489
+ if resp.status_code not in (200, 201):
490
+ raise RuntimeError(
491
+ f"Job submission failed with status {resp.status_code}: {resp.text[:500]}"
492
+ )
493
+
494
+ job_data = resp.json()
495
+ job_id = job_data.get("job_id")
496
+ if not job_id:
497
+ raise RuntimeError(f"No job_id in response: {job_data}")
498
+
499
+ self._job_id = job_id
500
+ ctx["job_id"] = job_id
501
+ log_info("EvalJob.submit completed", ctx=ctx)
502
+ return job_id
503
+
504
+ @property
505
+ def job_id(self) -> Optional[str]:
506
+ """Get the job ID (None if not yet submitted)."""
507
+ return self._job_id
508
+
509
+ def get_status(self) -> Dict[str, Any]:
510
+ """Get current job status.
511
+
512
+ Returns:
513
+ Job status dictionary with keys:
514
+ - job_id: Job identifier
515
+ - status: "running", "completed", or "failed"
516
+ - error: Error message if failed
517
+ - created_at, started_at, completed_at: Timestamps
518
+ - config: Original job configuration
519
+ - results: Summary results if completed
520
+
521
+ Raises:
522
+ RuntimeError: If job hasn't been submitted yet
523
+
524
+ Example:
525
+ >>> status = job.get_status()
526
+ >>> print(f"Status: {status['status']}")
527
+ >>> if status["status"] == "completed":
528
+ ... print(f"Mean score: {status['results']['mean_score']}")
529
+ """
530
+ if not self._job_id:
531
+ raise RuntimeError("Job not yet submitted. Call submit() first.")
532
+
533
+ url = f"{self._base_url()}/eval/jobs/{self._job_id}"
534
+
535
+ with httpx.Client(timeout=httpx.Timeout(30.0)) as client:
536
+ resp = client.get(url, headers=self._headers())
537
+
538
+ if resp.status_code != 200:
539
+ raise RuntimeError(f"Failed to get status: {resp.status_code} {resp.text}")
540
+
541
+ return resp.json()
542
+
543
+ def poll_until_complete(
544
+ self,
545
+ *,
546
+ timeout: float = 1200.0,
547
+ interval: float = 2.0,
548
+ progress: bool = False,
549
+ on_status: Optional[Callable[[Dict[str, Any]], None]] = None,
550
+ ) -> EvalResult:
551
+ """Poll job until it reaches a terminal state, then return results.
552
+
553
+ Polls the backend until the job completes or fails, then fetches
554
+ and returns the detailed results.
555
+
556
+ Args:
557
+ timeout: Maximum seconds to wait (default: 1200 = 20 minutes)
558
+ interval: Seconds between poll attempts (default: 2)
559
+ progress: If True, print status updates during polling (useful for notebooks)
560
+ on_status: Optional callback called on each status update (for custom progress handling)
561
+
562
+ Returns:
563
+ EvalResult with typed status, mean_score, seed_results, etc.
564
+
565
+ Raises:
566
+ RuntimeError: If job hasn't been submitted yet
567
+ TimeoutError: If timeout is exceeded
568
+
569
+ Example:
570
+ >>> result = job.poll_until_complete(progress=True)
571
+ [00:05] running | 3/10 completed
572
+ [00:10] running | 7/10 completed
573
+ [00:15] completed | mean_score: 0.85
574
+ >>> result.succeeded
575
+ True
576
+ >>> result.mean_score
577
+ 0.85
578
+ """
579
+ if not self._job_id:
580
+ raise RuntimeError("Job not yet submitted. Call submit() first.")
581
+
582
+ job_id = self._job_id
583
+ start_time = time.time()
584
+ last_data: Dict[str, Any] = {}
585
+
586
+ while True:
587
+ elapsed = time.time() - start_time
588
+ if elapsed >= timeout:
589
+ if progress:
590
+ print(f"[poll] timeout after {timeout:.0f}s")
591
+ # Return with whatever data we have
592
+ return EvalResult.from_response(job_id, last_data)
593
+
594
+ try:
595
+ status_data = self.get_status()
596
+ last_data = status_data
597
+
598
+ status = EvalStatus.from_string(status_data.get("status", "pending"))
599
+
600
+ # Extract progress info
601
+ results_info = status_data.get("results", {})
602
+ completed = results_info.get("completed", 0) if isinstance(results_info, dict) else 0
603
+ total = results_info.get("total", len(self.config.seeds)) if isinstance(results_info, dict) else len(self.config.seeds)
604
+
605
+ # Progress output
606
+ if progress:
607
+ mins, secs = divmod(int(elapsed), 60)
608
+ if status.is_terminal:
609
+ # Get final results for mean_score
610
+ try:
611
+ final_results = self.get_results()
612
+ mean_score = final_results.get("summary", {}).get("mean_score")
613
+ score_str = f"mean_score: {mean_score:.2f}" if mean_score is not None else ""
614
+ print(f"[{mins:02d}:{secs:02d}] {status.value} | {score_str}")
615
+ # Use final results for the return value
616
+ last_data = final_results
617
+ except Exception:
618
+ print(f"[{mins:02d}:{secs:02d}] {status.value}")
619
+ else:
620
+ print(f"[{mins:02d}:{secs:02d}] {status.value} | {completed}/{total} completed")
621
+
622
+ # Callback for custom handling
623
+ if on_status:
624
+ on_status(status_data)
625
+
626
+ # Check terminal state
627
+ if status.is_terminal:
628
+ # Fetch full results if completed
629
+ if status == EvalStatus.COMPLETED:
630
+ try:
631
+ final_results = self.get_results()
632
+ return EvalResult.from_response(job_id, final_results)
633
+ except Exception:
634
+ pass
635
+ return EvalResult.from_response(job_id, last_data)
636
+
637
+ except Exception as exc:
638
+ if progress:
639
+ print(f"[poll] error: {exc}")
640
+ log_info("poll request failed", ctx={"error": str(exc), "job_id": job_id})
641
+
642
+ time.sleep(interval)
643
+
644
+ def get_results(self) -> Dict[str, Any]:
645
+ """Get detailed job results.
646
+
647
+ Fetches the full results including per-seed scores, tokens, and costs.
648
+
649
+ Returns:
650
+ Results dictionary with:
651
+ - job_id: Job identifier
652
+ - status: Job status
653
+ - summary: Aggregate metrics
654
+ - mean_score: Average score across seeds
655
+ - total_tokens: Total token usage
656
+ - total_cost_usd: Total cost
657
+ - num_seeds: Number of seeds evaluated
658
+ - num_successful: Seeds that completed
659
+ - num_failed: Seeds that failed
660
+ - results: List of per-seed results
661
+ - seed: Seed number
662
+ - score: Evaluation score
663
+ - tokens: Token count
664
+ - cost_usd: Cost for this seed
665
+ - latency_ms: Execution time
666
+ - error: Error message if failed
667
+
668
+ Raises:
669
+ RuntimeError: If job hasn't been submitted yet
670
+
671
+ Example:
672
+ >>> results = job.get_results()
673
+ >>> for r in results["results"]:
674
+ ... print(f"Seed {r['seed']}: score={r['score']}, tokens={r['tokens']}")
675
+ """
676
+ if not self._job_id:
677
+ raise RuntimeError("Job not yet submitted. Call submit() first.")
678
+
679
+ url = f"{self._base_url()}/eval/jobs/{self._job_id}/results"
680
+
681
+ with httpx.Client(timeout=httpx.Timeout(30.0)) as client:
682
+ resp = client.get(url, headers=self._headers())
683
+
684
+ if resp.status_code != 200:
685
+ raise RuntimeError(f"Failed to get results: {resp.status_code} {resp.text}")
686
+
687
+ return resp.json()
688
+
689
+ def download_traces(self, output_dir: str | Path) -> Path:
690
+ """Download traces for the job to a directory.
691
+
692
+ Downloads the traces ZIP file from the backend and extracts
693
+ it to the specified directory.
694
+
695
+ Args:
696
+ output_dir: Directory to extract traces to
697
+
698
+ Returns:
699
+ Path to the output directory
700
+
701
+ Raises:
702
+ RuntimeError: If job hasn't been submitted or download fails
703
+
704
+ Example:
705
+ >>> traces_dir = job.download_traces("./traces")
706
+ >>> for trace_file in traces_dir.glob("*.json"):
707
+ ... print(f"Trace: {trace_file}")
708
+ """
709
+ import io
710
+ import zipfile
711
+
712
+ if not self._job_id:
713
+ raise RuntimeError("Job not yet submitted. Call submit() first.")
714
+
715
+ url = f"{self._base_url()}/eval/jobs/{self._job_id}/traces"
716
+ output_path = Path(output_dir)
717
+
718
+ with httpx.Client(timeout=httpx.Timeout(60.0)) as client:
719
+ resp = client.get(url, headers=self._headers())
720
+
721
+ if resp.status_code != 200:
722
+ raise RuntimeError(f"Failed to download traces: {resp.status_code} {resp.text}")
723
+
724
+ output_path.mkdir(parents=True, exist_ok=True)
725
+
726
+ with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
727
+ zf.extractall(output_path)
728
+
729
+ return output_path
730
+
731
+
732
+ __all__ = ["EvalJob", "EvalJobConfig", "EvalResult", "EvalStatus"]