synth-ai 0.2.17__py3-none-any.whl → 0.2.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (169) hide show
  1. examples/baseline/banking77_baseline.py +204 -0
  2. examples/baseline/crafter_baseline.py +407 -0
  3. examples/baseline/pokemon_red_baseline.py +326 -0
  4. examples/baseline/simple_baseline.py +56 -0
  5. examples/baseline/warming_up_to_rl_baseline.py +239 -0
  6. examples/blog_posts/gepa/README.md +355 -0
  7. examples/blog_posts/gepa/configs/banking77_gepa_local.toml +95 -0
  8. examples/blog_posts/gepa/configs/banking77_gepa_test.toml +82 -0
  9. examples/blog_posts/gepa/configs/banking77_mipro_local.toml +52 -0
  10. examples/blog_posts/gepa/configs/hotpotqa_gepa_local.toml +59 -0
  11. examples/blog_posts/gepa/configs/hotpotqa_gepa_qwen.toml +36 -0
  12. examples/blog_posts/gepa/configs/hotpotqa_mipro_local.toml +53 -0
  13. examples/blog_posts/gepa/configs/hover_gepa_local.toml +59 -0
  14. examples/blog_posts/gepa/configs/hover_gepa_qwen.toml +36 -0
  15. examples/blog_posts/gepa/configs/hover_mipro_local.toml +53 -0
  16. examples/blog_posts/gepa/configs/ifbench_gepa_local.toml +59 -0
  17. examples/blog_posts/gepa/configs/ifbench_gepa_qwen.toml +36 -0
  18. examples/blog_posts/gepa/configs/ifbench_mipro_local.toml +53 -0
  19. examples/blog_posts/gepa/configs/pupa_gepa_local.toml +60 -0
  20. examples/blog_posts/gepa/configs/pupa_mipro_local.toml +54 -0
  21. examples/blog_posts/gepa/deploy_banking77_task_app.sh +41 -0
  22. examples/blog_posts/gepa/gepa_baseline.py +204 -0
  23. examples/blog_posts/gepa/query_prompts_example.py +97 -0
  24. examples/blog_posts/gepa/run_gepa_banking77.sh +87 -0
  25. examples/blog_posts/gepa/task_apps.py +105 -0
  26. examples/blog_posts/gepa/test_gepa_local.sh +67 -0
  27. examples/blog_posts/gepa/verify_banking77_setup.sh +123 -0
  28. examples/blog_posts/pokemon_vl/configs/eval_gpt5nano.toml +26 -0
  29. examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +12 -10
  30. examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +1 -0
  31. examples/blog_posts/pokemon_vl/extract_images.py +239 -0
  32. examples/blog_posts/pokemon_vl/pokemon_vl_baseline.py +326 -0
  33. examples/blog_posts/pokemon_vl/run_eval_extract_images.py +209 -0
  34. examples/blog_posts/pokemon_vl/run_qwen_eval_extract_images.py +212 -0
  35. examples/blog_posts/pokemon_vl/text_box_analysis.md +106 -0
  36. examples/blog_posts/warming_up_to_rl/ARCHITECTURE.md +195 -0
  37. examples/blog_posts/warming_up_to_rl/FINAL_TEST_RESULTS.md +127 -0
  38. examples/blog_posts/warming_up_to_rl/INFERENCE_SUCCESS.md +132 -0
  39. examples/blog_posts/warming_up_to_rl/SMOKE_TESTING.md +164 -0
  40. examples/blog_posts/warming_up_to_rl/SMOKE_TEST_COMPLETE.md +253 -0
  41. examples/blog_posts/warming_up_to_rl/configs/eval_baseline_qwen32b_10x20.toml +25 -0
  42. examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b_10x20.toml +26 -0
  43. examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +1 -1
  44. examples/blog_posts/warming_up_to_rl/configs/smoke_test.toml +75 -0
  45. examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +60 -10
  46. examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +1 -1
  47. examples/blog_posts/warming_up_to_rl/warming_up_to_rl_baseline.py +187 -0
  48. examples/multi_step/configs/VERILOG_REWARDS.md +4 -0
  49. examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +4 -0
  50. examples/multi_step/configs/crafter_rl_outcome.toml +1 -0
  51. examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +1 -0
  52. examples/multi_step/configs/crafter_rl_stepwise_simple.toml +1 -0
  53. examples/rl/configs/rl_from_base_qwen17.toml +1 -0
  54. examples/swe/task_app/hosted/inference/openai_client.py +0 -34
  55. examples/swe/task_app/hosted/policy_routes.py +17 -0
  56. examples/swe/task_app/hosted/rollout.py +4 -2
  57. examples/task_apps/banking77/__init__.py +6 -0
  58. examples/task_apps/banking77/banking77_task_app.py +841 -0
  59. examples/task_apps/banking77/deploy_wrapper.py +46 -0
  60. examples/task_apps/crafter/CREATE_SFT_DATASET.md +4 -0
  61. examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +4 -0
  62. examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +4 -0
  63. examples/task_apps/crafter/task_app/grpo_crafter.py +24 -2
  64. examples/task_apps/crafter/task_app/synth_envs_hosted/hosted_app.py +49 -0
  65. examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +355 -58
  66. examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +68 -7
  67. examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +78 -21
  68. examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +194 -1
  69. examples/task_apps/gepa_benchmarks/__init__.py +7 -0
  70. examples/task_apps/gepa_benchmarks/common.py +260 -0
  71. examples/task_apps/gepa_benchmarks/hotpotqa_task_app.py +507 -0
  72. examples/task_apps/gepa_benchmarks/hover_task_app.py +436 -0
  73. examples/task_apps/gepa_benchmarks/ifbench_task_app.py +563 -0
  74. examples/task_apps/gepa_benchmarks/pupa_task_app.py +460 -0
  75. examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +4 -0
  76. examples/task_apps/pokemon_red/task_app.py +254 -36
  77. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +1 -0
  78. examples/warming_up_to_rl/task_app/grpo_crafter.py +53 -4
  79. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +49 -0
  80. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +152 -41
  81. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +31 -1
  82. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +33 -3
  83. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +67 -0
  84. examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +1 -0
  85. synth_ai/api/train/builders.py +90 -1
  86. synth_ai/api/train/cli.py +396 -21
  87. synth_ai/api/train/config_finder.py +13 -2
  88. synth_ai/api/train/configs/__init__.py +15 -1
  89. synth_ai/api/train/configs/prompt_learning.py +442 -0
  90. synth_ai/api/train/configs/rl.py +29 -0
  91. synth_ai/api/train/task_app.py +1 -1
  92. synth_ai/api/train/validators.py +277 -0
  93. synth_ai/baseline/__init__.py +25 -0
  94. synth_ai/baseline/config.py +209 -0
  95. synth_ai/baseline/discovery.py +214 -0
  96. synth_ai/baseline/execution.py +146 -0
  97. synth_ai/cli/__init__.py +85 -17
  98. synth_ai/cli/__main__.py +0 -0
  99. synth_ai/cli/claude.py +70 -0
  100. synth_ai/cli/codex.py +84 -0
  101. synth_ai/cli/commands/__init__.py +1 -0
  102. synth_ai/cli/commands/baseline/__init__.py +12 -0
  103. synth_ai/cli/commands/baseline/core.py +637 -0
  104. synth_ai/cli/commands/baseline/list.py +93 -0
  105. synth_ai/cli/commands/eval/core.py +13 -10
  106. synth_ai/cli/commands/filter/core.py +53 -17
  107. synth_ai/cli/commands/help/core.py +0 -1
  108. synth_ai/cli/commands/smoke/__init__.py +7 -0
  109. synth_ai/cli/commands/smoke/core.py +1436 -0
  110. synth_ai/cli/commands/status/subcommands/pricing.py +22 -0
  111. synth_ai/cli/commands/status/subcommands/usage.py +203 -0
  112. synth_ai/cli/commands/train/judge_schemas.py +1 -0
  113. synth_ai/cli/commands/train/judge_validation.py +1 -0
  114. synth_ai/cli/commands/train/validation.py +0 -57
  115. synth_ai/cli/demo.py +35 -3
  116. synth_ai/cli/deploy/__init__.py +40 -25
  117. synth_ai/cli/deploy.py +162 -0
  118. synth_ai/cli/legacy_root_backup.py +14 -8
  119. synth_ai/cli/opencode.py +107 -0
  120. synth_ai/cli/root.py +9 -5
  121. synth_ai/cli/task_app_deploy.py +1 -1
  122. synth_ai/cli/task_apps.py +53 -53
  123. synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +7 -4
  124. synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +9 -5
  125. synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +4 -3
  126. synth_ai/judge_schemas.py +1 -0
  127. synth_ai/learning/__init__.py +10 -0
  128. synth_ai/learning/prompt_learning_client.py +276 -0
  129. synth_ai/learning/prompt_learning_types.py +184 -0
  130. synth_ai/pricing/__init__.py +2 -0
  131. synth_ai/pricing/model_pricing.py +57 -0
  132. synth_ai/streaming/handlers.py +53 -4
  133. synth_ai/streaming/streamer.py +19 -0
  134. synth_ai/task/apps/__init__.py +1 -0
  135. synth_ai/task/config.py +2 -0
  136. synth_ai/task/tracing_utils.py +25 -25
  137. synth_ai/task/validators.py +44 -8
  138. synth_ai/task_app_cfgs.py +21 -0
  139. synth_ai/tracing_v3/config.py +162 -19
  140. synth_ai/tracing_v3/constants.py +1 -1
  141. synth_ai/tracing_v3/db_config.py +24 -38
  142. synth_ai/tracing_v3/storage/config.py +47 -13
  143. synth_ai/tracing_v3/storage/factory.py +3 -3
  144. synth_ai/tracing_v3/turso/daemon.py +113 -11
  145. synth_ai/tracing_v3/turso/native_manager.py +92 -16
  146. synth_ai/types.py +8 -0
  147. synth_ai/urls.py +11 -0
  148. synth_ai/utils/__init__.py +30 -1
  149. synth_ai/utils/agents.py +74 -0
  150. synth_ai/utils/bin.py +39 -0
  151. synth_ai/utils/cli.py +149 -5
  152. synth_ai/utils/env.py +17 -17
  153. synth_ai/utils/json.py +72 -0
  154. synth_ai/utils/modal.py +283 -1
  155. synth_ai/utils/paths.py +48 -0
  156. synth_ai/utils/uvicorn.py +113 -0
  157. {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/METADATA +102 -4
  158. {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/RECORD +162 -88
  159. synth_ai/cli/commands/deploy/__init__.py +0 -23
  160. synth_ai/cli/commands/deploy/core.py +0 -614
  161. synth_ai/cli/commands/deploy/errors.py +0 -72
  162. synth_ai/cli/commands/deploy/validation.py +0 -11
  163. synth_ai/cli/deploy/core.py +0 -5
  164. synth_ai/cli/deploy/errors.py +0 -23
  165. synth_ai/cli/deploy/validation.py +0 -5
  166. {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/WHEEL +0 -0
  167. {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/entry_points.txt +0 -0
  168. {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/licenses/LICENSE +0 -0
  169. {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,276 @@
1
+ """Client utilities for querying prompt learning job results."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ from .._utils.http import AsyncHttpClient
8
+ from .prompt_learning_types import PromptResults
9
+
10
+
11
+ def _validate_job_id(job_id: str) -> None:
12
+ """Validate that job_id has the expected prompt learning format.
13
+
14
+ Args:
15
+ job_id: Job ID to validate
16
+
17
+ Raises:
18
+ ValueError: If job_id doesn't start with 'pl_'
19
+ """
20
+ if not job_id.startswith("pl_"):
21
+ raise ValueError(
22
+ f"Invalid prompt learning job ID format: {job_id!r}. "
23
+ f"Expected format: 'pl_<identifier>' (e.g., 'pl_9c58b711c2644083')"
24
+ )
25
+
26
+
27
+ class PromptLearningClient:
28
+ """Client for interacting with prompt learning jobs and retrieving results."""
29
+
30
+ def __init__(self, base_url: str, api_key: str, *, timeout: float = 30.0) -> None:
31
+ """Initialize the prompt learning client.
32
+
33
+ Args:
34
+ base_url: Base URL of the backend API (e.g., "http://localhost:8000")
35
+ api_key: API key for authentication
36
+ timeout: Request timeout in seconds
37
+ """
38
+ self._base_url = base_url.rstrip("/")
39
+ self._api_key = api_key
40
+ self._timeout = timeout
41
+
42
+ async def get_job(self, job_id: str) -> Dict[str, Any]:
43
+ """Get job metadata and status.
44
+
45
+ Args:
46
+ job_id: Job ID (e.g., "pl_9c58b711c2644083")
47
+
48
+ Returns:
49
+ Job metadata including status, best_score, created_at, etc.
50
+
51
+ Raises:
52
+ ValueError: If job_id format is invalid
53
+ """
54
+ _validate_job_id(job_id)
55
+ async with AsyncHttpClient(self._base_url, self._api_key, timeout=self._timeout) as http:
56
+ return await http.get(f"/api/prompt-learning/online/jobs/{job_id}")
57
+
58
+ async def get_events(
59
+ self, job_id: str, *, since_seq: int = 0, limit: int = 5000
60
+ ) -> List[Dict[str, Any]]:
61
+ """Get events for a prompt learning job.
62
+
63
+ Args:
64
+ job_id: Job ID
65
+ since_seq: Return events after this sequence number
66
+ limit: Maximum number of events to return
67
+
68
+ Returns:
69
+ List of event dictionaries with type, message, data, etc.
70
+
71
+ Raises:
72
+ ValueError: If job_id format is invalid or response structure is unexpected
73
+ """
74
+ _validate_job_id(job_id)
75
+ params = {"since_seq": since_seq, "limit": limit}
76
+ async with AsyncHttpClient(self._base_url, self._api_key, timeout=self._timeout) as http:
77
+ js = await http.get(
78
+ f"/api/prompt-learning/online/jobs/{job_id}/events",
79
+ params=params
80
+ )
81
+ if isinstance(js, dict) and isinstance(js.get("events"), list):
82
+ return js["events"]
83
+ # Unexpected response structure - raise instead of silently returning empty list
84
+ raise ValueError(
85
+ f"Unexpected response structure from events endpoint. "
86
+ f"Expected dict with 'events' list, got: {type(js).__name__}"
87
+ )
88
+
89
+ async def get_prompts(self, job_id: str) -> PromptResults:
90
+ """Get the best prompts and scoring metadata from a completed job.
91
+
92
+ Args:
93
+ job_id: Job ID
94
+
95
+ Returns:
96
+ PromptResults dataclass containing:
97
+ - best_prompt: The top-performing prompt with sections and metadata
98
+ - best_score: The best accuracy score achieved
99
+ - top_prompts: List of top-K prompts with train/val scores
100
+ - optimized_candidates: All frontier/Pareto-optimal candidates
101
+ - attempted_candidates: All candidates tried during optimization
102
+
103
+ Raises:
104
+ ValueError: If job_id format is invalid
105
+ """
106
+ _validate_job_id(job_id)
107
+ events = await self.get_events(job_id, limit=10000)
108
+
109
+ result = PromptResults()
110
+
111
+ # Extract results from events
112
+ for event in events:
113
+ event_type = event.get("type", "")
114
+ event_data = event.get("data", {})
115
+
116
+ # Best prompt event
117
+ if event_type == "prompt.learning.best.prompt":
118
+ result.best_prompt = event_data.get("best_prompt")
119
+ result.best_score = event_data.get("best_score")
120
+
121
+ # Top-K prompt content events
122
+ elif event_type == "prompt.learning.top.prompt.content":
123
+ result.top_prompts.append({
124
+ "rank": event_data.get("rank"),
125
+ "train_accuracy": event_data.get("train_accuracy"),
126
+ "val_accuracy": event_data.get("val_accuracy"),
127
+ "template": event_data.get("template"),
128
+ "full_text": event_data.get("full_text"),
129
+ })
130
+
131
+ # Final results event (contains all candidates)
132
+ elif event_type == "prompt.learning.final.results":
133
+ result.optimized_candidates = event_data.get("optimized_candidates", [])
134
+ result.attempted_candidates = event_data.get("attempted_candidates", [])
135
+
136
+ # Validation results
137
+ elif event_type == "prompt.learning.validation.scored":
138
+ result.validation_results.append(event_data)
139
+
140
+ # Completion event (fallback for best_score)
141
+ elif event_type == "prompt.learning.gepa.complete":
142
+ if result.best_score is None:
143
+ result.best_score = event_data.get("best_score")
144
+
145
+ return result
146
+
147
+ async def get_prompt_text(self, job_id: str, rank: int = 1) -> Optional[str]:
148
+ """Get the full text of a specific prompt by rank.
149
+
150
+ Args:
151
+ job_id: Job ID
152
+ rank: Prompt rank (1 = best, 2 = second best, etc.)
153
+
154
+ Returns:
155
+ Full prompt text or None if not found
156
+
157
+ Raises:
158
+ ValueError: If job_id format is invalid or rank < 1
159
+ """
160
+ _validate_job_id(job_id)
161
+ if rank < 1:
162
+ raise ValueError(f"Rank must be >= 1, got: {rank}")
163
+ prompts_data = await self.get_prompts(job_id)
164
+ top_prompts = prompts_data.top_prompts
165
+
166
+ for prompt_info in top_prompts:
167
+ if prompt_info.get("rank") == rank:
168
+ return prompt_info.get("full_text")
169
+
170
+ return None
171
+
172
+ async def get_scoring_summary(self, job_id: str) -> Dict[str, Any]:
173
+ """Get a summary of scoring metrics for all candidates.
174
+
175
+ Args:
176
+ job_id: Job ID
177
+
178
+ Returns:
179
+ Dictionary with scoring statistics:
180
+ - best_train_accuracy: Best training accuracy
181
+ - best_val_accuracy: Best validation accuracy (if available)
182
+ - num_candidates_tried: Total candidates evaluated
183
+ - num_frontier_candidates: Number in Pareto frontier
184
+ - score_distribution: Histogram of accuracy scores
185
+
186
+ Raises:
187
+ ValueError: If job_id format is invalid
188
+ """
189
+ _validate_job_id(job_id)
190
+ prompts_data = await self.get_prompts(job_id)
191
+
192
+ attempted = prompts_data.attempted_candidates
193
+ optimized = prompts_data.optimized_candidates
194
+ validation = prompts_data.validation_results
195
+
196
+ # Extract train accuracies (only from candidates that have accuracy field)
197
+ train_accuracies = [
198
+ c["accuracy"] for c in attempted if "accuracy" in c
199
+ ]
200
+
201
+ # Extract val accuracies (only from validations that have accuracy field)
202
+ val_accuracies = [
203
+ v["accuracy"] for v in validation if "accuracy" in v
204
+ ]
205
+
206
+ # Score distribution (bins)
207
+ bins = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
208
+ distribution = {f"{bins[i]:.1f}-{bins[i+1]:.1f}": 0 for i in range(len(bins) - 1)}
209
+ for acc in train_accuracies:
210
+ for i in range(len(bins) - 1):
211
+ if bins[i] <= acc < bins[i+1] or (i == len(bins) - 2 and acc == bins[i+1]):
212
+ distribution[f"{bins[i]:.1f}-{bins[i+1]:.1f}"] += 1
213
+ break
214
+
215
+ return {
216
+ "best_train_accuracy": max(train_accuracies) if train_accuracies else None,
217
+ "best_val_accuracy": max(val_accuracies) if val_accuracies else None,
218
+ "num_candidates_tried": len(attempted),
219
+ "num_frontier_candidates": len(optimized),
220
+ "score_distribution": distribution,
221
+ "mean_train_accuracy": sum(train_accuracies) / len(train_accuracies) if train_accuracies else None,
222
+ }
223
+
224
+
225
+ # Synchronous wrapper for convenience
226
+ def get_prompts(job_id: str, base_url: str, api_key: str) -> PromptResults:
227
+ """Synchronous wrapper to get prompts from a job.
228
+
229
+ Args:
230
+ job_id: Job ID (e.g., "pl_9c58b711c2644083")
231
+ base_url: Backend API base URL
232
+ api_key: API key for authentication
233
+
234
+ Returns:
235
+ PromptResults dataclass with prompt results
236
+ """
237
+ import asyncio
238
+
239
+ client = PromptLearningClient(base_url, api_key)
240
+ return asyncio.run(client.get_prompts(job_id))
241
+
242
+
243
+ def get_prompt_text(job_id: str, base_url: str, api_key: str, rank: int = 1) -> Optional[str]:
244
+ """Synchronous wrapper to get prompt text by rank.
245
+
246
+ Args:
247
+ job_id: Job ID
248
+ base_url: Backend API base URL
249
+ api_key: API key for authentication
250
+ rank: Prompt rank (1 = best, 2 = second best, etc.)
251
+
252
+ Returns:
253
+ Full prompt text or None if not found
254
+ """
255
+ import asyncio
256
+
257
+ client = PromptLearningClient(base_url, api_key)
258
+ return asyncio.run(client.get_prompt_text(job_id, rank))
259
+
260
+
261
+ def get_scoring_summary(job_id: str, base_url: str, api_key: str) -> Dict[str, Any]:
262
+ """Synchronous wrapper to get scoring summary.
263
+
264
+ Args:
265
+ job_id: Job ID
266
+ base_url: Backend API base URL
267
+ api_key: API key for authentication
268
+
269
+ Returns:
270
+ Dictionary with scoring statistics
271
+ """
272
+ import asyncio
273
+
274
+ client = PromptLearningClient(base_url, api_key)
275
+ return asyncio.run(client.get_scoring_summary(job_id))
276
+
@@ -0,0 +1,184 @@
1
+ """Type definitions for prompt learning data structures."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any, Dict, List, Optional
7
+
8
+
9
+ @dataclass
10
+ class TextReplacement:
11
+ """A text replacement in a prompt transformation."""
12
+
13
+ new_text: str
14
+ apply_to_role: str = "system"
15
+ old_text: Optional[str] = None
16
+ position: Optional[int] = None
17
+
18
+
19
+ @dataclass
20
+ class CandidateScore:
21
+ """Scoring information for a candidate prompt."""
22
+
23
+ accuracy: float
24
+ prompt_length: int = 0
25
+ tool_call_rate: float = 0.0
26
+ instance_scores: List[float] = field(default_factory=list)
27
+
28
+
29
+ @dataclass
30
+ class PromptSection:
31
+ """A section of a prompt (e.g., system, user, assistant)."""
32
+
33
+ role: str
34
+ content: str
35
+
36
+
37
+ @dataclass
38
+ class Candidate:
39
+ """A candidate prompt from the optimization process."""
40
+
41
+ accuracy: float
42
+ prompt_length: int = 0
43
+ tool_call_rate: float = 0.0
44
+ instance_scores: List[float] = field(default_factory=list)
45
+ object: Optional[Dict[str, Any]] = None
46
+
47
+ @classmethod
48
+ def from_dict(cls, data: Dict[str, Any]) -> Candidate:
49
+ """Create a Candidate from a dictionary."""
50
+ return cls(
51
+ accuracy=data.get("accuracy", 0.0),
52
+ prompt_length=data.get("prompt_length", 0),
53
+ tool_call_rate=data.get("tool_call_rate", 0.0),
54
+ instance_scores=data.get("instance_scores", []),
55
+ object=data.get("object"),
56
+ )
57
+
58
+
59
+ @dataclass
60
+ class OptimizedCandidate:
61
+ """An optimized candidate from the Pareto frontier."""
62
+
63
+ score: CandidateScore
64
+ payload_kind: str # "transformation" or "template"
65
+ object: Optional[Dict[str, Any]] = None
66
+ instance_scores: Optional[List[float]] = None
67
+
68
+ @classmethod
69
+ def from_dict(cls, data: Dict[str, Any]) -> OptimizedCandidate:
70
+ """Create an OptimizedCandidate from a dictionary."""
71
+ score_data = data.get("score", {})
72
+ if isinstance(score_data, dict):
73
+ score = CandidateScore(
74
+ accuracy=score_data.get("accuracy", 0.0),
75
+ prompt_length=score_data.get("prompt_length", 0),
76
+ tool_call_rate=score_data.get("tool_call_rate", 0.0),
77
+ instance_scores=score_data.get("instance_scores", []),
78
+ )
79
+ else:
80
+ score = CandidateScore(accuracy=0.0)
81
+
82
+ return cls(
83
+ score=score,
84
+ payload_kind=data.get("payload_kind", "unknown"),
85
+ object=data.get("object"),
86
+ instance_scores=data.get("instance_scores"),
87
+ )
88
+
89
+
90
+ @dataclass
91
+ class PromptLearningEvent:
92
+ """A generic prompt learning event."""
93
+
94
+ type: str
95
+ message: str
96
+ data: Dict[str, Any]
97
+ seq: int
98
+ created_at: Optional[str] = None
99
+
100
+ @classmethod
101
+ def from_dict(cls, data: Dict[str, Any]) -> PromptLearningEvent:
102
+ """Create a PromptLearningEvent from a dictionary."""
103
+ return cls(
104
+ type=data.get("type", ""),
105
+ message=data.get("message", ""),
106
+ data=data.get("data", {}),
107
+ seq=data.get("seq", 0),
108
+ created_at=data.get("created_at"),
109
+ )
110
+
111
+
112
+ @dataclass
113
+ class BestPromptEventData:
114
+ """Data for prompt.learning.best.prompt event."""
115
+
116
+ best_score: float
117
+ best_prompt: Dict[str, Any]
118
+
119
+ @classmethod
120
+ def from_dict(cls, data: Dict[str, Any]) -> BestPromptEventData:
121
+ """Create BestPromptEventData from a dictionary."""
122
+ return cls(
123
+ best_score=data.get("best_score", 0.0),
124
+ best_prompt=data.get("best_prompt", {}),
125
+ )
126
+
127
+
128
+ @dataclass
129
+ class FinalResultsEventData:
130
+ """Data for prompt.learning.final.results event."""
131
+
132
+ attempted_candidates: List[Dict[str, Any]]
133
+ optimized_candidates: List[Dict[str, Any]]
134
+
135
+ @classmethod
136
+ def from_dict(cls, data: Dict[str, Any]) -> FinalResultsEventData:
137
+ """Create FinalResultsEventData from a dictionary."""
138
+ return cls(
139
+ attempted_candidates=data.get("attempted_candidates", []),
140
+ optimized_candidates=data.get("optimized_candidates", []),
141
+ )
142
+
143
+
144
+ @dataclass
145
+ class ValidationScoredEventData:
146
+ """Data for prompt.learning.validation.scored event."""
147
+
148
+ accuracy: float
149
+ instance_scores: List[float] = field(default_factory=list)
150
+ is_baseline: bool = False
151
+
152
+ @classmethod
153
+ def from_dict(cls, data: Dict[str, Any]) -> ValidationScoredEventData:
154
+ """Create ValidationScoredEventData from a dictionary."""
155
+ return cls(
156
+ accuracy=data.get("accuracy", 0.0),
157
+ instance_scores=data.get("instance_scores", []),
158
+ is_baseline=data.get("is_baseline", False),
159
+ )
160
+
161
+
162
+ @dataclass
163
+ class PromptResults:
164
+ """Results from a completed prompt learning job."""
165
+
166
+ best_prompt: Optional[Dict[str, Any]] = None
167
+ best_score: Optional[float] = None
168
+ top_prompts: List[Dict[str, Any]] = field(default_factory=list)
169
+ optimized_candidates: List[Dict[str, Any]] = field(default_factory=list)
170
+ attempted_candidates: List[Dict[str, Any]] = field(default_factory=list)
171
+ validation_results: List[Dict[str, Any]] = field(default_factory=list)
172
+
173
+ @classmethod
174
+ def from_dict(cls, data: Dict[str, Any]) -> PromptResults:
175
+ """Create PromptResults from a dictionary."""
176
+ return cls(
177
+ best_prompt=data.get("best_prompt"),
178
+ best_score=data.get("best_score"),
179
+ top_prompts=data.get("top_prompts", []),
180
+ optimized_candidates=data.get("optimized_candidates", []),
181
+ attempted_candidates=data.get("attempted_candidates", []),
182
+ validation_results=data.get("validation_results", []),
183
+ )
184
+
@@ -0,0 +1,2 @@
1
+ """Pricing module for SDK."""
2
+
@@ -0,0 +1,57 @@
1
+ """Static pricing table for supported models.
2
+
3
+ This module provides per-token pricing used by the SDK status commands.
4
+ Rates are expressed in USD per token and split into input/output prices.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ from dataclasses import dataclass
9
+ from typing import Dict
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class TokenRates:
14
+ input_usd: float
15
+ output_usd: float
16
+
17
+
18
+ # Default per-token prices (USD), sourced Nov 3, 2025 — update as contracts change
19
+ MODEL_PRICES: Dict[str, Dict[str, TokenRates]] = {
20
+ # OpenAI official pricing
21
+ "openai": {
22
+ # GPT-5 family
23
+ "gpt-5": TokenRates(input_usd=0.00000125, output_usd=0.00001000), # $1.25 / $10 per 1M
24
+ "gpt-5-mini": TokenRates(input_usd=0.00000025, output_usd=0.00000200), # $0.25 / $2.00 per 1M
25
+ "gpt-5-nano": TokenRates(input_usd=0.00000005, output_usd=0.00000040), # $0.05 / $0.40 per 1M
26
+
27
+ "gpt-4o-mini": TokenRates(input_usd=0.00000015, output_usd=0.00000060), # $0.15 / $0.60 per 1M
28
+ "gpt-4o": TokenRates(input_usd=0.00000250, output_usd=0.00001000), # $2.50 / $10.00 per 1M
29
+ },
30
+ # Groq OSS via OpenAI-compatible path (latest Groq docs)
31
+ "groq": {
32
+ "openai/gpt-oss-20b": TokenRates(input_usd=0.000000075, output_usd=0.000000300), # $0.075 / $0.30 per 1M
33
+
34
+ "openai/gpt-oss-120b": TokenRates(input_usd=0.000000150, output_usd=0.000000600), # $0.15 / $0.60 per 1M
35
+
36
+ # Additional Groq on-demand models
37
+ "moonshotai/kimi-k2-0905": TokenRates(input_usd=0.000001000, output_usd=0.000003000), # $1.00 / $3.00 per 1M
38
+
39
+ "meta/llama-guard-4-12b": TokenRates(input_usd=0.000000200, output_usd=0.000000200), # $0.20 / $0.20 per 1M
40
+ "qwen/qwen3-32b": TokenRates(input_usd=0.000000290, output_usd=0.000000590), # $0.29 / $0.59 per 1M
41
+ "meta/llama-3.3-70b-versatile": TokenRates(input_usd=0.000000590, output_usd=0.000000790), # $0.59 / $0.79 per 1M
42
+ "meta/llama-3.1-8b-instant": TokenRates(input_usd=0.000000050, output_usd=0.000000080), # $0.05 / $0.08 per 1M
43
+ },
44
+ # Google Gemini pricing — per-token USD (per 1M ÷ 1e6), Nov 3, 2025
45
+ "google": {
46
+ # Gemini 2.5 Pro (two tiers by prompt size)
47
+ "gemini-2.5-pro": TokenRates(input_usd=0.00000125, output_usd=0.00001000), # <=200k tokens
48
+ "gemini-2.5-pro-gt200k": TokenRates(input_usd=0.00000250, output_usd=0.00001500), # >200k tokens
49
+
50
+ # Gemini 2.5 Flash (hybrid reasoning)
51
+ "gemini-2.5-flash": TokenRates(input_usd=0.00000030, output_usd=0.00000250),
52
+
53
+ # Gemini 2.5 Flash-Lite (cheapest)
54
+ "gemini-2.5-flash-lite": TokenRates(input_usd=0.00000010, output_usd=0.00000040),
55
+ },
56
+ }
57
+
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import contextlib
4
4
  import json
5
+ import re
5
6
  import time
6
7
  from abc import ABC, abstractmethod
7
8
  from collections import deque
@@ -14,6 +15,37 @@ import click
14
15
  from .types import StreamMessage, StreamType
15
16
 
16
17
 
18
+ def _mask_sensitive_urls(text: str) -> str:
19
+ """Mask S3/Wasabi URLs and sensitive paths in log messages.
20
+
21
+ Replaces full S3/Wasabi URLs with masked versions to prevent leaking
22
+ bucket names, paths, and infrastructure details in public SDK logs.
23
+
24
+ Examples:
25
+ s3://synth-artifacts/models/... -> s3://***/***/[masked]
26
+ Wasabi s3://bucket/path/file.tar.gz -> Wasabi s3://***/***/[masked]
27
+ """
28
+ if not text:
29
+ return text
30
+
31
+ # Pattern matches:
32
+ # - Optional "Wasabi " prefix
33
+ # - s3:// or http(s):// scheme
34
+ # - Any bucket/host
35
+ # - Any path
36
+ # - Common model file extensions
37
+ pattern = r'(Wasabi\s+)?((s3|https?)://[^\s]+\.(tar\.gz|zip|pt|pth|safetensors|ckpt|bin))'
38
+
39
+ def replace_url(match: re.Match) -> str:
40
+ prefix = match.group(1) or "" # "Wasabi " or empty
41
+ url = match.group(2)
42
+ # Extract just the filename
43
+ filename = url.split("/")[-1] if "/" in url else "file"
44
+ return f'{prefix}s3://***/***/[{filename}]'
45
+
46
+ return re.sub(pattern, replace_url, text, flags=re.IGNORECASE)
47
+
48
+
17
49
  class StreamHandler(ABC):
18
50
  """Base class for log handlers that consume ``StreamMessage`` objects."""
19
51
 
@@ -72,14 +104,29 @@ class CLIHandler(StreamHandler):
72
104
  prefix = f"[{timestamp}] [{message.seq}] {event_type}"
73
105
  if level:
74
106
  prefix += f" ({level})"
75
- click.echo(f"{prefix}: {msg}".rstrip(": "))
107
+ # Mask sensitive URLs before displaying
108
+ sanitized_msg = _mask_sensitive_urls(msg)
109
+ click.echo(f"{prefix}: {sanitized_msg}".rstrip(": "))
76
110
  return
77
111
 
78
112
  if message.stream_type is StreamType.METRICS:
79
- name = message.data.get("name", "metric")
113
+ name = message.data.get("name")
80
114
  value = message.data.get("value")
81
115
  step = message.data.get("step")
82
- click.echo(f"[{timestamp}] {name}={value} (step={step})")
116
+ data = message.data.get("data", {})
117
+
118
+ # Format metric display
119
+ metric_str = f"[{timestamp}] [metric] {name}={value:.4f}" if isinstance(value, (int, float)) else f"[{timestamp}] [metric] {name}={value}"
120
+ if step is not None:
121
+ metric_str += f" (step={step})"
122
+
123
+ # Add any additional context from data field
124
+ if isinstance(data, dict):
125
+ n = data.get("n")
126
+ if n is not None:
127
+ metric_str += f" n={n}"
128
+
129
+ click.echo(metric_str)
83
130
  return
84
131
 
85
132
  if message.stream_type is StreamType.TIMELINE:
@@ -387,7 +434,9 @@ class RichHandler(StreamHandler):
387
434
  event_type = message.data.get("type", "event")
388
435
  summary = message.data.get("message") or ""
389
436
  level = message.data.get("level")
390
- formatted = f"[{event_type}] {summary}".strip()
437
+ # Mask sensitive URLs before displaying
438
+ sanitized_summary = _mask_sensitive_urls(summary)
439
+ formatted = f"[{event_type}] {sanitized_summary}".strip()
391
440
  if level:
392
441
  formatted = f"{formatted} ({level})"
393
442
  self._event_log.append(formatted)
@@ -51,6 +51,25 @@ class StreamEndpoints:
51
51
  timeline=f"{base}/timeline",
52
52
  )
53
53
 
54
+ @classmethod
55
+ def prompt_learning(cls, job_id: str) -> StreamEndpoints:
56
+ """Endpoints for prompt learning jobs (MIPRO/GEPA)."""
57
+ base = f"/prompt-learning/online/jobs/{job_id}"
58
+ return cls(
59
+ status=base,
60
+ events=f"{base}/events",
61
+ metrics=f"{base}/metrics",
62
+ timeline=None,
63
+ status_fallbacks=(
64
+ f"/learning/jobs/{job_id}",
65
+ f"/orchestration/jobs/{job_id}",
66
+ ),
67
+ event_fallbacks=(
68
+ f"/learning/jobs/{job_id}/events",
69
+ f"/orchestration/jobs/{job_id}/events",
70
+ ),
71
+ )
72
+
54
73
  @classmethod
55
74
  def rl(cls, job_id: str) -> StreamEndpoints:
56
75
  base = f"/rl/jobs/{job_id}"
@@ -22,6 +22,7 @@ class ModalDeploymentConfig:
22
22
  extra_local_dirs: Sequence[tuple[str, str]] = field(default_factory=tuple)
23
23
  secret_names: Sequence[str] = field(default_factory=tuple)
24
24
  volume_mounts: Sequence[tuple[str, str]] = field(default_factory=tuple)
25
+ env_vars: dict[str, str] = field(default_factory=dict)
25
26
  timeout: int = 600
26
27
  memory: int = 4096
27
28
  cpu: float = 2.0
synth_ai/task/config.py CHANGED
@@ -257,3 +257,5 @@ class FilterConfig:
257
257
  output_path.parent.mkdir(parents=True, exist_ok=True)
258
258
  return output_path
259
259
 
260
+
261
+