synth-ai 0.2.17__py3-none-any.whl → 0.2.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/baseline/banking77_baseline.py +204 -0
- examples/baseline/crafter_baseline.py +407 -0
- examples/baseline/pokemon_red_baseline.py +326 -0
- examples/baseline/simple_baseline.py +56 -0
- examples/baseline/warming_up_to_rl_baseline.py +239 -0
- examples/blog_posts/gepa/README.md +355 -0
- examples/blog_posts/gepa/configs/banking77_gepa_local.toml +95 -0
- examples/blog_posts/gepa/configs/banking77_gepa_test.toml +82 -0
- examples/blog_posts/gepa/configs/banking77_mipro_local.toml +52 -0
- examples/blog_posts/gepa/configs/hotpotqa_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/hotpotqa_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/hotpotqa_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/hover_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/hover_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/hover_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/ifbench_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/ifbench_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/ifbench_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/pupa_gepa_local.toml +60 -0
- examples/blog_posts/gepa/configs/pupa_mipro_local.toml +54 -0
- examples/blog_posts/gepa/deploy_banking77_task_app.sh +41 -0
- examples/blog_posts/gepa/gepa_baseline.py +204 -0
- examples/blog_posts/gepa/query_prompts_example.py +97 -0
- examples/blog_posts/gepa/run_gepa_banking77.sh +87 -0
- examples/blog_posts/gepa/task_apps.py +105 -0
- examples/blog_posts/gepa/test_gepa_local.sh +67 -0
- examples/blog_posts/gepa/verify_banking77_setup.sh +123 -0
- examples/blog_posts/pokemon_vl/configs/eval_gpt5nano.toml +26 -0
- examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +12 -10
- examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +1 -0
- examples/blog_posts/pokemon_vl/extract_images.py +239 -0
- examples/blog_posts/pokemon_vl/pokemon_vl_baseline.py +326 -0
- examples/blog_posts/pokemon_vl/run_eval_extract_images.py +209 -0
- examples/blog_posts/pokemon_vl/run_qwen_eval_extract_images.py +212 -0
- examples/blog_posts/pokemon_vl/text_box_analysis.md +106 -0
- examples/blog_posts/warming_up_to_rl/ARCHITECTURE.md +195 -0
- examples/blog_posts/warming_up_to_rl/FINAL_TEST_RESULTS.md +127 -0
- examples/blog_posts/warming_up_to_rl/INFERENCE_SUCCESS.md +132 -0
- examples/blog_posts/warming_up_to_rl/SMOKE_TESTING.md +164 -0
- examples/blog_posts/warming_up_to_rl/SMOKE_TEST_COMPLETE.md +253 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_baseline_qwen32b_10x20.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b_10x20.toml +26 -0
- examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +1 -1
- examples/blog_posts/warming_up_to_rl/configs/smoke_test.toml +75 -0
- examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +60 -10
- examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +1 -1
- examples/blog_posts/warming_up_to_rl/warming_up_to_rl_baseline.py +187 -0
- examples/multi_step/configs/VERILOG_REWARDS.md +4 -0
- examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +4 -0
- examples/multi_step/configs/crafter_rl_outcome.toml +1 -0
- examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +1 -0
- examples/multi_step/configs/crafter_rl_stepwise_simple.toml +1 -0
- examples/rl/configs/rl_from_base_qwen17.toml +1 -0
- examples/swe/task_app/hosted/inference/openai_client.py +0 -34
- examples/swe/task_app/hosted/policy_routes.py +17 -0
- examples/swe/task_app/hosted/rollout.py +4 -2
- examples/task_apps/banking77/__init__.py +6 -0
- examples/task_apps/banking77/banking77_task_app.py +841 -0
- examples/task_apps/banking77/deploy_wrapper.py +46 -0
- examples/task_apps/crafter/CREATE_SFT_DATASET.md +4 -0
- examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +4 -0
- examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +4 -0
- examples/task_apps/crafter/task_app/grpo_crafter.py +24 -2
- examples/task_apps/crafter/task_app/synth_envs_hosted/hosted_app.py +49 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +355 -58
- examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +68 -7
- examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +78 -21
- examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +194 -1
- examples/task_apps/gepa_benchmarks/__init__.py +7 -0
- examples/task_apps/gepa_benchmarks/common.py +260 -0
- examples/task_apps/gepa_benchmarks/hotpotqa_task_app.py +507 -0
- examples/task_apps/gepa_benchmarks/hover_task_app.py +436 -0
- examples/task_apps/gepa_benchmarks/ifbench_task_app.py +563 -0
- examples/task_apps/gepa_benchmarks/pupa_task_app.py +460 -0
- examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +4 -0
- examples/task_apps/pokemon_red/task_app.py +254 -36
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +1 -0
- examples/warming_up_to_rl/task_app/grpo_crafter.py +53 -4
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +49 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +152 -41
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +31 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +33 -3
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +67 -0
- examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +1 -0
- synth_ai/api/train/builders.py +90 -1
- synth_ai/api/train/cli.py +396 -21
- synth_ai/api/train/config_finder.py +13 -2
- synth_ai/api/train/configs/__init__.py +15 -1
- synth_ai/api/train/configs/prompt_learning.py +442 -0
- synth_ai/api/train/configs/rl.py +29 -0
- synth_ai/api/train/task_app.py +1 -1
- synth_ai/api/train/validators.py +277 -0
- synth_ai/baseline/__init__.py +25 -0
- synth_ai/baseline/config.py +209 -0
- synth_ai/baseline/discovery.py +214 -0
- synth_ai/baseline/execution.py +146 -0
- synth_ai/cli/__init__.py +85 -17
- synth_ai/cli/__main__.py +0 -0
- synth_ai/cli/claude.py +70 -0
- synth_ai/cli/codex.py +84 -0
- synth_ai/cli/commands/__init__.py +1 -0
- synth_ai/cli/commands/baseline/__init__.py +12 -0
- synth_ai/cli/commands/baseline/core.py +637 -0
- synth_ai/cli/commands/baseline/list.py +93 -0
- synth_ai/cli/commands/eval/core.py +13 -10
- synth_ai/cli/commands/filter/core.py +53 -17
- synth_ai/cli/commands/help/core.py +0 -1
- synth_ai/cli/commands/smoke/__init__.py +7 -0
- synth_ai/cli/commands/smoke/core.py +1436 -0
- synth_ai/cli/commands/status/subcommands/pricing.py +22 -0
- synth_ai/cli/commands/status/subcommands/usage.py +203 -0
- synth_ai/cli/commands/train/judge_schemas.py +1 -0
- synth_ai/cli/commands/train/judge_validation.py +1 -0
- synth_ai/cli/commands/train/validation.py +0 -57
- synth_ai/cli/demo.py +35 -3
- synth_ai/cli/deploy/__init__.py +40 -25
- synth_ai/cli/deploy.py +162 -0
- synth_ai/cli/legacy_root_backup.py +14 -8
- synth_ai/cli/opencode.py +107 -0
- synth_ai/cli/root.py +9 -5
- synth_ai/cli/task_app_deploy.py +1 -1
- synth_ai/cli/task_apps.py +53 -53
- synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +7 -4
- synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +9 -5
- synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +4 -3
- synth_ai/judge_schemas.py +1 -0
- synth_ai/learning/__init__.py +10 -0
- synth_ai/learning/prompt_learning_client.py +276 -0
- synth_ai/learning/prompt_learning_types.py +184 -0
- synth_ai/pricing/__init__.py +2 -0
- synth_ai/pricing/model_pricing.py +57 -0
- synth_ai/streaming/handlers.py +53 -4
- synth_ai/streaming/streamer.py +19 -0
- synth_ai/task/apps/__init__.py +1 -0
- synth_ai/task/config.py +2 -0
- synth_ai/task/tracing_utils.py +25 -25
- synth_ai/task/validators.py +44 -8
- synth_ai/task_app_cfgs.py +21 -0
- synth_ai/tracing_v3/config.py +162 -19
- synth_ai/tracing_v3/constants.py +1 -1
- synth_ai/tracing_v3/db_config.py +24 -38
- synth_ai/tracing_v3/storage/config.py +47 -13
- synth_ai/tracing_v3/storage/factory.py +3 -3
- synth_ai/tracing_v3/turso/daemon.py +113 -11
- synth_ai/tracing_v3/turso/native_manager.py +92 -16
- synth_ai/types.py +8 -0
- synth_ai/urls.py +11 -0
- synth_ai/utils/__init__.py +30 -1
- synth_ai/utils/agents.py +74 -0
- synth_ai/utils/bin.py +39 -0
- synth_ai/utils/cli.py +149 -5
- synth_ai/utils/env.py +17 -17
- synth_ai/utils/json.py +72 -0
- synth_ai/utils/modal.py +283 -1
- synth_ai/utils/paths.py +48 -0
- synth_ai/utils/uvicorn.py +113 -0
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/METADATA +102 -4
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/RECORD +162 -88
- synth_ai/cli/commands/deploy/__init__.py +0 -23
- synth_ai/cli/commands/deploy/core.py +0 -614
- synth_ai/cli/commands/deploy/errors.py +0 -72
- synth_ai/cli/commands/deploy/validation.py +0 -11
- synth_ai/cli/deploy/core.py +0 -5
- synth_ai/cli/deploy/errors.py +0 -23
- synth_ai/cli/deploy/validation.py +0 -5
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
"""Client utilities for querying prompt learning job results."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
6
|
+
|
|
7
|
+
from .._utils.http import AsyncHttpClient
|
|
8
|
+
from .prompt_learning_types import PromptResults
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _validate_job_id(job_id: str) -> None:
|
|
12
|
+
"""Validate that job_id has the expected prompt learning format.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
job_id: Job ID to validate
|
|
16
|
+
|
|
17
|
+
Raises:
|
|
18
|
+
ValueError: If job_id doesn't start with 'pl_'
|
|
19
|
+
"""
|
|
20
|
+
if not job_id.startswith("pl_"):
|
|
21
|
+
raise ValueError(
|
|
22
|
+
f"Invalid prompt learning job ID format: {job_id!r}. "
|
|
23
|
+
f"Expected format: 'pl_<identifier>' (e.g., 'pl_9c58b711c2644083')"
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class PromptLearningClient:
|
|
28
|
+
"""Client for interacting with prompt learning jobs and retrieving results."""
|
|
29
|
+
|
|
30
|
+
def __init__(self, base_url: str, api_key: str, *, timeout: float = 30.0) -> None:
|
|
31
|
+
"""Initialize the prompt learning client.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
base_url: Base URL of the backend API (e.g., "http://localhost:8000")
|
|
35
|
+
api_key: API key for authentication
|
|
36
|
+
timeout: Request timeout in seconds
|
|
37
|
+
"""
|
|
38
|
+
self._base_url = base_url.rstrip("/")
|
|
39
|
+
self._api_key = api_key
|
|
40
|
+
self._timeout = timeout
|
|
41
|
+
|
|
42
|
+
async def get_job(self, job_id: str) -> Dict[str, Any]:
|
|
43
|
+
"""Get job metadata and status.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
job_id: Job ID (e.g., "pl_9c58b711c2644083")
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Job metadata including status, best_score, created_at, etc.
|
|
50
|
+
|
|
51
|
+
Raises:
|
|
52
|
+
ValueError: If job_id format is invalid
|
|
53
|
+
"""
|
|
54
|
+
_validate_job_id(job_id)
|
|
55
|
+
async with AsyncHttpClient(self._base_url, self._api_key, timeout=self._timeout) as http:
|
|
56
|
+
return await http.get(f"/api/prompt-learning/online/jobs/{job_id}")
|
|
57
|
+
|
|
58
|
+
async def get_events(
|
|
59
|
+
self, job_id: str, *, since_seq: int = 0, limit: int = 5000
|
|
60
|
+
) -> List[Dict[str, Any]]:
|
|
61
|
+
"""Get events for a prompt learning job.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
job_id: Job ID
|
|
65
|
+
since_seq: Return events after this sequence number
|
|
66
|
+
limit: Maximum number of events to return
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
List of event dictionaries with type, message, data, etc.
|
|
70
|
+
|
|
71
|
+
Raises:
|
|
72
|
+
ValueError: If job_id format is invalid or response structure is unexpected
|
|
73
|
+
"""
|
|
74
|
+
_validate_job_id(job_id)
|
|
75
|
+
params = {"since_seq": since_seq, "limit": limit}
|
|
76
|
+
async with AsyncHttpClient(self._base_url, self._api_key, timeout=self._timeout) as http:
|
|
77
|
+
js = await http.get(
|
|
78
|
+
f"/api/prompt-learning/online/jobs/{job_id}/events",
|
|
79
|
+
params=params
|
|
80
|
+
)
|
|
81
|
+
if isinstance(js, dict) and isinstance(js.get("events"), list):
|
|
82
|
+
return js["events"]
|
|
83
|
+
# Unexpected response structure - raise instead of silently returning empty list
|
|
84
|
+
raise ValueError(
|
|
85
|
+
f"Unexpected response structure from events endpoint. "
|
|
86
|
+
f"Expected dict with 'events' list, got: {type(js).__name__}"
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
async def get_prompts(self, job_id: str) -> PromptResults:
|
|
90
|
+
"""Get the best prompts and scoring metadata from a completed job.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
job_id: Job ID
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
PromptResults dataclass containing:
|
|
97
|
+
- best_prompt: The top-performing prompt with sections and metadata
|
|
98
|
+
- best_score: The best accuracy score achieved
|
|
99
|
+
- top_prompts: List of top-K prompts with train/val scores
|
|
100
|
+
- optimized_candidates: All frontier/Pareto-optimal candidates
|
|
101
|
+
- attempted_candidates: All candidates tried during optimization
|
|
102
|
+
|
|
103
|
+
Raises:
|
|
104
|
+
ValueError: If job_id format is invalid
|
|
105
|
+
"""
|
|
106
|
+
_validate_job_id(job_id)
|
|
107
|
+
events = await self.get_events(job_id, limit=10000)
|
|
108
|
+
|
|
109
|
+
result = PromptResults()
|
|
110
|
+
|
|
111
|
+
# Extract results from events
|
|
112
|
+
for event in events:
|
|
113
|
+
event_type = event.get("type", "")
|
|
114
|
+
event_data = event.get("data", {})
|
|
115
|
+
|
|
116
|
+
# Best prompt event
|
|
117
|
+
if event_type == "prompt.learning.best.prompt":
|
|
118
|
+
result.best_prompt = event_data.get("best_prompt")
|
|
119
|
+
result.best_score = event_data.get("best_score")
|
|
120
|
+
|
|
121
|
+
# Top-K prompt content events
|
|
122
|
+
elif event_type == "prompt.learning.top.prompt.content":
|
|
123
|
+
result.top_prompts.append({
|
|
124
|
+
"rank": event_data.get("rank"),
|
|
125
|
+
"train_accuracy": event_data.get("train_accuracy"),
|
|
126
|
+
"val_accuracy": event_data.get("val_accuracy"),
|
|
127
|
+
"template": event_data.get("template"),
|
|
128
|
+
"full_text": event_data.get("full_text"),
|
|
129
|
+
})
|
|
130
|
+
|
|
131
|
+
# Final results event (contains all candidates)
|
|
132
|
+
elif event_type == "prompt.learning.final.results":
|
|
133
|
+
result.optimized_candidates = event_data.get("optimized_candidates", [])
|
|
134
|
+
result.attempted_candidates = event_data.get("attempted_candidates", [])
|
|
135
|
+
|
|
136
|
+
# Validation results
|
|
137
|
+
elif event_type == "prompt.learning.validation.scored":
|
|
138
|
+
result.validation_results.append(event_data)
|
|
139
|
+
|
|
140
|
+
# Completion event (fallback for best_score)
|
|
141
|
+
elif event_type == "prompt.learning.gepa.complete":
|
|
142
|
+
if result.best_score is None:
|
|
143
|
+
result.best_score = event_data.get("best_score")
|
|
144
|
+
|
|
145
|
+
return result
|
|
146
|
+
|
|
147
|
+
async def get_prompt_text(self, job_id: str, rank: int = 1) -> Optional[str]:
|
|
148
|
+
"""Get the full text of a specific prompt by rank.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
job_id: Job ID
|
|
152
|
+
rank: Prompt rank (1 = best, 2 = second best, etc.)
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
Full prompt text or None if not found
|
|
156
|
+
|
|
157
|
+
Raises:
|
|
158
|
+
ValueError: If job_id format is invalid or rank < 1
|
|
159
|
+
"""
|
|
160
|
+
_validate_job_id(job_id)
|
|
161
|
+
if rank < 1:
|
|
162
|
+
raise ValueError(f"Rank must be >= 1, got: {rank}")
|
|
163
|
+
prompts_data = await self.get_prompts(job_id)
|
|
164
|
+
top_prompts = prompts_data.top_prompts
|
|
165
|
+
|
|
166
|
+
for prompt_info in top_prompts:
|
|
167
|
+
if prompt_info.get("rank") == rank:
|
|
168
|
+
return prompt_info.get("full_text")
|
|
169
|
+
|
|
170
|
+
return None
|
|
171
|
+
|
|
172
|
+
async def get_scoring_summary(self, job_id: str) -> Dict[str, Any]:
|
|
173
|
+
"""Get a summary of scoring metrics for all candidates.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
job_id: Job ID
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
Dictionary with scoring statistics:
|
|
180
|
+
- best_train_accuracy: Best training accuracy
|
|
181
|
+
- best_val_accuracy: Best validation accuracy (if available)
|
|
182
|
+
- num_candidates_tried: Total candidates evaluated
|
|
183
|
+
- num_frontier_candidates: Number in Pareto frontier
|
|
184
|
+
- score_distribution: Histogram of accuracy scores
|
|
185
|
+
|
|
186
|
+
Raises:
|
|
187
|
+
ValueError: If job_id format is invalid
|
|
188
|
+
"""
|
|
189
|
+
_validate_job_id(job_id)
|
|
190
|
+
prompts_data = await self.get_prompts(job_id)
|
|
191
|
+
|
|
192
|
+
attempted = prompts_data.attempted_candidates
|
|
193
|
+
optimized = prompts_data.optimized_candidates
|
|
194
|
+
validation = prompts_data.validation_results
|
|
195
|
+
|
|
196
|
+
# Extract train accuracies (only from candidates that have accuracy field)
|
|
197
|
+
train_accuracies = [
|
|
198
|
+
c["accuracy"] for c in attempted if "accuracy" in c
|
|
199
|
+
]
|
|
200
|
+
|
|
201
|
+
# Extract val accuracies (only from validations that have accuracy field)
|
|
202
|
+
val_accuracies = [
|
|
203
|
+
v["accuracy"] for v in validation if "accuracy" in v
|
|
204
|
+
]
|
|
205
|
+
|
|
206
|
+
# Score distribution (bins)
|
|
207
|
+
bins = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
|
|
208
|
+
distribution = {f"{bins[i]:.1f}-{bins[i+1]:.1f}": 0 for i in range(len(bins) - 1)}
|
|
209
|
+
for acc in train_accuracies:
|
|
210
|
+
for i in range(len(bins) - 1):
|
|
211
|
+
if bins[i] <= acc < bins[i+1] or (i == len(bins) - 2 and acc == bins[i+1]):
|
|
212
|
+
distribution[f"{bins[i]:.1f}-{bins[i+1]:.1f}"] += 1
|
|
213
|
+
break
|
|
214
|
+
|
|
215
|
+
return {
|
|
216
|
+
"best_train_accuracy": max(train_accuracies) if train_accuracies else None,
|
|
217
|
+
"best_val_accuracy": max(val_accuracies) if val_accuracies else None,
|
|
218
|
+
"num_candidates_tried": len(attempted),
|
|
219
|
+
"num_frontier_candidates": len(optimized),
|
|
220
|
+
"score_distribution": distribution,
|
|
221
|
+
"mean_train_accuracy": sum(train_accuracies) / len(train_accuracies) if train_accuracies else None,
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
# Synchronous wrapper for convenience
|
|
226
|
+
def get_prompts(job_id: str, base_url: str, api_key: str) -> PromptResults:
|
|
227
|
+
"""Synchronous wrapper to get prompts from a job.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
job_id: Job ID (e.g., "pl_9c58b711c2644083")
|
|
231
|
+
base_url: Backend API base URL
|
|
232
|
+
api_key: API key for authentication
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
PromptResults dataclass with prompt results
|
|
236
|
+
"""
|
|
237
|
+
import asyncio
|
|
238
|
+
|
|
239
|
+
client = PromptLearningClient(base_url, api_key)
|
|
240
|
+
return asyncio.run(client.get_prompts(job_id))
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def get_prompt_text(job_id: str, base_url: str, api_key: str, rank: int = 1) -> Optional[str]:
|
|
244
|
+
"""Synchronous wrapper to get prompt text by rank.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
job_id: Job ID
|
|
248
|
+
base_url: Backend API base URL
|
|
249
|
+
api_key: API key for authentication
|
|
250
|
+
rank: Prompt rank (1 = best, 2 = second best, etc.)
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
Full prompt text or None if not found
|
|
254
|
+
"""
|
|
255
|
+
import asyncio
|
|
256
|
+
|
|
257
|
+
client = PromptLearningClient(base_url, api_key)
|
|
258
|
+
return asyncio.run(client.get_prompt_text(job_id, rank))
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def get_scoring_summary(job_id: str, base_url: str, api_key: str) -> Dict[str, Any]:
|
|
262
|
+
"""Synchronous wrapper to get scoring summary.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
job_id: Job ID
|
|
266
|
+
base_url: Backend API base URL
|
|
267
|
+
api_key: API key for authentication
|
|
268
|
+
|
|
269
|
+
Returns:
|
|
270
|
+
Dictionary with scoring statistics
|
|
271
|
+
"""
|
|
272
|
+
import asyncio
|
|
273
|
+
|
|
274
|
+
client = PromptLearningClient(base_url, api_key)
|
|
275
|
+
return asyncio.run(client.get_scoring_summary(job_id))
|
|
276
|
+
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""Type definitions for prompt learning data structures."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class TextReplacement:
|
|
11
|
+
"""A text replacement in a prompt transformation."""
|
|
12
|
+
|
|
13
|
+
new_text: str
|
|
14
|
+
apply_to_role: str = "system"
|
|
15
|
+
old_text: Optional[str] = None
|
|
16
|
+
position: Optional[int] = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class CandidateScore:
|
|
21
|
+
"""Scoring information for a candidate prompt."""
|
|
22
|
+
|
|
23
|
+
accuracy: float
|
|
24
|
+
prompt_length: int = 0
|
|
25
|
+
tool_call_rate: float = 0.0
|
|
26
|
+
instance_scores: List[float] = field(default_factory=list)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class PromptSection:
|
|
31
|
+
"""A section of a prompt (e.g., system, user, assistant)."""
|
|
32
|
+
|
|
33
|
+
role: str
|
|
34
|
+
content: str
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class Candidate:
|
|
39
|
+
"""A candidate prompt from the optimization process."""
|
|
40
|
+
|
|
41
|
+
accuracy: float
|
|
42
|
+
prompt_length: int = 0
|
|
43
|
+
tool_call_rate: float = 0.0
|
|
44
|
+
instance_scores: List[float] = field(default_factory=list)
|
|
45
|
+
object: Optional[Dict[str, Any]] = None
|
|
46
|
+
|
|
47
|
+
@classmethod
|
|
48
|
+
def from_dict(cls, data: Dict[str, Any]) -> Candidate:
|
|
49
|
+
"""Create a Candidate from a dictionary."""
|
|
50
|
+
return cls(
|
|
51
|
+
accuracy=data.get("accuracy", 0.0),
|
|
52
|
+
prompt_length=data.get("prompt_length", 0),
|
|
53
|
+
tool_call_rate=data.get("tool_call_rate", 0.0),
|
|
54
|
+
instance_scores=data.get("instance_scores", []),
|
|
55
|
+
object=data.get("object"),
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class OptimizedCandidate:
|
|
61
|
+
"""An optimized candidate from the Pareto frontier."""
|
|
62
|
+
|
|
63
|
+
score: CandidateScore
|
|
64
|
+
payload_kind: str # "transformation" or "template"
|
|
65
|
+
object: Optional[Dict[str, Any]] = None
|
|
66
|
+
instance_scores: Optional[List[float]] = None
|
|
67
|
+
|
|
68
|
+
@classmethod
|
|
69
|
+
def from_dict(cls, data: Dict[str, Any]) -> OptimizedCandidate:
|
|
70
|
+
"""Create an OptimizedCandidate from a dictionary."""
|
|
71
|
+
score_data = data.get("score", {})
|
|
72
|
+
if isinstance(score_data, dict):
|
|
73
|
+
score = CandidateScore(
|
|
74
|
+
accuracy=score_data.get("accuracy", 0.0),
|
|
75
|
+
prompt_length=score_data.get("prompt_length", 0),
|
|
76
|
+
tool_call_rate=score_data.get("tool_call_rate", 0.0),
|
|
77
|
+
instance_scores=score_data.get("instance_scores", []),
|
|
78
|
+
)
|
|
79
|
+
else:
|
|
80
|
+
score = CandidateScore(accuracy=0.0)
|
|
81
|
+
|
|
82
|
+
return cls(
|
|
83
|
+
score=score,
|
|
84
|
+
payload_kind=data.get("payload_kind", "unknown"),
|
|
85
|
+
object=data.get("object"),
|
|
86
|
+
instance_scores=data.get("instance_scores"),
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@dataclass
|
|
91
|
+
class PromptLearningEvent:
|
|
92
|
+
"""A generic prompt learning event."""
|
|
93
|
+
|
|
94
|
+
type: str
|
|
95
|
+
message: str
|
|
96
|
+
data: Dict[str, Any]
|
|
97
|
+
seq: int
|
|
98
|
+
created_at: Optional[str] = None
|
|
99
|
+
|
|
100
|
+
@classmethod
|
|
101
|
+
def from_dict(cls, data: Dict[str, Any]) -> PromptLearningEvent:
|
|
102
|
+
"""Create a PromptLearningEvent from a dictionary."""
|
|
103
|
+
return cls(
|
|
104
|
+
type=data.get("type", ""),
|
|
105
|
+
message=data.get("message", ""),
|
|
106
|
+
data=data.get("data", {}),
|
|
107
|
+
seq=data.get("seq", 0),
|
|
108
|
+
created_at=data.get("created_at"),
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@dataclass
|
|
113
|
+
class BestPromptEventData:
|
|
114
|
+
"""Data for prompt.learning.best.prompt event."""
|
|
115
|
+
|
|
116
|
+
best_score: float
|
|
117
|
+
best_prompt: Dict[str, Any]
|
|
118
|
+
|
|
119
|
+
@classmethod
|
|
120
|
+
def from_dict(cls, data: Dict[str, Any]) -> BestPromptEventData:
|
|
121
|
+
"""Create BestPromptEventData from a dictionary."""
|
|
122
|
+
return cls(
|
|
123
|
+
best_score=data.get("best_score", 0.0),
|
|
124
|
+
best_prompt=data.get("best_prompt", {}),
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@dataclass
|
|
129
|
+
class FinalResultsEventData:
|
|
130
|
+
"""Data for prompt.learning.final.results event."""
|
|
131
|
+
|
|
132
|
+
attempted_candidates: List[Dict[str, Any]]
|
|
133
|
+
optimized_candidates: List[Dict[str, Any]]
|
|
134
|
+
|
|
135
|
+
@classmethod
|
|
136
|
+
def from_dict(cls, data: Dict[str, Any]) -> FinalResultsEventData:
|
|
137
|
+
"""Create FinalResultsEventData from a dictionary."""
|
|
138
|
+
return cls(
|
|
139
|
+
attempted_candidates=data.get("attempted_candidates", []),
|
|
140
|
+
optimized_candidates=data.get("optimized_candidates", []),
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
@dataclass
|
|
145
|
+
class ValidationScoredEventData:
|
|
146
|
+
"""Data for prompt.learning.validation.scored event."""
|
|
147
|
+
|
|
148
|
+
accuracy: float
|
|
149
|
+
instance_scores: List[float] = field(default_factory=list)
|
|
150
|
+
is_baseline: bool = False
|
|
151
|
+
|
|
152
|
+
@classmethod
|
|
153
|
+
def from_dict(cls, data: Dict[str, Any]) -> ValidationScoredEventData:
|
|
154
|
+
"""Create ValidationScoredEventData from a dictionary."""
|
|
155
|
+
return cls(
|
|
156
|
+
accuracy=data.get("accuracy", 0.0),
|
|
157
|
+
instance_scores=data.get("instance_scores", []),
|
|
158
|
+
is_baseline=data.get("is_baseline", False),
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
@dataclass
|
|
163
|
+
class PromptResults:
|
|
164
|
+
"""Results from a completed prompt learning job."""
|
|
165
|
+
|
|
166
|
+
best_prompt: Optional[Dict[str, Any]] = None
|
|
167
|
+
best_score: Optional[float] = None
|
|
168
|
+
top_prompts: List[Dict[str, Any]] = field(default_factory=list)
|
|
169
|
+
optimized_candidates: List[Dict[str, Any]] = field(default_factory=list)
|
|
170
|
+
attempted_candidates: List[Dict[str, Any]] = field(default_factory=list)
|
|
171
|
+
validation_results: List[Dict[str, Any]] = field(default_factory=list)
|
|
172
|
+
|
|
173
|
+
@classmethod
|
|
174
|
+
def from_dict(cls, data: Dict[str, Any]) -> PromptResults:
|
|
175
|
+
"""Create PromptResults from a dictionary."""
|
|
176
|
+
return cls(
|
|
177
|
+
best_prompt=data.get("best_prompt"),
|
|
178
|
+
best_score=data.get("best_score"),
|
|
179
|
+
top_prompts=data.get("top_prompts", []),
|
|
180
|
+
optimized_candidates=data.get("optimized_candidates", []),
|
|
181
|
+
attempted_candidates=data.get("attempted_candidates", []),
|
|
182
|
+
validation_results=data.get("validation_results", []),
|
|
183
|
+
)
|
|
184
|
+
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Static pricing table for supported models.
|
|
2
|
+
|
|
3
|
+
This module provides per-token pricing used by the SDK status commands.
|
|
4
|
+
Rates are expressed in USD per token and split into input/output prices.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Dict
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True)
|
|
13
|
+
class TokenRates:
|
|
14
|
+
input_usd: float
|
|
15
|
+
output_usd: float
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# Default per-token prices (USD), sourced Nov 3, 2025 — update as contracts change
|
|
19
|
+
MODEL_PRICES: Dict[str, Dict[str, TokenRates]] = {
|
|
20
|
+
# OpenAI official pricing
|
|
21
|
+
"openai": {
|
|
22
|
+
# GPT-5 family
|
|
23
|
+
"gpt-5": TokenRates(input_usd=0.00000125, output_usd=0.00001000), # $1.25 / $10 per 1M
|
|
24
|
+
"gpt-5-mini": TokenRates(input_usd=0.00000025, output_usd=0.00000200), # $0.25 / $2.00 per 1M
|
|
25
|
+
"gpt-5-nano": TokenRates(input_usd=0.00000005, output_usd=0.00000040), # $0.05 / $0.40 per 1M
|
|
26
|
+
|
|
27
|
+
"gpt-4o-mini": TokenRates(input_usd=0.00000015, output_usd=0.00000060), # $0.15 / $0.60 per 1M
|
|
28
|
+
"gpt-4o": TokenRates(input_usd=0.00000250, output_usd=0.00001000), # $2.50 / $10.00 per 1M
|
|
29
|
+
},
|
|
30
|
+
# Groq OSS via OpenAI-compatible path (latest Groq docs)
|
|
31
|
+
"groq": {
|
|
32
|
+
"openai/gpt-oss-20b": TokenRates(input_usd=0.000000075, output_usd=0.000000300), # $0.075 / $0.30 per 1M
|
|
33
|
+
|
|
34
|
+
"openai/gpt-oss-120b": TokenRates(input_usd=0.000000150, output_usd=0.000000600), # $0.15 / $0.60 per 1M
|
|
35
|
+
|
|
36
|
+
# Additional Groq on-demand models
|
|
37
|
+
"moonshotai/kimi-k2-0905": TokenRates(input_usd=0.000001000, output_usd=0.000003000), # $1.00 / $3.00 per 1M
|
|
38
|
+
|
|
39
|
+
"meta/llama-guard-4-12b": TokenRates(input_usd=0.000000200, output_usd=0.000000200), # $0.20 / $0.20 per 1M
|
|
40
|
+
"qwen/qwen3-32b": TokenRates(input_usd=0.000000290, output_usd=0.000000590), # $0.29 / $0.59 per 1M
|
|
41
|
+
"meta/llama-3.3-70b-versatile": TokenRates(input_usd=0.000000590, output_usd=0.000000790), # $0.59 / $0.79 per 1M
|
|
42
|
+
"meta/llama-3.1-8b-instant": TokenRates(input_usd=0.000000050, output_usd=0.000000080), # $0.05 / $0.08 per 1M
|
|
43
|
+
},
|
|
44
|
+
# Google Gemini pricing — per-token USD (per 1M ÷ 1e6), Nov 3, 2025
|
|
45
|
+
"google": {
|
|
46
|
+
# Gemini 2.5 Pro (two tiers by prompt size)
|
|
47
|
+
"gemini-2.5-pro": TokenRates(input_usd=0.00000125, output_usd=0.00001000), # <=200k tokens
|
|
48
|
+
"gemini-2.5-pro-gt200k": TokenRates(input_usd=0.00000250, output_usd=0.00001500), # >200k tokens
|
|
49
|
+
|
|
50
|
+
# Gemini 2.5 Flash (hybrid reasoning)
|
|
51
|
+
"gemini-2.5-flash": TokenRates(input_usd=0.00000030, output_usd=0.00000250),
|
|
52
|
+
|
|
53
|
+
# Gemini 2.5 Flash-Lite (cheapest)
|
|
54
|
+
"gemini-2.5-flash-lite": TokenRates(input_usd=0.00000010, output_usd=0.00000040),
|
|
55
|
+
},
|
|
56
|
+
}
|
|
57
|
+
|
synth_ai/streaming/handlers.py
CHANGED
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import contextlib
|
|
4
4
|
import json
|
|
5
|
+
import re
|
|
5
6
|
import time
|
|
6
7
|
from abc import ABC, abstractmethod
|
|
7
8
|
from collections import deque
|
|
@@ -14,6 +15,37 @@ import click
|
|
|
14
15
|
from .types import StreamMessage, StreamType
|
|
15
16
|
|
|
16
17
|
|
|
18
|
+
def _mask_sensitive_urls(text: str) -> str:
|
|
19
|
+
"""Mask S3/Wasabi URLs and sensitive paths in log messages.
|
|
20
|
+
|
|
21
|
+
Replaces full S3/Wasabi URLs with masked versions to prevent leaking
|
|
22
|
+
bucket names, paths, and infrastructure details in public SDK logs.
|
|
23
|
+
|
|
24
|
+
Examples:
|
|
25
|
+
s3://synth-artifacts/models/... -> s3://***/***/[masked]
|
|
26
|
+
Wasabi s3://bucket/path/file.tar.gz -> Wasabi s3://***/***/[masked]
|
|
27
|
+
"""
|
|
28
|
+
if not text:
|
|
29
|
+
return text
|
|
30
|
+
|
|
31
|
+
# Pattern matches:
|
|
32
|
+
# - Optional "Wasabi " prefix
|
|
33
|
+
# - s3:// or http(s):// scheme
|
|
34
|
+
# - Any bucket/host
|
|
35
|
+
# - Any path
|
|
36
|
+
# - Common model file extensions
|
|
37
|
+
pattern = r'(Wasabi\s+)?((s3|https?)://[^\s]+\.(tar\.gz|zip|pt|pth|safetensors|ckpt|bin))'
|
|
38
|
+
|
|
39
|
+
def replace_url(match: re.Match) -> str:
|
|
40
|
+
prefix = match.group(1) or "" # "Wasabi " or empty
|
|
41
|
+
url = match.group(2)
|
|
42
|
+
# Extract just the filename
|
|
43
|
+
filename = url.split("/")[-1] if "/" in url else "file"
|
|
44
|
+
return f'{prefix}s3://***/***/[{filename}]'
|
|
45
|
+
|
|
46
|
+
return re.sub(pattern, replace_url, text, flags=re.IGNORECASE)
|
|
47
|
+
|
|
48
|
+
|
|
17
49
|
class StreamHandler(ABC):
|
|
18
50
|
"""Base class for log handlers that consume ``StreamMessage`` objects."""
|
|
19
51
|
|
|
@@ -72,14 +104,29 @@ class CLIHandler(StreamHandler):
|
|
|
72
104
|
prefix = f"[{timestamp}] [{message.seq}] {event_type}"
|
|
73
105
|
if level:
|
|
74
106
|
prefix += f" ({level})"
|
|
75
|
-
|
|
107
|
+
# Mask sensitive URLs before displaying
|
|
108
|
+
sanitized_msg = _mask_sensitive_urls(msg)
|
|
109
|
+
click.echo(f"{prefix}: {sanitized_msg}".rstrip(": "))
|
|
76
110
|
return
|
|
77
111
|
|
|
78
112
|
if message.stream_type is StreamType.METRICS:
|
|
79
|
-
name = message.data.get("name"
|
|
113
|
+
name = message.data.get("name")
|
|
80
114
|
value = message.data.get("value")
|
|
81
115
|
step = message.data.get("step")
|
|
82
|
-
|
|
116
|
+
data = message.data.get("data", {})
|
|
117
|
+
|
|
118
|
+
# Format metric display
|
|
119
|
+
metric_str = f"[{timestamp}] [metric] {name}={value:.4f}" if isinstance(value, (int, float)) else f"[{timestamp}] [metric] {name}={value}"
|
|
120
|
+
if step is not None:
|
|
121
|
+
metric_str += f" (step={step})"
|
|
122
|
+
|
|
123
|
+
# Add any additional context from data field
|
|
124
|
+
if isinstance(data, dict):
|
|
125
|
+
n = data.get("n")
|
|
126
|
+
if n is not None:
|
|
127
|
+
metric_str += f" n={n}"
|
|
128
|
+
|
|
129
|
+
click.echo(metric_str)
|
|
83
130
|
return
|
|
84
131
|
|
|
85
132
|
if message.stream_type is StreamType.TIMELINE:
|
|
@@ -387,7 +434,9 @@ class RichHandler(StreamHandler):
|
|
|
387
434
|
event_type = message.data.get("type", "event")
|
|
388
435
|
summary = message.data.get("message") or ""
|
|
389
436
|
level = message.data.get("level")
|
|
390
|
-
|
|
437
|
+
# Mask sensitive URLs before displaying
|
|
438
|
+
sanitized_summary = _mask_sensitive_urls(summary)
|
|
439
|
+
formatted = f"[{event_type}] {sanitized_summary}".strip()
|
|
391
440
|
if level:
|
|
392
441
|
formatted = f"{formatted} ({level})"
|
|
393
442
|
self._event_log.append(formatted)
|
synth_ai/streaming/streamer.py
CHANGED
|
@@ -51,6 +51,25 @@ class StreamEndpoints:
|
|
|
51
51
|
timeline=f"{base}/timeline",
|
|
52
52
|
)
|
|
53
53
|
|
|
54
|
+
@classmethod
|
|
55
|
+
def prompt_learning(cls, job_id: str) -> StreamEndpoints:
|
|
56
|
+
"""Endpoints for prompt learning jobs (MIPRO/GEPA)."""
|
|
57
|
+
base = f"/prompt-learning/online/jobs/{job_id}"
|
|
58
|
+
return cls(
|
|
59
|
+
status=base,
|
|
60
|
+
events=f"{base}/events",
|
|
61
|
+
metrics=f"{base}/metrics",
|
|
62
|
+
timeline=None,
|
|
63
|
+
status_fallbacks=(
|
|
64
|
+
f"/learning/jobs/{job_id}",
|
|
65
|
+
f"/orchestration/jobs/{job_id}",
|
|
66
|
+
),
|
|
67
|
+
event_fallbacks=(
|
|
68
|
+
f"/learning/jobs/{job_id}/events",
|
|
69
|
+
f"/orchestration/jobs/{job_id}/events",
|
|
70
|
+
),
|
|
71
|
+
)
|
|
72
|
+
|
|
54
73
|
@classmethod
|
|
55
74
|
def rl(cls, job_id: str) -> StreamEndpoints:
|
|
56
75
|
base = f"/rl/jobs/{job_id}"
|
synth_ai/task/apps/__init__.py
CHANGED
|
@@ -22,6 +22,7 @@ class ModalDeploymentConfig:
|
|
|
22
22
|
extra_local_dirs: Sequence[tuple[str, str]] = field(default_factory=tuple)
|
|
23
23
|
secret_names: Sequence[str] = field(default_factory=tuple)
|
|
24
24
|
volume_mounts: Sequence[tuple[str, str]] = field(default_factory=tuple)
|
|
25
|
+
env_vars: dict[str, str] = field(default_factory=dict)
|
|
25
26
|
timeout: int = 600
|
|
26
27
|
memory: int = 4096
|
|
27
28
|
cpu: float = 2.0
|