synth-ai 0.2.9.dev4__py3-none-any.whl → 0.2.9.dev7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/common_old/backend.py +0 -1
- examples/crafter_debug_render.py +15 -6
- examples/evals_old/compare_models.py +1 -0
- examples/finetuning_old/_backup_synth_qwen/filter_traces_achievements.py +6 -2
- examples/finetuning_old/_backup_synth_qwen/react_agent_lm.py +4 -4
- examples/finetuning_old/_backup_synth_qwen/sft_kickoff.py +4 -3
- examples/finetuning_old/synth_qwen_v1/filter_traces_achievements.py +6 -2
- examples/finetuning_old/synth_qwen_v1/finetune.py +1 -1
- examples/finetuning_old/synth_qwen_v1/hello_ft_model.py +4 -4
- examples/finetuning_old/synth_qwen_v1/infer.py +1 -2
- examples/finetuning_old/synth_qwen_v1/poll.py +4 -2
- examples/finetuning_old/synth_qwen_v1/prepare_data.py +8 -8
- examples/finetuning_old/synth_qwen_v1/react_agent_lm.py +5 -4
- examples/finetuning_old/synth_qwen_v1/run_crafter_sft_job.py +11 -8
- examples/finetuning_old/synth_qwen_v1/run_ft_job.py +17 -12
- examples/finetuning_old/synth_qwen_v1/upload_data.py +1 -1
- examples/finetuning_old/synth_qwen_v1/util.py +7 -2
- examples/rl/configs/eval_base_qwen.toml +1 -1
- examples/rl/configs/rl_from_base_qwen17.toml +1 -1
- examples/rl/download_dataset.py +26 -10
- examples/rl/run_eval.py +17 -15
- examples/rl/run_rl_and_save.py +24 -7
- examples/rl/task_app/math_single_step.py +128 -11
- examples/rl/task_app/math_task_app.py +11 -3
- examples/rl_old/task_app.py +222 -53
- examples/warming_up_to_rl/analyze_trace_db.py +7 -5
- examples/warming_up_to_rl/export_trace_sft.py +141 -16
- examples/warming_up_to_rl/groq_test.py +11 -4
- examples/warming_up_to_rl/manage_secrets.py +15 -6
- examples/warming_up_to_rl/readme.md +9 -2
- examples/warming_up_to_rl/run_eval.py +108 -30
- examples/warming_up_to_rl/run_fft_and_save.py +128 -52
- examples/warming_up_to_rl/run_local_rollout.py +87 -36
- examples/warming_up_to_rl/run_local_rollout_modal.py +113 -25
- examples/warming_up_to_rl/run_local_rollout_parallel.py +80 -16
- examples/warming_up_to_rl/run_local_rollout_traced.py +125 -20
- examples/warming_up_to_rl/run_rl_and_save.py +31 -7
- examples/warming_up_to_rl/run_rollout_remote.py +37 -10
- examples/warming_up_to_rl/task_app/grpo_crafter.py +90 -27
- examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +9 -27
- examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +46 -108
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +50 -17
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +35 -21
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +8 -4
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +29 -26
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +17 -13
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +106 -63
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +82 -84
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +76 -59
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +43 -49
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +5 -15
- synth_ai/__init__.py +1 -0
- synth_ai/api/train/builders.py +34 -10
- synth_ai/api/train/cli.py +172 -32
- synth_ai/api/train/config_finder.py +59 -4
- synth_ai/api/train/env_resolver.py +32 -14
- synth_ai/api/train/pollers.py +11 -3
- synth_ai/api/train/task_app.py +4 -1
- synth_ai/api/train/utils.py +20 -4
- synth_ai/cli/__init__.py +11 -4
- synth_ai/cli/balance.py +1 -1
- synth_ai/cli/demo.py +19 -5
- synth_ai/cli/rl_demo.py +75 -16
- synth_ai/cli/root.py +116 -37
- synth_ai/cli/task_apps.py +1286 -170
- synth_ai/cli/traces.py +1 -0
- synth_ai/cli/turso.py +73 -0
- synth_ai/core/experiment.py +0 -2
- synth_ai/demo_registry.py +67 -30
- synth_ai/demos/core/cli.py +493 -164
- synth_ai/demos/demo_task_apps/core.py +50 -6
- synth_ai/demos/demo_task_apps/crafter/configs/crafter_fft_4b.toml +2 -3
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +36 -28
- synth_ai/demos/demo_task_apps/math/_common.py +1 -2
- synth_ai/demos/demo_task_apps/math/deploy_modal.py +0 -2
- synth_ai/demos/demo_task_apps/math/modal_task_app.py +168 -65
- synth_ai/demos/demo_task_apps/math/task_app_entry.py +0 -1
- synth_ai/environments/examples/bandit/engine.py +12 -4
- synth_ai/environments/examples/bandit/taskset.py +4 -4
- synth_ai/environments/reproducibility/tree.py +3 -1
- synth_ai/environments/service/core_routes.py +6 -2
- synth_ai/evals/base.py +0 -2
- synth_ai/experimental/synth_oss.py +11 -12
- synth_ai/handshake.py +3 -1
- synth_ai/http_client.py +31 -7
- synth_ai/inference/__init__.py +0 -2
- synth_ai/inference/client.py +8 -4
- synth_ai/jobs/client.py +40 -10
- synth_ai/learning/client.py +33 -8
- synth_ai/learning/config.py +0 -2
- synth_ai/learning/constants.py +0 -2
- synth_ai/learning/ft_client.py +6 -3
- synth_ai/learning/health.py +9 -2
- synth_ai/learning/jobs.py +17 -5
- synth_ai/learning/prompts/hello_world_in_context_injection_ex.py +1 -3
- synth_ai/learning/prompts/random_search.py +4 -1
- synth_ai/learning/prompts/run_random_search_banking77.py +6 -1
- synth_ai/learning/rl_client.py +42 -14
- synth_ai/learning/sse.py +0 -2
- synth_ai/learning/validators.py +6 -2
- synth_ai/lm/caching/ephemeral.py +1 -3
- synth_ai/lm/core/exceptions.py +0 -2
- synth_ai/lm/core/main.py +13 -1
- synth_ai/lm/core/synth_models.py +0 -1
- synth_ai/lm/core/vendor_clients.py +4 -2
- synth_ai/lm/overrides.py +2 -2
- synth_ai/lm/vendors/core/anthropic_api.py +7 -7
- synth_ai/lm/vendors/core/openai_api.py +2 -0
- synth_ai/lm/vendors/openai_standard.py +3 -1
- synth_ai/lm/vendors/openai_standard_responses.py +6 -3
- synth_ai/lm/vendors/supported/custom_endpoint.py +1 -3
- synth_ai/lm/vendors/synth_client.py +37 -10
- synth_ai/rl/__init__.py +0 -1
- synth_ai/rl/contracts.py +0 -2
- synth_ai/rl/env_keys.py +6 -1
- synth_ai/task/__init__.py +1 -0
- synth_ai/task/apps/__init__.py +11 -11
- synth_ai/task/auth.py +29 -17
- synth_ai/task/client.py +3 -1
- synth_ai/task/contracts.py +1 -0
- synth_ai/task/datasets.py +3 -1
- synth_ai/task/errors.py +3 -2
- synth_ai/task/health.py +0 -2
- synth_ai/task/json.py +0 -1
- synth_ai/task/proxy.py +2 -5
- synth_ai/task/rubrics.py +9 -3
- synth_ai/task/server.py +31 -5
- synth_ai/task/tracing_utils.py +8 -3
- synth_ai/task/validators.py +0 -1
- synth_ai/task/vendors.py +0 -1
- synth_ai/tracing_v3/db_config.py +26 -1
- synth_ai/tracing_v3/decorators.py +1 -0
- synth_ai/tracing_v3/examples/basic_usage.py +3 -2
- synth_ai/tracing_v3/hooks.py +2 -0
- synth_ai/tracing_v3/replica_sync.py +1 -0
- synth_ai/tracing_v3/session_tracer.py +24 -3
- synth_ai/tracing_v3/storage/base.py +4 -1
- synth_ai/tracing_v3/storage/factory.py +0 -1
- synth_ai/tracing_v3/turso/manager.py +102 -38
- synth_ai/tracing_v3/turso/models.py +4 -1
- synth_ai/tracing_v3/utils.py +1 -0
- synth_ai/v0/tracing/upload.py +32 -135
- {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev7.dist-info}/METADATA +1 -1
- {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev7.dist-info}/RECORD +154 -156
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_stepwise_rewards.py +0 -58
- synth_ai/environments/examples/sokoban/units/astar_common.py +0 -95
- synth_ai/install_sqld.sh +0 -40
- {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev7.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev7.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev7.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev7.dist-info}/top_level.txt +0 -0
synth_ai/api/train/builders.py
CHANGED
|
@@ -34,9 +34,13 @@ def build_rl_payload(
|
|
|
34
34
|
services = data.get("services") if isinstance(data.get("services"), dict) else {}
|
|
35
35
|
model_cfg = data.get("model") if isinstance(data.get("model"), dict) else {}
|
|
36
36
|
|
|
37
|
-
final_task_url = (
|
|
37
|
+
final_task_url = (
|
|
38
|
+
overrides.get("task_url") or task_url or services.get("task_url") or ""
|
|
39
|
+
).strip()
|
|
38
40
|
if not final_task_url:
|
|
39
|
-
raise click.ClickException(
|
|
41
|
+
raise click.ClickException(
|
|
42
|
+
"Task app URL required (provide --task-url or set services.task_url in TOML)"
|
|
43
|
+
)
|
|
40
44
|
|
|
41
45
|
model_source = (model_cfg.get("source") or "").strip()
|
|
42
46
|
model_base = (model_cfg.get("base") or "").strip()
|
|
@@ -45,7 +49,9 @@ def build_rl_payload(
|
|
|
45
49
|
model_source = override_model
|
|
46
50
|
model_base = ""
|
|
47
51
|
if bool(model_source) == bool(model_base):
|
|
48
|
-
raise click.ClickException(
|
|
52
|
+
raise click.ClickException(
|
|
53
|
+
"Model section must specify exactly one of [model].source or [model].base"
|
|
54
|
+
)
|
|
49
55
|
|
|
50
56
|
# Force TOML services.task_url to the effective endpoint to avoid split URLs
|
|
51
57
|
try:
|
|
@@ -93,15 +99,23 @@ def build_sft_payload(
|
|
|
93
99
|
if not raw_dataset:
|
|
94
100
|
raise TrainError("Dataset not specified; pass --dataset or set [job].data")
|
|
95
101
|
dataset_path = Path(raw_dataset)
|
|
96
|
-
|
|
102
|
+
# Resolve relative paths from current working directory, not config directory
|
|
103
|
+
dataset_path = (
|
|
104
|
+
dataset_path if dataset_path.is_absolute() else (Path.cwd() / dataset_path)
|
|
105
|
+
).resolve()
|
|
97
106
|
if not dataset_path.exists():
|
|
98
107
|
raise TrainError(f"Dataset not found: {dataset_path}")
|
|
99
108
|
|
|
100
|
-
validation_path =
|
|
109
|
+
validation_path = (
|
|
110
|
+
data_cfg.get("validation_path")
|
|
111
|
+
if isinstance(data_cfg.get("validation_path"), str)
|
|
112
|
+
else None
|
|
113
|
+
)
|
|
101
114
|
validation_file = None
|
|
102
115
|
if validation_path:
|
|
103
116
|
vpath = Path(validation_path)
|
|
104
|
-
|
|
117
|
+
# Resolve relative paths from current working directory, not config directory
|
|
118
|
+
vpath = (vpath if vpath.is_absolute() else (Path.cwd() / vpath)).resolve()
|
|
105
119
|
if not vpath.exists():
|
|
106
120
|
click.echo(f"[WARN] Validation dataset {vpath} missing; continuing without validation")
|
|
107
121
|
else:
|
|
@@ -125,15 +139,23 @@ def build_sft_payload(
|
|
|
125
139
|
if isinstance(hp_cfg.get("parallelism"), dict):
|
|
126
140
|
hp_block["parallelism"] = hp_cfg["parallelism"]
|
|
127
141
|
|
|
128
|
-
compute_block = {
|
|
142
|
+
compute_block = {
|
|
143
|
+
k: compute_cfg[k] for k in ("gpu_type", "gpu_count", "nodes") if k in compute_cfg
|
|
144
|
+
}
|
|
129
145
|
|
|
130
146
|
effective = {
|
|
131
147
|
"compute": compute_block,
|
|
132
|
-
"data": {
|
|
148
|
+
"data": {
|
|
149
|
+
"topology": data_cfg.get("topology", {})
|
|
150
|
+
if isinstance(data_cfg.get("topology"), dict)
|
|
151
|
+
else {}
|
|
152
|
+
},
|
|
133
153
|
"training": {k: v for k, v in train_cfg.items() if k in ("mode", "use_qlora")},
|
|
134
154
|
}
|
|
135
155
|
|
|
136
|
-
validation_cfg =
|
|
156
|
+
validation_cfg = (
|
|
157
|
+
train_cfg.get("validation") if isinstance(train_cfg.get("validation"), dict) else None
|
|
158
|
+
)
|
|
137
159
|
if isinstance(validation_cfg, dict):
|
|
138
160
|
hp_block.update(
|
|
139
161
|
{
|
|
@@ -144,7 +166,9 @@ def build_sft_payload(
|
|
|
144
166
|
"greater_is_better": bool(validation_cfg.get("greater_is_better", False)),
|
|
145
167
|
}
|
|
146
168
|
)
|
|
147
|
-
effective.setdefault("training", {})["validation"] = {
|
|
169
|
+
effective.setdefault("training", {})["validation"] = {
|
|
170
|
+
"enabled": bool(validation_cfg.get("enabled", True))
|
|
171
|
+
}
|
|
148
172
|
|
|
149
173
|
payload = {
|
|
150
174
|
"model": job_cfg.get("model") or data.get("model"),
|
synth_ai/api/train/cli.py
CHANGED
|
@@ -24,6 +24,7 @@ from .utils import (
|
|
|
24
24
|
sleep,
|
|
25
25
|
validate_sft_jsonl,
|
|
26
26
|
)
|
|
27
|
+
from synth_ai.config.base_url import get_backend_from_env
|
|
27
28
|
|
|
28
29
|
|
|
29
30
|
def _discover_dataset_candidates(config_path: Path, limit: int = 50) -> list[Path]:
|
|
@@ -92,20 +93,57 @@ def _prompt_manual_dataset() -> Path:
|
|
|
92
93
|
return Path(manual).expanduser()
|
|
93
94
|
|
|
94
95
|
|
|
96
|
+
def _default_backend() -> str:
|
|
97
|
+
"""Resolve backend URL with proper production default."""
|
|
98
|
+
# Check explicit override first
|
|
99
|
+
explicit = os.getenv("BACKEND_BASE_URL", "").strip()
|
|
100
|
+
if explicit:
|
|
101
|
+
return explicit
|
|
102
|
+
# Use standard resolution logic
|
|
103
|
+
base, _ = get_backend_from_env()
|
|
104
|
+
return f"{base}/api" if not base.endswith("/api") else base
|
|
105
|
+
|
|
106
|
+
|
|
95
107
|
@click.command("train")
|
|
96
|
-
@click.option(
|
|
108
|
+
@click.option(
|
|
109
|
+
"--config",
|
|
110
|
+
"config_paths",
|
|
111
|
+
multiple=True,
|
|
112
|
+
type=click.Path(),
|
|
113
|
+
help="Path to training TOML (repeatable)",
|
|
114
|
+
)
|
|
97
115
|
@click.option("--type", "train_type", type=click.Choice(["auto", "rl", "sft"]), default="auto")
|
|
98
|
-
@click.option(
|
|
116
|
+
@click.option(
|
|
117
|
+
"--env-file",
|
|
118
|
+
"env_files",
|
|
119
|
+
multiple=True,
|
|
120
|
+
type=click.Path(),
|
|
121
|
+
help=".env file(s) to preload (skips selection prompt)",
|
|
122
|
+
)
|
|
99
123
|
@click.option("--task-url", default=None, help="Override task app base URL (RL only)")
|
|
100
|
-
@click.option(
|
|
101
|
-
|
|
124
|
+
@click.option(
|
|
125
|
+
"--dataset",
|
|
126
|
+
"dataset_path",
|
|
127
|
+
type=click.Path(),
|
|
128
|
+
default=None,
|
|
129
|
+
help="Override dataset JSONL path (SFT)",
|
|
130
|
+
)
|
|
131
|
+
@click.option("--backend", default=_default_backend, help="Backend base URL")
|
|
102
132
|
@click.option("--model", default=None, help="Override model identifier")
|
|
103
133
|
@click.option("--idempotency", default=None, help="Idempotency-Key header for job creation")
|
|
104
134
|
@click.option("--dry-run", is_flag=True, help="Preview payload without submitting")
|
|
105
135
|
@click.option("--poll/--no-poll", default=True, help="Poll job status until terminal state")
|
|
106
|
-
@click.option(
|
|
136
|
+
@click.option(
|
|
137
|
+
"--poll-timeout", default=3600.0, type=float, help="Maximum seconds to poll before timing out"
|
|
138
|
+
)
|
|
107
139
|
@click.option("--poll-interval", default=5.0, type=float, help="Seconds between poll attempts")
|
|
108
|
-
@click.option(
|
|
140
|
+
@click.option(
|
|
141
|
+
"--examples",
|
|
142
|
+
"examples_limit",
|
|
143
|
+
type=int,
|
|
144
|
+
default=None,
|
|
145
|
+
help="Limit SFT training to the first N examples",
|
|
146
|
+
)
|
|
109
147
|
def train_command(
|
|
110
148
|
config_paths: tuple[str, ...],
|
|
111
149
|
train_type: str,
|
|
@@ -123,12 +161,18 @@ def train_command(
|
|
|
123
161
|
) -> None:
|
|
124
162
|
"""Interactive launcher for RL / SFT jobs."""
|
|
125
163
|
|
|
126
|
-
candidates = discover_configs(
|
|
127
|
-
|
|
164
|
+
candidates = discover_configs(
|
|
165
|
+
list(config_paths), requested_type=train_type if train_type != "auto" else None
|
|
166
|
+
)
|
|
167
|
+
selection = prompt_for_config(
|
|
168
|
+
candidates, requested_type=train_type if train_type != "auto" else None
|
|
169
|
+
)
|
|
128
170
|
|
|
129
171
|
effective_type = train_type if train_type != "auto" else selection.train_type
|
|
130
172
|
if effective_type not in {"rl", "sft"}:
|
|
131
|
-
effective_type = click.prompt(
|
|
173
|
+
effective_type = click.prompt(
|
|
174
|
+
"Detected config type is ambiguous. Enter type", type=click.Choice(["rl", "sft"])
|
|
175
|
+
)
|
|
132
176
|
|
|
133
177
|
cfg_path = selection.path
|
|
134
178
|
click.echo(f"Using config: {cfg_path} ({effective_type})")
|
|
@@ -219,11 +263,14 @@ def train_command(
|
|
|
219
263
|
)
|
|
220
264
|
|
|
221
265
|
|
|
222
|
-
def _wait_for_training_file(
|
|
266
|
+
def _wait_for_training_file(
|
|
267
|
+
backend_base: str, api_key: str, file_id: str, *, timeout: float = 120.0
|
|
268
|
+
) -> None:
|
|
223
269
|
url = f"{backend_base}/learning/files/{file_id}"
|
|
224
270
|
headers = {"Authorization": f"Bearer {api_key}"}
|
|
225
271
|
elapsed = 0.0
|
|
226
272
|
interval = 2.0
|
|
273
|
+
first_check = True
|
|
227
274
|
while True:
|
|
228
275
|
resp = http_get(url, headers=headers, timeout=30.0)
|
|
229
276
|
if resp.status_code == 200:
|
|
@@ -231,17 +278,55 @@ def _wait_for_training_file(backend_base: str, api_key: str, file_id: str, *, ti
|
|
|
231
278
|
data = resp.json()
|
|
232
279
|
except Exception:
|
|
233
280
|
data = {}
|
|
234
|
-
status = str(
|
|
281
|
+
status = str(
|
|
282
|
+
data.get("status") or data.get("state") or data.get("storage_state") or "ready"
|
|
283
|
+
).lower()
|
|
284
|
+
if first_check:
|
|
285
|
+
click.echo(f"File uploaded successfully (id={file_id}, status={status})")
|
|
286
|
+
first_check = False
|
|
235
287
|
if status in {"ready", "uploaded", "stored", "complete"}:
|
|
288
|
+
click.echo(f"✓ Training file ready (status={status})")
|
|
236
289
|
return
|
|
290
|
+
# Show progress for processing states
|
|
291
|
+
if status in {"processing", "pending", "validating"}:
|
|
292
|
+
click.echo(
|
|
293
|
+
f" Waiting for file processing... (status={status}, {elapsed:.0f}s elapsed)"
|
|
294
|
+
)
|
|
237
295
|
elif resp.status_code == 404:
|
|
238
296
|
# Keep polling; object may not be visible yet
|
|
239
|
-
|
|
297
|
+
if first_check:
|
|
298
|
+
click.echo(f"Waiting for file {file_id} to become visible...")
|
|
299
|
+
first_check = False
|
|
300
|
+
elif resp.status_code in {401, 403}:
|
|
301
|
+
# Auth errors won't resolve by polling - fail immediately
|
|
302
|
+
try:
|
|
303
|
+
error_body = resp.json()
|
|
304
|
+
except Exception:
|
|
305
|
+
error_body = resp.text[:400]
|
|
306
|
+
click.echo(f"\n[ERROR] Authentication failed when checking training file:")
|
|
307
|
+
click.echo(f" URL: {url}")
|
|
308
|
+
click.echo(f" Status: {resp.status_code}")
|
|
309
|
+
click.echo(f" Response: {error_body}")
|
|
310
|
+
click.echo(f" API key: {mask_value(api_key)}")
|
|
311
|
+
raise click.ClickException(
|
|
312
|
+
f"Authentication error ({resp.status_code}). "
|
|
313
|
+
"Check that your SYNTH_API_KEY is valid and has permission to access this organization's files."
|
|
314
|
+
)
|
|
240
315
|
else:
|
|
241
|
-
|
|
316
|
+
# Other errors - show details but keep polling
|
|
317
|
+
try:
|
|
318
|
+
error_body = resp.json()
|
|
319
|
+
except Exception:
|
|
320
|
+
error_body = resp.text[:400]
|
|
321
|
+
click.echo(f"[WARN] Unexpected response checking file {file_id}:")
|
|
322
|
+
click.echo(f" URL: {url}")
|
|
323
|
+
click.echo(f" Status: {resp.status_code}")
|
|
324
|
+
click.echo(f" Response: {error_body}")
|
|
242
325
|
|
|
243
326
|
if elapsed >= timeout:
|
|
244
|
-
raise click.ClickException(
|
|
327
|
+
raise click.ClickException(
|
|
328
|
+
f"Training file {file_id} not ready after {timeout:.0f}s (last status: {resp.status_code})"
|
|
329
|
+
)
|
|
245
330
|
sleep(interval)
|
|
246
331
|
elapsed += interval
|
|
247
332
|
|
|
@@ -259,7 +344,11 @@ def handle_rl(
|
|
|
259
344
|
poll_timeout: float,
|
|
260
345
|
poll_interval: float,
|
|
261
346
|
) -> None:
|
|
262
|
-
overrides: Dict[str, Any] = {
|
|
347
|
+
overrides: Dict[str, Any] = {
|
|
348
|
+
"backend": backend_base,
|
|
349
|
+
"task_url": task_url_override,
|
|
350
|
+
"model": model_override,
|
|
351
|
+
}
|
|
263
352
|
build = build_rl_payload(
|
|
264
353
|
config_path=cfg_path,
|
|
265
354
|
task_url=task_url_override or os.environ.get("TASK_APP_URL", ""),
|
|
@@ -271,13 +360,17 @@ def handle_rl(
|
|
|
271
360
|
verify_url = f"{backend_base}/rl/verify_task_app"
|
|
272
361
|
verify_headers = {"Authorization": f"Bearer {synth_key}", "Content-Type": "application/json"}
|
|
273
362
|
try:
|
|
274
|
-
vresp = http_post(
|
|
363
|
+
vresp = http_post(
|
|
364
|
+
verify_url, headers=verify_headers, json_body={"endpoint_base_url": build.task_url}
|
|
365
|
+
)
|
|
275
366
|
try:
|
|
276
367
|
vjs = vresp.json()
|
|
277
368
|
except Exception:
|
|
278
369
|
vjs = {"status": vresp.status_code, "text": (vresp.text or "")[:400]}
|
|
279
370
|
except Exception as _ve:
|
|
280
|
-
raise click.ClickException(
|
|
371
|
+
raise click.ClickException(
|
|
372
|
+
f"Task app verification call failed: {type(_ve).__name__}: {_ve}"
|
|
373
|
+
) from _ve
|
|
281
374
|
if vresp.status_code >= 400:
|
|
282
375
|
click.echo("Task app verification error:\n" + preview_json(vjs, limit=800))
|
|
283
376
|
raise click.ClickException(f"Verification failed with status {vresp.status_code}")
|
|
@@ -379,55 +472,102 @@ def handle_sft(
|
|
|
379
472
|
validate_sft_jsonl(build.validation_file)
|
|
380
473
|
|
|
381
474
|
upload_url = f"{backend_base}/learning/files"
|
|
382
|
-
click.echo(f"Uploading
|
|
475
|
+
click.echo(f"\n=== Uploading Training Data ===")
|
|
476
|
+
click.echo(f"Dataset: {build.train_file}")
|
|
477
|
+
click.echo(f"Destination: {upload_url}")
|
|
383
478
|
if dry_run:
|
|
384
479
|
click.echo("Dry run: skipping upload")
|
|
385
480
|
train_file_id = "dry-run-train"
|
|
386
481
|
val_file_id = None
|
|
387
482
|
else:
|
|
388
|
-
resp = post_multipart(
|
|
389
|
-
|
|
483
|
+
resp = post_multipart(
|
|
484
|
+
upload_url, api_key=synth_key, file_field="file", file_path=build.train_file
|
|
485
|
+
)
|
|
486
|
+
js = (
|
|
487
|
+
resp.json()
|
|
488
|
+
if resp.headers.get("content-type", "").startswith("application/json")
|
|
489
|
+
else {}
|
|
490
|
+
)
|
|
390
491
|
if resp.status_code >= 400 or "id" not in js:
|
|
391
|
-
|
|
492
|
+
click.echo(f"\n[ERROR] Training file upload failed:")
|
|
493
|
+
click.echo(f" URL: {upload_url}")
|
|
494
|
+
click.echo(f" Status: {resp.status_code}")
|
|
495
|
+
click.echo(f" Response: {js or resp.text[:400]}")
|
|
496
|
+
click.echo(f" File: {build.train_file}")
|
|
497
|
+
raise click.ClickException(
|
|
498
|
+
f"Training file upload failed with status {resp.status_code}"
|
|
499
|
+
)
|
|
392
500
|
train_file_id = js["id"]
|
|
501
|
+
click.echo(f"✓ Training file uploaded (id={train_file_id})")
|
|
393
502
|
val_file_id = None
|
|
394
503
|
if build.validation_file:
|
|
395
|
-
click.echo(f"Uploading validation dataset {build.validation_file}")
|
|
396
|
-
vresp = post_multipart(
|
|
397
|
-
|
|
504
|
+
click.echo(f"Uploading validation dataset: {build.validation_file}")
|
|
505
|
+
vresp = post_multipart(
|
|
506
|
+
upload_url,
|
|
507
|
+
api_key=synth_key,
|
|
508
|
+
file_field="file",
|
|
509
|
+
file_path=build.validation_file,
|
|
510
|
+
)
|
|
511
|
+
vjs = (
|
|
512
|
+
vresp.json()
|
|
513
|
+
if vresp.headers.get("content-type", "").startswith("application/json")
|
|
514
|
+
else {}
|
|
515
|
+
)
|
|
398
516
|
if vresp.status_code < 400 and "id" in vjs:
|
|
399
517
|
val_file_id = vjs["id"]
|
|
518
|
+
click.echo(f"✓ Validation file uploaded (id={val_file_id})")
|
|
400
519
|
else:
|
|
401
|
-
click.echo(
|
|
520
|
+
click.echo(
|
|
521
|
+
f"[WARN] Validation upload failed ({vresp.status_code}): {vjs or vresp.text[:200]}"
|
|
522
|
+
)
|
|
402
523
|
payload = dict(build.payload)
|
|
403
524
|
payload["training_file_id"] = train_file_id
|
|
404
525
|
if val_file_id:
|
|
405
|
-
payload.setdefault("metadata", {}).setdefault("effective_config", {}).setdefault(
|
|
526
|
+
payload.setdefault("metadata", {}).setdefault("effective_config", {}).setdefault(
|
|
527
|
+
"data", {}
|
|
528
|
+
)["validation_files"] = [val_file_id]
|
|
406
529
|
|
|
530
|
+
click.echo(f"\n=== Checking File Processing Status ===")
|
|
407
531
|
try:
|
|
408
532
|
_wait_for_training_file(backend_base, synth_key, train_file_id)
|
|
409
533
|
except click.ClickException as exc:
|
|
410
534
|
raise click.ClickException(f"Training file {train_file_id} not ready: {exc}") from exc
|
|
411
535
|
|
|
412
|
-
click.echo("
|
|
536
|
+
click.echo(f"\n=== Creating Training Job ===")
|
|
537
|
+
click.echo("Job payload preview:")
|
|
538
|
+
click.echo(preview_json(payload, limit=800))
|
|
413
539
|
if dry_run:
|
|
414
540
|
click.echo("Dry run: skipping job submission")
|
|
415
541
|
return
|
|
416
542
|
|
|
417
543
|
create_url = f"{backend_base}/learning/jobs"
|
|
418
544
|
headers = {"Authorization": f"Bearer {synth_key}", "Content-Type": "application/json"}
|
|
545
|
+
click.echo(f"\nPOST {create_url}")
|
|
419
546
|
resp = http_post(create_url, headers=headers, json_body=payload)
|
|
420
|
-
js =
|
|
421
|
-
|
|
547
|
+
js = (
|
|
548
|
+
resp.json()
|
|
549
|
+
if resp.headers.get("content-type", "").startswith("application/json")
|
|
550
|
+
else {}
|
|
551
|
+
)
|
|
422
552
|
if resp.status_code not in (200, 201):
|
|
423
|
-
|
|
553
|
+
click.echo(f"\n[ERROR] Job creation failed:")
|
|
554
|
+
click.echo(f" URL: {create_url}")
|
|
555
|
+
click.echo(f" Status: {resp.status_code}")
|
|
556
|
+
click.echo(f" Response: {preview_json(js, limit=600)}")
|
|
557
|
+
raise click.ClickException(f"Job creation failed with status {resp.status_code}")
|
|
424
558
|
job_id = js.get("job_id") or js.get("id")
|
|
425
559
|
if not job_id:
|
|
426
560
|
raise click.ClickException("Response missing job id")
|
|
561
|
+
click.echo(f"✓ Job created (id={job_id})")
|
|
427
562
|
|
|
563
|
+
click.echo(f"\n=== Starting Training Job ===")
|
|
428
564
|
start_url = f"{backend_base}/learning/jobs/{job_id}/start"
|
|
429
|
-
click.echo(f"POST {start_url}
|
|
430
|
-
|
|
565
|
+
click.echo(f"POST {start_url}")
|
|
566
|
+
start_resp = http_post(start_url, headers=headers, json_body={})
|
|
567
|
+
if start_resp.status_code not in (200, 201):
|
|
568
|
+
click.echo(f"[WARN] Job start returned status {start_resp.status_code}")
|
|
569
|
+
else:
|
|
570
|
+
click.echo(f"✓ Job started")
|
|
431
571
|
|
|
432
572
|
if not poll:
|
|
433
573
|
click.echo(f"Started job {job_id} (polling disabled)")
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
3
5
|
from dataclasses import dataclass
|
|
4
6
|
from pathlib import Path
|
|
5
7
|
from typing import Iterable
|
|
@@ -9,6 +11,7 @@ import click
|
|
|
9
11
|
from .utils import REPO_ROOT, load_toml, preview_json
|
|
10
12
|
|
|
11
13
|
_SKIP_DIRS = {".git", "__pycache__", ".venv", "node_modules", "dist", "build"}
|
|
14
|
+
_STATE_FILE = os.path.expanduser("~/.synth-ai/demo.json")
|
|
12
15
|
|
|
13
16
|
|
|
14
17
|
@dataclass(slots=True)
|
|
@@ -17,9 +20,43 @@ class ConfigCandidate:
|
|
|
17
20
|
train_type: str # "rl", "sft", or "unknown"
|
|
18
21
|
|
|
19
22
|
|
|
23
|
+
def _load_last_config() -> Path | None:
|
|
24
|
+
"""Load the last used training config path from state file."""
|
|
25
|
+
try:
|
|
26
|
+
if os.path.isfile(_STATE_FILE):
|
|
27
|
+
with open(_STATE_FILE) as fh:
|
|
28
|
+
data = json.load(fh)
|
|
29
|
+
if isinstance(data, dict):
|
|
30
|
+
last_config = data.get("LAST_CONFIG")
|
|
31
|
+
if last_config:
|
|
32
|
+
path = Path(last_config).resolve()
|
|
33
|
+
if path.exists():
|
|
34
|
+
return path
|
|
35
|
+
except Exception:
|
|
36
|
+
pass
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _save_last_config(config_path: Path) -> None:
|
|
41
|
+
"""Save the last used training config path to state file."""
|
|
42
|
+
try:
|
|
43
|
+
data = {}
|
|
44
|
+
if os.path.isfile(_STATE_FILE):
|
|
45
|
+
with open(_STATE_FILE) as fh:
|
|
46
|
+
data = json.load(fh) or {}
|
|
47
|
+
if not isinstance(data, dict):
|
|
48
|
+
data = {}
|
|
49
|
+
data["LAST_CONFIG"] = str(config_path.resolve())
|
|
50
|
+
os.makedirs(os.path.dirname(_STATE_FILE), exist_ok=True)
|
|
51
|
+
with open(_STATE_FILE, "w") as fh:
|
|
52
|
+
json.dump(data, fh)
|
|
53
|
+
except Exception:
|
|
54
|
+
pass
|
|
55
|
+
|
|
56
|
+
|
|
20
57
|
def _iter_candidate_paths() -> Iterable[Path]:
|
|
21
58
|
seen: set[Path] = set()
|
|
22
|
-
|
|
59
|
+
|
|
23
60
|
# Prioritize current working directory first
|
|
24
61
|
try:
|
|
25
62
|
cwd = Path.cwd().resolve()
|
|
@@ -135,23 +172,41 @@ def discover_configs(explicit: list[str], *, requested_type: str | None) -> list
|
|
|
135
172
|
return candidates
|
|
136
173
|
|
|
137
174
|
|
|
138
|
-
def prompt_for_config(
|
|
175
|
+
def prompt_for_config(
|
|
176
|
+
candidates: list[ConfigCandidate], *, requested_type: str | None
|
|
177
|
+
) -> ConfigCandidate:
|
|
139
178
|
if not candidates:
|
|
140
179
|
raise click.ClickException("No training configs found. Pass --config explicitly.")
|
|
141
180
|
|
|
181
|
+
# Check for last used config and move it to the top if found
|
|
182
|
+
last_config = _load_last_config()
|
|
183
|
+
default_idx = 1
|
|
184
|
+
|
|
185
|
+
if last_config:
|
|
186
|
+
for idx, cand in enumerate(candidates):
|
|
187
|
+
if cand.path.resolve() == last_config:
|
|
188
|
+
# Move last used config to the front
|
|
189
|
+
candidates.insert(0, candidates.pop(idx))
|
|
190
|
+
break
|
|
191
|
+
|
|
142
192
|
click.echo("Select a training config:")
|
|
143
193
|
for idx, cand in enumerate(candidates, start=1):
|
|
144
194
|
label = cand.train_type if cand.train_type != "unknown" else "?"
|
|
145
|
-
|
|
195
|
+
last_marker = " (last used)" if last_config and cand.path.resolve() == last_config else ""
|
|
196
|
+
click.echo(f" {idx}) [{label}] {cand.path}{last_marker}")
|
|
146
197
|
click.echo(" 0) Abort")
|
|
147
198
|
|
|
148
|
-
choice = click.prompt("Enter choice", type=int)
|
|
199
|
+
choice = click.prompt("Enter choice", type=int, default=default_idx)
|
|
149
200
|
if choice == 0:
|
|
150
201
|
raise click.ClickException("Aborted by user")
|
|
151
202
|
if choice < 0 or choice > len(candidates):
|
|
152
203
|
raise click.ClickException("Invalid selection")
|
|
153
204
|
|
|
154
205
|
selection = candidates[choice - 1]
|
|
206
|
+
|
|
207
|
+
# Save this config as the last used
|
|
208
|
+
_save_last_config(selection.path)
|
|
209
|
+
|
|
155
210
|
try:
|
|
156
211
|
data = load_toml(selection.path)
|
|
157
212
|
preview = preview_json({k: data.get(k) for k in list(data.keys())[:4]}, limit=320)
|
|
@@ -56,12 +56,12 @@ class EnvResolver:
|
|
|
56
56
|
def _collect_default_candidates(config_path: Path | None) -> list[Path]:
|
|
57
57
|
candidates: list[Path] = []
|
|
58
58
|
cwd = Path.cwd()
|
|
59
|
-
|
|
59
|
+
|
|
60
60
|
# Prioritize CWD env files
|
|
61
61
|
cwd_env = cwd / ".env"
|
|
62
62
|
if cwd_env.exists():
|
|
63
63
|
candidates.append(cwd_env.resolve())
|
|
64
|
-
|
|
64
|
+
|
|
65
65
|
# Search for additional .env files in CWD subdirectories
|
|
66
66
|
for sub in cwd.glob("**/.env"):
|
|
67
67
|
try:
|
|
@@ -76,13 +76,13 @@ def _collect_default_candidates(config_path: Path | None) -> list[Path]:
|
|
|
76
76
|
if len(candidates) >= 20:
|
|
77
77
|
break
|
|
78
78
|
candidates.append(resolved)
|
|
79
|
-
|
|
79
|
+
|
|
80
80
|
# Then config path env file
|
|
81
81
|
if config_path:
|
|
82
82
|
cfg_env = config_path.parent / ".env"
|
|
83
83
|
if cfg_env.exists():
|
|
84
84
|
candidates.append(cfg_env.resolve())
|
|
85
|
-
|
|
85
|
+
|
|
86
86
|
# Then repo env files
|
|
87
87
|
repo_env = REPO_ROOT / ".env"
|
|
88
88
|
if repo_env.exists():
|
|
@@ -90,7 +90,7 @@ def _collect_default_candidates(config_path: Path | None) -> list[Path]:
|
|
|
90
90
|
examples_env = REPO_ROOT / "examples" / ".env"
|
|
91
91
|
if examples_env.exists():
|
|
92
92
|
candidates.append(examples_env.resolve())
|
|
93
|
-
|
|
93
|
+
|
|
94
94
|
# Search shallow depth for additional .env files in examples
|
|
95
95
|
for sub in (REPO_ROOT / "examples").glob("**/.env"):
|
|
96
96
|
try:
|
|
@@ -105,7 +105,7 @@ def _collect_default_candidates(config_path: Path | None) -> list[Path]:
|
|
|
105
105
|
if len(candidates) >= 20:
|
|
106
106
|
break
|
|
107
107
|
candidates.append(resolved)
|
|
108
|
-
|
|
108
|
+
|
|
109
109
|
deduped: list[Path] = []
|
|
110
110
|
for path in candidates:
|
|
111
111
|
if path not in deduped:
|
|
@@ -156,8 +156,27 @@ def resolve_env(
|
|
|
156
156
|
raise click.ClickException(f"Env file not found: {path}")
|
|
157
157
|
resolver = EnvResolver(provided)
|
|
158
158
|
else:
|
|
159
|
-
|
|
160
|
-
|
|
159
|
+
# Check for saved .env path from demo command
|
|
160
|
+
try:
|
|
161
|
+
from synth_ai.demos.demo_task_apps.core import load_env_file_path
|
|
162
|
+
|
|
163
|
+
saved_env_path = load_env_file_path()
|
|
164
|
+
if saved_env_path:
|
|
165
|
+
saved_path = Path(saved_env_path)
|
|
166
|
+
if saved_path.exists():
|
|
167
|
+
click.echo(f"Using .env file: {saved_path}")
|
|
168
|
+
resolver = EnvResolver([saved_path])
|
|
169
|
+
else:
|
|
170
|
+
# Saved path no longer exists, fall back to prompt
|
|
171
|
+
resolver = EnvResolver(_collect_default_candidates(config_path))
|
|
172
|
+
resolver.select_new_env()
|
|
173
|
+
else:
|
|
174
|
+
resolver = EnvResolver(_collect_default_candidates(config_path))
|
|
175
|
+
resolver.select_new_env()
|
|
176
|
+
except Exception:
|
|
177
|
+
# If import fails or any error, fall back to original behavior
|
|
178
|
+
resolver = EnvResolver(_collect_default_candidates(config_path))
|
|
179
|
+
resolver.select_new_env()
|
|
161
180
|
|
|
162
181
|
# Preload selected .env keys into process env so downstream lookups succeed
|
|
163
182
|
try:
|
|
@@ -207,10 +226,10 @@ def _resolve_key(resolver: EnvResolver, spec: KeySpec) -> str:
|
|
|
207
226
|
break
|
|
208
227
|
if env_val:
|
|
209
228
|
click.echo(f"Found {spec.name} in current sources: {mask_value(env_val)}")
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
229
|
+
# Automatically use and persist the value (no prompt)
|
|
230
|
+
_maybe_persist(resolver, spec, env_val)
|
|
231
|
+
os.environ[spec.name] = env_val
|
|
232
|
+
return env_val
|
|
214
233
|
options: list[tuple[str, Callable[[], str | None]]] = []
|
|
215
234
|
|
|
216
235
|
def _enter_manual() -> str:
|
|
@@ -254,8 +273,7 @@ def _resolve_key(resolver: EnvResolver, spec: KeySpec) -> str:
|
|
|
254
273
|
|
|
255
274
|
|
|
256
275
|
def _maybe_persist(resolver: EnvResolver, spec: KeySpec, value: str) -> None:
|
|
257
|
-
|
|
258
|
-
return
|
|
276
|
+
# Automatically save (no prompt)
|
|
259
277
|
resolver.set_value(spec.name, value)
|
|
260
278
|
click.echo(f"Saved {spec.name} to {resolver.current_path}")
|
|
261
279
|
|
synth_ai/api/train/pollers.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
+
from datetime import datetime
|
|
4
5
|
from typing import Any, Mapping
|
|
5
6
|
|
|
6
7
|
import click
|
|
@@ -15,7 +16,9 @@ class PollOutcome:
|
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
class JobPoller:
|
|
18
|
-
def __init__(
|
|
19
|
+
def __init__(
|
|
20
|
+
self, base_url: str, api_key: str, *, interval: float = 5.0, timeout: float = 3600.0
|
|
21
|
+
) -> None:
|
|
19
22
|
self.base_url = ensure_api_base(base_url)
|
|
20
23
|
self.api_key = api_key
|
|
21
24
|
self.interval = interval
|
|
@@ -35,9 +38,14 @@ class JobPoller:
|
|
|
35
38
|
while elapsed <= self.timeout:
|
|
36
39
|
try:
|
|
37
40
|
resp = http_get(f"{self.base_url}{path}", headers=self._headers())
|
|
38
|
-
info =
|
|
41
|
+
info = (
|
|
42
|
+
resp.json()
|
|
43
|
+
if resp.headers.get("content-type", "").startswith("application/json")
|
|
44
|
+
else {}
|
|
45
|
+
)
|
|
39
46
|
status = (info.get("status") or info.get("state") or "").lower()
|
|
40
|
-
|
|
47
|
+
timestamp = datetime.now().strftime("%H:%M:%S")
|
|
48
|
+
click.echo(f"[poll] {timestamp} {elapsed:.0f}s status={status}")
|
|
41
49
|
if status in {"succeeded", "failed", "cancelled", "canceled", "completed"}:
|
|
42
50
|
break
|
|
43
51
|
except Exception as exc: # pragma: no cover - network failures
|