synth-ai 0.2.9.dev5__py3-none-any.whl → 0.2.9.dev7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (155) hide show
  1. examples/common_old/backend.py +0 -1
  2. examples/crafter_debug_render.py +15 -6
  3. examples/evals_old/compare_models.py +1 -0
  4. examples/finetuning_old/_backup_synth_qwen/filter_traces_achievements.py +6 -2
  5. examples/finetuning_old/_backup_synth_qwen/react_agent_lm.py +4 -4
  6. examples/finetuning_old/_backup_synth_qwen/sft_kickoff.py +4 -3
  7. examples/finetuning_old/synth_qwen_v1/filter_traces_achievements.py +6 -2
  8. examples/finetuning_old/synth_qwen_v1/finetune.py +1 -1
  9. examples/finetuning_old/synth_qwen_v1/hello_ft_model.py +4 -4
  10. examples/finetuning_old/synth_qwen_v1/infer.py +1 -2
  11. examples/finetuning_old/synth_qwen_v1/poll.py +4 -2
  12. examples/finetuning_old/synth_qwen_v1/prepare_data.py +8 -8
  13. examples/finetuning_old/synth_qwen_v1/react_agent_lm.py +5 -4
  14. examples/finetuning_old/synth_qwen_v1/run_crafter_sft_job.py +11 -8
  15. examples/finetuning_old/synth_qwen_v1/run_ft_job.py +17 -12
  16. examples/finetuning_old/synth_qwen_v1/upload_data.py +1 -1
  17. examples/finetuning_old/synth_qwen_v1/util.py +7 -2
  18. examples/rl/configs/eval_base_qwen.toml +1 -1
  19. examples/rl/configs/rl_from_base_qwen17.toml +1 -1
  20. examples/rl/download_dataset.py +26 -10
  21. examples/rl/run_eval.py +17 -15
  22. examples/rl/run_rl_and_save.py +24 -7
  23. examples/rl/task_app/math_single_step.py +128 -11
  24. examples/rl/task_app/math_task_app.py +11 -3
  25. examples/rl_old/task_app.py +222 -53
  26. examples/warming_up_to_rl/analyze_trace_db.py +7 -5
  27. examples/warming_up_to_rl/export_trace_sft.py +141 -16
  28. examples/warming_up_to_rl/groq_test.py +11 -4
  29. examples/warming_up_to_rl/manage_secrets.py +15 -6
  30. examples/warming_up_to_rl/readme.md +9 -2
  31. examples/warming_up_to_rl/run_eval.py +108 -30
  32. examples/warming_up_to_rl/run_fft_and_save.py +128 -52
  33. examples/warming_up_to_rl/run_local_rollout.py +87 -36
  34. examples/warming_up_to_rl/run_local_rollout_modal.py +113 -25
  35. examples/warming_up_to_rl/run_local_rollout_parallel.py +80 -16
  36. examples/warming_up_to_rl/run_local_rollout_traced.py +125 -20
  37. examples/warming_up_to_rl/run_rl_and_save.py +31 -7
  38. examples/warming_up_to_rl/run_rollout_remote.py +37 -10
  39. examples/warming_up_to_rl/task_app/grpo_crafter.py +90 -27
  40. examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +9 -27
  41. examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +46 -108
  42. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -1
  43. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +1 -1
  44. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -1
  45. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +50 -17
  46. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +35 -21
  47. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +8 -4
  48. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +29 -26
  49. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +1 -1
  50. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +17 -13
  51. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +1 -1
  52. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +106 -63
  53. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +82 -84
  54. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +76 -59
  55. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +1 -1
  56. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +43 -49
  57. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +5 -15
  58. synth_ai/__init__.py +1 -0
  59. synth_ai/api/train/builders.py +34 -10
  60. synth_ai/api/train/cli.py +172 -32
  61. synth_ai/api/train/config_finder.py +59 -4
  62. synth_ai/api/train/env_resolver.py +32 -14
  63. synth_ai/api/train/pollers.py +11 -3
  64. synth_ai/api/train/task_app.py +4 -1
  65. synth_ai/api/train/utils.py +20 -4
  66. synth_ai/cli/__init__.py +11 -4
  67. synth_ai/cli/balance.py +1 -1
  68. synth_ai/cli/demo.py +19 -5
  69. synth_ai/cli/rl_demo.py +75 -16
  70. synth_ai/cli/root.py +116 -37
  71. synth_ai/cli/task_apps.py +1276 -186
  72. synth_ai/cli/traces.py +1 -0
  73. synth_ai/cli/turso.py +73 -0
  74. synth_ai/core/experiment.py +0 -2
  75. synth_ai/demo_registry.py +67 -30
  76. synth_ai/demos/core/cli.py +493 -164
  77. synth_ai/demos/demo_task_apps/core.py +50 -6
  78. synth_ai/demos/demo_task_apps/crafter/configs/crafter_fft_4b.toml +2 -3
  79. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +36 -28
  80. synth_ai/demos/demo_task_apps/math/_common.py +1 -2
  81. synth_ai/demos/demo_task_apps/math/deploy_modal.py +0 -2
  82. synth_ai/demos/demo_task_apps/math/modal_task_app.py +168 -65
  83. synth_ai/demos/demo_task_apps/math/task_app_entry.py +0 -1
  84. synth_ai/environments/examples/bandit/engine.py +12 -4
  85. synth_ai/environments/examples/bandit/taskset.py +4 -4
  86. synth_ai/environments/reproducibility/tree.py +3 -1
  87. synth_ai/environments/service/core_routes.py +6 -2
  88. synth_ai/evals/base.py +0 -2
  89. synth_ai/experimental/synth_oss.py +11 -12
  90. synth_ai/handshake.py +3 -1
  91. synth_ai/http_client.py +31 -7
  92. synth_ai/inference/__init__.py +0 -2
  93. synth_ai/inference/client.py +8 -4
  94. synth_ai/jobs/client.py +40 -10
  95. synth_ai/learning/client.py +33 -8
  96. synth_ai/learning/config.py +0 -2
  97. synth_ai/learning/constants.py +0 -2
  98. synth_ai/learning/ft_client.py +6 -3
  99. synth_ai/learning/health.py +9 -2
  100. synth_ai/learning/jobs.py +17 -5
  101. synth_ai/learning/prompts/hello_world_in_context_injection_ex.py +1 -3
  102. synth_ai/learning/prompts/random_search.py +4 -1
  103. synth_ai/learning/prompts/run_random_search_banking77.py +6 -1
  104. synth_ai/learning/rl_client.py +42 -14
  105. synth_ai/learning/sse.py +0 -2
  106. synth_ai/learning/validators.py +6 -2
  107. synth_ai/lm/caching/ephemeral.py +1 -3
  108. synth_ai/lm/core/exceptions.py +0 -2
  109. synth_ai/lm/core/main.py +13 -1
  110. synth_ai/lm/core/synth_models.py +0 -1
  111. synth_ai/lm/core/vendor_clients.py +4 -2
  112. synth_ai/lm/overrides.py +2 -2
  113. synth_ai/lm/vendors/core/anthropic_api.py +7 -7
  114. synth_ai/lm/vendors/core/openai_api.py +2 -0
  115. synth_ai/lm/vendors/openai_standard.py +3 -1
  116. synth_ai/lm/vendors/openai_standard_responses.py +6 -3
  117. synth_ai/lm/vendors/supported/custom_endpoint.py +1 -3
  118. synth_ai/lm/vendors/synth_client.py +37 -10
  119. synth_ai/rl/__init__.py +0 -1
  120. synth_ai/rl/contracts.py +0 -2
  121. synth_ai/rl/env_keys.py +6 -1
  122. synth_ai/task/__init__.py +1 -0
  123. synth_ai/task/apps/__init__.py +11 -11
  124. synth_ai/task/auth.py +29 -17
  125. synth_ai/task/client.py +3 -1
  126. synth_ai/task/contracts.py +1 -0
  127. synth_ai/task/datasets.py +3 -1
  128. synth_ai/task/errors.py +3 -2
  129. synth_ai/task/health.py +0 -2
  130. synth_ai/task/json.py +0 -1
  131. synth_ai/task/proxy.py +2 -5
  132. synth_ai/task/rubrics.py +9 -3
  133. synth_ai/task/server.py +31 -5
  134. synth_ai/task/tracing_utils.py +8 -3
  135. synth_ai/task/validators.py +0 -1
  136. synth_ai/task/vendors.py +0 -1
  137. synth_ai/tracing_v3/db_config.py +26 -1
  138. synth_ai/tracing_v3/decorators.py +1 -0
  139. synth_ai/tracing_v3/examples/basic_usage.py +3 -2
  140. synth_ai/tracing_v3/hooks.py +2 -0
  141. synth_ai/tracing_v3/replica_sync.py +1 -0
  142. synth_ai/tracing_v3/session_tracer.py +24 -3
  143. synth_ai/tracing_v3/storage/base.py +4 -1
  144. synth_ai/tracing_v3/storage/factory.py +0 -1
  145. synth_ai/tracing_v3/turso/manager.py +102 -38
  146. synth_ai/tracing_v3/turso/models.py +4 -1
  147. synth_ai/tracing_v3/utils.py +1 -0
  148. synth_ai/v0/tracing/upload.py +32 -135
  149. {synth_ai-0.2.9.dev5.dist-info → synth_ai-0.2.9.dev7.dist-info}/METADATA +1 -1
  150. {synth_ai-0.2.9.dev5.dist-info → synth_ai-0.2.9.dev7.dist-info}/RECORD +154 -154
  151. synth_ai/install_sqld.sh +0 -40
  152. {synth_ai-0.2.9.dev5.dist-info → synth_ai-0.2.9.dev7.dist-info}/WHEEL +0 -0
  153. {synth_ai-0.2.9.dev5.dist-info → synth_ai-0.2.9.dev7.dist-info}/entry_points.txt +0 -0
  154. {synth_ai-0.2.9.dev5.dist-info → synth_ai-0.2.9.dev7.dist-info}/licenses/LICENSE +0 -0
  155. {synth_ai-0.2.9.dev5.dist-info → synth_ai-0.2.9.dev7.dist-info}/top_level.txt +0 -0
@@ -34,9 +34,13 @@ def build_rl_payload(
34
34
  services = data.get("services") if isinstance(data.get("services"), dict) else {}
35
35
  model_cfg = data.get("model") if isinstance(data.get("model"), dict) else {}
36
36
 
37
- final_task_url = (overrides.get("task_url") or task_url or services.get("task_url") or "").strip()
37
+ final_task_url = (
38
+ overrides.get("task_url") or task_url or services.get("task_url") or ""
39
+ ).strip()
38
40
  if not final_task_url:
39
- raise click.ClickException("Task app URL required (provide --task-url or set services.task_url in TOML)")
41
+ raise click.ClickException(
42
+ "Task app URL required (provide --task-url or set services.task_url in TOML)"
43
+ )
40
44
 
41
45
  model_source = (model_cfg.get("source") or "").strip()
42
46
  model_base = (model_cfg.get("base") or "").strip()
@@ -45,7 +49,9 @@ def build_rl_payload(
45
49
  model_source = override_model
46
50
  model_base = ""
47
51
  if bool(model_source) == bool(model_base):
48
- raise click.ClickException("Model section must specify exactly one of [model].source or [model].base")
52
+ raise click.ClickException(
53
+ "Model section must specify exactly one of [model].source or [model].base"
54
+ )
49
55
 
50
56
  # Force TOML services.task_url to the effective endpoint to avoid split URLs
51
57
  try:
@@ -93,15 +99,23 @@ def build_sft_payload(
93
99
  if not raw_dataset:
94
100
  raise TrainError("Dataset not specified; pass --dataset or set [job].data")
95
101
  dataset_path = Path(raw_dataset)
96
- dataset_path = (dataset_path if dataset_path.is_absolute() else (config_path.parent / dataset_path)).resolve()
102
+ # Resolve relative paths from current working directory, not config directory
103
+ dataset_path = (
104
+ dataset_path if dataset_path.is_absolute() else (Path.cwd() / dataset_path)
105
+ ).resolve()
97
106
  if not dataset_path.exists():
98
107
  raise TrainError(f"Dataset not found: {dataset_path}")
99
108
 
100
- validation_path = data_cfg.get("validation_path") if isinstance(data_cfg.get("validation_path"), str) else None
109
+ validation_path = (
110
+ data_cfg.get("validation_path")
111
+ if isinstance(data_cfg.get("validation_path"), str)
112
+ else None
113
+ )
101
114
  validation_file = None
102
115
  if validation_path:
103
116
  vpath = Path(validation_path)
104
- vpath = (vpath if vpath.is_absolute() else (config_path.parent / vpath)).resolve()
117
+ # Resolve relative paths from current working directory, not config directory
118
+ vpath = (vpath if vpath.is_absolute() else (Path.cwd() / vpath)).resolve()
105
119
  if not vpath.exists():
106
120
  click.echo(f"[WARN] Validation dataset {vpath} missing; continuing without validation")
107
121
  else:
@@ -125,15 +139,23 @@ def build_sft_payload(
125
139
  if isinstance(hp_cfg.get("parallelism"), dict):
126
140
  hp_block["parallelism"] = hp_cfg["parallelism"]
127
141
 
128
- compute_block = {k: compute_cfg[k] for k in ("gpu_type", "gpu_count", "nodes") if k in compute_cfg}
142
+ compute_block = {
143
+ k: compute_cfg[k] for k in ("gpu_type", "gpu_count", "nodes") if k in compute_cfg
144
+ }
129
145
 
130
146
  effective = {
131
147
  "compute": compute_block,
132
- "data": {"topology": data_cfg.get("topology", {}) if isinstance(data_cfg.get("topology"), dict) else {}},
148
+ "data": {
149
+ "topology": data_cfg.get("topology", {})
150
+ if isinstance(data_cfg.get("topology"), dict)
151
+ else {}
152
+ },
133
153
  "training": {k: v for k, v in train_cfg.items() if k in ("mode", "use_qlora")},
134
154
  }
135
155
 
136
- validation_cfg = train_cfg.get("validation") if isinstance(train_cfg.get("validation"), dict) else None
156
+ validation_cfg = (
157
+ train_cfg.get("validation") if isinstance(train_cfg.get("validation"), dict) else None
158
+ )
137
159
  if isinstance(validation_cfg, dict):
138
160
  hp_block.update(
139
161
  {
@@ -144,7 +166,9 @@ def build_sft_payload(
144
166
  "greater_is_better": bool(validation_cfg.get("greater_is_better", False)),
145
167
  }
146
168
  )
147
- effective.setdefault("training", {})["validation"] = {"enabled": bool(validation_cfg.get("enabled", True))}
169
+ effective.setdefault("training", {})["validation"] = {
170
+ "enabled": bool(validation_cfg.get("enabled", True))
171
+ }
148
172
 
149
173
  payload = {
150
174
  "model": job_cfg.get("model") or data.get("model"),
synth_ai/api/train/cli.py CHANGED
@@ -24,6 +24,7 @@ from .utils import (
24
24
  sleep,
25
25
  validate_sft_jsonl,
26
26
  )
27
+ from synth_ai.config.base_url import get_backend_from_env
27
28
 
28
29
 
29
30
  def _discover_dataset_candidates(config_path: Path, limit: int = 50) -> list[Path]:
@@ -92,20 +93,57 @@ def _prompt_manual_dataset() -> Path:
92
93
  return Path(manual).expanduser()
93
94
 
94
95
 
96
+ def _default_backend() -> str:
97
+ """Resolve backend URL with proper production default."""
98
+ # Check explicit override first
99
+ explicit = os.getenv("BACKEND_BASE_URL", "").strip()
100
+ if explicit:
101
+ return explicit
102
+ # Use standard resolution logic
103
+ base, _ = get_backend_from_env()
104
+ return f"{base}/api" if not base.endswith("/api") else base
105
+
106
+
95
107
  @click.command("train")
96
- @click.option("--config", "config_paths", multiple=True, type=click.Path(), help="Path to training TOML (repeatable)")
108
+ @click.option(
109
+ "--config",
110
+ "config_paths",
111
+ multiple=True,
112
+ type=click.Path(),
113
+ help="Path to training TOML (repeatable)",
114
+ )
97
115
  @click.option("--type", "train_type", type=click.Choice(["auto", "rl", "sft"]), default="auto")
98
- @click.option("--env-file", "env_files", multiple=True, type=click.Path(), help=".env file(s) to preload (skips selection prompt)")
116
+ @click.option(
117
+ "--env-file",
118
+ "env_files",
119
+ multiple=True,
120
+ type=click.Path(),
121
+ help=".env file(s) to preload (skips selection prompt)",
122
+ )
99
123
  @click.option("--task-url", default=None, help="Override task app base URL (RL only)")
100
- @click.option("--dataset", "dataset_path", type=click.Path(), default=None, help="Override dataset JSONL path (SFT)")
101
- @click.option("--backend", default=lambda: os.getenv("BACKEND_BASE_URL", "http://localhost:8000/api"), help="Backend base URL")
124
+ @click.option(
125
+ "--dataset",
126
+ "dataset_path",
127
+ type=click.Path(),
128
+ default=None,
129
+ help="Override dataset JSONL path (SFT)",
130
+ )
131
+ @click.option("--backend", default=_default_backend, help="Backend base URL")
102
132
  @click.option("--model", default=None, help="Override model identifier")
103
133
  @click.option("--idempotency", default=None, help="Idempotency-Key header for job creation")
104
134
  @click.option("--dry-run", is_flag=True, help="Preview payload without submitting")
105
135
  @click.option("--poll/--no-poll", default=True, help="Poll job status until terminal state")
106
- @click.option("--poll-timeout", default=3600.0, type=float, help="Maximum seconds to poll before timing out")
136
+ @click.option(
137
+ "--poll-timeout", default=3600.0, type=float, help="Maximum seconds to poll before timing out"
138
+ )
107
139
  @click.option("--poll-interval", default=5.0, type=float, help="Seconds between poll attempts")
108
- @click.option("--examples", "examples_limit", type=int, default=None, help="Limit SFT training to the first N examples")
140
+ @click.option(
141
+ "--examples",
142
+ "examples_limit",
143
+ type=int,
144
+ default=None,
145
+ help="Limit SFT training to the first N examples",
146
+ )
109
147
  def train_command(
110
148
  config_paths: tuple[str, ...],
111
149
  train_type: str,
@@ -123,12 +161,18 @@ def train_command(
123
161
  ) -> None:
124
162
  """Interactive launcher for RL / SFT jobs."""
125
163
 
126
- candidates = discover_configs(list(config_paths), requested_type=train_type if train_type != "auto" else None)
127
- selection = prompt_for_config(candidates, requested_type=train_type if train_type != "auto" else None)
164
+ candidates = discover_configs(
165
+ list(config_paths), requested_type=train_type if train_type != "auto" else None
166
+ )
167
+ selection = prompt_for_config(
168
+ candidates, requested_type=train_type if train_type != "auto" else None
169
+ )
128
170
 
129
171
  effective_type = train_type if train_type != "auto" else selection.train_type
130
172
  if effective_type not in {"rl", "sft"}:
131
- effective_type = click.prompt("Detected config type is ambiguous. Enter type", type=click.Choice(["rl", "sft"]))
173
+ effective_type = click.prompt(
174
+ "Detected config type is ambiguous. Enter type", type=click.Choice(["rl", "sft"])
175
+ )
132
176
 
133
177
  cfg_path = selection.path
134
178
  click.echo(f"Using config: {cfg_path} ({effective_type})")
@@ -219,11 +263,14 @@ def train_command(
219
263
  )
220
264
 
221
265
 
222
- def _wait_for_training_file(backend_base: str, api_key: str, file_id: str, *, timeout: float = 120.0) -> None:
266
+ def _wait_for_training_file(
267
+ backend_base: str, api_key: str, file_id: str, *, timeout: float = 120.0
268
+ ) -> None:
223
269
  url = f"{backend_base}/learning/files/{file_id}"
224
270
  headers = {"Authorization": f"Bearer {api_key}"}
225
271
  elapsed = 0.0
226
272
  interval = 2.0
273
+ first_check = True
227
274
  while True:
228
275
  resp = http_get(url, headers=headers, timeout=30.0)
229
276
  if resp.status_code == 200:
@@ -231,17 +278,55 @@ def _wait_for_training_file(backend_base: str, api_key: str, file_id: str, *, ti
231
278
  data = resp.json()
232
279
  except Exception:
233
280
  data = {}
234
- status = str(data.get("status") or data.get("state") or data.get("storage_state") or "ready").lower()
281
+ status = str(
282
+ data.get("status") or data.get("state") or data.get("storage_state") or "ready"
283
+ ).lower()
284
+ if first_check:
285
+ click.echo(f"File uploaded successfully (id={file_id}, status={status})")
286
+ first_check = False
235
287
  if status in {"ready", "uploaded", "stored", "complete"}:
288
+ click.echo(f"✓ Training file ready (status={status})")
236
289
  return
290
+ # Show progress for processing states
291
+ if status in {"processing", "pending", "validating"}:
292
+ click.echo(
293
+ f" Waiting for file processing... (status={status}, {elapsed:.0f}s elapsed)"
294
+ )
237
295
  elif resp.status_code == 404:
238
296
  # Keep polling; object may not be visible yet
239
- pass
297
+ if first_check:
298
+ click.echo(f"Waiting for file {file_id} to become visible...")
299
+ first_check = False
300
+ elif resp.status_code in {401, 403}:
301
+ # Auth errors won't resolve by polling - fail immediately
302
+ try:
303
+ error_body = resp.json()
304
+ except Exception:
305
+ error_body = resp.text[:400]
306
+ click.echo(f"\n[ERROR] Authentication failed when checking training file:")
307
+ click.echo(f" URL: {url}")
308
+ click.echo(f" Status: {resp.status_code}")
309
+ click.echo(f" Response: {error_body}")
310
+ click.echo(f" API key: {mask_value(api_key)}")
311
+ raise click.ClickException(
312
+ f"Authentication error ({resp.status_code}). "
313
+ "Check that your SYNTH_API_KEY is valid and has permission to access this organization's files."
314
+ )
240
315
  else:
241
- click.echo(f"[WARN] Unexpected response while checking training file {file_id}: {resp.status_code}")
316
+ # Other errors - show details but keep polling
317
+ try:
318
+ error_body = resp.json()
319
+ except Exception:
320
+ error_body = resp.text[:400]
321
+ click.echo(f"[WARN] Unexpected response checking file {file_id}:")
322
+ click.echo(f" URL: {url}")
323
+ click.echo(f" Status: {resp.status_code}")
324
+ click.echo(f" Response: {error_body}")
242
325
 
243
326
  if elapsed >= timeout:
244
- raise click.ClickException(f"Training file {file_id} not ready after {timeout:.0f}s")
327
+ raise click.ClickException(
328
+ f"Training file {file_id} not ready after {timeout:.0f}s (last status: {resp.status_code})"
329
+ )
245
330
  sleep(interval)
246
331
  elapsed += interval
247
332
 
@@ -259,7 +344,11 @@ def handle_rl(
259
344
  poll_timeout: float,
260
345
  poll_interval: float,
261
346
  ) -> None:
262
- overrides: Dict[str, Any] = {"backend": backend_base, "task_url": task_url_override, "model": model_override}
347
+ overrides: Dict[str, Any] = {
348
+ "backend": backend_base,
349
+ "task_url": task_url_override,
350
+ "model": model_override,
351
+ }
263
352
  build = build_rl_payload(
264
353
  config_path=cfg_path,
265
354
  task_url=task_url_override or os.environ.get("TASK_APP_URL", ""),
@@ -271,13 +360,17 @@ def handle_rl(
271
360
  verify_url = f"{backend_base}/rl/verify_task_app"
272
361
  verify_headers = {"Authorization": f"Bearer {synth_key}", "Content-Type": "application/json"}
273
362
  try:
274
- vresp = http_post(verify_url, headers=verify_headers, json_body={"endpoint_base_url": build.task_url})
363
+ vresp = http_post(
364
+ verify_url, headers=verify_headers, json_body={"endpoint_base_url": build.task_url}
365
+ )
275
366
  try:
276
367
  vjs = vresp.json()
277
368
  except Exception:
278
369
  vjs = {"status": vresp.status_code, "text": (vresp.text or "")[:400]}
279
370
  except Exception as _ve:
280
- raise click.ClickException(f"Task app verification call failed: {type(_ve).__name__}: {_ve}") from _ve
371
+ raise click.ClickException(
372
+ f"Task app verification call failed: {type(_ve).__name__}: {_ve}"
373
+ ) from _ve
281
374
  if vresp.status_code >= 400:
282
375
  click.echo("Task app verification error:\n" + preview_json(vjs, limit=800))
283
376
  raise click.ClickException(f"Verification failed with status {vresp.status_code}")
@@ -379,55 +472,102 @@ def handle_sft(
379
472
  validate_sft_jsonl(build.validation_file)
380
473
 
381
474
  upload_url = f"{backend_base}/learning/files"
382
- click.echo(f"Uploading dataset {build.train_file}")
475
+ click.echo(f"\n=== Uploading Training Data ===")
476
+ click.echo(f"Dataset: {build.train_file}")
477
+ click.echo(f"Destination: {upload_url}")
383
478
  if dry_run:
384
479
  click.echo("Dry run: skipping upload")
385
480
  train_file_id = "dry-run-train"
386
481
  val_file_id = None
387
482
  else:
388
- resp = post_multipart(upload_url, api_key=synth_key, file_field="file", file_path=build.train_file)
389
- js = resp.json() if resp.headers.get("content-type", "").startswith("application/json") else {}
483
+ resp = post_multipart(
484
+ upload_url, api_key=synth_key, file_field="file", file_path=build.train_file
485
+ )
486
+ js = (
487
+ resp.json()
488
+ if resp.headers.get("content-type", "").startswith("application/json")
489
+ else {}
490
+ )
390
491
  if resp.status_code >= 400 or "id" not in js:
391
- raise click.ClickException(f"Training file upload failed ({resp.status_code}): {js or resp.text[:200]}")
492
+ click.echo(f"\n[ERROR] Training file upload failed:")
493
+ click.echo(f" URL: {upload_url}")
494
+ click.echo(f" Status: {resp.status_code}")
495
+ click.echo(f" Response: {js or resp.text[:400]}")
496
+ click.echo(f" File: {build.train_file}")
497
+ raise click.ClickException(
498
+ f"Training file upload failed with status {resp.status_code}"
499
+ )
392
500
  train_file_id = js["id"]
501
+ click.echo(f"✓ Training file uploaded (id={train_file_id})")
393
502
  val_file_id = None
394
503
  if build.validation_file:
395
- click.echo(f"Uploading validation dataset {build.validation_file}")
396
- vresp = post_multipart(upload_url, api_key=synth_key, file_field="file", file_path=build.validation_file)
397
- vjs = vresp.json() if vresp.headers.get("content-type", "").startswith("application/json") else {}
504
+ click.echo(f"Uploading validation dataset: {build.validation_file}")
505
+ vresp = post_multipart(
506
+ upload_url,
507
+ api_key=synth_key,
508
+ file_field="file",
509
+ file_path=build.validation_file,
510
+ )
511
+ vjs = (
512
+ vresp.json()
513
+ if vresp.headers.get("content-type", "").startswith("application/json")
514
+ else {}
515
+ )
398
516
  if vresp.status_code < 400 and "id" in vjs:
399
517
  val_file_id = vjs["id"]
518
+ click.echo(f"✓ Validation file uploaded (id={val_file_id})")
400
519
  else:
401
- click.echo(f"[WARN] Validation upload failed: {vresp.status_code} {vjs or vresp.text[:200]}")
520
+ click.echo(
521
+ f"[WARN] Validation upload failed ({vresp.status_code}): {vjs or vresp.text[:200]}"
522
+ )
402
523
  payload = dict(build.payload)
403
524
  payload["training_file_id"] = train_file_id
404
525
  if val_file_id:
405
- payload.setdefault("metadata", {}).setdefault("effective_config", {}).setdefault("data", {})["validation_files"] = [val_file_id]
526
+ payload.setdefault("metadata", {}).setdefault("effective_config", {}).setdefault(
527
+ "data", {}
528
+ )["validation_files"] = [val_file_id]
406
529
 
530
+ click.echo(f"\n=== Checking File Processing Status ===")
407
531
  try:
408
532
  _wait_for_training_file(backend_base, synth_key, train_file_id)
409
533
  except click.ClickException as exc:
410
534
  raise click.ClickException(f"Training file {train_file_id} not ready: {exc}") from exc
411
535
 
412
- click.echo("FFT job payload:\n" + preview_json(payload, limit=800))
536
+ click.echo(f"\n=== Creating Training Job ===")
537
+ click.echo("Job payload preview:")
538
+ click.echo(preview_json(payload, limit=800))
413
539
  if dry_run:
414
540
  click.echo("Dry run: skipping job submission")
415
541
  return
416
542
 
417
543
  create_url = f"{backend_base}/learning/jobs"
418
544
  headers = {"Authorization": f"Bearer {synth_key}", "Content-Type": "application/json"}
545
+ click.echo(f"\nPOST {create_url}")
419
546
  resp = http_post(create_url, headers=headers, json_body=payload)
420
- js = resp.json() if resp.headers.get("content-type", "").startswith("application/json") else {}
421
- click.echo(f"Response {resp.status_code}: {preview_json(js, limit=400)}")
547
+ js = (
548
+ resp.json()
549
+ if resp.headers.get("content-type", "").startswith("application/json")
550
+ else {}
551
+ )
422
552
  if resp.status_code not in (200, 201):
423
- raise click.ClickException("Failed to create learning job")
553
+ click.echo(f"\n[ERROR] Job creation failed:")
554
+ click.echo(f" URL: {create_url}")
555
+ click.echo(f" Status: {resp.status_code}")
556
+ click.echo(f" Response: {preview_json(js, limit=600)}")
557
+ raise click.ClickException(f"Job creation failed with status {resp.status_code}")
424
558
  job_id = js.get("job_id") or js.get("id")
425
559
  if not job_id:
426
560
  raise click.ClickException("Response missing job id")
561
+ click.echo(f"✓ Job created (id={job_id})")
427
562
 
563
+ click.echo(f"\n=== Starting Training Job ===")
428
564
  start_url = f"{backend_base}/learning/jobs/{job_id}/start"
429
- click.echo(f"POST {start_url} (start)")
430
- _ = http_post(start_url, headers=headers, json_body={})
565
+ click.echo(f"POST {start_url}")
566
+ start_resp = http_post(start_url, headers=headers, json_body={})
567
+ if start_resp.status_code not in (200, 201):
568
+ click.echo(f"[WARN] Job start returned status {start_resp.status_code}")
569
+ else:
570
+ click.echo(f"✓ Job started")
431
571
 
432
572
  if not poll:
433
573
  click.echo(f"Started job {job_id} (polling disabled)")
@@ -1,5 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import json
4
+ import os
3
5
  from dataclasses import dataclass
4
6
  from pathlib import Path
5
7
  from typing import Iterable
@@ -9,6 +11,7 @@ import click
9
11
  from .utils import REPO_ROOT, load_toml, preview_json
10
12
 
11
13
  _SKIP_DIRS = {".git", "__pycache__", ".venv", "node_modules", "dist", "build"}
14
+ _STATE_FILE = os.path.expanduser("~/.synth-ai/demo.json")
12
15
 
13
16
 
14
17
  @dataclass(slots=True)
@@ -17,9 +20,43 @@ class ConfigCandidate:
17
20
  train_type: str # "rl", "sft", or "unknown"
18
21
 
19
22
 
23
+ def _load_last_config() -> Path | None:
24
+ """Load the last used training config path from state file."""
25
+ try:
26
+ if os.path.isfile(_STATE_FILE):
27
+ with open(_STATE_FILE) as fh:
28
+ data = json.load(fh)
29
+ if isinstance(data, dict):
30
+ last_config = data.get("LAST_CONFIG")
31
+ if last_config:
32
+ path = Path(last_config).resolve()
33
+ if path.exists():
34
+ return path
35
+ except Exception:
36
+ pass
37
+ return None
38
+
39
+
40
+ def _save_last_config(config_path: Path) -> None:
41
+ """Save the last used training config path to state file."""
42
+ try:
43
+ data = {}
44
+ if os.path.isfile(_STATE_FILE):
45
+ with open(_STATE_FILE) as fh:
46
+ data = json.load(fh) or {}
47
+ if not isinstance(data, dict):
48
+ data = {}
49
+ data["LAST_CONFIG"] = str(config_path.resolve())
50
+ os.makedirs(os.path.dirname(_STATE_FILE), exist_ok=True)
51
+ with open(_STATE_FILE, "w") as fh:
52
+ json.dump(data, fh)
53
+ except Exception:
54
+ pass
55
+
56
+
20
57
  def _iter_candidate_paths() -> Iterable[Path]:
21
58
  seen: set[Path] = set()
22
-
59
+
23
60
  # Prioritize current working directory first
24
61
  try:
25
62
  cwd = Path.cwd().resolve()
@@ -135,23 +172,41 @@ def discover_configs(explicit: list[str], *, requested_type: str | None) -> list
135
172
  return candidates
136
173
 
137
174
 
138
- def prompt_for_config(candidates: list[ConfigCandidate], *, requested_type: str | None) -> ConfigCandidate:
175
+ def prompt_for_config(
176
+ candidates: list[ConfigCandidate], *, requested_type: str | None
177
+ ) -> ConfigCandidate:
139
178
  if not candidates:
140
179
  raise click.ClickException("No training configs found. Pass --config explicitly.")
141
180
 
181
+ # Check for last used config and move it to the top if found
182
+ last_config = _load_last_config()
183
+ default_idx = 1
184
+
185
+ if last_config:
186
+ for idx, cand in enumerate(candidates):
187
+ if cand.path.resolve() == last_config:
188
+ # Move last used config to the front
189
+ candidates.insert(0, candidates.pop(idx))
190
+ break
191
+
142
192
  click.echo("Select a training config:")
143
193
  for idx, cand in enumerate(candidates, start=1):
144
194
  label = cand.train_type if cand.train_type != "unknown" else "?"
145
- click.echo(f" {idx}) [{label}] {cand.path}")
195
+ last_marker = " (last used)" if last_config and cand.path.resolve() == last_config else ""
196
+ click.echo(f" {idx}) [{label}] {cand.path}{last_marker}")
146
197
  click.echo(" 0) Abort")
147
198
 
148
- choice = click.prompt("Enter choice", type=int)
199
+ choice = click.prompt("Enter choice", type=int, default=default_idx)
149
200
  if choice == 0:
150
201
  raise click.ClickException("Aborted by user")
151
202
  if choice < 0 or choice > len(candidates):
152
203
  raise click.ClickException("Invalid selection")
153
204
 
154
205
  selection = candidates[choice - 1]
206
+
207
+ # Save this config as the last used
208
+ _save_last_config(selection.path)
209
+
155
210
  try:
156
211
  data = load_toml(selection.path)
157
212
  preview = preview_json({k: data.get(k) for k in list(data.keys())[:4]}, limit=320)
@@ -56,12 +56,12 @@ class EnvResolver:
56
56
  def _collect_default_candidates(config_path: Path | None) -> list[Path]:
57
57
  candidates: list[Path] = []
58
58
  cwd = Path.cwd()
59
-
59
+
60
60
  # Prioritize CWD env files
61
61
  cwd_env = cwd / ".env"
62
62
  if cwd_env.exists():
63
63
  candidates.append(cwd_env.resolve())
64
-
64
+
65
65
  # Search for additional .env files in CWD subdirectories
66
66
  for sub in cwd.glob("**/.env"):
67
67
  try:
@@ -76,13 +76,13 @@ def _collect_default_candidates(config_path: Path | None) -> list[Path]:
76
76
  if len(candidates) >= 20:
77
77
  break
78
78
  candidates.append(resolved)
79
-
79
+
80
80
  # Then config path env file
81
81
  if config_path:
82
82
  cfg_env = config_path.parent / ".env"
83
83
  if cfg_env.exists():
84
84
  candidates.append(cfg_env.resolve())
85
-
85
+
86
86
  # Then repo env files
87
87
  repo_env = REPO_ROOT / ".env"
88
88
  if repo_env.exists():
@@ -90,7 +90,7 @@ def _collect_default_candidates(config_path: Path | None) -> list[Path]:
90
90
  examples_env = REPO_ROOT / "examples" / ".env"
91
91
  if examples_env.exists():
92
92
  candidates.append(examples_env.resolve())
93
-
93
+
94
94
  # Search shallow depth for additional .env files in examples
95
95
  for sub in (REPO_ROOT / "examples").glob("**/.env"):
96
96
  try:
@@ -105,7 +105,7 @@ def _collect_default_candidates(config_path: Path | None) -> list[Path]:
105
105
  if len(candidates) >= 20:
106
106
  break
107
107
  candidates.append(resolved)
108
-
108
+
109
109
  deduped: list[Path] = []
110
110
  for path in candidates:
111
111
  if path not in deduped:
@@ -156,8 +156,27 @@ def resolve_env(
156
156
  raise click.ClickException(f"Env file not found: {path}")
157
157
  resolver = EnvResolver(provided)
158
158
  else:
159
- resolver = EnvResolver(_collect_default_candidates(config_path))
160
- resolver.select_new_env() # force user selection even if one candidate
159
+ # Check for saved .env path from demo command
160
+ try:
161
+ from synth_ai.demos.demo_task_apps.core import load_env_file_path
162
+
163
+ saved_env_path = load_env_file_path()
164
+ if saved_env_path:
165
+ saved_path = Path(saved_env_path)
166
+ if saved_path.exists():
167
+ click.echo(f"Using .env file: {saved_path}")
168
+ resolver = EnvResolver([saved_path])
169
+ else:
170
+ # Saved path no longer exists, fall back to prompt
171
+ resolver = EnvResolver(_collect_default_candidates(config_path))
172
+ resolver.select_new_env()
173
+ else:
174
+ resolver = EnvResolver(_collect_default_candidates(config_path))
175
+ resolver.select_new_env()
176
+ except Exception:
177
+ # If import fails or any error, fall back to original behavior
178
+ resolver = EnvResolver(_collect_default_candidates(config_path))
179
+ resolver.select_new_env()
161
180
 
162
181
  # Preload selected .env keys into process env so downstream lookups succeed
163
182
  try:
@@ -207,10 +226,10 @@ def _resolve_key(resolver: EnvResolver, spec: KeySpec) -> str:
207
226
  break
208
227
  if env_val:
209
228
  click.echo(f"Found {spec.name} in current sources: {mask_value(env_val)}")
210
- if _prompt_yes_no(f"Use this value for {spec.name}?", default=True):
211
- _maybe_persist(resolver, spec, env_val)
212
- os.environ[spec.name] = env_val
213
- return env_val
229
+ # Automatically use and persist the value (no prompt)
230
+ _maybe_persist(resolver, spec, env_val)
231
+ os.environ[spec.name] = env_val
232
+ return env_val
214
233
  options: list[tuple[str, Callable[[], str | None]]] = []
215
234
 
216
235
  def _enter_manual() -> str:
@@ -254,8 +273,7 @@ def _resolve_key(resolver: EnvResolver, spec: KeySpec) -> str:
254
273
 
255
274
 
256
275
  def _maybe_persist(resolver: EnvResolver, spec: KeySpec, value: str) -> None:
257
- if not _prompt_yes_no(f"Save {spec.name} to {resolver.current_path}?", default=True):
258
- return
276
+ # Automatically save (no prompt)
259
277
  resolver.set_value(spec.name, value)
260
278
  click.echo(f"Saved {spec.name} to {resolver.current_path}")
261
279
 
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass
4
+ from datetime import datetime
4
5
  from typing import Any, Mapping
5
6
 
6
7
  import click
@@ -15,7 +16,9 @@ class PollOutcome:
15
16
 
16
17
 
17
18
  class JobPoller:
18
- def __init__(self, base_url: str, api_key: str, *, interval: float = 5.0, timeout: float = 3600.0) -> None:
19
+ def __init__(
20
+ self, base_url: str, api_key: str, *, interval: float = 5.0, timeout: float = 3600.0
21
+ ) -> None:
19
22
  self.base_url = ensure_api_base(base_url)
20
23
  self.api_key = api_key
21
24
  self.interval = interval
@@ -35,9 +38,14 @@ class JobPoller:
35
38
  while elapsed <= self.timeout:
36
39
  try:
37
40
  resp = http_get(f"{self.base_url}{path}", headers=self._headers())
38
- info = resp.json() if resp.headers.get("content-type", "").startswith("application/json") else {}
41
+ info = (
42
+ resp.json()
43
+ if resp.headers.get("content-type", "").startswith("application/json")
44
+ else {}
45
+ )
39
46
  status = (info.get("status") or info.get("state") or "").lower()
40
- click.echo(f"[poll] {elapsed:.0f}s status={status}")
47
+ timestamp = datetime.now().strftime("%H:%M:%S")
48
+ click.echo(f"[poll] {timestamp} {elapsed:.0f}s status={status}")
41
49
  if status in {"succeeded", "failed", "cancelled", "canceled", "completed"}:
42
50
  break
43
51
  except Exception as exc: # pragma: no cover - network failures