synth-ai 0.2.9.dev17__py3-none-any.whl → 0.2.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (56) hide show
  1. examples/dev/qwen3_32b_qlora_4xh100.toml +40 -0
  2. examples/multi_step/crafter_rl_lora.md +29 -0
  3. examples/multi_step/task_app_config_notes.md +488 -0
  4. examples/qwen_coder/infer_ft_smoke.py +1 -0
  5. examples/qwen_coder/scripts/infer_coder.sh +1 -0
  6. examples/qwen_coder/scripts/train_coder_30b.sh +1 -0
  7. examples/qwen_coder/subset_jsonl.py +1 -0
  8. examples/qwen_coder/todos.md +38 -0
  9. examples/qwen_coder/validate_jsonl.py +1 -0
  10. examples/vlm/PROPOSAL.md +53 -0
  11. examples/warming_up_to_rl/configs/eval_stepwise_complex.toml +33 -0
  12. examples/warming_up_to_rl/configs/eval_stepwise_consistent.toml +26 -0
  13. examples/warming_up_to_rl/configs/eval_stepwise_per_achievement.toml +36 -0
  14. examples/warming_up_to_rl/configs/eval_stepwise_simple.toml +30 -0
  15. examples/warming_up_to_rl/old/event_rewards.md +234 -0
  16. examples/warming_up_to_rl/old/notes.md +73 -0
  17. examples/warming_up_to_rl/run_eval.py +142 -25
  18. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +146 -2
  19. synth_ai/__init__.py +5 -20
  20. synth_ai/api/train/builders.py +25 -14
  21. synth_ai/api/train/cli.py +29 -6
  22. synth_ai/api/train/env_resolver.py +18 -19
  23. synth_ai/api/train/supported_algos.py +8 -5
  24. synth_ai/api/train/utils.py +6 -1
  25. synth_ai/cli/__init__.py +4 -2
  26. synth_ai/cli/_storage.py +19 -0
  27. synth_ai/cli/balance.py +14 -2
  28. synth_ai/cli/calc.py +37 -22
  29. synth_ai/cli/legacy_root_backup.py +12 -14
  30. synth_ai/cli/recent.py +12 -7
  31. synth_ai/cli/root.py +1 -23
  32. synth_ai/cli/status.py +4 -3
  33. synth_ai/cli/task_apps.py +143 -137
  34. synth_ai/cli/traces.py +4 -3
  35. synth_ai/cli/watch.py +3 -2
  36. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +738 -0
  37. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +580 -0
  38. synth_ai/jobs/client.py +15 -3
  39. synth_ai/task/server.py +14 -7
  40. synth_ai/tracing_v3/decorators.py +51 -26
  41. synth_ai/tracing_v3/examples/basic_usage.py +12 -7
  42. synth_ai/tracing_v3/llm_call_record_helpers.py +107 -53
  43. synth_ai/tracing_v3/replica_sync.py +8 -4
  44. synth_ai/tracing_v3/storage/utils.py +11 -9
  45. synth_ai/tracing_v3/turso/__init__.py +12 -0
  46. synth_ai/tracing_v3/turso/daemon.py +2 -1
  47. synth_ai/tracing_v3/turso/native_manager.py +28 -15
  48. {synth_ai-0.2.9.dev17.dist-info → synth_ai-0.2.12.dist-info}/METADATA +33 -88
  49. {synth_ai-0.2.9.dev17.dist-info → synth_ai-0.2.12.dist-info}/RECORD +53 -41
  50. {synth_ai-0.2.9.dev17.dist-info → synth_ai-0.2.12.dist-info}/top_level.txt +0 -1
  51. synth/__init__.py +0 -14
  52. synth_ai/_docs_message.py +0 -10
  53. synth_ai/main.py +0 -5
  54. {synth_ai-0.2.9.dev17.dist-info → synth_ai-0.2.12.dist-info}/WHEEL +0 -0
  55. {synth_ai-0.2.9.dev17.dist-info → synth_ai-0.2.12.dist-info}/entry_points.txt +0 -0
  56. {synth_ai-0.2.9.dev17.dist-info → synth_ai-0.2.12.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,40 @@
1
+ [job]
2
+ model = "Qwen/Qwen3-32B"
3
+ # Optionally set here; you can also pass --dataset
4
+ # data = "/abs/path/to/train.jsonl"
5
+
6
+ [compute]
7
+ gpu_type = "H100"
8
+ gpu_count = 4
9
+ nodes = 1
10
+
11
+ [data]
12
+ # Optional; forwarded into metadata.effective_config.data.topology
13
+ topology = { container_count = 4 }
14
+
15
+ [training]
16
+ mode = "sft_offline"
17
+ use_qlora = true
18
+
19
+ [training.validation]
20
+ enabled = true
21
+ evaluation_strategy = "steps"
22
+ eval_steps = 20
23
+ save_best_model_at_end = true
24
+ metric_for_best_model = "val.loss"
25
+ greater_is_better = false
26
+
27
+ [hyperparameters]
28
+ n_epochs = 1
29
+ per_device_batch = 1
30
+ gradient_accumulation_steps = 64
31
+ sequence_length = 4096
32
+ learning_rate = 5e-6
33
+ warmup_ratio = 0.03
34
+
35
+ [hyperparameters.parallelism]
36
+ use_deepspeed = true
37
+ deepspeed_stage = 2
38
+ bf16 = true
39
+ fp16 = false
40
+ fsdp = false
@@ -0,0 +1,29 @@
1
+ # Crafter RL LoRA (10-step runs)
2
+
3
+ This walkthrough shows how to fine-tune the Crafter task app with our 10-step RL LoRA config.
4
+
5
+ 1. **Start the Crafter task app on Modal (with tracing + text-only prompts)**
6
+
7
+ ```bash
8
+ BACKEND_BASE_URL=https://agent-learning.onrender.com/api \
9
+ uvx synth-ai modal-serve grpo-crafter \
10
+ --env-file examples/warming_up_to_rl/.env \
11
+ --name grpo-crafter-task-app
12
+ ```
13
+
14
+ * Deploys the Modal task app with the tracing/text-only fixes baked in.*
15
+
16
+ 2. **Launch the RL job using the updated LoRA config**
17
+
18
+ ```bash
19
+ uvx synth-ai train --type rl \
20
+ --config tests/artifacts/configs/rl.lora.small.toml \
21
+ --backend https://agent-learning.onrender.com/api \
22
+ --env-file .env \
23
+ --no-poll
24
+ ```
25
+
26
+ * This config forces 10 agent turns per rollout, reduces batch size to avoid OOMs, and enforces Crafter-specific defaults.*
27
+
28
+ INFO - 🎉 Training completed successfully!
29
+ INFO - All batch rewards: [0.0625, 0.0625, 0.125, 0.0625, 0.0625, 0.3125, 0.375, 0.4375, 0.5, 0.9375]
@@ -0,0 +1,488 @@
1
+ # Task App Config for Crafter RL: Dense Stepwise Rewards
2
+
3
+ Goal: Allow configuring the Crafter task app to enable/disable dense (decision-stepwise) event rewards and pass that choice from the RL config, through the backend, into the task app’s /rollout execution. This should be broader than just policy config – a `task_app_config` concept – but we can implement with the existing `env.config` today and optionally add a top-level alias later.
4
+
5
+ ## Findings (current behaviour)
6
+
7
+ - Rollout request contract already supports two config payloads:
8
+ - `env.config: dict` and `policy.config: dict`
9
+ - The hosted Crafter rollout implementation already supports decision-stepwise rewards, controlled via a `step_rewards` block in either `policy.config` or `env.config`.
10
+ - When active, it computes per-decision “unique achievement” deltas and attaches per-turn metadata; it also returns `decision_samples` when enabled.
11
+
12
+ Key locations and behaviour:
13
+
14
+ - Rollout schema (env/policy config):
15
+ ```51:87:synth-ai/synth_ai/task/contracts.py
16
+ class RolloutEnvSpec(BaseModel):
17
+ env_id: str | None = None
18
+ env_name: str | None = None
19
+ config: dict[str, Any] = Field(default_factory=dict)
20
+ seed: int | None = None
21
+
22
+ class RolloutPolicySpec(BaseModel):
23
+ policy_id: str | None = None
24
+ policy_name: str | None = None
25
+ config: dict[str, Any] = Field(default_factory=dict)
26
+ ```
27
+
28
+ - Crafter hosted rollout reads step-reward config from policy, then env; gates on `enabled` and `mode == "decision_stepwise"`:
29
+ ```1041:1067:synth-ai/examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py
30
+ # Stepwise reward configuration (Crafter shaping; gate on explicit enable)
31
+ step_rewards_cfg_raw: dict[str, Any] = {}
32
+ ...
33
+ if not step_rewards_cfg_raw:
34
+ if isinstance(request.env.config, dict):
35
+ step_rewards_cfg_raw = dict(request.env.config.get("step_rewards") or {})
36
+
37
+ step_rewards_enabled = bool(step_rewards_cfg_raw.get("enabled", False))
38
+ step_rewards_mode = str(step_rewards_cfg_raw.get("mode") or "off").lower()
39
+ ...
40
+ step_rewards_active = step_rewards_enabled and step_rewards_mode == "decision_stepwise"
41
+ ```
42
+
43
+ - When active, it computes decision-level indicators and metadata, and adds to each step’s `info.meta.decision_rewards`; also accumulates `decision_samples`:
44
+ ```1554:1596:synth-ai/examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py
45
+ if step_rewards_active:
46
+ decision_actions = _summarize_tool_calls(pending_tool_calls)
47
+ stepwise_info, decision_record, stats = compute_stepwise_reward(
48
+ prev_achievements or {},
49
+ new_achievement_state,
50
+ decision_index,
51
+ decision_actions,
52
+ step_rewards_indicator_lambda,
53
+ )
54
+ ...
55
+ # Compute decision-level rewards (absolute vs unique) and attach to metadata
56
+ turned_true = set(stepwise_info.get("new_achievements") or [])
57
+ seen_before = set(episode_seen_achievements)
58
+ new_unique = sorted(turned_true - seen_before)
59
+ ach_delta = int(len(turned_true))
60
+ unique_delta = int(len(new_unique))
61
+ meta_block = (_info.get("meta") if isinstance(_info.get("meta"), dict) else {})
62
+ decision_rewards = {"turn": int(decision_index), "ach_delta": ach_delta, "unique_delta": unique_delta, "all": all_list, "unique": new_unique}
63
+ decision_rewards_meta = decision_rewards
64
+ meta_block["decision_rewards"] = decision_rewards
65
+ _info["meta"] = meta_block
66
+ episode_seen_achievements.update(turned_true)
67
+ decision_samples.append(decision_record)
68
+ ```
69
+
70
+ - The simpler published Crafter app (`examples/warming_up_to_rl/task_app/grpo_crafter.py`) sets sane defaults for `step_rewards` in both env and policy when it aliases math → crafter, but the hosted rollout above is the one actually used in production paths.
71
+ ```479:490:synth-ai/examples/warming_up_to_rl/task_app/grpo_crafter.py
72
+ env_cfg.setdefault("step_rewards", dict(DEFAULT_ALIAS_STEP_REWARDS))
73
+ ...
74
+ policy_cfg.setdefault("step_rewards", dict(DEFAULT_ALIAS_STEP_REWARDS))
75
+ ```
76
+
77
+ - Backend RPC: The backend constructs the rollout HTTP payload with both env_config and policy_config; these are forwarded to the task app `/rollout`:
78
+ ```456:470:monorepo/backend/app/routes/clustered_training/core/algorithms/gspo/evaluation/evaluator.py
79
+ payload = {
80
+ "run_id": run_id,
81
+ "env": {"env_name": env_name, "config": env_config, "seed": seed},
82
+ "policy": {"policy_name": policy_name, "config": policy_config},
83
+ "ops": ops,
84
+ "record": {"trajectories": True, "logprobs": False, "value": False},
85
+ "on_done": on_done,
86
+ }
87
+ ```
88
+
89
+ - RL config ingestion: The CLI forwards the full TOML in the job payload. The backend trainer flattens some rollout options and (optionally) picks up `rollout.env_config`:
90
+ ```364:393:monorepo/backend/app/routes/clustered_training/core/algorithms/gspo/training/clustered_trainer.py
91
+ # Rollout config
92
+ if "rollout" in config_dict:
93
+ flat_config["env_name"] = config_dict["rollout"].get("env_name", "crafter")
94
+ ...
95
+ if "env_config" in config_dict["rollout"]:
96
+ flat_config["env_config"] = config_dict["rollout"]["env_config"]
97
+ ```
98
+
99
+ Implication: We can carry a broader "task app config" today via `rollout.env_config` without changing wire contracts. The task app already reads `env.config.step_rewards`.
100
+
101
+ ## Proposed configuration shape (TOML)
102
+
103
+ Recommended to use `rollout.env_config.step_rewards` so the backend passes it through to the task app:
104
+
105
+ ```toml
106
+ [rollout]
107
+ env_name = "crafter"
108
+ policy_name = "crafter-react"
109
+ max_turns = 10
110
+ ops = ["agent", "env"]
111
+
112
+ [rollout.env_config.step_rewards]
113
+ # Toggle dense per-decision rewards
114
+ enabled = true
115
+ # Supported: "off" | "decision_stepwise" | (future) "env_sparse"
116
+ mode = "decision_stepwise"
117
+ # Reward = indicator_lambda * I(unique_achievements_delta > 0)
118
+ indicator_lambda = 1.0
119
+ # Reserved for shaped/intermediate signals (currently unused)
120
+ step_beta = 0.0
121
+ ```
122
+
123
+ Optional (policy sampling, still supported via `policy.config` in the task app runner):
124
+
125
+ ```toml
126
+ [rollout.policy_config]
127
+ temperature = 0.2
128
+ top_p = 0.95
129
+ max_tokens = 512
130
+ ```
131
+
132
+ Notes:
133
+ - The hosted Crafter rollout checks `policy.config.step_rewards` first, then falls back to `env.config.step_rewards`. Prefer `env_config` as the canonical place for app-level settings.
134
+ - If you want the app to disable stepwise rewards entirely, set `enabled=false` or `mode="off"`.
135
+
136
+ ## Wire and data flow
137
+
138
+ 1) CLI → Backend: CLI includes the entire TOML in the job payload (`build_rl_payload`).
139
+ 2) Backend → Trainer: Trainer flattens rollout properties and can include `env_config`.
140
+ 3) Trainer → Task App: Rollout HTTP payload embeds `env.config` and `policy.config`.
141
+ 4) Task App: Hosted rollout computes decision-level metadata and returns:
142
+ - `RolloutStep.info.meta.decision_rewards` with `{turn, ach_delta, unique_delta, ...}`
143
+ - `trajectory.decision_samples` summarizing per-turn reward inputs
144
+
145
+ ## Minimal code changes to adopt
146
+
147
+ - synth-ai (optional):
148
+ - Add example configs under `examples/warming_up_to_rl/configs/*.toml` using `[rollout.env_config.step_rewards]`.
149
+ - Document this block in docs and the multi_step walkthrough.
150
+
151
+ - monorepo backend:
152
+ - Verify trainer always passes `rollout.env_config` (ClusteredTrainerConfig appears to support it; ensure it flows into the runtime’s rollout request builder in the trainer where the payload is assembled).
153
+ - No contract changes needed: task app already reads from `env.config`.
154
+
155
+ - Task App:
156
+ - Already supports the block; no changes needed for the hosted Crafter rollout.
157
+ - If you want a first-class `task_app_config` top-level, we can add an alias resolver that copies `config["task_app_config"]` → `env.config` inside the rollout executor.
158
+
159
+ ## Open questions / follow-ups
160
+
161
+ - Does the current trainer consume `decision_samples` or `step.info.meta.decision_rewards` for credit assignment? If not, wire this into the per-step reward/advantage pipeline.
162
+ - Decide whether to disable the default enabling of stepwise rewards in `grpo_crafter.py` aliases (`DEFAULT_ALIAS_STEP_REWARDS`) so the TOML fully drives behaviour.
163
+ - Standardize on `env_config.step_rewards` for app-level settings across environments.
164
+
165
+ ## Reference: CRAfter RL LoRA example (expected first 10 rewards)
166
+ These are the first ten batch rewards printed at RL start:
167
+ ```
168
+ - INFO - All batch rewards: [0.0625, 0.0625, 0.125, 0.0625, 0.0625, 0.3125, 0.375, 0.4375, 0.5, 0.9375]
169
+ ```
170
+
171
+ ---
172
+
173
+ ## Enable stepwise during EVALS and compare vs final
174
+
175
+ We can enable stepwise shaping for evaluation-only runs and compare “stepwise” vs “final (outcome)” returns.
176
+
177
+ Two evaluation paths exist today:
178
+
179
+ - Backend evaluator endpoint (preferred for hosted):
180
+ ```1114:1136:monorepo/backend/app/routes/clustered_training/core/routes.py
181
+ class RlEvaluateRequest(BaseModel):
182
+ model: str
183
+ seeds: list[int]
184
+ rollouts_per_seed: int = 1
185
+ env_name: str
186
+ env_config: Dict[str, Any] = Field(default_factory=dict)
187
+ policy_name: str
188
+ thinking_mode: str
189
+ thinking_budget: int | None = None
190
+ max_steps_per_episode: int = 100
191
+ max_concurrent_rollouts: int = 8
192
+ on_done: str = "terminate"
193
+ task_service_url: str | None = None
194
+ vllm_url: str | None = None
195
+ vllm_public_url: str | None = None
196
+ ```
197
+
198
+ Pass `env_config.step_rewards` here to turn on stepwise shaping during evals (no trainer changes needed). The evaluator will forward `env_config` into each rollout:
199
+ ```383:396:monorepo/backend/app/routes/clustered_training/core/algorithms/gspo/evaluation/evaluator.py
200
+ payload = {
201
+ "env": {"env_name": env_name, "config": env_config, ...},
202
+ "policy": {"policy_name": policy_name, "config": policy_config},
203
+ ...
204
+ }
205
+ ```
206
+
207
+ Task app already computes and attaches:
208
+ - Per-decision metadata at `step.info.meta.decision_rewards`
209
+ - Aggregates we can expose (see below) for stepwise vs final
210
+
211
+ Recommended enhancement (small change in task app): include a summary under `response.metrics.details.stepwise` so eval clients don’t need to parse per-step:
212
+ ```python
213
+ metrics.details["stepwise"] = {
214
+ "indicator_sum": stepwise_indicator_sum,
215
+ "reward_sum": stepwise_reward_sum,
216
+ "new_achievements_total": stepwise_new_achievements_total,
217
+ }
218
+ ```
219
+
220
+ For local SDK evals (without backend), call the `/rollout` endpoint directly with the same `env.config.step_rewards` block.
221
+
222
+ Example payload fragment:
223
+ ```json
224
+ {
225
+ "env": {
226
+ "env_name": "crafter",
227
+ "config": {
228
+ "step_rewards": { "enabled": true, "mode": "decision_stepwise", "indicator_lambda": 1.0 }
229
+ },
230
+ "seed": 0
231
+ },
232
+ "policy": { "policy_name": "crafter-react", "config": {"temperature": 0.2} },
233
+ "ops": ["agent", "env"]
234
+ }
235
+ ```
236
+
237
+ ---
238
+
239
+ ## Simple vs Complex stepwise modes (proposal)
240
+
241
+ Add a `strategy` under the existing `step_rewards` block:
242
+
243
+ ```toml
244
+ [rollout.env_config.step_rewards]
245
+ enabled = true
246
+ mode = "decision_stepwise" # gate remains the same
247
+ strategy = "simple" # "simple" | "complex"
248
+ indicator_lambda = 1.0
249
+
250
+ # Complex-only (optional)
251
+ weights = { collect_sapling = 0.1, craft_wood_pickaxe = 0.3, collect_diamond = 1.0 }
252
+ k_limits = { collect_sapling = 1, craft_wood_pickaxe = 2, collect_diamond = 3 }
253
+ ```
254
+
255
+ Behaviour:
256
+ - strategy="simple": reward 1.0×indicator_lambda if any new achievement unlocked at that decision, else 0. (Current logic already does this; just make it explicit.)
257
+ - strategy="complex":
258
+ - Maintain per-episode `achieve_count[name]`.
259
+ - For each achievement newly unlocked at the decision, if `achieve_count[name] < k_limits.get(name, 1)`, add `weights.get(name, 1.0)` to the stepwise reward and increment the count.
260
+ - The uniqueness baseline should be the “turned true this decision” set; combining with episode-level uniqueness is optional if we intend multiple rewards up to K.
261
+
262
+ Minimal code touch points:
263
+ - synth-ai task app (hosted Crafter rollout):
264
+ - Extend `compute_stepwise_reward(prev_achievements, new_achievements, decision_index, actions_summary, indicator_lambda)` to optionally take `strategy`, `weights`, `k_limits`, and a `counts` dict.
265
+ - Thread an `episode_ach_counts: Dict[str, int]` through the rollout loop (similar to `episode_seen_achievements`).
266
+ - Build `reward_stepwise` as per strategy; keep existing `decision_rewards` metadata (ach/unique deltas) unchanged.
267
+ - Add `metrics.details["stepwise"]` summary (indicator_sum, reward_sum, new_achievements_total).
268
+
269
+ - monorepo backend (evals):
270
+ - No contract change: pass the same `env_config.step_rewards` in `RlEvaluateRequest.env_config`.
271
+ - For convenience, surface stepwise summary in any eval aggregation/CSV if present under `metrics.details.stepwise`.
272
+
273
+ Open choice:
274
+ - Either keep `mode="decision_stepwise"` and add `strategy`, or introduce `mode` values `{ "simple_stepwise", "complex_stepwise" }`. The former is backward compatible and clearer.
275
+
276
+ Testing plan:
277
+ - Unit-test `compute_stepwise_reward` for both strategies with synthetic prev/new achievement maps.
278
+ - Smoke eval over a few seeds with `strategy=simple` and `strategy=complex` to compare `metrics.details.stepwise.reward_sum` vs `metrics.mean_return`.
279
+
280
+ ---
281
+
282
+ ## Eval script scope: Groq Qwen/Qwen3-32B stepwise vs outcome
283
+
284
+ Objective: run many Crafter rollouts against Groq `Qwen/Qwen3-32B` and compare distributions and correlations between stepwise rewards and final (outcome) rewards, for both simple and complex stepwise strategies.
285
+
286
+ Inputs/flags:
287
+ - `--task-url` Task app base URL (Modal deployment)
288
+ - `--env-key` ENVIRONMENT_API_KEY (or from `.env`)
289
+ - `--model` default `Qwen/Qwen3-32B`
290
+ - `--seeds` list or `--num-seeds` N (use 0..N-1)
291
+ - `--rollouts-per-seed` default 3
292
+ - `--max-turns` default 10
293
+ - `--strategy` `simple|complex|both` (default both)
294
+ - `--weights-json` optional JSON path for complex weighting
295
+ - `--k-limits-json` optional JSON path for complex K-limits
296
+ - `--out` output directory for CSV/plots
297
+
298
+ What it does:
299
+ 1) Builds rollout payloads for each seed and strategy variant.
300
+ 2) For each rollout, passes `env.config.step_rewards` with:
301
+ - common: `{ enabled: true, mode: "decision_stepwise" }`
302
+ - simple: `strategy: "simple", indicator_lambda: 1.0`
303
+ - complex: `strategy: "complex", weights, k_limits`
304
+ 3) Uses policy config to route inference to Groq with the requested model.
305
+ 4) Collects per-rollout summary:
306
+ - `final_return = response.metrics.mean_return`
307
+ - `step_indicator_sum`, `step_reward_sum`, `new_achievements_total` from `metrics.details.stepwise` (or compute from steps if absent)
308
+ - counts like `num_steps`, `tool_calls_total`
309
+ 5) Writes a wide CSV with one row per rollout, including seed, strategy, and the above fields.
310
+ 6) Visualizes:
311
+ - Histogram of `step_reward_sum` by strategy
312
+ - Scatter: `step_reward_sum` vs `final_return`, per strategy (with Pearson/Spearman r)
313
+ - Optional ECDFs for indicator_sum
314
+
315
+ Data schema (CSV):
316
+ ```
317
+ seed,int | rollout_idx,int | strategy,str | final_return,float | step_reward_sum,float |
318
+ step_indicator_sum,float | new_achievements_total,int | num_steps,int | tool_calls_total,int |
319
+ model,str | max_turns,int | timestamp,iso
320
+ ```
321
+
322
+ Pseudocode (Python):
323
+ ```python
324
+ import os, json, csv, time, math, statistics
325
+ import httpx
326
+
327
+ TASK_URL = os.environ.get("TASK_APP_URL")
328
+ ENV_KEY = os.environ.get("ENVIRONMENT_API_KEY")
329
+
330
+ def build_step_cfg(strategy, weights=None, k_limits=None):
331
+ cfg = {"enabled": True, "mode": "decision_stepwise", "strategy": strategy, "indicator_lambda": 1.0}
332
+ if strategy == "complex":
333
+ if weights: cfg["weights"] = weights
334
+ if k_limits: cfg["k_limits"] = k_limits
335
+ return cfg
336
+
337
+ async def run_rollout(seed, strategy, model, max_turns, weights, k_limits):
338
+ step_cfg = build_step_cfg(strategy, weights, k_limits)
339
+ payload = {
340
+ "run_id": f"eval-{seed}-{strategy}-{int(time.time())}",
341
+ "env": {"env_name": "crafter", "seed": seed, "config": {"step_rewards": step_cfg, "env_params": {"max_steps_per_episode": max_turns}}},
342
+ "policy": {"policy_name": "crafter-react", "config": {"inference_url": "https://groq.synth-ai.internal/proxy", "model": model, "temperature": 0.2, "top_p": 0.95, "max_tokens": 512}},
343
+ "ops": ["agent", "env"] * max_turns,
344
+ "record": {"trajectories": True},
345
+ "on_done": "terminate",
346
+ }
347
+ async with httpx.AsyncClient(timeout=300.0) as client:
348
+ r = await client.post(f"{TASK_URL}/rollout", headers={"X-API-Key": ENV_KEY}, json=payload)
349
+ r.raise_for_status()
350
+ resp = r.json()
351
+ met = resp.get("metrics", {})
352
+ details = met.get("details", {})
353
+ step = details.get("stepwise", {})
354
+ final_return = float(met.get("mean_return") or 0.0)
355
+ step_reward_sum = float(step.get("reward_sum") or 0.0)
356
+ step_indicator_sum = float(step.get("indicator_sum") or 0.0)
357
+ new_ach_total = int(step.get("new_achievements_total") or 0)
358
+ num_steps = int(met.get("num_steps") or 0)
359
+ tool_calls_total = sum(len(s.get("tool_calls", [])) for s in (resp.get("trajectories", [{}])[0].get("steps", []))) if resp.get("trajectories") else 0
360
+ return {
361
+ "seed": seed, "strategy": strategy, "final_return": final_return,
362
+ "step_reward_sum": step_reward_sum, "step_indicator_sum": step_indicator_sum,
363
+ "new_achievements_total": new_ach_total, "num_steps": num_steps,
364
+ "tool_calls_total": tool_calls_total,
365
+ }
366
+ ```
367
+
368
+ CLI example:
369
+ ```bash
370
+ uv run python tools/eval_stepwise_vs_final.py \
371
+ --task-url $TASK_APP_URL \
372
+ --env-key $ENVIRONMENT_API_KEY \
373
+ --model "Qwen/Qwen3-32B" \
374
+ --num-seeds 100 --rollouts-per-seed 3 --max-turns 10 \
375
+ --strategy both --out results/qwen32b
376
+ ```
377
+
378
+ Notes:
379
+ - The correlation/plots can be produced with `matplotlib` or `plotly`; write PNG + HTML.
380
+ - If `metrics.details.stepwise` is not yet populated by the task app, compute `indicator_sum` and `reward_sum` on the client by scanning `steps[].info.meta.decision_rewards`.
381
+
382
+ ### Output artifacts (JSON + Markdown)
383
+
384
+ Directory layout under `--out` (example: `results/qwen32b`):
385
+
386
+ - `runs/` — one JSONL file per strategy with one record per rollout
387
+ - `runs/simple.jsonl`
388
+ - `runs/complex.jsonl`
389
+ - `summary/`
390
+ - `summary.json` — aggregates per strategy (mean/median/std, correlations, counts)
391
+ - `stats_by_seed.json` — per-seed aggregates
392
+ - `config_snapshot.json` — CLI args, weights, k-limits, timestamp, git SHA
393
+ - `plots/`
394
+ - `hist_step_reward_simple.png`, `hist_step_reward_complex.png`
395
+ - `scatter_step_reward_vs_final_simple.png`, `scatter_step_reward_vs_final_complex.png`
396
+ - `ecdf_indicator_simple.png`, `ecdf_indicator_complex.png`
397
+ - `report.md` — human-friendly Markdown summary linking to plots
398
+
399
+ Record schema (per line in `runs/*.jsonl`):
400
+
401
+ ```json
402
+ {
403
+ "seed": 0,
404
+ "rollout_idx": 1,
405
+ "strategy": "simple", // or "complex"
406
+ "final_return": 0.9375,
407
+ "step_reward_sum": 1.2,
408
+ "step_indicator_sum": 3.0,
409
+ "new_achievements_total": 3,
410
+ "num_steps": 10,
411
+ "tool_calls_total": 12,
412
+ "model": "Qwen/Qwen3-32B",
413
+ "max_turns": 10,
414
+ "timestamp": "2025-10-17T22:14:05Z",
415
+ "meta": {
416
+ "task_url": "...",
417
+ "weights": {"collect_diamond": 1.0}, // only for complex
418
+ "k_limits": {"collect_diamond": 3} // only for complex
419
+ }
420
+ }
421
+ ```
422
+
423
+ Markdown report (`report.md`) outline:
424
+
425
+ - Title, timestamp, model, config summary
426
+ - Table of global aggregates per strategy (mean, median, std of `final_return`, `step_reward_sum`, correlations r/Pearson, ρ/Spearman)
427
+ - Small tables per-seed (top-N seeds by return)
428
+ - Inline images linking to `plots/*.png`
429
+ - Pointers to raw data files and `config_snapshot.json`
430
+
431
+
432
+ I'll scan both repos for Crafter RL, policy/task app config, rollout calls, and backend RL endpoints, then draft notes under `examples/multi_step/`.
433
+
434
+ [5 tools called]
435
+
436
+ I'll open the Crafter task app implementation to inspect rubrics and where stepwise reward configuration is applied, then check the RL evaluator to see how env/policy config is passed to the task app.
437
+
438
+ [11 tools called]
439
+
440
+ I scanned both repos to map how stepwise rewards already work in the Crafter task app and how rollout configs flow from the RL TOML into the task app. I’ll add a detailed notes document under `examples/multi_step/` with findings, code references, and a concrete config proposal.
441
+
442
+ [6 tools called]
443
+
444
+ - I researched Crafter RL and documented how to toggle dense stepwise rewards and flow the config end-to-end.
445
+ - Notes added at `synth-ai/examples/multi_step/task_app_config_notes.md` with exact code references and a TOML example.
446
+
447
+ Key points:
448
+ - Use `rollout.env_config.step_rewards` in the RL TOML; task app already reads `policy.config.step_rewards` first, then `env.config.step_rewards`.
449
+ - Enable with:
450
+ ```toml
451
+ [rollout.env_config.step_rewards]
452
+ enabled = true
453
+ mode = "decision_stepwise"
454
+ indicator_lambda = 1.0
455
+ step_beta = 0.0
456
+ ```
457
+ - Backend already forwards `env_config` and `policy_config` in rollout requests; no wire change needed. Ensure trainer populates `env_config` when present.
458
+ - Task app attaches per-turn `decision_rewards` and returns `decision_samples`; consider consuming these in credit assignment.
459
+
460
+ References:
461
+ - Task contracts and rollout schema:
462
+ ```51:87:synth-ai/synth_ai/task/contracts.py
463
+ class RolloutEnvSpec(BaseModel):
464
+ ...
465
+ config: dict[str, Any] = Field(default_factory=dict)
466
+ ...
467
+ class RolloutPolicySpec(BaseModel):
468
+ ...
469
+ config: dict[str, Any] = Field(default_factory=dict)
470
+ ```
471
+ - Crafter stepwise config reading and reward metadata:
472
+ ```1041:1067:synth-ai/examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py
473
+ # Stepwise reward configuration ...
474
+ ```
475
+ ```1554:1596:synth-ai/examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py
476
+ # Compute decision-level rewards ...
477
+ ```
478
+ - Backend rollout payload shape:
479
+ ```456:470:monorepo/backend/app/routes/clustered_training/core/algorithms/gspo/evaluation/evaluator.py
480
+ payload = {
481
+ "run_id": run_id,
482
+ "env": {"env_name": env_name, "config": env_config, "seed": seed},
483
+ "policy": {"policy_name": policy_name, "config": policy_config},
484
+ ...
485
+ }
486
+ ```
487
+
488
+ Status: Notes committed on branch `friday-cleanup` and pushed.
@@ -62,3 +62,4 @@ async def main() -> None:
62
62
  if __name__ == "__main__":
63
63
  asyncio.run(main())
64
64
 
65
+
@@ -16,3 +16,4 @@ fi
16
16
  uv run python examples/qwen_coder/infer_prod_proxy.py
17
17
 
18
18
 
19
+
@@ -19,3 +19,4 @@ uvx synth-ai train \
19
19
  --env-file "${ENV_FILE:-}"
20
20
 
21
21
 
22
+
@@ -36,3 +36,4 @@ if __name__ == "__main__":
36
36
  main()
37
37
 
38
38
 
39
+
@@ -0,0 +1,38 @@
1
+ # Qwen Coder – Remaining TODOs
2
+
3
+ - [ ] Add small-base LoRA config for quick iteration
4
+ - Create `configs/coder_lora_4b.toml` (base=`Qwen/Qwen3-4B`, 1x H100, LoRA all-linear, same hyperparameters structure as 30B).
5
+
6
+ - [ ] Improve SFT submission script (sft_lora_30b.py)
7
+ - Include `metadata.effective_config.compute` in job payload (gpu_type, gpu_count, nodes) so API doesn’t 400 without TOML.
8
+ - Write resulting `ft:<id>` to `examples/qwen_coder/ft_data/ft_model_id.txt` and print it clearly.
9
+ - Add optional validation file support when present.
10
+
11
+ - [ ] Add post‑SFT inference script
12
+ - Read `ft_data/ft_model_id.txt` and call the prod proxy (or SDK InferenceClient) to verify the finetuned adapter returns.
13
+ - Save a short transcript to `ft_data/ft_infer_smoke.txt`.
14
+
15
+ - [ ] Add inference smoke tests (local opt‑in)
16
+ - `tests/qwen_coder/test_infer_prod_proxy.py` (skips unless `SYNTH_API_KEY` set). Hits `/api/inference/v1/chat/completions` with `Qwen/Qwen3-Coder-30B-A3B-Instruct` and asserts 200/choices.
17
+ - Optional: same test for an `ft:<id>` if `FT_MODEL_ID` env is provided.
18
+
19
+ - [ ] Document end‑to‑end flow in README
20
+ - Expand README with explicit env section (`SYNTH_API_KEY`, `BACKEND_BASE_URL`).
21
+ - Show: generate dataset → run LoRA (4B or 30B) → poll → infer with `ft:<id>`.
22
+ - Mention cost/time caveats for 30B.
23
+
24
+ - [ ] Dataset utilities
25
+ - Add `validate_jsonl.py` to check first N lines parse and contain `messages`/`assistant` fields required by SFT.
26
+ - Add `subset_jsonl.py` to create capped training sets for quick runs.
27
+
28
+ - [ ] Optional: CLI convenience wrappers
29
+ - `scripts/train_coder_30b.sh` to invoke `uvx synth-ai train --type sft --config configs/coder_lora_30b.toml --dataset ft_data/coder_sft.small.jsonl` with `.env` preload.
30
+ - `scripts/infer_coder.sh` to run `infer_prod_proxy.py` against base or `ft:<id>`.
31
+
32
+ - [ ] Optional CI (requires secrets)
33
+ - GitHub workflow job (smoke) that runs `infer_prod_proxy.py` with `SYNTH_API_KEY` secret and prints the first 200 chars of assistant output.
34
+
35
+ - [ ] (If needed) Add coder variants
36
+ - If backend supports additional coder SKUs, append to `synth_ai/api/models/supported.py:QWEN3_CODER_MODELS` so SDK validation passes (SFT/inference).
37
+
38
+
@@ -57,3 +57,4 @@ if __name__ == "__main__":
57
57
  main()
58
58
 
59
59
 
60
+