synth-ai 0.2.9.dev17__py3-none-any.whl → 0.2.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/dev/qwen3_32b_qlora_4xh100.toml +40 -0
- examples/multi_step/crafter_rl_lora.md +29 -0
- examples/multi_step/task_app_config_notes.md +488 -0
- examples/qwen_coder/infer_ft_smoke.py +1 -0
- examples/qwen_coder/scripts/infer_coder.sh +1 -0
- examples/qwen_coder/scripts/train_coder_30b.sh +1 -0
- examples/qwen_coder/subset_jsonl.py +1 -0
- examples/qwen_coder/todos.md +38 -0
- examples/qwen_coder/validate_jsonl.py +1 -0
- examples/vlm/PROPOSAL.md +53 -0
- examples/warming_up_to_rl/configs/eval_stepwise_complex.toml +33 -0
- examples/warming_up_to_rl/configs/eval_stepwise_consistent.toml +26 -0
- examples/warming_up_to_rl/configs/eval_stepwise_per_achievement.toml +36 -0
- examples/warming_up_to_rl/configs/eval_stepwise_simple.toml +30 -0
- examples/warming_up_to_rl/old/event_rewards.md +234 -0
- examples/warming_up_to_rl/old/notes.md +73 -0
- examples/warming_up_to_rl/run_eval.py +142 -25
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +146 -2
- synth_ai/__init__.py +5 -20
- synth_ai/api/train/builders.py +25 -14
- synth_ai/api/train/cli.py +29 -6
- synth_ai/api/train/env_resolver.py +18 -19
- synth_ai/api/train/supported_algos.py +8 -5
- synth_ai/api/train/utils.py +6 -1
- synth_ai/cli/__init__.py +4 -2
- synth_ai/cli/_storage.py +19 -0
- synth_ai/cli/balance.py +14 -2
- synth_ai/cli/calc.py +37 -22
- synth_ai/cli/legacy_root_backup.py +12 -14
- synth_ai/cli/recent.py +12 -7
- synth_ai/cli/root.py +1 -23
- synth_ai/cli/status.py +4 -3
- synth_ai/cli/task_apps.py +143 -137
- synth_ai/cli/traces.py +4 -3
- synth_ai/cli/watch.py +3 -2
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +738 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +580 -0
- synth_ai/jobs/client.py +15 -3
- synth_ai/task/server.py +14 -7
- synth_ai/tracing_v3/decorators.py +51 -26
- synth_ai/tracing_v3/examples/basic_usage.py +12 -7
- synth_ai/tracing_v3/llm_call_record_helpers.py +107 -53
- synth_ai/tracing_v3/replica_sync.py +8 -4
- synth_ai/tracing_v3/storage/utils.py +11 -9
- synth_ai/tracing_v3/turso/__init__.py +12 -0
- synth_ai/tracing_v3/turso/daemon.py +2 -1
- synth_ai/tracing_v3/turso/native_manager.py +28 -15
- {synth_ai-0.2.9.dev17.dist-info → synth_ai-0.2.12.dist-info}/METADATA +33 -88
- {synth_ai-0.2.9.dev17.dist-info → synth_ai-0.2.12.dist-info}/RECORD +53 -41
- {synth_ai-0.2.9.dev17.dist-info → synth_ai-0.2.12.dist-info}/top_level.txt +0 -1
- synth/__init__.py +0 -14
- synth_ai/_docs_message.py +0 -10
- synth_ai/main.py +0 -5
- {synth_ai-0.2.9.dev17.dist-info → synth_ai-0.2.12.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.9.dev17.dist-info → synth_ai-0.2.12.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.9.dev17.dist-info → synth_ai-0.2.12.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
[job]
|
|
2
|
+
model = "Qwen/Qwen3-32B"
|
|
3
|
+
# Optionally set here; you can also pass --dataset
|
|
4
|
+
# data = "/abs/path/to/train.jsonl"
|
|
5
|
+
|
|
6
|
+
[compute]
|
|
7
|
+
gpu_type = "H100"
|
|
8
|
+
gpu_count = 4
|
|
9
|
+
nodes = 1
|
|
10
|
+
|
|
11
|
+
[data]
|
|
12
|
+
# Optional; forwarded into metadata.effective_config.data.topology
|
|
13
|
+
topology = { container_count = 4 }
|
|
14
|
+
|
|
15
|
+
[training]
|
|
16
|
+
mode = "sft_offline"
|
|
17
|
+
use_qlora = true
|
|
18
|
+
|
|
19
|
+
[training.validation]
|
|
20
|
+
enabled = true
|
|
21
|
+
evaluation_strategy = "steps"
|
|
22
|
+
eval_steps = 20
|
|
23
|
+
save_best_model_at_end = true
|
|
24
|
+
metric_for_best_model = "val.loss"
|
|
25
|
+
greater_is_better = false
|
|
26
|
+
|
|
27
|
+
[hyperparameters]
|
|
28
|
+
n_epochs = 1
|
|
29
|
+
per_device_batch = 1
|
|
30
|
+
gradient_accumulation_steps = 64
|
|
31
|
+
sequence_length = 4096
|
|
32
|
+
learning_rate = 5e-6
|
|
33
|
+
warmup_ratio = 0.03
|
|
34
|
+
|
|
35
|
+
[hyperparameters.parallelism]
|
|
36
|
+
use_deepspeed = true
|
|
37
|
+
deepspeed_stage = 2
|
|
38
|
+
bf16 = true
|
|
39
|
+
fp16 = false
|
|
40
|
+
fsdp = false
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Crafter RL LoRA (10-step runs)
|
|
2
|
+
|
|
3
|
+
This walkthrough shows how to fine-tune the Crafter task app with our 10-step RL LoRA config.
|
|
4
|
+
|
|
5
|
+
1. **Start the Crafter task app on Modal (with tracing + text-only prompts)**
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
BACKEND_BASE_URL=https://agent-learning.onrender.com/api \
|
|
9
|
+
uvx synth-ai modal-serve grpo-crafter \
|
|
10
|
+
--env-file examples/warming_up_to_rl/.env \
|
|
11
|
+
--name grpo-crafter-task-app
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
* Deploys the Modal task app with the tracing/text-only fixes baked in.*
|
|
15
|
+
|
|
16
|
+
2. **Launch the RL job using the updated LoRA config**
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
uvx synth-ai train --type rl \
|
|
20
|
+
--config tests/artifacts/configs/rl.lora.small.toml \
|
|
21
|
+
--backend https://agent-learning.onrender.com/api \
|
|
22
|
+
--env-file .env \
|
|
23
|
+
--no-poll
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
* This config forces 10 agent turns per rollout, reduces batch size to avoid OOMs, and enforces Crafter-specific defaults.*
|
|
27
|
+
|
|
28
|
+
INFO - 🎉 Training completed successfully!
|
|
29
|
+
INFO - All batch rewards: [0.0625, 0.0625, 0.125, 0.0625, 0.0625, 0.3125, 0.375, 0.4375, 0.5, 0.9375]
|
|
@@ -0,0 +1,488 @@
|
|
|
1
|
+
# Task App Config for Crafter RL: Dense Stepwise Rewards
|
|
2
|
+
|
|
3
|
+
Goal: Allow configuring the Crafter task app to enable/disable dense (decision-stepwise) event rewards and pass that choice from the RL config, through the backend, into the task app’s /rollout execution. This should be broader than just policy config – a `task_app_config` concept – but we can implement with the existing `env.config` today and optionally add a top-level alias later.
|
|
4
|
+
|
|
5
|
+
## Findings (current behaviour)
|
|
6
|
+
|
|
7
|
+
- Rollout request contract already supports two config payloads:
|
|
8
|
+
- `env.config: dict` and `policy.config: dict`
|
|
9
|
+
- The hosted Crafter rollout implementation already supports decision-stepwise rewards, controlled via a `step_rewards` block in either `policy.config` or `env.config`.
|
|
10
|
+
- When active, it computes per-decision “unique achievement” deltas and attaches per-turn metadata; it also returns `decision_samples` when enabled.
|
|
11
|
+
|
|
12
|
+
Key locations and behaviour:
|
|
13
|
+
|
|
14
|
+
- Rollout schema (env/policy config):
|
|
15
|
+
```51:87:synth-ai/synth_ai/task/contracts.py
|
|
16
|
+
class RolloutEnvSpec(BaseModel):
|
|
17
|
+
env_id: str | None = None
|
|
18
|
+
env_name: str | None = None
|
|
19
|
+
config: dict[str, Any] = Field(default_factory=dict)
|
|
20
|
+
seed: int | None = None
|
|
21
|
+
|
|
22
|
+
class RolloutPolicySpec(BaseModel):
|
|
23
|
+
policy_id: str | None = None
|
|
24
|
+
policy_name: str | None = None
|
|
25
|
+
config: dict[str, Any] = Field(default_factory=dict)
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
- Crafter hosted rollout reads step-reward config from policy, then env; gates on `enabled` and `mode == "decision_stepwise"`:
|
|
29
|
+
```1041:1067:synth-ai/examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py
|
|
30
|
+
# Stepwise reward configuration (Crafter shaping; gate on explicit enable)
|
|
31
|
+
step_rewards_cfg_raw: dict[str, Any] = {}
|
|
32
|
+
...
|
|
33
|
+
if not step_rewards_cfg_raw:
|
|
34
|
+
if isinstance(request.env.config, dict):
|
|
35
|
+
step_rewards_cfg_raw = dict(request.env.config.get("step_rewards") or {})
|
|
36
|
+
|
|
37
|
+
step_rewards_enabled = bool(step_rewards_cfg_raw.get("enabled", False))
|
|
38
|
+
step_rewards_mode = str(step_rewards_cfg_raw.get("mode") or "off").lower()
|
|
39
|
+
...
|
|
40
|
+
step_rewards_active = step_rewards_enabled and step_rewards_mode == "decision_stepwise"
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
- When active, it computes decision-level indicators and metadata, and adds to each step’s `info.meta.decision_rewards`; also accumulates `decision_samples`:
|
|
44
|
+
```1554:1596:synth-ai/examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py
|
|
45
|
+
if step_rewards_active:
|
|
46
|
+
decision_actions = _summarize_tool_calls(pending_tool_calls)
|
|
47
|
+
stepwise_info, decision_record, stats = compute_stepwise_reward(
|
|
48
|
+
prev_achievements or {},
|
|
49
|
+
new_achievement_state,
|
|
50
|
+
decision_index,
|
|
51
|
+
decision_actions,
|
|
52
|
+
step_rewards_indicator_lambda,
|
|
53
|
+
)
|
|
54
|
+
...
|
|
55
|
+
# Compute decision-level rewards (absolute vs unique) and attach to metadata
|
|
56
|
+
turned_true = set(stepwise_info.get("new_achievements") or [])
|
|
57
|
+
seen_before = set(episode_seen_achievements)
|
|
58
|
+
new_unique = sorted(turned_true - seen_before)
|
|
59
|
+
ach_delta = int(len(turned_true))
|
|
60
|
+
unique_delta = int(len(new_unique))
|
|
61
|
+
meta_block = (_info.get("meta") if isinstance(_info.get("meta"), dict) else {})
|
|
62
|
+
decision_rewards = {"turn": int(decision_index), "ach_delta": ach_delta, "unique_delta": unique_delta, "all": all_list, "unique": new_unique}
|
|
63
|
+
decision_rewards_meta = decision_rewards
|
|
64
|
+
meta_block["decision_rewards"] = decision_rewards
|
|
65
|
+
_info["meta"] = meta_block
|
|
66
|
+
episode_seen_achievements.update(turned_true)
|
|
67
|
+
decision_samples.append(decision_record)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
- The simpler published Crafter app (`examples/warming_up_to_rl/task_app/grpo_crafter.py`) sets sane defaults for `step_rewards` in both env and policy when it aliases math → crafter, but the hosted rollout above is the one actually used in production paths.
|
|
71
|
+
```479:490:synth-ai/examples/warming_up_to_rl/task_app/grpo_crafter.py
|
|
72
|
+
env_cfg.setdefault("step_rewards", dict(DEFAULT_ALIAS_STEP_REWARDS))
|
|
73
|
+
...
|
|
74
|
+
policy_cfg.setdefault("step_rewards", dict(DEFAULT_ALIAS_STEP_REWARDS))
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
- Backend RPC: The backend constructs the rollout HTTP payload with both env_config and policy_config; these are forwarded to the task app `/rollout`:
|
|
78
|
+
```456:470:monorepo/backend/app/routes/clustered_training/core/algorithms/gspo/evaluation/evaluator.py
|
|
79
|
+
payload = {
|
|
80
|
+
"run_id": run_id,
|
|
81
|
+
"env": {"env_name": env_name, "config": env_config, "seed": seed},
|
|
82
|
+
"policy": {"policy_name": policy_name, "config": policy_config},
|
|
83
|
+
"ops": ops,
|
|
84
|
+
"record": {"trajectories": True, "logprobs": False, "value": False},
|
|
85
|
+
"on_done": on_done,
|
|
86
|
+
}
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
- RL config ingestion: The CLI forwards the full TOML in the job payload. The backend trainer flattens some rollout options and (optionally) picks up `rollout.env_config`:
|
|
90
|
+
```364:393:monorepo/backend/app/routes/clustered_training/core/algorithms/gspo/training/clustered_trainer.py
|
|
91
|
+
# Rollout config
|
|
92
|
+
if "rollout" in config_dict:
|
|
93
|
+
flat_config["env_name"] = config_dict["rollout"].get("env_name", "crafter")
|
|
94
|
+
...
|
|
95
|
+
if "env_config" in config_dict["rollout"]:
|
|
96
|
+
flat_config["env_config"] = config_dict["rollout"]["env_config"]
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Implication: We can carry a broader "task app config" today via `rollout.env_config` without changing wire contracts. The task app already reads `env.config.step_rewards`.
|
|
100
|
+
|
|
101
|
+
## Proposed configuration shape (TOML)
|
|
102
|
+
|
|
103
|
+
Recommended to use `rollout.env_config.step_rewards` so the backend passes it through to the task app:
|
|
104
|
+
|
|
105
|
+
```toml
|
|
106
|
+
[rollout]
|
|
107
|
+
env_name = "crafter"
|
|
108
|
+
policy_name = "crafter-react"
|
|
109
|
+
max_turns = 10
|
|
110
|
+
ops = ["agent", "env"]
|
|
111
|
+
|
|
112
|
+
[rollout.env_config.step_rewards]
|
|
113
|
+
# Toggle dense per-decision rewards
|
|
114
|
+
enabled = true
|
|
115
|
+
# Supported: "off" | "decision_stepwise" | (future) "env_sparse"
|
|
116
|
+
mode = "decision_stepwise"
|
|
117
|
+
# Reward = indicator_lambda * I(unique_achievements_delta > 0)
|
|
118
|
+
indicator_lambda = 1.0
|
|
119
|
+
# Reserved for shaped/intermediate signals (currently unused)
|
|
120
|
+
step_beta = 0.0
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Optional (policy sampling, still supported via `policy.config` in the task app runner):
|
|
124
|
+
|
|
125
|
+
```toml
|
|
126
|
+
[rollout.policy_config]
|
|
127
|
+
temperature = 0.2
|
|
128
|
+
top_p = 0.95
|
|
129
|
+
max_tokens = 512
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
Notes:
|
|
133
|
+
- The hosted Crafter rollout checks `policy.config.step_rewards` first, then falls back to `env.config.step_rewards`. Prefer `env_config` as the canonical place for app-level settings.
|
|
134
|
+
- If you want the app to disable stepwise rewards entirely, set `enabled=false` or `mode="off"`.
|
|
135
|
+
|
|
136
|
+
## Wire and data flow
|
|
137
|
+
|
|
138
|
+
1) CLI → Backend: CLI includes the entire TOML in the job payload (`build_rl_payload`).
|
|
139
|
+
2) Backend → Trainer: Trainer flattens rollout properties and can include `env_config`.
|
|
140
|
+
3) Trainer → Task App: Rollout HTTP payload embeds `env.config` and `policy.config`.
|
|
141
|
+
4) Task App: Hosted rollout computes decision-level metadata and returns:
|
|
142
|
+
- `RolloutStep.info.meta.decision_rewards` with `{turn, ach_delta, unique_delta, ...}`
|
|
143
|
+
- `trajectory.decision_samples` summarizing per-turn reward inputs
|
|
144
|
+
|
|
145
|
+
## Minimal code changes to adopt
|
|
146
|
+
|
|
147
|
+
- synth-ai (optional):
|
|
148
|
+
- Add example configs under `examples/warming_up_to_rl/configs/*.toml` using `[rollout.env_config.step_rewards]`.
|
|
149
|
+
- Document this block in docs and the multi_step walkthrough.
|
|
150
|
+
|
|
151
|
+
- monorepo backend:
|
|
152
|
+
- Verify trainer always passes `rollout.env_config` (ClusteredTrainerConfig appears to support it; ensure it flows into the runtime’s rollout request builder in the trainer where the payload is assembled).
|
|
153
|
+
- No contract changes needed: task app already reads from `env.config`.
|
|
154
|
+
|
|
155
|
+
- Task App:
|
|
156
|
+
- Already supports the block; no changes needed for the hosted Crafter rollout.
|
|
157
|
+
- If you want a first-class `task_app_config` top-level, we can add an alias resolver that copies `config["task_app_config"]` → `env.config` inside the rollout executor.
|
|
158
|
+
|
|
159
|
+
## Open questions / follow-ups
|
|
160
|
+
|
|
161
|
+
- Does the current trainer consume `decision_samples` or `step.info.meta.decision_rewards` for credit assignment? If not, wire this into the per-step reward/advantage pipeline.
|
|
162
|
+
- Decide whether to disable the default enabling of stepwise rewards in `grpo_crafter.py` aliases (`DEFAULT_ALIAS_STEP_REWARDS`) so the TOML fully drives behaviour.
|
|
163
|
+
- Standardize on `env_config.step_rewards` for app-level settings across environments.
|
|
164
|
+
|
|
165
|
+
## Reference: CRAfter RL LoRA example (expected first 10 rewards)
|
|
166
|
+
These are the first ten batch rewards printed at RL start:
|
|
167
|
+
```
|
|
168
|
+
- INFO - All batch rewards: [0.0625, 0.0625, 0.125, 0.0625, 0.0625, 0.3125, 0.375, 0.4375, 0.5, 0.9375]
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
---
|
|
172
|
+
|
|
173
|
+
## Enable stepwise during EVALS and compare vs final
|
|
174
|
+
|
|
175
|
+
We can enable stepwise shaping for evaluation-only runs and compare “stepwise” vs “final (outcome)” returns.
|
|
176
|
+
|
|
177
|
+
Two evaluation paths exist today:
|
|
178
|
+
|
|
179
|
+
- Backend evaluator endpoint (preferred for hosted):
|
|
180
|
+
```1114:1136:monorepo/backend/app/routes/clustered_training/core/routes.py
|
|
181
|
+
class RlEvaluateRequest(BaseModel):
|
|
182
|
+
model: str
|
|
183
|
+
seeds: list[int]
|
|
184
|
+
rollouts_per_seed: int = 1
|
|
185
|
+
env_name: str
|
|
186
|
+
env_config: Dict[str, Any] = Field(default_factory=dict)
|
|
187
|
+
policy_name: str
|
|
188
|
+
thinking_mode: str
|
|
189
|
+
thinking_budget: int | None = None
|
|
190
|
+
max_steps_per_episode: int = 100
|
|
191
|
+
max_concurrent_rollouts: int = 8
|
|
192
|
+
on_done: str = "terminate"
|
|
193
|
+
task_service_url: str | None = None
|
|
194
|
+
vllm_url: str | None = None
|
|
195
|
+
vllm_public_url: str | None = None
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
Pass `env_config.step_rewards` here to turn on stepwise shaping during evals (no trainer changes needed). The evaluator will forward `env_config` into each rollout:
|
|
199
|
+
```383:396:monorepo/backend/app/routes/clustered_training/core/algorithms/gspo/evaluation/evaluator.py
|
|
200
|
+
payload = {
|
|
201
|
+
"env": {"env_name": env_name, "config": env_config, ...},
|
|
202
|
+
"policy": {"policy_name": policy_name, "config": policy_config},
|
|
203
|
+
...
|
|
204
|
+
}
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
Task app already computes and attaches:
|
|
208
|
+
- Per-decision metadata at `step.info.meta.decision_rewards`
|
|
209
|
+
- Aggregates we can expose (see below) for stepwise vs final
|
|
210
|
+
|
|
211
|
+
Recommended enhancement (small change in task app): include a summary under `response.metrics.details.stepwise` so eval clients don’t need to parse per-step:
|
|
212
|
+
```python
|
|
213
|
+
metrics.details["stepwise"] = {
|
|
214
|
+
"indicator_sum": stepwise_indicator_sum,
|
|
215
|
+
"reward_sum": stepwise_reward_sum,
|
|
216
|
+
"new_achievements_total": stepwise_new_achievements_total,
|
|
217
|
+
}
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
For local SDK evals (without backend), call the `/rollout` endpoint directly with the same `env.config.step_rewards` block.
|
|
221
|
+
|
|
222
|
+
Example payload fragment:
|
|
223
|
+
```json
|
|
224
|
+
{
|
|
225
|
+
"env": {
|
|
226
|
+
"env_name": "crafter",
|
|
227
|
+
"config": {
|
|
228
|
+
"step_rewards": { "enabled": true, "mode": "decision_stepwise", "indicator_lambda": 1.0 }
|
|
229
|
+
},
|
|
230
|
+
"seed": 0
|
|
231
|
+
},
|
|
232
|
+
"policy": { "policy_name": "crafter-react", "config": {"temperature": 0.2} },
|
|
233
|
+
"ops": ["agent", "env"]
|
|
234
|
+
}
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
---
|
|
238
|
+
|
|
239
|
+
## Simple vs Complex stepwise modes (proposal)
|
|
240
|
+
|
|
241
|
+
Add a `strategy` under the existing `step_rewards` block:
|
|
242
|
+
|
|
243
|
+
```toml
|
|
244
|
+
[rollout.env_config.step_rewards]
|
|
245
|
+
enabled = true
|
|
246
|
+
mode = "decision_stepwise" # gate remains the same
|
|
247
|
+
strategy = "simple" # "simple" | "complex"
|
|
248
|
+
indicator_lambda = 1.0
|
|
249
|
+
|
|
250
|
+
# Complex-only (optional)
|
|
251
|
+
weights = { collect_sapling = 0.1, craft_wood_pickaxe = 0.3, collect_diamond = 1.0 }
|
|
252
|
+
k_limits = { collect_sapling = 1, craft_wood_pickaxe = 2, collect_diamond = 3 }
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
Behaviour:
|
|
256
|
+
- strategy="simple": reward 1.0×indicator_lambda if any new achievement unlocked at that decision, else 0. (Current logic already does this; just make it explicit.)
|
|
257
|
+
- strategy="complex":
|
|
258
|
+
- Maintain per-episode `achieve_count[name]`.
|
|
259
|
+
- For each achievement newly unlocked at the decision, if `achieve_count[name] < k_limits.get(name, 1)`, add `weights.get(name, 1.0)` to the stepwise reward and increment the count.
|
|
260
|
+
- The uniqueness baseline should be the “turned true this decision” set; combining with episode-level uniqueness is optional if we intend multiple rewards up to K.
|
|
261
|
+
|
|
262
|
+
Minimal code touch points:
|
|
263
|
+
- synth-ai task app (hosted Crafter rollout):
|
|
264
|
+
- Extend `compute_stepwise_reward(prev_achievements, new_achievements, decision_index, actions_summary, indicator_lambda)` to optionally take `strategy`, `weights`, `k_limits`, and a `counts` dict.
|
|
265
|
+
- Thread an `episode_ach_counts: Dict[str, int]` through the rollout loop (similar to `episode_seen_achievements`).
|
|
266
|
+
- Build `reward_stepwise` as per strategy; keep existing `decision_rewards` metadata (ach/unique deltas) unchanged.
|
|
267
|
+
- Add `metrics.details["stepwise"]` summary (indicator_sum, reward_sum, new_achievements_total).
|
|
268
|
+
|
|
269
|
+
- monorepo backend (evals):
|
|
270
|
+
- No contract change: pass the same `env_config.step_rewards` in `RlEvaluateRequest.env_config`.
|
|
271
|
+
- For convenience, surface stepwise summary in any eval aggregation/CSV if present under `metrics.details.stepwise`.
|
|
272
|
+
|
|
273
|
+
Open choice:
|
|
274
|
+
- Either keep `mode="decision_stepwise"` and add `strategy`, or introduce `mode` values `{ "simple_stepwise", "complex_stepwise" }`. The former is backward compatible and clearer.
|
|
275
|
+
|
|
276
|
+
Testing plan:
|
|
277
|
+
- Unit-test `compute_stepwise_reward` for both strategies with synthetic prev/new achievement maps.
|
|
278
|
+
- Smoke eval over a few seeds with `strategy=simple` and `strategy=complex` to compare `metrics.details.stepwise.reward_sum` vs `metrics.mean_return`.
|
|
279
|
+
|
|
280
|
+
---
|
|
281
|
+
|
|
282
|
+
## Eval script scope: Groq Qwen/Qwen3-32B stepwise vs outcome
|
|
283
|
+
|
|
284
|
+
Objective: run many Crafter rollouts against Groq `Qwen/Qwen3-32B` and compare distributions and correlations between stepwise rewards and final (outcome) rewards, for both simple and complex stepwise strategies.
|
|
285
|
+
|
|
286
|
+
Inputs/flags:
|
|
287
|
+
- `--task-url` Task app base URL (Modal deployment)
|
|
288
|
+
- `--env-key` ENVIRONMENT_API_KEY (or from `.env`)
|
|
289
|
+
- `--model` default `Qwen/Qwen3-32B`
|
|
290
|
+
- `--seeds` list or `--num-seeds` N (use 0..N-1)
|
|
291
|
+
- `--rollouts-per-seed` default 3
|
|
292
|
+
- `--max-turns` default 10
|
|
293
|
+
- `--strategy` `simple|complex|both` (default both)
|
|
294
|
+
- `--weights-json` optional JSON path for complex weighting
|
|
295
|
+
- `--k-limits-json` optional JSON path for complex K-limits
|
|
296
|
+
- `--out` output directory for CSV/plots
|
|
297
|
+
|
|
298
|
+
What it does:
|
|
299
|
+
1) Builds rollout payloads for each seed and strategy variant.
|
|
300
|
+
2) For each rollout, passes `env.config.step_rewards` with:
|
|
301
|
+
- common: `{ enabled: true, mode: "decision_stepwise" }`
|
|
302
|
+
- simple: `strategy: "simple", indicator_lambda: 1.0`
|
|
303
|
+
- complex: `strategy: "complex", weights, k_limits`
|
|
304
|
+
3) Uses policy config to route inference to Groq with the requested model.
|
|
305
|
+
4) Collects per-rollout summary:
|
|
306
|
+
- `final_return = response.metrics.mean_return`
|
|
307
|
+
- `step_indicator_sum`, `step_reward_sum`, `new_achievements_total` from `metrics.details.stepwise` (or compute from steps if absent)
|
|
308
|
+
- counts like `num_steps`, `tool_calls_total`
|
|
309
|
+
5) Writes a wide CSV with one row per rollout, including seed, strategy, and the above fields.
|
|
310
|
+
6) Visualizes:
|
|
311
|
+
- Histogram of `step_reward_sum` by strategy
|
|
312
|
+
- Scatter: `step_reward_sum` vs `final_return`, per strategy (with Pearson/Spearman r)
|
|
313
|
+
- Optional ECDFs for indicator_sum
|
|
314
|
+
|
|
315
|
+
Data schema (CSV):
|
|
316
|
+
```
|
|
317
|
+
seed,int | rollout_idx,int | strategy,str | final_return,float | step_reward_sum,float |
|
|
318
|
+
step_indicator_sum,float | new_achievements_total,int | num_steps,int | tool_calls_total,int |
|
|
319
|
+
model,str | max_turns,int | timestamp,iso
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
Pseudocode (Python):
|
|
323
|
+
```python
|
|
324
|
+
import os, json, csv, time, math, statistics
|
|
325
|
+
import httpx
|
|
326
|
+
|
|
327
|
+
TASK_URL = os.environ.get("TASK_APP_URL")
|
|
328
|
+
ENV_KEY = os.environ.get("ENVIRONMENT_API_KEY")
|
|
329
|
+
|
|
330
|
+
def build_step_cfg(strategy, weights=None, k_limits=None):
|
|
331
|
+
cfg = {"enabled": True, "mode": "decision_stepwise", "strategy": strategy, "indicator_lambda": 1.0}
|
|
332
|
+
if strategy == "complex":
|
|
333
|
+
if weights: cfg["weights"] = weights
|
|
334
|
+
if k_limits: cfg["k_limits"] = k_limits
|
|
335
|
+
return cfg
|
|
336
|
+
|
|
337
|
+
async def run_rollout(seed, strategy, model, max_turns, weights, k_limits):
|
|
338
|
+
step_cfg = build_step_cfg(strategy, weights, k_limits)
|
|
339
|
+
payload = {
|
|
340
|
+
"run_id": f"eval-{seed}-{strategy}-{int(time.time())}",
|
|
341
|
+
"env": {"env_name": "crafter", "seed": seed, "config": {"step_rewards": step_cfg, "env_params": {"max_steps_per_episode": max_turns}}},
|
|
342
|
+
"policy": {"policy_name": "crafter-react", "config": {"inference_url": "https://groq.synth-ai.internal/proxy", "model": model, "temperature": 0.2, "top_p": 0.95, "max_tokens": 512}},
|
|
343
|
+
"ops": ["agent", "env"] * max_turns,
|
|
344
|
+
"record": {"trajectories": True},
|
|
345
|
+
"on_done": "terminate",
|
|
346
|
+
}
|
|
347
|
+
async with httpx.AsyncClient(timeout=300.0) as client:
|
|
348
|
+
r = await client.post(f"{TASK_URL}/rollout", headers={"X-API-Key": ENV_KEY}, json=payload)
|
|
349
|
+
r.raise_for_status()
|
|
350
|
+
resp = r.json()
|
|
351
|
+
met = resp.get("metrics", {})
|
|
352
|
+
details = met.get("details", {})
|
|
353
|
+
step = details.get("stepwise", {})
|
|
354
|
+
final_return = float(met.get("mean_return") or 0.0)
|
|
355
|
+
step_reward_sum = float(step.get("reward_sum") or 0.0)
|
|
356
|
+
step_indicator_sum = float(step.get("indicator_sum") or 0.0)
|
|
357
|
+
new_ach_total = int(step.get("new_achievements_total") or 0)
|
|
358
|
+
num_steps = int(met.get("num_steps") or 0)
|
|
359
|
+
tool_calls_total = sum(len(s.get("tool_calls", [])) for s in (resp.get("trajectories", [{}])[0].get("steps", []))) if resp.get("trajectories") else 0
|
|
360
|
+
return {
|
|
361
|
+
"seed": seed, "strategy": strategy, "final_return": final_return,
|
|
362
|
+
"step_reward_sum": step_reward_sum, "step_indicator_sum": step_indicator_sum,
|
|
363
|
+
"new_achievements_total": new_ach_total, "num_steps": num_steps,
|
|
364
|
+
"tool_calls_total": tool_calls_total,
|
|
365
|
+
}
|
|
366
|
+
```
|
|
367
|
+
|
|
368
|
+
CLI example:
|
|
369
|
+
```bash
|
|
370
|
+
uv run python tools/eval_stepwise_vs_final.py \
|
|
371
|
+
--task-url $TASK_APP_URL \
|
|
372
|
+
--env-key $ENVIRONMENT_API_KEY \
|
|
373
|
+
--model "Qwen/Qwen3-32B" \
|
|
374
|
+
--num-seeds 100 --rollouts-per-seed 3 --max-turns 10 \
|
|
375
|
+
--strategy both --out results/qwen32b
|
|
376
|
+
```
|
|
377
|
+
|
|
378
|
+
Notes:
|
|
379
|
+
- The correlation/plots can be produced with `matplotlib` or `plotly`; write PNG + HTML.
|
|
380
|
+
- If `metrics.details.stepwise` is not yet populated by the task app, compute `indicator_sum` and `reward_sum` on the client by scanning `steps[].info.meta.decision_rewards`.
|
|
381
|
+
|
|
382
|
+
### Output artifacts (JSON + Markdown)
|
|
383
|
+
|
|
384
|
+
Directory layout under `--out` (example: `results/qwen32b`):
|
|
385
|
+
|
|
386
|
+
- `runs/` — one JSONL file per strategy with one record per rollout
|
|
387
|
+
- `runs/simple.jsonl`
|
|
388
|
+
- `runs/complex.jsonl`
|
|
389
|
+
- `summary/`
|
|
390
|
+
- `summary.json` — aggregates per strategy (mean/median/std, correlations, counts)
|
|
391
|
+
- `stats_by_seed.json` — per-seed aggregates
|
|
392
|
+
- `config_snapshot.json` — CLI args, weights, k-limits, timestamp, git SHA
|
|
393
|
+
- `plots/`
|
|
394
|
+
- `hist_step_reward_simple.png`, `hist_step_reward_complex.png`
|
|
395
|
+
- `scatter_step_reward_vs_final_simple.png`, `scatter_step_reward_vs_final_complex.png`
|
|
396
|
+
- `ecdf_indicator_simple.png`, `ecdf_indicator_complex.png`
|
|
397
|
+
- `report.md` — human-friendly Markdown summary linking to plots
|
|
398
|
+
|
|
399
|
+
Record schema (per line in `runs/*.jsonl`):
|
|
400
|
+
|
|
401
|
+
```json
|
|
402
|
+
{
|
|
403
|
+
"seed": 0,
|
|
404
|
+
"rollout_idx": 1,
|
|
405
|
+
"strategy": "simple", // or "complex"
|
|
406
|
+
"final_return": 0.9375,
|
|
407
|
+
"step_reward_sum": 1.2,
|
|
408
|
+
"step_indicator_sum": 3.0,
|
|
409
|
+
"new_achievements_total": 3,
|
|
410
|
+
"num_steps": 10,
|
|
411
|
+
"tool_calls_total": 12,
|
|
412
|
+
"model": "Qwen/Qwen3-32B",
|
|
413
|
+
"max_turns": 10,
|
|
414
|
+
"timestamp": "2025-10-17T22:14:05Z",
|
|
415
|
+
"meta": {
|
|
416
|
+
"task_url": "...",
|
|
417
|
+
"weights": {"collect_diamond": 1.0}, // only for complex
|
|
418
|
+
"k_limits": {"collect_diamond": 3} // only for complex
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
```
|
|
422
|
+
|
|
423
|
+
Markdown report (`report.md`) outline:
|
|
424
|
+
|
|
425
|
+
- Title, timestamp, model, config summary
|
|
426
|
+
- Table of global aggregates per strategy (mean, median, std of `final_return`, `step_reward_sum`, correlations r/Pearson, ρ/Spearman)
|
|
427
|
+
- Small tables per-seed (top-N seeds by return)
|
|
428
|
+
- Inline images linking to `plots/*.png`
|
|
429
|
+
- Pointers to raw data files and `config_snapshot.json`
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
I'll scan both repos for Crafter RL, policy/task app config, rollout calls, and backend RL endpoints, then draft notes under `examples/multi_step/`.
|
|
433
|
+
|
|
434
|
+
[5 tools called]
|
|
435
|
+
|
|
436
|
+
I'll open the Crafter task app implementation to inspect rubrics and where stepwise reward configuration is applied, then check the RL evaluator to see how env/policy config is passed to the task app.
|
|
437
|
+
|
|
438
|
+
[11 tools called]
|
|
439
|
+
|
|
440
|
+
I scanned both repos to map how stepwise rewards already work in the Crafter task app and how rollout configs flow from the RL TOML into the task app. I’ll add a detailed notes document under `examples/multi_step/` with findings, code references, and a concrete config proposal.
|
|
441
|
+
|
|
442
|
+
[6 tools called]
|
|
443
|
+
|
|
444
|
+
- I researched Crafter RL and documented how to toggle dense stepwise rewards and flow the config end-to-end.
|
|
445
|
+
- Notes added at `synth-ai/examples/multi_step/task_app_config_notes.md` with exact code references and a TOML example.
|
|
446
|
+
|
|
447
|
+
Key points:
|
|
448
|
+
- Use `rollout.env_config.step_rewards` in the RL TOML; task app already reads `policy.config.step_rewards` first, then `env.config.step_rewards`.
|
|
449
|
+
- Enable with:
|
|
450
|
+
```toml
|
|
451
|
+
[rollout.env_config.step_rewards]
|
|
452
|
+
enabled = true
|
|
453
|
+
mode = "decision_stepwise"
|
|
454
|
+
indicator_lambda = 1.0
|
|
455
|
+
step_beta = 0.0
|
|
456
|
+
```
|
|
457
|
+
- Backend already forwards `env_config` and `policy_config` in rollout requests; no wire change needed. Ensure trainer populates `env_config` when present.
|
|
458
|
+
- Task app attaches per-turn `decision_rewards` and returns `decision_samples`; consider consuming these in credit assignment.
|
|
459
|
+
|
|
460
|
+
References:
|
|
461
|
+
- Task contracts and rollout schema:
|
|
462
|
+
```51:87:synth-ai/synth_ai/task/contracts.py
|
|
463
|
+
class RolloutEnvSpec(BaseModel):
|
|
464
|
+
...
|
|
465
|
+
config: dict[str, Any] = Field(default_factory=dict)
|
|
466
|
+
...
|
|
467
|
+
class RolloutPolicySpec(BaseModel):
|
|
468
|
+
...
|
|
469
|
+
config: dict[str, Any] = Field(default_factory=dict)
|
|
470
|
+
```
|
|
471
|
+
- Crafter stepwise config reading and reward metadata:
|
|
472
|
+
```1041:1067:synth-ai/examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py
|
|
473
|
+
# Stepwise reward configuration ...
|
|
474
|
+
```
|
|
475
|
+
```1554:1596:synth-ai/examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py
|
|
476
|
+
# Compute decision-level rewards ...
|
|
477
|
+
```
|
|
478
|
+
- Backend rollout payload shape:
|
|
479
|
+
```456:470:monorepo/backend/app/routes/clustered_training/core/algorithms/gspo/evaluation/evaluator.py
|
|
480
|
+
payload = {
|
|
481
|
+
"run_id": run_id,
|
|
482
|
+
"env": {"env_name": env_name, "config": env_config, "seed": seed},
|
|
483
|
+
"policy": {"policy_name": policy_name, "config": policy_config},
|
|
484
|
+
...
|
|
485
|
+
}
|
|
486
|
+
```
|
|
487
|
+
|
|
488
|
+
Status: Notes committed on branch `friday-cleanup` and pushed.
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Qwen Coder – Remaining TODOs
|
|
2
|
+
|
|
3
|
+
- [ ] Add small-base LoRA config for quick iteration
|
|
4
|
+
- Create `configs/coder_lora_4b.toml` (base=`Qwen/Qwen3-4B`, 1x H100, LoRA all-linear, same hyperparameters structure as 30B).
|
|
5
|
+
|
|
6
|
+
- [ ] Improve SFT submission script (sft_lora_30b.py)
|
|
7
|
+
- Include `metadata.effective_config.compute` in job payload (gpu_type, gpu_count, nodes) so API doesn’t 400 without TOML.
|
|
8
|
+
- Write resulting `ft:<id>` to `examples/qwen_coder/ft_data/ft_model_id.txt` and print it clearly.
|
|
9
|
+
- Add optional validation file support when present.
|
|
10
|
+
|
|
11
|
+
- [ ] Add post‑SFT inference script
|
|
12
|
+
- Read `ft_data/ft_model_id.txt` and call the prod proxy (or SDK InferenceClient) to verify the finetuned adapter returns.
|
|
13
|
+
- Save a short transcript to `ft_data/ft_infer_smoke.txt`.
|
|
14
|
+
|
|
15
|
+
- [ ] Add inference smoke tests (local opt‑in)
|
|
16
|
+
- `tests/qwen_coder/test_infer_prod_proxy.py` (skips unless `SYNTH_API_KEY` set). Hits `/api/inference/v1/chat/completions` with `Qwen/Qwen3-Coder-30B-A3B-Instruct` and asserts 200/choices.
|
|
17
|
+
- Optional: same test for an `ft:<id>` if `FT_MODEL_ID` env is provided.
|
|
18
|
+
|
|
19
|
+
- [ ] Document end‑to‑end flow in README
|
|
20
|
+
- Expand README with explicit env section (`SYNTH_API_KEY`, `BACKEND_BASE_URL`).
|
|
21
|
+
- Show: generate dataset → run LoRA (4B or 30B) → poll → infer with `ft:<id>`.
|
|
22
|
+
- Mention cost/time caveats for 30B.
|
|
23
|
+
|
|
24
|
+
- [ ] Dataset utilities
|
|
25
|
+
- Add `validate_jsonl.py` to check first N lines parse and contain `messages`/`assistant` fields required by SFT.
|
|
26
|
+
- Add `subset_jsonl.py` to create capped training sets for quick runs.
|
|
27
|
+
|
|
28
|
+
- [ ] Optional: CLI convenience wrappers
|
|
29
|
+
- `scripts/train_coder_30b.sh` to invoke `uvx synth-ai train --type sft --config configs/coder_lora_30b.toml --dataset ft_data/coder_sft.small.jsonl` with `.env` preload.
|
|
30
|
+
- `scripts/infer_coder.sh` to run `infer_prod_proxy.py` against base or `ft:<id>`.
|
|
31
|
+
|
|
32
|
+
- [ ] Optional CI (requires secrets)
|
|
33
|
+
- GitHub workflow job (smoke) that runs `infer_prod_proxy.py` with `SYNTH_API_KEY` secret and prints the first 200 chars of assistant output.
|
|
34
|
+
|
|
35
|
+
- [ ] (If needed) Add coder variants
|
|
36
|
+
- If backend supports additional coder SKUs, append to `synth_ai/api/models/supported.py:QWEN3_CODER_MODELS` so SDK validation passes (SFT/inference).
|
|
37
|
+
|
|
38
|
+
|