synth-ai 0.2.10__py3-none-any.whl → 0.2.13.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/agora_ex/README_MoE.md +224 -0
- examples/agora_ex/__init__.py +7 -0
- examples/agora_ex/agora_ex.py +65 -0
- examples/agora_ex/agora_ex_task_app.py +590 -0
- examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +121 -0
- examples/agora_ex/reward_fn_grpo-human.py +129 -0
- examples/agora_ex/system_prompt_CURRENT.md +63 -0
- examples/agora_ex/task_app/agora_ex_task_app.py +590 -0
- examples/agora_ex/task_app/reward_fn_grpo-human.py +129 -0
- examples/agora_ex/task_app/system_prompt_CURRENT.md +63 -0
- examples/multi_step/configs/crafter_rl_outcome.toml +74 -0
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +175 -0
- examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +83 -0
- examples/multi_step/configs/crafter_rl_stepwise_simple.toml +78 -0
- examples/multi_step/crafter_rl_lora.md +51 -10
- examples/multi_step/sse_metrics_streaming_notes.md +357 -0
- examples/multi_step/task_app_config_notes.md +494 -0
- examples/warming_up_to_rl/configs/eval_stepwise_complex.toml +35 -0
- examples/warming_up_to_rl/configs/eval_stepwise_consistent.toml +26 -0
- examples/warming_up_to_rl/configs/eval_stepwise_per_achievement.toml +36 -0
- examples/warming_up_to_rl/configs/eval_stepwise_simple.toml +32 -0
- examples/warming_up_to_rl/run_eval.py +267 -41
- examples/warming_up_to_rl/task_app/grpo_crafter.py +3 -33
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +109 -45
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +42 -46
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +376 -193
- synth_ai/__init__.py +41 -1
- synth_ai/api/train/builders.py +74 -33
- synth_ai/api/train/cli.py +29 -6
- synth_ai/api/train/configs/__init__.py +44 -0
- synth_ai/api/train/configs/rl.py +133 -0
- synth_ai/api/train/configs/sft.py +94 -0
- synth_ai/api/train/configs/shared.py +24 -0
- synth_ai/api/train/env_resolver.py +18 -19
- synth_ai/api/train/supported_algos.py +8 -5
- synth_ai/api/train/utils.py +6 -1
- synth_ai/cli/__init__.py +4 -2
- synth_ai/cli/_storage.py +19 -0
- synth_ai/cli/balance.py +14 -2
- synth_ai/cli/calc.py +37 -22
- synth_ai/cli/demo.py +38 -39
- synth_ai/cli/legacy_root_backup.py +12 -14
- synth_ai/cli/recent.py +12 -7
- synth_ai/cli/rl_demo.py +81 -102
- synth_ai/cli/status.py +4 -3
- synth_ai/cli/task_apps.py +146 -137
- synth_ai/cli/traces.py +4 -3
- synth_ai/cli/watch.py +3 -2
- synth_ai/demos/core/cli.py +121 -159
- synth_ai/environments/examples/crafter_classic/environment.py +16 -0
- synth_ai/evals/__init__.py +15 -0
- synth_ai/evals/client.py +85 -0
- synth_ai/evals/types.py +42 -0
- synth_ai/jobs/client.py +15 -3
- synth_ai/judge_schemas.py +127 -0
- synth_ai/rubrics/__init__.py +22 -0
- synth_ai/rubrics/validators.py +126 -0
- synth_ai/task/server.py +14 -7
- synth_ai/tracing_v3/decorators.py +51 -26
- synth_ai/tracing_v3/examples/basic_usage.py +12 -7
- synth_ai/tracing_v3/llm_call_record_helpers.py +107 -53
- synth_ai/tracing_v3/replica_sync.py +8 -4
- synth_ai/tracing_v3/serialization.py +130 -0
- synth_ai/tracing_v3/storage/utils.py +11 -9
- synth_ai/tracing_v3/turso/__init__.py +12 -0
- synth_ai/tracing_v3/turso/daemon.py +2 -1
- synth_ai/tracing_v3/turso/native_manager.py +28 -15
- {synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/METADATA +4 -2
- {synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/RECORD +73 -40
- {synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/entry_points.txt +0 -1
- {synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,494 @@
|
|
|
1
|
+
# Task App Config for Crafter RL: Dense Stepwise Rewards
|
|
2
|
+
|
|
3
|
+
Goal: Allow configuring the Crafter task app to enable/disable dense (decision-stepwise) event rewards and pass that choice from the RL config, through the backend, into the task app’s /rollout execution. This should be broader than just policy config – a `task_app_config` concept – but we can implement with the existing `env.config` today and optionally add a top-level alias later.
|
|
4
|
+
|
|
5
|
+
## Findings (current behaviour)
|
|
6
|
+
|
|
7
|
+
- Rollout request contract already supports two config payloads:
|
|
8
|
+
- `env.config: dict` and `policy.config: dict`
|
|
9
|
+
- The hosted Crafter rollout implementation already supports decision-stepwise rewards, controlled via a `step_rewards` block in either `policy.config` or `env.config`.
|
|
10
|
+
- When active, it computes per-decision “unique achievement” deltas and attaches per-turn metadata; it also returns `decision_samples` when enabled.
|
|
11
|
+
|
|
12
|
+
Key locations and behaviour:
|
|
13
|
+
|
|
14
|
+
- Rollout schema (env/policy config):
|
|
15
|
+
```51:87:synth-ai/synth_ai/task/contracts.py
|
|
16
|
+
class RolloutEnvSpec(BaseModel):
|
|
17
|
+
env_id: str | None = None
|
|
18
|
+
env_name: str | None = None
|
|
19
|
+
config: dict[str, Any] = Field(default_factory=dict)
|
|
20
|
+
seed: int | None = None
|
|
21
|
+
|
|
22
|
+
class RolloutPolicySpec(BaseModel):
|
|
23
|
+
policy_id: str | None = None
|
|
24
|
+
policy_name: str | None = None
|
|
25
|
+
config: dict[str, Any] = Field(default_factory=dict)
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
- Crafter hosted rollout reads step-reward config from policy, then env; gates on `enabled` and `mode == "decision_stepwise"`:
|
|
29
|
+
```1041:1067:synth-ai/examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py
|
|
30
|
+
# Stepwise reward configuration (Crafter shaping; gate on explicit enable)
|
|
31
|
+
step_rewards_cfg_raw: dict[str, Any] = {}
|
|
32
|
+
...
|
|
33
|
+
if not step_rewards_cfg_raw:
|
|
34
|
+
if isinstance(request.env.config, dict):
|
|
35
|
+
step_rewards_cfg_raw = dict(request.env.config.get("step_rewards") or {})
|
|
36
|
+
|
|
37
|
+
step_rewards_enabled = bool(step_rewards_cfg_raw.get("enabled", False))
|
|
38
|
+
step_rewards_mode = str(step_rewards_cfg_raw.get("mode") or "off").lower()
|
|
39
|
+
...
|
|
40
|
+
step_rewards_active = step_rewards_enabled and step_rewards_mode == "decision_stepwise"
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
- When active, it computes decision-level indicators and metadata, and adds to each step’s `info.meta.decision_rewards`; also accumulates `decision_samples`:
|
|
44
|
+
```1554:1596:synth-ai/examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py
|
|
45
|
+
if step_rewards_active:
|
|
46
|
+
decision_actions = _summarize_tool_calls(pending_tool_calls)
|
|
47
|
+
stepwise_info, decision_record, stats = compute_stepwise_reward(
|
|
48
|
+
prev_achievements or {},
|
|
49
|
+
new_achievement_state,
|
|
50
|
+
decision_index,
|
|
51
|
+
decision_actions,
|
|
52
|
+
step_rewards_indicator_lambda,
|
|
53
|
+
)
|
|
54
|
+
...
|
|
55
|
+
# Compute decision-level rewards (absolute vs unique) and attach to metadata
|
|
56
|
+
turned_true = set(stepwise_info.get("new_achievements") or [])
|
|
57
|
+
seen_before = set(episode_seen_achievements)
|
|
58
|
+
new_unique = sorted(turned_true - seen_before)
|
|
59
|
+
ach_delta = int(len(turned_true))
|
|
60
|
+
unique_delta = int(len(new_unique))
|
|
61
|
+
meta_block = (_info.get("meta") if isinstance(_info.get("meta"), dict) else {})
|
|
62
|
+
decision_rewards = {"turn": int(decision_index), "ach_delta": ach_delta, "unique_delta": unique_delta, "all": all_list, "unique": new_unique}
|
|
63
|
+
decision_rewards_meta = decision_rewards
|
|
64
|
+
meta_block["decision_rewards"] = decision_rewards
|
|
65
|
+
_info["meta"] = meta_block
|
|
66
|
+
episode_seen_achievements.update(turned_true)
|
|
67
|
+
decision_samples.append(decision_record)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
- The simpler published Crafter app (`examples/warming_up_to_rl/task_app/grpo_crafter.py`) sets sane defaults for `step_rewards` in both env and policy when it aliases math → crafter, but the hosted rollout above is the one actually used in production paths.
|
|
71
|
+
```479:490:synth-ai/examples/warming_up_to_rl/task_app/grpo_crafter.py
|
|
72
|
+
env_cfg.setdefault("step_rewards", dict(DEFAULT_ALIAS_STEP_REWARDS))
|
|
73
|
+
...
|
|
74
|
+
policy_cfg.setdefault("step_rewards", dict(DEFAULT_ALIAS_STEP_REWARDS))
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
- Backend RPC: The backend constructs the rollout HTTP payload with both env_config and policy_config; these are forwarded to the task app `/rollout`:
|
|
78
|
+
```456:470:monorepo/backend/app/routes/clustered_training/core/algorithms/gspo/evaluation/evaluator.py
|
|
79
|
+
payload = {
|
|
80
|
+
"run_id": run_id,
|
|
81
|
+
"env": {"env_name": env_name, "config": env_config, "seed": seed},
|
|
82
|
+
"policy": {"policy_name": policy_name, "config": policy_config},
|
|
83
|
+
"ops": ops,
|
|
84
|
+
"record": {"trajectories": True, "logprobs": False, "value": False},
|
|
85
|
+
"on_done": on_done,
|
|
86
|
+
}
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
- RL config ingestion: The CLI forwards the full TOML in the job payload. The backend trainer flattens some rollout options and (optionally) picks up `rollout.env_config`:
|
|
90
|
+
```364:393:monorepo/backend/app/routes/clustered_training/core/algorithms/gspo/training/clustered_trainer.py
|
|
91
|
+
# Rollout config
|
|
92
|
+
if "rollout" in config_dict:
|
|
93
|
+
flat_config["env_name"] = config_dict["rollout"].get("env_name", "crafter")
|
|
94
|
+
...
|
|
95
|
+
if "env_config" in config_dict["rollout"]:
|
|
96
|
+
flat_config["env_config"] = config_dict["rollout"]["env_config"]
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Implication: We can carry a broader "task app config" today via `rollout.env_config` without changing wire contracts. The task app already reads `env.config.step_rewards`.
|
|
100
|
+
|
|
101
|
+
## Proposed configuration shape (TOML)
|
|
102
|
+
|
|
103
|
+
Recommended to use `rollout.env_config.step_rewards` so the backend passes it through to the task app:
|
|
104
|
+
|
|
105
|
+
```toml
|
|
106
|
+
[rollout]
|
|
107
|
+
env_name = "crafter"
|
|
108
|
+
policy_name = "crafter-react"
|
|
109
|
+
max_turns = 10
|
|
110
|
+
ops = ["agent", "env"]
|
|
111
|
+
|
|
112
|
+
[rollout.env_config.step_rewards]
|
|
113
|
+
# Toggle dense per-decision rewards
|
|
114
|
+
enabled = true
|
|
115
|
+
# Supported: "off" | "decision_stepwise" | (future) "env_sparse"
|
|
116
|
+
mode = "decision_stepwise"
|
|
117
|
+
# Reward = indicator_lambda * I(unique_achievements_delta > 0)
|
|
118
|
+
indicator_lambda = 1.0
|
|
119
|
+
# Reserved for shaped/intermediate signals (currently unused)
|
|
120
|
+
step_beta = 0.0
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Optional (policy sampling, still supported via `policy.config` in the task app runner):
|
|
124
|
+
|
|
125
|
+
```toml
|
|
126
|
+
[rollout.policy_config]
|
|
127
|
+
temperature = 0.2
|
|
128
|
+
top_p = 0.95
|
|
129
|
+
max_tokens = 512
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
Notes:
|
|
133
|
+
- The hosted Crafter rollout checks `policy.config.step_rewards` first, then falls back to `env.config.step_rewards`. Prefer `env_config` as the canonical place for app-level settings.
|
|
134
|
+
- If you want the app to disable stepwise rewards entirely, set `enabled=false` or `mode="off"`.
|
|
135
|
+
|
|
136
|
+
## Wire and data flow
|
|
137
|
+
|
|
138
|
+
1) CLI → Backend: CLI includes the entire TOML in the job payload (`build_rl_payload`).
|
|
139
|
+
2) Backend → Trainer: Trainer flattens rollout properties and can include `env_config`.
|
|
140
|
+
3) Trainer → Task App: Rollout HTTP payload embeds `env.config` and `policy.config`.
|
|
141
|
+
4) Task App: Hosted rollout computes decision-level metadata and returns:
|
|
142
|
+
- `RolloutStep.info.meta.decision_rewards` with `{turn, ach_delta, unique_delta, ...}`
|
|
143
|
+
- `trajectory.decision_samples` summarizing per-turn reward inputs
|
|
144
|
+
|
|
145
|
+
## Minimal code changes to adopt
|
|
146
|
+
|
|
147
|
+
- synth-ai (optional):
|
|
148
|
+
- Add example configs under `examples/warming_up_to_rl/configs/*.toml` using `[rollout.env_config.step_rewards]`.
|
|
149
|
+
- Document this block in docs and the multi_step walkthrough.
|
|
150
|
+
|
|
151
|
+
- monorepo backend:
|
|
152
|
+
- Verify trainer always passes `rollout.env_config` (ClusteredTrainerConfig appears to support it; ensure it flows into the runtime’s rollout request builder in the trainer where the payload is assembled).
|
|
153
|
+
- No contract changes needed: task app already reads from `env.config`.
|
|
154
|
+
|
|
155
|
+
- Task App:
|
|
156
|
+
- Already supports the block; no changes needed for the hosted Crafter rollout.
|
|
157
|
+
- If you want a first-class `task_app_config` top-level, we can add an alias resolver that copies `config["task_app_config"]` → `env.config` inside the rollout executor.
|
|
158
|
+
|
|
159
|
+
## Open questions / follow-ups
|
|
160
|
+
|
|
161
|
+
- Does the current trainer consume `decision_samples` or `step.info.meta.decision_rewards` for credit assignment? If not, wire this into the per-step reward/advantage pipeline.
|
|
162
|
+
- Decide whether to disable the default enabling of stepwise rewards in `grpo_crafter.py` aliases (`DEFAULT_ALIAS_STEP_REWARDS`) so the TOML fully drives behaviour.
|
|
163
|
+
- Standardize on `env_config.step_rewards` for app-level settings across environments.
|
|
164
|
+
|
|
165
|
+
## Reference: CRAfter RL LoRA example (expected first 10 rewards)
|
|
166
|
+
These are the first ten batch rewards printed at RL start:
|
|
167
|
+
```
|
|
168
|
+
- INFO - All batch rewards: [0.0625, 0.0625, 0.125, 0.0625, 0.0625, 0.3125, 0.375, 0.4375, 0.5, 0.9375]
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
---
|
|
172
|
+
|
|
173
|
+
## Enable stepwise during EVALS and compare vs final
|
|
174
|
+
|
|
175
|
+
We can enable stepwise shaping for evaluation-only runs and compare “stepwise” vs “final (outcome)” returns.
|
|
176
|
+
|
|
177
|
+
Two evaluation paths exist today:
|
|
178
|
+
|
|
179
|
+
- Backend evaluator endpoint (preferred for hosted):
|
|
180
|
+
```1114:1136:monorepo/backend/app/routes/clustered_training/core/routes.py
|
|
181
|
+
class RlEvaluateRequest(BaseModel):
|
|
182
|
+
model: str
|
|
183
|
+
seeds: list[int]
|
|
184
|
+
rollouts_per_seed: int = 1
|
|
185
|
+
env_name: str
|
|
186
|
+
env_config: Dict[str, Any] = Field(default_factory=dict)
|
|
187
|
+
policy_name: str
|
|
188
|
+
thinking_mode: str
|
|
189
|
+
thinking_budget: int | None = None
|
|
190
|
+
max_steps_per_episode: int = 100
|
|
191
|
+
max_concurrent_rollouts: int = 8
|
|
192
|
+
on_done: str = "terminate"
|
|
193
|
+
task_service_url: str | None = None
|
|
194
|
+
vllm_url: str | None = None
|
|
195
|
+
vllm_public_url: str | None = None
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
Pass `env_config.step_rewards` here to turn on stepwise shaping during evals (no trainer changes needed). The evaluator will forward `env_config` into each rollout:
|
|
199
|
+
```383:396:monorepo/backend/app/routes/clustered_training/core/algorithms/gspo/evaluation/evaluator.py
|
|
200
|
+
payload = {
|
|
201
|
+
"env": {"env_name": env_name, "config": env_config, ...},
|
|
202
|
+
"policy": {"policy_name": policy_name, "config": policy_config},
|
|
203
|
+
...
|
|
204
|
+
}
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
Task app already computes and attaches:
|
|
208
|
+
- Per-decision metadata at `step.info.meta.decision_rewards`
|
|
209
|
+
- Aggregates we can expose (see below) for stepwise vs final
|
|
210
|
+
|
|
211
|
+
Recommended enhancement (small change in task app): include a summary under `response.metrics.details.stepwise` so eval clients don’t need to parse per-step:
|
|
212
|
+
```python
|
|
213
|
+
metrics.details["stepwise"] = {
|
|
214
|
+
"indicator_sum": stepwise_indicator_sum,
|
|
215
|
+
"reward_sum": stepwise_reward_sum,
|
|
216
|
+
"new_achievements_total": stepwise_new_achievements_total,
|
|
217
|
+
}
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
For local SDK evals (without backend), call the `/rollout` endpoint directly with the same `env.config.step_rewards` block.
|
|
221
|
+
|
|
222
|
+
Example payload fragment:
|
|
223
|
+
```json
|
|
224
|
+
{
|
|
225
|
+
"env": {
|
|
226
|
+
"env_name": "crafter",
|
|
227
|
+
"config": {
|
|
228
|
+
"step_rewards": { "enabled": true, "mode": "decision_stepwise", "indicator_lambda": 1.0 }
|
|
229
|
+
},
|
|
230
|
+
"seed": 0
|
|
231
|
+
},
|
|
232
|
+
"policy": { "policy_name": "crafter-react", "config": {"temperature": 0.2} },
|
|
233
|
+
"ops": ["agent", "env"]
|
|
234
|
+
}
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
---
|
|
238
|
+
|
|
239
|
+
## Simple vs Complex stepwise modes (proposal)
|
|
240
|
+
|
|
241
|
+
Add a `strategy` under the existing `step_rewards` block:
|
|
242
|
+
|
|
243
|
+
```toml
|
|
244
|
+
[rollout.env_config.step_rewards]
|
|
245
|
+
enabled = true
|
|
246
|
+
mode = "decision_stepwise" # gate remains the same
|
|
247
|
+
strategy = "simple" # "simple" | "complex"
|
|
248
|
+
indicator_lambda = 1.0
|
|
249
|
+
|
|
250
|
+
# Complex-only (optional)
|
|
251
|
+
weights = { collect_sapling = 0.1, craft_wood_pickaxe = 0.3, collect_diamond = 1.0 }
|
|
252
|
+
k_limits = { collect_sapling = 1, craft_wood_pickaxe = 2, collect_diamond = 3 }
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
Behaviour:
|
|
256
|
+
- strategy="simple": reward 1.0×indicator_lambda if any new achievement unlocked at that decision, else 0. (Current logic already does this; just make it explicit.)
|
|
257
|
+
- strategy="complex":
|
|
258
|
+
- Maintain per-episode `achieve_count[name]`.
|
|
259
|
+
- For each achievement newly unlocked at the decision, if `achieve_count[name] < k_limits.get(name, 1)`, add `weights.get(name, 1.0)` to the stepwise reward and increment the count.
|
|
260
|
+
- The uniqueness baseline should be the “turned true this decision” set; combining with episode-level uniqueness is optional if we intend multiple rewards up to K.
|
|
261
|
+
|
|
262
|
+
Minimal code touch points:
|
|
263
|
+
- synth-ai task app (hosted Crafter rollout):
|
|
264
|
+
- Extend `compute_stepwise_reward(prev_achievements, new_achievements, decision_index, actions_summary, indicator_lambda)` to optionally take `strategy`, `weights`, `k_limits`, and a `counts` dict.
|
|
265
|
+
- Thread an `episode_ach_counts: Dict[str, int]` through the rollout loop (similar to `episode_seen_achievements`).
|
|
266
|
+
- Build `reward_stepwise` as per strategy; keep existing `decision_rewards` metadata (ach/unique deltas) unchanged.
|
|
267
|
+
- Add `metrics.details["stepwise"]` summary (indicator_sum, reward_sum, new_achievements_total).
|
|
268
|
+
|
|
269
|
+
- monorepo backend (evals):
|
|
270
|
+
- No contract change: pass the same `env_config.step_rewards` in `RlEvaluateRequest.env_config`.
|
|
271
|
+
- For convenience, surface stepwise summary in any eval aggregation/CSV if present under `metrics.details.stepwise`.
|
|
272
|
+
|
|
273
|
+
Open choice:
|
|
274
|
+
- Either keep `mode="decision_stepwise"` and add `strategy`, or introduce `mode` values `{ "simple_stepwise", "complex_stepwise" }`. The former is backward compatible and clearer.
|
|
275
|
+
|
|
276
|
+
Testing plan:
|
|
277
|
+
- Unit-test `compute_stepwise_reward` for both strategies with synthetic prev/new achievement maps.
|
|
278
|
+
- Smoke eval over a few seeds with `strategy=simple` and `strategy=complex` to compare `metrics.details.stepwise.reward_sum` vs `metrics.mean_return`.
|
|
279
|
+
|
|
280
|
+
---
|
|
281
|
+
|
|
282
|
+
## Eval script scope: Groq Qwen/Qwen3-32B stepwise vs outcome
|
|
283
|
+
|
|
284
|
+
Objective: run many Crafter rollouts against Groq `Qwen/Qwen3-32B` and compare distributions and correlations between stepwise rewards and final (outcome) rewards, for both simple and complex stepwise strategies.
|
|
285
|
+
|
|
286
|
+
Inputs/flags:
|
|
287
|
+
- `--task-url` Task app base URL (Modal deployment)
|
|
288
|
+
- `--env-key` ENVIRONMENT_API_KEY (or from `.env`)
|
|
289
|
+
- `--model` default `Qwen/Qwen3-32B`
|
|
290
|
+
- `--seeds` list or `--num-seeds` N (use 0..N-1)
|
|
291
|
+
- `--rollouts-per-seed` default 3
|
|
292
|
+
- `--max-turns` default 10
|
|
293
|
+
- `--strategy` `simple|complex|both` (default both)
|
|
294
|
+
- `--weights-json` optional JSON path for complex weighting
|
|
295
|
+
- `--k-limits-json` optional JSON path for complex K-limits
|
|
296
|
+
- `--out` output directory for CSV/plots
|
|
297
|
+
|
|
298
|
+
What it does:
|
|
299
|
+
1) Builds rollout payloads for each seed and strategy variant.
|
|
300
|
+
2) For each rollout, passes `env.config.step_rewards` with:
|
|
301
|
+
- common: `{ enabled: true, mode: "decision_stepwise" }`
|
|
302
|
+
- simple: `strategy: "simple", indicator_lambda: 1.0`
|
|
303
|
+
- complex: `strategy: "complex", weights, k_limits`
|
|
304
|
+
3) Uses policy config to route inference to Groq with the requested model.
|
|
305
|
+
4) Collects per-rollout summary:
|
|
306
|
+
- `final_return = response.metrics.mean_return`
|
|
307
|
+
- `step_indicator_sum`, `step_reward_sum`, `new_achievements_total` from `metrics.details.stepwise` (or compute from steps if absent)
|
|
308
|
+
- counts like `num_steps`, `tool_calls_total`
|
|
309
|
+
5) Writes a wide CSV with one row per rollout, including seed, strategy, and the above fields.
|
|
310
|
+
6) Visualizes:
|
|
311
|
+
- Histogram of `step_reward_sum` by strategy
|
|
312
|
+
- Scatter: `step_reward_sum` vs `final_return`, per strategy (with Pearson/Spearman r)
|
|
313
|
+
- Optional ECDFs for indicator_sum
|
|
314
|
+
|
|
315
|
+
Data schema (CSV):
|
|
316
|
+
```
|
|
317
|
+
seed,int | rollout_idx,int | strategy,str | final_return,float | step_reward_sum,float |
|
|
318
|
+
step_indicator_sum,float | new_achievements_total,int | num_steps,int | tool_calls_total,int |
|
|
319
|
+
model,str | max_turns,int | timestamp,iso
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
Pseudocode (Python):
|
|
323
|
+
```python
|
|
324
|
+
import os, json, csv, time, math, statistics
|
|
325
|
+
import httpx
|
|
326
|
+
|
|
327
|
+
TASK_URL = os.environ.get("TASK_APP_URL")
|
|
328
|
+
ENV_KEY = os.environ.get("ENVIRONMENT_API_KEY")
|
|
329
|
+
|
|
330
|
+
def build_step_cfg(strategy, weights=None, k_limits=None):
|
|
331
|
+
cfg = {"enabled": True, "mode": "decision_stepwise", "strategy": strategy, "indicator_lambda": 1.0}
|
|
332
|
+
if strategy == "complex":
|
|
333
|
+
if weights: cfg["weights"] = weights
|
|
334
|
+
if k_limits: cfg["k_limits"] = k_limits
|
|
335
|
+
return cfg
|
|
336
|
+
|
|
337
|
+
async def run_rollout(seed, strategy, model, max_turns, weights, k_limits):
|
|
338
|
+
step_cfg = build_step_cfg(strategy, weights, k_limits)
|
|
339
|
+
payload = {
|
|
340
|
+
"run_id": f"eval-{seed}-{strategy}-{int(time.time())}",
|
|
341
|
+
"env": {"env_name": "crafter", "seed": seed, "config": {"step_rewards": step_cfg, "env_params": {"max_steps_per_episode": max_turns}}},
|
|
342
|
+
"policy": {"policy_name": "crafter-react", "config": {"inference_url": "https://groq.synth-ai.internal/proxy", "model": model, "temperature": 0.2, "top_p": 0.95, "max_tokens": 512}},
|
|
343
|
+
"ops": ["agent", "env"] * max_turns,
|
|
344
|
+
"record": {"trajectories": True},
|
|
345
|
+
"on_done": "terminate",
|
|
346
|
+
}
|
|
347
|
+
async with httpx.AsyncClient(timeout=300.0) as client:
|
|
348
|
+
r = await client.post(f"{TASK_URL}/rollout", headers={"X-API-Key": ENV_KEY}, json=payload)
|
|
349
|
+
r.raise_for_status()
|
|
350
|
+
resp = r.json()
|
|
351
|
+
met = resp.get("metrics", {})
|
|
352
|
+
details = met.get("details", {})
|
|
353
|
+
step = details.get("stepwise", {})
|
|
354
|
+
final_return = float(met.get("mean_return") or 0.0)
|
|
355
|
+
step_reward_sum = float(step.get("reward_sum") or 0.0)
|
|
356
|
+
step_indicator_sum = float(step.get("indicator_sum") or 0.0)
|
|
357
|
+
new_ach_total = int(step.get("new_achievements_total") or 0)
|
|
358
|
+
num_steps = int(met.get("num_steps") or 0)
|
|
359
|
+
tool_calls_total = sum(len(s.get("tool_calls", [])) for s in (resp.get("trajectories", [{}])[0].get("steps", []))) if resp.get("trajectories") else 0
|
|
360
|
+
return {
|
|
361
|
+
"seed": seed, "strategy": strategy, "final_return": final_return,
|
|
362
|
+
"step_reward_sum": step_reward_sum, "step_indicator_sum": step_indicator_sum,
|
|
363
|
+
"new_achievements_total": new_ach_total, "num_steps": num_steps,
|
|
364
|
+
"tool_calls_total": tool_calls_total,
|
|
365
|
+
}
|
|
366
|
+
```
|
|
367
|
+
|
|
368
|
+
CLI example:
|
|
369
|
+
```bash
|
|
370
|
+
uv run python tools/eval_stepwise_vs_final.py \
|
|
371
|
+
--task-url $TASK_APP_URL \
|
|
372
|
+
--env-key $ENVIRONMENT_API_KEY \
|
|
373
|
+
--model "Qwen/Qwen3-32B" \
|
|
374
|
+
--num-seeds 100 --rollouts-per-seed 3 --max-turns 10 \
|
|
375
|
+
--strategy both --out results/qwen32b
|
|
376
|
+
```
|
|
377
|
+
|
|
378
|
+
Notes:
|
|
379
|
+
- The correlation/plots can be produced with `matplotlib` or `plotly`; write PNG + HTML.
|
|
380
|
+
- If `metrics.details.stepwise` is not yet populated by the task app, compute `indicator_sum` and `reward_sum` on the client by scanning `steps[].info.meta.decision_rewards`.
|
|
381
|
+
|
|
382
|
+
### Output artifacts (JSON + Markdown)
|
|
383
|
+
|
|
384
|
+
Directory layout under `--out` (example: `results/qwen32b`):
|
|
385
|
+
|
|
386
|
+
- `runs/` — one JSONL file per strategy with one record per rollout
|
|
387
|
+
- `runs/simple.jsonl`
|
|
388
|
+
- `runs/complex.jsonl`
|
|
389
|
+
- `summary/`
|
|
390
|
+
- `summary.json` — aggregates per strategy (mean/median/std, correlations, counts)
|
|
391
|
+
- `stats_by_seed.json` — per-seed aggregates
|
|
392
|
+
- `config_snapshot.json` — CLI args, weights, k-limits, timestamp, git SHA
|
|
393
|
+
- `plots/`
|
|
394
|
+
- `hist_step_reward_simple.png`, `hist_step_reward_complex.png`
|
|
395
|
+
- `scatter_step_reward_vs_final_simple.png`, `scatter_step_reward_vs_final_complex.png`
|
|
396
|
+
- `ecdf_indicator_simple.png`, `ecdf_indicator_complex.png`
|
|
397
|
+
- `report.md` — human-friendly Markdown summary linking to plots
|
|
398
|
+
|
|
399
|
+
Record schema (per line in `runs/*.jsonl`):
|
|
400
|
+
|
|
401
|
+
```json
|
|
402
|
+
{
|
|
403
|
+
"seed": 0,
|
|
404
|
+
"rollout_idx": 1,
|
|
405
|
+
"strategy": "simple", // or "complex"
|
|
406
|
+
"final_return": 0.9375,
|
|
407
|
+
"step_reward_sum": 1.2,
|
|
408
|
+
"step_indicator_sum": 3.0,
|
|
409
|
+
"new_achievements_total": 3,
|
|
410
|
+
"num_steps": 10,
|
|
411
|
+
"tool_calls_total": 12,
|
|
412
|
+
"model": "Qwen/Qwen3-32B",
|
|
413
|
+
"max_turns": 10,
|
|
414
|
+
"timestamp": "2025-10-17T22:14:05Z",
|
|
415
|
+
"meta": {
|
|
416
|
+
"task_url": "...",
|
|
417
|
+
"weights": {"collect_diamond": 1.0}, // only for complex
|
|
418
|
+
"k_limits": {"collect_diamond": 3} // only for complex
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
```
|
|
422
|
+
|
|
423
|
+
Markdown report (`report.md`) outline:
|
|
424
|
+
|
|
425
|
+
- Title, timestamp, model, config summary
|
|
426
|
+
- Table of global aggregates per strategy (mean, median, std of `final_return`, `step_reward_sum`, correlations r/Pearson, ρ/Spearman)
|
|
427
|
+
- Small tables per-seed (top-N seeds by return)
|
|
428
|
+
- Inline images linking to `plots/*.png`
|
|
429
|
+
- Pointers to raw data files and `config_snapshot.json`
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
I'll scan both repos for Crafter RL, policy/task app config, rollout calls, and backend RL endpoints, then draft notes under `examples/multi_step/`.
|
|
433
|
+
|
|
434
|
+
[5 tools called]
|
|
435
|
+
|
|
436
|
+
I'll open the Crafter task app implementation to inspect rubrics and where stepwise reward configuration is applied, then check the RL evaluator to see how env/policy config is passed to the task app.
|
|
437
|
+
|
|
438
|
+
[11 tools called]
|
|
439
|
+
|
|
440
|
+
I scanned both repos to map how stepwise rewards already work in the Crafter task app and how rollout configs flow from the RL TOML into the task app. I’ll add a detailed notes document under `examples/multi_step/` with findings, code references, and a concrete config proposal.
|
|
441
|
+
|
|
442
|
+
[6 tools called]
|
|
443
|
+
|
|
444
|
+
- I researched Crafter RL and documented how to toggle dense stepwise rewards and flow the config end-to-end.
|
|
445
|
+
- Notes added at `synth-ai/examples/multi_step/task_app_config_notes.md` with exact code references and a TOML example.
|
|
446
|
+
|
|
447
|
+
Key points:
|
|
448
|
+
- Use `rollout.env_config.step_rewards` in the RL TOML; task app already reads `policy.config.step_rewards` first, then `env.config.step_rewards`.
|
|
449
|
+
- Enable with:
|
|
450
|
+
```toml
|
|
451
|
+
[rollout.env_config.step_rewards]
|
|
452
|
+
enabled = true
|
|
453
|
+
mode = "decision_stepwise"
|
|
454
|
+
indicator_lambda = 1.0
|
|
455
|
+
step_beta = 0.0
|
|
456
|
+
```
|
|
457
|
+
- Backend already forwards `env_config` and `policy_config` in rollout requests; no wire change needed. Ensure trainer populates `env_config` when present.
|
|
458
|
+
- Task app attaches per-turn `decision_rewards` and returns `decision_samples`; consider consuming these in credit assignment.
|
|
459
|
+
|
|
460
|
+
References:
|
|
461
|
+
- Task contracts and rollout schema:
|
|
462
|
+
```51:87:synth-ai/synth_ai/task/contracts.py
|
|
463
|
+
class RolloutEnvSpec(BaseModel):
|
|
464
|
+
...
|
|
465
|
+
config: dict[str, Any] = Field(default_factory=dict)
|
|
466
|
+
...
|
|
467
|
+
class RolloutPolicySpec(BaseModel):
|
|
468
|
+
...
|
|
469
|
+
config: dict[str, Any] = Field(default_factory=dict)
|
|
470
|
+
```
|
|
471
|
+
- Crafter stepwise config reading and reward metadata:
|
|
472
|
+
```1041:1067:synth-ai/examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py
|
|
473
|
+
# Stepwise reward configuration ...
|
|
474
|
+
```
|
|
475
|
+
```1554:1596:synth-ai/examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py
|
|
476
|
+
# Compute decision-level rewards ...
|
|
477
|
+
```
|
|
478
|
+
- Backend rollout payload shape:
|
|
479
|
+
```456:470:monorepo/backend/app/routes/clustered_training/core/algorithms/gspo/evaluation/evaluator.py
|
|
480
|
+
payload = {
|
|
481
|
+
"run_id": run_id,
|
|
482
|
+
"env": {"env_name": env_name, "config": env_config, "seed": seed},
|
|
483
|
+
"policy": {"policy_name": policy_name, "config": policy_config},
|
|
484
|
+
...
|
|
485
|
+
}
|
|
486
|
+
```
|
|
487
|
+
|
|
488
|
+
Status: Notes committed on branch `friday-cleanup` and pushed.
|
|
489
|
+
|
|
490
|
+
### Operational guardrails
|
|
491
|
+
|
|
492
|
+
- Treat avg_turns == 0 (or a high fraction of episodes with turns == 0) as a failure condition; exit non‑zero.
|
|
493
|
+
- Fail fast when the first policy step returns a 4xx/5xx from the inference target; include the HTTP status and URL in the error message.
|
|
494
|
+
- CI hint: a tiny smoke run (2 seeds × 1 rollout) should see turns > 0 in healthy setups.
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Crafter eval (stepwise: complex)
|
|
2
|
+
|
|
3
|
+
# Replace at runtime by exporting TASK_APP_URL, or put the full https://<modal>.run URL here
|
|
4
|
+
# task_app_url = "https://YOUR-TASK-APP.modal.run"
|
|
5
|
+
|
|
6
|
+
model = "qwen/qwen3-32b"
|
|
7
|
+
# Route inference to local task app Groq proxy
|
|
8
|
+
inference_url = "http://localhost:8001/proxy/groq"
|
|
9
|
+
num_episodes = 10
|
|
10
|
+
max_turns = 10
|
|
11
|
+
concurrency = 10
|
|
12
|
+
# difficulty = "easy" # optional
|
|
13
|
+
|
|
14
|
+
[rollout]
|
|
15
|
+
env_name = "crafter"
|
|
16
|
+
policy_name = "crafter-react"
|
|
17
|
+
max_turns = 10
|
|
18
|
+
|
|
19
|
+
[rollout.env_config]
|
|
20
|
+
|
|
21
|
+
[rollout.env_config.step_rewards]
|
|
22
|
+
enabled = true
|
|
23
|
+
mode = "decision_stepwise"
|
|
24
|
+
strategy = "complex"
|
|
25
|
+
indicator_lambda = 0.0 # ignored by complex mode if weights are present
|
|
26
|
+
|
|
27
|
+
# Example weights (tune per need)
|
|
28
|
+
weights = { collect_sapling = 0.1, craft_wood_pickaxe = 0.3, collect_diamond = 1.0 }
|
|
29
|
+
# Allow rewarding an achievement up to K times per episode
|
|
30
|
+
k_limits = { collect_sapling = 1, craft_wood_pickaxe = 2, collect_diamond = 3 }
|
|
31
|
+
|
|
32
|
+
[rollout.policy_config]
|
|
33
|
+
temperature = 0.2
|
|
34
|
+
top_p = 0.95
|
|
35
|
+
max_tokens = 512
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Crafter evaluation with consistent stepwise shaping.
|
|
2
|
+
task_app_url = "https://synth-laboratories--grpo-crafter-task-app-final-warming--ceb5b2.modal.run"
|
|
3
|
+
model = "qwen/qwen3-32b"
|
|
4
|
+
inference_url = "https://api.groq.com/openai"
|
|
5
|
+
num_episodes = 20
|
|
6
|
+
max_turns = 10
|
|
7
|
+
concurrency = 10
|
|
8
|
+
|
|
9
|
+
[rollout]
|
|
10
|
+
env_name = "crafter"
|
|
11
|
+
policy_name = "crafter-react"
|
|
12
|
+
max_turns = 10
|
|
13
|
+
|
|
14
|
+
[rollout.env_config]
|
|
15
|
+
difficulty = "easy"
|
|
16
|
+
|
|
17
|
+
[rollout.env_config.step_rewards]
|
|
18
|
+
enabled = true
|
|
19
|
+
mode = "decision_stepwise"
|
|
20
|
+
strategy = "consistent"
|
|
21
|
+
indicator_lambda = 1.0
|
|
22
|
+
|
|
23
|
+
[rollout.policy_config]
|
|
24
|
+
temperature = 0.2
|
|
25
|
+
top_p = 0.95
|
|
26
|
+
max_tokens = 512
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Crafter evaluation with achievement-weighted stepwise shaping.
|
|
2
|
+
task_app_url = "https://synth-laboratories--grpo-crafter-task-app-final-warming--ceb5b2.modal.run"
|
|
3
|
+
model = "qwen/qwen3-32b"
|
|
4
|
+
inference_url = "https://api.groq.com/openai"
|
|
5
|
+
num_episodes = 20
|
|
6
|
+
max_turns = 10
|
|
7
|
+
concurrency = 10
|
|
8
|
+
|
|
9
|
+
[rollout]
|
|
10
|
+
env_name = "crafter"
|
|
11
|
+
policy_name = "crafter-react"
|
|
12
|
+
max_turns = 10
|
|
13
|
+
|
|
14
|
+
[rollout.env_config]
|
|
15
|
+
difficulty = "easy"
|
|
16
|
+
|
|
17
|
+
[rollout.env_config.step_rewards]
|
|
18
|
+
enabled = true
|
|
19
|
+
mode = "decision_stepwise"
|
|
20
|
+
strategy = "per_achievement"
|
|
21
|
+
indicator_lambda = 1.0
|
|
22
|
+
|
|
23
|
+
[rollout.env_config.step_rewards.weights]
|
|
24
|
+
collect_sapling = 0.1
|
|
25
|
+
craft_wood_pickaxe = 0.3
|
|
26
|
+
collect_diamond = 1.0
|
|
27
|
+
|
|
28
|
+
[rollout.env_config.step_rewards.k_limits]
|
|
29
|
+
collect_sapling = 1
|
|
30
|
+
craft_wood_pickaxe = 2
|
|
31
|
+
collect_diamond = 3
|
|
32
|
+
|
|
33
|
+
[rollout.policy_config]
|
|
34
|
+
temperature = 0.2
|
|
35
|
+
top_p = 0.95
|
|
36
|
+
max_tokens = 512
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# Crafter eval (stepwise: simple)
|
|
2
|
+
|
|
3
|
+
# Replace at runtime by exporting TASK_APP_URL, or put the full https://<modal>.run URL here
|
|
4
|
+
# task_app_url = "https://YOUR-TASK-APP.modal.run"
|
|
5
|
+
|
|
6
|
+
model = "qwen/qwen3-32b"
|
|
7
|
+
# Route inference to local task app Groq proxy
|
|
8
|
+
inference_url = "http://localhost:8001/proxy/groq"
|
|
9
|
+
num_episodes = 10
|
|
10
|
+
max_turns = 10
|
|
11
|
+
concurrency = 10
|
|
12
|
+
# difficulty = "easy" # optional
|
|
13
|
+
|
|
14
|
+
[rollout]
|
|
15
|
+
env_name = "crafter"
|
|
16
|
+
policy_name = "crafter-react"
|
|
17
|
+
max_turns = 10
|
|
18
|
+
|
|
19
|
+
[rollout.env_config]
|
|
20
|
+
# Example additional env controls can go here
|
|
21
|
+
|
|
22
|
+
[rollout.env_config.step_rewards]
|
|
23
|
+
enabled = true
|
|
24
|
+
mode = "decision_stepwise"
|
|
25
|
+
strategy = "simple" # 1.0 reward if any new achievement this decision
|
|
26
|
+
indicator_lambda = 1.0
|
|
27
|
+
|
|
28
|
+
[rollout.policy_config]
|
|
29
|
+
# Inference will be routed automatically when using --use-rollout
|
|
30
|
+
temperature = 0.2
|
|
31
|
+
top_p = 0.95
|
|
32
|
+
max_tokens = 512
|