synth-ai 0.2.9.dev3__py3-none-any.whl → 0.2.9.dev5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (107) hide show
  1. examples/analyze_semantic_words.sh +17 -0
  2. examples/common_old/backend.py +21 -0
  3. examples/crafter_debug_render.py +180 -0
  4. examples/evals_old/README.md +98 -0
  5. examples/evals_old/__init__.py +6 -0
  6. examples/evals_old/compare_models.py +1037 -0
  7. examples/evals_old/example_log.md +145 -0
  8. examples/evals_old/run_demo.sh +126 -0
  9. examples/evals_old/trace_analysis.py +270 -0
  10. examples/finetuning_old/_backup_synth_qwen/config.toml +29 -0
  11. examples/finetuning_old/_backup_synth_qwen/example_log.md +324 -0
  12. examples/finetuning_old/_backup_synth_qwen/filter_traces.py +60 -0
  13. examples/finetuning_old/_backup_synth_qwen/filter_traces_achievements.py +239 -0
  14. examples/finetuning_old/_backup_synth_qwen/purge_v3_traces.py +109 -0
  15. examples/finetuning_old/_backup_synth_qwen/react_agent_lm.py +1924 -0
  16. examples/finetuning_old/_backup_synth_qwen/readme.md +49 -0
  17. examples/finetuning_old/_backup_synth_qwen/run_crafter_qwen4b.py +114 -0
  18. examples/finetuning_old/_backup_synth_qwen/run_demo.sh +195 -0
  19. examples/finetuning_old/_backup_synth_qwen/sft_kickoff.py +118 -0
  20. examples/finetuning_old/synth_qwen_v1/README.md +68 -0
  21. examples/finetuning_old/synth_qwen_v1/filter_traces.py +60 -0
  22. examples/finetuning_old/synth_qwen_v1/filter_traces_achievements.py +239 -0
  23. examples/finetuning_old/synth_qwen_v1/finetune.py +46 -0
  24. examples/finetuning_old/synth_qwen_v1/hello_ft_model.py +71 -0
  25. examples/finetuning_old/synth_qwen_v1/infer.py +37 -0
  26. examples/finetuning_old/synth_qwen_v1/poll.py +44 -0
  27. examples/finetuning_old/synth_qwen_v1/prepare_data.py +35 -0
  28. examples/finetuning_old/synth_qwen_v1/purge_v3_traces.py +109 -0
  29. examples/finetuning_old/synth_qwen_v1/react_agent_lm.py +1932 -0
  30. examples/finetuning_old/synth_qwen_v1/run_crafter_sft_job.py +207 -0
  31. examples/finetuning_old/synth_qwen_v1/run_ft_job.py +232 -0
  32. examples/finetuning_old/synth_qwen_v1/upload_data.py +34 -0
  33. examples/finetuning_old/synth_qwen_v1/util.py +147 -0
  34. examples/rl/README.md +169 -0
  35. examples/rl/configs/eval_base_qwen.toml +15 -0
  36. examples/rl/configs/eval_rl_qwen.toml +11 -0
  37. examples/rl/configs/rl_from_base_qwen.toml +35 -0
  38. examples/rl/configs/rl_from_base_qwen17.toml +74 -0
  39. examples/rl/configs/rl_from_ft_qwen.toml +35 -0
  40. examples/rl/download_dataset.py +64 -0
  41. examples/rl/run_eval.py +435 -0
  42. examples/rl/run_rl_and_save.py +94 -0
  43. examples/rl/task_app/README.md +22 -0
  44. {synth_ai/task/apps → examples/rl/task_app}/math_single_step.py +8 -8
  45. examples/rl/task_app/math_task_app.py +107 -0
  46. examples/rl_old/task_app.py +962 -0
  47. examples/run_crafter_demo.sh +10 -0
  48. examples/warming_up_to_rl/analyze_trace_db.py +420 -0
  49. examples/warming_up_to_rl/configs/crafter_fft.toml +48 -0
  50. examples/warming_up_to_rl/configs/crafter_fft_4b.toml +54 -0
  51. examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml +20 -0
  52. examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml +13 -0
  53. examples/warming_up_to_rl/configs/eval_modal_qwen4b.toml +23 -0
  54. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +73 -0
  55. examples/warming_up_to_rl/configs/rl_from_ft.toml +56 -0
  56. examples/warming_up_to_rl/export_trace_sft.py +541 -0
  57. examples/warming_up_to_rl/groq_test.py +88 -0
  58. examples/warming_up_to_rl/manage_secrets.py +127 -0
  59. examples/warming_up_to_rl/old/event_rewards.md +234 -0
  60. examples/warming_up_to_rl/old/notes.md +73 -0
  61. examples/warming_up_to_rl/readme.md +172 -0
  62. examples/warming_up_to_rl/run_eval.py +434 -0
  63. examples/warming_up_to_rl/run_fft_and_save.py +309 -0
  64. examples/warming_up_to_rl/run_local_rollout.py +188 -0
  65. examples/warming_up_to_rl/run_local_rollout_modal.py +160 -0
  66. examples/warming_up_to_rl/run_local_rollout_parallel.py +342 -0
  67. examples/warming_up_to_rl/run_local_rollout_traced.py +372 -0
  68. examples/warming_up_to_rl/run_rl_and_save.py +101 -0
  69. examples/warming_up_to_rl/run_rollout_remote.py +129 -0
  70. examples/warming_up_to_rl/task_app/README.md +38 -0
  71. {synth_ai/task/apps → examples/warming_up_to_rl/task_app}/grpo_crafter.py +7 -7
  72. examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +165 -0
  73. examples/warming_up_to_rl/task_app/synth_envs_hosted/README.md +173 -0
  74. examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +5 -0
  75. examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +145 -0
  76. examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +1271 -0
  77. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -0
  78. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
  79. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
  80. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +429 -0
  81. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +442 -0
  82. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +96 -0
  83. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +302 -0
  84. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
  85. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +202 -0
  86. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +5 -0
  87. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +512 -0
  88. examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +102 -0
  89. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +985 -0
  90. examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +197 -0
  91. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +1749 -0
  92. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +5 -0
  93. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +217 -0
  94. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +160 -0
  95. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +146 -0
  96. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +61 -0
  97. synth_ai/api/train/config_finder.py +18 -18
  98. synth_ai/api/train/env_resolver.py +28 -1
  99. synth_ai/cli/task_apps.py +291 -56
  100. synth_ai/task/apps/__init__.py +54 -13
  101. {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev5.dist-info}/METADATA +1 -1
  102. {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev5.dist-info}/RECORD +106 -13
  103. {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev5.dist-info}/top_level.txt +1 -0
  104. synth_ai/environments/examples/sokoban/units/astar_common.py +0 -95
  105. {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev5.dist-info}/WHEEL +0 -0
  106. {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev5.dist-info}/entry_points.txt +0 -0
  107. {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev5.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,127 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import os
6
+ import shlex
7
+ import subprocess
8
+ import sys
9
+ import tempfile
10
+ from pathlib import Path
11
+ from typing import Dict, Tuple
12
+
13
+
14
+ def load_env_file(path: Path) -> Dict[str, str]:
15
+ env: Dict[str, str] = {}
16
+ if not path.exists():
17
+ raise FileNotFoundError(f".env not found at {path}")
18
+ for line in path.read_text(encoding="utf-8").splitlines():
19
+ line = line.strip()
20
+ if not line or line.startswith("#") or "=" not in line:
21
+ continue
22
+ k, v = line.split("=", 1)
23
+ env[k.strip()] = v.strip().strip("'").strip('"')
24
+ return env
25
+
26
+
27
+ def write_temp_env(kv: Dict[str, str]) -> Path:
28
+ fd, p = tempfile.mkstemp(prefix="modal_secret_", suffix=".env")
29
+ path = Path(p)
30
+ with os.fdopen(fd, "w", encoding="utf-8") as fh:
31
+ for k, v in kv.items():
32
+ fh.write(f"{k}={v}\n")
33
+ return path
34
+
35
+
36
+ def run(cmd: str) -> Tuple[int, str]:
37
+ proc = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
38
+ return proc.returncode, proc.stdout
39
+
40
+
41
+ def ensure_secret(secret_name: str, kv: Dict[str, str]) -> None:
42
+ if not kv:
43
+ print(f"[skip] {secret_name}: no values provided")
44
+ return
45
+ # Prefer passing KEY=VALUE pairs to avoid Typer --env-file bug under some shells
46
+ kv_args = " ".join([f"{shlex.quote(k)}={shlex.quote(v)}" for k, v in kv.items()])
47
+ # Try plain modal first; fallback to uv run modal
48
+ def _create() -> Tuple[int, str]:
49
+ return run(f"modal secret create {shlex.quote(secret_name)} {kv_args}")
50
+ def _delete() -> Tuple[int, str]:
51
+ return run(f"printf 'y\n' | modal secret delete {shlex.quote(secret_name)}")
52
+ rc, out = _create()
53
+ if rc != 0:
54
+ # Fallback: use uv run modal
55
+ rc_uv, out_uv = run(f"uv run modal secret create {shlex.quote(secret_name)} {kv_args}")
56
+ if rc_uv == 0:
57
+ print(f"[ok] secret ready: {secret_name}")
58
+ return
59
+ # Try delete+create with both variants
60
+ print(f"[info] create failed for {secret_name}, attempting delete+create…")
61
+ _ = _delete()
62
+ rc2, out2 = _create()
63
+ if rc2 != 0:
64
+ _ = run(f"printf 'y\n' | uv run modal secret delete {shlex.quote(secret_name)}")
65
+ rc3, out3 = run(f"uv run modal secret create {shlex.quote(secret_name)} {kv_args}")
66
+ if rc3 != 0:
67
+ print(out3 or out2 or out_uv or out)
68
+ raise RuntimeError(f"failed to create secret {secret_name}")
69
+ print(f"[ok] secret ready: {secret_name}")
70
+
71
+
72
+ def main() -> None:
73
+ ap = argparse.ArgumentParser(description="Sync .env keys into Modal secret bundles for the task app")
74
+ ap.add_argument("--env-path", default=str(Path(__file__).parent / ".env"), help="Path to .env with keys")
75
+ args = ap.parse_args()
76
+
77
+ env = load_env_file(Path(args.env_path))
78
+
79
+ # Secrets used by the task app
80
+ env_secret = {
81
+ k: v
82
+ for k, v in {
83
+ "ENVIRONMENT_API_KEY": env.get("ENVIRONMENT_API_KEY", ""),
84
+ "dev_environment_api_key": env.get("ENVIRONMENT_API_KEY", ""),
85
+ }.items()
86
+ if v
87
+ }
88
+
89
+ groq_secret = {
90
+ k: v
91
+ for k, v in {
92
+ "GROQ_API_KEY": env.get("GROQ_API_KEY", ""),
93
+ "dev_groq_api_key": env.get("GROQ_API_KEY", ""),
94
+ }.items()
95
+ if v
96
+ }
97
+
98
+ openai_secret = {
99
+ k: v
100
+ for k, v in {
101
+ "OPENAI_API_KEY": env.get("OPENAI_API_KEY", ""),
102
+ "dev_openai_api_key": env.get("OPENAI_API_KEY", ""),
103
+ }.items()
104
+ if v
105
+ }
106
+
107
+ # Optional: backend key (not mounted by task app today, but useful to keep consistent)
108
+ synth_secret = {"SYNTH_API_KEY": env.get("SYNTH_API_KEY", "")} if env.get("SYNTH_API_KEY") else {}
109
+
110
+ ensure_secret("crafter-environment-sdk", env_secret)
111
+ ensure_secret("groq-api-key", groq_secret)
112
+ ensure_secret("openai-api-key", openai_secret)
113
+ if synth_secret:
114
+ ensure_secret("synth-api-key", synth_secret)
115
+
116
+ print("All requested secrets ensured. Redeploy the app if you updated any secrets:")
117
+ print(" uv run modal deploy examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py")
118
+
119
+
120
+ if __name__ == "__main__":
121
+ try:
122
+ main()
123
+ except Exception as e:
124
+ print(f"[error] {type(e).__name__}: {e}")
125
+ sys.exit(1)
126
+
127
+
@@ -0,0 +1,234 @@
1
+ # Crafter Event-Level Rewards (NOTES)
2
+
3
+ This note outlines how to support event-level reward layering for Crafter across the warming_up_to_rl task app and the monorepo clustered_training RL pipeline.
4
+
5
+ ## Goals
6
+ - Attribute reward at decision/step level (per tool call) instead of only using a single trajectory outcome reward.
7
+ - Make this behavior controllable via TOML config flags (enable/disable and choose the source/kind of event reward).
8
+ - Keep compatibility with existing trajectory-outcome paths; when disabled, the system behaves exactly as before.
9
+
10
+ ## Definitions
11
+ - "Decision": one LM tool call (e.g., `interact_many`) and the sequence of environment steps it triggers.
12
+ - "Absolute achievement delta" (AchΔ): count of achievements that became true during a decision.
13
+ - "Unique achievement delta" (UniqueΔ): count of achievements first unlocked in the episode by a decision.
14
+ - "Env sparse reward": the environment’s own per-step reward (e.g., `reward_last_step`).
15
+
16
+ ## What to compute per decision
17
+ - From observation before and after the decision:
18
+ - `turned_true = achievements_after − achievements_before`
19
+ - `new_unique = episode_achievements_after − episode_achievements_before`
20
+ - Scalars:
21
+ - `ach_delta = len(turned_true)`
22
+ - `unique_delta = len(new_unique)`
23
+ - Optional: per-achievement markers for each `a ∈ new_unique` (reward 1.0) for fine-grained shaping.
24
+
25
+ ## Switches/Flags in TOML
26
+ Prefer reusing existing RL trainer flags in clustered_training (already present in code):
27
+
28
+ ```
29
+ [training]
30
+ # Stepwise/event rewards
31
+ step_rewards_enabled = true # master switch
32
+ step_rewards_mode = "decision_stepwise" # "off" | "decision_stepwise" | "env_sparse"
33
+ step_rewards_beta = 0.0 # optional coefficient for time weighting
34
+ step_rewards_indicator_lambda = 0.0 # optional coefficient for indicator-based flips
35
+
36
+ # Crafter-specific selection (proposed extension, optional)
37
+ # event_rewards_kind = "unique" # "unique" | "absolute" (if omitted, default to "unique")
38
+ ```
39
+
40
+ - `step_rewards_enabled`: enables all event-level aggregation.
41
+ - `step_rewards_mode`:
42
+ - `off`: use only trajectory outcome reward (status quo).
43
+ - `decision_stepwise`: use per-decision computed deltas (from policy app or collector), aggregate as returns.
44
+ - `env_sparse`: use the environment’s `reward_last_step` per step.
45
+ - `event_rewards_kind` (optional): if present, selects `unique_delta` (default) vs `ach_delta` for `decision_stepwise`.
46
+
47
+ Warmup task TOML may place these under a `training` or `rollout` section; the launcher just forwards the full TOML blob to the backend, so the monorepo side should read the same keys.
48
+
49
+ ## Warming_up_to_rl task app – producing decision rewards
50
+ - In the Crafter policy (or rollout coordinator), for each decision:
51
+ - Compute `ach_delta` and `unique_delta` as above.
52
+ - Attach a compact record to the step metadata, e.g.:
53
+ ```json
54
+ {
55
+ "decision_rewards": {
56
+ "turn": 5,
57
+ "ach_delta": 1,
58
+ "unique_delta": 1,
59
+ "all": ["collect_wood"],
60
+ "unique": ["collect_wood"]
61
+ }
62
+ }
63
+ ```
64
+ - When `step_rewards_enabled=false`, omit this block.
65
+ - When `step_rewards_mode="env_sparse"`, rely on `reward_last_step` (no decision block required).
66
+
67
+ Notes:
68
+ - The app already records previous tool calls and environment results; this simply adds a small, structured payload per decision (turn).
69
+ - If per-step `reward_last_step` is unavailable, `decision_stepwise` remains effective as long as achievements maps are present.
70
+
71
+ ## Monorepo clustered_training – consuming event rewards
72
+ Integration points (based on existing config structure):
73
+ - `ClusteredTrainerConfig` already includes:
74
+ - `step_rewards_enabled: bool`
75
+ - `step_rewards_mode: str` (off | decision_stepwise)
76
+ - `step_rewards_beta: float`
77
+ - `step_rewards_indicator_lambda: float`
78
+
79
+ Collector changes (conceptual):
80
+ 1. During trajectory collection, build a vector `r_t` of per-time-step rewards:
81
+ - If `step_rewards_mode == "decision_stepwise"`:
82
+ - For time step `t` corresponding to a decision, set:
83
+ - `r_t = unique_delta` if `event_rewards_kind=="unique"` (default), else `r_t = ach_delta`.
84
+ - For non-decision steps, `r_t = 0.0` (unless you prefer to spread rewards over sub-steps; keep simple attribution by default).
85
+ - If `step_rewards_mode == "env_sparse"`:
86
+ - For each environment step, set `r_t = reward_last_step`.
87
+ - Else (`off`):
88
+ - Use a single scalar outcome reward at the end (status quo).
89
+
90
+ 2. Compute returns/advantages as usual, summing event rewards:
91
+ - For GRPO/GRPO-Ludic, the typical group-based advantage calculation remains unchanged; only the reward signal changes from a single scalar to a sequence `[r_1, …, r_T]`.
92
+ - Optional time weighting: `r_t ← r_t + beta * (T − t) * indicator_flip_t`, where `indicator_flip_t` is 1 if any unique achievement flipped at `t`, else 0. Use `step_rewards_indicator_lambda` as a coefficient if needed.
93
+
94
+ Pseudo-code (collector side):
95
+ ```python
96
+ r = [0.0] * T
97
+ if cfg.step_rewards_enabled:
98
+ if cfg.step_rewards_mode == "decision_stepwise":
99
+ for ev in decision_events: # each with fields {turn, ach_delta, unique_delta}
100
+ idx = ev["turn"] - 1 # 0-based
101
+ base = ev["unique_delta"] if event_kind == "unique" else ev["ach_delta"]
102
+ r[idx] += float(base)
103
+ if cfg.step_rewards_indicator_lambda > 0 and ev["unique_delta"] > 0:
104
+ r[idx] += float(cfg.step_rewards_indicator_lambda)
105
+ elif cfg.step_rewards_mode == "env_sparse":
106
+ for t, step in enumerate(env_steps):
107
+ r[t] += float(step.get("reward_last_step", 0.0))
108
+ else:
109
+ r[-1] += float(trajectory_outcome_reward)
110
+ ```
111
+
112
+ ## Respecting the TOML switch
113
+ - warming_up_to_rl launcher (`run_rl_and_save.py`) forwards the entire TOML to the backend.
114
+ - clustered_training should read `[training].step_rewards_enabled` and `[training].step_rewards_mode` (and optionally `event_rewards_kind`) inside its config loader (already present fields in `ClusteredTrainerConfig`).
115
+ - When disabled, the collector must not attempt to parse or rely on any per-decision metadata.
116
+
117
+ ## Debugging & metrics
118
+ - Log per-trajectory aggregates: `ΣAchΔ`, `ΣUniqueΔ`, and a breakdown by decision turn (already added to the Groq rollout table in research). These can be mirrored in the backend logs for quick checks.
119
+ - Add simple counters to training logs:
120
+ - number of decisions with `unique_delta>0`
121
+ - sum of deltas per batch
122
+ - share of batches with nonzero event rewards
123
+
124
+ ## Backward compatibility
125
+ - When flags are off, the pipeline uses trajectory outcome rewards only.
126
+ - No schema migrations are required; event-level metadata is optional.
127
+
128
+ ## Recommended defaults
129
+ - `step_rewards_enabled = true`
130
+ - `step_rewards_mode = "decision_stepwise"`
131
+ - Prefer `unique` deltas for better credit assignment; set `event_rewards_kind = "unique"` (if adopted) or implicitly default to unique deltas.
132
+
133
+ Here’s the exact file-by-file implementation checklist, scoped so another engineer can implement from this alone.
134
+
135
+ Warming_up_to_rl (task app) – record decision rewards and honor flags
136
+ - Config examples (ensure flags present and documented)
137
+ - `examples/warming_up_to_rl/configs/*.toml`
138
+ - Add under [training]:
139
+ - `step_rewards_enabled = true|false`
140
+ - `step_rewards_mode = "off" | "decision_stepwise" | "env_sparse"`
141
+ - Optional: `event_rewards_kind = "unique" | "absolute"`
142
+ - Optional shaping: `step_rewards_beta`, `step_rewards_indicator_lambda`
143
+
144
+ - Policy (compute ach/unique deltas per decision; emit into step metadata when enabled)
145
+ - `examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py`
146
+ - Before/after each tool call sequence, compute:
147
+ - `ach_delta = len(achievements_after − achievements_before)`
148
+ - `unique_delta = len((episode_achievements_after) − (episode_achievements_before))`
149
+ - When `[training].step_rewards_enabled` and `step_rewards_mode == "decision_stepwise"`:
150
+ - Attach to the step’s returned metadata:
151
+ - `decision_rewards = { turn, ach_delta, unique_delta, all: [...], unique: [...] }`
152
+ - If `step_rewards_mode == "env_sparse"`, do not emit `decision_rewards` (leave environment’s `reward_last_step` as the only per-step reward).
153
+ - Respect clipping for long “Previous tool calls” context (already added; keep).
154
+
155
+ - Policy routes (surface flags to policy; store on policy instance or in request metadata)
156
+ - `examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py`
157
+ - Accept training flags from create/init endpoints (if provided via config).
158
+ - Pass through/attach the flags into the policy or per-step metadata so `policy.step(...)` can read them.
159
+
160
+ - Rollout coordinator (guarantee metadata flows out with each step)
161
+ - `examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py`
162
+ - Ensure the step response returned to the caller includes `decision_rewards` when set by the policy.
163
+ - No compute here; just propagate metadata.
164
+
165
+ - Environment adapter (ensure observation has fields needed by the deltas)
166
+ - `examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py`
167
+ - Confirm each step response includes `observation.achievements_status` and `observation.reward_last_step`.
168
+ - No reward computation changes here; just guarantee the fields exist.
169
+
170
+ Monorepo (clustered training, GSPO/GRPO) – use decision/env-sparse rewards to build per-step returns
171
+ - Config loader (read flags; default behavior preserved)
172
+ - `backend/app/routes/clustered_training/core/algorithms/gspo/training/clustered_trainer.py`
173
+ - In `ClusteredTrainerConfig.from_dict(...)`:
174
+ - Already present: `step_rewards_enabled`, `step_rewards_mode`, `step_rewards_beta`, `step_rewards_indicator_lambda`.
175
+ - Add (optional) read: `event_rewards_kind` with default `"unique"` if not present.
176
+
177
+ - Collector/rollout trajectory builder (construct r_t per episode)
178
+ - The module that converts environment/policy step records into trajectories (collector). If it’s split, cover the point where step arrays are built just before advantage computation.
179
+ - New logic:
180
+ - Initialize `r = [0.0] * T`.
181
+ - If `step_rewards_enabled`:
182
+ - If `step_rewards_mode == "decision_stepwise"`:
183
+ - For each step metadata with `decision_rewards`:
184
+ - `idx = turn - 1`
185
+ - `base = unique_delta` if `event_rewards_kind == "unique"` else `ach_delta`
186
+ - `r[idx] += float(base)`
187
+ - If `step_rewards_indicator_lambda > 0` and `unique_delta > 0`, `r[idx] += step_rewards_indicator_lambda`
188
+ - Else if `step_rewards_mode == "env_sparse"`:
189
+ - For each step, `r[t] += float(observation.reward_last_step or 0.0)`
190
+ - Else (`off`): `r[-1] += float(outcome_reward)`
191
+ - Optional shaping: `r[t] += step_rewards_beta * (T - t) * indicator_flip_t` where `indicator_flip_t = 1` if the step had `unique_delta > 0`, else 0.
192
+ - Ensure this path does not run when flags are off; old outcome-only behavior remains.
193
+
194
+ - Advantage/returns computation (no API change; just consume r)
195
+ - The function/module that currently builds returns/advantages from rewards.
196
+ - No interface changes; ensure it takes `r` from the collector path above instead of a single scalar outcome reward when event rewards are enabled.
197
+
198
+ - Logging/metrics (help ops confirm it’s working)
199
+ - Add counters in the training loop logs:
200
+ - Sum of `r` per batch (stepwise mode).
201
+ - Count of decisions with `unique_delta > 0`.
202
+ - Mode/flags echoed on startup.
203
+
204
+ - RL configs (dev example TOMLs with flags)
205
+ - `backend/app/routes/clustered_training/dev/configs/crafter_online.toml`
206
+ - Add the `[training]` keys above with comments showing choices.
207
+ - Any job start scripts that inline TOML (e.g. `tests/applications/crafter/rl/start_qwen_full_clustered.py` if used)
208
+ - Ensure they don’t strip the new keys; no code change needed if they pass through the TOML.
209
+
210
+ Research (optional reference; not required for GSPO)
211
+ - Reference rollout script demonstrating decision-delta computation
212
+ - `research/testing/crafter/eval_rollout_table_groq.py`
213
+ - Already computes/prints per-decision deltas; use as validation aid (no further changes required for GSPO).
214
+
215
+ Docs/notes (keep implementers aligned)
216
+ - Warming up to RL notes
217
+ - `examples/warming_up_to_rl/event_rewards.md`
218
+ - Already describes flags and expectations; keep this in sync if any naming changes happen.
219
+
220
+ - Research spec
221
+ - `research/testing/crafter/event_rewards.txt`
222
+ - Already contains the full design and the “recording AND using stepwise rewards” plan.
223
+
224
+ Sanity checklist (engineer can validate with these)
225
+ - With `[training].step_rewards_enabled=false`: identical behavior to today (only outcome reward used).
226
+ - With `decision_stepwise`:
227
+ - The task app emits `decision_rewards` per decision (check one trajectory).
228
+ - The collector constructs `r_t` from `unique_delta` (or `ach_delta` if configured).
229
+ - Training logs show nonzero stepwise batch reward sums.
230
+ - With `env_sparse`:
231
+ - No decision payload; rewards come strictly from `reward_last_step`.
232
+ - Switching `event_rewards_kind` between `"unique"` and `"absolute"` changes which scalar lands in r at a decision turn.
233
+
234
+ If you want, I can generate minimal code diffs for each target file after you confirm these paths and flag names.
@@ -0,0 +1,73 @@
1
+ # Crafter Task App Ops Cheatsheet
2
+
3
+ ## Discover available task apps
4
+ - `uvx synth-ai task-app list`
5
+ - Lists the registered apps plus any aliases (e.g. `grpo-crafter`, `crafter`).
6
+
7
+ ## Run locally with uvicorn
8
+ - Launch the FastAPI server:
9
+ - `uvx synth-ai serve grpo-crafter --port 8010 --force`
10
+ - `--force` frees the port if a previous run is still bound.
11
+ - Add `--reload` while iterating on code.
12
+ - Enable tracing + SFT dumps while serving:
13
+ - `uvx synth-ai serve grpo-crafter --port 8010 --force --trace ./traces --trace-db ./traces/v3/synth_ai.db`
14
+ - `--trace` writes JSONL trajectories into the folder.
15
+ - `--trace-db` points the sqlite/Turso-compatible tracing DB (defaults to `traces/v3/synth_ai.db`).
16
+
17
+ ## Modal hot-reload (`modal serve`)
18
+ - Run the hosted app locally inside Modal’s hot-reload loop:
19
+ - `uvx synth-ai task-app modal-serve grpo-crafter --env-file .env`
20
+ - CLI will prompt for a `.env` file if not supplied; secrets are loaded via `Secret.from_dotenv`.
21
+ - Keeps watching the repo for changes and streams logs in your terminal.
22
+
23
+ ## Modal deploy (persistent endpoint)
24
+ - Build + deploy to the `modal deploy` target:
25
+ - `uvx synth-ai task-app deploy grpo-crafter --env-file .env`
26
+ - Use `--dry-run` first to inspect the generated `modal deploy …` command.
27
+ - `--modal-cli` lets you point at a non-default Modal binary if needed.
28
+
29
+ ## Collecting traces & rollouts
30
+ - Local rollouts against a running server with full trace payloads:
31
+ - `uv run python examples/warming_up_to_rl/run_local_rollout_traced.py --api-key "$ENVIRONMENT_API_KEY" --base-url http://localhost:8010 --model gpt-4o-mini --trace-format full --trace-path ./trace_full.json`
32
+ - This script prints a reward summary, dumps the trace JSON, and warns if episode returns don’t line up with event rewards.
33
+ - Remote rollouts against a deployed Modal endpoint:
34
+ - `uv run python examples/warming_up_to_rl/run_rollout_remote.py --base-url https://<modal-app-url> --api-key "$ENVIRONMENT_API_KEY" --model gpt-4o-mini --max-llm-calls 10`
35
+
36
+ ## Trace analytics
37
+ - Summarise model usage, reward breakdowns, and achievement histograms:
38
+ - `uv run python examples/warming_up_to_rl/analyze_trace_db.py --db traces/v3/synth_ai.db`
39
+ - Output includes per-model achievement tallies and episode reward stats.
40
+
41
+ ## Exporting behavioural-cloning datasets
42
+ - Filter sessions via model, achievements, rewards, etc., then export JSONL:
43
+ - `uv run python examples/warming_up_to_rl/export_trace_sft.py \`
44
+ ` --db traces/v3/synth_ai.db \`
45
+ ` --output traces/qwen32b_filtered.jsonl \`
46
+ ` --model qwen/qwen3-32b \`
47
+ ` --exclude-achievement collect_sapling \`
48
+ ` --exclude-achievement collect_drink \`
49
+ ` --min-unique 3 \`
50
+ ` --event-reward unique_achievement_delta:1.0 \`
51
+ ` --limit 100`
52
+ - `--exclude-achievement` makes it easy to ignore easier unlocks when enforcing `--min-unique`.
53
+ - Combine `--require-achievement`, `--min-outcome-reward`, or provider filters as needed.
54
+
55
+ ## Training jobs (RL + SFT)
56
+ - `uvx synth-ai train` is the consolidated entry point for RL or SFT launches.
57
+ - Omit `--config` to let the CLI enumerate candidate TOMLs (RL + FFT) and pick interactively.
58
+ - Omit `--env-file` to browse available `.env` files; the CLI never auto-selects.
59
+ - Missing secrets trigger an interactive loop: enter manually, switch `.env`, or fetch from Modal (secrets/apps) before proceeding.
60
+ - RL run (local backend + local task app):
61
+ - `uvx synth-ai train --type rl --config examples/warming_up_to_rl/configs/crafter_cluster.toml --backend http://localhost:8000/api --task-url http://localhost:8010`
62
+ - Performs task-app health checks using the resolved `ENVIRONMENT_API_KEY` before posting to `/rl/jobs`.
63
+ - Polls job status until terminal unless `--no-poll` is supplied.
64
+ - SFT run (FFT fine-tune):
65
+ - `uvx synth-ai train --type sft --config examples/warming_up_to_rl/configs/fft_crafter.toml --dataset traces/crafter_sft.jsonl`
66
+ - Uploads training/validation JSONL to `/learning/files` and starts the job.
67
+ - Poll output mirrors the legacy `run_fft_and_save.py` script.
68
+ - Common flags:
69
+ - `--dry-run` previews payloads/uploads without making requests.
70
+ - `--idempotency` sets the `Idempotency-Key` header for RL submissions.
71
+ - `--poll-timeout` / `--poll-interval` tune the backend polling cadence.
72
+
73
+ > Tip: all `uvx synth-ai …` subcommands accept `--help` if you need to inspect additional options on the fly.
@@ -0,0 +1,172 @@
1
+ # Warming Up to RL (Crafter)
2
+
3
+ The Crafter example demonstrates the full Synth AI workflow: task app serving, Groq rollouts, tracing, SFT dataset export, FFT training, evaluation of fine-tuned models, and RL training.
4
+
5
+ ## Quick Reference Commands
6
+
7
+ - Serve task app locally with tracing:
8
+ ```bash
9
+ uvx synth-ai serve --port 8001 --env-file examples/warming_up_to_rl/.env --trace traces/v3
10
+ ```
11
+ - Deploy to Modal:
12
+ ```bash
13
+ uvx synth-ai deploy grpo-crafter --name grpo-crafter-task-app
14
+ ```
15
+ - Groq rollout (server-side):
16
+ ```bash
17
+ uv run python examples/warming_up_to_rl/run_eval.py --toml examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml --use-rollout
18
+ ```
19
+ - Export SFT data from traced runs:
20
+ ```bash
21
+ python examples/warming_up_to_rl/export_trace_sft.py --db traces/v3/synth_ai.db --output ft_data/crafter_traces.jsonl
22
+ ```
23
+ - FFT via CLI:
24
+ ```bash
25
+ uvx synth-ai train --type sft --config examples/warming_up_to_rl/configs/crafter_fft.toml --dataset /absolute/path/to/data.jsonl
26
+ ```
27
+ - Evaluate FFT checkpoint:
28
+ ```bash
29
+ uv run python examples/warming_up_to_rl/run_eval.py --toml examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml --use-rollout
30
+ ```
31
+ - RL via CLI (FFT-first):
32
+ ```bash
33
+ uvx synth-ai train --type rl --config examples/warming_up_to_rl/configs/rl_from_ft.toml
34
+ ```
35
+
36
+ ---
37
+
38
+ ## 1. Prerequisites
39
+
40
+ - Python 3.11+
41
+ - `uv`/`uvx` available (or install Synth in a virtualenv)
42
+ - Modal CLI (`modal token new`) if you plan to deploy the task app
43
+ - `.env` in this directory with at least:
44
+ - `SYNTH_API_KEY`
45
+ - `ENVIRONMENT_API_KEY`
46
+ - `TASK_APP_URL` (when running against a hosted task app)
47
+ - Optional: `GROQ_API_KEY`, `OPENAI_API_KEY` for proxy endpoints
48
+
49
+ `uvx synth-ai setup` can populate the `.env` by guiding you through the dashboard handshake.
50
+
51
+ > All commands below assume you are running from the repository root unless noted.
52
+
53
+ ## 2. Task App Operations
54
+
55
+ ### Local development
56
+
57
+ ```bash
58
+ uvx synth-ai serve --port 8001 --env-file examples/warming_up_to_rl/.env --trace traces/v3 --trace-db traces/v3/synth_ai.db
59
+ ```
60
+
61
+ - `--trace` and `--trace-db` enable tracing v3 and SFT JSONL dumps.
62
+ - Add `--reload` for uvicorn auto-reload while editing code.
63
+
64
+ ### Modal deploy / serve
65
+
66
+ ```bash
67
+ uvx synth-ai deploy grpo-crafter --name grpo-crafter-task-app --env-file examples/warming_up_to_rl/.env
68
+ uvx synth-ai modal-serve grpo-crafter --name grpo-crafter-task-app --env-file examples/warming_up_to_rl/.env
69
+ ```
70
+
71
+ Both commands preflight the environment key with the backend when `SYNTH_API_KEY` is present.
72
+
73
+ ## 3. Baseline Evaluations (Groq and Synth vLLM)
74
+
75
+ Evaluation scripts auto-load `.env` values. Update TOMLs under `configs/` with the correct `task_app_url` and provider-specific model names.
76
+
77
+ - Groq Qwen3-32B:
78
+ ```bash
79
+ uv run python examples/warming_up_to_rl/run_eval.py --toml examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml --use-rollout
80
+ ```
81
+ - Synth vLLM Qwen3-4B (Modal-hosted inference URL specified in TOML):
82
+ ```bash
83
+ uv run python examples/warming_up_to_rl/run_eval.py --toml examples/warming_up_to_rl/configs/eval_modal_qwen4b.toml --use-rollout
84
+ ```
85
+
86
+ `--use-rollout` drives the task app’s `/rollout` endpoint so achievements and metrics are captured. Without it the script issues per-step `initialize/step/terminate` calls.
87
+
88
+ ## 4. Tracing and SFT Dataset Export
89
+
90
+ 1. Serve the task app with tracing enabled (see Section 2) or run the traced rollout helper:
91
+ ```bash
92
+ uv run python examples/warming_up_to_rl/run_local_rollout_traced.py --episodes 10 --difficulty easy
93
+ ```
94
+ 2. Inspect local trace databases:
95
+ ```bash
96
+ uvx synth-ai traces --limit 10
97
+ ```
98
+ 3. Export JSONL suitable for SFT:
99
+ ```bash
100
+ python examples/warming_up_to_rl/export_trace_sft.py \
101
+ --db traces/v3/synth_ai.db \
102
+ --min-achievements 3 \
103
+ --output ft_data/crafter_traces.jsonl
104
+ ```
105
+
106
+ The exporter enriches each example with achievements unlocked, model metadata, and reward summaries.
107
+
108
+ ## 5. SFT / FFT Training
109
+
110
+ ### Preferred: `uvx synth-ai train`
111
+
112
+ ```bash
113
+ uvx synth-ai train \
114
+ --type sft \
115
+ --config examples/warming_up_to_rl/configs/crafter_fft.toml \
116
+ --dataset /absolute/path/to/crafter_traces.jsonl
117
+ ```
118
+
119
+ The CLI will:
120
+ - Prompt for `.env` selection (or use `--env-file`).
121
+ - Upload training (and optional validation) data to `/learning/files`.
122
+ - Submit the job and poll until completion unless `--no-poll` is set.
123
+
124
+ ### Legacy script
125
+
126
+ ```bash
127
+ uv run python examples/warming_up_to_rl/run_fft_and_save.py \
128
+ --toml examples/warming_up_to_rl/configs/crafter_fft.toml \
129
+ --data /absolute/path/to/crafter_traces.jsonl \
130
+ --poll-seconds 1800
131
+ ```
132
+
133
+ The script writes the resulting model ID to `ft_model_id.txt`. Use that ID in evaluation and RL configs (e.g., `model = "ft:abc123"`).
134
+
135
+ ## 6. Evaluate the Fine-tuned Model
136
+
137
+ After FFT completes, update `configs/eval_fft_qwen4b.toml` so `model = "ft:<model_id>"`, then rerun the evaluation:
138
+
139
+ ```bash
140
+ uv run python examples/warming_up_to_rl/run_eval.py --toml examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml --use-rollout
141
+ ```
142
+
143
+ This reuses the same Groq/vLLM pipeline but exercises the finetuned checkpoint.
144
+
145
+ ## 7. RL Training
146
+
147
+ ### Preferred: `uvx synth-ai train --type rl`
148
+
149
+ ```bash
150
+ uvx synth-ai train \
151
+ --type rl \
152
+ --config examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml
153
+ ```
154
+
155
+ During the interactive setup the CLI ensures `SYNTH_API_KEY`, `ENVIRONMENT_API_KEY`, and `TASK_APP_URL` are present, health-checks the task app, and submits the RL job to `/rl/jobs`.
156
+
157
+ ### Legacy script
158
+
159
+ ```bash
160
+ uv run python examples/warming_up_to_rl/run_rl_and_save.py \
161
+ --config examples/warming_up_to_rl/configs/rl_from_ft.toml
162
+ ```
163
+
164
+ To start directly from a base model, switch the config to `rl_from_base_qwen4b.toml` and ensure `[model].base` is populated.
165
+
166
+ ## 8. Additional Utilities
167
+
168
+ - `manage_secrets.py` – convenience helpers for Modal secret management.
169
+ - `run_local_rollout.py`, `run_local_rollout_parallel.py`, `run_rollout_remote.py` – alternative rollout launchers for benchmarking.
170
+ - `analyze_trace_db.py` – inspect trace quality/achievements before exporting.
171
+
172
+ Refer to `docs/workflows/` for end-to-end guidance that mirrors these commands.