synth-ai 0.2.9.dev3__py3-none-any.whl → 0.2.9.dev5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/analyze_semantic_words.sh +17 -0
- examples/common_old/backend.py +21 -0
- examples/crafter_debug_render.py +180 -0
- examples/evals_old/README.md +98 -0
- examples/evals_old/__init__.py +6 -0
- examples/evals_old/compare_models.py +1037 -0
- examples/evals_old/example_log.md +145 -0
- examples/evals_old/run_demo.sh +126 -0
- examples/evals_old/trace_analysis.py +270 -0
- examples/finetuning_old/_backup_synth_qwen/config.toml +29 -0
- examples/finetuning_old/_backup_synth_qwen/example_log.md +324 -0
- examples/finetuning_old/_backup_synth_qwen/filter_traces.py +60 -0
- examples/finetuning_old/_backup_synth_qwen/filter_traces_achievements.py +239 -0
- examples/finetuning_old/_backup_synth_qwen/purge_v3_traces.py +109 -0
- examples/finetuning_old/_backup_synth_qwen/react_agent_lm.py +1924 -0
- examples/finetuning_old/_backup_synth_qwen/readme.md +49 -0
- examples/finetuning_old/_backup_synth_qwen/run_crafter_qwen4b.py +114 -0
- examples/finetuning_old/_backup_synth_qwen/run_demo.sh +195 -0
- examples/finetuning_old/_backup_synth_qwen/sft_kickoff.py +118 -0
- examples/finetuning_old/synth_qwen_v1/README.md +68 -0
- examples/finetuning_old/synth_qwen_v1/filter_traces.py +60 -0
- examples/finetuning_old/synth_qwen_v1/filter_traces_achievements.py +239 -0
- examples/finetuning_old/synth_qwen_v1/finetune.py +46 -0
- examples/finetuning_old/synth_qwen_v1/hello_ft_model.py +71 -0
- examples/finetuning_old/synth_qwen_v1/infer.py +37 -0
- examples/finetuning_old/synth_qwen_v1/poll.py +44 -0
- examples/finetuning_old/synth_qwen_v1/prepare_data.py +35 -0
- examples/finetuning_old/synth_qwen_v1/purge_v3_traces.py +109 -0
- examples/finetuning_old/synth_qwen_v1/react_agent_lm.py +1932 -0
- examples/finetuning_old/synth_qwen_v1/run_crafter_sft_job.py +207 -0
- examples/finetuning_old/synth_qwen_v1/run_ft_job.py +232 -0
- examples/finetuning_old/synth_qwen_v1/upload_data.py +34 -0
- examples/finetuning_old/synth_qwen_v1/util.py +147 -0
- examples/rl/README.md +169 -0
- examples/rl/configs/eval_base_qwen.toml +15 -0
- examples/rl/configs/eval_rl_qwen.toml +11 -0
- examples/rl/configs/rl_from_base_qwen.toml +35 -0
- examples/rl/configs/rl_from_base_qwen17.toml +74 -0
- examples/rl/configs/rl_from_ft_qwen.toml +35 -0
- examples/rl/download_dataset.py +64 -0
- examples/rl/run_eval.py +435 -0
- examples/rl/run_rl_and_save.py +94 -0
- examples/rl/task_app/README.md +22 -0
- {synth_ai/task/apps → examples/rl/task_app}/math_single_step.py +8 -8
- examples/rl/task_app/math_task_app.py +107 -0
- examples/rl_old/task_app.py +962 -0
- examples/run_crafter_demo.sh +10 -0
- examples/warming_up_to_rl/analyze_trace_db.py +420 -0
- examples/warming_up_to_rl/configs/crafter_fft.toml +48 -0
- examples/warming_up_to_rl/configs/crafter_fft_4b.toml +54 -0
- examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml +20 -0
- examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml +13 -0
- examples/warming_up_to_rl/configs/eval_modal_qwen4b.toml +23 -0
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +73 -0
- examples/warming_up_to_rl/configs/rl_from_ft.toml +56 -0
- examples/warming_up_to_rl/export_trace_sft.py +541 -0
- examples/warming_up_to_rl/groq_test.py +88 -0
- examples/warming_up_to_rl/manage_secrets.py +127 -0
- examples/warming_up_to_rl/old/event_rewards.md +234 -0
- examples/warming_up_to_rl/old/notes.md +73 -0
- examples/warming_up_to_rl/readme.md +172 -0
- examples/warming_up_to_rl/run_eval.py +434 -0
- examples/warming_up_to_rl/run_fft_and_save.py +309 -0
- examples/warming_up_to_rl/run_local_rollout.py +188 -0
- examples/warming_up_to_rl/run_local_rollout_modal.py +160 -0
- examples/warming_up_to_rl/run_local_rollout_parallel.py +342 -0
- examples/warming_up_to_rl/run_local_rollout_traced.py +372 -0
- examples/warming_up_to_rl/run_rl_and_save.py +101 -0
- examples/warming_up_to_rl/run_rollout_remote.py +129 -0
- examples/warming_up_to_rl/task_app/README.md +38 -0
- {synth_ai/task/apps → examples/warming_up_to_rl/task_app}/grpo_crafter.py +7 -7
- examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +165 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/README.md +173 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +145 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +1271 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +429 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +442 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +96 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +302 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +202 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +512 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +102 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +985 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +197 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +1749 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +217 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +160 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +146 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +61 -0
- synth_ai/api/train/config_finder.py +18 -18
- synth_ai/api/train/env_resolver.py +28 -1
- synth_ai/cli/task_apps.py +291 -56
- synth_ai/task/apps/__init__.py +54 -13
- {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev5.dist-info}/METADATA +1 -1
- {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev5.dist-info}/RECORD +106 -13
- {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev5.dist-info}/top_level.txt +1 -0
- synth_ai/environments/examples/sokoban/units/astar_common.py +0 -95
- {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev5.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev5.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev5.dist-info}/licenses/LICENSE +0 -0
examples/rl/README.md
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
# Math RL Demo (Single Step)
|
|
2
|
+
|
|
3
|
+
This example trains a reinforcement learning policy on single-step math problems sourced from the [EleutherAI/math](https://huggingface.co/datasets/EleutherAI/math) dataset. Episodes consist of a single tool call: the model must emit a `math_submit` function call whose `answer` field contains the final solution. Missing or malformed tool calls receive negative reward; correct answers earn positive reward.
|
|
4
|
+
|
|
5
|
+
## Quick Commands
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
# Serve locally with tracing
|
|
9
|
+
uvx synth-ai serve math-single-step --port 8101 --env-file examples/rl/.env --trace traces/math
|
|
10
|
+
|
|
11
|
+
# Modal deployment
|
|
12
|
+
uvx synth-ai deploy --name synth-math-single-step --env-file examples/rl/.env
|
|
13
|
+
|
|
14
|
+
# Evaluate base Qwen policy (validation split)
|
|
15
|
+
uv run python examples/rl/run_eval.py --toml examples/rl/configs/eval_base_qwen.toml
|
|
16
|
+
|
|
17
|
+
# Launch RL job from base model
|
|
18
|
+
uvx synth-ai train --type rl --config examples/rl/configs/rl_from_base_qwen.toml
|
|
19
|
+
|
|
20
|
+
# Evaluate RL checkpoint on held-out test split
|
|
21
|
+
uv run python examples/rl/run_eval.py --toml examples/rl/configs/eval_rl_qwen.toml
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## 1. Prerequisites
|
|
25
|
+
|
|
26
|
+
- Python 3.11+
|
|
27
|
+
- `uv`/`uvx`
|
|
28
|
+
- Modal CLI (`modal token new`) for deployment
|
|
29
|
+
- `.env` at `examples/rl/.env` containing at least:
|
|
30
|
+
- `SYNTH_API_KEY`
|
|
31
|
+
- `ENVIRONMENT_API_KEY`
|
|
32
|
+
- Optional: `TASK_APP_URL` (Modal URL), `GROQ_API_KEY`, `OPENAI_API_KEY`
|
|
33
|
+
|
|
34
|
+
Run `uvx synth-ai setup` to populate the `.env` if you have not paired the SDK before.
|
|
35
|
+
|
|
36
|
+
## 2. Task App
|
|
37
|
+
|
|
38
|
+
The task app is defined in `synth_ai/task/apps/math_single_step.py` and registered as `math-single-step`. It loads problems from the Hugging Face dataset (configurable via `MATH_DATASET_*` env vars) and manages per-episode state with an in-memory environment manager.
|
|
39
|
+
|
|
40
|
+
- **Observation**: single math problem (string) plus dataset metadata.
|
|
41
|
+
- **Actions**: exactly one `math_submit` tool call with an `answer` string.
|
|
42
|
+
- **Rewards**:
|
|
43
|
+
- `+1.0` for correct answer
|
|
44
|
+
- `0.0` for incorrect answer
|
|
45
|
+
- `-0.5` if the tool call omits an answer or uses the wrong tool
|
|
46
|
+
- `-1.0` when no tool call is provided
|
|
47
|
+
|
|
48
|
+
Serve locally with tracing to capture trajectories:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
uvx synth-ai serve math-single-step \
|
|
52
|
+
--port 8101 \
|
|
53
|
+
--env-file examples/rl/.env \
|
|
54
|
+
--trace traces/math \
|
|
55
|
+
--trace-db traces/math/synth_ai.db
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Deploy or serve on Modal using the same env file; the registration includes a `ModalDeploymentConfig` that installs the `datasets` package automatically.
|
|
59
|
+
|
|
60
|
+
## 3. Evaluation
|
|
61
|
+
|
|
62
|
+
`examples/rl/run_eval.py` evaluates a policy by sampling deterministic seeds from the dataset splits. TOML configuration controls the model, split, and number of episodes. Example config (`eval_base_qwen.toml`):
|
|
63
|
+
|
|
64
|
+
```toml
|
|
65
|
+
provider = "synth"
|
|
66
|
+
task_app_url = "http://localhost:8101"
|
|
67
|
+
model = "Qwen/Qwen3-4B"
|
|
68
|
+
split = "validation"
|
|
69
|
+
num_episodes = 50
|
|
70
|
+
seed_start = 0
|
|
71
|
+
|
|
72
|
+
[policy]
|
|
73
|
+
inference_url = "http://localhost:8000/api/inference"
|
|
74
|
+
max_tokens = 128
|
|
75
|
+
temperature = 0.0
|
|
76
|
+
# Optional: override headers for inference requests
|
|
77
|
+
# [policy.extra_headers]
|
|
78
|
+
# Authorization = "Bearer ..."
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
The `[policy]` table maps directly to the inference payload; add `[policy.headers]` if you need to forward custom HTTP headers (e.g., `Authorization`). If `SYNTH_API_KEY` is present, the evaluator automatically sends `Authorization: Bearer <key>`.
|
|
82
|
+
|
|
83
|
+
Set `--use-rollout` to exercise the server-side rollout endpoint instead of the per-step API.
|
|
84
|
+
|
|
85
|
+
The script reports accuracy and a breakdown of failure modes (`missing_tool_call`, `blank_answer`, etc.).
|
|
86
|
+
|
|
87
|
+
## 4. RL Training
|
|
88
|
+
|
|
89
|
+
Example RL config (`configs/rl_from_base_qwen.toml`):
|
|
90
|
+
|
|
91
|
+
```toml
|
|
92
|
+
[services]
|
|
93
|
+
task_url = "https://your-app.modal.run"
|
|
94
|
+
|
|
95
|
+
[model]
|
|
96
|
+
base = "Qwen/Qwen3-4B"
|
|
97
|
+
|
|
98
|
+
[data]
|
|
99
|
+
split = "train"
|
|
100
|
+
seed_start = 0
|
|
101
|
+
episodes_per_iteration = 2048
|
|
102
|
+
|
|
103
|
+
[training]
|
|
104
|
+
max_turns = 1
|
|
105
|
+
ops = ["agent", "env"]
|
|
106
|
+
batch_size = 128
|
|
107
|
+
group_size = 1024
|
|
108
|
+
reward_positive = 1.0
|
|
109
|
+
reward_negative_no_tool = -1.0
|
|
110
|
+
reward_negative_no_answer = -0.5
|
|
111
|
+
|
|
112
|
+
[policy]
|
|
113
|
+
model = "Qwen/Qwen3-4B"
|
|
114
|
+
inference_url = "https://your-inference-host"
|
|
115
|
+
max_tokens = 128
|
|
116
|
+
temperature = 0.0
|
|
117
|
+
|
|
118
|
+
[tags]
|
|
119
|
+
experiment = "math_single_step"
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
Submit jobs interactively with:
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
uvx synth-ai train --type rl --config examples/rl/configs/rl_from_base_qwen.toml
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
The CLI ensures the task app is reachable (`/health`, `/task_info`), prompts for missing secrets, and polls job status until completion. For scripted automation, use `run_rl_and_save.py`:
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
uv run python examples/rl/run_rl_and_save.py \
|
|
132
|
+
--config examples/rl/configs/rl_from_base_qwen.toml \
|
|
133
|
+
--backend https://backend.synth.ai/api
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## 5. Evaluating RL Outputs
|
|
137
|
+
|
|
138
|
+
After training completes, set `model = "rl:<job_or_model_id>"` in `configs/eval_rl_qwen.toml` (and update `split = "test"` for a held-out set). Re-run `run_eval.py` to compare:
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
uv run python examples/rl/run_eval.py --toml examples/rl/configs/eval_rl_qwen.toml
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
Record both validation (pre-RL) and test (post-RL) accuracy to quantify improvements.
|
|
145
|
+
|
|
146
|
+
## 6. Dataset Notes
|
|
147
|
+
|
|
148
|
+
- By default the task app loads the [Hendrycks MATH benchmark](https://huggingface.co/datasets/nlile/hendrycks-MATH-benchmark). Override via `MATH_DATASET_NAME` / `MATH_DATASET_CONFIG` env vars if you want a different variant. The dataset is public and automatically downloaded when the task app starts; the server will fail fast with a clear error if it cannot be fetched.
|
|
149
|
+
- For offline use, run `uv run python examples/rl/download_dataset.py --output-dir examples/rl/data --dataset nlile/hendrycks-MATH-benchmark --config algebra --limit 2000`. Then start the task app with `MATH_DATASET_LOCAL_DIR=examples/rl/data` (or set `MATH_DATASET_LOCAL_<SPLIT>_FILE`).
|
|
150
|
+
- Hugging Face downloads occur at runtime; pre-fetch locally or mount a Modal volume if you need offline access.
|
|
151
|
+
- Hugging Face downloads occur at runtime; pre-fetch locally or mount a Modal volume if you need offline access.
|
|
152
|
+
- Seeds map directly to dataset indices. Use `seed_start` to control determinism in configs and evaluations.
|
|
153
|
+
|
|
154
|
+
## 7. Additional Utilities
|
|
155
|
+
|
|
156
|
+
- `examples/rl/task_app/math_task_app.py` – legacy runner (`python .../math_task_app.py --reload`).
|
|
157
|
+
- `examples/rl/run_eval.py` – CLI evaluation helper (supports proxying Groq or hitting arbitrary inference URLs).
|
|
158
|
+
- `examples/rl/run_rl_and_save.py` – thin wrapper around the Synth `/rl/jobs` API.
|
|
159
|
+
|
|
160
|
+
For broader background on Synth task apps, CLI commands, and tracing, see the new documentation under `docs/`.
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
uv run python examples/rl/run_eval.py --toml examples/rl/configs/eval_base_qwen.toml
|
|
165
|
+
uvx synth-ai serve math-single-step \
|
|
166
|
+
--port 8101 \
|
|
167
|
+
--env-file examples/rl/.env \
|
|
168
|
+
--trace traces/math \
|
|
169
|
+
--force
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
provider = "synth"
|
|
2
|
+
task_app_url = "http://localhost:8101"
|
|
3
|
+
model = "Qwen/Qwen3-1.7B"
|
|
4
|
+
split = "validation"
|
|
5
|
+
num_episodes = 50
|
|
6
|
+
seed_start = 0
|
|
7
|
+
|
|
8
|
+
[policy]
|
|
9
|
+
inference_url = "http://localhost:8000/api/inference"
|
|
10
|
+
max_tokens = 128
|
|
11
|
+
temperature = 0.0
|
|
12
|
+
|
|
13
|
+
# Optionally supply custom headers
|
|
14
|
+
# [policy.headers]
|
|
15
|
+
# Authorization = "Bearer ..."
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
provider = "synth"
|
|
2
|
+
task_app_url = "https://your-math-task.modal.run"
|
|
3
|
+
model = "rl:REPLACE_WITH_JOB_ID"
|
|
4
|
+
split = "test"
|
|
5
|
+
num_episodes = 200
|
|
6
|
+
seed_start = 100000
|
|
7
|
+
|
|
8
|
+
[policy]
|
|
9
|
+
inference_url = "https://your-inference-host"
|
|
10
|
+
max_tokens = 128
|
|
11
|
+
temperature = 0.0
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
[services]
|
|
2
|
+
task_url = "https://your-math-task.modal.run"
|
|
3
|
+
|
|
4
|
+
[model]
|
|
5
|
+
base = "Qwen/Qwen3-4B"
|
|
6
|
+
|
|
7
|
+
[policy]
|
|
8
|
+
model = "Qwen/Qwen3-4B"
|
|
9
|
+
inference_url = "https://your-inference-host"
|
|
10
|
+
max_tokens = 128
|
|
11
|
+
temperature = 0.0
|
|
12
|
+
|
|
13
|
+
[data]
|
|
14
|
+
split = "train"
|
|
15
|
+
seed_start = 0
|
|
16
|
+
episodes_per_iteration = 2048
|
|
17
|
+
evaluation_split = "validation"
|
|
18
|
+
evaluation_episodes = 256
|
|
19
|
+
|
|
20
|
+
[training]
|
|
21
|
+
max_turns = 1
|
|
22
|
+
ops = ["agent", "env"]
|
|
23
|
+
batch_size = 128
|
|
24
|
+
group_size = 1024
|
|
25
|
+
reward_positive = 1.0
|
|
26
|
+
reward_negative_no_tool = -1.0
|
|
27
|
+
reward_negative_no_answer = -0.5
|
|
28
|
+
learning_rate = 5e-6
|
|
29
|
+
|
|
30
|
+
[compute]
|
|
31
|
+
gpu_type = "A10G"
|
|
32
|
+
gpu_count = 4
|
|
33
|
+
|
|
34
|
+
[tags]
|
|
35
|
+
experiment = "math_single_step"
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
[algorithm]
|
|
2
|
+
type = "online"
|
|
3
|
+
method = "policy_gradient"
|
|
4
|
+
variety = "gspo"
|
|
5
|
+
|
|
6
|
+
[services]
|
|
7
|
+
task_url = "http://localhost:8101"
|
|
8
|
+
|
|
9
|
+
[model]
|
|
10
|
+
base = "Qwen/Qwen3-1.7B"
|
|
11
|
+
|
|
12
|
+
[policy]
|
|
13
|
+
model = "Qwen/Qwen3-1.7B"
|
|
14
|
+
inference_url = "http://localhost:8000/api/inference"
|
|
15
|
+
max_tokens = 1028
|
|
16
|
+
temperature = 0.2
|
|
17
|
+
|
|
18
|
+
[data]
|
|
19
|
+
split = "train"
|
|
20
|
+
seed_start = 0
|
|
21
|
+
episodes_per_iteration = 1280 # 8 per group * 4 groups per batch * 2 batches per step * 20 steps
|
|
22
|
+
evaluation_split = "validation"
|
|
23
|
+
evaluation_episodes = 50
|
|
24
|
+
|
|
25
|
+
[training]
|
|
26
|
+
max_turns = 1
|
|
27
|
+
ops = ["agent", "env"]
|
|
28
|
+
batch_size = 2
|
|
29
|
+
group_size = 16
|
|
30
|
+
reward_positive = 1.0
|
|
31
|
+
reward_negative_no_tool = -1.0
|
|
32
|
+
reward_negative_no_answer = -0.5
|
|
33
|
+
learning_rate = 5e-6
|
|
34
|
+
log_interval = 1
|
|
35
|
+
weight_sync_interval = 1
|
|
36
|
+
|
|
37
|
+
[training.weight_sync]
|
|
38
|
+
enable = true
|
|
39
|
+
targets = ["policy"]
|
|
40
|
+
|
|
41
|
+
[compute]
|
|
42
|
+
gpu_type = "H100"
|
|
43
|
+
gpu_count = 4
|
|
44
|
+
|
|
45
|
+
[topology]
|
|
46
|
+
type = "single_node_split"
|
|
47
|
+
gpus_for_vllm = 2
|
|
48
|
+
gpus_for_training = 1
|
|
49
|
+
gpus_for_ref = 1
|
|
50
|
+
tensor_parallel = 1
|
|
51
|
+
|
|
52
|
+
[vllm]
|
|
53
|
+
tensor_parallel_size = 1
|
|
54
|
+
max_model_len = 4096
|
|
55
|
+
|
|
56
|
+
[reference]
|
|
57
|
+
placement = "dedicated"
|
|
58
|
+
port = 8002
|
|
59
|
+
tp = 1
|
|
60
|
+
health_max_wait_s = 180
|
|
61
|
+
health_interval_ms = 300
|
|
62
|
+
|
|
63
|
+
[rollout]
|
|
64
|
+
policy_name = "math-single-step"
|
|
65
|
+
max_turns = 1
|
|
66
|
+
episodes_per_batch = 32 # group_size * batch_size
|
|
67
|
+
|
|
68
|
+
[evaluation]
|
|
69
|
+
instances = 32
|
|
70
|
+
every_n_iters = 10
|
|
71
|
+
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
|
|
72
|
+
|
|
73
|
+
[tags]
|
|
74
|
+
experiment = "math_single_step_qwen17"
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
[services]
|
|
2
|
+
task_url = "https://your-math-task.modal.run"
|
|
3
|
+
|
|
4
|
+
[model]
|
|
5
|
+
source = "ft:REPLACE_WITH_MODEL_ID"
|
|
6
|
+
|
|
7
|
+
[policy]
|
|
8
|
+
model = "ft:REPLACE_WITH_MODEL_ID"
|
|
9
|
+
inference_url = "https://your-inference-host"
|
|
10
|
+
max_tokens = 128
|
|
11
|
+
temperature = 0.0
|
|
12
|
+
|
|
13
|
+
[data]
|
|
14
|
+
split = "train"
|
|
15
|
+
seed_start = 0
|
|
16
|
+
episodes_per_iteration = 2048
|
|
17
|
+
evaluation_split = "validation"
|
|
18
|
+
evaluation_episodes = 256
|
|
19
|
+
|
|
20
|
+
[training]
|
|
21
|
+
max_turns = 1
|
|
22
|
+
ops = ["agent", "env"]
|
|
23
|
+
batch_size = 128
|
|
24
|
+
group_size = 1024
|
|
25
|
+
reward_positive = 1.0
|
|
26
|
+
reward_negative_no_tool = -1.0
|
|
27
|
+
reward_negative_no_answer = -0.5
|
|
28
|
+
learning_rate = 5e-6
|
|
29
|
+
|
|
30
|
+
[compute]
|
|
31
|
+
gpu_type = "A10G"
|
|
32
|
+
gpu_count = 4
|
|
33
|
+
|
|
34
|
+
[tags]
|
|
35
|
+
experiment = "math_single_step_from_fft"
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Download subsets of the MATH dataset to local JSONL files."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import argparse
|
|
7
|
+
import json
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from datasets import load_dataset
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def extract_examples(dataset: Any, *, limit: int | None) -> list[dict[str, str]]:
|
|
15
|
+
if limit is not None:
|
|
16
|
+
dataset = dataset.select(range(min(limit, len(dataset))))
|
|
17
|
+
examples: list[dict[str, str]] = []
|
|
18
|
+
for item in dataset:
|
|
19
|
+
problem = (item.get("problem") or "").strip()
|
|
20
|
+
solution = item.get("solution") or ""
|
|
21
|
+
if isinstance(solution, list):
|
|
22
|
+
solution = "\n".join(str(part) for part in solution)
|
|
23
|
+
examples.append({
|
|
24
|
+
"problem": problem,
|
|
25
|
+
"solution": solution,
|
|
26
|
+
})
|
|
27
|
+
return examples
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def write_jsonl(path: Path, rows: list[dict[str, str]]) -> None:
|
|
31
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
32
|
+
with path.open("w", encoding="utf-8") as fh:
|
|
33
|
+
for row in rows:
|
|
34
|
+
fh.write(json.dumps(row, ensure_ascii=False) + "\n")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def main() -> None:
|
|
38
|
+
parser = argparse.ArgumentParser(description="Download MATH dataset splits to JSONL for offline use")
|
|
39
|
+
parser.add_argument("--output-dir", default="examples/rl/data", help="Directory to write <split>.jsonl files")
|
|
40
|
+
parser.add_argument("--dataset", default="nlile/hendrycks-MATH-benchmark", help="Hugging Face dataset identifier")
|
|
41
|
+
parser.add_argument("--config", default="algebra", help="Hugging Face dataset config (if required)")
|
|
42
|
+
parser.add_argument("--splits", nargs="*", default=["train", "validation", "test"], help="Splits to download")
|
|
43
|
+
parser.add_argument("--limit", type=int, default=None, help="Optional cap on examples per split")
|
|
44
|
+
args = parser.parse_args()
|
|
45
|
+
|
|
46
|
+
output_dir = Path(args.output_dir).expanduser()
|
|
47
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
48
|
+
|
|
49
|
+
for split in args.splits:
|
|
50
|
+
print(f"[INFO] Downloading {args.dataset} ({args.config}) split={split}")
|
|
51
|
+
if args.config:
|
|
52
|
+
dataset = load_dataset(args.dataset, args.config, split=split)
|
|
53
|
+
else:
|
|
54
|
+
dataset = load_dataset(args.dataset, split=split)
|
|
55
|
+
rows = extract_examples(dataset, limit=args.limit)
|
|
56
|
+
out_path = output_dir / f"{split}.jsonl"
|
|
57
|
+
write_jsonl(out_path, rows)
|
|
58
|
+
print(f"[INFO] Wrote {len(rows)} examples to {out_path}")
|
|
59
|
+
|
|
60
|
+
print("Done. Set MATH_DATASET_LOCAL_DIR to the output directory when serving the task app.")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
if __name__ == "__main__":
|
|
64
|
+
main()
|