freesolo-flash-dev 0.2.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flash/__init__.py +29 -0
- flash/_channel.py +23 -0
- flash/_fileio.py +35 -0
- flash/_logging.py +49 -0
- flash/_update_check.py +266 -0
- flash/catalog.py +253 -0
- flash/cli/__init__.py +1 -0
- flash/cli/main/__init__.py +227 -0
- flash/cli/main/__main__.py +6 -0
- flash/cli/main/commands.py +636 -0
- flash/cli/main/envpush.py +317 -0
- flash/cli/main/render.py +599 -0
- flash/cli/main/training_doc.py +455 -0
- flash/client/__init__.py +14 -0
- flash/client/config.py +70 -0
- flash/client/http.py +372 -0
- flash/client/runtime_secrets.py +69 -0
- flash/client/specs.py +20 -0
- flash/cost/__init__.py +16 -0
- flash/cost/analytical.py +175 -0
- flash/cost/facts.py +114 -0
- flash/cost/spec.py +113 -0
- flash/cost/types.py +158 -0
- flash/engine/__init__.py +6 -0
- flash/engine/accounting.py +36 -0
- flash/engine/chalk_kernels.py +116 -0
- flash/engine/multiturn_rollout.py +780 -0
- flash/engine/recipe.py +86 -0
- flash/engine/vram.py +603 -0
- flash/engine/worker/__init__.py +2916 -0
- flash/engine/worker/__main__.py +4 -0
- flash/engine/worker/kernel_warmup.py +400 -0
- flash/engine/worker/lora.py +796 -0
- flash/engine/worker/packing.py +366 -0
- flash/engine/worker/perf.py +1048 -0
- flash/envs/__init__.py +10 -0
- flash/envs/adapter/__init__.py +883 -0
- flash/envs/adapter/rubric.py +222 -0
- flash/envs/base.py +52 -0
- flash/envs/registry.py +62 -0
- flash/mcp/__init__.py +1 -0
- flash/mcp/server.py +85 -0
- flash/providers/__init__.py +59 -0
- flash/providers/_auth.py +24 -0
- flash/providers/_http.py +230 -0
- flash/providers/_instance.py +416 -0
- flash/providers/_instance_bootstrap.py +517 -0
- flash/providers/_poll.py +311 -0
- flash/providers/allocator.py +193 -0
- flash/providers/base.py +431 -0
- flash/providers/hyperstack/__init__.py +127 -0
- flash/providers/hyperstack/api.py +522 -0
- flash/providers/hyperstack/auth.py +17 -0
- flash/providers/hyperstack/gpus.py +29 -0
- flash/providers/hyperstack/jobs/__init__.py +632 -0
- flash/providers/hyperstack/jobs/builders.py +122 -0
- flash/providers/hyperstack/preflight.py +23 -0
- flash/providers/hyperstack/pricing.py +26 -0
- flash/providers/hyperstack/train.py +25 -0
- flash/providers/lambdalabs/__init__.py +139 -0
- flash/providers/lambdalabs/api.py +261 -0
- flash/providers/lambdalabs/auth.py +18 -0
- flash/providers/lambdalabs/gpus.py +29 -0
- flash/providers/lambdalabs/jobs/__init__.py +724 -0
- flash/providers/lambdalabs/jobs/builders.py +118 -0
- flash/providers/lambdalabs/preflight.py +27 -0
- flash/providers/lambdalabs/pricing.py +51 -0
- flash/providers/lambdalabs/train.py +27 -0
- flash/providers/preflight.py +55 -0
- flash/providers/realized.py +80 -0
- flash/providers/runpod/__init__.py +130 -0
- flash/providers/runpod/api.py +186 -0
- flash/providers/runpod/auth.py +37 -0
- flash/providers/runpod/cost.py +57 -0
- flash/providers/runpod/gpus.py +46 -0
- flash/providers/runpod/jobs.py +956 -0
- flash/providers/runpod/keys.py +139 -0
- flash/providers/runpod/preflight.py +30 -0
- flash/providers/runpod/preload.py +915 -0
- flash/providers/runpod/pricing.py +18 -0
- flash/providers/runpod/slots.py +79 -0
- flash/providers/runpod/train/__init__.py +150 -0
- flash/providers/runpod/train/deps.py +395 -0
- flash/providers/runpod/train/endpoints.py +820 -0
- flash/py.typed +0 -0
- flash/runner/__init__.py +686 -0
- flash/runner/checkpoints.py +82 -0
- flash/runner/deploy.py +422 -0
- flash/runner/lifecycle.py +672 -0
- flash/schema/__init__.py +375 -0
- flash/schema/fields.py +331 -0
- flash/serve/__init__.py +1 -0
- flash/serve/deploy.py +326 -0
- flash/serve/pricing.py +60 -0
- flash/server/__init__.py +1 -0
- flash/server/__main__.py +20 -0
- flash/server/app.py +961 -0
- flash/server/auth.py +263 -0
- flash/server/billing.py +124 -0
- flash/server/checkpoints.py +110 -0
- flash/server/db.py +160 -0
- flash/server/environment_registry.py +102 -0
- flash/server/envs.py +360 -0
- flash/server/reconcile.py +163 -0
- flash/server/run_registry.py +150 -0
- flash/spec.py +333 -0
- freesolo_flash_dev-0.2.25.dist-info/METADATA +192 -0
- freesolo_flash_dev-0.2.25.dist-info/RECORD +111 -0
- freesolo_flash_dev-0.2.25.dist-info/WHEEL +4 -0
- freesolo_flash_dev-0.2.25.dist-info/entry_points.txt +3 -0
- freesolo_flash_dev-0.2.25.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,455 @@
|
|
|
1
|
+
"""The TRAINING.md playbook scaffolded by `flash env setup`.
|
|
2
|
+
|
|
3
|
+
This is the single source of truth for the TRAINING.md that lands in a user's
|
|
4
|
+
project folder. It is written for the AI coding agent a user points at their
|
|
5
|
+
environment: a finished run is not a run that worked, and most of the value is in
|
|
6
|
+
how you design the signal, what you read, and how you decide a run is good.
|
|
7
|
+
|
|
8
|
+
Keep it flash-accurate — every command, config field, and import below exists in
|
|
9
|
+
this codebase (`flash <cmd>`, `[train]` fields in ``flash/spec.py``, and
|
|
10
|
+
``freesolo.environments``). Update it here, not in a copy.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
TRAINING_MD = r"""# TRAINING.md — how to actually improve a model with Flash
|
|
16
|
+
|
|
17
|
+
> **If you are an AI agent asked to train a model here, read this first.**
|
|
18
|
+
> `flash env setup` dropped this file next to your `environment.py` and `configs/`.
|
|
19
|
+
> It is the playbook Freesolo's own training agents follow to turn a *finished*
|
|
20
|
+
> run into a model that *measurably improved*. The mechanics live in the hosted
|
|
21
|
+
> docs (https://freesolo.co/docs); this file is the judgment that sits on top of them.
|
|
22
|
+
|
|
23
|
+
A run that reaches `done` is **not** the same as a run that worked. Submitting a run
|
|
24
|
+
is not a result. The whole job is to design the learning signal, read what the run
|
|
25
|
+
actually produced, and decide — honestly — whether the model got better.
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Using Flash
|
|
30
|
+
|
|
31
|
+
Flash is a **managed** training service with a thin CLI/client. You author an
|
|
32
|
+
environment (the task + its reward), publish it, and submit SFT or GRPO runs from a
|
|
33
|
+
TOML config. Flash allocates the cheapest fitting GPU across providers, runs the job,
|
|
34
|
+
streams logs back, and serves the result. You never handle provider credentials or a
|
|
35
|
+
GPU — you authenticate once with a freesolo API key, and everything below is a `flash`
|
|
36
|
+
CLI command.
|
|
37
|
+
|
|
38
|
+
### Install & authenticate
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install freesolo-flash # installs the `flash` CLI (import name is also `flash`)
|
|
42
|
+
flash login --api-key fslo_... # or: export FREESOLO_API_KEY=fslo_... (create a key at https://freesolo.co)
|
|
43
|
+
flash whoami # confirm the identity behind your key
|
|
44
|
+
flash models # supported base models (and which support `thinking`)
|
|
45
|
+
flash gpus # managed GPU classes with live $/hr
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### The project layout (`flash env setup` created this)
|
|
49
|
+
|
|
50
|
+
```text
|
|
51
|
+
environment.py # the task: how to prompt the model and how to score it
|
|
52
|
+
datasets/train.jsonl # training rows, one JSON object per line: {"input": ..., "output": ...}
|
|
53
|
+
configs/rl.toml # a GRPO (RL) run config
|
|
54
|
+
configs/sft.toml # an SFT run config
|
|
55
|
+
TRAINING.md # this file
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### 1. Author the environment
|
|
59
|
+
|
|
60
|
+
`environment.py` defines the task. A single-turn env subclasses
|
|
61
|
+
`EnvironmentSingleTurn`, turns a row into a prompt, and scores the model's response
|
|
62
|
+
with a `RewardResult` (see *Reward design* below). `load_environment()` is the entry
|
|
63
|
+
point Flash calls:
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
from freesolo.datasets.types import TaskExample
|
|
67
|
+
from freesolo.environments import EnvironmentSingleTurn, RewardResult
|
|
68
|
+
|
|
69
|
+
class MyEnv(EnvironmentSingleTurn):
|
|
70
|
+
dataset = load_jsonl("datasets/train.jsonl") # rows -> TaskExample(input=..., output=...)
|
|
71
|
+
|
|
72
|
+
def build_prompt_messages(self, example: TaskExample, prompt_text: str):
|
|
73
|
+
return [{"role": "user", "content": example.input}]
|
|
74
|
+
|
|
75
|
+
def score_response(self, example: TaskExample, response_text: str) -> RewardResult:
|
|
76
|
+
expected = str(example.output or "").strip()
|
|
77
|
+
score = 1.0 if expected and expected in response_text else 0.0
|
|
78
|
+
return RewardResult(score=score, threshold=1.0)
|
|
79
|
+
|
|
80
|
+
def load_environment(**kwargs) -> MyEnv:
|
|
81
|
+
return MyEnv()
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
For tool use, dialogue, or games, subclass `EnvironmentMultiTurn` instead and drive the
|
|
85
|
+
conversation across turns. The reward is the same `RewardResult` contract either way.
|
|
86
|
+
|
|
87
|
+
### 2. Publish the environment
|
|
88
|
+
|
|
89
|
+
A managed run references a **published** environment by id — so push your folder first:
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
flash env push --name my-env . # uploads this project; prints an env id like "your-org/my-env"
|
|
93
|
+
flash env list # installed envs + local sources you can push
|
|
94
|
+
flash env install your-org/their-env # record an env someone else published, to train against it
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Paste the returned id into `[environment] id` in **both** configs. Re-push after any
|
|
98
|
+
edit to `environment.py` or `datasets/` so the managed run uses your change.
|
|
99
|
+
|
|
100
|
+
### 3. Configure the run (TOML)
|
|
101
|
+
|
|
102
|
+
```toml
|
|
103
|
+
model = "Qwen/Qwen3.5-4B" # see `flash models`
|
|
104
|
+
algorithm = "grpo" # "grpo" (RL) or "sft"
|
|
105
|
+
# thinking = true # opt-in reasoning mode, for models that support it
|
|
106
|
+
|
|
107
|
+
[environment]
|
|
108
|
+
id = "your-org/my-env" # the id printed by `flash env push`
|
|
109
|
+
# secrets = ["SERPAPI_API_KEY"] # only the NAMES of env vars your environment reads;
|
|
110
|
+
# values are pulled from your shell/.env at submit time,
|
|
111
|
+
# never stored in the spec
|
|
112
|
+
|
|
113
|
+
[train]
|
|
114
|
+
steps = 150 # GRPO is step-driven; SFT is epoch-driven (epochs = N)
|
|
115
|
+
lora_rank = 32
|
|
116
|
+
lora_alpha = 64
|
|
117
|
+
seeds = [0]
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
GPU and the HF artifact repo are **fully managed** — there is no GPU knob; the allocator
|
|
121
|
+
picks the cheapest class that fits, and each run gets its own artifact repo. Compose or
|
|
122
|
+
tweak configs without editing files: `--config extra.toml` (deep-merge) and
|
|
123
|
+
`--set key=value` (e.g. `--set train.steps=300`).
|
|
124
|
+
|
|
125
|
+
### 4. Submit
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
flash train configs/rl.toml --dry-run # validate the config locally — no GPU, no charge
|
|
129
|
+
flash train configs/rl.toml --cost # pre-flight USD estimate, then exit
|
|
130
|
+
flash train configs/rl.toml # submit and follow logs (Ctrl-C detaches)
|
|
131
|
+
flash train configs/rl.toml --background # submit and return immediately
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### 5. Monitor
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
flash status <run-id> # state + accrued cost
|
|
138
|
+
flash status <run-id> --logs # reward/loss trend + worker console/error logs + any traceback
|
|
139
|
+
flash status <run-id> --follow # stream a live run to completion
|
|
140
|
+
flash runs # all your runs and their state/cost
|
|
141
|
+
flash cancel <run-id> # stop a run
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### 6. Deploy & chat
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
flash checkpoints <run-id> # deployable per-step RL checkpoints
|
|
148
|
+
flash deploy <run-id> # serve the trained adapter (--step N for an intermediate checkpoint)
|
|
149
|
+
flash chat <run-id> -m "hello" # chat with the deployed adapter
|
|
150
|
+
flash deployments # active serving endpoints
|
|
151
|
+
flash undeploy <run-id> # tear the endpoint down
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
> Flash also ships an **MCP bridge** (`flash` as an MCP server) so a coding agent can
|
|
155
|
+
> drive these same commands as tools.
|
|
156
|
+
|
|
157
|
+
The rest of this file is about doing the above *well* — designing a reward that teaches,
|
|
158
|
+
and deciding honestly whether a run improved.
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
## The loop
|
|
163
|
+
|
|
164
|
+
Work in tight, attributable iterations. Each one is a hypothesis:
|
|
165
|
+
|
|
166
|
+
```
|
|
167
|
+
1. Reconstruct state — what's the best run so far, and what have you already tried?
|
|
168
|
+
2. Form a hypothesis — pick ONE lever and say WHY it will move the metric.
|
|
169
|
+
3. Change that ONE lever.
|
|
170
|
+
4. Validate locally — `flash train configs/rl.toml --dry-run` (catches config errors
|
|
171
|
+
for free; a paid run on a broken config or an all-zero reward is wasted budget).
|
|
172
|
+
5. Submit — `flash train configs/rl.toml`.
|
|
173
|
+
6. Judge — read the metric trend AND a sample of real rollouts (see below).
|
|
174
|
+
7. Keep the best run; revert the change if it didn't beat the noise band. Repeat.
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
**Lever priority (highest impact first):** reward design → data / curriculum →
|
|
178
|
+
training knobs. The reward is the teacher; spend your effort there before touching
|
|
179
|
+
hyperparameters.
|
|
180
|
+
|
|
181
|
+
**One controlled change at a time.** Bundling changes makes the effect
|
|
182
|
+
unattributable. Never re-run a setting that already failed at a negligibly
|
|
183
|
+
different value.
|
|
184
|
+
|
|
185
|
+
---
|
|
186
|
+
|
|
187
|
+
## Before you trust a run — the checklist
|
|
188
|
+
|
|
189
|
+
A run is only evidence of improvement when **all** of these hold:
|
|
190
|
+
|
|
191
|
+
- [ ] The run reached `done` (confirmed via `flash status <run-id>`), not merely submitted.
|
|
192
|
+
- [ ] The reward trend rose (GRPO `reward_mean`) or the SFT loss fell — **beyond the noise band**, not within it.
|
|
193
|
+
- [ ] You **probed the trained adapter on real inputs** (`flash deploy` + `flash chat`), including cases it should fail — not just the metrics.
|
|
194
|
+
- [ ] The score is real behavior, not empty/truncated/templated outputs, skipped rows, leakage, a swallowed exception, or a format-only win.
|
|
195
|
+
- [ ] If you track a clean success signal separately from the shaped reward (an explicit `RewardMetric`), *that* moved too.
|
|
196
|
+
|
|
197
|
+
If any box is unchecked, the run is not done improving — keep training, don't declare success.
|
|
198
|
+
|
|
199
|
+
---
|
|
200
|
+
|
|
201
|
+
## Judge the run, don't just finish it
|
|
202
|
+
|
|
203
|
+
- **Judge the trend, not a single number.** The proof of training is the curve:
|
|
204
|
+
`reward_mean` rising over steps (GRPO) or loss falling (SFT). Record the base/early
|
|
205
|
+
value and the final value. A flat or noisy trend with no improvement is not success.
|
|
206
|
+
- **Read the model's outputs, not just the metrics.** A rising reward can come from
|
|
207
|
+
reward-hacking or a degenerate output the reward still credits — metrics alone never
|
|
208
|
+
establish that the model got better. Flash does not expose training-time rollouts
|
|
209
|
+
through the CLI (`--logs` gives you the metric trend and the worker's console/error
|
|
210
|
+
logs, not the sampled generations), so to read real outputs **deploy the adapter and
|
|
211
|
+
probe it**: `flash deploy <run-id>` then `flash chat <run-id> -m "..."` on at least a
|
|
212
|
+
few real inputs, including ones it should get wrong.
|
|
213
|
+
|
|
214
|
+
```bash
|
|
215
|
+
flash status <run-id> # state + accrued cost
|
|
216
|
+
flash status <run-id> --logs # metric trend + worker console/error logs (+ traceback)
|
|
217
|
+
flash status <run-id> --follow # stream a live run until completion
|
|
218
|
+
flash deploy <run-id> # serve the adapter, then `flash chat` it to read real outputs
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
- **Decide with the noise band.** When comparing two runs or two checkpoints, record
|
|
222
|
+
the eval-split size `N` and the metric's approximate sampling noise — about
|
|
223
|
+
`1.96·√(p(1-p)/N)` for a rate metric `p`. Treat a difference *inside* that band as
|
|
224
|
+
**no change** — neither improvement nor regression. A within-noise gain is not a win.
|
|
225
|
+
|
|
226
|
+
---
|
|
227
|
+
|
|
228
|
+
## Reward design (GRPO) — your highest-impact lever
|
|
229
|
+
|
|
230
|
+
The reward defines what the model learns; its quality sets the ceiling on what GRPO
|
|
231
|
+
can reach. Rewards are rubric / `score_response` functions in your `environment.py`.
|
|
232
|
+
|
|
233
|
+
### Make it graded and dense — avoid the all-zero cold start
|
|
234
|
+
|
|
235
|
+
If `reward_mean` is flat at ~0.000, every rollout in the group scored the same, the
|
|
236
|
+
advantage is zero, and the policy gets **no gradient**. That is a reward-design bug,
|
|
237
|
+
not a model to keep training. Reshape the reward to credit **ordered partial
|
|
238
|
+
progress** so even an untrained base model earns a small nonzero score and better
|
|
239
|
+
attempts score strictly higher:
|
|
240
|
+
|
|
241
|
+
```text
|
|
242
|
+
well-formed / parses → schema- & safety-valid → executes / runs → correct / relevant
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
Gate only the **top** tiers against gaming; keep the lower tiers dense. GRPO needs
|
|
246
|
+
*within-group variance* to learn — if every rollout in a group scores identically,
|
|
247
|
+
there is nothing to optimize.
|
|
248
|
+
|
|
249
|
+
### Separate the shaped reward from a clean success signal
|
|
250
|
+
|
|
251
|
+
A good GRPO reward is usually **shaped** — partial credit so the model always has a
|
|
252
|
+
gradient to climb. But a shaped score is the wrong thing to judge *final quality* on:
|
|
253
|
+
it can rise from reward-hacking while the outcome you care about stays flat. Report the
|
|
254
|
+
shaped value as `score`, and surface the clean pass/fail as an **explicit
|
|
255
|
+
`RewardMetric`** so it shows up in the run's metric breakdown — a bare `threshold` is
|
|
256
|
+
used for grading but is *not* logged on its own, so it gives you nothing to judge:
|
|
257
|
+
|
|
258
|
+
```python
|
|
259
|
+
from freesolo.environments import RewardResult, RewardMetric
|
|
260
|
+
|
|
261
|
+
def score_response(self, example, response_text) -> RewardResult:
|
|
262
|
+
score = graded_score(example, response_text) # shaped 0-1 — what GRPO optimizes
|
|
263
|
+
return RewardResult(
|
|
264
|
+
score=score,
|
|
265
|
+
threshold=1.0, # success = score >= threshold
|
|
266
|
+
metrics=(RewardMetric(name="success", score=float(score >= 1.0)),), # logged: judge on this
|
|
267
|
+
)
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
`score` is what GRPO optimizes (it becomes the run's `total`). Each `RewardMetric` you
|
|
271
|
+
attach is logged by name in the per-scorer breakdown — that is how the clean success
|
|
272
|
+
rate becomes visible. Use the shaped `score` to confirm the model is learning *at all*,
|
|
273
|
+
and judge the run on the explicit `success` metric.
|
|
274
|
+
|
|
275
|
+
### Reward rules that prevent silent failure
|
|
276
|
+
|
|
277
|
+
- **Return `0.0` explicitly — never let scoring raise.** An uncaught exception in
|
|
278
|
+
scoring fails the whole run. Guard every parse and lookup and return
|
|
279
|
+
`RewardResult(score=0.0, error=...)` for missing evidence, a parse failure, or an
|
|
280
|
+
unsafe/unsupported output.
|
|
281
|
+
- **Gate LLM judges behind the hard checks.** Run deterministic validity checks first
|
|
282
|
+
and return `score=0.0` on any parse/schema/safety failure, so the policy can't
|
|
283
|
+
reward-hack a lenient judge with malformed-but-plausible text.
|
|
284
|
+
- **Judge the realistic outcome, not the raw string.** Give a judge the runtime
|
|
285
|
+
output, tool result, or executed-query records. For database / search / retrieval
|
|
286
|
+
tasks, grade the *returned records*, not the query text — the query is only
|
|
287
|
+
secondary validity evidence.
|
|
288
|
+
- **A small format penalty beats a hard zero for shaping.** A useful trick:
|
|
289
|
+
`reward = format_coef * (correct_format - 1) + correct_answer` with `format_coef≈0.1`
|
|
290
|
+
— a tiny penalty for bad formatting, full credit for a correct, well-formatted answer.
|
|
291
|
+
- **Anti-patterns.** Don't reward length or verbosity. Don't ship a reward that is
|
|
292
|
+
always 0 or always 1 (no signal). Simpler rewards usually beat clever ones — a
|
|
293
|
+
mediocre *stable* reward beats a "perfect" reward you keep tweaking. Changing the
|
|
294
|
+
reward resets progress, so keep the best checkpoint before you do.
|
|
295
|
+
|
|
296
|
+
---
|
|
297
|
+
|
|
298
|
+
## SFT conventions
|
|
299
|
+
|
|
300
|
+
Pick SFT when you already have good answers and want the model to imitate them.
|
|
301
|
+
|
|
302
|
+
- **Data quality is the ceiling.** SFT can only be as good as the answers you show it.
|
|
303
|
+
A small set of high-quality examples beats a large mediocre one. Keep response format
|
|
304
|
+
consistent (if you want JSON, *every* example is JSON) and keep the prompt format the
|
|
305
|
+
same as inference time.
|
|
306
|
+
- **Watch the loss fall — and check overfitting yourself.** Flash SFT logs **training
|
|
307
|
+
loss only**; it runs no mid-training held-out eval (evaluation is deferred to the
|
|
308
|
+
deploy/serving side). A falling train loss alone can be memorization, so keep an eval
|
|
309
|
+
split the run never trains on, then **deploy the adapter and score it on that split**
|
|
310
|
+
(`flash deploy` + `flash chat`). If held-out quality stalls or drops while train loss
|
|
311
|
+
keeps falling, reduce `epochs` or add more data — not more passes.
|
|
312
|
+
- **Start `max_length` small and grow it on evidence.** Begin from the smallest
|
|
313
|
+
`max_length` that plausibly fits prompt + completion, and only raise it when you see
|
|
314
|
+
truncation (outputs cut off mid-thought, degraded loss). A bigger context just costs
|
|
315
|
+
more.
|
|
316
|
+
- **SFT is a great warm start for GRPO.** SFT first to teach the format and a competent
|
|
317
|
+
baseline, then GRPO to optimize past it. Across that lineage keep the **same base
|
|
318
|
+
model and the same `lora_rank` / `lora_alpha`** — `init_from_adapter` loads a LoRA
|
|
319
|
+
adapter specific to one base model and one adapter shape, so mixing sizes is an
|
|
320
|
+
invalid shape mismatch.
|
|
321
|
+
|
|
322
|
+
```toml
|
|
323
|
+
# configs/rl.toml — warm-start GRPO from the SFT run's adapter
|
|
324
|
+
algorithm = "grpo"
|
|
325
|
+
|
|
326
|
+
[train]
|
|
327
|
+
# paste the full adapter_ref `flash status <sft-run-id>` prints, verbatim
|
|
328
|
+
# (shape: <owner>/<repo>:sft/<run-id>/seed0 — the owner/repo prefix is required)
|
|
329
|
+
init_from_adapter = "your-org/your-repo:sft/<sft-run-id>/seed0"
|
|
330
|
+
lora_rank = 32 # must match the SFT run
|
|
331
|
+
lora_alpha = 64 # must match the SFT run
|
|
332
|
+
```
|
|
333
|
+
|
|
334
|
+
SFT is **epoch-driven** (`epochs`); GRPO is **step-driven** (`steps`).
|
|
335
|
+
|
|
336
|
+
---
|
|
337
|
+
|
|
338
|
+
## GRPO knobs that matter
|
|
339
|
+
|
|
340
|
+
Set these in `[train]`. Each is `None` by default — the worker's tuned recipe fills
|
|
341
|
+
in a sensible value, so only override with a reason.
|
|
342
|
+
|
|
343
|
+
| Knob | Convention |
|
|
344
|
+
| --- | --- |
|
|
345
|
+
| `group_size` | Completions sampled per prompt (default 8). More = more signal and more cost; drop to 4 to trim cost. The group needs *within-group variance* for an advantage to exist. |
|
|
346
|
+
| `max_tokens` | Completion budget per rollout. Size it to the expected output length — too small silently truncates good answers and poisons the reward; too large just costs more. |
|
|
347
|
+
| `temperature` | Rollout sampling temperature. Keep it near 1.0 for GRPO — too low collapses diversity (and the model can collapse within a few steps); raise it to widen exploration against uniform-reward groups. |
|
|
348
|
+
| `kl_penalty_coef` | Keeps the trained model from drifting too far from the base. Raise it to anchor against entropy collapse; lower it for more freedom to move. |
|
|
349
|
+
| `thinking_length_penalty_coef` | Per-reasoning-token reward deduction — curb overthinking, but watch it doesn't push the model into terse degeneracy. |
|
|
350
|
+
| `learning_rate` | Change it in small steps. Too high destabilizes RL and degrades output quality; if the model is collapsing, lower it. |
|
|
351
|
+
| `batch_size` | The effective prompts-per-step. Too small and the reward trend is pure noise; size it so the trend is readable. |
|
|
352
|
+
|
|
353
|
+
> **The reward-hacking signature:** a smoothed reward rising while mean generated
|
|
354
|
+
> length collapses. Whenever any shortness or format pressure is active, verify the
|
|
355
|
+
> gate by scoring a few truncated or opener-only probe responses — they should score low.
|
|
356
|
+
|
|
357
|
+
---
|
|
358
|
+
|
|
359
|
+
## Curriculum — start easy, scale up
|
|
360
|
+
|
|
361
|
+
Starting too hard produces zero learning signal; the model never succeeds, the reward
|
|
362
|
+
stays at 0, and there is nothing to climb. Start where the base model can *partially*
|
|
363
|
+
succeed, then raise difficulty as it improves. The "Goldilocks zone" — where most
|
|
364
|
+
rollouts score somewhere between all-fail and all-pass — is where GRPO has the most
|
|
365
|
+
signal.
|
|
366
|
+
|
|
367
|
+
- If nearly every prompt is solved (most groups score ~1.0): **increase difficulty** —
|
|
368
|
+
harder prompts, tighter format/reward, more steps.
|
|
369
|
+
- If nearly nothing is solved (most groups score ~0.0): **decrease difficulty** —
|
|
370
|
+
easier or few-shot prompts, a more lenient (denser) reward, or warm-start with SFT.
|
|
371
|
+
- In between: good signal — keep iterating at this difficulty.
|
|
372
|
+
|
|
373
|
+
---
|
|
374
|
+
|
|
375
|
+
## Diagnose before you re-run
|
|
376
|
+
|
|
377
|
+
When the reward stalls, a chunk of outputs fail, or the checkpoint underperforms,
|
|
378
|
+
don't treat failures as one bucket. Read a sample of the **actual failing
|
|
379
|
+
generations** (raw outputs, not just scores), classify the dominant mode, and apply a
|
|
380
|
+
targeted fix rather than leaning on the reward gate to slowly select against it. Then
|
|
381
|
+
**re-measure that mode** to confirm it dropped.
|
|
382
|
+
|
|
383
|
+
| Failure mode | What you see | Targeted fix |
|
|
384
|
+
| --- | --- | --- |
|
|
385
|
+
| Repetition / looping collapse | the same phrase repeats until truncation | repetition or length penalty; lower `temperature` |
|
|
386
|
+
| Overthinking / verbose reasoning | reasoning eats the whole token budget | `thinking_length_penalty_coef`; tighten the prompt |
|
|
387
|
+
| Max-token truncation | answers cut off mid-thought | raise `max_tokens` / `max_length` |
|
|
388
|
+
| Unparsed / over-escaped output | reward can't read the answer | robust parser; return `0.0` on parse fail; format gate |
|
|
389
|
+
| Wrapper / markdown around structured output | prose around the JSON/answer | a format gate; `stop_sequences` |
|
|
390
|
+
| Uniform-reward groups | every rollout in a group scores the same → no gradient | shape the reward for partial credit; raise `temperature` |
|
|
391
|
+
| Too-hard prompts | the base never succeeds, reward stays at 0 | curriculum / easier prompts; warm-start with SFT |
|
|
392
|
+
| Judge-rewarded degenerate output | short, templated answers a judge still rates well | a minimum-substance zero-gate ahead of the judge |
|
|
393
|
+
|
|
394
|
+
---
|
|
395
|
+
|
|
396
|
+
## When a run stalls
|
|
397
|
+
|
|
398
|
+
A plateau is not automatically a capability ceiling. Before you call it one:
|
|
399
|
+
|
|
400
|
+
1. **Probe with best-of-N.** Run a best-of-N / pass@k probe at a coverage temperature
|
|
401
|
+
(well above greedy) on a less-fitted checkpoint.
|
|
402
|
+
2. **Read the result.** High best-of-N but a collapsed greedy output and low sample
|
|
403
|
+
diversity is **entropy collapse**, not a ceiling — and it's fixable: anchor harder
|
|
404
|
+
with `kl_penalty_coef`, lower the `learning_rate`, or widen exploration. Only if the
|
|
405
|
+
probe shows no headroom is it a genuine ceiling.
|
|
406
|
+
3. **Change a different lever.** If there's real headroom, try a *different* lever from
|
|
407
|
+
the one that just failed — a different knob, reward shape, or data family — one
|
|
408
|
+
controlled change at a time.
|
|
409
|
+
|
|
410
|
+
Actively research established GRPO/SFT techniques (exploration / entropy control, KL
|
|
411
|
+
scheduling, reward shaping, curriculum / difficulty filtering, rejection-sampling SFT
|
|
412
|
+
on high-reward rollouts) rather than guessing — and count a technique as helpful only
|
|
413
|
+
on a beyond-noise improvement.
|
|
414
|
+
|
|
415
|
+
---
|
|
416
|
+
|
|
417
|
+
## Scale the evidence
|
|
418
|
+
|
|
419
|
+
- **A smoke test is not proof.** A single-digit `steps`, a tiny dataset, or a handful
|
|
420
|
+
of rollouts only validates the wiring. Scale `steps` / `epochs`, the dataset size,
|
|
421
|
+
and `group_size` to the model and the data you actually have before you trust a
|
|
422
|
+
result. Don't cite budget alone as the reason for an underpowered run.
|
|
423
|
+
- **Use the data you have.** Deliberately assign every usable row to training or to a
|
|
424
|
+
held-out eval split; if a planned holdout is so small that one example swings the
|
|
425
|
+
metric by several points, enlarge it during split design rather than gating on noise.
|
|
426
|
+
|
|
427
|
+
---
|
|
428
|
+
|
|
429
|
+
## Treat crashes as infra, not model size
|
|
430
|
+
|
|
431
|
+
> A CUDA / OOM / vLLM / kernel / provider error is an **infrastructure** problem, not a
|
|
432
|
+
> sign the model is too big. Lower `max_length`, `max_tokens`, or `group_size` to shrink
|
|
433
|
+
> the run's footprint and let the allocator retry onto the next fitting GPU class — do
|
|
434
|
+
> **not** switch to a smaller model to make a crash disappear. That silently destroys
|
|
435
|
+
> quality.
|
|
436
|
+
|
|
437
|
+
---
|
|
438
|
+
|
|
439
|
+
## Command reference
|
|
440
|
+
|
|
441
|
+
```bash
|
|
442
|
+
flash env setup # scaffold environment.py, datasets/, configs/, this file
|
|
443
|
+
flash env push --name my-env . # publish the environment; paste the returned id into [environment]
|
|
444
|
+
flash train configs/rl.toml --dry-run # validate the config locally (no GPU, no charge)
|
|
445
|
+
flash train configs/rl.toml --cost # pre-flight USD estimate, then exit
|
|
446
|
+
flash train configs/rl.toml # submit and follow logs (Ctrl-C detaches; --background to skip following)
|
|
447
|
+
flash status <run-id> # state + accrued cost
|
|
448
|
+
flash status <run-id> --logs # reward/loss trend + worker console/error logs
|
|
449
|
+
flash status <run-id> --follow # stream a live run to completion
|
|
450
|
+
flash runs # list your runs and their state/cost
|
|
451
|
+
flash deploy <run-id> # serve the trained adapter
|
|
452
|
+
```
|
|
453
|
+
|
|
454
|
+
See the full reference at https://freesolo.co/docs.
|
|
455
|
+
"""
|
flash/client/__init__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""HTTP client for the managed Flash control plane (used by the CLI and MCP bridge)."""
|
|
2
|
+
|
|
3
|
+
from .config import load_credentials, save_credentials
|
|
4
|
+
from .http import ApiClient, ApiError, ClientError, client_from_config, verify_freesolo_key
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"ApiClient",
|
|
8
|
+
"ApiError",
|
|
9
|
+
"ClientError",
|
|
10
|
+
"client_from_config",
|
|
11
|
+
"load_credentials",
|
|
12
|
+
"save_credentials",
|
|
13
|
+
"verify_freesolo_key",
|
|
14
|
+
]
|
flash/client/config.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Client-side credential storage: the Flash API key + control-plane URL.
|
|
2
|
+
|
|
3
|
+
Stored in ``~/.flash/config.json`` (dir 0700, file 0600 — it holds a secret).
|
|
4
|
+
Environment variables take precedence so CI/agents can inject credentials without
|
|
5
|
+
touching the file: ``FREESOLO_API_KEY`` for the key, ``FLASH_API_URL`` for the URL.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from .._channel import CHANNEL
|
|
14
|
+
from .._fileio import read_json_or_empty, secure_json_write
|
|
15
|
+
|
|
16
|
+
# The default control plane follows the installed channel (see flash/_channel.py): the prod
|
|
17
|
+
# package (freesolo-flash / `flash`) targets the production plane, the dev-channel package
|
|
18
|
+
# (freesolo-flash-dev / `flash-dev`) targets the staging plane. Either is overridable via
|
|
19
|
+
# FLASH_API_URL or `flash login --api-url`.
|
|
20
|
+
PROD_API_URL = "https://flash.freesolo.co"
|
|
21
|
+
DEV_API_URL = "https://flash-dev.freesolo.co"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def default_api_url(channel: str = CHANNEL) -> str:
|
|
25
|
+
"""Default control-plane URL for the given release channel."""
|
|
26
|
+
return DEV_API_URL if channel == "dev" else PROD_API_URL
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
DEFAULT_API_URL = default_api_url()
|
|
30
|
+
|
|
31
|
+
CONFIG_DIR = Path.home() / ".flash"
|
|
32
|
+
CONFIG_PATH = CONFIG_DIR / "config.json"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _read_config() -> dict:
|
|
36
|
+
return read_json_or_empty(CONFIG_PATH)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def load_credentials_with_source() -> tuple[str, str | None, str | None]:
|
|
40
|
+
"""Resolve (api_url, api_key, key_source); key/source are None when logged out."""
|
|
41
|
+
cfg = _read_config()
|
|
42
|
+
api_url = os.environ.get("FLASH_API_URL") or cfg.get("api_url") or DEFAULT_API_URL
|
|
43
|
+
env_key = os.environ.get("FREESOLO_API_KEY")
|
|
44
|
+
if env_key:
|
|
45
|
+
return api_url.rstrip("/"), env_key, "FREESOLO_API_KEY"
|
|
46
|
+
if cfg.get("api_key"):
|
|
47
|
+
return api_url.rstrip("/"), cfg["api_key"], str(CONFIG_PATH)
|
|
48
|
+
return api_url.rstrip("/"), None, None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def load_credentials() -> tuple[str, str | None]:
|
|
52
|
+
"""Resolve (api_url, api_key); the key is None when the user hasn't logged in."""
|
|
53
|
+
api_url, api_key, _source = load_credentials_with_source()
|
|
54
|
+
return api_url, api_key
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def save_credentials(api_key: str, api_url: str | None = None) -> Path:
|
|
58
|
+
"""Persist the key (and optionally a non-default URL) with private permissions."""
|
|
59
|
+
cfg = _read_config()
|
|
60
|
+
cfg["api_key"] = api_key
|
|
61
|
+
if api_url:
|
|
62
|
+
# Record the plane actually authenticated against. When it's the default, drop any
|
|
63
|
+
# stored url instead of pinning it — this also clears a stale custom url from a
|
|
64
|
+
# previous custom FLASH_API_URL login so later commands don't keep hitting the old host.
|
|
65
|
+
if api_url.rstrip("/") == DEFAULT_API_URL.rstrip("/"):
|
|
66
|
+
cfg.pop("api_url", None)
|
|
67
|
+
else:
|
|
68
|
+
cfg["api_url"] = api_url.rstrip("/")
|
|
69
|
+
secure_json_write(CONFIG_PATH, cfg)
|
|
70
|
+
return CONFIG_PATH
|