freesolo-flash-dev 0.2.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. flash/__init__.py +29 -0
  2. flash/_channel.py +23 -0
  3. flash/_fileio.py +35 -0
  4. flash/_logging.py +49 -0
  5. flash/_update_check.py +266 -0
  6. flash/catalog.py +253 -0
  7. flash/cli/__init__.py +1 -0
  8. flash/cli/main/__init__.py +227 -0
  9. flash/cli/main/__main__.py +6 -0
  10. flash/cli/main/commands.py +636 -0
  11. flash/cli/main/envpush.py +317 -0
  12. flash/cli/main/render.py +599 -0
  13. flash/cli/main/training_doc.py +455 -0
  14. flash/client/__init__.py +14 -0
  15. flash/client/config.py +70 -0
  16. flash/client/http.py +372 -0
  17. flash/client/runtime_secrets.py +69 -0
  18. flash/client/specs.py +20 -0
  19. flash/cost/__init__.py +16 -0
  20. flash/cost/analytical.py +175 -0
  21. flash/cost/facts.py +114 -0
  22. flash/cost/spec.py +113 -0
  23. flash/cost/types.py +158 -0
  24. flash/engine/__init__.py +6 -0
  25. flash/engine/accounting.py +36 -0
  26. flash/engine/chalk_kernels.py +116 -0
  27. flash/engine/multiturn_rollout.py +780 -0
  28. flash/engine/recipe.py +86 -0
  29. flash/engine/vram.py +603 -0
  30. flash/engine/worker/__init__.py +2916 -0
  31. flash/engine/worker/__main__.py +4 -0
  32. flash/engine/worker/kernel_warmup.py +400 -0
  33. flash/engine/worker/lora.py +796 -0
  34. flash/engine/worker/packing.py +366 -0
  35. flash/engine/worker/perf.py +1048 -0
  36. flash/envs/__init__.py +10 -0
  37. flash/envs/adapter/__init__.py +883 -0
  38. flash/envs/adapter/rubric.py +222 -0
  39. flash/envs/base.py +52 -0
  40. flash/envs/registry.py +62 -0
  41. flash/mcp/__init__.py +1 -0
  42. flash/mcp/server.py +85 -0
  43. flash/providers/__init__.py +59 -0
  44. flash/providers/_auth.py +24 -0
  45. flash/providers/_http.py +230 -0
  46. flash/providers/_instance.py +416 -0
  47. flash/providers/_instance_bootstrap.py +517 -0
  48. flash/providers/_poll.py +311 -0
  49. flash/providers/allocator.py +193 -0
  50. flash/providers/base.py +431 -0
  51. flash/providers/hyperstack/__init__.py +127 -0
  52. flash/providers/hyperstack/api.py +522 -0
  53. flash/providers/hyperstack/auth.py +17 -0
  54. flash/providers/hyperstack/gpus.py +29 -0
  55. flash/providers/hyperstack/jobs/__init__.py +632 -0
  56. flash/providers/hyperstack/jobs/builders.py +122 -0
  57. flash/providers/hyperstack/preflight.py +23 -0
  58. flash/providers/hyperstack/pricing.py +26 -0
  59. flash/providers/hyperstack/train.py +25 -0
  60. flash/providers/lambdalabs/__init__.py +139 -0
  61. flash/providers/lambdalabs/api.py +261 -0
  62. flash/providers/lambdalabs/auth.py +18 -0
  63. flash/providers/lambdalabs/gpus.py +29 -0
  64. flash/providers/lambdalabs/jobs/__init__.py +724 -0
  65. flash/providers/lambdalabs/jobs/builders.py +118 -0
  66. flash/providers/lambdalabs/preflight.py +27 -0
  67. flash/providers/lambdalabs/pricing.py +51 -0
  68. flash/providers/lambdalabs/train.py +27 -0
  69. flash/providers/preflight.py +55 -0
  70. flash/providers/realized.py +80 -0
  71. flash/providers/runpod/__init__.py +130 -0
  72. flash/providers/runpod/api.py +186 -0
  73. flash/providers/runpod/auth.py +37 -0
  74. flash/providers/runpod/cost.py +57 -0
  75. flash/providers/runpod/gpus.py +46 -0
  76. flash/providers/runpod/jobs.py +956 -0
  77. flash/providers/runpod/keys.py +139 -0
  78. flash/providers/runpod/preflight.py +30 -0
  79. flash/providers/runpod/preload.py +915 -0
  80. flash/providers/runpod/pricing.py +18 -0
  81. flash/providers/runpod/slots.py +79 -0
  82. flash/providers/runpod/train/__init__.py +150 -0
  83. flash/providers/runpod/train/deps.py +395 -0
  84. flash/providers/runpod/train/endpoints.py +820 -0
  85. flash/py.typed +0 -0
  86. flash/runner/__init__.py +686 -0
  87. flash/runner/checkpoints.py +82 -0
  88. flash/runner/deploy.py +422 -0
  89. flash/runner/lifecycle.py +672 -0
  90. flash/schema/__init__.py +375 -0
  91. flash/schema/fields.py +331 -0
  92. flash/serve/__init__.py +1 -0
  93. flash/serve/deploy.py +326 -0
  94. flash/serve/pricing.py +60 -0
  95. flash/server/__init__.py +1 -0
  96. flash/server/__main__.py +20 -0
  97. flash/server/app.py +961 -0
  98. flash/server/auth.py +263 -0
  99. flash/server/billing.py +124 -0
  100. flash/server/checkpoints.py +110 -0
  101. flash/server/db.py +160 -0
  102. flash/server/environment_registry.py +102 -0
  103. flash/server/envs.py +360 -0
  104. flash/server/reconcile.py +163 -0
  105. flash/server/run_registry.py +150 -0
  106. flash/spec.py +333 -0
  107. freesolo_flash_dev-0.2.25.dist-info/METADATA +192 -0
  108. freesolo_flash_dev-0.2.25.dist-info/RECORD +111 -0
  109. freesolo_flash_dev-0.2.25.dist-info/WHEEL +4 -0
  110. freesolo_flash_dev-0.2.25.dist-info/entry_points.txt +3 -0
  111. freesolo_flash_dev-0.2.25.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,455 @@
1
+ """The TRAINING.md playbook scaffolded by `flash env setup`.
2
+
3
+ This is the single source of truth for the TRAINING.md that lands in a user's
4
+ project folder. It is written for the AI coding agent a user points at their
5
+ environment: a finished run is not a run that worked, and most of the value is in
6
+ how you design the signal, what you read, and how you decide a run is good.
7
+
8
+ Keep it flash-accurate — every command, config field, and import below exists in
9
+ this codebase (`flash <cmd>`, `[train]` fields in ``flash/spec.py``, and
10
+ ``freesolo.environments``). Update it here, not in a copy.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ TRAINING_MD = r"""# TRAINING.md — how to actually improve a model with Flash
16
+
17
+ > **If you are an AI agent asked to train a model here, read this first.**
18
+ > `flash env setup` dropped this file next to your `environment.py` and `configs/`.
19
+ > It is the playbook Freesolo's own training agents follow to turn a *finished*
20
+ > run into a model that *measurably improved*. The mechanics live in the hosted
21
+ > docs (https://freesolo.co/docs); this file is the judgment that sits on top of them.
22
+
23
+ A run that reaches `done` is **not** the same as a run that worked. Submitting a run
24
+ is not a result. The whole job is to design the learning signal, read what the run
25
+ actually produced, and decide — honestly — whether the model got better.
26
+
27
+ ---
28
+
29
+ ## Using Flash
30
+
31
+ Flash is a **managed** training service with a thin CLI/client. You author an
32
+ environment (the task + its reward), publish it, and submit SFT or GRPO runs from a
33
+ TOML config. Flash allocates the cheapest fitting GPU across providers, runs the job,
34
+ streams logs back, and serves the result. You never handle provider credentials or a
35
+ GPU — you authenticate once with a freesolo API key, and everything below is a `flash`
36
+ CLI command.
37
+
38
+ ### Install & authenticate
39
+
40
+ ```bash
41
+ pip install freesolo-flash # installs the `flash` CLI (import name is also `flash`)
42
+ flash login --api-key fslo_... # or: export FREESOLO_API_KEY=fslo_... (create a key at https://freesolo.co)
43
+ flash whoami # confirm the identity behind your key
44
+ flash models # supported base models (and which support `thinking`)
45
+ flash gpus # managed GPU classes with live $/hr
46
+ ```
47
+
48
+ ### The project layout (`flash env setup` created this)
49
+
50
+ ```text
51
+ environment.py # the task: how to prompt the model and how to score it
52
+ datasets/train.jsonl # training rows, one JSON object per line: {"input": ..., "output": ...}
53
+ configs/rl.toml # a GRPO (RL) run config
54
+ configs/sft.toml # an SFT run config
55
+ TRAINING.md # this file
56
+ ```
57
+
58
+ ### 1. Author the environment
59
+
60
+ `environment.py` defines the task. A single-turn env subclasses
61
+ `EnvironmentSingleTurn`, turns a row into a prompt, and scores the model's response
62
+ with a `RewardResult` (see *Reward design* below). `load_environment()` is the entry
63
+ point Flash calls:
64
+
65
+ ```python
66
+ from freesolo.datasets.types import TaskExample
67
+ from freesolo.environments import EnvironmentSingleTurn, RewardResult
68
+
69
+ class MyEnv(EnvironmentSingleTurn):
70
+ dataset = load_jsonl("datasets/train.jsonl") # rows -> TaskExample(input=..., output=...)
71
+
72
+ def build_prompt_messages(self, example: TaskExample, prompt_text: str):
73
+ return [{"role": "user", "content": example.input}]
74
+
75
+ def score_response(self, example: TaskExample, response_text: str) -> RewardResult:
76
+ expected = str(example.output or "").strip()
77
+ score = 1.0 if expected and expected in response_text else 0.0
78
+ return RewardResult(score=score, threshold=1.0)
79
+
80
+ def load_environment(**kwargs) -> MyEnv:
81
+ return MyEnv()
82
+ ```
83
+
84
+ For tool use, dialogue, or games, subclass `EnvironmentMultiTurn` instead and drive the
85
+ conversation across turns. The reward is the same `RewardResult` contract either way.
86
+
87
+ ### 2. Publish the environment
88
+
89
+ A managed run references a **published** environment by id — so push your folder first:
90
+
91
+ ```bash
92
+ flash env push --name my-env . # uploads this project; prints an env id like "your-org/my-env"
93
+ flash env list # installed envs + local sources you can push
94
+ flash env install your-org/their-env # record an env someone else published, to train against it
95
+ ```
96
+
97
+ Paste the returned id into `[environment] id` in **both** configs. Re-push after any
98
+ edit to `environment.py` or `datasets/` so the managed run uses your change.
99
+
100
+ ### 3. Configure the run (TOML)
101
+
102
+ ```toml
103
+ model = "Qwen/Qwen3.5-4B" # see `flash models`
104
+ algorithm = "grpo" # "grpo" (RL) or "sft"
105
+ # thinking = true # opt-in reasoning mode, for models that support it
106
+
107
+ [environment]
108
+ id = "your-org/my-env" # the id printed by `flash env push`
109
+ # secrets = ["SERPAPI_API_KEY"] # only the NAMES of env vars your environment reads;
110
+ # values are pulled from your shell/.env at submit time,
111
+ # never stored in the spec
112
+
113
+ [train]
114
+ steps = 150 # GRPO is step-driven; SFT is epoch-driven (epochs = N)
115
+ lora_rank = 32
116
+ lora_alpha = 64
117
+ seeds = [0]
118
+ ```
119
+
120
+ GPU and the HF artifact repo are **fully managed** — there is no GPU knob; the allocator
121
+ picks the cheapest class that fits, and each run gets its own artifact repo. Compose or
122
+ tweak configs without editing files: `--config extra.toml` (deep-merge) and
123
+ `--set key=value` (e.g. `--set train.steps=300`).
124
+
125
+ ### 4. Submit
126
+
127
+ ```bash
128
+ flash train configs/rl.toml --dry-run # validate the config locally — no GPU, no charge
129
+ flash train configs/rl.toml --cost # pre-flight USD estimate, then exit
130
+ flash train configs/rl.toml # submit and follow logs (Ctrl-C detaches)
131
+ flash train configs/rl.toml --background # submit and return immediately
132
+ ```
133
+
134
+ ### 5. Monitor
135
+
136
+ ```bash
137
+ flash status <run-id> # state + accrued cost
138
+ flash status <run-id> --logs # reward/loss trend + worker console/error logs + any traceback
139
+ flash status <run-id> --follow # stream a live run to completion
140
+ flash runs # all your runs and their state/cost
141
+ flash cancel <run-id> # stop a run
142
+ ```
143
+
144
+ ### 6. Deploy & chat
145
+
146
+ ```bash
147
+ flash checkpoints <run-id> # deployable per-step RL checkpoints
148
+ flash deploy <run-id> # serve the trained adapter (--step N for an intermediate checkpoint)
149
+ flash chat <run-id> -m "hello" # chat with the deployed adapter
150
+ flash deployments # active serving endpoints
151
+ flash undeploy <run-id> # tear the endpoint down
152
+ ```
153
+
154
+ > Flash also ships an **MCP bridge** (`flash` as an MCP server) so a coding agent can
155
+ > drive these same commands as tools.
156
+
157
+ The rest of this file is about doing the above *well* — designing a reward that teaches,
158
+ and deciding honestly whether a run improved.
159
+
160
+ ---
161
+
162
+ ## The loop
163
+
164
+ Work in tight, attributable iterations. Each one is a hypothesis:
165
+
166
+ ```
167
+ 1. Reconstruct state — what's the best run so far, and what have you already tried?
168
+ 2. Form a hypothesis — pick ONE lever and say WHY it will move the metric.
169
+ 3. Change that ONE lever.
170
+ 4. Validate locally — `flash train configs/rl.toml --dry-run` (catches config errors
171
+ for free; a paid run on a broken config or an all-zero reward is wasted budget).
172
+ 5. Submit — `flash train configs/rl.toml`.
173
+ 6. Judge — read the metric trend AND a sample of real rollouts (see below).
174
+ 7. Keep the best run; revert the change if it didn't beat the noise band. Repeat.
175
+ ```
176
+
177
+ **Lever priority (highest impact first):** reward design → data / curriculum →
178
+ training knobs. The reward is the teacher; spend your effort there before touching
179
+ hyperparameters.
180
+
181
+ **One controlled change at a time.** Bundling changes makes the effect
182
+ unattributable. Never re-run a setting that already failed at a negligibly
183
+ different value.
184
+
185
+ ---
186
+
187
+ ## Before you trust a run — the checklist
188
+
189
+ A run is only evidence of improvement when **all** of these hold:
190
+
191
+ - [ ] The run reached `done` (confirmed via `flash status <run-id>`), not merely submitted.
192
+ - [ ] The reward trend rose (GRPO `reward_mean`) or the SFT loss fell — **beyond the noise band**, not within it.
193
+ - [ ] You **probed the trained adapter on real inputs** (`flash deploy` + `flash chat`), including cases it should fail — not just the metrics.
194
+ - [ ] The score is real behavior, not empty/truncated/templated outputs, skipped rows, leakage, a swallowed exception, or a format-only win.
195
+ - [ ] If you track a clean success signal separately from the shaped reward (an explicit `RewardMetric`), *that* moved too.
196
+
197
+ If any box is unchecked, the run is not done improving — keep training, don't declare success.
198
+
199
+ ---
200
+
201
+ ## Judge the run, don't just finish it
202
+
203
+ - **Judge the trend, not a single number.** The proof of training is the curve:
204
+ `reward_mean` rising over steps (GRPO) or loss falling (SFT). Record the base/early
205
+ value and the final value. A flat or noisy trend with no improvement is not success.
206
+ - **Read the model's outputs, not just the metrics.** A rising reward can come from
207
+ reward-hacking or a degenerate output the reward still credits — metrics alone never
208
+ establish that the model got better. Flash does not expose training-time rollouts
209
+ through the CLI (`--logs` gives you the metric trend and the worker's console/error
210
+ logs, not the sampled generations), so to read real outputs **deploy the adapter and
211
+ probe it**: `flash deploy <run-id>` then `flash chat <run-id> -m "..."` on at least a
212
+ few real inputs, including ones it should get wrong.
213
+
214
+ ```bash
215
+ flash status <run-id> # state + accrued cost
216
+ flash status <run-id> --logs # metric trend + worker console/error logs (+ traceback)
217
+ flash status <run-id> --follow # stream a live run until completion
218
+ flash deploy <run-id> # serve the adapter, then `flash chat` it to read real outputs
219
+ ```
220
+
221
+ - **Decide with the noise band.** When comparing two runs or two checkpoints, record
222
+ the eval-split size `N` and the metric's approximate sampling noise — about
223
+ `1.96·√(p(1-p)/N)` for a rate metric `p`. Treat a difference *inside* that band as
224
+ **no change** — neither improvement nor regression. A within-noise gain is not a win.
225
+
226
+ ---
227
+
228
+ ## Reward design (GRPO) — your highest-impact lever
229
+
230
+ The reward defines what the model learns; its quality sets the ceiling on what GRPO
231
+ can reach. Rewards are rubric / `score_response` functions in your `environment.py`.
232
+
233
+ ### Make it graded and dense — avoid the all-zero cold start
234
+
235
+ If `reward_mean` is flat at ~0.000, every rollout in the group scored the same, the
236
+ advantage is zero, and the policy gets **no gradient**. That is a reward-design bug,
237
+ not a model to keep training. Reshape the reward to credit **ordered partial
238
+ progress** so even an untrained base model earns a small nonzero score and better
239
+ attempts score strictly higher:
240
+
241
+ ```text
242
+ well-formed / parses → schema- & safety-valid → executes / runs → correct / relevant
243
+ ```
244
+
245
+ Gate only the **top** tiers against gaming; keep the lower tiers dense. GRPO needs
246
+ *within-group variance* to learn — if every rollout in a group scores identically,
247
+ there is nothing to optimize.
248
+
249
+ ### Separate the shaped reward from a clean success signal
250
+
251
+ A good GRPO reward is usually **shaped** — partial credit so the model always has a
252
+ gradient to climb. But a shaped score is the wrong thing to judge *final quality* on:
253
+ it can rise from reward-hacking while the outcome you care about stays flat. Report the
254
+ shaped value as `score`, and surface the clean pass/fail as an **explicit
255
+ `RewardMetric`** so it shows up in the run's metric breakdown — a bare `threshold` is
256
+ used for grading but is *not* logged on its own, so it gives you nothing to judge:
257
+
258
+ ```python
259
+ from freesolo.environments import RewardResult, RewardMetric
260
+
261
+ def score_response(self, example, response_text) -> RewardResult:
262
+ score = graded_score(example, response_text) # shaped 0-1 — what GRPO optimizes
263
+ return RewardResult(
264
+ score=score,
265
+ threshold=1.0, # success = score >= threshold
266
+ metrics=(RewardMetric(name="success", score=float(score >= 1.0)),), # logged: judge on this
267
+ )
268
+ ```
269
+
270
+ `score` is what GRPO optimizes (it becomes the run's `total`). Each `RewardMetric` you
271
+ attach is logged by name in the per-scorer breakdown — that is how the clean success
272
+ rate becomes visible. Use the shaped `score` to confirm the model is learning *at all*,
273
+ and judge the run on the explicit `success` metric.
274
+
275
+ ### Reward rules that prevent silent failure
276
+
277
+ - **Return `0.0` explicitly — never let scoring raise.** An uncaught exception in
278
+ scoring fails the whole run. Guard every parse and lookup and return
279
+ `RewardResult(score=0.0, error=...)` for missing evidence, a parse failure, or an
280
+ unsafe/unsupported output.
281
+ - **Gate LLM judges behind the hard checks.** Run deterministic validity checks first
282
+ and return `score=0.0` on any parse/schema/safety failure, so the policy can't
283
+ reward-hack a lenient judge with malformed-but-plausible text.
284
+ - **Judge the realistic outcome, not the raw string.** Give a judge the runtime
285
+ output, tool result, or executed-query records. For database / search / retrieval
286
+ tasks, grade the *returned records*, not the query text — the query is only
287
+ secondary validity evidence.
288
+ - **A small format penalty beats a hard zero for shaping.** A useful trick:
289
+ `reward = format_coef * (correct_format - 1) + correct_answer` with `format_coef≈0.1`
290
+ — a tiny penalty for bad formatting, full credit for a correct, well-formatted answer.
291
+ - **Anti-patterns.** Don't reward length or verbosity. Don't ship a reward that is
292
+ always 0 or always 1 (no signal). Simpler rewards usually beat clever ones — a
293
+ mediocre *stable* reward beats a "perfect" reward you keep tweaking. Changing the
294
+ reward resets progress, so keep the best checkpoint before you do.
295
+
296
+ ---
297
+
298
+ ## SFT conventions
299
+
300
+ Pick SFT when you already have good answers and want the model to imitate them.
301
+
302
+ - **Data quality is the ceiling.** SFT can only be as good as the answers you show it.
303
+ A small set of high-quality examples beats a large mediocre one. Keep response format
304
+ consistent (if you want JSON, *every* example is JSON) and keep the prompt format the
305
+ same as inference time.
306
+ - **Watch the loss fall — and check overfitting yourself.** Flash SFT logs **training
307
+ loss only**; it runs no mid-training held-out eval (evaluation is deferred to the
308
+ deploy/serving side). A falling train loss alone can be memorization, so keep an eval
309
+ split the run never trains on, then **deploy the adapter and score it on that split**
310
+ (`flash deploy` + `flash chat`). If held-out quality stalls or drops while train loss
311
+ keeps falling, reduce `epochs` or add more data — not more passes.
312
+ - **Start `max_length` small and grow it on evidence.** Begin from the smallest
313
+ `max_length` that plausibly fits prompt + completion, and only raise it when you see
314
+ truncation (outputs cut off mid-thought, degraded loss). A bigger context just costs
315
+ more.
316
+ - **SFT is a great warm start for GRPO.** SFT first to teach the format and a competent
317
+ baseline, then GRPO to optimize past it. Across that lineage keep the **same base
318
+ model and the same `lora_rank` / `lora_alpha`** — `init_from_adapter` loads a LoRA
319
+ adapter specific to one base model and one adapter shape, so mixing sizes is an
320
+ invalid shape mismatch.
321
+
322
+ ```toml
323
+ # configs/rl.toml — warm-start GRPO from the SFT run's adapter
324
+ algorithm = "grpo"
325
+
326
+ [train]
327
+ # paste the full adapter_ref `flash status <sft-run-id>` prints, verbatim
328
+ # (shape: <owner>/<repo>:sft/<run-id>/seed0 — the owner/repo prefix is required)
329
+ init_from_adapter = "your-org/your-repo:sft/<sft-run-id>/seed0"
330
+ lora_rank = 32 # must match the SFT run
331
+ lora_alpha = 64 # must match the SFT run
332
+ ```
333
+
334
+ SFT is **epoch-driven** (`epochs`); GRPO is **step-driven** (`steps`).
335
+
336
+ ---
337
+
338
+ ## GRPO knobs that matter
339
+
340
+ Set these in `[train]`. Each is `None` by default — the worker's tuned recipe fills
341
+ in a sensible value, so only override with a reason.
342
+
343
+ | Knob | Convention |
344
+ | --- | --- |
345
+ | `group_size` | Completions sampled per prompt (default 8). More = more signal and more cost; drop to 4 to trim cost. The group needs *within-group variance* for an advantage to exist. |
346
+ | `max_tokens` | Completion budget per rollout. Size it to the expected output length — too small silently truncates good answers and poisons the reward; too large just costs more. |
347
+ | `temperature` | Rollout sampling temperature. Keep it near 1.0 for GRPO — too low collapses diversity (and the model can collapse within a few steps); raise it to widen exploration against uniform-reward groups. |
348
+ | `kl_penalty_coef` | Keeps the trained model from drifting too far from the base. Raise it to anchor against entropy collapse; lower it for more freedom to move. |
349
+ | `thinking_length_penalty_coef` | Per-reasoning-token reward deduction — curb overthinking, but watch it doesn't push the model into terse degeneracy. |
350
+ | `learning_rate` | Change it in small steps. Too high destabilizes RL and degrades output quality; if the model is collapsing, lower it. |
351
+ | `batch_size` | The effective prompts-per-step. Too small and the reward trend is pure noise; size it so the trend is readable. |
352
+
353
+ > **The reward-hacking signature:** a smoothed reward rising while mean generated
354
+ > length collapses. Whenever any shortness or format pressure is active, verify the
355
+ > gate by scoring a few truncated or opener-only probe responses — they should score low.
356
+
357
+ ---
358
+
359
+ ## Curriculum — start easy, scale up
360
+
361
+ Starting too hard produces zero learning signal; the model never succeeds, the reward
362
+ stays at 0, and there is nothing to climb. Start where the base model can *partially*
363
+ succeed, then raise difficulty as it improves. The "Goldilocks zone" — where most
364
+ rollouts score somewhere between all-fail and all-pass — is where GRPO has the most
365
+ signal.
366
+
367
+ - If nearly every prompt is solved (most groups score ~1.0): **increase difficulty** —
368
+ harder prompts, tighter format/reward, more steps.
369
+ - If nearly nothing is solved (most groups score ~0.0): **decrease difficulty** —
370
+ easier or few-shot prompts, a more lenient (denser) reward, or warm-start with SFT.
371
+ - In between: good signal — keep iterating at this difficulty.
372
+
373
+ ---
374
+
375
+ ## Diagnose before you re-run
376
+
377
+ When the reward stalls, a chunk of outputs fail, or the checkpoint underperforms,
378
+ don't treat failures as one bucket. Read a sample of the **actual failing
379
+ generations** (raw outputs, not just scores), classify the dominant mode, and apply a
380
+ targeted fix rather than leaning on the reward gate to slowly select against it. Then
381
+ **re-measure that mode** to confirm it dropped.
382
+
383
+ | Failure mode | What you see | Targeted fix |
384
+ | --- | --- | --- |
385
+ | Repetition / looping collapse | the same phrase repeats until truncation | repetition or length penalty; lower `temperature` |
386
+ | Overthinking / verbose reasoning | reasoning eats the whole token budget | `thinking_length_penalty_coef`; tighten the prompt |
387
+ | Max-token truncation | answers cut off mid-thought | raise `max_tokens` / `max_length` |
388
+ | Unparsed / over-escaped output | reward can't read the answer | robust parser; return `0.0` on parse fail; format gate |
389
+ | Wrapper / markdown around structured output | prose around the JSON/answer | a format gate; `stop_sequences` |
390
+ | Uniform-reward groups | every rollout in a group scores the same → no gradient | shape the reward for partial credit; raise `temperature` |
391
+ | Too-hard prompts | the base never succeeds, reward stays at 0 | curriculum / easier prompts; warm-start with SFT |
392
+ | Judge-rewarded degenerate output | short, templated answers a judge still rates well | a minimum-substance zero-gate ahead of the judge |
393
+
394
+ ---
395
+
396
+ ## When a run stalls
397
+
398
+ A plateau is not automatically a capability ceiling. Before you call it one:
399
+
400
+ 1. **Probe with best-of-N.** Run a best-of-N / pass@k probe at a coverage temperature
401
+ (well above greedy) on a less-fitted checkpoint.
402
+ 2. **Read the result.** High best-of-N but a collapsed greedy output and low sample
403
+ diversity is **entropy collapse**, not a ceiling — and it's fixable: anchor harder
404
+ with `kl_penalty_coef`, lower the `learning_rate`, or widen exploration. Only if the
405
+ probe shows no headroom is it a genuine ceiling.
406
+ 3. **Change a different lever.** If there's real headroom, try a *different* lever from
407
+ the one that just failed — a different knob, reward shape, or data family — one
408
+ controlled change at a time.
409
+
410
+ Actively research established GRPO/SFT techniques (exploration / entropy control, KL
411
+ scheduling, reward shaping, curriculum / difficulty filtering, rejection-sampling SFT
412
+ on high-reward rollouts) rather than guessing — and count a technique as helpful only
413
+ on a beyond-noise improvement.
414
+
415
+ ---
416
+
417
+ ## Scale the evidence
418
+
419
+ - **A smoke test is not proof.** A single-digit `steps`, a tiny dataset, or a handful
420
+ of rollouts only validates the wiring. Scale `steps` / `epochs`, the dataset size,
421
+ and `group_size` to the model and the data you actually have before you trust a
422
+ result. Don't cite budget alone as the reason for an underpowered run.
423
+ - **Use the data you have.** Deliberately assign every usable row to training or to a
424
+ held-out eval split; if a planned holdout is so small that one example swings the
425
+ metric by several points, enlarge it during split design rather than gating on noise.
426
+
427
+ ---
428
+
429
+ ## Treat crashes as infra, not model size
430
+
431
+ > A CUDA / OOM / vLLM / kernel / provider error is an **infrastructure** problem, not a
432
+ > sign the model is too big. Lower `max_length`, `max_tokens`, or `group_size` to shrink
433
+ > the run's footprint and let the allocator retry onto the next fitting GPU class — do
434
+ > **not** switch to a smaller model to make a crash disappear. That silently destroys
435
+ > quality.
436
+
437
+ ---
438
+
439
+ ## Command reference
440
+
441
+ ```bash
442
+ flash env setup # scaffold environment.py, datasets/, configs/, this file
443
+ flash env push --name my-env . # publish the environment; paste the returned id into [environment]
444
+ flash train configs/rl.toml --dry-run # validate the config locally (no GPU, no charge)
445
+ flash train configs/rl.toml --cost # pre-flight USD estimate, then exit
446
+ flash train configs/rl.toml # submit and follow logs (Ctrl-C detaches; --background to skip following)
447
+ flash status <run-id> # state + accrued cost
448
+ flash status <run-id> --logs # reward/loss trend + worker console/error logs
449
+ flash status <run-id> --follow # stream a live run to completion
450
+ flash runs # list your runs and their state/cost
451
+ flash deploy <run-id> # serve the trained adapter
452
+ ```
453
+
454
+ See the full reference at https://freesolo.co/docs.
455
+ """
@@ -0,0 +1,14 @@
1
+ """HTTP client for the managed Flash control plane (used by the CLI and MCP bridge)."""
2
+
3
+ from .config import load_credentials, save_credentials
4
+ from .http import ApiClient, ApiError, ClientError, client_from_config, verify_freesolo_key
5
+
6
+ __all__ = [
7
+ "ApiClient",
8
+ "ApiError",
9
+ "ClientError",
10
+ "client_from_config",
11
+ "load_credentials",
12
+ "save_credentials",
13
+ "verify_freesolo_key",
14
+ ]
flash/client/config.py ADDED
@@ -0,0 +1,70 @@
1
+ """Client-side credential storage: the Flash API key + control-plane URL.
2
+
3
+ Stored in ``~/.flash/config.json`` (dir 0700, file 0600 — it holds a secret).
4
+ Environment variables take precedence so CI/agents can inject credentials without
5
+ touching the file: ``FREESOLO_API_KEY`` for the key, ``FLASH_API_URL`` for the URL.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ from pathlib import Path
12
+
13
+ from .._channel import CHANNEL
14
+ from .._fileio import read_json_or_empty, secure_json_write
15
+
16
+ # The default control plane follows the installed channel (see flash/_channel.py): the prod
17
+ # package (freesolo-flash / `flash`) targets the production plane, the dev-channel package
18
+ # (freesolo-flash-dev / `flash-dev`) targets the staging plane. Either is overridable via
19
+ # FLASH_API_URL or `flash login --api-url`.
20
+ PROD_API_URL = "https://flash.freesolo.co"
21
+ DEV_API_URL = "https://flash-dev.freesolo.co"
22
+
23
+
24
+ def default_api_url(channel: str = CHANNEL) -> str:
25
+ """Default control-plane URL for the given release channel."""
26
+ return DEV_API_URL if channel == "dev" else PROD_API_URL
27
+
28
+
29
+ DEFAULT_API_URL = default_api_url()
30
+
31
+ CONFIG_DIR = Path.home() / ".flash"
32
+ CONFIG_PATH = CONFIG_DIR / "config.json"
33
+
34
+
35
+ def _read_config() -> dict:
36
+ return read_json_or_empty(CONFIG_PATH)
37
+
38
+
39
+ def load_credentials_with_source() -> tuple[str, str | None, str | None]:
40
+ """Resolve (api_url, api_key, key_source); key/source are None when logged out."""
41
+ cfg = _read_config()
42
+ api_url = os.environ.get("FLASH_API_URL") or cfg.get("api_url") or DEFAULT_API_URL
43
+ env_key = os.environ.get("FREESOLO_API_KEY")
44
+ if env_key:
45
+ return api_url.rstrip("/"), env_key, "FREESOLO_API_KEY"
46
+ if cfg.get("api_key"):
47
+ return api_url.rstrip("/"), cfg["api_key"], str(CONFIG_PATH)
48
+ return api_url.rstrip("/"), None, None
49
+
50
+
51
+ def load_credentials() -> tuple[str, str | None]:
52
+ """Resolve (api_url, api_key); the key is None when the user hasn't logged in."""
53
+ api_url, api_key, _source = load_credentials_with_source()
54
+ return api_url, api_key
55
+
56
+
57
+ def save_credentials(api_key: str, api_url: str | None = None) -> Path:
58
+ """Persist the key (and optionally a non-default URL) with private permissions."""
59
+ cfg = _read_config()
60
+ cfg["api_key"] = api_key
61
+ if api_url:
62
+ # Record the plane actually authenticated against. When it's the default, drop any
63
+ # stored url instead of pinning it — this also clears a stale custom url from a
64
+ # previous custom FLASH_API_URL login so later commands don't keep hitting the old host.
65
+ if api_url.rstrip("/") == DEFAULT_API_URL.rstrip("/"):
66
+ cfg.pop("api_url", None)
67
+ else:
68
+ cfg["api_url"] = api_url.rstrip("/")
69
+ secure_json_write(CONFIG_PATH, cfg)
70
+ return CONFIG_PATH