PyPI - freesolo-flash-dev - Versions diffs - 0.2.25__py3-none-any.whl - Mend

freesolo-flash-dev 0.2.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (111) hide show

flash/__init__.py +29 -0
flash/_channel.py +23 -0
flash/_fileio.py +35 -0
flash/_logging.py +49 -0
flash/_update_check.py +266 -0
flash/catalog.py +253 -0
flash/cli/__init__.py +1 -0
flash/cli/main/__init__.py +227 -0
flash/cli/main/__main__.py +6 -0
flash/cli/main/commands.py +636 -0
flash/cli/main/envpush.py +317 -0
flash/cli/main/render.py +599 -0
flash/cli/main/training_doc.py +455 -0
flash/client/__init__.py +14 -0
flash/client/config.py +70 -0
flash/client/http.py +372 -0
flash/client/runtime_secrets.py +69 -0
flash/client/specs.py +20 -0
flash/cost/__init__.py +16 -0
flash/cost/analytical.py +175 -0
flash/cost/facts.py +114 -0
flash/cost/spec.py +113 -0
flash/cost/types.py +158 -0
flash/engine/__init__.py +6 -0
flash/engine/accounting.py +36 -0
flash/engine/chalk_kernels.py +116 -0
flash/engine/multiturn_rollout.py +780 -0
flash/engine/recipe.py +86 -0
flash/engine/vram.py +603 -0
flash/engine/worker/__init__.py +2916 -0
flash/engine/worker/__main__.py +4 -0
flash/engine/worker/kernel_warmup.py +400 -0
flash/engine/worker/lora.py +796 -0
flash/engine/worker/packing.py +366 -0
flash/engine/worker/perf.py +1048 -0
flash/envs/__init__.py +10 -0
flash/envs/adapter/__init__.py +883 -0
flash/envs/adapter/rubric.py +222 -0
flash/envs/base.py +52 -0
flash/envs/registry.py +62 -0
flash/mcp/__init__.py +1 -0
flash/mcp/server.py +85 -0
flash/providers/__init__.py +59 -0
flash/providers/_auth.py +24 -0
flash/providers/_http.py +230 -0
flash/providers/_instance.py +416 -0
flash/providers/_instance_bootstrap.py +517 -0
flash/providers/_poll.py +311 -0
flash/providers/allocator.py +193 -0
flash/providers/base.py +431 -0
flash/providers/hyperstack/__init__.py +127 -0
flash/providers/hyperstack/api.py +522 -0
flash/providers/hyperstack/auth.py +17 -0
flash/providers/hyperstack/gpus.py +29 -0
flash/providers/hyperstack/jobs/__init__.py +632 -0
flash/providers/hyperstack/jobs/builders.py +122 -0
flash/providers/hyperstack/preflight.py +23 -0
flash/providers/hyperstack/pricing.py +26 -0
flash/providers/hyperstack/train.py +25 -0
flash/providers/lambdalabs/__init__.py +139 -0
flash/providers/lambdalabs/api.py +261 -0
flash/providers/lambdalabs/auth.py +18 -0
flash/providers/lambdalabs/gpus.py +29 -0
flash/providers/lambdalabs/jobs/__init__.py +724 -0
flash/providers/lambdalabs/jobs/builders.py +118 -0
flash/providers/lambdalabs/preflight.py +27 -0
flash/providers/lambdalabs/pricing.py +51 -0
flash/providers/lambdalabs/train.py +27 -0
flash/providers/preflight.py +55 -0
flash/providers/realized.py +80 -0
flash/providers/runpod/__init__.py +130 -0
flash/providers/runpod/api.py +186 -0
flash/providers/runpod/auth.py +37 -0
flash/providers/runpod/cost.py +57 -0
flash/providers/runpod/gpus.py +46 -0
flash/providers/runpod/jobs.py +956 -0
flash/providers/runpod/keys.py +139 -0
flash/providers/runpod/preflight.py +30 -0
flash/providers/runpod/preload.py +915 -0
flash/providers/runpod/pricing.py +18 -0
flash/providers/runpod/slots.py +79 -0
flash/providers/runpod/train/__init__.py +150 -0
flash/providers/runpod/train/deps.py +395 -0
flash/providers/runpod/train/endpoints.py +820 -0
flash/py.typed +0 -0
flash/runner/__init__.py +686 -0
flash/runner/checkpoints.py +82 -0
flash/runner/deploy.py +422 -0
flash/runner/lifecycle.py +672 -0
flash/schema/__init__.py +375 -0
flash/schema/fields.py +331 -0
flash/serve/__init__.py +1 -0
flash/serve/deploy.py +326 -0
flash/serve/pricing.py +60 -0
flash/server/__init__.py +1 -0
flash/server/__main__.py +20 -0
flash/server/app.py +961 -0
flash/server/auth.py +263 -0
flash/server/billing.py +124 -0
flash/server/checkpoints.py +110 -0
flash/server/db.py +160 -0
flash/server/environment_registry.py +102 -0
flash/server/envs.py +360 -0
flash/server/reconcile.py +163 -0
flash/server/run_registry.py +150 -0
flash/spec.py +333 -0
freesolo_flash_dev-0.2.25.dist-info/METADATA +192 -0
freesolo_flash_dev-0.2.25.dist-info/RECORD +111 -0
freesolo_flash_dev-0.2.25.dist-info/WHEEL +4 -0
freesolo_flash_dev-0.2.25.dist-info/entry_points.txt +3 -0
freesolo_flash_dev-0.2.25.dist-info/licenses/LICENSE +201 -0

flash/engine/worker/__init__.py ADDED Viewed

@@ -0,0 +1,2916 @@
+"""On-GPU fine-tuning worker (RunPod). Modes: sft | rl.
+This module runs on the provisioned RunPod GPU. It uses the shared recipe
+(``flash.engine.recipe``) so SFT targets and RL rewards are rendered and scored
+consistently.
+Artifacts (adapter, metrics.json, heartbeat.json, checkpoints) are streamed to a
+Hugging Face dataset repo. HF checkpoints give preemption resilience: if a worker is
+recycled mid-run we resume from the latest uploaded checkpoint. Metrics are also
+returned directly to the caller by the launching provider.
+Core environment variables (set by the launching provider / runner):
+  RUN_MODE      sft|rl
+  SEED          int
+  HF_REPO       Hugging Face dataset repo for artifacts, populated per-run from the
+                JobSpec's [train] hf_repo by whichever provider launches the worker
+  HF_TOKEN
+  RUN_ID        unique id for this run (namespacing in the repo)
+The FLASH_*/RL_*/SFT_* env vars are A/B overrides documented at their use sites; the
+JobSpec [train] table is the source of truth for per-run knobs.
+"""
+from __future__ import annotations
+import contextlib
+import faulthandler
+import json
+import math
+import os
+import random
+import re
+import sys
+import tempfile
+import threading
+import time
+import traceback
+from flash.engine.accounting import RunMetrics
+# Shared, substrate-neutral fine-tuning internals (live in this same package).
+from flash.engine.chalk_kernels import active_kernels, install_chalk_kernels
+from flash.engine.recipe import RECIPE
+# Re-export the pure helpers split into the leaf submodules ``.perf`` and ``.lora``.
+# CRITICAL: the readers below (run_sft / run_rl / make_lora / _init_adapter_model / ...) call
+# these by their bare name, which resolves through THIS module's namespace — so a test's
+# ``monkeypatch.setattr(worker, "<name>", ...)`` still reaches the readers. Names actually used
+# by the retained readers are imported plainly; names re-exported only for API / test access
+# (no retained reader uses them) are marked unused for the linter.
+from flash.engine.worker.lora import (
+    _LM_SYNC_REMAP_ON,
+    _VL_EXCLUDE_SEGMENTS,  # noqa: F401
+    _patch_peft_weight_converter_compat,  # noqa: F401
+    _remap_vl_sync_weights,  # noqa: F401
+    assert_adapter_delta_nonzero,
+    assert_adapter_load_clean,
+    assert_lora_applied,
+    disable_liger_grpo_torch_compile,
+    is_vl_checkpoint,
+    lora_exclude_modules,
+    model_quant,  # noqa: F401
+    patch_grpo_mask_aware_lm_head,
+    patch_vllm_language_model_only,
+    patch_vllm_lm_weight_sync,
+    remap_adapter_keys,  # noqa: F401
+    remap_vl_adapter_dir,
+    strip_language_model_infix,  # noqa: F401
+    vllm_language_model_only_kwargs,  # noqa: F401
+)
+from flash.engine.worker.packing import (
+    BlockDiagonalCollator,
+    gdn_packing_available,
+    model_is_gdn_hybrid,
+    model_is_pure_attention,
+    pack_token_ids,
+    packing_efficiency,
+    tokenize_for_packing,
+)
+from flash.engine.worker.perf import (
+    RetriableInfraError,
+    _attn_impl_for_capability,  # noqa: F401
+    _ensure_fla_fastpath_on_hopper,
+    _estimate_params,  # noqa: F401
+    _flash_attn_3_available,  # noqa: F401
+    _flash_attn_available,
+    _GpuPeakSampler,
+    _liger_default_for_model,  # noqa: F401
+    _memory_mode,
+    _metric_curve,
+    _neutralize_tilelang_cudart_stub,
+    _peak_gpu_gb,
+    _remove_fla_from_disk,  # noqa: F401
+    _reset_peak_gpu,
+    _sdpa_cudnn_ctx,
+    free_gpu,
+    fused_optim_name,
+    gpu_diagnostics,
+    grad_checkpointing_on,
+    grpo_sleep_mode,
+    liger_on,
+    loraplus_optimizer_cls,
+    optimal_attn_impl,
+    setup_perf_backends,
+    wait_for_gpu,
+)
+from flash.envs.adapter import GitHubRateLimitError
+from flash.envs.registry import load_environment
+from flash.spec import load_job_spec_from_env
+HF_REPO = os.environ.get("HF_REPO", "")
+RUN_ID = os.environ.get("RUN_ID", "local")
+SEED = int(os.environ.get("SEED", "0"))
+RUN_MODE = os.environ.get("RUN_MODE", "sft")
+ATTEMPT = os.environ.get("ATTEMPT", "")
+JOB_SPEC = load_job_spec_from_env()
+# PHASE is the stable artifact namespace (sft|rl) and matches RUN_MODE for a train run.
+PHASE = os.environ.get(
+    "PHASE",
+    JOB_SPEC.phase if JOB_SPEC else (RUN_MODE if RUN_MODE in ("sft", "rl") else "sft"),
+)
+def _load_active_env():
+    """Load the run's Freesolo environment from the JobSpec; require an explicit env.
+    There is no default/builtin environment: a run MUST name a published Freesolo
+    environment id. Failing here prevents a paid worker from training/evaluating the
+    wrong task.
+    """
+    if JOB_SPEC is None:
+        # No JobSpec at all (e.g. the module imported for a non-run path / a unit test). There
+        # is nothing to select; defer the hard requirement to the JobSpec-present branch so the
+        # module stays importable. A real run always carries a JobSpec.
+        return None
+    env_id = JOB_SPEC.environment.id
+    if not env_id:
+        # Every supported algorithm (sft/grpo) trains/evaluates against a Freesolo env, so a
+        # missing env is always a misconfigured spec. Fail loudly rather than fall back to a
+        # default and burn a paid worker on the wrong task.
+        raise RuntimeError(
+            "JobSpec sets no environment: provide [environment] id "
+            "(a Freesolo environment id like 'your-name/your-env', returned by "
+            "`flash env push --name <name>`)."
+        )
+    # Pass the control-plane-pinned commit sha (resolve-once hook) when present so the adapter
+    # skips the GitHub ref->sha resolve; "" (the default) keeps the worker resolving it itself.
+    return load_environment(
+        env_id, JOB_SPEC.environment.params, resolved_sha=JOB_SPEC.environment.resolved_sha
+    )
+ACTIVE_ENV = None
+def require_active_env():
+    """Return the run's loaded environment, or raise a CLEAR error when there is none.
+    ``ACTIVE_ENV`` is None on the no-JobSpec path (the module is imported with no
+    FLASH_JOB_SPEC_JSON/PATH, e.g. a misconfigured worker launch). Every train/eval consumer
+    needs a real env; without this guard the first ``ACTIVE_ENV.<attr>`` access dies with an
+    opaque ``AttributeError: 'NoneType' object has no attribute ...``. Fail loudly with an
+    actionable message instead — mirrors the explicit RuntimeError raised when a JobSpec is
+    present but names no environment.
+    """
+    global ACTIVE_ENV
+    if ACTIVE_ENV is None:
+        ACTIVE_ENV = _load_active_env()
+    if ACTIVE_ENV is None:
+        raise RuntimeError(
+            "no environment is loaded: this worker was started without a JobSpec "
+            "(FLASH_JOB_SPEC_JSON / FLASH_JOB_SPEC_PATH is unset). A train/eval run must "
+            "carry a JobSpec naming [environment] id "
+            "(a Freesolo environment id like 'your-name/your-env', returned by "
+            "`flash env push --name <name>`)."
+        )
+    return ACTIVE_ENV
+# Thinking/reasoning mode: one flag per run from the run config (TOML `thinking`), consumed
+# identically by SFT rendering, RL rollouts, and serving. Defaults off without a JobSpec.
+THINKING = JOB_SPEC.thinking if JOB_SPEC else False
+# ---------------------------------------------------------------------------
+# HF helpers (code-delivery + artifact channel; works without inbound network)
+# ---------------------------------------------------------------------------
+def error_artifact_name(mode: str) -> str:
+    """Per-mode error filename (e.g. error_sft.txt) so a run's traceback is uploaded
+    under a stable name even though heartbeat.json is single-file/overwritten."""
+    return f"error_{mode}.txt"
+def hf_api():
+    from huggingface_hub import HfApi
+    return HfApi(token=os.environ.get("HF_TOKEN"))
+def hf_prefix() -> str:
+    return f"{PHASE}/{RUN_ID}/seed{SEED}"
+def _hf_upload(do_upload, repo_subpath: str, required: bool, label: str) -> None:
+    """Shared HF upload loop for files/folders: HF_REPO guard + retry/raise-or-warn.
+    ``required=True`` (completion artifacts DONE/metrics.json, the trained adapter) retries
+    and finally raises: a swallowed upload failure would make the control plane mark a
+    finished run failed/retried, or mark the run done while deployment can never download
+    the missing adapter. Optional artifacts (generations, logs) only warn.
+    """
+    if not HF_REPO:
+        return
+    attempts = 3 if required else 1
+    for attempt in range(attempts):
+        try:
+            do_upload()
+            return
+        except Exception as e:
+            if required and attempt + 1 < attempts:
+                print(f"{label} retry {attempt + 1}/{attempts}: {e}")
+                time.sleep(5 * (attempt + 1))
+                continue
+            if required:
+                # Already retried 3x -> the host/network is bad, not the run. Infra-shaped.
+                raise RetriableInfraError(f"required upload of {repo_subpath!r} failed: {e}") from e
+            print(f"{label} warn:", e)
+            return
+def hf_upload_file(local_path: str, repo_subpath: str, required: bool = False):
+    """Upload one file to the run's HF prefix."""
+    _hf_upload(
+        lambda: hf_api().upload_file(
+            path_or_fileobj=local_path,
+            path_in_repo=f"{hf_prefix()}/{repo_subpath}",
+            repo_id=HF_REPO,
+            repo_type="dataset",
+        ),
+        repo_subpath,
+        required,
+        "hf_upload_file",
+    )
+_DEBUG_UPLOAD_LOCK = threading.Lock()
+def upload_debug_jsonl(name: str, rows: list[dict], *, keep_last: int = 200) -> None:
+    """Append bounded JSONL debug rows and upload them as an optional artifact.
+    This is intentionally best-effort: debug visibility must not fail a paid run.
+    """
+    if not rows or not HF_REPO:
+        return
+    repo_name = os.path.basename(name if name.endswith(".jsonl") else f"{name}.jsonl")
+    path = os.path.join("/tmp", repo_name)
+    try:
+        with _DEBUG_UPLOAD_LOCK:
+            existing: list[str] = []
+            with contextlib.suppress(FileNotFoundError), open(path) as f:
+                existing = f.readlines()[-keep_last:]
+            with open(path, "w") as f:
+                f.writelines(existing)
+                for row in rows:
+                    f.write(json.dumps(row, default=str, ensure_ascii=True, sort_keys=True) + "\n")
+            hf_upload_file(path, repo_name)
+    except Exception as e:
+        print(f"debug upload warn ({repo_name}): {e}")
+def hf_upload_folder(local_dir: str, repo_subpath: str, required: bool = False):
+    """Upload a folder to the run's HF prefix."""
+    _hf_upload(
+        lambda: hf_api().upload_folder(
+            folder_path=local_dir,
+            path_in_repo=f"{hf_prefix()}/{repo_subpath}",
+            repo_id=HF_REPO,
+            repo_type="dataset",
+        ),
+        repo_subpath,
+        required,
+        "hf_upload_folder",
+    )
+def hf_resume_checkpoint() -> str | None:
+    """Latest streamed trainer checkpoint for this run (or None).
+    Checkpoints are uploaded DURING the run by ``make_checkpoint_upload_callback`` as
+    ``<prefix>/checkpoint/checkpoint-<step>/``; a replacement worker downloads the
+    newest one so a mid-run preemption costs at most one save interval.
+    """
+    if not HF_REPO:
+        return None
+    try:
+        from huggingface_hub import snapshot_download
+        snapshot_download(
+            repo_id=HF_REPO,
+            repo_type="dataset",
+            allow_patterns=[f"{hf_prefix()}/checkpoint/**"],
+            local_dir="/tmp/resume",
+            token=os.environ.get("HF_TOKEN"),
+        )
+        base = os.path.join("/tmp/resume", hf_prefix(), "checkpoint")
+        if not os.path.isdir(base):
+            return None
+        cands = [d for d in os.listdir(base) if d.startswith("checkpoint-")]
+        if not cands:
+            return None
+        latest = max(cands, key=lambda d: int(d.split("-")[-1]))
+        path = os.path.join(base, latest)
+        print(f"[resume] found streamed checkpoint: {path}")
+        return path
+    except Exception as e:
+        print("hf_resume_checkpoint warn:", e)
+        return None
+def prefetch_model(model_id: str) -> float:
+    """Pull the model weights into the local HF cache up front; return seconds spent.
+    The trainer/vLLM would download lazily anyway — doing it explicitly (a) makes the
+    download a first-class, timed stage in the heartbeat stream (the cold-start metric
+    the speed work optimizes), and (b) fails fast with a clear disk/network error
+    instead of dying inside trainer construction. Idempotent: a warm cache costs ~0 s.
+    """
+    from huggingface_hub import snapshot_download
+    t0 = time.time()
+    try:
+        snapshot_download(
+            repo_id=model_id,
+            # weights + tokenizer/config only (same exclusions as the image bake)
+            ignore_patterns=["*.pth", "*.gguf", "original/*", "*.onnx", "*.msgpack", "*.h5"],
+        )
+    except Exception as e:
+        # Surface but don't fail here: gated/local-only models still load fine through
+        # the normal from_pretrained path the trainer uses next.
+        print("prefetch_model warn:", e)
+    secs = round(time.time() - t0, 1)
+    heartbeat(
+        "model_prefetched",
+        model=model_id,
+        download_seconds=secs,
+        hf_transfer=os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", ""),
+        gpu=gpu_diagnostics(),
+    )
+    return secs
+# Trainer-state files a serving engine never needs: optimizer/scheduler/rng/loss-curve
+# state. Excluded when publishing the deployable per-step adapter so each step's snapshot is
+# just the LoRA weights + config (a few MB), small enough to KEEP every step (no pruning).
+_CHECKPOINT_TRAINER_STATE = (
+    "optimizer.pt",
+    "optimizer.bin",
+    "scheduler.pt",
+    "scaler.pt",
+    "rng_state*.pth",
+    "trainer_state.json",
+    "training_args.bin",
+    "*.distcp",
+    "global_step*/**",
+    "latest",
+    "zero_to_fp32.py",
+)
+# The PEFT adapter weights file a checkpoint must carry to be loadable/servable (safetensors is
+# the default; .bin is the legacy fallback). A step with adapter_config.json but no weights is
+# NOT deployable, so it's never published/listed.
+_ADAPTER_WEIGHT_FILES = ("adapter_model.safetensors", "adapter_model.bin")
+def publish_deployable_checkpoint(ckpt_dir: str, step: int) -> str | None:
+    """Mirror a trainer checkpoint's LoRA adapter to a stable, NON-pruned per-step path so a
+    run cancelled mid-RL is still one-command-deployable from its last good step.
+    The trainer's checkpoint folder already contains the PEFT adapter (``adapter_config.json``
+    + ``adapter_model.safetensors``) that ``deploy_adapter`` serves; we re-upload just those
+    (dropping optimizer/scheduler/rng state) to ``<prefix>/checkpoints/step-<step>/adapter``.
+    Unlike the resume checkpoint (``checkpoint/**``, kept latest-only), these accumulate, so
+    EVERY step stays deployable. Returns the deployable adapter subfolder, or ``None`` when
+    there's no adapter to publish. Best-effort: a failure here never fails a paid run.
+    """
+    if not HF_REPO:
+        return None
+    # Only publish a checkpoint that actually carries a loadable adapter (config AND weights) —
+    # never advertise a non-deployable step.
+    has_config = os.path.isfile(os.path.join(ckpt_dir, "adapter_config.json"))
+    has_weights = any(os.path.isfile(os.path.join(ckpt_dir, w)) for w in _ADAPTER_WEIGHT_FILES)
+    if not (has_config and has_weights):
+        return None
+    subfolder = f"{hf_prefix()}/checkpoints/step-{step}/adapter"
+    try:
+        hf_api().upload_folder(
+            folder_path=ckpt_dir,
+            path_in_repo=subfolder,
+            repo_id=HF_REPO,
+            repo_type="dataset",
+            ignore_patterns=list(_CHECKPOINT_TRAINER_STATE),
+        )
+        heartbeat("checkpoint_deployable", step=step, subfolder=subfolder)
+        return subfolder
+    except Exception as e:
+        print(f"[ckpt] deployable publish warn (step {step}):", e)
+        return None
+def make_checkpoint_upload_callback():
+    """Stream each trainer save to HF so preemption loses <= one save interval.
+    Uploads run in a background thread (the train loop never blocks on the network);
+    older checkpoints are deleted in the same commit. If an upload is still in flight
+    when the next save fires, the new save is skipped (the following one catches up).
+    Each save also publishes a deployable per-step adapter snapshot (``publish_deployable_
+    checkpoint``) so a run cancelled mid-RL can still be deployed from its latest step.
+    """
+    from transformers import TrainerCallback
+    lock = threading.Lock()
+    class _CheckpointUpload(TrainerCallback):
+        def on_save(self, args, state, control, **kwargs):
+            if not HF_REPO:
+                return
+            step = int(state.global_step)
+            ckpt_dir = os.path.join(args.output_dir, f"checkpoint-{step}")
+            if not os.path.isdir(ckpt_dir):
+                return
+            if not lock.acquire(blocking=False):
+                print(f"[ckpt] upload busy; skipping step {step}")
+                return
+            def _upload():
+                try:
+                    hf_api().upload_folder(
+                        folder_path=ckpt_dir,
+                        path_in_repo=f"{hf_prefix()}/checkpoint/checkpoint-{step}",
+                        repo_id=HF_REPO,
+                        repo_type="dataset",
+                        delete_patterns=[f"{hf_prefix()}/checkpoint/**"],
+                    )
+                    heartbeat("checkpoint_uploaded", step=step)
+                    # Mirror this step's adapter to its own kept-forever path so the run
+                    # stays deployable even if it never reaches "done".
+                    publish_deployable_checkpoint(ckpt_dir, step)
+                except Exception as e:
+                    print("ckpt upload warn:", e)
+                finally:
+                    lock.release()
+            threading.Thread(target=_upload, daemon=True).start()
+    return _CheckpointUpload()
+# Heartbeat HF-commit throttle. Each heartbeat() commits heartbeat.json to the HF artifact
+# repo; committing every training step (the reward callback fires per step) blows HuggingFace's
+# per-repo commit rate limit (128/hour), especially when several runs share one HF_REPO. Only
+# the per-step "rl_step" stage is high-frequency, so throttle JUST that one to once per
+# 60s; every other stage — including milestones and the terminal done/already_done — always
+# commits so the control plane never misses a transition.
+# The local file + stdout line are always written regardless.
+_HB_LAST_UPLOAD = 0.0
+# The rl_step heartbeat-upload throttle, in seconds (fixed 60s) — keeps GRPO under HF's
+# 128 commits/hour-per-repo limit when concurrent runs share one HF_REPO.
+_HB_MIN_INTERVAL_S = 60.0
+_HB_THROTTLED_STAGES = frozenset({"rl_step"})
+# Terminal transitions the control plane must never miss — always committed.
+_HB_TERMINAL_STAGES = frozenset({"done", "already_done"})
+_HB_TERMINAL_ONLY = False
+# Even in terminal-only mode, emit a SLOW heartbeat at this cadence so the control plane's stall
+# detector keeps seeing progress through a long
+# training phase and doesn't false-stall the run. 600s -> ~6 commits/hr, far under the 128/hr cap.
+_HB_TERMINAL_ONLY_INTERVAL_S = 600.0
+# Serializes heartbeat.json writes and _HB_LAST_UPLOAD reads/updates. During GRPO,
+# heartbeat() is called concurrently from the trainer thread (reward callback) and the
+# checkpoint-upload daemon thread; without this lock two writers can interleave and
+# truncate/garble heartbeat.json (and race _HB_LAST_UPLOAD).
+_HB_LOCK = threading.Lock()
+# Serializes the actual HF upload (a slow network commit) SEPARATELY from _HB_LOCK so the
+# trainer's frequent local writes never block on the network. Without it, two heartbeat
+# threads can upload heartbeat.json concurrently: a slower upload could land AFTER a newer
+# one on HF (reorder), so this lock makes uploads strictly ordered.
+_HB_UPLOAD_LOCK = threading.Lock()
+# Stall diagnostics: when FLASH_STALL_FAULTHANDLER_S > 0, arm a faulthandler watchdog that dumps
+# every thread's Python stack (then exits, so the run FAILS instead of hanging until the
+# control-plane stall watchdog kills it ~25 min later, and the dump is uploaded with
+# console_<phase>.txt). The timer is re-armed on every heartbeat, so it only fires when NO progress
+# heartbeat lands for the whole window -- i.e. a real hang. OFF by default (0); opt-in per run via
+# [worker_env]. Used to localize the GRPO sleep-mode rollout hang.
+_STALL_FAULTHANDLER_S = 0
+with contextlib.suppress(Exception):
+    _STALL_FAULTHANDLER_S = int(os.environ.get("FLASH_STALL_FAULTHANDLER_S", "0") or 0)
+def _rearm_stall_faulthandler() -> None:
+    if _STALL_FAULTHANDLER_S <= 0:
+        return
+    with contextlib.suppress(Exception):
+        faulthandler.cancel_dump_traceback_later()
+        faulthandler.dump_traceback_later(_STALL_FAULTHANDLER_S, exit=True)
+def heartbeat(stage: str, **kw):
+    global _HB_LAST_UPLOAD
+    payload = {
+        "stage": stage,
+        "ts": time.time(),
+        "run_id": RUN_ID,
+        "mode": RUN_MODE,
+        "seed": SEED,
+        "attempt": ATTEMPT,
+        **kw,
+    }
+    # The datacenter the worker actually landed in (RunPod serverless sets RUNPOD_DC_ID) — a
+    # diagnostic so the control plane / logs show which region a run hit (the eager weight-cache fleet
+    # already has a volume in every storage DC). Empty/absent on non-RunPod (instance) workers and
+    # harmless; only emitted when present.
+    _dc = os.environ.get("RUNPOD_DC_ID") or ""
+    if _dc:
+        payload.setdefault("dc", _dc)
+    os.makedirs("/tmp/hb", exist_ok=True)
+    p = "/tmp/hb/heartbeat.json"
+    # _HB_LOCK guards ONLY the fast local work (atomic write + _HB_LAST_UPLOAD + snapshot capture);
+    # the slow HF commit runs OUTSIDE it so the trainer's per-step reward callback never blocks on
+    # the network behind the checkpoint daemon's commit (a GRPO perf regression).
+    with _HB_LOCK:
+        # Atomic write: temp file + os.replace() so a concurrent reader never sees a partial file.
+        tmp = p + f".{os.getpid()}.{threading.get_ident()}.tmp"
+        snapshot = json.dumps(payload)
+        with open(tmp, "w") as f:
+            f.write(snapshot)
+        os.replace(tmp, p)
+        now = time.time()
+        if stage in _HB_TERMINAL_STAGES or stage.startswith("error_"):
+            upload_due = True  # never miss a terminal transition
+        elif _HB_TERMINAL_ONLY:
+            # Benchmark fan-out: keep commits far under the 128/hour cap, but still emit a SLOW
+            # heartbeat (~every _HB_TERMINAL_ONLY_INTERVAL_S) so the control-plane stall detector
+            # sees progress during a long training phase and doesn't false-stall the run.
+            upload_due = (
+                _HB_LAST_UPLOAD == 0.0 or (now - _HB_LAST_UPLOAD) >= _HB_TERMINAL_ONLY_INTERVAL_S
+            )
+        else:
+            throttled = stage in _HB_THROTTLED_STAGES
+            upload_due = not throttled or (now - _HB_LAST_UPLOAD) >= _HB_MIN_INTERVAL_S
+        if upload_due:
+            _HB_LAST_UPLOAD = now  # claim the slot under the lock (throttle stays atomic)
+    if upload_due:
+        # Serialize the network commit under a SEPARATE lock so uploads can't reorder, and
+        # upload the captured snapshot (via a private temp file, since hf_upload_file takes
+        # a path) rather than re-reading p — which a newer heartbeat may already have
+        # overwritten between our slot-claim and this upload.
+        with _HB_UPLOAD_LOCK:
+            up = p + f".{os.getpid()}.{threading.get_ident()}.upload.tmp"
+            with open(up, "w") as f:
+                f.write(snapshot)
+            try:
+                hf_upload_file(up, "heartbeat.json")
+            finally:
+                with contextlib.suppress(OSError):
+                    os.remove(up)
+    # Re-arm the stall watchdog: progress landed, so reset the no-heartbeat timer.
+    _rearm_stall_faulthandler()
+    print("HEARTBEAT", json.dumps(payload))
+# ---------------------------------------------------------------------------
+# Decoding parity: render with the model's own chat template and one run-wide thinking
+# flag (off by default), so SFT targets and RL rollouts use identical prompt
+# formatting within a run.
+# ---------------------------------------------------------------------------
+def render_prompt(tokenizer, item) -> str:
+    item = item if isinstance(item, dict) else {"question": item}
+    msgs = require_active_env().prompt_messages(item)
+    return tokenizer.apply_chat_template(
+        msgs, tokenize=False, add_generation_prompt=True, enable_thinking=THINKING
+    )
+def strip_think(completion: str | None) -> str | None:
+    """Drop <think>...</think> reasoning before the environment grades/rewards a
+    thinking-mode completion.
+    - closed block(s): keep only the text after the LAST </think>. This also covers
+      always-thinking templates that pre-open <think> inside the generation prompt,
+      whose completions contain </think> with no opening tag.
+    - unclosed <think> (completion budget exhausted): keep only the pre-think text
+      (usually empty), so answer extraction fails and the completion scores 0 —
+      deliberate reward pressure to close thinking within budget, and it keeps a
+      last-number fallback from matching numbers inside the reasoning.
+    - no tags: unchanged.
+    """
+    if completion is None:
+        return None
+    if "</think>" in completion:
+        return completion.rsplit("</think>", 1)[1]
+    if "<think>" in completion:
+        return completion.split("<think>", 1)[0]
+    return completion
+def graded_text(completion: str | None) -> str | None:
+    """What the env grader/reward sees: thinking runs strip <think> blocks first (a
+    completion whose reasoning never closes grades 0 — see strip_think). Applied once
+    here, before ACTIVE_ENV.grade/reward, so it works for every environment."""
+    return strip_think(completion) if THINKING else completion
+# ---------------------------------------------------------------------------
+# SFT
+# ---------------------------------------------------------------------------
+def force_vllm_backend_for_sm120() -> str | None:
+    """On RTX 5090 / consumer Blackwell (sm120), force a PTX-independent vLLM attention backend.
+    vLLM's default rollout backend is flash-attn, whose PRE-BUILT PTX needs a newer driver JIT than
+    many 5090 RunPod hosts have — when the JIT fails the colocated rollout silently produces NO
+    completions (empty reward_history, ~1.4 s "done"; a whole 22-run sweep hit this on every 5090).
+    FLASHINFER is vLLM's Blackwell-native backend (no flash-attn PTX dependency) and trains on a 5090
+    (measured: FLASHINFER/TORCH_SDPA/TRITON_ATTN all train, ~116 s). This mirrors the trainer's
+    cuDNN-SDPA forcing on sm120 (``_attn_impl_for_capability``). The GRPO no-op guard remains the
+    backstop. Returns the backend set (None if not sm120). Fixed — no operator override."""
+    try:
+        import torch
+        if not torch.cuda.is_available() or torch.cuda.get_device_capability(0)[0] != 12:
+            return None
+    except Exception as e:
+        print("[rl] sm120 vLLM backend probe skipped:", e)
+        return None
+    os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
+    print(
+        "[rl] sm120 (RTX 5090): VLLM_ATTENTION_BACKEND=FLASHINFER (flash-attn PTX is unreliable "
+        "on consumer Blackwell hosts -> empty-rollout failures)"
+    )
+    return "FLASHINFER"
+def finalize_alloc_conf_for_sleep() -> None:
+    """Sync the CUDA allocator conf with the worker's RESOLVED vLLM sleep default (RL runs only).
+    The launcher (providers/*/train.py build_worker_env) picks the sleep-SAFE non-expandable
+    PYTORCH_ALLOC_CONF for RL before this process starts, but it can't know the GRPO sleep decision:
+    for a small model the worker resolves sleep OFF (the speed default), so the non-expandable conf
+    is safe but fragments a long colocate run. Here (we have the model config + GPU) we resolve the
+    SAME deterministic sleep default (``_memory_mode``, exactly run_rl's gate) and, if sleep is OFF,
+    switch to expandable_segments — which only crashes WITH sleep on, a case we've just ruled out.
+    PYTORCH_ALLOC_CONF is read lazily at the first CUDA allocation, so this must run before any
+    allocation (it does — called at boot)."""
+    if PHASE != "rl":
+        return
+    try:
+        model_id = JOB_SPEC.model if JOB_SPEC else ""
+        # Resolve the sleep decision EXACTLY as run_rl does (grpo_sleep_mode: the size/context gate
+        # PLUS the resident-fit check against the live card), so the alloc conf matches the sleep
+        # mode the trainer will actually use.
+        _t = JOB_SPEC.train if JOB_SPEC else None
+        ctx = 0
+        try:
+            if _t and _t.max_length:
+                ctx = int(_t.max_length)
+        except Exception:
+            ctx = 0
+        card_gb = 0.0
+        try:
+            import torch as _torch_card
+            if _torch_card.cuda.is_available():
+                # Binary GiB to match grpo_fits_resident (see run_rl); /1e9 over-reports ~7%.
+                card_gb = _torch_card.cuda.get_device_properties(0).total_memory / (1024**3)
+        except Exception:
+            card_gb = 0.0
+        # Resolve group_size EXACTLY as run_rl does (gcfg override, else the recipe default), not a
+        # flat 8: if the recipe's rl.group_size differs from 8 the alloc-conf sleep decision here
+        # would diverge from the trainer's, picking the wrong expandable/non-expandable conf.
+        from flash.engine.recipe import RECIPE as _RECIPE
+        _gcfg = grpo_overrides()
+        _group_size = int(_gcfg.get("group_size") or _RECIPE.rl.group_size)
+        sleep_on = grpo_sleep_mode(
+            model_id,
+            max_length=ctx,
+            group_size=_group_size,
+            max_tokens=(_t.max_tokens if _t else None),
+            lora_rank=int(_t.lora_rank) if _t and _t.lora_rank else 32,
+            thinking=THINKING,
+            card_vram_gb=card_gb,
+        )
+        if not sleep_on:  # sleep resolves OFF -> expandable is safe + better
+            conf = "expandable_segments:True"
+            os.environ["PYTORCH_ALLOC_CONF"] = conf
+            os.environ["PYTORCH_CUDA_ALLOC_CONF"] = conf
+            print(f"[alloc] sleep resolves OFF -> {conf} (anti-fragmentation, matches worker gate)")
+        else:
+            print("[alloc] sleep resolves ON -> keeping launcher's non-expandable conf")
+    except Exception as e:
+        print("[alloc] auto-conf skipped:", e)
+def wandb_report_to() -> list[str]:
+    """TRL/HF ``report_to`` targets. Restores the W&B logging the legacy freesolo training path had
+    but the flash migration dropped: report to W&B whenever WANDB_API_KEY is present. No key -> []
+    (silent, the metrics.json artifact is still the source of truth).
+    Project + run name come ONLY from the typed ``[wandb]`` config (``JOB_SPEC.wandb``) — there is
+    NO WANDB_PROJECT / WANDB_NAME environment variable. HF's WandbCallback has no project argument
+    and would read WANDB_PROJECT from the env, so we initialize the run directly via the wandb SDK
+    here (``wandb.init(project=..., name=...)``); the Trainer's callback then reuses that run. The
+    run's entity is the API key's default account/team (we don't pass ``entity=``), so the only
+    W&B env var is the WANDB_API_KEY credential."""
+    if not os.environ.get("WANDB_API_KEY"):
+        return []
+    import importlib.util
+    if importlib.util.find_spec("wandb") is None:
+        print("[wandb] WANDB_API_KEY set but the wandb package is missing; skipping W&B logging")
+        return []
+    # Best-effort, like the bitsandbytes import above: a partial/broken wandb install or an
+    # init failure (auth, network, runtime import error) must NOT abort training — W&B logging is
+    # optional and metrics.json is the source of truth. Any failure -> no W&B logging ([]).
+    try:
+        import wandb
+        if wandb.run is None:  # init from the spec so the project needs no WANDB_PROJECT env
+            project = (JOB_SPEC.wandb.project if JOB_SPEC else None) or "flash"
+            wandb.init(project=project, name=wandb_run_name())
+    except Exception as e:
+        print(
+            f"[wandb] W&B init failed ({e}); skipping W&B logging (metrics.json is still written)"
+        )
+        return []
+    return ["wandb"]
+def wandb_run_name() -> str:
+    """W&B run name, from the typed ``[wandb] run_name`` config (``JOB_SPEC.wandb.run_name``) only —
+    no WANDB_NAME environment variable. An explicit name is used verbatim (the user owns the
+    naming); otherwise a stable id tying the dashboard run to the Flash run
+    (``flash-<phase>-<run_id>-seed<N>``). Passed to the Trainer via ``TrainingArguments.run_name``
+    and to ``wandb.init`` above."""
+    configured = JOB_SPEC.wandb.run_name if JOB_SPEC else None
+    if configured and configured.strip():
+        return configured.strip()
+    return f"flash-{PHASE}-{RUN_ID}-seed{SEED}"
+def wandb_run_info() -> dict:
+    """The live W&B run's {url, id, project} if W&B is active, else {}. Recorded in metrics.json so
+    the W&B run is verifiable + the freesolo agent's `wandb_runs` / the SDK's link_wandb can point at
+    the real dashboard URL — the link the flash migration otherwise dropped. Never raises."""
+    try:
+        import wandb
+        run = getattr(wandb, "run", None)
+        if run is None:
+            return {}
+        return {
+            "wandb_url": getattr(run, "url", None),
+            "wandb_id": getattr(run, "id", None),
+            "wandb_project": getattr(run, "project", None),
+        }
+    except Exception:
+        return {}
+def make_lora(model_id: str | None = None):
+    """LoRA config. We target 'all-linear' (every nn.Linear) rather than a hardcoded
+    q/k/v/o list: it is architecture-agnostic, so the same recipe works for the dense
+    default (Qwen3-4B-Instruct-2507) and for newer models with extra projection
+    types (e.g. the Qwen3.5 hybrid Gated-DeltaNet) without missing any adapters.
+    For natively-multimodal checkpoints the vision tower is excluded (see
+    ``lora_exclude_modules``)."""
+    from peft import LoraConfig
+    # Adapt every linear projection. "all-linear" is a PEFT SPECIAL string (not a module name)
+    # that PEFT expands to all linear layers — the right managed default across the catalog.
+    targets = "all-linear"
+    rank = JOB_SPEC.train.lora_rank if JOB_SPEC else RECIPE.lora.rank
+    alpha = JOB_SPEC.train.lora_alpha if JOB_SPEC else RECIPE.lora.alpha
+    kwargs = {
+        "r": rank,
+        "lora_alpha": alpha,
+        "lora_dropout": RECIPE.lora.dropout,
+        "target_modules": targets,
+        "task_type": "CAUSAL_LM",
+    }
+    # Adapter initialization: standard zero-B init (the LoRA delta starts at zero, so the saved
+    # adapter is a plain residual that loads correctly onto the ORIGINAL base).
+    # PiSSA was removed: it mutates the effective base during training, so its saved adapter only
+    # reconstructs against the PiSSA-residual base. Loading that adapter onto the unmodified base
+    # at SERVING or GRPO WARM-START (which is exactly our flow) corrupts the model -> the served
+    # model emits only whitespace and warm-start GRPO hangs. peft can convert PiSSA->standard on
+    # save, but the simpler, robust choice is the default init (the convergence gain isn't worth
+    # silently breaking serve + warm-start).
+    kwargs["init_lora_weights"] = True
+    print(
+        "[lora] init_lora_weights=True (standard zero-B; PiSSA removed for serve/warm-start safety)"
+    )
+    # Standard LoRA scaling (alpha/r). rsLoRA was removed: it scales by alpha/sqrt(r) (~5.6x larger
+    # for r=32/alpha=64), so with the usual LoRA LR (e.g. 2e-4) the effective update is ~5.6x too
+    # large -> SFT diverges to a degenerate adapter (served model repeats a single token / emits
+    # whitespace) and the adapter is also fragile under vLLM's rsLoRA handling at serve time.
+    # Standard scaling keeps the catalog LRs sane and the saved adapter serve-safe.
+    kwargs["use_rslora"] = False
+    if model_id and targets == "all-linear":
+        exclude = lora_exclude_modules(model_id)
+        if exclude:
+            kwargs["exclude_modules"] = exclude
+            print(f"[lora] excluding modules for {model_id}: {exclude}")
+    return LoraConfig(**kwargs)
+def require_vllm_for_rollout_func(use_rollout_func: bool, use_vllm: bool, model_id: str) -> None:
+    """Fail fast when a multi-turn GRPO run needs colocated vLLM but it's disabled.
+    The multi-turn rollout closure (``multiturn_rollout.build_rollout_func``) drives generation
+    through ``trainer.vllm_generation.llm``. TRL only creates that engine when ``use_vllm`` is
+    True, so with vLLM disabled the rollout would AttributeError at the first turn. GRPO now always
+    colocates vLLM (``use_vllm`` is unconditionally True), so this guard is defensive — keep it to
+    fail fast with an actionable message should a future tier disable the rollout engine.
+    """
+    if use_rollout_func and not use_vllm:
+        raise RuntimeError(
+            f"multi-turn GRPO needs colocated vLLM, which is disabled for {model_id}. "
+            "Use a single-turn environment for this model, or a model tier that keeps "
+            "vLLM enabled for rollouts."
+        )
+def run_sft():
+    from datasets import Dataset
+    from transformers import AutoTokenizer
+    from trl import SFTConfig as TRLSFTConfig
+    from trl import SFTTrainer
+    env = require_active_env()  # fail loudly (not AttributeError: NoneType) on the no-JobSpec path
+    t_start = time.time()
+    heartbeat("sft_start", gpu=gpu_diagnostics())
+    # SFT on a multi-turn env: rows whose target completion is a full trajectory train on the whole
+    # transcript (proper multi-turn SFT, handled below); rows with a single-turn target completion
+    # collapse to one assistant turn. Warn only for the collapsing case (computed during the
+    # dataset build below), not unconditionally.
+    wait_for_gpu()
+    setup_perf_backends()
+    model_id = JOB_SPEC.model if JOB_SPEC else RECIPE.hf_model_id
+    download_seconds = prefetch_model(model_id)
+    tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    if tok.pad_token is None:
+        tok.pad_token = tok.eos_token
+    # Build SFT text dataset (seeded shuffle for reproducibility)
+    train = env.dataset()
+    rng = random.Random(SEED)
+    rng.shuffle(train)
+    max_examples = int(
+        JOB_SPEC.train.max_examples or 0
+        if JOB_SPEC and JOB_SPEC.train and JOB_SPEC.train.max_examples is not None
+        else 0
+    )
+    if max_examples > 0:
+        train = train[:max_examples]
+    texts = []
+    multiturn_targets = 0
+    for ex in train:
+        # The env (via the freesolo-sdk Environment.sft_completion) owns the target completion: the
+        # full multi-turn target trajectory (assistant turns + tool calls + tool results + replies)
+        # when the row ships one, else a single target assistant turn. Training on the whole
+        # transcript is what makes SFT actually multi-turn (the tool-call protocol + replies) — the
+        # warm start the GRPO recipe expects. A >1-message completion is a multi-turn trajectory.
+        completion = env.sft_completion(ex)
+        if len(completion) > 1:  # a multi-turn target trajectory (vs a single assistant turn)
+            multiturn_targets += 1
+        msgs = [*env.prompt_messages(ex), *completion]
+        texts.append(
+            {
+                "text": tok.apply_chat_template(
+                    msgs, tokenize=False, add_generation_prompt=False, enable_thinking=THINKING
+                )
+            }
+        )
+    if multiturn_targets:
+        print(f"[sft] multi-turn SFT: {multiturn_targets}/{len(train)} rows train on a full target transcript")
+    elif getattr(env, "multi_turn", False):
+        print(
+            "[sft][warn] this is a multi-turn Freesolo environment but no row ships a multi-turn "
+            "target completion; SFT collapses to a single assistant turn per row (tool/env turns "
+            "ignored). Provide target transcripts (output={\"messages\": [...]}) for proper multi-turn SFT."
+        )
+    if THINKING and not any("<think>" in t["text"] for t in texts[:256]):
+        print(
+            "WARN: thinking mode is ON but no sampled SFT target contains a <think> "
+            "trace — training on non-reasoning targets teaches the model to SKIP "
+            "thinking. Use a dataset with reasoning traces, or set thinking = false."
+        )
+    ds = Dataset.from_list(texts)
+    setup_seconds = time.time() - t_start
+    heartbeat("sft_model_load", setup_seconds=setup_seconds, gpu=gpu_diagnostics())
+    # Epochs come from the run's [train] epochs (already in JOB_SPEC), else the recipe default.
+    epochs = int(
+        JOB_SPEC.train.epochs
+        if JOB_SPEC and JOB_SPEC.train.epochs is not None
+        else RECIPE.sft.num_epochs
+    )
+    # SDK [train] knobs override the recipe default.
+    from flash.catalog import vocab_size_for
+    from flash.engine.vram import resolve_params_b, sft_grad_accum, sft_logits_fused
+    _t = JOB_SPEC.train if JOB_SPEC else None
+    sft_lr = _t.learning_rate if _t and _t.learning_rate is not None else RECIPE.sft.learning_rate
+    sft_max_len = (
+        _t.max_length
+        if _t and _t.max_length is not None
+        else (RECIPE.sft.max_seq_len_thinking if THINKING else RECIPE.sft.max_seq_len)
+    )
+    # batch_size is the GLOBAL/effective batch; sft_grad_accum sizes the per-device micro-batch +
+    # grad-accum to realize it (shared with the cost estimator's step count, see engine.vram).
+    effective_batch = (
+        _t.batch_size if _t and _t.batch_size is not None else RECIPE.sft.effective_batch
+    )
+    # Large-vocab OOM guard: when the fused CE (Liger) is OFF, the SFTTrainer materializes the full
+    # [per_device, seq, vocab] fp32 logits + grad — at Qwen3.5's ~248k vocab a 0.8B SFT OOM'd a
+    # 24 GB card in backward. Cap the per-device micro-batch by the real model vocab + seq so those
+    # logits stay within the logits budget; grad-accum rises to keep the effective batch unchanged
+    # (the SFT mirror of rl_per_device_comps' GRPO cap). fused mirrors liger_on(_memory_mode(...))
+    # below, so the cap binds exactly when the worker won't fuse the CE.
+    _sft_params_b = resolve_params_b(model_id)  # catalog stat else HF safetensors (open models)
+    _sft_vocab = vocab_size_for(model_id)
+    # Actual fused-CE decision == what `use_liger_kernel` is set from below (line ~879). sft_logits_fused
+    # is the offline size/ctx mirror; liger_on(...) adds the runtime CUDA + liger_kernel-importable
+    # check, so the cap binds exactly when the fused CE is NOT really taken.
+    _sft_fused = sft_logits_fused(_sft_params_b, sft_max_len) and liger_on(
+        _memory_mode(model_id, sft_max_len)
+    )
+    per_device_bs, grad_accum = sft_grad_accum(
+        effective_batch, seq_len=sft_max_len, vocab=_sft_vocab, fused=_sft_fused
+    )
+    if not _sft_fused and per_device_bs < min(effective_batch, 4):
+        print(
+            f"[sft] large-vocab logits cap: per_device={per_device_bs} grad_accum={grad_accum} "
+            f"(seq={sft_max_len}, vocab={_sft_vocab}; realized batch "
+            f"{per_device_bs * grad_accum} >= requested {effective_batch})"
+        )
+    sft_save_default = _t.save_every if _t and _t.save_every is not None else 50
+    out_dir = f"/tmp/sft_seed{SEED}"
+    resume_ckpt = hf_resume_checkpoint()
+    # [train].max_steps>0 caps optimizer steps (used by the cheap pre-flight smoke).
+    max_steps = int(_t.max_steps or 0 if _t and _t.max_steps is not None else 0)
+    cfg_kwargs = {
+        "output_dir": out_dir,
+        "num_train_epochs": epochs,
+        "per_device_train_batch_size": per_device_bs,
+        "gradient_accumulation_steps": grad_accum,
+        "learning_rate": sft_lr,
+        "warmup_ratio": RECIPE.sft.warmup_frac,
+        "logging_steps": 10,
+        "save_steps": sft_save_default,
+        "save_total_limit": 1,
+        # Resumable checkpoints: save the optimizer / scheduler / RNG state alongside the (small)
+        # LoRA adapter. We DO resume mid-run — make_checkpoint_upload_callback streams each save to
+        # HF and a replacement worker calls resume_from_checkpoint(hf_resume_checkpoint()) after a
+        # preemption — so without this the resumed run would re-initialize the optimizer (Adam
+        # moments) and LR schedule instead of truly continuing. For LoRA the optimizer state is tiny
+        # (it covers only the trainable adapter params), so the save spike is negligible. The
+        # deployable per-step snapshot (publish_deployable_checkpoint) strips this trainer state
+        # separately, so serving still gets adapter-only files.
+        "save_only_model": False,
+        "max_length": sft_max_len,
+        "bf16": True,
+        "report_to": wandb_report_to(),  # W&B when WANDB_API_KEY present (restored post-flash-migration)
+        "run_name": wandb_run_name(),
+        # Dataloader parallelism: overlap host-side collation/tokenization with GPU compute so a
+        # real (large) training set isn't dataloader-bound. Pure throughput, zero quality change.
+        # Negligible on the tiny benchmark (pre-tokenized, in-memory); a real win at production
+        # dataset sizes.
+        "dataloader_num_workers": 4,
+        "dataloader_pin_memory": True,
+        "dataloader_persistent_workers": True,
+        "seed": SEED,
+        "gradient_checkpointing": grad_checkpointing_on(model_id, sft_max_len),
+        # Non-reentrant checkpointing: composes cleanly with autograd hooks (verl #3629) and is
+        # required by TRL for correct grad flow through the LoRA adapters.
+        "gradient_checkpointing_kwargs": {"use_reentrant": False},
+        "completion_only_loss": False,
+        # Optimizer: 8-bit paged AdamW (int8 state paged to host RAM -> fits a smaller GPU).
+        "optim": fused_optim_name(),
+    }
+    if max_steps > 0:
+        cfg_kwargs["max_steps"] = max_steps
+    # Example packing: concatenate short examples into full max_length sequences so a batch isn't
+    # mostly pad tokens — PR #174 measured a 4.4-10.7x SFT speedup (h100 8.2x, 4090 10.7x) because
+    # instruction targets are far shorter than max_seq_len; unpacked batches waste most of their
+    # FLOPs on padding. TRL's 'bfd' strategy makes padding-free batches whose example boundaries are
+    # honored ONLY by an attention impl that reads them — under plain SDPA packed examples
+    # cross-contaminate (silent quality loss). The boundary-correct backend is FlashAttention-2
+    # varlen (reads position_ids), which the worker image bakes in best-effort: Dockerfile.worker
+    # installs FLASH_ATTN_SPEC (a community cu128/torch2.10/cp312 wheel preferred, source build as a
+    # fallback) and tolerates a build failure -> SDPA. So _fa_ok is True whenever that install landed;
+    # packing is ON then (varlen keeps 'bfd' example boundaries correct). If the best-effort install
+    # failed, _fa_ok is False and we SKIP packing — without a boundary-correct attn backend examples
+    # would cross-contaminate under SDPA.
+    # Pure full-attention vs GatedDeltaNet hybrid (Qwen3.5/3.6) — probed ONCE here and reused across
+    # the whole packing decision (each probe reads the cached HF config). TRL 'bfd' packing keeps
+    # example boundaries via position_ids that a varlen attn honors, but it provides NO seq_idx, so it
+    # can't reset a GDN hybrid's causal conv -> bfd-packing a GDN model silently cross-contaminates its
+    # linear-attention layers. So bfd is enabled for PURE full-attention models only; GDN hybrids pack
+    # via the cu_seqlens/seq_idx varlen collator branch below (when their kernels are present).
+    _pure_attn = model_is_pure_attention(model_id)
+    _gdn = model_is_gdn_hybrid(model_id)
+    _fa_ok = _flash_attn_available()
+    if _fa_ok and _pure_attn:
+        cfg_kwargs["packing"] = True
+        print("[sft] example packing enabled (FA2 varlen)")
+    elif _fa_ok and _gdn:
+        print(
+            "[sft] TRL bfd packing NOT used for the GatedDeltaNet hybrid (bfd can't reset the conv); "
+            "the cu_seqlens/seq_idx varlen collator handles its packing when both kernels are present."
+        )
+    else:
+        # FA2 bfd packing not enabled here — either flash_attn isn't importable, or it is but the arch
+        # isn't bfd-safe (e.g. sliding-window). This is NOT the final word: the SDPA block-diagonal /
+        # GDN-varlen block below may still turn packing on for a pure-attention or GDN-hybrid model.
+        _bfd_why = "flash_attn not importable" if not _fa_ok else "arch not bfd-safe under FA2 varlen"
+        print(f"[sft] TRL bfd (FA2) packing not used ({_bfd_why}); the SDPA-mask path decides packing below.")
+    # Liger fused CE/RMSNorm/RoPE kernels, gated by model size (_memory_mode). The fused linear
+    # cross-entropy is the big large-vocab (Qwen3.5 ~248k) memory/throughput win.
+    if liger_on(_memory_mode(model_id, sft_max_len)):
+        cfg_kwargs["use_liger_kernel"] = True
+        print("[sft] liger fused kernels enabled")
+    _attn = optimal_attn_impl()  # arch-best FlashAttention (FA3 Hopper / FA2 Ampere·Ada) or SDPA
+    # Packing correctness: 'bfd' packed batches are boundary-correct ONLY under a varlen-capable attn
+    # (FA2 and FA3 both expose flash_attn_varlen_func; plain SDPA cross-contaminates packed examples).
+    # Use the ARCH-BEST flash impl optimal_attn_impl already picked (so Hopper packs under FA3, not
+    # FA2). Cases when it did NOT pick a flash impl:
+    #   * _attn == "sdpa" (sm120, the deliberate no-flash exception): DISABLE packing — consumer
+    #     Blackwell stays plain SDPA; do NOT force FA2 (its sm120 kernel coverage is unverified).
+    #   * _attn is None (Hopper without FA3): force FA2 for boundary-correct varlen IF the wheel is
+    #     importable; else drop packing rather than silently cross-contaminate.
+    if cfg_kwargs.get("packing"):
+        if _attn in ("flash_attention_2", "flash_attention_3"):
+            print(f"[sft] attn_implementation={_attn} (packing boundary-correct varlen)")
+        elif _attn == "sdpa":
+            cfg_kwargs["packing"] = False
+            print("[sft] packing disabled: selected attn_implementation=sdpa (no varlen flash backend)")
+        elif _fa_ok:
+            _attn = "flash_attention_2"
+            print("[sft] attn_implementation=flash_attention_2 (packing boundary-correct varlen)")
+        else:
+            cfg_kwargs["packing"] = False
+            print("[sft] packing disabled: no varlen flash backend (FA2/FA3) available -> plain SDPA")
+    # --- True token packing via a 4D block-diagonal SDPA mask (no flash-attn / no flex) ---------
+    # When the run lands on plain SDPA (no varlen flash backend) the block above left packing OFF —
+    # notably on sm120 (RTX 5090, flash's DEFAULT GPU), and anywhere the best-effort flash-attn
+    # build didn't land. For a PURE full-attention model we can still pack: concatenate examples
+    # into max_length blocks and feed a 4D block-diagonal causal mask SDPA honors natively, so
+    # packed examples never attend across boundaries (boundary-correct, numerically identical to
+    # unpacked — verified on a tiny Qwen3/Llama: |packed-separate| logits ~1e-7). This reclaims the packing
+    # throughput win on the default GPU with neither flash-attn nor flex_attention. GatedDeltaNet
+    # hybrids (Qwen3.5/3.6) take the NEXT branch instead — a mask alone can't reset their linear-
+    # attention state, so they also need the cu_seqlens/seq_idx varlen kwargs.
+    _collator = None
+    # The mask paths materialize a dense [B, 1, T, T] mask — O(T^2) memory. At very long context that
+    # tax (hundreds of MB to >1 GB) can OOM a run that previously fit under memory-efficient SDPA, and
+    # packing buys little there anyway (long rows already fill a block). Above this cap, leave packing
+    # off (train unpacked, as today). 16384: the dense bf16/bool mask stays <=~256 MB at bsz=1.
+    _PACK_MASK_MAX_LEN = 16384
+    _mask_pack_ok = sft_max_len <= _PACK_MASK_MAX_LEN
+    _sdpa_pack = bool(not cfg_kwargs.get("packing") and _pure_attn and _mask_pack_ok)
+    if _sdpa_pack:
+        # The 4D mask requires a MASK-READING attn (SDPA). DOWNGRADE any flash impl optimal_attn_impl
+        # picked — e.g. FA3 on a Hopper worker whose FA2 wheel didn't build — to SDPA: a flash varlen
+        # kernel SILENTLY IGNORES the 4D mask, so packed examples would attend across boundaries. (A
+        # bare ``_attn or "sdpa"`` would leave the truthy flash string in place — the bug this avoids.)
+        if _attn in ("flash_attention_2", "flash_attention_3"):
+            print(f"[sft] packing under SDPA: downgrading {_attn} -> sdpa (a flash kernel ignores the 4D mask)")
+        _attn = "sdpa"
+        cfg_kwargs["packing"] = False  # we own the packing; TRL must not also pack
+        # Hand TRL pre-tokenized, pre-packed rows + our collator: skip its dataset prep and stop the
+        # signature-based column pruning from dropping our seq_lengths column before collation.
+        _dk = dict(cfg_kwargs.get("dataset_kwargs") or {})
+        _dk["skip_prepare_dataset"] = True
+        cfg_kwargs["dataset_kwargs"] = _dk
+        cfg_kwargs["remove_unused_columns"] = False
+        # Tokenize EXACTLY like TRL's non-packed prep (EOS-append parity so the model still learns to
+        # stop; batched; truncate to max_length) then bin-pack into <= max_length blocks.
+        _tokenized = tokenize_for_packing([t["text"] for t in texts], tok, sft_max_len)
+        _packed_rows = pack_token_ids(_tokenized, sft_max_len)
+        ds = Dataset.from_list(_packed_rows)
+        _collator = BlockDiagonalCollator(pad_token_id=tok.pad_token_id)
+        # Memory: re-size the per-device micro-batch (in BLOCKS) for the full-block [pd, max_length,
+        # vocab] fp32 logits budget — a no-op under Liger's fused CE. Quality: each block holds
+        # ~ex_per_block examples, so KEEP the effective batch in EXAMPLES at the configured value by
+        # re-deriving grad_accum from the block count. Without this, packing balloons the effective
+        # batch ~ex_per_block-fold (fewer, larger updates -> mild undertraining at the same epochs:
+        # an A/B measured +5.2% held-out loss vs unpacked, closed to +0.1% once matched).
+        _pd_pack, _ = sft_grad_accum(
+            effective_batch, seq_len=sft_max_len, vocab=vocab_size_for(model_id),
+            fused=bool(cfg_kwargs.get("use_liger_kernel")),
+        )
+        # The dense [pd, 1, T, T] bool mask is pd*T^2 bytes — under Liger the logits cap doesn't bind
+        # so pd can be 4, and at long context that mask alone is GBs. Cap pd so the mask stays <=512MB
+        # (a no-op at short ctx: at T=2048 it allows pd up to ~125; it only bites past ~12k tokens).
+        _pd_pack = max(1, min(_pd_pack, (512 * 1024 * 1024) // (sft_max_len * sft_max_len)))
+        _ex_per_block = len(_tokenized) / max(1, len(_packed_rows))
+        cfg_kwargs["per_device_train_batch_size"] = _pd_pack
+        cfg_kwargs["gradient_accumulation_steps"] = max(
+            1, math.ceil(effective_batch / max(1.0, _pd_pack * _ex_per_block))
+        )
+        print(
+            "[sft] true token packing ENABLED (4D block-diagonal SDPA mask): "
+            f"{len(_tokenized)} examples -> {len(_packed_rows)} blocks (~{_ex_per_block:.1f} ex/block, "
+            f"{packing_efficiency(_packed_rows, sft_max_len):.0%} dense) of <= {sft_max_len} tok; "
+            f"pd={_pd_pack} ga={cfg_kwargs['gradient_accumulation_steps']} (effective batch kept "
+            f"~{effective_batch} ex); no flash-attn / no flex_attention"
+        )
+    elif not cfg_kwargs.get("packing") and _gdn and gdn_packing_available(model_id) and _mask_pack_ok:
+        # GatedDeltaNet hybrid (Qwen3.5/3.6, flash's flagship tier): the 4D block-diagonal mask makes
+        # the FULL-attention layers boundary-correct, and the linear-attention (DeltaNet) layers reset
+        # their recurrence + causal conv at example boundaries via cu_seq_lens_q (fla kernel) + seq_idx
+        # (causal_conv1d). GPU-validated on Qwen3.5-0.8B (RTX 5090): a packed example's output is
+        # byte-identical regardless of its neighbors' content (ZERO cross-example leakage); the only
+        # diff vs unpacked is benign bf16 GDN-kernel tiling numerics (~0.3 on logits). Gated on BOTH
+        # kernels being importable (gdn_packing_available) so a worker without them stays unpacked.
+        # Pin SDPA for the full-attn layers (downgrade any flash impl, e.g. FA3 on Hopper — it would
+        # ignore the 4D mask); the DeltaNet layers are unaffected (they use cu_seqlens/seq_idx).
+        if _attn in ("flash_attention_2", "flash_attention_3"):
+            print(f"[sft] GDN packing under SDPA: downgrading {_attn} -> sdpa for the full-attn layers")
+        _attn = "sdpa"
+        cfg_kwargs["packing"] = False
+        _dk = dict(cfg_kwargs.get("dataset_kwargs") or {})
+        _dk["skip_prepare_dataset"] = True
+        cfg_kwargs["dataset_kwargs"] = _dk
+        cfg_kwargs["remove_unused_columns"] = False
+        # EOS-append parity + batched + truncated tokenization (same as the unpacked path), then pack.
+        _tokenized = tokenize_for_packing([t["text"] for t in texts], tok, sft_max_len)
+        _packed_rows = pack_token_ids(_tokenized, sft_max_len)
+        ds = Dataset.from_list(_packed_rows)
+        _collator = BlockDiagonalCollator(pad_token_id=tok.pad_token_id, emit_varlen=True)
+        # cu_seqlens spans ONE packed block, so per-device is a single block; keep the effective batch
+        # in EXAMPLES at the configured value via grad-accum (each block holds ~ex_per_block examples —
+        # without this the effective batch would balloon ~ex_per_block-fold -> undertraining).
+        _ex_per_block = len(_tokenized) / max(1, len(_packed_rows))
+        cfg_kwargs["per_device_train_batch_size"] = 1
+        cfg_kwargs["gradient_accumulation_steps"] = max(1, math.ceil(effective_batch / max(1.0, _ex_per_block)))
+        print(
+            "[sft] true token packing ENABLED for GatedDeltaNet hybrid (4D mask + cu_seqlens/seq_idx "
+            f"varlen): {len(_tokenized)} examples -> {len(_packed_rows)} blocks (~{_ex_per_block:.1f} "
+            f"ex/block, {packing_efficiency(_packed_rows, sft_max_len):.0%} dense) of <= {sft_max_len} "
+            f"tok; pd=1 ga={cfg_kwargs['gradient_accumulation_steps']} (effective batch kept ~{effective_batch} ex)"
+        )
+    elif not cfg_kwargs.get("packing") and (_pure_attn or _gdn) and not _mask_pack_ok:
+        print(
+            f"[sft] packing stays OFF: max_length {sft_max_len} > {_PACK_MASK_MAX_LEN} — the dense "
+            "O(T^2) block-diagonal mask gets too large at long context (unpacked is more memory-"
+            "efficient there, and long rows already fill a block)."
+        )
+    elif not cfg_kwargs.get("packing") and not _pure_attn:
+        _why = (
+            "hybrid GatedDeltaNet but the fla/causal_conv1d varlen kernels aren't both importable"
+            if _gdn
+            else "non-full-attention arch (e.g. sliding-window) a block-diagonal mask can't pack"
+        )
+        print(f"[sft] packing stays OFF: {_why}. (Pure full-attention models pack via the SDPA mask.)")
+    # Explicit bf16 + no auto device-map: TRL/transformers-5 string loading can
+    # otherwise fall back to fp32 (2x VRAM; observed 18.6 GB for a 4.66B model) or
+    # accelerate-offload large models to meta ("expected device meta but got
+    # cuda:0" in backward on the 9B).
+    mik = {"dtype": "bfloat16", "device_map": None}
+    if _attn:
+        mik["attn_implementation"] = _attn
+    cfg_kwargs["model_init_kwargs"] = mik
+    cfg = TRLSFTConfig(**cfg_kwargs)
+    # LoRA+ (convergence lever, arXiv 2402.12354; always-on: measured -52% train loss in A/B
+    # (gpu-bench)): give the LoRA B matrices a higher LR than A (ratio 16). Reported ~2x fewer steps
+    # to target at identical per-step FLOPs. TRL builds the model from a string inside __init__, so
+    # the optimizer (which needs the instantiated params) can't be pre-built — override
+    # create_optimizer to construct it from self.model once it exists.
+    _lp_ratio = 16
+    _SFT = SFTTrainer
+    if _lp_ratio > 1:
+        class _SFT(SFTTrainer):  # local LoRA+ subclass
+            _loraplus_applied = False  # True only once the LoRA+ grouping actually installs
+            def create_optimizer(self):
+                if self.optimizer is None:
+                    try:
+                        from peft.optimizers import create_loraplus_optimizer
+                        # Mirror the configured `optim` so LoRA+ and the 8-bit paged optimizer state
+                        # coexist (instead of silently forcing fp32 AdamW); see loraplus_optimizer_cls.
+                        # .value (not str()): self.args.optim is a TRL OptimizerNames enum whose
+                        # str() is "OptimizerNames.PAGED_ADAMW_8BIT"; pass the raw value
+                        # ("paged_adamw_8bit") so the 8-bit match works.
+                        opt_cls, extra = loraplus_optimizer_cls(
+                            getattr(self.args.optim, "value", self.args.optim)
+                        )
+                        # Forward the TrainingArguments optimizer config that the default HF
+                        # create_optimizer path would have applied. Building the optimizer
+                        # ourselves means we must replicate it explicitly, or LoRA+ runs would
+                        # silently use the optimizer class's own defaults instead of the
+                        # configured betas/eps/weight_decay. betas/eps go straight to the optimizer
+                        # constructor (alongside any `extra` from loraplus_optimizer_cls);
+                        # weight_decay is handled separately below.
+                        fwd = dict(extra)
+                        _betas = (
+                            getattr(self.args, "adam_beta1", None),
+                            getattr(self.args, "adam_beta2", None),
+                        )
+                        if None not in _betas:
+                            fwd.setdefault("betas", _betas)
+                        _eps = getattr(self.args, "adam_epsilon", None)
+                        if _eps is not None:
+                            fwd.setdefault("eps", _eps)
+                        # PEFT does NOT read args.weight_decay; it applies decay via its own LoRA+
+                        # param groups, keyed off the loraplus_weight_decay kwarg (which it pops
+                        # before constructing the optimizer). Pass it as a top-level kwarg so it
+                        # isn't forwarded into the optimizer constructor.
+                        lp_extra: dict[str, object] = {}
+                        _wd = getattr(self.args, "weight_decay", None)
+                        if _wd is not None:
+                            lp_extra["loraplus_weight_decay"] = _wd
+                        # PEFT's create_loraplus_optimizer forwards extra kwargs to the optimizer;
+                        # the lr keyword name has shifted across PEFT versions, so pass it via
+                        # optimizer_kwargs (the stable form) and fall back to a top-level lr=.
+                        try:
+                            self.optimizer = create_loraplus_optimizer(
+                                model=self.model,
+                                optimizer_cls=opt_cls,
+                                optimizer_kwargs={"lr": self.args.learning_rate, **fwd},
+                                loraplus_lr_ratio=_lp_ratio,
+                                **lp_extra,
+                            )
+                        except TypeError:
+                            self.optimizer = create_loraplus_optimizer(
+                                model=self.model,
+                                optimizer_cls=opt_cls,
+                                lr=self.args.learning_rate,
+                                loraplus_lr_ratio=_lp_ratio,
+                                **fwd,
+                                **lp_extra,
+                            )
+                        self._loraplus_applied = True
+                        print(
+                            f"[lora+] optimizer enabled (B-matrix LR ratio={_lp_ratio}, "
+                            f"cls={opt_cls.__name__})"
+                        )
+                        return self.optimizer
+                    except Exception as e:  # never block training on the LoRA+ wiring
+                        print("[lora+] setup failed, falling back to default optimizer:", e)
+                return super().create_optimizer()
+    # Pass model as a string id + tokenizer as processing_class so TRL takes the
+    # text/causal-LM path (not the VLM processor path) for this multimodal checkpoint.
+    # SFTTrainer.__init__ blocks for 10-15 min on first use (FA2 CUDA kernel JIT compilation);
+    # without a heartbeat the control plane can't distinguish this from a real hang and may
+    # recycle the worker. A daemon thread pings every 30s so the stall detector stays quiet.
+    _sft_init_done = threading.Event()
+    def _sft_init_heartbeat() -> None:
+        while not _sft_init_done.wait(30.0):
+            heartbeat("sft_initializing", gpu=gpu_diagnostics())
+    _sft_init_hb = threading.Thread(target=_sft_init_heartbeat, daemon=True)
+    _sft_init_hb.start()
+    try:
+        trainer = _SFT(
+            model=model_id,
+            args=cfg,
+            train_dataset=ds,
+            peft_config=make_lora(model_id),
+            processing_class=tok,
+            # Our block-diagonal collator on the SDPA-packing path; None elsewhere == TRL default.
+            data_collator=_collator,
+            callbacks=[make_sft_heartbeat_callback(), make_checkpoint_upload_callback()],
+        )
+    finally:
+        _sft_init_done.set()
+    # Apply chalk's gap-filling kernels (RoPE/LoRA-delta/embedding, like Liger) on the materialized
+    # SFT trainer.model — chalk's apply patches the LIVE module, so it must run AFTER TRL builds the
+    # model (chalk composes on top of TRL's Liger). No-op unless a FLASH_* kernel flag selects it and
+    # freesolo-chalk is installed.
+    _chalk_report = install_chalk_kernels(getattr(trainer, "model", None))
+    _reset_peak_gpu()  # so peak_gpu_gb reflects the train loop (optimizer-state A/B is measurable)
+    _gpu_sampler = _GpuPeakSampler().start()  # true device peak incl. bnb managed optimizer pages
+    t_train = time.time()
+    with _sdpa_cudnn_ctx(_attn):  # force cuDNN SDPA on sm120 (no-op otherwise)
+        trainer.train(resume_from_checkpoint=resume_ckpt)
+    train_wall = time.time() - t_train
+    sft_peak_gpu_gb = _peak_gpu_gb()
+    sft_device_peak_gpu_gb = _gpu_sampler.stop_gb()
+    adapter_dir = f"{out_dir}/adapter"
+    trainer.model.save_pretrained(adapter_dir)
+    tok.save_pretrained(adapter_dir)
+    hf_upload_folder(adapter_dir, "adapter", required=True)
+    heartbeat("sft_trained", train_wall=train_wall, gpu=gpu_diagnostics())
+    # count train tokens
+    train_tokens = int(sum(len(tok(t["text"])["input_ids"]) for t in texts) * epochs)
+    # Write train metadata + the completion sentinel (metrics.json/DONE) for this phase.
+    write_train_meta(
+        phase="sft",
+        adapter_dir=adapter_dir,
+        model_id=model_id,
+        train_wall=train_wall,
+        setup_seconds=setup_seconds,
+        train_tokens=train_tokens,
+        generated_tokens=0,
+        notes={
+            "epochs": epochs,
+            "resumed": bool(resume_ckpt),
+            "download_seconds": download_seconds,
+            "hf_transfer": os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", ""),
+            "thinking": THINKING,
+            # Persist the loss curve so a CONVERGENCE A/B (PiSSA / LoRA+ init, etc.) is measurable
+            # without a checkpoint: trainer_state.json is only written on a save_step, and the
+            # console is only uploaded on failure, so a short successful run otherwise drops its
+            # loss history entirely.
+            "loss_curve": _metric_curve(trainer, "loss"),
+            # Peak torch-allocated GPU memory during the train loop (excludes bnb managed pages, so
+            # it overstates the 8-bit saving — use device_peak_gpu_gb for the true footprint).
+            "peak_gpu_gb": sft_peak_gpu_gb,
+            # True peak device memory (total-free, incl. bnb managed optimizer pages): the honest
+            # headline for the fp32-vs-8-bit LoRA+ optimizer A/B.
+            "device_peak_gpu_gb": sft_device_peak_gpu_gb,
+            # Report the optimizer ACTUALLY built on the trainer, not the planned class: if the
+            # LoRA+ create_optimizer override failed, training falls back to TRL's configured
+            # optimizer without LoRA+ grouping. loraplus_applied records which path actually ran.
+            # Accelerate wraps the optimizer (AcceleratedOptimizer) under transformers 5.x, so unwrap
+            # via `.optimizer` to record the underlying PagedAdamW8bit/AdamW the A/B cares about, not
+            # the wrapper name.
+            "loraplus_optim": (
+                type(getattr(trainer.optimizer, "optimizer", trainer.optimizer)).__name__
+                if getattr(trainer, "optimizer", None) is not None
+                else loraplus_optimizer_cls(fused_optim_name())[0].__name__
+            ),
+            "loraplus_applied": getattr(trainer, "_loraplus_applied", False),
+            # Which chalk gap-filling kernels actually ENGAGED (empty/None = chalk not installed or
+            # every kernel fell back) — verifies the chalk stack without the console.
+            "chalk_kernels": active_kernels(_chalk_report) or None,
+            **wandb_run_info(),
+        },
+    )
+    free_gpu(trainer)
+# ---------------------------------------------------------------------------
+# RL (GRPO) with TRL + colocated vLLM
+# ---------------------------------------------------------------------------
+def compute_grpo_batching(prompts_per_step: int, group_size: int, per_device_comps: int) -> dict:
+    """Translate an intended ``prompts_per_step`` into a TRL GRPO batch configuration.
+    TRL's GRPO batch sizing is denominated in **completions (prompt-completion pairs), not
+    prompts**. The number of *unique prompts* optimized per step is
+        (per_device_train_batch_size * gradient_accumulation_steps * num_processes)
+        / num_generations
+    So to actually optimize ``prompts_per_step`` prompts per step, the global *completion*
+    batch must equal ``prompts_per_step * group_size``. We keep ``per_device`` small (it,
+    not grad-accum, sets peak VRAM) and put the rest in gradient accumulation.
+    The bug this fixes: ``grad_accum = prompts_per_step // per_device`` treated
+    ``per_device_train_batch_size`` as a *prompt* count, omitting the ``* group_size``
+    factor, so a run intended as 64 prompts/step actually optimized only
+    ``64 / group_size = 8`` prompts/step (an 8x smaller effective batch).
+    """
+    group_size = max(1, int(group_size))
+    prompts_per_step = max(1, int(prompts_per_step))
+    per_device = max(1, int(per_device_comps))
+    target_comps = prompts_per_step * group_size  # total completions / optimizer step
+    # Never let the per-device completion micro-batch exceed the target completion batch:
+    # a small prompts_per_step would otherwise overshoot it (mirrors run_sft's
+    # `min(per_device_bs, effective_batch)`). No-op at the default (prompts_per_step=64).
+    per_device = max(1, min(per_device, target_comps))
+    # per_device is the fixed VRAM knob, but when it does NOT divide target_comps neither floor
+    # nor ceil of grad_accum is right: floor (the old bug) silently optimizes FEWER prompts than
+    # requested, while ceil over-shoots and asks TRL for MORE unique prompts than the (already
+    # dataset-capped) prompts_per_step -- which, on a small retained dataset, yields no batches
+    # after the paid worker is provisioned. Instead shrink per_device to the largest divisor of
+    # target_comps that is <= the requested per_device: that lowers (never raises) peak VRAM and
+    # makes per_device * grad_accum == target_comps EXACTLY, so unique prompts == prompts_per_step
+    # with no over/under-shoot. (per_device=16, target_comps=40 -> 10 -> grad_accum=4 -> 40 comps
+    # = exactly 5 prompts. A divisor always exists since 1 divides everything.)
+    while target_comps % per_device != 0:
+        per_device -= 1
+    grad_accum = max(1, target_comps // per_device)
+    # The global completion batch (per_device * grad_accum == target_comps) is divisible by
+    # num_generations (= group_size) by construction, since target_comps = prompts_per_step *
+    # group_size; TRL's divisibility requirement is satisfied with no further rounding.
+    generations_per_step = per_device * grad_accum
+    unique_prompts_per_step = generations_per_step // group_size
+    return {
+        "per_device_train_batch_size": per_device,
+        "gradient_accumulation_steps": grad_accum,
+        "generations_per_step": generations_per_step,
+        "unique_prompts_per_step": unique_prompts_per_step,
+        # TRL requires the global completion batch be divisible by num_generations.
+        "divisible_by_group": (generations_per_step % group_size == 0),
+    }
+def resolve_grpo_prompts_per_step(requested: int, available_prompts: int) -> int:
+    """Cap GRPO's prompt batch to the retained dataset size.
+    TRL's GRPO dataloader can yield zero batches when the configured prompt batch is larger
+    than the dataset that remains after prompt-budget filtering. That surfaces late as
+    "There seems not to be a single sample in your epoch_iterator" and then our no-reward guard
+    reports the wrong cause. Small smoke envs should still train; use every retained prompt per
+    step instead of asking TRL for an impossible larger batch.
+    """
+    requested = max(1, int(requested))
+    available_prompts = int(available_prompts)
+    if available_prompts <= 0:
+        raise ValueError("GRPO needs at least one retained training prompt")
+    return min(requested, available_prompts)
+def build_grpo_prompt_dataset(prompts: list[dict]) -> tuple[list[dict], list]:
+    """Arrow-safe GRPO rollout rows + the parallel example lookup ``reward_fn`` maps back through.
+    ``Dataset.from_list`` lets PyArrow infer ONE column type per (nested) field across ALL rows, so
+    embedding the rich per-example record makes a *valid* env whose per-row ``info``/``metadata``
+    legitimately mixes types crash dataset construction with ``ArrowInvalid`` — and the whole RL
+    phase dies at startup, AFTER the paid GPU is provisioned, on input that passed offline
+    single-example validation. (Observed with ifeval-lite: ``metadata.param`` is an int target word
+    count for some rows and a required-word string ``'gentle'`` for others; Arrow infers ``int64``
+    from the leading rows then fails on the first string.)
+    Fix: keep the dataset columns trivially typed — the TRL-required ``prompt`` plus a stable integer
+    ``example_idx`` — and return the original example objects in a parallel list. ``reward_fn`` maps
+    the index back, so the env still sees its EXACT record (no JSON/Arrow round-trip, no type
+    coercion). ``rows[i]["example_idx"] == i`` and ``examples[i]`` is that row's record.
+    """
+    examples = [p["example"] for p in prompts]
+    rows = [{"prompt": p["prompt"], "example_idx": i} for i, p in enumerate(prompts)]
+    return rows, examples
+# Hard ceiling on the per-device completion micro-batch when growing on a SHORT-seq run. MEASURED
+# (RunPod, Qwen3.5-0.8B GRPO, group8, gsm8k, seq1024, 6 steps): trainer throughput rises from
+# per_device 4 -> 8 (~+12%) and plateaus 8..16 (A100 80GB: 375/407/411 tok/s at pd 4/8/16), then
+# REGRESSES at pd 32 (326 tok/s, -20%) as the larger forward stops buying MFU. So we never grow
+# past the top of that plateau, even on a card with VRAM to spare. (Reward histories at pd 4 and
+# 16 were identical -> per_device is a pure speed/VRAM knob, not an optimization change.)
+_RL_PER_DEVICE_MAX = 16
+# Reference sequence length the activation/VRAM divisor is calibrated at. The colocate activation
+# peak grows with the training sequence length; the cap is scaled by seq_len/_RL_ACT_SEQ_REF so a
+# short-seq run (the underfed regime) is allowed a proportionally bigger micro-batch.
+_RL_ACT_SEQ_REF = 2048.0
+# VRAM-per-(micro-batch element) divisor at the reference seq, normalized to ~2B width (1.41).
+# MEASURED: Qwen3.5-2B group8 seq2048 OOMs a 32 GB card at per_device=8 but trains at 4 ->
+# 32 / (7.5 * 1.0 * 1.0) = 4. (Unchanged from the historical colocate cap, so at/above the
+# reference seq the value is byte-for-byte the old one — no regression.)
+_RL_ACT_DIVISOR = 7.5
+# Floor on the seq scale: caps how far a short sequence may grow the micro-batch. Set so the
+# underfed case that motivated this — Qwen3.5-0.8B GRPO on a 24 GB card at seq<=1024 — lands on
+# the MEASURED-SAFE per_device 8 (RunPod RTX 4090 24 GB: pd8 fits at 19.0 GB and is +12.6% over
+# pd4, while the old seq-independent cap under-fed it at ~5; pd16 there would need ~27 GB -> OOM).
+# 24 / (7.5 * (0.894/1.41) * 0.63) = 8.0. Bounds short-seq growth to ~1.6x the reference cap.
+_RL_ACT_SEQ_SCALE_FLOOR = 0.63
+# Clamp the seq scale at 1.0 (never ABOVE the reference). Combined with the short_seq growth gate,
+# this makes a seq>=reference run byte-for-byte the old value: seq_scale==1.0 -> vram_cap == the
+# old colocate cap, and the ceiling falls back to the historical default, so min(default, ...) is
+# exactly what the old code returned. We deliberately do NOT tighten long-seq below the historical
+# value (grad checkpointing makes activations sub-linear in seq there, so the linear model would
+# over-cap), nor grow above it (unvalidated — the regression is in tokens-in-flight = pd x seq).
+_RL_ACT_SEQ_SCALE_CEIL = 1.0
+def rl_per_device_comps(
+    completion_len: int = 0,
+    vocab: int = 248_320,
+    *,
+    use_vllm: bool = True,
+    params_b: float | None = None,
+    seq_len: int = 0,
+) -> int:
+    """Per-device *completion* micro-batch for GRPO (TRL counts completions, not prompts).
+    This, not grad-accum, sets peak trainer VRAM AND the trainer step's MFU: a bigger
+    micro-batch means bigger, fewer GEMMs (less launch overhead, fuller tensor cores) at the
+    same effective batch (compute_grpo_batching pushes the remainder into grad-accum, so the
+    optimization is identical — only speed/VRAM change). MEASURED on RunPod (Qwen3.5-0.8B GRPO,
+    group8, seq1024): the old seq-independent colocate cap under-fed a 24 GB card at per_device ~5,
+    while per_device 8 fits (19.0 GB) and is +12.6% throughput; on an 80 GB card throughput
+    plateaus at per_device 8..16 and regresses by per_device 32. So on a SHORT-seq run we grow the
+    micro-batch into the card's measured VRAM headroom up to the plateau ceiling.
+    Growth is GATED to short sequences (seq < the reference). At/above the reference seq the value
+    is byte-for-byte the historical one — bigger per_device at long context is unvalidated and the
+    regression is driven by tokens-in-flight (per_device x seq), which a fixed-per_device ceiling
+    would not catch.
+    Two upper bounds cap the growth:
+    * **logits budget (6 GB)** — a HARD correctness cap. The logprob pass can materialize fp32
+      logits of shape [per_device, completion_len, vocab]; at Qwen3.5's ~248k vocab a long
+      completion is enormous (per_device 8 x 4096 tok x 248k x 4 B = ~30 GiB -> OOMs a small
+      card). Liger normally fuses these away, but this stays a safety net for the fallback path.
+    * **activation/VRAM cap** — the per-device forward holds the model's attention/activation
+      memory (the Qwen3.5 GDN/FLA kernels peak per micro-batch even with grad checkpointing),
+      which the logits term can't see and which Liger does NOT touch. Calibrated against the live
+      card's VRAM, model width (~sqrt(params)), and — unlike the old seq-independent cap — the
+      training sequence length: activations scale ~linearly with seq, so a SHORT-seq run gets a
+      proportionally bigger cap. MEASURED at seq_ref=2048: Qwen3.5-2B (width ~1.41) group8 OOMs a
+      32 GB card at per_device=8 but trains at 4 -> 32 / 7.5 = 4.
+    Off a live card (allocator / unit tests) there is no VRAM signal, so we fall back to the
+    conservative historical default (8, or 2 with thinking) bounded by the logits budget — the
+    allocator already provisions for that floor, and the worker only ever grows INTO the spare
+    VRAM the chosen card actually reports, so it cannot over-fill the card it was routed to.
+    """
+    default = 2 if THINKING else 8
+    # Logits budget: hard upper bound on the fp32 [per_device, completion, vocab] logprob tensor.
+    logits_cap = _RL_PER_DEVICE_MAX
+    if completion_len > 0:
+        logits_cap = max(1, int(6.0e9 / (max(1, completion_len) * vocab * 4)))
+    # Growth is gated to SHORT sequences (seq < the reference). At/above the reference seq the
+    # micro-batch is left exactly as the historical code computed it: bigger per_device at long
+    # context is unvalidated and risky — the measured throughput regression is driven by
+    # tokens-in-flight (per_device x seq), so per_device 16 at seq 2048 (~the regression-zone
+    # per_device 32 at seq 1024) could regress, and a fixed-per_device ceiling would not catch it.
+    short_seq = (seq_len or _RL_ACT_SEQ_REF) < _RL_ACT_SEQ_REF
+    # Activation/VRAM cap — only computable on a live card. It both caps DOWN (big model / small
+    # card / long seq) and, on a SHORT-seq run, lets the micro-batch GROW into spare VRAM.
+    vram_cap = None
+    if use_vllm:
+        try:
+            import torch
+            if torch.cuda.is_available():
+                vram_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
+                width = (max(float(params_b), 0.1) ** 0.5) if params_b else 1.41
+                seq_scale = min(
+                    _RL_ACT_SEQ_SCALE_CEIL,
+                    max(_RL_ACT_SEQ_SCALE_FLOOR, (seq_len or _RL_ACT_SEQ_REF) / _RL_ACT_SEQ_REF),
+                )
+                vram_cap = max(
+                    1, int(vram_gb / (_RL_ACT_DIVISOR * (width / 1.41) * seq_scale))
+                )
+        except Exception as e:
+            print("rl_per_device_comps colocate cap probe failed (keeping logits cap):", e)
+    if vram_cap is None:
+        # No live card (allocator / offline / unit tests): conservative default, logits-bounded.
+        return max(1, min(default, logits_cap))
+    # Short seq -> grow into measured VRAM headroom up to the plateau ceiling. At/above the
+    # reference seq the ceiling is the historical default, and seq_scale is clamped to 1.0 so
+    # vram_cap == the old colocate cap -> the result is byte-for-byte the old value (no regression,
+    # no unvalidated long-seq growth).
+    #
+    # THINKING runs are EXCLUDED from the growth path: they emit long completions whose
+    # activation/logprob cost the prompt-only `seq_len` gate cannot see, so letting short-seq
+    # growth raise the ceiling to _RL_PER_DEVICE_MAX would silently override the conservative
+    # thinking default (2) and risk OOM / unstable training. They keep `default` as the ceiling,
+    # i.e. byte-for-byte the historical value.
+    ceiling = _RL_PER_DEVICE_MAX if (short_seq and not THINKING) else default
+    return max(1, min(ceiling, logits_cap, vram_cap))
+_STEP_GPU_DIAG_INTERVAL_S = 300.0
+_SFT_HEARTBEAT_INTERVAL_S = 60.0
+def make_reward_heartbeat_callback():
+    """A TRL/transformers callback that streams the per-step mean reward to the HF heartbeat
+    channel, giving the worker a live RL signal (no pod log API) and recording a
+    ``reward_history``. Built lazily so the module imports without transformers installed."""
+    from transformers import TrainerCallback
+    class _RewardHeartbeat(TrainerCallback):
+        def __init__(self):
+            self.reward_history = []
+            self.last_gpu_diag_at = 0.0
+        def on_log(self, args, state, control, logs=None, **kwargs):
+            if not logs:
+                return
+            r = logs.get("reward")
+            if r is None:
+                return
+            try:
+                r = float(r)
+            except (TypeError, ValueError):
+                return
+            self.reward_history.append(r)
+            step = int(getattr(state, "global_step", len(self.reward_history)))
+            payload = {
+                "step": step,
+                "reward": r,
+                "reward_last": self.reward_history[-8:],
+            }
+            now = time.monotonic()
+            if (
+                self.last_gpu_diag_at == 0.0
+                or now - self.last_gpu_diag_at >= _STEP_GPU_DIAG_INTERVAL_S
+            ):
+                payload["gpu"] = gpu_diagnostics()
+                self.last_gpu_diag_at = now
+            heartbeat("rl_step", **payload)
+    return _RewardHeartbeat()
+def make_sft_heartbeat_callback():
+    """Stream SFT trainer logs so a run is not silent between model load and completion."""
+    from transformers import TrainerCallback
+    class _SFTHeartbeat(TrainerCallback):
+        def __init__(self):
+            self.last_heartbeat_at = 0.0
+            self.last_gpu_diag_at = 0.0
+        def on_log(self, args, state, control, logs=None, **kwargs):
+            if not logs:
+                return
+            now = time.monotonic()
+            if self.last_heartbeat_at and now - self.last_heartbeat_at < _SFT_HEARTBEAT_INTERVAL_S:
+                return
+            self.last_heartbeat_at = now
+            payload = {
+                "step": int(getattr(state, "global_step", 0) or 0),
+                "epoch": logs.get("epoch"),
+                "loss": logs.get("loss"),
+                "grad_norm": logs.get("grad_norm"),
+                "learning_rate": logs.get("learning_rate"),
+            }
+            if (
+                self.last_gpu_diag_at == 0.0
+                or now - self.last_gpu_diag_at >= _STEP_GPU_DIAG_INTERVAL_S
+            ):
+                payload["gpu"] = gpu_diagnostics()
+                self.last_gpu_diag_at = now
+            heartbeat("sft_step", **{k: v for k, v in payload.items() if v is not None})
+    return _SFTHeartbeat()
+def grpo_overrides() -> dict:
+    """The GRPO recipe knobs, read off the job spec's ``[train]`` table (``TrainSpec``).
+    A field left unset (None) is omitted here so the recipe default applies downstream.
+    Knobs: group_size, temperature, max_tokens (completion budget), kl_penalty_coef (the KL
+    beta), advantage_clip (centered-advantage clip), and thinking_length_penalty_coef
+    (a per-<think>-token reward deduction). These live in ``[train]`` — NOT in
+    ``[environment.params]``, which is forwarded verbatim to the Freesolo env loader."""
+    if not JOB_SPEC:
+        return {}
+    train = JOB_SPEC.train
+    cfg = {
+        "group_size": train.group_size,
+        "temperature": train.temperature,
+        "max_tokens": train.max_tokens,
+        "kl_penalty_coef": train.kl_penalty_coef,
+        "advantage_clip": train.advantage_clip,
+        "thinking_length_penalty_coef": train.thinking_length_penalty_coef,
+    }
+    return {k: v for k, v in cfg.items() if v is not None}
+def think_token_count(completion: str | None, tokenizer) -> int:
+    """Number of tokens inside the completion's <think>...</think> span (0 if none).
+    Used for the thinking-length reward deduction: long reasoning is penalized in
+    proportion to the tokens it spent, mirroring the SDK's thinking_length_penalty_coef.
+    """
+    if not completion or "<think>" not in completion:
+        return 0
+    after = completion.split("<think>", 1)[1]
+    think_text = after.split("</think>", 1)[0] if "</think>" in after else after
+    if not think_text:
+        return 0
+    return len(tokenizer(think_text, add_special_tokens=False)["input_ids"])
+def _init_adapter_model(model_id: str):
+    """Base model + the ``train.init_from_adapter`` adapter loaded as a trainable
+    PeftModel, or the plain ``model_id`` string + a fresh LoRA when it is unset.
+    GRPO continuing an SFT adapter: TRL trains the LOADED adapter (peft_config=None)
+    instead of attaching a fresh one."""
+    prefix = JOB_SPEC.train.init_from_adapter if JOB_SPEC else ""
+    if not prefix:
+        return model_id, make_lora(model_id)
+    adir = _download_adapter(prefix)
+    if not adir:
+        # The user explicitly asked GRPO to continue from this adapter; silently
+        # falling back to a fresh base-model LoRA would spend a full paid run
+        # optimizing the wrong starting point. Fail hard instead.
+        raise RuntimeError(
+            f"train.init_from_adapter={prefix!r} could not be downloaded from the artifact "
+            "store (wrong/missing prefix or no access); refusing to silently start GRPO from "
+            "the base model. Fix the adapter prefix / HF credentials, or omit "
+            "init_from_adapter to train a fresh LoRA."
+        )
+    from peft import PeftModel
+    from transformers import AutoModelForCausalLM
+    print(f"[init-adapter] initializing LoRA from {prefix}")
+    # VL checkpoints (Qwen3.5/3.6): the SFT step saved the adapter against the FULL multimodal model
+    # (keys under ``base_model.model.model.language_model.layers.*``), but we load the base here via
+    # AutoModelForCausalLM (text-only tree, ``base_model.model.model.layers.*``). Strip the
+    # ``.language_model.`` infix on disk so PeftModel.from_pretrained matches the SFT keys —
+    # otherwise peft only WARNS about missing keys and silently trains a fresh LoRA, discarding the
+    # SFT. No-op for non-VL checkpoints. See flash/engine/worker/lora.py.
+    remap_vl_adapter_dir(adir, model_id)
+    _attn = optimal_attn_impl()
+    base = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        dtype="bfloat16",
+        trust_remote_code=True,
+        **({"attn_implementation": _attn} if _attn else {}),
+    )
+    model = PeftModel.from_pretrained(base, adir, is_trainable=True)
+    # Fail loudly if the adapter didn't actually apply (a key mismatch would otherwise silently start
+    # GRPO from the base model again). from_pretrained loads with load_state_dict(strict=False) and
+    # only WARNS on a mismatch, discarding the load result — so re-run load_adapter to CAPTURE which
+    # keys matched and assert matched==saved (peft injects the LoRA modules from target_modules BEFORE
+    # loading weights, so the module-count check alone can't see a silent weight discard). The reload
+    # is idempotent: same weights into the same "default" adapter. See flash/engine/worker/lora.py.
+    # Mirror from_pretrained's key_mapping: for transformers models that define a
+    # ``_checkpoint_conversion_mapping`` (renamed-arch checkpoints), from_pretrained remaps the adapter
+    # keys before loading; the reload must apply the SAME mapping or it would reinterpret valid keys as
+    # mismatched and falsely abort. peft reads it off the base model (peft_model.py from_pretrained).
+    key_mapping = getattr(base, "_checkpoint_conversion_mapping", None)
+    load_result = model.load_adapter(
+        adir, adapter_name="default", is_trainable=True, key_mapping=key_mapping
+    )
+    assert_adapter_load_clean(load_result, model_id)
+    assert_lora_applied(model, model_id)
+    assert_adapter_delta_nonzero(model, model_id)
+    return model, None
+def _grpo_resume_already_complete(resume_ckpt, target_steps: int, steps_run: int) -> bool:
+    """True when this worker resumed a checkpoint that already reached the target step count.
+    Such a resume legitimately performs ZERO new optimizer steps (so the fresh hb_cb has an empty
+    reward_history) yet the policy IS fully trained — it must NOT be flagged as a no-op failure.
+    """
+    return bool(resume_ckpt) and target_steps > 0 and steps_run >= target_steps
+def _grpo_is_no_op_failure(reward_history, resume_ckpt, target_steps: int, steps_run: int) -> bool:
+    """True when a GRPO run trained NOTHING and must fail loudly instead of reporting as done.
+    An empty ``reward_history`` means the reward callback never fired — the rollout scored nothing
+    (e.g. vLLM silently returning no completions), so no real training happened. The sole exception
+    is a resume that already reached the target steps (see ``_grpo_resume_already_complete``): that
+    has an empty fresh history but a fully-trained policy, so it is NOT a failure.
+    """
+    if reward_history:
+        return False
+    return not _grpo_resume_already_complete(resume_ckpt, target_steps, steps_run)
+def run_rl():
+    from datasets import Dataset
+    from transformers import AutoTokenizer
+    from trl import GRPOConfig, GRPOTrainer
+    env = require_active_env()  # fail loudly (not AttributeError: NoneType) on the no-JobSpec path
+    t_start = time.time()
+    heartbeat("rl_start", gpu=gpu_diagnostics())
+    # GRPO rollout strategy by env shape (trl 1.6 adds the hooks these need):
+    #   * single-turn          -> TRL single-shot generation + per-completion reward (below);
+    #   * tool (ToolEnv & subs:
+    #     Stateful/Sandbox/Python) -> TRL drives the tool-call loop natively via
+    #     GRPOTrainer(tools=...) (it parses tool calls, executes the tools, and masks the
+    #     tool-result tokens itself); the reward scores the full transcript;
+    #   * pure multi-turn      -> a custom rollout_func (flash.engine.multiturn_rollout)
+    #     drives THIS env's turn loop on the colocate engine and returns the interleaved
+    #     token sequence with an env_mask so only the model's tokens are trained.
+    is_tool_env = getattr(env, "is_tool_env", False)
+    is_multi_turn = getattr(env, "multi_turn", False)
+    conversational = is_multi_turn  # message-list prompts (tool + pure multi-turn) vs strings
+    if is_multi_turn:
+        # The Liger fused GRPO loss (use_liger_kernel, kept ON to avoid the 248k-vocab fp32-logits
+        # OOM) torch.compiles, and on the VARIABLE-length multi-turn completions its dynamo guard
+        # build trips a torch 2.10 bug (symbol_to_source IndexError) that crashes the first
+        # training step. Let dynamo FALL BACK TO EAGER for the offending function instead of
+        # raising. This is NOT `TORCHDYNAMO_DISABLE` (which would also break the colocate vLLM
+        # engine's required compilation) — dynamo stays enabled; only erroring graphs run eager.
+        try:
+            import torch._dynamo
+            torch._dynamo.config.suppress_errors = True
+            print("[rl] multi-turn: torch._dynamo suppress_errors=True (Liger loss falls back to eager on dynamic shapes)")
+        except Exception as exc:  # never let a torch internals change block the run
+            print(f"[rl] could not set torch._dynamo.suppress_errors: {exc!r}")
+    wait_for_gpu()
+    setup_perf_backends()
+    model_id = JOB_SPEC.model if JOB_SPEC else RECIPE.hf_model_id
+    download_seconds = prefetch_model(model_id)
+    rl = RECIPE.rl
+    # Steps come from the run's [train] steps (already in JOB_SPEC), else the recipe default.
+    steps = int(
+        JOB_SPEC.train.steps if JOB_SPEC and JOB_SPEC.train.steps is not None else rl.num_steps
+    )
+    # Throughput/quality knobs: the number of prompts optimized per step, completions per
+    # prompt, and whether vLLM offloads weights between steps. Sleep mode frees memory for the
+    # optimizer but reloads ~weights each step (a large per-step cost); it's gated OFF by model
+    # size when both the policy and rollout engine fit resident.
+    gcfg = grpo_overrides()
+    _t = JOB_SPEC.train if JOB_SPEC else None
+    # batch_size = prompts per optimizer step for GRPO.
+    # prompts per optimizer step = the run config's [train].batch_size (recipe default otherwise).
+    prompts_per_step = int(
+        _t.batch_size if _t and _t.batch_size is not None else rl.prompts_per_step
+    )
+    group_size = int(gcfg.get("group_size") or rl.group_size)
+    # temperature: explicit None check, NOT `or` — a configured 0.0 (greedy/deterministic
+    # rollouts) must be honored, not fall back to the recipe sampling temperature.
+    _gcfg_temp = gcfg.get("temperature")
+    _temperature = float(_gcfg_temp if _gcfg_temp is not None else rl.sampling_temperature)
+    _kl_beta = float(gcfg.get("kl_penalty_coef") or 0.0)
+    _adv_clip = float(gcfg.get("advantage_clip") or 0.0)
+    _think_penalty = float(gcfg.get("thinking_length_penalty_coef") or 0.0)
+    # vLLM sleep mode offloads the rollout engine's weights between steps to free memory for the
+    # optimizer, but reloading each step is a large per-step cost (PR #174 measured ~2-2.6x faster
+    # GRPO with it OFF on models that fit) AND on the large-model GRPO path the sleep/wake cycle
+    # STALLS the colocated rollout (the rollout emits unparseable completions, then the worker
+    # hangs mid-training). So enable sleep only when the run genuinely can't fit RESIDENT on THIS
+    # card: large/long-context AND the policy + colocated rollout engine + training peak don't fit
+    # on the live GPU. When they fit (the common allocator-sized case), skip sleep entirely.
+    _grpo_ctx = int(_t.max_length if _t and _t.max_length else 0)
+    _card_vram_gb = 0.0
+    try:
+        import torch as _torch_card
+        if _torch_card.cuda.is_available():
+            # Binary GiB (/(1024**3)), NOT decimal GB (/1e9 over-reports ~7%): grpo_fits_resident's
+            # VRAM estimate is in GiB, so a decimal card size would make a marginal card look big
+            # enough to fit resident and wrongly disable sleep, risking OOM.
+            _card_vram_gb = _torch_card.cuda.get_device_properties(0).total_memory / (1024**3)
+    except Exception as _e:
+        print("[rl] card VRAM probe failed (sleep-mode gate falls back to size/context):", _e)
+    _lora_rank = int(_t.lora_rank) if _t and _t.lora_rank else 32
+    sleep_mode = grpo_sleep_mode(
+        model_id,
+        max_length=_grpo_ctx,
+        group_size=group_size,
+        max_tokens=gcfg.get("max_tokens"),
+        lora_rank=_lora_rank,
+        thinking=THINKING,
+        card_vram_gb=_card_vram_gb,
+    )
+    print(
+        f"[rl] vLLM sleep mode = {sleep_mode} "
+        f"(model={model_id}, ctx={_grpo_ctx}, card={_card_vram_gb:.0f}GB)"
+    )
+    # Rollout backend: always colocated vLLM (fast). The whole supported catalog runs GRPO with
+    # colocated vLLM; there is no transformers-generation fallback.
+    use_vllm = True
+    print("[rl] rollout backend: colocated vLLM")
+    from flash.catalog import MODELS as _CATALOG
+    _info = _CATALOG.get(model_id)
+    tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    if tok.pad_token is None:
+        tok.pad_token = tok.eos_token
+    train = env.dataset()
+    rng = random.Random(SEED)
+    rng.shuffle(train)
+    if conversational:
+        # Message-list prompts so the chat template applies roles + (for tool envs) the tool
+        # schemas; per-turn length is managed by the tool loop / rollout_func, not a flat budget.
+        prompts = [{"prompt": env.prompt_messages(ex), "example": ex} for ex in train]
+    else:
+        prompts = [{"prompt": render_prompt(tok, ex), "example": ex} for ex in train]
+    # The colocated vLLM engine's model length is the hard cap on prompt+completion at
+    # rollout. Size it from [train].max_length and derive the prompt budget from it so a
+    # bigger engine or a smaller completion automatically admits longer prompts (rather than
+    # a fixed rl.max_prompt_len that no env override could lift).
+    _max_completion = int(
+        gcfg.get("max_tokens")
+        or (rl.max_completion_len_thinking if THINKING else rl.max_completion_len)
+    )
+    # Engine context = the run's [train].max_length (so a long-context GRPO config sized/paid for
+    # by the allocator actually RUNS at that length), else the recipe default. Without the
+    # train.max_length fallback the allocator provisions a big GPU for the long context but the
+    # engine runs short — paying for headroom we never use.
+    _train_ctx = _t.max_length if (_t and _t.max_length) else 0
+    vllm_max_len = int(_train_ctx or max(1024, rl.max_prompt_len + _max_completion))
+    # The engine must fit completion + at least some prompt. If [train].max_length is below the
+    # completion budget, no prompt can ever fit — fail fast here rather than passing a 1-token
+    # budget that lets prompts through and then OOMs/overflows mid-rollout.
+    if vllm_max_len <= _max_completion:
+        raise ValueError(
+            f"engine length {vllm_max_len} leaves no room for the {_max_completion}-token "
+            "completion; raise [train].max_length or lower [train].max_tokens"
+        )
+    prompt_budget = vllm_max_len - _max_completion
+    # TRL 1.5's GRPOConfig has no max_prompt_length and does NOT truncate prompts, so a prompt
+    # that leaves no room for the completion within the engine length would fail mid-rollout
+    # AFTER the paid worker is provisioned. Drop prompts that don't fit the budget up front.
+    # render_prompt returns an apply_chat_template(tokenize=False) string that already carries
+    # the special tokens, so tokenize with add_special_tokens=False (the default re-adds
+    # BOS/EOS and over-counts).
+    # Drop prompts that leave no room for the completion within the engine length — applies to
+    # BOTH single-turn (string prompts) and conversational (message-list) prompts, so a tool /
+    # multi-turn rollout can't overflow the colocate engine mid-generation. Conversational
+    # prompts are length-checked via the chat template (with the generation prompt).
+    # Tool schemas TRL injects into the prompt for native tools= GRPO — include them in the
+    # budget for a tool env so a prompt isn't undercounted at filter time vs. rollout time.
+    _oai_tools = (
+        getattr(getattr(env, "_env", None), "oai_tools", None) if is_tool_env else None
+    )
+    def _prompt_tokens(p) -> int:
+        if conversational:
+            # Render to text then tokenize — the SAME path the rollout uses — so the filter
+            # count matches the rollout's count (avoids a tokenize=True vs text mismatch).
+            kw = {"tools": _oai_tools} if _oai_tools else {}
+            try:
+                text = tok.apply_chat_template(
+                    p["prompt"],
+                    add_generation_prompt=True,
+                    tokenize=False,
+                    enable_thinking=THINKING,
+                    **kw,
+                )
+            except Exception as exc:
+                # Fail fast WITH context: a tokenizer/template incompatibility would render every
+                # prompt uncountable and otherwise surface as a misleading "all prompts exceed
+                # budget" — raise so the model/template can be fixed before a paid run trains on
+                # a degenerate dataset.
+                raise RuntimeError(
+                    "failed to render a conversational prompt with this model's chat template "
+                    f"(fix the model/template or the env's prompts): {exc}"
+                ) from exc
+            return len(tok(text, add_special_tokens=False).input_ids)
+        return len(tok(p["prompt"], add_special_tokens=False).input_ids)
+    kept = [p for p in prompts if 0 < _prompt_tokens(p) <= prompt_budget]
+    if len(kept) < len(prompts):
+        print(
+            f"[rl] dropped {len(prompts) - len(kept)} prompts over the {prompt_budget}-token "
+            f"prompt budget (engine {vllm_max_len} - completion {_max_completion})"
+        )
+    if not kept:
+        raise ValueError(
+            f"every training prompt exceeds the {prompt_budget}-token prompt budget (engine "
+            f"{vllm_max_len} - completion {_max_completion}); raise [train].max_length, lower "
+            "[train].max_tokens, or shorten the environment's prompts"
+        )
+    prompts = kept
+    resolved_prompts_per_step = resolve_grpo_prompts_per_step(prompts_per_step, len(prompts))
+    if resolved_prompts_per_step != prompts_per_step:
+        print(
+            f"[rl] lowering prompts_per_step from {prompts_per_step} to "
+            f"{resolved_prompts_per_step}: only {len(prompts)} prompt(s) fit after filtering"
+        )
+        prompts_per_step = resolved_prompts_per_step
+    # Carry a stable integer index instead of the rich record so PyArrow can't crash on an env whose
+    # per-row info/metadata legitimately mixes types (see build_grpo_prompt_dataset). reward_fn maps
+    # the index back to the original example object below.
+    ds_rows, rollout_examples = build_grpo_prompt_dataset(prompts)
+    ds = Dataset.from_list(ds_rows)
+    def reward_fn(completions, **kwargs):
+        # rollout_func (pure multi-turn) path: the per-rollout reward is computed by the env
+        # during the rollout and forwarded as the "reward" extra field — pass it through.
+        if kwargs.get("reward") is not None:
+            return [float(r) for r in kwargs["reward"]]
+        # Score the <think>-stripped text (graded_text), then — datums parity — deduct
+        # the thinking-length penalty computed from the RAW completion's <think> span.
+        # The dataset carries example_idx (not the record); map each back to its original object.
+        # Fail LOUD if TRL stops forwarding example_idx (column pruning / a TRL change): defaulting to
+        # [] would zip to ZERO examples -> empty rewards -> silent no-op / broken training (issues
+        # #206 / #210). A reward over the wrong/empty examples is far worse than crashing the run.
+        example_idx = kwargs.get("example_idx")
+        if example_idx is None:
+            raise RuntimeError(
+                "GRPO reward_fn received no 'example_idx' column from TRL — the reward cannot be "
+                "mapped back to its training example, so every reward would be empty/misaligned "
+                f"(got kwargs keys {sorted(kwargs)}). This usually means TRL dropped the dataset "
+                "column (remove_unused_columns / a TRL version change); the run is aborted rather "
+                "than silently training on no signal."
+            )
+        if len(example_idx) != len(completions):
+            raise RuntimeError(
+                f"GRPO reward_fn example_idx/completions length mismatch "
+                f"({len(example_idx)} vs {len(completions)}) — rewards would be misaligned with "
+                "the sampled completions; aborting rather than training on a shifted reward signal."
+            )
+        examples = [rollout_examples[int(i)] for i in example_idx]
+        rewards = []
+        debug_rows = []
+        for idx, (comp, ex) in enumerate(zip(completions, examples, strict=False)):
+            if isinstance(comp, list):
+                # Tool / conversational transcript (TRL passes a list of messages): score the
+                # whole transcript via the environment reward (no <think> stripping —
+                # multi-turn content).
+                r = env.reward_from_messages(comp, ex)
+                rewards.append(r)
+                continue
+            graded = graded_text(comp)
+            breakdown = None
+            if hasattr(env, "scores_breakdown"):
+                breakdown = env.scores_breakdown(graded, ex)
+                r = float(breakdown.get("total", 0.0))
+            else:
+                r = env.reward(graded, ex)
+            if _think_penalty > 0 and THINKING:
+                r -= _think_penalty * think_token_count(comp, tok)
+            rewards.append(r)
+            if idx < 8:
+                debug_rows.append(
+                    {
+                        "ts": time.time(),
+                        "attempt": ATTEMPT,
+                        "run_id": RUN_ID,
+                        "mode": RUN_MODE,
+                        "seed": SEED,
+                        "reward": r,
+                        "breakdown": breakdown,
+                        "completion_prefix": str(comp or "")[:1000],
+                        "graded_prefix": str(graded or "")[:1000],
+                        "example_id": (ex or {}).get("id") if isinstance(ex, dict) else None,
+                        "example_input": (ex or {}).get("input") if isinstance(ex, dict) else None,
+                    }
+                )
+        upload_debug_jsonl("reward_debug.jsonl", debug_rows)
+        return rewards
+    # TRL's per_device_train_batch_size counts COMPLETIONS, not prompts. Size grad-accum so
+    # the global completion batch = prompts_per_step * group_size, i.e. each optimizer step
+    # actually optimizes `prompts_per_step` prompts. The per-device *completion* micro-batch
+    # is the VRAM knob (thinking-aware; see rl_per_device_comps).
+    from flash.engine.vram import resolve_params_b
+    # Open-model (uncataloged) GRPO: size the colocate activation cap from the catalog stat, else
+    # the HF safetensors metadata (no download). Without a real count a large open model falls back
+    # to the ~2B-width default in rl_per_device_comps and gets too LOOSE a per-device cap ->
+    # colocate OOM. Best-effort: stays None offline, keeping prior behavior.
+    _params_b = resolve_params_b(model_id)
+    from flash.catalog import vocab_size_for
+    # Per-device completion-logits cap: a multi-turn rollout accumulates a FULL transcript (model
+    # turns + masked env tokens) up to the engine context — far longer than the single-turn per-turn
+    # budget `_max_completion` — and the trainer's logprob forward processes that whole completion.
+    # So size the fp32 [per_device, completion, vocab] cap against the WORST-CASE multi-turn
+    # completion length (the engine context) instead of `_max_completion`, or a long multi-turn run
+    # OOMs the trainer forward. Single-turn keeps `_max_completion` (its true completion length).
+    _cap_completion_len = vllm_max_len if is_multi_turn else _max_completion
+    per_device_comps = rl_per_device_comps(
+        _cap_completion_len,
+        vocab=vocab_size_for(model_id),
+        use_vllm=use_vllm,
+        params_b=_params_b,
+        # The trainer forward processes prompt+completion up to the engine context, so the
+        # activation/VRAM cap is sized against the worst-case training sequence length.
+        seq_len=vllm_max_len,
+    )
+    if is_multi_turn and _cap_completion_len != _max_completion:
+        print(
+            f"[rl] multi-turn: sizing the per-device logits cap against the full transcript length "
+            f"{_cap_completion_len} (engine context), not the per-turn budget {_max_completion}"
+        )
+    batching = compute_grpo_batching(prompts_per_step, group_size, per_device_comps)
+    if not batching["divisible_by_group"]:
+        print(
+            "WARN: generation batch not divisible by group size; check prompts_per_step/group_size"
+        )
+    print(
+        f"[rl] GRPO batching: per_device={batching['per_device_train_batch_size']} "
+        f"grad_accum={batching['gradient_accumulation_steps']} "
+        f"generations/step={batching['generations_per_step']} "
+        f"unique_prompts/step={batching['unique_prompts_per_step']} "
+        f"(target prompts/step={prompts_per_step}, group={group_size}, sleep={sleep_mode})"
+    )
+    out_dir = f"/tmp/rl_seed{SEED}"
+    resume_ckpt = hf_resume_checkpoint()
+    grpo_kwargs = {
+        "output_dir": out_dir,
+        "learning_rate": (
+            _t.learning_rate if _t and _t.learning_rate is not None else rl.learning_rate
+        ),
+        "per_device_train_batch_size": batching["per_device_train_batch_size"],
+        "gradient_accumulation_steps": batching["gradient_accumulation_steps"],
+        "num_generations": group_size,
+        # NB: GRPOConfig has no max_prompt_length field (TRL 1.5) and does not truncate
+        # prompts; the dataset is pre-filtered above to prompts that fit prompt_budget
+        # (vllm_max_len - completion), so every prompt fits the engine sized here.
+        "max_completion_length": _max_completion,
+        "max_steps": steps,
+        "temperature": _temperature,
+        "top_p": rl.sampling_top_p,
+        "use_vllm": use_vllm,
+        "logging_steps": 1,
+        "save_steps": _t.save_every if _t and _t.save_every is not None else 20,
+        "save_total_limit": 1,
+        # Resumable checkpoints: keep the optimizer/scheduler/RNG state with the LoRA adapter so a
+        # preempted GRPO run resumed via resume_from_checkpoint(hf_resume_checkpoint()) continues
+        # with intact optimizer state + step instead of a fresh optimizer. For LoRA this state is
+        # small (trainable adapter params only). The deployable per-step snapshot strips it
+        # separately, so serving still gets adapter-only files.
+        "save_only_model": False,
+        "bf16": True,
+        "report_to": wandb_report_to(),  # W&B when WANDB_API_KEY present (restored post-flash-migration)
+        "run_name": wandb_run_name(),
+        "seed": SEED,
+        "gradient_checkpointing": grad_checkpointing_on(model_id, vllm_max_len),
+        # Non-reentrant checkpointing: the modern path that composes correctly with autograd
+        # saved-tensor hooks and avoids the reentrant path's extra graph retention. (verl #3629.)
+        "gradient_checkpointing_kwargs": {"use_reentrant": False},
+        # Pin a stable, well-conditioned GRPO recipe instead of inheriting TRL's defaults
+        # (which on a short run suppress the lift): constant LR (TRL default 'linear' decays
+        # to 0 over the run), advantages centered by group mean only (no std scaling, which
+        # biases by difficulty/length — matches datums.centered_advantages), and no
+        # length-normalized loss. beta is the KL-to-reference coef (datums kl_masks ->
+        # kl_penalty_coef).
+        "lr_scheduler_type": "constant",
+        "warmup_ratio": 0.0,
+        "beta": _kl_beta,
+        "scale_rewards": "none",
+        "loss_type": "dr_grpo",
+        # Optimizer: 8-bit paged AdamW (int8 state paged to host RAM -> fits a smaller GPU);
+        # colocated GRPO (trainer + vLLM on one GPU) is memory-tight, so this is the right default.
+        "optim": fused_optim_name(),
+    }
+    # Liger fused GRPO loss: fuses the lm_head + per-token logprob so the fp32
+    # [batch, seq, ~248k vocab] logits never materialize — the documented GRPO OOM driver.
+    # TRL 1.6's GRPOConfig flag is `use_liger_kernel` (NOT `use_liger_loss`, which doesn't
+    # exist in 1.6). DEFAULT ON for the GRPO path regardless of model size: MEASURED that
+    # WITHOUT it even Qwen3.5-0.8B GRPO OOMs a 24 GB (and 32 GB) card because the per-completion
+    # logits over the 248k vocab dominate — the small-scale JIT cost is far cheaper than the OOM.
+    # (This differs from SFT, where Liger is gated by size since 1B-class SFT can be net-negative.)
+    if liger_on(True):
+        grpo_kwargs["use_liger_kernel"] = True
+        print("[rl] liger fused GRPO loss enabled")
+    if use_vllm:
+        # RTX 5090 / sm120: pin a PTX-independent vLLM attention backend (FLASHINFER) BEFORE TRL
+        # builds the colocated engine — else the rollout can silently produce no completions on
+        # old-driver Blackwell hosts (flash-attn PTX JIT failure). No-op off sm120 / if pinned.
+        force_vllm_backend_for_sm120()
+        # Colocate shares one GPU between the policy model and the vLLM rollout engine.
+        # vllm_max_model_length bounds the KV cache to what GRPO needs (else vLLM sizes for
+        # the model's FULL context and won't start on a consumer GPU).
+        # vllm_gpu_memory_utilization sizes vLLM's KV pool. The blanket sleep-path 0.45 was a
+        # misjudgement: on an 80 GB A100 it reserves 0.45 x 80 = 36 GB of KV, but a GRPO rollout only
+        # holds ~num_generations x context tokens. MEASURED (Qwen3.5-4B colocate): that 36 GB
+        # reservation is the dominant resident allocation and sets the step peak (~46 GB) — exactly why
+        # trainer-side optimisations (mask-aware lm_head, fused layers) moved nothing. colocate_kv_util
+        # sizes both paths from flash's per-model KV estimate instead (vram.py); MEASURED 4B/80 GB peak
+        # 46 -> 26 GB, reward byte-identical, train_wall neutral.
+        try:
+            import torch as _torch_vram
+            from flash.engine.vram import colocate_kv_util
+            _total_vram_gb = _torch_vram.cuda.get_device_properties(0).total_memory / 1e9
+            _vllm_gpu_mem_util = colocate_kv_util(
+                _params_b, vllm_max_len, _total_vram_gb, sleep_mode, num_generations=group_size
+            )
+        except Exception:
+            _vllm_gpu_mem_util = 0.45 if sleep_mode else 0.10  # safe fallback to the old constants
+        grpo_kwargs.update(
+            vllm_mode="colocate",
+            vllm_max_model_length=vllm_max_len,
+            vllm_gpu_memory_utilization=_vllm_gpu_mem_util,
+            vllm_enable_sleep_mode=sleep_mode,
+        )
+        # Rollout-memory + throughput knobs, applied ONLY if this TRL exposes the field (so an
+        # older TRL never crashes on an unknown kwarg). All verl-validated for GRPO colocate (#174).
+        _grpo_fields = set(getattr(GRPOConfig, "__dataclass_fields__", {}))
+        def _set_vllm_field(names, value, label):
+            for _f in names:
+                if _f in _grpo_fields:
+                    grpo_kwargs[_f] = value
+                    print(f"[rl] {label} ({_f}={value})")
+                    return True
+            return False
+        # fp8 KV cache only where the silicon has native fp8 (compute capability >= 8.9: Ada /
+        # Hopper / Blackwell) — ~halves the rollout KV pool. Ampere (A100/A6000/3090) lacks
+        # fp8, so it stays fp16 there (forcing it on would error / silently emulate).
+        try:
+            import torch as _torch
+            _want_fp8 = _torch.cuda.get_device_capability() >= (8, 9)
+        except Exception:
+            _want_fp8 = False
+        if _want_fp8:
+            _set_vllm_field(("vllm_kv_cache_dtype", "kv_cache_dtype"), "fp8", "fp8 KV cache")
+        # PREFIX CACHING: every GRPO group of `num_generations` rollouts shares the SAME prompt
+        # prefix, so caching the prompt KV computes it once and reuses it — the dominant rollout win
+        # on one GPU. CHUNKED PREFILL interleaves prefill with decode so a long prompt doesn't stall
+        # the batch. CUDAGRAPH MODE sets verl's full-graph-decode + piecewise-fallback rollout mode.
+        _set_vllm_field(
+            ("vllm_enable_prefix_caching", "enable_prefix_caching"),
+            True,
+            "vLLM prefix caching (shared GRPO prompt KV reuse)",
+        )
+        _set_vllm_field(
+            ("vllm_enable_chunked_prefill", "enable_chunked_prefill"),
+            True,
+            "vLLM chunked prefill",
+        )
+        # vLLM 0.19.1 regressed the Triton _compute_slot_mapping_kernel: it launches
+        # (num_reqs + 1) thread blocks but the block table only has num_reqs rows, so the
+        # extra block causes an illegal memory access (cudaErrorIllegalAddress) on the first
+        # generation step. CUDA graph compilation triggers this path. Skip FULL_AND_PIECEWISE
+        # for vLLM versions outside TRL's supported range (0.12.0-0.19.0) until a fix lands.
+        _cudagraph_safe = True
+        try:
+            import vllm as _vllm_mod
+            _ver_base = _vllm_mod.__version__.split("+")[0]  # strip PEP440 local (e.g. +cu121)
+            _vllm_ver = tuple(int(x) for x in _ver_base.split(".")[:3])
+            if _vllm_ver > (0, 19, 0):
+                _cudagraph_safe = False
+                print(
+                    f"[rl][warn] vLLM {_vllm_mod.__version__} > 0.19.0: skipping "
+                    "FULL_AND_PIECEWISE CUDA graph compilation (Triton slot-mapping "
+                    "crash workaround; update vLLM to a TRL-supported version to re-enable)"
+                )
+                # vLLM 0.19.1 ALSO hits `RuntimeError: aot_compile is not supported by the
+                # current configuration` through its DEFAULT torch.compile path on some GPU
+                # arches (Ampere sm_86: A6000, A100) — it fires from vllm/compilation/wrapper.py
+                # when torch._dynamo.is_compiling() is False inside the CUDA-graph capture path.
+                # Skipping FULL_AND_PIECEWISE above is not enough (the vllm_compilation_config
+                # GRPOConfig field doesn't exist in this TRL, so that _set_vllm_field is a no-op).
+                # VLLM_TORCH_COMPILE_LEVEL=0 (NO_COMPILATION) forces vLLM to execute the model
+                # eagerly, preventing the AOT path entirely. Official vLLM env var (vllm/envs.py);
+                # a no-op on a vLLM that doesn't define it. Don't override an operator-set value.
+                if "VLLM_TORCH_COMPILE_LEVEL" not in os.environ:
+                    os.environ["VLLM_TORCH_COMPILE_LEVEL"] = "0"
+                    print("[rl][warn] VLLM_TORCH_COMPILE_LEVEL=0 (prevent aot_compile on vLLM 0.19.1)")
+        except Exception:
+            pass
+        if _cudagraph_safe:
+            _set_vllm_field(
+                ("vllm_compilation_config", "compilation_config"),
+                {"cudagraph_mode": "FULL_AND_PIECEWISE"},
+                "vLLM cudagraph_mode (verl rollout default)",
+            )
+    # Adapter init: continue training the SFT adapter (peft_config=None, model is the
+    # loaded PeftModel) when train.init_from_adapter is set, else a fresh LoRA on the
+    # string model id (model_init_kwargs forces bf16 — TRL string-loading can fall back
+    # to fp32 and double VRAM).
+    init_model, init_peft = _init_adapter_model(model_id)
+    # chalk's kernels are applied AFTER construction (below) against trainer.model: chalk's apply
+    # patches the LIVE nn.Module, so there is nothing to install pre-build. On the fresh-LoRA path
+    # init_model is just the model-id string (TRL builds the module), and even on the
+    # continue-adapter path TRL may rebuild/wrap the PeftModel, so trainer.model is the
+    # authoritative target.
+    if init_peft is not None:
+        # Fresh LoRA: TRL loads the string model id with these kwargs, then attaches the
+        # adapter. Force bf16 (TRL string-loading can fall back to fp32 and double VRAM).
+        _attn = optimal_attn_impl()  # arch-aware FlashAttention (Kernels Hub) / SDPA
+        grpo_kwargs["model_init_kwargs"] = {"dtype": "bfloat16"}
+        if _attn:
+            grpo_kwargs["model_init_kwargs"]["attn_implementation"] = _attn
+    else:
+        _attn = optimal_attn_impl()
+    # stop_sequences: TRL forwards generation_kwargs to the (vLLM) sampler, whose
+    # SamplingParams.stop truncates each rollout at the requested delimiter — so the reward
+    # sees the same completion the config intends, instead of generating to max_completion.
+    if _t and _t.stop_sequences:
+        grpo_kwargs["generation_kwargs"] = {"stop": list(_t.stop_sequences)}
+    # advantage_clip>0 is the datums centered-advantage clamp; TRL has no advantage-value
+    # clip knob (it clips the importance ratio), so honor the default (clip off ==
+    # centered) and surface a note when a config asks for an explicit clamp.
+    if _adv_clip > 0:
+        print(f"[rl] advantage_clip={_adv_clip} recorded; TRL centers advantages (no value clip)")
+    # num_iterations (the one promoted GRPO speed lever, measured 1.38x faster) is feature-detected
+    # so an older TRL that lacks the field is simply skipped (GRPOConfig rejects unknown kwargs).
+    # Generation dominates GRPO wall-clock, so reusing each rollout batch for 2 optimizer steps is
+    # the cheapest large speedup; mu=2 is the standard GRPO config and TRL's importance-sampling
+    # correction (on by default) keeps the step stable. (The GSPO/DAPO A/B levers were dropped: the
+    # framework-scan in gpu-bench/RESEARCH_FINDINGS.md measured no robust win over baseline.)
+    import dataclasses as _dc
+    try:
+        _grpo_fields = {f.name for f in _dc.fields(GRPOConfig)}
+    except TypeError:
+        _grpo_fields = set()  # not a dataclass on this TRL -> skip the feature-detected knob
+    if "num_iterations" in _grpo_fields:
+        grpo_kwargs["num_iterations"] = 2
+        print("[rl] rollout amortization: num_iterations=2 (reuse each generation batch)")
+    # truncated importance sampling (tis): trl's grpo applies an importance-sampling correction by
+    # default, but with mode="sequence_mask" and clip_max=3.0. the verl/openrlhf recipe for the
+    # rollout(vllm)-vs-training token-distribution mismatch is TOKEN-LEVEL truncated is with the
+    # per-token ratio clipped at c=2 (verl rollout_is_threshold=2.0). adopt that recipe here:
+    # token_truncate + c_max=2.0. feature-detected against this trl's GRPOConfig fields (canonical
+    # clip field first, then the pre-2.0 deprecated alias), so a trl that lacks a field is skipped.
+    # note: this deliberately changes trl's defaults (sequence_mask / 3.0) to the recipe values.
+    if "vllm_importance_sampling_mode" in _grpo_fields:
+        grpo_kwargs["vllm_importance_sampling_mode"] = "token_truncate"
+        print("[rl] tis mode=token_truncate (token-level truncated importance sampling)")
+    _tis_c = 2.0
+    _tis_clip_field = next(
+        (
+            f
+            for f in ("vllm_importance_sampling_clip_max", "vllm_importance_sampling_cap")
+            if f in _grpo_fields
+        ),
+        None,
+    )
+    if _tis_clip_field:
+        grpo_kwargs[_tis_clip_field] = _tis_c
+        print(f"[rl] tis clip c_max={_tis_c} ({_tis_clip_field})")
+    else:
+        print("[rl] tis: trl default importance-sampling correction in effect; no clip field on this trl")
+    cfg = GRPOConfig(**grpo_kwargs)
+    setup_seconds = time.time() - t_start
+    heartbeat("rl_train_start", setup_seconds=setup_seconds, gpu=gpu_diagnostics())
+    # VL checkpoints (Qwen3.5/3.6) train text-only: make TRL's colocated rollout
+    # engine skip the vision tower (VRAM + 5090 PTX-compat; see the patch docstring).
+    # Only relevant when vLLM drives rollouts; transformers generation uses the trainer
+    # model (already text-only via the LoRA target/exclude config).
+    if use_vllm:
+        patch_vllm_language_model_only(model_id)
+        # Install (but do NOT yet activate) the TRL->vLLM weight-sync name remap for Qwen3.5/3.6:
+        # the trainer pushes ``model.*`` names but the VL engine's LM params live under
+        # ``language_model.*``, so the first sync_weights() would raise without this. Activated
+        # below, after the trainer + its initial checkpoint load are built.
+        patch_vllm_lm_weight_sync(model_id)
+    hb_cb = make_reward_heartbeat_callback()
+    # Multi-turn / tool wiring (trl 1.6): tool envs hand TRL the tool callables so it runs the
+    # tool-call loop natively; pure multi-turn envs hand TRL a rollout_func that drives the
+    # env's own turn loop on the colocate engine (env_mask masks the non-model tokens).
+    extra_trainer_kwargs: dict = {}
+    tools = env.tools() if is_tool_env else []
+    # A tool env exposing NO tools would silently degrade to single-shot under tools=[]; drive
+    # it through the rollout_func turn loop instead so it isn't mis-trained as single-turn.
+    if is_tool_env and not tools:
+        print("[rl][warn] tool env exposes no tools — using the multi-turn rollout_func path")
+    use_rollout_func = is_multi_turn and not (is_tool_env and tools)
+    require_vllm_for_rollout_func(use_rollout_func, use_vllm, model_id)
+    if is_tool_env and tools:
+        extra_trainer_kwargs["tools"] = tools
+        print(f"[rl] tool env: handing {len(tools)} tool(s) to TRL's native tool loop")
+    if use_rollout_func:
+        from flash.engine.multiturn_rollout import (
+            build_examples_index,
+            build_rollout_func,
+            index_collisions,
+        )
+        examples_by_key = build_examples_index(train, env.prompt_messages)
+        ncol = index_collisions(train, env.prompt_messages)
+        if ncol:
+            print(
+                f"[rl][warn] {ncol} duplicate prompt(s) collide in the reward index; the shared "
+                "prompt scores against the last example's answer/info"
+            )
+        extra_trainer_kwargs["rollout_func"] = build_rollout_func(
+            active_env=env,
+            tok=tok,
+            examples_by_key=examples_by_key,
+            max_completion=_max_completion,
+            max_turns=getattr(env, "max_turns", 10),
+            temperature=_temperature,
+            top_p=rl.sampling_top_p,
+            stop=(list(_t.stop_sequences) if _t and _t.stop_sequences else None),
+            thinking=THINKING,
+            engine_max_len=vllm_max_len,
+        )
+        print("[rl] multi-turn env: driving the turn loop via rollout_func")
+    # GRPOTrainer.__init__ blocks during model/vLLM init + FA2 kernel compilation (can be
+    # 10-20 min on first use). Background heartbeats keep the stall detector quiet.
+    _rl_init_done = threading.Event()
+    def _rl_init_heartbeat() -> None:
+        while not _rl_init_done.wait(30.0):
+            heartbeat("rl_initializing", gpu=gpu_diagnostics())
+    _rl_init_hb = threading.Thread(target=_rl_init_heartbeat, daemon=True)
+    _rl_init_hb.start()
+    try:
+        trainer = GRPOTrainer(
+            model=init_model,
+            args=cfg,
+            train_dataset=ds,
+            reward_funcs=reward_fn,
+            peft_config=init_peft,
+            processing_class=tok,
+            callbacks=[hb_cb, make_checkpoint_upload_callback()],
+            **extra_trainer_kwargs,
+        )
+    finally:
+        _rl_init_done.set()
+    # Apply chalk's gap-filling kernels (RoPE/LoRA-delta/embedding, like Liger) on the module
+    # GRPOTrainer actually optimizes (trainer.model) — the fresh-LoRA path only passes the model-id
+    # string to TRL, so trainer.model is the authoritative target. chalk composes on top of Liger.
+    # Capture the install report so the engaged kernels land in metrics (active_kernels below).
+    _chalk_report = install_chalk_kernels(getattr(trainer, "model", None))
+    # Liger fused-loss chunk_size: TRL leaves it at the default 1, so the fused GRPO loss runs its
+    # whole detach -> chunk_forward -> compiled-loss -> autograd.grad cycle ONCE PER SEQUENCE
+    # (per_device_train_batch_size times) — Python/kernel-launch/compile-guard overhead that
+    # dominates at small-model scale where the GEMMs are tiny. Collapse it to ONE invocation over the
+    # whole per-device micro-batch. Numerically identical (every loss_type normalizes by the GLOBAL
+    # token count, not the chunk-local size, and chunk losses are summed). Must run BEFORE the
+    # mask-aware wrap below, which replaces trainer.liger_grpo_loss with a closure that has no
+    # chunk_size attribute.
+    _liger_loss = getattr(trainer, "liger_grpo_loss", None)
+    if _liger_loss is not None and hasattr(_liger_loss, "chunk_size"):
+        _cs = max(1, int(getattr(trainer.args, "per_device_train_batch_size", 1)))
+        if _cs > int(getattr(_liger_loss, "chunk_size", 1)):
+            _liger_loss.chunk_size = _cs
+            print(f"[rl] liger fused-loss chunk_size -> {_cs} (one invocation, not one per sequence)")
+    # Run liger's fused GRPO loss EAGER: drop ONLY its torch.compile (BROKEN on torch 2.10 — its
+    # dynamo guard-gen trips a symbol_to_source IndexError that crashes the first GRPO step on every
+    # path), keep the chunked memory path that prevents the 248k-vocab fp32-logit OOM. Must run BEFORE
+    # the mask-aware wrap below, which replaces trainer.liger_grpo_loss with a closure. See the helper.
+    if disable_liger_grpo_torch_compile(trainer):
+        print(
+            "[rl] liger GRPO loss: torch.compile DISABLED (eager loss math; chunked memory path "
+            "retained) — dodges the torch 2.10 dynamo guard-gen crash (symbol_to_source IndexError)"
+        )
+    # Mask-aware lm_head: skip the 248k-vocab projection at MASKED completion positions in the GRPO
+    # loss — its most expensive op, and the trainer step dominates train_wall. For MULTI-TURN that
+    # masked set is the ~half-to-most of the transcript that is env/tool text; for SINGLE-TURN it is
+    # the right-PADDING (GRPO samples variable-length completions, padded to the batch max). Either
+    # way those positions add zero loss/gradient but pay full FLOPs. Loss-preserving; applies to ALL
+    # GRPO with the Liger fused loss; no-op when nothing is masked (uniform-length single-turn).
+    if grpo_kwargs.get("use_liger_kernel") and patch_grpo_mask_aware_lm_head(trainer):
+        _masked_kind = "env + padding" if use_rollout_func else "padding"
+        print(f"[rl] mask-aware lm_head: skipping masked ({_masked_kind}) positions in the GRPO loss")
+    # The trainer (and its colocated vLLM engine + initial checkpoint load) is now built. Activate
+    # the TRL->vLLM weight-sync name remap ONLY now (see patch_vllm_lm_weight_sync) so the initial
+    # checkpoint load stayed untouched while the train-time syncs get remapped. No-op unless the VL
+    # patch above was installed.
+    if use_vllm:
+        _LM_SYNC_REMAP_ON["on"] = True
+        if is_vl_checkpoint(model_id):
+            print("[vllm] LM weight-sync remap activated for training syncs")
+    # Mid-run eval is intentionally NOT run during training: held-out evaluation happens on the
+    # deploy/serving side (against the trained adapter), keeping training pure (no eval-phase cost
+    # or eval-boundary stalls). Training streams only the per-step reward heartbeat.
+    _reset_peak_gpu()  # peak_gpu_gb reflects the train loop (verifies the micro-batch headroom)
+    _gpu_sampler = _GpuPeakSampler().start()  # true device peak incl. vLLM colocate + bnb pages
+    t_train = time.time()
+    with _sdpa_cudnn_ctx(_attn):  # force cuDNN SDPA on sm120 (no-op otherwise)
+        trainer.train(resume_from_checkpoint=resume_ckpt)
+    train_wall = time.time() - t_train
+    rl_peak_gpu_gb = _peak_gpu_gb()
+    rl_device_peak_gpu_gb = _gpu_sampler.stop_gb()
+    reward_history = list(getattr(hb_cb, "reward_history", []))
+    # A GRPO run that finishes WITHOUT the reward callback ever firing (empty reward_history)
+    # produced NO real training — the rollout scored nothing (e.g. vLLM generation silently
+    # returning no completions, observed on RTX 5090 / sm120: ~1.4 s wall, empty reward + loss
+    # curves, but the run otherwise "succeeds"). That is a FAILURE, not a success: a no-op run with
+    # an unchanged adapter must not be reported as done — fail loudly so the operator/agent doesn't
+    # trust it. (An env returning all-zero rewards still appends 0.0s, so an EMPTY history uniquely
+    # means the reward path never ran.)
+    _steps_run = int(getattr(trainer.state, "global_step", 0) or 0)
+    # A resume that already reached the target steps legitimately performs ZERO new optimizer
+    # steps: the previous worker uploaded the final checkpoint (and scored its rewards) but died
+    # before writing metrics/DONE, so this worker's fresh hb_cb has an empty reward_history even
+    # though the policy IS fully trained. Don't fail those — finalize from the resumed state. The
+    # no-op guard below is only for a run that genuinely trained nothing (no resume, or the resume
+    # didn't reach the target steps).
+    _resumed_complete = _grpo_resume_already_complete(resume_ckpt, steps, _steps_run)
+    if _grpo_is_no_op_failure(reward_history, resume_ckpt, steps, _steps_run):
+        if _steps_run == 0:
+            raise RuntimeError(
+                "GRPO trainer completed zero optimizer steps before any reward was scored. "
+                f"retained_prompts={len(prompts)}, prompts_per_step={prompts_per_step}, "
+                f"generations_per_step={batching['generations_per_step']}. This usually means "
+                "TRL built an empty dataloader; add training examples, lower [train].batch_size, "
+                "or reduce prompt length/max_tokens so more examples fit."
+            )
+        raise RuntimeError(
+            f"GRPO scored no reward in {train_wall:.1f}s over {_steps_run} step(s) — the rollout "
+            "produced no completions, so the policy was never actually trained. Failing loudly "
+            "instead of reporting a no-op run as done (seen on RTX 5090/sm120 vLLM rollout)."
+        )
+    if not reward_history and _resumed_complete:
+        print(
+            f"[resume] no new reward in this worker but resumed checkpoint already reached "
+            f"{_steps_run}/{steps} step(s) — finalizing the completed policy instead of failing."
+        )
+    adapter_dir = f"{out_dir}/adapter"
+    trainer.model.save_pretrained(adapter_dir)
+    tok.save_pretrained(adapter_dir)
+    hf_upload_folder(adapter_dir, "adapter", required=True)
+    heartbeat("rl_trained", train_wall=train_wall, gpu=gpu_diagnostics())
+    # Upper bound on generated tokens: completions actually optimized (the intended
+    # prompts_per_step after the batch fix) x the max completion length. Over-counts (most
+    # completions are shorter); reported as an upper bound, used only for a rough throughput.
+    gen_tokens = steps * batching["unique_prompts_per_step"] * group_size * _max_completion
+    write_train_meta(
+        phase="rl",
+        adapter_dir=adapter_dir,
+        model_id=model_id,
+        train_wall=train_wall,
+        setup_seconds=setup_seconds,
+        train_tokens=0,
+        generated_tokens=gen_tokens,
+        notes={
+            "steps": steps,
+            "resumed": bool(resume_ckpt),
+            "download_seconds": download_seconds,
+            "hf_transfer": os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", ""),
+            "reward_history": reward_history,
+            "loss_curve": _metric_curve(trainer, "loss"),
+            # Peak torch-allocated GPU memory during the GRPO train loop (excludes bnb managed
+            # pages). device_peak_gpu_gb is the TRUE device footprint (total-free, incl. the vLLM
+            # colocate engine + bnb pages): the headline for verifying the per-device micro-batch
+            # left the card with headroom (no OOM) at the sized batch.
+            "peak_gpu_gb": rl_peak_gpu_gb,
+            "device_peak_gpu_gb": rl_device_peak_gpu_gb,
+            # Which chalk gap-filling kernels actually ENGAGED (None = chalk not installed or every
+            # kernel fell back) — verifies the chalk stack on a GRPO run without the console.
+            "chalk_kernels": active_kernels(_chalk_report) or None,
+            **wandb_run_info(),
+            "gen_tokens_is_upper_bound": True,
+            "thinking": THINKING,
+            "max_completion_len": _max_completion,
+            "prompts_per_step": batching["unique_prompts_per_step"],
+            "generations_per_step": batching["generations_per_step"],
+            "group_size": group_size,
+            "per_device_train_batch_size": batching["per_device_train_batch_size"],
+            "gradient_accumulation_steps": batching["gradient_accumulation_steps"],
+            "grpo_recipe": {
+                "lr_scheduler": "constant",
+                "beta": _kl_beta,
+                "scale_rewards": "none",
+                "loss_type": "dr_grpo",
+                "temperature": _temperature,
+                "advantage_clip": _adv_clip,
+                "thinking_length_penalty_coef": _think_penalty,
+                "init_from_adapter": JOB_SPEC.train.init_from_adapter if JOB_SPEC else "",
+            },
+        },
+    )
+    free_gpu(trainer)
+# ---------------------------------------------------------------------------
+# Completion: train phase writes metrics.json + the DONE sentinel (see _finalize).
+# ---------------------------------------------------------------------------
+def write_train_meta(
+    phase, adapter_dir, model_id, train_wall, setup_seconds, train_tokens, generated_tokens, notes
+):
+    env = require_active_env()
+    meta = {
+        "phase": phase,
+        "adapter_dir": adapter_dir,
+        "model_id": model_id,
+        "train_wall": train_wall,
+        "setup_seconds": setup_seconds,
+        "train_tokens": train_tokens,
+        "generated_tokens": generated_tokens,
+        "notes": notes or {},
+    }
+    with open("/tmp/train_meta.json", "w") as f:
+        json.dump(meta, f)
+    hf_upload_file("/tmp/train_meta.json", "train_meta.json")
+    heartbeat(
+        f"{phase}_train_done",
+        **{k: meta[k] for k in ("train_wall", "train_tokens", "generated_tokens")},
+        gpu=gpu_diagnostics(),
+    )
+    # Finalize directly from the training phase: build the run-metrics record (training
+    # metrics only — loss/reward are streamed by the trainer; reward_history is in notes)
+    # and write the completion sentinel. There is no separate eval phase.
+    m = RunMetrics(
+        # Substrate the worker actually ran on. The RunPod launcher sets FLASH_ARM; default to
+        # "runpod" when unset so persisted metrics correctly attribute the compute backend.
+        arm=os.environ.get("FLASH_ARM", "runpod"),
+        phase=phase,
+        seed=SEED,
+        model_id=model_id,
+        wall_seconds=train_wall,
+        setup_seconds=setup_seconds,
+        train_throughput_toks_per_s=(
+            (generated_tokens or train_tokens) / train_wall if train_wall else 0.0
+        ),
+        train_tokens=train_tokens,
+        generated_tokens=generated_tokens,
+        notes={
+            **(notes or {}),
+            "renderer": "flash_env",
+            "thinking": THINKING,
+            "train_wall": train_wall,
+            "model_id": model_id,
+            "environment": env.id,
+            "job_spec": JOB_SPEC.to_dict() if JOB_SPEC else None,
+        },
+    )
+    _finalize(m)
+def _resolve_adapter_ref(adapter_ref: str) -> tuple[str, str] | None:
+    """Resolve init_from_adapter into (repo, prefix).
+    The only public form is the exact adapter_ref emitted by ``flash status``:
+    ``<owner>/<repo>:<phase>/<run_id>/seed<N>``.
+    """
+    adapter_ref = adapter_ref.strip()
+    match = re.fullmatch(
+        r"(?P<repo>[A-Za-z0-9][A-Za-z0-9._-]*/[A-Za-z0-9][A-Za-z0-9._-]*):"
+        r"(?P<phase>sft|rl)/(?P<run_id>[A-Za-z0-9][A-Za-z0-9._-]{0,127})/seed(?P<seed>\d+)",
+        adapter_ref,
+    )
+    if not match:
+        return None
+    repo, phase, run_id, seed = match.groups()
+    return repo, f"{phase}/{run_id}/seed{seed}"
+def _download_adapter(adapter_prefix: str | None) -> str | None:
+    """Download an init_from_adapter LoRA to /tmp/evdl/<prefix>/adapter and return its dir.
+    ``adapter_prefix`` must be the full ``adapter_ref`` string emitted by ``flash status``:
+    ``<owner>/<repo>:<phase>/<run_id>/seed<N>``.
+    """
+    if not adapter_prefix:
+        return None
+    resolved = _resolve_adapter_ref(adapter_prefix)
+    if not resolved:
+        return None
+    repo, prefix = resolved
+    from huggingface_hub import snapshot_download
+    snapshot_download(
+        repo_id=repo,
+        repo_type="dataset",
+        allow_patterns=[f"{prefix}/adapter/*"],
+        local_dir="/tmp/evdl",
+        token=os.environ.get("HF_TOKEN"),
+    )
+    adir = os.path.join("/tmp/evdl", prefix, "adapter")
+    return adir if os.path.isdir(adir) else None
+def _finalize(metrics: RunMetrics):
+    metrics.save("/tmp/metrics.json")
+    # Required: a swallowed upload would make the control plane fail/retry a finished run.
+    hf_upload_file("/tmp/metrics.json", "metrics.json", required=True)
+    # DONE sentinel so the controller knows it's safe to tear down
+    with open("/tmp/DONE", "w") as f:
+        f.write(str(time.time()))
+    hf_upload_file("/tmp/DONE", "DONE", required=True)
+    heartbeat("done", gpu=gpu_diagnostics())
+    print("NODE DONE:", metrics.to_json())
+# How long to wait for wandb.finish() to flush. On SUCCESS the full run must sync (a slow network /
+# large run can exceed the old 5s and leave the run "crashed"), so give it a generous-but-bounded
+# window; on FAILURE abort fast (the run is failing regardless and the worker is hard-exiting).
+_WANDB_FINISH_WAIT_S = 120.0
+_WANDB_FINISH_FAIL_WAIT_S = 5.0
+# Baked compiled-kernel cache (opt-in; see Dockerfile.worker + flash/engine/worker/kernel_warmup.py).
+# The Dockerfile points TRITON_CACHE_DIR/TORCHINDUCTOR_CACHE_DIR here and, when built with
+# --build-arg BUILD_KERNEL_CACHE=true, bakes a portable mega-cache produced on a real GPU. These
+# names are kept in lockstep with kernel_warmup.DEFAULT_CACHE_DIR / MEGA_CACHE_FILENAME.
+_KERNEL_CACHE_DIR = "/opt/flash/kernelcache"
+_KERNEL_CACHE_FILE = os.path.join(_KERNEL_CACHE_DIR, "mega_cache.bin")
+_KERNEL_CACHE_META_FILE = os.path.join(_KERNEL_CACHE_DIR, "mega_cache.json")
+def _current_cuda_sm(torch) -> str | None:
+    try:
+        if not torch.cuda.is_available():
+            return None
+        cap = torch.cuda.get_device_capability(0)
+        return f"sm{cap[0]}{cap[1]}"
+    except Exception:
+        return None
+def _load_kernel_cache_if_present() -> bool:
+    """Best-effort: if a baked mega-cache blob exists, load it so the worker skips first-run JIT.
+    Loads the portable cache that kernel_warmup.py wrote on a GPU builder via
+    ``torch.compiler.load_cache_artifacts()`` — measured cold compile ~124s -> warm load ~0.2s.
+    OPT-IN: when no baked cache is present (the default image build), this is a no-op and the worker
+    JITs on first use exactly as before (#163's init heartbeat covers that stall). Never raises:
+    a missing torch / missing file / unusable blob just logs and leaves the JIT path intact.
+    """
+    def _reject(reason: str) -> bool:
+        # a baked cache is present but unusable (no/garbled metadata or wrong arch): repoint
+        # triton/inductor OFF the baked trees (Dockerfile points them at /opt/flash/kernelcache)
+        # so the JIT fallback compiles fresh into scratch instead of reusing wrong-arch baked
+        # entries that would collide with this worker's arch.
+        print(f"[kernel-cache] {reason} -> first-run JIT fallback")
+        scratch = os.path.join(tempfile.gettempdir(), "flash-kernelcache-jit")
+        for sub, var in (("triton", "TRITON_CACHE_DIR"), ("inductor", "TORCHINDUCTOR_CACHE_DIR")):
+            d = os.path.join(scratch, sub)
+            os.makedirs(d, exist_ok=True)
+            os.environ[var] = d
+        return False
+    if not os.path.isfile(_KERNEL_CACHE_FILE):
+        print(f"[kernel-cache] no baked cache at {_KERNEL_CACHE_FILE} -> first-run JIT (expected default)")
+        return False
+    try:
+        import torch
+        current_sm = _current_cuda_sm(torch)
+        try:
+            with open(_KERNEL_CACHE_META_FILE) as f:
+                meta = json.load(f)
+        except FileNotFoundError:
+            return _reject("baked cache has no metadata")
+        except Exception as e:
+            return _reject(f"metadata unreadable ({e})")
+        cached_sm = str(meta.get("sm") or "")
+        if not current_sm:
+            # can't verify the worker's GPU arch -> don't risk loading a wrong-arch blob; JIT instead.
+            return _reject("worker GPU arch undetermined")
+        if cached_sm != current_sm:
+            return _reject(
+                f"baked cache arch {cached_sm or 'unknown'} does not match worker arch {current_sm}"
+            )
+        with open(_KERNEL_CACHE_FILE, "rb") as f:
+            blob = f.read()
+        torch.compiler.load_cache_artifacts(blob)
+        print(
+            f"[kernel-cache] loaded baked mega-cache for {cached_sm or 'unknown'} "
+            f"({len(blob)} bytes) -> skipping first-run JIT"
+        )
+        return True
+    except Exception as e:
+        # never block boot on a bad/absent cache: fall back to the normal JIT path. repoint off the
+        # baked trees too — if the mega blob was present + arch-matched but load raised, the on-disk
+        # triton/inductor entries may be partial/corrupt, so JIT fresh into scratch.
+        return _reject(f"load skipped ({e})")
+def wandb_finish(exit_code: int = 0) -> None:
+    """Finalize the W&B run before the worker's hard ``os._exit()``.
+    The worker hard-exits to dodge the colocated-vLLM teardown deadlock (see main),
+    which skips wandb's atexit sync — so a *successfully completed* run was left
+    dangling and W&B eventually marked it ``crashed`` even though all metrics were
+    logged. Explicitly finish the run (we own it: we called ``wandb.init`` in
+    ``wandb_report_to``) so it shows ``finished``. Best-effort; never raises (W&B is
+    optional, metrics.json is the source of truth)."""
+    if not os.environ.get("WANDB_API_KEY"):
+        return
+    import importlib.util
+    # find_spec can RAISE (not just return None) when wandb is already in sys.modules with an
+    # absent/partial __spec__ (e.g. a namespace-package or a partially-initialized import) — that
+    # would propagate out of the shutdown path and skip the hard exit. Keep it best-effort: treat any
+    # probe failure as "wandb present enough to try", and let the import + finish below (already
+    # wrapped) decide. Only a definitive None (probe succeeded, module truly absent) returns early.
+    try:
+        if importlib.util.find_spec("wandb") is None:
+            return
+    except Exception:
+        pass  # ambiguous probe -> fall through and try to finish (still fully guarded below)
+    try:
+        import wandb
+        if getattr(wandb, "run", None) is None:
+            return
+        errs: list[Exception] = []
+        def _finish() -> None:
+            try:
+                wandb.finish(exit_code=exit_code)
+            except Exception as e:
+                errs.append(e)
+        t = threading.Thread(target=_finish, daemon=True)
+        t.start()
+        # On SUCCESS (exit_code == 0) wandb.finish() must flush the full run; a slow network / large
+        # run can take well over 5s, and cutting it off there is what leaves the run dangling ->
+        # "crashed". Allow a longer, still-bounded wait on success; keep the short cut-off on the
+        # FAILURE path (exit_code != 0) where we want to abort fast and the run is failing anyway.
+        wait_s = _WANDB_FINISH_WAIT_S if exit_code == 0 else _WANDB_FINISH_FAIL_WAIT_S
+        t.join(timeout=wait_s)
+        if t.is_alive():
+            print(f"[wandb] finish() did not complete within {wait_s}s; continuing with hard exit")
+        elif errs:
+            print(f"[wandb] finish() warning: {errs[0]}")
+    except Exception as e:  # pragma: no cover - logging-only path
+        print(f"[wandb] finish() warning: {e}")
+def main():
+    # Idempotency: if DONE was already uploaded, a re-delivered job re-fetches the final
+    # metrics from HF and returns them immediately. (The previous behavior — sleeping in
+    # an infinite loop — kept a billable GPU worker alive until the execution timeout.)
+    try:
+        # Idempotency FIRST — before any env-mutating pip install / package removal: a re-delivered
+        # job whose DONE already exists must return the persisted metrics and exit WITHOUT running
+        # _ensure_fla_fastpath_on_hopper() (mutates the env: pip-installs tilelang/fla) — that wasted
+        # a worker mutating its env on an already-complete run. It runs after the DONE check below.
+        if HF_REPO:
+            from huggingface_hub import hf_hub_download
+            try:
+                hf_hub_download(
+                    repo_id=HF_REPO,
+                    repo_type="dataset",
+                    filename=f"{hf_prefix()}/DONE",
+                    token=os.environ.get("HF_TOKEN"),
+                )
+                done = True
+            except Exception:
+                done = False
+            if done:
+                print("Run already complete (DONE present); returning persisted metrics.")
+                heartbeat("already_done", gpu=gpu_diagnostics(include_torch=False))
+                try:
+                    got = hf_hub_download(
+                        repo_id=HF_REPO,
+                        repo_type="dataset",
+                        filename=f"{hf_prefix()}/metrics.json",
+                        token=os.environ.get("HF_TOKEN"),
+                    )
+                    import shutil
+                    shutil.copy(got, "/tmp/metrics.json")
+                    sys.stdout.flush()
+                    os._exit(0)
+                except Exception as e:
+                    raise SystemExit(f"DONE present but metrics.json unavailable: {e}") from e
+        # Not a DONE re-delivery -> this worker will train. These must run before any model import:
+        _ensure_fla_fastpath_on_hopper()  # Hopper: enable fla+tilelang GDN fast path (see perf.py)
+        # Repoint tilelang's libcudart_stub.so at the real CUDA runtime so it can't shadow libcudart
+        # in vLLM's CudaRTLibrary (intermittent `undefined symbol: cudaDeviceReset` on GRPO vLLM
+        # init, any model size/arch). AFTER the fla fast path (a tilelang reinstall there rewrites
+        # the stub) and BEFORE the model/vLLM import. See perf.py / flash #184.
+        _neutralize_tilelang_cudart_stub()
+        heartbeat("boot", gpu=gpu_diagnostics(include_torch=False))
+        finalize_alloc_conf_for_sleep()  # sync CUDA alloc conf to resolved sleep (before first CUDA alloc)
+        # Opt-in: load a baked compiled-kernel mega-cache (if the image shipped one) so the worker
+        # skips the ~10-15 min first-run JIT. Best-effort + no-op when absent (the default), so the
+        # normal JIT path is untouched. Runs AFTER finalize_alloc_conf_for_sleep: _load probes CUDA
+        # (_current_cuda_sm -> get_device_capability triggers CUDA init), so the allocator conf must be
+        # resolved first; still before any model/kernel import that would otherwise trigger compilation.
+        _load_kernel_cache_if_present()
+        # Dispatch table — register new algorithms (e.g. ppo) here as they land.
+        modes = {
+            "sft": run_sft,  # SFT (TRL SFTTrainer)
+            "rl": run_rl,  # GRPO (TRL GRPOTrainer + colocated vLLM)
+        }
+        handler = modes.get(RUN_MODE)
+        if handler is None:
+            raise SystemExit(f"unknown RUN_MODE {RUN_MODE}; known: {sorted(modes)}")
+        handler()
+        # All artifacts (adapter, train_meta, metrics, DONE) are uploaded to HF *inside* the
+        # handler. The RL trainer's colocated vLLM can DEADLOCK at interpreter shutdown
+        # during NCCL/IPC/CUDA teardown — not segfault-and-exit (which `check=False` on the
+        # train subprocess already tolerates), but hang forever. That would block the Flash
+        # handler's *blocking* `subprocess.run` (heartbeat frozen at "rl_train_done") and the
+        # whole run stalls until the wall-clock cap. Hard-exit to bypass the hanging teardown now that
+        # every output is safely persisted.
+        wandb_finish(exit_code=0)  # mark the W&B run finished BEFORE os._exit (which skips wandb's atexit sync)
+        sys.stdout.flush()
+        sys.stderr.flush()
+        os._exit(0)
+    except Exception as e:
+        # Structured retry signal both pollers read: an infra failure -> retry on a fresh worker.
+        # GitHubRateLimitError (env ref resolution hit a persistent GitHub rate limit) is retriable:
+        # reschedule on a fresh worker once the limit window resets rather than hard-failing. Env
+        # resolution runs lazily inside this try (require_active_env, called by the handlers above),
+        # never at import, so a rate-limit raise reaches here and is classified correctly.
+        retriable = isinstance(e, (RetriableInfraError, GitHubRateLimitError))
+        tb = traceback.format_exc()
+        traceback.print_exc()
+        try:
+            err_name = error_artifact_name(RUN_MODE)
+            err_path = f"/tmp/{err_name}"
+            with open(err_path, "w") as f:
+                f.write(tb)
+            hf_upload_file(err_path, err_name)
+        except Exception as up_err:
+            print("error-upload warn:", up_err)
+        hb_flags = {"retriable": retriable}
+        try:
+            heartbeat(f"error_{RUN_MODE}", error=str(e)[:500], **hb_flags, diag=gpu_diagnostics())
+        except Exception:
+            heartbeat(f"error_{RUN_MODE}", error=str(e)[:500], **hb_flags)
+        # keep container alive briefly so logs flush, then exit non-zero -> restart
+        wandb_finish(exit_code=1)  # finalize the W&B run as failed (don't leave it dangling -> "crashed")
+        time.sleep(10)
+        raise
+if __name__ == "__main__":
+    main()