PyPI - freesolo-flash-dev - Versions diffs - 0.2.25__py3-none-any.whl - Mend

freesolo-flash-dev 0.2.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (111) hide show

flash/__init__.py +29 -0
flash/_channel.py +23 -0
flash/_fileio.py +35 -0
flash/_logging.py +49 -0
flash/_update_check.py +266 -0
flash/catalog.py +253 -0
flash/cli/__init__.py +1 -0
flash/cli/main/__init__.py +227 -0
flash/cli/main/__main__.py +6 -0
flash/cli/main/commands.py +636 -0
flash/cli/main/envpush.py +317 -0
flash/cli/main/render.py +599 -0
flash/cli/main/training_doc.py +455 -0
flash/client/__init__.py +14 -0
flash/client/config.py +70 -0
flash/client/http.py +372 -0
flash/client/runtime_secrets.py +69 -0
flash/client/specs.py +20 -0
flash/cost/__init__.py +16 -0
flash/cost/analytical.py +175 -0
flash/cost/facts.py +114 -0
flash/cost/spec.py +113 -0
flash/cost/types.py +158 -0
flash/engine/__init__.py +6 -0
flash/engine/accounting.py +36 -0
flash/engine/chalk_kernels.py +116 -0
flash/engine/multiturn_rollout.py +780 -0
flash/engine/recipe.py +86 -0
flash/engine/vram.py +603 -0
flash/engine/worker/__init__.py +2916 -0
flash/engine/worker/__main__.py +4 -0
flash/engine/worker/kernel_warmup.py +400 -0
flash/engine/worker/lora.py +796 -0
flash/engine/worker/packing.py +366 -0
flash/engine/worker/perf.py +1048 -0
flash/envs/__init__.py +10 -0
flash/envs/adapter/__init__.py +883 -0
flash/envs/adapter/rubric.py +222 -0
flash/envs/base.py +52 -0
flash/envs/registry.py +62 -0
flash/mcp/__init__.py +1 -0
flash/mcp/server.py +85 -0
flash/providers/__init__.py +59 -0
flash/providers/_auth.py +24 -0
flash/providers/_http.py +230 -0
flash/providers/_instance.py +416 -0
flash/providers/_instance_bootstrap.py +517 -0
flash/providers/_poll.py +311 -0
flash/providers/allocator.py +193 -0
flash/providers/base.py +431 -0
flash/providers/hyperstack/__init__.py +127 -0
flash/providers/hyperstack/api.py +522 -0
flash/providers/hyperstack/auth.py +17 -0
flash/providers/hyperstack/gpus.py +29 -0
flash/providers/hyperstack/jobs/__init__.py +632 -0
flash/providers/hyperstack/jobs/builders.py +122 -0
flash/providers/hyperstack/preflight.py +23 -0
flash/providers/hyperstack/pricing.py +26 -0
flash/providers/hyperstack/train.py +25 -0
flash/providers/lambdalabs/__init__.py +139 -0
flash/providers/lambdalabs/api.py +261 -0
flash/providers/lambdalabs/auth.py +18 -0
flash/providers/lambdalabs/gpus.py +29 -0
flash/providers/lambdalabs/jobs/__init__.py +724 -0
flash/providers/lambdalabs/jobs/builders.py +118 -0
flash/providers/lambdalabs/preflight.py +27 -0
flash/providers/lambdalabs/pricing.py +51 -0
flash/providers/lambdalabs/train.py +27 -0
flash/providers/preflight.py +55 -0
flash/providers/realized.py +80 -0
flash/providers/runpod/__init__.py +130 -0
flash/providers/runpod/api.py +186 -0
flash/providers/runpod/auth.py +37 -0
flash/providers/runpod/cost.py +57 -0
flash/providers/runpod/gpus.py +46 -0
flash/providers/runpod/jobs.py +956 -0
flash/providers/runpod/keys.py +139 -0
flash/providers/runpod/preflight.py +30 -0
flash/providers/runpod/preload.py +915 -0
flash/providers/runpod/pricing.py +18 -0
flash/providers/runpod/slots.py +79 -0
flash/providers/runpod/train/__init__.py +150 -0
flash/providers/runpod/train/deps.py +395 -0
flash/providers/runpod/train/endpoints.py +820 -0
flash/py.typed +0 -0
flash/runner/__init__.py +686 -0
flash/runner/checkpoints.py +82 -0
flash/runner/deploy.py +422 -0
flash/runner/lifecycle.py +672 -0
flash/schema/__init__.py +375 -0
flash/schema/fields.py +331 -0
flash/serve/__init__.py +1 -0
flash/serve/deploy.py +326 -0
flash/serve/pricing.py +60 -0
flash/server/__init__.py +1 -0
flash/server/__main__.py +20 -0
flash/server/app.py +961 -0
flash/server/auth.py +263 -0
flash/server/billing.py +124 -0
flash/server/checkpoints.py +110 -0
flash/server/db.py +160 -0
flash/server/environment_registry.py +102 -0
flash/server/envs.py +360 -0
flash/server/reconcile.py +163 -0
flash/server/run_registry.py +150 -0
flash/spec.py +333 -0
freesolo_flash_dev-0.2.25.dist-info/METADATA +192 -0
freesolo_flash_dev-0.2.25.dist-info/RECORD +111 -0
freesolo_flash_dev-0.2.25.dist-info/WHEEL +4 -0
freesolo_flash_dev-0.2.25.dist-info/entry_points.txt +3 -0
freesolo_flash_dev-0.2.25.dist-info/licenses/LICENSE +201 -0

flash/cost/facts.py ADDED Viewed

@@ -0,0 +1,114 @@
+"""Static lookup facts for the cost model: GPU price/VRAM/compute + cheapest-fit
+selection, model size/quant, and reward-grader latency. Pure tables + accessors."""
+from __future__ import annotations
+from flash.catalog import MODELS
+from flash.providers.base import GPU_INFO, GpuClass, providers_for
+# ===== GPU facts =====
+GPU_COMPUTE_TFLOPS: dict[str, float] = {
+    "L4": 60.0,
+    "RTX 4090": 165.0,
+    "RTX 5090": 210.0,
+    "RTX A6000": 155.0,
+    "A40": 150.0,
+    "RTX 6000 Ada": 182.0,
+    "A100 PCIe": 312.0,
+    "A100 SXM": 312.0,
+    "H100": 990.0,
+    "RTX Pro 6000": 250.0,
+}
+_DEFAULT_TFLOPS = 100.0
+def gpu_tflops(name: str) -> float:
+    """Peak bf16 tensor TFLOPS for a managed GPU class."""
+    return GPU_COMPUTE_TFLOPS.get(name, _DEFAULT_TFLOPS)
+def gpu_hourly_usd(name: str, provider: str | None = None) -> float:
+    """Representative $/hr for a class, on ``provider`` when given.
+    The nominal ``GpuClass.hourly_usd`` is the RunPod rate, which is WRONG for a provider-specific
+    quote (e.g. a Lambda RTX A6000 is $1.09/hr, not RunPod's $0.49). When ``provider`` is
+    ``lambda``/``hyperstack`` and the class is offered there, price it through that provider's
+    pricing module (live with a static fallback); otherwise (runpod/auto/None) use the nominal rate.
+    """
+    info = GPU_INFO.get(name)
+    if info is None:
+        raise KeyError(f"unknown GPU class {name!r}")
+    p = (provider or "").strip().lower()
+    if p == "lambda" and info.lambda_name:
+        from flash.providers.lambdalabs.pricing import hourly_rate
+        return hourly_rate(name)
+    if p == "hyperstack" and info.hyperstack_name:
+        from flash.providers.hyperstack.pricing import hourly_rate
+        return hourly_rate(name)
+    return info.hourly_usd
+def gpu_vram_gb(name: str) -> int:
+    info = GPU_INFO.get(name)
+    if info is None:
+        raise KeyError(f"unknown GPU class {name!r}")
+    return info.vram_gb
+def pick_gpu(required_vram_gb: int, *, provider: str | None = None) -> str:
+    """Cheapest GPU class that fits ``required_vram_gb``, ranked by static $/hr.
+    No pin; every fitting class is eligible, validated or not. NOTE this is intentionally
+    gate-free: the submit-time allocator restricts to the validated pool, so the
+    actually-provisioned class can be pricier than the one priced here. ``provider`` restricts
+    candidates to what it can provision.
+    """
+    def _selectable(g: GpuClass) -> bool:
+        return provider in (None, "auto") or provider in providers_for(g.name)
+    candidates = [g for g in GPU_INFO.values() if g.vram_gb >= required_vram_gb and _selectable(g)]
+    if not candidates:
+        raise ValueError(f"no GPU class fits >= {required_vram_gb} GB")
+    # Rank by the rate on the REQUESTED provider so a provider-specific quote picks that provider's
+    # cheapest fit (not the cheapest by the RunPod nominal rate).
+    best = min(candidates, key=lambda g: (gpu_hourly_usd(g.name, provider=provider), g.vram_gb, g.name))
+    return best.name
+# ===== Model-size facts (catalog-only; five dense text models, no MoE/open-model sizing) =====
+def total_params_b(model_id: str) -> float:
+    """Total parameter count (billions) for a catalog model -- the curated ``params_b`` stat."""
+    info = MODELS.get(model_id)
+    if info is None:
+        raise ValueError(
+            f"unknown model {model_id!r}; cost estimation supports catalog models only "
+            f"({', '.join(MODELS)})"
+        )
+    return info.params_b
+def model_quant(model_id: str) -> str:
+    """Quantization of the catalog entry; ``"bf16"`` for the whole catalog today (bf16 default)."""
+    info = MODELS.get(model_id)
+    return (info.quant or "bf16") if info is not None else "bf16"
+def download_weight_gb(model_id: str) -> float:
+    """GB pulled from the HF hub at cold start (full bf16 checkpoint, 2 bytes/param)."""
+    return total_params_b(model_id) * 2.0
+# ===== Reward-grader latency (GRPO) =====
+# A single average grader latency (s/completion) for every env. Graders span ~0.01s (regex/math)
+# to ~3s (LLM judge/code); ~1s is a middle-of-the-road default (a run can override it).
+AVG_REWARD_SECONDS_PER_COMPLETION = 1.0
+def reward_seconds_per_completion(override: float | None = None) -> float:
+    """Per-completion reward latency (s): the explicit override, else the single average."""
+    if override is not None:
+        return max(0.0, override)
+    return AVG_REWARD_SECONDS_PER_COMPLETION

flash/cost/spec.py ADDED Viewed

@@ -0,0 +1,113 @@
+"""Map a parsed training ``JobSpec`` to a cost ``RunConfig`` / step count / estimate.
+Used by ``flash train --cost`` for a pre-flight quote. The control plane bills completed runs
+from their final recorded ``cost_usd`` instead of charging this estimate at submit time."""
+from __future__ import annotations
+from flash.cost.analytical import estimate_cost
+from flash.cost.types import CostEstimate, RunConfig
+# Fallback SFT dataset size when an uncapped run's env can't be counted locally. Most Freesolo
+# training datasets land in the
+# low-thousands of rows; this is a representative middle estimate so the quote is in the right
+# ballpark rather than hard-failing.
+DEFAULT_UNCOUNTED_SFT_EXAMPLES = 1000
+def count_env_examples(env_id: str, params: dict | None = None) -> int | None:
+    """Training rows in ``env_id``'s dataset (the worker's train split), or ``None`` if it can't
+    be loaded. Best-effort -- prices an uncapped SFT run on the real dataset size, not a guess.
+    Loading may need network access for managed Freesolo environments. If the environment
+    cannot be loaded in this interpreter, this returns ``None`` and the caller falls back to a
+    default count instead of hard-failing."""
+    if not env_id:
+        return None
+    try:
+        from flash.envs import load_environment
+        rows = load_environment(env_id, params or {}).dataset()
+    except Exception:
+        return None
+    return len(rows) if rows is not None else None
+def spec_steps(spec) -> int:
+    """Per-seed optimizer steps implied by a train spec (mirrors the worker). GRPO: ``train.steps``
+    (else recipe default). SFT: ``epochs x ceil(num_examples / realized_batch)`` capped by
+    ``max_steps``, where ``num_examples`` is ``max_examples`` if pinned else the real env size."""
+    from flash.catalog import vocab_size_for
+    from flash.engine.recipe import RECIPE
+    from flash.engine.vram import resolve_params_b, sft_logits_fused, sft_realized_batch
+    t = spec.train
+    if spec.algorithm == "grpo":
+        if t.steps is not None:
+            return max(1, int(t.steps))
+        return RECIPE.rl.num_steps
+    # --- SFT ---
+    cap = int(t.max_steps) if t.max_steps else 0  # SFT-only optimizer-step cap (0 = uncapped)
+    epochs = int(t.epochs) if t.epochs is not None else RECIPE.sft.num_epochs
+    requested_batch = int(t.batch_size) if t.batch_size is not None else RECIPE.sft.effective_batch
+    # Mirror the worker's per-device micro-batch EXACTLY, incl. the big-vocab logits cap: when the
+    # fused CE is OFF the worker vocab-sizes the micro-batch (engine.worker), which (with CEIL'd
+    # grad-accum) can change the realized global batch and thus the step count. Feed the same
+    # seq/vocab/fused so the priced step count matches what actually runs.
+    sft_seq = (
+        int(t.max_length)
+        if t.max_length is not None
+        else (RECIPE.sft.max_seq_len_thinking if spec.thinking else RECIPE.sft.max_seq_len)
+    )
+    # Resolve params_b via the shared helper (catalog stat else HF safetensors for an open model) —
+    # the SAME resolution the worker's run_sft uses. The fused-CE decision (and thus the big-vocab
+    # micro-batch cap) hinges on the >=3B threshold, so an uncataloged >=3B model must not be priced
+    # as <3B (which would flip fused off, change the realized batch via the cap, and misprice the
+    # step count). Best-effort: no network -> None -> the prior <3B (cap-on) behavior.
+    sft_fused = sft_logits_fused(resolve_params_b(spec.model), sft_seq)
+    batch = sft_realized_batch(
+        requested_batch, seq_len=sft_seq, vocab=vocab_size_for(spec.model), fused=sft_fused
+    )
+    # max_examples is a CAP; 0 (like None) means "no cap" (worker trains the full dataset), so
+    # don't let max_examples=0 price a single step.
+    pinned_examples = int(t.max_examples) if t.max_examples else 0
+    if pinned_examples > 0:
+        examples = pinned_examples
+    else:
+        # No cap: the worker trains the FULL env dataset, so price its real size when we can
+        # count it. A managed Freesolo environment may not be reachable in this interpreter, so
+        # counting can return None. Fall back to a representative default instead of hard-failing.
+        examples = count_env_examples(spec.environment.id, spec.environment.params)
+        if examples is None:
+            examples = DEFAULT_UNCOUNTED_SFT_EXAMPLES
+    n = max(1, -(-examples // batch) * epochs)  # epochs x ceil(examples / realized_batch)
+    return min(n, cap) if cap > 0 else n
+def runconfig_from_spec(spec) -> RunConfig:
+    """Map a parsed ``JobSpec`` to a cost ``RunConfig``. Each seed is its own job that re-pays the
+    cold start, so steps and setup repeats scale by the seed count. The estimate doesn't pin a
+    GPU -- it does its own cheapest-fit (provider="auto")."""
+    t, g = spec.train, spec.gpu
+    is_grpo = spec.algorithm == "grpo"
+    seeds = max(1, len(t.seeds or (0,)))
+    return RunConfig(
+        model_id=spec.model,
+        method=spec.algorithm,
+        steps=spec_steps(spec) * seeds,
+        setup_repeats=seeds,
+        seq_len=t.max_length,
+        completion_len=t.max_tokens if is_grpo else None,
+        batch_size=t.batch_size,
+        group_size=t.group_size if is_grpo else None,
+        lora_rank=t.lora_rank,
+        thinking=spec.thinking,
+        provider="auto",
+        max_wall_seconds=g.max_wall_seconds,
+        environment=spec.environment.id or None,
+    )
+def estimate_for_spec(spec) -> CostEstimate:
+    """The pre-flight ``CostEstimate`` for a parsed training ``JobSpec``."""
+    return estimate_cost(runconfig_from_spec(spec))

flash/cost/types.py ADDED Viewed

@@ -0,0 +1,158 @@
+"""The estimator's I/O types: ``RunConfig`` (input) and ``CostEstimate`` (result)."""
+from __future__ import annotations
+from dataclasses import dataclass, replace
+from flash.catalog import normalize_algorithm
+from flash.engine.recipe import RECIPE
+from flash.providers import PROVIDER_NAMES
+@dataclass(frozen=True)
+class RunConfig:
+    """One training run to price. ``None`` knobs resolve to recipe defaults."""
+    model_id: str
+    method: str  # "sft" | "grpo"
+    steps: int
+    # Cold-start setups the bill covers: a multi-seed run reprovisions (and re-pays boot) per
+    # seed, so this is the seed count.
+    setup_repeats: int = 1
+    # Engine context length (forwarded as [train].max_length, NOT prompt length). When unset the
+    # GRPO default mirrors the worker's max(1024, max_prompt_len + completion); see normalized().
+    seq_len: int | None = None
+    completion_len: int | None = None  # GRPO only (max_tokens)
+    batch_size: int | None = None  # SFT effective batch / GRPO prompts_per_step
+    group_size: int | None = None  # GRPO completions per prompt (G)
+    lora_rank: int | None = None
+    thinking: bool = False
+    # GRPO only: seconds to score one completion. None -> the single average grader latency.
+    reward_seconds_per_completion: float | None = None
+    max_wall_seconds: int | None = None  # per-seed wall cap (spec gpu.max_wall_seconds); None = 24h
+    provider: str = "auto"
+    environment: str | None = None  # Freesolo environment id; descriptive only
+    def __post_init__(self) -> None:
+        object.__setattr__(self, "method", normalize_algorithm(self.method))
+        # Normalize like the allocator (case/whitespace, empty -> "auto") and reject an unknown
+        # substrate up front (else it filters out every candidate -> confusing "no GPU fits").
+        prov = (self.provider or "auto").strip().lower() or "auto"
+        if prov not in ("auto", *PROVIDER_NAMES):
+            raise ValueError(f"unknown provider {self.provider!r} (auto, {', '.join(PROVIDER_NAMES)})")
+        object.__setattr__(self, "provider", prov)
+        if self.steps < 1:
+            raise ValueError(f"steps must be >= 1, got {self.steps}")
+        if self.setup_repeats < 1:
+            raise ValueError(f"setup_repeats must be >= 1, got {self.setup_repeats}")
+        # Steps are split evenly across seeds, so a non-divisible split would price fractional
+        # steps per seed (impossible in a real run).
+        if self.steps % self.setup_repeats != 0:
+            raise ValueError(
+                f"steps ({self.steps}) must be a multiple of setup_repeats ({self.setup_repeats})"
+            )
+        # Reject 0/negative positive-only knobs (bogus quote). max_wall_seconds is NOT here: the
+        # runner floors it to max(60, ...) and estimate_cost mirrors that, so a non-positive cap
+        # is accepted (floored to 60s), not rejected.
+        for _name in ("seq_len", "batch_size", "group_size", "completion_len", "lora_rank"):
+            _val = getattr(self, _name)
+            if _val is not None and _val < 1:
+                raise ValueError(f"{_name} must be >= 1, got {_val}")
+    @property
+    def is_grpo(self) -> bool:
+        return self.method == "grpo"
+    def normalized(self) -> RunConfig:
+        """A copy with every ``None`` knob filled from the recipe for this method."""
+        lora = self.lora_rank if self.lora_rank is not None else RECIPE.lora.rank
+        if self.is_grpo:
+            comp = self.completion_len
+            if comp is None:
+                comp = (
+                    RECIPE.rl.max_completion_len_thinking
+                    if self.thinking
+                    else RECIPE.rl.max_completion_len
+                )
+            # Explicit pin wins; else mirror the allocator's GRPO sizing of an unset max_length:
+            # max(1024, max_prompt_len + completion), not bare max_prompt_len (which under-sizes).
+            seq = (
+                self.seq_len
+                if self.seq_len is not None
+                else max(1024, RECIPE.rl.max_prompt_len + int(comp))
+            )
+            batch = self.batch_size if self.batch_size is not None else RECIPE.rl.prompts_per_step
+            group = self.group_size if self.group_size is not None else RECIPE.rl.group_size
+        else:
+            seq = self.seq_len
+            if seq is None:
+                seq = RECIPE.sft.max_seq_len_thinking if self.thinking else RECIPE.sft.max_seq_len
+            comp = None
+            batch = self.batch_size if self.batch_size is not None else RECIPE.sft.effective_batch
+            group = None
+        return replace(self, seq_len=seq, completion_len=comp, batch_size=batch, group_size=group, lora_rank=lora)
+    def train_knobs(self) -> dict[str, int]:
+        """The knob dict ``model_required_vram_gb`` consumes. Only an EXPLICIT batch_size is
+        forwarded -- an omitted SFT batch sizes as the worker's micro-batch (4), not the recipe's
+        effective batch (32), which would over-provision."""
+        n = self.normalized()
+        knobs: dict[str, int] = {"lora_rank": n.lora_rank}
+        if self.batch_size is not None:
+            knobs["batch_size"] = self.batch_size
+        if n.seq_len is not None:
+            knobs["max_length"] = n.seq_len
+        if n.completion_len is not None:
+            knobs["max_tokens"] = n.completion_len
+        if n.group_size is not None:
+            knobs["group_size"] = n.group_size
+        return knobs
+@dataclass(frozen=True)
+class CostEstimate:
+    """A pre-flight estimate. ``total_usd`` = ``wall_clock_hours * gpu_hourly_usd``, no multiplier."""
+    model_id: str
+    method: str
+    steps: int
+    gpu: str
+    provider: str
+    gpu_vram_gb: int
+    required_vram_gb: int
+    gpu_hourly_usd: float
+    setup_seconds: float  # cold start: boot + deps + model load (+ vLLM init for GRPO)
+    seconds_per_step: float
+    train_seconds: float  # steps * seconds_per_step (post wall-clock cap)
+    wall_clock_seconds: float
+    wall_capped: bool
+    total_usd: float
+    notes: tuple[str, ...] = ()
+    @property
+    def wall_clock_hours(self) -> float:
+        return self.wall_clock_seconds / 3600.0
+    def breakdown(self) -> str:
+        """Multi-line itemized breakdown for CLI output."""
+        lines = [
+            f"Run        : {self.model_id}  [{self.method.upper()}, {self.steps} steps]",
+            f"GPU        : {self.gpu} on {self.provider} "
+            f"({self.gpu_vram_gb} GB; run needs >= {self.required_vram_gb} GB) "
+            f"@ ${self.gpu_hourly_usd:.2f}/hr",
+            f"Setup      : {self.setup_seconds / 60:.1f} min (cold start: boot + deps + model load"
+            + (" + vLLM init" if self.method == "grpo" else "")
+            + ")",
+            f"Per step   : {self.seconds_per_step:.2f} s",
+            f"Train      : {self.train_seconds / 60:.1f} min"
+            + ("  [CAPPED at the wall-clock limit]" if self.wall_capped else ""),
+            f"Wall clock : {self.wall_clock_hours:.2f} h",
+            f"TOTAL      : ${self.total_usd:.2f}",
+        ]
+        if self.notes:
+            lines.append("Notes      :")
+            lines.extend(f"  - {n}" for n in self.notes)
+        return "\n".join(lines)

flash/engine/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Fine-tuning internals for the Flash package.
+This subpackage holds the shared recipe, data loaders, graders, run accounting,
+and the on-GPU worker entrypoint. The RunPod provider invokes ``flash.engine.worker``
+on the provisioned GPU.
+"""

flash/engine/accounting.py ADDED Viewed

@@ -0,0 +1,36 @@
+"""Cost accounting + the standard run-metrics record for Flash runs.
+GPU cost = gpu_hours * hourly_rate (per-second billing on RunPod; artifacts go via HF).
+"""
+from __future__ import annotations
+import json
+from dataclasses import asdict, dataclass, field
+@dataclass
+class RunMetrics:
+    """Standard metrics record written per phase/seed."""
+    arm: str = "runpod"  # compute substrate
+    phase: str = ""  # "sft" | "rl"
+    seed: int = 0
+    model_id: str = ""
+    # Speed
+    wall_seconds: float = 0.0
+    setup_seconds: float = 0.0  # cold start / provisioning + model load
+    train_throughput_toks_per_s: float = 0.0
+    # Token accounting
+    train_tokens: int = 0
+    generated_tokens: int = 0  # RL: total sampled completion tokens
+    # Misc / friction. cost_usd is computed/stamped downstream by the runner from the
+    # provider's $/hr (see runner._persist_metrics), not by the worker.
+    notes: dict = field(default_factory=dict)
+    def to_json(self) -> str:
+        return json.dumps(asdict(self), indent=2)
+    def save(self, path: str):
+        with open(path, "w") as f:
+            f.write(self.to_json())

flash/engine/chalk_kernels.py ADDED Viewed

@@ -0,0 +1,116 @@
+"""Optional chalk GPU kernels (the ``freesolo-chalk`` package).
+Chalk holds Freesolo's hand-written Triton/CUDA kernels that complement Liger: the RoPE the
+qwen3.5 hybrid arch needs (Liger refuses it), the LoRA-delta matmul, fused MLP, the QKV
+norm+RoPE attention epilogue, embedding gather, and FP8 frozen-base GEMMs.
+Chalk ships a Liger-style one-call entry point, ``apply_chalk_kernel_to_qwen35(model, ...)``,
+mirroring ``apply_liger_kernel_to_qwen3``: enablement is the call itself (no env flag), each kernel
+is a boolean keyword, and it NEVER raises on a kernel failure (every installer self-tests +
+arch-gates and falls back to the eager/Liger path; a no-op off-GPU). flash applies it
+AUTOMATICALLY — like Liger — after the trainer builds the model, with the **gap-filling** kernels
+Liger leaves on the eager path ON BY DEFAULT: RoPE, the LoRA-delta matmul, and embedding gather.
+The kernels that OVERLAP Liger (fused MLP / SwiGLU — Liger owns MLP) or are situational (the
+eval-only QKV epilogue, the Hopper-only FP8 frozen base) stay OPT-IN.
+Liger is applied by TRL (``use_liger_kernel``); chalk composes ON TOP of the live Liger modules,
+so flash calls chalk with ``liger=False``. Kernel selection is FIXED (deterministic): the
+gap-fillers run and the overlapping/situational kernels stay off — there is no env override. If
+``freesolo-chalk`` isn't installed (no ``FLASH_CHALK_SPEC``, or on the control plane) the whole
+module degrades to a no-op.
+"""
+from __future__ import annotations
+from collections.abc import Mapping
+from flash._logging import get_logger
+log = get_logger(__name__)
+# Chalk kernel table: (apply_chalk_kernel_to_qwen35 keyword, enabled). Selection is FIXED — there
+# is no env override; the values here are exactly what runs on every supported run.
+# The GAP-FILLERS that complement Liger are ON — applied automatically like apply_liger_kernel —
+# because each chalk installer self-tests on install and falls back to the eager/Liger path on any
+# failure, so always-applying them is safe:
+#   * rope             — the RoPE Liger REFUSES on the qwen3.5 hybrid arch (its only real gap)
+#   * fused_lora_delta — the LoRA-delta matmul on the trainable path (Liger doesn't touch adapters)
+#   * fused_embedding  — the embedding gather (Liger doesn't touch it)
+# The OVERLAPPING / situational kernels stay OFF: the fused MLP overlaps Liger's SwiGLU (Liger owns
+# MLP), the attn epilogue is eval-only (needs q/k/v out of LORA_TARGETS), and the FP8 frozen base is
+# Hopper sm_90+ only. The keyword is exactly chalk's apply_chalk_kernel_to_qwen35 kwarg.
+_KERNELS: list[tuple[str, bool]] = [
+    ("rope", True),
+    ("fused_lora_delta", True),
+    ("fused_embedding", True),
+    ("fused_mlp", False),  # off (Liger owns MLP/SwiGLU)
+    ("attn_epilogue", False),  # off (eval-only; needs q/k/v out of LoRA)
+    ("fp8_frozen_base", False),  # off (Hopper sm_90+ only)
+]
+def _enabled_kwargs() -> dict[str, bool]:
+    """The fixed ``apply_chalk_kernel_to_qwen35`` boolean kwargs (gap-fillers on, the rest off)."""
+    return dict(_KERNELS)
+def active_kernels(report: Mapping[str, object] | None) -> list[str]:
+    """The chalk kernels that actually ENGAGED (truthy, non-error result) in an apply report.
+    For a metrics note recording which kernels ran (so chalk engagement is verifiable without the
+    console). Excludes ``liger`` (TRL applies Liger; chalk's report carries it as False here).
+    """
+    return sorted(
+        k
+        for k, v in (report or {}).items()
+        if k != "liger" and v not in (False, None) and not (isinstance(v, dict) and "error" in v)
+    )
+def install_chalk_kernels(model=None) -> dict:
+    """Apply chalk's gap-filling kernels to ``model`` — ON by default (like Liger).
+    Uses chalk's Liger-style entry point ``apply_chalk_kernel_to_qwen35(model, liger=False, ...)``:
+    Liger is already applied by TRL (``use_liger_kernel``), so chalk composes on top of the live
+    Liger modules. Each kernel is a fixed boolean (gap-fillers on, the rest off). Returns chalk's
+    per-kernel report, or ``{}`` when there is no model yet or freesolo-chalk isn't installed.
+    chalk's apply patches the LIVE module, so the worker calls this AFTER TRL builds the trainer
+    (``model=trainer.model``); ``model is None`` is a safe no-op kept for defensive callers.
+    """
+    if model is None:
+        # chalk's apply patches the materialized module -> nothing to do before the model is built.
+        return {}
+    kwargs = _enabled_kwargs()
+    try:
+        from chalk.transformers import apply_chalk_kernel_to_qwen35
+    except ImportError:
+        # chalk is installed by default (PyPI; chalk_extra_pip), so this only fires if an install
+        # was disabled/failed. Always safe: the kernels degrade to the eager/Liger path. Only the
+        # post-build call reaches this import (the pre-build pass returns early), so it logs at most
+        # once per run — no per-process dedup needed.
+        log.info(
+            "freesolo-chalk is not installed on this worker (set FLASH_CHALK_SPEC to an installable "
+            "spec, or check the default PyPI install); chalk kernels off, using eager/Liger."
+        )
+        return {}
+    except Exception as e:
+        # A partially-installed / version-incompatible chalk can raise non-ImportError errors at
+        # import time (e.g. a Triton/torch mismatch). This hook must never abort training.
+        log.warning("chalk import failed (ignored, kernels disabled): %s", e)
+        return {}
+    try:
+        # liger=False: TRL already applied Liger (use_liger_kernel); chalk composes on the live
+        # Liger modules. apply_chalk_kernel_to_qwen35 never raises on a per-kernel failure, but
+        # guard the call itself so a chalk API/version skew can never abort training.
+        report = apply_chalk_kernel_to_qwen35(model, liger=False, **kwargs)
+    except Exception as e:  # never block training on the optional kernel stack
+        log.warning("chalk apply failed (ignored, kernels disabled): %s", e)
+        return {}
+    active = active_kernels(report)
+    if active:
+        log.info("chalk kernels active: %s", ", ".join(active))
+    return report or {}