PyPI - freesolo-flash - Versions diffs - 0.2.3__tar.gz → 0.2.4__tar.gz - Mend

freesolo-flash 0.2.3tar.gz → 0.2.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (165) hide show

{freesolo_flash-0.2.3 → freesolo_flash-0.2.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: freesolo-flash
-Version: 0.2.3
+Version: 0.2.4
 Summary: Flash — managed LoRA post-training (SFT/GRPO) for verifiers environments, driven by the `flash` CLI
 Project-URL: Homepage, https://github.com/freesolo-co/flash
 Project-URL: Repository, https://github.com/freesolo-co/flash
@@ -27,12 +27,14 @@ Requires-Dist: trl<1.7,>=1.6; extra == 'gpu'
 Requires-Dist: verifiers>=0.1.10; extra == 'gpu'
 Requires-Dist: vllm==0.19.1; extra == 'gpu'
 Provides-Extra: server
+Requires-Dist: datasets>=2.19; extra == 'server'
 Requires-Dist: fastapi; extra == 'server'
 Requires-Dist: httpx>=0.27; extra == 'server'
 Requires-Dist: huggingface-hub>=0.34; extra == 'server'
 Requires-Dist: prime>=0.6.3; extra == 'server'
 Requires-Dist: runpod-flash; extra == 'server'
 Requires-Dist: uvicorn; extra == 'server'
+Requires-Dist: verifiers>=0.1.10; extra == 'server'
 Description-Content-Type: text/markdown
 # Flash

{freesolo_flash-0.2.3 → freesolo_flash-0.2.4}/flash/__init__.py RENAMED Viewed

@@ -8,4 +8,4 @@ GPU (RunPod or Vast.ai) behind the scenes.
 __all__ = ["__version__"]
-__version__ = "0.2.3"
+__version__ = "0.2.4"

{freesolo_flash-0.2.3 → freesolo_flash-0.2.4}/flash/catalog.py RENAMED Viewed

@@ -64,6 +64,9 @@ class ModelInfo:
     # the raw tokenizer count). Drives the GRPO fp32-logits memory term and the per-device
     # completion cap. Curated per model below; defaults to the open-model fallback.
     vocab_size: int = _DEFAULT_VOCAB_SIZE
+    # Total parameters in billions — the numeric model size the cost estimator reads directly
+    # (no parsing of the ``params`` display string). Curated per catalog model below.
+    params_b: float = 0.0
     def to_dict(self) -> dict[str, Any]:
         return asdict(self)
@@ -79,6 +82,7 @@ MODELS: dict[str, ModelInfo] = {
         id="openbmb/MiniCPM5-1B",
         display_name="MiniCPM5 1B",
         params="1.2B dense (Llama arch)",
+        params_b=1.2,
         vocab_size=130_560,
         algos=("sft", "grpo"),
         min_vram_gb=12,
@@ -95,6 +99,7 @@ MODELS: dict[str, ModelInfo] = {
         id="Qwen/Qwen3.5-0.8B",
         display_name="Qwen3.5 0.8B",
         params="0.9B (text-only fine-tune)",
+        params_b=0.9,
         vocab_size=248_320,
         algos=("sft", "grpo"),
         min_vram_gb=12,
@@ -106,6 +111,7 @@ MODELS: dict[str, ModelInfo] = {
         id="Qwen/Qwen3.5-2B",
         display_name="Qwen3.5 2B",
         params="2.3B (text-only fine-tune)",
+        params_b=2.3,
         vocab_size=248_320,
         algos=("sft", "grpo"),
         min_vram_gb=16,
@@ -116,6 +122,7 @@ MODELS: dict[str, ModelInfo] = {
         id="Qwen/Qwen3.5-4B",
         display_name="Qwen3.5 4B",
         params="4.7B (text-only fine-tune)",
+        params_b=4.7,
         vocab_size=248_320,
         algos=("sft", "grpo"),
         min_vram_gb=32,
@@ -128,6 +135,7 @@ MODELS: dict[str, ModelInfo] = {
         id="Qwen/Qwen3.5-9B",
         display_name="Qwen3.5 9B",
         params="9.7B (text-only fine-tune)",
+        params_b=9.7,
         vocab_size=248_320,
         algos=("sft", "grpo"),
         min_vram_gb=16,

{freesolo_flash-0.2.3 → freesolo_flash-0.2.4}/flash/cli/main/__init__.py RENAMED Viewed

@@ -137,6 +137,11 @@ def main(argv: list[str] | None = None) -> int:
         help="override a config value; repeatable",
     )
     train.add_argument("--dry-run", action="store_true")
+    train.add_argument(
+        "--cost",
+        action="store_true",
+        help="print the pre-flight USD cost for the config and exit (no submit)",
+    )
     train.add_argument(
         "--background",
         action="store_true",

{freesolo_flash-0.2.3 → freesolo_flash-0.2.4}/flash/cli/main/commands.py RENAMED Viewed

@@ -26,6 +26,7 @@ from flash.client import (
 )
 from flash.client.config import load_credentials
 from flash.client.specs import spec_payload
+from flash.cost.spec import runconfig_from_spec
 from flash.runner import TERMINAL_STATES, new_run_id
 from flash.schema import ConfigError, spec_from_file
@@ -262,12 +263,30 @@ def cmd_env_list(args) -> int:
     return 0
+def _cmd_train_cost(args) -> int:
+    """`flash train --cost`: print the pre-flight USD cost for the config and exit (no submit).
+    Catalog-only and deterministic; an uncapped SFT run loads the env to count its train split."""
+    from flash.cost import estimate_cost
+    spec = spec_from_file(
+        args.config,
+        run_id=None,
+        overrides=args.overrides,
+        extra_configs=args.extra_configs,
+    )
+    print(estimate_cost(runconfig_from_spec(spec)).breakdown())
+    return 0
 def cmd_train(args) -> int:
+    if getattr(args, "cost", False):
+        return _cmd_train_cost(args)
     spec = spec_from_file(
         args.config,
         run_id=new_run_id() if args.dry_run else None,
-        overrides=getattr(args, "overrides", None),
-        extra_configs=getattr(args, "extra_configs", None),
+        overrides=args.overrides,
+        extra_configs=args.extra_configs,
     )
     if args.dry_run:
         # Fully local: validate the id-based config without credentials, a server, or a GPU.

freesolo_flash-0.2.4/flash/cost/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""Flash training-cost estimator: a deterministic, equation-based pre-flight estimate
+(``estimate_cost``) of cost = wall-clock hours x market $/hr. No output multiplier."""
+from __future__ import annotations
+from .analytical import estimate_cost
+from .spec import estimate_for_spec, runconfig_from_spec
+from .types import CostEstimate, RunConfig
+__all__ = [
+    "CostEstimate",
+    "RunConfig",
+    "estimate_cost",
+    "estimate_for_spec",
+    "runconfig_from_spec",
+]

freesolo_flash-0.2.4/flash/cost/analytical.py ADDED Viewed

@@ -0,0 +1,160 @@
+"""The analytical cost model: total = wall-clock hours x GPU $/hr, where wall = cold-start
+setup + steps x per-step time (a FLOPs/MFU estimate). GRPO splits each step into a vLLM
+rollout + reward grading + policy/reference update."""
+from __future__ import annotations
+import math
+from flash.providers.allocator import required_vram_gb, vram_headroom
+from .facts import (
+    download_weight_gb,
+    gpu_tflops,
+    gpu_vram_gb,
+    model_quant,
+    pick_gpu,
+    realized_hourly_usd,
+    reward_seconds_per_completion,
+    total_params_b,
+)
+from .types import CostEstimate, RunConfig
+# FLOPs per token per active-parameter.
+SFT_FLOPS_PER_TOKEN_PER_PARAM = 6.0  # forward (2) + backward (4)
+GRPO_GEN_FLOPS_PER_TOKEN_PER_PARAM = 2.0  # autoregressive rollout forward
+GRPO_UPDATE_FLOPS_PER_TOKEN_PER_PARAM = 8.0  # policy fwd+bwd (6) + frozen-ref fwd (2)
+# Model-FLOPs utilization (fraction of peak sustained), calibrated against real RunPod/Vast
+# wall clock. LoRA + small batches sit well below dense-pretraining MFU.
+MFU_TRAIN = 0.35  # GRPO policy/reference update
+MFU_SFT_TRAIN = 0.25  # SFT fwd/bwd (smaller effective batch, long sequences)
+MFU_DECODE = 0.12  # batched vLLM rollout (decode is memory-bandwidth-bound)
+# Reward grading is CONCURRENT: a step's completions score in parallel slots, so the reward
+# wall is ceil(completions / slots) waves x latency, not completions x latency.
+REWARD_CONCURRENCY = 16.0
+# Cold-start overhead (seconds): container boot + deps + model download (+ vLLM init for GRPO).
+WORKER_BOOT_S = 180.0
+DEPS_INSTALL_S = 120.0
+VLLM_INIT_S = 120.0
+DOWNLOAD_RATE_GBPS = 0.4  # effective HF snapshot download (hf_transfer)
+DEFAULT_WALL_CAP_S = 24 * 3600  # spec gpu.max_wall_seconds default
+def _fmt_duration(seconds: float) -> str:
+    """Human duration for notes: seconds < 1m, minutes < 1h, else whole/1-decimal hours."""
+    if seconds < 60:
+        return f"{seconds:.0f}s"
+    if seconds < 3600:
+        return f"{seconds / 60:.0f}m"
+    hours = seconds / 3600
+    return f"{hours:.0f}h" if abs(hours - round(hours)) < 1e-9 else f"{hours:.1f}h"
+def setup_seconds(config: RunConfig) -> float:
+    """Cold-start wall time billed before the first optimizer step."""
+    s = WORKER_BOOT_S + DEPS_INSTALL_S + download_weight_gb(config.model_id) / DOWNLOAD_RATE_GBPS
+    if config.is_grpo:
+        s += VLLM_INIT_S
+    return s
+def seconds_per_step(config: RunConfig, gpu: str) -> float:
+    """Steady-state wall time for one optimizer step on ``gpu``."""
+    n = config.normalized()
+    params = total_params_b(n.model_id) * 1e9
+    peak = gpu_tflops(gpu) * 1e12  # FLOP/s
+    if not n.is_grpo:
+        flops = SFT_FLOPS_PER_TOKEN_PER_PARAM * params * (n.batch_size * n.seq_len)
+        return flops / (peak * MFU_SFT_TRAIN)
+    # GRPO step = rollout (G completions/prompt) + concurrent reward grading + policy/ref update.
+    completions = n.batch_size * n.group_size
+    gen_tokens = completions * n.completion_len
+    gen_s = (GRPO_GEN_FLOPS_PER_TOKEN_PER_PARAM * params * gen_tokens) / (peak * MFU_DECODE)
+    update_s = (GRPO_UPDATE_FLOPS_PER_TOKEN_PER_PARAM * params * gen_tokens) / (peak * MFU_TRAIN)
+    latency = reward_seconds_per_completion(n.reward_seconds_per_completion)
+    reward_s = math.ceil(completions / REWARD_CONCURRENCY) * latency  # ceil: a partial wave still costs one latency
+    return gen_s + reward_s + update_s
+def select_gpu(config: RunConfig) -> tuple[str, int]:
+    """(chosen GPU class, required VRAM GB): the cheapest fitting class, like the allocator
+    (no pin, no validation gate). Catalog sizing is offline/deterministic."""
+    total_params_b(config.model_id)  # catalog-only: reject a non-catalog model before any (HF) sizing
+    need = required_vram_gb(
+        config.model_id,
+        config.method,
+        train=config.train_knobs(),
+        thinking=config.thinking,
+    )
+    gpu = pick_gpu(need, provider=config.provider)
+    return gpu, need
+def _notes(config: RunConfig, raw_train_s: float, wall_capped: bool, cap_s: float) -> tuple[str, ...]:
+    n = config.normalized()
+    notes: list[str] = []
+    if (quant := model_quant(n.model_id)) != "bf16":
+        notes.append(f"{quant}: smaller VRAM footprint -> cheaper GPU class fits")
+    if n.is_grpo:
+        comps = n.batch_size * n.group_size
+        rsec = reward_seconds_per_completion(n.reward_seconds_per_completion)
+        notes.append(
+            f"GRPO step = vLLM rollout of {n.batch_size}x{n.group_size}={comps} completions "
+            f"@ {n.completion_len} tok + reward ({rsec:.2f}s/completion"
+            + (f", env {n.environment}" if n.environment else "")
+            + ") + policy+reference update"
+        )
+    notes.append(f"GPU sized with {vram_headroom() - 1:.0%} VRAM headroom; market (spot/queue) $/hr")
+    if wall_capped:
+        per_seed = "" if config.setup_repeats == 1 else "per-seed "
+        notes.append(
+            f"training clamped to fit the {_fmt_duration(cap_s)} {per_seed}wall cap "
+            f"(after setup; uncapped: {_fmt_duration(raw_train_s)})"
+        )
+    return tuple(notes)
+def estimate_cost(config: RunConfig, *, wall_cap_s: float = DEFAULT_WALL_CAP_S) -> CostEstimate:
+    """Deterministic pre-flight cost estimate -- the analytical ground truth."""
+    gpu, need = select_gpu(config)
+    hourly = realized_hourly_usd(gpu)
+    # Mirror the runner's max(60, max_wall_seconds) floor so a sub-60s cap isn't underpriced.
+    cap_s = max(60.0, float(config.max_wall_seconds)) if config.max_wall_seconds is not None else wall_cap_s
+    # Each seed is its own job (own cold start + own wall cap): price one seed, clamp, x seeds.
+    seeds = config.setup_repeats
+    setup_per_seed = setup_seconds(config)
+    sps = seconds_per_step(config, gpu)
+    raw_train_per_seed = (config.steps / seeds) * sps
+    # The cap is on total per-seed wall; setup is billed too, so clamp training to fit it.
+    wall_capped = (setup_per_seed + raw_train_per_seed) > cap_s
+    setup_per_seed = min(setup_per_seed, cap_s)
+    train_per_seed = max(0.0, cap_s - setup_per_seed) if wall_capped else raw_train_per_seed
+    setup, train = setup_per_seed * seeds, train_per_seed * seeds
+    wall = setup + train
+    return CostEstimate(
+        model_id=config.model_id,
+        method=config.method,
+        steps=config.steps,
+        gpu=gpu,
+        provider=config.provider,
+        gpu_vram_gb=gpu_vram_gb(gpu),
+        required_vram_gb=need,
+        gpu_hourly_usd=hourly,
+        setup_seconds=setup,
+        seconds_per_step=sps,
+        train_seconds=train,
+        wall_clock_seconds=wall,
+        wall_capped=wall_capped,
+        total_usd=wall / 3600.0 * hourly,
+        notes=_notes(config, raw_train_per_seed, wall_capped, cap_s),
+    )

freesolo_flash-0.2.4/flash/cost/facts.py ADDED Viewed

@@ -0,0 +1,126 @@
+"""Static lookup facts for the cost model: GPU price/VRAM/compute + cheapest-fit
+selection, model size/quant, and reward-grader latency. Pure tables + accessors."""
+from __future__ import annotations
+from flash.catalog import MODELS
+from flash.providers.base import GPU_INFO, GpuClass, providers_for
+# ===== GPU facts =====
+GPU_COMPUTE_TFLOPS: dict[str, float] = {
+    "RTX A4000": 77.0,
+    "RTX 2000 Ada": 89.0,
+    "RTX A4500": 89.0,
+    "RTX 4000 Ada": 90.0,
+    "RTX A5000": 89.0,
+    "RTX 3090": 71.0,
+    "L4": 60.0,
+    "RTX Pro 4000": 95.0,
+    "RTX 4090": 165.0,
+    "RTX 5090": 210.0,
+    "RTX A6000": 155.0,
+    "A40": 150.0,
+    "RTX 6000 Ada": 182.0,
+    "L40S": 181.0,
+    "A100 SXM 40GB": 312.0,
+    "A100 PCIe": 312.0,
+    "A100 SXM": 312.0,
+    "H100 NVL": 835.0,
+    "H100": 990.0,
+    "RTX Pro 6000": 250.0,
+    "RTX Pro 6000 WK": 250.0,
+}
+_DEFAULT_TFLOPS = 100.0
+def gpu_tflops(name: str) -> float:
+    """Peak bf16 tensor TFLOPS for a managed GPU class."""
+    return GPU_COMPUTE_TFLOPS.get(name, _DEFAULT_TFLOPS)
+def gpu_hourly_usd(name: str) -> float:
+    """Static fallback (on-demand list) $/hr for a class."""
+    info = GPU_INFO.get(name)
+    if info is None:
+        raise KeyError(f"unknown GPU class {name!r}")
+    return info.hourly_usd
+# Realized (spot/queue) $/hr per class -- the discount below on-demand list (RTX 5090 lists
+# $0.99, bills ~$0.87). ``realized_hourly_usd`` CLAMPS to the list price so it can never
+# over-quote; a class with no clean observed rate falls back to list.
+REALIZED_HOURLY_USD: dict[str, float] = {
+    "RTX 3090": 0.239,
+    "RTX 4090": 0.426,
+    "RTX 5090": 0.871,
+    "RTX A5000": 0.304,
+    "RTX 6000 Ada": 0.601,
+    "A100 PCIe": 1.035,
+    "A100 SXM": 1.133,
+}
+def realized_hourly_usd(name: str) -> float:
+    """Market (spot/queue) $/hr, clamped to the list price; the list price when not observed."""
+    list_price = gpu_hourly_usd(name)
+    return min(REALIZED_HOURLY_USD.get(name, list_price), list_price)
+def gpu_vram_gb(name: str) -> int:
+    info = GPU_INFO.get(name)
+    if info is None:
+        raise KeyError(f"unknown GPU class {name!r}")
+    return info.vram_gb
+def pick_gpu(required_vram_gb: int, *, provider: str | None = None) -> str:
+    """Cheapest GPU class that fits ``required_vram_gb``, ranked by the REALIZED (market) $/hr it
+    is BILLED at (ties: vram, name) -- so selection is consistent with the bill and approximates
+    the allocator, which provisions the cheapest live offer. No pin and no validation gate -- every
+    fitting class is eligible. ``provider`` restricts candidates to what it can provision.
+    """
+    def _selectable(g: GpuClass) -> bool:
+        return provider in (None, "auto") or provider in providers_for(g.name)
+    candidates = [g for g in GPU_INFO.values() if g.vram_gb >= required_vram_gb and _selectable(g)]
+    if not candidates:
+        raise ValueError(f"no GPU class fits >= {required_vram_gb} GB")
+    best = min(candidates, key=lambda g: (realized_hourly_usd(g.name), g.vram_gb, g.name))
+    return best.name
+# ===== Model-size facts (catalog-only; five dense text models, no MoE/open-model sizing) =====
+def total_params_b(model_id: str) -> float:
+    """Total parameter count (billions) for a catalog model -- the curated ``params_b`` stat."""
+    info = MODELS.get(model_id)
+    if info is None:
+        raise ValueError(
+            f"unknown model {model_id!r}; cost estimation supports catalog models only "
+            f"({', '.join(MODELS)})"
+        )
+    return info.params_b
+def model_quant(model_id: str) -> str:
+    """Quantization of the catalog entry (``"bf16"`` or ``"4bit-qlora"``); bf16 default."""
+    info = MODELS.get(model_id)
+    return (info.quant or "bf16") if info is not None else "bf16"
+def download_weight_gb(model_id: str) -> float:
+    """GB pulled from the HF hub at cold start (full bf16 checkpoint, 2 bytes/param)."""
+    return total_params_b(model_id) * 2.0
+# ===== Reward-grader latency (GRPO) =====
+# A single average grader latency (s/completion) for every env. Graders span ~0.01s (regex/math)
+# to ~3s (LLM judge/code); ~1s is a middle-of-the-road default (a run can override it).
+AVG_REWARD_SECONDS_PER_COMPLETION = 1.0
+def reward_seconds_per_completion(override: float | None = None) -> float:
+    """Per-completion reward latency (s): the explicit override, else the single average."""
+    if override is not None:
+        return max(0.0, override)
+    return AVG_REWARD_SECONDS_PER_COMPLETION

freesolo_flash-0.2.4/flash/cost/spec.py ADDED Viewed

@@ -0,0 +1,87 @@
+"""Map a parsed training ``JobSpec`` to a cost ``RunConfig`` / step count / estimate.
+Shared by ``flash train --cost`` and the control plane's submit-time charge, so both price the
+same work on the same catalog-only, cheapest-fit basis."""
+from __future__ import annotations
+from flash.cost.analytical import estimate_cost
+from flash.cost.types import CostEstimate, RunConfig
+def count_env_examples(env_id: str, params: dict | None = None) -> int | None:
+    """Training rows in ``env_id``'s dataset (the worker's train split), or ``None`` if it can't
+    be loaded. Best-effort -- prices an uncapped SFT run on the real dataset size, not a guess."""
+    if not env_id:
+        return None
+    try:
+        from flash.envs import load_environment
+        rows = load_environment(env_id, params or {}).dataset("train")
+    except Exception:
+        return None
+    return len(rows) if rows is not None else None
+def spec_steps(spec) -> int:
+    """Per-seed optimizer steps implied by a train spec (mirrors the worker). GRPO: ``train.steps``
+    (else recipe default). SFT: ``epochs x ceil(num_examples / realized_batch)`` capped by
+    ``max_steps``, where ``num_examples`` is ``max_examples`` if pinned else the real env size."""
+    from flash.engine.recipe import RECIPE
+    from flash.engine.vram import sft_realized_batch
+    t = spec.train
+    if spec.algorithm == "grpo":
+        if t.steps is not None:
+            return max(1, int(t.steps))
+        return RECIPE.rl.num_steps
+    # --- SFT ---
+    cap = int(t.max_steps) if t.max_steps else 0  # SFT-only optimizer-step cap (0 = uncapped)
+    epochs = int(t.epochs) if t.epochs is not None else RECIPE.sft.num_epochs
+    requested_batch = int(t.batch_size) if t.batch_size is not None else RECIPE.sft.effective_batch
+    batch = sft_realized_batch(requested_batch)
+    # max_examples is a CAP; 0 (like None) means "no cap" (worker trains the full dataset), so
+    # don't let max_examples=0 price a single step.
+    pinned_examples = int(t.max_examples) if t.max_examples else 0
+    if pinned_examples > 0:
+        examples = pinned_examples
+    else:
+        # No cap: the worker trains the FULL env dataset, so price its real size.
+        examples = count_env_examples(spec.environment.id, spec.environment.params)
+        if examples is None:
+            raise ValueError(
+                f"could not load environment {spec.environment.id!r} to count its training "
+                f"examples for the cost; install it (`slm env install {spec.environment.id}`) "
+                "or pin [train].max_examples"
+            )
+    n = max(1, -(-examples // batch) * epochs)  # epochs x ceil(examples / realized_batch)
+    return min(n, cap) if cap > 0 else n
+def runconfig_from_spec(spec) -> RunConfig:
+    """Map a parsed ``JobSpec`` to a cost ``RunConfig``. Each seed is its own job that re-pays the
+    cold start, so steps and setup repeats scale by the seed count. The estimate doesn't pin a
+    GPU -- it does its own cheapest-fit (provider="auto")."""
+    t, g = spec.train, spec.gpu
+    is_grpo = spec.algorithm == "grpo"
+    seeds = max(1, len(t.seeds or (0,)))
+    return RunConfig(
+        model_id=spec.model,
+        method=spec.algorithm,
+        steps=spec_steps(spec) * seeds,
+        setup_repeats=seeds,
+        seq_len=t.max_length,
+        completion_len=t.max_tokens if is_grpo else None,
+        batch_size=t.batch_size,
+        group_size=t.group_size if is_grpo else None,
+        lora_rank=t.lora_rank,
+        thinking=spec.thinking,
+        provider="auto",
+        max_wall_seconds=g.max_wall_seconds,
+        environment=spec.environment.id or None,
+    )
+def estimate_for_spec(spec) -> CostEstimate:
+    """The pre-flight ``CostEstimate`` for a parsed training ``JobSpec``."""
+    return estimate_cost(runconfig_from_spec(spec))

freesolo-flash 0.2.3__tar.gz → 0.2.4__tar.gz

freesolo-flash 0.2.3tar.gz → 0.2.4tar.gz