PyPI - freesolo-flash-dev - Versions diffs - 0.2.25__py3-none-any.whl - Mend

freesolo-flash-dev 0.2.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (111) hide show

flash/__init__.py +29 -0
flash/_channel.py +23 -0
flash/_fileio.py +35 -0
flash/_logging.py +49 -0
flash/_update_check.py +266 -0
flash/catalog.py +253 -0
flash/cli/__init__.py +1 -0
flash/cli/main/__init__.py +227 -0
flash/cli/main/__main__.py +6 -0
flash/cli/main/commands.py +636 -0
flash/cli/main/envpush.py +317 -0
flash/cli/main/render.py +599 -0
flash/cli/main/training_doc.py +455 -0
flash/client/__init__.py +14 -0
flash/client/config.py +70 -0
flash/client/http.py +372 -0
flash/client/runtime_secrets.py +69 -0
flash/client/specs.py +20 -0
flash/cost/__init__.py +16 -0
flash/cost/analytical.py +175 -0
flash/cost/facts.py +114 -0
flash/cost/spec.py +113 -0
flash/cost/types.py +158 -0
flash/engine/__init__.py +6 -0
flash/engine/accounting.py +36 -0
flash/engine/chalk_kernels.py +116 -0
flash/engine/multiturn_rollout.py +780 -0
flash/engine/recipe.py +86 -0
flash/engine/vram.py +603 -0
flash/engine/worker/__init__.py +2916 -0
flash/engine/worker/__main__.py +4 -0
flash/engine/worker/kernel_warmup.py +400 -0
flash/engine/worker/lora.py +796 -0
flash/engine/worker/packing.py +366 -0
flash/engine/worker/perf.py +1048 -0
flash/envs/__init__.py +10 -0
flash/envs/adapter/__init__.py +883 -0
flash/envs/adapter/rubric.py +222 -0
flash/envs/base.py +52 -0
flash/envs/registry.py +62 -0
flash/mcp/__init__.py +1 -0
flash/mcp/server.py +85 -0
flash/providers/__init__.py +59 -0
flash/providers/_auth.py +24 -0
flash/providers/_http.py +230 -0
flash/providers/_instance.py +416 -0
flash/providers/_instance_bootstrap.py +517 -0
flash/providers/_poll.py +311 -0
flash/providers/allocator.py +193 -0
flash/providers/base.py +431 -0
flash/providers/hyperstack/__init__.py +127 -0
flash/providers/hyperstack/api.py +522 -0
flash/providers/hyperstack/auth.py +17 -0
flash/providers/hyperstack/gpus.py +29 -0
flash/providers/hyperstack/jobs/__init__.py +632 -0
flash/providers/hyperstack/jobs/builders.py +122 -0
flash/providers/hyperstack/preflight.py +23 -0
flash/providers/hyperstack/pricing.py +26 -0
flash/providers/hyperstack/train.py +25 -0
flash/providers/lambdalabs/__init__.py +139 -0
flash/providers/lambdalabs/api.py +261 -0
flash/providers/lambdalabs/auth.py +18 -0
flash/providers/lambdalabs/gpus.py +29 -0
flash/providers/lambdalabs/jobs/__init__.py +724 -0
flash/providers/lambdalabs/jobs/builders.py +118 -0
flash/providers/lambdalabs/preflight.py +27 -0
flash/providers/lambdalabs/pricing.py +51 -0
flash/providers/lambdalabs/train.py +27 -0
flash/providers/preflight.py +55 -0
flash/providers/realized.py +80 -0
flash/providers/runpod/__init__.py +130 -0
flash/providers/runpod/api.py +186 -0
flash/providers/runpod/auth.py +37 -0
flash/providers/runpod/cost.py +57 -0
flash/providers/runpod/gpus.py +46 -0
flash/providers/runpod/jobs.py +956 -0
flash/providers/runpod/keys.py +139 -0
flash/providers/runpod/preflight.py +30 -0
flash/providers/runpod/preload.py +915 -0
flash/providers/runpod/pricing.py +18 -0
flash/providers/runpod/slots.py +79 -0
flash/providers/runpod/train/__init__.py +150 -0
flash/providers/runpod/train/deps.py +395 -0
flash/providers/runpod/train/endpoints.py +820 -0
flash/py.typed +0 -0
flash/runner/__init__.py +686 -0
flash/runner/checkpoints.py +82 -0
flash/runner/deploy.py +422 -0
flash/runner/lifecycle.py +672 -0
flash/schema/__init__.py +375 -0
flash/schema/fields.py +331 -0
flash/serve/__init__.py +1 -0
flash/serve/deploy.py +326 -0
flash/serve/pricing.py +60 -0
flash/server/__init__.py +1 -0
flash/server/__main__.py +20 -0
flash/server/app.py +961 -0
flash/server/auth.py +263 -0
flash/server/billing.py +124 -0
flash/server/checkpoints.py +110 -0
flash/server/db.py +160 -0
flash/server/environment_registry.py +102 -0
flash/server/envs.py +360 -0
flash/server/reconcile.py +163 -0
flash/server/run_registry.py +150 -0
flash/spec.py +333 -0
freesolo_flash_dev-0.2.25.dist-info/METADATA +192 -0
freesolo_flash_dev-0.2.25.dist-info/RECORD +111 -0
freesolo_flash_dev-0.2.25.dist-info/WHEEL +4 -0
freesolo_flash_dev-0.2.25.dist-info/entry_points.txt +3 -0
freesolo_flash_dev-0.2.25.dist-info/licenses/LICENSE +201 -0

flash/providers/_poll.py ADDED Viewed

@@ -0,0 +1,311 @@
+"""Shared poll-loop scaffolding for provider job pollers.
+Poll loops share a timestamped ``say()`` logger, a consecutive-poll-error retry/give-up
+counter, and the heartbeat progress-surfacing block (key on (stage, step, ts), log
+``worker: stage=… step=… reward=…``). Only those neutral pieces live here; each poller
+keeps its own status/terminal handling inline.
+"""
+from __future__ import annotations
+import os
+import re
+import time
+from collections.abc import Callable
+from typing import Any
+# Grace past a preload box's embedded wall deadline before an orphan sweep reaps it. A healthy warm
+# self-bounds at its wall cap (the in-box timer ``os._exit``s) and the driver's ``finally`` terminates
+# the instance; a box still alive THIS long past its deadline has lost its driver (the only thing that
+# tears instance providers down), so it is provably orphaned and safe to reap. Generous so clock skew /
+# a slow teardown / a near-deadline box mid-download is never reaped early.
+PRELOAD_REAP_GRACE_S = 1800.0
+def preload_instance_run_id(provider: str, region: str, reap_deadline_epoch: int, suffix: str) -> str:
+    """Build a ``flash-preload-*`` run id that embeds its wall-clock reap deadline (``-d<epoch>-``).
+    The epoch lets an orphan sweep reap a driver-lost warm box by NAME alone (no provider creation-time
+    field needed). ``reap_deadline_epoch`` is the box's wall-cap deadline in epoch seconds. Kept in sync
+    with ``preload_box_reap_due``'s parser — change both together."""
+    return f"flash-preload-{provider}-{region.lower()}-d{int(reap_deadline_epoch)}-{suffix}"
+def preload_box_reap_due(name: str, now: float, grace_s: float = PRELOAD_REAP_GRACE_S) -> bool:
+    """True when a ``flash-preload-*`` instance name carries an embedded reap deadline (``-d<epoch>-``,
+    written by ``preload_instance_run_id``) that elapsed more than ``grace_s`` ago.
+    Used by the Lambda/Hyperstack orphan sweeps: warm boxes are normally driver-owned and exempt, but a
+    driver that died before its ``terminate_run_instances`` finally would leave one billing forever.
+    Reaping past deadline+grace bounds that leak. Names WITHOUT a parseable deadline (legacy launches)
+    return False — the unconditional driver-owned exemption still applies to them. The 10+ digit guard
+    keeps a region segment like ``us-east-1`` from being mistaken for the ``-d<epoch>-`` token."""
+    m = re.search(r"-d(\d{10,})-", name)
+    if not m:
+        return False
+    return float(m.group(1)) + grace_s < now
+def make_say(log) -> Callable[[str], None]:
+    """A timestamped line logger that no-ops when ``log`` is None."""
+    def say(msg: str) -> None:
+        if log is not None:
+            print(f"[{time.strftime('%H:%M:%S')}] {msg}", file=log, flush=True)
+    return say
+class PollErrorTracker:
+    """Counts consecutive poll errors and decides when to give up.
+    Encapsulates the identical retry block both pollers use: on a transient fetch
+    error, log it, give up after ``max_errors`` consecutive failures, otherwise sleep
+    a linear backoff (capped at 60 s) before the caller retries.
+    """
+    def __init__(self, say: Callable[[str], None], interval_s: float, max_errors: int = 8) -> None:
+        self._say = say
+        self._interval_s = interval_s
+        self._max_errors = max_errors
+        self._count = 0
+    def reset(self) -> None:
+        self._count = 0
+    def record(self, exc: Exception) -> bool:
+        """Register a poll error. Returns True if the caller should give up (too many),
+        else sleeps the backoff and returns False (caller should ``continue``)."""
+        self._count += 1
+        self._say(f"poll error ({self._count}): {exc}")
+        if self._count >= self._max_errors:
+            return True
+        time.sleep(min(60, self._interval_s * self._count))
+        return False
+def _num(value: Any) -> float | None:
+    try:
+        if value is None:
+            return None
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+def _fmt_float(value: Any, digits: int = 3) -> str | None:
+    num = _num(value)
+    if num is None:
+        return None
+    return f"{num:.{digits}f}"
+def _fmt_gb(value: Any) -> str | None:
+    num = _num(value)
+    if num is None:
+        return None
+    return f"{num:.1f}GB"
+def _fmt_pct(value: Any) -> str | None:
+    num = _num(value)
+    if num is None:
+        return None
+    return f"{num:.0f}%"
+def _fmt_watts(value: Any) -> str | None:
+    num = _num(value)
+    if num is None:
+        return None
+    return f"{num:.0f}W"
+def _short_process_name(name: str) -> str:
+    base = os.path.basename(str(name or "").strip())
+    return base or "process"
+def format_gpu_status(gpu: Any) -> str:
+    """Human-readable one-line GPU telemetry summary for heartbeat log lines."""
+    if not isinstance(gpu, dict) or not gpu:
+        return ""
+    parts: list[str] = []
+    name = gpu.get("device_name") or gpu.get("name")
+    if name:
+        parts.append(str(name))
+    driver = gpu.get("driver_version")
+    cuda = gpu.get("torch_cuda")
+    if driver:
+        parts.append(f"driver={driver}")
+    if cuda:
+        parts.append(f"cuda={cuda}")
+    util = _fmt_pct(gpu.get("gpu_util_pct"))
+    mem_util = _fmt_pct(gpu.get("mem_util_pct"))
+    if util:
+        parts.append(f"util={util}")
+    if mem_util:
+        parts.append(f"mem_util={mem_util}")
+    used = _fmt_gb(gpu.get("memory_used_gb"))
+    total = _fmt_gb(gpu.get("memory_total_gb"))
+    free = _fmt_gb(gpu.get("memory_free_gb"))
+    if used and total:
+        parts.append(f"mem={used}/{total}")
+    elif free and total:
+        parts.append(f"free={free}/{total}")
+    torch_alloc = _fmt_gb(gpu.get("torch_memory_allocated_gb"))
+    torch_reserved = _fmt_gb(gpu.get("torch_memory_reserved_gb"))
+    if torch_alloc:
+        if torch_reserved:
+            parts.append(f"torch={torch_alloc}/{torch_reserved}")
+        else:
+            parts.append(f"torch={torch_alloc}")
+    temp = _num(gpu.get("temperature_c"))
+    if temp is not None:
+        parts.append(f"temp={temp:.0f}C")
+    power = _fmt_watts(gpu.get("power_w"))
+    power_limit = _fmt_watts(gpu.get("power_limit_w"))
+    if power and power_limit:
+        parts.append(f"power={power}/{power_limit}")
+    elif power:
+        parts.append(f"power={power}")
+    pstate = gpu.get("pstate")
+    if pstate:
+        parts.append(f"pstate={pstate}")
+    processes = gpu.get("processes")
+    if isinstance(processes, list) and processes:
+        proc_parts = []
+        for proc in processes[:3]:
+            if not isinstance(proc, dict):
+                continue
+            pname = _short_process_name(str(proc.get("process_name") or ""))
+            pid = proc.get("pid")
+            mem = _fmt_gb(proc.get("used_memory_gb"))
+            label = f"{pname}:{pid}" if pid is not None else pname
+            if mem:
+                label = f"{label}:{mem}"
+            proc_parts.append(label)
+        if proc_parts:
+            parts.append("procs=" + ",".join(proc_parts))
+    if not parts:
+        if gpu.get("nvidia_smi"):
+            parts.append(str(gpu["nvidia_smi"])[:160])
+        elif gpu.get("nvidia_smi_err"):
+            parts.append(str(gpu["nvidia_smi_err"])[:160])
+    return " gpu[" + " ".join(parts) + "]" if parts else ""
+def _format_heartbeat(hb: dict) -> str:
+    msg = f"worker: stage={hb.get('stage')}"
+    for key, digits in (
+        ("step", 0),
+        ("epoch", 3),
+        ("reward", 3),
+        ("loss", 4),
+        ("grad_norm", 3),
+        ("learning_rate", 8),
+        ("setup_seconds", 1),
+        ("train_wall", 1),
+    ):
+        value = hb.get(key)
+        if value is None:
+            continue
+        if isinstance(value, (int, float)):
+            if digits == 0:
+                msg += f" {key}={int(value)}"
+            else:
+                msg += f" {key}={value:.{digits}f}"
+        else:
+            msg += f" {key}={value}"
+    msg += format_gpu_status(hb.get("gpu") or hb.get("diag"))
+    return msg
+def _record_heartbeat(hb: dict) -> None:
+    run_id = str(hb.get("run_id") or "").strip()
+    if not run_id:
+        return
+    try:
+        from flash.runner import record_heartbeat
+        record_heartbeat(run_id, hb)
+    except Exception:
+        # Status persistence is diagnostic only; polling/liveness must not depend on it.
+        pass
+def surface_heartbeat(
+    heartbeat_reader: Callable[[], Any] | None,
+    last_hb_key: tuple | None,
+    say: Callable[[str], None],
+) -> tuple[tuple | None, str | None]:
+    """Read a heartbeat and, if it advanced, log worker progress.
+    Returns ``(hb_key, stage)`` where ``hb_key`` is the new (stage, step, ts) key (or the
+    unchanged ``last_hb_key`` when nothing advanced) and ``stage`` is the stage of the new
+    heartbeat when it advanced (else None). Callers use the returned ``stage`` for their
+    own setup-vs-training stall bookkeeping.
+    """
+    if heartbeat_reader is None:
+        return last_hb_key, None
+    try:
+        hb = heartbeat_reader()
+    except Exception:
+        hb = None
+    if not hb:
+        return last_hb_key, None
+    key = (hb.get("stage"), hb.get("step"), hb.get("ts"))
+    if key == last_hb_key:
+        return last_hb_key, None
+    _record_heartbeat(hb)
+    stage = hb.get("stage")
+    say(_format_heartbeat(hb))
+    return key, stage
+def heartbeat_progress_ts(hb_key: tuple | None, launch_ts: float | None) -> tuple[float, bool]:
+    """Wall-clock to credit as 'last worker progress' for a just-surfaced heartbeat, plus whether
+    that heartbeat actually belongs to THIS attempt.
+    Use the heartbeat's OWN ``ts`` (key[2] = when the worker actually made progress), not the
+    poll time. On a delayed reattach after a control-plane restart, a heartbeat that was already
+    stale BEFORE the restart must not buy a fresh full stall window — crediting the poll time
+    would hand a hung worker another grace period while the instance keeps billing. Clamp to
+    ``[launch, now]`` so worker/control-plane clock skew can neither make a healthy worker look
+    ancient (premature stall) nor land its progress in the future.
+    Returns ``(ts, fresh)``. ``fresh`` is False when the heartbeat's ts predates this attempt's
+    launch: that is a LEFTOVER heartbeat from a prior attempt (retries reuse the same seed
+    heartbeat path), so the caller must NOT treat it as current progress — otherwise a stale
+    training-stage heartbeat would arm the tighter training stall window and fail a healthy new
+    attempt mid-setup before it has overwritten the old file. ``launch_ts`` uses truthiness (not
+    ``is not None``): the instance handles store started_ts as a non-Optional float coerced to 0.0
+    when missing, so 0.0 means "unknown launch" (a real launch is a large epoch ts). When launch is
+    UNKNOWN we cannot date heartbeats relative to it, so the clamp floor drops to 0.0 and every
+    heartbeat counts as fresh (the safe default: don't discard progress we can't date — clamping the
+    floor to ``now`` instead would mark every normal heartbeat, timestamped before it is read, stale
+    and stall a healthy recovered worker)."""
+    now = time.time()
+    ts = hb_key[2] if (isinstance(hb_key, tuple) and len(hb_key) >= 3) else None
+    try:
+        ts = float(ts)
+    except (TypeError, ValueError):
+        return now, False
+    lo = float(launch_ts) if launch_ts else 0.0  # unknown launch -> floor 0.0 (all heartbeats fresh)
+    fresh = ts >= lo
+    return min(now, max(lo, ts)), fresh
+def surface_forced_heartbeat(
+    heartbeat_reader: Callable[..., Any] | None,
+    last_hb_key: tuple | None,
+    say: Callable[[str], None],
+) -> tuple[tuple | None, str | None]:
+    """Force-read and surface the latest heartbeat, bypassing reader rate limits.
+    Used on terminal provider statuses so a fast worker failure still leaves the last worker/GPU
+    snapshot in both the run log and status JSON.
+    """
+    if heartbeat_reader is None:
+        return last_hb_key, None
+    return surface_heartbeat(lambda: heartbeat_reader(force=True), last_hb_key, say)

flash/providers/allocator.py ADDED Viewed

@@ -0,0 +1,193 @@
+"""GPU allocation: the cheapest fitting class across the active providers.
+Given a base model (+ algorithm), compute the VRAM the FULL run needs — sized for the
+heavier phase, GRPO, since the typical pipeline is SFT followed by GRPO — then rank every
+fitting candidate by $/hr and pick the cheapest:
+  runpod      every validated Flash-provisionable class (static $/hr)
+  lambda      every fitting class that currently has LIVE regional capacity (live $/hr); opt-in,
+              available only when LAMBDA_API_KEY is set on the control plane
+  hyperstack  every fitting class whose single-GPU flavor currently has STOCK (static $/hr); opt-in,
+              available only when HYPERSTACK_API_KEY is set on the control plane
+RunPod's cheaper static rates almost always win on price, so the instance providers (Lambda,
+Hyperstack) join the ranked list as capacity COMPLEMENTS: when RunPod's cheapest fitting class is
+out of capacity (THROTTLED / queue backstop), the runner's gpu-walk steps down the ranked list and
+reaches an in-capacity instance class. Both instance providers are capacity-filtered up front
+(``_lambda_candidates`` / ``_hyperstack_candidates`` only offer a class a region/flavor can supply
+right now), so the walk never lands on a class that would just fail to launch.
+Allocation happens at SUBMIT time in the runner. The parse-time resolution in schema is a
+RunPod-static provisional for validation/dry-run display.
+"""
+from __future__ import annotations
+from flash._logging import get_logger
+from flash.providers import PROVIDER_NAMES, available_providers, get_provider
+from flash.providers.base import (
+    Allocation,
+    Candidate,
+    UnsupportedGpuError,
+)
+logger = get_logger(__name__)
+# "Comfortably" = the open-model VRAM estimate plus headroom, so a full SFT+GRPO run
+# never lands in check_fit's "tight" band by construction. Curated catalog entries
+# already carry measured minimums and are used as-is. The headroom (default 1.1 ==
+# model_required_vram_gb's own default) is read at call time via vram_headroom() so allocate()
+# and the parse-time provisional_gpu size identically.
+def vram_headroom() -> float:
+    """The sizing headroom multiplier, honored by both the submit-time allocator and the
+    parse-time provisional_gpu so they never disagree. A constant."""
+    return 1.1
+def required_vram_gb(
+    model_id: str,
+    algorithm: str,
+    *,
+    train=None,
+    thinking: bool = False,
+) -> int:
+    """VRAM the full run needs, sized to the run's actual knobs (context length, LoRA
+    rank, batch / group size, thinking) via the shared ``model_required_vram_gb`` matrix.
+    Catalog GRPO floors stay hard floors (never under-provision a validated model); the
+    matrix sizes up from there for big contexts/groups and down to a cheaper card for
+    small runs. Unlisted open models size from HF metadata, falling back to the 24 GB tier
+    when unreadable (handled inside model_required_vram_gb)."""
+    from flash.engine.vram import model_required_vram_gb
+    return model_required_vram_gb(
+        model_id,
+        algorithm,
+        train=train,
+        thinking=thinking,
+        headroom=vram_headroom(),
+    )
+def _runpod_candidates(need: int) -> list[Candidate]:
+    """RunPod's fitting, validated classes priced by the static table.
+    Restricted to the validated pool (``g.validated``): the deployed control plane rejects a submit
+    for any non-validated class, so allocating one would only fail at submit time.
+    """
+    provider = get_provider("runpod")
+    return [
+        Candidate("runpod", g.name, provider.hourly_rate(g.name), g.vram_gb)
+        for g in provider.gpu_classes()
+        if g.vram_gb >= need and g.validated
+    ]
+def _lambda_candidates(need: int) -> list[Candidate]:
+    """Lambda's fitting classes that currently have LIVE capacity, priced live.
+    Capacity-aware by design: a Lambda class with no region advertising capacity is EXCLUDED, so
+    the allocator never hands the runner a Lambda class that would immediately fail to launch (and
+    burn a retry) — directly the "GPU allocation is good, doesn't randomly die" property. A Lambda
+    capacity-lookup failure (no key / network blip) degrades to the other providers: it is
+    non-fatal as long as another provider can supply a fitting class.
+    """
+    from flash.providers.lambdalabs.jobs import usable_instances
+    provider = get_provider("lambda")
+    out: list[Candidate] = []
+    try:
+        for g in provider.gpu_classes():
+            if g.vram_gb < need:
+                continue
+            # usable_instances reads the cached /instance-types, so only the first call hits the API.
+            if usable_instances(g.name):
+                out.append(Candidate("lambda", g.name, provider.hourly_rate(g.name), g.vram_gb))
+    except Exception as exc:
+        logger.warning("lambda capacity lookup failed (%s); allocating without lambda", exc)
+        return []
+    return out
+def _hyperstack_candidates(need: int) -> list[Candidate]:
+    """Hyperstack's fitting classes that currently have flavor STOCK, priced statically.
+    Capacity-aware, exactly like Lambda: a class with no in-stock flavor is excluded so the runner
+    never walks onto a class that would immediately fail to launch. A capacity-lookup failure
+    degrades to the other providers.
+    """
+    from flash.providers.hyperstack.jobs import usable_instances
+    provider = get_provider("hyperstack")
+    out: list[Candidate] = []
+    try:
+        for g in provider.gpu_classes():
+            if g.vram_gb < need:
+                continue
+            # usable_instances reads the cached /core/flavors, so only the first call hits the API.
+            if usable_instances(g.name):
+                out.append(Candidate("hyperstack", g.name, provider.hourly_rate(g.name), g.vram_gb))
+    except Exception as exc:
+        logger.warning("hyperstack capacity lookup failed (%s); allocating without hyperstack", exc)
+        return []
+    return out
+def allocate(
+    model_id: str,
+    algorithm: str,
+    *,
+    train=None,
+    thinking: bool = False,
+) -> Allocation:
+    """Pick the cheapest fitting (provider, GPU class) able to run the job.
+    There is no GPU pin — every fitting class on every available provider is eligible, and the
+    cheapest wins. RunPod is restricted to its validated pool (``GpuClass.validated``) because the
+    deployed control plane rejects a submit for any non-validated class; the instance providers
+    (Lambda via LAMBDA_API_KEY, Hyperstack via HYPERSTACK_API_KEY — both opt-in) each contribute
+    their fitting classes that currently have live capacity/stock. RunPod's cheaper static rates
+    usually win, with Lambda and Hyperstack joining as capacity complements lower in the ranked list.
+    ``train``/``thinking`` size the requirement to the run's actual knobs (context, group, rank,
+    batch) via the matrix.
+    """
+    need = required_vram_gb(model_id, algorithm, train=train, thinking=thinking)
+    available = available_providers()
+    candidates: list[Candidate] = []
+    if "runpod" in available:
+        candidates += _runpod_candidates(need)
+    if "lambda" in available:
+        candidates += _lambda_candidates(need)
+    if "hyperstack" in available:
+        candidates += _hyperstack_candidates(need)
+    if not candidates:
+        raise UnsupportedGpuError(
+            f"no allocatable GPU (>= {need} GB VRAM for {model_id}) on any available provider "
+            f"({', '.join(available) or '(none)'}); the run genuinely exceeds every active GPU class"
+        )
+    # Cheapest first; equal rates prefer less VRAM (don't burn a big card on a small job),
+    # then registry order.
+    order = {n: i for i, n in enumerate(PROVIDER_NAMES)}
+    ranked = sorted(candidates, key=lambda c: (c.hourly_usd, c.vram_gb, order.get(c.provider, 99)))
+    best = ranked[0]
+    return Allocation(
+        provider=best.provider,
+        gpu=best.gpu,
+        hourly_usd=best.hourly_usd,
+        min_vram_gb=need,
+        candidates=tuple(ranked),
+    )
+def allocation_summary(a: Allocation) -> str:
+    head = (
+        f"allocated {a.gpu} on {a.provider} at ${a.hourly_usd:.2f}/hr "
+        f"(need >= {a.min_vram_gb} GB VRAM"
+    )
+    head += ")"
+    if len(a.candidates) > 1:
+        nxt = a.candidates[1]
+        head += f"; next-best: {nxt.gpu}@{nxt.provider} ${nxt.hourly_usd:.2f}/hr"
+    return head