PyPI - freesolo-flash-dev - Versions diffs - 0.2.25__py3-none-any.whl - Mend

freesolo-flash-dev 0.2.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (111) hide show

flash/__init__.py +29 -0
flash/_channel.py +23 -0
flash/_fileio.py +35 -0
flash/_logging.py +49 -0
flash/_update_check.py +266 -0
flash/catalog.py +253 -0
flash/cli/__init__.py +1 -0
flash/cli/main/__init__.py +227 -0
flash/cli/main/__main__.py +6 -0
flash/cli/main/commands.py +636 -0
flash/cli/main/envpush.py +317 -0
flash/cli/main/render.py +599 -0
flash/cli/main/training_doc.py +455 -0
flash/client/__init__.py +14 -0
flash/client/config.py +70 -0
flash/client/http.py +372 -0
flash/client/runtime_secrets.py +69 -0
flash/client/specs.py +20 -0
flash/cost/__init__.py +16 -0
flash/cost/analytical.py +175 -0
flash/cost/facts.py +114 -0
flash/cost/spec.py +113 -0
flash/cost/types.py +158 -0
flash/engine/__init__.py +6 -0
flash/engine/accounting.py +36 -0
flash/engine/chalk_kernels.py +116 -0
flash/engine/multiturn_rollout.py +780 -0
flash/engine/recipe.py +86 -0
flash/engine/vram.py +603 -0
flash/engine/worker/__init__.py +2916 -0
flash/engine/worker/__main__.py +4 -0
flash/engine/worker/kernel_warmup.py +400 -0
flash/engine/worker/lora.py +796 -0
flash/engine/worker/packing.py +366 -0
flash/engine/worker/perf.py +1048 -0
flash/envs/__init__.py +10 -0
flash/envs/adapter/__init__.py +883 -0
flash/envs/adapter/rubric.py +222 -0
flash/envs/base.py +52 -0
flash/envs/registry.py +62 -0
flash/mcp/__init__.py +1 -0
flash/mcp/server.py +85 -0
flash/providers/__init__.py +59 -0
flash/providers/_auth.py +24 -0
flash/providers/_http.py +230 -0
flash/providers/_instance.py +416 -0
flash/providers/_instance_bootstrap.py +517 -0
flash/providers/_poll.py +311 -0
flash/providers/allocator.py +193 -0
flash/providers/base.py +431 -0
flash/providers/hyperstack/__init__.py +127 -0
flash/providers/hyperstack/api.py +522 -0
flash/providers/hyperstack/auth.py +17 -0
flash/providers/hyperstack/gpus.py +29 -0
flash/providers/hyperstack/jobs/__init__.py +632 -0
flash/providers/hyperstack/jobs/builders.py +122 -0
flash/providers/hyperstack/preflight.py +23 -0
flash/providers/hyperstack/pricing.py +26 -0
flash/providers/hyperstack/train.py +25 -0
flash/providers/lambdalabs/__init__.py +139 -0
flash/providers/lambdalabs/api.py +261 -0
flash/providers/lambdalabs/auth.py +18 -0
flash/providers/lambdalabs/gpus.py +29 -0
flash/providers/lambdalabs/jobs/__init__.py +724 -0
flash/providers/lambdalabs/jobs/builders.py +118 -0
flash/providers/lambdalabs/preflight.py +27 -0
flash/providers/lambdalabs/pricing.py +51 -0
flash/providers/lambdalabs/train.py +27 -0
flash/providers/preflight.py +55 -0
flash/providers/realized.py +80 -0
flash/providers/runpod/__init__.py +130 -0
flash/providers/runpod/api.py +186 -0
flash/providers/runpod/auth.py +37 -0
flash/providers/runpod/cost.py +57 -0
flash/providers/runpod/gpus.py +46 -0
flash/providers/runpod/jobs.py +956 -0
flash/providers/runpod/keys.py +139 -0
flash/providers/runpod/preflight.py +30 -0
flash/providers/runpod/preload.py +915 -0
flash/providers/runpod/pricing.py +18 -0
flash/providers/runpod/slots.py +79 -0
flash/providers/runpod/train/__init__.py +150 -0
flash/providers/runpod/train/deps.py +395 -0
flash/providers/runpod/train/endpoints.py +820 -0
flash/py.typed +0 -0
flash/runner/__init__.py +686 -0
flash/runner/checkpoints.py +82 -0
flash/runner/deploy.py +422 -0
flash/runner/lifecycle.py +672 -0
flash/schema/__init__.py +375 -0
flash/schema/fields.py +331 -0
flash/serve/__init__.py +1 -0
flash/serve/deploy.py +326 -0
flash/serve/pricing.py +60 -0
flash/server/__init__.py +1 -0
flash/server/__main__.py +20 -0
flash/server/app.py +961 -0
flash/server/auth.py +263 -0
flash/server/billing.py +124 -0
flash/server/checkpoints.py +110 -0
flash/server/db.py +160 -0
flash/server/environment_registry.py +102 -0
flash/server/envs.py +360 -0
flash/server/reconcile.py +163 -0
flash/server/run_registry.py +150 -0
flash/spec.py +333 -0
freesolo_flash_dev-0.2.25.dist-info/METADATA +192 -0
freesolo_flash_dev-0.2.25.dist-info/RECORD +111 -0
freesolo_flash_dev-0.2.25.dist-info/WHEEL +4 -0
freesolo_flash_dev-0.2.25.dist-info/entry_points.txt +3 -0
freesolo_flash_dev-0.2.25.dist-info/licenses/LICENSE +201 -0

flash/schema/fields.py ADDED Viewed

@@ -0,0 +1,331 @@
+"""Field-level validators/coercers for Flash TOML config parsing.
+Leaf helpers split out of ``flash.schema``: the ``ConfigError`` type, the [train] scalar
+validators, the slug/worker-env/wandb validators, and the ``--set`` scalar coercer. None
+reference the rest of the schema package; the package ``__init__`` re-exports them.
+"""
+from __future__ import annotations
+import math
+import re
+import urllib.parse
+from typing import Any
+from flash.spec import WandbSpec
+_GITHUB_SAFE_PART_RE = re.compile(r"^[A-Za-z0-9._-]+$")
+def _train_int(train_raw: dict, key: str, *, minimum: int) -> int | None:
+    """Validate an optional integer [train] knob (>= minimum) -> ConfigError (HTTP 400).
+    None stays None (recipe default). Rejects bools, non-numbers, non-integers, and
+    out-of-range values at parse time instead of letting them reach a provisioned worker.
+    """
+    v = train_raw.get(key)
+    if v is None:
+        return None
+    if isinstance(v, bool) or not isinstance(v, (int, float)):
+        raise ConfigError(f"train.{key} must be an integer")
+    # Check finiteness BEFORE int(v): int(inf) raises OverflowError and int(nan) ValueError
+    # (the former would be a 500); reject both as a clean 400.
+    if not math.isfinite(v) or float(v) != int(v):
+        raise ConfigError(f"train.{key} must be a finite integer")
+    v = int(v)
+    if v < minimum:
+        raise ConfigError(f"train.{key} must be >= {minimum}")
+    return v
+def _train_float(
+    train_raw: dict,
+    key: str,
+    *,
+    minimum: float,
+    exclusive: bool = False,
+    maximum: float | None = None,
+) -> float | None:
+    """Validate an optional float [train] knob -> ConfigError (HTTP 400). None stays None."""
+    v = train_raw.get(key)
+    if v is None:
+        return None
+    if isinstance(v, bool) or not isinstance(v, (int, float)):
+        raise ConfigError(f"train.{key} must be a number")
+    v = float(v)
+    # nan/inf slip past the range checks below (nan compares false, inf passes any minimum)
+    # and would reach TRL optimizer/sampling settings; reject them as a 400 here.
+    if not math.isfinite(v):
+        raise ConfigError(f"train.{key} must be a finite number")
+    if exclusive and v <= minimum:
+        raise ConfigError(f"train.{key} must be > {minimum}")
+    if not exclusive and v < minimum:
+        raise ConfigError(f"train.{key} must be >= {minimum}")
+    if maximum is not None and v > maximum:
+        raise ConfigError(f"train.{key} must be between {minimum} and {maximum}")
+    return v
+def _train_stops(train_raw: dict) -> tuple[str, ...]:
+    """Validate stop_sequences -> ConfigError. A string is ONE stop (never char-split);
+    a list must hold strings; empties are dropped; anything else is rejected."""
+    v = train_raw.get("stop_sequences")
+    if v is None:
+        return ()
+    if isinstance(v, str):
+        return (v,) if v else ()
+    if not isinstance(v, (list, tuple)):
+        raise ConfigError("train.stop_sequences must be a string or a list of strings")
+    for s in v:
+        if not isinstance(s, str):
+            raise ConfigError("train.stop_sequences entries must be strings")
+    return tuple(s for s in v if s)
+class ConfigError(ValueError):
+    pass
+def _require_slug(value: str, message: str) -> None:
+    """Require an ``owner/name`` slug."""
+    text = (value or "").strip()
+    if not text or ":" in text:
+        raise ConfigError(message)
+    parsed = urllib.parse.urlparse(text)
+    if parsed.scheme or parsed.netloc:
+        raise ConfigError(message)
+    parts = text.split("/")
+    if len(parts) != 2 or not _is_safe_github_path_parts(parts):
+        raise ConfigError(message)
+def _require_environment_ref(value: str, message: str) -> None:
+    """Require a Freesolo environment id."""
+    try:
+        _require_slug(value, message)
+        return
+    except ConfigError:
+        pass
+    if value.startswith("github:"):
+        body = value[len("github:") :]
+        repo_ref, sep, path = body.partition(":")
+        repo, at, ref = repo_ref.partition("@")
+        if at and not ref:
+            raise ConfigError(message)
+        owner_repo = repo.split("/")
+        if (
+            len(owner_repo) == 2
+            and _is_safe_github_path_parts(owner_repo)
+            and (not at or _is_safe_github_path_parts([ref]))
+            and (not sep or _is_safe_environment_path(path))
+        ):
+            return
+        raise ConfigError(message)
+    if value.startswith("https://github.com/") or value.startswith("http://github.com/"):
+        parsed = urllib.parse.urlparse(value)
+        if parsed.scheme in {"http", "https"} and parsed.netloc.lower() == "github.com":
+            parts = [
+                part for part in urllib.parse.unquote(parsed.path).strip("/").split("/") if part
+            ]
+            if len(parts) < 2:
+                raise ConfigError(message)
+            owner, repo = parts[0], parts[1]
+            repo = repo[:-4] if repo.endswith(".git") else repo
+            if len(parts) == 2:
+                if not _is_safe_github_path_parts([owner, repo]):
+                    raise ConfigError(message)
+            elif len(parts) >= 5 and parts[2] in {"blob", "tree"}:
+                ref = parts[3]
+                if not _is_safe_github_path_parts([ref]):
+                    raise ConfigError(message)
+                raw_path = "/".join(parts[4:])
+                if not _is_safe_environment_path(raw_path):
+                    raise ConfigError(message)
+                if not _is_safe_github_path_parts([owner, repo, ref]):
+                    raise ConfigError(message)
+            else:
+                raise ConfigError(message)
+            return
+    raise ConfigError(message)
+def _is_safe_environment_path(path: str) -> bool:
+    if not path:
+        return True
+    raw = path.strip().replace("\\", "/")
+    if raw.startswith("/"):
+        return False
+    parts = [part for part in raw.split("/") if part]
+    if not parts:
+        return True
+    return not any(part in {".", ".."} for part in parts)
+def _is_safe_github_path_parts(parts: list[str]) -> bool:
+    if any(part in {".", "..", ""} for part in parts):
+        return False
+    return all(_GITHUB_SAFE_PART_RE.fullmatch(part) for part in parts)
+def _coerce_scalar(value: str):
+    low = value.strip().lower()
+    if low in ("true", "false"):
+        return low == "true"
+    try:
+        return int(value)
+    except ValueError:
+        pass
+    try:
+        return float(value)
+    except ValueError:
+        return value
+def _validate_env_var_names(names, context: str) -> None:
+    bad_names = sorted(repr(k) for k in names if (not k) or any(c in k for c in "=\0 \t\n\r"))
+    if bad_names:
+        raise ConfigError(
+            f"{context} has invalid environment variable name(s): {', '.join(bad_names)}; an "
+            "env var name must be non-empty and contain no '=', whitespace, or NUL byte"
+        )
+_RESERVED_ENVIRONMENT_SECRET_KEYS = frozenset(
+    {
+        "RUNPOD_API_KEY",
+        "HF_TOKEN",
+        "HUGGING_FACE_HUB_TOKEN",
+        "GITHUB_TOKEN",
+        "FREESOLO_API_KEY",
+        "FREESOLO_INTERNAL_KEY",
+        "RUN_ID",
+        "HF_REPO",
+        "FLASH_ARM",
+    }
+)
+def _environment_secrets(raw: Any) -> tuple[str, ...]:
+    """Parse [environment].secrets as declared worker env-var secret names."""
+    if raw is None:
+        return ()
+    if isinstance(raw, str) or not isinstance(raw, (list, tuple)):
+        raise ConfigError("[environment] secrets must be a list of environment variable names")
+    if not all(isinstance(name, str) for name in raw):
+        raise ConfigError("[environment] secrets entries must be strings")
+    secrets = tuple(dict.fromkeys(raw))
+    _validate_env_var_names(secrets, "[environment] secrets")
+    reserved = sorted(set(secrets) & _RESERVED_ENVIRONMENT_SECRET_KEYS)
+    if reserved:
+        raise ConfigError(
+            "[environment] secrets must not include platform-managed key(s): "
+            f"{', '.join(reserved)}"
+        )
+    return secrets
+def _worker_env(raw: Any) -> dict[str, str]:
+    """Parse the optional [worker_env] table: per-run worker env overrides (string-valued)."""
+    if raw is None:
+        return {}
+    if not isinstance(raw, dict):
+        raise ConfigError("[worker_env] must be a table of string key/values")
+    env = {str(k): str(v) for k, v in raw.items()}
+    # Env var NAMES must be usable by subprocess.Popen(env=...) on the worker, which raises
+    # ValueError for an empty name or one containing '=' or a NUL byte (and whitespace breaks most
+    # shells). Reject these at parse time so a malformed [worker_env] (e.g. a TOML quoted key like
+    # "BAD=KEY", or an empty key) fails on config load — not after a worker has been provisioned.
+    _validate_env_var_names(env, "[worker_env]")
+    # [worker_env] is serialized into job_spec_json (persisted + logged), so it must NOT carry
+    # secrets — they would leak into run artifacts. Reject secret-looking keys; operators set
+    # those as real process environment variables (forwarded to the worker out-of-band) instead.
+    # Detect by `_`-delimited WORD components (not substring): flag a secret WORD, or `KEY`
+    # qualified by a credential context. This catches HF_TOKEN, *_API_KEY, SECRET_KEY, INTERNAL_KEY,
+    # CREDENTIAL, AWS_SECRET_ACCESS_KEY, GITHUB_PAT (PAT word), and credential keys like SSH_KEY /
+    # DEPLOY_KEY / GPG_KEY (KEY qualified by a credential context) — while allowing legit knobs whose
+    # names merely contain a marker (RL_VLLM_MAX_BATCHED_TOKENS -> word TOKENS, not TOKEN; a bare
+    # SORT_KEY -> KEY without a secret qualifier).
+    _secret_words = {
+        "TOKEN",
+        "SECRET",
+        "PASSWORD",
+        "PASSWD",
+        "PASSPHRASE",
+        "CREDENTIAL",
+        "CREDENTIALS",
+        "APIKEY",
+        "PRIVATEKEY",
+        "PAT",  # personal access token (e.g. GITHUB_PAT, GH_PAT)
+    }
+    _key_qualifiers = {
+        "API",
+        "SECRET",
+        "PRIVATE",
+        "ACCESS",
+        "INTERNAL",
+        "AUTH",
+        "SIGNING",
+        "ENCRYPTION",
+        # credential-key contexts: SSH_KEY, DEPLOY_KEY, GPG_KEY, RSA_KEY, TLS/SSL/PEM keys, etc.
+        "SSH",
+        "DEPLOY",
+        "GPG",
+        "PGP",
+        "RSA",
+        "PEM",
+        "SSL",
+        "TLS",
+    }
+    def _is_secret_key(name: str) -> bool:
+        words = set(name.upper().split("_"))
+        return bool(words & _secret_words) or ("KEY" in words and bool(words & _key_qualifiers))
+    secrets = sorted(k for k in env if _is_secret_key(k))
+    if secrets:
+        raise ConfigError(
+            f"[worker_env] must not contain secret-bearing keys ({', '.join(secrets)}); these are "
+            "serialized into run artifacts; use provider process env or supported runtime secrets "
+            "instead"
+        )
+    return env
+# Allowed [wandb] config keys -> typed JobSpec.wandb fields (first-class spec config, NOT env vars).
+_WANDB_KEYS = ("project", "run_name")
+def _wandb_spec(raw: Any) -> WandbSpec:
+    """Parse the optional ``[wandb]`` table into a typed ``WandbSpec`` (project / run_name).
+    These are non-secret W&B naming labels carried as first-class spec config (round-tripped in
+    the job-spec JSON the worker reads), NOT environment variables. The worker honors them in
+    ``engine.worker.wandb_report_to`` / ``wandb_run_name``, so a run can land in its own W&B
+    project under its own run name instead of the hardcoded ``flash`` / ``flash-…`` defaults.
+    Settable in TOML (``[wandb] project = …``) or via ``flash train cfg.toml --set
+    wandb.project=… --set wandb.run_name=…``. The actual W&B credential (WANDB_API_KEY) stays an
+    env-var secret — only the naming config lives here."""
+    if raw is None:
+        return WandbSpec()
+    if not isinstance(raw, dict):
+        raise ConfigError('[wandb] must be a table (e.g. project = "my-project")')
+    unknown = sorted(set(raw) - set(_WANDB_KEYS))
+    if unknown:
+        raise ConfigError(
+            f"[wandb] unknown key(s): {', '.join(unknown)} (allowed: {', '.join(_WANDB_KEYS)})"
+        )
+    values: dict[str, str] = {}
+    for key in _WANDB_KEYS:
+        val = raw.get(key)
+        # Absent OR null means "unset". A serialized JobSpec round-trips unset wandb fields as
+        # null (``asdict`` emits ``{"project": null, "run_name": null}``), so re-parsing a spec —
+        # which is exactly what the control plane does on submit, ``spec_from_dict(spec.to_dict())``
+        # — must accept null without demanding a value, or every run that omits ``[wandb]`` is
+        # rejected. Only an explicitly-set value is validated: a bare ""/whitespace is a real
+        # config mistake worth flagging.
+        if val is None:
+            continue
+        if not isinstance(val, str) or not val.strip():
+            raise ConfigError(f"[wandb] {key} must be a non-empty string")
+        values[key] = val.strip()
+    return WandbSpec(**values)

flash/serve/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Adapter serving helpers."""

flash/serve/deploy.py ADDED Viewed

@@ -0,0 +1,326 @@
+"""Serve a trained LoRA adapter via the freesolo platform's multi-LoRA serving app.
+Flash no longer runs its own per-run vLLM endpoint. Instead the control plane is a
+thin client of the freesolo serving service (a Modal multi-LoRA app that serves every
+adapter on shared base-model capacity — so there is no flash-side idle billing to
+track). The same CLI commands and control-plane endpoints
+(`deploy`/`undeploy`/`chat`/`deployments`) stay; only what they do under the hood
+changed.
+The serving service exposes:
+- ``POST {FREESOLO_SERVING_URL}/adapters`` — register/deploy an adapter (auth header).
+- ``DELETE {FREESOLO_SERVING_URL}/adapters/{adapterId}`` — undeploy (auth header).
+- ``POST {FREESOLO_SERVING_URL}/v1/chat/completions`` — OpenAI-style chat.
+- ``GET {FREESOLO_SERVING_URL}/healthz`` / ``GET .../adapters`` — health / list.
+The registration/teardown calls carry the shared ``X-Freesolo-Internal-Key`` header
+(the same internal credential flash already holds, ``FREESOLO_INTERNAL_KEY``). The chat
+calls also send it: the control plane is a trusted server-to-server caller (it has already
+authorized the user's key on its own ``/v1/runs/{run_id}/chat`` route), so it uses the
+serving app's internal-key bypass when serving enforces external chat auth.
+"""
+from __future__ import annotations
+import json
+import os
+from collections.abc import Iterator
+from dataclasses import asdict, dataclass
+import httpx
+from flash._logging import get_logger
+from flash.providers.base import canonical_gpu, gpu_short
+logger = get_logger(__name__)
+# Default freesolo serving base URL (the Modal multi-LoRA app). Overridable per-env.
+DEFAULT_FREESOLO_SERVING_URL = "https://clado-ai--freesolo-lora-serving.modal.run"
+class ServingError(RuntimeError):
+    """The freesolo serving backend (Modal LoRA app) rejected a request or was unreachable.
+    Carries the upstream status (when there was an HTTP response) so the API layer can
+    surface a clean ``502 Bad Gateway`` with the real reason instead of letting an
+    ``httpx`` exception escape as an unhandled ``500`` + traceback.
+    """
+    def __init__(self, message: str, *, status_code: int | None = None):
+        super().__init__(message)
+        self.status_code = status_code
+def _post_adapter_or_raise(url: str, body: dict) -> httpx.Response:
+    """POST an adapter registration to the serving backend, translating any transport- or
+    status-level failure into a ``ServingError`` that carries the upstream detail."""
+    try:
+        # follow_redirects: Modal answers a slow request with a 303 to an async-result poll URL
+        # (?__modal_function_call_id=...); without following it httpx raises on the 303 (see chat).
+        resp = httpx.post(
+            url,
+            json=body,
+            headers=_internal_key_header(),
+            timeout=60.0,
+            follow_redirects=True,
+        )
+        resp.raise_for_status()
+        return resp
+    except httpx.HTTPStatusError as exc:
+        raise _serving_status_error(url, exc) from exc
+    except httpx.RequestError as exc:
+        raise ServingError(f"could not reach the serving backend at {url}: {exc}") from exc
+def _serving_status_error(url: str, exc: httpx.HTTPStatusError) -> ServingError:
+    """Build a ``ServingError`` from an upstream HTTP failure, carrying the status and a
+    4xx-vs-5xx-tailored hint (shared by the deploy POST and the undeploy DELETE)."""
+    # raise_for_status() always carries a response, but a hand-built HTTPStatusError may
+    # not — guard so error translation can never itself raise.
+    resp = exc.response
+    status = resp.status_code if resp is not None else None
+    detail = ((resp.text if resp is not None else "") or "").strip()[:500]
+    msg = f"serving backend error for {url}"
+    if status is not None:
+        msg += f" (HTTP {status})"
+    if detail:
+        msg += f": {detail}"
+    # Tailor the hint to the upstream status: a 4xx is a client/auth problem with THIS request
+    # (e.g. a missing/invalid FREESOLO_INTERNAL_KEY), not a serving outage; a 5xx (or unknown)
+    # means the backend itself failed / has no engine for the base model.
+    if status is not None and status < 500:
+        msg += (
+            " — the serving backend rejected the request (4xx); check FREESOLO_INTERNAL_KEY "
+            "and the request payload (this is a client/auth error, not a serving outage)"
+        )
+    else:
+        msg += (
+            " — the serving backend is unavailable or has no engine for this base model; "
+            "an operator must check the freesolo serving deployment"
+        )
+    return ServingError(msg, status_code=status)
+def serving_base_url() -> str:
+    """The freesolo serving base URL (env-overridable, trailing slash stripped)."""
+    return (os.environ.get("FREESOLO_SERVING_URL") or DEFAULT_FREESOLO_SERVING_URL).rstrip("/")
+def _internal_key_header() -> dict[str, str]:
+    key = os.environ.get("FREESOLO_INTERNAL_KEY") or ""
+    return {"X-Freesolo-Internal-Key": key} if key else {}
+@dataclass
+class Deployment:
+    run_id: str
+    model: str
+    adapter_hf_prefix: str
+    gpu: str
+    openai_model: str
+    endpoint_name: str
+    state: str = "ready"
+    def to_dict(self) -> dict:
+        return asdict(self)
+def serve_endpoint_name(friendly_gpu: str, run_id: str) -> str:
+    """Cosmetic endpoint label (the freesolo app serves all adapters on one endpoint)."""
+    tail = (run_id or "").split("-")[-1][:24]
+    base = f"flash-serve-{gpu_short(canonical_gpu(friendly_gpu))}"
+    return f"{base}-{tail}" if tail else base
+def servable_gpu(gpu_name: str) -> str:
+    """Resolve a friendly GPU class for the deployment record.
+    Serving is delegated to freesolo (one GPU per base model, chosen there), so this is
+    now informational. We still canonicalize the name and fall back to the cheapest RunPod
+    class big enough when the trained class isn't a RunPod class, so the recorded ``gpu`` is
+    a sensible, valid class (and junk GPU names still raise)."""
+    from flash.providers.base import GPU_INFO, cheapest_gpu
+    friendly = canonical_gpu(gpu_name)
+    info = GPU_INFO[friendly]
+    if info.enum_member:  # a RunPod class — serve it directly
+        return friendly
+    return cheapest_gpu(info.vram_gb)  # else the cheapest RunPod class that fits
+def deploy_adapter(
+    run_id: str,
+    model: str,
+    hf_repo: str,
+    adapter_prefix: str,
+    gpu_name: str = "RTX 5090",
+    dry_run: bool = False,
+    thinking: bool = False,
+    org_id: str | None = None,
+) -> Deployment:
+    """Register the trained adapter with the freesolo serving app.
+    The adapter artifacts already live in the run's HF dataset repo (the trainer
+    streamed them there); freesolo serving pulls them from
+    ``{hf_repo}:{adapter_prefix}/adapter``. ``dry_run`` validates/shapes the deployment
+    without making the network call.
+    """
+    friendly = servable_gpu(gpu_name)
+    subfolder = f"{adapter_prefix}/adapter"
+    dep = Deployment(
+        run_id=run_id,
+        model=model,
+        adapter_hf_prefix=subfolder,
+        gpu=friendly,
+        openai_model=run_id,
+        endpoint_name=serving_base_url(),
+        state="dry_run" if dry_run else "ready",
+    )
+    if dry_run:
+        return dep
+    base = serving_base_url()
+    body = {
+        "adapterId": run_id,
+        "repoId": hf_repo,
+        "baseModel": model,
+        "subfolder": subfolder,
+        # The trainer always streams the adapter into a *dataset* repo (the worker's
+        # hf_upload_folder uses repo_type="dataset"), so serving must pull from the dataset
+        # namespace. Without this the serving app defaults repoType to "model" and
+        # snapshot_download 404s on the model namespace — deploy returns 200 but the engine
+        # warmup fails, the adapter is silently disabled, and the first chat 404s.
+        "repoType": "dataset",
+        "status": "ready",
+    }
+    # Attribute the adapter to the deploying org so serving can authorize external chat by org:
+    # the backend maps adapterId -> org via hosted_lora_adapters.org_id, which serving persists
+    # from this field. Normalize (strip) and omit when blank (older callers / whitespace) so the
+    # registration shape is unchanged and a stray " org " can't mis-attribute the adapter.
+    normalized_org_id = (org_id or "").strip()
+    if normalized_org_id:
+        body["orgId"] = normalized_org_id
+    _post_adapter_or_raise(f"{base}/adapters", body)
+    logger.info("registered adapter %s with freesolo serving (%s)", run_id, base)
+    return dep
+def undeploy_adapter(run_id: str) -> list[str]:
+    """Deregister the run's adapter from the freesolo serving app.
+    Returns ``[run_id]`` when the adapter was removed (200), ``[]`` when it was already
+    gone (404). Any other failure — a non-404 HTTP status or a transport error — is
+    translated into a ``ServingError`` (carrying the upstream status), exactly like
+    ``deploy_adapter``, so callers see a stable error surface (the API maps it to a clean
+    502) instead of a raw ``httpx`` exception escaping as an unhandled 500.
+    """
+    base = serving_base_url()
+    url = f"{base}/adapters/{run_id}"
+    try:
+        resp = httpx.delete(
+            url,
+            headers=_internal_key_header(),
+            timeout=60.0,
+            # Modal answers a slow request with a 303 to an async-result poll URL; follow it (see chat).
+            follow_redirects=True,
+        )
+        # Undeploy is idempotent: an already-absent adapter (404) is a no-op success, not an
+        # error — handle it before raise_for_status() so it never becomes a ServingError.
+        if resp.status_code == 404:
+            return []
+        resp.raise_for_status()
+    except httpx.HTTPStatusError as exc:
+        raise _serving_status_error(url, exc) from exc
+    except httpx.RequestError as exc:
+        raise ServingError(f"could not reach the serving backend at {url}: {exc}") from exc
+    logger.info("deregistered adapter %s from freesolo serving (%s)", run_id, base)
+    return [run_id]
+def chat(
+    run_id: str,
+    messages: list[dict],
+    temperature: float = 0.0,
+    max_tokens: int = 512,
+    thinking: bool = False,
+) -> dict:
+    """Send an OpenAI-style chat request for the run's adapter to freesolo serving.
+    The adapter is addressed by ``model=run_id`` (its registered ``adapterId``); the
+    response is the parsed OpenAI chat-completion dict, so
+    ``resp["choices"][0]["message"]["content"]`` keeps working downstream.
+    """
+    base = serving_base_url()
+    body = {
+        "model": run_id,
+        "messages": messages,
+        "max_tokens": int(max_tokens),
+        "temperature": float(temperature),
+        # Per-run thinking parity: a run trained with thinking must serve with thinking, so
+        # forward the flag to the chat template (enable_thinking is the kwarg the renderer and
+        # rollout path use, e.g. multiturn_rollout.build_rollout_func). Without this the served
+        # completions diverge from training behavior even though the caller passes thinking=.
+        "chat_template_kwargs": {"enable_thinking": bool(thinking)},
+    }
+    # Cold starts (scale-from-zero per base model) can take minutes. Modal serves a slow ASGI
+    # request by 303-redirecting to an async-result poll URL (?__modal_function_call_id=...), so
+    # the client must follow redirects to retrieve the eventual completion — without this httpx
+    # raises on the 303 and the chat fails mid cold-start. max_redirects is raised because a long
+    # cold start polls across several redirect cycles before the result is ready.
+    with httpx.Client(follow_redirects=True, max_redirects=100, timeout=30 * 60.0) as client:
+        # The control plane is a trusted server-to-server caller (it already authorized the user's
+        # key on the /v1/runs/{run_id}/chat route), so present the internal key to pass serving's
+        # external chat-auth gate. No-op when the gate is off or the key is unset.
+        resp = client.post(f"{base}/v1/chat/completions", json=body, headers=_internal_key_header())
+    resp.raise_for_status()
+    return resp.json()
+def _openai_stream_content(lines: Iterator[str]) -> Iterator[str]:
+    for line in lines:
+        line = line.strip()
+        if not line.startswith("data:"):
+            continue
+        data = line.removeprefix("data:").strip()
+        if data == "[DONE]":
+            break
+        if not data:
+            continue
+        chunk = json.loads(data)
+        for choice in chunk.get("choices") or []:
+            content = ((choice.get("delta") or {}).get("content")) or ""
+            if content:
+                yield str(content)
+def chat_stream(
+    run_id: str,
+    messages: list[dict],
+    temperature: float = 0.0,
+    max_tokens: int = 512,
+    thinking: bool = False,
+) -> Iterator[str]:
+    """Yield text deltas from the freesolo OpenAI-compatible streaming endpoint."""
+    base = serving_base_url()
+    body = {
+        "model": run_id,
+        "messages": messages,
+        "max_tokens": int(max_tokens),
+        "temperature": float(temperature),
+        "chat_template_kwargs": {"enable_thinking": bool(thinking)},
+        "stream": True,
+    }
+    with (
+        httpx.Client(follow_redirects=True, max_redirects=100, timeout=30 * 60.0) as client,
+        client.stream(
+            "POST", f"{base}/v1/chat/completions", json=body, headers=_internal_key_header()
+        ) as resp,
+    ):
+        resp.raise_for_status()
+        if "application/json" in resp.headers.get("content-type", ""):
+            payload = resp.json()
+            content = (((payload.get("choices") or [{}])[0].get("message") or {}).get("content"))
+            if content:
+                yield str(content)
+            return
+        yield from _openai_stream_content(resp.iter_lines())