PyPI - freesolo-flash-dev - Versions diffs - 0.2.25__py3-none-any.whl - Mend

freesolo-flash-dev 0.2.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (111) hide show

flash/__init__.py +29 -0
flash/_channel.py +23 -0
flash/_fileio.py +35 -0
flash/_logging.py +49 -0
flash/_update_check.py +266 -0
flash/catalog.py +253 -0
flash/cli/__init__.py +1 -0
flash/cli/main/__init__.py +227 -0
flash/cli/main/__main__.py +6 -0
flash/cli/main/commands.py +636 -0
flash/cli/main/envpush.py +317 -0
flash/cli/main/render.py +599 -0
flash/cli/main/training_doc.py +455 -0
flash/client/__init__.py +14 -0
flash/client/config.py +70 -0
flash/client/http.py +372 -0
flash/client/runtime_secrets.py +69 -0
flash/client/specs.py +20 -0
flash/cost/__init__.py +16 -0
flash/cost/analytical.py +175 -0
flash/cost/facts.py +114 -0
flash/cost/spec.py +113 -0
flash/cost/types.py +158 -0
flash/engine/__init__.py +6 -0
flash/engine/accounting.py +36 -0
flash/engine/chalk_kernels.py +116 -0
flash/engine/multiturn_rollout.py +780 -0
flash/engine/recipe.py +86 -0
flash/engine/vram.py +603 -0
flash/engine/worker/__init__.py +2916 -0
flash/engine/worker/__main__.py +4 -0
flash/engine/worker/kernel_warmup.py +400 -0
flash/engine/worker/lora.py +796 -0
flash/engine/worker/packing.py +366 -0
flash/engine/worker/perf.py +1048 -0
flash/envs/__init__.py +10 -0
flash/envs/adapter/__init__.py +883 -0
flash/envs/adapter/rubric.py +222 -0
flash/envs/base.py +52 -0
flash/envs/registry.py +62 -0
flash/mcp/__init__.py +1 -0
flash/mcp/server.py +85 -0
flash/providers/__init__.py +59 -0
flash/providers/_auth.py +24 -0
flash/providers/_http.py +230 -0
flash/providers/_instance.py +416 -0
flash/providers/_instance_bootstrap.py +517 -0
flash/providers/_poll.py +311 -0
flash/providers/allocator.py +193 -0
flash/providers/base.py +431 -0
flash/providers/hyperstack/__init__.py +127 -0
flash/providers/hyperstack/api.py +522 -0
flash/providers/hyperstack/auth.py +17 -0
flash/providers/hyperstack/gpus.py +29 -0
flash/providers/hyperstack/jobs/__init__.py +632 -0
flash/providers/hyperstack/jobs/builders.py +122 -0
flash/providers/hyperstack/preflight.py +23 -0
flash/providers/hyperstack/pricing.py +26 -0
flash/providers/hyperstack/train.py +25 -0
flash/providers/lambdalabs/__init__.py +139 -0
flash/providers/lambdalabs/api.py +261 -0
flash/providers/lambdalabs/auth.py +18 -0
flash/providers/lambdalabs/gpus.py +29 -0
flash/providers/lambdalabs/jobs/__init__.py +724 -0
flash/providers/lambdalabs/jobs/builders.py +118 -0
flash/providers/lambdalabs/preflight.py +27 -0
flash/providers/lambdalabs/pricing.py +51 -0
flash/providers/lambdalabs/train.py +27 -0
flash/providers/preflight.py +55 -0
flash/providers/realized.py +80 -0
flash/providers/runpod/__init__.py +130 -0
flash/providers/runpod/api.py +186 -0
flash/providers/runpod/auth.py +37 -0
flash/providers/runpod/cost.py +57 -0
flash/providers/runpod/gpus.py +46 -0
flash/providers/runpod/jobs.py +956 -0
flash/providers/runpod/keys.py +139 -0
flash/providers/runpod/preflight.py +30 -0
flash/providers/runpod/preload.py +915 -0
flash/providers/runpod/pricing.py +18 -0
flash/providers/runpod/slots.py +79 -0
flash/providers/runpod/train/__init__.py +150 -0
flash/providers/runpod/train/deps.py +395 -0
flash/providers/runpod/train/endpoints.py +820 -0
flash/py.typed +0 -0
flash/runner/__init__.py +686 -0
flash/runner/checkpoints.py +82 -0
flash/runner/deploy.py +422 -0
flash/runner/lifecycle.py +672 -0
flash/schema/__init__.py +375 -0
flash/schema/fields.py +331 -0
flash/serve/__init__.py +1 -0
flash/serve/deploy.py +326 -0
flash/serve/pricing.py +60 -0
flash/server/__init__.py +1 -0
flash/server/__main__.py +20 -0
flash/server/app.py +961 -0
flash/server/auth.py +263 -0
flash/server/billing.py +124 -0
flash/server/checkpoints.py +110 -0
flash/server/db.py +160 -0
flash/server/environment_registry.py +102 -0
flash/server/envs.py +360 -0
flash/server/reconcile.py +163 -0
flash/server/run_registry.py +150 -0
flash/spec.py +333 -0
freesolo_flash_dev-0.2.25.dist-info/METADATA +192 -0
freesolo_flash_dev-0.2.25.dist-info/RECORD +111 -0
freesolo_flash_dev-0.2.25.dist-info/WHEEL +4 -0
freesolo_flash_dev-0.2.25.dist-info/entry_points.txt +3 -0
freesolo_flash_dev-0.2.25.dist-info/licenses/LICENSE +201 -0

flash/__init__.py ADDED Viewed

@@ -0,0 +1,29 @@
+"""Flash — managed LoRA post-training: log in with your freesolo key, train.
+A focused developer experience (TOML run specs, pluggable environments,
+CLI/API/MCP entry points, adapter deployment). Users authenticate with their
+freesolo API key (`flash login`); the control plane runs each job on a managed
+RunPod GPU behind the scenes.
+"""
+from importlib.metadata import version as _dist_version
+from flash._channel import DIST_NAME as _DIST_NAME
+__all__ = ["__version__"]
+# single source of truth for the version is pyproject `[project].version`, which hatchling bakes
+# into the installed distribution metadata at build time. read it back here instead of keeping a
+# second hand-maintained literal: that duplicate is what desynced in 0.2.20 (the wheel said 0.2.20
+# while __init__ still hard-coded 0.2.19), making flash nag to upgrade forever while uv reported
+# nothing to upgrade. the distribution name (_DIST_NAME: "freesolo-flash", or "freesolo-flash-dev"
+# for the dev channel) differs from the import package, and is selected by flash/_channel.py.
+try:
+    __version__ = _dist_version(_DIST_NAME)
+except Exception:
+    # no readable dist metadata: running from a source tree that was never installed, or an
+    # unreadable/corrupt METADATA file. fall back to a clearly-fake version rather than letting
+    # `import flash` (the package root, imported by every entry point) crash. this only happens off
+    # the installed path; a released wheel always has real metadata. a bare-checkout run on a tty
+    # may then show the update notice, which is fine for an uninstalled dev build.
+    __version__ = "0+unknown"

flash/_channel.py ADDED Viewed

@@ -0,0 +1,23 @@
+"""Release channel of the installed distribution — the single switch between prod and dev.
+The production package ``freesolo-flash`` ships ``CHANNEL = "prod"`` (the checked-in default
+below): it installs a ``flash`` CLI that talks to the production control plane. The dev-channel
+package ``freesolo-flash-dev`` is built from this *same source* with only this one line rewritten
+to ``CHANNEL = "dev"`` (see ``scripts/build_dev_dist.py``); everything that differs between the two
+channels — the CLI name, the PyPI distribution name, the default control-plane URL — derives from
+it below, so there is exactly one thing to flip. An explicit ``FLASH_API_URL`` /
+``flash login --api-url`` always wins; the channel only picks the *default* plane.
+"""
+from __future__ import annotations
+# The one line scripts/build_dev_dist.py rewrites to "dev" for the dev-channel build.
+CHANNEL = "dev"
+# Console-script + argparse program name. Kept in lockstep with [project.scripts] in
+# pyproject.toml (which the build script also rewrites: flash -> flash-dev).
+CLI_NAME = "flash-dev" if CHANNEL == "dev" else "flash"
+# PyPI distribution name. Used to read back __version__ from installed metadata and to point the
+# update check at the right project (kept in lockstep with [project].name in pyproject.toml).
+DIST_NAME = "freesolo-flash-dev" if CHANNEL == "dev" else "freesolo-flash"

flash/_fileio.py ADDED Viewed

@@ -0,0 +1,35 @@
+"""Small shared file-IO helpers for credential/manifest JSON under ``~/.flash``."""
+from __future__ import annotations
+import contextlib
+import json
+import os
+from pathlib import Path
+def read_json_or_empty(path: Path) -> dict:
+    """Parse a JSON object file, returning ``{}`` if it's missing or unreadable."""
+    try:
+        return json.loads(path.read_text())
+    except (OSError, ValueError):
+        return {}
+def secure_json_write(path: Path, data: dict) -> None:
+    """Write ``data`` as JSON with private permissions (the file may hold a secret).
+    Creates the parent dir (0700) and opens the file 0600 from the start — never
+    write_text + chmod, which leaves it umask-readable in between. ``O_NOFOLLOW``
+    (where available) refuses to follow a symlink planted at ``path`` so the write
+    can't be redirected to clobber an arbitrary file.
+    """
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with contextlib.suppress(OSError):
+        os.chmod(path.parent, 0o700)
+    flags = os.O_WRONLY | os.O_CREAT | os.O_TRUNC | getattr(os, "O_NOFOLLOW", 0)
+    fd = os.open(path, flags, 0o600)
+    with os.fdopen(fd, "w") as f:
+        json.dump(data, f, indent=2, sort_keys=True)
+    with contextlib.suppress(OSError):
+        os.chmod(path, 0o600)

flash/_logging.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""Package logging helpers.
+Library code logs through the ``flash`` logger and never configures handlers on import (it
+attaches a :class:`logging.NullHandler`), so importing Flash stays silent for downstream
+applications. The CLI calls :func:`configure_logging` to attach a console handler whose
+level is controlled by ``-v/--verbose``.
+"""
+from __future__ import annotations
+import logging
+_ROOT_NAME = "flash"
+# Attach a NullHandler once so "No handlers could be found" warnings never appear and
+# importing the library produces no output unless the app opts in.
+_root = logging.getLogger(_ROOT_NAME)
+if not any(isinstance(h, logging.NullHandler) for h in _root.handlers):
+    _root.addHandler(logging.NullHandler())
+def get_logger(name: str | None = None) -> logging.Logger:
+    """Return a logger under the ``flash`` namespace (e.g. ``get_logger(__name__)``)."""
+    if not name or name == _ROOT_NAME:
+        return logging.getLogger(_ROOT_NAME)
+    if name.startswith(_ROOT_NAME + "."):
+        return logging.getLogger(name)
+    return logging.getLogger(f"{_ROOT_NAME}.{name}")
+def configure_logging(verbosity: int = 0, level: int | None = None) -> None:
+    """Attach a console handler to the ``flash`` logger and set its level.
+    ``verbosity`` maps repeated ``-v`` flags to levels (0=WARNING, 1=INFO, >=2=DEBUG).
+    An explicit ``level`` overrides the verbosity mapping.
+    """
+    if level is None:
+        level = {0: logging.WARNING, 1: logging.INFO}.get(verbosity, logging.DEBUG)
+    logger = logging.getLogger(_ROOT_NAME)
+    logger.setLevel(level)
+    # Replace any prior console handler we installed so repeated calls don't stack handlers.
+    for h in [h for h in logger.handlers if getattr(h, "_flash_console", False)]:
+        logger.removeHandler(h)
+    handler = logging.StreamHandler()  # stderr
+    handler.setLevel(level)
+    handler.setFormatter(logging.Formatter("%(levelname)s %(name)s: %(message)s"))
+    handler._flash_console = True  # type: ignore[attr-defined]
+    logger.addHandler(handler)

flash/_update_check.py ADDED Viewed

@@ -0,0 +1,266 @@
+"""Background "a new release is available" notice for the `flash` CLI.
+The client CLI is pure standard library (no extra deps), so this is too: it queries PyPI
+with ``urllib`` and compares the published version against the installed ``__version__``.
+Design constraints that keep it from ever getting in the way:
+- **Stays out of the way.** The PyPI lookup runs in a daemon thread (so it overlaps the
+  command) and every failure (offline, timeout, bad JSON) is swallowed. Only the once-a-day
+  refresh waits briefly for that thread; every other command builds the notice from cache with
+  zero network I/O.
+- **Cached once per day.** The latest version is stored in ``~/.flash/update_check.json``;
+  we only hit PyPI when that cache is older than :data:`_CHECK_INTERVAL_S`. The check time is
+  stamped synchronously before the lookup so the daily back-off holds even if the worker thread
+  is killed at process exit before it records a result.
+- **stderr only, TTY only.** The notice prints to stderr (never stdout), so it can't corrupt
+  JSON piped to ``jq`` or captured output, and it's suppressed entirely when stderr isn't a
+  terminal (pipes, redirects, CI, tests). Color is dropped when ``NO_COLOR`` is set.
+- **Opt-out.** Set ``FLASH_NO_UPDATE_CHECK=1`` to disable the check and notice completely.
+"""
+from __future__ import annotations
+import contextlib
+import json
+import os
+import re
+import sys
+import threading
+import time
+import urllib.error
+import urllib.request
+from flash import __version__
+from flash._channel import DIST_NAME
+from flash._fileio import read_json_or_empty, secure_json_write
+from flash._logging import get_logger
+from flash.client.config import CONFIG_DIR
+logger = get_logger("flash.update_check")
+# The PyPI distribution name (== pyproject `name`) and the command that upgrades it. Follows the
+# installed channel (freesolo-flash, or freesolo-flash-dev for the dev build) — see flash/_channel.py.
+PACKAGE_NAME = DIST_NAME
+UPGRADE_COMMAND = f"uv tool upgrade {PACKAGE_NAME}"
+_PYPI_JSON_URL = f"https://pypi.org/pypi/{PACKAGE_NAME}/json"
+CACHE_PATH = CONFIG_DIR / "update_check.json"
+# Re-check PyPI at most once a day; the notice itself is shown on every command from cache.
+_CHECK_INTERVAL_S = 24 * 60 * 60
+# How long the lookup may take, and how long the once-a-day refresh waits for it at the end of a
+# command. Keep the join >= the fetch timeout so the worker thread finishes (and records its
+# result) within the wait instead of being killed at process exit mid-write.
+_FETCH_TIMEOUT_S = 1.5
+_JOIN_TIMEOUT_S = 2.0
+_OPT_OUT_ENV = "FLASH_NO_UPDATE_CHECK"
+# A PEP 440 version only uses this charset. We reject anything else (control chars, ANSI escape
+# sequences, newlines) before printing the value to a terminal, so a poisoned cache or a hostile
+# response can't inject escape codes into the notice. The length bound is just a sanity cap.
+_SAFE_VERSION = re.compile(r"\A[A-Za-z0-9][A-Za-z0-9.+!_-]{0,63}\Z")
+# A coarse subset of the PEP 440 grammar: the numeric release plus optional pre/post/dev markers.
+# Enough to order the simple versions this package ships and to spot pre-releases; this is not a
+# full PEP 440 implementation (the stdlib-only client can't depend on `packaging`).
+_VERSION_RE = re.compile(
+    r"""\A\s*v?
+        (?P<release>\d+(?:\.\d+)*)
+        (?P<pre>[._-]?(?:alpha|beta|preview|pre|rc|a|b|c)\d*)?
+        (?P<post>[._-]?(?:post|rev|r)\d*|-\d+)?
+        (?P<dev>[._-]?dev\d*)?
+    """,
+    re.IGNORECASE | re.VERBOSE,
+)
+def _enabled() -> bool:
+    """The whole feature is off unless stderr is a TTY and the user hasn't opted out."""
+    if os.environ.get(_OPT_OUT_ENV):
+        return False
+    try:
+        return bool(sys.stderr.isatty())
+    except Exception:
+        # stderr may be detached/closed/replaced (e.g. some embedded contexts); any failure
+        # here is treated as "not a TTY" so the check can never crash a command.
+        return False
+def _normalize_release(release: tuple[int, ...]) -> tuple[int, ...]:
+    """Drop trailing zeros so ``1.0`` and ``1.0.0`` compare equal (keep at least one segment)."""
+    parts = list(release)
+    while len(parts) > 1 and parts[-1] == 0:
+        parts.pop()
+    return tuple(parts)
+def _version_key(version: str) -> tuple[tuple[int, ...], int, int]:
+    """A coarse PEP 440 sort key ``(release, final_rank, post)`` where higher means newer.
+    The release segment is normalized (``1.0 == 1.0.0``). A pre-release/dev version ranks below
+    the final release of the same number (``final_rank`` 0 vs 1); a post-release ranks above it
+    via ``post``. Epochs and local versions are ignored — the catalog ships only simple versions.
+    Returns an empty release for unparseable input, which compares as "older than everything".
+    """
+    match = _VERSION_RE.match(version or "")
+    if not match:
+        return ((), 1, 0)
+    release = _normalize_release(tuple(int(part) for part in match.group("release").split(".")))
+    is_pre = bool(match.group("pre") or match.group("dev"))
+    post_digits = re.search(r"\d+", match.group("post") or "")
+    post = int(post_digits.group()) if post_digits else 0
+    return (release, 0 if is_pre else 1, post)
+def _is_prerelease(version: str) -> bool:
+    """True when the version carries a pre-release or dev marker (a/b/c/rc/alpha/beta/dev)."""
+    match = _VERSION_RE.match(version or "")
+    return bool(match and (match.group("pre") or match.group("dev")))
+def _is_newer(latest: str, current: str) -> bool:
+    """True only when ``latest`` is a strictly newer version than ``current`` (PEP 440 order)."""
+    latest_key = _version_key(latest)
+    return bool(latest_key[0]) and latest_key > _version_key(current)
+def _clean_version(value: object) -> str | None:
+    """Return ``value`` only if it's a safe, escape-free version string, else ``None``.
+    Guards both the PyPI response and the on-disk cache: ``_version_key`` parses just the
+    numeric/marker prefix, so without this an injected suffix (ANSI codes, newlines) could reach
+    the terminal. Non-strings (and anything outside the PEP 440 charset) are rejected.
+    """
+    return value if isinstance(value, str) and _SAFE_VERSION.match(value) else None
+def _read_cache() -> dict:
+    # read_json_or_empty returns whatever the file parses to; a non-object (e.g. ``[]``) would
+    # make the ``.get()`` callers raise, and _check_due runs before main()'s error handling — so
+    # coerce anything that isn't a dict back to an empty one.
+    cache = read_json_or_empty(CACHE_PATH)
+    return cache if isinstance(cache, dict) else {}
+def _check_due(now: float) -> bool:
+    """True when there's no fresh cached check (so we should hit PyPI)."""
+    cache = _read_cache()
+    checked_at = cache.get("checked_at")
+    if not isinstance(checked_at, (int, float)):
+        return True
+    return (now - checked_at) >= _CHECK_INTERVAL_S
+def _fetch_latest_version(timeout: float = _FETCH_TIMEOUT_S) -> str | None:
+    """Return PyPI's latest version for the package, or ``None`` on any failure/odd response."""
+    req = urllib.request.Request(
+        _PYPI_JSON_URL,
+        headers={
+            "Accept": "application/json",
+            "User-Agent": f"{PACKAGE_NAME}/{__version__}",
+        },
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            payload = json.loads(resp.read())
+    except (urllib.error.URLError, OSError, ValueError, TimeoutError) as exc:
+        logger.debug("update check: PyPI lookup failed: %s", exc)
+        return None
+    # Expected shape is {"info": {"version": ...}}; tolerate anything else (a proxy error page,
+    # ``[]``, ``{"info": null}``, ...) instead of letting a dereference raise into the caller.
+    info = payload.get("info") if isinstance(payload, dict) else None
+    version = info.get("version") if isinstance(info, dict) else None
+    return _clean_version(version)
+def _stamp_check_time() -> None:
+    """Record "checked just now" (keeping any cached version), synchronously and best-effort.
+    Done before the background lookup starts so the daily back-off holds even if the daemon worker
+    is killed at process exit before it records its own result — otherwise a stale/missing cache
+    would make every command re-run (and wait on) the lookup. Never raises (runs before main()'s
+    error handling).
+    """
+    with contextlib.suppress(Exception):
+        cache = _read_cache()
+        cache["checked_at"] = time.time()
+        secure_json_write(CACHE_PATH, cache)
+def _refresh_cache() -> None:
+    """Fetch from PyPI and persist the version on success; runs in a daemon thread, never raises.
+    The attempt time is already stamped by :func:`_stamp_check_time`, so a failed lookup just
+    returns and lets the daily back-off (set there) stand.
+    """
+    try:
+        latest = _fetch_latest_version()
+        if not latest:
+            return
+        cache = _read_cache()
+        cache["checked_at"] = time.time()
+        cache["pypi_version"] = latest
+        secure_json_write(CACHE_PATH, cache)
+    except Exception as exc:  # truly never let a background thread escape
+        logger.debug("update check: refresh failed: %s", exc)
+def _supports_color() -> bool:
+    return not os.environ.get("NO_COLOR")
+def _red(text: str) -> str:
+    return f"\033[31m{text}\033[0m" if _supports_color() else text
+def _build_notice() -> str | None:
+    """Build the upgrade notice from the cached PyPI version, or ``None`` if up to date."""
+    latest = _clean_version(_read_cache().get("pypi_version"))
+    # Only nudge toward stable releases: never advertise a pre-release (rc/dev) as an upgrade.
+    if not latest or _is_prerelease(latest) or not _is_newer(latest, __version__):
+        return None
+    return _red(
+        f"A new release of {PACKAGE_NAME} is available: {__version__} -> {latest}\n"
+        f"Update with `{UPGRADE_COMMAND}`."
+    )
+def maybe_start_update_check() -> threading.Thread | None:
+    """Kick off a background PyPI refresh if one is due. Returns the thread (or ``None``).
+    Pass the return value to :func:`emit_update_notice`. No-ops (returns ``None``) when the
+    feature is disabled or the cached check is still fresh, so the common path is free.
+    """
+    if not _enabled() or not _check_due(time.time()):
+        return None
+    # Stamp the attempt synchronously before spawning the worker, so the daily back-off holds even
+    # if the daemon is killed at process exit before it writes (see _stamp_check_time).
+    _stamp_check_time()
+    thread = threading.Thread(target=_refresh_cache, name="flash-update-check", daemon=True)
+    try:
+        thread.start()
+    except RuntimeError:
+        # can't spawn a thread (e.g. interpreter shutting down) — skip the check silently.
+        return None
+    return thread
+def emit_update_notice(notifier: threading.Thread | None = None) -> None:
+    """Print the upgrade notice (if any) to stderr at the end of a command.
+    Briefly waits for an in-flight refresh so a freshly fetched version can be shown the same
+    run; if it doesn't finish in time we just use whatever is already cached.
+    """
+    if not _enabled():
+        return
+    if notifier is not None:
+        with contextlib.suppress(RuntimeError):
+            notifier.join(timeout=_JOIN_TIMEOUT_S)
+    # This runs from main()'s finally block, so it must never raise: a broken pipe
+    # (`flash ... | head`), full disk, or closed stderr would otherwise crash the command.
+    with contextlib.suppress(Exception):
+        notice = _build_notice()
+        if notice:
+            print(notice, file=sys.stderr)

flash/catalog.py ADDED Viewed

@@ -0,0 +1,253 @@
+"""Curated model catalog for one-consumer-GPU LoRA jobs."""
+from __future__ import annotations
+import math
+from dataclasses import asdict, dataclass
+from typing import Any
+ALGORITHMS = ("sft", "grpo")
+def normalize_algorithm(value: str) -> str:
+    """Canonical (lowercased, validated) algorithm name."""
+    value = (value or "grpo").lower()
+    if value not in ALGORITHMS:
+        raise ValueError(f"unsupported algorithm: {value}; known: {', '.join(ALGORITHMS)}")
+    return value
+# The default GPU class used as the open-model-policy
+# sizing reference and the spec/from_dict fallback). The managed GPU class set (KNOWN)
+# lives in providers.base; RunPod pricing lives under providers/runpod. Defined above
+# ModelInfo so it can back the recommended_gpu field default.
+DEFAULT_GPU = "RTX 5090"
+# Output vocab (== config.vocab_size, the lm_head / logits width — the PADDED model vocab,
+# NOT the raw tokenizer token count). Sizes the GRPO fp32-logits VRAM term (engine.vram) and
+# the per-device completion cap (engine.worker.rl_per_device_comps). This is the open-model
+# fallback; curated per-model values live on each ModelInfo below and are read via
+# vocab_size_for(). Over-estimating is the memory-SAFE direction (smaller cap, larger VRAM
+# estimate), so the fallback is the largest catalog vocab.
+_DEFAULT_VOCAB_SIZE = 248_320
+@dataclass(frozen=True)
+class ModelInfo:
+    id: str
+    display_name: str
+    params: str
+    algos: tuple[str, ...]
+    min_vram_gb: int
+    quant: str = "bf16"
+    recommended_gpu: str = DEFAULT_GPU
+    # GRPO needs more VRAM than SFT (a colocated vLLM rollout engine holds a second copy of
+    # the weights + KV cache). 0 => GRPO uses ``min_vram_gb`` like SFT; set it when the GRPO
+    # tier needs a bigger card than SFT (the colocate 2nd weight copy + KV pool). Consumed by
+    # engine.vram.model_required_vram_gb.
+    grpo_min_vram_gb: int = 0
+    notes: str = ""
+    # Worker container disk this model needs (GB). 0 = the platform default (64 GB)
+    # suffices. The runner raises gpu.disk_gb to at least this, so big-checkpoint
+    # models whose weights alone exceed 64 GB work out of the box.
+    min_disk_gb: int = 0
+    # Thinking/reasoning capability of the checkpoint's chat template:
+    #   "none"    no <think> support (or a non-thinking variant) — `thinking = true` is
+    #             rejected for these models
+    #   "hybrid"  template honors enable_thinking (Qwen3-style hybrid reasoning)
+    #   "always"  the model always emits reasoning; enable_thinking can't turn it off,
+    #             so `thinking = true` is required
+    #   "unknown" open-model-policy entries (capability not verified)
+    thinking: str = "none"
+    # Output vocab = config.vocab_size (lm_head / logits width, the padded model vocab — not
+    # the raw tokenizer count). Drives the GRPO fp32-logits memory term and the per-device
+    # completion cap. Curated per model below; defaults to the open-model fallback.
+    vocab_size: int = _DEFAULT_VOCAB_SIZE
+    # Total parameters in billions — the numeric model size the cost estimator reads directly
+    # (no parsing of the ``params`` display string). Curated per catalog model below.
+    params_b: float = 0.0
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+# The default model Flash trains when a config omits one. A current-gen dense 4B
+# (text-only fine-tune) on the modern worker stack — the safe out-of-the-box choice for
+# the average developer. It is thinking-"hybrid"; the thinking flag defaults OFF.
+DEFAULT_MODEL = "Qwen/Qwen3.5-4B"
+MODELS: dict[str, ModelInfo] = {
+    "openbmb/MiniCPM5-1B": ModelInfo(
+        id="openbmb/MiniCPM5-1B",
+        display_name="MiniCPM5 1B",
+        params="1.2B dense (Llama arch)",
+        params_b=1.2,
+        vocab_size=130_560,
+        algos=("sft", "grpo"),
+        min_vram_gb=12,
+        recommended_gpu="RTX 4090",
+        thinking="hybrid",
+        notes="On-device class SLM (131k ctx); standard Llama architecture.",
+    ),
+    # ---- Qwen3.5 dense family: validated on the modern worker stack ----
+    # (trl 1.x / vllm 0.19 / transformers 5.x). Trained + served TEXT-ONLY: the
+    # checkpoints are natively multimodal, so LoRA excludes the vision tower and vLLM
+    # loads language_model_only (see flash.engine.worker). Each entry passed a real
+    # train+eval smoke on its recommended GPU (bench/results/phase1/).
+    "Qwen/Qwen3.5-0.8B": ModelInfo(
+        id="Qwen/Qwen3.5-0.8B",
+        display_name="Qwen3.5 0.8B",
+        params="0.9B (text-only fine-tune)",
+        params_b=0.9,
+        vocab_size=248_320,
+        algos=("sft", "grpo"),
+        min_vram_gb=12,
+        recommended_gpu="RTX 4090",
+        thinking="hybrid",
+        notes="Smallest Qwen3.5; cheap smoke/dev runs with the modern arch.",
+    ),
+    "Qwen/Qwen3.5-2B": ModelInfo(
+        id="Qwen/Qwen3.5-2B",
+        display_name="Qwen3.5 2B",
+        params="2.3B (text-only fine-tune)",
+        params_b=2.3,
+        vocab_size=248_320,
+        algos=("sft", "grpo"),
+        min_vram_gb=16,
+        recommended_gpu="RTX 4090",
+        thinking="hybrid",
+    ),
+    "Qwen/Qwen3.5-4B": ModelInfo(
+        id="Qwen/Qwen3.5-4B",
+        display_name="Qwen3.5 4B",
+        params="4.7B (text-only fine-tune)",
+        params_b=4.7,
+        vocab_size=248_320,
+        algos=("sft", "grpo"),
+        min_vram_gb=32,
+        recommended_gpu="RTX 5090",
+        thinking="hybrid",
+        notes="Current-gen 4B. GRPO uses the sleep-mode memory recipe (hybrid arch needs "
+        "extra engine state-cache); fused DeltaNet kernels ship in the default stack.",
+    ),
+    "Qwen/Qwen3.5-9B": ModelInfo(
+        id="Qwen/Qwen3.5-9B",
+        display_name="Qwen3.5 9B",
+        params="9.7B (text-only fine-tune)",
+        params_b=9.7,
+        vocab_size=248_320,
+        algos=("sft", "grpo"),
+        min_vram_gb=48,
+        # bf16 LoRA (NOT QLoRA). 4-bit QLoRA was abandoned for the 9B because the GRPO vLLM
+        # rollout MERGES the LoRA into the 4-bit base (peft bnb merge), and that rounding makes
+        # the sampler policy diverge from the bf16 trainer -> TRL importance-sampling ratio
+        # collapses to 0 (no learning) + runaway/non-terminating generations. bf16 keeps the
+        # rollout and trainer in the same precision so GRPO actually learns. Costs a bigger GPU:
+        # ~19 GB weights; SFT fits a 48 GB card, colocated GRPO (two bf16 copies + KV + the
+        # 248k-vocab fp32 logits) needs an 80 GB class -> grpo_min_vram_gb floor below.
+        grpo_min_vram_gb=80,
+        quant="bf16",
+        recommended_gpu="A100 PCIe",
+        thinking="hybrid",
+        notes="bf16 LoRA. ~19 GB of weights; SFT fits a 48 GB card, while colocated GRPO "
+        "(two bf16 copies + KV + the 248k-vocab fp32 logits) needs an 80 GB-class card "
+        "(grpo_min_vram_gb floor).",
+    ),
+}
+def list_models() -> list[ModelInfo]:
+    return sorted(MODELS.values(), key=lambda m: (m.min_vram_gb, m.id))
+def get_model(model_id: str) -> ModelInfo:
+    try:
+        return MODELS[model_id]
+    except KeyError as exc:
+        allowed = ", ".join(MODELS)
+        raise ValueError(
+            f"unsupported model {model_id!r}; choose one of: {allowed} — or set "
+            f'model_policy = "allow" in the config to run any HF model that fits the GPU '
+            f"(open-model policy)"
+        ) from exc
+def vocab_size_for(model_id: str) -> int:
+    """Output vocab (== config.vocab_size, the lm_head / logits width) for a model — the
+    number that sizes the GRPO fp32-logits VRAM term and the per-device completion cap.
+    Returns the curated catalog value, else the safe default for open-model-policy entries.
+    This is the PADDED model vocab, not the raw tokenizer token count."""
+    info = MODELS.get(model_id)
+    return info.vocab_size if info is not None else _DEFAULT_VOCAB_SIZE
+def resolve_model(
+    model_id: str,
+    algorithm: str,
+    policy: str = "catalog",
+    gpu: str | None = None,
+) -> ModelInfo:
+    """Resolve a model under the configured policy.
+    ``catalog`` (default): the model must be a curated catalog entry.
+    ``allow``: any HF model is accepted; a coarse VRAM-fit estimate (HF safetensors
+    metadata, no download) blocks only provably-impossible fits and warns on tight ones.
+    """
+    algo = normalize_algorithm(algorithm)
+    if model_id in MODELS:
+        return validate_model_for_algorithm(model_id, algo)
+    if policy != "allow":
+        # Reuse get_model's error (includes the open-model hint).
+        return get_model(model_id)
+    return _resolve_open_model(model_id, algo, gpu)
+def _resolve_open_model(model_id: str, algo: str, gpu: str | None) -> ModelInfo:
+    """Synthesize a ModelInfo for the open-model "allow" policy from a coarse VRAM-fit
+    estimate (HF safetensors metadata, no download). Blocks provably-impossible fits and
+    warns on tight ones. Isolates the engine.vram dependency + disk-floor heuristic from
+    the curated-catalog path in resolve_model."""
+    from flash.engine.vram import check_fit
+    est = check_fit(model_id, algo, gpu or DEFAULT_GPU)
+    if est.verdict == "too_big":
+        raise ValueError(
+            f"{model_id} does not fit the requested GPU: {est.describe()}. "
+            f"Pick a smaller model or a larger supported GPU."
+        )
+    if est.verdict in ("tight", "unknown"):
+        print(f"warning: open-model policy: {est.describe()}")
+    params = f"{est.params_b:.1f}B" if est.params_b else "unknown size"
+    # Disk floor for the open model: a bf16 checkpoint is ~2 GB per billion params;
+    # add worker-stack headroom so a large model that passes the VRAM check can't
+    # provision a paid worker and then fail in prefetch_model when the checkpoint
+    # overflows the 64 GB container default. 0 (unknown size) leaves the default
+    # (the user can still raise it with gpu.disk_gb).
+    min_disk = int(est.params_b * 2) + 64 if est.params_b else 0
+    return ModelInfo(
+        id=model_id,
+        display_name=model_id,
+        params=params,
+        algos=ALGORITHMS,
+        min_vram_gb=math.ceil(est.est_gb) if est.est_gb else 24,
+        min_disk_gb=min_disk,
+        recommended_gpu=gpu or DEFAULT_GPU,
+        thinking="unknown",
+        notes="unlisted model accepted via the open-model policy (not curated/validated)",
+    )
+def validate_model_for_algorithm(model_id: str, algorithm: str) -> ModelInfo:
+    info = get_model(model_id)
+    algo = normalize_algorithm(algorithm)
+    # Catalog entries advertise the capability classes "sft" and "grpo": grpo needs the
+    # colocated rollout engine, sft is trainer-only.
+    required = "grpo" if algo == "grpo" else "sft"
+    if required not in info.algos:
+        allowed = ", ".join(info.algos)
+        raise ValueError(f"{model_id} supports {allowed}, not {algo}")
+    return info
+def public_model_rows() -> list[dict[str, Any]]:
+    return [m.to_dict() for m in list_models()]

flash/cli/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """CLI package."""