PyPI - freesolo-flash - Versions diffs - 0.2.0__py3-none-any.whl - Mend

freesolo-flash 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

flash/__init__.py +11 -0
flash/_fileio.py +35 -0
flash/_logging.py +49 -0
flash/catalog.py +245 -0
flash/cli/__init__.py +1 -0
flash/cli/main/__init__.py +220 -0
flash/cli/main/__main__.py +6 -0
flash/cli/main/commands.py +430 -0
flash/cli/main/envpush.py +333 -0
flash/client/__init__.py +14 -0
flash/client/config.py +46 -0
flash/client/http.py +202 -0
flash/client/specs.py +23 -0
flash/engine/__init__.py +7 -0
flash/engine/accounting.py +37 -0
flash/engine/chalk_kernels.py +150 -0
flash/engine/multiturn_rollout.py +273 -0
flash/engine/recipe.py +86 -0
flash/engine/vram.py +382 -0
flash/engine/worker/__init__.py +1960 -0
flash/engine/worker/__main__.py +4 -0
flash/engine/worker/lora.py +137 -0
flash/engine/worker/perf.py +467 -0
flash/envs/__init__.py +10 -0
flash/envs/adapter/__init__.py +384 -0
flash/envs/adapter/rubric.py +222 -0
flash/envs/base.py +49 -0
flash/envs/registry.py +76 -0
flash/mcp/__init__.py +1 -0
flash/mcp/server.py +83 -0
flash/providers/__init__.py +59 -0
flash/providers/_auth.py +24 -0
flash/providers/_http.py +100 -0
flash/providers/_poll.py +87 -0
flash/providers/allocator.py +173 -0
flash/providers/base.py +496 -0
flash/providers/preflight.py +80 -0
flash/providers/runpod/__init__.py +108 -0
flash/providers/runpod/api.py +109 -0
flash/providers/runpod/auth.py +24 -0
flash/providers/runpod/gpus.py +46 -0
flash/providers/runpod/jobs.py +519 -0
flash/providers/runpod/preflight.py +30 -0
flash/providers/runpod/pricing.py +108 -0
flash/providers/runpod/train/__init__.py +141 -0
flash/providers/runpod/train/deps.py +371 -0
flash/providers/runpod/train/endpoints.py +501 -0
flash/providers/vast/__init__.py +120 -0
flash/providers/vast/_bootstrap.py +288 -0
flash/providers/vast/api.py +215 -0
flash/providers/vast/auth.py +19 -0
flash/providers/vast/gpus.py +21 -0
flash/providers/vast/jobs/__init__.py +555 -0
flash/providers/vast/jobs/builders.py +205 -0
flash/providers/vast/preflight.py +27 -0
flash/providers/vast/pricing.py +51 -0
flash/providers/vast/train.py +27 -0
flash/py.typed +0 -0
flash/runner/__init__.py +290 -0
flash/runner/deploy.py +349 -0
flash/runner/lifecycle.py +437 -0
flash/schema/__init__.py +285 -0
flash/schema/fields.py +210 -0
flash/serve/__init__.py +1 -0
flash/serve/deploy.py +195 -0
flash/server/__init__.py +1 -0
flash/server/__main__.py +20 -0
flash/server/app.py +424 -0
flash/server/auth.py +132 -0
flash/server/db.py +152 -0
flash/server/envs.py +449 -0
flash/spec.py +291 -0
freesolo_flash-0.2.0.dist-info/METADATA +99 -0
freesolo_flash-0.2.0.dist-info/RECORD +77 -0
freesolo_flash-0.2.0.dist-info/WHEEL +4 -0
freesolo_flash-0.2.0.dist-info/entry_points.txt +4 -0
freesolo_flash-0.2.0.dist-info/licenses/LICENSE +201 -0

flash/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""Flash — managed LoRA post-training: log in with your freesolo key, train.
+A focused developer experience (TOML run specs, pluggable environments,
+CLI/API/MCP entry points, adapter deployment). Users authenticate with their
+freesolo API key (`flash login`); the control plane runs each job on a managed
+GPU (RunPod or Vast.ai) behind the scenes.
+"""
+__all__ = ["__version__"]
+__version__ = "0.2.0"

flash/_fileio.py ADDED Viewed

@@ -0,0 +1,35 @@
+"""Small shared file-IO helpers for credential/manifest JSON under ``~/.flash``."""
+from __future__ import annotations
+import contextlib
+import json
+import os
+from pathlib import Path
+def read_json_or_empty(path: Path) -> dict:
+    """Parse a JSON object file, returning ``{}`` if it's missing or unreadable."""
+    try:
+        return json.loads(path.read_text())
+    except (OSError, ValueError):
+        return {}
+def secure_json_write(path: Path, data: dict) -> None:
+    """Write ``data`` as JSON with private permissions (the file may hold a secret).
+    Creates the parent dir (0700) and opens the file 0600 from the start — never
+    write_text + chmod, which leaves it umask-readable in between. ``O_NOFOLLOW``
+    (where available) refuses to follow a symlink planted at ``path`` so the write
+    can't be redirected to clobber an arbitrary file.
+    """
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with contextlib.suppress(OSError):
+        os.chmod(path.parent, 0o700)
+    flags = os.O_WRONLY | os.O_CREAT | os.O_TRUNC | getattr(os, "O_NOFOLLOW", 0)
+    fd = os.open(path, flags, 0o600)
+    with os.fdopen(fd, "w") as f:
+        json.dump(data, f, indent=2, sort_keys=True)
+    with contextlib.suppress(OSError):
+        os.chmod(path, 0o600)

flash/_logging.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""Package logging helpers.
+Library code logs through the ``flash`` logger and never configures handlers on import (it
+attaches a :class:`logging.NullHandler`), so importing Flash stays silent for downstream
+applications. The CLI calls :func:`configure_logging` to attach a console handler whose
+level is controlled by ``-v/--verbose``.
+"""
+from __future__ import annotations
+import logging
+_ROOT_NAME = "flash"
+# Attach a NullHandler once so "No handlers could be found" warnings never appear and
+# importing the library produces no output unless the app opts in.
+_root = logging.getLogger(_ROOT_NAME)
+if not any(isinstance(h, logging.NullHandler) for h in _root.handlers):
+    _root.addHandler(logging.NullHandler())
+def get_logger(name: str | None = None) -> logging.Logger:
+    """Return a logger under the ``flash`` namespace (e.g. ``get_logger(__name__)``)."""
+    if not name or name == _ROOT_NAME:
+        return logging.getLogger(_ROOT_NAME)
+    if name.startswith(_ROOT_NAME + "."):
+        return logging.getLogger(name)
+    return logging.getLogger(f"{_ROOT_NAME}.{name}")
+def configure_logging(verbosity: int = 0, level: int | None = None) -> None:
+    """Attach a console handler to the ``flash`` logger and set its level.
+    ``verbosity`` maps repeated ``-v`` flags to levels (0=WARNING, 1=INFO, >=2=DEBUG).
+    An explicit ``level`` overrides the verbosity mapping.
+    """
+    if level is None:
+        level = {0: logging.WARNING, 1: logging.INFO}.get(verbosity, logging.DEBUG)
+    logger = logging.getLogger(_ROOT_NAME)
+    logger.setLevel(level)
+    # Replace any prior console handler we installed so repeated calls don't stack handlers.
+    for h in [h for h in logger.handlers if getattr(h, "_flash_console", False)]:
+        logger.removeHandler(h)
+    handler = logging.StreamHandler()  # stderr
+    handler.setLevel(level)
+    handler.setFormatter(logging.Formatter("%(levelname)s %(name)s: %(message)s"))
+    handler._flash_console = True  # type: ignore[attr-defined]
+    logger.addHandler(handler)

flash/catalog.py ADDED Viewed

@@ -0,0 +1,245 @@
+"""Curated model catalog for one-consumer-GPU LoRA jobs."""
+from __future__ import annotations
+import math
+from dataclasses import asdict, dataclass
+from typing import Any
+ALGORITHMS = ("sft", "grpo")
+def normalize_algorithm(value: str) -> str:
+    """Canonical (lowercased, validated) algorithm name."""
+    value = (value or "grpo").lower()
+    if value not in ALGORITHMS:
+        raise ValueError(f"unsupported algorithm: {value}; known: {', '.join(ALGORITHMS)}")
+    return value
+# The default GPU class a run lands on when none is pinned (also the open-model-policy
+# sizing reference and the spec/from_dict fallback). The managed GPU class set (KNOWN)
+# lives in providers.base; per-provider classes and pricing live under
+# providers/{runpod,vast}. Defined above ModelInfo so it can back the recommended_gpu
+# field default.
+DEFAULT_GPU = "RTX 5090"
+# Output vocab (== config.vocab_size, the lm_head / logits width — the PADDED model vocab,
+# NOT the raw tokenizer token count). Sizes the GRPO fp32-logits VRAM term (engine.vram) and
+# the per-device completion cap (engine.worker.rl_per_device_comps). This is the open-model
+# fallback; curated per-model values live on each ModelInfo below and are read via
+# vocab_size_for(). Over-estimating is the memory-SAFE direction (smaller cap, larger VRAM
+# estimate), so the fallback is the largest catalog vocab.
+_DEFAULT_VOCAB_SIZE = 248_320
+@dataclass(frozen=True)
+class ModelInfo:
+    id: str
+    display_name: str
+    params: str
+    algos: tuple[str, ...]
+    min_vram_gb: int
+    quant: str = "bf16"
+    recommended_gpu: str = DEFAULT_GPU
+    # GRPO needs more VRAM than SFT (a colocated vLLM rollout engine holds a second copy of
+    # the weights + KV cache). 0 => GRPO uses ``min_vram_gb`` like SFT; set it when the GRPO
+    # tier needs a bigger card than SFT (the colocate 2nd weight copy + KV pool). Consumed by
+    # engine.vram.model_required_vram_gb.
+    grpo_min_vram_gb: int = 0
+    notes: str = ""
+    # Worker container disk this model needs (GB). 0 = the platform default (64 GB)
+    # suffices. The runner raises gpu.disk_gb to at least this, so big-checkpoint
+    # models whose weights alone exceed 64 GB work out of the box.
+    min_disk_gb: int = 0
+    # Thinking/reasoning capability of the checkpoint's chat template:
+    #   "none"    no <think> support (or a non-thinking variant) — `thinking = true` is
+    #             rejected for these models
+    #   "hybrid"  template honors enable_thinking (Qwen3-style hybrid reasoning)
+    #   "always"  the model always emits reasoning; enable_thinking can't turn it off,
+    #             so `thinking = true` is required
+    #   "unknown" open-model-policy entries (capability not verified)
+    thinking: str = "none"
+    # Output vocab = config.vocab_size (lm_head / logits width, the padded model vocab — not
+    # the raw tokenizer count). Drives the GRPO fp32-logits memory term and the per-device
+    # completion cap. Curated per model below; defaults to the open-model fallback.
+    vocab_size: int = _DEFAULT_VOCAB_SIZE
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+# The default model Flash trains when a config omits one. A current-gen dense 4B
+# (text-only fine-tune) on the modern worker stack — the safe out-of-the-box choice for
+# the average developer. It is thinking-"hybrid"; the thinking flag now defaults ON.
+DEFAULT_MODEL = "Qwen/Qwen3.5-4B"
+MODELS: dict[str, ModelInfo] = {
+    "openbmb/MiniCPM5-1B": ModelInfo(
+        id="openbmb/MiniCPM5-1B",
+        display_name="MiniCPM5 1B",
+        params="1.2B dense (Llama arch)",
+        vocab_size=130_560,
+        algos=("sft", "grpo"),
+        min_vram_gb=12,
+        recommended_gpu="RTX 4090",
+        thinking="hybrid",
+        notes="On-device class SLM (131k ctx); standard Llama architecture.",
+    ),
+    # ---- Qwen3.5 dense family: validated on the modern worker stack ----
+    # (trl 1.x / vllm 0.19 / transformers 5.x). Trained + served TEXT-ONLY: the
+    # checkpoints are natively multimodal, so LoRA excludes the vision tower and vLLM
+    # loads language_model_only (see flash.engine.worker). Each entry passed a real
+    # train+eval smoke on its recommended GPU (bench/results/phase1/).
+    "Qwen/Qwen3.5-0.8B": ModelInfo(
+        id="Qwen/Qwen3.5-0.8B",
+        display_name="Qwen3.5 0.8B",
+        params="0.9B (text-only fine-tune)",
+        vocab_size=248_320,
+        algos=("sft", "grpo"),
+        min_vram_gb=12,
+        recommended_gpu="RTX 4090",
+        thinking="hybrid",
+        notes="Smallest Qwen3.5; cheap smoke/dev runs with the modern arch.",
+    ),
+    "Qwen/Qwen3.5-2B": ModelInfo(
+        id="Qwen/Qwen3.5-2B",
+        display_name="Qwen3.5 2B",
+        params="2.3B (text-only fine-tune)",
+        vocab_size=248_320,
+        algos=("sft", "grpo"),
+        min_vram_gb=16,
+        recommended_gpu="RTX 4090",
+        thinking="hybrid",
+    ),
+    "Qwen/Qwen3.5-4B": ModelInfo(
+        id="Qwen/Qwen3.5-4B",
+        display_name="Qwen3.5 4B",
+        params="4.7B (text-only fine-tune)",
+        vocab_size=248_320,
+        algos=("sft", "grpo"),
+        min_vram_gb=32,
+        recommended_gpu="RTX 5090",
+        thinking="hybrid",
+        notes="Current-gen 4B. GRPO uses the sleep-mode memory recipe (hybrid arch needs "
+        "extra engine state-cache); fused DeltaNet kernels ship in the default stack.",
+    ),
+    "Qwen/Qwen3.5-9B": ModelInfo(
+        id="Qwen/Qwen3.5-9B",
+        display_name="Qwen3.5 9B",
+        params="9.7B (text-only fine-tune)",
+        vocab_size=248_320,
+        algos=("sft", "grpo"),
+        min_vram_gb=16,
+        # MEMORY-OPTIMIZED: 4-bit NF4 frozen base + bf16 LoRA adapter (QLoRA). The base
+        # drops from ~19 GB bf16 to ~5.3 GB, so colocated GRPO holds two 4-bit copies
+        # (trainer + bnb-quantized vLLM rollout) instead of two bf16 copies -> it fits a
+        # ~24-32 GB card instead of an 80 GB A100. NF4 is near-lossless for adapter training
+        # (QLoRA paper + follow-ups), a small quality trade for a ~3x cheaper GPU. No GRPO
+        # floor: the matrix sizes the (much smaller) 4-bit footprint directly.
+        grpo_min_vram_gb=0,
+        quant="4bit-qlora",
+        recommended_gpu="RTX 5090",
+        thinking="hybrid",
+        notes="QLoRA (4-bit NF4 base + bf16 LoRA). GRPO's colocated vLLM rollout loads the "
+        "base 4-bit via bitsandbytes too, so both copies are 4-bit -> fits ~24-32 GB "
+        "instead of 80 GB bf16. ~near-lossless vs bf16 LoRA.",
+    ),
+}
+def list_models() -> list[ModelInfo]:
+    return sorted(MODELS.values(), key=lambda m: (m.min_vram_gb, m.id))
+def get_model(model_id: str) -> ModelInfo:
+    try:
+        return MODELS[model_id]
+    except KeyError as exc:
+        allowed = ", ".join(MODELS)
+        raise ValueError(
+            f"unsupported model {model_id!r}; choose one of: {allowed} — or set "
+            f'model_policy = "allow" in the config to run any HF model that fits the GPU '
+            f"(open-model policy)"
+        ) from exc
+def vocab_size_for(model_id: str) -> int:
+    """Output vocab (== config.vocab_size, the lm_head / logits width) for a model — the
+    number that sizes the GRPO fp32-logits VRAM term and the per-device completion cap.
+    Returns the curated catalog value, else the safe default for open-model-policy entries.
+    This is the PADDED model vocab, not the raw tokenizer token count."""
+    info = MODELS.get(model_id)
+    return info.vocab_size if info is not None else _DEFAULT_VOCAB_SIZE
+def resolve_model(
+    model_id: str,
+    algorithm: str,
+    policy: str = "catalog",
+    gpu: str | None = None,
+) -> ModelInfo:
+    """Resolve a model under the configured policy.
+    ``catalog`` (default): the model must be a curated catalog entry.
+    ``allow``: any HF model is accepted; a coarse VRAM-fit estimate (HF safetensors
+    metadata, no download) blocks only provably-impossible fits and warns on tight ones.
+    """
+    algo = normalize_algorithm(algorithm)
+    if model_id in MODELS:
+        return validate_model_for_algorithm(model_id, algo)
+    if policy != "allow":
+        # Reuse get_model's error (includes the open-model hint).
+        return get_model(model_id)
+    return _resolve_open_model(model_id, algo, gpu)
+def _resolve_open_model(model_id: str, algo: str, gpu: str | None) -> ModelInfo:
+    """Synthesize a ModelInfo for the open-model "allow" policy from a coarse VRAM-fit
+    estimate (HF safetensors metadata, no download). Blocks provably-impossible fits and
+    warns on tight ones. Isolates the engine.vram dependency + disk-floor heuristic from
+    the curated-catalog path in resolve_model."""
+    from flash.engine.vram import check_fit
+    est = check_fit(model_id, algo, gpu or DEFAULT_GPU)
+    if est.verdict == "too_big":
+        raise ValueError(
+            f"{model_id} does not fit the requested GPU: {est.describe()}. "
+            f"Pick a smaller model or a larger supported GPU."
+        )
+    if est.verdict in ("tight", "unknown"):
+        print(f"warning: open-model policy: {est.describe()}")
+    params = f"{est.params_b:.1f}B" if est.params_b else "unknown size"
+    # Disk floor for the open model: a bf16 checkpoint is ~2 GB per billion params;
+    # add worker-stack headroom so a large model that passes the VRAM check can't
+    # provision a paid worker and then fail in prefetch_model when the checkpoint
+    # overflows the 64 GB container default. 0 (unknown size) leaves the default
+    # (the user can still raise it with gpu.disk_gb).
+    min_disk = int(est.params_b * 2) + 64 if est.params_b else 0
+    return ModelInfo(
+        id=model_id,
+        display_name=model_id,
+        params=params,
+        algos=ALGORITHMS,
+        min_vram_gb=math.ceil(est.est_gb) if est.est_gb else 24,
+        min_disk_gb=min_disk,
+        recommended_gpu=gpu or DEFAULT_GPU,
+        thinking="unknown",
+        notes="unlisted model accepted via the open-model policy (not curated/validated)",
+    )
+def validate_model_for_algorithm(model_id: str, algorithm: str) -> ModelInfo:
+    info = get_model(model_id)
+    algo = normalize_algorithm(algorithm)
+    # Catalog entries advertise the capability classes "sft" and "grpo": grpo needs the
+    # colocated rollout engine, sft is trainer-only.
+    required = "grpo" if algo == "grpo" else "sft"
+    if required not in info.algos:
+        allowed = ", ".join(info.algos)
+        raise ValueError(f"{model_id} supports {allowed}, not {algo}")
+    return info
+def public_model_rows() -> list[dict[str, Any]]:
+    return [m.to_dict() for m in list_models()]

flash/cli/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """CLI package."""

flash/cli/main/__init__.py ADDED Viewed

@@ -0,0 +1,220 @@
+"""CLI for the managed Flash service.
+Every run-lifecycle command is a thin HTTP call to the Flash control plane —
+users authenticate with their freesolo API key (`flash login` verifies it against
+the freesolo backend), never with provider credentials. Config parsing/validation
+and `--dry-run` stay fully local.
+"""
+from __future__ import annotations
+import argparse
+import sys
+from flash import __version__
+from flash._logging import configure_logging, get_logger
+# Command handlers + the patched client surface live in submodules; re-export them so
+# `flash.cli.main` stays the single public import surface (and so monkeypatching
+# `flash.cli.main.commands` reaches the bare globals the handlers read).
+from flash.cli.main.commands import (  # noqa: F401
+    _CLI_DONE_STATES,
+    _OK_STATES,
+    _STARTER_ENV_PY,
+    _USER_ERRORS,
+    _follow_run,
+    _poll_logs,
+    client_from_config,
+    cmd_attach,
+    cmd_cancel,
+    cmd_chat,
+    cmd_cost,
+    cmd_deploy,
+    cmd_deployments,
+    cmd_env_init,
+    cmd_env_list,
+    cmd_gpus,
+    cmd_lab_setup,
+    cmd_login,
+    cmd_logs,
+    cmd_models,
+    cmd_ps,
+    cmd_status,
+    cmd_train,
+    cmd_undeploy,
+    cmd_version,
+    cmd_whoami,
+    verify_freesolo_key,
+)
+from flash.cli.main.envpush import cmd_env_install, cmd_env_push
+logger = get_logger("flash.cli.main")
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(prog="flash", description="Managed LoRA post-training")
+    parser.add_argument("-V", "--version", action="version", version=f"flash {__version__}")
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="show full tracebacks on error",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="count",
+        default=0,
+        help="increase log verbosity (-v for info, -vv for debug)",
+    )
+    sub = parser.add_subparsers(dest="cmd", required=True)
+    version = sub.add_parser("version", help="print the Flash version")
+    version.set_defaults(func=cmd_version)
+    login = sub.add_parser("login", help="log in with your freesolo API key (verified by freesolo)")
+    login.add_argument(
+        "--api-key",
+        help="your freesolo API key (default: FREESOLO_API_KEY); created in the dashboard",
+    )
+    login.add_argument(
+        "--freesolo-url",
+        dest="freesolo_url",
+        help="freesolo backend base URL (default: FREESOLO_BASE_URL or https://api.freesolo.co)",
+    )
+    login.add_argument(
+        "--api-url", help="flash control-plane URL for training calls (default: FLASH_API_URL)"
+    )
+    login.set_defaults(func=cmd_login)
+    whoami = sub.add_parser("whoami", help="show the identity behind your stored key")
+    whoami.set_defaults(func=cmd_whoami)
+    lab = sub.add_parser("lab", help="local authoring scaffolds")
+    lab_sub = lab.add_subparsers(dest="lab_cmd", required=True)
+    setup = lab_sub.add_parser("setup", help="scaffold environments/ + configs/ in the cwd")
+    setup.set_defaults(func=cmd_lab_setup)
+    models = sub.add_parser("models", help="list supported base models")
+    models.set_defaults(func=cmd_models)
+    gpus = sub.add_parser("gpus", help="list managed GPU classes with live $/hr")
+    gpus.set_defaults(func=cmd_gpus)
+    env = sub.add_parser("env", help="manage verifiers environments")
+    env_sub = env.add_subparsers(dest="env_cmd", required=True)
+    init = env_sub.add_parser("init", help="scaffold a new local verifiers environment")
+    init.add_argument("name")
+    init.set_defaults(func=cmd_env_init)
+    env_list = env_sub.add_parser("list", help="list installed + local environments")
+    env_list.set_defaults(func=cmd_env_list)
+    env_install = env_sub.add_parser("install", help="install a published Prime Hub environment")
+    env_install.add_argument("env_id", help='the env id to install (a Hub slug, "owner/name")')
+    env_install.set_defaults(func=cmd_env_install)
+    env_push = env_sub.add_parser(
+        "push", help="publish a local verifiers env to the Prime Hub (private); prints its env id"
+    )
+    env_push.add_argument("path", nargs="?", default=".")
+    env_push.set_defaults(func=cmd_env_push)
+    train = sub.add_parser("train", help="submit a managed training run from a TOML config")
+    train.add_argument("config")
+    train.add_argument(
+        "--config",
+        dest="extra_configs",
+        action="append",
+        default=[],
+        help="additional TOML to deep-merge (config composition); repeatable",
+    )
+    train.add_argument(
+        "--set",
+        dest="overrides",
+        action="append",
+        default=[],
+        metavar="key=value",
+        help="override a config value; repeatable",
+    )
+    train.add_argument("--dry-run", action="store_true")
+    train.add_argument(
+        "--background",
+        action="store_true",
+        help="submit and return immediately instead of following logs",
+    )
+    train.set_defaults(func=cmd_train)
+    status = sub.add_parser("status", help="show a run's full status JSON")
+    status.add_argument("run_id")
+    status.set_defaults(func=cmd_status)
+    attach = sub.add_parser(
+        "attach", help="follow a running job's logs to completion (resumable any time)"
+    )
+    attach.add_argument("run_id")
+    attach.set_defaults(func=cmd_attach)
+    ps = sub.add_parser("ps", help="list runs and their state/cost")
+    ps.set_defaults(func=cmd_ps)
+    cost = sub.add_parser("cost", help="show a run's accrued cost (USD)")
+    cost.add_argument("run_id")
+    cost.set_defaults(func=cmd_cost)
+    cancel = sub.add_parser("cancel", help="cancel a run (best-effort)")
+    cancel.add_argument("run_id")
+    cancel.set_defaults(func=cmd_cancel)
+    logs = sub.add_parser("logs")
+    logs.add_argument("run_id")
+    logs.add_argument("-f", "--follow", action="store_true", help="stream new log lines")
+    logs.set_defaults(func=cmd_logs)
+    deploy = sub.add_parser("deploy")
+    deploy.add_argument("run_id")
+    deploy.add_argument(
+        "--mode",
+        choices=["dev", "always-on"],
+        default="dev",
+        help="dev: scale-to-zero, cold start after idle, $0 when unused (default). "
+        "always-on: one warm worker 24/7, no cold starts, continuous billing.",
+    )
+    deploy.add_argument(
+        "--idle-timeout",
+        type=int,
+        default=300,
+        help="dev mode: seconds of inactivity before the worker scales to zero (default 300)",
+    )
+    deploy.add_argument("--dry-run", action="store_true")
+    deploy.set_defaults(func=cmd_deploy)
+    undeploy = sub.add_parser("undeploy", help="tear down a run's serving endpoint")
+    undeploy.add_argument("run_id")
+    undeploy.set_defaults(func=cmd_undeploy)
+    deployments = sub.add_parser("deployments", help="list active serving deployments")
+    deployments.set_defaults(func=cmd_deployments)
+    chat = sub.add_parser("chat", help="chat with a deployed adapter")
+    chat.add_argument("run_id")
+    chat.add_argument("-m", "--message", required=True)
+    chat.add_argument("--max-tokens", type=int, default=512)
+    chat.add_argument("--temperature", type=float, default=0.0)
+    chat.set_defaults(func=cmd_chat)
+    # The control plane is operator-only and run as a separate one-off service via the
+    # `flash-server` console script (flash.server.__main__:main), not a `flash` subcommand.
+    args = parser.parse_args(argv)
+    configure_logging(verbosity=getattr(args, "verbose", 0))
+    debug = getattr(args, "debug", False)
+    try:
+        return args.func(args)
+    except _USER_ERRORS as exc:
+        if debug:
+            raise
+        print(f"error: {exc}", file=sys.stderr)
+        return 1
+    except KeyboardInterrupt:
+        print("aborted", file=sys.stderr)
+        return 130

flash/cli/main/__main__.py ADDED Viewed

@@ -0,0 +1,6 @@
+import sys
+from flash.cli.main import main
+if __name__ == "__main__":
+    sys.exit(main())