PyPI - freesolo-flash-dev - Versions diffs - 0.2.25__tar.gz → 0.2.26__tar.gz - Mend

freesolo-flash-dev 0.2.25tar.gz → 0.2.26tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (274) hide show

{freesolo_flash_dev-0.2.25 → freesolo_flash_dev-0.2.26}/.env.example RENAMED Viewed

@@ -5,6 +5,11 @@
 # GPU substrate. RunPod is the default; Vast is opt-in (only required when set).
 RUNPOD_API_KEY=
 VAST_API_KEY=
+# Use the per-arch baked worker images (cu128-<sm>) to skip the ~10-15 min cold-start JIT. Requires
+# the per-SM images published first (.github/workflows/bake-kernel-cache.yml), and a manual re-bake
+# after any worker-deps change. The control-plane Dockerfile sets this to 1 by default; uncomment to
+# enable for a bare `flash-server` deploy.
+# FLASH_WORKER_IMAGE_PER_SM=1
 # HuggingFace token with write access to each run's [train] hf_repo (code upload +
 # streamed checkpoints/adapters land in that per-run dataset repo). The artifact repo
 # is per-run (set in the run TOML's [train] hf_repo), not an operator-wide env var.

{freesolo_flash_dev-0.2.25 → freesolo_flash_dev-0.2.26}/.github/workflows/bake-kernel-cache.yml RENAMED Viewed

@@ -56,11 +56,24 @@ jobs:
               gpu_type_id: "NVIDIA H100 80GB HBM3",
               allowed_cuda: "",
             }
-          # Blackwell needs CUDA-13 hosts to JIT its PTX (matches min_cuda_for in the provider).
+          # Blackwell needs CUDA-13 hosts to JIT its PTX (matches min_cuda_for in the provider). Bake
+          # sm120 on the RTX Pro 6000 (Server Edition): it's the same sm120 (cache is sm-keyed, so it's
+          # valid for RTX 5090 too) but has far better secure-cloud on-demand capacity -- the RTX 5090
+          # pool repeatedly returned "machine does not have the resources" at create_pod.
           - {
               sm: sm120,
               arch: "12.0",
-              gpu_type_id: "NVIDIA GeForce RTX 5090",
+              gpu_type_id: "NVIDIA RTX PRO 6000 Blackwell Server Edition",
+              allowed_cuda: "13.0",
+            }
+          # Datacenter Blackwell (sm100, distinct from the sm120 RTX Pro 6000). B200 has 180 GB and
+          # good secure-cloud capacity. CUDA-13 host like the other Blackwell parts. NOT in the default
+          # `sms` list yet (B200 is unvalidated) -- bake it explicitly via `sms=sm100` until a smoke
+          # passes, then add it to the default.
+          - {
+              sm: sm100,
+              arch: "10.0",
+              gpu_type_id: "NVIDIA B200",
               allowed_cuda: "13.0",
             }
     steps:
@@ -80,6 +93,11 @@ jobs:
       - name: Install uv
         if: steps.gate.outputs.run == 'true'
         uses: astral-sh/setup-uv@v5
+        with:
+          # no cache: the post-run cache-prune calls `uv`, but the "Free disk space" step deletes
+          # /opt/hostedtoolcache (where setup-uv put uv), so the post step fails ("uv not found") and
+          # marks the whole job red even though the bake + push already succeeded.
+          enable-cache: false
       - name: Sync deps (flash + runpod + hf)
         if: steps.gate.outputs.run == 'true'

{freesolo_flash_dev-0.2.25 → freesolo_flash_dev-0.2.26}/.github/workflows/publish-dev.yml RENAMED Viewed

@@ -8,7 +8,8 @@ name: Publish flash dev-channel package
 # merging to `dev` cuts a release; ordinary dev pushes (version unchanged -> already published)
 # no-op. This mirrors freesolo-flash's publish.yml, but keyed on the dev version and on `dev`
 # instead of `main`, and a no-op is a clean success (not a failure) since most dev pushes don't
-# bump it. Manual runs via workflow_dispatch force a publish attempt of the current dev version.
+# bump it. Manual runs via workflow_dispatch re-run the same version check for the current dev
+# version (still a no-op when it's already on PyPI — not a forced re-publish).
 on:
   push:
     branches:
@@ -45,15 +46,28 @@ jobs:
         with:
           python-version: "3.11"
-      - name: Read dev-channel version
+      - name: Read dev-channel version (and enforce parity with the prod version)
         id: meta
         run: |
           python3 - <<'PY' >> "$GITHUB_OUTPUT"
+          import sys
           import tomllib
           with open("pyproject.toml", "rb") as f:
               data = tomllib.load(f)
-          print(f"version={data['tool']['flash-dev']['version']}")
+          dev_version = data["tool"]["flash-dev"]["version"]
+          prod_version = data["project"]["version"]
+          # The two channels MUST ship in lockstep (version-parity.yml enforces this on PRs). Re-check
+          # it here so the PUBLISH path can never ship a dev wheel out of sync with freesolo-flash even
+          # if the parity CI was bypassed or branch protection isn't strict — fail before build/publish.
+          if dev_version != prod_version:
+              print(
+                  f"::error::version mismatch: [tool.flash-dev].version={dev_version} != "
+                  f"[project].version={prod_version}; bump both in lockstep before publishing.",
+                  file=sys.stderr,
+              )
+              sys.exit(1)
+          print(f"version={dev_version}")
           PY
       - name: Decide whether to publish

freesolo_flash_dev-0.2.26/.github/workflows/version-parity.yml ADDED Viewed

@@ -0,0 +1,49 @@
+name: Version parity
+# Keep the two release channels pinned to the same version: the prod package `freesolo-flash`
+# (pyproject `[project].version`) and the dev-channel package `freesolo-flash-dev`
+# (`[tool.flash-dev].version`). They publish from `main` and `dev` respectively, so a divergence
+# would ship two channels claiming different versions. Bump both together.
+on:
+  push:
+    branches: [main, dev]
+  pull_request:
+    branches: [main, dev]
+permissions:
+  contents: read
+jobs:
+  versions-match:
+    name: dev and main at the same version
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+      # tomllib is stdlib only since 3.11; the runner's default python3 may predate that.
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Compare channel versions
+        run: |
+          python3 - <<'PY'
+          import sys
+          import tomllib
+          with open("pyproject.toml", "rb") as f:
+              data = tomllib.load(f)
+          prod = data["project"]["version"]
+          dev = data["tool"]["flash-dev"]["version"]
+          if prod != dev:
+              print(
+                  "::error::Channel version mismatch: "
+                  f"[project].version={prod} (freesolo-flash) != "
+                  f"[tool.flash-dev].version={dev} (freesolo-flash-dev). "
+                  "Bump both to the same version."
+              )
+              sys.exit(1)
+          print(f"OK: freesolo-flash and freesolo-flash-dev are both at {prod}.")
+          PY

{freesolo_flash_dev-0.2.25 → freesolo_flash_dev-0.2.26}/.github/workflows/worker-image.yml RENAMED Viewed

@@ -83,3 +83,15 @@ jobs:
           build-args: |
             FLASH_ATTN_SPEC=https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.9.0/flash_attn-2.8.3%2Bcu128torch2.10-cp312-cp312-linux_x86_64.whl
             FLASH_ATTN_3_SPEC=${{ github.event.inputs.flash_attn_3_spec || 'https://github.com/windreamer/flash-attention3-wheels/releases/download/2026.03.19-850211f/flash_attn_3-3.0.0%2B20260318.cu128torch2100cxx11abitrue.8afc61-cp39-abi3-linux_x86_64.whl' }}
+      # The per-arch baked images (cu128-<sm>) are rebaked MANUALLY (bake-kernel-cache.yml). When this
+      # rebuilds :cu128, those tags go stale -> prompt a rebake. Only when the cu128 tag was (re)built.
+      - name: Remind to rebake per-SM kernel-cache images
+        if: ${{ success() && (github.event.inputs.tag || 'cu128') == 'cu128' }}
+        run: |
+          echo "::warning title=Rebake per-SM images::A new :cu128 base was published. If FLASH_WORKER_IMAGE_PER_SM=1 is enabled, run bake-kernel-cache.yml to refresh cu128-sm{80,86,89,90,120}, else workers run stale baked deps."
+          {
+            echo "### ⚠️ Per-SM kernel-cache images may now be stale"
+            echo ""
+            echo "A fresh \`:cu128\` base was just published. If \`FLASH_WORKER_IMAGE_PER_SM=1\` is enabled, re-run **bake-kernel-cache.yml** to refresh the per-arch \`cu128-sm*\` tags, or workers will run the previous baked deps."
+          } >> "$GITHUB_STEP_SUMMARY"

freesolo_flash_dev-0.2.26/Dockerfile ADDED Viewed

@@ -0,0 +1,42 @@
+# Flash control plane (operator-side).
+#
+#   docker build -t flash-control-plane .
+#   docker run -p 8080:8080 \
+#     -e RUNPOD_API_KEY=... -e HF_TOKEN=... \
+#     -v flash-state:/root/.flash flash-control-plane
+#
+# All persistent state (key DB, run records, results) lives under ~/.flash (fixed paths,
+# = /root/.flash for the default root user) — mount a volume there. Run exactly ONE
+# container instance per state volume (state is local files + SQLite; no horizontal scaling).
+FROM python:3.12-slim
+WORKDIR /app
+COPY . .
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends ca-certificates git curl \
+    && curl -1sLf 'https://artifacts-cli.infisical.com/setup.deb.sh' | bash \
+    && apt-get update && apt-get install -y --no-install-recommends infisical \
+    && rm -rf /var/lib/apt/lists/* \
+    && chmod +x /app/infisical-entrypoint.sh
+RUN pip install --no-cache-dir ".[server]"
+VOLUME /root/.flash
+EXPOSE 8080
+# Use the per-arch baked worker images (ghcr.io/.../flash-worker:cu128-<sm>) so cold workers skip the
+# ~10-15 min first-use JIT; the allocator maps each GPU class to its matching -smXX tag. All validated
+# SMs (sm80/86/89/90/120) are published. Rebakes are MANUAL -- after a Dockerfile.worker/deps change
+# rebuilds :cu128, re-run bake-kernel-cache.yml so the -smXX tags don't ship stale deps (the
+# worker-image build posts a reminder). Override at runtime with `-e FLASH_WORKER_IMAGE_PER_SM=0`.
+#
+# NOTE: this ENV is the default for BARE (non-Infisical) `flash-server` deploys. Under the Infisical
+# entrypoint below, `infisical run` overrides the container env, so for the Infisical-managed deploy
+# set FLASH_WORKER_IMAGE_PER_SM in the vault (path /flash) or add it to INFISICAL_KEEP -- otherwise
+# this default may not reach the server.
+ENV FLASH_WORKER_IMAGE_PER_SM=1
+# secret injection wrapper: no-op passthrough unless INFISICAL_CLIENT_ID is set, else
+# `infisical login` (universal-auth) then `infisical run --path /flash` before the server.
+ENTRYPOINT ["/app/infisical-entrypoint.sh"]
+CMD ["python", "-m", "flash.server", "--host", "0.0.0.0", "--port", "8080"]

{freesolo_flash_dev-0.2.25 → freesolo_flash_dev-0.2.26}/Dockerfile.worker RENAMED Viewed

@@ -73,9 +73,9 @@ RUN pip install --no-cache-dir \
 ARG FLASH_ATTN_SPEC=flash-attn
 # Source-build fallback only (ignored when FLASH_ATTN_SPEC is a wheel): bound the compile so it
 # doesn't OOM. TORCH_CUDA_ARCH_LIST restricts to the catalog's arches — Ampere (8.0 A100 / 8.6
-# 3090/A40), Ada (8.9 4090), Hopper (9.0 H100), Blackwell (12.0 RTX 5090; sm120); MAX_JOBS bounds
+# 3090/A40), Ada (8.9 4090), Hopper (9.0 H100), Blackwell datacenter (10.0 B200; sm100) + workstation (12.0 RTX 5090/Pro 6000; sm120); MAX_JOBS bounds
 # peak compile memory. (A from-source build still needs a big-RAM host; the wheel avoids all of it.)
-RUN TORCH_CUDA_ARCH_LIST="8.0 8.6 8.9 9.0 12.0" MAX_JOBS=4 \
+RUN TORCH_CUDA_ARCH_LIST="8.0 8.6 8.9 9.0 10.0 12.0" MAX_JOBS=4 \
     pip install --no-cache-dir "${FLASH_ATTN_SPEC}" --no-build-isolation \
     && echo "flash-attn: installed (${FLASH_ATTN_SPEC})" || echo "flash-attn: build failed, SDPA fallback"
@@ -119,7 +119,7 @@ RUN if [ -n "${FLASH_ATTN_3_SPEC}" ]; then \
 # import fine but raise "no kernel image is available for execution on the device" at the first conv
 # forward on sm120 — GPU-verified. engine.worker.packing.gdn_packing_available runs a conv smoke too,
 # so even a wrong-arch build can't crash a run (it just keeps GDN packing off).
-RUN TORCH_CUDA_ARCH_LIST="8.0 8.6 8.9 9.0 12.0" CAUSAL_CONV1D_FORCE_BUILD=TRUE MAX_JOBS=4 \
+RUN TORCH_CUDA_ARCH_LIST="8.0 8.6 8.9 9.0 10.0 12.0" CAUSAL_CONV1D_FORCE_BUILD=TRUE MAX_JOBS=4 \
     pip install --no-cache-dir "causal-conv1d==1.6.2.post1" --no-build-isolation \
     && python -c "import causal_conv1d" \
     && echo "causal_conv1d: installed (GDN packing enabled)" \

{freesolo_flash_dev-0.2.25 → freesolo_flash_dev-0.2.26}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: freesolo-flash-dev
-Version: 0.2.25
+Version: 0.2.26
 Summary: Flash — managed LoRA post-training (SFT/GRPO) for Freesolo environments, driven by the `flash` CLI
 Project-URL: Homepage, https://github.com/freesolo-co/flash
 Project-URL: Repository, https://github.com/freesolo-co/flash
@@ -85,7 +85,6 @@ The allocator picks the cheapest validated RunPod GPU class that fits the run.
   and ready-to-run configs to start from
 - `flash/serve/`, `flash/server/` — adapter serving and the FastAPI control
   plane (run operator-side via the separate `flash-server` command)
-- `flash/mcp/` — stdio MCP bridge for coding agents
 - `Dockerfile` — the control-plane image (used by the repo docker-compose)
 - `tests/` — pytest suite (CPU-only; offline-by-default, no GPU/network)
@@ -117,11 +116,15 @@ Two channels are published to PyPI from the *same source*, distinguished by one
 | prod | `freesolo-flash` | `flash` | `flash.freesolo.co` | push to `main` that bumps `[project].version` (`.github/workflows/publish.yml`) |
 | dev | `freesolo-flash-dev` | `flash-dev` | `flash-dev.freesolo.co` | push to `dev` whose `[tool.flash-dev].version` isn't on PyPI yet (`.github/workflows/publish-dev.yml`) |
-The two install side by side (distinct package + CLI names). The dev build is produced by
+Each environment holds exactly **one** channel: both packages ship the same import package
+(`flash/`) with one baked `CHANNEL` line, so installing both into the same environment makes the
+later install win for *both* CLIs. For side-by-side prod and staging, install each channel in its
+own virtualenv (or via `pipx`, which isolates per tool). The dev build is produced by
 `scripts/build_dev_dist.py`, which renames the package/CLI and flips `CHANNEL` to `dev` before
-`uv build`. To cut a dev release, bump `[tool.flash-dev].version` and merge to `dev`. Either CLI
-still honours an explicit `FLASH_API_URL` / `flash login --api-url`; the channel only sets the
-default.
+`uv build`. Both channels ship at the **same version**: `[project].version` and
+`[tool.flash-dev].version` must match (CI enforces this via `.github/workflows/version-parity.yml`),
+so cutting a release means bumping both together. Either CLI still honours an explicit
+`FLASH_API_URL` / the `login --api-url` flag; the channel only sets the default.
 ## Serving From an API

{freesolo_flash_dev-0.2.25 → freesolo_flash_dev-0.2.26}/README.md RENAMED Viewed

@@ -36,7 +36,6 @@ The allocator picks the cheapest validated RunPod GPU class that fits the run.
   and ready-to-run configs to start from
 - `flash/serve/`, `flash/server/` — adapter serving and the FastAPI control
   plane (run operator-side via the separate `flash-server` command)
-- `flash/mcp/` — stdio MCP bridge for coding agents
 - `Dockerfile` — the control-plane image (used by the repo docker-compose)
 - `tests/` — pytest suite (CPU-only; offline-by-default, no GPU/network)
@@ -68,11 +67,15 @@ Two channels are published to PyPI from the *same source*, distinguished by one
 | prod | `freesolo-flash` | `flash` | `flash.freesolo.co` | push to `main` that bumps `[project].version` (`.github/workflows/publish.yml`) |
 | dev | `freesolo-flash-dev` | `flash-dev` | `flash-dev.freesolo.co` | push to `dev` whose `[tool.flash-dev].version` isn't on PyPI yet (`.github/workflows/publish-dev.yml`) |
-The two install side by side (distinct package + CLI names). The dev build is produced by
+Each environment holds exactly **one** channel: both packages ship the same import package
+(`flash/`) with one baked `CHANNEL` line, so installing both into the same environment makes the
+later install win for *both* CLIs. For side-by-side prod and staging, install each channel in its
+own virtualenv (or via `pipx`, which isolates per tool). The dev build is produced by
 `scripts/build_dev_dist.py`, which renames the package/CLI and flips `CHANNEL` to `dev` before
-`uv build`. To cut a dev release, bump `[tool.flash-dev].version` and merge to `dev`. Either CLI
-still honours an explicit `FLASH_API_URL` / `flash login --api-url`; the channel only sets the
-default.
+`uv build`. Both channels ship at the **same version**: `[project].version` and
+`[tool.flash-dev].version` must match (CI enforces this via `.github/workflows/version-parity.yml`),
+so cutting a release means bumping both together. Either CLI still honours an explicit
+`FLASH_API_URL` / the `login --api-url` flag; the channel only sets the default.
 ## Serving From an API

{freesolo_flash_dev-0.2.25 → freesolo_flash_dev-0.2.26}/docker/bake_kernel_cache.py RENAMED Viewed

@@ -69,7 +69,10 @@ def main() -> int:
     ap.add_argument("--gpu-type-id", required=True, help="RunPod gpuTypeId, e.g. 'NVIDIA H100 80GB HBM3'")
     ap.add_argument("--image", default="ghcr.io/freesolo-co/flash-worker:cu128")
     ap.add_argument("--out", default="build/kernel_cache")
-    ap.add_argument("--container-disk-gb", type=int, default=80)
+    # the warm pod only pulls the ~20GB image + writes the cache (no model download), so keep this
+    # modest -- an over-large ask shrinks the eligible host pool and trips "machine does not have the
+    # resources" on scarce classes (e.g. Blackwell sm120 on secure cloud).
+    ap.add_argument("--container-disk-gb", type=int, default=60)
     ap.add_argument("--deadline-min", type=int, default=45)
     ap.add_argument("--run-id", default="", help="unique suffix for the temp repo (default: time+uuid)")
     ap.add_argument(
@@ -221,7 +224,18 @@ def _verify(out: str, sm: str) -> int:
     blob = os.path.join(out, "mega_cache.bin")
     meta = os.path.join(out, "mega_cache.json")
     if not os.path.isfile(blob):
-        log(f"FAIL: no mega_cache.bin in {out}")
+        log(f"FAIL: no mega_cache.bin in {out}; what the warmup actually produced:")
+        for root, _, files in os.walk(out):
+            for f in sorted(files):
+                p = os.path.join(root, f)
+                log(f"   present: {os.path.relpath(p, out)} ({os.path.getsize(p)} b)")
+        wl = os.path.join(out, "warmup.log")
+        if os.path.isfile(wl):
+            log("   --- warmup.log tail ---")
+            with open(wl, errors="replace") as wlf:
+                tail = wlf.read().splitlines()[-40:]
+            for line in tail:
+                log(f"   | {line}")
         return 1
     try:
         with open(meta) as f:

{freesolo_flash_dev-0.2.25 → freesolo_flash_dev-0.2.26}/docker/bake_pod_entry.py RENAMED Viewed

@@ -49,7 +49,11 @@ def main() -> int:
     if arch:
         cmd += ["--arch", arch]
     print(f"[bake] running: {' '.join(cmd)}", flush=True)
-    rc = subprocess.call(cmd, env=env)
+    # capture the warmup output into /out so it ships back with the cache -- lets the CI helper show
+    # WHICH warm steps compiled and what save_cache_artifacts returned when no mega_cache.bin lands.
+    os.makedirs("/out", exist_ok=True)
+    with open("/out/warmup.log", "wb") as lf:
+        rc = subprocess.call(cmd, env=env, stdout=lf, stderr=subprocess.STDOUT)
     print(f"[bake] kernel_warmup rc={rc}", flush=True)
     # ship the whole cache tree back (mega blob + metadata + raw triton/inductor dirs).

{freesolo_flash_dev-0.2.25 → freesolo_flash_dev-0.2.26}/docs/cli-style/README.md RENAMED Viewed

@@ -16,8 +16,8 @@ A standardized, production-grade output theme for every `flash` command.
   disable the themed layout with `FLASH_STYLE=0`. `NO_COLOR` keeps the layout but drops ANSI color.
 - **No new dependencies:** pure standard library, like the rest of the client CLI.
-The rendering lives in `flash/cli/main/render.py`; the command wiring is in
-`flash/cli/main/commands.py` and `envpush.py`.
+The rendering lives in `flash/cli/render.py`; the command wiring is in
+`flash/cli/commands.py` and `envpush.py`.
 ## Preview

{freesolo_flash_dev-0.2.25 → freesolo_flash_dev-0.2.26}/docs/cli-style/generate.py RENAMED Viewed

@@ -22,8 +22,8 @@ import tempfile
 from contextlib import redirect_stderr, redirect_stdout
 from pathlib import Path
-from flash.cli import main as cli
-from flash.cli.main import render
+import flash.cli as cli
+from flash.cli import render
 class _Utf8(io.StringIO):
@@ -185,7 +185,7 @@ def _capture_argv(argv, *, styled, theme="dark", cwd=None, with_stderr=False) ->
     ``with_stderr`` is set, the stderr note (e.g. `flash train`'s hand-off line) is shown first,
     as it appears in a real terminal before the streamed logs."""
     _set_style(styled, theme)
-    commands = sys.modules["flash.cli.main.commands"]
+    commands = sys.modules["flash.cli.commands"]
     saved = commands.client_from_config
     commands.client_from_config = lambda *a, **k: FAKE
     out, err = _Utf8(), _Utf8()
@@ -476,7 +476,7 @@ def main():
     out_path = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(__file__).parent / "index.html"
     # Deterministic preview: pin the dry-run id (cmd_train calls new_run_id() every time) so the
     # committed gallery doesn't churn on every regeneration.
-    sys.modules["flash.cli.main.commands"].new_run_id = lambda: "flash-1718900000-d0cf00ed"
+    sys.modules["flash.cli.commands"].new_run_id = lambda: "flash-1718900000-d0cf00ed"
     with tempfile.TemporaryDirectory() as td:
         # Point the installed-env registry at an empty temp manifest so `flash env list` never
         # leaks a developer's real installed env slugs (~/.flash/envs.json) into the preview.

{freesolo_flash_dev-0.2.25 → freesolo_flash_dev-0.2.26}/flash/__init__.py RENAMED Viewed

@@ -1,7 +1,7 @@
 """Flash — managed LoRA post-training: log in with your freesolo key, train.
 A focused developer experience (TOML run specs, pluggable environments,
-CLI/API/MCP entry points, adapter deployment). Users authenticate with their
+CLI/API entry points, adapter deployment). Users authenticate with their
 freesolo API key (`flash login`); the control plane runs each job on a managed
 RunPod GPU behind the scenes.
 """

{freesolo_flash_dev-0.2.25 → freesolo_flash_dev-0.2.26}/flash/_channel.py RENAMED Viewed

@@ -5,8 +5,13 @@ below): it installs a ``flash`` CLI that talks to the production control plane.
 package ``freesolo-flash-dev`` is built from this *same source* with only this one line rewritten
 to ``CHANNEL = "dev"`` (see ``scripts/build_dev_dist.py``); everything that differs between the two
 channels — the CLI name, the PyPI distribution name, the default control-plane URL — derives from
-it below, so there is exactly one thing to flip. An explicit ``FLASH_API_URL`` /
-``flash login --api-url`` always wins; the channel only picks the *default* plane.
+it below, so there is exactly one thing to flip. An explicit ``FLASH_API_URL`` / the
+``login --api-url`` flag always wins; the channel only picks the *default* plane.
+Both channels ship the SAME import package (``flash/``) with this one baked line, so a single
+environment holds exactly ONE channel — installing ``freesolo-flash`` and ``freesolo-flash-dev``
+into the same environment makes the later install win for both CLIs. For side-by-side prod and
+staging, install each channel in its own virtualenv (or via ``pipx``, which isolates per tool).
 """
 from __future__ import annotations

{freesolo_flash_dev-0.2.25 → freesolo_flash_dev-0.2.26}/flash/catalog.py RENAMED Viewed

@@ -46,6 +46,13 @@ class ModelInfo:
     # tier needs a bigger card than SFT (the colocate 2nd weight copy + KV pool). Consumed by
     # engine.vram.model_required_vram_gb.
     grpo_min_vram_gb: int = 0
+    # SFT hard VRAM floor (GB). 0 => SFT sizes purely from the param-based estimate and is free to
+    # down-route to a smaller validated card (the default — e.g. a 4B SFT estimates ~17 GB and rents
+    # a 48 GB card, NOT its ``min_vram_gb`` reference). Set it ONLY when a curated model must not be
+    # placed on the cheapest card the estimate would otherwise allow — e.g. a very large checkpoint
+    # whose ~param-est margin over the frozen-weights floor is too thin on the next card down.
+    # Consumed by engine.vram.model_required_vram_gb (the SFT analog of ``grpo_min_vram_gb``).
+    sft_min_vram_gb: int = 0
     notes: str = ""
     # Worker container disk this model needs (GB). 0 = the platform default (64 GB)
     # suffices. The runner raises gpu.disk_gb to at least this, so big-checkpoint
@@ -64,8 +71,14 @@ class ModelInfo:
     # completion cap. Curated per model below; defaults to the open-model fallback.
     vocab_size: int = _DEFAULT_VOCAB_SIZE
     # Total parameters in billions — the numeric model size the cost estimator reads directly
-    # (no parsing of the ``params`` display string). Curated per catalog model below.
+    # (no parsing of the ``params`` display string). Drives the memory/size terms (VRAM, disk,
+    # download), which always size the FULL checkpoint. Curated per catalog model below.
     params_b: float = 0.0
+    # Parameters ACTIVE per token in billions — only meaningful for an MoE, where a token routes
+    # through a small subset of experts. The cost estimator's per-token FLOPs/step-time term reads
+    # this (a token exercises only the active params), while VRAM/disk/download keep using the total
+    # ``params_b``. 0.0 (the dense default) means "same as params_b" — every token hits every param.
+    active_params_b: float = 0.0
     def to_dict(self) -> dict[str, Any]:
         return asdict(self)
@@ -89,7 +102,7 @@ MODELS: dict[str, ModelInfo] = {
         thinking="hybrid",
         notes="On-device class SLM (131k ctx); standard Llama architecture.",
     ),
-    # ---- Qwen3.5 dense family: validated on the modern worker stack ----
+    # Qwen3.5 dense family: validated on the modern worker stack
     # (trl 1.x / vllm 0.19 / transformers 5.x). Trained + served TEXT-ONLY: the
     # checkpoints are natively multimodal, so LoRA excludes the vision tower and vLLM
     # loads language_model_only (see flash.engine.worker). Each entry passed a real
@@ -153,6 +166,64 @@ MODELS: dict[str, ModelInfo] = {
         "(two bf16 copies + KV + the 248k-vocab fp32 logits) needs an 80 GB-class card "
         "(grpo_min_vram_gb floor).",
     ),
+    # ---- Qwen3.6 MoE: the big-checkpoint tier (H200 for SFT, B200 for GRPO) ----
+    # 35B-A3B is a Mixture-of-Experts checkpoint: ~3B parameters are ACTIVE per token, but all 35B
+    # are materialized on the GPU, so the MEMORY/disk/download terms size the FULL 35B (~70 GB bf16)
+    # while the COMPUTE terms (activations, KV pool, rank-linear LoRA) size the ~3B active backbone
+    # (engine.vram is MoE-aware via active_params_b). bf16 LoRA, NOT QLoRA — same reason as the 9B.
+    # Because the resident weights dominate and the active compute is tiny, the GPU tier is set by
+    # how many weight copies each algorithm holds, NOT by context length:
+    #   * SFT — ONE ~70 GB copy + small active-compute (~82 GB peak, ~flat in context) -> fits the
+    #     141 GB H200 with wide margin (context ~unbounded by VRAM). Live-validated on a B200; the
+    #     H200 down-tier is the MoE-aware win (cheaper, plentiful stock).
+    #   * GRPO — colocates the vLLM rollout, so TWO ~70 GB copies (trainer + engine) are resident at
+    #     the rollout peak (~167 GB) -> needs the 180 GB B200; the H200 can't hold both. The MoE
+    #     rollout weight-sync needed a fused-expert name fix (engine.worker.lora._remap_vl_sync_weights
+    #     passes the multimodal ``model.language_model.*`` names through to vLLM's own mapper). Both
+    #     single- and multi-turn GRPO live-validated on a B200.
+    "Qwen/Qwen3.6-35B-A3B": ModelInfo(
+        id="Qwen/Qwen3.6-35B-A3B",
+        display_name="Qwen3.6 35B-A3B (MoE)",
+        params="35B total / ~3B active (MoE)",
+        # TOTAL parameters (billions) the SFT VRAM equation + cost projection read. For an MoE
+        # checkpoint the size term is the TOTAL count, not the ~3B active: download/VRAM/disk size the
+        # FULL checkpoint that lands on the GPU (all experts are materialized). 35.0 is the CALIBRATED
+        # total: the live-validated single-B200 SFT fit depends on it — the honest-peak equation lands
+        # at the 180 GB B200's usable budget, and the marketing "~35.95B" figure tips it over (186 GB,
+        # see test_sft_equation_covers_honest_peak_across_seq_boundary). Keep 35.0.
+        params_b=35.0,
+        # ~3B ACTIVE per token (the "A3B" in the name): a token routes through a small subset of
+        # experts, so cost/step-time FLOPs scale with ~3B, not the 35B total. Without this the
+        # estimator would price SFT as if every token exercised all 35B params — ~10x too slow/costly.
+        active_params_b=3.0,
+        vocab_size=248_320,
+        algos=("sft", "grpo"),
+        min_vram_gb=141,
+        # Hard SFT floor: with MoE-aware sizing the SFT estimate is ~82 GB (the 70 GB resident weights
+        # dominate; the active-3B activations/KV are tiny), which would otherwise down-route to the
+        # 96 GB RTX Pro 6000 (consumer Blackwell, thin margin over the 70 GB base) or the 80 GB H100
+        # (too tight). Floor to 100 GB so SFT lands on the 141 GB H200 — a datacenter card with wide
+        # margin, ~$1.50/hr cheaper than the B200 and not needed here.
+        sft_min_vram_gb=100,
+        # GRPO floor = the 180 GB B200 (colocated GRPO holds two ~70 GB weight copies + a KV pool; the
+        # 141 GB H200 can't hold the trainer + vLLM rollout). The base ~167 GB two-copy estimate already
+        # routes GRPO to the B200, but setting the floor ALSO ENGAGES the long-context escalation —
+        # model_required_vram_gb only adds grpo_seq_escalation_gb when a grpo floor is set. The
+        # escalation keys on the ~3B ACTIVE params, so default/moderate GRPO still fits the B200 but a
+        # long (>~16k-token, e.g. 32k) rollout is sized PAST 180 GB and rejected at parse time, instead
+        # of booting a B200 and OOMing in vLLM's KV allocation.
+        grpo_min_vram_gb=180,
+        quant="bf16",
+        recommended_gpu="H200",
+        thinking="hybrid",
+        # ~70 GB bf16 checkpoint. Peak disk = HF download (~70 GB) + Xet temp (~70 GB) + per-step
+        # deployable-checkpoint saves; floor to 200 GB so the rent doesn't hit "No space left on
+        # device" (the runner raises gpu.disk_gb to this out of the box).
+        min_disk_gb=200,
+        notes="MoE (35B total / ~3B active), bf16 LoRA. SFT runs on the 141 GB H200 (the ~70 GB "
+        "weights dominate; active-3B compute keeps activations/KV tiny, so context is ~unbounded by "
+        "VRAM); colocated GRPO needs the 180 GB B200 (trainer + vLLM rollout = two 70 GB copies).",
+    ),
 }

{freesolo_flash_dev-0.2.25/flash/cli/main → freesolo_flash_dev-0.2.26/flash/cli}/__init__.py RENAMED Viewed

@@ -17,9 +17,9 @@ from flash._logging import configure_logging, get_logger
 from flash._update_check import emit_update_notice, maybe_start_update_check
 # Command handlers + the patched client surface live in submodules; re-export them so
-# `flash.cli.main` stays the single public import surface (and so monkeypatching
-# `flash.cli.main.commands` reaches the bare globals the handlers read).
-from flash.cli.main.commands import (  # noqa: F401
+# `flash.cli` stays the single public import surface (and so monkeypatching
+# `flash.cli.commands` reaches the bare globals the handlers read).
+from flash.cli.commands import (  # noqa: F401
     _CLI_DONE_STATES,
     _OK_STATES,
     _STARTER_ENV_PY,
@@ -34,6 +34,7 @@ from flash.cli.main.commands import (  # noqa: F401
     cmd_deployments,
     cmd_env_list,
     cmd_env_setup,
+    cmd_export,
     cmd_gpus,
     cmd_login,
     cmd_models,
@@ -45,9 +46,9 @@ from flash.cli.main.commands import (  # noqa: F401
     cmd_whoami,
     verify_freesolo_key,
 )
-from flash.cli.main.envpush import cmd_env_install, cmd_env_push
+from flash.cli.envpush import cmd_env_install, cmd_env_push
-logger = get_logger("flash.cli.main")
+logger = get_logger("flash.cli")
 def main(argv: list[str] | None = None) -> int:
@@ -194,6 +195,39 @@ def main(argv: list[str] | None = None) -> int:
     undeploy.add_argument("run_id")
     undeploy.set_defaults(func=cmd_undeploy)
+    export = sub.add_parser(
+        "export", help="export a trained adapter to your own HuggingFace repo"
+    )
+    export.add_argument(
+        "--adapter-id",
+        dest="adapter_id",
+        required=True,
+        help="the Freesolo adapter id (the run id) to export",
+    )
+    export.add_argument(
+        "--repository",
+        required=True,
+        help="destination HuggingFace repo 'owner/name' (created if it doesn't exist)",
+    )
+    export.add_argument(
+        "--api-key",
+        help="HuggingFace token with write access to --repository "
+        "(default: HF_TOKEN from your shell or a local .env / .env.local)",
+    )
+    export.add_argument(
+        "--step",
+        type=int,
+        default=None,
+        help="export a specific intermediate checkpoint (see `flash checkpoints <adapter-id>`) "
+        "instead of the run's final adapter; works even for a run cancelled mid-RL",
+    )
+    export.add_argument(
+        "--public",
+        action="store_true",
+        help="create the destination repo as public (default: private)",
+    )
+    export.set_defaults(func=cmd_export)
     deployments = sub.add_parser("deployments", help="list active serving deployments")
     deployments.set_defaults(func=cmd_deployments)

{freesolo_flash_dev-0.2.25/flash/cli/main → freesolo_flash_dev-0.2.26/flash/cli}/__main__.py RENAMED Viewed

@@ -1,6 +1,6 @@
 import sys
-from flash.cli.main import main
+from flash.cli import main
 if __name__ == "__main__":
     sys.exit(main())

freesolo-flash-dev 0.2.25__tar.gz → 0.2.26__tar.gz

freesolo-flash-dev 0.2.25tar.gz → 0.2.26tar.gz