freesolo-flash-dev 0.2.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flash/__init__.py +29 -0
- flash/_channel.py +23 -0
- flash/_fileio.py +35 -0
- flash/_logging.py +49 -0
- flash/_update_check.py +266 -0
- flash/catalog.py +253 -0
- flash/cli/__init__.py +1 -0
- flash/cli/main/__init__.py +227 -0
- flash/cli/main/__main__.py +6 -0
- flash/cli/main/commands.py +636 -0
- flash/cli/main/envpush.py +317 -0
- flash/cli/main/render.py +599 -0
- flash/cli/main/training_doc.py +455 -0
- flash/client/__init__.py +14 -0
- flash/client/config.py +70 -0
- flash/client/http.py +372 -0
- flash/client/runtime_secrets.py +69 -0
- flash/client/specs.py +20 -0
- flash/cost/__init__.py +16 -0
- flash/cost/analytical.py +175 -0
- flash/cost/facts.py +114 -0
- flash/cost/spec.py +113 -0
- flash/cost/types.py +158 -0
- flash/engine/__init__.py +6 -0
- flash/engine/accounting.py +36 -0
- flash/engine/chalk_kernels.py +116 -0
- flash/engine/multiturn_rollout.py +780 -0
- flash/engine/recipe.py +86 -0
- flash/engine/vram.py +603 -0
- flash/engine/worker/__init__.py +2916 -0
- flash/engine/worker/__main__.py +4 -0
- flash/engine/worker/kernel_warmup.py +400 -0
- flash/engine/worker/lora.py +796 -0
- flash/engine/worker/packing.py +366 -0
- flash/engine/worker/perf.py +1048 -0
- flash/envs/__init__.py +10 -0
- flash/envs/adapter/__init__.py +883 -0
- flash/envs/adapter/rubric.py +222 -0
- flash/envs/base.py +52 -0
- flash/envs/registry.py +62 -0
- flash/mcp/__init__.py +1 -0
- flash/mcp/server.py +85 -0
- flash/providers/__init__.py +59 -0
- flash/providers/_auth.py +24 -0
- flash/providers/_http.py +230 -0
- flash/providers/_instance.py +416 -0
- flash/providers/_instance_bootstrap.py +517 -0
- flash/providers/_poll.py +311 -0
- flash/providers/allocator.py +193 -0
- flash/providers/base.py +431 -0
- flash/providers/hyperstack/__init__.py +127 -0
- flash/providers/hyperstack/api.py +522 -0
- flash/providers/hyperstack/auth.py +17 -0
- flash/providers/hyperstack/gpus.py +29 -0
- flash/providers/hyperstack/jobs/__init__.py +632 -0
- flash/providers/hyperstack/jobs/builders.py +122 -0
- flash/providers/hyperstack/preflight.py +23 -0
- flash/providers/hyperstack/pricing.py +26 -0
- flash/providers/hyperstack/train.py +25 -0
- flash/providers/lambdalabs/__init__.py +139 -0
- flash/providers/lambdalabs/api.py +261 -0
- flash/providers/lambdalabs/auth.py +18 -0
- flash/providers/lambdalabs/gpus.py +29 -0
- flash/providers/lambdalabs/jobs/__init__.py +724 -0
- flash/providers/lambdalabs/jobs/builders.py +118 -0
- flash/providers/lambdalabs/preflight.py +27 -0
- flash/providers/lambdalabs/pricing.py +51 -0
- flash/providers/lambdalabs/train.py +27 -0
- flash/providers/preflight.py +55 -0
- flash/providers/realized.py +80 -0
- flash/providers/runpod/__init__.py +130 -0
- flash/providers/runpod/api.py +186 -0
- flash/providers/runpod/auth.py +37 -0
- flash/providers/runpod/cost.py +57 -0
- flash/providers/runpod/gpus.py +46 -0
- flash/providers/runpod/jobs.py +956 -0
- flash/providers/runpod/keys.py +139 -0
- flash/providers/runpod/preflight.py +30 -0
- flash/providers/runpod/preload.py +915 -0
- flash/providers/runpod/pricing.py +18 -0
- flash/providers/runpod/slots.py +79 -0
- flash/providers/runpod/train/__init__.py +150 -0
- flash/providers/runpod/train/deps.py +395 -0
- flash/providers/runpod/train/endpoints.py +820 -0
- flash/py.typed +0 -0
- flash/runner/__init__.py +686 -0
- flash/runner/checkpoints.py +82 -0
- flash/runner/deploy.py +422 -0
- flash/runner/lifecycle.py +672 -0
- flash/schema/__init__.py +375 -0
- flash/schema/fields.py +331 -0
- flash/serve/__init__.py +1 -0
- flash/serve/deploy.py +326 -0
- flash/serve/pricing.py +60 -0
- flash/server/__init__.py +1 -0
- flash/server/__main__.py +20 -0
- flash/server/app.py +961 -0
- flash/server/auth.py +263 -0
- flash/server/billing.py +124 -0
- flash/server/checkpoints.py +110 -0
- flash/server/db.py +160 -0
- flash/server/environment_registry.py +102 -0
- flash/server/envs.py +360 -0
- flash/server/reconcile.py +163 -0
- flash/server/run_registry.py +150 -0
- flash/spec.py +333 -0
- freesolo_flash_dev-0.2.25.dist-info/METADATA +192 -0
- freesolo_flash_dev-0.2.25.dist-info/RECORD +111 -0
- freesolo_flash_dev-0.2.25.dist-info/WHEEL +4 -0
- freesolo_flash_dev-0.2.25.dist-info/entry_points.txt +3 -0
- freesolo_flash_dev-0.2.25.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,522 @@
|
|
|
1
|
+
"""Thin Hyperstack (NexGen Cloud) REST client (no SDK state): flavors/capacity + VM lifecycle.
|
|
2
|
+
|
|
3
|
+
Mirrors ``providers/lambdalabs/api.py``: stdlib urllib via the shared ``RestClient``, hardened
|
|
4
|
+
retries, nothing persisted locally. Hyperstack specifics:
|
|
5
|
+
|
|
6
|
+
* **Auth header.** Hyperstack presents the key as a bare ``api_key: <key>`` header (NOT
|
|
7
|
+
``Authorization: Bearer``); the ``RestClient`` is configured with ``auth_header_name="api_key"``.
|
|
8
|
+
* **Capacity = ``stock_available``.** ``/core/flavors`` carries per-flavor ``stock_available`` per
|
|
9
|
+
region (the Hyperstack analog of Lambda's ``regions_with_capacity_available``); a flavor with no
|
|
10
|
+
stock in any region can't launch.
|
|
11
|
+
* **Launch needs an environment + a keypair.** Every region has a ``default-<region>`` environment;
|
|
12
|
+
a launch requires exactly one keypair name (env-scoped). The box is bootstrapped via cloud-init
|
|
13
|
+
``user_data`` and we never SSH, so the key is a formality — ``resolve_key_name`` reuses an
|
|
14
|
+
existing key or imports a throwaway one (private half discarded; no inbound-SSH rule is opened).
|
|
15
|
+
* **Non-idempotent launch.** ``POST /core/virtual-machines`` provisions a NEW (billed) VM every
|
|
16
|
+
time it succeeds, so it is NEVER retried.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import os
|
|
22
|
+
import re
|
|
23
|
+
import subprocess
|
|
24
|
+
import tempfile
|
|
25
|
+
import time
|
|
26
|
+
from typing import Any
|
|
27
|
+
|
|
28
|
+
from flash._logging import get_logger
|
|
29
|
+
from flash.providers._http import RestClient, is_conflict, is_not_found
|
|
30
|
+
|
|
31
|
+
logger = get_logger(__name__)
|
|
32
|
+
|
|
33
|
+
HYPERSTACK_BASE = "https://infrahub-api.nexgencloud.com/v1"
|
|
34
|
+
_USER_AGENT = "flash-hyperstack/1.0 (+https://freesolo.co)"
|
|
35
|
+
# The managed keypair name (env-scoped). Operators can pin an existing key via HYPERSTACK_KEYPAIR_NAME.
|
|
36
|
+
_MANAGED_KEYPAIR = "flash-managed"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class HyperstackApiError(RuntimeError):
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
_CLIENT = RestClient(
|
|
44
|
+
env_var="HYPERSTACK_API_KEY",
|
|
45
|
+
error_cls=HyperstackApiError,
|
|
46
|
+
base_url=HYPERSTACK_BASE,
|
|
47
|
+
missing_key_message="HYPERSTACK_API_KEY not configured on the control-plane host",
|
|
48
|
+
extra_headers={"User-Agent": _USER_AGENT},
|
|
49
|
+
auth_header_name="api_key",
|
|
50
|
+
auth_value_format="{key}",
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def request_with_retries(
|
|
55
|
+
path: str, method: str = "GET", body: dict | None = None, retries: int = 4, base_delay: float = 2.0
|
|
56
|
+
) -> Any:
|
|
57
|
+
return _CLIENT.request_with_retries(
|
|
58
|
+
path, method=method, body=body, retries=retries, base_delay=base_delay
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# ---------------------------------------------------------------------------
|
|
63
|
+
# Flavors + capacity (cached: pricing, the allocator, and the launcher all read this)
|
|
64
|
+
# ---------------------------------------------------------------------------
|
|
65
|
+
_FLAVORS_TTL_S = 45.0
|
|
66
|
+
_flavors_cache: dict[str, Any] = {"ts": 0.0, "by_region": None}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# Regions excluded from allocation + launch. CANADA-1's on-demand stock is a known-broken-driver
|
|
70
|
+
# fleet: instances reach ACTIVE then die without a DONE sentinel (NVML init failure / cuda
|
|
71
|
+
# unavailable), so launching there burns GPU budget on guaranteed-failed retries. Skip it by default
|
|
72
|
+
# rather than waterfall through it. Override with HYPERSTACK_BLOCKED_REGIONS (comma-separated): unset
|
|
73
|
+
# -> the default below; set (even to "") -> exactly that list, so operators can re-enable CANADA-1
|
|
74
|
+
# (HYPERSTACK_BLOCKED_REGIONS="") or block others once capacity/driver health changes.
|
|
75
|
+
_DEFAULT_BLOCKED_REGIONS = frozenset({"CANADA-1"})
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _blocked_regions() -> set[str]:
|
|
79
|
+
env = os.environ.get("HYPERSTACK_BLOCKED_REGIONS")
|
|
80
|
+
if env is None:
|
|
81
|
+
return set(_DEFAULT_BLOCKED_REGIONS)
|
|
82
|
+
return {r.strip().upper() for r in env.split(",") if r.strip()}
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _regions() -> list[str]:
|
|
86
|
+
out = request_with_retries("/core/regions")
|
|
87
|
+
regs = out.get("regions", []) if isinstance(out, dict) else []
|
|
88
|
+
names = [r.get("name") for r in regs if r.get("name")]
|
|
89
|
+
names = names or ["NORWAY-1", "CANADA-1", "US-1", "CANADA-2"]
|
|
90
|
+
blocked = _blocked_regions()
|
|
91
|
+
return [n for n in names if n.upper() not in blocked]
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# Hyperstack regions that DON'T support block-volume operations. LIVE-FOUND: CANADA-2 returns HTTP 400
|
|
95
|
+
# "Volume operations are not supported in this region" on create — the region is launchable for COMPUTE
|
|
96
|
+
# but has no volume backend, so the cache can't live there. Mirrors RunPod's _VOLUME_INCAPABLE_DATACENTERS.
|
|
97
|
+
# These regions stay in `_regions()` (still usable for GPU launches), but the cache provisioner and the
|
|
98
|
+
# launch-time attach skip them rather than burning a guaranteed-failing API call (the launch path already
|
|
99
|
+
# degrades to a cold run there, so a stale list is graceful — but list a region here to silence the noise).
|
|
100
|
+
_VOLUME_INCAPABLE_REGIONS = frozenset({"CANADA-2"})
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def region_supports_cache(region: str) -> bool:
|
|
104
|
+
"""Whether the weight cache can be stored in ``region`` (False for known volume-incapable regions)."""
|
|
105
|
+
return region not in _VOLUME_INCAPABLE_REGIONS
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def cache_regions() -> list[str]:
|
|
109
|
+
"""The subset of ``_regions()`` that supports block volumes — where the cache is provisioned."""
|
|
110
|
+
return [r for r in _regions() if region_supports_cache(r)]
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def flavors_by_region(force: bool = False) -> dict[str, list[dict]]:
|
|
114
|
+
"""``region -> [flavor dict]`` across all regions, cached for ``_FLAVORS_TTL_S``.
|
|
115
|
+
|
|
116
|
+
Each flavor dict carries ``name``, ``gpu``, ``gpu_count``, ``stock_available``. Raises
|
|
117
|
+
``HyperstackApiError`` on a hard failure; callers that must degrade gracefully catch it.
|
|
118
|
+
"""
|
|
119
|
+
now = time.time()
|
|
120
|
+
if not force and _flavors_cache["by_region"] is not None and now - _flavors_cache["ts"] < _FLAVORS_TTL_S:
|
|
121
|
+
return _flavors_cache["by_region"]
|
|
122
|
+
by_region: dict[str, list[dict]] = {}
|
|
123
|
+
for region in _regions():
|
|
124
|
+
out = request_with_retries(f"/core/flavors?region={region}")
|
|
125
|
+
data = out.get("data", []) if isinstance(out, dict) else []
|
|
126
|
+
by_region[region] = [f for grp in data for f in grp.get("flavors", [])]
|
|
127
|
+
_flavors_cache.update(ts=now, by_region=by_region)
|
|
128
|
+
return by_region
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def regions_with_stock(flavor_name: str, force: bool = False) -> list[str]:
|
|
132
|
+
"""Region names where ``flavor_name`` currently has stock (the launchability signal)."""
|
|
133
|
+
out = []
|
|
134
|
+
for region, flavors in flavors_by_region(force=force).items():
|
|
135
|
+
for f in flavors:
|
|
136
|
+
if f.get("name") == flavor_name and f.get("stock_available"):
|
|
137
|
+
out.append(region)
|
|
138
|
+
break
|
|
139
|
+
return out
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
# ---------------------------------------------------------------------------
|
|
143
|
+
# Environments (a launch targets ``default-<region>``)
|
|
144
|
+
# ---------------------------------------------------------------------------
|
|
145
|
+
_env_cache: dict[str, Any] = {"ts": 0.0, "by_region": None}
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def environment_for_region(region: str) -> str:
|
|
149
|
+
"""The environment name to launch into for ``region`` (the per-region default env)."""
|
|
150
|
+
now = time.time()
|
|
151
|
+
if _env_cache["by_region"] is None or now - _env_cache["ts"] > 300:
|
|
152
|
+
out = request_with_retries("/core/environments")
|
|
153
|
+
envs = out.get("environments", []) if isinstance(out, dict) else []
|
|
154
|
+
_env_cache.update(ts=now, by_region={e.get("region"): e.get("name") for e in envs if e.get("name")})
|
|
155
|
+
return (_env_cache["by_region"] or {}).get(region) or f"default-{region}"
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
# ---------------------------------------------------------------------------
|
|
159
|
+
# Keypairs (launch requires exactly one; we never SSH)
|
|
160
|
+
# ---------------------------------------------------------------------------
|
|
161
|
+
def list_keypairs() -> list[dict]:
|
|
162
|
+
out = request_with_retries("/core/keypairs")
|
|
163
|
+
return out.get("keypairs", []) if isinstance(out, dict) else []
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _generate_throwaway_public_key() -> str:
|
|
167
|
+
"""An OpenSSH ed25519 public key whose private half is immediately discarded.
|
|
168
|
+
|
|
169
|
+
Hyperstack requires a key_name at launch even though the box is bootstrapped via cloud-init and
|
|
170
|
+
we never SSH. The private key is thrown away here and no inbound-SSH security rule is opened, so
|
|
171
|
+
the key is inert.
|
|
172
|
+
|
|
173
|
+
Shells out to ``ssh-keygen``; a slim control-plane image may not ship it. A missing/failed
|
|
174
|
+
``ssh-keygen`` raises a clear, actionable ``HyperstackApiError`` (install ``openssh-client`` or
|
|
175
|
+
pin ``HYPERSTACK_KEYPAIR_NAME``) instead of a bare ``FileNotFoundError`` that looks like an
|
|
176
|
+
API/stock failure."""
|
|
177
|
+
with tempfile.TemporaryDirectory() as d:
|
|
178
|
+
kp = f"{d}/k"
|
|
179
|
+
try:
|
|
180
|
+
subprocess.run(
|
|
181
|
+
["ssh-keygen", "-t", "ed25519", "-f", kp, "-N", "", "-q"], check=True, timeout=30
|
|
182
|
+
)
|
|
183
|
+
except (FileNotFoundError, subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
|
|
184
|
+
raise HyperstackApiError(
|
|
185
|
+
"could not generate a throwaway Hyperstack keypair: ssh-keygen is unavailable or "
|
|
186
|
+
f"failed ({e}). Install openssh-client on the control plane (to provide ssh-keygen), "
|
|
187
|
+
"or set HYPERSTACK_KEYPAIR_NAME to an existing keypair to skip generation."
|
|
188
|
+
) from e
|
|
189
|
+
with open(kp + ".pub") as f:
|
|
190
|
+
return f.read().strip()
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def resolve_key_name(environment_name: str) -> str:
|
|
194
|
+
"""A keypair name usable to launch into ``environment_name``.
|
|
195
|
+
|
|
196
|
+
Pins ``HYPERSTACK_KEYPAIR_NAME`` if set; else reuses an existing key in that environment; else
|
|
197
|
+
imports a throwaway ``flash-managed`` key (private half discarded). Idempotent per env.
|
|
198
|
+
"""
|
|
199
|
+
import os
|
|
200
|
+
|
|
201
|
+
pinned = os.environ.get("HYPERSTACK_KEYPAIR_NAME")
|
|
202
|
+
if pinned:
|
|
203
|
+
return pinned
|
|
204
|
+
existing = list_keypairs()
|
|
205
|
+
|
|
206
|
+
def _env_name(k: dict) -> str:
|
|
207
|
+
env = k.get("environment")
|
|
208
|
+
return env.get("name") if isinstance(env, dict) else (env or "")
|
|
209
|
+
|
|
210
|
+
# Reuse only a key bound to the EXACT target environment (Hyperstack keypairs are env-scoped, so
|
|
211
|
+
# an env-less / other-env key may be rejected at launch). Otherwise fall through to creating the
|
|
212
|
+
# env-scoped managed key below.
|
|
213
|
+
for k in existing:
|
|
214
|
+
if k.get("name") and _env_name(k) == environment_name:
|
|
215
|
+
return k["name"]
|
|
216
|
+
name = f"{_MANAGED_KEYPAIR}-{environment_name}"
|
|
217
|
+
if any(k.get("name") == name for k in existing):
|
|
218
|
+
return name
|
|
219
|
+
try:
|
|
220
|
+
request_with_retries(
|
|
221
|
+
"/core/keypairs",
|
|
222
|
+
method="POST",
|
|
223
|
+
body={
|
|
224
|
+
"name": name,
|
|
225
|
+
"environment_name": environment_name,
|
|
226
|
+
"public_key": _generate_throwaway_public_key(),
|
|
227
|
+
},
|
|
228
|
+
retries=0,
|
|
229
|
+
)
|
|
230
|
+
except HyperstackApiError as e:
|
|
231
|
+
# Create race: two concurrent launches into the same env both see no managed key and both
|
|
232
|
+
# POST. The loser gets an "already exists" rejection — that is SUCCESS, the env-scoped key now
|
|
233
|
+
# exists (the winner created it), so return the name. Re-list to confirm the key is really
|
|
234
|
+
# present before swallowing the error, so an UNRELATED 4xx (e.g. a bad public key / perms)
|
|
235
|
+
# still surfaces rather than being silently treated as a benign duplicate.
|
|
236
|
+
if _keypair_create_conflict(e) and any(
|
|
237
|
+
k.get("name") == name for k in list_keypairs()
|
|
238
|
+
):
|
|
239
|
+
return name
|
|
240
|
+
raise
|
|
241
|
+
return name
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def _keypair_create_conflict(err: Exception) -> bool:
|
|
245
|
+
"""True when a keypair POST failed because the name already exists (a benign create race), not
|
|
246
|
+
for some other reason. Hyperstack returns 409/400 with an 'already exists'/'duplicate' body."""
|
|
247
|
+
s = str(err).lower()
|
|
248
|
+
if "-> http 409" in s:
|
|
249
|
+
return True
|
|
250
|
+
return ("-> http 4" in s) and ("already exist" in s or "duplicate" in s or "in use" in s)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
# ---------------------------------------------------------------------------
|
|
254
|
+
# Images (need a Docker-preinstalled, CUDA-12.8 Ubuntu image to run WORKER_IMAGE)
|
|
255
|
+
# ---------------------------------------------------------------------------
|
|
256
|
+
_image_cache: dict[str, str] = {}
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def _image_cuda(name: str) -> float:
|
|
260
|
+
"""Parse the CUDA version out of a Hyperstack image name (e.g. '... CUDA 12.8 ...' -> 12.8)."""
|
|
261
|
+
m = re.search(r"cuda (\d+(?:\.\d+)?)", name.lower())
|
|
262
|
+
return float(m.group(1)) if m else 0.0
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def docker_image_for_region(region: str, min_cuda: str = "12.8") -> str:
|
|
266
|
+
"""A Docker-preinstalled Ubuntu image in ``region`` whose host CUDA covers the run.
|
|
267
|
+
|
|
268
|
+
The host driver's CUDA must be at least the GPU class's floor (``min_cuda`` — e.g. 13.0 for
|
|
269
|
+
Blackwell) AND the cu128 worker container's 12.8. Among the fitting Docker images we pick the
|
|
270
|
+
LOWEST qualifying CUDA (closest to the worker stack) and prefer the newest Ubuntu. Raises if
|
|
271
|
+
none qualifies, so ``launch_and_submit`` skips the region rather than booting a box on a driver
|
|
272
|
+
that can't JIT the GPU's kernels (a Blackwell class on a CUDA-12.8 image would fail at setup)."""
|
|
273
|
+
required = max(12.8, float(min_cuda))
|
|
274
|
+
key = f"{region}|{required}"
|
|
275
|
+
if key in _image_cache:
|
|
276
|
+
return _image_cache[key]
|
|
277
|
+
out = request_with_retries(f"/core/images?region={region}")
|
|
278
|
+
images = out.get("images", []) if isinstance(out, dict) else []
|
|
279
|
+
flat: list[dict] = []
|
|
280
|
+
for x in images:
|
|
281
|
+
if isinstance(x, dict) and "images" in x:
|
|
282
|
+
flat += x["images"]
|
|
283
|
+
elif isinstance(x, dict):
|
|
284
|
+
flat.append(x)
|
|
285
|
+
names = [im.get("name", "") for im in flat]
|
|
286
|
+
# Docker-preinstalled ONLY (the cloud-init does not install Docker) AND CUDA >= required.
|
|
287
|
+
docker_imgs = [n for n in names if "with docker" in n.lower()]
|
|
288
|
+
fitting = [n for n in docker_imgs if _image_cuda(n) >= required - 1e-9]
|
|
289
|
+
if not fitting:
|
|
290
|
+
have = sorted({_image_cuda(n) for n in docker_imgs})
|
|
291
|
+
raise HyperstackApiError(
|
|
292
|
+
f"no Docker image with CUDA>={required} in {region} (available Docker CUDA: {have})"
|
|
293
|
+
)
|
|
294
|
+
best = min(fitting, key=lambda n: (_image_cuda(n), "24.04" not in n))
|
|
295
|
+
_image_cache[key] = best
|
|
296
|
+
return best
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
# ---------------------------------------------------------------------------
|
|
300
|
+
# Virtual machines
|
|
301
|
+
# ---------------------------------------------------------------------------
|
|
302
|
+
def launch_vm(
|
|
303
|
+
*, name: str, environment_name: str, image_name: str, flavor_name: str, key_name: str, user_data: str
|
|
304
|
+
) -> str:
|
|
305
|
+
"""Launch one VM -> its id. Raises ``HyperstackApiError`` on rejection (no stock, etc.).
|
|
306
|
+
|
|
307
|
+
NON-IDEMPOTENT: never retried (a blind retry on a timeout where Hyperstack accepted the first
|
|
308
|
+
request would double-provision)."""
|
|
309
|
+
body = {
|
|
310
|
+
# ``name`` is bounded <=60 by ``_instance.run_label_prefix`` (NOT truncated here) so the
|
|
311
|
+
# stored name always equals the prefix ``sweep_orphans`` matches on.
|
|
312
|
+
"name": name,
|
|
313
|
+
"environment_name": environment_name,
|
|
314
|
+
"image_name": image_name,
|
|
315
|
+
"flavor_name": flavor_name,
|
|
316
|
+
"key_name": key_name,
|
|
317
|
+
"count": 1,
|
|
318
|
+
"assign_floating_ip": True, # public IP for outbound (Docker pull + HF egress)
|
|
319
|
+
"user_data": user_data,
|
|
320
|
+
}
|
|
321
|
+
out = request_with_retries("/core/virtual-machines", method="POST", body=body, retries=0)
|
|
322
|
+
insts = out.get("instances") if isinstance(out, dict) else None
|
|
323
|
+
if not insts:
|
|
324
|
+
# Some responses nest a single instance under "instance".
|
|
325
|
+
one = out.get("instance") if isinstance(out, dict) else None
|
|
326
|
+
insts = [one] if one else None
|
|
327
|
+
if not insts or not insts[0] or not insts[0].get("id"):
|
|
328
|
+
raise HyperstackApiError(f"launch({flavor_name}@{environment_name}) returned no VM id: {out}")
|
|
329
|
+
return str(insts[0]["id"])
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def get_vm(vm_id: str) -> dict | None:
|
|
333
|
+
"""VM detail dict, or None once it no longer exists (deleted)."""
|
|
334
|
+
try:
|
|
335
|
+
out = request_with_retries(f"/core/virtual-machines/{vm_id}")
|
|
336
|
+
except HyperstackApiError as e:
|
|
337
|
+
# Robust 404 check (NOT a bare "404" substring): Hyperstack VM ids are short integers, so a
|
|
338
|
+
# transient 5xx on a VM whose id contains "404" must not be misread as "deleted".
|
|
339
|
+
if is_not_found(e):
|
|
340
|
+
return None
|
|
341
|
+
raise
|
|
342
|
+
inst = out.get("instance") if isinstance(out, dict) else None
|
|
343
|
+
return inst if isinstance(inst, dict) else None
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
# Hyperstack paginates ``/core/virtual-machines``: a single GET returns only the FIRST page, so an
|
|
347
|
+
# account with more VMs than one page silently hides the rest. ``sweep_orphans`` lists every VM to
|
|
348
|
+
# reap orphans, so a missed page = a leaked, still-billing box. Walk every page (bounded so a buggy
|
|
349
|
+
# server that never shrinks the page can't loop forever) and concatenate.
|
|
350
|
+
_VM_PAGE_SIZE = 100
|
|
351
|
+
_VM_MAX_PAGES = 1000
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def list_vms() -> list[dict]:
|
|
355
|
+
"""Every Flash-listable VM across ALL pages (orphan-sweep + run-terminate read this).
|
|
356
|
+
|
|
357
|
+
Paginates ``/core/virtual-machines`` via ``page``/``per_page`` until a short/empty page is
|
|
358
|
+
reached. A page fetch that errors propagates AND a malformed page schema raises
|
|
359
|
+
``HyperstackApiError`` — so a partial/incomplete list never masquerades as the authoritative
|
|
360
|
+
fleet and lets a sweep miss still-billing VMs (or reap survivors)."""
|
|
361
|
+
out: list[dict] = []
|
|
362
|
+
seen_ids: set[str] = set()
|
|
363
|
+
terminated = False
|
|
364
|
+
# Walk one page PAST the cap so a fleet that's an exact multiple of _VM_PAGE_SIZE can be CONFIRMED
|
|
365
|
+
# complete: its last in-cap page is full, so the only way to tell "complete (next page empty)"
|
|
366
|
+
# from "truncated (next page still has VMs)" is to fetch that extra probe page. The probe is
|
|
367
|
+
# gated below to fire ONLY at the cap boundary, so the normal walk still stops at _VM_MAX_PAGES.
|
|
368
|
+
for page in range(1, _VM_MAX_PAGES + 2):
|
|
369
|
+
resp = request_with_retries(
|
|
370
|
+
f"/core/virtual-machines?page={page}&per_page={_VM_PAGE_SIZE}"
|
|
371
|
+
)
|
|
372
|
+
# Distinguish a malformed response from a valid empty page. An unexpected schema (resp not a
|
|
373
|
+
# dict, or "instances" not a list) means we CANNOT trust this as the authoritative fleet —
|
|
374
|
+
# returning the partial list gathered so far would let an orphan sweep miss still-billing VMs
|
|
375
|
+
# past this point, so RAISE instead. A valid empty list is the legitimate end-of-pagination.
|
|
376
|
+
if not isinstance(resp, dict) or not isinstance(resp.get("instances"), list):
|
|
377
|
+
raise HyperstackApiError(
|
|
378
|
+
f"unexpected /core/virtual-machines response on page {page} "
|
|
379
|
+
f"(no 'instances' list): {resp!r}"
|
|
380
|
+
)
|
|
381
|
+
insts = resp["instances"]
|
|
382
|
+
if not insts:
|
|
383
|
+
terminated = True
|
|
384
|
+
break # valid empty page -> done paginating (incl. the boundary probe confirming complete)
|
|
385
|
+
added = 0
|
|
386
|
+
for v in insts:
|
|
387
|
+
# De-dupe across pages: an older API that ignores the ``page`` param would echo page 1
|
|
388
|
+
# forever, so we count only NEW ids and stop below when a full page adds none.
|
|
389
|
+
# Use ``is not None`` (not truthiness) so a legitimately falsy id like 0 still de-dupes.
|
|
390
|
+
vid = str(v["id"]) if isinstance(v, dict) and v.get("id") is not None else None
|
|
391
|
+
if vid is not None and vid in seen_ids:
|
|
392
|
+
continue
|
|
393
|
+
if vid is not None:
|
|
394
|
+
seen_ids.add(vid)
|
|
395
|
+
out.append(v)
|
|
396
|
+
added += 1
|
|
397
|
+
# Last page (short) OR a full page that added nothing new (server ignoring pagination): done.
|
|
398
|
+
if len(insts) < _VM_PAGE_SIZE or added == 0:
|
|
399
|
+
terminated = True
|
|
400
|
+
break
|
|
401
|
+
# A FULL page that added new VMs and we've now consumed the cap (page == _VM_MAX_PAGES): the
|
|
402
|
+
# fleet might be exactly a multiple of the page size (complete) OR genuinely larger
|
|
403
|
+
# (truncated). DON'T raise yet — let the loop fetch ONE more page (_VM_MAX_PAGES + 1) as a
|
|
404
|
+
# probe: if it comes back empty, the break above marks ``terminated`` and we return the full
|
|
405
|
+
# fleet; if it still has VMs, the fleet is genuinely over-cap and we raise below.
|
|
406
|
+
if not terminated:
|
|
407
|
+
# The boundary probe page (_VM_MAX_PAGES + 1) ALSO came back full — pagination did not
|
|
408
|
+
# terminate within the cap and there are genuinely more VMs past it. Returning ``out`` here
|
|
409
|
+
# would hand back a truncated fleet as if authoritative, and an orphan sweep / run-terminate
|
|
410
|
+
# keying off it would miss (or fail to reap) still-billing VMs. Surface it as a
|
|
411
|
+
# HyperstackApiError — same as the malformed-page guard above — so the incomplete list never
|
|
412
|
+
# masquerades as the complete one. (An exact-multiple fleet whose probe page was empty
|
|
413
|
+
# already returned normally above and does NOT reach here.)
|
|
414
|
+
raise HyperstackApiError(
|
|
415
|
+
f"/core/virtual-machines pagination did not terminate within {_VM_MAX_PAGES} pages "
|
|
416
|
+
f"(the page past the cap was still full at {_VM_PAGE_SIZE}/page, {len(out)} VMs "
|
|
417
|
+
f"collected) — refusing to return a truncated fleet that an orphan sweep could read as "
|
|
418
|
+
f"authoritative"
|
|
419
|
+
)
|
|
420
|
+
return out
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
def delete_vm(vm_id: str) -> bool:
|
|
424
|
+
"""Delete (and stop billing for) a VM. Best-effort: never raises."""
|
|
425
|
+
if not vm_id:
|
|
426
|
+
return False
|
|
427
|
+
try:
|
|
428
|
+
request_with_retries(f"/core/virtual-machines/{vm_id}", method="DELETE", retries=2)
|
|
429
|
+
return True
|
|
430
|
+
except Exception as exc:
|
|
431
|
+
# Hyperstack returns 409 when a prior delete request is still being processed: the VM is
|
|
432
|
+
# already queued for teardown so billing will stop — treat as success. Key off the chained
|
|
433
|
+
# HTTPError status (``is_conflict``), NOT a bare "409"/"conflict" substring — a non-409
|
|
434
|
+
# failure whose text merely contains "409" (e.g. a "4090" GPU name) must still surface, or a
|
|
435
|
+
# real delete failure would be silently dropped and the VM left billing.
|
|
436
|
+
if is_conflict(exc):
|
|
437
|
+
return True
|
|
438
|
+
logger.warning("hyperstack delete_vm(%s) failed: %s", vm_id, exc)
|
|
439
|
+
return False
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
# ---------------------------------------------------------------------------
|
|
443
|
+
# Volumes (the weight cache). Region/environment-scoped block storage; created via the API, attached
|
|
444
|
+
# to a VM AFTER launch, then formatted+mounted in the VM by the cloud-init preamble. Block volumes
|
|
445
|
+
# are single-attach (one VM at a time) — concurrent same-region runs fall back to cold.
|
|
446
|
+
# ---------------------------------------------------------------------------
|
|
447
|
+
_CACHE_VOLUME_TYPE = "Cloud-SSD"
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
def list_volumes() -> list[dict]:
|
|
451
|
+
"""All volumes: ``[{id, name, environment:{name,region}, status, attachments}, ...]``."""
|
|
452
|
+
out = request_with_retries("/core/volumes")
|
|
453
|
+
vols = out.get("volumes") if isinstance(out, dict) else None
|
|
454
|
+
return vols if isinstance(vols, list) else []
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
def create_volume(name: str, environment_name: str, size_gb: int) -> dict:
|
|
458
|
+
"""Create a ``Cloud-SSD`` volume in ``environment_name`` -> its object (incl. integer ``id``)."""
|
|
459
|
+
body = {
|
|
460
|
+
"name": name,
|
|
461
|
+
"environment_name": environment_name,
|
|
462
|
+
"volume_type": _CACHE_VOLUME_TYPE,
|
|
463
|
+
"size": int(size_gb),
|
|
464
|
+
}
|
|
465
|
+
out = request_with_retries("/core/volumes", method="POST", body=body, retries=2)
|
|
466
|
+
return (out.get("volume") if isinstance(out, dict) else None) or {}
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def delete_volume(volume_id) -> bool:
|
|
470
|
+
"""Delete a volume by id (best-effort)."""
|
|
471
|
+
try:
|
|
472
|
+
request_with_retries(f"/core/volumes/{volume_id}", method="DELETE", retries=2)
|
|
473
|
+
return True
|
|
474
|
+
except Exception as exc:
|
|
475
|
+
logger.warning("hyperstack delete_volume(%s) failed: %s", volume_id, exc)
|
|
476
|
+
return False
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def attach_volume(vm_id: str, volume_id) -> bool:
|
|
480
|
+
"""Attach a volume to a VM (best-effort). The cloud-init preamble then formats+mounts the device;
|
|
481
|
+
if this fails the run still proceeds COLD (the preamble degrades when no device appears)."""
|
|
482
|
+
try:
|
|
483
|
+
request_with_retries(
|
|
484
|
+
f"/core/virtual-machines/{vm_id}/attach-volumes",
|
|
485
|
+
method="POST",
|
|
486
|
+
body={"volume_ids": [volume_id]},
|
|
487
|
+
retries=2,
|
|
488
|
+
)
|
|
489
|
+
return True
|
|
490
|
+
except Exception as exc:
|
|
491
|
+
logger.warning("hyperstack attach_volume(vm=%s, vol=%s) failed: %s", vm_id, volume_id, exc)
|
|
492
|
+
return False
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
def cache_volume_name(base: str, region: str) -> str:
|
|
496
|
+
"""Physical cache-volume name for ``base`` in ``region`` — DISTINCT per region.
|
|
497
|
+
|
|
498
|
+
Hyperstack enforces GLOBAL volume-name uniqueness (a plain ``flash-weights`` can exist in only ONE
|
|
499
|
+
environment account-wide; a 2nd create elsewhere returns HTTP 400 "This name is not available").
|
|
500
|
+
The cache is one logical volume (``base`` == ``spec.gpu.network_volume``, e.g. ``flash-weights``)
|
|
501
|
+
realized as one physical volume per region, so the region MUST be in the name. Mirrors RunPod's
|
|
502
|
+
per-DC ``weight_cache_volume_name``. The cloud-init preamble mounts by device size, not name, so
|
|
503
|
+
the worker is unaffected — every cache volume mounts at the same ``/mnt/flash-weights``.
|
|
504
|
+
"""
|
|
505
|
+
return f"{base}-{region}".lower()
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
def ensure_volume(name: str, environment_name: str, size_gb: int) -> object:
|
|
509
|
+
"""Create-if-absent the cache volume ``name`` in ``environment_name``; return its id. Idempotent:
|
|
510
|
+
reuses an existing same-name volume in that environment. ``name`` MUST be globally unique — use
|
|
511
|
+
``cache_volume_name(base, region)`` (Hyperstack rejects a duplicate name across environments)."""
|
|
512
|
+
for v in list_volumes():
|
|
513
|
+
if v.get("name") == name and (v.get("environment") or {}).get("name") == environment_name:
|
|
514
|
+
return v.get("id")
|
|
515
|
+
return create_volume(name, environment_name, size_gb).get("id")
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
def delete_vms(vm_ids: list[str]) -> list[str]:
|
|
519
|
+
"""Delete several VMs (best-effort, per-id isolated). Return the ids that ACTUALLY deleted so
|
|
520
|
+
callers (sweep_orphans / terminate_run_instances) report only what was truly torn down — a
|
|
521
|
+
partial failure must not log/return a still-billing VM as reaped."""
|
|
522
|
+
return [str(v) for v in vm_ids if v and delete_vm(str(v))]
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Hyperstack credential handling (operator-side), mirroring the RunPod/Lambda auth modules.
|
|
2
|
+
|
|
3
|
+
The Hyperstack REST client authenticates via the ``HYPERSTACK_API_KEY`` environment variable, set
|
|
4
|
+
by the **operator** on the control-plane host. Env-only by design. Hyperstack presents the key in a
|
|
5
|
+
bare ``api_key`` header (not ``Authorization: Bearer``).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from .._auth import load_provider_key
|
|
11
|
+
|
|
12
|
+
_ENV_VAR = "HYPERSTACK_API_KEY"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def load_api_key() -> str | None:
|
|
16
|
+
"""API key from the environment (operator configuration)."""
|
|
17
|
+
return load_provider_key(_ENV_VAR)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Hyperstack's GPU classes (its rows of the shared GPU table).
|
|
2
|
+
|
|
3
|
+
The class table is provider-agnostic and lives in ``providers/base.py``. This module carves out
|
|
4
|
+
Hyperstack's rows (``gpu_classes()`` == every class with a ``hyperstack_name``) and owns the
|
|
5
|
+
friendly-name -> Hyperstack flavor translation.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from flash.providers.base import GpuClass, UnsupportedGpuError, get_gpu_info, providers_for
|
|
11
|
+
|
|
12
|
+
__all__ = ["flavor_for", "gpu_classes"]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def gpu_classes() -> list[GpuClass]:
|
|
16
|
+
"""The GPU classes Hyperstack can provision (those with a ``hyperstack_name``)."""
|
|
17
|
+
from flash.providers.base import GPU_INFO
|
|
18
|
+
|
|
19
|
+
return [g for g in GPU_INFO.values() if g.hyperstack_name]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def flavor_for(name: str) -> str:
|
|
23
|
+
"""Hyperstack single-GPU flavor name (e.g. 'n3-L40x1') for a friendly GPU class name."""
|
|
24
|
+
info = get_gpu_info(name)
|
|
25
|
+
if not info.hyperstack_name:
|
|
26
|
+
raise UnsupportedGpuError(
|
|
27
|
+
f"{info.name} is not available on Hyperstack (providers: {', '.join(providers_for(name))})"
|
|
28
|
+
)
|
|
29
|
+
return info.hyperstack_name
|