freesolo-flash-dev 0.2.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. flash/__init__.py +29 -0
  2. flash/_channel.py +23 -0
  3. flash/_fileio.py +35 -0
  4. flash/_logging.py +49 -0
  5. flash/_update_check.py +266 -0
  6. flash/catalog.py +253 -0
  7. flash/cli/__init__.py +1 -0
  8. flash/cli/main/__init__.py +227 -0
  9. flash/cli/main/__main__.py +6 -0
  10. flash/cli/main/commands.py +636 -0
  11. flash/cli/main/envpush.py +317 -0
  12. flash/cli/main/render.py +599 -0
  13. flash/cli/main/training_doc.py +455 -0
  14. flash/client/__init__.py +14 -0
  15. flash/client/config.py +70 -0
  16. flash/client/http.py +372 -0
  17. flash/client/runtime_secrets.py +69 -0
  18. flash/client/specs.py +20 -0
  19. flash/cost/__init__.py +16 -0
  20. flash/cost/analytical.py +175 -0
  21. flash/cost/facts.py +114 -0
  22. flash/cost/spec.py +113 -0
  23. flash/cost/types.py +158 -0
  24. flash/engine/__init__.py +6 -0
  25. flash/engine/accounting.py +36 -0
  26. flash/engine/chalk_kernels.py +116 -0
  27. flash/engine/multiturn_rollout.py +780 -0
  28. flash/engine/recipe.py +86 -0
  29. flash/engine/vram.py +603 -0
  30. flash/engine/worker/__init__.py +2916 -0
  31. flash/engine/worker/__main__.py +4 -0
  32. flash/engine/worker/kernel_warmup.py +400 -0
  33. flash/engine/worker/lora.py +796 -0
  34. flash/engine/worker/packing.py +366 -0
  35. flash/engine/worker/perf.py +1048 -0
  36. flash/envs/__init__.py +10 -0
  37. flash/envs/adapter/__init__.py +883 -0
  38. flash/envs/adapter/rubric.py +222 -0
  39. flash/envs/base.py +52 -0
  40. flash/envs/registry.py +62 -0
  41. flash/mcp/__init__.py +1 -0
  42. flash/mcp/server.py +85 -0
  43. flash/providers/__init__.py +59 -0
  44. flash/providers/_auth.py +24 -0
  45. flash/providers/_http.py +230 -0
  46. flash/providers/_instance.py +416 -0
  47. flash/providers/_instance_bootstrap.py +517 -0
  48. flash/providers/_poll.py +311 -0
  49. flash/providers/allocator.py +193 -0
  50. flash/providers/base.py +431 -0
  51. flash/providers/hyperstack/__init__.py +127 -0
  52. flash/providers/hyperstack/api.py +522 -0
  53. flash/providers/hyperstack/auth.py +17 -0
  54. flash/providers/hyperstack/gpus.py +29 -0
  55. flash/providers/hyperstack/jobs/__init__.py +632 -0
  56. flash/providers/hyperstack/jobs/builders.py +122 -0
  57. flash/providers/hyperstack/preflight.py +23 -0
  58. flash/providers/hyperstack/pricing.py +26 -0
  59. flash/providers/hyperstack/train.py +25 -0
  60. flash/providers/lambdalabs/__init__.py +139 -0
  61. flash/providers/lambdalabs/api.py +261 -0
  62. flash/providers/lambdalabs/auth.py +18 -0
  63. flash/providers/lambdalabs/gpus.py +29 -0
  64. flash/providers/lambdalabs/jobs/__init__.py +724 -0
  65. flash/providers/lambdalabs/jobs/builders.py +118 -0
  66. flash/providers/lambdalabs/preflight.py +27 -0
  67. flash/providers/lambdalabs/pricing.py +51 -0
  68. flash/providers/lambdalabs/train.py +27 -0
  69. flash/providers/preflight.py +55 -0
  70. flash/providers/realized.py +80 -0
  71. flash/providers/runpod/__init__.py +130 -0
  72. flash/providers/runpod/api.py +186 -0
  73. flash/providers/runpod/auth.py +37 -0
  74. flash/providers/runpod/cost.py +57 -0
  75. flash/providers/runpod/gpus.py +46 -0
  76. flash/providers/runpod/jobs.py +956 -0
  77. flash/providers/runpod/keys.py +139 -0
  78. flash/providers/runpod/preflight.py +30 -0
  79. flash/providers/runpod/preload.py +915 -0
  80. flash/providers/runpod/pricing.py +18 -0
  81. flash/providers/runpod/slots.py +79 -0
  82. flash/providers/runpod/train/__init__.py +150 -0
  83. flash/providers/runpod/train/deps.py +395 -0
  84. flash/providers/runpod/train/endpoints.py +820 -0
  85. flash/py.typed +0 -0
  86. flash/runner/__init__.py +686 -0
  87. flash/runner/checkpoints.py +82 -0
  88. flash/runner/deploy.py +422 -0
  89. flash/runner/lifecycle.py +672 -0
  90. flash/schema/__init__.py +375 -0
  91. flash/schema/fields.py +331 -0
  92. flash/serve/__init__.py +1 -0
  93. flash/serve/deploy.py +326 -0
  94. flash/serve/pricing.py +60 -0
  95. flash/server/__init__.py +1 -0
  96. flash/server/__main__.py +20 -0
  97. flash/server/app.py +961 -0
  98. flash/server/auth.py +263 -0
  99. flash/server/billing.py +124 -0
  100. flash/server/checkpoints.py +110 -0
  101. flash/server/db.py +160 -0
  102. flash/server/environment_registry.py +102 -0
  103. flash/server/envs.py +360 -0
  104. flash/server/reconcile.py +163 -0
  105. flash/server/run_registry.py +150 -0
  106. flash/spec.py +333 -0
  107. freesolo_flash_dev-0.2.25.dist-info/METADATA +192 -0
  108. freesolo_flash_dev-0.2.25.dist-info/RECORD +111 -0
  109. freesolo_flash_dev-0.2.25.dist-info/WHEEL +4 -0
  110. freesolo_flash_dev-0.2.25.dist-info/entry_points.txt +3 -0
  111. freesolo_flash_dev-0.2.25.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,522 @@
1
+ """Thin Hyperstack (NexGen Cloud) REST client (no SDK state): flavors/capacity + VM lifecycle.
2
+
3
+ Mirrors ``providers/lambdalabs/api.py``: stdlib urllib via the shared ``RestClient``, hardened
4
+ retries, nothing persisted locally. Hyperstack specifics:
5
+
6
+ * **Auth header.** Hyperstack presents the key as a bare ``api_key: <key>`` header (NOT
7
+ ``Authorization: Bearer``); the ``RestClient`` is configured with ``auth_header_name="api_key"``.
8
+ * **Capacity = ``stock_available``.** ``/core/flavors`` carries per-flavor ``stock_available`` per
9
+ region (the Hyperstack analog of Lambda's ``regions_with_capacity_available``); a flavor with no
10
+ stock in any region can't launch.
11
+ * **Launch needs an environment + a keypair.** Every region has a ``default-<region>`` environment;
12
+ a launch requires exactly one keypair name (env-scoped). The box is bootstrapped via cloud-init
13
+ ``user_data`` and we never SSH, so the key is a formality — ``resolve_key_name`` reuses an
14
+ existing key or imports a throwaway one (private half discarded; no inbound-SSH rule is opened).
15
+ * **Non-idempotent launch.** ``POST /core/virtual-machines`` provisions a NEW (billed) VM every
16
+ time it succeeds, so it is NEVER retried.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import os
22
+ import re
23
+ import subprocess
24
+ import tempfile
25
+ import time
26
+ from typing import Any
27
+
28
+ from flash._logging import get_logger
29
+ from flash.providers._http import RestClient, is_conflict, is_not_found
30
+
31
+ logger = get_logger(__name__)
32
+
33
+ HYPERSTACK_BASE = "https://infrahub-api.nexgencloud.com/v1"
34
+ _USER_AGENT = "flash-hyperstack/1.0 (+https://freesolo.co)"
35
+ # The managed keypair name (env-scoped). Operators can pin an existing key via HYPERSTACK_KEYPAIR_NAME.
36
+ _MANAGED_KEYPAIR = "flash-managed"
37
+
38
+
39
+ class HyperstackApiError(RuntimeError):
40
+ pass
41
+
42
+
43
+ _CLIENT = RestClient(
44
+ env_var="HYPERSTACK_API_KEY",
45
+ error_cls=HyperstackApiError,
46
+ base_url=HYPERSTACK_BASE,
47
+ missing_key_message="HYPERSTACK_API_KEY not configured on the control-plane host",
48
+ extra_headers={"User-Agent": _USER_AGENT},
49
+ auth_header_name="api_key",
50
+ auth_value_format="{key}",
51
+ )
52
+
53
+
54
+ def request_with_retries(
55
+ path: str, method: str = "GET", body: dict | None = None, retries: int = 4, base_delay: float = 2.0
56
+ ) -> Any:
57
+ return _CLIENT.request_with_retries(
58
+ path, method=method, body=body, retries=retries, base_delay=base_delay
59
+ )
60
+
61
+
62
+ # ---------------------------------------------------------------------------
63
+ # Flavors + capacity (cached: pricing, the allocator, and the launcher all read this)
64
+ # ---------------------------------------------------------------------------
65
+ _FLAVORS_TTL_S = 45.0
66
+ _flavors_cache: dict[str, Any] = {"ts": 0.0, "by_region": None}
67
+
68
+
69
+ # Regions excluded from allocation + launch. CANADA-1's on-demand stock is a known-broken-driver
70
+ # fleet: instances reach ACTIVE then die without a DONE sentinel (NVML init failure / cuda
71
+ # unavailable), so launching there burns GPU budget on guaranteed-failed retries. Skip it by default
72
+ # rather than waterfall through it. Override with HYPERSTACK_BLOCKED_REGIONS (comma-separated): unset
73
+ # -> the default below; set (even to "") -> exactly that list, so operators can re-enable CANADA-1
74
+ # (HYPERSTACK_BLOCKED_REGIONS="") or block others once capacity/driver health changes.
75
+ _DEFAULT_BLOCKED_REGIONS = frozenset({"CANADA-1"})
76
+
77
+
78
+ def _blocked_regions() -> set[str]:
79
+ env = os.environ.get("HYPERSTACK_BLOCKED_REGIONS")
80
+ if env is None:
81
+ return set(_DEFAULT_BLOCKED_REGIONS)
82
+ return {r.strip().upper() for r in env.split(",") if r.strip()}
83
+
84
+
85
+ def _regions() -> list[str]:
86
+ out = request_with_retries("/core/regions")
87
+ regs = out.get("regions", []) if isinstance(out, dict) else []
88
+ names = [r.get("name") for r in regs if r.get("name")]
89
+ names = names or ["NORWAY-1", "CANADA-1", "US-1", "CANADA-2"]
90
+ blocked = _blocked_regions()
91
+ return [n for n in names if n.upper() not in blocked]
92
+
93
+
94
+ # Hyperstack regions that DON'T support block-volume operations. LIVE-FOUND: CANADA-2 returns HTTP 400
95
+ # "Volume operations are not supported in this region" on create — the region is launchable for COMPUTE
96
+ # but has no volume backend, so the cache can't live there. Mirrors RunPod's _VOLUME_INCAPABLE_DATACENTERS.
97
+ # These regions stay in `_regions()` (still usable for GPU launches), but the cache provisioner and the
98
+ # launch-time attach skip them rather than burning a guaranteed-failing API call (the launch path already
99
+ # degrades to a cold run there, so a stale list is graceful — but list a region here to silence the noise).
100
+ _VOLUME_INCAPABLE_REGIONS = frozenset({"CANADA-2"})
101
+
102
+
103
+ def region_supports_cache(region: str) -> bool:
104
+ """Whether the weight cache can be stored in ``region`` (False for known volume-incapable regions)."""
105
+ return region not in _VOLUME_INCAPABLE_REGIONS
106
+
107
+
108
+ def cache_regions() -> list[str]:
109
+ """The subset of ``_regions()`` that supports block volumes — where the cache is provisioned."""
110
+ return [r for r in _regions() if region_supports_cache(r)]
111
+
112
+
113
+ def flavors_by_region(force: bool = False) -> dict[str, list[dict]]:
114
+ """``region -> [flavor dict]`` across all regions, cached for ``_FLAVORS_TTL_S``.
115
+
116
+ Each flavor dict carries ``name``, ``gpu``, ``gpu_count``, ``stock_available``. Raises
117
+ ``HyperstackApiError`` on a hard failure; callers that must degrade gracefully catch it.
118
+ """
119
+ now = time.time()
120
+ if not force and _flavors_cache["by_region"] is not None and now - _flavors_cache["ts"] < _FLAVORS_TTL_S:
121
+ return _flavors_cache["by_region"]
122
+ by_region: dict[str, list[dict]] = {}
123
+ for region in _regions():
124
+ out = request_with_retries(f"/core/flavors?region={region}")
125
+ data = out.get("data", []) if isinstance(out, dict) else []
126
+ by_region[region] = [f for grp in data for f in grp.get("flavors", [])]
127
+ _flavors_cache.update(ts=now, by_region=by_region)
128
+ return by_region
129
+
130
+
131
+ def regions_with_stock(flavor_name: str, force: bool = False) -> list[str]:
132
+ """Region names where ``flavor_name`` currently has stock (the launchability signal)."""
133
+ out = []
134
+ for region, flavors in flavors_by_region(force=force).items():
135
+ for f in flavors:
136
+ if f.get("name") == flavor_name and f.get("stock_available"):
137
+ out.append(region)
138
+ break
139
+ return out
140
+
141
+
142
+ # ---------------------------------------------------------------------------
143
+ # Environments (a launch targets ``default-<region>``)
144
+ # ---------------------------------------------------------------------------
145
+ _env_cache: dict[str, Any] = {"ts": 0.0, "by_region": None}
146
+
147
+
148
+ def environment_for_region(region: str) -> str:
149
+ """The environment name to launch into for ``region`` (the per-region default env)."""
150
+ now = time.time()
151
+ if _env_cache["by_region"] is None or now - _env_cache["ts"] > 300:
152
+ out = request_with_retries("/core/environments")
153
+ envs = out.get("environments", []) if isinstance(out, dict) else []
154
+ _env_cache.update(ts=now, by_region={e.get("region"): e.get("name") for e in envs if e.get("name")})
155
+ return (_env_cache["by_region"] or {}).get(region) or f"default-{region}"
156
+
157
+
158
+ # ---------------------------------------------------------------------------
159
+ # Keypairs (launch requires exactly one; we never SSH)
160
+ # ---------------------------------------------------------------------------
161
+ def list_keypairs() -> list[dict]:
162
+ out = request_with_retries("/core/keypairs")
163
+ return out.get("keypairs", []) if isinstance(out, dict) else []
164
+
165
+
166
+ def _generate_throwaway_public_key() -> str:
167
+ """An OpenSSH ed25519 public key whose private half is immediately discarded.
168
+
169
+ Hyperstack requires a key_name at launch even though the box is bootstrapped via cloud-init and
170
+ we never SSH. The private key is thrown away here and no inbound-SSH security rule is opened, so
171
+ the key is inert.
172
+
173
+ Shells out to ``ssh-keygen``; a slim control-plane image may not ship it. A missing/failed
174
+ ``ssh-keygen`` raises a clear, actionable ``HyperstackApiError`` (install ``openssh-client`` or
175
+ pin ``HYPERSTACK_KEYPAIR_NAME``) instead of a bare ``FileNotFoundError`` that looks like an
176
+ API/stock failure."""
177
+ with tempfile.TemporaryDirectory() as d:
178
+ kp = f"{d}/k"
179
+ try:
180
+ subprocess.run(
181
+ ["ssh-keygen", "-t", "ed25519", "-f", kp, "-N", "", "-q"], check=True, timeout=30
182
+ )
183
+ except (FileNotFoundError, subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
184
+ raise HyperstackApiError(
185
+ "could not generate a throwaway Hyperstack keypair: ssh-keygen is unavailable or "
186
+ f"failed ({e}). Install openssh-client on the control plane (to provide ssh-keygen), "
187
+ "or set HYPERSTACK_KEYPAIR_NAME to an existing keypair to skip generation."
188
+ ) from e
189
+ with open(kp + ".pub") as f:
190
+ return f.read().strip()
191
+
192
+
193
+ def resolve_key_name(environment_name: str) -> str:
194
+ """A keypair name usable to launch into ``environment_name``.
195
+
196
+ Pins ``HYPERSTACK_KEYPAIR_NAME`` if set; else reuses an existing key in that environment; else
197
+ imports a throwaway ``flash-managed`` key (private half discarded). Idempotent per env.
198
+ """
199
+ import os
200
+
201
+ pinned = os.environ.get("HYPERSTACK_KEYPAIR_NAME")
202
+ if pinned:
203
+ return pinned
204
+ existing = list_keypairs()
205
+
206
+ def _env_name(k: dict) -> str:
207
+ env = k.get("environment")
208
+ return env.get("name") if isinstance(env, dict) else (env or "")
209
+
210
+ # Reuse only a key bound to the EXACT target environment (Hyperstack keypairs are env-scoped, so
211
+ # an env-less / other-env key may be rejected at launch). Otherwise fall through to creating the
212
+ # env-scoped managed key below.
213
+ for k in existing:
214
+ if k.get("name") and _env_name(k) == environment_name:
215
+ return k["name"]
216
+ name = f"{_MANAGED_KEYPAIR}-{environment_name}"
217
+ if any(k.get("name") == name for k in existing):
218
+ return name
219
+ try:
220
+ request_with_retries(
221
+ "/core/keypairs",
222
+ method="POST",
223
+ body={
224
+ "name": name,
225
+ "environment_name": environment_name,
226
+ "public_key": _generate_throwaway_public_key(),
227
+ },
228
+ retries=0,
229
+ )
230
+ except HyperstackApiError as e:
231
+ # Create race: two concurrent launches into the same env both see no managed key and both
232
+ # POST. The loser gets an "already exists" rejection — that is SUCCESS, the env-scoped key now
233
+ # exists (the winner created it), so return the name. Re-list to confirm the key is really
234
+ # present before swallowing the error, so an UNRELATED 4xx (e.g. a bad public key / perms)
235
+ # still surfaces rather than being silently treated as a benign duplicate.
236
+ if _keypair_create_conflict(e) and any(
237
+ k.get("name") == name for k in list_keypairs()
238
+ ):
239
+ return name
240
+ raise
241
+ return name
242
+
243
+
244
+ def _keypair_create_conflict(err: Exception) -> bool:
245
+ """True when a keypair POST failed because the name already exists (a benign create race), not
246
+ for some other reason. Hyperstack returns 409/400 with an 'already exists'/'duplicate' body."""
247
+ s = str(err).lower()
248
+ if "-> http 409" in s:
249
+ return True
250
+ return ("-> http 4" in s) and ("already exist" in s or "duplicate" in s or "in use" in s)
251
+
252
+
253
+ # ---------------------------------------------------------------------------
254
+ # Images (need a Docker-preinstalled, CUDA-12.8 Ubuntu image to run WORKER_IMAGE)
255
+ # ---------------------------------------------------------------------------
256
+ _image_cache: dict[str, str] = {}
257
+
258
+
259
+ def _image_cuda(name: str) -> float:
260
+ """Parse the CUDA version out of a Hyperstack image name (e.g. '... CUDA 12.8 ...' -> 12.8)."""
261
+ m = re.search(r"cuda (\d+(?:\.\d+)?)", name.lower())
262
+ return float(m.group(1)) if m else 0.0
263
+
264
+
265
+ def docker_image_for_region(region: str, min_cuda: str = "12.8") -> str:
266
+ """A Docker-preinstalled Ubuntu image in ``region`` whose host CUDA covers the run.
267
+
268
+ The host driver's CUDA must be at least the GPU class's floor (``min_cuda`` — e.g. 13.0 for
269
+ Blackwell) AND the cu128 worker container's 12.8. Among the fitting Docker images we pick the
270
+ LOWEST qualifying CUDA (closest to the worker stack) and prefer the newest Ubuntu. Raises if
271
+ none qualifies, so ``launch_and_submit`` skips the region rather than booting a box on a driver
272
+ that can't JIT the GPU's kernels (a Blackwell class on a CUDA-12.8 image would fail at setup)."""
273
+ required = max(12.8, float(min_cuda))
274
+ key = f"{region}|{required}"
275
+ if key in _image_cache:
276
+ return _image_cache[key]
277
+ out = request_with_retries(f"/core/images?region={region}")
278
+ images = out.get("images", []) if isinstance(out, dict) else []
279
+ flat: list[dict] = []
280
+ for x in images:
281
+ if isinstance(x, dict) and "images" in x:
282
+ flat += x["images"]
283
+ elif isinstance(x, dict):
284
+ flat.append(x)
285
+ names = [im.get("name", "") for im in flat]
286
+ # Docker-preinstalled ONLY (the cloud-init does not install Docker) AND CUDA >= required.
287
+ docker_imgs = [n for n in names if "with docker" in n.lower()]
288
+ fitting = [n for n in docker_imgs if _image_cuda(n) >= required - 1e-9]
289
+ if not fitting:
290
+ have = sorted({_image_cuda(n) for n in docker_imgs})
291
+ raise HyperstackApiError(
292
+ f"no Docker image with CUDA>={required} in {region} (available Docker CUDA: {have})"
293
+ )
294
+ best = min(fitting, key=lambda n: (_image_cuda(n), "24.04" not in n))
295
+ _image_cache[key] = best
296
+ return best
297
+
298
+
299
+ # ---------------------------------------------------------------------------
300
+ # Virtual machines
301
+ # ---------------------------------------------------------------------------
302
+ def launch_vm(
303
+ *, name: str, environment_name: str, image_name: str, flavor_name: str, key_name: str, user_data: str
304
+ ) -> str:
305
+ """Launch one VM -> its id. Raises ``HyperstackApiError`` on rejection (no stock, etc.).
306
+
307
+ NON-IDEMPOTENT: never retried (a blind retry on a timeout where Hyperstack accepted the first
308
+ request would double-provision)."""
309
+ body = {
310
+ # ``name`` is bounded <=60 by ``_instance.run_label_prefix`` (NOT truncated here) so the
311
+ # stored name always equals the prefix ``sweep_orphans`` matches on.
312
+ "name": name,
313
+ "environment_name": environment_name,
314
+ "image_name": image_name,
315
+ "flavor_name": flavor_name,
316
+ "key_name": key_name,
317
+ "count": 1,
318
+ "assign_floating_ip": True, # public IP for outbound (Docker pull + HF egress)
319
+ "user_data": user_data,
320
+ }
321
+ out = request_with_retries("/core/virtual-machines", method="POST", body=body, retries=0)
322
+ insts = out.get("instances") if isinstance(out, dict) else None
323
+ if not insts:
324
+ # Some responses nest a single instance under "instance".
325
+ one = out.get("instance") if isinstance(out, dict) else None
326
+ insts = [one] if one else None
327
+ if not insts or not insts[0] or not insts[0].get("id"):
328
+ raise HyperstackApiError(f"launch({flavor_name}@{environment_name}) returned no VM id: {out}")
329
+ return str(insts[0]["id"])
330
+
331
+
332
+ def get_vm(vm_id: str) -> dict | None:
333
+ """VM detail dict, or None once it no longer exists (deleted)."""
334
+ try:
335
+ out = request_with_retries(f"/core/virtual-machines/{vm_id}")
336
+ except HyperstackApiError as e:
337
+ # Robust 404 check (NOT a bare "404" substring): Hyperstack VM ids are short integers, so a
338
+ # transient 5xx on a VM whose id contains "404" must not be misread as "deleted".
339
+ if is_not_found(e):
340
+ return None
341
+ raise
342
+ inst = out.get("instance") if isinstance(out, dict) else None
343
+ return inst if isinstance(inst, dict) else None
344
+
345
+
346
+ # Hyperstack paginates ``/core/virtual-machines``: a single GET returns only the FIRST page, so an
347
+ # account with more VMs than one page silently hides the rest. ``sweep_orphans`` lists every VM to
348
+ # reap orphans, so a missed page = a leaked, still-billing box. Walk every page (bounded so a buggy
349
+ # server that never shrinks the page can't loop forever) and concatenate.
350
+ _VM_PAGE_SIZE = 100
351
+ _VM_MAX_PAGES = 1000
352
+
353
+
354
+ def list_vms() -> list[dict]:
355
+ """Every Flash-listable VM across ALL pages (orphan-sweep + run-terminate read this).
356
+
357
+ Paginates ``/core/virtual-machines`` via ``page``/``per_page`` until a short/empty page is
358
+ reached. A page fetch that errors propagates AND a malformed page schema raises
359
+ ``HyperstackApiError`` — so a partial/incomplete list never masquerades as the authoritative
360
+ fleet and lets a sweep miss still-billing VMs (or reap survivors)."""
361
+ out: list[dict] = []
362
+ seen_ids: set[str] = set()
363
+ terminated = False
364
+ # Walk one page PAST the cap so a fleet that's an exact multiple of _VM_PAGE_SIZE can be CONFIRMED
365
+ # complete: its last in-cap page is full, so the only way to tell "complete (next page empty)"
366
+ # from "truncated (next page still has VMs)" is to fetch that extra probe page. The probe is
367
+ # gated below to fire ONLY at the cap boundary, so the normal walk still stops at _VM_MAX_PAGES.
368
+ for page in range(1, _VM_MAX_PAGES + 2):
369
+ resp = request_with_retries(
370
+ f"/core/virtual-machines?page={page}&per_page={_VM_PAGE_SIZE}"
371
+ )
372
+ # Distinguish a malformed response from a valid empty page. An unexpected schema (resp not a
373
+ # dict, or "instances" not a list) means we CANNOT trust this as the authoritative fleet —
374
+ # returning the partial list gathered so far would let an orphan sweep miss still-billing VMs
375
+ # past this point, so RAISE instead. A valid empty list is the legitimate end-of-pagination.
376
+ if not isinstance(resp, dict) or not isinstance(resp.get("instances"), list):
377
+ raise HyperstackApiError(
378
+ f"unexpected /core/virtual-machines response on page {page} "
379
+ f"(no 'instances' list): {resp!r}"
380
+ )
381
+ insts = resp["instances"]
382
+ if not insts:
383
+ terminated = True
384
+ break # valid empty page -> done paginating (incl. the boundary probe confirming complete)
385
+ added = 0
386
+ for v in insts:
387
+ # De-dupe across pages: an older API that ignores the ``page`` param would echo page 1
388
+ # forever, so we count only NEW ids and stop below when a full page adds none.
389
+ # Use ``is not None`` (not truthiness) so a legitimately falsy id like 0 still de-dupes.
390
+ vid = str(v["id"]) if isinstance(v, dict) and v.get("id") is not None else None
391
+ if vid is not None and vid in seen_ids:
392
+ continue
393
+ if vid is not None:
394
+ seen_ids.add(vid)
395
+ out.append(v)
396
+ added += 1
397
+ # Last page (short) OR a full page that added nothing new (server ignoring pagination): done.
398
+ if len(insts) < _VM_PAGE_SIZE or added == 0:
399
+ terminated = True
400
+ break
401
+ # A FULL page that added new VMs and we've now consumed the cap (page == _VM_MAX_PAGES): the
402
+ # fleet might be exactly a multiple of the page size (complete) OR genuinely larger
403
+ # (truncated). DON'T raise yet — let the loop fetch ONE more page (_VM_MAX_PAGES + 1) as a
404
+ # probe: if it comes back empty, the break above marks ``terminated`` and we return the full
405
+ # fleet; if it still has VMs, the fleet is genuinely over-cap and we raise below.
406
+ if not terminated:
407
+ # The boundary probe page (_VM_MAX_PAGES + 1) ALSO came back full — pagination did not
408
+ # terminate within the cap and there are genuinely more VMs past it. Returning ``out`` here
409
+ # would hand back a truncated fleet as if authoritative, and an orphan sweep / run-terminate
410
+ # keying off it would miss (or fail to reap) still-billing VMs. Surface it as a
411
+ # HyperstackApiError — same as the malformed-page guard above — so the incomplete list never
412
+ # masquerades as the complete one. (An exact-multiple fleet whose probe page was empty
413
+ # already returned normally above and does NOT reach here.)
414
+ raise HyperstackApiError(
415
+ f"/core/virtual-machines pagination did not terminate within {_VM_MAX_PAGES} pages "
416
+ f"(the page past the cap was still full at {_VM_PAGE_SIZE}/page, {len(out)} VMs "
417
+ f"collected) — refusing to return a truncated fleet that an orphan sweep could read as "
418
+ f"authoritative"
419
+ )
420
+ return out
421
+
422
+
423
+ def delete_vm(vm_id: str) -> bool:
424
+ """Delete (and stop billing for) a VM. Best-effort: never raises."""
425
+ if not vm_id:
426
+ return False
427
+ try:
428
+ request_with_retries(f"/core/virtual-machines/{vm_id}", method="DELETE", retries=2)
429
+ return True
430
+ except Exception as exc:
431
+ # Hyperstack returns 409 when a prior delete request is still being processed: the VM is
432
+ # already queued for teardown so billing will stop — treat as success. Key off the chained
433
+ # HTTPError status (``is_conflict``), NOT a bare "409"/"conflict" substring — a non-409
434
+ # failure whose text merely contains "409" (e.g. a "4090" GPU name) must still surface, or a
435
+ # real delete failure would be silently dropped and the VM left billing.
436
+ if is_conflict(exc):
437
+ return True
438
+ logger.warning("hyperstack delete_vm(%s) failed: %s", vm_id, exc)
439
+ return False
440
+
441
+
442
+ # ---------------------------------------------------------------------------
443
+ # Volumes (the weight cache). Region/environment-scoped block storage; created via the API, attached
444
+ # to a VM AFTER launch, then formatted+mounted in the VM by the cloud-init preamble. Block volumes
445
+ # are single-attach (one VM at a time) — concurrent same-region runs fall back to cold.
446
+ # ---------------------------------------------------------------------------
447
+ _CACHE_VOLUME_TYPE = "Cloud-SSD"
448
+
449
+
450
+ def list_volumes() -> list[dict]:
451
+ """All volumes: ``[{id, name, environment:{name,region}, status, attachments}, ...]``."""
452
+ out = request_with_retries("/core/volumes")
453
+ vols = out.get("volumes") if isinstance(out, dict) else None
454
+ return vols if isinstance(vols, list) else []
455
+
456
+
457
+ def create_volume(name: str, environment_name: str, size_gb: int) -> dict:
458
+ """Create a ``Cloud-SSD`` volume in ``environment_name`` -> its object (incl. integer ``id``)."""
459
+ body = {
460
+ "name": name,
461
+ "environment_name": environment_name,
462
+ "volume_type": _CACHE_VOLUME_TYPE,
463
+ "size": int(size_gb),
464
+ }
465
+ out = request_with_retries("/core/volumes", method="POST", body=body, retries=2)
466
+ return (out.get("volume") if isinstance(out, dict) else None) or {}
467
+
468
+
469
+ def delete_volume(volume_id) -> bool:
470
+ """Delete a volume by id (best-effort)."""
471
+ try:
472
+ request_with_retries(f"/core/volumes/{volume_id}", method="DELETE", retries=2)
473
+ return True
474
+ except Exception as exc:
475
+ logger.warning("hyperstack delete_volume(%s) failed: %s", volume_id, exc)
476
+ return False
477
+
478
+
479
+ def attach_volume(vm_id: str, volume_id) -> bool:
480
+ """Attach a volume to a VM (best-effort). The cloud-init preamble then formats+mounts the device;
481
+ if this fails the run still proceeds COLD (the preamble degrades when no device appears)."""
482
+ try:
483
+ request_with_retries(
484
+ f"/core/virtual-machines/{vm_id}/attach-volumes",
485
+ method="POST",
486
+ body={"volume_ids": [volume_id]},
487
+ retries=2,
488
+ )
489
+ return True
490
+ except Exception as exc:
491
+ logger.warning("hyperstack attach_volume(vm=%s, vol=%s) failed: %s", vm_id, volume_id, exc)
492
+ return False
493
+
494
+
495
+ def cache_volume_name(base: str, region: str) -> str:
496
+ """Physical cache-volume name for ``base`` in ``region`` — DISTINCT per region.
497
+
498
+ Hyperstack enforces GLOBAL volume-name uniqueness (a plain ``flash-weights`` can exist in only ONE
499
+ environment account-wide; a 2nd create elsewhere returns HTTP 400 "This name is not available").
500
+ The cache is one logical volume (``base`` == ``spec.gpu.network_volume``, e.g. ``flash-weights``)
501
+ realized as one physical volume per region, so the region MUST be in the name. Mirrors RunPod's
502
+ per-DC ``weight_cache_volume_name``. The cloud-init preamble mounts by device size, not name, so
503
+ the worker is unaffected — every cache volume mounts at the same ``/mnt/flash-weights``.
504
+ """
505
+ return f"{base}-{region}".lower()
506
+
507
+
508
+ def ensure_volume(name: str, environment_name: str, size_gb: int) -> object:
509
+ """Create-if-absent the cache volume ``name`` in ``environment_name``; return its id. Idempotent:
510
+ reuses an existing same-name volume in that environment. ``name`` MUST be globally unique — use
511
+ ``cache_volume_name(base, region)`` (Hyperstack rejects a duplicate name across environments)."""
512
+ for v in list_volumes():
513
+ if v.get("name") == name and (v.get("environment") or {}).get("name") == environment_name:
514
+ return v.get("id")
515
+ return create_volume(name, environment_name, size_gb).get("id")
516
+
517
+
518
+ def delete_vms(vm_ids: list[str]) -> list[str]:
519
+ """Delete several VMs (best-effort, per-id isolated). Return the ids that ACTUALLY deleted so
520
+ callers (sweep_orphans / terminate_run_instances) report only what was truly torn down — a
521
+ partial failure must not log/return a still-billing VM as reaped."""
522
+ return [str(v) for v in vm_ids if v and delete_vm(str(v))]
@@ -0,0 +1,17 @@
1
+ """Hyperstack credential handling (operator-side), mirroring the RunPod/Lambda auth modules.
2
+
3
+ The Hyperstack REST client authenticates via the ``HYPERSTACK_API_KEY`` environment variable, set
4
+ by the **operator** on the control-plane host. Env-only by design. Hyperstack presents the key in a
5
+ bare ``api_key`` header (not ``Authorization: Bearer``).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from .._auth import load_provider_key
11
+
12
+ _ENV_VAR = "HYPERSTACK_API_KEY"
13
+
14
+
15
+ def load_api_key() -> str | None:
16
+ """API key from the environment (operator configuration)."""
17
+ return load_provider_key(_ENV_VAR)
@@ -0,0 +1,29 @@
1
+ """Hyperstack's GPU classes (its rows of the shared GPU table).
2
+
3
+ The class table is provider-agnostic and lives in ``providers/base.py``. This module carves out
4
+ Hyperstack's rows (``gpu_classes()`` == every class with a ``hyperstack_name``) and owns the
5
+ friendly-name -> Hyperstack flavor translation.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from flash.providers.base import GpuClass, UnsupportedGpuError, get_gpu_info, providers_for
11
+
12
+ __all__ = ["flavor_for", "gpu_classes"]
13
+
14
+
15
+ def gpu_classes() -> list[GpuClass]:
16
+ """The GPU classes Hyperstack can provision (those with a ``hyperstack_name``)."""
17
+ from flash.providers.base import GPU_INFO
18
+
19
+ return [g for g in GPU_INFO.values() if g.hyperstack_name]
20
+
21
+
22
+ def flavor_for(name: str) -> str:
23
+ """Hyperstack single-GPU flavor name (e.g. 'n3-L40x1') for a friendly GPU class name."""
24
+ info = get_gpu_info(name)
25
+ if not info.hyperstack_name:
26
+ raise UnsupportedGpuError(
27
+ f"{info.name} is not available on Hyperstack (providers: {', '.join(providers_for(name))})"
28
+ )
29
+ return info.hyperstack_name