freesolo-flash-dev 0.2.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flash/__init__.py +29 -0
- flash/_channel.py +23 -0
- flash/_fileio.py +35 -0
- flash/_logging.py +49 -0
- flash/_update_check.py +266 -0
- flash/catalog.py +253 -0
- flash/cli/__init__.py +1 -0
- flash/cli/main/__init__.py +227 -0
- flash/cli/main/__main__.py +6 -0
- flash/cli/main/commands.py +636 -0
- flash/cli/main/envpush.py +317 -0
- flash/cli/main/render.py +599 -0
- flash/cli/main/training_doc.py +455 -0
- flash/client/__init__.py +14 -0
- flash/client/config.py +70 -0
- flash/client/http.py +372 -0
- flash/client/runtime_secrets.py +69 -0
- flash/client/specs.py +20 -0
- flash/cost/__init__.py +16 -0
- flash/cost/analytical.py +175 -0
- flash/cost/facts.py +114 -0
- flash/cost/spec.py +113 -0
- flash/cost/types.py +158 -0
- flash/engine/__init__.py +6 -0
- flash/engine/accounting.py +36 -0
- flash/engine/chalk_kernels.py +116 -0
- flash/engine/multiturn_rollout.py +780 -0
- flash/engine/recipe.py +86 -0
- flash/engine/vram.py +603 -0
- flash/engine/worker/__init__.py +2916 -0
- flash/engine/worker/__main__.py +4 -0
- flash/engine/worker/kernel_warmup.py +400 -0
- flash/engine/worker/lora.py +796 -0
- flash/engine/worker/packing.py +366 -0
- flash/engine/worker/perf.py +1048 -0
- flash/envs/__init__.py +10 -0
- flash/envs/adapter/__init__.py +883 -0
- flash/envs/adapter/rubric.py +222 -0
- flash/envs/base.py +52 -0
- flash/envs/registry.py +62 -0
- flash/mcp/__init__.py +1 -0
- flash/mcp/server.py +85 -0
- flash/providers/__init__.py +59 -0
- flash/providers/_auth.py +24 -0
- flash/providers/_http.py +230 -0
- flash/providers/_instance.py +416 -0
- flash/providers/_instance_bootstrap.py +517 -0
- flash/providers/_poll.py +311 -0
- flash/providers/allocator.py +193 -0
- flash/providers/base.py +431 -0
- flash/providers/hyperstack/__init__.py +127 -0
- flash/providers/hyperstack/api.py +522 -0
- flash/providers/hyperstack/auth.py +17 -0
- flash/providers/hyperstack/gpus.py +29 -0
- flash/providers/hyperstack/jobs/__init__.py +632 -0
- flash/providers/hyperstack/jobs/builders.py +122 -0
- flash/providers/hyperstack/preflight.py +23 -0
- flash/providers/hyperstack/pricing.py +26 -0
- flash/providers/hyperstack/train.py +25 -0
- flash/providers/lambdalabs/__init__.py +139 -0
- flash/providers/lambdalabs/api.py +261 -0
- flash/providers/lambdalabs/auth.py +18 -0
- flash/providers/lambdalabs/gpus.py +29 -0
- flash/providers/lambdalabs/jobs/__init__.py +724 -0
- flash/providers/lambdalabs/jobs/builders.py +118 -0
- flash/providers/lambdalabs/preflight.py +27 -0
- flash/providers/lambdalabs/pricing.py +51 -0
- flash/providers/lambdalabs/train.py +27 -0
- flash/providers/preflight.py +55 -0
- flash/providers/realized.py +80 -0
- flash/providers/runpod/__init__.py +130 -0
- flash/providers/runpod/api.py +186 -0
- flash/providers/runpod/auth.py +37 -0
- flash/providers/runpod/cost.py +57 -0
- flash/providers/runpod/gpus.py +46 -0
- flash/providers/runpod/jobs.py +956 -0
- flash/providers/runpod/keys.py +139 -0
- flash/providers/runpod/preflight.py +30 -0
- flash/providers/runpod/preload.py +915 -0
- flash/providers/runpod/pricing.py +18 -0
- flash/providers/runpod/slots.py +79 -0
- flash/providers/runpod/train/__init__.py +150 -0
- flash/providers/runpod/train/deps.py +395 -0
- flash/providers/runpod/train/endpoints.py +820 -0
- flash/py.typed +0 -0
- flash/runner/__init__.py +686 -0
- flash/runner/checkpoints.py +82 -0
- flash/runner/deploy.py +422 -0
- flash/runner/lifecycle.py +672 -0
- flash/schema/__init__.py +375 -0
- flash/schema/fields.py +331 -0
- flash/serve/__init__.py +1 -0
- flash/serve/deploy.py +326 -0
- flash/serve/pricing.py +60 -0
- flash/server/__init__.py +1 -0
- flash/server/__main__.py +20 -0
- flash/server/app.py +961 -0
- flash/server/auth.py +263 -0
- flash/server/billing.py +124 -0
- flash/server/checkpoints.py +110 -0
- flash/server/db.py +160 -0
- flash/server/environment_registry.py +102 -0
- flash/server/envs.py +360 -0
- flash/server/reconcile.py +163 -0
- flash/server/run_registry.py +150 -0
- flash/spec.py +333 -0
- freesolo_flash_dev-0.2.25.dist-info/METADATA +192 -0
- freesolo_flash_dev-0.2.25.dist-info/RECORD +111 -0
- freesolo_flash_dev-0.2.25.dist-info/WHEEL +4 -0
- freesolo_flash_dev-0.2.25.dist-info/entry_points.txt +3 -0
- freesolo_flash_dev-0.2.25.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,632 @@
|
|
|
1
|
+
"""Hyperstack run lifecycle: stock walk -> launch VM -> HF-artifact poll -> guaranteed delete.
|
|
2
|
+
|
|
3
|
+
The Hyperstack equivalent of ``providers/lambdalabs/jobs``. Hyperstack rents a single-GPU VM from a
|
|
4
|
+
region with stock, ships the shared cloud-init ``user_data`` (runs ``WORKER_IMAGE`` via Docker), and
|
|
5
|
+
detects completion from the worker's HF artifacts. Cost-safety: a launched VM is ALWAYS deleted —
|
|
6
|
+
the runner ``finally``, the poll deadline, cancel, and ``sweep_orphans`` each guarantee it. Like
|
|
7
|
+
Lambda, there is no in-box self-destruct (no instance-scoped key); ``sweep_orphans`` at startup is
|
|
8
|
+
the crash backstop.
|
|
9
|
+
|
|
10
|
+
Pure dataclasses + builders live in ``.builders`` and are re-exported. Lifecycle functions and the
|
|
11
|
+
constants tests monkeypatch stay in this ``__init__``.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import contextlib
|
|
17
|
+
import json
|
|
18
|
+
import time
|
|
19
|
+
from collections.abc import Callable
|
|
20
|
+
|
|
21
|
+
from flash._logging import get_logger
|
|
22
|
+
from flash.providers._poll import (
|
|
23
|
+
PollErrorTracker,
|
|
24
|
+
heartbeat_progress_ts,
|
|
25
|
+
make_say,
|
|
26
|
+
preload_box_reap_due,
|
|
27
|
+
surface_heartbeat,
|
|
28
|
+
)
|
|
29
|
+
from flash.providers.base import GPU_INFO, PollResult, min_cuda_modern
|
|
30
|
+
from flash.providers.hyperstack import api as hs_api
|
|
31
|
+
from flash.providers.hyperstack.jobs.builders import (
|
|
32
|
+
HyperstackInstance,
|
|
33
|
+
HyperstackJobHandle,
|
|
34
|
+
build_payload,
|
|
35
|
+
build_user_data,
|
|
36
|
+
instance_label,
|
|
37
|
+
run_label_prefix,
|
|
38
|
+
)
|
|
39
|
+
from flash.providers.runpod.jobs import make_hf_heartbeat_reader, make_hf_text_reader
|
|
40
|
+
|
|
41
|
+
logger = get_logger(__name__)
|
|
42
|
+
|
|
43
|
+
# How long a VM may sit in a non-active state (provisioning) before we give up and retry.
|
|
44
|
+
LOAD_TIMEOUT_S = 900.0
|
|
45
|
+
# Cold-start (Docker pull + pip + model download) emits no heartbeat -> larger setup grace until a
|
|
46
|
+
# training heartbeat; tighter window after.
|
|
47
|
+
SETUP_GRACE_S = 3000.0
|
|
48
|
+
STALL_AFTER_S = 1500.0
|
|
49
|
+
PROVISION_GRACE_S = 3000.0
|
|
50
|
+
|
|
51
|
+
_SETUP_HEARTBEAT_STAGES = frozenset(
|
|
52
|
+
{"boot", "sft_start", "rl_start", "sft_model_load", "rl_train_start"}
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# Hyperstack VM statuses that mean "the box is gone / will not progress".
|
|
56
|
+
_DEAD_STATES = {"ERROR", "FAILED", "DELETING", "DELETED", "TERMINATED"}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def usable_instances(gpu_class: str, force: bool = False) -> list[HyperstackInstance]:
|
|
60
|
+
"""Launchable (region) candidates for a managed GPU class, only where the flavor has stock now.
|
|
61
|
+
``force`` bypasses the ``/core/flavors`` cache (used by the in-launch refresh so it can discover
|
|
62
|
+
newly-restocked regions instead of re-reading the just-populated allocation cache).
|
|
63
|
+
|
|
64
|
+
Hyperstack prices per flavor (not per region), so every candidate carries the same $/hr; the
|
|
65
|
+
list is the regions whose flavor currently advertises stock. Empty == no Hyperstack capacity now.
|
|
66
|
+
"""
|
|
67
|
+
from flash.providers.hyperstack.gpus import flavor_for
|
|
68
|
+
from flash.providers.hyperstack.pricing import hourly_rate
|
|
69
|
+
|
|
70
|
+
info = GPU_INFO[gpu_class]
|
|
71
|
+
flavor = flavor_for(gpu_class)
|
|
72
|
+
rate = hourly_rate(gpu_class)
|
|
73
|
+
return [
|
|
74
|
+
HyperstackInstance(
|
|
75
|
+
gpu=gpu_class,
|
|
76
|
+
flavor=flavor,
|
|
77
|
+
region=region,
|
|
78
|
+
environment=hs_api.environment_for_region(region),
|
|
79
|
+
vram_gb=info.vram_gb,
|
|
80
|
+
price_usd_hr=rate,
|
|
81
|
+
)
|
|
82
|
+
for region in hs_api.regions_with_stock(flavor, force=force)
|
|
83
|
+
]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _launch_rejection_is_clean(err: Exception) -> bool:
|
|
87
|
+
"""True when a launch error is a DEFINITIVE rejection that created NO VM (safe to walk). The
|
|
88
|
+
shared RestClient fast-fails a non-429 4xx as ``... -> HTTP 4xx: ...`` (request rejected, e.g.
|
|
89
|
+
no stock). Anything else — 429, 5xx/timeout (``failed after N attempts``), or accepted-but-no-id
|
|
90
|
+
(``returned no VM id``) — is AMBIGUOUS: Hyperstack may have created a billed VM, so we must NOT
|
|
91
|
+
issue another launch."""
|
|
92
|
+
s = str(err)
|
|
93
|
+
return "-> HTTP 4" in s and "HTTP 429" not in s
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def launch_and_submit(
|
|
97
|
+
spec,
|
|
98
|
+
seed: int,
|
|
99
|
+
instances: list[HyperstackInstance],
|
|
100
|
+
attempt: int = 0,
|
|
101
|
+
log=None,
|
|
102
|
+
runtime_secrets: dict | None = None,
|
|
103
|
+
mode: str | None = None,
|
|
104
|
+
models: list | None = None,
|
|
105
|
+
) -> HyperstackJobHandle:
|
|
106
|
+
"""Launch the first region that accepts the job; walk regions on a stock rejection, refresh once.
|
|
107
|
+
|
|
108
|
+
``mode="preload"`` + ``models`` launches a download-only warm (the bootstrap pulls the models into
|
|
109
|
+
the mounted cache volume and exits — no worker)."""
|
|
110
|
+
say = make_say(log)
|
|
111
|
+
if not instances:
|
|
112
|
+
raise hs_api.HyperstackApiError(
|
|
113
|
+
f"no Hyperstack stock for {spec.gpu.type} (no region advertises the flavor)"
|
|
114
|
+
)
|
|
115
|
+
# Weight cache: when wanted (runner-assigned network_volume), HF_HOME points at a block volume
|
|
116
|
+
# formatted+mounted at /mnt/flash-weights (region-independent path) and bind-mounted into the
|
|
117
|
+
# container; the volume is created-if-absent per environment and attached AFTER launch. If the
|
|
118
|
+
# volume can't be ensured we launch cold; if attach fails the cloud-init preamble degrades to cold
|
|
119
|
+
# (no device appears). Block volumes are single-attach, so a concurrent same-region run runs cold.
|
|
120
|
+
# ``gpu=`` selects the per-GPU worker image (dev: worker_image_for_gpu via hyperstack_image).
|
|
121
|
+
cache_name = getattr(spec.gpu, "network_volume", None)
|
|
122
|
+
cold_user_data = build_user_data(
|
|
123
|
+
build_payload(spec, seed, attempt, runtime_secrets=runtime_secrets), gpu=spec.gpu.type
|
|
124
|
+
)
|
|
125
|
+
cache_user_data = None
|
|
126
|
+
cache_gb = 0
|
|
127
|
+
if cache_name:
|
|
128
|
+
from flash.runner import WEIGHT_CACHE_VOLUME_GB
|
|
129
|
+
from flash.spec import _volume_gb
|
|
130
|
+
|
|
131
|
+
# Honor the runner-assigned size (mirrors RunPod), defaulting to the standard cache size. Parse
|
|
132
|
+
# tolerantly via _volume_gb: this best-effort weight-cache path must never crash the run on a
|
|
133
|
+
# non-int / stale / hand-edited size ("0", "", "abc", bool) — it just falls back to the default.
|
|
134
|
+
cache_gb = _volume_gb(getattr(spec.gpu, "network_volume_gb", None), default=WEIGHT_CACHE_VOLUME_GB)
|
|
135
|
+
cache_user_data = build_user_data(
|
|
136
|
+
build_payload(
|
|
137
|
+
spec, seed, attempt, runtime_secrets=runtime_secrets,
|
|
138
|
+
cache_host_mount="/mnt/flash-weights", cache_block_device=True,
|
|
139
|
+
mode=mode, models=models,
|
|
140
|
+
),
|
|
141
|
+
gpu=spec.gpu.type,
|
|
142
|
+
)
|
|
143
|
+
name = instance_label(spec.run_id, seed, attempt)
|
|
144
|
+
|
|
145
|
+
tried_regions: set[str] = set()
|
|
146
|
+
candidates = list(instances)
|
|
147
|
+
refreshed = False
|
|
148
|
+
last_err: Exception | None = None
|
|
149
|
+
|
|
150
|
+
def refresh_once(gpu: str) -> None:
|
|
151
|
+
"""One forced stock re-fetch when the walk is exhausted (the alloc cache is ~45s stale).
|
|
152
|
+
|
|
153
|
+
NO-OP in preload mode: ``warm_instances`` pins each preload launch to ONE specific target
|
|
154
|
+
region (``instances=[candidate]``) and reports that exact region as warmed. Refreshing to a
|
|
155
|
+
DIFFERENT region here would warm region B while the caller reports the target region A as
|
|
156
|
+
warmed (its cache actually still cold). A preload that can't run in its target region must
|
|
157
|
+
FAIL that region (the walk exhausts -> raise), never silently warm another.
|
|
158
|
+
"""
|
|
159
|
+
nonlocal refreshed, candidates
|
|
160
|
+
if mode == "preload":
|
|
161
|
+
return
|
|
162
|
+
if not candidates and not refreshed:
|
|
163
|
+
refreshed = True
|
|
164
|
+
candidates = [
|
|
165
|
+
c for c in usable_instances(gpu, force=True) if c.region not in tried_regions
|
|
166
|
+
]
|
|
167
|
+
|
|
168
|
+
while candidates:
|
|
169
|
+
inst = candidates.pop(0)
|
|
170
|
+
if inst.region in tried_regions:
|
|
171
|
+
continue
|
|
172
|
+
tried_regions.add(inst.region)
|
|
173
|
+
# Pre-launch resolution: pick a boot image whose host CUDA covers this GPU class's floor
|
|
174
|
+
# (Blackwell needs 13) and the SSH key. These run BEFORE any non-idempotent launch, so a
|
|
175
|
+
# failure here (e.g. the region advertises stock but has no qualifying CUDA image) created NO
|
|
176
|
+
# VM — it is a CLEAN region skip, never an ambiguous phantom. Walk to the next region.
|
|
177
|
+
try:
|
|
178
|
+
image = hs_api.docker_image_for_region(inst.region, min_cuda=min_cuda_modern(inst.gpu))
|
|
179
|
+
key_name = hs_api.resolve_key_name(inst.environment)
|
|
180
|
+
except hs_api.HyperstackApiError as e:
|
|
181
|
+
last_err = e
|
|
182
|
+
say(f"region {inst.region} ({inst.gpu} {inst.flavor}) unusable (no boot image/key): {e}")
|
|
183
|
+
refresh_once(inst.gpu)
|
|
184
|
+
continue
|
|
185
|
+
# Ensure the cache volume exists in this environment (create-if-absent); on success use the
|
|
186
|
+
# cache user_data and attach the volume after launch. Any failure -> launch cold here — EXCEPT
|
|
187
|
+
# in preload mode, where the cold user_data carries no mode/models, so a cold fallback would
|
|
188
|
+
# boot a full training bootstrap (GPU billing, timeout) and warm nothing. There we SKIP the
|
|
189
|
+
# region instead and let the walk try the next one (failing if none can host the cache).
|
|
190
|
+
vol_id, user_data = None, cold_user_data
|
|
191
|
+
cache_unavailable_reason = None
|
|
192
|
+
if cache_name and not hs_api.region_supports_cache(inst.region):
|
|
193
|
+
cache_unavailable_reason = "weight cache not supported in region"
|
|
194
|
+
elif cache_name:
|
|
195
|
+
try:
|
|
196
|
+
# Per-region physical name (Hyperstack volume names are GLOBALLY unique — a bare
|
|
197
|
+
# cache_name can exist in only one environment account-wide).
|
|
198
|
+
vol_name = hs_api.cache_volume_name(cache_name, inst.region)
|
|
199
|
+
vol_id = hs_api.ensure_volume(vol_name, inst.environment, cache_gb) or None
|
|
200
|
+
if vol_id is not None:
|
|
201
|
+
user_data = cache_user_data
|
|
202
|
+
else:
|
|
203
|
+
cache_unavailable_reason = "ensure_volume returned no id"
|
|
204
|
+
except Exception as e:
|
|
205
|
+
vol_id = None
|
|
206
|
+
cache_unavailable_reason = str(e)
|
|
207
|
+
if cache_name and cache_unavailable_reason is not None:
|
|
208
|
+
if mode == "preload":
|
|
209
|
+
say(f"weight cache unavailable in {inst.region} ({cache_unavailable_reason}); "
|
|
210
|
+
"skipping (preload needs it)")
|
|
211
|
+
last_err = hs_api.HyperstackApiError(
|
|
212
|
+
f"preload: weight cache unavailable in {inst.region} ({cache_unavailable_reason})"
|
|
213
|
+
)
|
|
214
|
+
refresh_once(inst.gpu)
|
|
215
|
+
continue
|
|
216
|
+
say(f"weight cache unavailable in {inst.region} ({cache_unavailable_reason}); launching cold")
|
|
217
|
+
try:
|
|
218
|
+
vm_id = hs_api.launch_vm(
|
|
219
|
+
name=name,
|
|
220
|
+
environment_name=inst.environment,
|
|
221
|
+
image_name=image,
|
|
222
|
+
flavor_name=inst.flavor,
|
|
223
|
+
key_name=key_name,
|
|
224
|
+
user_data=user_data,
|
|
225
|
+
)
|
|
226
|
+
except hs_api.HyperstackApiError as e:
|
|
227
|
+
last_err = e
|
|
228
|
+
if not _launch_rejection_is_clean(e):
|
|
229
|
+
# Ambiguous failure (timeout / 5xx / 429 / accepted-but-no-id): Hyperstack may have
|
|
230
|
+
# created a billed VM whose id we never got. Do NOT launch another in this attempt —
|
|
231
|
+
# reconcile any phantom by run-name and stop; the runner's retry (+ gc /
|
|
232
|
+
# sweep_orphans) re-provisions cleanly.
|
|
233
|
+
say(f"ambiguous launch failure in {inst.region}: {e}; reconciling + retrying fresh")
|
|
234
|
+
with contextlib.suppress(Exception):
|
|
235
|
+
terminate_run_instances(spec.run_id)
|
|
236
|
+
raise hs_api.HyperstackApiError(
|
|
237
|
+
f"ambiguous Hyperstack launch failure (possible phantom reaped): {e}"
|
|
238
|
+
) from e
|
|
239
|
+
say(f"region {inst.region} ({inst.gpu} {inst.flavor}) rejected: {e}")
|
|
240
|
+
refresh_once(inst.gpu)
|
|
241
|
+
continue
|
|
242
|
+
# Attach the cache volume to the freshly-launched VM (best-effort: the cloud-init preamble
|
|
243
|
+
# degrades to a cold run if no device appears, e.g. attach failed or the volume is busy on
|
|
244
|
+
# another VM — block volumes are single-attach).
|
|
245
|
+
if vol_id is not None:
|
|
246
|
+
attached = False
|
|
247
|
+
with contextlib.suppress(Exception):
|
|
248
|
+
attached = hs_api.attach_volume(vm_id, vol_id)
|
|
249
|
+
# A training run survives a failed attach (the preamble runs cold), but a PRELOAD box can't:
|
|
250
|
+
# with no device it would refuse to warm ephemeral disk (the sentinel check) and just burn
|
|
251
|
+
# paid GPU until the wall cap. Treat a failed preload attach as a launch failure — tear the
|
|
252
|
+
# VM down and walk to the next region (failing the warm if none can attach the cache).
|
|
253
|
+
if not attached and mode == "preload":
|
|
254
|
+
say(f"preload: cache attach failed in {inst.region} (vol busy/absent); "
|
|
255
|
+
"terminating box and trying next region")
|
|
256
|
+
with contextlib.suppress(Exception):
|
|
257
|
+
terminate_run_instances(spec.run_id)
|
|
258
|
+
last_err = hs_api.HyperstackApiError(
|
|
259
|
+
f"preload: cache volume attach failed in {inst.region}"
|
|
260
|
+
)
|
|
261
|
+
refresh_once(inst.gpu)
|
|
262
|
+
continue
|
|
263
|
+
say(
|
|
264
|
+
f"launched hyperstack vm {vm_id}: {inst.gpu} {inst.flavor} "
|
|
265
|
+
f"${inst.price_usd_hr:.2f}/hr in {inst.region} attempt={attempt} seed={seed}"
|
|
266
|
+
)
|
|
267
|
+
return HyperstackJobHandle(
|
|
268
|
+
vm_id=vm_id,
|
|
269
|
+
flavor=inst.flavor,
|
|
270
|
+
region=inst.region,
|
|
271
|
+
name=name,
|
|
272
|
+
gpu=inst.gpu,
|
|
273
|
+
hourly_usd=inst.price_usd_hr,
|
|
274
|
+
attempt=attempt,
|
|
275
|
+
started_ts=time.time(),
|
|
276
|
+
)
|
|
277
|
+
# Phantom-VM safety: a non-idempotent launch that Hyperstack ACCEPTED but whose response lacked
|
|
278
|
+
# a parseable id raises (caught above as a region rejection), leaving a billed VM under our run
|
|
279
|
+
# name that no handle owns. Best-effort reap any such VM by run-name before giving up (the
|
|
280
|
+
# post-run gc / sweep_orphans are the backstop, but this closes the window now).
|
|
281
|
+
with contextlib.suppress(Exception):
|
|
282
|
+
terminate_run_instances(spec.run_id)
|
|
283
|
+
raise hs_api.HyperstackApiError(
|
|
284
|
+
f"all {len(tried_regions)} Hyperstack region(s) rejected the {spec.gpu.type} launch "
|
|
285
|
+
f"(no stock): {last_err}"
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
_make_hf_file_reader = make_hf_text_reader
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def _failure_detail(hf_repo: str, prefix: str, phase: str, marker: dict | None) -> str:
|
|
293
|
+
"""Best root-cause detail from the HF artifacts. Hyperstack exposes no VM console API, so the
|
|
294
|
+
box's ``hyperstack_boot.log`` (pushed to HF by the cloud-init host uploader) is the only window
|
|
295
|
+
into a pre-worker failure (docker/GPU not ready, image-pull failure)."""
|
|
296
|
+
parts = []
|
|
297
|
+
if marker and marker.get("error"):
|
|
298
|
+
parts.append(str(marker["error"]))
|
|
299
|
+
err = _make_hf_file_reader(hf_repo, f"{prefix}/error_{phase}.txt")(force=True)
|
|
300
|
+
if err:
|
|
301
|
+
parts.append(f"--- error_{phase}.txt ---\n{err[-2000:]}")
|
|
302
|
+
boot = _make_hf_file_reader(hf_repo, f"{prefix}/hyperstack_boot.log")(force=True)
|
|
303
|
+
if boot:
|
|
304
|
+
parts.append(f"--- hyperstack_boot.log (host) ---\n{boot[-3000:]}")
|
|
305
|
+
return "\n".join(parts) or "hyperstack worker terminated without a DONE sentinel"
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def poll_hs_job(
|
|
309
|
+
handle: HyperstackJobHandle,
|
|
310
|
+
spec,
|
|
311
|
+
seed: int,
|
|
312
|
+
log=None,
|
|
313
|
+
interval_s: float = 15.0,
|
|
314
|
+
heartbeat_reader=None,
|
|
315
|
+
setup_grace_s: float = SETUP_GRACE_S,
|
|
316
|
+
stall_after_s: float = STALL_AFTER_S,
|
|
317
|
+
deadline_s: float | None = None,
|
|
318
|
+
) -> PollResult:
|
|
319
|
+
"""Poll VM status + HF artifacts to a terminal state (cf. lambda.jobs.poll_lambda_job)."""
|
|
320
|
+
say = make_say(log)
|
|
321
|
+
# Single source of truth for "when did this VM launch". started_ts is a non-Optional float that
|
|
322
|
+
# HyperstackJobHandle.from_dict coerces to 0.0 when MISSING (old/corrupt handle), so 0.0 means
|
|
323
|
+
# "unknown launch" (a real launch is a large epoch ts, never 0.0). Fall back to now so EVERY use
|
|
324
|
+
# below -- the load/stall clocks AND done_is_fresh / finish_ok's wall+cost stamping -- treats a
|
|
325
|
+
# recovered corrupt handle consistently, instead of billing/comparing from the 1970 epoch.
|
|
326
|
+
launch_ts = handle.started_ts or time.time()
|
|
327
|
+
hf_repo = spec.train.hf_repo
|
|
328
|
+
prefix = f"{spec.phase}/{spec.run_id}/seed{seed}"
|
|
329
|
+
done_reader = _make_hf_file_reader(hf_repo, f"{prefix}/DONE")
|
|
330
|
+
marker_reader = _make_hf_file_reader(
|
|
331
|
+
hf_repo, f"{prefix}/hyperstack_attempt{handle.attempt}.json", min_interval_s=60.0
|
|
332
|
+
)
|
|
333
|
+
metrics_reader = _make_hf_file_reader(hf_repo, f"{prefix}/metrics.json")
|
|
334
|
+
|
|
335
|
+
def finish_ok(done_content: str | None = None) -> PollResult:
|
|
336
|
+
raw = metrics_reader(force=True)
|
|
337
|
+
if raw is None:
|
|
338
|
+
return PollResult(False, failure="job_failed", detail="DONE without metrics.json")
|
|
339
|
+
metrics = json.loads(raw)
|
|
340
|
+
end_ts = time.time()
|
|
341
|
+
if done_content:
|
|
342
|
+
try:
|
|
343
|
+
done_ts = float(done_content.strip())
|
|
344
|
+
if launch_ts <= done_ts <= end_ts:
|
|
345
|
+
end_ts = done_ts
|
|
346
|
+
except ValueError:
|
|
347
|
+
pass
|
|
348
|
+
wall_h = (end_ts - launch_ts) / 3600.0
|
|
349
|
+
metrics["cost_usd"] = round(wall_h * handle.hourly_usd, 6)
|
|
350
|
+
notes = metrics.get("notes") if isinstance(metrics.get("notes"), dict) else {}
|
|
351
|
+
notes.update(
|
|
352
|
+
{
|
|
353
|
+
"provider": "hyperstack",
|
|
354
|
+
"hyperstack_rate_usd_hr": handle.hourly_usd,
|
|
355
|
+
"hyperstack_gpu": handle.gpu,
|
|
356
|
+
"hyperstack_flavor": handle.flavor,
|
|
357
|
+
"hyperstack_region": handle.region,
|
|
358
|
+
}
|
|
359
|
+
)
|
|
360
|
+
metrics["notes"] = notes
|
|
361
|
+
return PollResult(True, metrics=metrics)
|
|
362
|
+
|
|
363
|
+
def done_is_fresh(content: str) -> bool:
|
|
364
|
+
# launch_ts (not handle.started_ts) so an unknown-launch (0.0) handle doesn't accept every
|
|
365
|
+
# leftover DONE as fresh.
|
|
366
|
+
try:
|
|
367
|
+
return float(content.strip()) > launch_ts - 120.0
|
|
368
|
+
except ValueError:
|
|
369
|
+
return False
|
|
370
|
+
|
|
371
|
+
def finish_from_ok_marker() -> PollResult:
|
|
372
|
+
# ok marker => the worker finished (it wrote metrics before the marker) even if DONE is STALE
|
|
373
|
+
# (a retry hit the already-complete path). Treat ok-marker + metrics as terminal success.
|
|
374
|
+
d = done_reader(force=True)
|
|
375
|
+
return finish_ok(d if (d is not None and done_is_fresh(d)) else None)
|
|
376
|
+
|
|
377
|
+
def fail_from_marker(marker: dict | None) -> PollResult:
|
|
378
|
+
from flash.providers.runpod.jobs import worker_flagged_retriable
|
|
379
|
+
|
|
380
|
+
# Host failure marker sets retriable=True; the worker stamps it for a RetriableInfraError.
|
|
381
|
+
retriable = bool(marker and marker.get("retriable")) or worker_flagged_retriable(heartbeat_reader)
|
|
382
|
+
return PollResult(
|
|
383
|
+
False,
|
|
384
|
+
failure="job_preempted" if retriable else "job_failed",
|
|
385
|
+
detail=_failure_detail(hf_repo, prefix, spec.phase, marker),
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
def terminal_artifact_result() -> PollResult | None:
|
|
389
|
+
# One forced read of the worker's terminal HF artifacts (DONE / attempt ok-marker). Returns a
|
|
390
|
+
# terminal PollResult when the worker definitively finished or errored, else None. Used both
|
|
391
|
+
# when the host is dead AND before returning a recovered client-side-deadline `stalled`: a
|
|
392
|
+
# control-plane outage longer than max_wall+grace must not discard a seed the worker actually
|
|
393
|
+
# completed during the downtime (the deadline check would otherwise fire before any DONE read).
|
|
394
|
+
d = done_reader(force=True)
|
|
395
|
+
if d is not None and done_is_fresh(d):
|
|
396
|
+
return finish_ok(d)
|
|
397
|
+
raw = marker_reader(force=True)
|
|
398
|
+
if raw:
|
|
399
|
+
with contextlib.suppress(ValueError):
|
|
400
|
+
m = json.loads(raw)
|
|
401
|
+
if m.get("ok"):
|
|
402
|
+
return finish_from_ok_marker() # finished (stale DONE ok)
|
|
403
|
+
return fail_from_marker(m)
|
|
404
|
+
return None
|
|
405
|
+
|
|
406
|
+
poll_errors = PollErrorTracker(say, interval_s)
|
|
407
|
+
# Seed the load/stall clocks from the VM's LAUNCH (launch_ts), not this poll's start: on a
|
|
408
|
+
# delayed reattach after a control-plane restart the box has been billing since launch, so a
|
|
409
|
+
# still-booting VM that already blew LOAD_TIMEOUT_S must fail over NOW instead of getting another
|
|
410
|
+
# full window. launch_ts already maps an unknown-launch (0.0) handle to now (see above), so a
|
|
411
|
+
# fresh launch is a no-op and a corrupt handle won't peg the clocks to the epoch.
|
|
412
|
+
start = launch_ts
|
|
413
|
+
last_status = None
|
|
414
|
+
last_hb_key = None
|
|
415
|
+
last_progress = start
|
|
416
|
+
became_active = False
|
|
417
|
+
seen_training_hb = False
|
|
418
|
+
missing_streak = 0
|
|
419
|
+
while True:
|
|
420
|
+
if deadline_s is not None and time.time() - start > deadline_s:
|
|
421
|
+
# A recovered run can blow a launch-anchored deadline on the FIRST reattach tick (the
|
|
422
|
+
# outage lasted past max_wall+grace). Read terminal artifacts once before giving up: if
|
|
423
|
+
# the worker finished/errored during the downtime, persist that instead of retrying.
|
|
424
|
+
terminal = terminal_artifact_result()
|
|
425
|
+
if terminal is not None:
|
|
426
|
+
return terminal
|
|
427
|
+
return PollResult(False, failure="stalled", detail="client-side deadline exceeded")
|
|
428
|
+
try:
|
|
429
|
+
vm = hs_api.get_vm(handle.vm_id)
|
|
430
|
+
poll_errors.reset()
|
|
431
|
+
except hs_api.HyperstackApiError as e:
|
|
432
|
+
if poll_errors.record(e):
|
|
433
|
+
return PollResult(False, failure="poll_error", detail=str(e))
|
|
434
|
+
continue
|
|
435
|
+
missing_streak = missing_streak + 1 if vm is None else 0
|
|
436
|
+
status = ((vm or {}).get("status") or ("missing" if vm is None else "unknown")).upper()
|
|
437
|
+
if status != last_status:
|
|
438
|
+
say(f"vm {handle.vm_id}: {status}")
|
|
439
|
+
# Treat a status TRANSITION as progress, but NOT the first observation: last_status
|
|
440
|
+
# starts None, so on a reattach the very first read always "changes" — counting it as
|
|
441
|
+
# progress would overwrite the launch-anchored last_progress and hand a silent-since-
|
|
442
|
+
# launch worker a fresh full setup grace after every control-plane restart.
|
|
443
|
+
if last_status is not None:
|
|
444
|
+
last_progress = time.time()
|
|
445
|
+
last_status = status
|
|
446
|
+
if status == "ACTIVE":
|
|
447
|
+
became_active = True
|
|
448
|
+
|
|
449
|
+
done = done_reader()
|
|
450
|
+
if done is not None and done_is_fresh(done):
|
|
451
|
+
return finish_ok(done)
|
|
452
|
+
|
|
453
|
+
dead = missing_streak >= 3 or status in _DEAD_STATES
|
|
454
|
+
if dead:
|
|
455
|
+
terminal = terminal_artifact_result()
|
|
456
|
+
if terminal is not None:
|
|
457
|
+
return terminal
|
|
458
|
+
return PollResult(
|
|
459
|
+
False,
|
|
460
|
+
failure="job_preempted",
|
|
461
|
+
detail=_failure_detail(hf_repo, prefix, spec.phase, None),
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
raw_marker = marker_reader()
|
|
465
|
+
if raw_marker:
|
|
466
|
+
try:
|
|
467
|
+
marker = json.loads(raw_marker)
|
|
468
|
+
except ValueError:
|
|
469
|
+
marker = None
|
|
470
|
+
if marker and not marker.get("ok"):
|
|
471
|
+
return fail_from_marker(marker)
|
|
472
|
+
if marker and marker.get("ok"):
|
|
473
|
+
return finish_from_ok_marker() # ok marker + metrics == success (DONE may be stale)
|
|
474
|
+
|
|
475
|
+
if not became_active and time.time() - start > LOAD_TIMEOUT_S:
|
|
476
|
+
return PollResult(
|
|
477
|
+
False,
|
|
478
|
+
failure="stalled",
|
|
479
|
+
detail=f"vm stuck in '{status}' for {int(time.time() - start)}s "
|
|
480
|
+
f"(never became active; provisioning / host issue)",
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
new_key, stage = surface_heartbeat(heartbeat_reader, last_hb_key, say)
|
|
484
|
+
if new_key != last_hb_key:
|
|
485
|
+
last_hb_key = new_key
|
|
486
|
+
# Credit the heartbeat's OWN timestamp, not the poll time: a heartbeat that was
|
|
487
|
+
# already stale before a control-plane restart must not reset the stall clock to now
|
|
488
|
+
# on the first reattach read (last_hb_key starts None, so even an old heartbeat looks
|
|
489
|
+
# "new"). Clamped to [launch, now]. Healthy workers heartbeat well inside the stall
|
|
490
|
+
# window, so their ts ~= now (no behavior change on the normal path). ``fresh`` is False
|
|
491
|
+
# for a LEFTOVER heartbeat from a prior attempt (ts < launch); we then neither advance
|
|
492
|
+
# last_progress nor mark training seen, so a stale training heartbeat can't arm the
|
|
493
|
+
# tighter training stall window before this attempt overwrites the file. Dates against
|
|
494
|
+
# ``launch_ts`` (NOT the raw handle.started_ts) so an unknown-launch (0.0) handle is
|
|
495
|
+
# anchored to the SAME ``now`` reference as done_is_fresh / the load+stall clocks: a
|
|
496
|
+
# leftover heartbeat predating this reattach is then consistently rejected instead of
|
|
497
|
+
# blanket-trusted (which could otherwise arm the tighter training window off a prior
|
|
498
|
+
# attempt's training heartbeat). On a real launch this is exactly handle.started_ts.
|
|
499
|
+
hb_ts, fresh = heartbeat_progress_ts(new_key, launch_ts)
|
|
500
|
+
if fresh:
|
|
501
|
+
last_progress = hb_ts
|
|
502
|
+
if stage not in _SETUP_HEARTBEAT_STAGES:
|
|
503
|
+
seen_training_hb = True
|
|
504
|
+
if became_active:
|
|
505
|
+
limit = stall_after_s if seen_training_hb else setup_grace_s
|
|
506
|
+
if time.time() - last_progress > limit:
|
|
507
|
+
phase = "training" if seen_training_hb else "setup (pre-training)"
|
|
508
|
+
return PollResult(
|
|
509
|
+
False,
|
|
510
|
+
failure="stalled",
|
|
511
|
+
detail=f"no worker progress for {int(time.time() - last_progress)}s "
|
|
512
|
+
f"during {phase} (vm status {status}, limit {int(limit)}s)",
|
|
513
|
+
)
|
|
514
|
+
time.sleep(interval_s)
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
def submit_run_hyperstack(
|
|
518
|
+
spec,
|
|
519
|
+
seed: int,
|
|
520
|
+
log=None,
|
|
521
|
+
on_handle=None,
|
|
522
|
+
attempt: int = 0,
|
|
523
|
+
runtime_secrets: dict | None = None,
|
|
524
|
+
on_last_gpu: bool = False,
|
|
525
|
+
) -> PollResult:
|
|
526
|
+
"""Hyperstack equivalent of ``runpod.jobs.submit_run``: launch, persist, poll, delete.
|
|
527
|
+
|
|
528
|
+
The ``finally`` delete is the cost-safety primary: every exit path tears the paid VM down.
|
|
529
|
+
"""
|
|
530
|
+
if spec.gpu.type not in GPU_INFO:
|
|
531
|
+
raise hs_api.HyperstackApiError(
|
|
532
|
+
f"submit_run_hyperstack needs a concrete gpu class, got {spec.gpu.type!r}"
|
|
533
|
+
)
|
|
534
|
+
instances = usable_instances(spec.gpu.type)
|
|
535
|
+
handle = launch_and_submit(
|
|
536
|
+
spec, seed, instances, attempt=attempt, log=log, runtime_secrets=runtime_secrets
|
|
537
|
+
)
|
|
538
|
+
try:
|
|
539
|
+
if on_handle is not None:
|
|
540
|
+
on_handle(handle.to_dict())
|
|
541
|
+
hf_repo = spec.train.hf_repo
|
|
542
|
+
prefix = f"{spec.phase}/{spec.run_id}/seed{seed}"
|
|
543
|
+
reader = make_hf_heartbeat_reader(hf_repo, prefix) if hf_repo else None
|
|
544
|
+
setup_grace = SETUP_GRACE_S * (1.5 if on_last_gpu else 1.0)
|
|
545
|
+
deadline = max(60, int(spec.gpu.max_wall_seconds)) + PROVISION_GRACE_S
|
|
546
|
+
return poll_hs_job(
|
|
547
|
+
handle, spec, seed, log=log, heartbeat_reader=reader,
|
|
548
|
+
setup_grace_s=setup_grace, deadline_s=deadline,
|
|
549
|
+
)
|
|
550
|
+
finally:
|
|
551
|
+
hs_api.delete_vm(handle.vm_id)
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
def terminate_run_instances(run_id: str) -> list[str]:
|
|
555
|
+
"""Delete every VM belonging to ONE run (names start with its run prefix). Best-effort."""
|
|
556
|
+
if not run_id:
|
|
557
|
+
return []
|
|
558
|
+
try:
|
|
559
|
+
vms = hs_api.list_vms()
|
|
560
|
+
except Exception:
|
|
561
|
+
return []
|
|
562
|
+
prefix = run_label_prefix(run_id)
|
|
563
|
+
ids = [
|
|
564
|
+
str(v.get("id"))
|
|
565
|
+
for v in vms
|
|
566
|
+
if v.get("id")
|
|
567
|
+
and (str(v.get("name") or "") == prefix or str(v.get("name") or "").startswith(prefix + "-s"))
|
|
568
|
+
]
|
|
569
|
+
return hs_api.delete_vms(ids) if ids else []
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
def sweep_orphans(
|
|
573
|
+
active_labels: set[str] | Callable[[], set[str]] | None = None,
|
|
574
|
+
) -> list[str]:
|
|
575
|
+
"""Delete Flash-named VMs that no live run owns; return deleted ids. Run at startup + post-run.
|
|
576
|
+
|
|
577
|
+
Only names with the ``flash-`` run prefix are touched. ``active_labels`` may be RAW run ids;
|
|
578
|
+
each is passed through ``run_label_prefix`` so it matches the forced prefix the names carry.
|
|
579
|
+
|
|
580
|
+
``active_labels`` may also be a CALLABLE returning that set — it is then resolved AFTER the VM
|
|
581
|
+
list is fetched. The periodic in-lifetime sweep passes one so the protection set is read
|
|
582
|
+
post-listing: any VM present in the list had its run's status row committed before the VM was
|
|
583
|
+
launched (hence before this list call), so resolving the live set now is guaranteed to include
|
|
584
|
+
it — closing the launch race where a run started after a pre-captured set could have its fresh
|
|
585
|
+
worker reaped as a phantom orphan.
|
|
586
|
+
"""
|
|
587
|
+
try:
|
|
588
|
+
vms = hs_api.list_vms()
|
|
589
|
+
except Exception as exc:
|
|
590
|
+
logger.warning("hyperstack orphan sweep skipped: %s", exc)
|
|
591
|
+
return []
|
|
592
|
+
try:
|
|
593
|
+
labels = active_labels() if callable(active_labels) else active_labels
|
|
594
|
+
except Exception as exc:
|
|
595
|
+
# Resolving the protection set failed (e.g. a db/status read error in the callable). SKIP the
|
|
596
|
+
# sweep — never fall through to an empty set, which would treat every live run's VM as an
|
|
597
|
+
# orphan and reap it. Honors the "never raises" contract.
|
|
598
|
+
logger.warning("hyperstack orphan sweep skipped: could not resolve active set: %s", exc)
|
|
599
|
+
return []
|
|
600
|
+
active = {run_label_prefix(a) for a in (labels or set())}
|
|
601
|
+
now = time.time()
|
|
602
|
+
orphans: list[str] = []
|
|
603
|
+
for v in vms:
|
|
604
|
+
name = str(v.get("name") or "")
|
|
605
|
+
if not name.startswith("flash-"):
|
|
606
|
+
continue
|
|
607
|
+
# Warm/preload boxes (``flash-preload-...``) are driver-owned: launched by
|
|
608
|
+
# preload.warm_instances (mode="preload"), NEVER persisted in the run DB (so never in
|
|
609
|
+
# ``active``), and self-terminated in _warm_one_instance's ``finally`` (and by startup
|
|
610
|
+
# recover_runs). A catalog warm can outlast this ~10-min sweep, so reaping them by the bare
|
|
611
|
+
# ``flash-`` prefix would kill an in-progress preload mid-download; normally exempt them.
|
|
612
|
+
# EXCEPTION: a box still alive past its embedded wall deadline + grace has lost its driver (the
|
|
613
|
+
# only thing that terminates instance providers — nothing on the box self-terminates the VM), so
|
|
614
|
+
# reap it to bound the leak rather than exempt it forever (see preload_box_reap_due).
|
|
615
|
+
if name.startswith("flash-preload-"):
|
|
616
|
+
if preload_box_reap_due(name, now):
|
|
617
|
+
vid = v.get("id")
|
|
618
|
+
if vid:
|
|
619
|
+
orphans.append(str(vid))
|
|
620
|
+
logger.warning(
|
|
621
|
+
"reaping orphaned hyperstack preload vm %s (outlived its wall deadline + "
|
|
622
|
+
"grace; driver lost)", name)
|
|
623
|
+
continue
|
|
624
|
+
if any(name == a or name.startswith(a + "-s") for a in active):
|
|
625
|
+
continue
|
|
626
|
+
vid = v.get("id")
|
|
627
|
+
if vid:
|
|
628
|
+
orphans.append(str(vid))
|
|
629
|
+
deleted = hs_api.delete_vms(orphans) if orphans else []
|
|
630
|
+
for vid in deleted:
|
|
631
|
+
logger.warning("deleted orphaned hyperstack vm %s", vid)
|
|
632
|
+
return deleted
|