freesolo-flash-dev 0.2.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. flash/__init__.py +29 -0
  2. flash/_channel.py +23 -0
  3. flash/_fileio.py +35 -0
  4. flash/_logging.py +49 -0
  5. flash/_update_check.py +266 -0
  6. flash/catalog.py +253 -0
  7. flash/cli/__init__.py +1 -0
  8. flash/cli/main/__init__.py +227 -0
  9. flash/cli/main/__main__.py +6 -0
  10. flash/cli/main/commands.py +636 -0
  11. flash/cli/main/envpush.py +317 -0
  12. flash/cli/main/render.py +599 -0
  13. flash/cli/main/training_doc.py +455 -0
  14. flash/client/__init__.py +14 -0
  15. flash/client/config.py +70 -0
  16. flash/client/http.py +372 -0
  17. flash/client/runtime_secrets.py +69 -0
  18. flash/client/specs.py +20 -0
  19. flash/cost/__init__.py +16 -0
  20. flash/cost/analytical.py +175 -0
  21. flash/cost/facts.py +114 -0
  22. flash/cost/spec.py +113 -0
  23. flash/cost/types.py +158 -0
  24. flash/engine/__init__.py +6 -0
  25. flash/engine/accounting.py +36 -0
  26. flash/engine/chalk_kernels.py +116 -0
  27. flash/engine/multiturn_rollout.py +780 -0
  28. flash/engine/recipe.py +86 -0
  29. flash/engine/vram.py +603 -0
  30. flash/engine/worker/__init__.py +2916 -0
  31. flash/engine/worker/__main__.py +4 -0
  32. flash/engine/worker/kernel_warmup.py +400 -0
  33. flash/engine/worker/lora.py +796 -0
  34. flash/engine/worker/packing.py +366 -0
  35. flash/engine/worker/perf.py +1048 -0
  36. flash/envs/__init__.py +10 -0
  37. flash/envs/adapter/__init__.py +883 -0
  38. flash/envs/adapter/rubric.py +222 -0
  39. flash/envs/base.py +52 -0
  40. flash/envs/registry.py +62 -0
  41. flash/mcp/__init__.py +1 -0
  42. flash/mcp/server.py +85 -0
  43. flash/providers/__init__.py +59 -0
  44. flash/providers/_auth.py +24 -0
  45. flash/providers/_http.py +230 -0
  46. flash/providers/_instance.py +416 -0
  47. flash/providers/_instance_bootstrap.py +517 -0
  48. flash/providers/_poll.py +311 -0
  49. flash/providers/allocator.py +193 -0
  50. flash/providers/base.py +431 -0
  51. flash/providers/hyperstack/__init__.py +127 -0
  52. flash/providers/hyperstack/api.py +522 -0
  53. flash/providers/hyperstack/auth.py +17 -0
  54. flash/providers/hyperstack/gpus.py +29 -0
  55. flash/providers/hyperstack/jobs/__init__.py +632 -0
  56. flash/providers/hyperstack/jobs/builders.py +122 -0
  57. flash/providers/hyperstack/preflight.py +23 -0
  58. flash/providers/hyperstack/pricing.py +26 -0
  59. flash/providers/hyperstack/train.py +25 -0
  60. flash/providers/lambdalabs/__init__.py +139 -0
  61. flash/providers/lambdalabs/api.py +261 -0
  62. flash/providers/lambdalabs/auth.py +18 -0
  63. flash/providers/lambdalabs/gpus.py +29 -0
  64. flash/providers/lambdalabs/jobs/__init__.py +724 -0
  65. flash/providers/lambdalabs/jobs/builders.py +118 -0
  66. flash/providers/lambdalabs/preflight.py +27 -0
  67. flash/providers/lambdalabs/pricing.py +51 -0
  68. flash/providers/lambdalabs/train.py +27 -0
  69. flash/providers/preflight.py +55 -0
  70. flash/providers/realized.py +80 -0
  71. flash/providers/runpod/__init__.py +130 -0
  72. flash/providers/runpod/api.py +186 -0
  73. flash/providers/runpod/auth.py +37 -0
  74. flash/providers/runpod/cost.py +57 -0
  75. flash/providers/runpod/gpus.py +46 -0
  76. flash/providers/runpod/jobs.py +956 -0
  77. flash/providers/runpod/keys.py +139 -0
  78. flash/providers/runpod/preflight.py +30 -0
  79. flash/providers/runpod/preload.py +915 -0
  80. flash/providers/runpod/pricing.py +18 -0
  81. flash/providers/runpod/slots.py +79 -0
  82. flash/providers/runpod/train/__init__.py +150 -0
  83. flash/providers/runpod/train/deps.py +395 -0
  84. flash/providers/runpod/train/endpoints.py +820 -0
  85. flash/py.typed +0 -0
  86. flash/runner/__init__.py +686 -0
  87. flash/runner/checkpoints.py +82 -0
  88. flash/runner/deploy.py +422 -0
  89. flash/runner/lifecycle.py +672 -0
  90. flash/schema/__init__.py +375 -0
  91. flash/schema/fields.py +331 -0
  92. flash/serve/__init__.py +1 -0
  93. flash/serve/deploy.py +326 -0
  94. flash/serve/pricing.py +60 -0
  95. flash/server/__init__.py +1 -0
  96. flash/server/__main__.py +20 -0
  97. flash/server/app.py +961 -0
  98. flash/server/auth.py +263 -0
  99. flash/server/billing.py +124 -0
  100. flash/server/checkpoints.py +110 -0
  101. flash/server/db.py +160 -0
  102. flash/server/environment_registry.py +102 -0
  103. flash/server/envs.py +360 -0
  104. flash/server/reconcile.py +163 -0
  105. flash/server/run_registry.py +150 -0
  106. flash/spec.py +333 -0
  107. freesolo_flash_dev-0.2.25.dist-info/METADATA +192 -0
  108. freesolo_flash_dev-0.2.25.dist-info/RECORD +111 -0
  109. freesolo_flash_dev-0.2.25.dist-info/WHEEL +4 -0
  110. freesolo_flash_dev-0.2.25.dist-info/entry_points.txt +3 -0
  111. freesolo_flash_dev-0.2.25.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,632 @@
1
+ """Hyperstack run lifecycle: stock walk -> launch VM -> HF-artifact poll -> guaranteed delete.
2
+
3
+ The Hyperstack equivalent of ``providers/lambdalabs/jobs``. Hyperstack rents a single-GPU VM from a
4
+ region with stock, ships the shared cloud-init ``user_data`` (runs ``WORKER_IMAGE`` via Docker), and
5
+ detects completion from the worker's HF artifacts. Cost-safety: a launched VM is ALWAYS deleted —
6
+ the runner ``finally``, the poll deadline, cancel, and ``sweep_orphans`` each guarantee it. Like
7
+ Lambda, there is no in-box self-destruct (no instance-scoped key); ``sweep_orphans`` at startup is
8
+ the crash backstop.
9
+
10
+ Pure dataclasses + builders live in ``.builders`` and are re-exported. Lifecycle functions and the
11
+ constants tests monkeypatch stay in this ``__init__``.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import contextlib
17
+ import json
18
+ import time
19
+ from collections.abc import Callable
20
+
21
+ from flash._logging import get_logger
22
+ from flash.providers._poll import (
23
+ PollErrorTracker,
24
+ heartbeat_progress_ts,
25
+ make_say,
26
+ preload_box_reap_due,
27
+ surface_heartbeat,
28
+ )
29
+ from flash.providers.base import GPU_INFO, PollResult, min_cuda_modern
30
+ from flash.providers.hyperstack import api as hs_api
31
+ from flash.providers.hyperstack.jobs.builders import (
32
+ HyperstackInstance,
33
+ HyperstackJobHandle,
34
+ build_payload,
35
+ build_user_data,
36
+ instance_label,
37
+ run_label_prefix,
38
+ )
39
+ from flash.providers.runpod.jobs import make_hf_heartbeat_reader, make_hf_text_reader
40
+
41
+ logger = get_logger(__name__)
42
+
43
+ # How long a VM may sit in a non-active state (provisioning) before we give up and retry.
44
+ LOAD_TIMEOUT_S = 900.0
45
+ # Cold-start (Docker pull + pip + model download) emits no heartbeat -> larger setup grace until a
46
+ # training heartbeat; tighter window after.
47
+ SETUP_GRACE_S = 3000.0
48
+ STALL_AFTER_S = 1500.0
49
+ PROVISION_GRACE_S = 3000.0
50
+
51
+ _SETUP_HEARTBEAT_STAGES = frozenset(
52
+ {"boot", "sft_start", "rl_start", "sft_model_load", "rl_train_start"}
53
+ )
54
+
55
+ # Hyperstack VM statuses that mean "the box is gone / will not progress".
56
+ _DEAD_STATES = {"ERROR", "FAILED", "DELETING", "DELETED", "TERMINATED"}
57
+
58
+
59
+ def usable_instances(gpu_class: str, force: bool = False) -> list[HyperstackInstance]:
60
+ """Launchable (region) candidates for a managed GPU class, only where the flavor has stock now.
61
+ ``force`` bypasses the ``/core/flavors`` cache (used by the in-launch refresh so it can discover
62
+ newly-restocked regions instead of re-reading the just-populated allocation cache).
63
+
64
+ Hyperstack prices per flavor (not per region), so every candidate carries the same $/hr; the
65
+ list is the regions whose flavor currently advertises stock. Empty == no Hyperstack capacity now.
66
+ """
67
+ from flash.providers.hyperstack.gpus import flavor_for
68
+ from flash.providers.hyperstack.pricing import hourly_rate
69
+
70
+ info = GPU_INFO[gpu_class]
71
+ flavor = flavor_for(gpu_class)
72
+ rate = hourly_rate(gpu_class)
73
+ return [
74
+ HyperstackInstance(
75
+ gpu=gpu_class,
76
+ flavor=flavor,
77
+ region=region,
78
+ environment=hs_api.environment_for_region(region),
79
+ vram_gb=info.vram_gb,
80
+ price_usd_hr=rate,
81
+ )
82
+ for region in hs_api.regions_with_stock(flavor, force=force)
83
+ ]
84
+
85
+
86
+ def _launch_rejection_is_clean(err: Exception) -> bool:
87
+ """True when a launch error is a DEFINITIVE rejection that created NO VM (safe to walk). The
88
+ shared RestClient fast-fails a non-429 4xx as ``... -> HTTP 4xx: ...`` (request rejected, e.g.
89
+ no stock). Anything else — 429, 5xx/timeout (``failed after N attempts``), or accepted-but-no-id
90
+ (``returned no VM id``) — is AMBIGUOUS: Hyperstack may have created a billed VM, so we must NOT
91
+ issue another launch."""
92
+ s = str(err)
93
+ return "-> HTTP 4" in s and "HTTP 429" not in s
94
+
95
+
96
+ def launch_and_submit(
97
+ spec,
98
+ seed: int,
99
+ instances: list[HyperstackInstance],
100
+ attempt: int = 0,
101
+ log=None,
102
+ runtime_secrets: dict | None = None,
103
+ mode: str | None = None,
104
+ models: list | None = None,
105
+ ) -> HyperstackJobHandle:
106
+ """Launch the first region that accepts the job; walk regions on a stock rejection, refresh once.
107
+
108
+ ``mode="preload"`` + ``models`` launches a download-only warm (the bootstrap pulls the models into
109
+ the mounted cache volume and exits — no worker)."""
110
+ say = make_say(log)
111
+ if not instances:
112
+ raise hs_api.HyperstackApiError(
113
+ f"no Hyperstack stock for {spec.gpu.type} (no region advertises the flavor)"
114
+ )
115
+ # Weight cache: when wanted (runner-assigned network_volume), HF_HOME points at a block volume
116
+ # formatted+mounted at /mnt/flash-weights (region-independent path) and bind-mounted into the
117
+ # container; the volume is created-if-absent per environment and attached AFTER launch. If the
118
+ # volume can't be ensured we launch cold; if attach fails the cloud-init preamble degrades to cold
119
+ # (no device appears). Block volumes are single-attach, so a concurrent same-region run runs cold.
120
+ # ``gpu=`` selects the per-GPU worker image (dev: worker_image_for_gpu via hyperstack_image).
121
+ cache_name = getattr(spec.gpu, "network_volume", None)
122
+ cold_user_data = build_user_data(
123
+ build_payload(spec, seed, attempt, runtime_secrets=runtime_secrets), gpu=spec.gpu.type
124
+ )
125
+ cache_user_data = None
126
+ cache_gb = 0
127
+ if cache_name:
128
+ from flash.runner import WEIGHT_CACHE_VOLUME_GB
129
+ from flash.spec import _volume_gb
130
+
131
+ # Honor the runner-assigned size (mirrors RunPod), defaulting to the standard cache size. Parse
132
+ # tolerantly via _volume_gb: this best-effort weight-cache path must never crash the run on a
133
+ # non-int / stale / hand-edited size ("0", "", "abc", bool) — it just falls back to the default.
134
+ cache_gb = _volume_gb(getattr(spec.gpu, "network_volume_gb", None), default=WEIGHT_CACHE_VOLUME_GB)
135
+ cache_user_data = build_user_data(
136
+ build_payload(
137
+ spec, seed, attempt, runtime_secrets=runtime_secrets,
138
+ cache_host_mount="/mnt/flash-weights", cache_block_device=True,
139
+ mode=mode, models=models,
140
+ ),
141
+ gpu=spec.gpu.type,
142
+ )
143
+ name = instance_label(spec.run_id, seed, attempt)
144
+
145
+ tried_regions: set[str] = set()
146
+ candidates = list(instances)
147
+ refreshed = False
148
+ last_err: Exception | None = None
149
+
150
+ def refresh_once(gpu: str) -> None:
151
+ """One forced stock re-fetch when the walk is exhausted (the alloc cache is ~45s stale).
152
+
153
+ NO-OP in preload mode: ``warm_instances`` pins each preload launch to ONE specific target
154
+ region (``instances=[candidate]``) and reports that exact region as warmed. Refreshing to a
155
+ DIFFERENT region here would warm region B while the caller reports the target region A as
156
+ warmed (its cache actually still cold). A preload that can't run in its target region must
157
+ FAIL that region (the walk exhausts -> raise), never silently warm another.
158
+ """
159
+ nonlocal refreshed, candidates
160
+ if mode == "preload":
161
+ return
162
+ if not candidates and not refreshed:
163
+ refreshed = True
164
+ candidates = [
165
+ c for c in usable_instances(gpu, force=True) if c.region not in tried_regions
166
+ ]
167
+
168
+ while candidates:
169
+ inst = candidates.pop(0)
170
+ if inst.region in tried_regions:
171
+ continue
172
+ tried_regions.add(inst.region)
173
+ # Pre-launch resolution: pick a boot image whose host CUDA covers this GPU class's floor
174
+ # (Blackwell needs 13) and the SSH key. These run BEFORE any non-idempotent launch, so a
175
+ # failure here (e.g. the region advertises stock but has no qualifying CUDA image) created NO
176
+ # VM — it is a CLEAN region skip, never an ambiguous phantom. Walk to the next region.
177
+ try:
178
+ image = hs_api.docker_image_for_region(inst.region, min_cuda=min_cuda_modern(inst.gpu))
179
+ key_name = hs_api.resolve_key_name(inst.environment)
180
+ except hs_api.HyperstackApiError as e:
181
+ last_err = e
182
+ say(f"region {inst.region} ({inst.gpu} {inst.flavor}) unusable (no boot image/key): {e}")
183
+ refresh_once(inst.gpu)
184
+ continue
185
+ # Ensure the cache volume exists in this environment (create-if-absent); on success use the
186
+ # cache user_data and attach the volume after launch. Any failure -> launch cold here — EXCEPT
187
+ # in preload mode, where the cold user_data carries no mode/models, so a cold fallback would
188
+ # boot a full training bootstrap (GPU billing, timeout) and warm nothing. There we SKIP the
189
+ # region instead and let the walk try the next one (failing if none can host the cache).
190
+ vol_id, user_data = None, cold_user_data
191
+ cache_unavailable_reason = None
192
+ if cache_name and not hs_api.region_supports_cache(inst.region):
193
+ cache_unavailable_reason = "weight cache not supported in region"
194
+ elif cache_name:
195
+ try:
196
+ # Per-region physical name (Hyperstack volume names are GLOBALLY unique — a bare
197
+ # cache_name can exist in only one environment account-wide).
198
+ vol_name = hs_api.cache_volume_name(cache_name, inst.region)
199
+ vol_id = hs_api.ensure_volume(vol_name, inst.environment, cache_gb) or None
200
+ if vol_id is not None:
201
+ user_data = cache_user_data
202
+ else:
203
+ cache_unavailable_reason = "ensure_volume returned no id"
204
+ except Exception as e:
205
+ vol_id = None
206
+ cache_unavailable_reason = str(e)
207
+ if cache_name and cache_unavailable_reason is not None:
208
+ if mode == "preload":
209
+ say(f"weight cache unavailable in {inst.region} ({cache_unavailable_reason}); "
210
+ "skipping (preload needs it)")
211
+ last_err = hs_api.HyperstackApiError(
212
+ f"preload: weight cache unavailable in {inst.region} ({cache_unavailable_reason})"
213
+ )
214
+ refresh_once(inst.gpu)
215
+ continue
216
+ say(f"weight cache unavailable in {inst.region} ({cache_unavailable_reason}); launching cold")
217
+ try:
218
+ vm_id = hs_api.launch_vm(
219
+ name=name,
220
+ environment_name=inst.environment,
221
+ image_name=image,
222
+ flavor_name=inst.flavor,
223
+ key_name=key_name,
224
+ user_data=user_data,
225
+ )
226
+ except hs_api.HyperstackApiError as e:
227
+ last_err = e
228
+ if not _launch_rejection_is_clean(e):
229
+ # Ambiguous failure (timeout / 5xx / 429 / accepted-but-no-id): Hyperstack may have
230
+ # created a billed VM whose id we never got. Do NOT launch another in this attempt —
231
+ # reconcile any phantom by run-name and stop; the runner's retry (+ gc /
232
+ # sweep_orphans) re-provisions cleanly.
233
+ say(f"ambiguous launch failure in {inst.region}: {e}; reconciling + retrying fresh")
234
+ with contextlib.suppress(Exception):
235
+ terminate_run_instances(spec.run_id)
236
+ raise hs_api.HyperstackApiError(
237
+ f"ambiguous Hyperstack launch failure (possible phantom reaped): {e}"
238
+ ) from e
239
+ say(f"region {inst.region} ({inst.gpu} {inst.flavor}) rejected: {e}")
240
+ refresh_once(inst.gpu)
241
+ continue
242
+ # Attach the cache volume to the freshly-launched VM (best-effort: the cloud-init preamble
243
+ # degrades to a cold run if no device appears, e.g. attach failed or the volume is busy on
244
+ # another VM — block volumes are single-attach).
245
+ if vol_id is not None:
246
+ attached = False
247
+ with contextlib.suppress(Exception):
248
+ attached = hs_api.attach_volume(vm_id, vol_id)
249
+ # A training run survives a failed attach (the preamble runs cold), but a PRELOAD box can't:
250
+ # with no device it would refuse to warm ephemeral disk (the sentinel check) and just burn
251
+ # paid GPU until the wall cap. Treat a failed preload attach as a launch failure — tear the
252
+ # VM down and walk to the next region (failing the warm if none can attach the cache).
253
+ if not attached and mode == "preload":
254
+ say(f"preload: cache attach failed in {inst.region} (vol busy/absent); "
255
+ "terminating box and trying next region")
256
+ with contextlib.suppress(Exception):
257
+ terminate_run_instances(spec.run_id)
258
+ last_err = hs_api.HyperstackApiError(
259
+ f"preload: cache volume attach failed in {inst.region}"
260
+ )
261
+ refresh_once(inst.gpu)
262
+ continue
263
+ say(
264
+ f"launched hyperstack vm {vm_id}: {inst.gpu} {inst.flavor} "
265
+ f"${inst.price_usd_hr:.2f}/hr in {inst.region} attempt={attempt} seed={seed}"
266
+ )
267
+ return HyperstackJobHandle(
268
+ vm_id=vm_id,
269
+ flavor=inst.flavor,
270
+ region=inst.region,
271
+ name=name,
272
+ gpu=inst.gpu,
273
+ hourly_usd=inst.price_usd_hr,
274
+ attempt=attempt,
275
+ started_ts=time.time(),
276
+ )
277
+ # Phantom-VM safety: a non-idempotent launch that Hyperstack ACCEPTED but whose response lacked
278
+ # a parseable id raises (caught above as a region rejection), leaving a billed VM under our run
279
+ # name that no handle owns. Best-effort reap any such VM by run-name before giving up (the
280
+ # post-run gc / sweep_orphans are the backstop, but this closes the window now).
281
+ with contextlib.suppress(Exception):
282
+ terminate_run_instances(spec.run_id)
283
+ raise hs_api.HyperstackApiError(
284
+ f"all {len(tried_regions)} Hyperstack region(s) rejected the {spec.gpu.type} launch "
285
+ f"(no stock): {last_err}"
286
+ )
287
+
288
+
289
+ _make_hf_file_reader = make_hf_text_reader
290
+
291
+
292
+ def _failure_detail(hf_repo: str, prefix: str, phase: str, marker: dict | None) -> str:
293
+ """Best root-cause detail from the HF artifacts. Hyperstack exposes no VM console API, so the
294
+ box's ``hyperstack_boot.log`` (pushed to HF by the cloud-init host uploader) is the only window
295
+ into a pre-worker failure (docker/GPU not ready, image-pull failure)."""
296
+ parts = []
297
+ if marker and marker.get("error"):
298
+ parts.append(str(marker["error"]))
299
+ err = _make_hf_file_reader(hf_repo, f"{prefix}/error_{phase}.txt")(force=True)
300
+ if err:
301
+ parts.append(f"--- error_{phase}.txt ---\n{err[-2000:]}")
302
+ boot = _make_hf_file_reader(hf_repo, f"{prefix}/hyperstack_boot.log")(force=True)
303
+ if boot:
304
+ parts.append(f"--- hyperstack_boot.log (host) ---\n{boot[-3000:]}")
305
+ return "\n".join(parts) or "hyperstack worker terminated without a DONE sentinel"
306
+
307
+
308
+ def poll_hs_job(
309
+ handle: HyperstackJobHandle,
310
+ spec,
311
+ seed: int,
312
+ log=None,
313
+ interval_s: float = 15.0,
314
+ heartbeat_reader=None,
315
+ setup_grace_s: float = SETUP_GRACE_S,
316
+ stall_after_s: float = STALL_AFTER_S,
317
+ deadline_s: float | None = None,
318
+ ) -> PollResult:
319
+ """Poll VM status + HF artifacts to a terminal state (cf. lambda.jobs.poll_lambda_job)."""
320
+ say = make_say(log)
321
+ # Single source of truth for "when did this VM launch". started_ts is a non-Optional float that
322
+ # HyperstackJobHandle.from_dict coerces to 0.0 when MISSING (old/corrupt handle), so 0.0 means
323
+ # "unknown launch" (a real launch is a large epoch ts, never 0.0). Fall back to now so EVERY use
324
+ # below -- the load/stall clocks AND done_is_fresh / finish_ok's wall+cost stamping -- treats a
325
+ # recovered corrupt handle consistently, instead of billing/comparing from the 1970 epoch.
326
+ launch_ts = handle.started_ts or time.time()
327
+ hf_repo = spec.train.hf_repo
328
+ prefix = f"{spec.phase}/{spec.run_id}/seed{seed}"
329
+ done_reader = _make_hf_file_reader(hf_repo, f"{prefix}/DONE")
330
+ marker_reader = _make_hf_file_reader(
331
+ hf_repo, f"{prefix}/hyperstack_attempt{handle.attempt}.json", min_interval_s=60.0
332
+ )
333
+ metrics_reader = _make_hf_file_reader(hf_repo, f"{prefix}/metrics.json")
334
+
335
+ def finish_ok(done_content: str | None = None) -> PollResult:
336
+ raw = metrics_reader(force=True)
337
+ if raw is None:
338
+ return PollResult(False, failure="job_failed", detail="DONE without metrics.json")
339
+ metrics = json.loads(raw)
340
+ end_ts = time.time()
341
+ if done_content:
342
+ try:
343
+ done_ts = float(done_content.strip())
344
+ if launch_ts <= done_ts <= end_ts:
345
+ end_ts = done_ts
346
+ except ValueError:
347
+ pass
348
+ wall_h = (end_ts - launch_ts) / 3600.0
349
+ metrics["cost_usd"] = round(wall_h * handle.hourly_usd, 6)
350
+ notes = metrics.get("notes") if isinstance(metrics.get("notes"), dict) else {}
351
+ notes.update(
352
+ {
353
+ "provider": "hyperstack",
354
+ "hyperstack_rate_usd_hr": handle.hourly_usd,
355
+ "hyperstack_gpu": handle.gpu,
356
+ "hyperstack_flavor": handle.flavor,
357
+ "hyperstack_region": handle.region,
358
+ }
359
+ )
360
+ metrics["notes"] = notes
361
+ return PollResult(True, metrics=metrics)
362
+
363
+ def done_is_fresh(content: str) -> bool:
364
+ # launch_ts (not handle.started_ts) so an unknown-launch (0.0) handle doesn't accept every
365
+ # leftover DONE as fresh.
366
+ try:
367
+ return float(content.strip()) > launch_ts - 120.0
368
+ except ValueError:
369
+ return False
370
+
371
+ def finish_from_ok_marker() -> PollResult:
372
+ # ok marker => the worker finished (it wrote metrics before the marker) even if DONE is STALE
373
+ # (a retry hit the already-complete path). Treat ok-marker + metrics as terminal success.
374
+ d = done_reader(force=True)
375
+ return finish_ok(d if (d is not None and done_is_fresh(d)) else None)
376
+
377
+ def fail_from_marker(marker: dict | None) -> PollResult:
378
+ from flash.providers.runpod.jobs import worker_flagged_retriable
379
+
380
+ # Host failure marker sets retriable=True; the worker stamps it for a RetriableInfraError.
381
+ retriable = bool(marker and marker.get("retriable")) or worker_flagged_retriable(heartbeat_reader)
382
+ return PollResult(
383
+ False,
384
+ failure="job_preempted" if retriable else "job_failed",
385
+ detail=_failure_detail(hf_repo, prefix, spec.phase, marker),
386
+ )
387
+
388
+ def terminal_artifact_result() -> PollResult | None:
389
+ # One forced read of the worker's terminal HF artifacts (DONE / attempt ok-marker). Returns a
390
+ # terminal PollResult when the worker definitively finished or errored, else None. Used both
391
+ # when the host is dead AND before returning a recovered client-side-deadline `stalled`: a
392
+ # control-plane outage longer than max_wall+grace must not discard a seed the worker actually
393
+ # completed during the downtime (the deadline check would otherwise fire before any DONE read).
394
+ d = done_reader(force=True)
395
+ if d is not None and done_is_fresh(d):
396
+ return finish_ok(d)
397
+ raw = marker_reader(force=True)
398
+ if raw:
399
+ with contextlib.suppress(ValueError):
400
+ m = json.loads(raw)
401
+ if m.get("ok"):
402
+ return finish_from_ok_marker() # finished (stale DONE ok)
403
+ return fail_from_marker(m)
404
+ return None
405
+
406
+ poll_errors = PollErrorTracker(say, interval_s)
407
+ # Seed the load/stall clocks from the VM's LAUNCH (launch_ts), not this poll's start: on a
408
+ # delayed reattach after a control-plane restart the box has been billing since launch, so a
409
+ # still-booting VM that already blew LOAD_TIMEOUT_S must fail over NOW instead of getting another
410
+ # full window. launch_ts already maps an unknown-launch (0.0) handle to now (see above), so a
411
+ # fresh launch is a no-op and a corrupt handle won't peg the clocks to the epoch.
412
+ start = launch_ts
413
+ last_status = None
414
+ last_hb_key = None
415
+ last_progress = start
416
+ became_active = False
417
+ seen_training_hb = False
418
+ missing_streak = 0
419
+ while True:
420
+ if deadline_s is not None and time.time() - start > deadline_s:
421
+ # A recovered run can blow a launch-anchored deadline on the FIRST reattach tick (the
422
+ # outage lasted past max_wall+grace). Read terminal artifacts once before giving up: if
423
+ # the worker finished/errored during the downtime, persist that instead of retrying.
424
+ terminal = terminal_artifact_result()
425
+ if terminal is not None:
426
+ return terminal
427
+ return PollResult(False, failure="stalled", detail="client-side deadline exceeded")
428
+ try:
429
+ vm = hs_api.get_vm(handle.vm_id)
430
+ poll_errors.reset()
431
+ except hs_api.HyperstackApiError as e:
432
+ if poll_errors.record(e):
433
+ return PollResult(False, failure="poll_error", detail=str(e))
434
+ continue
435
+ missing_streak = missing_streak + 1 if vm is None else 0
436
+ status = ((vm or {}).get("status") or ("missing" if vm is None else "unknown")).upper()
437
+ if status != last_status:
438
+ say(f"vm {handle.vm_id}: {status}")
439
+ # Treat a status TRANSITION as progress, but NOT the first observation: last_status
440
+ # starts None, so on a reattach the very first read always "changes" — counting it as
441
+ # progress would overwrite the launch-anchored last_progress and hand a silent-since-
442
+ # launch worker a fresh full setup grace after every control-plane restart.
443
+ if last_status is not None:
444
+ last_progress = time.time()
445
+ last_status = status
446
+ if status == "ACTIVE":
447
+ became_active = True
448
+
449
+ done = done_reader()
450
+ if done is not None and done_is_fresh(done):
451
+ return finish_ok(done)
452
+
453
+ dead = missing_streak >= 3 or status in _DEAD_STATES
454
+ if dead:
455
+ terminal = terminal_artifact_result()
456
+ if terminal is not None:
457
+ return terminal
458
+ return PollResult(
459
+ False,
460
+ failure="job_preempted",
461
+ detail=_failure_detail(hf_repo, prefix, spec.phase, None),
462
+ )
463
+
464
+ raw_marker = marker_reader()
465
+ if raw_marker:
466
+ try:
467
+ marker = json.loads(raw_marker)
468
+ except ValueError:
469
+ marker = None
470
+ if marker and not marker.get("ok"):
471
+ return fail_from_marker(marker)
472
+ if marker and marker.get("ok"):
473
+ return finish_from_ok_marker() # ok marker + metrics == success (DONE may be stale)
474
+
475
+ if not became_active and time.time() - start > LOAD_TIMEOUT_S:
476
+ return PollResult(
477
+ False,
478
+ failure="stalled",
479
+ detail=f"vm stuck in '{status}' for {int(time.time() - start)}s "
480
+ f"(never became active; provisioning / host issue)",
481
+ )
482
+
483
+ new_key, stage = surface_heartbeat(heartbeat_reader, last_hb_key, say)
484
+ if new_key != last_hb_key:
485
+ last_hb_key = new_key
486
+ # Credit the heartbeat's OWN timestamp, not the poll time: a heartbeat that was
487
+ # already stale before a control-plane restart must not reset the stall clock to now
488
+ # on the first reattach read (last_hb_key starts None, so even an old heartbeat looks
489
+ # "new"). Clamped to [launch, now]. Healthy workers heartbeat well inside the stall
490
+ # window, so their ts ~= now (no behavior change on the normal path). ``fresh`` is False
491
+ # for a LEFTOVER heartbeat from a prior attempt (ts < launch); we then neither advance
492
+ # last_progress nor mark training seen, so a stale training heartbeat can't arm the
493
+ # tighter training stall window before this attempt overwrites the file. Dates against
494
+ # ``launch_ts`` (NOT the raw handle.started_ts) so an unknown-launch (0.0) handle is
495
+ # anchored to the SAME ``now`` reference as done_is_fresh / the load+stall clocks: a
496
+ # leftover heartbeat predating this reattach is then consistently rejected instead of
497
+ # blanket-trusted (which could otherwise arm the tighter training window off a prior
498
+ # attempt's training heartbeat). On a real launch this is exactly handle.started_ts.
499
+ hb_ts, fresh = heartbeat_progress_ts(new_key, launch_ts)
500
+ if fresh:
501
+ last_progress = hb_ts
502
+ if stage not in _SETUP_HEARTBEAT_STAGES:
503
+ seen_training_hb = True
504
+ if became_active:
505
+ limit = stall_after_s if seen_training_hb else setup_grace_s
506
+ if time.time() - last_progress > limit:
507
+ phase = "training" if seen_training_hb else "setup (pre-training)"
508
+ return PollResult(
509
+ False,
510
+ failure="stalled",
511
+ detail=f"no worker progress for {int(time.time() - last_progress)}s "
512
+ f"during {phase} (vm status {status}, limit {int(limit)}s)",
513
+ )
514
+ time.sleep(interval_s)
515
+
516
+
517
+ def submit_run_hyperstack(
518
+ spec,
519
+ seed: int,
520
+ log=None,
521
+ on_handle=None,
522
+ attempt: int = 0,
523
+ runtime_secrets: dict | None = None,
524
+ on_last_gpu: bool = False,
525
+ ) -> PollResult:
526
+ """Hyperstack equivalent of ``runpod.jobs.submit_run``: launch, persist, poll, delete.
527
+
528
+ The ``finally`` delete is the cost-safety primary: every exit path tears the paid VM down.
529
+ """
530
+ if spec.gpu.type not in GPU_INFO:
531
+ raise hs_api.HyperstackApiError(
532
+ f"submit_run_hyperstack needs a concrete gpu class, got {spec.gpu.type!r}"
533
+ )
534
+ instances = usable_instances(spec.gpu.type)
535
+ handle = launch_and_submit(
536
+ spec, seed, instances, attempt=attempt, log=log, runtime_secrets=runtime_secrets
537
+ )
538
+ try:
539
+ if on_handle is not None:
540
+ on_handle(handle.to_dict())
541
+ hf_repo = spec.train.hf_repo
542
+ prefix = f"{spec.phase}/{spec.run_id}/seed{seed}"
543
+ reader = make_hf_heartbeat_reader(hf_repo, prefix) if hf_repo else None
544
+ setup_grace = SETUP_GRACE_S * (1.5 if on_last_gpu else 1.0)
545
+ deadline = max(60, int(spec.gpu.max_wall_seconds)) + PROVISION_GRACE_S
546
+ return poll_hs_job(
547
+ handle, spec, seed, log=log, heartbeat_reader=reader,
548
+ setup_grace_s=setup_grace, deadline_s=deadline,
549
+ )
550
+ finally:
551
+ hs_api.delete_vm(handle.vm_id)
552
+
553
+
554
+ def terminate_run_instances(run_id: str) -> list[str]:
555
+ """Delete every VM belonging to ONE run (names start with its run prefix). Best-effort."""
556
+ if not run_id:
557
+ return []
558
+ try:
559
+ vms = hs_api.list_vms()
560
+ except Exception:
561
+ return []
562
+ prefix = run_label_prefix(run_id)
563
+ ids = [
564
+ str(v.get("id"))
565
+ for v in vms
566
+ if v.get("id")
567
+ and (str(v.get("name") or "") == prefix or str(v.get("name") or "").startswith(prefix + "-s"))
568
+ ]
569
+ return hs_api.delete_vms(ids) if ids else []
570
+
571
+
572
+ def sweep_orphans(
573
+ active_labels: set[str] | Callable[[], set[str]] | None = None,
574
+ ) -> list[str]:
575
+ """Delete Flash-named VMs that no live run owns; return deleted ids. Run at startup + post-run.
576
+
577
+ Only names with the ``flash-`` run prefix are touched. ``active_labels`` may be RAW run ids;
578
+ each is passed through ``run_label_prefix`` so it matches the forced prefix the names carry.
579
+
580
+ ``active_labels`` may also be a CALLABLE returning that set — it is then resolved AFTER the VM
581
+ list is fetched. The periodic in-lifetime sweep passes one so the protection set is read
582
+ post-listing: any VM present in the list had its run's status row committed before the VM was
583
+ launched (hence before this list call), so resolving the live set now is guaranteed to include
584
+ it — closing the launch race where a run started after a pre-captured set could have its fresh
585
+ worker reaped as a phantom orphan.
586
+ """
587
+ try:
588
+ vms = hs_api.list_vms()
589
+ except Exception as exc:
590
+ logger.warning("hyperstack orphan sweep skipped: %s", exc)
591
+ return []
592
+ try:
593
+ labels = active_labels() if callable(active_labels) else active_labels
594
+ except Exception as exc:
595
+ # Resolving the protection set failed (e.g. a db/status read error in the callable). SKIP the
596
+ # sweep — never fall through to an empty set, which would treat every live run's VM as an
597
+ # orphan and reap it. Honors the "never raises" contract.
598
+ logger.warning("hyperstack orphan sweep skipped: could not resolve active set: %s", exc)
599
+ return []
600
+ active = {run_label_prefix(a) for a in (labels or set())}
601
+ now = time.time()
602
+ orphans: list[str] = []
603
+ for v in vms:
604
+ name = str(v.get("name") or "")
605
+ if not name.startswith("flash-"):
606
+ continue
607
+ # Warm/preload boxes (``flash-preload-...``) are driver-owned: launched by
608
+ # preload.warm_instances (mode="preload"), NEVER persisted in the run DB (so never in
609
+ # ``active``), and self-terminated in _warm_one_instance's ``finally`` (and by startup
610
+ # recover_runs). A catalog warm can outlast this ~10-min sweep, so reaping them by the bare
611
+ # ``flash-`` prefix would kill an in-progress preload mid-download; normally exempt them.
612
+ # EXCEPTION: a box still alive past its embedded wall deadline + grace has lost its driver (the
613
+ # only thing that terminates instance providers — nothing on the box self-terminates the VM), so
614
+ # reap it to bound the leak rather than exempt it forever (see preload_box_reap_due).
615
+ if name.startswith("flash-preload-"):
616
+ if preload_box_reap_due(name, now):
617
+ vid = v.get("id")
618
+ if vid:
619
+ orphans.append(str(vid))
620
+ logger.warning(
621
+ "reaping orphaned hyperstack preload vm %s (outlived its wall deadline + "
622
+ "grace; driver lost)", name)
623
+ continue
624
+ if any(name == a or name.startswith(a + "-s") for a in active):
625
+ continue
626
+ vid = v.get("id")
627
+ if vid:
628
+ orphans.append(str(vid))
629
+ deleted = hs_api.delete_vms(orphans) if orphans else []
630
+ for vid in deleted:
631
+ logger.warning("deleted orphaned hyperstack vm %s", vid)
632
+ return deleted