freesolo-flash-dev 0.2.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flash/__init__.py +29 -0
- flash/_channel.py +23 -0
- flash/_fileio.py +35 -0
- flash/_logging.py +49 -0
- flash/_update_check.py +266 -0
- flash/catalog.py +253 -0
- flash/cli/__init__.py +1 -0
- flash/cli/main/__init__.py +227 -0
- flash/cli/main/__main__.py +6 -0
- flash/cli/main/commands.py +636 -0
- flash/cli/main/envpush.py +317 -0
- flash/cli/main/render.py +599 -0
- flash/cli/main/training_doc.py +455 -0
- flash/client/__init__.py +14 -0
- flash/client/config.py +70 -0
- flash/client/http.py +372 -0
- flash/client/runtime_secrets.py +69 -0
- flash/client/specs.py +20 -0
- flash/cost/__init__.py +16 -0
- flash/cost/analytical.py +175 -0
- flash/cost/facts.py +114 -0
- flash/cost/spec.py +113 -0
- flash/cost/types.py +158 -0
- flash/engine/__init__.py +6 -0
- flash/engine/accounting.py +36 -0
- flash/engine/chalk_kernels.py +116 -0
- flash/engine/multiturn_rollout.py +780 -0
- flash/engine/recipe.py +86 -0
- flash/engine/vram.py +603 -0
- flash/engine/worker/__init__.py +2916 -0
- flash/engine/worker/__main__.py +4 -0
- flash/engine/worker/kernel_warmup.py +400 -0
- flash/engine/worker/lora.py +796 -0
- flash/engine/worker/packing.py +366 -0
- flash/engine/worker/perf.py +1048 -0
- flash/envs/__init__.py +10 -0
- flash/envs/adapter/__init__.py +883 -0
- flash/envs/adapter/rubric.py +222 -0
- flash/envs/base.py +52 -0
- flash/envs/registry.py +62 -0
- flash/mcp/__init__.py +1 -0
- flash/mcp/server.py +85 -0
- flash/providers/__init__.py +59 -0
- flash/providers/_auth.py +24 -0
- flash/providers/_http.py +230 -0
- flash/providers/_instance.py +416 -0
- flash/providers/_instance_bootstrap.py +517 -0
- flash/providers/_poll.py +311 -0
- flash/providers/allocator.py +193 -0
- flash/providers/base.py +431 -0
- flash/providers/hyperstack/__init__.py +127 -0
- flash/providers/hyperstack/api.py +522 -0
- flash/providers/hyperstack/auth.py +17 -0
- flash/providers/hyperstack/gpus.py +29 -0
- flash/providers/hyperstack/jobs/__init__.py +632 -0
- flash/providers/hyperstack/jobs/builders.py +122 -0
- flash/providers/hyperstack/preflight.py +23 -0
- flash/providers/hyperstack/pricing.py +26 -0
- flash/providers/hyperstack/train.py +25 -0
- flash/providers/lambdalabs/__init__.py +139 -0
- flash/providers/lambdalabs/api.py +261 -0
- flash/providers/lambdalabs/auth.py +18 -0
- flash/providers/lambdalabs/gpus.py +29 -0
- flash/providers/lambdalabs/jobs/__init__.py +724 -0
- flash/providers/lambdalabs/jobs/builders.py +118 -0
- flash/providers/lambdalabs/preflight.py +27 -0
- flash/providers/lambdalabs/pricing.py +51 -0
- flash/providers/lambdalabs/train.py +27 -0
- flash/providers/preflight.py +55 -0
- flash/providers/realized.py +80 -0
- flash/providers/runpod/__init__.py +130 -0
- flash/providers/runpod/api.py +186 -0
- flash/providers/runpod/auth.py +37 -0
- flash/providers/runpod/cost.py +57 -0
- flash/providers/runpod/gpus.py +46 -0
- flash/providers/runpod/jobs.py +956 -0
- flash/providers/runpod/keys.py +139 -0
- flash/providers/runpod/preflight.py +30 -0
- flash/providers/runpod/preload.py +915 -0
- flash/providers/runpod/pricing.py +18 -0
- flash/providers/runpod/slots.py +79 -0
- flash/providers/runpod/train/__init__.py +150 -0
- flash/providers/runpod/train/deps.py +395 -0
- flash/providers/runpod/train/endpoints.py +820 -0
- flash/py.typed +0 -0
- flash/runner/__init__.py +686 -0
- flash/runner/checkpoints.py +82 -0
- flash/runner/deploy.py +422 -0
- flash/runner/lifecycle.py +672 -0
- flash/schema/__init__.py +375 -0
- flash/schema/fields.py +331 -0
- flash/serve/__init__.py +1 -0
- flash/serve/deploy.py +326 -0
- flash/serve/pricing.py +60 -0
- flash/server/__init__.py +1 -0
- flash/server/__main__.py +20 -0
- flash/server/app.py +961 -0
- flash/server/auth.py +263 -0
- flash/server/billing.py +124 -0
- flash/server/checkpoints.py +110 -0
- flash/server/db.py +160 -0
- flash/server/environment_registry.py +102 -0
- flash/server/envs.py +360 -0
- flash/server/reconcile.py +163 -0
- flash/server/run_registry.py +150 -0
- flash/spec.py +333 -0
- freesolo_flash_dev-0.2.25.dist-info/METADATA +192 -0
- freesolo_flash_dev-0.2.25.dist-info/RECORD +111 -0
- freesolo_flash_dev-0.2.25.dist-info/WHEEL +4 -0
- freesolo_flash_dev-0.2.25.dist-info/entry_points.txt +3 -0
- freesolo_flash_dev-0.2.25.dist-info/licenses/LICENSE +201 -0
flash/providers/_http.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
"""Shared stdlib REST client for provider API modules.
|
|
2
|
+
|
|
3
|
+
Provider API clients use the same hardened-retry shape: a Bearer/Content-Type urllib
|
|
4
|
+
request, a jittered exponential backoff that retries 5xx/429 and fast-fails other 4xx
|
|
5
|
+
with the response body as the actionable detail, and a "failed after N attempts" raise.
|
|
6
|
+
This module factors that common core out so the backoff math lives in one place.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import contextlib
|
|
12
|
+
import json
|
|
13
|
+
import os
|
|
14
|
+
import random
|
|
15
|
+
import re
|
|
16
|
+
import time
|
|
17
|
+
import urllib.error
|
|
18
|
+
import urllib.request
|
|
19
|
+
from collections.abc import Callable
|
|
20
|
+
from typing import Any
|
|
21
|
+
|
|
22
|
+
# An unambiguous ``HTTP 404`` token: ``http 404`` bounded so a longer status-LIKE number can't
|
|
23
|
+
# match. ``\b`` after ``404`` rejects ``HTTP 4040``/``HTTP 4041`` (digit immediately after), while
|
|
24
|
+
# still matching ``HTTP 404:``, ``HTTP 404 Not Found``, and a trailing ``HTTP 404`` at end-of-string.
|
|
25
|
+
_HTTP_404_RE = re.compile(r"\bhttp 404\b")
|
|
26
|
+
# Same shape for ``HTTP 409`` (conflict) — bounded so ``HTTP 4090``/``4091`` (and a ``4090`` GPU
|
|
27
|
+
# name) can't match: only a genuine ``HTTP 409`` token counts in the no-cause text fallback.
|
|
28
|
+
_HTTP_409_RE = re.compile(r"\bhttp 409\b")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _status_matches(err: Exception, code: int, token_re: re.Pattern[str]) -> bool:
|
|
32
|
+
"""True when ``err`` represents the given HTTP ``code``, keyed off the chained status when present.
|
|
33
|
+
|
|
34
|
+
``request_with_retries`` chains the original urllib ``HTTPError`` as ``__cause__`` for every
|
|
35
|
+
fast-failed 4xx (and on the "failed after N attempts" path), so the status CODE is authoritative
|
|
36
|
+
when a cause is present — anything else is a real failure that must NOT be swallowed. We only
|
|
37
|
+
fall back to a text match when there is no HTTPError cause, and even then only on an unambiguous
|
|
38
|
+
``HTTP <code>`` TOKEN (``token_re``) — NEVER a bare substring, and never a longer number that
|
|
39
|
+
just begins with ``code``: the token's trailing ``\\b`` rejects ``HTTP <code>0``/``<code>1`` (and
|
|
40
|
+
a ``4090`` GPU name), so a transient error whose text embeds such an id is not misread."""
|
|
41
|
+
cause = getattr(err, "__cause__", None)
|
|
42
|
+
if isinstance(cause, urllib.error.HTTPError):
|
|
43
|
+
return cause.code == code
|
|
44
|
+
return bool(token_re.search(str(err).lower()))
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def is_not_found(err: Exception) -> bool:
|
|
48
|
+
"""True only when a provider API error represents a genuine HTTP 404 (resource already gone).
|
|
49
|
+
|
|
50
|
+
See ``_status_matches`` for how the chained-HTTPError code is authoritative and why the no-cause
|
|
51
|
+
fallback uses the bounded ``HTTP 404`` token (never a bare ``"404"`` substring). Mirrors
|
|
52
|
+
``runpod.api._is_not_found``."""
|
|
53
|
+
return _status_matches(err, 404, _HTTP_404_RE)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def is_conflict(err: Exception) -> bool:
|
|
57
|
+
"""True only when a provider API error represents a genuine HTTP 409 (conflict).
|
|
58
|
+
|
|
59
|
+
Same status-CODE-authoritative logic as ``is_not_found`` (see ``_status_matches``): a real 409
|
|
60
|
+
has the chained ``HTTPError`` whose ``.code == 409``; a non-409 failure whose message merely
|
|
61
|
+
contains ``"409"`` (e.g. a ``4090`` GPU name) plus the word "conflict" is NOT a conflict and must
|
|
62
|
+
still surface. Used by Hyperstack ``delete_vm`` to treat an in-flight-teardown 409 as success."""
|
|
63
|
+
return _status_matches(err, 409, _HTTP_409_RE)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class RestClient:
|
|
67
|
+
"""Parametrized urllib REST client with jittered-backoff retries.
|
|
68
|
+
|
|
69
|
+
``base_url`` is prefixed onto the ``target`` passed to each call. The key is read
|
|
70
|
+
from ``env_var`` on each request (env-only by design — never persisted) and failures
|
|
71
|
+
raise ``error_cls``.
|
|
72
|
+
|
|
73
|
+
``keys_provider`` (optional) supplies an *ordered* list of API keys to try per call:
|
|
74
|
+
each key runs the full backoff loop, and a key that ends in a failover-class error
|
|
75
|
+
(per ``failover_predicate``) hands off to the next key — used by providers whose key
|
|
76
|
+
is a multi-account pool (see ``runpod.keys``). With no ``keys_provider`` the client
|
|
77
|
+
uses the single ``env_var`` key and behaves exactly as a single-key client.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
def __init__(
|
|
81
|
+
self,
|
|
82
|
+
*,
|
|
83
|
+
env_var: str,
|
|
84
|
+
error_cls: type[Exception],
|
|
85
|
+
base_url: str = "",
|
|
86
|
+
missing_key_message: str | None = None,
|
|
87
|
+
keys_provider: Callable[[], list[str]] | None = None,
|
|
88
|
+
failover_predicate: Callable[[Exception], bool] | None = None,
|
|
89
|
+
extra_headers: dict[str, str] | None = None,
|
|
90
|
+
auth_header_name: str = "Authorization",
|
|
91
|
+
auth_value_format: str = "Bearer {key}",
|
|
92
|
+
) -> None:
|
|
93
|
+
self.env_var = env_var
|
|
94
|
+
self.error_cls = error_cls
|
|
95
|
+
self.base_url = base_url
|
|
96
|
+
self.missing_key_message = (
|
|
97
|
+
missing_key_message or f"{env_var} not configured on the control-plane host"
|
|
98
|
+
)
|
|
99
|
+
self.keys_provider = keys_provider
|
|
100
|
+
self.failover_predicate = failover_predicate
|
|
101
|
+
# Static headers added to EVERY request (e.g. a custom User-Agent). Lambda Cloud sits
|
|
102
|
+
# behind Cloudflare, which 403s the stdlib default ``Python-urllib/<v>`` UA — so the
|
|
103
|
+
# Lambda client passes a real UA here. The auth + ``Content-Type`` headers are always set
|
|
104
|
+
# by ``request`` and win on a key collision.
|
|
105
|
+
self.extra_headers = dict(extra_headers or {})
|
|
106
|
+
# How the API key is presented. Default is RunPod/Lambda's ``Authorization: Bearer <key>``;
|
|
107
|
+
# Hyperstack uses a bare ``api_key: <key>`` header instead (``auth_header_name="api_key"``,
|
|
108
|
+
# ``auth_value_format="{key}"``).
|
|
109
|
+
self.auth_header_name = auth_header_name
|
|
110
|
+
self.auth_value_format = auth_value_format
|
|
111
|
+
|
|
112
|
+
def api_key(self) -> str:
|
|
113
|
+
key = os.environ.get(self.env_var)
|
|
114
|
+
if not key:
|
|
115
|
+
raise self.error_cls(self.missing_key_message)
|
|
116
|
+
return key
|
|
117
|
+
|
|
118
|
+
def _ordered_keys(self) -> list[str]:
|
|
119
|
+
"""The keys to try, in order. Single-key clients yield exactly the env key."""
|
|
120
|
+
if self.keys_provider is None:
|
|
121
|
+
return [self.api_key()]
|
|
122
|
+
keys = self.keys_provider()
|
|
123
|
+
if not keys:
|
|
124
|
+
raise self.error_cls(self.missing_key_message)
|
|
125
|
+
return keys
|
|
126
|
+
|
|
127
|
+
def request(
|
|
128
|
+
self,
|
|
129
|
+
target: str,
|
|
130
|
+
method: str = "GET",
|
|
131
|
+
body: dict | None = None,
|
|
132
|
+
timeout: float = 30.0,
|
|
133
|
+
key: str | None = None,
|
|
134
|
+
) -> Any:
|
|
135
|
+
req = urllib.request.Request(
|
|
136
|
+
f"{self.base_url}{target}",
|
|
137
|
+
method=method,
|
|
138
|
+
data=json.dumps(body).encode() if body is not None else None,
|
|
139
|
+
headers={
|
|
140
|
+
**self.extra_headers,
|
|
141
|
+
self.auth_header_name: self.auth_value_format.format(key=key or self.api_key()),
|
|
142
|
+
"Content-Type": "application/json",
|
|
143
|
+
},
|
|
144
|
+
)
|
|
145
|
+
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
146
|
+
raw = resp.read()
|
|
147
|
+
return json.loads(raw) if raw else {}
|
|
148
|
+
|
|
149
|
+
def _request_one_key(
|
|
150
|
+
self,
|
|
151
|
+
key: str,
|
|
152
|
+
target: str,
|
|
153
|
+
method: str,
|
|
154
|
+
body: dict | None,
|
|
155
|
+
retries: int,
|
|
156
|
+
base_delay: float,
|
|
157
|
+
) -> Any:
|
|
158
|
+
"""One key's full backoff loop (the original single-key behavior)."""
|
|
159
|
+
last: Exception | None = None
|
|
160
|
+
for attempt in range(retries + 1):
|
|
161
|
+
try:
|
|
162
|
+
return self.request(target, method=method, body=body, key=key)
|
|
163
|
+
except urllib.error.HTTPError as e:
|
|
164
|
+
if e.code < 500 and e.code != 429:
|
|
165
|
+
# The response body usually carries the actionable error detail; e.reason
|
|
166
|
+
# alone (e.g. "Bad Request") is rarely enough to debug a 4xx.
|
|
167
|
+
detail = ""
|
|
168
|
+
with contextlib.suppress(Exception):
|
|
169
|
+
detail = e.read().decode("utf-8", "replace")[:500].strip()
|
|
170
|
+
suffix = f": {detail}" if detail else ""
|
|
171
|
+
raise self.error_cls(
|
|
172
|
+
f"{method} {target} -> HTTP {e.code}: {e.reason}{suffix}"
|
|
173
|
+
) from e
|
|
174
|
+
last = e
|
|
175
|
+
except (urllib.error.URLError, TimeoutError, ConnectionError, OSError) as e:
|
|
176
|
+
last = e
|
|
177
|
+
if attempt < retries:
|
|
178
|
+
delay = min(base_delay * (2 ** min(attempt, 6)), 30.0)
|
|
179
|
+
time.sleep(delay * random.uniform(0.7, 1.3))
|
|
180
|
+
# Chain the last exception so callers can inspect it: ``_is_not_found`` keys off the
|
|
181
|
+
# HTTPError code, and the multi-key waterfall's ``failover_predicate`` needs it to see
|
|
182
|
+
# a persistent 429's status (without the cause it can't tell the account is rate/quota
|
|
183
|
+
# limited and would stop instead of trying the next account).
|
|
184
|
+
raise self.error_cls(
|
|
185
|
+
f"{method} {target} failed after {retries + 1} attempts: {last}"
|
|
186
|
+
) from last
|
|
187
|
+
|
|
188
|
+
def request_with_retries_for_key(
|
|
189
|
+
self,
|
|
190
|
+
key: str,
|
|
191
|
+
target: str,
|
|
192
|
+
method: str = "GET",
|
|
193
|
+
body: dict | None = None,
|
|
194
|
+
retries: int = 4,
|
|
195
|
+
base_delay: float = 2.0,
|
|
196
|
+
) -> Any:
|
|
197
|
+
"""Like request_with_retries but always uses the supplied key, bypassing the pool.
|
|
198
|
+
|
|
199
|
+
Use this when you need to query each account in the pool independently (e.g.
|
|
200
|
+
list_endpoints aggregation) rather than stopping at the first success.
|
|
201
|
+
"""
|
|
202
|
+
return self._request_one_key(key, target, method, body, retries, base_delay)
|
|
203
|
+
|
|
204
|
+
def request_with_retries(
|
|
205
|
+
self,
|
|
206
|
+
target: str,
|
|
207
|
+
method: str = "GET",
|
|
208
|
+
body: dict | None = None,
|
|
209
|
+
retries: int = 4,
|
|
210
|
+
base_delay: float = 2.0,
|
|
211
|
+
) -> Any:
|
|
212
|
+
"""REST call hardened against transient network/5xx blips (jittered backoff).
|
|
213
|
+
|
|
214
|
+
With a multi-key ``keys_provider``, a key that fails with a failover-class error
|
|
215
|
+
hands off to the next key in the pool; a hard, key-agnostic error (or the last
|
|
216
|
+
key) is raised. Single-key clients try exactly one key — identical to before.
|
|
217
|
+
"""
|
|
218
|
+
ordered = self._ordered_keys()
|
|
219
|
+
last_exc: Exception | None = None
|
|
220
|
+
for i, key in enumerate(ordered):
|
|
221
|
+
try:
|
|
222
|
+
return self._request_one_key(key, target, method, body, retries, base_delay)
|
|
223
|
+
except self.error_cls as e:
|
|
224
|
+
last_exc = e
|
|
225
|
+
more_keys = i < len(ordered) - 1
|
|
226
|
+
if more_keys and self.failover_predicate is not None and self.failover_predicate(e):
|
|
227
|
+
continue
|
|
228
|
+
raise
|
|
229
|
+
# Only reachable if ordered is empty, which _ordered_keys already guards against.
|
|
230
|
+
raise last_exc or self.error_cls(self.missing_key_message)
|
|
@@ -0,0 +1,416 @@
|
|
|
1
|
+
"""Shared building blocks for the instance-based providers (Lambda, Hyperstack).
|
|
2
|
+
|
|
3
|
+
Both rent a single-GPU instance and bootstrap it identically: ship a cloud-init ``user_data`` that
|
|
4
|
+
runs the prebuilt ``WORKER_IMAGE`` via Docker on the host, detect completion from the worker's HF
|
|
5
|
+
artifacts, and guarantee teardown control-plane-side. The per-provider packages differ only in the
|
|
6
|
+
REST API (launch/list/terminate) and the capacity model; everything below — the run-derived
|
|
7
|
+
sweep-matchable label, the bootstrap payload, and the cloud-init script — is identical, so it lives
|
|
8
|
+
here (single source of truth, parameterized by the substrate ``arm`` and the run's image).
|
|
9
|
+
|
|
10
|
+
The shipped bootstrap is the sibling ``_instance_bootstrap.py``; ``arm`` (e.g. ``lambda`` /
|
|
11
|
+
``hyperstack``) travels in ``payload["flash_arm"]`` and decides FLASH_ARM + the ``<arm>_attempt<N>``
|
|
12
|
+
marker name.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import base64
|
|
18
|
+
import hashlib
|
|
19
|
+
import io
|
|
20
|
+
import json
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
# Lambda/Hyperstack cap an instance/VM ``name`` at 64 chars. We keep the label at or under this so
|
|
24
|
+
# the name is NEVER silently truncated at launch — truncation would desync the stored name from the
|
|
25
|
+
# ``run_label_prefix`` the orphan-sweep matches on, which could fail to protect (or wrongly reap) a
|
|
26
|
+
# live run. The seed/attempt suffix ``-s{seed}-a{attempt}`` is bounded (<=12 chars), so the prefix
|
|
27
|
+
# is bounded to leave room for it.
|
|
28
|
+
_MAX_NAME = 60
|
|
29
|
+
_PREFIX_BUDGET = _MAX_NAME - 12
|
|
30
|
+
|
|
31
|
+
# Above this many chars, the serialized job spec is spilled OUT of the inline cloud-init user_data
|
|
32
|
+
# (uploaded to HF; the bootstrap fetches it) so a large inline spec can't overflow the provider's
|
|
33
|
+
# user_data size limit and get the launch rejected before a handle is persisted. Below it the spec
|
|
34
|
+
# rides inline (the common, tiny-spec case) so launch needs no extra HF round-trip. The cap is well
|
|
35
|
+
# under the bootstrap's own 96_000-char execve threshold, and the base64 + heredoc framing inflates
|
|
36
|
+
# user_data ~1.4x, so the spilled ceiling keeps a typical run's user_data comfortably small.
|
|
37
|
+
_SPEC_SPILL_THRESHOLD = 16_000
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def run_label_prefix(run_id: str) -> str:
|
|
41
|
+
"""The prefix EVERY instance label for ``run_id`` starts with, bounded to the name budget.
|
|
42
|
+
|
|
43
|
+
Platform run ids already start with ``flash-``; anything else (direct-API callers, tests) gets
|
|
44
|
+
the prefix forced. A run id long enough to overflow the provider name cap is shortened
|
|
45
|
+
DETERMINISTICALLY (a stable 8-char hash suffix) so launch AND ``sweep_orphans`` compute the
|
|
46
|
+
IDENTICAL bounded prefix — and two distinct long run ids never collide onto the same name (which
|
|
47
|
+
could otherwise reap the wrong live instance). Short ids (the common case) pass through
|
|
48
|
+
unchanged."""
|
|
49
|
+
base = run_id if run_id.startswith("flash-") else f"flash-{run_id}"
|
|
50
|
+
if len(base) <= _PREFIX_BUDGET:
|
|
51
|
+
return base
|
|
52
|
+
h = hashlib.sha1(base.encode()).hexdigest()[:8]
|
|
53
|
+
return f"{base[: _PREFIX_BUDGET - 9]}-{h}"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def instance_label(run_id: str, seed: int, attempt: int) -> str:
|
|
57
|
+
"""Instance name: run-derived so ``sweep_orphans`` can tell ours from anything else on the
|
|
58
|
+
account, and bounded (via ``run_label_prefix``) so the provider never truncates it."""
|
|
59
|
+
return f"{run_label_prefix(run_id)}-s{seed}-a{attempt}"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# The worker container path the per-region cache is bind-mounted at, and the HF cache under it. The
|
|
63
|
+
# host mount differs per provider (Lambda NFS /lambda/nfs/<name>; Hyperstack block /mnt/flash-weights)
|
|
64
|
+
# but the CONTAINER path is fixed, so HF_HOME is uniform regardless of substrate.
|
|
65
|
+
CACHE_CONTAINER_MOUNT = "/weight-cache"
|
|
66
|
+
CACHE_HF_HOME = f"{CACHE_CONTAINER_MOUNT}/hf-cache"
|
|
67
|
+
# Sentinel file written onto a SUCCESSFULLY-mounted block-volume cache (by the cloud-init preamble),
|
|
68
|
+
# so the in-container preload mount-check can tell a real mount from an empty Docker bind (a failed
|
|
69
|
+
# attach). Lives on the device itself -> absent when the volume isn't actually mounted.
|
|
70
|
+
CACHE_MOUNT_MARKER = ".flash-cache-mounted"
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _cache_block_device_setup(payload: dict) -> str:
|
|
74
|
+
"""Cloud-init preamble (block-volume providers, e.g. Hyperstack): wait for the attached volume's
|
|
75
|
+
block device, format it ONCE if it has no filesystem (NEVER reformat a populated cache — guarded
|
|
76
|
+
by ``blkid``), and mount it at the host ``cache_host_mount``. No-op for NFS providers (Lambda
|
|
77
|
+
auto-mounts) and for cold runs. Best-effort: if the device never appears / mount fails, the bind
|
|
78
|
+
falls back to an empty dir (a correct cold run), never a hard failure."""
|
|
79
|
+
if not payload.get("cache_block_device") or not payload.get("cache_host_mount"):
|
|
80
|
+
return ""
|
|
81
|
+
mount = payload["cache_host_mount"]
|
|
82
|
+
# The attached cache volume is provisioned at an EXACT known size, so pick the candidate disk by
|
|
83
|
+
# size (±20%) AND require that neither it nor any of its partitions is mounted. That excludes the
|
|
84
|
+
# boot disk (its partition is mounted at /) and any differently-sized ephemeral/local NVMe — so we
|
|
85
|
+
# never mkfs the wrong device. A warm cache disk (already ext4, unmounted) still matches, and the
|
|
86
|
+
# blkid guard keeps its data. If nothing matches, run cold (format nothing).
|
|
87
|
+
expect_bytes = int(payload.get("cache_size_gb") or 0) * 1000 * 1000 * 1000
|
|
88
|
+
marker = CACHE_MOUNT_MARKER
|
|
89
|
+
return f"""
|
|
90
|
+
# --- weight-cache block volume: wait-for-device (size-matched, unmounted), format-if-new, mount ---
|
|
91
|
+
echo "FLASH: waiting for the attached cache block device (~{payload.get('cache_size_gb')}GB)..."
|
|
92
|
+
EXPECT_BYTES={expect_bytes}
|
|
93
|
+
CACHE_DEV=""
|
|
94
|
+
for i in $(seq 1 60); do
|
|
95
|
+
for d in $(lsblk -dpbn -o NAME,TYPE,SIZE | awk -v e="$EXPECT_BYTES" \
|
|
96
|
+
'$2=="disk" && e>0 {{lo=e*0.8; hi=e*1.2; if ($3+0>=lo && $3+0<=hi) print $1}}'); do
|
|
97
|
+
# Skip any disk with a mounted partition (boot/data disks in use) — only a free disk is ours.
|
|
98
|
+
if lsblk -pnr -o MOUNTPOINT "$d" | grep -q '[^[:space:]]'; then continue; fi
|
|
99
|
+
CACHE_DEV="$d"; break
|
|
100
|
+
done
|
|
101
|
+
[ -n "$CACHE_DEV" ] && break
|
|
102
|
+
sleep 5
|
|
103
|
+
done
|
|
104
|
+
if [ -n "$CACHE_DEV" ]; then
|
|
105
|
+
echo "FLASH: cache device $CACHE_DEV"
|
|
106
|
+
blkid "$CACHE_DEV" >/dev/null 2>&1 || mkfs.ext4 -q "$CACHE_DEV" || true # format ONCE; never reformat a populated cache
|
|
107
|
+
mkdir -p '{mount}'
|
|
108
|
+
if mount "$CACHE_DEV" '{mount}' 2>/dev/null; then
|
|
109
|
+
# Sentinel written ONTO the mounted block device (not the underlying empty dir): it is only
|
|
110
|
+
# visible at the bind path inside the container when the REAL volume is mounted. The preload
|
|
111
|
+
# mount-check requires it, so a failed/absent attach (Docker binding an empty host dir) can't
|
|
112
|
+
# masquerade as a warm cache and silently warm ephemeral disk.
|
|
113
|
+
touch '{mount}/{marker}' 2>/dev/null || true
|
|
114
|
+
else
|
|
115
|
+
echo "FLASH: cache mount failed; running cold"
|
|
116
|
+
fi
|
|
117
|
+
else
|
|
118
|
+
echo "FLASH: no matching cache block device appeared; running cold"
|
|
119
|
+
fi
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _cache_nfs_mount_check(payload: dict) -> str:
|
|
124
|
+
"""Cloud-init preamble (NFS providers, e.g. Lambda): the platform auto-mounts the weight-cache
|
|
125
|
+
filesystem on the host at ``cache_host_mount`` — but ONLY if Lambda actually attached + readied it.
|
|
126
|
+
Docker's ``-v`` bind silently auto-CREATES a missing host dir, so a launch where the FS never
|
|
127
|
+
mounted would otherwise have the container warm an empty EPHEMERAL bind dir and report a false warm.
|
|
128
|
+
Verify the host path is a REAL mountpoint (an auto-created empty dir on the boot disk is not) and,
|
|
129
|
+
if so, drop the same sentinel the block-device path uses — ``run_preload`` requires it for the
|
|
130
|
+
cache, so a not-actually-mounted NFS fails the preload mount-check instead of warming throwaway
|
|
131
|
+
disk. No-op for block-volume providers (handled by ``_cache_block_device_setup``) and cold runs.
|
|
132
|
+
Best-effort: a training run still degrades to cold; only preload hard-requires the sentinel."""
|
|
133
|
+
if not payload.get("cache_host_mount") or payload.get("cache_block_device"):
|
|
134
|
+
return ""
|
|
135
|
+
mount = payload["cache_host_mount"]
|
|
136
|
+
marker = CACHE_MOUNT_MARKER
|
|
137
|
+
return f"""
|
|
138
|
+
# --- weight-cache NFS mount: verify the platform actually mounted it, then drop the sentinel ---
|
|
139
|
+
if mountpoint -q '{mount}'; then
|
|
140
|
+
echo "FLASH: weight-cache NFS mounted at {mount}"
|
|
141
|
+
touch '{mount}/{marker}' 2>/dev/null || true
|
|
142
|
+
else
|
|
143
|
+
echo "FLASH: weight-cache NFS NOT mounted at {mount} (no sentinel; preload will refuse, train runs cold)"
|
|
144
|
+
fi
|
|
145
|
+
"""
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def build_payload(
|
|
149
|
+
spec,
|
|
150
|
+
seed: int,
|
|
151
|
+
attempt: int,
|
|
152
|
+
*,
|
|
153
|
+
arm: str,
|
|
154
|
+
runtime_secrets: dict | None = None,
|
|
155
|
+
cache_host_mount: str | None = None,
|
|
156
|
+
cache_block_device: bool = False,
|
|
157
|
+
mode: str | None = None,
|
|
158
|
+
models: list | None = None,
|
|
159
|
+
) -> dict:
|
|
160
|
+
"""The bootstrap's input — field-compatible with the RunPod ``_train_body`` payload, plus the
|
|
161
|
+
bits the instance can't infer (HF prefix for markers, wall cap, attempt, and the substrate
|
|
162
|
+
``arm`` that the bootstrap stamps as FLASH_ARM + the marker name).
|
|
163
|
+
|
|
164
|
+
``cache_host_mount`` (set by the provider when it attaches a per-region weight cache) points
|
|
165
|
+
HF_HOME at the bind-mounted cache (``/weight-cache/hf-cache``) instead of stripping the
|
|
166
|
+
RunPod redirect; ``cache_block_device`` adds the format/mount preamble for block-volume providers.
|
|
167
|
+
"""
|
|
168
|
+
from flash.envs.registry import worker_pip_for_env
|
|
169
|
+
from flash.providers.runpod.train import (
|
|
170
|
+
build_worker_env,
|
|
171
|
+
chalk_extra_pip,
|
|
172
|
+
strip_runpod_volume_env,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
# Start from the shared env with the RunPod /runpod-volume redirect stripped (that mount is
|
|
176
|
+
# RunPod-only). If THIS provider attached a cache, point HF_HOME at the instance cache mount —
|
|
177
|
+
# but DON'T clobber a per-run [worker_env].HF_HOME the user set on purpose. build_worker_env
|
|
178
|
+
# merges [worker_env] LAST, so a user override survives the strip above (only /runpod-volume-
|
|
179
|
+
# rooted vars are stripped); on RunPod that override wins, so honor it here too for parity. We
|
|
180
|
+
# only install the cache path when HF_HOME is absent (i.e. the platform redirect was stripped and
|
|
181
|
+
# the user set nothing).
|
|
182
|
+
env = strip_runpod_volume_env(build_worker_env(spec, seed, runtime_secrets=runtime_secrets))
|
|
183
|
+
if cache_host_mount and not env.get("HF_HOME"):
|
|
184
|
+
env["HF_HOME"] = CACHE_HF_HOME
|
|
185
|
+
payload = {
|
|
186
|
+
"hf_repo": spec.train.hf_repo,
|
|
187
|
+
"job_spec_json": spec.to_json(),
|
|
188
|
+
"phase": spec.phase,
|
|
189
|
+
"seed": int(seed),
|
|
190
|
+
"flash_arm": arm,
|
|
191
|
+
"env": env,
|
|
192
|
+
# The bootstrap pip-installs extra_pip for every job, so the per-run env wheel + the opt-in
|
|
193
|
+
# chalk spec ride along here to reach default runs (mirrors runpod/jobs.submit_run).
|
|
194
|
+
"extra_pip": (list(spec.environment.pip) or worker_pip_for_env(spec.environment.id))
|
|
195
|
+
+ chalk_extra_pip(spec),
|
|
196
|
+
"hf_prefix": f"{spec.phase}/{spec.run_id}/seed{seed}",
|
|
197
|
+
"max_wall_s": max(60, int(spec.gpu.max_wall_seconds)),
|
|
198
|
+
"attempt": int(attempt),
|
|
199
|
+
}
|
|
200
|
+
if cache_host_mount:
|
|
201
|
+
payload["cache_host_mount"] = cache_host_mount
|
|
202
|
+
# Carry the mount sentinel filename so the bootstrap's mount-check reads it from ONE source of
|
|
203
|
+
# truth (this constant) instead of re-hardcoding the literal — BOTH cloud-init preambles
|
|
204
|
+
# (_cache_block_device_setup for block volumes, _cache_nfs_mount_check for NFS) write the same
|
|
205
|
+
# CACHE_MOUNT_MARKER onto a verified-real mount, so the in-container preload check can tell a
|
|
206
|
+
# genuine mount from an empty Docker bind regardless of substrate.
|
|
207
|
+
payload["cache_mount_marker"] = CACHE_MOUNT_MARKER
|
|
208
|
+
if cache_block_device:
|
|
209
|
+
payload["cache_block_device"] = True
|
|
210
|
+
# The block-device preamble matches the attached volume by its EXACT provisioned size, so
|
|
211
|
+
# carry the runner-assigned size (falls back to the default cache size). Parse tolerantly
|
|
212
|
+
# via _volume_gb so a non-int / stale spec value ("0", "", "abc", bool) can't crash the
|
|
213
|
+
# instance bootstrap on this best-effort device-matching hint — it defaults instead.
|
|
214
|
+
from flash.runner import WEIGHT_CACHE_VOLUME_GB
|
|
215
|
+
from flash.spec import _volume_gb
|
|
216
|
+
|
|
217
|
+
payload["cache_size_gb"] = _volume_gb(
|
|
218
|
+
getattr(spec.gpu, "network_volume_gb", None), default=WEIGHT_CACHE_VOLUME_GB
|
|
219
|
+
)
|
|
220
|
+
# Preload (warm) mode: the bootstrap downloads ``models`` into the mounted cache and exits — no
|
|
221
|
+
# code fetch, no worker. Only meaningful with a cache attached (else there's nothing to warm).
|
|
222
|
+
if mode:
|
|
223
|
+
payload["mode"] = mode
|
|
224
|
+
payload["models"] = list(models or [])
|
|
225
|
+
return payload
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
# Host helper: best-effort upload of the consolidated boot log to HF. Neither Lambda nor Hyperstack
|
|
229
|
+
# exposes an instance console/log API, so the box pushes its own boot log to HF — the only window
|
|
230
|
+
# into a failure BEFORE the worker container can write its own artifacts (docker/GPU not ready,
|
|
231
|
+
# image pull failure). Reads creds from the on-box payload.json. Never raises.
|
|
232
|
+
_HOSTLOG_PY = """\
|
|
233
|
+
import json
|
|
234
|
+
try:
|
|
235
|
+
p = json.load(open("/opt/flash/payload.json"))
|
|
236
|
+
from huggingface_hub import HfApi
|
|
237
|
+
HfApi(token=(p.get("env") or {}).get("HF_TOKEN")).upload_file(
|
|
238
|
+
path_or_fileobj="/opt/flash/host_boot.log",
|
|
239
|
+
path_in_repo=p["hf_prefix"] + "/" + p.get("flash_arm", "instance") + "_boot.log",
|
|
240
|
+
repo_id=p["hf_repo"],
|
|
241
|
+
repo_type="dataset",
|
|
242
|
+
)
|
|
243
|
+
except Exception:
|
|
244
|
+
pass
|
|
245
|
+
"""
|
|
246
|
+
|
|
247
|
+
# Host helper: write the attempt-failure marker (<arm>_attempt<N>.json, ok=false, RETRIABLE) to HF
|
|
248
|
+
# when the box can't even start the worker container (docker/GPU never ready, image pull failure).
|
|
249
|
+
# Without it a pre-container failure leaves NO marker, so the poller would burn the whole setup
|
|
250
|
+
# grace (~50 min) before reporting a generic stall; this surfaces a fast, RETRYABLE failure so the
|
|
251
|
+
# runner re-provisions on a fresh host immediately. Reads creds from the on-box payload.json.
|
|
252
|
+
#
|
|
253
|
+
# CRITICAL: the worker OWNS this marker path. A container that starts but fast-fails on a real,
|
|
254
|
+
# non-retriable user/config error can exit before the host's ~5s liveness check, having ALREADY
|
|
255
|
+
# uploaded its own ok=false marker (the TRUE error) here. The host must NOT overwrite it with a
|
|
256
|
+
# RETRIABLE host marker — that would relabel a genuine user error as job_preempted and silently
|
|
257
|
+
# retry / hide the root cause. So this writes the host marker ONLY when no worker attempt marker
|
|
258
|
+
# yet exists at the path (i.e. the container never got far enough to write one). The check is
|
|
259
|
+
# best-effort: on a read error it stays conservative and SKIPS the write (never clobbers).
|
|
260
|
+
_FAILMARK_PY = """\
|
|
261
|
+
import json, sys
|
|
262
|
+
try:
|
|
263
|
+
p = json.load(open("/opt/flash/payload.json"))
|
|
264
|
+
arm = p.get("flash_arm", "instance"); att = int(p.get("attempt") or 0)
|
|
265
|
+
reason = sys.argv[1] if len(sys.argv) > 1 else "host boot failure"
|
|
266
|
+
marker_path = p["hf_prefix"] + "/" + arm + "_attempt" + str(att) + ".json"
|
|
267
|
+
from huggingface_hub import HfApi
|
|
268
|
+
api = HfApi(token=(p.get("env") or {}).get("HF_TOKEN"))
|
|
269
|
+
try:
|
|
270
|
+
worker_marker_exists = api.file_exists(repo_id=p["hf_repo"], filename=marker_path, repo_type="dataset")
|
|
271
|
+
except Exception:
|
|
272
|
+
worker_marker_exists = True # conservative: on a read error, never risk clobbering
|
|
273
|
+
if not worker_marker_exists:
|
|
274
|
+
open("/opt/flash/fm.json", "w").write(json.dumps({"ok": False, "attempt": att, "retriable": True, "error": "host: " + reason}))
|
|
275
|
+
api.upload_file(
|
|
276
|
+
path_or_fileobj="/opt/flash/fm.json",
|
|
277
|
+
path_in_repo=marker_path,
|
|
278
|
+
repo_id=p["hf_repo"], repo_type="dataset",
|
|
279
|
+
)
|
|
280
|
+
except Exception:
|
|
281
|
+
pass
|
|
282
|
+
"""
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def _spill_large_spec_to_hf(payload: dict) -> dict:
|
|
286
|
+
"""Keep a large ``job_spec_json`` OUT of the inline cloud-init user_data.
|
|
287
|
+
|
|
288
|
+
A tiny spec already yields ~17 KB of cloud-init; a 100 KB inline param balloons user_data past
|
|
289
|
+
typical provider/cloud-init user-data caps and the launch is rejected before any handle is
|
|
290
|
+
persisted (an unrecoverable, billing-invisible failure). When the spec is large we upload it to
|
|
291
|
+
the run's HF dataset repo at ``<hf_prefix>/job_spec.json`` and replace the inline value with a
|
|
292
|
+
small ``job_spec_in_hf`` sentinel; the bootstrap fetches it from the SAME repo it already pulls
|
|
293
|
+
code from. Small specs (the common case) ride inline unchanged — no extra HF round-trip.
|
|
294
|
+
|
|
295
|
+
Returns the payload to embed (a shallow copy when spilled, else the original).
|
|
296
|
+
"""
|
|
297
|
+
spec_json = payload.get("job_spec_json") or ""
|
|
298
|
+
if len(spec_json) <= _SPEC_SPILL_THRESHOLD:
|
|
299
|
+
return payload
|
|
300
|
+
from huggingface_hub import HfApi
|
|
301
|
+
|
|
302
|
+
# Wrap the bytes in BytesIO: huggingface_hub.upload_file accepts a path-like for
|
|
303
|
+
# path_or_fileobj, and raw ``bytes`` is itself a valid path type, so it could be
|
|
304
|
+
# misinterpreted as a (huge) filesystem path. BytesIO makes it an unambiguous file-like upload.
|
|
305
|
+
HfApi(token=(payload.get("env") or {}).get("HF_TOKEN")).upload_file(
|
|
306
|
+
path_or_fileobj=io.BytesIO(spec_json.encode("utf-8")),
|
|
307
|
+
path_in_repo=f"{payload['hf_prefix']}/job_spec.json",
|
|
308
|
+
repo_id=payload["hf_repo"],
|
|
309
|
+
repo_type="dataset",
|
|
310
|
+
)
|
|
311
|
+
spilled = dict(payload)
|
|
312
|
+
spilled["job_spec_json"] = ""
|
|
313
|
+
spilled["job_spec_in_hf"] = True
|
|
314
|
+
return spilled
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def build_user_data(payload: dict, *, image: str) -> str:
|
|
318
|
+
"""Cloud-init ``user_data``: run the worker ``image`` via Docker on the host.
|
|
319
|
+
|
|
320
|
+
cloud-init runs this once at first boot as root. Everything dynamic travels base64-encoded
|
|
321
|
+
inside the script (never interpolated into shell syntax), so the job-spec JSON survives
|
|
322
|
+
byte-exact. The full training stack is baked into the image, so the box only needs Docker + an
|
|
323
|
+
NVIDIA GPU — both shipped by the providers' default Docker-capable images — and the container
|
|
324
|
+
does the rest (fetch code from HF, run the worker, stream artifacts back to HF).
|
|
325
|
+
|
|
326
|
+
A large job spec is spilled to HF first (see ``_spill_large_spec_to_hf``) so it never inflates
|
|
327
|
+
user_data past the provider's size cap.
|
|
328
|
+
|
|
329
|
+
Secrets-wise the script carries the same content as the worker env on RunPod (HF token, env
|
|
330
|
+
secrets). The operator's provider API key is NEVER shipped (teardown is control-plane-side via
|
|
331
|
+
the runner ``finally`` / poll deadline / ``sweep_orphans``).
|
|
332
|
+
"""
|
|
333
|
+
payload = _spill_large_spec_to_hf(payload)
|
|
334
|
+
payload_b64 = base64.encodebytes(json.dumps(payload).encode()).decode()
|
|
335
|
+
bootstrap_src = (Path(__file__).parent / "_instance_bootstrap.py").read_text()
|
|
336
|
+
# Weight cache: the provider mounts its region-scoped persistent storage on the HOST at
|
|
337
|
+
# ``cache_host_mount`` (Lambda auto-mounts its NFS filesystem there; Hyperstack's preamble below
|
|
338
|
+
# formats+mounts the attached block device there). Bind it into the worker container at the FIXED
|
|
339
|
+
# ``/weight-cache`` so the worker's HF_HOME=/weight-cache/hf-cache (set in build_payload) persists
|
|
340
|
+
# the model download across runs in this region. Absent -> no bind (cold run).
|
|
341
|
+
cache_host_mount = payload.get("cache_host_mount")
|
|
342
|
+
# Single-quote the host path in the docker -v (defensive; the path is a controlled constant).
|
|
343
|
+
cache_bind = f"-v '{cache_host_mount}':{CACHE_CONTAINER_MOUNT} \\\n " if cache_host_mount else ""
|
|
344
|
+
cache_setup = _cache_block_device_setup(payload) + _cache_nfs_mount_check(payload)
|
|
345
|
+
return f"""#!/bin/bash
|
|
346
|
+
# Flash instance worker (generated by flash.providers._instance.build_user_data; arm={payload.get('flash_arm')})
|
|
347
|
+
set -x
|
|
348
|
+
mkdir -p /opt/flash
|
|
349
|
+
# Consolidate ALL boot output (this script + the container) into one host log the uploader ships
|
|
350
|
+
# to HF — neither substrate has a console API, so this is the only window into a pre-worker failure.
|
|
351
|
+
exec >>/opt/flash/host_boot.log 2>&1
|
|
352
|
+
cat > /opt/flash/payload.b64 <<'FLASH_PAYLOAD_EOF'
|
|
353
|
+
{payload_b64}FLASH_PAYLOAD_EOF
|
|
354
|
+
base64 -d /opt/flash/payload.b64 > /opt/flash/payload.json
|
|
355
|
+
cat > /opt/flash/bootstrap.py <<'FLASH_BOOTSTRAP_EOF'
|
|
356
|
+
{bootstrap_src}FLASH_BOOTSTRAP_EOF
|
|
357
|
+
cat > /opt/flash/hostlog.py <<'FLASH_HOSTLOG_EOF'
|
|
358
|
+
{_HOSTLOG_PY}FLASH_HOSTLOG_EOF
|
|
359
|
+
cat > /opt/flash/failmark.py <<'FLASH_FAILMARK_EOF'
|
|
360
|
+
{_FAILMARK_PY}FLASH_FAILMARK_EOF
|
|
361
|
+
IMAGE={image!r}
|
|
362
|
+
# huggingface_hub on the host for the boot-log + failure-marker uploaders (best-effort).
|
|
363
|
+
pip3 install -q huggingface_hub >/dev/null 2>&1 \\
|
|
364
|
+
|| python3 -m pip install -q --break-system-packages huggingface_hub >/dev/null 2>&1 || true
|
|
365
|
+
fail() {{ echo "FLASH: $1" >&2; python3 /opt/flash/failmark.py "$1" >/dev/null 2>&1 || true; exit 1; }}
|
|
366
|
+
# The provider's default image ships Docker + the NVIDIA Container Toolkit, but cloud-init can run
|
|
367
|
+
# before they finish initializing — wait for both (up to ~10 min) before launching the worker.
|
|
368
|
+
for i in $(seq 1 100); do
|
|
369
|
+
if docker info >/dev/null 2>&1 && nvidia-smi >/dev/null 2>&1; then break; fi
|
|
370
|
+
echo "FLASH: waiting for docker+gpu ($i)"; sleep 6
|
|
371
|
+
done
|
|
372
|
+
docker info >/dev/null 2>&1 || fail "docker never became ready"
|
|
373
|
+
nvidia-smi >/dev/null 2>&1 || fail "gpu never became ready"
|
|
374
|
+
{cache_setup}
|
|
375
|
+
# Pull with retries (the image is large; a transient registry blip must not fail the run). On total
|
|
376
|
+
# failure, write a RETRYABLE marker and exit NOW instead of leaving a billed box idling the whole
|
|
377
|
+
# setup grace with no DONE/marker.
|
|
378
|
+
PULLED=0
|
|
379
|
+
for i in 1 2 3 4 5; do docker pull "$IMAGE" && {{ PULLED=1; break; }}; echo "FLASH: pull retry $i"; sleep 20; done
|
|
380
|
+
[ "$PULLED" -eq 1 ] || fail "worker image pull failed after retries"
|
|
381
|
+
# Run the worker container detached so cloud-init completes promptly; completion is signaled via the
|
|
382
|
+
# worker's HF artifacts (DONE/metrics.json/marker), never a return channel from the box.
|
|
383
|
+
docker run -d --name flashrun --gpus all --shm-size=16g --network host \\
|
|
384
|
+
-v /opt/flash:/root/flash {cache_bind}-w /root/flash \\
|
|
385
|
+
"$IMAGE" python /root/flash/bootstrap.py || fail "docker run failed"
|
|
386
|
+
sleep 5
|
|
387
|
+
# The container must be running OR have already exited CLEANLY. The bootstrap returns 0 ONLY on
|
|
388
|
+
# genuine success (it confirms metrics.json and uploads its ok-marker first) — so an exit code of 0
|
|
389
|
+
# is itself the success signal (e.g. an already-complete retry that finished in <5s), and the host
|
|
390
|
+
# must NOT write any marker for it: the worker OWNS the attempt marker, and writing to that path here
|
|
391
|
+
# would clobber its ok-marker (HF listing can lag the worker's just-finished upload). A NON-zero
|
|
392
|
+
# exit reaches fail(), but its failmark uploader is itself marker-aware: a container that started
|
|
393
|
+
# and then fast-failed on a real user/config error has ALREADY written its own ok=false marker here,
|
|
394
|
+
# and the host failmark SKIPS the write when that marker exists (so a genuine user error is never
|
|
395
|
+
# relabeled retriable/job_preempted). Only a never-started container — no worker marker — gets the
|
|
396
|
+
# retriable host failmark.
|
|
397
|
+
if ! docker ps --filter name=flashrun --filter status=running -q | grep -q .; then
|
|
398
|
+
EXIT="$(docker inspect -f '{{{{.State.ExitCode}}}}' flashrun 2>/dev/null || echo 1)"
|
|
399
|
+
docker logs flashrun >>/opt/flash/host_boot.log 2>&1 || true
|
|
400
|
+
[ "$EXIT" = "0" ] || fail "worker container did not start (exit ${{EXIT}})"
|
|
401
|
+
fi
|
|
402
|
+
# Mirror the container's stdout into the host boot log (detached) so an early in-container crash is
|
|
403
|
+
# visible on HF even if it dies before uploading its own console artifact.
|
|
404
|
+
( docker logs -f flashrun >>/opt/flash/host_boot.log 2>&1 || true ) &
|
|
405
|
+
disown || true
|
|
406
|
+
# Host->HF boot-log uploader: THROTTLED to 120s and STOPPED once the container exits (bounded ~30
|
|
407
|
+
# min). The worker itself uploads rate-limited heartbeats/console once running, so a 30s diagnostic
|
|
408
|
+
# loop for the whole run would risk Hugging Face's per-repo hourly commit cap and starve the
|
|
409
|
+
# required metrics/DONE commits.
|
|
410
|
+
( for i in $(seq 1 15); do
|
|
411
|
+
python3 /opt/flash/hostlog.py >/dev/null 2>&1 || true
|
|
412
|
+
docker ps --filter name=flashrun --filter status=running -q | grep -q . || break
|
|
413
|
+
sleep 120
|
|
414
|
+
done ) &
|
|
415
|
+
disown || true
|
|
416
|
+
"""
|