freesolo-flash-dev 0.2.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. flash/__init__.py +29 -0
  2. flash/_channel.py +23 -0
  3. flash/_fileio.py +35 -0
  4. flash/_logging.py +49 -0
  5. flash/_update_check.py +266 -0
  6. flash/catalog.py +253 -0
  7. flash/cli/__init__.py +1 -0
  8. flash/cli/main/__init__.py +227 -0
  9. flash/cli/main/__main__.py +6 -0
  10. flash/cli/main/commands.py +636 -0
  11. flash/cli/main/envpush.py +317 -0
  12. flash/cli/main/render.py +599 -0
  13. flash/cli/main/training_doc.py +455 -0
  14. flash/client/__init__.py +14 -0
  15. flash/client/config.py +70 -0
  16. flash/client/http.py +372 -0
  17. flash/client/runtime_secrets.py +69 -0
  18. flash/client/specs.py +20 -0
  19. flash/cost/__init__.py +16 -0
  20. flash/cost/analytical.py +175 -0
  21. flash/cost/facts.py +114 -0
  22. flash/cost/spec.py +113 -0
  23. flash/cost/types.py +158 -0
  24. flash/engine/__init__.py +6 -0
  25. flash/engine/accounting.py +36 -0
  26. flash/engine/chalk_kernels.py +116 -0
  27. flash/engine/multiturn_rollout.py +780 -0
  28. flash/engine/recipe.py +86 -0
  29. flash/engine/vram.py +603 -0
  30. flash/engine/worker/__init__.py +2916 -0
  31. flash/engine/worker/__main__.py +4 -0
  32. flash/engine/worker/kernel_warmup.py +400 -0
  33. flash/engine/worker/lora.py +796 -0
  34. flash/engine/worker/packing.py +366 -0
  35. flash/engine/worker/perf.py +1048 -0
  36. flash/envs/__init__.py +10 -0
  37. flash/envs/adapter/__init__.py +883 -0
  38. flash/envs/adapter/rubric.py +222 -0
  39. flash/envs/base.py +52 -0
  40. flash/envs/registry.py +62 -0
  41. flash/mcp/__init__.py +1 -0
  42. flash/mcp/server.py +85 -0
  43. flash/providers/__init__.py +59 -0
  44. flash/providers/_auth.py +24 -0
  45. flash/providers/_http.py +230 -0
  46. flash/providers/_instance.py +416 -0
  47. flash/providers/_instance_bootstrap.py +517 -0
  48. flash/providers/_poll.py +311 -0
  49. flash/providers/allocator.py +193 -0
  50. flash/providers/base.py +431 -0
  51. flash/providers/hyperstack/__init__.py +127 -0
  52. flash/providers/hyperstack/api.py +522 -0
  53. flash/providers/hyperstack/auth.py +17 -0
  54. flash/providers/hyperstack/gpus.py +29 -0
  55. flash/providers/hyperstack/jobs/__init__.py +632 -0
  56. flash/providers/hyperstack/jobs/builders.py +122 -0
  57. flash/providers/hyperstack/preflight.py +23 -0
  58. flash/providers/hyperstack/pricing.py +26 -0
  59. flash/providers/hyperstack/train.py +25 -0
  60. flash/providers/lambdalabs/__init__.py +139 -0
  61. flash/providers/lambdalabs/api.py +261 -0
  62. flash/providers/lambdalabs/auth.py +18 -0
  63. flash/providers/lambdalabs/gpus.py +29 -0
  64. flash/providers/lambdalabs/jobs/__init__.py +724 -0
  65. flash/providers/lambdalabs/jobs/builders.py +118 -0
  66. flash/providers/lambdalabs/preflight.py +27 -0
  67. flash/providers/lambdalabs/pricing.py +51 -0
  68. flash/providers/lambdalabs/train.py +27 -0
  69. flash/providers/preflight.py +55 -0
  70. flash/providers/realized.py +80 -0
  71. flash/providers/runpod/__init__.py +130 -0
  72. flash/providers/runpod/api.py +186 -0
  73. flash/providers/runpod/auth.py +37 -0
  74. flash/providers/runpod/cost.py +57 -0
  75. flash/providers/runpod/gpus.py +46 -0
  76. flash/providers/runpod/jobs.py +956 -0
  77. flash/providers/runpod/keys.py +139 -0
  78. flash/providers/runpod/preflight.py +30 -0
  79. flash/providers/runpod/preload.py +915 -0
  80. flash/providers/runpod/pricing.py +18 -0
  81. flash/providers/runpod/slots.py +79 -0
  82. flash/providers/runpod/train/__init__.py +150 -0
  83. flash/providers/runpod/train/deps.py +395 -0
  84. flash/providers/runpod/train/endpoints.py +820 -0
  85. flash/py.typed +0 -0
  86. flash/runner/__init__.py +686 -0
  87. flash/runner/checkpoints.py +82 -0
  88. flash/runner/deploy.py +422 -0
  89. flash/runner/lifecycle.py +672 -0
  90. flash/schema/__init__.py +375 -0
  91. flash/schema/fields.py +331 -0
  92. flash/serve/__init__.py +1 -0
  93. flash/serve/deploy.py +326 -0
  94. flash/serve/pricing.py +60 -0
  95. flash/server/__init__.py +1 -0
  96. flash/server/__main__.py +20 -0
  97. flash/server/app.py +961 -0
  98. flash/server/auth.py +263 -0
  99. flash/server/billing.py +124 -0
  100. flash/server/checkpoints.py +110 -0
  101. flash/server/db.py +160 -0
  102. flash/server/environment_registry.py +102 -0
  103. flash/server/envs.py +360 -0
  104. flash/server/reconcile.py +163 -0
  105. flash/server/run_registry.py +150 -0
  106. flash/spec.py +333 -0
  107. freesolo_flash_dev-0.2.25.dist-info/METADATA +192 -0
  108. freesolo_flash_dev-0.2.25.dist-info/RECORD +111 -0
  109. freesolo_flash_dev-0.2.25.dist-info/WHEEL +4 -0
  110. freesolo_flash_dev-0.2.25.dist-info/entry_points.txt +3 -0
  111. freesolo_flash_dev-0.2.25.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,230 @@
1
+ """Shared stdlib REST client for provider API modules.
2
+
3
+ Provider API clients use the same hardened-retry shape: a Bearer/Content-Type urllib
4
+ request, a jittered exponential backoff that retries 5xx/429 and fast-fails other 4xx
5
+ with the response body as the actionable detail, and a "failed after N attempts" raise.
6
+ This module factors that common core out so the backoff math lives in one place.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import contextlib
12
+ import json
13
+ import os
14
+ import random
15
+ import re
16
+ import time
17
+ import urllib.error
18
+ import urllib.request
19
+ from collections.abc import Callable
20
+ from typing import Any
21
+
22
+ # An unambiguous ``HTTP 404`` token: ``http 404`` bounded so a longer status-LIKE number can't
23
+ # match. ``\b`` after ``404`` rejects ``HTTP 4040``/``HTTP 4041`` (digit immediately after), while
24
+ # still matching ``HTTP 404:``, ``HTTP 404 Not Found``, and a trailing ``HTTP 404`` at end-of-string.
25
+ _HTTP_404_RE = re.compile(r"\bhttp 404\b")
26
+ # Same shape for ``HTTP 409`` (conflict) — bounded so ``HTTP 4090``/``4091`` (and a ``4090`` GPU
27
+ # name) can't match: only a genuine ``HTTP 409`` token counts in the no-cause text fallback.
28
+ _HTTP_409_RE = re.compile(r"\bhttp 409\b")
29
+
30
+
31
+ def _status_matches(err: Exception, code: int, token_re: re.Pattern[str]) -> bool:
32
+ """True when ``err`` represents the given HTTP ``code``, keyed off the chained status when present.
33
+
34
+ ``request_with_retries`` chains the original urllib ``HTTPError`` as ``__cause__`` for every
35
+ fast-failed 4xx (and on the "failed after N attempts" path), so the status CODE is authoritative
36
+ when a cause is present — anything else is a real failure that must NOT be swallowed. We only
37
+ fall back to a text match when there is no HTTPError cause, and even then only on an unambiguous
38
+ ``HTTP <code>`` TOKEN (``token_re``) — NEVER a bare substring, and never a longer number that
39
+ just begins with ``code``: the token's trailing ``\\b`` rejects ``HTTP <code>0``/``<code>1`` (and
40
+ a ``4090`` GPU name), so a transient error whose text embeds such an id is not misread."""
41
+ cause = getattr(err, "__cause__", None)
42
+ if isinstance(cause, urllib.error.HTTPError):
43
+ return cause.code == code
44
+ return bool(token_re.search(str(err).lower()))
45
+
46
+
47
+ def is_not_found(err: Exception) -> bool:
48
+ """True only when a provider API error represents a genuine HTTP 404 (resource already gone).
49
+
50
+ See ``_status_matches`` for how the chained-HTTPError code is authoritative and why the no-cause
51
+ fallback uses the bounded ``HTTP 404`` token (never a bare ``"404"`` substring). Mirrors
52
+ ``runpod.api._is_not_found``."""
53
+ return _status_matches(err, 404, _HTTP_404_RE)
54
+
55
+
56
+ def is_conflict(err: Exception) -> bool:
57
+ """True only when a provider API error represents a genuine HTTP 409 (conflict).
58
+
59
+ Same status-CODE-authoritative logic as ``is_not_found`` (see ``_status_matches``): a real 409
60
+ has the chained ``HTTPError`` whose ``.code == 409``; a non-409 failure whose message merely
61
+ contains ``"409"`` (e.g. a ``4090`` GPU name) plus the word "conflict" is NOT a conflict and must
62
+ still surface. Used by Hyperstack ``delete_vm`` to treat an in-flight-teardown 409 as success."""
63
+ return _status_matches(err, 409, _HTTP_409_RE)
64
+
65
+
66
+ class RestClient:
67
+ """Parametrized urllib REST client with jittered-backoff retries.
68
+
69
+ ``base_url`` is prefixed onto the ``target`` passed to each call. The key is read
70
+ from ``env_var`` on each request (env-only by design — never persisted) and failures
71
+ raise ``error_cls``.
72
+
73
+ ``keys_provider`` (optional) supplies an *ordered* list of API keys to try per call:
74
+ each key runs the full backoff loop, and a key that ends in a failover-class error
75
+ (per ``failover_predicate``) hands off to the next key — used by providers whose key
76
+ is a multi-account pool (see ``runpod.keys``). With no ``keys_provider`` the client
77
+ uses the single ``env_var`` key and behaves exactly as a single-key client.
78
+ """
79
+
80
+ def __init__(
81
+ self,
82
+ *,
83
+ env_var: str,
84
+ error_cls: type[Exception],
85
+ base_url: str = "",
86
+ missing_key_message: str | None = None,
87
+ keys_provider: Callable[[], list[str]] | None = None,
88
+ failover_predicate: Callable[[Exception], bool] | None = None,
89
+ extra_headers: dict[str, str] | None = None,
90
+ auth_header_name: str = "Authorization",
91
+ auth_value_format: str = "Bearer {key}",
92
+ ) -> None:
93
+ self.env_var = env_var
94
+ self.error_cls = error_cls
95
+ self.base_url = base_url
96
+ self.missing_key_message = (
97
+ missing_key_message or f"{env_var} not configured on the control-plane host"
98
+ )
99
+ self.keys_provider = keys_provider
100
+ self.failover_predicate = failover_predicate
101
+ # Static headers added to EVERY request (e.g. a custom User-Agent). Lambda Cloud sits
102
+ # behind Cloudflare, which 403s the stdlib default ``Python-urllib/<v>`` UA — so the
103
+ # Lambda client passes a real UA here. The auth + ``Content-Type`` headers are always set
104
+ # by ``request`` and win on a key collision.
105
+ self.extra_headers = dict(extra_headers or {})
106
+ # How the API key is presented. Default is RunPod/Lambda's ``Authorization: Bearer <key>``;
107
+ # Hyperstack uses a bare ``api_key: <key>`` header instead (``auth_header_name="api_key"``,
108
+ # ``auth_value_format="{key}"``).
109
+ self.auth_header_name = auth_header_name
110
+ self.auth_value_format = auth_value_format
111
+
112
+ def api_key(self) -> str:
113
+ key = os.environ.get(self.env_var)
114
+ if not key:
115
+ raise self.error_cls(self.missing_key_message)
116
+ return key
117
+
118
+ def _ordered_keys(self) -> list[str]:
119
+ """The keys to try, in order. Single-key clients yield exactly the env key."""
120
+ if self.keys_provider is None:
121
+ return [self.api_key()]
122
+ keys = self.keys_provider()
123
+ if not keys:
124
+ raise self.error_cls(self.missing_key_message)
125
+ return keys
126
+
127
+ def request(
128
+ self,
129
+ target: str,
130
+ method: str = "GET",
131
+ body: dict | None = None,
132
+ timeout: float = 30.0,
133
+ key: str | None = None,
134
+ ) -> Any:
135
+ req = urllib.request.Request(
136
+ f"{self.base_url}{target}",
137
+ method=method,
138
+ data=json.dumps(body).encode() if body is not None else None,
139
+ headers={
140
+ **self.extra_headers,
141
+ self.auth_header_name: self.auth_value_format.format(key=key or self.api_key()),
142
+ "Content-Type": "application/json",
143
+ },
144
+ )
145
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
146
+ raw = resp.read()
147
+ return json.loads(raw) if raw else {}
148
+
149
+ def _request_one_key(
150
+ self,
151
+ key: str,
152
+ target: str,
153
+ method: str,
154
+ body: dict | None,
155
+ retries: int,
156
+ base_delay: float,
157
+ ) -> Any:
158
+ """One key's full backoff loop (the original single-key behavior)."""
159
+ last: Exception | None = None
160
+ for attempt in range(retries + 1):
161
+ try:
162
+ return self.request(target, method=method, body=body, key=key)
163
+ except urllib.error.HTTPError as e:
164
+ if e.code < 500 and e.code != 429:
165
+ # The response body usually carries the actionable error detail; e.reason
166
+ # alone (e.g. "Bad Request") is rarely enough to debug a 4xx.
167
+ detail = ""
168
+ with contextlib.suppress(Exception):
169
+ detail = e.read().decode("utf-8", "replace")[:500].strip()
170
+ suffix = f": {detail}" if detail else ""
171
+ raise self.error_cls(
172
+ f"{method} {target} -> HTTP {e.code}: {e.reason}{suffix}"
173
+ ) from e
174
+ last = e
175
+ except (urllib.error.URLError, TimeoutError, ConnectionError, OSError) as e:
176
+ last = e
177
+ if attempt < retries:
178
+ delay = min(base_delay * (2 ** min(attempt, 6)), 30.0)
179
+ time.sleep(delay * random.uniform(0.7, 1.3))
180
+ # Chain the last exception so callers can inspect it: ``_is_not_found`` keys off the
181
+ # HTTPError code, and the multi-key waterfall's ``failover_predicate`` needs it to see
182
+ # a persistent 429's status (without the cause it can't tell the account is rate/quota
183
+ # limited and would stop instead of trying the next account).
184
+ raise self.error_cls(
185
+ f"{method} {target} failed after {retries + 1} attempts: {last}"
186
+ ) from last
187
+
188
+ def request_with_retries_for_key(
189
+ self,
190
+ key: str,
191
+ target: str,
192
+ method: str = "GET",
193
+ body: dict | None = None,
194
+ retries: int = 4,
195
+ base_delay: float = 2.0,
196
+ ) -> Any:
197
+ """Like request_with_retries but always uses the supplied key, bypassing the pool.
198
+
199
+ Use this when you need to query each account in the pool independently (e.g.
200
+ list_endpoints aggregation) rather than stopping at the first success.
201
+ """
202
+ return self._request_one_key(key, target, method, body, retries, base_delay)
203
+
204
+ def request_with_retries(
205
+ self,
206
+ target: str,
207
+ method: str = "GET",
208
+ body: dict | None = None,
209
+ retries: int = 4,
210
+ base_delay: float = 2.0,
211
+ ) -> Any:
212
+ """REST call hardened against transient network/5xx blips (jittered backoff).
213
+
214
+ With a multi-key ``keys_provider``, a key that fails with a failover-class error
215
+ hands off to the next key in the pool; a hard, key-agnostic error (or the last
216
+ key) is raised. Single-key clients try exactly one key — identical to before.
217
+ """
218
+ ordered = self._ordered_keys()
219
+ last_exc: Exception | None = None
220
+ for i, key in enumerate(ordered):
221
+ try:
222
+ return self._request_one_key(key, target, method, body, retries, base_delay)
223
+ except self.error_cls as e:
224
+ last_exc = e
225
+ more_keys = i < len(ordered) - 1
226
+ if more_keys and self.failover_predicate is not None and self.failover_predicate(e):
227
+ continue
228
+ raise
229
+ # Only reachable if ordered is empty, which _ordered_keys already guards against.
230
+ raise last_exc or self.error_cls(self.missing_key_message)
@@ -0,0 +1,416 @@
1
+ """Shared building blocks for the instance-based providers (Lambda, Hyperstack).
2
+
3
+ Both rent a single-GPU instance and bootstrap it identically: ship a cloud-init ``user_data`` that
4
+ runs the prebuilt ``WORKER_IMAGE`` via Docker on the host, detect completion from the worker's HF
5
+ artifacts, and guarantee teardown control-plane-side. The per-provider packages differ only in the
6
+ REST API (launch/list/terminate) and the capacity model; everything below — the run-derived
7
+ sweep-matchable label, the bootstrap payload, and the cloud-init script — is identical, so it lives
8
+ here (single source of truth, parameterized by the substrate ``arm`` and the run's image).
9
+
10
+ The shipped bootstrap is the sibling ``_instance_bootstrap.py``; ``arm`` (e.g. ``lambda`` /
11
+ ``hyperstack``) travels in ``payload["flash_arm"]`` and decides FLASH_ARM + the ``<arm>_attempt<N>``
12
+ marker name.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import base64
18
+ import hashlib
19
+ import io
20
+ import json
21
+ from pathlib import Path
22
+
23
+ # Lambda/Hyperstack cap an instance/VM ``name`` at 64 chars. We keep the label at or under this so
24
+ # the name is NEVER silently truncated at launch — truncation would desync the stored name from the
25
+ # ``run_label_prefix`` the orphan-sweep matches on, which could fail to protect (or wrongly reap) a
26
+ # live run. The seed/attempt suffix ``-s{seed}-a{attempt}`` is bounded (<=12 chars), so the prefix
27
+ # is bounded to leave room for it.
28
+ _MAX_NAME = 60
29
+ _PREFIX_BUDGET = _MAX_NAME - 12
30
+
31
+ # Above this many chars, the serialized job spec is spilled OUT of the inline cloud-init user_data
32
+ # (uploaded to HF; the bootstrap fetches it) so a large inline spec can't overflow the provider's
33
+ # user_data size limit and get the launch rejected before a handle is persisted. Below it the spec
34
+ # rides inline (the common, tiny-spec case) so launch needs no extra HF round-trip. The cap is well
35
+ # under the bootstrap's own 96_000-char execve threshold, and the base64 + heredoc framing inflates
36
+ # user_data ~1.4x, so the spilled ceiling keeps a typical run's user_data comfortably small.
37
+ _SPEC_SPILL_THRESHOLD = 16_000
38
+
39
+
40
+ def run_label_prefix(run_id: str) -> str:
41
+ """The prefix EVERY instance label for ``run_id`` starts with, bounded to the name budget.
42
+
43
+ Platform run ids already start with ``flash-``; anything else (direct-API callers, tests) gets
44
+ the prefix forced. A run id long enough to overflow the provider name cap is shortened
45
+ DETERMINISTICALLY (a stable 8-char hash suffix) so launch AND ``sweep_orphans`` compute the
46
+ IDENTICAL bounded prefix — and two distinct long run ids never collide onto the same name (which
47
+ could otherwise reap the wrong live instance). Short ids (the common case) pass through
48
+ unchanged."""
49
+ base = run_id if run_id.startswith("flash-") else f"flash-{run_id}"
50
+ if len(base) <= _PREFIX_BUDGET:
51
+ return base
52
+ h = hashlib.sha1(base.encode()).hexdigest()[:8]
53
+ return f"{base[: _PREFIX_BUDGET - 9]}-{h}"
54
+
55
+
56
+ def instance_label(run_id: str, seed: int, attempt: int) -> str:
57
+ """Instance name: run-derived so ``sweep_orphans`` can tell ours from anything else on the
58
+ account, and bounded (via ``run_label_prefix``) so the provider never truncates it."""
59
+ return f"{run_label_prefix(run_id)}-s{seed}-a{attempt}"
60
+
61
+
62
+ # The worker container path the per-region cache is bind-mounted at, and the HF cache under it. The
63
+ # host mount differs per provider (Lambda NFS /lambda/nfs/<name>; Hyperstack block /mnt/flash-weights)
64
+ # but the CONTAINER path is fixed, so HF_HOME is uniform regardless of substrate.
65
+ CACHE_CONTAINER_MOUNT = "/weight-cache"
66
+ CACHE_HF_HOME = f"{CACHE_CONTAINER_MOUNT}/hf-cache"
67
+ # Sentinel file written onto a SUCCESSFULLY-mounted block-volume cache (by the cloud-init preamble),
68
+ # so the in-container preload mount-check can tell a real mount from an empty Docker bind (a failed
69
+ # attach). Lives on the device itself -> absent when the volume isn't actually mounted.
70
+ CACHE_MOUNT_MARKER = ".flash-cache-mounted"
71
+
72
+
73
+ def _cache_block_device_setup(payload: dict) -> str:
74
+ """Cloud-init preamble (block-volume providers, e.g. Hyperstack): wait for the attached volume's
75
+ block device, format it ONCE if it has no filesystem (NEVER reformat a populated cache — guarded
76
+ by ``blkid``), and mount it at the host ``cache_host_mount``. No-op for NFS providers (Lambda
77
+ auto-mounts) and for cold runs. Best-effort: if the device never appears / mount fails, the bind
78
+ falls back to an empty dir (a correct cold run), never a hard failure."""
79
+ if not payload.get("cache_block_device") or not payload.get("cache_host_mount"):
80
+ return ""
81
+ mount = payload["cache_host_mount"]
82
+ # The attached cache volume is provisioned at an EXACT known size, so pick the candidate disk by
83
+ # size (±20%) AND require that neither it nor any of its partitions is mounted. That excludes the
84
+ # boot disk (its partition is mounted at /) and any differently-sized ephemeral/local NVMe — so we
85
+ # never mkfs the wrong device. A warm cache disk (already ext4, unmounted) still matches, and the
86
+ # blkid guard keeps its data. If nothing matches, run cold (format nothing).
87
+ expect_bytes = int(payload.get("cache_size_gb") or 0) * 1000 * 1000 * 1000
88
+ marker = CACHE_MOUNT_MARKER
89
+ return f"""
90
+ # --- weight-cache block volume: wait-for-device (size-matched, unmounted), format-if-new, mount ---
91
+ echo "FLASH: waiting for the attached cache block device (~{payload.get('cache_size_gb')}GB)..."
92
+ EXPECT_BYTES={expect_bytes}
93
+ CACHE_DEV=""
94
+ for i in $(seq 1 60); do
95
+ for d in $(lsblk -dpbn -o NAME,TYPE,SIZE | awk -v e="$EXPECT_BYTES" \
96
+ '$2=="disk" && e>0 {{lo=e*0.8; hi=e*1.2; if ($3+0>=lo && $3+0<=hi) print $1}}'); do
97
+ # Skip any disk with a mounted partition (boot/data disks in use) — only a free disk is ours.
98
+ if lsblk -pnr -o MOUNTPOINT "$d" | grep -q '[^[:space:]]'; then continue; fi
99
+ CACHE_DEV="$d"; break
100
+ done
101
+ [ -n "$CACHE_DEV" ] && break
102
+ sleep 5
103
+ done
104
+ if [ -n "$CACHE_DEV" ]; then
105
+ echo "FLASH: cache device $CACHE_DEV"
106
+ blkid "$CACHE_DEV" >/dev/null 2>&1 || mkfs.ext4 -q "$CACHE_DEV" || true # format ONCE; never reformat a populated cache
107
+ mkdir -p '{mount}'
108
+ if mount "$CACHE_DEV" '{mount}' 2>/dev/null; then
109
+ # Sentinel written ONTO the mounted block device (not the underlying empty dir): it is only
110
+ # visible at the bind path inside the container when the REAL volume is mounted. The preload
111
+ # mount-check requires it, so a failed/absent attach (Docker binding an empty host dir) can't
112
+ # masquerade as a warm cache and silently warm ephemeral disk.
113
+ touch '{mount}/{marker}' 2>/dev/null || true
114
+ else
115
+ echo "FLASH: cache mount failed; running cold"
116
+ fi
117
+ else
118
+ echo "FLASH: no matching cache block device appeared; running cold"
119
+ fi
120
+ """
121
+
122
+
123
+ def _cache_nfs_mount_check(payload: dict) -> str:
124
+ """Cloud-init preamble (NFS providers, e.g. Lambda): the platform auto-mounts the weight-cache
125
+ filesystem on the host at ``cache_host_mount`` — but ONLY if Lambda actually attached + readied it.
126
+ Docker's ``-v`` bind silently auto-CREATES a missing host dir, so a launch where the FS never
127
+ mounted would otherwise have the container warm an empty EPHEMERAL bind dir and report a false warm.
128
+ Verify the host path is a REAL mountpoint (an auto-created empty dir on the boot disk is not) and,
129
+ if so, drop the same sentinel the block-device path uses — ``run_preload`` requires it for the
130
+ cache, so a not-actually-mounted NFS fails the preload mount-check instead of warming throwaway
131
+ disk. No-op for block-volume providers (handled by ``_cache_block_device_setup``) and cold runs.
132
+ Best-effort: a training run still degrades to cold; only preload hard-requires the sentinel."""
133
+ if not payload.get("cache_host_mount") or payload.get("cache_block_device"):
134
+ return ""
135
+ mount = payload["cache_host_mount"]
136
+ marker = CACHE_MOUNT_MARKER
137
+ return f"""
138
+ # --- weight-cache NFS mount: verify the platform actually mounted it, then drop the sentinel ---
139
+ if mountpoint -q '{mount}'; then
140
+ echo "FLASH: weight-cache NFS mounted at {mount}"
141
+ touch '{mount}/{marker}' 2>/dev/null || true
142
+ else
143
+ echo "FLASH: weight-cache NFS NOT mounted at {mount} (no sentinel; preload will refuse, train runs cold)"
144
+ fi
145
+ """
146
+
147
+
148
+ def build_payload(
149
+ spec,
150
+ seed: int,
151
+ attempt: int,
152
+ *,
153
+ arm: str,
154
+ runtime_secrets: dict | None = None,
155
+ cache_host_mount: str | None = None,
156
+ cache_block_device: bool = False,
157
+ mode: str | None = None,
158
+ models: list | None = None,
159
+ ) -> dict:
160
+ """The bootstrap's input — field-compatible with the RunPod ``_train_body`` payload, plus the
161
+ bits the instance can't infer (HF prefix for markers, wall cap, attempt, and the substrate
162
+ ``arm`` that the bootstrap stamps as FLASH_ARM + the marker name).
163
+
164
+ ``cache_host_mount`` (set by the provider when it attaches a per-region weight cache) points
165
+ HF_HOME at the bind-mounted cache (``/weight-cache/hf-cache``) instead of stripping the
166
+ RunPod redirect; ``cache_block_device`` adds the format/mount preamble for block-volume providers.
167
+ """
168
+ from flash.envs.registry import worker_pip_for_env
169
+ from flash.providers.runpod.train import (
170
+ build_worker_env,
171
+ chalk_extra_pip,
172
+ strip_runpod_volume_env,
173
+ )
174
+
175
+ # Start from the shared env with the RunPod /runpod-volume redirect stripped (that mount is
176
+ # RunPod-only). If THIS provider attached a cache, point HF_HOME at the instance cache mount —
177
+ # but DON'T clobber a per-run [worker_env].HF_HOME the user set on purpose. build_worker_env
178
+ # merges [worker_env] LAST, so a user override survives the strip above (only /runpod-volume-
179
+ # rooted vars are stripped); on RunPod that override wins, so honor it here too for parity. We
180
+ # only install the cache path when HF_HOME is absent (i.e. the platform redirect was stripped and
181
+ # the user set nothing).
182
+ env = strip_runpod_volume_env(build_worker_env(spec, seed, runtime_secrets=runtime_secrets))
183
+ if cache_host_mount and not env.get("HF_HOME"):
184
+ env["HF_HOME"] = CACHE_HF_HOME
185
+ payload = {
186
+ "hf_repo": spec.train.hf_repo,
187
+ "job_spec_json": spec.to_json(),
188
+ "phase": spec.phase,
189
+ "seed": int(seed),
190
+ "flash_arm": arm,
191
+ "env": env,
192
+ # The bootstrap pip-installs extra_pip for every job, so the per-run env wheel + the opt-in
193
+ # chalk spec ride along here to reach default runs (mirrors runpod/jobs.submit_run).
194
+ "extra_pip": (list(spec.environment.pip) or worker_pip_for_env(spec.environment.id))
195
+ + chalk_extra_pip(spec),
196
+ "hf_prefix": f"{spec.phase}/{spec.run_id}/seed{seed}",
197
+ "max_wall_s": max(60, int(spec.gpu.max_wall_seconds)),
198
+ "attempt": int(attempt),
199
+ }
200
+ if cache_host_mount:
201
+ payload["cache_host_mount"] = cache_host_mount
202
+ # Carry the mount sentinel filename so the bootstrap's mount-check reads it from ONE source of
203
+ # truth (this constant) instead of re-hardcoding the literal — BOTH cloud-init preambles
204
+ # (_cache_block_device_setup for block volumes, _cache_nfs_mount_check for NFS) write the same
205
+ # CACHE_MOUNT_MARKER onto a verified-real mount, so the in-container preload check can tell a
206
+ # genuine mount from an empty Docker bind regardless of substrate.
207
+ payload["cache_mount_marker"] = CACHE_MOUNT_MARKER
208
+ if cache_block_device:
209
+ payload["cache_block_device"] = True
210
+ # The block-device preamble matches the attached volume by its EXACT provisioned size, so
211
+ # carry the runner-assigned size (falls back to the default cache size). Parse tolerantly
212
+ # via _volume_gb so a non-int / stale spec value ("0", "", "abc", bool) can't crash the
213
+ # instance bootstrap on this best-effort device-matching hint — it defaults instead.
214
+ from flash.runner import WEIGHT_CACHE_VOLUME_GB
215
+ from flash.spec import _volume_gb
216
+
217
+ payload["cache_size_gb"] = _volume_gb(
218
+ getattr(spec.gpu, "network_volume_gb", None), default=WEIGHT_CACHE_VOLUME_GB
219
+ )
220
+ # Preload (warm) mode: the bootstrap downloads ``models`` into the mounted cache and exits — no
221
+ # code fetch, no worker. Only meaningful with a cache attached (else there's nothing to warm).
222
+ if mode:
223
+ payload["mode"] = mode
224
+ payload["models"] = list(models or [])
225
+ return payload
226
+
227
+
228
+ # Host helper: best-effort upload of the consolidated boot log to HF. Neither Lambda nor Hyperstack
229
+ # exposes an instance console/log API, so the box pushes its own boot log to HF — the only window
230
+ # into a failure BEFORE the worker container can write its own artifacts (docker/GPU not ready,
231
+ # image pull failure). Reads creds from the on-box payload.json. Never raises.
232
+ _HOSTLOG_PY = """\
233
+ import json
234
+ try:
235
+ p = json.load(open("/opt/flash/payload.json"))
236
+ from huggingface_hub import HfApi
237
+ HfApi(token=(p.get("env") or {}).get("HF_TOKEN")).upload_file(
238
+ path_or_fileobj="/opt/flash/host_boot.log",
239
+ path_in_repo=p["hf_prefix"] + "/" + p.get("flash_arm", "instance") + "_boot.log",
240
+ repo_id=p["hf_repo"],
241
+ repo_type="dataset",
242
+ )
243
+ except Exception:
244
+ pass
245
+ """
246
+
247
+ # Host helper: write the attempt-failure marker (<arm>_attempt<N>.json, ok=false, RETRIABLE) to HF
248
+ # when the box can't even start the worker container (docker/GPU never ready, image pull failure).
249
+ # Without it a pre-container failure leaves NO marker, so the poller would burn the whole setup
250
+ # grace (~50 min) before reporting a generic stall; this surfaces a fast, RETRYABLE failure so the
251
+ # runner re-provisions on a fresh host immediately. Reads creds from the on-box payload.json.
252
+ #
253
+ # CRITICAL: the worker OWNS this marker path. A container that starts but fast-fails on a real,
254
+ # non-retriable user/config error can exit before the host's ~5s liveness check, having ALREADY
255
+ # uploaded its own ok=false marker (the TRUE error) here. The host must NOT overwrite it with a
256
+ # RETRIABLE host marker — that would relabel a genuine user error as job_preempted and silently
257
+ # retry / hide the root cause. So this writes the host marker ONLY when no worker attempt marker
258
+ # yet exists at the path (i.e. the container never got far enough to write one). The check is
259
+ # best-effort: on a read error it stays conservative and SKIPS the write (never clobbers).
260
+ _FAILMARK_PY = """\
261
+ import json, sys
262
+ try:
263
+ p = json.load(open("/opt/flash/payload.json"))
264
+ arm = p.get("flash_arm", "instance"); att = int(p.get("attempt") or 0)
265
+ reason = sys.argv[1] if len(sys.argv) > 1 else "host boot failure"
266
+ marker_path = p["hf_prefix"] + "/" + arm + "_attempt" + str(att) + ".json"
267
+ from huggingface_hub import HfApi
268
+ api = HfApi(token=(p.get("env") or {}).get("HF_TOKEN"))
269
+ try:
270
+ worker_marker_exists = api.file_exists(repo_id=p["hf_repo"], filename=marker_path, repo_type="dataset")
271
+ except Exception:
272
+ worker_marker_exists = True # conservative: on a read error, never risk clobbering
273
+ if not worker_marker_exists:
274
+ open("/opt/flash/fm.json", "w").write(json.dumps({"ok": False, "attempt": att, "retriable": True, "error": "host: " + reason}))
275
+ api.upload_file(
276
+ path_or_fileobj="/opt/flash/fm.json",
277
+ path_in_repo=marker_path,
278
+ repo_id=p["hf_repo"], repo_type="dataset",
279
+ )
280
+ except Exception:
281
+ pass
282
+ """
283
+
284
+
285
+ def _spill_large_spec_to_hf(payload: dict) -> dict:
286
+ """Keep a large ``job_spec_json`` OUT of the inline cloud-init user_data.
287
+
288
+ A tiny spec already yields ~17 KB of cloud-init; a 100 KB inline param balloons user_data past
289
+ typical provider/cloud-init user-data caps and the launch is rejected before any handle is
290
+ persisted (an unrecoverable, billing-invisible failure). When the spec is large we upload it to
291
+ the run's HF dataset repo at ``<hf_prefix>/job_spec.json`` and replace the inline value with a
292
+ small ``job_spec_in_hf`` sentinel; the bootstrap fetches it from the SAME repo it already pulls
293
+ code from. Small specs (the common case) ride inline unchanged — no extra HF round-trip.
294
+
295
+ Returns the payload to embed (a shallow copy when spilled, else the original).
296
+ """
297
+ spec_json = payload.get("job_spec_json") or ""
298
+ if len(spec_json) <= _SPEC_SPILL_THRESHOLD:
299
+ return payload
300
+ from huggingface_hub import HfApi
301
+
302
+ # Wrap the bytes in BytesIO: huggingface_hub.upload_file accepts a path-like for
303
+ # path_or_fileobj, and raw ``bytes`` is itself a valid path type, so it could be
304
+ # misinterpreted as a (huge) filesystem path. BytesIO makes it an unambiguous file-like upload.
305
+ HfApi(token=(payload.get("env") or {}).get("HF_TOKEN")).upload_file(
306
+ path_or_fileobj=io.BytesIO(spec_json.encode("utf-8")),
307
+ path_in_repo=f"{payload['hf_prefix']}/job_spec.json",
308
+ repo_id=payload["hf_repo"],
309
+ repo_type="dataset",
310
+ )
311
+ spilled = dict(payload)
312
+ spilled["job_spec_json"] = ""
313
+ spilled["job_spec_in_hf"] = True
314
+ return spilled
315
+
316
+
317
+ def build_user_data(payload: dict, *, image: str) -> str:
318
+ """Cloud-init ``user_data``: run the worker ``image`` via Docker on the host.
319
+
320
+ cloud-init runs this once at first boot as root. Everything dynamic travels base64-encoded
321
+ inside the script (never interpolated into shell syntax), so the job-spec JSON survives
322
+ byte-exact. The full training stack is baked into the image, so the box only needs Docker + an
323
+ NVIDIA GPU — both shipped by the providers' default Docker-capable images — and the container
324
+ does the rest (fetch code from HF, run the worker, stream artifacts back to HF).
325
+
326
+ A large job spec is spilled to HF first (see ``_spill_large_spec_to_hf``) so it never inflates
327
+ user_data past the provider's size cap.
328
+
329
+ Secrets-wise the script carries the same content as the worker env on RunPod (HF token, env
330
+ secrets). The operator's provider API key is NEVER shipped (teardown is control-plane-side via
331
+ the runner ``finally`` / poll deadline / ``sweep_orphans``).
332
+ """
333
+ payload = _spill_large_spec_to_hf(payload)
334
+ payload_b64 = base64.encodebytes(json.dumps(payload).encode()).decode()
335
+ bootstrap_src = (Path(__file__).parent / "_instance_bootstrap.py").read_text()
336
+ # Weight cache: the provider mounts its region-scoped persistent storage on the HOST at
337
+ # ``cache_host_mount`` (Lambda auto-mounts its NFS filesystem there; Hyperstack's preamble below
338
+ # formats+mounts the attached block device there). Bind it into the worker container at the FIXED
339
+ # ``/weight-cache`` so the worker's HF_HOME=/weight-cache/hf-cache (set in build_payload) persists
340
+ # the model download across runs in this region. Absent -> no bind (cold run).
341
+ cache_host_mount = payload.get("cache_host_mount")
342
+ # Single-quote the host path in the docker -v (defensive; the path is a controlled constant).
343
+ cache_bind = f"-v '{cache_host_mount}':{CACHE_CONTAINER_MOUNT} \\\n " if cache_host_mount else ""
344
+ cache_setup = _cache_block_device_setup(payload) + _cache_nfs_mount_check(payload)
345
+ return f"""#!/bin/bash
346
+ # Flash instance worker (generated by flash.providers._instance.build_user_data; arm={payload.get('flash_arm')})
347
+ set -x
348
+ mkdir -p /opt/flash
349
+ # Consolidate ALL boot output (this script + the container) into one host log the uploader ships
350
+ # to HF — neither substrate has a console API, so this is the only window into a pre-worker failure.
351
+ exec >>/opt/flash/host_boot.log 2>&1
352
+ cat > /opt/flash/payload.b64 <<'FLASH_PAYLOAD_EOF'
353
+ {payload_b64}FLASH_PAYLOAD_EOF
354
+ base64 -d /opt/flash/payload.b64 > /opt/flash/payload.json
355
+ cat > /opt/flash/bootstrap.py <<'FLASH_BOOTSTRAP_EOF'
356
+ {bootstrap_src}FLASH_BOOTSTRAP_EOF
357
+ cat > /opt/flash/hostlog.py <<'FLASH_HOSTLOG_EOF'
358
+ {_HOSTLOG_PY}FLASH_HOSTLOG_EOF
359
+ cat > /opt/flash/failmark.py <<'FLASH_FAILMARK_EOF'
360
+ {_FAILMARK_PY}FLASH_FAILMARK_EOF
361
+ IMAGE={image!r}
362
+ # huggingface_hub on the host for the boot-log + failure-marker uploaders (best-effort).
363
+ pip3 install -q huggingface_hub >/dev/null 2>&1 \\
364
+ || python3 -m pip install -q --break-system-packages huggingface_hub >/dev/null 2>&1 || true
365
+ fail() {{ echo "FLASH: $1" >&2; python3 /opt/flash/failmark.py "$1" >/dev/null 2>&1 || true; exit 1; }}
366
+ # The provider's default image ships Docker + the NVIDIA Container Toolkit, but cloud-init can run
367
+ # before they finish initializing — wait for both (up to ~10 min) before launching the worker.
368
+ for i in $(seq 1 100); do
369
+ if docker info >/dev/null 2>&1 && nvidia-smi >/dev/null 2>&1; then break; fi
370
+ echo "FLASH: waiting for docker+gpu ($i)"; sleep 6
371
+ done
372
+ docker info >/dev/null 2>&1 || fail "docker never became ready"
373
+ nvidia-smi >/dev/null 2>&1 || fail "gpu never became ready"
374
+ {cache_setup}
375
+ # Pull with retries (the image is large; a transient registry blip must not fail the run). On total
376
+ # failure, write a RETRYABLE marker and exit NOW instead of leaving a billed box idling the whole
377
+ # setup grace with no DONE/marker.
378
+ PULLED=0
379
+ for i in 1 2 3 4 5; do docker pull "$IMAGE" && {{ PULLED=1; break; }}; echo "FLASH: pull retry $i"; sleep 20; done
380
+ [ "$PULLED" -eq 1 ] || fail "worker image pull failed after retries"
381
+ # Run the worker container detached so cloud-init completes promptly; completion is signaled via the
382
+ # worker's HF artifacts (DONE/metrics.json/marker), never a return channel from the box.
383
+ docker run -d --name flashrun --gpus all --shm-size=16g --network host \\
384
+ -v /opt/flash:/root/flash {cache_bind}-w /root/flash \\
385
+ "$IMAGE" python /root/flash/bootstrap.py || fail "docker run failed"
386
+ sleep 5
387
+ # The container must be running OR have already exited CLEANLY. The bootstrap returns 0 ONLY on
388
+ # genuine success (it confirms metrics.json and uploads its ok-marker first) — so an exit code of 0
389
+ # is itself the success signal (e.g. an already-complete retry that finished in <5s), and the host
390
+ # must NOT write any marker for it: the worker OWNS the attempt marker, and writing to that path here
391
+ # would clobber its ok-marker (HF listing can lag the worker's just-finished upload). A NON-zero
392
+ # exit reaches fail(), but its failmark uploader is itself marker-aware: a container that started
393
+ # and then fast-failed on a real user/config error has ALREADY written its own ok=false marker here,
394
+ # and the host failmark SKIPS the write when that marker exists (so a genuine user error is never
395
+ # relabeled retriable/job_preempted). Only a never-started container — no worker marker — gets the
396
+ # retriable host failmark.
397
+ if ! docker ps --filter name=flashrun --filter status=running -q | grep -q .; then
398
+ EXIT="$(docker inspect -f '{{{{.State.ExitCode}}}}' flashrun 2>/dev/null || echo 1)"
399
+ docker logs flashrun >>/opt/flash/host_boot.log 2>&1 || true
400
+ [ "$EXIT" = "0" ] || fail "worker container did not start (exit ${{EXIT}})"
401
+ fi
402
+ # Mirror the container's stdout into the host boot log (detached) so an early in-container crash is
403
+ # visible on HF even if it dies before uploading its own console artifact.
404
+ ( docker logs -f flashrun >>/opt/flash/host_boot.log 2>&1 || true ) &
405
+ disown || true
406
+ # Host->HF boot-log uploader: THROTTLED to 120s and STOPPED once the container exits (bounded ~30
407
+ # min). The worker itself uploads rate-limited heartbeats/console once running, so a 30s diagnostic
408
+ # loop for the whole run would risk Hugging Face's per-repo hourly commit cap and starve the
409
+ # required metrics/DONE commits.
410
+ ( for i in $(seq 1 15); do
411
+ python3 /opt/flash/hostlog.py >/dev/null 2>&1 || true
412
+ docker ps --filter name=flashrun --filter status=running -q | grep -q . || break
413
+ sleep 120
414
+ done ) &
415
+ disown || true
416
+ """