opencode-skills-collection 3.1.2 → 3.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/bundled-skills/.antigravity-install-manifest.json +4 -1
  2. package/bundled-skills/agent-creator/SKILL.md +246 -0
  3. package/bundled-skills/ax-extract-workflow/SKILL.md +156 -0
  4. package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
  5. package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
  6. package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
  7. package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
  8. package/bundled-skills/docs/sources/sources.md +1 -1
  9. package/bundled-skills/docs/users/bundles.md +1 -1
  10. package/bundled-skills/docs/users/claude-code-skills.md +1 -1
  11. package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
  12. package/bundled-skills/docs/users/getting-started.md +1 -1
  13. package/bundled-skills/docs/users/kiro-integration.md +1 -1
  14. package/bundled-skills/docs/users/usage.md +4 -4
  15. package/bundled-skills/docs/users/visual-guide.md +4 -4
  16. package/bundled-skills/lovable-cleanup/SKILL.md +2 -1
  17. package/bundled-skills/remote-gpu-trainer/.gitattributes +8 -0
  18. package/bundled-skills/remote-gpu-trainer/LICENSE +21 -0
  19. package/bundled-skills/remote-gpu-trainer/README.md +267 -0
  20. package/bundled-skills/remote-gpu-trainer/SKILL.md +249 -0
  21. package/bundled-skills/remote-gpu-trainer/evals/README.md +57 -0
  22. package/bundled-skills/remote-gpu-trainer/evals/RESULTS.md +44 -0
  23. package/bundled-skills/remote-gpu-trainer/evals/cases.jsonl +14 -0
  24. package/bundled-skills/remote-gpu-trainer/evals/run_evals.py +68 -0
  25. package/bundled-skills/remote-gpu-trainer/examples/autodl_sweep/README.md +72 -0
  26. package/bundled-skills/remote-gpu-trainer/examples/autodl_sweep/queue_1.txt +6 -0
  27. package/bundled-skills/remote-gpu-trainer/profiles/_schema.md +100 -0
  28. package/bundled-skills/remote-gpu-trainer/profiles/autodl.md +327 -0
  29. package/bundled-skills/remote-gpu-trainer/profiles/china.md +397 -0
  30. package/bundled-skills/remote-gpu-trainer/profiles/generic-ssh.md +450 -0
  31. package/bundled-skills/remote-gpu-trainer/profiles/lambda.md +342 -0
  32. package/bundled-skills/remote-gpu-trainer/profiles/paperspace.md +365 -0
  33. package/bundled-skills/remote-gpu-trainer/profiles/runpod.md +164 -0
  34. package/bundled-skills/remote-gpu-trainer/profiles/vastai.md +355 -0
  35. package/bundled-skills/remote-gpu-trainer/references/china-network.md +206 -0
  36. package/bundled-skills/remote-gpu-trainer/references/gotchas_universal.md +704 -0
  37. package/bundled-skills/remote-gpu-trainer/references/lifecycle_checklist.md +148 -0
  38. package/bundled-skills/remote-gpu-trainer/references/monitoring_patterns.md +327 -0
  39. package/bundled-skills/remote-gpu-trainer/references/multinode.md +190 -0
  40. package/bundled-skills/remote-gpu-trainer/references/parallel_ablation.md +196 -0
  41. package/bundled-skills/remote-gpu-trainer/references/principles.md +179 -0
  42. package/bundled-skills/remote-gpu-trainer/references/self-improvement.md +74 -0
  43. package/bundled-skills/remote-gpu-trainer/references/spot-resilience.md +235 -0
  44. package/bundled-skills/remote-gpu-trainer/references/ssh_transport.md +270 -0
  45. package/bundled-skills/remote-gpu-trainer/references/training/by-domain.md +230 -0
  46. package/bundled-skills/remote-gpu-trainer/references/training/checkpoint-resume.md +368 -0
  47. package/bundled-skills/remote-gpu-trainer/references/training/convergence-debugging.md +187 -0
  48. package/bundled-skills/remote-gpu-trainer/references/training/data-pipeline.md +119 -0
  49. package/bundled-skills/remote-gpu-trainer/references/training/distributed-launch.md +422 -0
  50. package/bundled-skills/remote-gpu-trainer/references/training/oom-memory.md +338 -0
  51. package/bundled-skills/remote-gpu-trainer/references/training/precision-stability.md +401 -0
  52. package/bundled-skills/remote-gpu-trainer/references/training/throughput-profiling.md +451 -0
  53. package/bundled-skills/remote-gpu-trainer/scripts/aggregate_to_fs.sh +55 -0
  54. package/bundled-skills/remote-gpu-trainer/scripts/check_staleness.py +70 -0
  55. package/bundled-skills/remote-gpu-trainer/scripts/download_loop.sh +67 -0
  56. package/bundled-skills/remote-gpu-trainer/scripts/gpu_health.sh +169 -0
  57. package/bundled-skills/remote-gpu-trainer/scripts/health_patrol.sh.template +67 -0
  58. package/bundled-skills/remote-gpu-trainer/scripts/mem_monitor.sh +67 -0
  59. package/bundled-skills/remote-gpu-trainer/scripts/reap_vram_zombies.sh +175 -0
  60. package/bundled-skills/remote-gpu-trainer/scripts/run_one.sh.template +104 -0
  61. package/bundled-skills/remote-gpu-trainer/scripts/run_queue.sh.template +83 -0
  62. package/bundled-skills/remote-gpu-trainer/scripts/setup-china-mirrors.sh +35 -0
  63. package/bundled-skills/remote-gpu-trainer/scripts/verify_local.py +145 -0
  64. package/package.json +1 -1
  65. package/skills_index.json +66 -0
@@ -0,0 +1,342 @@
1
+ ---
2
+ platform: lambda
3
+ kind: cloud-api # REST API / web console / SSH to a normal Ubuntu VM
4
+ meter_stop_verb: terminate # the ONLY action that stops billing; sudo shutdown does NOT
5
+ meter_stop_irreversible: true # terminate wipes local NVMe — there is no stop/suspend state
6
+ detach_primitive: tmux # plain Ubuntu; tmux/screen/nohup, install if absent
7
+ spot_available: false # no spot/preemptible tier; interruption is capacity-at-launch
8
+ spot_grace: n/a # no mid-run eviction → no grace window
9
+ shared_fs: true # region-locked NFS filesystem, attach-at-launch only
10
+ inode_cap: none # no documented inode cap; GiB quota only
11
+ free_egress: true # no ingress/egress fees on instances or filesystems
12
+ china_mirror_needed: false # US/global cloud, direct egress; no platform proxy
13
+ host_driver_cuda_max: lambda-stack-dependent # Lambda Stack bundles driver+CUDA+PyTorch; version moves per release — read nvidia-smi on the box, do NOT assume a number
14
+ local_nvme: true # ephemeral root/local NVMe, gone on terminate
15
+ ---
16
+
17
+ # Lambda Cloud — Profile
18
+
19
+ Lambda Cloud is a **cattle-not-pets** GPU cloud: on-demand + reserved instances, a prebuilt **Lambda
20
+ Stack** image, and **no stop/suspend state** — an instance can only be **launched, restarted, or
21
+ terminated**, and terminate destroys the local NVMe. Nothing on the box survives a teardown except what was
22
+ pushed off or written to an attached **region-locked NFS filesystem**. This inverts the AutoDL "关机保留数据"
23
+ instinct: here, durable design (checkpoint-to-NFS + idempotent resume) is **mandatory, not optional**.
24
+
25
+ > **Surface to the user up front (principle #10):** ⚠️ Danger clocks — there is **no stop/suspend**: an instance can only be launched / restarted / **terminated, and terminate wipes the local NVMe** — only the attached **NFS filesystem** survives, and **it keeps billing until you delete it manually** (LAM6). Conveniences — one-click **JupyterLab** per instance, free egress both directions. A terminate→relaunch yields a **new IP**.
26
+
27
+ > Docs/console domain moved from `lambdalabs.com` to `lambda.ai` (docs at `docs.lambda.ai`, console at
28
+ > `cloud.lambda.ai`); the **REST API base is still `cloud.lambdalabs.com/api/v1`** and `cloud.lambda.ai`
29
+ > also resolves (verified docs.lambda.ai + cloud-api 2026-06). Treat both hosts as live.
30
+
31
+ To jump: `grep -in <keyword> profiles/lambda.md`.
32
+
33
+ **Table of contents** — 1. LAUNCH · 2. STORAGE MODEL (survival matrix) · 3. NETWORK ·
34
+ 4. SPOT / INTERRUPTION + RESUME · 5. TEARDOWN / BILLING · 6. DAEMON TOOL · 7. TOP GOTCHAS (LAM1–LAM13) +
35
+ Platform-specific debugging · 8. SCRIPT OVERRIDES.
36
+
37
+ Universal gotchas (CRLF, inode/`df -i`, silent sync, cgroup OOM, spot grace) are NOT repeated here —
38
+ see `references/gotchas_universal.md`. Universal invariants → `references/principles.md`.
39
+
40
+ ---
41
+
42
+ ## 1. LAUNCH
43
+
44
+ Entry points:
45
+ - **Web console** at `cloud.lambda.ai` → Instances → Launch (pick GPU type + region, attach a filesystem
46
+ here if one is needed — see §2; attach any per-instance firewall ruleset here too — see §3/LAM4).
47
+ - **REST API** — `https://cloud.lambdalabs.com/api/v1`, auth `curl -u $LAMBDA_API_KEY:` (basic-auth,
48
+ password empty). Canonical automation surface (verified docs.lambda.ai/api/cloud 2026-06):
49
+ - `GET /instance-types` — lists every GPU type **and** `regions_with_capacity_available[]` per type.
50
+ This field IS the capacity signal — poll it to know where a type can launch right now (drives LAM5
51
+ retry-until-available).
52
+ - `POST /instance-operations/launch` · `.../terminate` · `.../restart` — create / stop-meter / reboot.
53
+ - **SSH** — standard connection to a normal Ubuntu VM; **default user is `ubuntu`** (not `root`); use
54
+ `sudo` for root. One-click **JupyterLab** is offered per instance.
55
+ - **SkyPilot** — de-facto orchestration layer: `pip install "skypilot[lambda]"`, key file at
56
+ `~/.lambda_cloud/lambda_keys` containing a line `api_key = <KEY>` (verified docs.skypilot.co 2026-06).
57
+ Use it for retry-until-capacity + autostop (§4, §6).
58
+
59
+ **Env contract — the image/base IS the env.** Instances ship **Lambda Stack** (NVIDIA driver + CUDA +
60
+ cuDNN + PyTorch/TensorFlow, all upgraded together as one apt metapackage). Run in it directly on the
61
+ throwaway box — do **not** `conda create` on a rental (`references/principles.md` §2), and do not `pip
62
+ install torch` over the top (LAM7/LAM8). Lambda Stack's exact CUDA/driver/PyTorch **moves per release**;
63
+ read it off the box (`nvidia-smi`, `python -c "import torch;print(torch.__version__,torch.version.cuda)"`)
64
+ rather than assuming a number. The **durable** form of the env is a Docker image (Lambda recommends running
65
+ Docker inside the instance) or a setup script replayed on each launch — because terminate destroys the box.
66
+ Reserved / 1-Click Clusters provide flat-rate multi-node (own billing model — LAM12).
67
+
68
+ > **verify:** `ssh ubuntu@<IP> 'python -c "import torch;print(torch.cuda.is_available())"'` → `True`.
69
+
70
+ ---
71
+
72
+ ## 2. STORAGE MODEL *(survival matrix — principle #4)*
73
+
74
+ Two tiers, and the trap is that the default working location is the **volatile** one.
75
+
76
+ - **Local / root NVMe** — fast, per-instance, **ephemeral**. Docs: *"Data not stored in the mount location
77
+ is erased once you terminate your instance and cannot be recovered"* (verified docs.lambda.ai
78
+ creating-managing-instances 2026-06). This is where work lands by default.
79
+ - **NFS filesystem** — a regional network filesystem mounted at `/lambda/nfs/<name>` (docs example mount:
80
+ `/lambda/nfs/persistent-storage`). **The only durable home.** Three hard constraints (verified
81
+ docs.lambda.ai/public-cloud/filesystems 2026-06):
82
+ - **Region-locked** — *"The filesystem must reside in the same region as the instance or cluster"* and
83
+ *"Filesystems cannot currently be transferred between regions."* Pick the region deliberately at create.
84
+ - **Attach-at-launch only** — *"You must attach the filesystem … at the time that the instance … is
85
+ launched"* and *"You can't attach a filesystem after you've created an instance."*
86
+ - Billed **$0.20/GiB/month in 1-hour increments**, **free ingress/egress**; **up to 24 filesystems per
87
+ account**; most regions allow up to 8 EB/filesystem but **us-south-1 (Texas) caps at 10 TB**.
88
+ - **No documented inode cap** — GiB quota only; no `df -i` ceiling surfaced (still audit `df -i` per the
89
+ universal storage gotcha).
90
+
91
+ | Tier | Path | Survives RESTART? | Survives TERMINATE? | Cap |
92
+ |---|---|---|---|---|
93
+ | Local / root NVMe | `/`, `/home/ubuntu` | yes (data persists; **but cold reboot wipes RAM** — LAM9) | **NO** (erased, unrecoverable) | instance root volume |
94
+ | NFS filesystem | `/lambda/nfs/<name>` | yes | **yes** (separate lifecycle; keeps billing — LAM6) | GiB quota; ~10 TB in us-south-1, 8 EB elsewhere |
95
+
96
+ **Checkpoints MUST go to** `/lambda/nfs/<name>` (the durable tier) for the §5 `terminate` verb. A
97
+ checkpoint left on local NVMe dies with the box. If no filesystem was attached at launch, the only durable
98
+ path is to `pull` the result off-box (free egress) before terminating.
99
+
100
+ ---
101
+
102
+ ## 3. NETWORK
103
+
104
+ - **Direct, unproxied egress.** US/global cloud — egress to HF / GitHub / PyPI is direct; **no
105
+ `network_turbo`-style accelerator exists**, and none is needed. China-mirror relevance is **N/A as a
106
+ platform feature** (relevant only when operating from inside China; then `references/china-network.md`
107
+ applies to the user's own setup, nothing platform-provided).
108
+ - **Free egress both directions** — *"Transparent pricing with no egress fees"* (verified lambda.ai
109
+ pricing 2026-06). Re-pulling a large model or pushing results off-box costs nothing, making
110
+ "pull-before-terminate" the cheap, safe default when no NFS is attached.
111
+ - **Firewall** — default allows *"only incoming ICMP traffic or TCP traffic on port 22 (SSH)"*. Open more
112
+ via **global rules** (apply workspace-wide) or **per-instance rulesets** (region-scoped). Per-instance
113
+ rulesets: *"You must attach rulesets during the instance launch process. You can't attach them after the
114
+ instance has been launched"* and *"You can't remove rulesets from an instance after the instance has been
115
+ launched"* (verified docs.lambda.ai/public-cloud/firewalls 2026-06) → plan port exposure before launch
116
+ (gotcha LAM4). Global rules can still be edited on the workspace afterward.
117
+ - **Exposing TB / Jupyter** — instances get a public IP; tunnel over SSH rather than opening ports:
118
+ `ssh -L 8888:localhost:8888 -L 6006:localhost:6006 ubuntu@<IP>`. No platform-pinned TensorBoard dir —
119
+ run TB on `:6006` against the logdir under the NFS mount.
120
+ - **SSH flavor** — direct TCP to a normal VM (`ubuntu@<IP>`); full `scp`/`rsync` work, no proxy-jump quirk.
121
+ **No static IP feature** — *"On-Demand Cloud doesn't support static IP addresses"* (verified DeepTalk
122
+ staff 2026-06). The IP is fixed for an instance's life, but **terminate→relaunch yields a NEW IP**
123
+ (LAM10) — re-read it from the console/API every launch; never hard-code it in automation.
124
+
125
+ ---
126
+
127
+ ## 4. SPOT / INTERRUPTION + RESUME *(principle #7/#8)*
128
+
129
+ **No spot / preemptible tier — and no mid-run eviction.** This is the key divergence from vast.ai/RunPod:
130
+ there is **no SIGTERM→SIGKILL grace window to survive**, because a running instance is never evicted
131
+ mid-epoch. The interruption model is different in kind:
132
+
133
+ - **Capacity-at-launch is the real failure.** The desired GPU type may be **unavailable when launch is
134
+ attempted** — Lambda has **no spot tier to fall back to**, and real-world on-demand fill rates are
135
+ spiky (one published 6-month log: ~64% same-day A100 success — i.e. ~1 in 3 attempts blocked; a 26 h
136
+ "temporarily unavailable" stall scaling 2→4 H100; verified medium.com/@velinxs 2026-06). H100/B200
137
+ capacity is the tightest. The resilience pattern is **retry-until-available**, not survive-eviction:
138
+ poll `GET /instance-types` for `regions_with_capacity_available` and `POST .../launch` the moment a
139
+ region appears (or let SkyPilot's provisioner retry across regions/types).
140
+ - **Self-inflicted termination only.** Once running, the only destructive events are an operator
141
+ `terminate`, or an **improper `sudo shutdown`** that pushes the box to **Alert** while still billing
142
+ (LAM3 / §5), or a **cold reboot** that wipes RAM (LAM9).
143
+ - **Resume hook** — checkpoint full state to the NFS filesystem on a periodic timer, load-latest
144
+ unconditionally on startup, so a fresh post-capacity launch resumes instead of restarting. Because the
145
+ box is cattle, the resume path is exercised on *every* relaunch, not just after a rare preemption.
146
+
147
+ Cadence formula (Young/Daly) + atomic-write resume → `references/spot-resilience.md`. Here the formula's
148
+ μ is effectively "time between voluntary relaunches," not a preemption rate.
149
+
150
+ ---
151
+
152
+ ## 5. TEARDOWN / BILLING *(principle #9 + the Iron Law)*
153
+
154
+ **TERMINATE is the meter-stop verb — and it is irreversible.** *"Billing begins the moment you launch an
155
+ instance and the instance passes health checks, and ends the moment you terminate the instance"*, billed
156
+ in **one-minute increments**, *"regardless if they're actively being used"* (verified
157
+ docs.lambda.ai/public-cloud/billing 2026-06).
158
+
159
+ > **The shutdown trap (most error-prone fact on this platform):** *"Do not use commands such as `sudo
160
+ > shutdown -h now` or `sudo systemctl poweroff` … These commands will not work as expected and will cause
161
+ > your instances to go into Alert status, and billing will continue"* (verified docs.lambda.ai 2026-06).
162
+ > Also `halt` / `shutdown -P 0` only stop the OS, not the meter (DeepTalk staff). Stop the meter **only**
163
+ > via `terminate` from the console or `POST /instance-operations/terminate` — which works even from inside
164
+ > the instance itself.
165
+
166
+ What each action preserves:
167
+ - **terminate** — stops the instance meter; **erases the local NVMe** (unrecoverable). The NFS filesystem
168
+ has a **separate lifecycle** and survives — but it **keeps billing $0.20/GiB/month until explicitly
169
+ deleted** (*"Billing continues as long as a filesystem exists, even if it's not mounted to an instance"*),
170
+ so a terminated-but-forgotten filesystem is a silent ongoing charge (LAM6).
171
+ - **There is no stop/suspend state** — *"It currently isn't possible to pause (suspend) your instance …
172
+ Your only options are to launch, restart, or terminate"* (verified docs.lambda.ai 2026-06). Idle-cheap
173
+ pause is impossible; the only way to stop paying for compute is to destroy the box and rebuild later.
174
+ - **restart / cold reboot** — does **not** stop the meter and does **not** wipe disk, but a **cold reboot
175
+ erases RAM and bypasses safe shutdown** — reserve it for a frozen box only (LAM9).
176
+
177
+ **Iron Law (SKILL.md Phase 5):** NO `terminate` until checkpoints are **pulled to local OR confirmed on
178
+ NFS by load-test** AND the user approves the cost-affecting action. Because terminate is destructive and
179
+ irreversible, an unverified `cp`/`rsync` to NFS means **permanent loss** — verify the sync (checksum /
180
+ `ls -l` / a load) before terminating, not after. Egress is free, so a belt-and-suspenders `pull` to local
181
+ is cheap. Cross-link: `superpowers:verification-before-completion` (REQUIRED) for the general gate.
182
+
183
+ ---
184
+
185
+ ## 6. DAEMON TOOL
186
+
187
+ - **Detach primitive: `tmux`** (or `screen` / `nohup`) on a standard Ubuntu VM — same playbook as the
188
+ AutoDL tmux pattern. Install if absent (`sudo apt install -y tmux`); fall back to
189
+ `nohup … </dev/null >log 2>&1 &`.
190
+ - **Survives an SSH drop, NOT a terminate.** tmux keeps the job alive across a dropped connection, but
191
+ with no stop state the detach primitive can't survive a teardown — only the **checkpoint-to-NFS +
192
+ idempotent resume** spine does (principle #8). tmux is the SSH-resilience layer; the checkpoint is the
193
+ instance-resilience layer. (tmux also won't survive a cold reboot — LAM9.)
194
+ - **Native orchestration: SkyPilot** (managed jobs, autostop, retry-until-capacity) + **1-Click
195
+ Clusters** for multi-node; no platform job-queue otherwise. SkyPilot moves the box on capacity loss but
196
+ **restarts the process from scratch — the checkpoint-load restores progress** (don't assume the
197
+ framework resumes training state).
198
+
199
+ ---
200
+
201
+ ## 7. TOP GOTCHAS (Lambda-pinned — universal ones live in `references/gotchas_universal.md`)
202
+
203
+ - **LAM1 — Terminate erases the local NVMe; there is no stop/suspend.**
204
+ Symptom: relaunched instance is blank, yesterday's run gone. → Root cause: local storage is ephemeral
205
+ (*"Data not stored in the mount location is erased … and cannot be recovered"*) and no stop state
206
+ preserves it; the AutoDL "关机 keeps my data" assumption is false. → Fix: design every workflow around
207
+ destroy/recreate — checkpoint to `/lambda/nfs/<name>` or `pull` off-box before any terminate; never keep
208
+ the only copy on local NVMe. (docs.lambda.ai 2026-06)
209
+
210
+ - **LAM2 — Filesystem is attach-at-launch only and region-locked.**
211
+ Symptom: a running instance has no durable storage and one can't be added; or a us-east filesystem won't
212
+ mount on a us-west instance. → Root cause: filesystems attach only at create time and can't move between
213
+ regions. → Fix: decide the region and attach the filesystem **at launch**; co-locate instance +
214
+ filesystem in the same region. (filesystems doc 2026-06)
215
+
216
+ - **LAM3 — `sudo shutdown` / `poweroff` keeps the meter running (Alert state).**
217
+ Symptom: instance "powered off" but the bill keeps climbing. → Root cause: an in-OS shutdown sends the
218
+ instance to **Alert** without stopping billing; `halt`/`shutdown -P 0` only stop the OS, not the meter.
219
+ → Fix: stop the meter only via **terminate** (console or `POST /instance-operations/terminate`); never
220
+ rely on an in-box poweroff. (billing doc + DeepTalk staff 2026-06)
221
+
222
+ - **LAM4 — Per-instance firewall rulesets are immutable post-launch.**
223
+ Symptom: a needed inbound port can't be opened (or a wrong one removed) on a live instance. → Root cause:
224
+ per-instance rulesets *"must [be attached] during the instance launch process"* and *"can't [be removed]
225
+ after the instance has been launched."* → Fix: plan port exposure before launch, use an editable
226
+ **global** rule, or tunnel over SSH (`-L`, §3) instead of opening a port. (firewalls doc 2026-06)
227
+
228
+ - **LAM5 — Capacity, not eviction, is the bottleneck (no spot fallback).**
229
+ Symptom: launch fails / dashboard shows the desired GPU type unavailable; long stalls scaling up. → Root
230
+ cause: on-demand supply for a specific GPU/region is exhausted (worst for H100/B200), and there is no
231
+ spot tier to fall back to. → Fix: poll `GET /instance-types` for `regions_with_capacity_available` and
232
+ launch the instant a region appears (or use SkyPilot's cross-region/type provisioner); resume from the
233
+ NFS checkpoint once granted (§4). (cloud-api doc + medium.com/@velinxs 2026-06)
234
+
235
+ - **LAM6 — The NFS filesystem keeps billing after the instance is gone.**
236
+ Symptom: all instances terminated, but storage charges continue. → Root cause: *"Billing continues as
237
+ long as a filesystem exists, even if it's not mounted to an instance"* — $0.20/GiB/month until deleted.
238
+ → Fix: after the final `pull` + verify, **delete the filesystem** (console Storage → Delete; requires
239
+ terminating attached instances first) — a distinct teardown step. (billing + filesystems docs 2026-06)
240
+
241
+ - **LAM7 — `pip install torch` over Lambda Stack silently shadows or mismatches it.**
242
+ Symptom: a `pip install` in `base` reports *"Defaulting to user installation because normal site-packages
243
+ is not writeable"* and lands in `~/.local`, or a `torch==X` pin drags in a CUDA/torchvision combo that
244
+ conflicts with the system build → import/CUDA errors. → Root cause: Lambda Stack PyTorch lives in
245
+ system `/usr/lib/python3/dist-packages` (not pip-writable as `ubuntu`); pip's user install or a hard
246
+ version pin diverges from it. → Fix: use the Stack's PyTorch as-is (don't reinstall), loosen pins
247
+ (`torch>=2.x` not `==`), or fully isolate in a fresh venv/conda env and install torch there cleanly —
248
+ don't half-mix pip-over-system. (DeepTalk threads 2026-06)
249
+
250
+ - **LAM8 — conda/venv that "borrows" Stack PyTorch via system-site-packages then breaks on pip.**
251
+ Symptom: created a conda env to use the Stack's torch, then a later `pip install` pulls a second,
252
+ conflicting torch or can't write site-packages. → Root cause: mixing `--system-site-packages` (to see
253
+ the system torch) with pip installs into the same env creates two torch copies. → Fix: pick ONE model —
254
+ either run in the bare Stack base (preferred on a rental), or build a fully self-contained env with
255
+ `conda install pytorch torchvision` (no system-site-packages borrowing). (DeepTalk
256
+ bypassing-lambda-stack thread 2026-06)
257
+
258
+ - **LAM9 — Cold reboot wipes RAM and tmux; warm restart still bills.**
259
+ Symptom: after a "reboot" the detached training job is gone and the box came back clean-ish. → Root
260
+ cause: a **cold reboot** *"erases all data currently in the instance's memory and bypasses the operating
261
+ system's safe-shutdown mechanisms"* — kills tmux sessions and any in-RAM state; neither reboot stops the
262
+ meter. → Fix: only cold-reboot a frozen box; rely on checkpoint-to-NFS, not on process survival across a
263
+ reboot; expect to re-`ssh` and re-`tmux attach` (session may be gone). (console doc 2026-06)
264
+
265
+ - **LAM10 — No static IP; the public IP changes on terminate→relaunch.**
266
+ Symptom: automation/SSH config hard-coded to yesterday's IP fails after a relaunch. → Root cause:
267
+ *"On-Demand Cloud doesn't support static IP addresses"* — a fresh launch gets a fresh IP. → Fix: read
268
+ the IP from the console / `GET /instances` on every launch; template SSH config dynamically; never
269
+ hard-code it. (DeepTalk staff 2026-06)
270
+
271
+ - **LAM11 — `apt full-upgrade` on Lambda Stack images can break cuDNN/DOCA.**
272
+ Symptom: after a recommended `apt-get update && upgrade` (or `full-upgrade` on 24.04 images), PyTorch/TF
273
+ fails to find cuDNN, or full-upgrade itself fails on a DOCA package. → Root cause: a system cuDNN bump
274
+ or DOCA repo state diverges from the Stack-bundled libs. → Fix: avoid blanket `full-upgrade` on a
275
+ rental; if cuDNN is missing, symlink the Stack copies —
276
+ `for so in /usr/lib/python3/dist-packages/tensorflow/libcudnn*; do sudo ln -s "$so" /usr/lib/x86_64-linux-gnu/; done`
277
+ (note: Stack cuDNN is usable *only* by the Stack-installed PyTorch/TF). (troubleshooting doc 2026-06)
278
+
279
+ - **LAM12 — 1-Click Clusters / reserved bill differently than on-demand (commitment traps).**
280
+ Symptom: expected per-minute pricing, got a 2-week minimum / weekly invoice / a reservation that expired.
281
+ → Root cause: **1-Click Clusters** carry a **minimum 2-week commitment with weekly billing** (not
282
+ per-minute); **reserved** capacity requires Lambda approval and the **invoice must be paid within ~10
283
+ days or the reservation is forfeited**, on non-cancelable terms. → Fix: use plain on-demand single
284
+ instances for per-minute experiments; only enter a cluster/reservation with confirmed sustained need and
285
+ budget approval. (1-click-clusters docs + nOps/CheckThat 2026-06)
286
+
287
+ - **LAM13 — GH200 (ARM/aarch64) breaks `pip install torch` — needs the ARM build.**
288
+ Symptom: on a 1× GH200 box, `pip install torch` installs a **CPU-only** wheel (no CUDA), or a pinned
289
+ `torch==2.2.0` fails to resolve. → Root cause: GH200 is aarch64; the default PyPI torch wheel for
290
+ aarch64 is CPU-only. → Fix: use Lambda Stack's pre-compiled ARM PyTorch (e.g. 2.4.1) as-is, or install
291
+ from the CUDA index `pip install torch --index-url https://download.pytorch.org/whl/cu128` (aarch64 GPU
292
+ wheels live there), or compile from source for newer versions; relax exact pins. (DeepTalk GH200 thread
293
+ + pytorch.org 2026-06)
294
+
295
+ ### Platform-specific debugging
296
+ - **Confirm billing actually stopped:** after a teardown, check the instance is **gone** (not in *Alert*)
297
+ via the console or `curl -u $LAMBDA_API_KEY: https://cloud.lambdalabs.com/api/v1/instances` — an Alert-
298
+ state box (from an in-OS shutdown) is still charging (LAM3).
299
+ - **Capacity probe before launch:** `curl -u $LAMBDA_API_KEY: .../instance-types | jq '.data | to_entries[]
300
+ | {type:.key, regions:.value.regions_with_capacity_available}'` — empty `regions` ⇒ that GPU type can't
301
+ launch anywhere right now (LAM5); this is the loop condition for retry-until-available.
302
+ - **GPU sanity on the box:** `nvidia-smi` (driver/CUDA + util) and `python -c "import torch;
303
+ print(torch.__version__, torch.version.cuda, torch.cuda.is_available())"` — mismatch between
304
+ `torch.version.cuda` and `nvidia-smi` CUDA usually means a pip-shadowed torch (LAM7/8/13), not a Stack
305
+ problem.
306
+ - **Read the real Stack version, never assume:** `apt list --installed 2>/dev/null | grep -i lambda-stack`
307
+ and `dpkg -l | grep -i cudnn` — confirm before debugging a "version mismatch."
308
+ - **Disk pressure on the ephemeral root:** `df -h /` and `df -h /lambda/nfs/<name>`; remember `/home/ubuntu`
309
+ is volatile — large datasets/checkpoints filling the root volume are also *lost* on terminate, so move
310
+ them to NFS, not just to clear space.
311
+ - **Detect a stalled download:** background the pull (`nohup … &`) and watch growth —
312
+ `watch -n5 'du -sh <target>; ls -l <target>'` (flat size for minutes ⇒ stalled; re-pull, egress is free).
313
+ - **Stuck/unreachable after reboot:** if SSH dies post-reboot, the box may be in *Alert* or networking
314
+ failed to come up — check the console state and prefer a fresh **terminate→relaunch** (resume from NFS)
315
+ over fighting a cold-reboot that already wiped RAM (LAM9).
316
+
317
+ ---
318
+
319
+ ## 8. SCRIPT OVERRIDES
320
+
321
+ Values to parameterize the `scripts/` templates for Lambda:
322
+
323
+ ```
324
+ DATA_DIR= /home/ubuntu (ephemeral NVMe — lost on terminate)
325
+ DURABLE_DIR= /lambda/nfs/<name>
326
+ PROXY_HOOK= (none — direct egress; no network_turbo)
327
+ CRED_FILE= "" (Lambda key is the $LAMBDA_API_KEY env var, not a file on disk — run_one's [ -n "$CRED_FILE" ] guard skips the file read and the env var passes through; SkyPilot key file at ~/.lambda_cloud/lambda_keys, format `api_key = <KEY>`)
328
+ SCRATCH= prune periodic ckpts on local NVMe; keep only `best` on /lambda/nfs/<name>
329
+ HF_HOME= /lambda/nfs/<name>/.cache/huggingface (durable; survives terminate, free egress on re-pull)
330
+ DETACH= tmux (apt install if absent; nohup fallback)
331
+ SSH_USER= ubuntu (NOT root)
332
+ ```
333
+
334
+ Notes for the wrapper:
335
+ - Default checkpoint dir → the NFS mount, not `/home/ubuntu` — the latter is erased on terminate.
336
+ - If no NFS filesystem is attached, set the wrapper to `pull` checkpoints to local on the periodic timer
337
+ (free egress) instead of relying on durable on-box storage.
338
+ - Re-read the instance IP from the console/API on every launch (LAM10) — never persist it in SSH config.
339
+ - Do not `pip install torch` / blanket `apt full-upgrade` on the rental — use the Stack as-is (LAM7/8/11);
340
+ on GH200 use the ARM build (LAM13).
341
+ - The teardown step is **terminate via API**, gated by the Iron Law; verify billing stopped (no *Alert*
342
+ state) and add an explicit reminder to **delete the NFS filesystem** (LAM6) when the project is done.