opencode-skills-collection 3.1.2 → 3.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/bundled-skills/.antigravity-install-manifest.json +4 -1
  2. package/bundled-skills/agent-creator/SKILL.md +246 -0
  3. package/bundled-skills/ax-extract-workflow/SKILL.md +156 -0
  4. package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
  5. package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
  6. package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
  7. package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
  8. package/bundled-skills/docs/sources/sources.md +1 -1
  9. package/bundled-skills/docs/users/bundles.md +1 -1
  10. package/bundled-skills/docs/users/claude-code-skills.md +1 -1
  11. package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
  12. package/bundled-skills/docs/users/getting-started.md +1 -1
  13. package/bundled-skills/docs/users/kiro-integration.md +1 -1
  14. package/bundled-skills/docs/users/usage.md +4 -4
  15. package/bundled-skills/docs/users/visual-guide.md +4 -4
  16. package/bundled-skills/lovable-cleanup/SKILL.md +2 -1
  17. package/bundled-skills/remote-gpu-trainer/.gitattributes +8 -0
  18. package/bundled-skills/remote-gpu-trainer/LICENSE +21 -0
  19. package/bundled-skills/remote-gpu-trainer/README.md +267 -0
  20. package/bundled-skills/remote-gpu-trainer/SKILL.md +249 -0
  21. package/bundled-skills/remote-gpu-trainer/evals/README.md +57 -0
  22. package/bundled-skills/remote-gpu-trainer/evals/RESULTS.md +44 -0
  23. package/bundled-skills/remote-gpu-trainer/evals/cases.jsonl +14 -0
  24. package/bundled-skills/remote-gpu-trainer/evals/run_evals.py +68 -0
  25. package/bundled-skills/remote-gpu-trainer/examples/autodl_sweep/README.md +72 -0
  26. package/bundled-skills/remote-gpu-trainer/examples/autodl_sweep/queue_1.txt +6 -0
  27. package/bundled-skills/remote-gpu-trainer/profiles/_schema.md +100 -0
  28. package/bundled-skills/remote-gpu-trainer/profiles/autodl.md +327 -0
  29. package/bundled-skills/remote-gpu-trainer/profiles/china.md +397 -0
  30. package/bundled-skills/remote-gpu-trainer/profiles/generic-ssh.md +450 -0
  31. package/bundled-skills/remote-gpu-trainer/profiles/lambda.md +342 -0
  32. package/bundled-skills/remote-gpu-trainer/profiles/paperspace.md +365 -0
  33. package/bundled-skills/remote-gpu-trainer/profiles/runpod.md +164 -0
  34. package/bundled-skills/remote-gpu-trainer/profiles/vastai.md +355 -0
  35. package/bundled-skills/remote-gpu-trainer/references/china-network.md +206 -0
  36. package/bundled-skills/remote-gpu-trainer/references/gotchas_universal.md +704 -0
  37. package/bundled-skills/remote-gpu-trainer/references/lifecycle_checklist.md +148 -0
  38. package/bundled-skills/remote-gpu-trainer/references/monitoring_patterns.md +327 -0
  39. package/bundled-skills/remote-gpu-trainer/references/multinode.md +190 -0
  40. package/bundled-skills/remote-gpu-trainer/references/parallel_ablation.md +196 -0
  41. package/bundled-skills/remote-gpu-trainer/references/principles.md +179 -0
  42. package/bundled-skills/remote-gpu-trainer/references/self-improvement.md +74 -0
  43. package/bundled-skills/remote-gpu-trainer/references/spot-resilience.md +235 -0
  44. package/bundled-skills/remote-gpu-trainer/references/ssh_transport.md +270 -0
  45. package/bundled-skills/remote-gpu-trainer/references/training/by-domain.md +230 -0
  46. package/bundled-skills/remote-gpu-trainer/references/training/checkpoint-resume.md +368 -0
  47. package/bundled-skills/remote-gpu-trainer/references/training/convergence-debugging.md +187 -0
  48. package/bundled-skills/remote-gpu-trainer/references/training/data-pipeline.md +119 -0
  49. package/bundled-skills/remote-gpu-trainer/references/training/distributed-launch.md +422 -0
  50. package/bundled-skills/remote-gpu-trainer/references/training/oom-memory.md +338 -0
  51. package/bundled-skills/remote-gpu-trainer/references/training/precision-stability.md +401 -0
  52. package/bundled-skills/remote-gpu-trainer/references/training/throughput-profiling.md +451 -0
  53. package/bundled-skills/remote-gpu-trainer/scripts/aggregate_to_fs.sh +55 -0
  54. package/bundled-skills/remote-gpu-trainer/scripts/check_staleness.py +70 -0
  55. package/bundled-skills/remote-gpu-trainer/scripts/download_loop.sh +67 -0
  56. package/bundled-skills/remote-gpu-trainer/scripts/gpu_health.sh +169 -0
  57. package/bundled-skills/remote-gpu-trainer/scripts/health_patrol.sh.template +67 -0
  58. package/bundled-skills/remote-gpu-trainer/scripts/mem_monitor.sh +67 -0
  59. package/bundled-skills/remote-gpu-trainer/scripts/reap_vram_zombies.sh +175 -0
  60. package/bundled-skills/remote-gpu-trainer/scripts/run_one.sh.template +104 -0
  61. package/bundled-skills/remote-gpu-trainer/scripts/run_queue.sh.template +83 -0
  62. package/bundled-skills/remote-gpu-trainer/scripts/setup-china-mirrors.sh +35 -0
  63. package/bundled-skills/remote-gpu-trainer/scripts/verify_local.py +145 -0
  64. package/package.json +1 -1
  65. package/skills_index.json +66 -0
@@ -0,0 +1,450 @@
1
+ ---
2
+ platform: generic-ssh # the DEFAULT profile; Slurm / K8s / Colab-Kaggle are thin diffs below
3
+ kind: ssh # ssh | slurm | kubernetes | notebook (per sub-section)
4
+ meter_stop_verb: manual # nothing reclaims the box — a forgotten instance bills 24/7
5
+ meter_stop_irreversible: true # destroying the box deletes its disk; no platform undo
6
+ detach_primitive: tmux # tmux/nohup (bare) | sbatch (Slurm) | k8s-job (K8s) | kaggle-commit
7
+ spot_available: false # bare box: none by default; Slurm scavenger + spot rentals override
8
+ spot_grace: n/a # bare: n/a · Slurm: SIGTERM→KillWait(default 30s)→SIGKILL · K8s: terminationGracePeriodSeconds(default 30s)
9
+ shared_fs: host-dependent # bare: one disk you own · Slurm: parallel /scratch · K8s: a PVC
10
+ inode_cap: host-dependent # measure with df -i; do NOT assume an AutoDL ~200K constant
11
+ free_egress: host-dependent
12
+ china_mirror_needed: host-dependent # only if the box sits behind the GFW
13
+ host_driver_cuda_max: host-dependent
14
+ local_nvme: host-dependent
15
+ ---
16
+
17
+ # Profile: generic-SSH — the DEFAULT (bare box) + Slurm / Kubernetes / Colab-Kaggle diffs
18
+
19
+ One-line purpose: the lowest-common-denominator profile for a box where **SSH is the only control
20
+ channel and teardown is manual** — every other platform profile is a *diff* against this baseline.
21
+
22
+ > **Surface to the user up front (principle #10):** ⚠️ Danger clock — there is usually **no auto-release / idle timer to save you**: a forgotten box **bills 24/7** until you tear it down, and teardown is entirely manual (no platform safety net). Reality — you **expose ports yourself** (an `ssh -L` tunnel for TB/Jupyter); on Slurm a job dies at **walltime** — design the requeue.
23
+
24
+ Read this whole file before Phase 0 on any unbranded rental, then jump to the matching sub-section
25
+ (Slurm / Kubernetes / Colab-Kaggle) if the backend is a scheduler, a cluster, or a notebook.
26
+ **Universal gotchas are NOT restated here** — see `references/gotchas_universal.md`.
27
+
28
+ **Table of contents** (`grep -in '<keyword>' profiles/generic-ssh.md` to jump):
29
+ - BASELINE: 8-field schema for the bare-SSH box (sections 1–8)
30
+ - THIN DIFF — SLURM (sbatch replaces tmux)
31
+ - THIN DIFF — KUBERNETES (a Job manifest replaces the shell)
32
+ - THIN DIFF — COLAB / KAGGLE (not SSH-orchestratable)
33
+
34
+ The one load-bearing abstraction every backend below solves differently: **detach the job from the
35
+ connection, and make the result survive the session ending.** Checkpoint-to-durable + idempotent
36
+ resume (principle #8) is the invariant; the detach primitive (tmux / sbatch / Job / commit) is the
37
+ swappable plug.
38
+
39
+ ---
40
+
41
+ ## 1. LAUNCH
42
+
43
+ - **Entry point:** `ssh user@host` — key-based, fronted by an `~/.ssh/config` alias so the rest of
44
+ the workflow says `ssh gpu-box`. There is **no platform API, console, or CLI** — SSH is the *only*
45
+ control channel (this is what makes the box "generic"). Set the alias per `references/ssh_transport.md`.
46
+ - **Push code:** `rsync -avz --partial ./proj/ gpu-box:~/proj/` — resumable, delta-only on re-syncs;
47
+ prefer over `scp` (a reset `scp` restarts from zero). Pull results the same way, reversed.
48
+ - **Download weights/datasets ON the box**, not over the local uplink: `ssh gpu-box 'cd ~/proj &&
49
+ hf download <repo> --local-dir data'` (or `aws s3 cp`, `wget`). The box almost always has a fatter,
50
+ cheaper pipe to HF/S3 than a home connection — pushing a 50 GB checkpoint over a residential uplink
51
+ is the classic self-inflicted stall. Transport verbs → **REQUIRED:** `huggingface-skills:hf-cli`.
52
+ - **Env contract:** whatever the host ships. There is no prebuilt "base" guarantee — inspect
53
+ `which python && python -V && nvidia-smi` first. If the image has a usable env, treat it as AutoDL's
54
+ base (do not `conda create` on a throwaway box); if it is bare, `conda create` / `venv` once and
55
+ pin it. State the seed/determinism in the run itself — no platform does it here (**REQUIRED:**
56
+ `verifying-dl-experiments`).
57
+
58
+ → **verify:** `ssh gpu-box 'python -c "import torch;print(torch.cuda.is_available())"'` prints `True`.
59
+
60
+ ## 2. STORAGE MODEL *(the survival matrix — principle #4)*
61
+
62
+ The box gives **one persistent disk that is yours to manage** — no shared FS, no platform quota
63
+ service, no automatic reclamation. *Measure, never assume:* run `df -h && df -i <mount>` live on the
64
+ box. Caps are host-dependent — do **not** carry over an AutoDL ~200K-inode or ~200 GB constant.
65
+
66
+ | Tier | Path | Survives STOP? | Survives DESTROY? | Cap |
67
+ |---|---|---|---|---|
68
+ | Root / home disk | `/` , `~` | yes (box keeps running) | **no** (destroy deletes the box) | host-dependent — `df -h`/`df -i` |
69
+ | Attached block volume (if any) | `/path/to/mount` | yes | depends on provider — verify before destroy | host-dependent |
70
+
71
+ The only "survival matrix" subtlety on a bare box: there is **no stop/destroy distinction the
72
+ platform enforces** — the box runs until *manually* stopped, and a destroy wipes the disk with no
73
+ undo. So checkpoints must land on a mount that gets `rsync`-pulled to local **before** teardown
74
+ (§5). Disk fails on inodes before bytes and the real hog hides in a symlinked cache — audit the
75
+ actual mount with `du`, clean by value (keep tiny eval JSONs, prune large periodic checkpoints).
76
+
77
+ ## 3. NETWORK
78
+
79
+ - **Egress/proxy:** host-dependent; there is no platform proxy hook. If the box sits behind the GFW,
80
+ set the mirror manually — `export HF_ENDPOINT=https://hf-mirror.com` (or `HF_HUB_ENABLE_HF_TRANSFER=1`
81
+ off-GFW) — and validate the speed test on the **same route** the real transfer uses (principle #7).
82
+ - **Port exposure:** expose services yourself. TensorBoard / Jupyter ride an SSH tunnel from the
83
+ local machine: `ssh -L 6006:localhost:6006 gpu-box` then open `http://<localhost>:6006`. There is
84
+ no console port-forward button.
85
+ - **SSH flavor:** direct-TCP key-based SSH — `scp`/`rsync` work normally (unlike the proxied SSH on
86
+ some rental platforms). If the provider hands out a non-standard port, pin it in the alias.
87
+
88
+ ## 4. SPOT / INTERRUPTION + RESUME *(principle #7/#8)*
89
+
90
+ A bare on-demand box has **no spot/preemption model by default** — it runs until manually stopped, so
91
+ the interruption to design against is an **SSH drop**, not an eviction. Without a detach primitive an
92
+ SSH drop sends SIGHUP and kills the job; `tmux` (§6) is what severs the job from the connection.
93
+
94
+ Resume is **self-built**: checkpoint full state (model + optimizer + scheduler + epoch/step + RNG +
95
+ dataloader position) atomically (`tmp`→`fsync`→`os.rename`) on a periodic timer, and load-latest
96
+ unconditionally on startup so the *identical launch command* resumes. Cadence formula + atomic-write
97
+ pattern → `references/spot-resilience.md`. (Spot-rented bare boxes exist — if the provider can evict,
98
+ treat it like the vast.ai profile: tiny/zero grace, checkpoint continuously.)
99
+
100
+ ## 5. TEARDOWN / BILLING *(principle #9 + the Iron Law)*
101
+
102
+ **Teardown is MANUAL and is the number-one cost failure on this profile.** Nothing reclaims the box:
103
+ no idle timer, no auto-release, no scheduler that ends the job. **A forgotten box bills 24/7** — an
104
+ overnight idle instance is the most expensive single mistake on metered hardware.
105
+
106
+ - The meter-stopping action is **provider-manual** (a console "stop"/"destroy", a `terminate` API, or
107
+ a phone call) — and on most bare rentals it is **irreversible** (deletes the disk).
108
+ - "Stop after pulling results" is a **mandatory final phase**, not an afterthought. Honor the
109
+ **teardown Iron Law**: no stop/destroy until checkpoints are **pulled to local AND verified by
110
+ load** (`scripts/verify_local.py`) **AND** the user has approved the cost-affecting action.
111
+ "It looked done in the log" is not evidence (principle #3). **REQUIRED:**
112
+ `superpowers:verification-before-completion`.
113
+
114
+ ## 6. DAEMON TOOL
115
+
116
+ - **`tmux`** is the detach primitive: `tmux new -s train` → run inside → `Ctrl-b d` to detach;
117
+ `tmux attach -t train` to reattach, `tmux ls` to reconcile a watcher against the real session
118
+ (principle #3). It survives an SSH drop; it does **not** survive a box reboot — relaunch after one.
119
+ - **Fallback** when tmux is absent and cannot be installed: `nohup <cmd> </dev/null >log 2>&1 &` then
120
+ `disown`. Always redirect stdin from `/dev/null` so the job never blocks reading the terminal.
121
+ - **No native queue** — the operator IS the scheduler, monitor, and janitor. Use the parameterized
122
+ `scripts/run_queue.sh.template` for a resumable serial queue; never edit a queue script while it is
123
+ being read (principle #6 — version the filename).
124
+
125
+ ## 7. TOP GOTCHAS (platform-pinned; universal ones → `references/gotchas_universal.md`)
126
+
127
+ - **GEN1 — Forgotten box bills 24/7.** Symptom: a week-old invoice for an instance that finished
128
+ training on day one. → Root cause: nothing on a bare box reclaims it; the human is the only janitor.
129
+ → Fix: make teardown a tracked Phase-5 step; after the verified pull, prompt the user to stop/destroy
130
+ (never auto-act — principle #9); for cross-session safety set a `/schedule` reminder to re-check.
131
+ - **GEN2 — SSH drop kills the run (no tmux).** Symptom: training dies the moment the laptop sleeps or
132
+ the network blips. → Root cause: the job is a child of the SSH shell; the drop sends SIGHUP.
133
+ → Fix: launch inside `tmux` (or `nohup … & disown`) **before** the long run starts — not after it is
134
+ already orphaned.
135
+ - **GEN3 — `scp` restarts from zero on a reset; `rsync` does not.** Symptom: a 40 GB re-sync that
136
+ never finishes over a flaky link. → Root cause: `scp` has no resume. → Fix: `rsync -avz --partial`
137
+ for every code/data/result transfer; wrap bulk pulls in a `timeout`+resume loop (principle #7).
138
+ - **GEN4 — CRLF breaks `.sh` on the Linux box.** Symptom: `bash: $'\r': command not found`, or a
139
+ shebang that "isn't found." → Root cause: a script authored on Windows carries CRLF line endings.
140
+ → Fix: `.gitattributes` with `*.sh text eol=lf`; on-box unblock `sed -i 's/\r$//' run.sh`.
141
+ - **GEN5 — Heavy DL static-checked on the wrong machine.** Symptom: an OOM or a CUDA mismatch only
142
+ reproduces on the box. → Root cause: static/import checks ran locally, the real compute is remote.
143
+ → Fix: run the cheap CPU smoke locally (Phase 2), run the heavy DL **on the box**; for the
144
+ bug-vs-effect call once it runs, defer to **REQUIRED:** `verifying-dl-experiments`.
145
+ - **GEN6 — A box reboot silently orphans the run (`tmux` does not survive it).** Symptom: a detached
146
+ job vanishes with a clean `dmesg`, idle GPU, and low `uptime`; `tmux ls` shows no sessions.
147
+ → Root cause: `tmux`/`nohup` survive an SSH drop but **not** a host reboot — the rental rebooted (host
148
+ maintenance, kernel update, or an OOM that took the box) and every session died. → Fix: treat reboot
149
+ as one of the four "vanished process" causes (cross-link `references/gotchas_universal.md` U3); make
150
+ resume idempotent (§4) so the *same* launch command continues from the last checkpoint; for a box that
151
+ reboots often, add an `@reboot` cron or a systemd unit that re-launches the detached queue.
152
+ - **GEN7 — A second concurrent run silently halves throughput by oversubscribing the GPU.** Symptom: two
153
+ training runs on the "same idle GPU" both crawl, or the second OOMs on a card that looked free.
154
+ → Root cause: a bare box has **no scheduler** — nothing prevents two processes sharing one GPU, so they
155
+ contend for VRAM and SM time. → Fix: the operator *is* the scheduler — serialize with the
156
+ `run_queue.sh` template, or pin each run to a distinct card with `CUDA_VISIBLE_DEVICES=<n>`; check
157
+ `nvidia-smi` for an existing holder before every launch (zombie holders → U11).
158
+ - **GEN8 — Watching a poll connection, not the run, declares a false death.** Symptom: the ssh-poll
159
+ drops and the run is pronounced dead, but the job finished fine and wrote `best.pth`. → Root cause: a
160
+ dropped *poll* connection ≠ the training dying; the two failure modes are conflated. → Fix: on any poll
161
+ drop, re-ssh and check ground truth directly (`pgrep -af train`, log tail, `best.pth` mtime) before
162
+ concluding anything (principle #3); robust short-connection poll template → U17.
163
+
164
+ ### Platform-specific debugging (bare SSH)
165
+
166
+ The box has no console — every diagnostic is an ssh one-liner. Run these *separately* (a kill drops the
167
+ SSH, U1/U4), and bound each with `ssh -o ConnectTimeout=15 -o ServerAliveInterval=10` so a blip
168
+ self-kills instead of half-open hanging:
169
+
170
+ - **Is the run alive or orphaned?** `ssh gpu-box 'tmux ls; pgrep -af <train-script> | head'` — empty
171
+ `tmux ls` after a vanished log ⇒ reboot/HUP (GEN6); reconcile the watcher against the real session.
172
+ - **Why did it die (the 4-cause ladder)?** `ssh gpu-box 'dmesg 2>/dev/null | grep -iE "killed process|out of memory|Xid" | tail; uptime'` — OOM line ⇒ U9/U10; clean dmesg + low uptime ⇒ reboot (GEN6); `Xid 48/79` ⇒ dead GPU, re-rent (U22).
173
+ - **GPU health, not just util%:** `ssh gpu-box 'nvidia-smi dmon -s pucvmet -d 1 -c 5'` — read SM clock + power, not `GPU-Util` (a liar, U21); a holder `nvidia-smi` cannot see ⇒ `fuser -v /dev/nvidia*` (U11).
174
+ - **Disk before it bites:** `ssh gpu-box 'df -h <mount>; df -i <mount>'` — inodes hit 100% before bytes (U7); the byte-hog often hides in `~/.cache/huggingface` (`du -sh ~/.cache/huggingface/hub/models--* | sort -rh`).
175
+ - **Stuck download?** A transfer with a live process but a flat `df` is stalled, not progressing —
176
+ `ssh gpu-box 'ls -la --time-style=+%H:%M data/*.tmp; df -h <mount>'`; if the size has not moved, kill and
177
+ resume the per-dir loop (`scripts/download_loop.sh`, U12), never restart from zero.
178
+
179
+ ## 8. SCRIPT OVERRIDES
180
+
181
+ Values to parameterize the `scripts/` templates for a bare-SSH box:
182
+
183
+ ```
184
+ DATA_DIR=$HOME/proj (working dir / data disk on the box)
185
+ DURABLE_DIR=$HOME/proj (durable mount = the measured persistent disk; pull to local before teardown)
186
+ PROXY_HOOK= (none by default; set HF_ENDPOINT=https://hf-mirror.com only if behind the GFW)
187
+ CRED_FILE=~/.netrc on the box's local disk, streamed in via stdin — never onto a shared/durable FS
188
+ SCRATCH=*.latest.pth and periodic checkpoints (prune on success; keep best + tiny eval JSONs)
189
+ HF_HOME=$HOME/proj/.hf (redirect off the default ~/.cache so it lands on the data disk)
190
+ DETACH=tmux (the swappable plug — replaced by sbatch / Job / commit in the diffs below)
191
+ ```
192
+
193
+ ---
194
+
195
+ # THIN DIFF — SLURM *(sbatch replaces tmux)*
196
+
197
+ `kind: slurm` · meter = walltime/fairshare **quota, not dollars** · detach = `sbatch` · no teardown.
198
+
199
+ The scheduler owns the job's lifecycle: the operator **submits**, Slurm runs and detaches it.
200
+ `tmux+nohup` is **replaced** (not supplemented) by `sbatch` — a submitted batch job survives logout
201
+ with no tmux. A bare `srun` still **blocks and dies on terminal close** like a foreground process, so
202
+ wrap `srun` *inside* an `sbatch` script for long runs.
203
+
204
+ - **Submit / monitor / kill:** `sbatch job.sh` (returns a jobid immediately) · `squeue -u $USER`
205
+ (status — replaces "reattach tmux") · `sacct -j <jobid>` (post-mortem: exit code, maxRSS, elapsed)
206
+ · `scancel <jobid>` (kill). Logs go to `slurm-%j.out` (arrays: `slurm-%A_%a.out`) — file-based, same
207
+ logs-to-file contract as the baseline.
208
+ - **GPUs are declarative:** `#SBATCH --gres=gpu:a100:2` (or `--gpus=volta:3`); request, do not place.
209
+ Slurm's GRES plugin sets `CUDA_VISIBLE_DEVICES` per step (verified slurm.schedmd.com/gres.html 2026-06).
210
+ - **Walltime ceiling — the hard new constraint:** `#SBATCH --time=HH:MM:SS` and at the limit each task
211
+ is sent **SIGTERM, then SIGKILL after `KillWait` (default 30 s)** (verified slurm.schedmd.com/sbatch.html
212
+ + slurm.conf 2026-06). Long training MUST checkpoint and requeue, not "run until done."
213
+ - **Preemption + checkpoint-on-signal:** on time-limit or scavenger-partition eviction the same
214
+ SIGTERM→KillWait→SIGKILL sequence applies. Arm `#SBATCH --signal=B:SIGTERM@360` for a ~6-minute warning
215
+ (the `B:` prefix signals the **batch shell**, not the steps; **Slurm may fire it up to 60 s EARLY** —
216
+ size the warning with that slack, verified slurm.schedmd.com/sbatch.html 2026-06), trap it to set a flag,
217
+ and `#SBATCH --requeue` to auto-return to the queue (the script restarts **from its beginning with the
218
+ same job ID**) and resume from the last checkpoint. Cadence formula → `references/spot-resilience.md`.
219
+ - **Native orchestration replaces hand-rolled fan-out:** `--array=0-15` (rate-limit with `%4`) fans out
220
+ ablation cells, `--dependency=afterok:<jobid>` chains stages (runs only on exit-code-0).
221
+ - **No per-hour teardown — watch fairshare.** Nodes are not `shutdown`; the job just ends. The
222
+ baseline's #1 risk (forgotten box) **disappears**, replaced by "don't blow the walltime/fairshare
223
+ allocation." There is nothing to stop.
224
+ - **No root, shared multi-tenant node:** cannot `apt install`. Use `module load cuda` or a container
225
+ (**Apptainer/Singularity** — Docker is usually banned).
226
+ - **Filesystem split:** the shared parallel FS (`$HOME`, `/scratch`) persists and is where checkpoints
227
+ go; node-local **`$TMPDIR` is wiped when the job ends** — stage scratch to `$TMPDIR`, checkpoint to
228
+ `/scratch`. Multi-node NCCL/fabric specifics → `references/multinode.md`.
229
+
230
+ ### Slurm gotchas (platform-pinned; universal → `references/gotchas_universal.md`)
231
+
232
+ - **SLURM1 — Checkpoint *inside* the signal handler corrupts the checkpoint.** Symptom: `--requeue`
233
+ works most of the time, then intermittently writes a corrupt `hpc_ckpt` and the requeued job won't
234
+ load. → Root cause: a Python signal handler can fire **after any bytecode instruction** — including
235
+ mid-backward-pass — so checkpointing directly in the handler races with training (verified
236
+ github.com/Lightning-AI/pytorch-lightning#21406 2026-06). → Fix: the handler does the **minimum** —
237
+ set a flag; poll the flag in the training loop and checkpoint at a **safe point** (end of step), then
238
+ `scontrol requeue $SLURM_JOB_ID` or exit so `--requeue` returns it.
239
+ - **SLURM2 — Warning signal arrives too late; the SIGKILL lands mid-write.** Symptom: the
240
+ `--signal@360` trap fires but the checkpoint is half-written when SIGKILL hits. → Root cause: two
241
+ slacks compound — Slurm may send the warning **up to 60 s early OR late**, and at the actual wall the
242
+ `KillWait` grace is only ~30 s (verified slurm.schedmd.com 2026-06). → Fix: budget the warning so a
243
+ full checkpoint fits *before* the wall even with the 60 s jitter; checkpoint *periodically* too (never
244
+ rely on the one signal); make the write atomic (`tmp`→`fsync`→`rename`, U6) so a truncated file is
245
+ never loaded.
246
+ - **SLURM3 — `srun` inside `sbatch` no longer inherits `--cpus-per-task` (Slurm ≥ 22.05).** Symptom: a
247
+ nested `srun` hangs, sees one CPU, or under-threads the dataloader. → Root cause: since 22.05 `srun`
248
+ stopped reading `SLURM_CPUS_PER_TASK` and must be told explicitly (verified docs.icer.msu.edu 2026-06).
249
+ → Fix: `srun -c $SLURM_CPUS_PER_TASK …`, or set `export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK`; pass
250
+ `--gpus-per-task`/`--gres` on the `srun` too — a step does not inherit the allocation's GRES by default.
251
+ - **SLURM4 — OOM is a job STATE, not a Python traceback.** Symptom: the job dies with no error in the
252
+ log; `sacct` shows `State=OUT_OF_MEMORY` (or `slurmstepd: Detected 1 oom-kill event(s)`). → Root cause:
253
+ Slurm cgroup sets a hard memory limit at (a fraction of) the requested `--mem`; exceeding it is an
254
+ OOM-kill the kernel performs (verified osc.edu / icer.msu.edu 2026-06). → Fix: read `sacct -o
255
+ MaxRSS,ReqMem` and raise `--mem`/`--mem-per-cpu` to MaxRSS×1.2; this is the cgroup-RAM OOM of U9
256
+ (dataloader workers × a big tensor), distinct from VRAM OOM (U10) — **do not** shrink batch for a
257
+ host-RAM OOM.
258
+ - **SLURM5 — `$TMPDIR` checkpoints evaporate when the job ends.** Symptom: a requeued/array job finds an
259
+ empty checkpoint dir. → Root cause: node-local `$TMPDIR` is wiped at job end; only the shared parallel
260
+ FS persists across a requeue or a different node. → Fix: stage *scratch* to `$TMPDIR` for speed, but
261
+ write **checkpoints to `/scratch/$USER`**; never point `DURABLE_DIR` at node-local storage.
262
+
263
+ ### Slurm debugging (squeue / sacct / cgroup triage)
264
+
265
+ - **Still queued or running?** `squeue -u $USER -o '%i %T %r %M %l %R'` — the `%r` Reason column explains
266
+ a `PENDING` (e.g. `Resources`, `Priority`, `QOSMaxGPUPerUserLimit`); `%R` on a running job is the nodelist.
267
+ - **Post-mortem (why it ended):** `sacct -j <jobid> --format=JobID,State,ExitCode,DerivedExitCode,Elapsed,MaxRSS,ReqMem,Timelimit,NodeList`
268
+ — `State=TIMEOUT` ⇒ walltime kill (raise `--time` or requeue); `OUT_OF_MEMORY` ⇒ SLURM4; `PREEMPTED`/`NODE_FAIL`
269
+ ⇒ requeue territory; `ExitCode` like `0:9` means killed by **signal 9** (SIGKILL — the KillWait expired).
270
+ - **Live resource use:** `sstat -j <jobid>.batch --format=JobID,MaxRSS,MaxVMSize` on a *running* step
271
+ (sacct only finalizes at exit); cross-check against `ReqMem` to catch a creeping leak before the cgroup kills it.
272
+ - **GPU actually allocated to the step?** inside the job: `echo $CUDA_VISIBLE_DEVICES && nvidia-smi -L`
273
+ — a mismatch ⇒ SLURM3 (`--gres`/`--gpus-per-task` not on the `srun`).
274
+ - **Multi-node hang** (job RUNNING, no progress) ⇒ NCCL/fabric, not Slurm → `references/multinode.md`.
275
+
276
+ **Slurm OVERRIDES:** `DETACH=sbatch` · `DURABLE_DIR=/scratch/$USER/proj` (durable) + `DATA_DIR=$TMPDIR`
277
+ (node-local, wiped) · `PROXY_HOOK=module load cuda` · teardown=`n/a (watch sacct + fairshare)`.
278
+
279
+ ---
280
+
281
+ # THIN DIFF — KUBERNETES *(a Job manifest replaces the shell)*
282
+
283
+ `kind: kubernetes` · detach = a `Job` manifest (no shell) · persistence = a **PVC, non-optional**.
284
+
285
+ The unit of work is a **manifest**, not a session: `kubectl apply -f job.yaml`; the control plane
286
+ schedules a pod and a `Job` controller **replaces it on failure** up to `backoffLimit` (**default 4** —
287
+ each failure creates a *new* pod, it does not restart the old one; verified kubernetes.io Jobs doc
288
+ 2026-06). The "detach from my connection" problem vanishes — the pod never had a connection to the shell.
289
+
290
+ - **GPUs:** `resources.limits: nvidia.com/gpu: 1`. Quirk (verified kubernetes.io scheduling-gpus 2026-06):
291
+ GPUs go in **`limits` only**; if `requests` is set it must **equal** `limits`, and you cannot set
292
+ `requests` without `limits`; GPUs are **integer, not shared or overcommitted** — one whole GPU per
293
+ container (absent MIG/time-slicing, which K8s does not provide out of the box). Provided by the NVIDIA
294
+ device-plugin DaemonSet.
295
+ - **Code delivery is different — no `rsync` into a pod.** Code is **baked into a container image**
296
+ (build → push to a registry) or pulled at pod start. This is the biggest workflow shift from the
297
+ baseline; pin the base image by `@sha256:` digest, not `:latest` (U30).
298
+ - **Persistence is the headline risk:** the **pod filesystem is EPHEMERAL by design.** On
299
+ death/restart/reschedule, anything written outside a mounted volume is **gone**. Checkpoints **must**
300
+ mount a **PersistentVolumeClaim** (or object storage) at `/checkpoints` — this is non-optional and is
301
+ the single most common way ML-on-K8s loses work.
302
+ - **Monitor:** `kubectl get pods` · `kubectl logs -f <pod>` (replaces `tail -f`). `kubectl exec -it …
303
+ -- bash` is a debugging tool, not the run mechanism — an exec session is not durable.
304
+ - **Declarative parallelism:** `Job` `parallelism`/`completions` (both default 1) for fan-out (the K8s
305
+ analog of Slurm arrays).
306
+ - **Lifecycle knobs:** `activeDeadlineSeconds` is the walltime analog (terminates the Job past the
307
+ deadline); `ttlSecondsAfterFinished` auto-GCs a finished Job; `terminationGracePeriodSeconds` (**default
308
+ 30 s**, verified kubernetes.io 2026-06) is the SIGTERM→SIGKILL window — the K8s analog of Slurm
309
+ `KillWait`, so the same checkpoint-on-SIGTERM discipline applies.
310
+ - **Teardown is two-layered:** `kubectl delete job <name>` frees the *pod* (cheap), but the underlying
311
+ **node/cluster keeps costing** unless an autoscaler scales it down. **delete ≠ scale-down** — the
312
+ node release is the real cost lever, distinct from the baseline's single "destroy the box."
313
+
314
+ ### Kubernetes gotchas (platform-pinned; universal → `references/gotchas_universal.md`)
315
+
316
+ - **K8S1 — Pod stuck `Pending`: `Insufficient nvidia.com/gpu`.** Symptom: `kubectl get pods` shows
317
+ `Pending`; the events read `0/N nodes are available: N Insufficient nvidia.com/gpu`. → Root cause:
318
+ *usually not* missing hardware — the **device-plugin DaemonSet** isn't running, so no node advertises
319
+ allocatable GPUs; or a taint blocks scheduling (verified kubenatives.com + GKE troubleshooting 2026-06).
320
+ → Fix: `kubectl describe node <n> | grep -A4 -E 'Capacity|Allocatable'` — if `nvidia.com/gpu` is `0`,
321
+ the plugin is down: `kubectl get ds -n kube-system | grep nvidia` and `kubectl logs -n kube-system -l
322
+ k8s-app=nvidia-device-plugin`; add the matching toleration if the GPU nodes are tainted.
323
+ - **K8S2 — `RestartPolicy: Always` is rejected on a Job.** Symptom: `kubectl apply` errors that a Job's
324
+ pod template may only use `Never` or `OnFailure`. → Root cause: a Job is not a Deployment; only those
325
+ two restart policies are legal (verified kubernetes.io Jobs doc 2026-06). → Fix: use `OnFailure`
326
+ (restart the *container* in place — keeps `/checkpoints` warm) or `Never` (a fresh pod per attempt,
327
+ cleaner logs); never copy a Deployment's `Always`.
328
+ - **K8S3 — `ImagePullBackOff` / `ErrImagePull` after a registry push.** Symptom: the pod never starts;
329
+ events show `Back-off pulling image`. → Root cause: a private registry without an `imagePullSecrets`,
330
+ a wrong tag/digest, or a too-big layer timing out the pull. → Fix: `kubectl describe pod <p>` reads the
331
+ exact pull error; attach `imagePullSecrets`, pin a real `@sha256:` digest (U30), and pre-warm large
332
+ images onto the node pool.
333
+ - **K8S4 — `Multi-Attach error` on a rescheduled pod (RWO PVC).** Symptom: a pod stuck
334
+ `ContainerCreating` after a node failure: `Volume is already exclusively attached to one node`. → Root
335
+ cause: a **ReadWriteOnce** PVC can attach to **one node at a time**; on failover the old attachment
336
+ hasn't released, and two distributed-training pods on different nodes can never share an RWO volume
337
+ (verified discuss.kubernetes.io / bobcares.com 2026-06). → Fix: for multi-node training use
338
+ **ReadWriteMany** (NFS/EFS/CephFS) for the shared checkpoint dir, or pin co-dependent pods to one node
339
+ with affinity; on a stuck failover, force-detach via the cloud console or delete the old `VolumeAttachment`.
340
+ - **K8S5 — Pod `Evicted` mid-training under node disk pressure.** Symptom: a long run dies with
341
+ `status: Evicted, reason: The node was low on resource: ephemeral-storage`. → Root cause: container
342
+ logs, the writable layer, and `emptyDir` count as **ephemeral storage**; checkpoints/caches written
343
+ outside the PVC fill the node and the kubelet evicts the pod (verified jorijn.com / oneuptime.com
344
+ 2026-06). → Fix: write **everything large to the PVC**, set `resources.limits.ephemeral-storage`,
345
+ rotate logs, and back `emptyDir` scratch with `sizeLimit`; this is the K8s face of the disk-full crash
346
+ (U6/U7).
347
+ - **K8S6 — Container runs but trains on CPU (GPU never attached).** Symptom: a pod runs to completion,
348
+ loss curves normal, ~100× too slow. → Root cause: the GPU limit was omitted, or `nvidia-smi` works on
349
+ the *node* but the container lacks the runtime/library path. → Fix: **validate `kubectl exec <p> --
350
+ nvidia-smi` before trusting a run**; ensure `resources.limits.nvidia.com/gpu` is set and the NVIDIA
351
+ container runtime is the default (this is U31 surfaced through K8s).
352
+
353
+ ### Kubernetes debugging (kubectl triage)
354
+
355
+ - **Why is it Pending / not starting?** `kubectl describe pod <p>` — the **Events** section names it
356
+ directly (Insufficient GPU ⇒ K8S1; FailedScheduling taint; ImagePullBackOff ⇒ K8S3; FailedMount ⇒ K8S4).
357
+ - **Why did it die?** `kubectl get pod <p> -o jsonpath='{.status.containerStatuses[0].lastState.terminated}'`
358
+ — `reason: OOMKilled` ⇒ raise `resources.limits.memory` (cgroup-RAM, U9); `Error` + exit code ⇒ read logs.
359
+ - **Logs of a crashed/previous attempt:** `kubectl logs <p> --previous` (the current pod may be a fresh
360
+ retry with an empty log); `kubectl get events --sort-by=.lastTimestamp` for the cluster-wide timeline.
361
+ - **Did the node even offer GPUs?** `kubectl describe node <n> | grep -A4 Allocatable` — `nvidia.com/gpu: 0`
362
+ ⇒ device plugin down (K8S1).
363
+ - **Is the PVC bound and mounted?** `kubectl get pvc` (`Bound`?) and `kubectl describe pod <p>` Volumes
364
+ section — an unbound PVC stalls the pod in `Pending`.
365
+
366
+ **K8s OVERRIDES:** `DETACH=k8s-job` · `DURABLE_DIR=/checkpoints` (PVC mount — required; RWX for multi-node)
367
+ · `CRED_FILE=""` — credentials arrive as a K8s Secret mounted as an env var (WANDB_API_KEY / HF_TOKEN),
368
+ never a file on disk and never baked into the image layer, so run_one's `[ -n "$CRED_FILE" ]` guard skips
369
+ the file read and the env var passes through · teardown=`kubectl delete` **+** scale the node pool down.
370
+
371
+ ---
372
+
373
+ # THIN DIFF — COLAB / KAGGLE *(not SSH-orchestratable)*
374
+
375
+ `kind: notebook` · **no SSH, no tmux, no persistent disk, no real job abstraction.** The generic
376
+ core's central primitive ("detach + survive the session") cannot be satisfied directly — degrade to
377
+ **checkpoint-to-cloud + idempotent resume**. Teardown is automatic and free; the *opposite* problem to
378
+ the baseline — the work cannot be kept alive long enough.
379
+
380
+ **Colab (free tier):**
381
+ - **Idle timeout ~90 min** (no cell activity) and a hard **~12 h max VM lifetime**; on disconnect all
382
+ RAM, variables, models, and the local `/content` filesystem are **lost**. Limits are **dynamic and
383
+ unpublished** — GPU type/availability and the exact ceilings "vary over time" and GPU is best-effort,
384
+ can be denied or downgraded (verified research.google.com/colaboratory/faq.html 2026-06).
385
+ - **Free tier requires the browser tab to STAY OPEN** — *(verified — corrects the draft's "anti-idle
386
+ tricks are unreliable" framing)*: **background execution is a Pro+ paid feature**; on free tier closing
387
+ the tab stops the runtime shortly after (verified github.com/googlecolab/colabtools#4151 + community
388
+ reports 2026-06). So keep-alive hacks aren't merely *unreliable* — there is **no supported headless
389
+ background run at all** on free Colab. Design for the disconnect, do not fight it.
390
+ - **Only survival mechanism:** mount Google Drive and **checkpoint every epoch to Drive**; make the
391
+ entrypoint **resume-from-Drive idempotent** so the inevitable reconnect continues, not restarts.
392
+
393
+ **Kaggle (free tier) — slightly better, because of one real primitive:**
394
+ - **30 GPU-hours/week** floating quota (T4×2 or P100; resets weekly); **interactive idle timeout ~60 min**
395
+ and a **~9 h** session cap (verified kaggle.com/docs/efficient-gpu-usage + product-feedback 2026-06).
396
+ - **The one genuine headless-background primitive: "Save Version → Save & Run All (commit)."** It
397
+ snapshots the notebook and runs it **on a separate machine with no idle timer, surviving browser
398
+ close**, and **persists `/kaggle/working` (20 GB) as the committed version's output** (commit times out
399
+ at ~9 h GPU / ~12 h CPU). This is the closest thing to `sbatch` in the free-tier world — single it out
400
+ as Kaggle's detach primitive. Live monitoring is weak (Colab: watch the cell; Kaggle commit: inspect
401
+ only the finished version's logs).
402
+ - **Code delivery:** clone from GitHub or pull the platform's dataset mounts — no scp.
403
+
404
+ ### Colab / Kaggle gotchas (platform-pinned; universal → `references/gotchas_universal.md`)
405
+
406
+ - **NB1 — Drive sync lag silently loses the "saved" checkpoint.** Symptom: training logs
407
+ `saved best.pth to /content/drive/...`, the runtime disconnects an hour later, and the file is **0 bytes
408
+ or absent** in Drive. → Root cause: writes to mounted Drive are **buffered and sync asynchronously** —
409
+ large files can take up to ~30 min to actually land, and an unmount/disconnect before the flush loses
410
+ them (verified github.com/googlecolab/colabtools#2607 + #4426 2026-06). → Fix: call
411
+ `drive.flush_and_unmount()` (or `os.fsync`) right after each checkpoint, keep checkpoints small, and
412
+ treat a checkpoint as durable **only after** it is visible in Drive — re-list it before trusting resume.
413
+ - **NB2 — Kaggle commit fails if any cell errors → the whole output is lost.** Symptom: "Save & Run All"
414
+ shows `committing…` forever or fails with a non-zero/`Code 0` error, and **nothing** in `/kaggle/working`
415
+ is saved. → Root cause: a commit re-runs the notebook **top-to-bottom on a fresh machine**; one failing
416
+ cell (or an interactive-only state, or a flaky cell) aborts the commit and discards its output (verified
417
+ kaggle.com/product-feedback/334753 + 59557 2026-06). → Fix: before committing, **Run All interactively**
418
+ end-to-end on a clean kernel (catch order/state bugs); guard long sections so a late failure still writes
419
+ partial results to `/kaggle/working`; rely on `/kaggle/working` (persisted), not in-memory variables.
420
+ - **NB3 — Kaggle batch (commit) run picks the WRONG accelerator / has no internet.** Symptom: a committed
421
+ run is glacial (ran on CPU) or fails to `pip install`/download. → Root cause: the **accelerator and
422
+ internet toggle are notebook settings the commit inherits** — a notebook left on "None"/internet-off
423
+ commits that way; internet also requires phone verification on the account. → Fix: set Accelerator =
424
+ GPU and Internet = On in the notebook *before* committing; verify with `torch.cuda.is_available()` in an
425
+ early cell so a CPU commit fails fast instead of wasting the 9 h.
426
+ - **NB4 — `/content` (Colab) and `/kaggle/temp` are scratch, not durable.** Symptom: results written to
427
+ `/content/...` or `/kaggle/temp` vanish on disconnect. → Root cause: only Drive (Colab) and
428
+ `/kaggle/working` (Kaggle committed output) survive the session; everything else is ephemeral. → Fix:
429
+ point `DURABLE_DIR` at the surviving path; never let the final artifact land only on scratch.
430
+ - **NB5 — Free Colab disconnect mid-epoch with no warning.** Symptom: the session simply dies; there is
431
+ **no SIGTERM, no grace window** to catch. → Root cause: unlike Slurm/K8s, a notebook eviction gives no
432
+ signal — the resume contract is the *only* defense. → Fix: checkpoint every N steps to Drive
433
+ (NB1-safe), make cell-1 resume-from-latest idempotent, and chain runs across sessions under the
434
+ per-session ceiling. There is no checkpoint-on-signal here (contrast Slurm `--signal` / K8s SIGTERM).
435
+
436
+ ### Colab / Kaggle debugging (session-death triage)
437
+
438
+ - **What am I actually on?** First cell: `import torch; print(torch.cuda.is_available(), torch.cuda.get_device_name(0))`
439
+ and `!nvidia-smi` — catches a CPU-only Colab assignment or a CPU Kaggle commit (NB3) before wasting the session.
440
+ - **Is the checkpoint really in Drive?** `!ls -la /content/drive/MyDrive/proj/*.pth` *after* a
441
+ `drive.flush_and_unmount()` — a 0-byte or missing file ⇒ sync lag (NB1), do not teardown trusting it.
442
+ - **Did the Kaggle commit succeed?** Open the Version's **Logs** tab (the only post-mortem for a committed
443
+ run) — a failed cell shows there; the committed `/kaggle/working` is the artifact, not the editor state.
444
+ - **Disk full inside the notebook?** `!df -h` — `/kaggle/working` caps at 20 GB; HF cache and intermediate
445
+ files exhaust it fast (U6/U7), prune before the commit's final write.
446
+
447
+ **Colab/Kaggle OVERRIDES:** `DETACH=`Drive-checkpoint loop (Colab) / Save&Run-All commit (Kaggle) ·
448
+ `DURABLE_DIR=`Drive `/content/drive/MyDrive/proj` (Colab) / `/kaggle/working` (Kaggle) · teardown=`automatic`
449
+ · the pattern, every run: checkpoint every N steps → idempotent resume from cell 1 → keep each run
450
+ under the per-session ceiling → chain runs across sessions.