opencode-skills-collection 3.1.2 → 3.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/bundled-skills/.antigravity-install-manifest.json +4 -1
  2. package/bundled-skills/agent-creator/SKILL.md +246 -0
  3. package/bundled-skills/ax-extract-workflow/SKILL.md +156 -0
  4. package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
  5. package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
  6. package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
  7. package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
  8. package/bundled-skills/docs/sources/sources.md +1 -1
  9. package/bundled-skills/docs/users/bundles.md +1 -1
  10. package/bundled-skills/docs/users/claude-code-skills.md +1 -1
  11. package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
  12. package/bundled-skills/docs/users/getting-started.md +1 -1
  13. package/bundled-skills/docs/users/kiro-integration.md +1 -1
  14. package/bundled-skills/docs/users/usage.md +4 -4
  15. package/bundled-skills/docs/users/visual-guide.md +4 -4
  16. package/bundled-skills/lovable-cleanup/SKILL.md +2 -1
  17. package/bundled-skills/remote-gpu-trainer/.gitattributes +8 -0
  18. package/bundled-skills/remote-gpu-trainer/LICENSE +21 -0
  19. package/bundled-skills/remote-gpu-trainer/README.md +267 -0
  20. package/bundled-skills/remote-gpu-trainer/SKILL.md +249 -0
  21. package/bundled-skills/remote-gpu-trainer/evals/README.md +57 -0
  22. package/bundled-skills/remote-gpu-trainer/evals/RESULTS.md +44 -0
  23. package/bundled-skills/remote-gpu-trainer/evals/cases.jsonl +14 -0
  24. package/bundled-skills/remote-gpu-trainer/evals/run_evals.py +68 -0
  25. package/bundled-skills/remote-gpu-trainer/examples/autodl_sweep/README.md +72 -0
  26. package/bundled-skills/remote-gpu-trainer/examples/autodl_sweep/queue_1.txt +6 -0
  27. package/bundled-skills/remote-gpu-trainer/profiles/_schema.md +100 -0
  28. package/bundled-skills/remote-gpu-trainer/profiles/autodl.md +327 -0
  29. package/bundled-skills/remote-gpu-trainer/profiles/china.md +397 -0
  30. package/bundled-skills/remote-gpu-trainer/profiles/generic-ssh.md +450 -0
  31. package/bundled-skills/remote-gpu-trainer/profiles/lambda.md +342 -0
  32. package/bundled-skills/remote-gpu-trainer/profiles/paperspace.md +365 -0
  33. package/bundled-skills/remote-gpu-trainer/profiles/runpod.md +164 -0
  34. package/bundled-skills/remote-gpu-trainer/profiles/vastai.md +355 -0
  35. package/bundled-skills/remote-gpu-trainer/references/china-network.md +206 -0
  36. package/bundled-skills/remote-gpu-trainer/references/gotchas_universal.md +704 -0
  37. package/bundled-skills/remote-gpu-trainer/references/lifecycle_checklist.md +148 -0
  38. package/bundled-skills/remote-gpu-trainer/references/monitoring_patterns.md +327 -0
  39. package/bundled-skills/remote-gpu-trainer/references/multinode.md +190 -0
  40. package/bundled-skills/remote-gpu-trainer/references/parallel_ablation.md +196 -0
  41. package/bundled-skills/remote-gpu-trainer/references/principles.md +179 -0
  42. package/bundled-skills/remote-gpu-trainer/references/self-improvement.md +74 -0
  43. package/bundled-skills/remote-gpu-trainer/references/spot-resilience.md +235 -0
  44. package/bundled-skills/remote-gpu-trainer/references/ssh_transport.md +270 -0
  45. package/bundled-skills/remote-gpu-trainer/references/training/by-domain.md +230 -0
  46. package/bundled-skills/remote-gpu-trainer/references/training/checkpoint-resume.md +368 -0
  47. package/bundled-skills/remote-gpu-trainer/references/training/convergence-debugging.md +187 -0
  48. package/bundled-skills/remote-gpu-trainer/references/training/data-pipeline.md +119 -0
  49. package/bundled-skills/remote-gpu-trainer/references/training/distributed-launch.md +422 -0
  50. package/bundled-skills/remote-gpu-trainer/references/training/oom-memory.md +338 -0
  51. package/bundled-skills/remote-gpu-trainer/references/training/precision-stability.md +401 -0
  52. package/bundled-skills/remote-gpu-trainer/references/training/throughput-profiling.md +451 -0
  53. package/bundled-skills/remote-gpu-trainer/scripts/aggregate_to_fs.sh +55 -0
  54. package/bundled-skills/remote-gpu-trainer/scripts/check_staleness.py +70 -0
  55. package/bundled-skills/remote-gpu-trainer/scripts/download_loop.sh +67 -0
  56. package/bundled-skills/remote-gpu-trainer/scripts/gpu_health.sh +169 -0
  57. package/bundled-skills/remote-gpu-trainer/scripts/health_patrol.sh.template +67 -0
  58. package/bundled-skills/remote-gpu-trainer/scripts/mem_monitor.sh +67 -0
  59. package/bundled-skills/remote-gpu-trainer/scripts/reap_vram_zombies.sh +175 -0
  60. package/bundled-skills/remote-gpu-trainer/scripts/run_one.sh.template +104 -0
  61. package/bundled-skills/remote-gpu-trainer/scripts/run_queue.sh.template +83 -0
  62. package/bundled-skills/remote-gpu-trainer/scripts/setup-china-mirrors.sh +35 -0
  63. package/bundled-skills/remote-gpu-trainer/scripts/verify_local.py +145 -0
  64. package/package.json +1 -1
  65. package/skills_index.json +66 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Yuyuan Han
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,267 @@
1
+ # remote-gpu-trainer
2
+
3
+ **An Agent Skill for running long GPU jobs on machines you rent but don't own.** Deploy, train,
4
+ monitor, and tear down safely across [AutoDL](https://www.autodl.com), RunPod, vast.ai, Lambda,
5
+ Paperspace, the Chinese platforms (恒源云 / 矩池云 / Featurize / 揽睿星舟), bare SSH boxes, Slurm, and
6
+ Kubernetes. One instance, or a fan-out of many.
7
+
8
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
9
+ [![Agent Skills standard](https://img.shields.io/badge/Agent%20Skills-SKILL.md-blue)](https://agentskills.io)
10
+ [![agentskills validate](https://img.shields.io/badge/agentskills%20validate-passing-brightgreen)](https://agentskills.io/specification)
11
+ [![Platforms](https://img.shields.io/badge/platform%20profiles-7-orange)](#whats-inside)
12
+ [![Status](https://img.shields.io/badge/status-pre--release-yellow)](#verification-status)
13
+
14
+ > **Disambiguation:** "AutoDL" here is the **autodl.com** GPU-rental platform, not AutoML or NAS. And
15
+ > this is an **Agent Skill** — a `SKILL.md` with reference docs and script templates — not a CLI or an
16
+ > SDK. It rides *above* each platform's API and encodes the operational survival knowledge those APIs
17
+ > leave out.
18
+
19
+ The whole skill is built on one mental model: **you are a short-term tenant on someone else's machine.**
20
+ So it teaches tenant survival — detach the job, make the result outlive the box, stop the meter without
21
+ losing data — and treats that as a single model across every backend. Only the per-platform specifics
22
+ (stop-vs-destroy billing, machine-locked volumes, `/root` ephemerality, acceleration proxy vs HF mirror,
23
+ spot grace) get pushed down into one profile per platform.
24
+
25
+ ```mermaid
26
+ flowchart TD
27
+ TASK(["Your task: deploy / train / monitor / tear down<br/>a job on a GPU box you rent, not own"])
28
+ TASK --> MATCH{"description keywords<br/>match the task?"}
29
+ MATCH -->|skill activates| HUB
30
+ HUB["<b>SKILL.md</b> — the always-loaded hub<br/>10 operating principles · 6-phase lifecycle · platform selector"]
31
+ HUB --> CORE["<b>references/</b><br/>platform-agnostic core"]
32
+ HUB --> PROF["<b>profiles/</b><br/>per-platform specifics"]
33
+ HUB --> EXEC["<b>scripts · examples · evals</b>"]
34
+ CORE --> CORE1["principles · gotchas U1–U39 · monitoring<br/>spot-resilience · ssh · china-network"]
35
+ CORE --> CORE2["training/ ×8 — the DL-debug layer<br/>OOM · NCCL-hang · NaN · throughput · ckpt · convergence · data"]
36
+ PROF --> PROF1["autodl (deepest) · runpod · vastai · lambda<br/>paperspace · china · generic-ssh"]
37
+ EXEC --> EXEC1["runnable wrappers + monitors · one worked example<br/>no-API-key retrieval drift-guard"]
38
+ ```
39
+
40
+ ## Contents
41
+
42
+ [Why this exists](#why-this-exists) · [How it differs](#how-it-differs) ·
43
+ [Architecture and layout](#architecture-and-layout) · [Install and deploy](#install-and-deploy) ·
44
+ [What's inside](#whats-inside) · [Scope](#scope) · [Verification status](#verification-status) ·
45
+ [Disclaimer](#disclaimer) · [中文简介](#中文简介) · [Contributing](#contributing) ·
46
+ [License](#license) · [Citing](#citing)
47
+
48
+ ## Why this exists
49
+
50
+ Renting a GPU is the easy part. The expensive surprises come from everything around the job: a stopped
51
+ box that quietly keeps billing, a "synced" checkpoint that never actually wrote because the disk ran out
52
+ of inodes, a download that stalls behind the wrong mirror, a `terminate` that deletes the only copy of a
53
+ week's training. None of that is in a platform's API docs, and most of it only bites once you've already
54
+ paid for it.
55
+
56
+ This skill collects that knowledge into a form an agent can act on: ten operating principles for *why*
57
+ each step matters, a six-phase lifecycle that ends every phase in a runnable check, and one profile per
58
+ platform that pins the concrete commands. It is opinionated about the things that cost money or data, and
59
+ quiet about the rest.
60
+
61
+ ## How it differs
62
+
63
+ General orchestrators — **SkyPilot**, **dstack**, **Modal** — own or abstract the infrastructure and
64
+ price-shop across Western clouds. They are excellent at that, and this skill does not compete with them.
65
+ But none of them supports AutoDL or the Chinese platforms, and each assumes its own daemon or cluster
66
+ model.
67
+
68
+ `remote-gpu-trainer` meets you on the **raw rented instance you already control**, and concentrates on a
69
+ blind spot those tools leave open: the Chinese platforms and bare-SSH cheap rentals, where disk-budget
70
+ design, inode caps, mirror stalls, cgroup OOM, spot-grace windows, and *irreversible* teardown are the
71
+ actual job. The two approaches compose well: let SkyPilot or dstack move the box for you, then let this
72
+ skill make your *code* resume-correct so their recovery actually restores progress.
73
+
74
+ ## Architecture and layout
75
+
76
+ The design follows the Agent Skills idea of **progressive disclosure**: a small always-loaded hub, and
77
+ deeper material loaded only when a phase needs it. The split that makes it portable is
78
+ **platform-agnostic core, platform-specific edges** — the principles and lifecycle hold everywhere, and
79
+ every concrete path, proxy, billing verb, and spot rule lives in exactly one place, the profile.
80
+
81
+ The six-phase lifecycle is the operational spine. Each phase delegates its substrate to the active
82
+ profile and ends in a check you can run:
83
+
84
+ ```mermaid
85
+ flowchart LR
86
+ P0["0 · audit<br/>df -i · cgroup · GPU"] --> P1["1 · ssh + creds"]
87
+ P1 --> P2["2 · CPU smoke<br/><i>before you rent</i>"]
88
+ P2 --> P3["3 · detached launch"]
89
+ P3 --> P4["4 · durable monitor<br/>(four-layer)"]
90
+ P4 --> P5["5 · verify + teardown<br/><b>Iron Law</b>"]
91
+ ```
92
+
93
+ The folders map onto that architecture directly:
94
+
95
+ ```text
96
+ remote-gpu-trainer/
97
+ ├── SKILL.md # the hub: 10 principles + 6-phase lifecycle + platform selector
98
+ ├── references/ # platform-agnostic knowledge, loaded on demand
99
+ │ ├── principles.md # the 10 invariants, expanded with cross-platform nuance
100
+ │ ├── lifecycle_checklist.md # the 6 phases as a per-platform checklist
101
+ │ ├── gotchas_universal.md # U1–U39, symptom → root cause → fix (U36–U38 are cross-links)
102
+ │ ├── monitoring_patterns.md # four-layer durable monitoring + cross-host portability map
103
+ │ ├── spot-resilience.md # preemption signals, Young/Daly cadence, atomic-write resume
104
+ │ ├── ssh_transport.md # ssh config, resumable rsync/scp, secrets via stdin, CRLF
105
+ │ ├── china-network.md # mirrors, HF_ENDPOINT, the no_proxy trap
106
+ │ ├── parallel_ablation.md # FS-shared fan-out + the reconciliation step
107
+ │ ├── multinode.md # NCCL / fabric-manager / elastic training (advanced)
108
+ │ ├── self-improvement.md # how the skill captures new gotchas without corrupting itself
109
+ │ └── training/ # the DL-training debug layer — when the run breaks, not the box
110
+ │ ├── oom-memory.md # CUDA/host OOM + the fit-it ladder
111
+ │ ├── distributed-launch.md # torchrun/accelerate/deepspeed + the multi-GPU HANGS toolkit
112
+ │ ├── precision-stability.md # fp16/bf16/tf32, NaN/Inf hunting, LLM loss spikes
113
+ │ ├── throughput-profiling.md # GPU-bound vs data-bound vs comms-bound
114
+ │ ├── checkpoint-resume.md # full-state + sharded save/resume, the resume bugs
115
+ │ ├── by-domain.md # LLM / vision / diffusion / RL / multimodal gotchas
116
+ │ ├── convergence-debugging.md # runs but won't learn: optimizer/LR/loss-fn/freezing
117
+ │ └── data-pipeline.md # dataloader & dataset correctness (not speed)
118
+ ├── profiles/ # one file per platform — the only place concrete specifics live
119
+ │ ├── _schema.md # the shared 8-field contract every profile fills
120
+ │ ├── autodl.md # deepest, battle-tested
121
+ │ ├── runpod.md vastai.md lambda.md paperspace.md
122
+ │ ├── china.md # 恒源云 / 矩池云 / Featurize / 揽睿星舟
123
+ │ └── generic-ssh.md # bare SSH / Slurm / K8s / Colab-Kaggle
124
+ ├── scripts/ # parameterized, runnable templates
125
+ │ ├── run_one.sh.template run_queue.sh.template health_patrol.sh.template
126
+ │ ├── mem_monitor.sh gpu_health.sh reap_vram_zombies.sh
127
+ │ ├── aggregate_to_fs.sh download_loop.sh setup-china-mirrors.sh
128
+ │ └── verify_local.py # load-and-verify each artifact before any teardown
129
+ ├── examples/autodl_sweep/ # one complete worked case, end to end
130
+ └── evals/ # cases.jsonl + run_evals.py (no-API-key drift guard) + RESULTS.md
131
+ ```
132
+
133
+ Each profile fills the same eight fields, so a platform you've never used reads like one you have:
134
+ launch · storage survival-matrix · network · spot/resume · teardown/billing · daemon · gotchas · script
135
+ overrides.
136
+
137
+ ## Install and deploy
138
+
139
+ This is a standard [Agent Skill](https://agentskills.io): one folder with a `SKILL.md` at its root.
140
+ Installing it means cloning that folder into wherever your agent looks for skills, then restarting the
141
+ agent. It auto-triggers on remote or rented-GPU deploy / train / monitor tasks — you don't invoke it by
142
+ name. Keep the folder named `remote-gpu-trainer`; the standard requires the directory name to match the
143
+ skill's `name:` field.
144
+
145
+ **Claude Code**
146
+
147
+ ```bash
148
+ git clone https://github.com/Hanyuyuan6/remote-gpu-trainer.git ~/.claude/skills/remote-gpu-trainer
149
+ ```
150
+
151
+ **OpenAI Codex**
152
+
153
+ ```bash
154
+ git clone https://github.com/Hanyuyuan6/remote-gpu-trainer.git ~/.agents/skills/remote-gpu-trainer
155
+ ```
156
+
157
+ **Cursor · Trae · Gemini CLI · VS Code / Copilot · Goose · Kiro · other compatible agents**
158
+
159
+ Clone the same folder into that agent's skills directory (each agent's docs, or
160
+ [agentskills.io](https://agentskills.io), give the exact location). Because they all read the same open
161
+ `SKILL.md` standard, the folder works unchanged across every one of them.
162
+
163
+ **Verify the install (optional).** With [uv](https://github.com/astral-sh/uv):
164
+
165
+ ```bash
166
+ uvx --from skills-ref agentskills validate ~/.claude/skills/remote-gpu-trainer # → "Valid skill"
167
+ ```
168
+
169
+ > **Two caveats.** The companion skills this one cross-links (`verifying-dl-experiments`,
170
+ > `superpowers:*`, `huggingface-skills:*`) are optional separate installs; it works standalone without
171
+ > them. And a few durable-monitoring recipes assume a host background-task runner plus a scheduler — map
172
+ > those to your agent's equivalents, using the per-host table in `references/monitoring_patterns.md` §7.
173
+
174
+ ## What's inside
175
+
176
+ - **`SKILL.md`** — the hub. Ten platform-agnostic operating principles, the six-phase lifecycle with a
177
+ runnable gate per phase, the platform selector, and the cross-links into everything below.
178
+ - **`references/`** — the platform-agnostic knowledge: `principles.md` (the ten invariants expanded),
179
+ `gotchas_universal.md` (U1–U39, each a `symptom → root cause → fix`; U36–U38 are delegated cross-links), `monitoring_patterns.md`
180
+ (four-layer durable monitoring plus a cross-host portability map), and the focused playbooks for SSH
181
+ transport, China networking, spot resilience, parallel ablation, multi-node, and self-improvement.
182
+ - **`references/training/`** — the **DL-training debug layer**, eight files for when the *run* breaks
183
+ rather than the platform: OOM, distributed launch and multi-GPU hangs, precision and loss spikes,
184
+ throughput profiling, checkpoint/resume, per-domain gotchas, convergence ("runs but won't learn"), and
185
+ dataloader correctness.
186
+ - **`profiles/`** — one file per platform, the only place concrete specifics live. `autodl` is the
187
+ deepest; alongside it are `runpod`, `vastai`, `lambda`, `paperspace`, `china`, and `generic-ssh`
188
+ (covering Slurm, K8s, Colab, Kaggle). `_schema.md` defines the shared eight-field contract.
189
+ - **`scripts/`** — parameterized wrapper templates, a memory monitor, a GPU-health probe, a VRAM-zombie
190
+ reaper, a read-only health-patrol tick, FS aggregation, a resumable download loop, the China-mirror
191
+ setup, and a load-and-verify checker.
192
+ - **`examples/autodl_sweep/`** — one complete worked case, end to end.
193
+ - **`evals/`** — a retrieval drift-guard: `cases.jsonl` holds realistic scenarios, `run_evals.py` checks
194
+ with no API key that every scenario's answer is still present at its documented location, and
195
+ `RESULTS.md` records fresh-agent navigation runs.
196
+
197
+ ## Scope
198
+
199
+ - **For:** rented or remote GPU instances (Chinese and Western clouds, bare SSH, Slurm, K8s); single or
200
+ multi-instance; long-running jobs — training, eval, ablation sweeps, batch inference, large data
201
+ processing.
202
+ - **Not for:** purely-local single-GPU training, in-instance multi-GPU DDP (use `torchrun` /
203
+ `accelerate`), managed multi-cloud price-shopping (use SkyPilot's skill), or zero-ops serverless (use
204
+ Modal).
205
+
206
+ ## Verification status
207
+
208
+ The **AutoDL** profile reflects the author's hands-on, daily use. The other six profiles — RunPod,
209
+ vast.ai, Lambda, Paperspace, the Chinese platforms, and the generic SSH / Slurm / K8s core — are
210
+ researched from each platform's official documentation and community reports. Every money-affecting fact
211
+ is cited inline and stamped `verified <month>`, but they are **not yet independently live-tested** by the
212
+ author. Treat them as a well-sourced starting map, not a guarantee.
213
+
214
+ The skill is built to **verify before any irreversible or costly action** (the Phase-0 live measurement,
215
+ the teardown Iron Law), so a stale fact surfaces as "re-check the docs," not a silent loss. Corrections,
216
+ and "I ran this, here's what changed" reports, are very welcome — please open an issue or PR.
217
+
218
+ ## Disclaimer
219
+
220
+ This is an independent community resource. It is **not affiliated with, endorsed by, or sponsored by**
221
+ AutoDL, RunPod, vast.ai, Lambda, Paperspace, DigitalOcean, or any platform named here. All product names
222
+ and trademarks belong to their respective owners and are used **nominatively**, only to identify the
223
+ platform a piece of guidance applies to. Platform facts are synthesized from public documentation and
224
+ community reports (cited inline) and were accurate at the noted `verified` date. **Platforms change their
225
+ pricing, billing verbs, and limits, so verify against current official docs before relying on a teardown
226
+ or billing fact** (see `references/self-improvement.md` §5). Provided "as is" under the MIT License,
227
+ without warranty.
228
+
229
+ ## 中文简介
230
+
231
+ 面向在**租来的 / 远程 GPU**(不是你自己的机器)上跑长任务的研究者与工程师,覆盖 AutoDL、RunPod、
232
+ vast.ai、Lambda、Paperspace、国内平台(恒源云 / 矩池云 / Featurize / 揽睿星舟)、裸 SSH 机器、Slurm、
233
+ Kubernetes,单机或多机并行。
234
+
235
+ 核心隐喻:**你是别人机器上的短期租客。** 所以技能教的是「让作业活过这台租来的机器」:把作业 detach、
236
+ 让结果先于实例存活、再安全地停掉计费。一套心智模型跨所有后端,只把每个平台的差异(停止 vs 销毁的计费、
237
+ 机器锁定的网盘、`/root` 是否易失、加速代理 vs HF 镜像、spot 抢占宽限)参数化下沉到各
238
+ `profiles/<平台>.md`。
239
+
240
+ 它专注的,正是 SkyPilot / dstack / Modal 这类抽象层略过的盲区:**AutoDL + 国内平台 + 裸 SSH 廉价租卡**
241
+ 上的磁盘预算、inode 上限、镜像卡顿、cgroup OOM、spot 宽限窗口,以及不可逆的销毁操作。安装方式见
242
+ [Install and deploy](#install-and-deploy):把整个文件夹克隆进对应 agent 的 skills 目录即可,重启后自动
243
+ 触发。
244
+
245
+ ## Contributing
246
+
247
+ Issues and PRs are welcome, especially **new platform profiles** and **new gotchas** with a concrete
248
+ `symptom → root cause → fix`. Keep every example generic: no real project names, hostnames, IPs, ports,
249
+ or keys. The `references/self-improvement.md` protocol describes the bar a new gotcha has to clear
250
+ (root-caused, reproduced, generalizable) before it earns a place in the catalog.
251
+
252
+ ## License
253
+
254
+ MIT — see [LICENSE](LICENSE). Copyright (c) 2026 Yuyuan Han.
255
+
256
+ ## Citing
257
+
258
+ A link back is plenty. If you need a formal reference:
259
+
260
+ ```bibtex
261
+ @software{han_remote_gpu_trainer_2026,
262
+ author = {Han, Yuyuan},
263
+ title = {remote-gpu-trainer: an Agent Skill for long GPU jobs on rented instances},
264
+ year = {2026},
265
+ url = {https://github.com/Hanyuyuan6/remote-gpu-trainer}
266
+ }
267
+ ```
@@ -0,0 +1,249 @@
1
+ ---
2
+ name: remote-gpu-trainer
3
+ description: "Deploy, monitor, and debug long GPU jobs on RENTED/remote instances (AutoDL, RunPod, vast.ai, Lambda, Slurm, K8s): teardown/billing safety, spot resilience, resumable checkpointing, OOM/NaN triage."
4
+ risk: safe
5
+ source: community
6
+ source_type: community
7
+ source_repo: Hanyuyuan6/remote-gpu-trainer
8
+ date_added: "2026-06-20"
9
+ category: ml-ops
10
+ license: "MIT"
11
+ license_source: "https://github.com/Hanyuyuan6/remote-gpu-trainer/blob/main/LICENSE"
12
+ compatibility: |
13
+ Any Agent-Skills (SKILL.md)-compatible agent — Claude Code, Codex, Cursor, Trae, Gemini CLI, etc.
14
+ Needs a shell + SSH (or a platform CLI/API) to drive the remote box; scripts are bash/python. A few
15
+ durable-monitoring recipes assume a host background-task runner + scheduler — map to the running
16
+ agent's equivalents (references/monitoring_patterns.md §7). Companion skills (verifying-dl-experiments,
17
+ superpowers:*, huggingface-skills:*) are optional separate installs.
18
+ ---
19
+
20
+ # remote-gpu-trainer — Remote GPU Job Orchestration
21
+
22
+ ## Overview
23
+
24
+ Deploy and babysit long-running GPU jobs on **rented boxes you don't own**, across any platform, and
25
+ get the result off the box before the meter or a preemption kills it. The core insight: **you are a
26
+ short-term tenant on someone else's machine** — so the job is to *detach the work, make the result
27
+ outlive the instance, and stop the meter safely*, not to provision a cluster.
28
+
29
+ This skill is **platform-agnostic at the core, platform-specific at the edges**: a fixed set of
30
+ operating principles + a 6-phase lifecycle that hold everywhere, plus one **profile per platform**
31
+ (`profiles/<platform>.md`) that owns every concrete path, proxy, billing verb, and spot semantic. Its
32
+ defensible value is the union the big orchestrators skip: **Chinese cgroup-isolated rentals + bare-SSH
33
+ cheap boxes + the disk-budget / monitoring / teardown reality** that *is* the job on metered hardware.
34
+
35
+ ## When to Use This Skill
36
+
37
+ Use whenever the user deploys, trains, monitors, or troubleshoots a long-running GPU job on a **RENTED
38
+ or remote instance they do not own** — training, eval, ablation sweeps, batch inference, or large data
39
+ processing — on AutoDL, RunPod, vast.ai, Lambda, Paperspace, Chinese platforms (恒源云/矩池云/Featurize/
40
+ 揽睿星舟), a bare SSH box, Slurm, or Kubernetes; single OR multi-instance. Triggers (multilingual):
41
+ "远程 GPU 训练", "GPU 租赁", "GPU rental", "租卡", "spot 抢占", "spot preemption", "断点续训",
42
+ "resumable training", "tmux 训练守护", "防 SSH 断线", "scp/rsync 上传", "多实例 ablation",
43
+ "远程 GPU 监控", "省钱关机/销毁实例", "stop vs terminate billing", "checkpoint 磁盘满",
44
+ "CUDA OOM/显存不足", "loss NaN/loss spike", "loss 不下降/不收敛", "overfit 单 batch",
45
+ "FSDP/DeepSpeed 配置", "多卡训练 hang", "dataloader worker/数据增广 bug". **NOT** for purely local
46
+ single-GPU training, in-instance multi-GPU DDP (use torchrun/accelerate), managed multi-cloud
47
+ price-shopping (use SkyPilot's skill), or zero-ops serverless (use Modal).
48
+
49
+ ## When NOT to use — and what to use instead
50
+
51
+ | Situation | Use instead |
52
+ |---|---|
53
+ | Local single-GPU, or multi-GPU **DDP inside one box** | `torchrun` / `accelerate` directly |
54
+ | Managed multi-cloud price-shopping + auto spot-recovery across **Western** clouds | **SkyPilot** (has its own Agent Skill) — then come back here to make your *code* resume-correct so its recovery actually works |
55
+ | Open BYOC dev environments | **dstack** |
56
+ | Zero-ops serverless inference | **Modal** |
57
+ | "Is this metric / ablation delta real?" | **REQUIRED:** `verifying-dl-experiments` (this skill owns *running* the job; that one owns *whether the number is true*) |
58
+
59
+ **This skill is for the blind spot those tools leave:** AutoDL + Chinese platforms, bare SSH/Slurm/K8s
60
+ rentals, and the operational gotchas (inode caps, mirror stalls, cgroup OOM, silent sync, spot grace
61
+ windows, irreversible teardown) that survive whichever provisioner you use.
62
+
63
+ ## Operating principles (the WHY — 10 invariants)
64
+
65
+ These hold on every metered, isolated, rented GPU; only the paths/CLI change. One line each; the deep
66
+ form with cross-platform nuance is in **`references/principles.md`** (read it before Phase 0).
67
+
68
+ 1. **Minimize paid wall-clock.** The meter runs the whole time — smoke locally on CPU before renting, launch detached, release the instant verification passes.
69
+ 2. **Cheap checks before expensive compute.** A 1–2 batch CPU smoke (logger off) kills import/config/shape/scale bugs for ~free. (Smoke *content* → `verifying-dl-experiments`.)
70
+ 3. **Trust artifacts you loaded, not log lines that claim success.** "synced/saved/done" lies under a silently-failed write; a watcher's own state is also a claim — reconcile it against the real process/artifact.
71
+ 4. **Know what survives stop vs destroy.** Per platform, identify exactly which mount survives a *stop* and which survives a *terminate* — the data you need often lives on the volatile one. (The single biggest portability trap.)
72
+ 5. **Storage fails on the dimension you're not watching.** Disk dies on **inodes** before bytes; the real hog hides in a symlinked cache; clean by value (keep tiny evidence, drop big scratch); monitor `df -i`, not just `df -h`.
73
+ 6. **Never mutate inputs under a live run.** A running job holds its scripts in memory by byte-offset; overwriting one mid-run re-executes blocks. Version filenames.
74
+ 7. **Design for retry — failure is probabilistic, transfers are flaky, mirrors are route-specific.** Make wrappers idempotent + resumable; retry the *identical* config; wrap bulk transfers in `timeout`+resume loops; a mirror/proxy speeds ONE route — validate on the same route the real transfer uses.
75
+ 8. **Checkpoint-to-durable + idempotent resume is the universal spine.** File checkpoint to the platform's durable location + unconditional load-latest-on-startup is the *one* mechanism that survives an SSH drop, a Slurm walltime kill, a K8s reschedule, a spot preemption, and a Colab disconnect. The detach primitive (tmux/sbatch/Job/commit) is the swappable plug; this is the invariant.
76
+ 9. **Cost and destructive actions are the user's call.** Never auto-release/terminate, never delete durable files without confirmation; if cleanup can't free space, **ask to expand the disk** rather than silently shrink the experiment.
77
+ 10. **Teach the user the platform, don't just drive it.** Most users don't know a platform's non-obvious **conveniences** (one-click SSH-key registration, GPU-availability notifications, built-in panels) or its **danger clocks** (auto-release/auto-delete timers on a *stopped* box — AutoDL releases a 关机 instance after 15 days → data disk gone; a stop that keeps billing; low-balance purge). Surface them on first contact — #9 stops the agent *doing* the dangerous thing, #10 *warns the human* before the clock fires. Per-platform list → each profile's **Surface to the user** block.
78
+
79
+ > **Monitoring physics (substrate for #3):** foreground Bash hard-caps at 600 s; `run_in_background` has no cap and notifies on exit; a never-exiting watcher never notifies; an unquoted `|` in a poll regex reads stdin and hangs forever. The four-layer monitoring architecture is built on these facts → `references/monitoring_patterns.md`.
80
+
81
+ ## Code discipline (the wrapper & training scripts you write)
82
+
83
+ Two rules govern the launch/wrapper/training code this skill has you write — corollaries of #1 and #8, not new invariants:
84
+
85
+ 1. **Reuse before writing.** Take the lowest rung that already works before adding code: the base image's pre-installed stack + platform features → a framework/library utility (`torchrun` / `accelerate` / HF) → your existing `scripts/` templates → minimal new code. On a metered box a needless `pip install` also burns paid wall-clock and can break the image's ABI — Phase 1's rule (*the prebuilt image **is** the env; don't `conda create` on a rental*) is exactly this principle applied to dependencies.
86
+ 2. **Floor — `minimum` bounds scope, not correctness.** Shrinking code must never drop what makes an expensive run survivable: checkpoint-to-durable + idempotent resume (#8), atomic writes, the error handling that prevents losing a long run, or seed/determinism logging. Keep one minimal self-check for non-trivial logic.
87
+
88
+ ## Pick your platform profile FIRST
89
+
90
+ Read the matching profile **before Phase 0** — it owns every path, proxy, credential location, billing
91
+ verb, and spot rule the phases below delegate to. Each follows the same 8-field schema
92
+ (`profiles/_schema.md`).
93
+
94
+ > **New here? The path is:** (1) find your platform in the table below → (2) read that profile's **LAUNCH**
95
+ > section (it walks rent → register SSH key → reach the box) → (3) come back and run the 6 phases from Phase 0.
96
+ > Already have a box you can `ssh` into? Skip straight to Phase 0.
97
+
98
+ | You're on… | Profile | Kind | Detach primitive | Meter-stop verb |
99
+ |---|---|---|---|---|
100
+ | AutoDL (deepest, battle-tested) | `profiles/autodl.md` | ssh-rental | tmux | 关机 (stops meter, **keeps disk** — the AutoDL exception) |
101
+ | RunPod | `profiles/runpod.md` | ssh-rental | tmux | **terminate** (stop still bills 2×; destroys volume disk) |
102
+ | vast.ai | `profiles/vastai.md` | ssh-rental (spot) | tmux | **destroy** (stop bills disk forever) |
103
+ | Lambda | `profiles/lambda.md` | cloud-api | tmux | **terminate** (no stop state) |
104
+ | Paperspace | `profiles/paperspace.md` | cloud-api | tmux | **destroy + release IP + delete storage** (shut-down stops compute only) |
105
+ | 恒源云 / 矩池云 / Featurize / 揽睿星舟 | `profiles/china.md` | ssh-rental | tmux | per-platform (data disk often bills while stopped) |
106
+ | Bare SSH box / Slurm / K8s / Colab-Kaggle | `profiles/generic-ssh.md` | ssh / slurm / k8s | tmux / sbatch / Job / commit | **manual** (a forgotten box bills 24/7) |
107
+
108
+ > **Profile confidence:** AutoDL is battle-tested from the author's daily use; the other six profiles are
109
+ > built from each platform's official docs + community reports (cited inline, `verified <month>`) and not
110
+ > yet independently live-tested — lean on the Phase-0 live measurements and **re-verify any teardown/
111
+ > billing fact against current docs before betting money or data** (`references/self-improvement.md` §5).
112
+
113
+ **Mental verb model** (one API across all platforms; the profile binds each verb to real commands):
114
+ `up` (rent+reach) → `push` (code/data on) → `run` (detached + checkpointing) → `watch` (durable monitor) → `pull` (results off + verify) → `down` (stop the meter).
115
+
116
+ ## Default workflow (6 phases)
117
+
118
+ Skip phases already done. Each phase delegates substrate to the profile and **ends in a runnable check**.
119
+
120
+ **Phase 0 — Environment audit.** Read the profile's STORAGE survival-matrix + region/DC-lock. Measure live:
121
+ `df -h && df -i <data-mount>`, cgroup `memory.max`, `nvidia-smi`. Pre-compute the checkpoint disk budget
122
+ (`ckpt_size × N + scratch`). → **verify:** `nvidia-smi` shows the expected GPU and `df -i` is not near 100%.
123
+
124
+ **Phase 1 — SSH + credentials.** Set the alias/env per the profile (the prebuilt image/base IS the env —
125
+ do not `conda create` on a rental). **Never rented before? the profile's LAUNCH section walks rent → register SSH key → connect.** Push secrets via **stdin, never onto a shared/durable FS**
126
+ (`references/ssh_transport.md`). → **verify:** `ssh <alias> 'python -c "import torch;print(torch.cuda.is_available())"'`.
127
+
128
+ **Phase 2 — Wrapper + CPU-smoke gate.** Build an idempotent `run_one`/`run_queue` from `scripts/` (parameterized
129
+ from the profile's OVERRIDES; **size batch/workers to the box for a standalone run, but PIN them across cells for a fair comparison** — `references/training/throughput-profiling.md`). **Run the cheap CPU smoke locally BEFORE renting** — it kills the dumb,
130
+ expensive failures (e.g. `python -m <your.train.module> --limit-batches 2 --epochs 1` — substitute your own entrypoint; this gate needs your training code plugged in). → **verify:** that smoke exits 0 on 2 batches with the logger disabled.
131
+
132
+ **Phase 3 — Detached launch.** Launch via the profile's detach primitive; probe briefly (log head + alive +
133
+ no traceback), then **hand back** — never a blocking foreground `sleep`. → **verify:** within 60 s, the detach
134
+ session is alive and the first log line shows the expected step/epoch.
135
+
136
+ **Phase 4 — Durable monitoring.** For anything over ~1–2 h, deploy the **four-layer architecture**
137
+ (`references/monitoring_patterns.md`): on-box self-completion chain + session patrol loop + event sentinels +
138
+ recovery handbook. **On Claude Code, fire the L2 patrol via `/loop 30m` (or `ScheduleWakeup`) running `scripts/health_patrol.sh.template`**; a host with no local recurring runner wires the on-box self-push instead (`references/monitoring_patterns.md` §7). A session-bound watcher alone dies with the session. Classify each outcome →
139
+ fixed remediation; **never blind-retry**. → **verify:** the patrol reports even when nothing changed.
140
+
141
+ **Phase 5 — Aggregate + verify + teardown.** Checked-sync to durable storage (gate the success line on the
142
+ copy result — principle #3), then **load-and-verify each artifact** (`scripts/verify_local.py`), THEN the profile's
143
+ meter-stopping action. → **verify:** `verify_local.py` reports 100% OK *before* any teardown.
144
+
145
+ > **Iron Law — teardown gate:** NO `release` / `terminate` / `destroy` / file-delete until checkpoints are
146
+ > **pulled to local AND verified by load**, and the user has explicitly approved the cost-affecting action.
147
+ > "It looked done in the log" is not evidence (principle #3). On most platforms the meter-stopping action is
148
+ > **irreversible** (deletes the disk) — confirmation matters more, not less.
149
+
150
+ ## Parallel ablation fan-out
151
+
152
+ For N ablation cells: one job per cell, an **isolated write path per job** (no shared mutable output), launched
153
+ across instances/queues. **REQUIRED:** `superpowers:dispatching-parallel-agents` supplies the independence
154
+ predicate (don't fan out onto shared state) and the mandatory post-fan-out reconciliation. FS-shared deployment
155
+ pattern → `references/parallel_ablation.md`.
156
+
157
+ ## Quick reference — the four facts that bite per platform
158
+
159
+ Full detail in each profile; this table is the at-a-glance.
160
+
161
+ | Platform | Survives **stop** | Survives **destroy** | Spot grace | China mirror needed |
162
+ |---|---|---|---|---|
163
+ | AutoDL | /root + data + FS | FS only | n/a | yes (`/etc/network_turbo`, hf-mirror) |
164
+ | RunPod | volume disk (bills 2×) | Network Volume only | ~5 s SIGTERM→KILL | no (`hf_transfer`) |
165
+ | vast.ai | disk (bills forever) | nothing | ~0 s (abrupt) | no |
166
+ | Lambda | n/a (no stop) | nothing | n/a (on-demand) | no |
167
+ | China (恒源云/矩池云/…) | varies; data disk bills | per-platform persistent vol | n/a | yes |
168
+ | generic-SSH/Slurm/K8s | you own it | you own it | Slurm SIGTERM→KillWait (def 30 s) | only if in China |
169
+
170
+ ## Common gotchas (top 8 inline — full catalog in references/)
171
+
172
+ The universal ones that cost the most GPU-hours. Symptom → fix; root cause + the rest in
173
+ **`references/gotchas_universal.md`** (run `grep -i '<keyword>' references/gotchas_universal.md` to jump).
174
+
175
+ 1. **SSH drops on `pkill -9`** (exit 255 + "Connection reset") — normal; re-ssh to verify, don't panic.
176
+ 2. **tmux holds the script in memory** — editing it mid-run re-executes blocks; version the filename.
177
+ 3. **Disk-full crashes `torch.save`** (`iostream error`) — pre-budget; auto-prune `latest.pth`, keep `best`.
178
+ 4. **cgroup OOM with no traceback** (bare `Killed` / exit 137) — `num_workers × big-tensor`; size workers vs `memory.max`, not CPU count.
179
+ 5. **Silent sync failure** — `cp … 2>/dev/null; echo synced` lies on a full/inode-exhausted FS; gate the success line on the actual copy result.
180
+ 6. **Spot preemption grace is tiny (~5 s → ~0 s on the platforms profiled here; AWS-style 2-min grace only on clouds not profiled)** — a SIGTERM-flush handler is NOT a safety net; checkpoint on a timer to durable storage, load-latest unconditionally (`references/spot-resilience.md`).
181
+ 7. **"Stop" rarely stops the meter** — only `terminate`/`destroy` does, and it's irreversible (deletes the disk). Know the verb from the profile before you click, and on RunPod a stopped Pod can even restart with zero GPUs.
182
+ 8. **CRLF breaks `.sh` on Linux** — author on Windows → `.gitattributes` `*.sh text eol=lf`; on-box unblock `sed -i 's/\r$//'`.
183
+
184
+ ## When training itself breaks (the model, not the platform)
185
+
186
+ Platform ops is only half the job — once the box is running, training breaks in its own ways. The
187
+ `references/training/` layer is the debug knowledge for the run itself. Boundary: **this layer owns
188
+ "make it run, fast, and not crash"; `verifying-dl-experiments` owns "is the *number* real"** —
189
+ cross-link it for collapse / leakage / metric-validity. Every entry is symptom → root cause → fix with
190
+ cited current docs.
191
+
192
+ - `references/training/oom-memory.md` — CUDA/VRAM + host-RAM OOM and the fit-it ladder (grad-accum → bf16 → activation-checkpointing → `expandable_segments` → FSDP/ZeRO → CPU/NVMe offload → LoRA/QLoRA); OOM-at-a-specific-step (first backward / val / longest batch); the memory snapshot + visualizer.
193
+ - `references/training/distributed-launch.md` — `torchrun`/`accelerate`/`deepspeed` launch + env contract, DDP/FSDP/ZeRO config, and the multi-GPU **HANGS** toolkit (one-rank-diverged, rank-conditional collective, dataloader-length mismatch). Multi-node wire → `references/multinode.md`.
194
+ - `references/training/precision-stability.md` — fp16/bf16/tf32 + AMP/GradScaler, NaN/Inf hunting (`detect_anomaly`), LLM **loss spikes** + divergence (warmup, clip, init, z-loss).
195
+ - `references/training/throughput-profiling.md` — GPU-bound vs data-bound vs comms-bound; dataloader knobs; `torch.compile` traps; flash-attention; `torch.profiler` / Nsight.
196
+ - `references/training/checkpoint-resume.md` — full-state save/resume mechanics, sharded (FSDP/DeepSpeed) checkpoints, and the resume bugs (epoch restart, data reshuffle, scaler/EMA dropped). Spot cadence → `references/spot-resilience.md`.
197
+ - `references/training/by-domain.md` — per-domain gotchas: LLM/transformer, vision (det/seg), diffusion, RL, multimodal/VLM.
198
+ - `references/training/convergence-debugging.md` — the **"runs but won't learn / learns badly"** layer: the overfit-one-batch smoke, params-not-updating, optimizer/LR/weight-decay/schedule config, loss-function footguns (double-softmax, BCEWithLogits, CE-target form), fine-tuning/freezing (frozen-BN drift, discriminative LR, LoRA wiring), and the training-dynamics dashboard (update:weight ratio, dead-ReLU, GradScaler-scale).
199
+ - `references/training/data-pipeline.md` — dataloader/dataset **correctness** (not speed): the worker-RNG augmentation-duplication bug, IterableDataset worker/rank sharding, collate/`__len__`/`pin_memory`/`spawn` contracts, and preprocessing/label/shuffle traps (RGB-vs-BGR, ToTensor ÷255, `set_epoch`).
200
+
201
+ ## Companion skills (separate installs; REQUIRED reading where present)
202
+
203
+ These are **separate** Agent Skills, not bundled here — install them for the full experience. On an
204
+ agent where a companion isn't installed, treat its pointer below as an optional cross-reference; this
205
+ skill still works standalone.
206
+
207
+ - **`verifying-dl-experiments`** — owns *is-the-number-real*: smoke content, retry-vs-safeguard, keepable-checkpoint, eval sizing, tracker forensics, GPU-0%-util diagnosis. This skill owns *where/when/how-much-$*.
208
+ - **`huggingface-skills:hf-cli`** — the transport verbs (`hf download --resume`, `hf upload-large-folder`, `hf cache verify`); this skill owns the China-mirror swap + stall-retry (`references/china-network.md`).
209
+ - **`huggingface-skills:huggingface-trackio`** — hosted tracker so metrics survive teardown (gotcha U20); poll `trackio` alerts as a structured monitor instead of brittle ssh-tail.
210
+ - **`superpowers:verification-before-completion`** — the Iron Law's general form; gates every "training done / synced / teardown complete" claim.
211
+ - **`superpowers:dispatching-parallel-agents`** — independence predicate + reconciliation for ablation fan-out.
212
+
213
+ ## Getting better over time (capture new gotchas + personalize)
214
+
215
+ This skill is static, but every run can teach it something — without corrupting it.
216
+ Protocol → **`references/self-improvement.md`**. In short: when a run surfaces a gotcha the catalog
217
+ lacks, **only sediment a root-caused, reproduced, generalizable one** (a one-off flake is a hypothesis,
218
+ not a gotcha — principle #3); **route it** — user/project-specific → the host's memory system,
219
+ generalizable → propose adding to `references/gotchas_universal.md` / the profile §7 /
220
+ `references/training/` (and offer an upstream PR); **never silently rewrite a skill file — draft the
221
+ `symptom → root cause → fix` and let the user approve.** On first use, capture the user's platforms +
222
+ paths + tracker entity into memory so later runs are pre-parameterized. Platform facts carry a `verified
223
+ <month>` stamp — re-verify any teardown/billing fact against current docs before betting money or data.
224
+
225
+ ## Limitations
226
+
227
+ - Does not replace a real cloud orchestrator or managed provisioner; use it to make rented-box work survivable, not to optimize multi-cloud procurement.
228
+ - Platform billing, stop, destroy, and data-retention behavior can drift; re-check current provider docs before destructive or money-impacting actions.
229
+ - Requires user-owned credentials, SSH/API access, and explicit confirmation before teardown, deletion, or other irreversible cleanup.
230
+ - Companion skills named above are not bundled here; treat them as optional references unless installed in the current agent environment.
231
+
232
+ ## Bundled resources
233
+
234
+ Load only what the current phase needs.
235
+
236
+ - `references/principles.md` — the 10 invariants expanded, with the cross-platform nuance behind each.
237
+ - `references/lifecycle_checklist.md` — the 6-phase runbook as a per-platform checklist.
238
+ - `references/gotchas_universal.md` — universal + mixed gotchas (TOC + grep index at top).
239
+ - `references/monitoring_patterns.md` — the four-layer durable-monitoring architecture + robust ssh-poll template.
240
+ - `references/ssh_transport.md` — ssh config, rsync/scp resumable patterns, secrets-via-stdin, CRLF, two-SSH-flavor caveat.
241
+ - `references/china-network.md` — mirrors table + HF_ENDPOINT + resumable-download ladder + the `no_proxy` trap (all CN platforms).
242
+ - `references/spot-resilience.md` — preemption signals, Young/Daly checkpoint cadence, atomic-write resume.
243
+ - `references/parallel_ablation.md` — FS-shared fan-out + the independence predicate + reconciliation.
244
+ - `references/multinode.md` — (advanced) NCCL / fabric-manager / elastic-training gotchas; single-box users skip.
245
+ - `references/training/` — the **DL-training debug layer** (8 files: oom-memory, distributed-launch, precision-stability, throughput-profiling, checkpoint-resume, by-domain, convergence-debugging, data-pipeline) — see "When training breaks" above.
246
+ - `references/self-improvement.md` — the feedback loop: capture a new gotcha (at a bar) into memory or the catalog, personalize on first run, keep platform facts fresh.
247
+ - `scripts/` — wrapper templates (`run_one`/`run_queue`), monitors (`mem_monitor`, `gpu_health`, `reap_vram_zombies`), the read-only patrol (`health_patrol.sh.template`), transfer/aggregation (`download_loop`, `aggregate_to_fs`, `setup-china-mirrors`), the load-and-verify checker (`verify_local.py`), and the `verified`-stamp freshness linter (`check_staleness.py`).
248
+ - `profiles/<platform>.md` — the per-platform substrate (one per platform; `_schema.md` defines the 8 fields).
249
+ - `examples/autodl_sweep/` — one complete, runnable worked case end to end.
@@ -0,0 +1,57 @@
1
+ # Evals — does the skill actually route to the right answer?
2
+
3
+ A skill is only as good as an agent's ability to *find and apply* the right entry under a real
4
+ problem. These evals test that, in two tiers, against a fixed set of realistic scenarios
5
+ ([`cases.jsonl`](cases.jsonl)) spanning both halves of the skill (remote-GPU operations on every
6
+ platform family + the DL-training-debug layer, including the `convergence-debugging` and
7
+ `data-pipeline` files).
8
+
9
+ ## Tier 1 — structural reachability (runnable, no API key)
10
+
11
+ ```bash
12
+ python evals/run_evals.py # exits non-zero if any case regresses
13
+ ```
14
+
15
+ For each scenario it asserts the answer is **present, at the documented location, with the
16
+ expected entry IDs / keywords intact**: every `expect_files` exists, every `expect_ids` is still a
17
+ `### <ID>` header there, every `expect_grep` term is still in the text. This is a **drift guard** —
18
+ it catches a renamed/removed entry, a moved section, a deleted file, or a fact rewritten away from
19
+ its key term. Run it in CI; it needs nothing but Python 3.
20
+
21
+ What it does **not** prove: that an agent actually *navigates* there (Tier 2), or that the platform
22
+ *facts* are correct on a live box (see Verification status).
23
+
24
+ ## Tier 2 — agentic navigation (the gold standard)
25
+
26
+ The real test: give a **fresh agent** the skill and one scenario's `prompt`, let it navigate **from
27
+ SKILL.md only** (following the documented routing, not blind grep), and check it reaches a correct,
28
+ specific answer covering the case's `must_cover` points within ~2 hops. Each case records its last
29
+ such run in the `agentic` field; the collected runs are in [`RESULTS.md`](RESULTS.md).
30
+
31
+ To re-run Tier 2 with any agent/harness: load the skill, paste a case `prompt`, and grade the
32
+ answer against `expect_files` / `expect_ids` / `must_cover`. (Anthropic's skill best-practices
33
+ recommend ≥3 evals across Haiku/Sonnet/Opus — re-running these cases per model is the way to meet
34
+ that bar; results to date were gathered on the development model and are labelled as such.)
35
+
36
+ ## Adding a case
37
+
38
+ Append one JSON object per line to `cases.jsonl`:
39
+
40
+ ```json
41
+ {"id": "kebab-id", "prompt": "the user's situation, verbatim-ish",
42
+ "expect_files": ["references/training/<file>.md"], "expect_ids": ["O7"],
43
+ "expect_grep": ["lr finder"], "must_cover": "the key points a correct answer must hit",
44
+ "agentic": "PASS/FAIL (date): the navigation path observed"}
45
+ ```
46
+
47
+ Use `expect_ids` for the training catalogs (they have `### O7 / DP1 / M17 …` headers) and
48
+ `expect_grep` for platform profiles (which are section-structured). Then `python evals/run_evals.py`.
49
+
50
+ ## Verification status (important)
51
+
52
+ These evals test **retrieval and routing inside the skill** — not the truth of the platform facts
53
+ on a live instance. Only the AutoDL profile is battle-tested by the author; the other six platform
54
+ profiles are researched from official docs + community reports and **not yet live-validated** (see
55
+ the repo README's "Verification status" and `references/self-improvement.md` §5). A case passing
56
+ here means "the skill leads an agent to *this documented answer*," not "this answer was confirmed on
57
+ a rented box."