opencode-skills-collection 3.1.2 → 3.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/bundled-skills/.antigravity-install-manifest.json +4 -1
  2. package/bundled-skills/agent-creator/SKILL.md +246 -0
  3. package/bundled-skills/ax-extract-workflow/SKILL.md +156 -0
  4. package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
  5. package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
  6. package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
  7. package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
  8. package/bundled-skills/docs/sources/sources.md +1 -1
  9. package/bundled-skills/docs/users/bundles.md +1 -1
  10. package/bundled-skills/docs/users/claude-code-skills.md +1 -1
  11. package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
  12. package/bundled-skills/docs/users/getting-started.md +1 -1
  13. package/bundled-skills/docs/users/kiro-integration.md +1 -1
  14. package/bundled-skills/docs/users/usage.md +4 -4
  15. package/bundled-skills/docs/users/visual-guide.md +4 -4
  16. package/bundled-skills/lovable-cleanup/SKILL.md +2 -1
  17. package/bundled-skills/remote-gpu-trainer/.gitattributes +8 -0
  18. package/bundled-skills/remote-gpu-trainer/LICENSE +21 -0
  19. package/bundled-skills/remote-gpu-trainer/README.md +267 -0
  20. package/bundled-skills/remote-gpu-trainer/SKILL.md +249 -0
  21. package/bundled-skills/remote-gpu-trainer/evals/README.md +57 -0
  22. package/bundled-skills/remote-gpu-trainer/evals/RESULTS.md +44 -0
  23. package/bundled-skills/remote-gpu-trainer/evals/cases.jsonl +14 -0
  24. package/bundled-skills/remote-gpu-trainer/evals/run_evals.py +68 -0
  25. package/bundled-skills/remote-gpu-trainer/examples/autodl_sweep/README.md +72 -0
  26. package/bundled-skills/remote-gpu-trainer/examples/autodl_sweep/queue_1.txt +6 -0
  27. package/bundled-skills/remote-gpu-trainer/profiles/_schema.md +100 -0
  28. package/bundled-skills/remote-gpu-trainer/profiles/autodl.md +327 -0
  29. package/bundled-skills/remote-gpu-trainer/profiles/china.md +397 -0
  30. package/bundled-skills/remote-gpu-trainer/profiles/generic-ssh.md +450 -0
  31. package/bundled-skills/remote-gpu-trainer/profiles/lambda.md +342 -0
  32. package/bundled-skills/remote-gpu-trainer/profiles/paperspace.md +365 -0
  33. package/bundled-skills/remote-gpu-trainer/profiles/runpod.md +164 -0
  34. package/bundled-skills/remote-gpu-trainer/profiles/vastai.md +355 -0
  35. package/bundled-skills/remote-gpu-trainer/references/china-network.md +206 -0
  36. package/bundled-skills/remote-gpu-trainer/references/gotchas_universal.md +704 -0
  37. package/bundled-skills/remote-gpu-trainer/references/lifecycle_checklist.md +148 -0
  38. package/bundled-skills/remote-gpu-trainer/references/monitoring_patterns.md +327 -0
  39. package/bundled-skills/remote-gpu-trainer/references/multinode.md +190 -0
  40. package/bundled-skills/remote-gpu-trainer/references/parallel_ablation.md +196 -0
  41. package/bundled-skills/remote-gpu-trainer/references/principles.md +179 -0
  42. package/bundled-skills/remote-gpu-trainer/references/self-improvement.md +74 -0
  43. package/bundled-skills/remote-gpu-trainer/references/spot-resilience.md +235 -0
  44. package/bundled-skills/remote-gpu-trainer/references/ssh_transport.md +270 -0
  45. package/bundled-skills/remote-gpu-trainer/references/training/by-domain.md +230 -0
  46. package/bundled-skills/remote-gpu-trainer/references/training/checkpoint-resume.md +368 -0
  47. package/bundled-skills/remote-gpu-trainer/references/training/convergence-debugging.md +187 -0
  48. package/bundled-skills/remote-gpu-trainer/references/training/data-pipeline.md +119 -0
  49. package/bundled-skills/remote-gpu-trainer/references/training/distributed-launch.md +422 -0
  50. package/bundled-skills/remote-gpu-trainer/references/training/oom-memory.md +338 -0
  51. package/bundled-skills/remote-gpu-trainer/references/training/precision-stability.md +401 -0
  52. package/bundled-skills/remote-gpu-trainer/references/training/throughput-profiling.md +451 -0
  53. package/bundled-skills/remote-gpu-trainer/scripts/aggregate_to_fs.sh +55 -0
  54. package/bundled-skills/remote-gpu-trainer/scripts/check_staleness.py +70 -0
  55. package/bundled-skills/remote-gpu-trainer/scripts/download_loop.sh +67 -0
  56. package/bundled-skills/remote-gpu-trainer/scripts/gpu_health.sh +169 -0
  57. package/bundled-skills/remote-gpu-trainer/scripts/health_patrol.sh.template +67 -0
  58. package/bundled-skills/remote-gpu-trainer/scripts/mem_monitor.sh +67 -0
  59. package/bundled-skills/remote-gpu-trainer/scripts/reap_vram_zombies.sh +175 -0
  60. package/bundled-skills/remote-gpu-trainer/scripts/run_one.sh.template +104 -0
  61. package/bundled-skills/remote-gpu-trainer/scripts/run_queue.sh.template +83 -0
  62. package/bundled-skills/remote-gpu-trainer/scripts/setup-china-mirrors.sh +35 -0
  63. package/bundled-skills/remote-gpu-trainer/scripts/verify_local.py +145 -0
  64. package/package.json +1 -1
  65. package/skills_index.json +66 -0
@@ -0,0 +1,235 @@
1
+ # Spot / Preemption Resilience
2
+
3
+ Make a job survive being killed at a random instant — the price of riding the 50–90 %-cheaper
4
+ spot/preemptible/interruptible tier. The whole layer reduces to **principle #8**
5
+ (`references/principles.md`): checkpoint full state to durable storage on a Young/Daly timer, load-latest
6
+ unconditionally on startup, write atomically, treat the preemption signal only as an opportunistic
7
+ last-flush. This file is the deep form: per-platform grace windows, the cadence formula with a worked
8
+ number, the atomic-write resume recipe, and a commented Python skeleton.
9
+
10
+ To jump: `grep -in '<keyword>' references/spot-resilience.md` — keywords: `grace`, `signal`, `young`,
11
+ `daly`, `cadence`, `atomic`, `rename`, `resume`, `skeleton`, `managed`, `skypilot`, `sagemaker`, `slurm`.
12
+
13
+ ## Table of contents
14
+
15
+ 1. [Preemption signals + grace windows (per platform)](#1-preemption-signals--grace-windows-per-platform)
16
+ 2. [Checkpoint cadence — the Young/Daly formula](#2-checkpoint-cadence--the-youngdaly-formula)
17
+ 3. [The atomic-write resume recipe](#3-the-atomic-write-resume-recipe)
18
+ 4. [Managed-spot frameworks move the box; the checkpoint-load restores the state](#4-managed-spot-frameworks-move-the-box-the-checkpoint-load-restores-the-state)
19
+ 5. [Python checkpoint/resume skeleton](#5-python-checkpointresume-skeleton)
20
+
21
+ ---
22
+
23
+ ## 1. Preemption signals + grace windows (per platform)
24
+
25
+ The grace window dictates the design: it decides whether checkpoint-on-signal is even possible, or
26
+ whether the timer is the *only* durability. **The window is NOT the safety net** — see the design-breaking
27
+ gotcha below the table. Concrete per-platform reach/billing detail lives in each `profiles/<platform>.md`
28
+ §4; this is the cross-platform signal map.
29
+
30
+ | Platform | Detection signal | Grace window | Implication |
31
+ |---|---|---|---|
32
+ | **AWS EC2 Spot** | IMDS `http://169.254.169.254/latest/meta-data/spot/instance-action` (404 = none, 200 = pending); rebalance-recommendation fires ~10–20 min earlier | **~120 s** | On-signal flush of a *small* checkpoint is viable; still timer-checkpoint for the big one |
33
+ | **GCP Spot** | metadata preemption flag + ACPI G2 Soft Off → shutdown script | **~30 s** default (configurable up to 120 s, Preview) | Timer-primary; on-signal flush only if checkpoint write < window |
34
+ | **GCP Preemptible (legacy)** | same signal, **plus a hard 24 h cap** regardless of capacity | ~30 s **+ guillotined at 24 h** | Prefer Spot for long runs; Preemptible dies at 24 h even idle |
35
+ | **Azure Spot** | IMDS Scheduled Events `/metadata/scheduledevents`, event type `Preempt` | **≥30 s** (Preempt is the short event; others give ≥5 min) | Timer-primary |
36
+ | **Slurm preemption / walltime** | `SIGTERM` (then `SIGKILL`); with `#SBATCH --signal=B:SIGTERM@360` the batch step gets SIGTERM ~360 s before the kill | **SIGTERM → ~30 s** default; widen via `--signal` lead time | `--requeue` + an in-script SIGTERM trap to checkpoint, then resume on requeue |
37
+ | **RunPod Spot** | OS **SIGTERM → SIGKILL** (also "interruptible without notice") | **~5 s** | Far too short to flush a large checkpoint — timer is the only real durability |
38
+ | **vast.ai Interruptible** | **no signal** — bid-based; instance is *paused* (processes killed) the instant it is outbid | **~0 s (abrupt)** | Pure timer; assume cold restart + reload every time |
39
+
40
+ URLs: AWS [spot-instance-termination-notices](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-instance-termination-notices.html),
41
+ [rebalance-recommendations](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/rebalance-recommendations.html);
42
+ GCP [preemptible](https://docs.cloud.google.com/compute/docs/instances/preemptible),
43
+ [spot](https://docs.cloud.google.com/compute/docs/instances/spot);
44
+ Azure [scheduled-events](https://learn.microsoft.com/en-us/azure/virtual-machines/windows/scheduled-events);
45
+ Slurm [sbatch `--signal`](https://slurm.schedmd.com/sbatch.html);
46
+ RunPod [spot-vs-on-demand](https://www.runpod.io/blog/spot-vs-on-demand-instances-runpod);
47
+ vast.ai [Rental-Types](https://vast.ai/article/Rental-Types).
48
+
49
+ **Gotcha — the design-breaking one.**
50
+ Symptom: a "catch SIGTERM, flush the 40 GB checkpoint to durable storage" handler works in testing on AWS
51
+ (120 s) but the job dies before the flush completes on RunPod (5 s) / vast.ai (0 s).
52
+ Root cause: treating the grace window as the *primary* durability mechanism — it spans 2 min down to ~0
53
+ across platforms, so any handler that needs more than a few seconds is a coin flip.
54
+ Fix: checkpoint on a **periodic timer to durable storage** (§2); use the signal trap **only** as an
55
+ opportunistic "save a final partial checkpoint if there is time" bonus, never as the safety net.
56
+
57
+ **Gotcha — GCP Preemptible 24 h guillotine.**
58
+ Symptom: a multi-day run on a Preemptible VM stops dead at 24 h even though nothing reclaimed it.
59
+ Root cause: legacy Preemptible has a hard 24 h max runtime; Spot VMs have no cap.
60
+ Fix: use **Spot, not Preemptible** for anything past a day (prefer Spot over legacy Preemptible for any run past a day).
61
+
62
+ ---
63
+
64
+ ## 2. Checkpoint cadence — the Young/Daly formula
65
+
66
+ Cadence is a **formula, not a guess.** The optimal checkpoint interval that minimizes total wasted
67
+ wall-clock (rollback re-compute after a kill **plus** checkpoint-write overhead) is the Young/Daly result:
68
+
69
+ ```
70
+ W = sqrt(2 * mu * C)
71
+ ```
72
+
73
+ - `mu` = mean time between preemptions (MTBF), in seconds.
74
+ - `C` = time to write one checkpoint to durable storage, in seconds.
75
+ - `W` = checkpoint interval (write a checkpoint every `W` seconds).
76
+
77
+ **Worked example.** A checkpoint takes `C = 30 s` to write; the instance is preempted on average every
78
+ `mu = 3 h = 10800 s`. Then:
79
+
80
+ ```
81
+ W = sqrt(2 * 10800 * 30) = sqrt(648000) ≈ 805 s ≈ 13.4 min → checkpoint every ~13 min.
82
+ ```
83
+
84
+ Higher preemption rate (smaller `mu`) → shorter interval. Slower checkpoint (larger `C`) → longer interval
85
+ (each save costs more, so amortize it over more progress).
86
+
87
+ **Round W DOWN to an iteration/epoch boundary.** Young/Daly assumes a checkpoint can be taken at *any*
88
+ instant, but real iterative training can only snapshot at a step or epoch boundary. So convert `W` to an
89
+ integer number of iterations and round *down*: at ~2 s/iteration, `805 s → 402 iters → checkpoint every
90
+ 400 iters`. Rounding down checkpoints slightly more often than optimal, which is the safe direction.
91
+
92
+ **Distributed multiplier.** With `N` workers, one preemption wastes `N×` the compute (the whole group rolls
93
+ back), so distributed jobs should checkpoint *more* frequently than the single-GPU `W` suggests.
94
+
95
+ URLs: Young/Daly [robustness paper, INRIA](https://people.bordeaux.inria.fr/gaupy/ressources/pub/confs/icpp20_robustness.pdf),
96
+ [Optimal Checkpointing Period, LAWN 281](https://www.netlib.org/lapack/lawnspdf/lawn281.pdf),
97
+ [Optimal Checkpointing for Iterative Applications, IEEE](https://ieeexplore.ieee.org/document/9495174/).
98
+
99
+ ---
100
+
101
+ ## 3. The atomic-write resume recipe
102
+
103
+ Two failure modes turn "I have checkpoints" into "my resume is broken": a **partial weight save** and a
104
+ **corrupt-on-kill checkpoint**. The recipe fixes both.
105
+
106
+ **Save FULL training state, not just model weights.** A resume that restores only weights silently
107
+ restarts the epoch, reshuffles data, and degrades accuracy. The checkpoint must include:
108
+
109
+ - model `state_dict`
110
+ - optimizer `state_dict`
111
+ - LR-scheduler `state_dict`
112
+ - epoch **and** global step/iteration counter
113
+ - RNG state (Python `random`, NumPy, `torch`, and CUDA)
114
+ - dataloader position (sampler epoch / resumable-sampler offset)
115
+
116
+ **Write atomically: tmp → fsync → os.replace.** A preemption mid-write corrupts the file, and a naive
117
+ overwrite can leave zero good checkpoints. `os.replace` maps to the atomic POSIX `rename(2)` on the same
118
+ filesystem (and, unlike `os.rename`, overwrites atomically on Windows too), so:
119
+
120
+ 1. Write the whole state to `latest.pt.tmp`.
121
+ 2. `fsync` the file (and the directory) so bytes hit disk before the rename.
122
+ 3. `os.replace("latest.pt.tmp", "latest.pt")` — the swap is all-or-nothing.
123
+ 4. Keep the previous `latest.pt` until the new one is committed; a kill at any point leaves one intact file.
124
+
125
+ **Checkpoint to the platform's DURABLE location, not local scratch** (principle #4). A managed replacement
126
+ node is *fresh* — anything not on a cloud bucket / network volume / shared FS is gone. On a marketplace box
127
+ where local disk persists across a pause, still mirror to durable storage at intervals.
128
+
129
+ **Load-latest UNCONDITIONALLY on startup.** Use the *same code path* for first launch (no checkpoint →
130
+ start fresh) and every restart-after-preemption (checkpoint exists → resume). This is what makes the job
131
+ idempotent: the **identical launch command** run any number of times converges to the same end state, which
132
+ is exactly what makes principle #7's "retry the identical config" actually resume progress instead of
133
+ restarting from zero.
134
+
135
+ URLs: [Check-N-Run, arXiv](https://arxiv.org/pdf/2010.08679),
136
+ [SkyPilot training-guide](https://docs.skypilot.co/en/latest/reference/training-guide.html),
137
+ [SageMaker resume-from-checkpoint](https://docs.aws.amazon.com/sagemaker/latest/dg/model-checkpoints-resume.html).
138
+
139
+ ---
140
+
141
+ ## 4. Managed-spot frameworks move the box; the checkpoint-load restores the state
142
+
143
+ Managed frameworks **auto-provision a replacement** on preemption — but they restart the **process from
144
+ scratch**. The framework moves the box; the checkpoint-load written in §3/§5 is what restores progress.
145
+ This is the single most-misunderstood point: the framework does **not** resume training on its own.
146
+
147
+ - **SkyPilot Managed Jobs** — strongest cross-cloud recommendation (re-provisions in a different
148
+ region/cloud to chase capacity, then re-runs the task). Caveat: it auto-recovers **only**
149
+ preemption/hardware failures — a user-code non-zero exit is **not** auto-recovered.
150
+ [managed-jobs](https://docs.skypilot.co/en/latest/examples/managed-jobs.html).
151
+ - **AWS SageMaker Managed Spot** — set `use_spot_instances=True` + `checkpoint_s3_uri`; SageMaker syncs the
152
+ checkpoint dir to S3 during training and copies it back on restart (up to ~90 % savings). Gotcha:
153
+ **`max_wait` must be greater than `max_run`** — `max_wait` covers wait-for-capacity *plus* run time
154
+ *plus* interruption gaps; set it too tight and the job is killed mid-resume.
155
+ [managed-spot docs](https://docs.aws.amazon.com/sagemaker/latest/dg/model-managed-spot-training.html).
156
+
157
+ Universal multi-cloud auto-failover is **out of scope for this skill** — use SkyPilot/dstack for that, then
158
+ return here to make the *code* resume-correct so their recovery actually lands on progress
159
+ (`superpowers:verification-before-completion` gates the "it resumed" claim against a loaded checkpoint, not
160
+ a log line). For the elastic / multi-node tier (torchrun `--max-restarts`, Elastic Horovod) see
161
+ `references/multinode.md`; the same invariant holds — the framework restarts processes, the per-epoch
162
+ snapshot restores state.
163
+
164
+ ---
165
+
166
+ ## 5. Python checkpoint/resume skeleton
167
+
168
+ Read this for the algorithm; adapt into the training script. The shape is platform-agnostic — only
169
+ `DURABLE_DIR` changes per profile (§8 SCRIPT OVERRIDES).
170
+
171
+ ```python
172
+ import os, random, signal, time
173
+ import numpy as np
174
+ import torch
175
+
176
+ DURABLE_DIR = os.environ["DURABLE_DIR"] # profile-supplied bucket/FS/volume mount, NOT local scratch
177
+ CKPT = os.path.join(DURABLE_DIR, "latest.pt")
178
+ CKPT_EVERY_ITERS = 400 # = round_down(Young/Daly W / sec_per_iter); see section 2
179
+
180
+ def save_full_state(model, opt, sched, epoch, step):
181
+ """Atomic write: tmp -> fsync -> os.replace. A kill at any point leaves one intact file."""
182
+ state = {
183
+ "model": model.state_dict(),
184
+ "opt": opt.state_dict(),
185
+ "sched": sched.state_dict(),
186
+ "epoch": epoch, "step": step, # resume the exact position, not the epoch start
187
+ "rng_python": random.getstate(),
188
+ "rng_numpy": np.random.get_state(),
189
+ "rng_torch": torch.get_rng_state(),
190
+ "rng_cuda": torch.cuda.get_rng_state_all(),
191
+ }
192
+ tmp = CKPT + ".tmp"
193
+ with open(tmp, "wb") as f:
194
+ torch.save(state, f)
195
+ f.flush()
196
+ os.fsync(f.fileno()) # bytes hit disk BEFORE the rename
197
+ os.replace(tmp, CKPT) # POSIX-atomic swap; prev file valid until this returns
198
+
199
+ def load_latest_if_any(model, opt, sched):
200
+ """Unconditional load-latest: identical command resumes OR starts fresh. Returns (epoch, step)."""
201
+ if not os.path.exists(CKPT):
202
+ return 0, 0 # first run, no checkpoint -> start from scratch
203
+ s = torch.load(CKPT, map_location="cpu")
204
+ model.load_state_dict(s["model"])
205
+ opt.load_state_dict(s["opt"])
206
+ sched.load_state_dict(s["sched"])
207
+ random.setstate(s["rng_python"])
208
+ np.random.set_state(s["rng_numpy"])
209
+ torch.set_rng_state(s["rng_torch"])
210
+ torch.cuda.set_rng_state_all(s["rng_cuda"])
211
+ return s["epoch"], s["step"] # caller skips dataloader to this position
212
+
213
+ # --- opportunistic last-flush only; NOT the safety net (section 1) ---
214
+ _preempted = {"flag": False}
215
+ def _on_sigterm(signum, frame):
216
+ _preempted["flag"] = True # set a flag; flush at the next safe boundary, do not block here
217
+ signal.signal(signal.SIGTERM, _on_sigterm)
218
+
219
+ def train(model, opt, sched, dataloader, total_epochs):
220
+ start_epoch, start_step = load_latest_if_any(model, opt, sched)
221
+ step = start_step
222
+ for epoch in range(start_epoch, total_epochs):
223
+ for batch in dataloader: # a resumable sampler should fast-forward to start_step
224
+ # ... forward / backward / opt.step() / sched.step() ...
225
+ step += 1
226
+ if step % CKPT_EVERY_ITERS == 0 or _preempted["flag"]:
227
+ save_full_state(model, opt, sched, epoch, step)
228
+ if _preempted["flag"]:
229
+ return # grace window may be ~0s; exit cleanly after the flush
230
+ ```
231
+
232
+ Verify the resume path before trusting it: kill the process mid-epoch, relaunch the *identical* command,
233
+ and confirm step/epoch/loss continue rather than reset (this is the `verifying-dl-experiments`
234
+ reproducibility check, applied to preemption). Trust the **loaded** checkpoint, not the "resumed" log line
235
+ (principle #3).
@@ -0,0 +1,270 @@
1
+ # SSH Transport — keys, keepalive, resumable copy, secrets-via-stdin
2
+
3
+ Platform-agnostic SSH + file-transfer substrate for every `ssh-rental` profile (AutoDL, RunPod,
4
+ vast.ai, Lambda, Paperspace, China, bare SSH). One-time config so subsequent commands are short and
5
+ password-less, plus the copy/secret patterns that survive flaky networks and short rentals. Concrete
6
+ hosts, ports, and credential locations are **profile facts** — this file owns the *mechanism*, the
7
+ profile (`profiles/<platform>.md` §1/§3/§8) owns the *values*.
8
+
9
+ To jump: `grep -in '<keyword>' references/ssh_transport.md` (e.g. `keepalive`, `rsync`, `stdin`, `crlf`).
10
+
11
+ ## Table of contents
12
+
13
+ 1. Key generation
14
+ 2. Push the public key to an instance
15
+ 3. `~/.ssh/config` alias + keepalive tuning
16
+ 4. Verify the alias
17
+ 5. Resumable copy — rsync vs scp, and WHY rsync
18
+ 6. Bulk per-dir download loop
19
+ 7. Move secrets via stdin — never inline a key, never on a durable FS
20
+ 8. CRLF — `.sh` authored on Windows breaks on Linux
21
+ 9. Two SSH flavors — proxied/basic SSH cannot `scp`
22
+ 10. Transport gotchas (Symptom → Root cause → Fix)
23
+
24
+ ---
25
+
26
+ ## 1. Key generation
27
+
28
+ Skip if `~/.ssh/id_ed25519` already exists.
29
+
30
+ ```bash
31
+ ssh-keygen -t ed25519 -C "<label>"
32
+ # Save path: Enter for the default ~/.ssh/id_ed25519
33
+ # Passphrase: optional (Enter for none, or set one + use ssh-agent)
34
+ ```
35
+
36
+ `ed25519` is shorter and more secure than RSA; every rental platform accepts both. One local key is
37
+ reused across all instances — generate once, push the **public** half (§2) to each box. The private
38
+ half (`~/.ssh/id_ed25519`, no `.pub`) never leaves the local machine and **never** goes onto a rental,
39
+ a shared FS, or a cloud agent (a cloud scheduler runs in an isolated sandbox with no access to it — and
40
+ putting a private key there is a secret leak; see `references/monitoring_patterns.md`).
41
+
42
+ ## 2. Push the public key to an instance
43
+
44
+ Copy the connection string from the platform's web console / API; it has the shape
45
+ `ssh -p <PORT> root@connect.<region>.<provider>.com`. Push the public key once:
46
+
47
+ ```bash
48
+ ssh-copy-id -p <PORT> root@connect.<region>.<provider>.com
49
+ # enter the platform-provided password ONCE
50
+ ```
51
+
52
+ If `ssh-copy-id` is absent (common on Windows-native shells), append the key manually:
53
+
54
+ ```bash
55
+ cat ~/.ssh/id_ed25519.pub # copy the entire line
56
+ ssh -p <PORT> root@connect.<region>.<provider>.com
57
+ # on the remote:
58
+ mkdir -p ~/.ssh && chmod 700 ~/.ssh
59
+ echo "<paste the public key line>" >> ~/.ssh/authorized_keys
60
+ chmod 600 ~/.ssh/authorized_keys
61
+ exit
62
+ ```
63
+
64
+ Test: re-running the `ssh …` line should connect **without** a password prompt.
65
+
66
+ ## 3. `~/.ssh/config` alias + keepalive tuning
67
+
68
+ One block per instance turns `ssh -p <PORT> root@connect.<region>.<provider>.com` into `ssh <alias>`,
69
+ and folds in the keepalive options that keep long monitoring/transfer connections from dropping.
70
+
71
+ ```ssh-config
72
+ Host proj-1
73
+ HostName connect.<region>.<provider>.com
74
+ Port <PORT>
75
+ User root
76
+ IdentityFile ~/.ssh/id_ed25519
77
+ ServerAliveInterval 60
78
+ ServerAliveCountMax 120
79
+ TCPKeepAlive yes
80
+ # LogLevel VERBOSE # uncomment to debug a refused/hung connection
81
+
82
+ Host proj-2
83
+ HostName connect.<region>.<provider>.com
84
+ Port <PORT>
85
+ User root
86
+ IdentityFile ~/.ssh/id_ed25519
87
+ ServerAliveInterval 60
88
+ ServerAliveCountMax 120
89
+ ```
90
+
91
+ **Naming**: `<project>-<index>` (e.g. `proj-1`, `proj-2`) reads cleanly in a fan-out loop; avoid bare
92
+ `gpu1`. **Why the three keepalive options**:
93
+
94
+ - `ServerAliveInterval 60` — send an application-layer heartbeat every 60 s, so a NAT/idle timeout on
95
+ the path does not silently drop a parked connection (mid-`scp`, or an open monitor).
96
+ - `ServerAliveCountMax 120` — tolerate up to 120 missed heartbeats before declaring the link dead (≈2 h
97
+ of network instability survived). Lower it (e.g. 3) for a *bounded* monitor that should self-kill on a
98
+ blip rather than hang — see the short-connection poll in `references/monitoring_patterns.md`.
99
+ - `TCPKeepAlive yes` — let the OS also emit TCP-layer keepalives, catching a peer that vanishes
100
+ ungracefully.
101
+
102
+ Ports change when a profile re-issues an instance (`ssh-rental` boxes assign a new port on
103
+ re-creation) — update the `Port` line after each create/recreate, then re-run §4.
104
+
105
+ ## 4. Verify the alias
106
+
107
+ ```bash
108
+ for a in proj-1 proj-2 proj-3 proj-4; do
109
+ echo "=== $a ==="
110
+ ssh -o ConnectTimeout=10 "$a" "hostname; date"
111
+ done
112
+ ```
113
+
114
+ Each should print a distinct hostname. Then the env probe (SKILL.md Phase 1):
115
+ `ssh <alias> 'python -c "import torch;print(torch.cuda.is_available())"'`.
116
+
117
+ ## 5. Resumable copy — rsync vs scp, and WHY rsync
118
+
119
+ `scp` opens **one** SSH stream for the whole transfer and **cannot resume**: any blip mid-copy aborts
120
+ the entire run and a re-run starts from zero. `rsync` compares source/dest and ships only the delta, so
121
+ a re-run after a drop **continues** instead of restarting — the single most important property on a
122
+ metered box where a 130 GB pull can blip at minute 45.
123
+
124
+ **Prefer `rsync` for anything large or multi-file:**
125
+
126
+ ```bash
127
+ rsync -avz --partial --inplace --progress \
128
+ -e ssh \
129
+ <alias>:/root/autodl-tmp/checkpoints/ /path/to/local/checkpoints/
130
+ ```
131
+
132
+ - `-a` archive (recurse + preserve perms/times/symlinks), `-v` verbose, `-z` compress on the wire.
133
+ - `--partial` keeps a partially-transferred file on interruption so the next run resumes mid-file
134
+ (without it, rsync deletes the partial and re-sends from the start).
135
+ - `--inplace` writes directly into the destination file (resume-friendly; avoids a full temp copy on a
136
+ tight local disk). Drop it if atomic-replace of an existing dest matters more than resumability.
137
+ - Re-run the **identical** command after any failure — that *is* the resume (principle #7).
138
+
139
+ Use plain `scp` only for a **single small** file (a config, one checkpoint < ~1 GB) where resume is
140
+ moot. For a large *tree*, even `scp` users should fall back to the **per-dir loop** (§6) so one dir's
141
+ failure doesn't lose the rest. If `rsync` is missing on the remote image, `apt-get install rsync` (when
142
+ online) or use the §6 loop.
143
+
144
+ > The bulk-download stall-retry ladder (HF/ModelScope mirror swaps, `timeout … && break` loops) is a
145
+ > *download-from-the-internet* concern, not host↔host copy — that lives in `references/china-network.md`.
146
+
147
+ ## 6. Bulk per-dir download loop
148
+
149
+ For a large directory tree (many run/checkpoint dirs), wrap each dir in its **own** SSH session so a
150
+ single drop loses only that dir, and a re-run **skips already-complete dirs**:
151
+
152
+ → `scripts/download_loop.sh` (parameterize `LOCAL_TARGET`, `REMOTE_ALIAS`, `REMOTE_PATH`).
153
+
154
+ Its shape, and why each piece matters:
155
+
156
+ - **List once, copy per-dir** — each `scp -r <alias>:<remote>/$d ./` is an independent session; one
157
+ failure ≠ whole-transfer loss (the `scp` single-stream trap, §5).
158
+ - **Size-threshold skip** — a dir already ≥ threshold counts as complete and is skipped; a partial dir
159
+ is removed and re-pulled. Re-running the whole script is therefore idempotent and resumable.
160
+ - **Per-dir `ConnectTimeout` + the §3 keepalive flags** on every `scp` so a hung session self-kills
161
+ instead of blocking the loop.
162
+
163
+ ## 7. Move secrets via stdin — never inline a key, never on a durable FS
164
+
165
+ Putting a credential **in a command** (`ssh host "echo 'KEY' > …"`, or `scp key.txt host:…`) leaks the
166
+ value into shell history, agent transcripts, and hook logs. Putting it on a **shared /
167
+ durable FS** is worse: the value persists for every co-tenant, and some platforms' upload classifiers
168
+ *block or corrupt* a file matching a known key pattern — so a credential written to the cross-instance
169
+ FS may silently never arrive. **Push credentials to each box's per-instance system disk, via stdin**, so
170
+ the value flows file → pipe → file and appears in no command text or output:
171
+
172
+ ```bash
173
+ # stream exactly one credential block — value never appears on a command line
174
+ grep -A 2 "machine api.<provider>.com" ~/.netrc \
175
+ | ssh <alias> 'umask 077; cat > /root/.netrc && chmod 600 /root/.netrc'
176
+ ```
177
+
178
+ ```bash
179
+ # or a single token, same principle (stdin in, file out, chmod 600)
180
+ printf '%s\n' "$TOKEN_FROM_ENV" \
181
+ | ssh <alias> 'umask 077; cat > /root/.<service>_key && chmod 600 /root/.<service>_key'
182
+ ```
183
+
184
+ Rules that make this safe:
185
+
186
+ - **One block, not the whole file.** Stream a single `machine …` stanza, never the entire `~/.netrc` —
187
+ it carries unrelated machines' credentials, and security hooks (rightly) block copying the whole file.
188
+ - **Reference, never echo.** Source the token from an env var (`$TOKEN_FROM_ENV`) or a keyring; never
189
+ paste the literal value into the command.
190
+ - **Per-instance system disk, not the shared FS.** Write to `/root/.<service>_key` (volatile but
191
+ private), not the cross-instance durable mount. The wrapper reads it and exports the env var before
192
+ launch (e.g. `export WANDB_API_KEY=$(cat /root/.wandb_key)`).
193
+ - **Verify by capability, not by echoing the value:**
194
+ `ssh <alias> 'python -c "import wandb; print(wandb.Api(timeout=20).default_entity)"'`.
195
+
196
+ ## 8. CRLF — `.sh` authored on Windows breaks on Linux
197
+
198
+ Symptom → Root cause → Fix:
199
+
200
+ - **Symptom**: a synced launcher does nothing (empty log); run by hand it errors `set: -: invalid
201
+ option`, `cd: /path\r: No such file or directory`, or `syntax error near unexpected token $'do\r'` —
202
+ every line "ends in `\r`".
203
+ - **Root cause**: Windows `core.autocrlf=true` (or `git archive` exporting with the working-tree EOL)
204
+ writes `.sh` with CRLF; Linux `bash` treats the trailing `\r` as part of each token. (`.py` is
205
+ unaffected — Python's universal newlines tolerate CRLF; specifically `bash`/`.sh` breaks.)
206
+ - **Fix**: add `.gitattributes` with `*.sh text eol=lf` so `git archive`/checkout always emits LF; as an
207
+ immediate on-box unblock, `sed -i 's/\r$//' scripts/*.sh`.
208
+
209
+ Every shell script in `scripts/` ships LF and starts `#!/usr/bin/env bash` + `set -u`; keep that
210
+ contract when authoring new ones. **Never** put an unquoted `|` inside a `grep` regex in a transport or
211
+ poll script — the shell splits it into piped commands and the first reads stdin → hangs forever
212
+ (`references/monitoring_patterns.md`).
213
+
214
+ ## 9. Two SSH flavors — proxied/basic SSH cannot `scp`
215
+
216
+ Some `ssh-rental` platforms expose **two** SSH endpoints, and the difference dictates whether file
217
+ transfer works at all:
218
+
219
+ - **Direct TCP SSH** — a real TCP port to the container (the `connect.<region>.<provider>.com:<PORT>`
220
+ shape above). Full `scp`/`rsync`/`sftp` work. This is what every transfer in this file assumes.
221
+ - **Proxied / "basic" SSH** — a relayed or web-terminal SSH (common on RunPod and vast.ai for the
222
+ default exposed endpoint). It carries an **interactive shell only**: `scp`/`rsync`/`sftp` fail (often
223
+ with `subsystem request failed` / a hung handshake) because the proxy doesn't forward the SFTP
224
+ subsystem.
225
+
226
+ **Fix**: for any code/data/checkpoint transfer, use the **direct-TCP** endpoint — on RunPod expose a
227
+ TCP port (the `ssh root@<ip> -p <PORT>` form, not the proxied `ssh <pod>@ssh.runpod.io` one); on vast.ai
228
+ use the instance's direct SSH port. Each profile's §3 NETWORK names which endpoint is which and whether
229
+ ports change on restart. If only proxied SSH is available, transfer out-of-band instead (push results to
230
+ object storage / HF Hub from on-box and pull from there).
231
+
232
+ ## 10. Transport gotchas (Symptom → Root cause → Fix)
233
+
234
+ Universal gotchas (disk-full, inode, OOM, silent sync) are **not** repeated here — see
235
+ `references/gotchas_universal.md`. These are transport-specific.
236
+
237
+ **T1 — SSH exits 255 / "Connection reset" right after a `pkill`/`kill`.**
238
+ Symptom: `ssh <alias> 'pkill -9 -f src.train'` returns `Connection reset by peer`, exit 255. → Root
239
+ cause: killing the process tree disrupts the PTY chain; the SSH client receives EOF and exits — and
240
+ anything *after* the kill in that same one-liner never runs. → Fix: this is **normal**, not a failure.
241
+ Re-ssh to verify (`ssh <alias> "pgrep -af src.train | head -1 || echo CLEAN"`). Split kill and relaunch
242
+ into **two** ssh calls — never `pkill X; relaunch X` in one command, the relaunch is dropped with the
243
+ session.
244
+
245
+ **T2 — large `scp -r` drops with "Read from remote host … reset by peer" 30–60 min in.**
246
+ Symptom: a 130 GB `scp -r` aborts mid-transfer; the local tree has only the first few dirs, the rest
247
+ gone. → Root cause: one SSH stream for the whole transfer; any blip kills it and `scp` does not resume.
248
+ → Fix: use `rsync --partial` (§5) or the per-dir loop (§6) — each dir an independent session, re-run
249
+ skips completed dirs.
250
+
251
+ **T3 — `.sh` "ends in `\r`" after a Windows→Linux sync.**
252
+ See §8 (`.gitattributes` `*.sh text eol=lf`; on-box `sed -i 's/\r$//'`).
253
+
254
+ **T4 — a credential leaks into history / a shared FS, or its FS upload silently fails.**
255
+ Symptom: a key pasted into an `ssh`/`scp` command lands in transcripts and hook logs; an scp of the key
256
+ to the shared FS "succeeds" but the file is missing or corrupt. → Root cause: the value appeared in a
257
+ command line; and some platforms' FS classifiers block/corrupt credential-shaped uploads. → Fix: §7 —
258
+ stream one block via stdin to the per-instance disk, verify by capability not by echo.
259
+
260
+ **T5 — `scp dest open "/root/x/": Failure` instantly.**
261
+ Symptom: a (often parallel/background) `scp big.tar <alias>:/root/x/` fails at once because the
262
+ destination dir doesn't exist — a sibling command meant to `mkdir` it ran later, or was blocked. → Root
263
+ cause: the transfer assumed a directory a *different* command was supposed to create (a parallel-setup
264
+ race). → Fix: make every transfer self-sufficient — create the dest in the same command:
265
+ `ssh <alias> 'mkdir -p /root/x' && scp … || retry`. Never assume a sibling created the destination.
266
+
267
+ **T6 — `Host key verification failed` after an instance is recreated.**
268
+ Symptom: same `connect.<region>.<provider>.com` host, new host key, so SSH refuses. → Root cause: the
269
+ recreated container presents a different host key on the reused hostname/port. → Fix:
270
+ `ssh-keygen -R '[connect.<region>.<provider>.com]:<PORT>'`, then reconnect (re-accepts the new key).