opencode-skills-collection 3.1.2 → 3.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/bundled-skills/.antigravity-install-manifest.json +4 -1
  2. package/bundled-skills/agent-creator/SKILL.md +246 -0
  3. package/bundled-skills/ax-extract-workflow/SKILL.md +156 -0
  4. package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
  5. package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
  6. package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
  7. package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
  8. package/bundled-skills/docs/sources/sources.md +1 -1
  9. package/bundled-skills/docs/users/bundles.md +1 -1
  10. package/bundled-skills/docs/users/claude-code-skills.md +1 -1
  11. package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
  12. package/bundled-skills/docs/users/getting-started.md +1 -1
  13. package/bundled-skills/docs/users/kiro-integration.md +1 -1
  14. package/bundled-skills/docs/users/usage.md +4 -4
  15. package/bundled-skills/docs/users/visual-guide.md +4 -4
  16. package/bundled-skills/lovable-cleanup/SKILL.md +2 -1
  17. package/bundled-skills/remote-gpu-trainer/.gitattributes +8 -0
  18. package/bundled-skills/remote-gpu-trainer/LICENSE +21 -0
  19. package/bundled-skills/remote-gpu-trainer/README.md +267 -0
  20. package/bundled-skills/remote-gpu-trainer/SKILL.md +249 -0
  21. package/bundled-skills/remote-gpu-trainer/evals/README.md +57 -0
  22. package/bundled-skills/remote-gpu-trainer/evals/RESULTS.md +44 -0
  23. package/bundled-skills/remote-gpu-trainer/evals/cases.jsonl +14 -0
  24. package/bundled-skills/remote-gpu-trainer/evals/run_evals.py +68 -0
  25. package/bundled-skills/remote-gpu-trainer/examples/autodl_sweep/README.md +72 -0
  26. package/bundled-skills/remote-gpu-trainer/examples/autodl_sweep/queue_1.txt +6 -0
  27. package/bundled-skills/remote-gpu-trainer/profiles/_schema.md +100 -0
  28. package/bundled-skills/remote-gpu-trainer/profiles/autodl.md +327 -0
  29. package/bundled-skills/remote-gpu-trainer/profiles/china.md +397 -0
  30. package/bundled-skills/remote-gpu-trainer/profiles/generic-ssh.md +450 -0
  31. package/bundled-skills/remote-gpu-trainer/profiles/lambda.md +342 -0
  32. package/bundled-skills/remote-gpu-trainer/profiles/paperspace.md +365 -0
  33. package/bundled-skills/remote-gpu-trainer/profiles/runpod.md +164 -0
  34. package/bundled-skills/remote-gpu-trainer/profiles/vastai.md +355 -0
  35. package/bundled-skills/remote-gpu-trainer/references/china-network.md +206 -0
  36. package/bundled-skills/remote-gpu-trainer/references/gotchas_universal.md +704 -0
  37. package/bundled-skills/remote-gpu-trainer/references/lifecycle_checklist.md +148 -0
  38. package/bundled-skills/remote-gpu-trainer/references/monitoring_patterns.md +327 -0
  39. package/bundled-skills/remote-gpu-trainer/references/multinode.md +190 -0
  40. package/bundled-skills/remote-gpu-trainer/references/parallel_ablation.md +196 -0
  41. package/bundled-skills/remote-gpu-trainer/references/principles.md +179 -0
  42. package/bundled-skills/remote-gpu-trainer/references/self-improvement.md +74 -0
  43. package/bundled-skills/remote-gpu-trainer/references/spot-resilience.md +235 -0
  44. package/bundled-skills/remote-gpu-trainer/references/ssh_transport.md +270 -0
  45. package/bundled-skills/remote-gpu-trainer/references/training/by-domain.md +230 -0
  46. package/bundled-skills/remote-gpu-trainer/references/training/checkpoint-resume.md +368 -0
  47. package/bundled-skills/remote-gpu-trainer/references/training/convergence-debugging.md +187 -0
  48. package/bundled-skills/remote-gpu-trainer/references/training/data-pipeline.md +119 -0
  49. package/bundled-skills/remote-gpu-trainer/references/training/distributed-launch.md +422 -0
  50. package/bundled-skills/remote-gpu-trainer/references/training/oom-memory.md +338 -0
  51. package/bundled-skills/remote-gpu-trainer/references/training/precision-stability.md +401 -0
  52. package/bundled-skills/remote-gpu-trainer/references/training/throughput-profiling.md +451 -0
  53. package/bundled-skills/remote-gpu-trainer/scripts/aggregate_to_fs.sh +55 -0
  54. package/bundled-skills/remote-gpu-trainer/scripts/check_staleness.py +70 -0
  55. package/bundled-skills/remote-gpu-trainer/scripts/download_loop.sh +67 -0
  56. package/bundled-skills/remote-gpu-trainer/scripts/gpu_health.sh +169 -0
  57. package/bundled-skills/remote-gpu-trainer/scripts/health_patrol.sh.template +67 -0
  58. package/bundled-skills/remote-gpu-trainer/scripts/mem_monitor.sh +67 -0
  59. package/bundled-skills/remote-gpu-trainer/scripts/reap_vram_zombies.sh +175 -0
  60. package/bundled-skills/remote-gpu-trainer/scripts/run_one.sh.template +104 -0
  61. package/bundled-skills/remote-gpu-trainer/scripts/run_queue.sh.template +83 -0
  62. package/bundled-skills/remote-gpu-trainer/scripts/setup-china-mirrors.sh +35 -0
  63. package/bundled-skills/remote-gpu-trainer/scripts/verify_local.py +145 -0
  64. package/package.json +1 -1
  65. package/skills_index.json +66 -0
@@ -0,0 +1,355 @@
1
+ ---
2
+ platform: vastai
3
+ kind: ssh-rental
4
+ meter_stop_verb: destroy # the action that STOPS billing; stop keeps billing disk forever (compute off, storage on)
5
+ meter_stop_irreversible: true # destroy permanently deletes container disk
6
+ detach_primitive: tmux # auto-attached on login; dies on container restart → onstart.sh is the durable hook
7
+ spot_available: true # interruptible (bid) auction — central to the platform
8
+ spot_grace: ~0s # preemption is an abrupt pause, no documented notice / no SIGTERM
9
+ shared_fs: false # NO platform-wide FS; Volumes are machine-locked (per-GPU bound on restart)
10
+ inode_cap: host-dependent # undocumented; whatever the host's Docker storage driver gives
11
+ free_egress: host-dependent # CORRECTED: host-set bandwidth price; billed per byte in AND out, often $0 but not guaranteed
12
+ china_mirror_needed: false # no China DCs and no platform proxy; fix HF at workload level
13
+ host_driver_cuda_max: image-dependent # CUDA ships in the chosen Docker image; must be ≤ host driver
14
+ local_nvme: host-dependent
15
+ ---
16
+
17
+ # vast.ai — platform profile
18
+
19
+ One-line purpose: rent a marketplace GPU as a **Docker image on a third-party host**, run a spot-resumable
20
+ job, and **copy results off before `destroy`** — the only verb that stops the full meter.
21
+
22
+ > **Surface to the user up front (principle #10):** ⚠️ Danger clocks — a **`stop`ped instance bills its disk FOREVER** (only `destroy` stops the full meter, and `destroy` deletes everything); **bandwidth/egress bills continuously**, host-priced. Risk — rent only **verified, high-reliability** hosts with a direct port (an unverified host can vanish mid-run); cloud-sync works even while stopped (§5), the cleanest durable target.
23
+
24
+ **Table of contents** (`grep -in '^## ' profiles/vastai.md` to jump):
25
+ - §1 LAUNCH — offer-driven, Docker-image-is-the-env
26
+ - §2 STORAGE MODEL — per-machine-local disk; survival matrix; cloud-sync escape hatch
27
+ - §3 NETWORK — proxy vs direct SSH; random ports; host-set bandwidth; no China proxy
28
+ - §4 SPOT / INTERRUPTION + RESUME — bid auction, ~0 s pause, GPU-bound resume, status-poll loop
29
+ - §5 TEARDOWN / BILLING — `destroy` is the meter-stop; `stop` bills disk forever; bandwidth bills always
30
+ - §6 DAEMON TOOL — tmux dies on restart; `onstart.sh` is the durable relaunch
31
+ - §7 TOP GOTCHAS — VAST1–VAST13, platform-pinned + Platform-specific debugging
32
+ - §8 SCRIPT OVERRIDES — values to parameterize `scripts/`
33
+
34
+ Universal gotchas are NOT restated here — see `references/gotchas_universal.md`. Spot cadence math and
35
+ atomic-resume live in `references/spot-resilience.md`.
36
+
37
+ **The one fact that reshapes everything:** vast.ai is a **decentralized marketplace of third-party hosts**,
38
+ not a uniform first-party cloud. Consequences that diverge from AutoDL: **no platform-wide shared FS**, **no
39
+ China-mirror proxy**, **no single prebuilt conda env** (the Docker image IS the env), **storage is locked to
40
+ one physical host and even one GPU ID**, **bandwidth is host-priced (not free by fiat)**, and
41
+ **interruptible (bid) preemption is a real, central, abrupt model**.
42
+
43
+ ---
44
+
45
+ ## 1. LAUNCH
46
+
47
+ **Entry points** (all equivalent): web console (`cloud.vast.ai`), the `vastai` CLI / Python SDK, the REST
48
+ API (`https://console.vast.ai/api/v1/...`, Bearer token), and SSH into the running container. The CLI is the
49
+ orchestration surface: `pip install vastai`, then `vastai set api-key $VAST_API_KEY` (env-var name only —
50
+ never inline the key).
51
+
52
+ **Env contract — the Docker image IS the env.** A bare VM is not offered by default; the create call MUST
53
+ specify `--image` (e.g. `pytorch/pytorch:2.4.0-cuda12.4-cudnn9-runtime`). **CUDA version is whatever the
54
+ image ships** — a mismatch with the host driver is a real failure mode (VAST5). The image's default Python
55
+ env is the low-friction place to run — do not `conda create` on a rental (the remote-base exception holds).
56
+ Note: **Docker-in-Docker is not supported** "due to security constraints" (verified
57
+ docs.vast.ai/.../faq/instances 2026-06) — a containerized inner runtime is not an option here.
58
+
59
+ **Launch is offer-driven and two-step** (search a marketplace offer → create onto it):
60
+
61
+ ```bash
62
+ #!/usr/bin/env bash
63
+ set -u
64
+ # 1) find a verified, rentable offer with at least one direct port, cheapest $/dlperf first
65
+ vastai search offers 'gpu_name=RTX_4090 num_gpus=1 verified=true rentable=true direct_port_count>=1' -o 'dlperf_usd-'
66
+ # 2) create onto the chosen OFFER_ID; --direct enables direct-TCP SSH (see §3)
67
+ vastai create instance OFFER_ID --image pytorch/pytorch:2.4.0-cuda12.4-cudnn9-runtime \
68
+ --disk 50 --ssh --direct --onstart-cmd 'nvidia-smi && bash /workspace/onstart.sh'
69
+ ```
70
+
71
+ `--onstart-cmd` (**max 16 KB**; for a longer script, gzip+base64-encode it) is written to `/root/onstart.sh`
72
+ and **re-runs on every container start** — this is the platform-native boot hook and the durable relaunch
73
+ path (§6) (verified docs.vast.ai/cli/commands 2026-06). Filter offers hard: an unverified, low-reliability
74
+ host can simply vanish (`Offline`) mid-run (VAST7). Boot is not instant: the host must **pull the Docker
75
+ image and boot — typically 1–5 min depending on image size** (verified docs.vast.ai CLI Hello World 2026-06);
76
+ a fat image stuck in `Loading` is the slow-download symptom (VAST13).
77
+
78
+ → **verify:** `vastai show instance OFFER_ID` lists the new instance `running`, and an in-container
79
+ `nvidia-smi` (via `--onstart-cmd` or first SSH) shows the expected GPU with a CUDA that matches the image.
80
+
81
+ ---
82
+
83
+ ## 2. STORAGE MODEL *(survival matrix — principle #4)*
84
+
85
+ Three tiers; the persistence + region story is the single biggest divergence from AutoDL — **there is no
86
+ region-wide shared FS** to sync to. (verified docs.vast.ai/.../storage/types 2026-06)
87
+
88
+ | Tier | Path | Speed | Survives STOP? | Survives DESTROY? | Cap |
89
+ |---|---|---|---|---|---|
90
+ | Container / instance disk (`--disk N`) | `/` + `/workspace` | local | **yes** (bills) | **NO — gone** | fixed at create, **non-resizable**, min **10 GB** (default) |
91
+ | Volume (local) | mounted path | local | yes | **yes, until volume deleted** (bills per-GB while it exists) | fixed; **machine-locked**, non-resizable |
92
+ | Cloud sync (S3 / GDrive / Backblaze / Dropbox) | off-box bucket | network | yes | **yes — fully off-box** | provider's; **works even while instance is stopped** |
93
+ | Network Volume (cross-machine) | — | — | — | — | **not in current storage docs — treat as unavailable** |
94
+
95
+ **Machine-lock — and per-GPU-lock — is the trap.** A Volume "is tied to the physical machine where created"
96
+ and "cannot migrate between different physical machines." Worse, a stopped instance is **bound to a specific
97
+ GPU ID**, not just the machine: "When an instance is created, it is bound to a specific GPU ID. If the
98
+ instance is stopped, it remains bound to the same GPU ID and waits for that GPU to become available again"
99
+ (verified vast-ai.crisp.help scheduling article 2026-06). So a machine can show **available for rent** (other
100
+ GPUs free) while the stopped instance is stuck in `Scheduling` waiting for *its* GPU (VAST3).
101
+
102
+ **Where checkpoints MUST go for the §5 verb:** there is **no durable mount that survives `destroy`** on the
103
+ container disk — so the durable target is **off-box**. Two real off-box paths: (a) `vastai copy` the result
104
+ to local / another instance / a Volume **before** `destroy`; (b) **Cloud sync** (`vastai cloud copy`) to
105
+ S3/GDrive/Backblaze/Dropbox — notably **works even while the instance is stopped** (verified
106
+ docs.vast.ai/.../data-movement 2026-06), which makes it the cleanest durable target for a spot job. Always
107
+ assume the instance is lost once its lifetime expires. Inode caps and FS type are **undocumented and
108
+ host-dependent** (whatever the host's Docker storage driver gives) — `df -i` per host, do not assume an
109
+ AutoDL-style platform constant.
110
+
111
+ → **verify:** before any teardown, `vastai copy <id>:/path/to/ckpt local:/path/to/local` exits 0 (or
112
+ `vastai cloud copy` completes) AND the local artifact loads (`scripts/verify_local.py`).
113
+
114
+ ---
115
+
116
+ ## 3. NETWORK
117
+
118
+ **Shared public IP + random external port.** Each instance shares a host's (usually shared) public IP;
119
+ "each open internal port (such as 22 or 8080 etc) is mapped to a *random* external port" read from the
120
+ **"IP Port Info" pop-up** (button on the instance) or `vastai show instance` — format
121
+ `PUBLIC_IP:33526 -> 8081/tcp` (verified docs.vast.ai/.../connect/networking 2026-06). Ports change per
122
+ instance — discover them at runtime, never hard-code. **Hard cap 64 open ports per instance.**
123
+
124
+ **Two SSH flavors — and the scp size trap:**
125
+ - **Proxy SSH** (default, via Vast's proxy): "works on all machines, slower for data transfer." It carries
126
+ `scp` but is throttled — vast's own guidance is **scp over proxy only for transfers under ~1 GB**; above
127
+ that "using the direct ssh connection is recommended" (verified docs.vast.ai/.../data-movement 2026-06).
128
+ - **Direct SSH** (direct-TCP to the host): "requires machines with open ports, faster and more reliable, the
129
+ preferred method." This is the one that carries large `scp`/`rsync`/`vastai copy` without stalling. It
130
+ **requires the offer to expose open ports** → filter `direct_port_count>=1` and create with `--direct`.
131
+
132
+ **Rule:** if bulk transfer must work, require **direct-TCP** at create time. `vastai copy` "uses rsync and is
133
+ generally fast and efficient, subject to single-link upload/download constraints" — for a multi-GB result,
134
+ direct + a resumable loop (`references/gotchas_universal.md` U12). For a big *inbound* dataset, prefer
135
+ `wget`/`curl` from a cloud bucket over proxied SSH (much higher throughput). Custom services use Docker `-p`
136
+ (e.g. `-p 8081:8081`); Jupyter defaults to internal 8080 gated by `JUPYTER_TOKEN` (override the port via
137
+ `JUPYTER_PORT`).
138
+
139
+ **Bandwidth is metered and host-priced — NOT free by fiat (corrected).** "You are charged bandwidth prices
140
+ for every byte sent or received to or from the instance, regardless of what state it is in," and "pricing is
141
+ set by the host and is specific to each offer" (verified docs.vast.ai/.../reference/billing +
142
+ .../instances/pricing 2026-06). In practice many hosts price egress at ~$0 (vast is generally a low/zero
143
+ egress option), but a given offer **can** charge per-GB in *both* directions — read the per-offer bandwidth
144
+ rate (hover the price on the instance card / search page) before a transfer-heavy job. This is why the
145
+ frontmatter is `free_egress: host-dependent`, not `true`.
146
+
147
+ **China relevance: none at the platform level.** No China datacenters, no `/etc/network_turbo` equivalent, no
148
+ built-in HF mirror. The HF-unreachable problem still exists at the *workload* level from some hosts, but the
149
+ fix is the job's **own** `HF_ENDPOINT=https://hf-mirror.com` / `hf_transfer`, not a platform script — see
150
+ `references/gotchas_universal.md` (HF download) for the resumable-download ladder.
151
+
152
+ → **verify:** `ssh <alias> 'echo ok'` over the **direct** endpoint, then a 1-file `vastai copy` round-trip
153
+ exits 0.
154
+
155
+ ---
156
+
157
+ ## 4. SPOT / INTERRUPTION + RESUME *(principle #7/#8)*
158
+
159
+ vast.ai's **interruptible** rentals are a **live continuous-bid auction** — the cheap-GPU core of the
160
+ platform ("can reduce costs by fifty percent or even more"), far more first-class than anything on AutoDL.
161
+ (verified vast.ai/article/Rental-Types 2026-06)
162
+
163
+ - **Bidding:** clients set a bid price; "the current highest bid is the instance that runs, the others are
164
+ paused." **On-demand always beats interruptible** regardless of bid amount ("on-demand instances will
165
+ always take precedence").
166
+ - **The bid is fixed at create.** "The bidding method cannot be changed after an instance is rented"
167
+ (verified Rental-Types 2026-06) — so the resume lever is **not** "raise this instance's bid." To recover an
168
+ out-priced run, either wait for the higher bid to finish, or **re-launch the identical job on a fresh
169
+ offer** (cheaper/on-demand) — which is why off-box checkpoints (§2) matter.
170
+ - **Preemption = pause, not destroy.** A preempted instance is paused (disk survives) until its bid regains
171
+ top priority or the higher bid finishes. Because storage is machine-/GPU-locked, it can only resume **on
172
+ the original host's original GPU** — the resumability cliff (VAST3).
173
+ - **Detection signal + grace window:** **little/no advance notice — treat the grace as ~0 s, an abrupt
174
+ pause.** No documented termination signal; a SIGTERM-flush handler is **NOT** a safety net. Detect via the
175
+ API: `show_instance` returns `actual_status` (current container state), `intended_status` (desired state),
176
+ `cur_state` (contract/hardware allocation), and `status_msg` (human string, e.g. "success, running ...")
177
+ (verified docs.vast.ai/api-reference/instances/show-instances 2026-06). A preempted instance stops being
178
+ `running`; the UI shows **Inactive** (stopped, data preserved) / **Scheduling** (waiting for the GPU to
179
+ free) / **Offline** (host gone).
180
+ - **Resume hook:** wait for the higher bid to finish or restart the instance; it returns
181
+ `Scheduling → running` **only if the same GPU is still free** (else it sticks — VAST3), then
182
+ **`/root/onstart.sh` re-runs** and relaunches training (§6). The job itself must be checkpoint-resumable
183
+ (`--resume`, load-latest unconditionally) so the identical command resumes idempotently.
184
+
185
+ **Orchestrator pattern:** poll `actual_status` / `status_msg` on a timer; on preemption, restart (or
186
+ re-launch on a new offer) and let `onstart.sh` + checkpoint-resume recover. Cadence formula (Young/Daly) and
187
+ atomic temp→fsync→rename resume → `references/spot-resilience.md`.
188
+
189
+ → **verify:** kill-and-resume drill — `vastai stop instance <id>` then `start`; the job resumes from the last
190
+ checkpoint step, not epoch 0.
191
+
192
+ ---
193
+
194
+ ## 5. TEARDOWN / BILLING *(principle #9 + the Iron Law)*
195
+
196
+ This is the most error-prone section — be precise. (verified docs.vast.ai/.../reference/billing +
197
+ .../manage-instances 2026-06)
198
+
199
+ - **`destroy` is the ONLY thing that stops the full meter** (compute **and** disk). It is **irreversible** —
200
+ all container-disk data is permanently deleted. (`vastai destroy instance <id>`)
201
+ - **`stop` is a trap:** it detaches the GPU and halts compute billing, but **disk keeps charging
202
+ indefinitely** while stopped — "stopping an instance does not avoid storage costs," "you will continue to
203
+ be billed for disk storage, even if your balance is negative." The #1 surprise bill on vast.ai.
204
+ "Stopped" ≠ "meter off."
205
+ - **Bandwidth bills in EVERY state.** Charged "for every byte sent or received... regardless of what state it
206
+ is in" — so even a transfer to/from a *stopped* instance (cloud sync) accrues host-set bandwidth cost (§3).
207
+ - **A Volume keeps billing after the instance is destroyed** until the volume itself is deleted ("charged per
208
+ GB while volume exists," independently from instances).
209
+ - **On-demand instances auto-stop when their host-set lifetime expires** — "when the rental end date is
210
+ reached, the rental contract expires and the instance is stopped." Data remains until destroyed. An
211
+ unattended job can silently end, so checkpoint as if the box disappears at any moment.
212
+ - **Zero / negative balance → deletion.** At $0.00 "your instances, storage volumes, and data will be
213
+ scheduled for deletion unless you add credits"; without a saved card "your instances and stored data will
214
+ be destroyed." There is a "short grace period where your balance may go negative before deletion occurs" —
215
+ do not rely on it.
216
+ - **Poll-loop cost trap:** a status-poll loop with no timeout/error check will loop forever while the
217
+ instance keeps accruing disk + bandwidth charges. Bound every poll loop with `timeout` + an exit check.
218
+
219
+ **Teardown Iron Law (vast.ai instance):** NO `destroy` until checkpoints are **copied off-box AND verified by
220
+ load** — either `vastai copy`-ed to local (`scripts/verify_local.py` reports 100% OK) or `vastai cloud copy`
221
+ confirmed — the copy exit status is checked (VAST2), and the user has **explicitly approved** the
222
+ cost-affecting action. "It looked done in the log" is not evidence (principle #3). Because `destroy` deletes
223
+ the disk and there is **no shared FS to fall back on**, the confirmation gate matters more here, not less.
224
+
225
+ ---
226
+
227
+ ## 6. DAEMON TOOL
228
+
229
+ - **Auto-tmux on SSH login** (same as AutoDL): login attaches a tmux session "to keep the session active
230
+ even if you disconnect." Disable with `touch ~/.no_auto_tmux` then reconnect (verified docs.vast.ai
231
+ jupyter-ssh FAQ 2026-06).
232
+ - **tmux survives an SSH disconnect but NOT a container restart/reboot/spot-resume** — a reboot or
233
+ spot-resume wipes the tmux session. The **durable relaunch hook is `/root/onstart.sh`** (the
234
+ `--onstart-cmd`), which re-runs on every container start. Put the training relaunch there, **not** in
235
+ tmux, so a spot-resume actually restarts the job.
236
+ - **SSH keys apply only to instances created AFTER the key is added** — existing instances do not get a new
237
+ key automatically. Set the account key **before** creating, or inject it via `onstart`. A pasted key missing
238
+ its `ssh-rsa`/`ssh-ed25519` prefix or `user@host` suffix authenticates as a password prompt — copy the whole
239
+ line (verified docs.vast.ai jupyter-ssh FAQ 2026-06).
240
+ - **Native queue:** vast.ai has **Serverless / autoscaler** for queue-style workloads, but single-instance
241
+ training has no managed scheduler — the orchestrator + `onstart.sh` + checkpoint-resume **is** the queue.
242
+
243
+ ---
244
+
245
+ ## 7. TOP GOTCHAS (platform-pinned; Symptom → Root cause → Fix)
246
+
247
+ Universal gotchas (CRLF, cgroup OOM, silent-sync, HF stalls, zombie VRAM, GPU-0%-util, scp-resets,
248
+ egress-surcharge) live in `references/gotchas_universal.md` — not repeated here.
249
+
250
+ - **VAST1 — surprise bill on a "stopped" instance.** Symptom: a stopped, idle instance keeps charging for
251
+ days, even past a negative balance. → Root cause: `stop` halts compute only; **disk bills forever while
252
+ stopped**, and bandwidth bills in every state. → Fix: to stop the meter, **`destroy`** (after copy-out per
253
+ §5); never leave an instance merely stopped to "save money."
254
+ - **VAST2 — results gone after teardown.** Symptom: `destroy` run, checkpoints irrecoverable. → Root cause:
255
+ `destroy` permanently nukes container disk and there's **no platform-wide FS to fall back on**. → Fix:
256
+ `vastai copy` out (or `vastai cloud copy` to a bucket) and **check its exit status** BEFORE `destroy`; gate
257
+ the success line on the copy result, never on a log claim.
258
+ - **VAST3 — paused/stopped instance stuck in `Scheduling` though the machine shows "available."** Symptom:
259
+ preempted or stopped run never resumes; the portal still lists the same machine as rentable. → Root cause:
260
+ the instance is **bound to a specific GPU ID** (not the machine); if that GPU was re-rented, it waits
261
+ indefinitely while *other* GPUs on the host stay free. "If stuck >30 s, GPU likely rented by another user."
262
+ → Fix: stop the scheduling attempt, **create a NEW instance on the same host and re-attach the same Volume**
263
+ (works because other GPUs are free), or re-launch on a different offer from an off-box checkpoint; don't
264
+ wait for the same GPU to come back (verified vast-ai.crisp.help + manage-instances 2026-06).
265
+ - **VAST4 — job dies mid-step with no warning.** Symptom: interruptible run vanishes abruptly. → Root cause:
266
+ bid preemption with **~0 s notice and no SIGTERM**; a flush handler never fires. → Fix: periodic checkpoint
267
+ to disk on a Young/Daly timer + load-latest-on-resume; poll `actual_status`/`status_msg` and restart (§4,
268
+ `references/spot-resilience.md`). The bid can't be raised on a live instance — re-launch elsewhere if the
269
+ GPU is gone.
270
+ - **VAST5 — CUDA driver mismatch on a fresh box.** Symptom: `torch.cuda.is_available()` is False / driver
271
+ mismatch error. → Root cause: **CUDA ships in the Docker image, not the host**; the image's CUDA may be
272
+ newer than the host driver supports (image CUDA must be ≤ host driver). → Fix: pick an image whose CUDA ≤
273
+ host driver; verify `nvidia-smi`/`nvcc` inside the container in `onstart` before training (general triangle:
274
+ `gotchas_universal.md` U28).
275
+ - **VAST6 — a service is unreachable on its "own" port.** Symptom: TB/Jupyter/API not reachable at the
276
+ internal port. → Root cause: internal ports map to **random external ports** and there's a **64-port cap**
277
+ per instance. → Fix: open ports with `-p` at create, **discover the external mapping at runtime**
278
+ (`vastai show instance` / IP Port Info pop-up), never hard-code a port.
279
+ - **VAST7 — host vanishes mid-run.** Symptom: instance flips to `Offline`, work lost. → Root cause: it's a
280
+ **marketplace** — an unverified/low-reliability host can disconnect. → Fix: filter offers on
281
+ `verified=true`, high `reliability`, and `direct_port_count>=1`; treat any single host as disposable and
282
+ checkpoint off-box accordingly.
283
+ - **VAST8 — bulk `scp` over the default SSH stalls / crawls.** Symptom: a multi-GB result copy over the
284
+ default endpoint hangs or runs at a trickle. → Root cause: the **default is proxy SSH**, throttled and
285
+ recommended only for <1 GB; large transfers need direct-TCP. → Fix: create with `--direct` (offer must have
286
+ `direct_port_count>=1`) and use that endpoint for `scp`/`vastai copy`; for big *inbound* data prefer
287
+ `wget`/`curl` from a bucket (verified data-movement docs 2026-06).
288
+ - **VAST9 — bandwidth shows up on the bill.** Symptom: a transfer-heavy job costs more than the GPU-hours
289
+ alone. → Root cause: bandwidth is **host-priced and metered per byte in both directions, in every state** —
290
+ some offers are not $0-egress. → Fix: read the per-offer bandwidth rate before committing; pull a dataset
291
+ **once** to durable local/Volume, not per-epoch from a remote bucket (general form: `gotchas_universal.md`
292
+ U14/U15).
293
+ - **VAST10 — disk full, and you can't grow it.** Symptom: `No space left on device` mid-run; `--disk` can't
294
+ be raised. → Root cause: container disk is **fixed at create (min 10 GB) and non-resizable**; Docker
295
+ layers + HF cache + checkpoints overrun it. → Fix: over-provision `--disk` at create; redirect `HF_HOME`
296
+ onto the data disk; prune `latest`/periodic checkpoints, keep only `best` (inode/byte audit:
297
+ `gotchas_universal.md` U6/U7).
298
+ - **VAST11 — secret baked into the image or onstart-cmd is recoverable.** Symptom: a key embedded at build
299
+ time or in `--onstart-cmd` is stored by the platform. → Root cause: image layers and the 16 KB onstart
300
+ string are persisted server-side. → Fix: inject `WANDB_API_KEY`/`HF_TOKEN` via **env vars at create**, never
301
+ baked into image layers or `--onstart-cmd`; stream creds via stdin at runtime (`gotchas_universal.md` U34).
302
+ - **VAST12 — assuming a cross-machine Network Volume exists.** Symptom: a plan relies on a Volume following
303
+ the job to a different host. → Root cause: Volumes are **machine-locked**; cross-machine Network Volumes are
304
+ **not in the current storage docs**. → Fix: design for off-box durability (`vastai cloud copy` to a bucket),
305
+ not a portable volume; only same-machine re-attach is reliable.
306
+ - **VAST13 — instance stuck in `Loading`, never reaches `running`.** Symptom: a new instance sits in
307
+ `Loading`/`Connecting` for many minutes. → Root cause: the host is **pulling a large Docker image** (boot is
308
+ 1–5 min, longer for fat images) or the host link is slow. → Fix: wait out the documented window, then read
309
+ `vastai show logs <id>` (below) for the pull progress; if still stuck, `destroy` and re-create on a faster
310
+ offer with a slimmer image.
311
+
312
+ ### Platform-specific debugging (commands + what to check)
313
+
314
+ - **Read the boot/container/system logs from off-box:**
315
+ `vastai show logs <id> --tail 200 [--filter <grep>] [--daemon-logs]` — uploads container logs (and, with
316
+ `--daemon-logs`, host/system logs) to a generated URL. This is the first stop for a box that won't connect,
317
+ a stuck `Loading`, or a silent `onstart` failure (verified docs.vast.ai/api-reference/instances/show-logs
318
+ 2026-06). The GUI equivalent is the **"Logs" button** on the instance card.
319
+ - **Inspect the live state machine without SSH:** `vastai show instance <id>` (or the API) — compare
320
+ `actual_status` (where the container *is*), `intended_status` (where it *should* be), `cur_state` (contract/
321
+ hardware allocation) and `status_msg`. `intended=running` but `actual≠running` + `Scheduling` ⇒ VAST3
322
+ (GPU-bound wait); `Offline` ⇒ VAST7 (host gone).
323
+ - **Confirm the GPU is really attached:** in `onstart` / first SSH run `nvidia-smi` and
324
+ `python -c "import torch; print(torch.cuda.is_available(), torch.version.cuda)"` — `False`/CPU-only ⇒ VAST5
325
+ (image CUDA > host driver) or no-GPU container (`gotchas_universal.md` U31).
326
+ - **Detect a stuck download inside the box:** `du -sh ~/.cache/huggingface/hub` over time (no growth = stalled
327
+ HF pull), `df -h /` (filling = active download) and `df -i /` (inodes), then the resumable-download ladder
328
+ in `gotchas_universal.md` (HF). A fat-image stall *before* SSH is visible only via `vastai show logs`.
329
+ - **Find the real external ports / SSH target:** `vastai show instance <id>` lists the port map and
330
+ `vastai ssh-url <id>` prints the connection string — never assume port 22 is reachable (VAST6).
331
+
332
+ ---
333
+
334
+ ## 8. SCRIPT OVERRIDES
335
+
336
+ Values to parameterize the `scripts/` templates for vast.ai:
337
+
338
+ ```bash
339
+ # DATA_DIR — data + (only) checkpoint mount; NOTHING survives destroy, so durable = off-box copy-out/cloud-sync
340
+ DATA_DIR=/workspace # container disk; survives stop, bills forever, GONE on destroy
341
+ DURABLE_DIR=off-box # no destroy-surviving mount: vastai copy / vastai cloud copy before destroy (§5)
342
+ # PROXY_HOOK — none at platform level (no /etc/network_turbo). HF mirror is the JOB's own env if needed:
343
+ PROXY_HOOK='' # set HF_ENDPOINT=https://hf-mirror.com in the job env only if a host can't reach HF
344
+ # CRED_FILE — empty: vast's key is the VAST_API_KEY env var, not a file. WANDB_API_KEY/HF_TOKEN also arrive via env.
345
+ CRED_FILE="" # no cred FILE on disk → run_one's [ -n "$CRED_FILE" ] guard skips the cat; VAST_API_KEY + WANDB_API_KEY/HF_TOKEN injected via env at create, NOT into the image or onstart-cmd
346
+ # SCRATCH — what to prune (disk is fixed-size, non-resizable → prune aggressively)
347
+ SCRATCH='latest.pth periodic-*.pth *.tmp ~/.cache/huggingface/hub/blobs' # keep only best + tiny eval JSONs
348
+ # HF_HOME — redirect cache off the small root onto the data disk
349
+ HF_HOME=/workspace/.cache/huggingface
350
+ # DETACH — durable relaunch is onstart.sh, NOT tmux (tmux dies on container restart/spot-resume)
351
+ DETACH='/root/onstart.sh' # re-runs on every container start; tmux only for an attached SSH session
352
+ ```
353
+
354
+ **Secrets note:** inject `WANDB_API_KEY` / `HF_TOKEN` via **env vars at create**, never baked into the Docker
355
+ image layers or the 16 KB `--onstart-cmd` (both are stored by the platform — VAST11).
@@ -0,0 +1,206 @@
1
+ # China network + model-download reference
2
+
3
+ Universal recipe for pulling code, packages, and model weights onto **any GPU box behind the GFW** —
4
+ AutoDL, 矩池云, 恒源云, Featurize, 揽睿星舟, or a bare CN SSH instance. The whole problem reduces to **four
5
+ orthogonal env-var switches** (mirror, cache location, resume tier, proxy scope); none requires editing
6
+ training code. This file owns the CN-specific transport swap and stall-retry; **REQUIRED:**
7
+ `huggingface-skills:hf-cli` owns the generic `hf download` / `hf upload` verbs underneath it.
8
+
9
+ Universal gotchas (inode caps, silent sync, symlinked caches) are **not** restated here — see
10
+ `references/gotchas_universal.md`. The AutoDL-pinned form lives in `profiles/autodl.md`.
11
+
12
+ To jump: `grep -in '<keyword>' references/china-network.md` (try `mirror`, `HF_ENDPOINT`, `hfd`,
13
+ `no_proxy`, `hf_transfer`, `decision`).
14
+
15
+ ## Table of contents
16
+
17
+ 1. Mirrors table — PyPI / conda / HuggingFace / alt hub
18
+ 2. Env switchboard — the four switches + the import-time trap + cache redirect
19
+ 3. Resumable-download ladder — three tiers + the `hf_transfer` caution
20
+ 4. The `no_proxy` trap — a proxy that fixes one domain breaks all the others
21
+ 5. Decision rule + `scripts/setup-china-mirrors.sh`
22
+
23
+ ---
24
+
25
+ ## 1. Mirrors table
26
+
27
+ Swap the *source*, not the workflow. Same package names, same repo IDs — only the endpoint changes. Ship
28
+ this verbatim; it is identical across every CN platform.
29
+
30
+ | Channel | Set | Endpoint(s) |
31
+ |---|---|---|
32
+ | **PyPI** | `pip config set global.index-url <url>` or `pip install -i <url> pkg` | Tsinghua TUNA `https://pypi.tuna.tsinghua.edu.cn/simple` · Aliyun `https://mirrors.aliyun.com/pypi/simple` · USTC `https://pypi.mirrors.ustc.edu.cn/simple` |
33
+ | **conda** | channels in `~/.condarc` (TUNA Anaconda) | `https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main` + `.../free` + the `cloud/` channels (pytorch, conda-forge) |
34
+ | **HuggingFace** | `export HF_ENDPOINT=https://hf-mirror.com` | drop-in reverse proxy — identical repo IDs, identical `hf download` / `from_pretrained` calls |
35
+ | **Alt model hub** | ModelScope CLI / SDK | `pip install modelscope`; `modelscope download <id>` or `snapshot_download(id, ...)` — often hosts the same Qwen / GLM / Llama weights domestically |
36
+
37
+ **conda trap — NEVER mirror `pytorch-nightly`.** TUNA (and every CN Anaconda mirror) syncs the stable
38
+ `pytorch` channel but **does not carry `pytorch-nightly`** — pointing the nightly channel at a mirror
39
+ silently resolves to a stale or absent build. Install nightly only from the official channel (over a real
40
+ proxy if the box is offline), and mirror just the stable channels.
41
+
42
+ Source: HF-Mirror `https://hf-mirror.com/`; TUNA PyPI `https://mirrors.tuna.tsinghua.edu.cn/help/pypi/`;
43
+ TUNA Anaconda `https://mirrors.tuna.tsinghua.edu.cn/help/anaconda/`; ModelScope client
44
+ `https://github.com/modelscope/modelscope_hub`.
45
+
46
+ ---
47
+
48
+ ## 2. Env switchboard + the import-time trap
49
+
50
+ Everything below is **environment variables only** — no code edits. Export them once per shell (or bake
51
+ them into `scripts/setup-china-mirrors.sh`, §5) before anything that touches the wire.
52
+
53
+ ```bash
54
+ # --- mirror routing ---
55
+ export HF_ENDPOINT=https://hf-mirror.com # MUST precede any HF import (see trap below)
56
+ # --- caches OFF the small reset-on-release system disk, ONTO the data disk ---
57
+ export HF_HOME=/path/to/datadisk/hf # parent for hub/, datasets/, etc.
58
+ export HF_HUB_CACHE=/path/to/datadisk/hf/hub # the model-blob cache specifically
59
+ export MODELSCOPE_CACHE=/path/to/datadisk/modelscope
60
+ # --- keep hf_transfer OFF on flaky CN links (see §3) ---
61
+ export HF_HUB_ENABLE_HF_TRANSFER=0
62
+ ```
63
+
64
+ **The import-time trap — `HF_ENDPOINT` is read once, at import.** `huggingface_hub` / `transformers` /
65
+ `datasets` snapshot `HF_ENDPOINT` the moment they are **imported**. Setting it *after* the import (or in a
66
+ notebook cell run after the first `import transformers`) is a no-op — the library already cached the
67
+ international endpoint and every download hits the slow path. Two safe forms:
68
+
69
+ ```bash
70
+ # Inline on the command — the env is set before the interpreter starts:
71
+ HF_ENDPOINT=https://hf-mirror.com python train.py
72
+ # Or export in the wrapper, ABOVE any python invocation:
73
+ export HF_ENDPOINT=https://hf-mirror.com # then later: python -m src.train ...
74
+ ```
75
+
76
+ **Cache redirect — why it matters.** Most CN images pair a tiny reset-on-release system disk with a larger
77
+ persistent data disk. Left at defaults, `~/.cache/huggingface` lands on the system disk and either fills it
78
+ (crashing downloads) or is **wiped on restart** on platforms where `/root` is ephemeral. Redirecting
79
+ `HF_HOME` / `HF_HUB_CACHE` / `MODELSCOPE_CACHE` onto the data disk ties model storage to the same
80
+ disk-budget discipline as checkpoints (principle #5; survival matrix in each profile).
81
+
82
+ Source: HF-Mirror `https://hf-mirror.com/`; ModelScope client
83
+ `https://github.com/modelscope/modelscope_hub`.
84
+
85
+ ---
86
+
87
+ ## 3. Resumable-download ladder
88
+
89
+ Bulk weight pulls are the prototypically flaky step on a CN link — a stall is **not** a permanent failure,
90
+ and every tier below accumulates progress across kills. Escalate by file size and instability.
91
+
92
+ **Tier 1 — `hf download <repo> --resume-download` (default).**
93
+ Writes partial blobs as `*.incomplete`; re-running the identical command resumes from the byte offset. Best
94
+ for single repos under ~10 GB. Wrap in a `timeout … && break` retry loop so a stall self-recovers:
95
+
96
+ ```bash
97
+ #!/usr/bin/env bash
98
+ set -u
99
+ for _ in $(seq 1 20); do
100
+ timeout 600 hf download "$REPO" --local-dir "$DIR" --resume-download && break
101
+ echo "stall, retrying (progress is saved)"; sleep 5
102
+ done
103
+ ```
104
+
105
+ (Underlying verbs — `hf download --resume-download`, `hf cache verify` — belong to **REQUIRED:**
106
+ `huggingface-skills:hf-cli`; this ladder only wraps them with CN-mirror routing + stall-retry.)
107
+
108
+ **Tier 2 — `hfd.sh` (aria2 multi-connection) for any single file > 10 GB.**
109
+ `hfd.sh` (the HF-Mirror companion script) drives `aria2c` with many parallel connections per file —
110
+ markedly faster and more stall-resistant than the single-stream CLI on large `.safetensors` shards over a
111
+ congested evening link. Reach for it whenever one file exceeds ~10 GB:
112
+
113
+ ```bash
114
+ ./hfd.sh "$REPO" --tool aria2c -x 8 # 8 connections per file, resumes on re-run
115
+ ```
116
+
117
+ **Tier 3 — ModelScope `snapshot_download` (HTTP-Range resume).**
118
+ When a model exists on ModelScope (most CN-origin models do), pull it domestically — `snapshot_download`
119
+ does per-file HTTP-Range resume, per-file retry with backoff, and SHA256 verification, all over a domestic
120
+ route that never touches the GFW:
121
+
122
+ ```python
123
+ from modelscope import snapshot_download
124
+ snapshot_download("Org/Model", local_dir="/path/to/datadisk/model")
125
+ ```
126
+
127
+ Note: ModelScope writes a plain directory and does **not** populate the HF cache, so
128
+ `from_pretrained("Org/Model")` won't find it — point the load at the local dir.
129
+
130
+ **`hf_transfer` caution — keep `HF_HUB_ENABLE_HF_TRANSFER=0` on flaky CN networks.**
131
+ `hf_transfer` is a Rust accelerator that helps on fast, stable links, but it has a **documented
132
+ hang-with-no-error** in exactly the unstable-bandwidth conditions CN ops hit — the download wedges with no
133
+ progress and no exception, defeating every retry loop above. Leave it **off** by default on any CN box;
134
+ only enable it once a route is verified fast and stable.
135
+
136
+ Source: hf CLI resume `https://github.com/huggingface/huggingface_hub/issues/3580`; hf_transfer hang
137
+ `https://github.com/huggingface/hf_transfer/issues/30`; ModelScope download
138
+ `https://deepwiki.com/modelscope/modelscope/3.1-model-download-and-caching`.
139
+
140
+ ---
141
+
142
+ ## 4. The `no_proxy` trap
143
+
144
+ **The highest-value gotcha in this file.** A Clash / VPN proxy added to reach `huggingface.co`
145
+ **simultaneously breaks every domestic mirror** — `pip`, the TUNA index, ModelScope, intra-cloud OSS all
146
+ get routed out through an overseas exit node, producing `ProxyError` or multi-minute stalls (principle #7:
147
+ a proxy speeds ONE route and slows the others).
148
+
149
+ **Symptom** → after exporting `http_proxy`/`https_proxy` to fix HF, `pip install` and ModelScope downloads
150
+ hang or raise `ProxyError`, while `huggingface.co` now works.
151
+ **Root cause** → the proxy is global; domestic mirrors that were fast on the direct route are now hauled
152
+ overseas and back.
153
+ **Fix** → exempt every domestic host from the proxy with a `no_proxy` allowlist, minding these library
154
+ quirks:
155
+
156
+ - **Leading-dot domains, no `*` wildcards.** `requests` honors `no_proxy` but does **not** expand `*` — use
157
+ `.modelscope.cn` (leading dot matches the domain and all subdomains), never `*.modelscope.cn`.
158
+ - **Set BOTH `no_proxy` and `NO_PROXY`.** Different libraries read different casings; set both to the same
159
+ value.
160
+ - **List `127.0.0.1` AND `localhost`.** They are distinct entries; omitting either lets a loopback call
161
+ (TensorBoard, a local API) get proxied.
162
+ - **`pip` ignores `no_proxy` for its own connections** — pass `pip install --proxy ""` to force pip onto the
163
+ direct route regardless of an inherited proxy env.
164
+
165
+ ```bash
166
+ # Only export this WHEN a proxy is present (see below):
167
+ DOMESTIC=".tuna.tsinghua.edu.cn,.aliyun.com,.aliyuncs.com,.ustc.edu.cn,.modelscope.cn,.tencentyun.com"
168
+ export no_proxy="127.0.0.1,localhost,${DOMESTIC}"
169
+ export NO_PROXY="$no_proxy"
170
+ ```
171
+
172
+ **A clean box with no proxy needs no `no_proxy` at all.** `no_proxy` only un-routes a proxy that is already
173
+ set. On a freshly rented box with no `http_proxy`/`https_proxy` exported, adding `no_proxy` does nothing —
174
+ add it **only** in the same breath as exporting a proxy (§5's "real overseas proxy" branch), and clear it
175
+ when the proxy is unset.
176
+
177
+ Source: requests `no_proxy` `https://github.com/psf/requests/issues/4871`; no_proxy guide
178
+ `https://www.browserstack.com/guide/no_proxy-environment-variable`; Clash pip ProxyError
179
+ `https://github.com/clash-verge-rev/clash-verge-rev/issues/2607`.
180
+
181
+ ---
182
+
183
+ ## 5. Decision rule + delivery
184
+
185
+ **Pick the cheapest route that reaches the weights, in order:**
186
+
187
+ 1. **hf-mirror first** — `HF_ENDPOINT=https://hf-mirror.com`. Drop-in, same repo IDs, no proxy, no
188
+ `no_proxy` to manage. Default for everything.
189
+ 2. **ModelScope** if the model is absent on the mirror or the mirror route is flaky — same Qwen / GLM /
190
+ Llama weights domestically, Tier-3 resume, no GFW crossing.
191
+ 3. **`hfd.sh`** for any single file > 10 GB on a stable-but-slow link — aria2 multi-connection.
192
+ 4. **A real overseas proxy ONLY when a model exists *only* on `huggingface.co`** and neither mirror nor
193
+ ModelScope carries it. The moment a proxy goes on, **immediately apply the §4 `no_proxy` block** so the
194
+ domestic mirrors keep working — and unset both when the pull is done.
195
+
196
+ **Never** reach for a proxy by reflex: it is the slowest, most fragile option and the one that breaks
197
+ everything else. Mirror → alt hub → multi-connection → proxy, in that order of preference.
198
+
199
+ **Ship `scripts/setup-china-mirrors.sh`** — the orchestrator `scp`s it onto the box and `source`s it on
200
+ first connect. It bakes §1 (PyPI + conda mirrors), §2 (the four env switches + cache redirect off the
201
+ system disk), and the §3 default (`HF_HUB_ENABLE_HF_TRANSFER=0`) into one idempotent step, leaving the §4
202
+ proxy block commented out (added only on the rare proxy branch). Author it with `#!/usr/bin/env bash` +
203
+ `set -u`, forward-slash paths, and **no unquoted `|` inside any `grep`** (an unquoted pipe in a regex reads
204
+ stdin and hangs the setup forever).
205
+
206
+ Source: HF-Mirror `https://hf-mirror.com/`; ModelScope `https://github.com/modelscope/modelscope_hub`.