opencode-skills-collection 3.1.2 → 3.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/bundled-skills/.antigravity-install-manifest.json +4 -1
  2. package/bundled-skills/agent-creator/SKILL.md +246 -0
  3. package/bundled-skills/ax-extract-workflow/SKILL.md +156 -0
  4. package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
  5. package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
  6. package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
  7. package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
  8. package/bundled-skills/docs/sources/sources.md +1 -1
  9. package/bundled-skills/docs/users/bundles.md +1 -1
  10. package/bundled-skills/docs/users/claude-code-skills.md +1 -1
  11. package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
  12. package/bundled-skills/docs/users/getting-started.md +1 -1
  13. package/bundled-skills/docs/users/kiro-integration.md +1 -1
  14. package/bundled-skills/docs/users/usage.md +4 -4
  15. package/bundled-skills/docs/users/visual-guide.md +4 -4
  16. package/bundled-skills/lovable-cleanup/SKILL.md +2 -1
  17. package/bundled-skills/remote-gpu-trainer/.gitattributes +8 -0
  18. package/bundled-skills/remote-gpu-trainer/LICENSE +21 -0
  19. package/bundled-skills/remote-gpu-trainer/README.md +267 -0
  20. package/bundled-skills/remote-gpu-trainer/SKILL.md +249 -0
  21. package/bundled-skills/remote-gpu-trainer/evals/README.md +57 -0
  22. package/bundled-skills/remote-gpu-trainer/evals/RESULTS.md +44 -0
  23. package/bundled-skills/remote-gpu-trainer/evals/cases.jsonl +14 -0
  24. package/bundled-skills/remote-gpu-trainer/evals/run_evals.py +68 -0
  25. package/bundled-skills/remote-gpu-trainer/examples/autodl_sweep/README.md +72 -0
  26. package/bundled-skills/remote-gpu-trainer/examples/autodl_sweep/queue_1.txt +6 -0
  27. package/bundled-skills/remote-gpu-trainer/profiles/_schema.md +100 -0
  28. package/bundled-skills/remote-gpu-trainer/profiles/autodl.md +327 -0
  29. package/bundled-skills/remote-gpu-trainer/profiles/china.md +397 -0
  30. package/bundled-skills/remote-gpu-trainer/profiles/generic-ssh.md +450 -0
  31. package/bundled-skills/remote-gpu-trainer/profiles/lambda.md +342 -0
  32. package/bundled-skills/remote-gpu-trainer/profiles/paperspace.md +365 -0
  33. package/bundled-skills/remote-gpu-trainer/profiles/runpod.md +164 -0
  34. package/bundled-skills/remote-gpu-trainer/profiles/vastai.md +355 -0
  35. package/bundled-skills/remote-gpu-trainer/references/china-network.md +206 -0
  36. package/bundled-skills/remote-gpu-trainer/references/gotchas_universal.md +704 -0
  37. package/bundled-skills/remote-gpu-trainer/references/lifecycle_checklist.md +148 -0
  38. package/bundled-skills/remote-gpu-trainer/references/monitoring_patterns.md +327 -0
  39. package/bundled-skills/remote-gpu-trainer/references/multinode.md +190 -0
  40. package/bundled-skills/remote-gpu-trainer/references/parallel_ablation.md +196 -0
  41. package/bundled-skills/remote-gpu-trainer/references/principles.md +179 -0
  42. package/bundled-skills/remote-gpu-trainer/references/self-improvement.md +74 -0
  43. package/bundled-skills/remote-gpu-trainer/references/spot-resilience.md +235 -0
  44. package/bundled-skills/remote-gpu-trainer/references/ssh_transport.md +270 -0
  45. package/bundled-skills/remote-gpu-trainer/references/training/by-domain.md +230 -0
  46. package/bundled-skills/remote-gpu-trainer/references/training/checkpoint-resume.md +368 -0
  47. package/bundled-skills/remote-gpu-trainer/references/training/convergence-debugging.md +187 -0
  48. package/bundled-skills/remote-gpu-trainer/references/training/data-pipeline.md +119 -0
  49. package/bundled-skills/remote-gpu-trainer/references/training/distributed-launch.md +422 -0
  50. package/bundled-skills/remote-gpu-trainer/references/training/oom-memory.md +338 -0
  51. package/bundled-skills/remote-gpu-trainer/references/training/precision-stability.md +401 -0
  52. package/bundled-skills/remote-gpu-trainer/references/training/throughput-profiling.md +451 -0
  53. package/bundled-skills/remote-gpu-trainer/scripts/aggregate_to_fs.sh +55 -0
  54. package/bundled-skills/remote-gpu-trainer/scripts/check_staleness.py +70 -0
  55. package/bundled-skills/remote-gpu-trainer/scripts/download_loop.sh +67 -0
  56. package/bundled-skills/remote-gpu-trainer/scripts/gpu_health.sh +169 -0
  57. package/bundled-skills/remote-gpu-trainer/scripts/health_patrol.sh.template +67 -0
  58. package/bundled-skills/remote-gpu-trainer/scripts/mem_monitor.sh +67 -0
  59. package/bundled-skills/remote-gpu-trainer/scripts/reap_vram_zombies.sh +175 -0
  60. package/bundled-skills/remote-gpu-trainer/scripts/run_one.sh.template +104 -0
  61. package/bundled-skills/remote-gpu-trainer/scripts/run_queue.sh.template +83 -0
  62. package/bundled-skills/remote-gpu-trainer/scripts/setup-china-mirrors.sh +35 -0
  63. package/bundled-skills/remote-gpu-trainer/scripts/verify_local.py +145 -0
  64. package/package.json +1 -1
  65. package/skills_index.json +66 -0
@@ -0,0 +1,397 @@
1
+ ---
2
+ platform: china-family # 矩池云 Matpool · 恒源云 Gpushare · Featurize · 揽睿星舟 LanRui
3
+ kind: ssh-rental # all four: SSH + Jupyter + tmux, cgroup-isolated, prebuilt conda base
4
+ meter_stop_verb: per-platform # 停止并释放 (Matpool) | 关机→释放 (Gpushare) | 实例归还 (Featurize) | 停止+销毁数据盘 (LanRui)
5
+ meter_stop_irreversible: mixed # releasing the instance is; the persistent vol survives — EXCEPT LanRui 数据盘 bills while stopped
6
+ detach_primitive: tmux # preinstalled on most images; backgrounded python survives tab-close too
7
+ spot_available: false # on-demand only — NO mid-run spot reclaim (see §4)
8
+ spot_grace: n/a # the involuntary-loss vector is auto-release of STOPPED instances, not preemption
9
+ shared_fs: per-platform # /mnt | /hy-netdisk(+/hy-nas) | work+/cloud | /home/user/netdisk/data — region/machine-scoped, see §2
10
+ inode_cap: undocumented # size caps documented (5/20/30/10 GB free); inode caps NOT — measure df -i live
11
+ free_egress: true # intra-China; cross-GFW pulls need a mirror (see references/china-network.md)
12
+ china_mirror_needed: true # all four sit behind the GFW — mirror/proxy story is shared, not per-platform
13
+ host_driver_cuda_max: image-dependent
14
+ local_nvme: per-platform # Gpushare /hy-tmp, LanRui /home/user/datadisk, Featurize local scratch
15
+ ---
16
+
17
+ # Profile: Chinese GPU-rental family (Matpool · Gpushare · Featurize · LanRui)
18
+
19
+ One-line purpose: the AutoDL-shaped Chinese rentals — near-clones that share AutoDL's SSH+tmux+prebuilt-base
20
+ spine but diverge on **what survives a stop**, **whether a stopped data disk still bills**, and **which (if
21
+ any) academic proxy** ships. Treat AutoDL (`profiles/autodl.md`) as the reference implementation; this profile
22
+ records only the deltas, at the FAMILY level first, then a per-platform comparison table.
23
+
24
+ > **Surface to the user up front (principle #10):** ⚠️ Danger clocks (per platform, §5) — a **stopped instance is auto-released** (Gpushare ~10 days, others vary) → data gone; **LanRui's 数据盘 bills while stopped**; Gpushare's **`/hy-tmp` is wiped 24 h after stop** and `/root` resets to the image. Conveniences — built-in **JupyterLab / TensorBoard** quick-tools (all four); **declare any custom port at rent time** ("高级选项") — it can't be opened later.
25
+
26
+ **To jump:** `grep -in '<keyword>' profiles/china.md` (e.g. `proxy`, `ephemeral`, `bills`, `inode`, `LanRui`).
27
+
28
+ ## Table of contents
29
+ 1. LAUNCH · 2. STORAGE MODEL (survival matrix + `/root`-ephemeral trap) · 3. NETWORK (→ `references/china-network.md`)
30
+ · 4. SPOT/INTERRUPTION · 5. TEARDOWN/BILLING · 6. DAEMON TOOL · 7. TOP GOTCHAS (universal → `references/gotchas_universal.md`)
31
+ + Platform-specific debugging · 8. SCRIPT OVERRIDES · 9. Per-platform comparison table
32
+
33
+ > Universal gotchas (CRLF, cgroup OOM, silent sync, tmux-holds-script, disk-budget, secrets-off-shared-FS)
34
+ > are NOT restated here — see `references/gotchas_universal.md`. The mirror/proxy/download story is NOT
35
+ > restated either — it is shared across all CN platforms and lives in `references/china-network.md`.
36
+
37
+ ---
38
+
39
+ ## 1. LAUNCH
40
+
41
+ All four: web console rents a marketplace machine → pick GPU count + a **prebuilt image** (PyTorch/TF + CUDA)
42
+ that *is* the env → connect via SSH (auto-generated password or pushed public key) + JupyterLab; VS Code
43
+ Remote-SSH works on all of them.
44
+
45
+ **Env contract — the image/base IS the env; do not `conda create` on a rental.** Same rule as AutoDL, with
46
+ per-platform base-activation wrinkles (verified per-platform docs 2026-06):
47
+ - **Featurize** — base is fully provisioned and used directly; `pip`/`conda install` into base persists on the
48
+ `work` workspace. Activate and run.
49
+ - **Matpool** — ships a **`myconda` env that auto-activates on startup** (interpreter at
50
+ `/root/miniconda3/envs/myconda/bin/python`). Run directly; no re-enable needed (verified matpool conda docs
51
+ 2026-06 — this corrects the earlier "auto-activate off" note, which was true only for Gpushare).
52
+ - **Gpushare** — ships miniconda but **base auto-activate is *disabled*** (`登陆终端默认取消了自动进入 base 环境`).
53
+ Re-enable (`conda config --set auto_activate_base true`) or activate the named env per session
54
+ (verified gpushare.com/docs/best_practices/conda 2026-06).
55
+ - **LanRui** — image-provisioned (PyTorch images are a purchasable image option); base used directly.
56
+
57
+ An unavoidable custom env goes **on the persistent disk** (`--prefix /<persistent-mount>/myenv`), never the
58
+ small system disk (§2) — a system-disk env is wiped wherever `/root` is ephemeral. On Gpushare specifically,
59
+ docs recommend `conda create -p /hy-netdisk/myenv` (NOT `/hy-tmp` — that auto-clears 24 h after shutdown, GS5).
60
+
61
+ → **verify:** `ssh <alias> 'python -c "import torch;print(torch.cuda.is_available())"'` returns `True`
62
+ against the *prebuilt* interpreter, before any install.
63
+
64
+ ---
65
+
66
+ ## 2. STORAGE MODEL *(survival matrix — principle #4)*
67
+
68
+ **The family-level trap that breaks ported AutoDL habits: `/root` (the system disk) is NOT durable on every
69
+ platform.** AutoDL persists `/root` across a power-off; here it ranges from "resets to image state on every
70
+ restart" (Gpushare) to "wiped the instant the instance is returned" (Featurize). Checkpoints MUST go to the
71
+ platform's persistent mount, not `/root`.
72
+
73
+ Each platform pairs a **small reset-prone system disk** with a **persistent network/data disk**:
74
+
75
+ | Platform | System disk (`/root` etc.) | Persistent mount | Local fast scratch | Free quota |
76
+ |---|---|---|---|---|
77
+ | Matpool | `/root` (instance-local; snapshot-captured) | **`/mnt`** netdisk (survives release, expandable, region-scoped) | `/root` (local) | 5 GB netdisk |
78
+ | Gpushare | `/` incl. `/root` — **resets to image on stop/restart** | **`/hy-netdisk`** (only on *marked* machines) | **`/hy-tmp`** (local SSD; auto-cleared 24 h after stop) | 20 GB sys disk |
79
+ | Featurize | wiped on return | **`work` = `/home/featurize`** (persists) + **`/cloud`** sync drive | local (non-persistent) | **30 GB** free cloud |
80
+ | LanRui | system disk lost on stop | **数据盘 = `/home/user/datadisk`** + shared **`/home/user/netdisk/data`** | 数据盘 (block-storage, ≈ sys-disk speed) | 网盘 10 GB free |
81
+
82
+ **Survival matrix (family):**
83
+
84
+ | Tier | Survives STOP? | Survives RELEASE/RETURN? | Notes |
85
+ |---|---|---|---|
86
+ | System disk / `/root` | varies (Gpushare: **NO, resets to image**; Featurize: wiped on return) | NO | never the checkpoint target |
87
+ | Persistent netdisk / 数据盘 | YES | YES (except LanRui 数据盘 still **bills** — §5) | the only safe checkpoint target |
88
+ | Cross-instance shared folder | YES | YES | region/zone/machine-scoped; **clobber risk** (see gotchas) |
89
+ | Gpushare `/hy-tmp` (local SSD) | **NO — auto-cleared 24 h after shutdown** | NO | fast scratch only; copy results to `/hy-netdisk` before stop (GS5) |
90
+
91
+ **Region / scope locks** (analog of AutoDL's region-scoped FS):
92
+ - **Matpool** — `/mnt` netdisk is **region-scoped**: different regions have separate netdisks that don't
93
+ interconnect; pick the region before expanding storage (verified matpool FAQ 2026-06).
94
+ - **Featurize** — code in `work`/`/cloud` persists; per common usage reports the cloud sync drive does not
95
+ share across different regions, while *datasets* are reusable — confirm all instances of a sweep are
96
+ same-region before fan-out. *(med confidence — official wording not re-verified 2026-06.)*
97
+ - **Gpushare** — `/hy-netdisk` exists **only on machines marked as supporting 内网存储**; an unmarked machine
98
+ has no shared mount. A separate **`/hy-nas`** shared storage (0.0007 元/GB·h) exists on specific instances
99
+ (verified gpushare.com/docs/data 2026-06).
100
+ - **LanRui** — `/home/user/netdisk/data` is a **per-availability-zone shared folder auto-mounted into *every*
101
+ workspace in that zone** (`data 文件夹下的任何数据,都可以在该可用区下的所有工作空间中使用`) — convenient,
102
+ but a parallel-ablation clobber hazard (gotcha LR2). The 网盘 is poor at many-small-file *writes*; use the
103
+ 数据盘 for that (verified docs.lanrui.co storage 2026-06).
104
+
105
+ **Inode caps:** size caps are documented (5 / 20 / 30 / 10 GB free across the four); **explicit inode caps are
106
+ NOT documented by any of them**. The many-small-files metadata-exhaustion risk still transfers to any shared
107
+ FS — measure `df -i <persistent-mount>` on a live instance in Phase 0 rather than assuming a number. Redirect
108
+ HF/ModelScope caches off the small system disk → see `references/china-network.md` §2.
109
+
110
+ State the checkpoint mount for §5's teardown verb: write to the **persistent netdisk/数据盘**, never `/root`.
111
+ On Gpushare, also stage hot datasets to `/hy-tmp` (local SSD) for IO, but copy results back to `/hy-netdisk`
112
+ before stopping — `/hy-tmp` is local AND auto-wiped 24 h after shutdown (GS5).
113
+
114
+ ---
115
+
116
+ ## 3. NETWORK
117
+
118
+ **The entire mirror / proxy / resumable-download story is shared across all CN platforms and lives in
119
+ `references/china-network.md` — do NOT duplicate it here.** That reference owns the mirrors table
120
+ (PyPI/conda/HF), `HF_ENDPOINT=https://hf-mirror.com`, the ModelScope fallback, the resumable-download retry
121
+ ladder, the `hf_transfer` hang caution, and the `no_proxy` trap. Only the per-platform **egress accelerator**
122
+ differs and is recorded here (verified per-platform docs 2026-06):
123
+
124
+ - **Gpushare — has a real academic proxy** (the closest analog to AutoDL's `/etc/network_turbo`):
125
+ `export https_proxy=http://turbo.gpushare.com:<PORT> http_proxy=http://turbo.gpushare.com:<PORT>`
126
+ (a `turbo2.gpushare.com:<PORT>` backup host also exists). Two critical differences from AutoDL: (a) it is
127
+ **per-session export**, NOT auto-sourced — re-run it in every new terminal/tmux pane; (b) it **whitelists
128
+ only `*.github.com`, `*.github.io`, `*.githubusercontent.com`, `*.githubassets.com`, `*.huggingface.co`,
129
+ `*.pytorch.org`, `*.kaggle.com` and *restricts every other host*** — so
130
+ `unset http_proxy https_proxy` (or `unset http_proxy && unset https_proxy`) the moment the accelerated pull
131
+ finishes, or `pip`/`apt`/domestic mirrors mystery-fail (gotcha GS2). This is exactly the
132
+ `no_proxy`/route-specific trap in principle #7 — validate the speed test on the same route the real transfer
133
+ uses (verified gpushare.com/docs/instance/network_turbo 2026-06).
134
+ - **Matpool** — no one-command egress proxy; ships source-switch scripts under `/public/script/`
135
+ (`switch_conda_source.sh`, `switch_pip_source.sh`, `switch_apt_source.sh`). Fall back to mirrors
136
+ (`references/china-network.md`).
137
+ - **Featurize / LanRui** — no documented one-command academic proxy surfaced; mirrors only.
138
+
139
+ **Port exposure:** JupyterLab/TensorBoard are built-in quick-tools (all four). **Custom ports must be declared
140
+ at rent time** ("高级选项" on Matpool, e.g. HTTP-6006 TensorBoard / HTTP-8888) — they cannot be opened
141
+ post-launch. Ports may change on restart — re-read the console, don't hard-code a port in an alias. SSH is
142
+ standard OpenSSH (scp/rsync work directly; no proxied-SSH `scp` limitation). Sanitized shapes:
143
+ `ssh -p <PORT> root@<region>.matpool.com` (Matpool, e.g. `hz.matpool.com` / `hz-t2.matpool.com`),
144
+ `ssh -p <PORT> root@<host>.gpushare.com` (Gpushare),
145
+ `ssh user@ssh.<region>.lanrui-ai.com -p <PORT> -i ~/.ssh/id_rsa` (LanRui — public-key must be uploaded to the
146
+ console first).
147
+
148
+ ---
149
+
150
+ ## 4. SPOT / INTERRUPTION + RESUME *(principle #7/#8)*
151
+
152
+ **These are on-demand-only platforms — there is NO spot bid and NO documented mid-run reclaim.** Do not
153
+ build SIGTERM-grace preemption handling here; aggressive retry-on-preemption is over-engineering on this
154
+ family. The real involuntary-loss vectors are:
155
+
156
+ 1. **Auto-release of *stopped* instances.** Gpushare auto-releases (deletes, unrecoverable) a stopped
157
+ pay-as-you-go instance **10 days after stop** (`实例停止 10 天后,会自动释放` — verified
158
+ gpushare.com/docs/instance/manage 2026-06). On arrears, **at noon on the 15th day** Gpushare deletes
159
+ personal data + the `/hy-nas` shared storage + custom images. A stopped box is not a parked box — pull
160
+ anything needed off it before that window.
161
+ 2. **`/hy-tmp` 24-hour auto-clear (Gpushare).** Distinct from instance release: even on a *running* server,
162
+ `/hy-tmp` data is deleted **24 h after the instance is shut down**, and is also wiped on instance migration
163
+ (GS5).
164
+ 3. **GPU-idle auto-shutdown.** Most platforms offer an opt-in "idle → auto-stop" policy to prevent waste; if
165
+ enabled it can stop a job that merely went quiet (e.g. between epochs with no GPU util) — keep it off for
166
+ long single-GPU jobs unless heartbeat is guaranteed.
167
+ 4. **Platform churn (LanRui).** LanRui migrated domain `lanrui-ai.com` → **`lanrui.co`** (old-domain data not
168
+ retained after **2024-11-01**) and retired its **T1/T2 zones on 2025-06-30**, moving users to a new "Cova"
169
+ platform — **re-verify current console paths/domain before scripting against any cached LanRui path**.
170
+
171
+ **Resume hook:** checkpoint-to-durable + load-latest-on-startup (principle #8) is still the right spine — here
172
+ it guards against a forgotten stop, a 10-day auto-release, and a `/hy-tmp` 24 h wipe, not a spot kill. The
173
+ cadence formula in `references/spot-resilience.md` still applies if a job is long enough to span a forced stop.
174
+
175
+ ---
176
+
177
+ ## 5. TEARDOWN / BILLING *(principle #9 + the Iron Law)*
178
+
179
+ **The meter-stop verb is per-platform — bind it from the table below before clicking anything.** The Iron Law
180
+ (SKILL.md Phase 5) holds unchanged: NO release/return/destroy until checkpoints are **pulled to local AND
181
+ verified by load**, and the user has approved the cost-affecting action.
182
+
183
+ | Platform | Meter-stop verb | What it preserves | Cost trap |
184
+ |---|---|---|---|
185
+ | Matpool | **停止并释放** (stop+release) | `/mnt` netdisk persists (region-scoped) | `.snap` snapshots silently eat the 5 GB netdisk (MP1) |
186
+ | Gpushare | **关机** stops compute → **释放** deletes | `/hy-netdisk` persists; `/hy-tmp` cleared 24 h post-stop; `/root` **resets to image** | stopped instance **auto-released at 10 days** (GS4); arrears purge day-15 noon |
187
+ | Featurize | **实例归还** (return) | only `work` (`/home/featurize`) + `/cloud` persist | everything else **wiped immediately on return** (FZ1) |
188
+ | LanRui | **停止** stops compute; **must *销毁数据盘*** (destroy the 数据盘) to stop disk billing | 网盘 + 数据盘 persist | **数据盘 bills hourly while the workspace is merely STOPPED** (LR1) |
189
+
190
+ **The single most dangerous divergence: on LanRui, "stop to save money" is wrong.** The 数据盘
191
+ (`/home/user/datadisk`, block storage, bought in 200 G / 500 G specs) bills hourly from *creation* until
192
+ *destroyed*, even while the workspace is stopped — `工作空间停止运行,未销毁的数据盘也将持续计费` (verified
193
+ docs.lanrui.co storage + lanrui.co/pricing 2026-06). So a stopped LanRui workspace keeps a meter running. To
194
+ actually stop all billing: stop the workspace AND destroy the 数据盘 (after the Iron-Law pull+verify). The 网盘
195
+ (10 GB free, 0.15 元/GB·月 overage) persists separately. Contrast: on Matpool/Gpushare/Featurize,
196
+ release/return/归还 ends compute billing and the persistent volume simply survives (Gpushare /hy-netdisk and
197
+ /hy-nas bill per-GB but are not destroyed by stopping).
198
+
199
+ **Cost-pause analogs (cheaper than full release, data kept):** Gpushare **无卡模式 / 无卡启动** (low-core
200
+ CPU-only restart, no GPU) is the analog of AutoDL's no-GPU restart — keeps `/hy-netdisk` data while paused at a
201
+ fraction of the GPU rate, ideal for env-config + dataset download (verified gpushare 无卡启动 announcement
202
+ 2026-06). LanRui supports an **auto-stop timer** (set a stop time at workspace start) and per-hour billing.
203
+
204
+ ---
205
+
206
+ ## 6. DAEMON TOOL
207
+
208
+ **tmux** is the family detach primitive — preinstalled on most images. Caveat (from Matpool docs, true
209
+ family-wide): **run tmux from a local SSH session, NOT the Jupyter web terminal** — keybindings collide with
210
+ tmux's prefix. A backgrounded `nohup python … </dev/null >log 2>&1 &` also survives a tab-close / page refresh
211
+ on Featurize (process not killed; only notebook cell state lost) — but tmux is preferred for a named,
212
+ re-attachable session.
213
+
214
+ tmux survives an **SSH drop** but **NOT** an instance **stop/restart** on any platform (on Gpushare the restart
215
+ resets `/root`, taking the tmux server and any `/root` logs with it) — so the durable spine is
216
+ checkpoint-to-persistent-disk (§2, principle #8), not the tmux session. LanRui additionally supports
217
+ **multi-machine multi-GPU distributed training** — if used, see `references/multinode.md`.
218
+
219
+ ---
220
+
221
+ ## 7. TOP GOTCHAS *(platform-pinned; universal ones → `references/gotchas_universal.md`)*
222
+
223
+ ### Family-wide (China-specific, not in the universal catalog)
224
+
225
+ **CN1 — `/root` ephemerality silently loses work.**
226
+ Symptom: code/checkpoints written to `/root` vanish after a stop/restart (Gpushare) or instance return
227
+ (Featurize). → Root cause: the system disk resets to image state / is wiped on return — unlike AutoDL, which
228
+ persists `/root` across power-off. → Fix: write *everything* to the persistent mount (§2); treat `/root` as
229
+ RAM. Audit with `ls <persistent-mount>` after a test stop before trusting it for a real run.
230
+
231
+ **CN2 — GPU-idle auto-stop kills a quiet job.**
232
+ Symptom: a long job dies mid-run with no error; console shows "auto-stopped (idle)". → Root cause: an opt-in
233
+ idle-shutdown policy stopped the instance during a low-GPU-util phase (data loading, eval, between epochs).
234
+ → Fix: disable idle-auto-stop for long jobs, or emit a periodic GPU-touching heartbeat; confirm the policy
235
+ state in Phase 0.
236
+
237
+ ### Matpool (matpool.com)
238
+
239
+ **MP1 — `.snap` snapshots silently consume the 5 GB netdisk.**
240
+ Symptom: "保存环境" / snapshot saves fail or the netdisk fills with no obvious culprit. → Root cause: snapshots
241
+ are written as `.snap` files **into the netdisk** and count against its tiny 5 GB quota (verified matpool
242
+ snapshot docs 2026-06). → Fix: prune old `.snap` files (deleting one frees the quota); keep only the latest
243
+ needed env snapshot.
244
+
245
+ **MP2 — `/mnt` is excluded from snapshots, and the machine is locked while saving.**
246
+ Symptom: "保存环境" doesn't capture code under `/mnt`; the instance is unusable during the save. → Root cause:
247
+ a snapshot captures **everything *except* `/mnt`** (the netdisk mount), and the machine cannot be used while
248
+ the snapshot writes. → Fix: to *shrink* a snapshot move code/data to `/mnt` first (it won't be captured); to
249
+ *preserve* code via snapshot keep it OFF `/mnt`. Ensure no running process before triggering a save.
250
+
251
+ **MP3 — region-scoped netdisk strands data on a sweep across regions.**
252
+ Symptom: a second instance in another region can't see files written by the first; expanded storage "missing".
253
+ → Root cause: `/mnt` netdisks are separate per region and do not interconnect. → Fix: keep all instances of a
254
+ sweep in one region; pick region before expanding (verified matpool FAQ 2026-06).
255
+
256
+ ### Gpushare (gpushare.com)
257
+
258
+ **GS1 — `/root` resets to image state on every shutdown/restart.** (The instance of CN1 to remember by name.)
259
+ Symptom: installed packages / code / logs under `/root` gone after restart. → Root cause: only `/hy-tmp` and
260
+ `/hy-netdisk` persist; `/` reverts to the image. → Fix: env on `/hy-netdisk`, hot data on `/hy-tmp`, results
261
+ synced to `/hy-netdisk` before stop.
262
+
263
+ **GS2 — turbo proxy left on blocks non-whitelisted hosts.**
264
+ Symptom: after `export …turbo.gpushare.com…`, `pip install` / `apt` / domestic mirrors hang or `ProxyError`.
265
+ → Root cause: the academic proxy whitelists only GitHub/HF/PyTorch/Kaggle and **restricts everything else**
266
+ (verified network_turbo docs 2026-06). → Fix: `unset http_proxy https_proxy` the moment the accelerated pull
267
+ finishes (§3). Same shape as the `no_proxy` trap in `references/china-network.md`.
268
+
269
+ **GS3 — `/hy-netdisk` absent on unmarked machines.**
270
+ Symptom: scripts referencing `/hy-netdisk` fail on some rentals. → Root cause: the shared netdisk exists only
271
+ on machines marked as supporting 内网存储. → Fix: check `mount | grep hy-netdisk` in Phase 0; fall back to
272
+ personal cloud storage via `oss cp` (OSS tool, ~300 Mbps, compressed archives only) if absent.
273
+
274
+ **GS4 — stopped instance auto-released at 10 days; arrears purge at day 15.** Symptom: a parked stopped
275
+ instance disappears, or shared/personal data is gone after non-payment. → Root cause: pay-as-you-go auto-
276
+ release 10 days after stop (`实例停止 10 天后自动释放`); on arrears, day-15-noon deletes personal data +
277
+ `/hy-nas` + custom images (verified gpushare docs 2026-06). → Fix: pull results off a stopped box promptly;
278
+ don't treat "stopped" as durable parking; keep the balance positive.
279
+
280
+ **GS5 — `/hy-tmp` auto-cleared 24 h after shutdown (and on migration).** *(NEW — corrects the prior "/hy-tmp
281
+ persists" assumption.)* Symptom: training data/scratch under `/hy-tmp` gone the day after a stop, even though
282
+ the instance still exists. → Root cause: `/hy-tmp` is per-server local scratch, auto-deleted 24 h after
283
+ shutdown and wiped on instance migration (verified gpushare.com/docs/data/storage 2026-06). → Fix: treat
284
+ `/hy-tmp` as IO scratch only; sync anything durable to `/hy-netdisk` before stopping; do NOT
285
+ `conda create -p /hy-tmp/...` for a persistent env (use `/hy-netdisk`).
286
+
287
+ ### Featurize (featurize.cn)
288
+
289
+ **FZ1 — anything outside `work`/`/cloud` is wiped the instant the instance is returned.** (The strictest
290
+ "what survives" rule of the four.) Symptom: results outside `/home/featurize` or `/cloud` gone after 归还.
291
+ → Root cause: only `work` (per-user cloud storage, `工作区可以一直保存项目文件`) and the `/cloud` sync drive
292
+ persist; everything else is destroyed on return (verified Featurize tutorials 2026-06). → Fix: write all
293
+ durable output under `work`/`/cloud`; verify before returning.
294
+
295
+ **FZ2 — `/cloud` sync drive lag makes edits *look* saved but not land.** Symptom: VS Code edits / files appear
296
+ saved locally but are missing after reconnect or return (the "工作区中修改代码后无法保存" complaint). → Root
297
+ cause: the Remote-SSH sync to the cloud drive is not always real-time, especially on slow links or large files.
298
+ → Fix: explicit `Ctrl+S`, then verify on the server (`ls -la` / `cat` the file) before trusting it; on a
299
+ flaky connection, close and re-open the Remote-SSH session (transient failures are expected).
300
+
301
+ **FZ3 — 30 GB free cloud quota silently breaks large writes / `conda create`.** *(corrects the prior "~20 GB"
302
+ figure.)* Symptom: env creation or large copies into `work`/`/cloud` fail or truncate. → Root cause: the free
303
+ cloud storage is **30 GB** (verified featurize.cn 2026-06); over it, writes fail. → Fix: `du -sh ~/work /cloud`
304
+ to watch headroom; keep only the active env there; large reproducible scratch belongs on local
305
+ non-persistent disk, not the cloud drive.
306
+
307
+ ### LanRui (lanrui.co / lanrui-ai.com)
308
+
309
+ **LR1 — 数据盘 keeps billing while the workspace is merely *stopped*.** (The most expensive divergence — see
310
+ §5.) Symptom: a stopped LanRui workspace still accrues cost. → Root cause: the 数据盘
311
+ (`/home/user/datadisk`) bills hourly from creation until *destroyed*, independent of workspace run-state
312
+ (`工作空间停止运行,未销毁的数据盘也将持续计费` — verified docs.lanrui.co storage 2026-06). → Fix: to stop
313
+ all billing, stop the workspace **and** 销毁 the 数据盘 — only after the Iron-Law pull+verify; the 网盘 keeps
314
+ the data.
315
+
316
+ **LR2 — shared `netdisk/data` folder mounted into every same-zone workspace → cross-run clobber.** Symptom:
317
+ a parallel ablation overwrites another run's outputs. → Root cause: `/home/user/netdisk/data` is auto-mounted
318
+ and shared across *all* workspaces in the same availability zone. → Fix: per-job isolated write paths
319
+ (`references/parallel_ablation.md`); never share a mutable output dir under `netdisk/data`. Also: the 网盘 is
320
+ poor at many-small-file *writes* — route those to the 数据盘.
321
+
322
+ **LR3 — platform/domain churn invalidates cached paths.** Symptom: scripted paths/domain fail post-migration.
323
+ → Root cause: domain `lanrui-ai.com` → `lanrui.co` (old data dropped after 2024-11-01); T1/T2 zones retired
324
+ 2025-06-30 → "Cova" platform. → Fix: re-verify console domain + paths in-session before scripting against any
325
+ cached LanRui path.
326
+
327
+ ### Platform-specific debugging
328
+
329
+ Before trusting a run, in Phase 0 (per platform):
330
+ - **Confirm persistence path is real, not `/root`.** `mount | grep -E 'mnt|hy-netdisk|cloud|datadisk|netdisk'`
331
+ then `touch <persistent-mount>/.probe && ls -l <persistent-mount>/.probe`. On Gpushare also confirm
332
+ `/hy-netdisk` is present (GS3) — `mount | grep hy-netdisk` (absent on unmarked machines).
333
+ - **GPU + driver sanity.** `nvidia-smi` (GPU visible, mem free, driver/CUDA), then
334
+ `python -c "import torch;print(torch.__version__, torch.cuda.is_available(), torch.cuda.get_device_name(0))"`
335
+ against the prebuilt interpreter. Mismatched local-vs-server PyTorch silently breaks checkpoint loads on
336
+ Featurize — match versions.
337
+ - **Detect a stuck / throttled download.** `du -sh <cache-dir>` twice ~30 s apart — flat size = stalled (often
338
+ the GFW or a left-on Gpushare turbo proxy restricting a non-whitelisted host, GS2). Cross-check with
339
+ `curl -sI -x "$https_proxy" https://hf-mirror.com` / `env | grep -i proxy`; `unset http_proxy https_proxy`
340
+ and retry on the mirror.
341
+ - **Disk / inode pressure (the silent §2 risk).** `df -h <persistent-mount>` AND `df -i <persistent-mount>` —
342
+ a full inode table fails writes while `df -h` still shows free GB. On Matpool, a filling 5 GB netdisk is
343
+ usually stale `.snap` files (`ls -la /mnt/*.snap`, MP1).
344
+ - **Verify the meter-stop did what was intended.** After "stop", re-check the console billing line — on LanRui
345
+ a stopped workspace whose 数据盘 was NOT destroyed is still metering (LR1); on Gpushare a stopped box still
346
+ counts toward the 10-day auto-release clock (GS4).
347
+ - **Read the running job's log, don't infer from silence.** Job is in tmux/nohup → `tmux capture-pane -pt
348
+ <session>` or `tail -f <persistent-mount>/run.log`. A vanished tmux server after a "restart" means `/root`
349
+ reset (GS1) — the log must live on the persistent mount to survive.
350
+
351
+ ---
352
+
353
+ ## 8. SCRIPT OVERRIDES
354
+
355
+ Parameterize the `scripts/` templates per platform. `PROXY_HOOK`, `HF_HOME`, and the mirror env all defer to
356
+ `references/china-network.md`; only the **mounts** truly differ.
357
+
358
+ | Var | Matpool | Gpushare | Featurize | LanRui |
359
+ |---|---|---|---|---|
360
+ | `DURABLE_DIR=` (durable) | `/mnt` | `/hy-netdisk` | `/home/featurize` (+`/cloud`) | `/home/user/datadisk` (or `/home/user/netdisk/data`) |
361
+ | `DATA_DIR=` (fast/ephemeral) | `/root` | `/hy-tmp` (24 h post-stop wipe) | local tmp | `/home/user/datadisk` scratch |
362
+ | `SCRATCH=` (local, prune) | `/root` | `/hy-tmp` | local tmp | 数据盘 scratch |
363
+ | `HF_HOME=` | `/mnt/.cache/hf` | `/hy-netdisk/.cache/hf` | `/cloud/.cache/hf` | `/home/user/datadisk/.cache/hf` |
364
+ | `PROXY_HOOK=` | (mirrors only) | `export …turbo.gpushare.com:<PORT>…` then `unset` | (mirrors only) | (mirrors only) |
365
+ | `CRED_FILE=""` (no file — env var) | `$WANDB_API_KEY` / `$HF_TOKEN` on **ephemeral** disk, never the shared netdisk | same | same | same |
366
+ | `DETACH=` | tmux | tmux | tmux | tmux |
367
+
368
+ `CRED_FILE=""` because on these CN platforms the credential is an **env var** (or `.netrc`) on the ephemeral
369
+ disk, not a file on the netdisk — leave it empty so run_one's `[ -n "$CRED_FILE" ]` guard skips the file read
370
+ and `$WANDB_API_KEY` / `$HF_TOKEN` pass through from the platform env.
371
+
372
+ Common to all: the credential lives in an env var or `.netrc` on the **ephemeral system disk**, never on the
373
+ shared/persistent netdisk (a shared `data` folder mounted into every same-zone workspace, like LanRui's, is
374
+ especially leaky — universal secrets-off-shared-FS gotcha in `references/gotchas_universal.md`).
375
+
376
+ ---
377
+
378
+ ## 9. Per-platform comparison — the load-bearing differences at a glance
379
+
380
+ The six questions the schema asks, answered per platform. This is the table to read first when picking which
381
+ delta applies.
382
+
383
+ | Question | Matpool | Gpushare | Featurize | LanRui |
384
+ |---|---|---|---|---|
385
+ | Prebuilt base-conda env? | yes (**`myconda`, auto-activated**) | yes (miniconda, base auto-activate **off**) | yes (full PyTorch/TF base, pip persists on `work`) | yes (image-provisioned; PyTorch images purchasable) |
386
+ | Academic-acceleration proxy? | no (source-switch scripts only) | **yes** `turbo.gpushare.com:<PORT>` (per-session, 7-host whitelist) | no (mirrors only) | no (mirrors only) |
387
+ | Shared / region FS? | `/mnt` netdisk (**region-scoped**, expandable) | `/hy-netdisk` (only on *marked* machines) + `/hy-nas` | `work`+`/cloud` (cloud sync; not cross-region, med-conf) | `/home/user/netdisk/data` (shared into *every same-zone* workspace) |
388
+ | Inode cap? | undocumented — measure `df -i` | undocumented — measure `df -i` | undocumented — measure `df -i` | undocumented — measure `df -i` |
389
+ | Data disk bills while **stopped**? | no (release ends billing) | no (but stopped box auto-released at 10 d; `/hy-tmp` cleared 24 h) | no (return ends billing) | **YES — 数据盘 bills until destroyed** |
390
+ | Meter-stop verb | 停止并释放 | 关机 → 释放 (+ 无卡模式 pause) | 实例归还 | **停止 + 销毁数据盘** |
391
+ | `/root` survives a stop? | local, lost on release | **NO — resets to image** | **NO — wiped on return** | system disk lost; use 数据盘 |
392
+
393
+ **Bottom line for porting an AutoDL workflow:** the SSH/tmux/smoke/checkpoint spine transfers verbatim; the
394
+ three things to re-bind per platform are (1) the **persistent mount** (never `/root`; on Gpushare never
395
+ `/hy-tmp` either), (2) the **meter-stop verb** — and on LanRui, that stopping is not enough, the 数据盘 must be
396
+ destroyed — and (3) the **proxy hook** (real proxy only on Gpushare, with a strict whitelist; mirrors-only
397
+ elsewhere → `references/china-network.md`).