opencode-skills-collection 3.1.2 → 3.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bundled-skills/.antigravity-install-manifest.json +4 -1
- package/bundled-skills/agent-creator/SKILL.md +246 -0
- package/bundled-skills/ax-extract-workflow/SKILL.md +156 -0
- package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
- package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
- package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
- package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
- package/bundled-skills/docs/sources/sources.md +1 -1
- package/bundled-skills/docs/users/bundles.md +1 -1
- package/bundled-skills/docs/users/claude-code-skills.md +1 -1
- package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
- package/bundled-skills/docs/users/getting-started.md +1 -1
- package/bundled-skills/docs/users/kiro-integration.md +1 -1
- package/bundled-skills/docs/users/usage.md +4 -4
- package/bundled-skills/docs/users/visual-guide.md +4 -4
- package/bundled-skills/lovable-cleanup/SKILL.md +2 -1
- package/bundled-skills/remote-gpu-trainer/.gitattributes +8 -0
- package/bundled-skills/remote-gpu-trainer/LICENSE +21 -0
- package/bundled-skills/remote-gpu-trainer/README.md +267 -0
- package/bundled-skills/remote-gpu-trainer/SKILL.md +249 -0
- package/bundled-skills/remote-gpu-trainer/evals/README.md +57 -0
- package/bundled-skills/remote-gpu-trainer/evals/RESULTS.md +44 -0
- package/bundled-skills/remote-gpu-trainer/evals/cases.jsonl +14 -0
- package/bundled-skills/remote-gpu-trainer/evals/run_evals.py +68 -0
- package/bundled-skills/remote-gpu-trainer/examples/autodl_sweep/README.md +72 -0
- package/bundled-skills/remote-gpu-trainer/examples/autodl_sweep/queue_1.txt +6 -0
- package/bundled-skills/remote-gpu-trainer/profiles/_schema.md +100 -0
- package/bundled-skills/remote-gpu-trainer/profiles/autodl.md +327 -0
- package/bundled-skills/remote-gpu-trainer/profiles/china.md +397 -0
- package/bundled-skills/remote-gpu-trainer/profiles/generic-ssh.md +450 -0
- package/bundled-skills/remote-gpu-trainer/profiles/lambda.md +342 -0
- package/bundled-skills/remote-gpu-trainer/profiles/paperspace.md +365 -0
- package/bundled-skills/remote-gpu-trainer/profiles/runpod.md +164 -0
- package/bundled-skills/remote-gpu-trainer/profiles/vastai.md +355 -0
- package/bundled-skills/remote-gpu-trainer/references/china-network.md +206 -0
- package/bundled-skills/remote-gpu-trainer/references/gotchas_universal.md +704 -0
- package/bundled-skills/remote-gpu-trainer/references/lifecycle_checklist.md +148 -0
- package/bundled-skills/remote-gpu-trainer/references/monitoring_patterns.md +327 -0
- package/bundled-skills/remote-gpu-trainer/references/multinode.md +190 -0
- package/bundled-skills/remote-gpu-trainer/references/parallel_ablation.md +196 -0
- package/bundled-skills/remote-gpu-trainer/references/principles.md +179 -0
- package/bundled-skills/remote-gpu-trainer/references/self-improvement.md +74 -0
- package/bundled-skills/remote-gpu-trainer/references/spot-resilience.md +235 -0
- package/bundled-skills/remote-gpu-trainer/references/ssh_transport.md +270 -0
- package/bundled-skills/remote-gpu-trainer/references/training/by-domain.md +230 -0
- package/bundled-skills/remote-gpu-trainer/references/training/checkpoint-resume.md +368 -0
- package/bundled-skills/remote-gpu-trainer/references/training/convergence-debugging.md +187 -0
- package/bundled-skills/remote-gpu-trainer/references/training/data-pipeline.md +119 -0
- package/bundled-skills/remote-gpu-trainer/references/training/distributed-launch.md +422 -0
- package/bundled-skills/remote-gpu-trainer/references/training/oom-memory.md +338 -0
- package/bundled-skills/remote-gpu-trainer/references/training/precision-stability.md +401 -0
- package/bundled-skills/remote-gpu-trainer/references/training/throughput-profiling.md +451 -0
- package/bundled-skills/remote-gpu-trainer/scripts/aggregate_to_fs.sh +55 -0
- package/bundled-skills/remote-gpu-trainer/scripts/check_staleness.py +70 -0
- package/bundled-skills/remote-gpu-trainer/scripts/download_loop.sh +67 -0
- package/bundled-skills/remote-gpu-trainer/scripts/gpu_health.sh +169 -0
- package/bundled-skills/remote-gpu-trainer/scripts/health_patrol.sh.template +67 -0
- package/bundled-skills/remote-gpu-trainer/scripts/mem_monitor.sh +67 -0
- package/bundled-skills/remote-gpu-trainer/scripts/reap_vram_zombies.sh +175 -0
- package/bundled-skills/remote-gpu-trainer/scripts/run_one.sh.template +104 -0
- package/bundled-skills/remote-gpu-trainer/scripts/run_queue.sh.template +83 -0
- package/bundled-skills/remote-gpu-trainer/scripts/setup-china-mirrors.sh +35 -0
- package/bundled-skills/remote-gpu-trainer/scripts/verify_local.py +145 -0
- package/package.json +1 -1
- package/skills_index.json +66 -0
|
@@ -0,0 +1,397 @@
|
|
|
1
|
+
---
|
|
2
|
+
platform: china-family # 矩池云 Matpool · 恒源云 Gpushare · Featurize · 揽睿星舟 LanRui
|
|
3
|
+
kind: ssh-rental # all four: SSH + Jupyter + tmux, cgroup-isolated, prebuilt conda base
|
|
4
|
+
meter_stop_verb: per-platform # 停止并释放 (Matpool) | 关机→释放 (Gpushare) | 实例归还 (Featurize) | 停止+销毁数据盘 (LanRui)
|
|
5
|
+
meter_stop_irreversible: mixed # releasing the instance is; the persistent vol survives — EXCEPT LanRui 数据盘 bills while stopped
|
|
6
|
+
detach_primitive: tmux # preinstalled on most images; backgrounded python survives tab-close too
|
|
7
|
+
spot_available: false # on-demand only — NO mid-run spot reclaim (see §4)
|
|
8
|
+
spot_grace: n/a # the involuntary-loss vector is auto-release of STOPPED instances, not preemption
|
|
9
|
+
shared_fs: per-platform # /mnt | /hy-netdisk(+/hy-nas) | work+/cloud | /home/user/netdisk/data — region/machine-scoped, see §2
|
|
10
|
+
inode_cap: undocumented # size caps documented (5/20/30/10 GB free); inode caps NOT — measure df -i live
|
|
11
|
+
free_egress: true # intra-China; cross-GFW pulls need a mirror (see references/china-network.md)
|
|
12
|
+
china_mirror_needed: true # all four sit behind the GFW — mirror/proxy story is shared, not per-platform
|
|
13
|
+
host_driver_cuda_max: image-dependent
|
|
14
|
+
local_nvme: per-platform # Gpushare /hy-tmp, LanRui /home/user/datadisk, Featurize local scratch
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
# Profile: Chinese GPU-rental family (Matpool · Gpushare · Featurize · LanRui)
|
|
18
|
+
|
|
19
|
+
One-line purpose: the AutoDL-shaped Chinese rentals — near-clones that share AutoDL's SSH+tmux+prebuilt-base
|
|
20
|
+
spine but diverge on **what survives a stop**, **whether a stopped data disk still bills**, and **which (if
|
|
21
|
+
any) academic proxy** ships. Treat AutoDL (`profiles/autodl.md`) as the reference implementation; this profile
|
|
22
|
+
records only the deltas, at the FAMILY level first, then a per-platform comparison table.
|
|
23
|
+
|
|
24
|
+
> **Surface to the user up front (principle #10):** ⚠️ Danger clocks (per platform, §5) — a **stopped instance is auto-released** (Gpushare ~10 days, others vary) → data gone; **LanRui's 数据盘 bills while stopped**; Gpushare's **`/hy-tmp` is wiped 24 h after stop** and `/root` resets to the image. Conveniences — built-in **JupyterLab / TensorBoard** quick-tools (all four); **declare any custom port at rent time** ("高级选项") — it can't be opened later.
|
|
25
|
+
|
|
26
|
+
**To jump:** `grep -in '<keyword>' profiles/china.md` (e.g. `proxy`, `ephemeral`, `bills`, `inode`, `LanRui`).
|
|
27
|
+
|
|
28
|
+
## Table of contents
|
|
29
|
+
1. LAUNCH · 2. STORAGE MODEL (survival matrix + `/root`-ephemeral trap) · 3. NETWORK (→ `references/china-network.md`)
|
|
30
|
+
· 4. SPOT/INTERRUPTION · 5. TEARDOWN/BILLING · 6. DAEMON TOOL · 7. TOP GOTCHAS (universal → `references/gotchas_universal.md`)
|
|
31
|
+
+ Platform-specific debugging · 8. SCRIPT OVERRIDES · 9. Per-platform comparison table
|
|
32
|
+
|
|
33
|
+
> Universal gotchas (CRLF, cgroup OOM, silent sync, tmux-holds-script, disk-budget, secrets-off-shared-FS)
|
|
34
|
+
> are NOT restated here — see `references/gotchas_universal.md`. The mirror/proxy/download story is NOT
|
|
35
|
+
> restated either — it is shared across all CN platforms and lives in `references/china-network.md`.
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## 1. LAUNCH
|
|
40
|
+
|
|
41
|
+
All four: web console rents a marketplace machine → pick GPU count + a **prebuilt image** (PyTorch/TF + CUDA)
|
|
42
|
+
that *is* the env → connect via SSH (auto-generated password or pushed public key) + JupyterLab; VS Code
|
|
43
|
+
Remote-SSH works on all of them.
|
|
44
|
+
|
|
45
|
+
**Env contract — the image/base IS the env; do not `conda create` on a rental.** Same rule as AutoDL, with
|
|
46
|
+
per-platform base-activation wrinkles (verified per-platform docs 2026-06):
|
|
47
|
+
- **Featurize** — base is fully provisioned and used directly; `pip`/`conda install` into base persists on the
|
|
48
|
+
`work` workspace. Activate and run.
|
|
49
|
+
- **Matpool** — ships a **`myconda` env that auto-activates on startup** (interpreter at
|
|
50
|
+
`/root/miniconda3/envs/myconda/bin/python`). Run directly; no re-enable needed (verified matpool conda docs
|
|
51
|
+
2026-06 — this corrects the earlier "auto-activate off" note, which was true only for Gpushare).
|
|
52
|
+
- **Gpushare** — ships miniconda but **base auto-activate is *disabled*** (`登陆终端默认取消了自动进入 base 环境`).
|
|
53
|
+
Re-enable (`conda config --set auto_activate_base true`) or activate the named env per session
|
|
54
|
+
(verified gpushare.com/docs/best_practices/conda 2026-06).
|
|
55
|
+
- **LanRui** — image-provisioned (PyTorch images are a purchasable image option); base used directly.
|
|
56
|
+
|
|
57
|
+
An unavoidable custom env goes **on the persistent disk** (`--prefix /<persistent-mount>/myenv`), never the
|
|
58
|
+
small system disk (§2) — a system-disk env is wiped wherever `/root` is ephemeral. On Gpushare specifically,
|
|
59
|
+
docs recommend `conda create -p /hy-netdisk/myenv` (NOT `/hy-tmp` — that auto-clears 24 h after shutdown, GS5).
|
|
60
|
+
|
|
61
|
+
→ **verify:** `ssh <alias> 'python -c "import torch;print(torch.cuda.is_available())"'` returns `True`
|
|
62
|
+
against the *prebuilt* interpreter, before any install.
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## 2. STORAGE MODEL *(survival matrix — principle #4)*
|
|
67
|
+
|
|
68
|
+
**The family-level trap that breaks ported AutoDL habits: `/root` (the system disk) is NOT durable on every
|
|
69
|
+
platform.** AutoDL persists `/root` across a power-off; here it ranges from "resets to image state on every
|
|
70
|
+
restart" (Gpushare) to "wiped the instant the instance is returned" (Featurize). Checkpoints MUST go to the
|
|
71
|
+
platform's persistent mount, not `/root`.
|
|
72
|
+
|
|
73
|
+
Each platform pairs a **small reset-prone system disk** with a **persistent network/data disk**:
|
|
74
|
+
|
|
75
|
+
| Platform | System disk (`/root` etc.) | Persistent mount | Local fast scratch | Free quota |
|
|
76
|
+
|---|---|---|---|---|
|
|
77
|
+
| Matpool | `/root` (instance-local; snapshot-captured) | **`/mnt`** netdisk (survives release, expandable, region-scoped) | `/root` (local) | 5 GB netdisk |
|
|
78
|
+
| Gpushare | `/` incl. `/root` — **resets to image on stop/restart** | **`/hy-netdisk`** (only on *marked* machines) | **`/hy-tmp`** (local SSD; auto-cleared 24 h after stop) | 20 GB sys disk |
|
|
79
|
+
| Featurize | wiped on return | **`work` = `/home/featurize`** (persists) + **`/cloud`** sync drive | local (non-persistent) | **30 GB** free cloud |
|
|
80
|
+
| LanRui | system disk lost on stop | **数据盘 = `/home/user/datadisk`** + shared **`/home/user/netdisk/data`** | 数据盘 (block-storage, ≈ sys-disk speed) | 网盘 10 GB free |
|
|
81
|
+
|
|
82
|
+
**Survival matrix (family):**
|
|
83
|
+
|
|
84
|
+
| Tier | Survives STOP? | Survives RELEASE/RETURN? | Notes |
|
|
85
|
+
|---|---|---|---|
|
|
86
|
+
| System disk / `/root` | varies (Gpushare: **NO, resets to image**; Featurize: wiped on return) | NO | never the checkpoint target |
|
|
87
|
+
| Persistent netdisk / 数据盘 | YES | YES (except LanRui 数据盘 still **bills** — §5) | the only safe checkpoint target |
|
|
88
|
+
| Cross-instance shared folder | YES | YES | region/zone/machine-scoped; **clobber risk** (see gotchas) |
|
|
89
|
+
| Gpushare `/hy-tmp` (local SSD) | **NO — auto-cleared 24 h after shutdown** | NO | fast scratch only; copy results to `/hy-netdisk` before stop (GS5) |
|
|
90
|
+
|
|
91
|
+
**Region / scope locks** (analog of AutoDL's region-scoped FS):
|
|
92
|
+
- **Matpool** — `/mnt` netdisk is **region-scoped**: different regions have separate netdisks that don't
|
|
93
|
+
interconnect; pick the region before expanding storage (verified matpool FAQ 2026-06).
|
|
94
|
+
- **Featurize** — code in `work`/`/cloud` persists; per common usage reports the cloud sync drive does not
|
|
95
|
+
share across different regions, while *datasets* are reusable — confirm all instances of a sweep are
|
|
96
|
+
same-region before fan-out. *(med confidence — official wording not re-verified 2026-06.)*
|
|
97
|
+
- **Gpushare** — `/hy-netdisk` exists **only on machines marked as supporting 内网存储**; an unmarked machine
|
|
98
|
+
has no shared mount. A separate **`/hy-nas`** shared storage (0.0007 元/GB·h) exists on specific instances
|
|
99
|
+
(verified gpushare.com/docs/data 2026-06).
|
|
100
|
+
- **LanRui** — `/home/user/netdisk/data` is a **per-availability-zone shared folder auto-mounted into *every*
|
|
101
|
+
workspace in that zone** (`data 文件夹下的任何数据,都可以在该可用区下的所有工作空间中使用`) — convenient,
|
|
102
|
+
but a parallel-ablation clobber hazard (gotcha LR2). The 网盘 is poor at many-small-file *writes*; use the
|
|
103
|
+
数据盘 for that (verified docs.lanrui.co storage 2026-06).
|
|
104
|
+
|
|
105
|
+
**Inode caps:** size caps are documented (5 / 20 / 30 / 10 GB free across the four); **explicit inode caps are
|
|
106
|
+
NOT documented by any of them**. The many-small-files metadata-exhaustion risk still transfers to any shared
|
|
107
|
+
FS — measure `df -i <persistent-mount>` on a live instance in Phase 0 rather than assuming a number. Redirect
|
|
108
|
+
HF/ModelScope caches off the small system disk → see `references/china-network.md` §2.
|
|
109
|
+
|
|
110
|
+
State the checkpoint mount for §5's teardown verb: write to the **persistent netdisk/数据盘**, never `/root`.
|
|
111
|
+
On Gpushare, also stage hot datasets to `/hy-tmp` (local SSD) for IO, but copy results back to `/hy-netdisk`
|
|
112
|
+
before stopping — `/hy-tmp` is local AND auto-wiped 24 h after shutdown (GS5).
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## 3. NETWORK
|
|
117
|
+
|
|
118
|
+
**The entire mirror / proxy / resumable-download story is shared across all CN platforms and lives in
|
|
119
|
+
`references/china-network.md` — do NOT duplicate it here.** That reference owns the mirrors table
|
|
120
|
+
(PyPI/conda/HF), `HF_ENDPOINT=https://hf-mirror.com`, the ModelScope fallback, the resumable-download retry
|
|
121
|
+
ladder, the `hf_transfer` hang caution, and the `no_proxy` trap. Only the per-platform **egress accelerator**
|
|
122
|
+
differs and is recorded here (verified per-platform docs 2026-06):
|
|
123
|
+
|
|
124
|
+
- **Gpushare — has a real academic proxy** (the closest analog to AutoDL's `/etc/network_turbo`):
|
|
125
|
+
`export https_proxy=http://turbo.gpushare.com:<PORT> http_proxy=http://turbo.gpushare.com:<PORT>`
|
|
126
|
+
(a `turbo2.gpushare.com:<PORT>` backup host also exists). Two critical differences from AutoDL: (a) it is
|
|
127
|
+
**per-session export**, NOT auto-sourced — re-run it in every new terminal/tmux pane; (b) it **whitelists
|
|
128
|
+
only `*.github.com`, `*.github.io`, `*.githubusercontent.com`, `*.githubassets.com`, `*.huggingface.co`,
|
|
129
|
+
`*.pytorch.org`, `*.kaggle.com` and *restricts every other host*** — so
|
|
130
|
+
`unset http_proxy https_proxy` (or `unset http_proxy && unset https_proxy`) the moment the accelerated pull
|
|
131
|
+
finishes, or `pip`/`apt`/domestic mirrors mystery-fail (gotcha GS2). This is exactly the
|
|
132
|
+
`no_proxy`/route-specific trap in principle #7 — validate the speed test on the same route the real transfer
|
|
133
|
+
uses (verified gpushare.com/docs/instance/network_turbo 2026-06).
|
|
134
|
+
- **Matpool** — no one-command egress proxy; ships source-switch scripts under `/public/script/`
|
|
135
|
+
(`switch_conda_source.sh`, `switch_pip_source.sh`, `switch_apt_source.sh`). Fall back to mirrors
|
|
136
|
+
(`references/china-network.md`).
|
|
137
|
+
- **Featurize / LanRui** — no documented one-command academic proxy surfaced; mirrors only.
|
|
138
|
+
|
|
139
|
+
**Port exposure:** JupyterLab/TensorBoard are built-in quick-tools (all four). **Custom ports must be declared
|
|
140
|
+
at rent time** ("高级选项" on Matpool, e.g. HTTP-6006 TensorBoard / HTTP-8888) — they cannot be opened
|
|
141
|
+
post-launch. Ports may change on restart — re-read the console, don't hard-code a port in an alias. SSH is
|
|
142
|
+
standard OpenSSH (scp/rsync work directly; no proxied-SSH `scp` limitation). Sanitized shapes:
|
|
143
|
+
`ssh -p <PORT> root@<region>.matpool.com` (Matpool, e.g. `hz.matpool.com` / `hz-t2.matpool.com`),
|
|
144
|
+
`ssh -p <PORT> root@<host>.gpushare.com` (Gpushare),
|
|
145
|
+
`ssh user@ssh.<region>.lanrui-ai.com -p <PORT> -i ~/.ssh/id_rsa` (LanRui — public-key must be uploaded to the
|
|
146
|
+
console first).
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## 4. SPOT / INTERRUPTION + RESUME *(principle #7/#8)*
|
|
151
|
+
|
|
152
|
+
**These are on-demand-only platforms — there is NO spot bid and NO documented mid-run reclaim.** Do not
|
|
153
|
+
build SIGTERM-grace preemption handling here; aggressive retry-on-preemption is over-engineering on this
|
|
154
|
+
family. The real involuntary-loss vectors are:
|
|
155
|
+
|
|
156
|
+
1. **Auto-release of *stopped* instances.** Gpushare auto-releases (deletes, unrecoverable) a stopped
|
|
157
|
+
pay-as-you-go instance **10 days after stop** (`实例停止 10 天后,会自动释放` — verified
|
|
158
|
+
gpushare.com/docs/instance/manage 2026-06). On arrears, **at noon on the 15th day** Gpushare deletes
|
|
159
|
+
personal data + the `/hy-nas` shared storage + custom images. A stopped box is not a parked box — pull
|
|
160
|
+
anything needed off it before that window.
|
|
161
|
+
2. **`/hy-tmp` 24-hour auto-clear (Gpushare).** Distinct from instance release: even on a *running* server,
|
|
162
|
+
`/hy-tmp` data is deleted **24 h after the instance is shut down**, and is also wiped on instance migration
|
|
163
|
+
(GS5).
|
|
164
|
+
3. **GPU-idle auto-shutdown.** Most platforms offer an opt-in "idle → auto-stop" policy to prevent waste; if
|
|
165
|
+
enabled it can stop a job that merely went quiet (e.g. between epochs with no GPU util) — keep it off for
|
|
166
|
+
long single-GPU jobs unless heartbeat is guaranteed.
|
|
167
|
+
4. **Platform churn (LanRui).** LanRui migrated domain `lanrui-ai.com` → **`lanrui.co`** (old-domain data not
|
|
168
|
+
retained after **2024-11-01**) and retired its **T1/T2 zones on 2025-06-30**, moving users to a new "Cova"
|
|
169
|
+
platform — **re-verify current console paths/domain before scripting against any cached LanRui path**.
|
|
170
|
+
|
|
171
|
+
**Resume hook:** checkpoint-to-durable + load-latest-on-startup (principle #8) is still the right spine — here
|
|
172
|
+
it guards against a forgotten stop, a 10-day auto-release, and a `/hy-tmp` 24 h wipe, not a spot kill. The
|
|
173
|
+
cadence formula in `references/spot-resilience.md` still applies if a job is long enough to span a forced stop.
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
## 5. TEARDOWN / BILLING *(principle #9 + the Iron Law)*
|
|
178
|
+
|
|
179
|
+
**The meter-stop verb is per-platform — bind it from the table below before clicking anything.** The Iron Law
|
|
180
|
+
(SKILL.md Phase 5) holds unchanged: NO release/return/destroy until checkpoints are **pulled to local AND
|
|
181
|
+
verified by load**, and the user has approved the cost-affecting action.
|
|
182
|
+
|
|
183
|
+
| Platform | Meter-stop verb | What it preserves | Cost trap |
|
|
184
|
+
|---|---|---|---|
|
|
185
|
+
| Matpool | **停止并释放** (stop+release) | `/mnt` netdisk persists (region-scoped) | `.snap` snapshots silently eat the 5 GB netdisk (MP1) |
|
|
186
|
+
| Gpushare | **关机** stops compute → **释放** deletes | `/hy-netdisk` persists; `/hy-tmp` cleared 24 h post-stop; `/root` **resets to image** | stopped instance **auto-released at 10 days** (GS4); arrears purge day-15 noon |
|
|
187
|
+
| Featurize | **实例归还** (return) | only `work` (`/home/featurize`) + `/cloud` persist | everything else **wiped immediately on return** (FZ1) |
|
|
188
|
+
| LanRui | **停止** stops compute; **must *销毁数据盘*** (destroy the 数据盘) to stop disk billing | 网盘 + 数据盘 persist | **数据盘 bills hourly while the workspace is merely STOPPED** (LR1) |
|
|
189
|
+
|
|
190
|
+
**The single most dangerous divergence: on LanRui, "stop to save money" is wrong.** The 数据盘
|
|
191
|
+
(`/home/user/datadisk`, block storage, bought in 200 G / 500 G specs) bills hourly from *creation* until
|
|
192
|
+
*destroyed*, even while the workspace is stopped — `工作空间停止运行,未销毁的数据盘也将持续计费` (verified
|
|
193
|
+
docs.lanrui.co storage + lanrui.co/pricing 2026-06). So a stopped LanRui workspace keeps a meter running. To
|
|
194
|
+
actually stop all billing: stop the workspace AND destroy the 数据盘 (after the Iron-Law pull+verify). The 网盘
|
|
195
|
+
(10 GB free, 0.15 元/GB·月 overage) persists separately. Contrast: on Matpool/Gpushare/Featurize,
|
|
196
|
+
release/return/归还 ends compute billing and the persistent volume simply survives (Gpushare /hy-netdisk and
|
|
197
|
+
/hy-nas bill per-GB but are not destroyed by stopping).
|
|
198
|
+
|
|
199
|
+
**Cost-pause analogs (cheaper than full release, data kept):** Gpushare **无卡模式 / 无卡启动** (low-core
|
|
200
|
+
CPU-only restart, no GPU) is the analog of AutoDL's no-GPU restart — keeps `/hy-netdisk` data while paused at a
|
|
201
|
+
fraction of the GPU rate, ideal for env-config + dataset download (verified gpushare 无卡启动 announcement
|
|
202
|
+
2026-06). LanRui supports an **auto-stop timer** (set a stop time at workspace start) and per-hour billing.
|
|
203
|
+
|
|
204
|
+
---
|
|
205
|
+
|
|
206
|
+
## 6. DAEMON TOOL
|
|
207
|
+
|
|
208
|
+
**tmux** is the family detach primitive — preinstalled on most images. Caveat (from Matpool docs, true
|
|
209
|
+
family-wide): **run tmux from a local SSH session, NOT the Jupyter web terminal** — keybindings collide with
|
|
210
|
+
tmux's prefix. A backgrounded `nohup python … </dev/null >log 2>&1 &` also survives a tab-close / page refresh
|
|
211
|
+
on Featurize (process not killed; only notebook cell state lost) — but tmux is preferred for a named,
|
|
212
|
+
re-attachable session.
|
|
213
|
+
|
|
214
|
+
tmux survives an **SSH drop** but **NOT** an instance **stop/restart** on any platform (on Gpushare the restart
|
|
215
|
+
resets `/root`, taking the tmux server and any `/root` logs with it) — so the durable spine is
|
|
216
|
+
checkpoint-to-persistent-disk (§2, principle #8), not the tmux session. LanRui additionally supports
|
|
217
|
+
**multi-machine multi-GPU distributed training** — if used, see `references/multinode.md`.
|
|
218
|
+
|
|
219
|
+
---
|
|
220
|
+
|
|
221
|
+
## 7. TOP GOTCHAS *(platform-pinned; universal ones → `references/gotchas_universal.md`)*
|
|
222
|
+
|
|
223
|
+
### Family-wide (China-specific, not in the universal catalog)
|
|
224
|
+
|
|
225
|
+
**CN1 — `/root` ephemerality silently loses work.**
|
|
226
|
+
Symptom: code/checkpoints written to `/root` vanish after a stop/restart (Gpushare) or instance return
|
|
227
|
+
(Featurize). → Root cause: the system disk resets to image state / is wiped on return — unlike AutoDL, which
|
|
228
|
+
persists `/root` across power-off. → Fix: write *everything* to the persistent mount (§2); treat `/root` as
|
|
229
|
+
RAM. Audit with `ls <persistent-mount>` after a test stop before trusting it for a real run.
|
|
230
|
+
|
|
231
|
+
**CN2 — GPU-idle auto-stop kills a quiet job.**
|
|
232
|
+
Symptom: a long job dies mid-run with no error; console shows "auto-stopped (idle)". → Root cause: an opt-in
|
|
233
|
+
idle-shutdown policy stopped the instance during a low-GPU-util phase (data loading, eval, between epochs).
|
|
234
|
+
→ Fix: disable idle-auto-stop for long jobs, or emit a periodic GPU-touching heartbeat; confirm the policy
|
|
235
|
+
state in Phase 0.
|
|
236
|
+
|
|
237
|
+
### Matpool (matpool.com)
|
|
238
|
+
|
|
239
|
+
**MP1 — `.snap` snapshots silently consume the 5 GB netdisk.**
|
|
240
|
+
Symptom: "保存环境" / snapshot saves fail or the netdisk fills with no obvious culprit. → Root cause: snapshots
|
|
241
|
+
are written as `.snap` files **into the netdisk** and count against its tiny 5 GB quota (verified matpool
|
|
242
|
+
snapshot docs 2026-06). → Fix: prune old `.snap` files (deleting one frees the quota); keep only the latest
|
|
243
|
+
needed env snapshot.
|
|
244
|
+
|
|
245
|
+
**MP2 — `/mnt` is excluded from snapshots, and the machine is locked while saving.**
|
|
246
|
+
Symptom: "保存环境" doesn't capture code under `/mnt`; the instance is unusable during the save. → Root cause:
|
|
247
|
+
a snapshot captures **everything *except* `/mnt`** (the netdisk mount), and the machine cannot be used while
|
|
248
|
+
the snapshot writes. → Fix: to *shrink* a snapshot move code/data to `/mnt` first (it won't be captured); to
|
|
249
|
+
*preserve* code via snapshot keep it OFF `/mnt`. Ensure no running process before triggering a save.
|
|
250
|
+
|
|
251
|
+
**MP3 — region-scoped netdisk strands data on a sweep across regions.**
|
|
252
|
+
Symptom: a second instance in another region can't see files written by the first; expanded storage "missing".
|
|
253
|
+
→ Root cause: `/mnt` netdisks are separate per region and do not interconnect. → Fix: keep all instances of a
|
|
254
|
+
sweep in one region; pick region before expanding (verified matpool FAQ 2026-06).
|
|
255
|
+
|
|
256
|
+
### Gpushare (gpushare.com)
|
|
257
|
+
|
|
258
|
+
**GS1 — `/root` resets to image state on every shutdown/restart.** (The instance of CN1 to remember by name.)
|
|
259
|
+
Symptom: installed packages / code / logs under `/root` gone after restart. → Root cause: only `/hy-tmp` and
|
|
260
|
+
`/hy-netdisk` persist; `/` reverts to the image. → Fix: env on `/hy-netdisk`, hot data on `/hy-tmp`, results
|
|
261
|
+
synced to `/hy-netdisk` before stop.
|
|
262
|
+
|
|
263
|
+
**GS2 — turbo proxy left on blocks non-whitelisted hosts.**
|
|
264
|
+
Symptom: after `export …turbo.gpushare.com…`, `pip install` / `apt` / domestic mirrors hang or `ProxyError`.
|
|
265
|
+
→ Root cause: the academic proxy whitelists only GitHub/HF/PyTorch/Kaggle and **restricts everything else**
|
|
266
|
+
(verified network_turbo docs 2026-06). → Fix: `unset http_proxy https_proxy` the moment the accelerated pull
|
|
267
|
+
finishes (§3). Same shape as the `no_proxy` trap in `references/china-network.md`.
|
|
268
|
+
|
|
269
|
+
**GS3 — `/hy-netdisk` absent on unmarked machines.**
|
|
270
|
+
Symptom: scripts referencing `/hy-netdisk` fail on some rentals. → Root cause: the shared netdisk exists only
|
|
271
|
+
on machines marked as supporting 内网存储. → Fix: check `mount | grep hy-netdisk` in Phase 0; fall back to
|
|
272
|
+
personal cloud storage via `oss cp` (OSS tool, ~300 Mbps, compressed archives only) if absent.
|
|
273
|
+
|
|
274
|
+
**GS4 — stopped instance auto-released at 10 days; arrears purge at day 15.** Symptom: a parked stopped
|
|
275
|
+
instance disappears, or shared/personal data is gone after non-payment. → Root cause: pay-as-you-go auto-
|
|
276
|
+
release 10 days after stop (`实例停止 10 天后自动释放`); on arrears, day-15-noon deletes personal data +
|
|
277
|
+
`/hy-nas` + custom images (verified gpushare docs 2026-06). → Fix: pull results off a stopped box promptly;
|
|
278
|
+
don't treat "stopped" as durable parking; keep the balance positive.
|
|
279
|
+
|
|
280
|
+
**GS5 — `/hy-tmp` auto-cleared 24 h after shutdown (and on migration).** *(NEW — corrects the prior "/hy-tmp
|
|
281
|
+
persists" assumption.)* Symptom: training data/scratch under `/hy-tmp` gone the day after a stop, even though
|
|
282
|
+
the instance still exists. → Root cause: `/hy-tmp` is per-server local scratch, auto-deleted 24 h after
|
|
283
|
+
shutdown and wiped on instance migration (verified gpushare.com/docs/data/storage 2026-06). → Fix: treat
|
|
284
|
+
`/hy-tmp` as IO scratch only; sync anything durable to `/hy-netdisk` before stopping; do NOT
|
|
285
|
+
`conda create -p /hy-tmp/...` for a persistent env (use `/hy-netdisk`).
|
|
286
|
+
|
|
287
|
+
### Featurize (featurize.cn)
|
|
288
|
+
|
|
289
|
+
**FZ1 — anything outside `work`/`/cloud` is wiped the instant the instance is returned.** (The strictest
|
|
290
|
+
"what survives" rule of the four.) Symptom: results outside `/home/featurize` or `/cloud` gone after 归还.
|
|
291
|
+
→ Root cause: only `work` (per-user cloud storage, `工作区可以一直保存项目文件`) and the `/cloud` sync drive
|
|
292
|
+
persist; everything else is destroyed on return (verified Featurize tutorials 2026-06). → Fix: write all
|
|
293
|
+
durable output under `work`/`/cloud`; verify before returning.
|
|
294
|
+
|
|
295
|
+
**FZ2 — `/cloud` sync drive lag makes edits *look* saved but not land.** Symptom: VS Code edits / files appear
|
|
296
|
+
saved locally but are missing after reconnect or return (the "工作区中修改代码后无法保存" complaint). → Root
|
|
297
|
+
cause: the Remote-SSH sync to the cloud drive is not always real-time, especially on slow links or large files.
|
|
298
|
+
→ Fix: explicit `Ctrl+S`, then verify on the server (`ls -la` / `cat` the file) before trusting it; on a
|
|
299
|
+
flaky connection, close and re-open the Remote-SSH session (transient failures are expected).
|
|
300
|
+
|
|
301
|
+
**FZ3 — 30 GB free cloud quota silently breaks large writes / `conda create`.** *(corrects the prior "~20 GB"
|
|
302
|
+
figure.)* Symptom: env creation or large copies into `work`/`/cloud` fail or truncate. → Root cause: the free
|
|
303
|
+
cloud storage is **30 GB** (verified featurize.cn 2026-06); over it, writes fail. → Fix: `du -sh ~/work /cloud`
|
|
304
|
+
to watch headroom; keep only the active env there; large reproducible scratch belongs on local
|
|
305
|
+
non-persistent disk, not the cloud drive.
|
|
306
|
+
|
|
307
|
+
### LanRui (lanrui.co / lanrui-ai.com)
|
|
308
|
+
|
|
309
|
+
**LR1 — 数据盘 keeps billing while the workspace is merely *stopped*.** (The most expensive divergence — see
|
|
310
|
+
§5.) Symptom: a stopped LanRui workspace still accrues cost. → Root cause: the 数据盘
|
|
311
|
+
(`/home/user/datadisk`) bills hourly from creation until *destroyed*, independent of workspace run-state
|
|
312
|
+
(`工作空间停止运行,未销毁的数据盘也将持续计费` — verified docs.lanrui.co storage 2026-06). → Fix: to stop
|
|
313
|
+
all billing, stop the workspace **and** 销毁 the 数据盘 — only after the Iron-Law pull+verify; the 网盘 keeps
|
|
314
|
+
the data.
|
|
315
|
+
|
|
316
|
+
**LR2 — shared `netdisk/data` folder mounted into every same-zone workspace → cross-run clobber.** Symptom:
|
|
317
|
+
a parallel ablation overwrites another run's outputs. → Root cause: `/home/user/netdisk/data` is auto-mounted
|
|
318
|
+
and shared across *all* workspaces in the same availability zone. → Fix: per-job isolated write paths
|
|
319
|
+
(`references/parallel_ablation.md`); never share a mutable output dir under `netdisk/data`. Also: the 网盘 is
|
|
320
|
+
poor at many-small-file *writes* — route those to the 数据盘.
|
|
321
|
+
|
|
322
|
+
**LR3 — platform/domain churn invalidates cached paths.** Symptom: scripted paths/domain fail post-migration.
|
|
323
|
+
→ Root cause: domain `lanrui-ai.com` → `lanrui.co` (old data dropped after 2024-11-01); T1/T2 zones retired
|
|
324
|
+
2025-06-30 → "Cova" platform. → Fix: re-verify console domain + paths in-session before scripting against any
|
|
325
|
+
cached LanRui path.
|
|
326
|
+
|
|
327
|
+
### Platform-specific debugging
|
|
328
|
+
|
|
329
|
+
Before trusting a run, in Phase 0 (per platform):
|
|
330
|
+
- **Confirm persistence path is real, not `/root`.** `mount | grep -E 'mnt|hy-netdisk|cloud|datadisk|netdisk'`
|
|
331
|
+
then `touch <persistent-mount>/.probe && ls -l <persistent-mount>/.probe`. On Gpushare also confirm
|
|
332
|
+
`/hy-netdisk` is present (GS3) — `mount | grep hy-netdisk` (absent on unmarked machines).
|
|
333
|
+
- **GPU + driver sanity.** `nvidia-smi` (GPU visible, mem free, driver/CUDA), then
|
|
334
|
+
`python -c "import torch;print(torch.__version__, torch.cuda.is_available(), torch.cuda.get_device_name(0))"`
|
|
335
|
+
against the prebuilt interpreter. Mismatched local-vs-server PyTorch silently breaks checkpoint loads on
|
|
336
|
+
Featurize — match versions.
|
|
337
|
+
- **Detect a stuck / throttled download.** `du -sh <cache-dir>` twice ~30 s apart — flat size = stalled (often
|
|
338
|
+
the GFW or a left-on Gpushare turbo proxy restricting a non-whitelisted host, GS2). Cross-check with
|
|
339
|
+
`curl -sI -x "$https_proxy" https://hf-mirror.com` / `env | grep -i proxy`; `unset http_proxy https_proxy`
|
|
340
|
+
and retry on the mirror.
|
|
341
|
+
- **Disk / inode pressure (the silent §2 risk).** `df -h <persistent-mount>` AND `df -i <persistent-mount>` —
|
|
342
|
+
a full inode table fails writes while `df -h` still shows free GB. On Matpool, a filling 5 GB netdisk is
|
|
343
|
+
usually stale `.snap` files (`ls -la /mnt/*.snap`, MP1).
|
|
344
|
+
- **Verify the meter-stop did what was intended.** After "stop", re-check the console billing line — on LanRui
|
|
345
|
+
a stopped workspace whose 数据盘 was NOT destroyed is still metering (LR1); on Gpushare a stopped box still
|
|
346
|
+
counts toward the 10-day auto-release clock (GS4).
|
|
347
|
+
- **Read the running job's log, don't infer from silence.** Job is in tmux/nohup → `tmux capture-pane -pt
|
|
348
|
+
<session>` or `tail -f <persistent-mount>/run.log`. A vanished tmux server after a "restart" means `/root`
|
|
349
|
+
reset (GS1) — the log must live on the persistent mount to survive.
|
|
350
|
+
|
|
351
|
+
---
|
|
352
|
+
|
|
353
|
+
## 8. SCRIPT OVERRIDES
|
|
354
|
+
|
|
355
|
+
Parameterize the `scripts/` templates per platform. `PROXY_HOOK`, `HF_HOME`, and the mirror env all defer to
|
|
356
|
+
`references/china-network.md`; only the **mounts** truly differ.
|
|
357
|
+
|
|
358
|
+
| Var | Matpool | Gpushare | Featurize | LanRui |
|
|
359
|
+
|---|---|---|---|---|
|
|
360
|
+
| `DURABLE_DIR=` (durable) | `/mnt` | `/hy-netdisk` | `/home/featurize` (+`/cloud`) | `/home/user/datadisk` (or `/home/user/netdisk/data`) |
|
|
361
|
+
| `DATA_DIR=` (fast/ephemeral) | `/root` | `/hy-tmp` (24 h post-stop wipe) | local tmp | `/home/user/datadisk` scratch |
|
|
362
|
+
| `SCRATCH=` (local, prune) | `/root` | `/hy-tmp` | local tmp | 数据盘 scratch |
|
|
363
|
+
| `HF_HOME=` | `/mnt/.cache/hf` | `/hy-netdisk/.cache/hf` | `/cloud/.cache/hf` | `/home/user/datadisk/.cache/hf` |
|
|
364
|
+
| `PROXY_HOOK=` | (mirrors only) | `export …turbo.gpushare.com:<PORT>…` then `unset` | (mirrors only) | (mirrors only) |
|
|
365
|
+
| `CRED_FILE=""` (no file — env var) | `$WANDB_API_KEY` / `$HF_TOKEN` on **ephemeral** disk, never the shared netdisk | same | same | same |
|
|
366
|
+
| `DETACH=` | tmux | tmux | tmux | tmux |
|
|
367
|
+
|
|
368
|
+
`CRED_FILE=""` because on these CN platforms the credential is an **env var** (or `.netrc`) on the ephemeral
|
|
369
|
+
disk, not a file on the netdisk — leave it empty so run_one's `[ -n "$CRED_FILE" ]` guard skips the file read
|
|
370
|
+
and `$WANDB_API_KEY` / `$HF_TOKEN` pass through from the platform env.
|
|
371
|
+
|
|
372
|
+
Common to all: the credential lives in an env var or `.netrc` on the **ephemeral system disk**, never on the
|
|
373
|
+
shared/persistent netdisk (a shared `data` folder mounted into every same-zone workspace, like LanRui's, is
|
|
374
|
+
especially leaky — universal secrets-off-shared-FS gotcha in `references/gotchas_universal.md`).
|
|
375
|
+
|
|
376
|
+
---
|
|
377
|
+
|
|
378
|
+
## 9. Per-platform comparison — the load-bearing differences at a glance
|
|
379
|
+
|
|
380
|
+
The six questions the schema asks, answered per platform. This is the table to read first when picking which
|
|
381
|
+
delta applies.
|
|
382
|
+
|
|
383
|
+
| Question | Matpool | Gpushare | Featurize | LanRui |
|
|
384
|
+
|---|---|---|---|---|
|
|
385
|
+
| Prebuilt base-conda env? | yes (**`myconda`, auto-activated**) | yes (miniconda, base auto-activate **off**) | yes (full PyTorch/TF base, pip persists on `work`) | yes (image-provisioned; PyTorch images purchasable) |
|
|
386
|
+
| Academic-acceleration proxy? | no (source-switch scripts only) | **yes** `turbo.gpushare.com:<PORT>` (per-session, 7-host whitelist) | no (mirrors only) | no (mirrors only) |
|
|
387
|
+
| Shared / region FS? | `/mnt` netdisk (**region-scoped**, expandable) | `/hy-netdisk` (only on *marked* machines) + `/hy-nas` | `work`+`/cloud` (cloud sync; not cross-region, med-conf) | `/home/user/netdisk/data` (shared into *every same-zone* workspace) |
|
|
388
|
+
| Inode cap? | undocumented — measure `df -i` | undocumented — measure `df -i` | undocumented — measure `df -i` | undocumented — measure `df -i` |
|
|
389
|
+
| Data disk bills while **stopped**? | no (release ends billing) | no (but stopped box auto-released at 10 d; `/hy-tmp` cleared 24 h) | no (return ends billing) | **YES — 数据盘 bills until destroyed** |
|
|
390
|
+
| Meter-stop verb | 停止并释放 | 关机 → 释放 (+ 无卡模式 pause) | 实例归还 | **停止 + 销毁数据盘** |
|
|
391
|
+
| `/root` survives a stop? | local, lost on release | **NO — resets to image** | **NO — wiped on return** | system disk lost; use 数据盘 |
|
|
392
|
+
|
|
393
|
+
**Bottom line for porting an AutoDL workflow:** the SSH/tmux/smoke/checkpoint spine transfers verbatim; the
|
|
394
|
+
three things to re-bind per platform are (1) the **persistent mount** (never `/root`; on Gpushare never
|
|
395
|
+
`/hy-tmp` either), (2) the **meter-stop verb** — and on LanRui, that stopping is not enough, the 数据盘 must be
|
|
396
|
+
destroyed — and (3) the **proxy hook** (real proxy only on Gpushare, with a strict whitelist; mirrors-only
|
|
397
|
+
elsewhere → `references/china-network.md`).
|