opencode-skills-collection 3.1.2 → 3.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/bundled-skills/.antigravity-install-manifest.json +4 -1
  2. package/bundled-skills/agent-creator/SKILL.md +246 -0
  3. package/bundled-skills/ax-extract-workflow/SKILL.md +156 -0
  4. package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
  5. package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
  6. package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
  7. package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
  8. package/bundled-skills/docs/sources/sources.md +1 -1
  9. package/bundled-skills/docs/users/bundles.md +1 -1
  10. package/bundled-skills/docs/users/claude-code-skills.md +1 -1
  11. package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
  12. package/bundled-skills/docs/users/getting-started.md +1 -1
  13. package/bundled-skills/docs/users/kiro-integration.md +1 -1
  14. package/bundled-skills/docs/users/usage.md +4 -4
  15. package/bundled-skills/docs/users/visual-guide.md +4 -4
  16. package/bundled-skills/lovable-cleanup/SKILL.md +2 -1
  17. package/bundled-skills/remote-gpu-trainer/.gitattributes +8 -0
  18. package/bundled-skills/remote-gpu-trainer/LICENSE +21 -0
  19. package/bundled-skills/remote-gpu-trainer/README.md +267 -0
  20. package/bundled-skills/remote-gpu-trainer/SKILL.md +249 -0
  21. package/bundled-skills/remote-gpu-trainer/evals/README.md +57 -0
  22. package/bundled-skills/remote-gpu-trainer/evals/RESULTS.md +44 -0
  23. package/bundled-skills/remote-gpu-trainer/evals/cases.jsonl +14 -0
  24. package/bundled-skills/remote-gpu-trainer/evals/run_evals.py +68 -0
  25. package/bundled-skills/remote-gpu-trainer/examples/autodl_sweep/README.md +72 -0
  26. package/bundled-skills/remote-gpu-trainer/examples/autodl_sweep/queue_1.txt +6 -0
  27. package/bundled-skills/remote-gpu-trainer/profiles/_schema.md +100 -0
  28. package/bundled-skills/remote-gpu-trainer/profiles/autodl.md +327 -0
  29. package/bundled-skills/remote-gpu-trainer/profiles/china.md +397 -0
  30. package/bundled-skills/remote-gpu-trainer/profiles/generic-ssh.md +450 -0
  31. package/bundled-skills/remote-gpu-trainer/profiles/lambda.md +342 -0
  32. package/bundled-skills/remote-gpu-trainer/profiles/paperspace.md +365 -0
  33. package/bundled-skills/remote-gpu-trainer/profiles/runpod.md +164 -0
  34. package/bundled-skills/remote-gpu-trainer/profiles/vastai.md +355 -0
  35. package/bundled-skills/remote-gpu-trainer/references/china-network.md +206 -0
  36. package/bundled-skills/remote-gpu-trainer/references/gotchas_universal.md +704 -0
  37. package/bundled-skills/remote-gpu-trainer/references/lifecycle_checklist.md +148 -0
  38. package/bundled-skills/remote-gpu-trainer/references/monitoring_patterns.md +327 -0
  39. package/bundled-skills/remote-gpu-trainer/references/multinode.md +190 -0
  40. package/bundled-skills/remote-gpu-trainer/references/parallel_ablation.md +196 -0
  41. package/bundled-skills/remote-gpu-trainer/references/principles.md +179 -0
  42. package/bundled-skills/remote-gpu-trainer/references/self-improvement.md +74 -0
  43. package/bundled-skills/remote-gpu-trainer/references/spot-resilience.md +235 -0
  44. package/bundled-skills/remote-gpu-trainer/references/ssh_transport.md +270 -0
  45. package/bundled-skills/remote-gpu-trainer/references/training/by-domain.md +230 -0
  46. package/bundled-skills/remote-gpu-trainer/references/training/checkpoint-resume.md +368 -0
  47. package/bundled-skills/remote-gpu-trainer/references/training/convergence-debugging.md +187 -0
  48. package/bundled-skills/remote-gpu-trainer/references/training/data-pipeline.md +119 -0
  49. package/bundled-skills/remote-gpu-trainer/references/training/distributed-launch.md +422 -0
  50. package/bundled-skills/remote-gpu-trainer/references/training/oom-memory.md +338 -0
  51. package/bundled-skills/remote-gpu-trainer/references/training/precision-stability.md +401 -0
  52. package/bundled-skills/remote-gpu-trainer/references/training/throughput-profiling.md +451 -0
  53. package/bundled-skills/remote-gpu-trainer/scripts/aggregate_to_fs.sh +55 -0
  54. package/bundled-skills/remote-gpu-trainer/scripts/check_staleness.py +70 -0
  55. package/bundled-skills/remote-gpu-trainer/scripts/download_loop.sh +67 -0
  56. package/bundled-skills/remote-gpu-trainer/scripts/gpu_health.sh +169 -0
  57. package/bundled-skills/remote-gpu-trainer/scripts/health_patrol.sh.template +67 -0
  58. package/bundled-skills/remote-gpu-trainer/scripts/mem_monitor.sh +67 -0
  59. package/bundled-skills/remote-gpu-trainer/scripts/reap_vram_zombies.sh +175 -0
  60. package/bundled-skills/remote-gpu-trainer/scripts/run_one.sh.template +104 -0
  61. package/bundled-skills/remote-gpu-trainer/scripts/run_queue.sh.template +83 -0
  62. package/bundled-skills/remote-gpu-trainer/scripts/setup-china-mirrors.sh +35 -0
  63. package/bundled-skills/remote-gpu-trainer/scripts/verify_local.py +145 -0
  64. package/package.json +1 -1
  65. package/skills_index.json +66 -0
@@ -0,0 +1,190 @@
1
+ # Multi-node NCCL & elastic-training gotchas — ADVANCED
2
+
3
+ **Single-box users skip this entire file.** None of it applies to a single instance — one node, however many GPUs,
4
+ runs DDP/FSDP over NVLink/PCIe and never touches the inter-node NCCL transport, fabric-manager, or rendezvous logic
5
+ below. This file is **only** for jobs spanning ≥2 rented instances (multi-node DDP, FSDP, pipeline/tensor parallel,
6
+ or elastic training). It assumes the checkpoint-to-durable + idempotent-resume spine is already in place
7
+ (`references/principles.md` #8; cadence + atomic-write in `references/spot-resilience.md`) — multi-node only changes
8
+ *how the process group forms and breaks*, never the resume mechanism.
9
+
10
+ These are all **[P] platform/topology-specific** gotchas. The universal ones (disk, OOM, CRLF, silent sync, spot
11
+ grace) are **not** restated here — see `references/gotchas_universal.md`.
12
+
13
+ ## Table of contents
14
+
15
+ - **Fabric-manager** — one bad node hangs the WHOLE job at NCCL init (MN1)
16
+ - **NIC selection** — NCCL picks docker0/loopback/slow NIC (MN2)
17
+ - **Timeout masking** — default 1800 s hides a straggler/dead rank (MN3)
18
+ - **MTU mismatch** — jumbo frames silently dropped, small messages fine (MN4)
19
+ - **Elastic restart ≠ state restore** — torchrun `--max-restarts` (MN5)
20
+ - **Elastic Horovod pause-below-min-np** — pauses then errors (MN6)
21
+ - **First-move checklist** — bring up a healthy multi-node group
22
+
23
+ To jump: `grep -in <keyword> references/multinode.md` (e.g. `grep -in fabric references/multinode.md`).
24
+
25
+ ---
26
+
27
+ ## MN1 — Fabric-manager down on one node hangs the entire job at NCCL init
28
+
29
+ **Symptom.** Launch a multi-node job; every rank prints up to NCCL init then freezes — no traceback, no progress,
30
+ no OOM, just a silent hang at the first collective. Killing and relaunching reproduces the exact same stall. A
31
+ single-node run of the identical code works fine.
32
+
33
+ **Root cause.** On NVSwitch-based nodes (HGX/DGX A100/H100), `nvidia-fabricmanager` must be running and healthy on
34
+ **every** node for NVLink/NVSwitch routing to come up. One node with a stopped, crashed, or version-mismatched
35
+ fabric-manager cannot establish its NVLink fabric, so its ranks never join the collective — and because NCCL init is
36
+ a **global barrier**, all the healthy nodes block waiting for the one that can never arrive. The failure is global;
37
+ the cause is local to one box.
38
+
39
+ **Fix.**
40
+ - Check fabric-manager on **every** node before launching, not just the head:
41
+ `systemctl status nvidia-fabricmanager` (or `nvidia-smi -q | grep -i fabric`). It must be `active (running)` and
42
+ its version must match the driver on that node.
43
+ - Turn the silent hang into a diagnosable one: launch with `NCCL_DEBUG=INFO` (optionally `NCCL_DEBUG_SUBSYS=INIT,NET`).
44
+ The first node whose log **stops** before the others printed their topology is the culprit.
45
+ - On a rental the fix is operational, not a reseat: restart the service (`systemctl restart nvidia-fabricmanager`)
46
+ if permitted, otherwise **stop that instance and re-rent a different box** — a fabric-manager that won't start is
47
+ usually a sick host (overlaps the Xid hardware-failure logic in `references/gotchas_universal.md`).
48
+
49
+ URL: https://support.crusoecloud.com/hc/en-us/articles/46061806112155-NCCL-Hangs-and-Multi-Node-Training-Stalls-Caused-by-Failed-nvidia-fabricmanager
50
+
51
+ ---
52
+
53
+ ## MN2 — NCCL picks the wrong NIC (docker0 / loopback / a slow interface)
54
+
55
+ **Symptom.** Multi-node init hangs forever, OR it connects but inter-node bandwidth is 10× too slow (allreduce
56
+ dominates the step time; single-node throughput was fine). `NCCL_DEBUG=INFO` shows NCCL binding to `docker0`, `lo`,
57
+ or a 1 GbE management NIC instead of the fast data-plane interface.
58
+
59
+ **Root cause.** NCCL auto-discovers network interfaces and, with no guidance, can select an unroutable bridge
60
+ (`docker0`), the loopback, or the slow management NIC — none of which carry traffic between the real nodes. The
61
+ job either never connects (unroutable) or runs over the wrong, slow path.
62
+
63
+ **Fix.** Pin the transport explicitly on **all** nodes (identical env on every rank):
64
+ - `export NCCL_SOCKET_IFNAME=<real-iface>` — a prefix filter; e.g. `eth`, `ens`, `bond`. Exclude bad ones with a
65
+ leading `^`: `NCCL_SOCKET_IFNAME=^docker0,lo`.
66
+ - On RDMA/InfiniBand fabrics pin the HCA: `export NCCL_IB_HCA=mlx5` (the active adapter prefix).
67
+ - **No RDMA (TCP-only rental):** `export NCCL_IB_DISABLE=1` so NCCL stops probing for nonexistent IB and falls back
68
+ to the socket path cleanly instead of stalling on IB discovery.
69
+ - Confirm the chosen interface with `NCCL_DEBUG=INFO` — the `NET/Socket` or `NET/IB` line names the bound device;
70
+ verify it is the fast data-plane NIC, not the bridge.
71
+
72
+ URL: https://github.com/NVIDIA/nccl/issues/1580
73
+
74
+ ---
75
+
76
+ ## MN3 — Default 1800 s NCCL timeout masks a straggler or a dead rank
77
+
78
+ **Symptom.** A run that was progressing freezes at a collective for exactly **30 minutes**, then dies with a
79
+ `Watchdog ... collective ... timed out` / `NCCL timeout` error — or worse, hangs far longer because the watchdog is
80
+ off. The real failure (a rank that OOM'd, crashed, or fell behind) happened 30 min earlier and is buried.
81
+
82
+ **Root cause.** NCCL's default collective timeout is **1800 s**. When one rank dies or stalls, the others sit in the
83
+ collective waiting out the full 30-minute window before anything surfaces — so the symptom appears half an hour after
84
+ the cause, and a transient straggler can trip a hard abort it should have survived.
85
+
86
+ **Fix.**
87
+ - **Fail fast on a dead rank:** `export NCCL_ASYNC_ERROR_HANDLING=1` (newer PyTorch: `TORCH_NCCL_ASYNC_ERROR_HANDLING=1`)
88
+ so a crashed/unreachable rank tears the group down promptly instead of waiting out the timeout — the surviving ranks
89
+ get an actionable error near the true failure point.
90
+ - **Tune the window deliberately, both directions.** Genuinely slow collectives (huge allreduce, slow checkpoint
91
+ barrier) need a **longer** timeout to avoid false aborts: raise it via the process-group init
92
+ (`torch.distributed.init_process_group(..., timeout=timedelta(minutes=60))`). To surface a hung straggler **sooner**,
93
+ lower it. The default rarely fits a real job — set it on purpose.
94
+ - Pair with MN1's `NCCL_DEBUG=INFO` so the timeout error names which rank went silent.
95
+
96
+ URL: https://repost.aws/questions/QURXddiuikQLesRDGz39RhIw/nccl-socket-timeout-when-using-large-dataset-in-multi-node-pretraining
97
+
98
+ ---
99
+
100
+ ## MN4 — Jumbo-frame MTU mismatch silently drops large NCCL frames
101
+
102
+ **Symptom.** Small collectives work (rendezvous succeeds, tiny tensors allreduce fine), but the job hangs or throws a
103
+ transport error the moment a **large** payload is sent — large gradient buckets, the first big allreduce, or a model
104
+ broadcast. The break correlates with message **size**, not with which ranks are involved.
105
+
106
+ **Root cause.** The container's interface is configured for jumbo frames (MTU 9000) but the host veth / bridge it
107
+ attaches to is still at 1500 (or vice-versa). Small packets fit under the smaller MTU and pass; oversized frames are
108
+ silently dropped at the mismatched hop with no application-level error, so NCCL stalls waiting for data that never
109
+ lands. Classic on containerized rentals where the container MTU and the host bridge MTU were set independently.
110
+
111
+ **Fix.**
112
+ - Match MTU end to end: set the **host veth/bridge** to the same MTU as the container interface (9000 ↔ 9000, or drop
113
+ both to 1500). Inspect with `ip link show` on both the container and the host side.
114
+ - Quick confirm the path actually carries jumbo frames between nodes:
115
+ `ping -M do -s 8972 <other-node-ip>` (8972 = 9000 − 28 bytes header). If it fails but `-s 1472` succeeds, the large
116
+ frames are being dropped → fix the MTU, do not blame NCCL.
117
+ - If the host bridge MTU cannot be changed on the rental, set the container interface **down** to 1500 to match — a
118
+ uniform-but-smaller MTU works; a mismatch does not.
119
+
120
+ URL: https://github.com/moby/moby/issues/4378
121
+
122
+ ---
123
+
124
+ ## MN5 — torchrun / TorchElastic `--max-restarts` restarts the process group but does NOT restore training state
125
+
126
+ **Symptom.** A worker dies (preemption, transient fault); torchrun's `--max-restarts=N` dutifully re-runs rendezvous
127
+ and relaunches **all** workers — but training resumes from **step 0** (or the wrong epoch), silently throwing away the
128
+ progress before the failure. The restart "worked" yet the run is set back hours.
129
+
130
+ **Root cause.** TorchElastic guarantees only that the **worker group** is reconstituted: it re-runs the c10d
131
+ rendezvous, re-derives `world_size`/`rank`, and relaunches every worker process. It does **not** persist or reload
132
+ training state — that is entirely the script's responsibility. A `--max-restarts` with no in-script
133
+ load-latest-checkpoint just re-runs `main()` from scratch on every restart.
134
+
135
+ **Fix.**
136
+ - The per-epoch (or per-N-step) snapshot is what restores state, exactly per `references/principles.md` #8: write
137
+ full state (model + optimizer + LR scheduler + epoch/step + RNG + dataloader position) **atomically**
138
+ (`tmp`→`fsync`→`os.rename`), and **load-latest unconditionally at the top of every launch** so a torchrun restart
139
+ resumes instead of restarting. Cadence formula + atomic-write detail live in `references/spot-resilience.md`.
140
+ - Use the c10d rendezvous backend hosted on a `host:port` (no etcd dependency); set `--max-restarts` to survive the
141
+ expected number of preemptions, not 0.
142
+ - **REQUIRED:** treat the restored run as a *resume of the identical config*, never a hand-patched relaunch — a
143
+ silently-restarted or reshuffled run is the exact contamination `verifying-dl-experiments` guards against; confirm
144
+ the resumed step/epoch against the loaded checkpoint before trusting any post-restart metric.
145
+
146
+ URL: https://docs.pytorch.org/tutorials/beginner/ddp_series_fault_tolerance.html
147
+
148
+ ---
149
+
150
+ ## MN6 — Elastic Horovod pauses below `--min-np`, then errors at `HOROVOD_ELASTIC_TIMEOUT`
151
+
152
+ **Symptom.** Under elastic Horovod (`horovodrun -np 8 --min-np 4 --max-np 12`), enough workers get preempted to drop
153
+ the live count below `--min-np`; the job does **not** fail immediately — it appears to hang (paused, no progress) —
154
+ and then, minutes later, dies with a timeout error. Operators waiting for an instant failure miss the pause window.
155
+
156
+ **Root cause.** Elastic Horovod **pauses** (does not fail) when available workers fall below `--min-np`, waiting for
157
+ capacity to return. It only errors once `HOROVOD_ELASTIC_TIMEOUT` (default **600 s**) elapses without recovering to
158
+ `--min-np`. So a too-high `--min-np` turns a routine couple-of-preemptions event into a hard failure after a silent
159
+ 10-minute wait.
160
+
161
+ **Fix.**
162
+ - Set `--min-np` **low enough** that the typical number of concurrent preemptions does not breach it — survivors keep
163
+ training through membership changes instead of pausing.
164
+ - Raise `HOROVOD_ELASTIC_TIMEOUT` if the spot tier's capacity routinely returns slower than 600 s, so a temporary
165
+ capacity dip resumes rather than aborts.
166
+ - **Pin LR-scaling and data-sharding to `--max-np`, not the live worker count** — otherwise the effective learning
167
+ rate and shard assignment drift on every membership change, quietly corrupting the run (a `verifying-dl-experiments`
168
+ concern: a metric from a run whose LR silently rescaled is not a clean datapoint).
169
+
170
+ URL: https://horovod.readthedocs.io/en/stable/elastic_include.html
171
+
172
+ ---
173
+
174
+ ## First-move checklist — bring up a healthy multi-node group
175
+
176
+ Run this order **before** trusting any multi-node throughput number; it isolates MN1–MN4 cheaply (no full job needed):
177
+
178
+ - [ ] **Every** node: `systemctl status nvidia-fabricmanager` → `active (running)`, version matches driver (MN1).
179
+ - [ ] Identical NCCL env exported on **all** ranks: `NCCL_SOCKET_IFNAME`, `NCCL_IB_HCA` (or `NCCL_IB_DISABLE=1`), and
180
+ the chosen `init_process_group` timeout (MN2, MN3).
181
+ - [ ] MTU path check between nodes: `ping -M do -s 8972 <other-node-ip>` succeeds, or both ends pinned to 1500 (MN4).
182
+ - [ ] First real launch with `NCCL_DEBUG=INFO` + `NCCL_ASYNC_ERROR_HANDLING=1`; confirm each rank's `NET/...` line
183
+ names the fast data-plane NIC, not a bridge (MN2, MN3).
184
+ - [ ] In-script load-latest-checkpoint verified to fire on restart **before** relying on `--max-restarts` /
185
+ elastic membership recovery (MN5, MN6; spine in `references/principles.md` #8).
186
+ - [ ] Distributed jobs checkpoint **more** often than single-GPU — one preemption wastes N× compute; cadence per
187
+ `references/spot-resilience.md`.
188
+
189
+ For fanning a *sweep* across nodes (independent cells, not one job over many nodes), that is
190
+ `references/parallel_ablation.md` + **REQUIRED** `superpowers:dispatching-parallel-agents`, not this file.
@@ -0,0 +1,196 @@
1
+ # Parallel Ablation Fan-out — FS-shared deployment, isolated write paths, reconciliation
2
+
3
+ Run N ablation cells in parallel across instances/queues without corrupting shared state, then
4
+ reconcile and re-verify every cell before any teardown. The mechanism is **one job per cell with an
5
+ isolated write path**; the discipline is **`superpowers:dispatching-parallel-agents`'s independence
6
+ predicate + reconciliation**. **REQUIRED:** `superpowers:dispatching-parallel-agents` and
7
+ **REQUIRED:** `superpowers:verification-before-completion`.
8
+
9
+ To jump: `grep -in <keyword> references/parallel_ablation.md`.
10
+
11
+ ## Table of contents
12
+
13
+ 1. The fan-out model (one job per cell)
14
+ 2. FS-shared wrapper deployment (place once, never mutate mid-run)
15
+ 3. The independence predicate (isolated write path = the analogue of a git worktree)
16
+ 4. The portable job request (describe once, run on any profile)
17
+ 5. Queue-file format + resume via `start_index`
18
+ 6. Mandatory post-fan-out reconciliation + full re-verify
19
+ 7. Gotchas
20
+
21
+ ---
22
+
23
+ ## 1. The fan-out model
24
+
25
+ Parallelism comes from running **multiple queues on multiple instances simultaneously** — never from
26
+ parallel jobs inside one instance (sequential per instance keeps memory predictable and prevents disk
27
+ contention). The unit of work is the **ablation cell**: one
28
+ `(cfg, task, epochs)` row → one `run_one` invocation → one isolated output directory.
29
+
30
+ ```
31
+ shared FS: /path/to/shared/run_one.sh, run_queue.sh (ONE version, all instances read it)
32
+ instance A tmux ──> run_queue.sh queueA.txt ──> cell a1 ──> cell a2 ──> ...
33
+ instance B tmux ──> run_queue.sh queueB.txt ──> cell b1 ──> cell b2 ──> ...
34
+ instance C tmux ──> run_queue.sh queueC.txt ──> cell c1 ──> ...
35
+ each cell writes ONLY to its own /ckpt/<name>/ + FS/<name>/
36
+ ```
37
+
38
+ Split the N cells across queue files (one per instance) by cost, not count — route the long cells
39
+ (detection at 50 epochs) onto faster/idle instances so the queues finish near-simultaneously.
40
+
41
+ ---
42
+
43
+ ## 2. FS-shared wrapper deployment
44
+
45
+ Place a **single copy** of `run_one`/`run_queue` on the cross-instance shared filesystem
46
+ (`profiles/<platform>.md` STORAGE names the mount; on AutoDL it is the FS tier, on RunPod a Network
47
+ Volume, on a bare box a synced NFS/`rsync` target). Every instance reads the **same version** — no
48
+ per-instance drift, no "fixed it on A but not B."
49
+
50
+ **Recall principle #6 — never mutate inputs under a live run.** A running queue holds
51
+ `run_queue.sh`/`run_one.sh` in memory by byte-offset; overwriting either mid-run lands bash in the
52
+ middle of a *different* file and re-executes blocks (duplicate runs, stalled queues). Therefore:
53
+
54
+ - **Deploy the wrapper before launching any queue.** Treat the FS copy as immutable for the fan-out's
55
+ lifetime. Edit only when nothing reads it (`pgrep -af run_queue.sh` empty on every instance).
56
+ - **Appending to a queue *file* mid-flight is safe** (streaming read re-reads on each iteration);
57
+ editing the *script* is not. New cells → append a line, or start a fresh queue file.
58
+ - A fix that must reach in-flight jobs → **version the filename** (`run_one.v2.sh`), drain the old
59
+ queues, point new queues at the new file. Never `scp` over the path a live queue is reading.
60
+
61
+ The FS copy is also the durable safety net: `run_one`'s post-success step syncs `best.pth` +
62
+ metrics + log to `FS/<name>/`, so a released/dead instance still leaves its cell's result on the FS.
63
+
64
+ ---
65
+
66
+ ## 3. The independence predicate
67
+
68
+ **REQUIRED:** `superpowers:dispatching-parallel-agents` — fan out only over work whose units share no
69
+ mutable state. Here the predicate is concrete: **each cell writes to its own output directory and
70
+ nothing else.** The per-job output dir is the platform analogue of a **git worktree** — an isolated
71
+ workspace where one agent's writes can never collide with another's.
72
+
73
+ Hold the predicate by routing every per-cell write to a name-scoped path:
74
+
75
+ | Write target | Isolation key | Set via |
76
+ |---|---|---|
77
+ | checkpoints | `<ckpt_root>/<name>/` | `training.checkpoint_dir` override (per `<name>`) |
78
+ | FS final copy | `FS/final_ckpts/<name>/` | `run_one` post-success sync |
79
+ | tracker run | `group=<task>_<cat>`, unique run name | `wandb.group` / `wandb.tags` overrides |
80
+ | per-cell log | `<name>.log` | `run_queue` per-line logging |
81
+
82
+ **Never fan out onto shared mutable output.** Two cells writing `latest.pth`, the same
83
+ `checkpoint_dir`, or one tracker run id = the exact shared-state violation the predicate forbids —
84
+ it produces silently interleaved checkpoints and unattributable metrics, which no amount of
85
+ post-hoc reconciliation can untangle. The `<name>` derives 1:1 from the cfg, so distinct cfgs →
86
+ distinct paths automatically; **two queue lines must never share a `<name>`.**
87
+
88
+ What is read-shared (the immutable wrappers, the dataset, the base image) is fine — the predicate
89
+ only forbids shared **mutable** state.
90
+
91
+ ---
92
+
93
+ ## 4. The portable job request
94
+
95
+ Describe a sweep once so the *same* fan-out runs against any profile (the launcher resolves it
96
+ against `profiles/<platform>.md`; the profile supplies paths/verbs, the job supplies the work — see
97
+ `profiles/_schema.md`):
98
+
99
+ ```yaml
100
+ resources:
101
+ gpu: {name: A100, count: 1, memory: 24GB+} # a CONSTRAINT, never a platform SKU
102
+ disk: 100GB # ckpt_size × cells_per_instance + scratch
103
+ candidates: [autodl, china, runpod] # ordered fallback → describe once, run anywhere
104
+ run: "bash run_queue.sh queue.txt" # the per-instance entry point
105
+ ```
106
+
107
+ Per-instance disk budget = `ckpt_size × cells_in_this_queue + scratch` (principle #5). Pre-compute it
108
+ in Phase 0; a fan-out that under-budgets disk fails the *last* cells of each queue, not the first.
109
+
110
+ ---
111
+
112
+ ## 5. Queue-file format + resume
113
+
114
+ One ablation cell per line, whitespace-separated (`while IFS=' ' read -r cfg task epochs`):
115
+
116
+ ```
117
+ <cfg_path> <task> [epochs]
118
+ ```
119
+
120
+ - `cfg_path` — yaml file relative to repo root; its basename is the cell `<name>` (the isolation key).
121
+ - `task` — reconstruction / segmentation / detection (or other supported task) — sets tracker group/tags.
122
+ - `epochs` — optional integer; omitted → wrapper default (e.g. `20`). The optional 3rd field lets one
123
+ queue mix per-task budgets (detection 50, recon/seg 20).
124
+
125
+ ```
126
+ configs/experiments/ablation/recon/baseline.yaml reconstruction 20
127
+ configs/experiments/ablation/det/baseline.yaml detection 50
128
+ configs/experiments/ablation/seg/no_aug.yaml segmentation
129
+ ```
130
+
131
+ **Resume via `start_index`.** A queue killed at cell k (SSH drop, preemption, OOM) resumes with
132
+ `bash run_queue.sh queue.txt <k>` — it skips the first k lines and continues. This is the queue-level
133
+ form of principle #8 (idempotent resume); combined with per-cell checkpoint-load-on-startup, a
134
+ half-finished cell resumes mid-cell, not from scratch. Keep `start_index` aligned to the queue file:
135
+ appending lines is safe, **reordering or deleting earlier lines shifts every index** — append only.
136
+
137
+ ---
138
+
139
+ ## 6. Mandatory post-fan-out reconciliation + full re-verify
140
+
141
+ **REQUIRED:** `superpowers:dispatching-parallel-agents` (reconcile) and
142
+ **REQUIRED:** `superpowers:verification-before-completion` (evidence before any success claim). When
143
+ queues report done, the watcher's "done" is a **claim** (principle #3), not ground truth — a cell can
144
+ report success on a silently-failed sync, OOM mid-write, or never have run because its instance died.
145
+
146
+ Reconcile and re-verify **every cell before any teardown** — this is a hard gate, not a spot check:
147
+
148
+ 1. **Roster.** Enumerate the expected cell `<name>` set from all queue files (the ground-truth roster).
149
+ 2. **Reconcile.** For each `<name>`, confirm `FS/final_ckpts/<name>/` exists and holds `best.pth` +
150
+ metrics + log. List the delta: missing, zero-byte, or duplicate-`<name>` collisions (a predicate
151
+ violation that slipped through — see Gotchas).
152
+ 3. **Re-verify by load.** Run `scripts/verify_local.py` over the durable copies — *load* each
153
+ checkpoint and metrics file. "The file exists" / "the log said synced" is not evidence; a load
154
+ that succeeds is (principle #3, the `verifying-dl-experiments` boundary owns whether the *number*
155
+ is real — **REQUIRED:** `verifying-dl-experiments`).
156
+ 4. **Remediate, never blind-retry.** Each missing/failed cell → classify the cause, then re-launch the
157
+ **identical config** (principle #7) on a live instance via `start_index`, or append its line to a
158
+ fresh queue. Do not patch one cell's config to make it pass — that destroys comparability.
159
+
160
+ Only after the roster is 100% reconciled AND every cell loads does the teardown Iron Law unlock
161
+ (SKILL.md Phase 5): no `release`/`terminate`/`destroy` until results are pulled to local AND verified
162
+ by load AND the user approves the cost-affecting action.
163
+
164
+ ---
165
+
166
+ ## 7. Gotchas
167
+
168
+ **Two cells share a `<name>` → interleaved checkpoints, unattributable metrics.**
169
+ Symptom: one cell's `best.pth` overwritten, a tracker run with mixed curves, reconciliation finds N-1
170
+ output dirs for N cells. → Root cause: independence-predicate violation — two queue lines mapped to
171
+ the same isolation key (same cfg basename / hand-set identical `checkpoint_dir`). → Fix: enforce
172
+ distinct `<name>` per line *before* launch (the cfg→`<name>` map must be injective); on collision,
173
+ rename one cfg and rerun both — interleaved output cannot be un-mixed after the fact.
174
+
175
+ **Editing the FS wrapper mid-fan-out → duplicate / stalled cells across instances.**
176
+ Symptom: cells re-run or queues hang after a "quick fix" to `run_queue.sh`/`run_one.sh` on the FS. →
177
+ Root cause: principle #6 — live bash holds the script by byte-offset; overwriting the shared copy
178
+ corrupts every reader at once. → Fix: treat the FS wrapper as immutable for the fan-out's lifetime;
179
+ version the filename and repoint new queues; edit only when `pgrep -af run_queue.sh` is empty
180
+ everywhere.
181
+
182
+ **Queue reports "all done" but a cell never ran.**
183
+ Symptom: roster has N cells, FS has fewer; no error in the surviving logs. → Root cause: the instance
184
+ died (released, preempted, host fault) and its queue's "done" was never emitted — absence of failure
185
+ is not presence of success (principle #3). → Fix: reconcile against the **roster**, not against the
186
+ watcher's last status; re-launch missing cells with `start_index` on a live instance.
187
+
188
+ **`start_index` resumes the wrong cell after a queue edit.**
189
+ Symptom: resume skips or re-runs the wrong rows. → Root cause: a line was inserted/deleted/reordered,
190
+ shifting every subsequent index. → Fix: append-only to in-flight queue files; to drop a cell, comment
191
+ it (don't delete) so indices stay stable, or start a fresh queue file for the remainder.
192
+
193
+ > Universal gotchas (SSH drop on `pkill`, CRLF, cgroup OOM, silent sync, inode exhaustion on
194
+ > many-small-files eval output across a shared FS) are **not** restated here — see
195
+ > `references/gotchas_universal.md`. Shared-FS inode pressure (principle #5) bites hardest exactly
196
+ > during fan-out, when N cells write eval artifacts to one FS at once.
@@ -0,0 +1,179 @@
1
+ # Operating Principles — the 10 invariants, expanded
2
+
3
+ These are the *why* behind every phase and gotcha. They hold on any **metered, isolated, rented GPU**
4
+ — AutoDL, RunPod, vast.ai, Lambda, Paperspace, a Chinese platform, a bare SSH box, Slurm, or K8s. Only
5
+ the concrete paths/CLI change (those live in `profiles/<platform>.md`). Internalize these; the recipes
6
+ follow. The one-line form is in `SKILL.md`; this file carries the cross-platform nuance.
7
+
8
+ To jump: `grep -n '^## ' references/principles.md`.
9
+
10
+ ---
11
+
12
+ ## 1. Minimize paid wall-clock
13
+
14
+ The meter runs the *entire* time the box is up, not just while the GPU computes. Three consequences:
15
+ smoke-test correctness **locally on CPU before renting** (principle #2); **launch detached and hand
16
+ control back** rather than babysitting a blocking `sleep`; and **release the instant verification
17
+ passes** (principle #9 governs the *who-decides*). Every idle paid minute — a stuck download, a forgotten
18
+ box overnight, a human-in-the-loop pause on a live instance — is money.
19
+
20
+ *Universal.* Even on Slurm where the "meter" is walltime/fairshare quota rather than dollars, the same
21
+ discipline applies: don't hold an allocation idle.
22
+
23
+ ---
24
+
25
+ ## 2. Cheap checks before expensive compute
26
+
27
+ A CPU smoke (1–2 batches, logger disabled, tiny shapes) kills import errors, config drift, tensor-shape
28
+ and measurement-**scale** bugs for ~free, **before** they bill GPU-hours. It is *necessary, not
29
+ sufficient* — it won't catch convergence — but it catches the dumb-and-expensive failures that otherwise
30
+ only surface after an instance spins up.
31
+
32
+ *Boundary:* this skill owns *when* to run the smoke (the pre-rent gate). The smoke's *content* — what to
33
+ assert, how to shrink the problem — belongs to **`verifying-dl-experiments`**. Don't duplicate it here.
34
+
35
+ ---
36
+
37
+ ## 3. Trust artifacts you loaded, not log lines that claim success
38
+
39
+ "synced / saved / done / 100% complete" is a **claim**, and claims lie under a silently-failed write —
40
+ a full disk, exhausted inodes, a swallowed error, a half-uploaded blob. Confirm the file **exists and
41
+ loads** before releasing the only copy.
42
+
43
+ **A watcher's own state is also a claim**, not ground truth. An async condition-waiter whose job you
44
+ superseded polls a marker that will never arrive (a zombie that loops forever). A session-scoped monitor
45
+ dies on context reset while the job runs on. Reconcile watchers against the job's *real* process and
46
+ artifacts (`tmux ls` / `squeue` / `pgrep`, output `mtime`, a load-test), tear a watcher down when you
47
+ supersede its job, and match a watcher's lifetime to the wait's duration.
48
+
49
+ > **Monitoring physics this rests on:** foreground Bash hard-caps at 600 s (a long foreground wait is
50
+ > killed at 10 min); `run_in_background` has **no** cap and notifies on exit; a never-*exiting* watcher
51
+ > never notifies; an unquoted `|` inside a poll regex splits into piped commands and the first reads
52
+ > stdin → hangs forever. See `references/monitoring_patterns.md`.
53
+
54
+ *Universal — the load-bearing spine.* It is the platform instance of
55
+ `superpowers:verification-before-completion`'s Iron Law ("no completion claim without fresh verification
56
+ evidence"). Shared with `verifying-dl-experiments`.
57
+
58
+ ---
59
+
60
+ ## 4. Know what survives stop vs destroy
61
+
62
+ **The single biggest portability trap.** AutoDL persists `/root` across a power-off — so the AutoDL
63
+ habit is "just 关机, my data's fine." That assumption is **false almost everywhere else**:
64
+
65
+ - **RunPod** wipes the *container disk* on stop; only the *volume disk* (`/workspace`) survives a stop,
66
+ and only a **Network Volume** survives a terminate.
67
+ - **vast.ai** keeps disk across a stop but **bills it forever**; a destroy loses everything.
68
+ - **K8s** wipes the pod filesystem on every reschedule unless a PVC is mounted.
69
+ - **Colab** loses `/content` and RAM on disconnect.
70
+
71
+ So the principle is not a path — it's a **discipline**: for each platform, before Phase 0, read the
72
+ profile's STORAGE survival-matrix and write your checkpoints to the mount that survives the teardown verb
73
+ you intend to use. The data you need most often lives on the *volatile* tier by default.
74
+
75
+ *Mixed:* the *rule* is universal; the *which-mount* value is a profile fact.
76
+
77
+ ---
78
+
79
+ ## 5. Storage fails on the dimension — and the location — you're not watching
80
+
81
+ Disk dies on **inodes before bytes** (`df -h` shows 34% while `cp` fails "No space left" because `df -i`
82
+ is at 100% — classic on a shared FS full of many-small-files eval output). The real space hog often
83
+ lives where you didn't look — a **symlinked cache** (`~/.cache/huggingface` mapped onto the data disk)
84
+ can outweigh the `runs/` you created. **Audit with `du` on the actual mount, not assumptions.** Clean by
85
+ **value**: keep the tiny irreplaceable evidence (metric/eval JSONs), discard the large reproducible
86
+ scratch (periodic checkpoints, unused model caches — one observed sweep left **179 GB** of superseded `latest.pt`/`epoch_*.pt` while the real evidence was **<200 MB** of JSON). Pre-compute the budget; monitor `df -i`, not just
87
+ `df -h`.
88
+
89
+ *Mixed:* the inode-cap *number* is a profile fact (AutoDL/China enforce ~200K; RunPod/vast/Lambda spec
90
+ GB quotas with no documented inode cap). The "audit the real mount, clean by value" discipline is core.
91
+ The general form of the many-small-files trap is **shard into tar** (WebDataset) — see
92
+ `references/gotchas_universal.md` U25.
93
+
94
+ ---
95
+
96
+ ## 6. Never mutate inputs under a live run
97
+
98
+ A running job holds its scripts **in memory by byte-offset**. tmux keeps `run_queue.sh` as-loaded; bash
99
+ reads a script by seeking to a saved offset, so `scp`-ing a new version mid-run makes bash land in the
100
+ middle of a *different* file and re-execute blocks (duplicate runs, stalled queues). Version filenames;
101
+ edit only when nothing is reading them (`pgrep -af <script>` empty).
102
+
103
+ *Universal — pure bash/tmux physics.* Identical across every SSH backend.
104
+
105
+ ---
106
+
107
+ ## 7. Design for retry — failure is probabilistic, transfers are flaky, mirrors are route-specific
108
+
109
+ Some fraction of identical launches die (a network blip during `wandb.init`, a transient kernel fault, a
110
+ spot preemption). Wrappers must be **idempotent and resumable**; retry the **identical config** rather
111
+ than hand-patching one run (which destroys comparability — see `verifying-dl-experiments`).
112
+
113
+ **Bulk transfers are the prototypical flaky step:** wrap them in `timeout`+resume retry loops — a stall
114
+ ≠ permanent failure, and resumable downloads accumulate progress across kills. An acceleration
115
+ **mirror/proxy/cache speeds ONE route, not all** — it may cover the metadata/API path while the bulk-data
116
+ path (a CDN/blob backend) still fails, and a *domestic* source routed through a *foreign*-acceleration
117
+ proxy is slower. Match the route to the origin; validate a speed test on the **same route** the real
118
+ transfer uses (a no-proxy probe of a proxied transfer measures nothing).
119
+
120
+ *Universal.* The **spot/preemption** sub-case is profile-parameterized (central on vast/RunPod; on
121
+ Lambda/Paperspace/China the interruption is auto-shutdown/auto-release/capacity instead) — see principle
122
+ #8 and `references/spot-resilience.md`.
123
+
124
+ ---
125
+
126
+ ## 8. Checkpoint-to-durable + idempotent resume is the universal spine
127
+
128
+ Detaching the job is necessary but not sufficient. The **one** mechanism that survives every failure
129
+ mode — SSH drop, Slurm walltime kill, K8s pod reschedule, spot preemption, Colab disconnect — is:
130
+
131
+ 1. **Checkpoint full state to the platform's durable location** on a periodic timer (model + optimizer +
132
+ LR-scheduler + epoch/step + RNG + dataloader position), written **atomically** (`tmp`→`fsync`→
133
+ `os.rename`) so a mid-write kill never corrupts the latest good checkpoint.
134
+ 2. **Load-latest-on-startup unconditionally**, so the *identical launch command* resumes instead of
135
+ restarting. This is what makes principle #7's "retry the identical config" actually resume progress.
136
+
137
+ The **detach primitive is the swappable plug** — tmux on a bare box, `sbatch --requeue` on Slurm, a Job
138
+ manifest on K8s, a Save&Run commit on Kaggle, a checkpoint-to-Drive loop on Colab. Checkpoint+resume is
139
+ the invariant underneath all of them.
140
+
141
+ *Universal.* Cadence is a formula, not a guess — Young/Daly `W = √(2·μ·C)` (μ = mean time between
142
+ preemptions, C = checkpoint write time); round *down* to an iteration boundary. Managed frameworks
143
+ (SkyPilot Managed Jobs, SageMaker) move the box for you but **restart your process from scratch — your
144
+ checkpoint-load is what restores progress.** Details + worked numbers in `references/spot-resilience.md`.
145
+
146
+ ---
147
+
148
+ ## 9. Cost and destructive actions are the user's call
149
+
150
+ Never auto-release/terminate an instance, never delete durable/shared files without explicit
151
+ confirmation, and if your own cleanup can't free enough space, **ask to expand the disk** (state the GB
152
+ needed) rather than silently shrinking the experiment (fewer seeds, smaller eval, capped vis).
153
+
154
+ This is sharpened, not softened, by going multi-platform: on RunPod/vast/Lambda the meter-stopping action
155
+ is the **irreversible** `terminate`/`destroy` that deletes the disk — so the confirmation gate matters
156
+ *more*. Operationalize it as the **teardown Iron Law** (SKILL.md Phase 5): no teardown before checkpoints
157
+ are pulled to local AND verified by load AND the user approves the specific cost-affecting action.
158
+
159
+ *Universal.* A shared FS is also multi-project: work inside your project's own folder, delete only your
160
+ own redundancy, never a top-level dir you didn't create.
161
+
162
+ ---
163
+
164
+ ## 10. Teach the user the platform, don't just drive it
165
+
166
+ Most users — especially on a platform they rent only occasionally — don't know its non-obvious
167
+ **conveniences** or its **danger clocks**, and the skill's job is not just to operate the box but to *tell
168
+ them*. On first contact with a platform, proactively surface:
169
+ - **Conveniences they'd otherwise miss:** one-click SSH-key registration (so the agent can connect
170
+ non-interactively), GPU-availability notifications, the built-in panels (JupyterLab / the TensorBoard tile).
171
+ - **Danger clocks that cost data or money:** auto-release / auto-delete timers on *stopped* instances
172
+ (AutoDL releases a 关机 box after **15 days** → the data disk is gone; several CN platforms in ~10), a
173
+ stop that keeps billing (vast.ai forever, RunPod 2×), low-balance / arrears purge.
174
+
175
+ The per-platform list lives in each profile's **Surface to the user** block. This pairs with #9: #9 stops
176
+ the agent from *doing* the dangerous thing; #10 makes the agent *warn the human* about the danger clock
177
+ before it fires. The most expensive surprises on rented hardware are the silent timers (a parked box
178
+ released, a stopped disk still billing), not the visible failures — surfacing them early is the cheapest
179
+ insurance.
@@ -0,0 +1,74 @@
1
+ # Getting better over time — capture new gotchas + personalize (without corrupting the skill)
2
+
3
+ This skill is a static reference, so it does **not** evolve on its own. But every real run teaches
4
+ something — a new platform quirk, a training bug not in the catalog, or the user's own setup. This file
5
+ is the protocol for **sedimenting that knowledge in the right place, at the right bar, without silently
6
+ rewriting the skill**. Apply it whenever a run surfaces something the catalog did not already cover.
7
+
8
+ To jump: `grep -in '<keyword>' references/self-improvement.md`.
9
+
10
+ ## Table of contents
11
+ 1. The bar — what qualifies as a keepable gotcha
12
+ 2. Route the learning — user memory vs the catalog vs an upstream PR
13
+ 3. Propose, don't auto-edit
14
+ 4. First-run personalization — capture the user's setup into memory
15
+ 5. Freshness — platform facts rot; stamp and re-verify
16
+
17
+ ## 1. The bar — do NOT enshrine a one-off
18
+
19
+ A surprising failure is a **hypothesis, not a gotcha** (principle #3; **REQUIRED:**
20
+ `verifying-dl-experiments`). Sediment a NEW gotcha ONLY when all three hold:
21
+
22
+ - **Root-caused** — the mechanism is understood, not just "it worked after I retried."
23
+ - **Reproduced or clearly mechanistic** — not a single flaky incident (a transient network blip is not a gotcha).
24
+ - **Generalizable** — another user on this platform / training setup would hit it too.
25
+
26
+ If it fails the bar it is a *project note* (→ §4 memory), not a catalog entry. Enshrining unverified
27
+ one-offs is exactly the catalog rot this bar exists to prevent.
28
+
29
+ ## 2. Route the learning
30
+
31
+ | What was learned | Where it goes | Form |
32
+ |---|---|---|
33
+ | **User/personal/host-specific** — this account's quirk, the user's preference, the usual GPU plan, "on MY box X is true" | **the host's `memory/` system** (host-specific, personal, may be ephemeral) | a `reference` / `project` / `feedback` fact, one per file, deduped |
34
+ | **A project-level fact or recurring error the user keeps hitting on THEIR project** — a config quirk, "always run X first", a path that must be Y, an env that must be activated | a **project instructions file in the user's repo** — `CLAUDE.md` / `AGENTS.md` / `.cursorrules` (persists cross-session AND cross-tool AND for collaborators, unlike host memory) | a short imperative rule; **propose, don't auto-write** (§3) |
35
+ | **Generalizable platform gotcha** | `profiles/<platform>.md` §7 (platform-pinned) or `references/gotchas_universal.md` (cross-platform) | `symptom → root cause → fix` + a source URL |
36
+ | **Generalizable training-debug gotcha** | `references/training/<topic>.md` | same form |
37
+ | **A correction** (a fact here is now wrong/stale) | edit that file; re-stamp its `verified <month>` | note old → new + URL |
38
+
39
+ Because this skill is open-source, a generalizable addition/correction is also a candidate for an
40
+ **upstream PR** — offer to open one so every user benefits, not just this install.
41
+
42
+ ## 3. Propose, don't auto-edit
43
+
44
+ **NEVER silently rewrite a skill file from a single run** — a wrong "fix" or a broken structure is worse
45
+ than a missing entry. Instead:
46
+
47
+ - Draft the entry (`symptom → root cause → fix`, with its source) and **show it to the user for approval.**
48
+ - For an out-of-scope or larger change, spin it off (the host's task / PR mechanism) rather than bloating
49
+ the current run.
50
+ - Apply only after the user okays it; then re-run the skill's own checks — cross-refs resolve, **no secret
51
+ value written**, structure + TOC intact.
52
+
53
+ ## 4. First-run personalization → memory
54
+
55
+ The first time this skill runs for a user, capture their **actual setup** into memory so later runs are
56
+ pre-parameterized instead of re-asked:
57
+
58
+ - which platform(s) they rent, and the per-platform §8 SCRIPT OVERRIDES that worked (paths, proxy hook, cred location);
59
+ - the project repo path / training entrypoint / config layout;
60
+ - the tracker entity (wandb / trackio) and **where the key lives** (its env-var name or file path — never the value);
61
+ - the usual GPU plan + disk budget.
62
+
63
+ Store as a `project` / `reference` memory. Record only the credential's *name or path*,
64
+ never the secret itself. Next session the profile overrides + `run_one` params come pre-filled.
65
+
66
+ ## 5. Freshness — this skill has a shelf life
67
+
68
+ Platform prices, billing verbs, and limits **change**. Every platform fact is annotated `verified
69
+ <month>` at authoring time. Before betting **money or data** on a teardown/billing fact — the
70
+ irreversible ones (`terminate` / `destroy` / release) — **re-verify it against the platform's current
71
+ docs in the same session.** If a fact is stale, fix it (the §2 correction row) and re-stamp the date. A
72
+ quarterly re-verification of each profile's §5 TEARDOWN/BILLING section keeps the highest-stakes facts
73
+ honest — schedulable via the host's `/schedule`. Run `scripts/check_staleness.py` to list every `verified`
74
+ stamp older than N months (a mechanical reminder of WHAT to re-check — it does not verify the fact itself).