opencode-skills-collection 3.1.1 → 3.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/bundled-skills/.antigravity-install-manifest.json +4 -1
  2. package/bundled-skills/2slides-ppt-generator/SKILL.md +8 -7
  3. package/bundled-skills/agent-creator/SKILL.md +246 -0
  4. package/bundled-skills/android-cli/SKILL.md +19 -7
  5. package/bundled-skills/android-ui-journey-testing/SKILL.md +5 -5
  6. package/bundled-skills/apple-notes-search/SKILL.md +12 -2
  7. package/bundled-skills/atlas-ledger/SKILL.md +8 -0
  8. package/bundled-skills/ax-extract-workflow/SKILL.md +156 -0
  9. package/bundled-skills/codex-fable5/SKILL.md +10 -2
  10. package/bundled-skills/competitor-analysis/scripts/gate_candidates.mjs +45 -15
  11. package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
  12. package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
  13. package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
  14. package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
  15. package/bundled-skills/docs/sources/sources.md +1 -1
  16. package/bundled-skills/docs/users/bundles.md +145 -1
  17. package/bundled-skills/docs/users/claude-code-skills.md +1 -1
  18. package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
  19. package/bundled-skills/docs/users/getting-started.md +1 -1
  20. package/bundled-skills/docs/users/kiro-integration.md +1 -1
  21. package/bundled-skills/docs/users/specialized-plugin-roadmap.md +11 -4
  22. package/bundled-skills/docs/users/usage.md +4 -4
  23. package/bundled-skills/docs/users/visual-guide.md +4 -4
  24. package/bundled-skills/dos-verify-done-claims/SKILL.md +16 -4
  25. package/bundled-skills/ecl-harness-engineer/agents/creator-config.md +1 -1
  26. package/bundled-skills/ecl-harness-engineer/references/environment-config-guide.md +2 -2
  27. package/bundled-skills/ecl-harness-engineer/references/environment-detection-guide.md +4 -4
  28. package/bundled-skills/event-staffing-ordering/SKILL.md +4 -0
  29. package/bundled-skills/loop-library/SKILL.md +7 -4
  30. package/bundled-skills/lovable-cleanup/SKILL.md +11 -8
  31. package/bundled-skills/macos-screen-recorder/SKILL.md +9 -1
  32. package/bundled-skills/mailtrap-managing-contacts/SKILL.md +1 -1
  33. package/bundled-skills/mailtrap-sending-emails/SKILL.md +1 -1
  34. package/bundled-skills/mailtrap-setting-up-sending-domain/SKILL.md +1 -1
  35. package/bundled-skills/remote-gpu-trainer/.gitattributes +8 -0
  36. package/bundled-skills/remote-gpu-trainer/LICENSE +21 -0
  37. package/bundled-skills/remote-gpu-trainer/README.md +267 -0
  38. package/bundled-skills/remote-gpu-trainer/SKILL.md +249 -0
  39. package/bundled-skills/remote-gpu-trainer/evals/README.md +57 -0
  40. package/bundled-skills/remote-gpu-trainer/evals/RESULTS.md +44 -0
  41. package/bundled-skills/remote-gpu-trainer/evals/cases.jsonl +14 -0
  42. package/bundled-skills/remote-gpu-trainer/evals/run_evals.py +68 -0
  43. package/bundled-skills/remote-gpu-trainer/examples/autodl_sweep/README.md +72 -0
  44. package/bundled-skills/remote-gpu-trainer/examples/autodl_sweep/queue_1.txt +6 -0
  45. package/bundled-skills/remote-gpu-trainer/profiles/_schema.md +100 -0
  46. package/bundled-skills/remote-gpu-trainer/profiles/autodl.md +327 -0
  47. package/bundled-skills/remote-gpu-trainer/profiles/china.md +397 -0
  48. package/bundled-skills/remote-gpu-trainer/profiles/generic-ssh.md +450 -0
  49. package/bundled-skills/remote-gpu-trainer/profiles/lambda.md +342 -0
  50. package/bundled-skills/remote-gpu-trainer/profiles/paperspace.md +365 -0
  51. package/bundled-skills/remote-gpu-trainer/profiles/runpod.md +164 -0
  52. package/bundled-skills/remote-gpu-trainer/profiles/vastai.md +355 -0
  53. package/bundled-skills/remote-gpu-trainer/references/china-network.md +206 -0
  54. package/bundled-skills/remote-gpu-trainer/references/gotchas_universal.md +704 -0
  55. package/bundled-skills/remote-gpu-trainer/references/lifecycle_checklist.md +148 -0
  56. package/bundled-skills/remote-gpu-trainer/references/monitoring_patterns.md +327 -0
  57. package/bundled-skills/remote-gpu-trainer/references/multinode.md +190 -0
  58. package/bundled-skills/remote-gpu-trainer/references/parallel_ablation.md +196 -0
  59. package/bundled-skills/remote-gpu-trainer/references/principles.md +179 -0
  60. package/bundled-skills/remote-gpu-trainer/references/self-improvement.md +74 -0
  61. package/bundled-skills/remote-gpu-trainer/references/spot-resilience.md +235 -0
  62. package/bundled-skills/remote-gpu-trainer/references/ssh_transport.md +270 -0
  63. package/bundled-skills/remote-gpu-trainer/references/training/by-domain.md +230 -0
  64. package/bundled-skills/remote-gpu-trainer/references/training/checkpoint-resume.md +368 -0
  65. package/bundled-skills/remote-gpu-trainer/references/training/convergence-debugging.md +187 -0
  66. package/bundled-skills/remote-gpu-trainer/references/training/data-pipeline.md +119 -0
  67. package/bundled-skills/remote-gpu-trainer/references/training/distributed-launch.md +422 -0
  68. package/bundled-skills/remote-gpu-trainer/references/training/oom-memory.md +338 -0
  69. package/bundled-skills/remote-gpu-trainer/references/training/precision-stability.md +401 -0
  70. package/bundled-skills/remote-gpu-trainer/references/training/throughput-profiling.md +451 -0
  71. package/bundled-skills/remote-gpu-trainer/scripts/aggregate_to_fs.sh +55 -0
  72. package/bundled-skills/remote-gpu-trainer/scripts/check_staleness.py +70 -0
  73. package/bundled-skills/remote-gpu-trainer/scripts/download_loop.sh +67 -0
  74. package/bundled-skills/remote-gpu-trainer/scripts/gpu_health.sh +169 -0
  75. package/bundled-skills/remote-gpu-trainer/scripts/health_patrol.sh.template +67 -0
  76. package/bundled-skills/remote-gpu-trainer/scripts/mem_monitor.sh +67 -0
  77. package/bundled-skills/remote-gpu-trainer/scripts/reap_vram_zombies.sh +175 -0
  78. package/bundled-skills/remote-gpu-trainer/scripts/run_one.sh.template +104 -0
  79. package/bundled-skills/remote-gpu-trainer/scripts/run_queue.sh.template +83 -0
  80. package/bundled-skills/remote-gpu-trainer/scripts/setup-china-mirrors.sh +35 -0
  81. package/bundled-skills/remote-gpu-trainer/scripts/verify_local.py +145 -0
  82. package/bundled-skills/screenstudio-alt/SKILL.md +9 -1
  83. package/bundled-skills/vibecode-production-qa-validator/SKILL.md +1 -1
  84. package/bundled-skills/youtube-notetaker/scripts/serve.py +63 -14
  85. package/package.json +1 -1
  86. package/skills_index.json +128 -49
@@ -0,0 +1,451 @@
1
+ # Throughput & profiling — make training FAST, find the one bottleneck
2
+
3
+ How to tell *why* a rented GPU is underfed (GPU-bound vs data-bound vs comms-bound), then apply the
4
+ right speedup in cost order — from a free dataloader knob to `torch.compile` and fused attention. This
5
+ layer owns *making it RUN fast + locating the mechanical bottleneck*; **verifying-dl-experiments** owns
6
+ *is the resulting number correct*. Cross-link it (**REQUIRED**) wherever a speedup risks changing the
7
+ science (a kernel that alters numerics, a precision swap, dropping samples to "go faster").
8
+
9
+ > **Size the run to the box — then PIN it for any comparison.** Auto-sizing batch/`num_workers` to the
10
+ > measured GPU/VRAM/vCPU (Phase 0) to use the card well is fine for a STANDALONE job; but for an ablation
11
+ > or baseline-vs-variant comparison, **pin the same batch across all cells** — auto-maximizing per-box
12
+ > silently changes a variable and breaks comparability (**verifying-dl-experiments**, REQUIRED).
13
+
14
+ To jump: `grep -in '<keyword>' references/training/throughput-profiling.md` (e.g. `bound`, `workers`,
15
+ `compile`, `recompile`, `flash`, `sdpa`, `nsys`, `py-spy`, `channels_last`, `tf32`, `overlap`).
16
+
17
+ ## Table of contents
18
+
19
+ - **Diagnose first** — T1 the 3-way split (GPU/data/comms-bound) · T2 util%-is-a-liar pointer · T3 the cheap CPU/GPU-busy triage
20
+ - **Dataloader (the #1 cause of a starved GPU)** — T4 num_workers · T5 persistent_workers · T6 pin_memory + non_blocking · T7 prefetch_factor · T8 IO-bound vs CPU-transform-bound
21
+ - **Free / near-free knobs** — T9 TF32 + matmul precision · T10 cudnn.benchmark · T11 channels_last · T12 set_to_none + disable debug APIs
22
+ - **Mixed precision for speed** — T13 bf16/fp16 throughput
23
+ - **Kernels** — T14 SDPA / FlashAttention · T15 torch.compile gains · T16 torch.compile recompilation traps
24
+ - **Memory↔speed trades** — T17 activation checkpointing speed cost · T18 batch size vs throughput
25
+ - **Profilers** — T19 torch.profiler (is-it-data-bound) · T20 nsys / Nsight Systems · T21 py-spy (live, no restart) · T22 memory-snapshot pointer
26
+ - **Multi-GPU / multi-node comms** — T23 DDP/FSDP compute-comm overlap
27
+ - **Pointers** — gotchas_universal.md U8/U21/U24/U25/U38 · oom-memory.md · distributed-launch.md · multinode.md · verifying-dl-experiments (skill)
28
+
29
+ ---
30
+
31
+ ## Diagnose first — do NOT tune blind
32
+
33
+ ### T1 — The 3-way split: GPU-bound vs data-bound vs comms-bound (decide before touching a knob)
34
+
35
+ **Symptom**: training is "slow" and the instinct is to change the model or batch size at random.
36
+
37
+ **Root cause**: throughput is gated by exactly one of three resources at a time; the fix for each is
38
+ disjoint, so guessing wastes paid wall-clock (principle #1).
39
+
40
+ **Fix — classify with one cheap reading each** (heuristic: util consistently >90% ⇒ GPU-bound;
41
+ low/fluctuating ⇒ elsewhere; both CPU+GPU low ⇒ I/O —
42
+ https://apxml.com/courses/planning-optimizing-ai-infrastructure/chapter-5-strategies-for-performance-optimization/identifying-performance-bottlenecks):
43
+ - **GPU-bound** (the good case): util high *and* SM clock/power high (T2); adding workers doesn't help. Only
44
+ levers left: kernels (T14–T15), precision (T13), a bigger card.
45
+ - **Data-bound**: util low-but-nonzero or sawtoothing, host CPU busy in `DataLoader`/transforms; a trace
46
+ shows GPU-idle gaps lining up with CPU data work (T19). Go to T4–T8.
47
+ - **Comms-bound** (multi-GPU/-node only): per-GPU util high, scaling efficiency poor; time in
48
+ `nccl:all_reduce`/`all_gather` not overlapped with compute. Go to T23.
49
+
50
+ The highest-signal instrument is a **profiler trace** (T19) — read it before changing anything.
51
+
52
+ ### T2 — `nvidia-smi` GPU-Util % lies; correlate clock + power → gotchas_universal.md U21
53
+
54
+ A 100%-util tile can hide a starved GPU (a trickle of tiny kernels reads as 100%). The full diagnosis —
55
+ correlate `clocks.current.sm` + mem-bandwidth util + power via `nvidia-smi dmon -s pucvmet -d 1`, and the
56
+ thermal/power-throttle slowdown — lives in **gotchas_universal.md U21/U23**; read it before concluding a run
57
+ is GPU-bound. The *0%-util-but-running* (CPU-data-bound) inverse is **U38**, owned by verifying-dl-experiments.
58
+
59
+ ### T3 — Cheap triage when no profiler is wired yet: is the host CPU busy?
60
+
61
+ **Symptom**: need a 30-second answer to "GPU or data?" before instrumenting.
62
+
63
+ **Fix**: watch GPU and CPU at once for ~10 s —
64
+ ```bash
65
+ nvidia-smi dmon -s pu -d 1 -c 10 # per-second SM% + power; sawtooth/low = starved
66
+ top -b -n 1 | grep -i python | head # a worker pegged at ~100% CPU = CPU-transform-bound
67
+ ```
68
+ GPU SM% high and steady ⇒ GPU-bound (stop here, go to kernels/precision). GPU SM% sawtoothing while a
69
+ python worker is CPU-pegged ⇒ data-bound (T4–T8). Both idle ⇒ I/O-bound (stage to NVMe, U8). Then confirm
70
+ with a real trace (T19) before investing in a fix. **GPU SM% low while *many* python threads thrash a few
71
+ cores (not one worker pegged) ⇒ intra-op thread oversubscription** on a vCPU slice, not data-bound — cap
72
+ `OMP_NUM_THREADS` to your cgroup quota (gotchas_universal.md **U40**), don't add dataloader workers.
73
+
74
+ ---
75
+
76
+ ## Dataloader — the #1 reason a rented GPU sits idle
77
+
78
+ The partial-starve knob set (and its order) is **gotchas_universal.md U24**; this section is the per-knob
79
+ *why/when*. Each helps a *different* failure, so apply by symptom, not as a blanket cargo-cult.
80
+
81
+ ### T4 — `num_workers`: 0 means the main process loads serially (the default starves the GPU)
82
+
83
+ **Symptom**: `DataLoader(num_workers=0)` (the default) — every batch is fetched on the main thread, GPU
84
+ waits the whole fetch.
85
+
86
+ **Root cause**: with `num_workers=0` "the data will be loaded in the main process" — no overlap between
87
+ data prep and compute (https://docs.pytorch.org/docs/2.12/data.html).
88
+
89
+ **Fix**: set `num_workers > 0` to load asynchronously and overlap fetch with the GPU step. Start at
90
+ `cores − 1`, but **size against per-worker RAM, not CPU count** — each worker `fork`s a full copy of any
91
+ large in-dataset object; too many OOM the cgroup with a bare `Killed` (the quadratic trap + sizing rule are
92
+ **gotchas_universal.md U9**). Not monotonic: past the point where the GPU is fed, extra workers only add RAM
93
+ and startup cost.
94
+
95
+ ### T5 — `persistent_workers=True`: stop paying worker-startup every epoch
96
+
97
+ **Symptom**: a visible stall at the **start of every epoch** (especially short epochs / many epochs); GPU
98
+ idle while workers respawn.
99
+
100
+ **Root cause**: default `persistent_workers=False` shuts down all workers after the dataset is consumed
101
+ once and **re-forks them next epoch** — re-importing, re-opening files, rebuilding the dataset object each
102
+ time (https://docs.pytorch.org/docs/2.12/data.html).
103
+
104
+ **Fix**: `persistent_workers=True` keeps the worker Dataset instances alive between epochs, removing the
105
+ per-epoch respawn cost. Requires `num_workers > 0`. Biggest win when epochs are short or the dataset's
106
+ `__init__` is heavy (loads an index/manifest).
107
+
108
+ ### T6 — `pin_memory=True` + `non_blocking=True`: overlap the host→device copy
109
+
110
+ **Symptom**: the H2D copy (`x.to('cuda')`) sits on the critical path between fetch and forward.
111
+
112
+ **Root cause**: a pageable-memory tensor must be staged through a pinned buffer by the driver before DMA;
113
+ a synchronous `.to(device)` blocks the step. "When using a GPU it's better to set `pin_memory=True`"
114
+ (https://docs.pytorch.org/tutorials/recipes/recipes/tuning_guide.html).
115
+
116
+ **Fix**: `DataLoader(pin_memory=True)` allocates batches page-locked, **then** transfer
117
+ `x = x.to(device, non_blocking=True)` so the copy runs async on a copy stream and overlaps compute. Both
118
+ halves needed — `pin_memory` alone still blocks; `non_blocking` without pinned memory silently falls back to
119
+ a blocking copy. Costs host RAM (pinned pages aren't swappable) — back off if it pressures the cgroup (U9).
120
+
121
+ ### T7 — `prefetch_factor`: deepen the queue when fetch time is bursty
122
+
123
+ **Symptom**: with workers on, the GPU still periodically stalls — every *Nth* step (N = `num_workers`) has
124
+ a long idle gap because all workers were busy producing the next batch when the GPU asked
125
+ (https://docs.pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html).
126
+
127
+ **Root cause**: `prefetch_factor` defaults to **2** when `num_workers>0` (None when 0) — "2 means there
128
+ will be a total of `2 * num_workers` batches prefetched across all workers"
129
+ (https://docs.pytorch.org/docs/2.12/data.html). A shallow queue can't absorb a variance spike in
130
+ per-sample fetch/decode time.
131
+
132
+ **Fix**: raise `prefetch_factor` (3–4) so workers run ahead and bursts hide — at the cost of more resident
133
+ batches in RAM (re-check U9). A *smoothing* knob, not a multiplier: if the **average** fetch rate is below
134
+ the GPU's consume rate, no depth helps — fix the rate (workers T4, GPU transform T8, NVMe U8) instead.
135
+
136
+ ### T8 — IO-bound vs CPU-transform-bound are different data-bound cases (different fix)
137
+
138
+ **Symptom**: data-bound (T1), but adding workers barely helps.
139
+
140
+ **Root cause — split the case**:
141
+ - **IO-bound**: bytes arrive slowly from network/HDD/object store; workers sit in `read`. Stage the working
142
+ set to instance-local **NVMe** (HDD→NVMe gaps reach ~35×) = **gotchas_universal.md U8**; the many-tiny-files
143
+ transaction death + **shard-into-tar / WebDataset** fix = **U25**.
144
+ - **CPU-transform-bound**: a heavy per-sample augment (resize/decode/FFT) saturates CPU; workers CPU-pegged
145
+ (T3), capping at core count. Move the transform to the **GPU** (NVIDIA DALI, `torchvision.transforms.v2`
146
+ on tensors, kornia) onto idle GPU cycles. The *0%-util* serialized-transform variant is **U38**, owned by
147
+ verifying-dl-experiments **REQUIRED** (which also owns whether a GPU-side transform shifted the data
148
+ distribution).
149
+
150
+ **Fix**: read the trace (T19) — time in `read`/`stat` ⇒ U8/U25; time in a transform fn ⇒ move to GPU.
151
+
152
+ ---
153
+
154
+ ## Free / near-free knobs (set these once at startup on any box)
155
+
156
+ ### T9 — TF32 / `set_float32_matmul_precision("high")` — the "why is my A100 slow" footgun
157
+
158
+ The biggest free speedup on Ampere+ for any fp32 matmul path; **OFF by default since PyTorch 1.12**. The
159
+ decision and exact knobs (`torch.set_float32_matmul_precision("high")`, the legacy `allow_tf32` flags,
160
+ `--tf32 1` in HF Trainer, convergence impact) are owned by **references/training/precision-stability.md P2**
161
+ (cross-link there; do NOT restate). If a fresh PyTorch 2.x rental's fp32-heavy run is 2–4× slow with no bug,
162
+ this is the first suspect.
163
+
164
+ ### T10 — `cudnn.benchmark=True`: autotune conv algorithms (fixed input shapes only)
165
+
166
+ **Symptom**: a conv-heavy net (CNN/UNet) is slower than it should be; input shapes are constant.
167
+
168
+ **Root cause**: by default cuDNN picks a generic conv algorithm; the autotuner benchmarks variants on the
169
+ first batch of each new shape and caches the fastest
170
+ (https://docs.pytorch.org/tutorials/recipes/recipes/tuning_guide.html).
171
+
172
+ **Fix**: `torch.backends.cudnn.benchmark = True` once at startup. **Only helps when input shapes are
173
+ stable** — with variable shapes (dynamic resolution, ragged batches) it re-benchmarks every new shape and
174
+ *loses* time. Trade-off: it is **nondeterministic** (picks by first-batch timing), so it fights the
175
+ determinism knobs — whether to enable it for a clean datapoint is owned by precision-stability P19 /
176
+ verifying-dl-experiments (U36, **REQUIRED**).
177
+
178
+ ### T11 — `channels_last`: free Tensor-Core speedup for conv nets under AMP
179
+
180
+ **Symptom**: a CNN under mixed precision isn't hitting Tensor Cores; throughput below the card's potential.
181
+
182
+ **Root cause**: default NCHW contiguous layout forces layout transposes around Tensor-Core convolutions.
183
+
184
+ **Fix**: convert model and inputs to `memory_format=torch.channels_last` —
185
+ `model = model.to(memory_format=torch.channels_last)` and `x = x.to(memory_format=torch.channels_last)`.
186
+ Optimizes convolutional networks with Tensor Cores + AMP
187
+ (https://docs.pytorch.org/tutorials/recipes/recipes/tuning_guide.html). Marked experimental and CNN-specific
188
+ (no benefit for pure transformers). No numerics change — purely a layout speedup.
189
+
190
+ ### T12 — `set_to_none` + disable debug APIs (two free per-step taxes to remove)
191
+
192
+ - **`optimizer.zero_grad(set_to_none=True)`** (the **default** since PyTorch 2.0) over zero-filling —
193
+ assigning `None` skips a memory-write kernel per param and lets the next backward write fresh
194
+ (https://docs.pytorch.org/tutorials/recipes/recipes/tuning_guide.html). Edge case: code reading `.grad`
195
+ between steps must tolerate `None`.
196
+ - **Turn OFF debug APIs for the real run** — `torch.autograd.set_detect_anomaly(True)`,
197
+ `torch.autograd.profiler.profile`, `gradcheck` add per-op bookkeeping (anomaly detection is ~10× slower,
198
+ precision-stability P9). Grep `detect_anomaly` / leftover `with profile(` wrappers before a long launch
199
+ (https://docs.pytorch.org/tutorials/recipes/recipes/tuning_guide.html); easy to leave on after a NaN hunt.
200
+
201
+ ---
202
+
203
+ ## Mixed precision for speed
204
+
205
+ ### T13 — bf16/fp16 is a throughput lever, not just a memory lever
206
+
207
+ **Symptom**: fp32 training under-uses Tensor Cores; the GPU has bf16/fp16 tensor cores.
208
+
209
+ **Root cause**: 16-bit matmuls run on Tensor Cores at much higher FLOP/s and halve activation
210
+ read/write bandwidth — a speedup *on top of* the memory saving (oom-memory.md M6).
211
+
212
+ **Fix**: `torch.autocast("cuda", dtype=torch.bfloat16)` on Ampere+ (the modern default; no GradScaler —
213
+ precision-stability P6) or `bf16=True` in HF `TrainingArguments`. The full precision decision (bf16 vs fp16
214
+ vs the V100/T4 fp16-only path, GradScaler mechanics, NaN/overflow) is owned by
215
+ **references/training/precision-stability.md P1–P10** (cross-link; do NOT restate). The *memory* angle and
216
+ the activation-bucket math is **oom-memory.md M6**. A NaN/divergence after the swap is a numerics question →
217
+ precision-stability / verifying-dl-experiments (**REQUIRED**).
218
+
219
+ ---
220
+
221
+ ## Kernels — the levers left once the GPU is fed
222
+
223
+ ### T14 — SDPA / FlashAttention: stop materializing the O(seq²) attention matrix
224
+
225
+ **Symptom**: a transformer is attention-bound; long sequences are slow and memory-heavy; or `flash_attn`
226
+ "installed" but the run is no faster.
227
+
228
+ **Root cause**: the eager/`math` attention path materializes the full `seq×seq` score matrix. The fused
229
+ **FlashAttention** / **memory-efficient** backends never do, but PyTorch's `scaled_dot_product_attention`
230
+ **silently falls back to the slow `math` backend** when the fused kernel's input constraints aren't met
231
+ (wrong dtype, head dim, mask shape) — "if a fused implementation is not available, a warning will be
232
+ raised" (https://docs.pytorch.org/docs/2.12/generated/torch.nn.functional.scaled_dot_product_attention.html).
233
+
234
+ **Fix**:
235
+ - Use `F.scaled_dot_product_attention(q,k,v)` (or `attn_implementation="sdpa"`, the HF default on 2.1.1+),
236
+ which auto-picks FlashAttention / memory-efficient / cuDNN / math. Feed it **fp16/bf16** inputs — the
237
+ fused backends need 16-bit (the `math` fallback is what runs in fp32).
238
+ - **Force-verify** the fast backend instead of trusting silence:
239
+ ```python
240
+ from torch.nn.attention import sdpa_kernel, SDPBackend
241
+ with sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION]): # errors loudly if it can't be used
242
+ out = F.scaled_dot_product_attention(q, k, v, is_causal=True)
243
+ ```
244
+ - **Installing `flash_attn` from source is a trap**: without `ninja` (`pip install ninja`) the CUDA
245
+ extension compiles single-threaded ~2 h; with ninja ~3–5 min on a 64-core box. With many cores but
246
+ `<96 GB` RAM ninja over-parallelizes and OOMs the build — cap `MAX_JOBS=4 pip install flash-attn
247
+ --no-build-isolation`. Prefer a **prebuilt wheel** matching the `cuXX/torchYY/cpZZ` triple
248
+ (https://github.com/Dao-AILab/flash-attention/issues/1038, https://pypi.org/project/flash-attn/). A
249
+ torch/CUDA mismatch is **gotchas_universal.md U28**. Whether the fused kernel changes outputs (causal-mask
250
+ edge cases) is a numerics check → verifying-dl-experiments (**REQUIRED**).
251
+
252
+ ### T15 — `torch.compile`: fuse kernels + cut launch overhead (one line, real gains)
253
+
254
+ **Symptom**: many small pointwise/elementwise ops; Python/launch overhead dominates between big matmuls.
255
+
256
+ **Root cause**: eager launches each op separately; Inductor fuses adjacent ops into Triton kernels and
257
+ (in CUDA-graph modes) eliminates per-step launch overhead, reusing the execution plan across steps.
258
+
259
+ **Fix**: wrap the model — `model = torch.compile(model)`. Modes
260
+ (https://huggingface.co/docs/transformers/en/perf_torch_compile):
261
+ - `default` — balanced speed/memory.
262
+ - `mode="reduce-overhead"` — uses **CUDA graphs** to kill Python overhead (best for many tiny ops /
263
+ small-batch / inference), at a little more memory.
264
+ - `mode="max-autotune"` — longest compile, fastest steady-state.
265
+ - HF `TrainingArguments(torch_compile=True, torch_compile_mode="reduce-overhead")`.
266
+
267
+ Reported ~2.2× mean-inference speedups; training gains real but model-dependent. **First step(s) are slow**
268
+ — compilation is lazy on first call (https://huggingface.co/docs/transformers/en/perf_torch_compile); exclude
269
+ warm-up from any throughput measurement. Set `fullgraph=True` while developing to surface graph breaks loudly
270
+ instead of silently losing speed. Whether the compiled *numbers* match eager → verifying-dl-experiments
271
+ (**REQUIRED**).
272
+
273
+ ### T16 — `torch.compile` recompilation trap: variable shapes silently blow the cache → eager
274
+
275
+ **Symptom**: a compiled run is *slower* than eager, or stutters periodically; throughput never stabilizes.
276
+ Common with variable batch/seq-len, dynamic padding, or per-step changing shapes.
277
+
278
+ **Root cause**: compile creates **guards** on traced shapes; a new shape violates a guard and triggers a
279
+ **recompile**. Past the recompile cap (`torch._dynamo.config.recompile_limit`, default **8**; legacy
280
+ `cache_size_limit`) Dynamo **stops compiling that function and runs it eagerly** — paying all the compile
281
+ cost and getting none of the benefit
282
+ (https://docs.pytorch.org/docs/stable/compile/programming_model.recompilation.html,
283
+ https://github.com/pytorch/pytorch/issues/93457).
284
+
285
+ **Fix**:
286
+ - **See it**: `TORCH_LOGS=recompiles python train.py` logs which function recompiled and the failed guard;
287
+ `TORCH_LOGS=graph_breaks` and `torch._dynamo.explain(...)` locate graph breaks
288
+ (https://docs.pytorch.org/docs/stable/torch.compiler_troubleshooting.html).
289
+ - **Tame shapes**: pad/bucket to a few fixed shapes so guards stop firing; or mark the varying dim dynamic
290
+ — `torch.compile(model, dynamic=True)` (or `mark_dynamic` / `TORCH_COMPILE_DYNAMIC_SOURCES`) compiles
291
+ one shape-generic graph instead of one per size. `dynamic=False` forces a fresh recompile per distinct
292
+ size (use only with truly few shapes)
293
+ (https://docs.pytorch.org/docs/stable/compile/programming_model.html).
294
+ - **Last resort**: raise `torch._dynamo.config.recompile_limit` only if a handful of *stable* extra shapes
295
+ legitimately exist — raising it to mask genuinely unbounded shapes just thrashes.
296
+
297
+ ---
298
+
299
+ ## Memory ↔ speed trades
300
+
301
+ ### T17 — Activation checkpointing buys memory by spending ~20–30% compute (know the cost)
302
+
303
+ **Symptom**: gradient/activation checkpointing is on "to be safe" and training is slow — but the model
304
+ actually fits without it.
305
+
306
+ **Fix**: checkpointing **recomputes** activations in backward instead of storing them — trading **~20–30%
307
+ extra compute** for a large memory cut (https://docs.pytorch.org/tutorials/recipes/recipes/tuning_guide.html,
308
+ oom-memory.md M7). Enable it **only when activations actually OOM** (full rationale + `use_reentrant=False` /
309
+ `use_cache=False` gotchas = **oom-memory.md M7**); if it fits without, turning it off is a free ~25% speedup.
310
+ On the frontier, checkpoint only the *fewest/heaviest* blocks needed to fit, not the whole model.
311
+
312
+ ### T18 — Bigger micro-batch ≈ better GPU utilization (up to the memory wall)
313
+
314
+ **Symptom**: tiny batches under-feed the GPU; util and throughput both low though VRAM is mostly free (small
315
+ batches under-fill Tensor Cores and amortize launch/sync overhead poorly).
316
+
317
+ **Fix**: raise micro-batch toward the VRAM limit; keep the **effective** batch fixed with grad-accum if the
318
+ result depends on it (`batch 4 × accum 16` beats `batch 1 × accum 64` — oom-memory.md M5). Accuracy/effective-
319
+ batch implications (LR scaling, accumulation loss-weighting) → verifying-dl-experiments (**REQUIRED**).
320
+ Sizing alongside a concurrent job + `expandable_segments` = **gotchas_universal.md U10** / oom-memory.md M8.
321
+
322
+ ---
323
+
324
+ ## Profilers — measure the bottleneck, don't guess it
325
+
326
+ ### T19 — `torch.profiler`: the definitive data-bound vs compute-bound verdict
327
+
328
+ **Symptom**: need to *prove* where time goes (which T1 case), not infer from util%.
329
+
330
+ **Fix — scheduled profile of a few steps**
331
+ (https://docs.pytorch.org/tutorials/recipes/recipes/profiler_recipe.html):
332
+ ```python
333
+ from torch.profiler import profile, schedule, ProfilerActivity, tensorboard_trace_handler
334
+ with profile(
335
+ activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
336
+ schedule=schedule(wait=1, warmup=1, active=3), # skip warm-up; record 3 steps
337
+ on_trace_ready=tensorboard_trace_handler("./tb_trace"),
338
+ record_shapes=True, with_stack=True,
339
+ ) as prof:
340
+ for step, batch in enumerate(loader):
341
+ train_step(batch); prof.step()
342
+ if step >= 6: break
343
+ print(prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=15))
344
+ ```
345
+ **Read it**: large **GPU-timeline gaps** with CPU busy in `DataLoader`/transforms during the gap ⇒
346
+ **data-bound** (T4–T8); the TensorBoard "Performance Recommendation" panel names the DataLoader directly
347
+ (https://docs.pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html). Densely-packed GPU
348
+ timeline ⇒ GPU-bound; sort by `self_cuda_time_total` for the hottest kernel (T14/T15). Time in `nccl:*` not
349
+ overlapped ⇒ comms-bound (T23). On a remote box write the trace and view locally — for raw
350
+ `export_chrome_trace("trace.json")` open at `chrome://tracing`; `scp` it down (references/ssh_transport.md),
351
+ never run a viewer over ssh.
352
+
353
+ ### T20 — `nsys` / Nsight Systems: system-wide timeline when the gap is below PyTorch's view
354
+
355
+ **Symptom**: torch.profiler shows GPU-idle gaps but not *why* (CPU launch latency, a hidden sync, a memcpy,
356
+ a kernel-launch storm); or want CUDA-API + NVTX + OS-runtime on one timeline.
357
+
358
+ **Root cause**: torch.profiler sees PyTorch ops; `nsys` traces the whole system — CUDA API, kernels,
359
+ memcpy, NVTX ranges, OS-runtime — so it exposes launch-bound stalls and CPU↔GPU sync that PyTorch can't.
360
+ "Periodic gaps in the CUDA HW row are moments when the GPU is idle — a red flag"
361
+ (https://docs.lxp.lu/howto/pytorch-profiling-with-nsight/).
362
+
363
+ **Fix — profile a bounded window on the box, view locally** (canonical PyTorch recipe,
364
+ https://gist.github.com/mcarilli/376821aa1a7182dfcf59928a7cde3223):
365
+ ```bash
366
+ nsys profile -w true -t cuda,nvtx,osrt,cudnn,cublas -s cpu \
367
+ --capture-range=cudaProfilerApi -x true -o report python train.py
368
+ ```
369
+ In the script, bound the window so the `.nsys-rep` stays small:
370
+ ```python
371
+ torch.cuda.profiler.cudart().cudaProfilerStart() # after warm-up
372
+ # ... a handful of steps, optionally wrapped in torch.cuda.nvtx.range_push/pop ...
373
+ torch.cuda.profiler.cudart().cudaProfilerStop()
374
+ ```
375
+ `scp` the `.nsys-rep` down, open in the Nsight Systems GUI. Nsight **Systems** finds *which* kernel is slow;
376
+ Nsight **Compute** (`ncu`) finds *why* (occupancy, bandwidth, warp stalls) — but `ncu` is heavy, reserve it
377
+ for one hot kernel (https://www.spheron.network/blog/gpu-profiling-ai-workloads-nsight-compute-pytorch-profiler-guide/).
378
+
379
+ ### T21 — `py-spy`: profile a LIVE training process with no restart, no code change
380
+
381
+ **Symptom**: a long run is mysteriously slow or apparently hung; restarting it to add a profiler would cost
382
+ hours and might not reproduce.
383
+
384
+ **Root cause**: a Python-side bottleneck or deadlock (a slow transform, a lock, a blocking collective) that
385
+ needs inspection *in situ*.
386
+
387
+ **Fix — attach by PID, zero instrumentation** (https://github.com/benfred/py-spy):
388
+ ```bash
389
+ py-spy dump --pid <PID> # one-shot stack of every thread → where it's hung RIGHT NOW
390
+ py-spy top --pid <PID> # live "which functions burn time" (Unix top-style)
391
+ py-spy record -o prof.svg --pid <PID> # flame graph over a window
392
+ ```
393
+ "The profiled program needs no import, no decorator, and no restart." On a rented box mid-run, `py-spy dump`
394
+ instantly distinguishes a *hung* process (stuck in `recv`/lock/`all_reduce`) from a *slow* one (busy in a
395
+ transform) — pairs with the "is it actually hung?" check (gotchas_universal.md U17, verifying-dl-experiments
396
+ **REQUIRED**). May need `--native` for C-extension frames and `sudo`/`SYS_PTRACE` to attach.
397
+
398
+ ### T22 — CUDA memory snapshot/visualizer → oom-memory.md M19
399
+
400
+ For *what allocated the memory* (not time), the `torch.cuda.memory._record_memory_history` snapshot +
401
+ https://pytorch.org/memory_viz timeline is owned by **references/training/oom-memory.md M19/M18**. It is a
402
+ memory tool, not a throughput tool — listed here only so the profiler menu is complete. Do NOT restate.
403
+
404
+ ---
405
+
406
+ ## Multi-GPU / multi-node communication
407
+
408
+ ### T23 — Compute-comms overlap: DDP overlaps by default; tune the bucket, watch for breakers
409
+
410
+ **Symptom**: scaling efficiency is poor — per-GPU util high, but N GPUs deliver far less than N× throughput;
411
+ trace shows `all_reduce`/`all_gather` *not* overlapped with backward compute.
412
+
413
+ **Root cause**: DDP overlaps gradient all-reduce with backward by bucketing gradients and launching each
414
+ bucket's reduce on a separate CUDA stream as soon as it's ready
415
+ (https://github.com/pytorch/pytorch/issues/67570). Overlap *breaks* when something forces a sync: an
416
+ unused-parameter recompute, an off-by-default `find_unused_parameters=True`, a `.item()`/print/`.cpu()` in
417
+ the step, or too-small/too-large buckets.
418
+
419
+ **Fix (single box, DDP/FSDP — the launch/sharding mechanics live in
420
+ references/training/distributed-launch.md, REQUIRED)**:
421
+ - Tune `bucket_cap_mb` (DDP) to batch gradient chunks into fewer, larger all-reduces; set
422
+ `gradient_as_bucket_view=True` to cut a copy. Buckets too small = launch overhead; too large = late
423
+ overlap.
424
+ - FSDP: enable `backward_prefetch` (prefetch the next layer's all-gather during current backward) and
425
+ `forward_prefetch` so comms hide under compute; `limit_all_gathers` if memory-pressured.
426
+ - Remove per-step host syncs (`loss.item()` every step, prints, eager `.cpu()`) that serialize the stream.
427
+
428
+ **Inter-node** transport (NCCL picking the wrong NIC, fabric-manager hang, 1800 s timeout masking a
429
+ straggler, MTU mismatch) is **references/multinode.md** (**REQUIRED** for ≥2 instances) — a comms "slowdown"
430
+ across boxes is usually one of those, not a bucket-size tune. Whether a world-size change silently rescaled
431
+ the effective batch/LR is a science question → verifying-dl-experiments (**REQUIRED**).
432
+
433
+ ---
434
+
435
+ ## Pointers — throughput gotchas catalogued elsewhere (do NOT restate)
436
+
437
+ - **gotchas_universal.md** — **U8** stage hot data to local NVMe (IO-bound) · **U21** `nvidia-smi` util% is
438
+ a liar (+ **U23** thermal/power throttle) · **U24** dataloader-starvation knob order · **U25** millions of
439
+ small files → shard into tar/WebDataset · **U38** GPU 0%-util CPU-data-bound (owned by verifying-dl).
440
+ - **references/training/oom-memory.md** — M5 micro-batch/grad-accum · M6 bf16 activations · M7 activation
441
+ checkpointing memory rationale · M8 `expandable_segments` · M19 memory snapshot/visualizer.
442
+ - **references/training/precision-stability.md** — P1–P10 the precision decision + AMP mechanics · P2 the
443
+ TF32-off footgun · P19 determinism-vs-`cudnn.benchmark` speed trade.
444
+ - **references/training/distributed-launch.md** — torchrun/Accelerate/DeepSpeed launch, DDP/FSDP sharding,
445
+ and the desync/hang toolkit (the launch substrate this file's T23 sits on).
446
+ - **references/multinode.md** — inter-node NCCL/NIC/fabric/timeout/MTU (the wire between boxes). Single-box
447
+ users skip.
448
+ - **verifying-dl-experiments** (**REQUIRED**) — owns *is-the-number-real*: whether a kernel/precision/compile
449
+ swap changed the result, whether dropping samples or a GPU-side transform shifted the distribution, the
450
+ 0%-util diagnosis (U38), determinism (U36). This file makes training *fast*; that skill decides if the
451
+ *faster result is still true*.
@@ -0,0 +1,55 @@
1
+ #!/usr/bin/env bash
2
+ # Aggregate completed ablation results from the per-instance data disk to durable storage.
3
+ # Idempotent (cp -f overwrites, so a retry result overwrites an epoch-1-failure snapshot).
4
+ #
5
+ # Override DATA_DIR / DURABLE_DIR per your platform profile (profiles/<platform>.md §8). Defaults = AutoDL.
6
+ #
7
+ # Usage: bash aggregate_to_fs.sh (run on each instance after its queue completes)
8
+ #
9
+ # This is a SAFETY NET — run_one.sh already auto-syncs per ablation. Use it when an auto-sync failed,
10
+ # an older run_one lacked it, or as a final pass before releasing an instance.
11
+ set -u
12
+
13
+ DATA_DIR="${DATA_DIR:-/root/autodl-tmp}"
14
+ DURABLE_DIR="${DURABLE_DIR:-/root/autodl-fs}"
15
+
16
+ FS_BASE="$DURABLE_DIR/final_ckpts"
17
+ LOCAL_CKPT_BASE="$DATA_DIR/checkpoints"
18
+ LOCAL_LOG_BASE="$DATA_DIR/runs/logs"
19
+
20
+ mkdir -p "$FS_BASE"
21
+
22
+ count=0
23
+ fail=0
24
+ for d in "$LOCAL_CKPT_BASE"/*/; do
25
+ [ -d "$d" ] || continue
26
+ name=$(basename "$d")
27
+
28
+ # Skip an ablation that never reached epoch 1 (no metrics written).
29
+ if [ ! -f "$d/best_metrics.json" ]; then
30
+ echo "SKIP $name (no best_metrics.json)"
31
+ continue
32
+ fi
33
+
34
+ FS_DIR="$FS_BASE/$name"
35
+ # GATE on the copy result — never echo OK unconditionally. A full / inode-exhausted durable FS
36
+ # makes mkdir/cp fail silently; an unconditional "OK" would lie (references/gotchas_universal.md,
37
+ # silent-sync; principle #3). Verify best.pth landed before counting it.
38
+ if mkdir -p "$FS_DIR" && cp -f "$d/best.pth" "$FS_DIR/" && [ -f "$FS_DIR/best.pth" ]; then
39
+ cp -f "$d/best_metrics.json" "$FS_DIR/" 2>/dev/null || true
40
+ cp -rf "$d/protocol" "$FS_DIR/" 2>/dev/null || true
41
+ cp -f "$LOCAL_LOG_BASE/$name.log" "$FS_DIR/" 2>/dev/null || true
42
+ echo "OK $name"
43
+ count=$((count+1))
44
+ else
45
+ echo "!! FAIL $name — durable copy did not land (check 'df -i $DURABLE_DIR'). Data-disk copy is source-of-truth."
46
+ fail=$((fail+1))
47
+ fi
48
+ done
49
+
50
+ echo
51
+ echo "=== Aggregated $count ablations to $FS_BASE ($fail failed) ==="
52
+ echo "Total dirs on durable FS now: $(find "$FS_BASE" -mindepth 1 -maxdepth 1 -type d | wc -l)"
53
+ df -h "$FS_BASE" | tail -1
54
+ df -i "$FS_BASE" | tail -1
55
+ [ "$fail" -eq 0 ] || exit 1
@@ -0,0 +1,70 @@
1
+ #!/usr/bin/env python3
2
+ """Flag platform/teardown facts whose `verified <YYYY-MM>` stamp has gone stale.
3
+
4
+ Every money-affecting platform fact in this skill is pinned with a `verified ... YYYY-MM`
5
+ stamp (references/self-improvement.md section 5). Billing verbs, spot rules, and auto-release
6
+ clocks drift silently; this scans every stamp and warns past an age threshold so the
7
+ quarterly re-verify is mechanical, not a memory ritual. It flags WHAT to re-check against
8
+ current platform docs -- it does NOT (and cannot) verify whether the fact is still true.
9
+ Pure stdlib, no network calls.
10
+
11
+ Usage:
12
+ python scripts/check_staleness.py [--root .] [--max-age-months 6] [--today YYYY-MM]
13
+
14
+ Exit code: 0 = every stamp within the threshold; 1 = at least one stale stamp (or none found).
15
+ """
16
+ from __future__ import annotations
17
+ import argparse
18
+ import re
19
+ import sys
20
+ from datetime import date
21
+ from pathlib import Path
22
+
23
+ # A stamp is a YYYY-MM token (2000-2099) sitting on a line that also says "verified".
24
+ DATE = re.compile(r"(20\d\d)-(0[1-9]|1[0-2])")
25
+
26
+
27
+ def main() -> int:
28
+ ap = argparse.ArgumentParser(description="Warn on stale `verified YYYY-MM` platform-fact stamps.")
29
+ ap.add_argument("--root", default=".", help="repo root to scan (default: cwd)")
30
+ ap.add_argument("--max-age-months", type=int, default=6, help="warn past this many months (default: 6)")
31
+ ap.add_argument("--today", help="override current month as YYYY-MM (default: system clock)")
32
+ a = ap.parse_args()
33
+
34
+ if a.today:
35
+ ty, tm = (int(x) for x in a.today.split("-"))
36
+ else:
37
+ t = date.today()
38
+ ty, tm = t.year, t.month
39
+ now = ty * 12 + tm
40
+
41
+ root = Path(a.root)
42
+ stamps = 0
43
+ stale = []
44
+ for f in sorted(root.rglob("*.md")):
45
+ if ".git" in f.parts:
46
+ continue
47
+ for n, line in enumerate(f.read_text(encoding="utf-8", errors="replace").splitlines(), 1):
48
+ if "verified" not in line.lower():
49
+ continue
50
+ for m in DATE.finditer(line):
51
+ stamps += 1
52
+ age = now - (int(m.group(1)) * 12 + int(m.group(2)))
53
+ if age > a.max_age_months:
54
+ stale.append((f.as_posix(), n, m.group(0), age))
55
+
56
+ print(f"Scanned {stamps} `verified <YYYY-MM>` stamp(s); threshold = {a.max_age_months} months.")
57
+ if stamps == 0:
58
+ print("WARNING: no stamps found -- wrong --root, or stamps were dropped.")
59
+ return 1
60
+ if not stale:
61
+ print("All stamps within threshold. (Still re-verify before betting money/data -- self-improvement.md section 5.)")
62
+ return 0
63
+ print(f"\n{len(stale)} STALE stamp(s) -- re-verify against current platform docs (self-improvement.md section 5):")
64
+ for path, n, stamp, age in stale:
65
+ print(f" {path}:{n} verified {stamp} ({age} mo old)")
66
+ return 1
67
+
68
+
69
+ if __name__ == "__main__":
70
+ sys.exit(main())
@@ -0,0 +1,67 @@
1
+ #!/usr/bin/env bash
2
+ # Per-dir resumable download loop — robust to mid-transfer connection drops.
3
+ #
4
+ # Each dir is pulled in its own session, so one network blip never loses the rest,
5
+ # and re-running skips already-complete dirs. A single `scp -r` of a huge tree dies
6
+ # on any blip and does NOT resume — see references/gotchas_universal.md (transfer
7
+ # resets). This uses rsync --partial, which resumes a half-pulled dir in place.
8
+ #
9
+ # Usage (override any var from the environment):
10
+ # LOCAL_TARGET=/path/to/local/final_ckpts \
11
+ # REMOTE_ALIAS=my-gpu-1 \
12
+ # REMOTE_PATH=/durable/final_ckpts \
13
+ # bash download_loop.sh
14
+ #
15
+ # NOTE: `du -sb` is GNU coreutils. On a non-GNU local shell (macOS/Windows) the
16
+ # size-skip heuristic may need adjusting; the download itself is unaffected.
17
+ set -u
18
+
19
+ LOCAL_TARGET="${LOCAL_TARGET:-/path/to/local/final_ckpts}"
20
+ REMOTE_ALIAS="${REMOTE_ALIAS:-my-gpu-1}"
21
+ REMOTE_PATH="${REMOTE_PATH:-/root/autodl-fs/final_ckpts}" # override from your profile (durable mount)
22
+ MIN_DIR_SIZE_BYTES="${MIN_DIR_SIZE_BYTES:-2000000000}" # 2 GB = "looks complete"
23
+
24
+ mkdir -p "$LOCAL_TARGET"
25
+ cd "$LOCAL_TARGET" || exit 1
26
+
27
+ echo "Listing remote dirs in $REMOTE_ALIAS:$REMOTE_PATH ..."
28
+ # Capture the listing AND its exit status separately. A bare `mapfile < <(ssh ...)`
29
+ # discards ssh's exit code, so an unreachable host or a wrong path yields an empty
30
+ # array that then reads as "nothing to download" -- a silent success right before a
31
+ # pre-teardown pull (principle #3). Fail loud on a listing error instead.
32
+ remote_listing=$(ssh -o ConnectTimeout=15 "$REMOTE_ALIAS" "ls -1 '$REMOTE_PATH'")
33
+ ssh_rc=$?
34
+ if [ "$ssh_rc" -ne 0 ]; then
35
+ echo "ERROR: could not list $REMOTE_ALIAS:$REMOTE_PATH (ssh/ls exit $ssh_rc) -- refusing to treat an unreachable host as an empty download." >&2
36
+ exit 1
37
+ fi
38
+ # mapfile preserves names with spaces; guard the empty string so it yields 0 elems, not 1.
39
+ if [ -z "$remote_listing" ]; then remote_dirs=(); else mapfile -t remote_dirs <<< "$remote_listing"; fi
40
+ n_total=${#remote_dirs[@]}
41
+ echo "Found $n_total remote dirs"
42
+ if [ "$n_total" -eq 0 ]; then echo "Remote dir is reachable but empty -- nothing to download."; exit 0; fi
43
+
44
+ ok=0; skip=0; fail=0
45
+ for d in "${remote_dirs[@]}"; do
46
+ [ -n "$d" ] || continue
47
+ if [ -d "$d" ]; then
48
+ size=$(du -sb "$d" 2>/dev/null | cut -f1)
49
+ if [ "${size:-0}" -ge "$MIN_DIR_SIZE_BYTES" ]; then
50
+ echo "SKIP $d (already complete)"
51
+ skip=$((skip+1)); continue
52
+ fi
53
+ echo "RETRY $d (partial — rsync will resume in place)"
54
+ fi
55
+ echo "DOWNLOADING $d ..."
56
+ if rsync -az --partial -e 'ssh -o ConnectTimeout=15 -o ServerAliveInterval=60 -o ServerAliveCountMax=120' \
57
+ "$REMOTE_ALIAS:$REMOTE_PATH/$d" ./ ; then
58
+ echo "OK $d"; ok=$((ok+1))
59
+ else
60
+ echo "FAIL $d"; fail=$((fail+1))
61
+ fi
62
+ done
63
+
64
+ echo
65
+ echo "=== Done === OK: $ok SKIP: $skip FAIL: $fail (of $n_total expected)"
66
+ echo "Local dirs now: $(find . -mindepth 1 -maxdepth 1 -type d | wc -l)"
67
+ [ "$fail" -eq 0 ] || { echo "Re-run to retry the failed dirs (resumable)."; exit 1; }