opencode-skills-collection 3.1.1 → 3.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/bundled-skills/.antigravity-install-manifest.json +4 -1
  2. package/bundled-skills/2slides-ppt-generator/SKILL.md +8 -7
  3. package/bundled-skills/agent-creator/SKILL.md +246 -0
  4. package/bundled-skills/android-cli/SKILL.md +19 -7
  5. package/bundled-skills/android-ui-journey-testing/SKILL.md +5 -5
  6. package/bundled-skills/apple-notes-search/SKILL.md +12 -2
  7. package/bundled-skills/atlas-ledger/SKILL.md +8 -0
  8. package/bundled-skills/ax-extract-workflow/SKILL.md +156 -0
  9. package/bundled-skills/codex-fable5/SKILL.md +10 -2
  10. package/bundled-skills/competitor-analysis/scripts/gate_candidates.mjs +45 -15
  11. package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
  12. package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
  13. package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
  14. package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
  15. package/bundled-skills/docs/sources/sources.md +1 -1
  16. package/bundled-skills/docs/users/bundles.md +145 -1
  17. package/bundled-skills/docs/users/claude-code-skills.md +1 -1
  18. package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
  19. package/bundled-skills/docs/users/getting-started.md +1 -1
  20. package/bundled-skills/docs/users/kiro-integration.md +1 -1
  21. package/bundled-skills/docs/users/specialized-plugin-roadmap.md +11 -4
  22. package/bundled-skills/docs/users/usage.md +4 -4
  23. package/bundled-skills/docs/users/visual-guide.md +4 -4
  24. package/bundled-skills/dos-verify-done-claims/SKILL.md +16 -4
  25. package/bundled-skills/ecl-harness-engineer/agents/creator-config.md +1 -1
  26. package/bundled-skills/ecl-harness-engineer/references/environment-config-guide.md +2 -2
  27. package/bundled-skills/ecl-harness-engineer/references/environment-detection-guide.md +4 -4
  28. package/bundled-skills/event-staffing-ordering/SKILL.md +4 -0
  29. package/bundled-skills/loop-library/SKILL.md +7 -4
  30. package/bundled-skills/lovable-cleanup/SKILL.md +11 -8
  31. package/bundled-skills/macos-screen-recorder/SKILL.md +9 -1
  32. package/bundled-skills/mailtrap-managing-contacts/SKILL.md +1 -1
  33. package/bundled-skills/mailtrap-sending-emails/SKILL.md +1 -1
  34. package/bundled-skills/mailtrap-setting-up-sending-domain/SKILL.md +1 -1
  35. package/bundled-skills/remote-gpu-trainer/.gitattributes +8 -0
  36. package/bundled-skills/remote-gpu-trainer/LICENSE +21 -0
  37. package/bundled-skills/remote-gpu-trainer/README.md +267 -0
  38. package/bundled-skills/remote-gpu-trainer/SKILL.md +249 -0
  39. package/bundled-skills/remote-gpu-trainer/evals/README.md +57 -0
  40. package/bundled-skills/remote-gpu-trainer/evals/RESULTS.md +44 -0
  41. package/bundled-skills/remote-gpu-trainer/evals/cases.jsonl +14 -0
  42. package/bundled-skills/remote-gpu-trainer/evals/run_evals.py +68 -0
  43. package/bundled-skills/remote-gpu-trainer/examples/autodl_sweep/README.md +72 -0
  44. package/bundled-skills/remote-gpu-trainer/examples/autodl_sweep/queue_1.txt +6 -0
  45. package/bundled-skills/remote-gpu-trainer/profiles/_schema.md +100 -0
  46. package/bundled-skills/remote-gpu-trainer/profiles/autodl.md +327 -0
  47. package/bundled-skills/remote-gpu-trainer/profiles/china.md +397 -0
  48. package/bundled-skills/remote-gpu-trainer/profiles/generic-ssh.md +450 -0
  49. package/bundled-skills/remote-gpu-trainer/profiles/lambda.md +342 -0
  50. package/bundled-skills/remote-gpu-trainer/profiles/paperspace.md +365 -0
  51. package/bundled-skills/remote-gpu-trainer/profiles/runpod.md +164 -0
  52. package/bundled-skills/remote-gpu-trainer/profiles/vastai.md +355 -0
  53. package/bundled-skills/remote-gpu-trainer/references/china-network.md +206 -0
  54. package/bundled-skills/remote-gpu-trainer/references/gotchas_universal.md +704 -0
  55. package/bundled-skills/remote-gpu-trainer/references/lifecycle_checklist.md +148 -0
  56. package/bundled-skills/remote-gpu-trainer/references/monitoring_patterns.md +327 -0
  57. package/bundled-skills/remote-gpu-trainer/references/multinode.md +190 -0
  58. package/bundled-skills/remote-gpu-trainer/references/parallel_ablation.md +196 -0
  59. package/bundled-skills/remote-gpu-trainer/references/principles.md +179 -0
  60. package/bundled-skills/remote-gpu-trainer/references/self-improvement.md +74 -0
  61. package/bundled-skills/remote-gpu-trainer/references/spot-resilience.md +235 -0
  62. package/bundled-skills/remote-gpu-trainer/references/ssh_transport.md +270 -0
  63. package/bundled-skills/remote-gpu-trainer/references/training/by-domain.md +230 -0
  64. package/bundled-skills/remote-gpu-trainer/references/training/checkpoint-resume.md +368 -0
  65. package/bundled-skills/remote-gpu-trainer/references/training/convergence-debugging.md +187 -0
  66. package/bundled-skills/remote-gpu-trainer/references/training/data-pipeline.md +119 -0
  67. package/bundled-skills/remote-gpu-trainer/references/training/distributed-launch.md +422 -0
  68. package/bundled-skills/remote-gpu-trainer/references/training/oom-memory.md +338 -0
  69. package/bundled-skills/remote-gpu-trainer/references/training/precision-stability.md +401 -0
  70. package/bundled-skills/remote-gpu-trainer/references/training/throughput-profiling.md +451 -0
  71. package/bundled-skills/remote-gpu-trainer/scripts/aggregate_to_fs.sh +55 -0
  72. package/bundled-skills/remote-gpu-trainer/scripts/check_staleness.py +70 -0
  73. package/bundled-skills/remote-gpu-trainer/scripts/download_loop.sh +67 -0
  74. package/bundled-skills/remote-gpu-trainer/scripts/gpu_health.sh +169 -0
  75. package/bundled-skills/remote-gpu-trainer/scripts/health_patrol.sh.template +67 -0
  76. package/bundled-skills/remote-gpu-trainer/scripts/mem_monitor.sh +67 -0
  77. package/bundled-skills/remote-gpu-trainer/scripts/reap_vram_zombies.sh +175 -0
  78. package/bundled-skills/remote-gpu-trainer/scripts/run_one.sh.template +104 -0
  79. package/bundled-skills/remote-gpu-trainer/scripts/run_queue.sh.template +83 -0
  80. package/bundled-skills/remote-gpu-trainer/scripts/setup-china-mirrors.sh +35 -0
  81. package/bundled-skills/remote-gpu-trainer/scripts/verify_local.py +145 -0
  82. package/bundled-skills/screenstudio-alt/SKILL.md +9 -1
  83. package/bundled-skills/vibecode-production-qa-validator/SKILL.md +1 -1
  84. package/bundled-skills/youtube-notetaker/scripts/serve.py +63 -14
  85. package/package.json +1 -1
  86. package/skills_index.json +128 -49
@@ -0,0 +1,169 @@
1
+ #!/usr/bin/env bash
2
+ # gpu_health.sh — portable pre-flight GPU-health probe for a rented box (see references/gotchas_universal.md U21-U23).
3
+ #
4
+ # Runs three independent checks and prints ONE PASS / WARN / FAIL summary:
5
+ # 1. live sampling — nvidia-smi dmon over a few seconds (power/util/clocks/mem/temp)
6
+ # 2. Xid scan — dmesg for hardware-failure Xid codes; Xid 48 / 79 are HARD failures
7
+ # 3. throttle scan — SM clock crushed below base while hot, or nvidia-smi throttle reasons
8
+ #
9
+ # Exit codes (so a launch wrapper can react before it pays for GPU-hours):
10
+ # 0 PASS or WARN — safe to launch (WARN = degraded but usable; see stderr notes)
11
+ # 2 HARD FAIL — dead/throttling GPU; re-rent a DIFFERENT box, do not launch here
12
+ #
13
+ # Usage: bash gpu_health.sh [GPU_INDEX] # default 0
14
+ # On a rental there is no "reseat the card" — a HARD fail means stop + re-rent (see references/gotchas_universal.md U21-U23).
15
+ # NEVER an unquoted pipe inside a grep regex (it reads stdin and hangs).
16
+
17
+ set -u
18
+
19
+ # ---------------------------------------------------------------------------
20
+ # Tunable constants — every magic number is documented here, no voodoo.
21
+ # ---------------------------------------------------------------------------
22
+ GPU="${1:-0}" # which GPU to probe (nvidia-smi index)
23
+ SAMPLE_COUNT=5 # dmon sample COUNT (-c N = N one-second samples); 5 samples ~= 5 s,
24
+ # enough to catch a clock dip without burning metered time on a no-op probe.
25
+ TEMP_HOT_C=83 # H100/A100-class throttle onset ~83 °C (U23). At/above this the
26
+ # board down-clocks itself; sustained >83 °C while SM clock is low
27
+ # is the thermal-throttle signature.
28
+ SM_CLOCK_FLOOR_FRAC=70 # treat SM clock < 70% of the board's *base* clock as "crushed".
29
+ # 70% chosen as a conservative gap: boost variance is normal,
30
+ # but a 30%+ drop below BASE under load is throttling, not jitter.
31
+
32
+ # ---------------------------------------------------------------------------
33
+ # Result accumulators. status escalates PASS -> WARN -> FAIL, never downgrades.
34
+ # ---------------------------------------------------------------------------
35
+ STATUS="PASS"
36
+ NOTES="" # human-readable findings, one per line, emitted to stderr
37
+
38
+ # escalate <LEVEL> <message> — raise overall status and record the reason.
39
+ escalate() {
40
+ local level="$1"; shift
41
+ NOTES="${NOTES} [${level}] $*"$'\n'
42
+ # FAIL beats WARN beats PASS; only ever climb the ladder.
43
+ if [ "$level" = "FAIL" ]; then
44
+ STATUS="FAIL"
45
+ elif [ "$level" = "WARN" ] && [ "$STATUS" != "FAIL" ]; then
46
+ STATUS="WARN"
47
+ fi
48
+ }
49
+
50
+ # ---------------------------------------------------------------------------
51
+ # Pre-flight: nvidia-smi must exist, and the requested GPU index must resolve.
52
+ # ---------------------------------------------------------------------------
53
+ if ! command -v nvidia-smi >/dev/null 2>&1; then
54
+ echo "FAIL: nvidia-smi not found — no NVIDIA driver on this box." >&2
55
+ exit 2
56
+ fi
57
+ if ! nvidia-smi -i "$GPU" -L >/dev/null 2>&1; then
58
+ echo "FAIL: GPU index $GPU does not exist (nvidia-smi -L)." >&2
59
+ exit 2
60
+ fi
61
+
62
+ GPU_NAME="$(nvidia-smi -i "$GPU" --query-gpu=name --format=csv,noheader 2>/dev/null)"
63
+ echo "== gpu_health: GPU $GPU ($GPU_NAME), sampling ${SAMPLE_COUNT}s =="
64
+
65
+ # ---------------------------------------------------------------------------
66
+ # CHECK 1 — live sampling with nvidia-smi dmon.
67
+ # -s pucvmet selects: p=power, u=util(sm/mem), c=clocks(sm/mem), v=power/thermal
68
+ # violations, m=mem usage, e=ECC errors, t=temp. -c N takes N one-second samples.
69
+ # We capture the raw table; later checks parse the peak temp / current SM clock out
70
+ # of the per-GPU query API (more robust than column-slicing dmon across driver versions).
71
+ # ---------------------------------------------------------------------------
72
+ DMON_OUT="$(nvidia-smi dmon -i "$GPU" -s pucvmet -c "$SAMPLE_COUNT" 2>/dev/null || true)"
73
+ if [ -n "$DMON_OUT" ]; then
74
+ echo "$DMON_OUT"
75
+ else
76
+ escalate WARN "dmon produced no samples (old driver?); falling back to point queries."
77
+ fi
78
+
79
+ # Point-in-time query: temperature, current SM clock, and BASE-equivalent reference.
80
+ # query-gpu fields are stable across drivers, unlike dmon column order.
81
+ read -r TEMP_C SM_CUR SM_MAX <<EOF
82
+ $(nvidia-smi -i "$GPU" \
83
+ --query-gpu=temperature.gpu,clocks.current.sm,clocks.max.sm \
84
+ --format=csv,noheader,nounits 2>/dev/null | tr ',' ' ')
85
+ EOF
86
+ TEMP_C="${TEMP_C:-0}"
87
+ SM_CUR="${SM_CUR:-0}"
88
+ SM_MAX="${SM_MAX:-0}"
89
+ echo " temp=${TEMP_C}C sm_clock=${SM_CUR}MHz sm_max=${SM_MAX}MHz"
90
+
91
+ # ---------------------------------------------------------------------------
92
+ # CHECK 2 — Xid hardware-error scan (see references/gotchas_universal.md U21-U23).
93
+ # Xid is the canonical NVIDIA hardware-failure channel in the kernel ring buffer.
94
+ # Xid 48 = double-bit (uncorrectable) ECC -> the GPU is effectively DEAD.
95
+ # Xid 79 = "GPU has fallen off the bus" -> PCIe link lost; board is gone.
96
+ # Other Xids (e.g. 13, 31, 43, 45) are usually app faults, not hardware death -> WARN.
97
+ # dmesg may need root; if it is unreadable we cannot clear the GPU, so WARN (not silent PASS).
98
+ # IMPORTANT: grep alternation is fully quoted — an unquoted '|' would fork a pipe that
99
+ # reads stdin and hangs the probe forever.
100
+ # ---------------------------------------------------------------------------
101
+ if DMESG_OUT="$(dmesg 2>/dev/null)" && [ -n "$DMESG_OUT" ]; then
102
+ # Any Xid line at all is worth surfacing.
103
+ XID_LINES="$(printf '%s\n' "$DMESG_OUT" | grep -iE 'NVRM: Xid' || true)"
104
+ if [ -n "$XID_LINES" ]; then
105
+ # HARD-failure Xid codes. Match "Xid (...): 48," / "Xid 79" robustly by code.
106
+ HARD_XID="$(printf '%s\n' "$XID_LINES" | grep -iE 'Xid[^0-9]*[0-9:() ]*[^0-9](48|79)([,. ]|$)' || true)"
107
+ if [ -n "$HARD_XID" ]; then
108
+ escalate FAIL "Xid 48/79 detected (dead GPU / off-the-bus): $(printf '%s' "$HARD_XID" | tail -n1)"
109
+ else
110
+ escalate WARN "Non-fatal Xid present (likely app fault): $(printf '%s' "$XID_LINES" | tail -n1)"
111
+ fi
112
+ fi
113
+ else
114
+ escalate WARN "dmesg unreadable (need root?) — cannot rule out an Xid hardware fault. — exit code is non-authoritative; have a human confirm GPU health when dmesg is unreadable."
115
+ fi
116
+
117
+ # ---------------------------------------------------------------------------
118
+ # CHECK 3 — thermal / power throttling (see references/gotchas_universal.md U21-U23).
119
+ # Two independent signatures, either one trips a HARD fail:
120
+ # (a) the kernel-reported clocks-throttle reasons via nvidia-smi -q -d PERFORMANCE
121
+ # (HW thermal slowdown / HW power brake / SW thermal slowdown active = throttling now);
122
+ # (b) heuristic: SM clock crushed below SM_CLOCK_FLOOR_FRAC% of sm_max WHILE temp >= 83 °C
123
+ # — the classic "same code slower than yesterday" silent 25–40% loss.
124
+ # On a shared rental the cooling cannot be fixed, so confirmed throttling => re-rent.
125
+ # ---------------------------------------------------------------------------
126
+ PERF_OUT="$(nvidia-smi -i "$GPU" -q -d PERFORMANCE 2>/dev/null || true)"
127
+ # Look ONLY for reasons reported "Active" — the static list is always present.
128
+ # Quoted alternation again: never an unquoted pipe in the regex.
129
+ THROTTLE_ACTIVE="$(printf '%s\n' "$PERF_OUT" \
130
+ | grep -iE 'slowdown|power brake|hw thermal|sw thermal' \
131
+ | grep -i 'active' \
132
+ | grep -iv ': not active' || true)"
133
+ if [ -n "$THROTTLE_ACTIVE" ]; then
134
+ escalate FAIL "nvidia-smi reports active throttling: $(printf '%s' "$THROTTLE_ACTIVE" | tr -s ' ' | tail -n1)"
135
+ fi
136
+
137
+ # Heuristic clock-vs-temp check — only meaningful when we read real numbers.
138
+ # Integer math only (clocks are whole MHz); guards against a zero sm_max.
139
+ if [ "$SM_MAX" -gt 0 ] 2>/dev/null; then
140
+ SM_FLOOR=$(( SM_MAX * SM_CLOCK_FLOOR_FRAC / 100 )) # 70% of max = "crushed" threshold
141
+ if [ "$SM_CUR" -lt "$SM_FLOOR" ] && [ "$TEMP_C" -ge "$TEMP_HOT_C" ] 2>/dev/null; then
142
+ escalate FAIL "thermal throttle: sm_clock ${SM_CUR}MHz < ${SM_FLOOR}MHz (70% of max) while temp ${TEMP_C}C >= ${TEMP_HOT_C}C"
143
+ elif [ "$TEMP_C" -ge "$TEMP_HOT_C" ] 2>/dev/null; then
144
+ # Hot but clock still high: borderline, warn so the caller watches it.
145
+ escalate WARN "running hot (${TEMP_C}C >= ${TEMP_HOT_C}C) but SM clock not yet crushed — watch for throttling."
146
+ fi
147
+ fi
148
+
149
+ # ---------------------------------------------------------------------------
150
+ # Summary + exit. HARD fail => exit 2 so a wrapper aborts the launch.
151
+ # ---------------------------------------------------------------------------
152
+ echo "------------------------------------------------------------"
153
+ if [ -n "$NOTES" ]; then
154
+ printf 'findings:\n%s' "$NOTES" >&2
155
+ fi
156
+ case "$STATUS" in
157
+ FAIL)
158
+ echo "RESULT: FAIL — GPU $GPU is unhealthy. Stop this instance and re-rent a different box."
159
+ exit 2
160
+ ;;
161
+ WARN)
162
+ echo "RESULT: WARN — GPU $GPU usable but degraded; review findings above before a long run."
163
+ exit 0
164
+ ;;
165
+ *)
166
+ echo "RESULT: PASS — GPU $GPU healthy (no Xid, no throttling, clocks nominal)."
167
+ exit 0
168
+ ;;
169
+ esac
@@ -0,0 +1,67 @@
1
+ #!/usr/bin/env bash
2
+ # health_patrol.sh.template — ONE read-only patrol tick for a detached remote GPU job.
3
+ #
4
+ # Fire on a cadence from the host's recurring runner (Claude Code `/loop 30m`; cron
5
+ # `3,33 * * * *` — offset off :00/:30 to dodge platform load spikes; Codex/Cursor
6
+ # Automations → references/monitoring_patterns.md §7). This is the §3 **L2 patrol** body:
7
+ # one combined ssh round-trip → a decision → a 3-5 line report EVEN IF nothing changed.
8
+ #
9
+ # READ-ONLY: never edits, restarts, or deletes anything (principles #6/#9). Watches
10
+ # LIVENESS only; to make the RESULT outlive the box, pair with an on-box L1 self-
11
+ # completion chain (§3 L1). Exit 0 = healthy / in-progress / cleanly done;
12
+ # exit 1 = ESCALATE ("崩了") so the loop/cron surfaces the tick loudly.
13
+ set -u
14
+
15
+ # ── PROFILE BLOCK — bind from profiles/<platform>.md §8 SCRIPT OVERRIDES ────────────────
16
+ HOST="${HOST:-autodl-1}" # ssh alias (profile §1)
17
+ RUN_GLOB="${RUN_GLOB:-scripts.train}" # pgrep -af pattern for the train process
18
+ RESULT_DIR="${RESULT_DIR:-/root/autodl-tmp/runs/results}" # dir holding one file per finished cell
19
+ RUN_LOG="${RUN_LOG:-/root/autodl-tmp/runs/logs/train.log}" # the PER-RUN log (NOT a tee'd master — see ‡)
20
+ DATA_MOUNT="${DATA_MOUNT:-/root/autodl-tmp}" # disk to watch (bytes AND inodes)
21
+ N_TOTAL="${N_TOTAL:-0}" # expected cell count (0 = don't grade completion)
22
+ DISK_PCT_MAX="${DISK_PCT_MAX:-95}" # escalate when used% (bytes or inodes) >= this
23
+
24
+ # ── ONE combined READ-ONLY round-trip (quoted heredoc → sent verbatim; safe args via bash -s) ──
25
+ OUT="$(ssh -o ConnectTimeout=15 -o ServerAliveInterval=10 -o ServerAliveCountMax=3 "$HOST" \
26
+ bash -s "$RUN_GLOB" "$RESULT_DIR" "$RUN_LOG" "$DATA_MOUNT" <<'REMOTE'
27
+ set -u
28
+ RUN_GLOB=$1; RESULT_DIR=$2; RUN_LOG=$3; DATA_MOUNT=$4
29
+ RESULT_GLOB='*.json' # CUSTOMIZE: one file per finished cell
30
+ CRASH_RE='Traceback|Error|CUDA out of memory|OutOfMemory|Killed' # CUSTOMIZE; QUOTED → | is alternation, never a pipe
31
+ echo "ALIVE=$(pgrep -af "$RUN_GLOB" 2>/dev/null | grep -v grep | wc -l)"
32
+ echo "DONE=$(ls "$RESULT_DIR"/$RESULT_GLOB 2>/dev/null | wc -l)"
33
+ echo "EPOCH=$(grep -hoE 'Epoch[ =:]*[0-9]+(/[0-9]+)?' "$RUN_LOG" 2>/dev/null | tail -1)"
34
+ echo "CRASH=$(grep -hE "$CRASH_RE" "$RUN_LOG" 2>/dev/null | wc -l)" # ‡ scope to per-run log, never run_all.out (§2)
35
+ echo "DISK=$(df -h "$DATA_MOUNT" 2>/dev/null | awk 'NR==2{print $5}')"
36
+ echo "INODE=$(df -i "$DATA_MOUNT" 2>/dev/null | awk 'NR==2{print $5}')"
37
+ REMOTE
38
+ )" || { echo "PATROL $HOST: ssh FAILED — only YOU can see the console (balance / power / preemption). Check it."; exit 1; }
39
+
40
+ # ── parse ──
41
+ g(){ printf '%s\n' "$OUT" | sed -n "s/^$1=//p"; }
42
+ ALIVE=$(g ALIVE); DONE=$(g DONE); EPOCH=$(g EPOCH); CRASH=$(g CRASH); DISK=$(g DISK); INODE=$(g INODE)
43
+ dp=${DISK%\%}; ip=${INODE%\%}
44
+
45
+ # ── always report, even if nothing changed (§3-L2) ──
46
+ echo "PATROL $HOST: proc=${ALIVE:-?} done=${DONE:-?}/${N_TOTAL} epoch=${EPOCH:-n/a} disk=${DISK:-?} inode=${INODE:-?}"
47
+
48
+ # ── escalate? crash signature / disk / inode / process-gone-while-incomplete ──
49
+ esc=0; why=""
50
+ [ "${CRASH:-0}" -gt 0 ] 2>/dev/null && { esc=1; why="crash x${CRASH} in $(basename "$RUN_LOG")"; }
51
+ [ "${dp:-0}" -ge "$DISK_PCT_MAX" ] 2>/dev/null && { esc=1; why="${why:+$why; }disk ${DISK}"; }
52
+ [ "${ip:-0}" -ge "$DISK_PCT_MAX" ] 2>/dev/null && { esc=1; why="${why:+$why; }inodes ${INODE}"; }
53
+ if [ "${ALIVE:-0}" -eq 0 ] 2>/dev/null && [ "${N_TOTAL:-0}" -gt 0 ] && [ "${DONE:-0}" -lt "$N_TOTAL" ] 2>/dev/null; then
54
+ esc=1; why="${why:+$why; }process gone at ${DONE}/${N_TOTAL} (incomplete)"
55
+ fi
56
+
57
+ if [ "$esc" -eq 1 ]; then
58
+ echo "PATROL: 崩了 — ${why}. Triage: ssh $HOST \"grep -B2 -A20 -E 'Traceback' '$RUN_LOG' | head -50\" (§6). Do NOT blind-restart; classify → fixed remediation."
59
+ exit 1
60
+ fi
61
+ if [ "${N_TOTAL:-0}" -gt 0 ] && [ "${DONE:-0}" -ge "$N_TOTAL" ] 2>/dev/null && [ "${ALIVE:-0}" -eq 0 ] 2>/dev/null; then
62
+ echo "PATROL: all ${N_TOTAL} done, process exited → load-verify + pull, THEN teardown (SKILL.md Phase 5 Iron Law)."; exit 0
63
+ fi
64
+ if [ "${ALIVE:-0}" -eq 0 ] 2>/dev/null && [ "${N_TOTAL:-0}" -eq 0 ]; then
65
+ echo "PATROL: process not running and completion-grading off — set N_TOTAL to auto-classify, or verify by hand."; exit 0
66
+ fi
67
+ echo "PATROL: healthy / in progress — nothing to do."; exit 0
@@ -0,0 +1,67 @@
1
+ #!/usr/bin/env bash
2
+ # 5-second resolution memory + CPU + GPU profiler for AutoDL training.
3
+ # Catches val-phase memory spikes that can cgroup-wedge an instance.
4
+ #
5
+ # Usage: bash mem_monitor.sh > /root/autodl-tmp/runs/logs/mem.tsv 2>&1 &
6
+ # Run in tmux session (separate from training tmux).
7
+ #
8
+ # Output: TSV with columns:
9
+ # timestamp cgroup_gb cpu_pct main_pid main_rss_gb main_threads main_fds n_python total_python_rss_gb wandb_pid wandb_rss_gb gpu_util_pct gpu_mem_mb
10
+
11
+ set -u
12
+
13
+ # Which training process to track for the "main" RSS columns. Override to match your launcher's
14
+ # `pgrep -f` pattern, e.g. TRAIN_PROC=train.py or TRAIN_PROC=accelerate (default: src.train).
15
+ TRAIN_PROC="${TRAIN_PROC:-src.train}"
16
+
17
+ # Header
18
+ printf "timestamp\tcgroup_gb\tcpu_pct\tmain_pid\tmain_rss_gb\tmain_threads\tmain_fds\tn_python\ttotal_python_rss_gb\twandb_pid\twandb_rss_gb\tgpu_util_pct\tgpu_mem_mb\n"
19
+
20
+ while true; do
21
+ ts=$(date '+%Y-%m-%d %H:%M:%S')
22
+
23
+ # cgroup current memory (bytes → GB)
24
+ cgroup_bytes=$(cat /sys/fs/cgroup/memory.current 2>/dev/null || echo 0)
25
+ cgroup_gb=$(awk "BEGIN{printf \"%.2f\", $cgroup_bytes/1073741824}")
26
+
27
+ # Total CPU usage from /proc/stat (rough; just diff once)
28
+ cpu_pct=$(top -bn1 | grep "Cpu(s)" | awk '{print $2+$4}')
29
+
30
+ # Main training python PID + RSS (pattern overridable via $TRAIN_PROC)
31
+ main_pid=$(pgrep -f "$TRAIN_PROC" | head -1)
32
+ if [ -n "$main_pid" ]; then
33
+ main_rss=$(awk '/VmRSS/ {print $2}' /proc/$main_pid/status 2>/dev/null || echo 0)
34
+ main_rss_gb=$(awk "BEGIN{printf \"%.2f\", $main_rss/1048576}")
35
+ main_threads=$(awk '/Threads/ {print $2}' /proc/$main_pid/status 2>/dev/null || echo 0)
36
+ main_fds=$(ls /proc/$main_pid/fd 2>/dev/null | wc -l)
37
+ else
38
+ main_pid=0; main_rss_gb=0; main_threads=0; main_fds=0
39
+ fi
40
+
41
+ # All python processes total RSS
42
+ n_python=$(pgrep -f python | wc -l)
43
+ total_python_rss_kb=$(ps -eo rss,comm | awk '$2 ~ /python/ {sum+=$1} END {print sum+0}')
44
+ total_python_rss_gb=$(awk "BEGIN{printf \"%.2f\", $total_python_rss_kb/1048576}")
45
+
46
+ # wandb process
47
+ wandb_pid=$(pgrep -f wandb-service | head -1)
48
+ if [ -n "$wandb_pid" ]; then
49
+ wandb_rss=$(awk '/VmRSS/ {print $2}' /proc/$wandb_pid/status 2>/dev/null || echo 0)
50
+ wandb_rss_gb=$(awk "BEGIN{printf \"%.2f\", $wandb_rss/1048576}")
51
+ else
52
+ wandb_pid=0; wandb_rss_gb=0
53
+ fi
54
+
55
+ # GPU util + memory
56
+ gpu_info=$(nvidia-smi --query-gpu=utilization.gpu,memory.used --format=csv,noheader,nounits 2>/dev/null | head -1)
57
+ gpu_util=$(echo "$gpu_info" | cut -d',' -f1 | tr -d ' ')
58
+ gpu_mem=$(echo "$gpu_info" | cut -d',' -f2 | tr -d ' ')
59
+ gpu_util=${gpu_util:-0}
60
+ gpu_mem=${gpu_mem:-0}
61
+
62
+ printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \
63
+ "$ts" "$cgroup_gb" "$cpu_pct" "$main_pid" "$main_rss_gb" "$main_threads" "$main_fds" \
64
+ "$n_python" "$total_python_rss_gb" "$wandb_pid" "$wandb_rss_gb" "$gpu_util" "$gpu_mem"
65
+
66
+ sleep 5
67
+ done
@@ -0,0 +1,175 @@
1
+ #!/usr/bin/env bash
2
+ # reap_vram_zombies.sh — find (and optionally kill) PIDs holding VRAM that the
3
+ # nvidia-smi process list cannot see (see references/gotchas_universal.md U11).
4
+ #
5
+ # After a crashed DDP run or a killed container, a process can keep a CUDA context
6
+ # (and its VRAM) alive while NOT appearing in `nvidia-smi`'s process table — so a
7
+ # fresh job OOMs on an "empty" GPU. Such holders DO still have the /dev/nvidia*
8
+ # device files open, so fuser/lsof can find them when nvidia-smi cannot.
9
+ #
10
+ # Strategy:
11
+ # 1. enumerate every PID with /dev/nvidia* open (fuser -v, lsof fallback)
12
+ # 2. subtract the PIDs nvidia-smi already accounts for (those are live, visible jobs)
13
+ # 3. of the remainder, flag any that is idle (~0% GPU util) and has lived past a timeout
14
+ # 4. DRY-RUN by default: print candidates only. --force is required to kill -9.
15
+ #
16
+ # Usage:
17
+ # bash reap_vram_zombies.sh # dry-run: list zombie candidates, kill nothing
18
+ # bash reap_vram_zombies.sh --force # actually kill -9 the flagged candidates
19
+ #
20
+ # A DRY-RUN exits 0 and never touches a process. Killing is destructive:
21
+ # it is gated behind an explicit --force so the orchestrator never auto-reaps.
22
+ # If the holder is inside another container, kill -9 from the host may not clear it —
23
+ # restart that container instead.
24
+ # NEVER an unquoted pipe inside a grep regex (it reads stdin and hangs forever).
25
+
26
+ set -u
27
+
28
+ # ---------------------------------------------------------------------------
29
+ # Tunable constants — documented, no magic numbers buried in logic.
30
+ # ---------------------------------------------------------------------------
31
+ FORCE=0 # 0 = dry-run (default), 1 = actually kill. Set by --force.
32
+ MIN_AGE_SECS=120 # only reap a holder that has lived > 2 min. A genuinely new
33
+ # process may briefly hold a context while warming up; 2 min
34
+ # is well past CUDA-context init, so survivors are stragglers.
35
+ IDLE_UTIL_PCT=5 # treat per-process GPU util <= 5% as "idle". A real training
36
+ # job pegs util far higher; ~0% + held VRAM = a zombie, not work.
37
+
38
+ # ---------------------------------------------------------------------------
39
+ # Arg parse — only --force is recognized; anything else is a usage error.
40
+ # ---------------------------------------------------------------------------
41
+ for arg in "$@"; do
42
+ case "$arg" in
43
+ --force) FORCE=1 ;;
44
+ -h|--help)
45
+ echo "usage: bash reap_vram_zombies.sh [--force]" >&2
46
+ echo " (default is a dry-run; --force enables kill -9)" >&2
47
+ exit 0
48
+ ;;
49
+ *)
50
+ echo "unknown argument: $arg (only --force is supported)" >&2
51
+ exit 64 # EX_USAGE
52
+ ;;
53
+ esac
54
+ done
55
+
56
+ if ! command -v nvidia-smi >/dev/null 2>&1; then
57
+ echo "nvidia-smi not found — no NVIDIA driver on this box." >&2
58
+ exit 1
59
+ fi
60
+
61
+ # ---------------------------------------------------------------------------
62
+ # Step 1 — enumerate PIDs holding /dev/nvidia* open.
63
+ # fuser prints PIDs (mode letters attached, e.g. "12345m"); strip non-digits.
64
+ # lsof is the fallback when fuser is absent. Expand /dev/nvidia* to only the real
65
+ # device nodes first: with no NVIDIA driver the glob matches nothing, and passing
66
+ # the literal "/dev/nvidia*" to fuser/lsof would otherwise error and mislead.
67
+ # ---------------------------------------------------------------------------
68
+ collect_dev_holders() {
69
+ local pids="" dev
70
+ local devs=()
71
+ for dev in /dev/nvidia*; do [ -e "$dev" ] && devs+=("$dev"); done
72
+ if [ "${#devs[@]}" -eq 0 ]; then
73
+ echo "no /dev/nvidia* device nodes present — cannot enumerate device holders." >&2
74
+ return 1
75
+ fi
76
+ if command -v fuser >/dev/null 2>&1; then
77
+ # fuser writes the PID list to stdout, the verbose table to stderr.
78
+ # 2>/dev/null drops the table; we keep only the bare PIDs.
79
+ pids="$(fuser "${devs[@]}" 2>/dev/null || true)"
80
+ elif command -v lsof >/dev/null 2>&1; then
81
+ # lsof -t prints one PID per line for the listed device files.
82
+ pids="$(lsof -t "${devs[@]}" 2>/dev/null || true)"
83
+ else
84
+ echo "neither fuser nor lsof is available — cannot enumerate device holders." >&2
85
+ return 1
86
+ fi
87
+ # Normalize to whitespace-separated bare PIDs (drop fuser's mode letters).
88
+ printf '%s\n' "$pids" | tr -cs '0-9' ' '
89
+ }
90
+
91
+ DEV_HOLDERS="$(collect_dev_holders)" || exit 1
92
+ DEV_HOLDERS="$(printf '%s\n' "$DEV_HOLDERS" | tr ' ' '\n' | grep -E '^[0-9]+$' || true)"
93
+
94
+ if [ -z "$DEV_HOLDERS" ]; then
95
+ echo "RESULT: clean — no process is holding /dev/nvidia* open."
96
+ exit 0
97
+ fi
98
+
99
+ # ---------------------------------------------------------------------------
100
+ # Step 2 — PIDs nvidia-smi already accounts for. These are visible, legitimate
101
+ # jobs; never reap them. (Empty when the zombie is the ONLY holder — the U11 case.)
102
+ # ---------------------------------------------------------------------------
103
+ VISIBLE_PIDS="$(nvidia-smi --query-compute-apps=pid --format=csv,noheader 2>/dev/null \
104
+ | grep -E '^[0-9]+$' || true)"
105
+
106
+ # is_visible <pid> — true if nvidia-smi lists this PID as a compute app.
107
+ is_visible() {
108
+ local pid="$1"
109
+ printf '%s\n' "$VISIBLE_PIDS" | grep -qx "$pid"
110
+ }
111
+
112
+ # ---------------------------------------------------------------------------
113
+ # Step 3 — classify each remaining holder. A candidate is a holder that is
114
+ # (a) NOT in nvidia-smi's list, (b) older than MIN_AGE_SECS, (c) ~idle on the GPU.
115
+ # Process age comes from `ps -o etimes` (elapsed seconds, integer, portable).
116
+ # ---------------------------------------------------------------------------
117
+ CANDIDATES=""
118
+ echo "== reap_vram_zombies: scanning $(printf '%s' "$DEV_HOLDERS" | tr '\n' ' ')=="
119
+ for pid in $DEV_HOLDERS; do
120
+ # Skip the kernel/init edge and any PID that vanished mid-scan.
121
+ if [ ! -d "/proc/$pid" ]; then
122
+ continue
123
+ fi
124
+
125
+ CMD="$(ps -o comm= -p "$pid" 2>/dev/null || true)"
126
+ AGE="$(ps -o etimes= -p "$pid" 2>/dev/null | tr -d ' ' || true)"
127
+ AGE="${AGE:-0}"
128
+
129
+ if is_visible "$pid"; then
130
+ echo " pid $pid ($CMD): visible to nvidia-smi — live job, skip."
131
+ continue
132
+ fi
133
+ if [ "$AGE" -lt "$MIN_AGE_SECS" ] 2>/dev/null; then
134
+ echo " pid $pid ($CMD): age ${AGE}s < ${MIN_AGE_SECS}s — too young, skip (may be warming up)."
135
+ continue
136
+ fi
137
+
138
+ # This PID holds /dev/nvidia*, is invisible to nvidia-smi, and is old.
139
+ # nvidia-smi cannot give us a per-process util for an unlisted PID, so by the
140
+ # U11 definition (held VRAM + invisible) it is already idle on the GPU.
141
+ echo " pid $pid ($CMD): age ${AGE}s, holds VRAM, INVISIBLE to nvidia-smi -> ZOMBIE candidate."
142
+ CANDIDATES="${CANDIDATES}${pid} "
143
+ done
144
+
145
+ CANDIDATES="$(printf '%s' "$CANDIDATES" | tr -s ' ' )"
146
+ CANDIDATES="${CANDIDATES# }"; CANDIDATES="${CANDIDATES% }"
147
+
148
+ # ---------------------------------------------------------------------------
149
+ # Step 4 — act. Dry-run prints and exits; --force kills -9.
150
+ # ---------------------------------------------------------------------------
151
+ echo "------------------------------------------------------------"
152
+ if [ -z "$CANDIDATES" ]; then
153
+ echo "RESULT: clean — holders exist but none qualifies as a zombie (all visible/young)."
154
+ exit 0
155
+ fi
156
+
157
+ echo "zombie VRAM holders: $CANDIDATES"
158
+ if [ "$FORCE" -ne 1 ]; then
159
+ echo "RESULT: DRY-RUN — nothing killed. Re-run with --force to 'kill -9' the PIDs above."
160
+ echo " (If a holder lives inside another container, restart that container instead.)"
161
+ exit 0
162
+ fi
163
+
164
+ # --force path: kill each candidate, report per-PID outcome.
165
+ RC=0
166
+ for pid in $CANDIDATES; do
167
+ if kill -9 "$pid" 2>/dev/null; then
168
+ echo "killed -9 $pid"
169
+ else
170
+ echo "FAILED to kill $pid (gone already, or owned by another container)." >&2
171
+ RC=1
172
+ fi
173
+ done
174
+ echo "RESULT: reaped zombie VRAM holders (--force)."
175
+ exit "$RC"
@@ -0,0 +1,104 @@
1
+ #!/usr/bin/env bash
2
+ # Per-job (per-ablation) wrapper — platform-agnostic skeleton.
3
+ #
4
+ # Parameterize the PROFILE BLOCK below from your platform profile's "SCRIPT OVERRIDES"
5
+ # section (profiles/<platform>.md §8). The defaults shown are AutoDL's.
6
+ #
7
+ # Mandatory: run the network-acceleration hook before any external call (wandb / HF / pip / git).
8
+ # On AutoDL that is `source /etc/network_turbo`; on a clean box set PROXY_HOOK=":" (a no-op).
9
+ # Without the right hook, wandb.init can hang and a flaky link can drop already-uploaded cloud
10
+ # runs — see references/gotchas_universal.md and references/china-network.md.
11
+ #
12
+ # Usage: ./run_one.sh <config_yaml> <task> [epochs]
13
+ #
14
+ # Disclose every CLI override applied below in any paper's Implementation Details — reproducibility
15
+ # depends on the list being complete (the yaml/source stay untouched). See references/gotchas_universal.md.
16
+ set -u
17
+
18
+ # Arg-count guard FIRST — under `set -u`, CFG="$1" below would abort with an unbound-variable
19
+ # error (and no usage hint) when run with no args. Fail with a readable usage line instead.
20
+ if [ "$#" -lt 2 ]; then
21
+ echo "usage: $0 <config_yaml> <task> [epochs]" >&2
22
+ exit 1
23
+ fi
24
+
25
+ # ===== PROFILE BLOCK — override from profiles/<platform>.md §8 (defaults = AutoDL) =====
26
+ PROJECT_REPO_DIR="${PROJECT_REPO_DIR:-/root/PROJECT_NAME}" # where your code lives on the instance
27
+ DATA_DIR="${DATA_DIR:-/root/autodl-tmp}" # fast per-instance scratch (checkpoints land here)
28
+ DURABLE_DIR="${DURABLE_DIR:-/root/autodl-fs}" # survives teardown (profile survival matrix); set "" to skip sync
29
+ PROXY_HOOK="${PROXY_HOOK:-source /etc/network_turbo}" # network-accel hook; ":" (no-op) on a clean box
30
+ CRED_FILE="${CRED_FILE:-/root/.wandb_key}" # file holding the tracker key; "" if WANDB_API_KEY already in env
31
+ # =======================================================================================
32
+
33
+ # PROXY_HOOK is an OPERATOR-supplied snippet from your platform profile (e.g. `source /etc/network_turbo`,
34
+ # `module load cuda`, or empty). It is eval'd intentionally so a profile can run an arbitrary setup hook —
35
+ # set it ONLY from your own trusted profile, never from untrusted or remote-derived input.
36
+ eval "${PROXY_HOOK}" 2>/dev/null || true
37
+ # The prebuilt base IS the env on most rentals (do not conda create). Activate if present.
38
+ source /root/miniconda3/etc/profile.d/conda.sh 2>/dev/null && conda activate base 2>/dev/null || true
39
+ if [ -n "$CRED_FILE" ] && [ -f "$CRED_FILE" ]; then export WANDB_API_KEY="$(cat "$CRED_FILE")"; fi
40
+
41
+ export WANDB_MODE="${WANDB_MODE:-online}" # offline without a key => W&B silently DISABLED (gotchas_universal)
42
+ export WANDB_START_METHOD=thread
43
+ export PYTHONUNBUFFERED=1
44
+
45
+ CKPT_ROOT="$DATA_DIR/checkpoints"
46
+ LOG_DIR="$DATA_DIR/runs/logs"
47
+ mkdir -p "$DATA_DIR/wandb" "$LOG_DIR" "$CKPT_ROOT"
48
+
49
+ cd "${PROJECT_REPO_DIR}" || { echo "PROJECT_REPO_DIR not found: $PROJECT_REPO_DIR"; exit 1; }
50
+
51
+ CFG="$1"
52
+ TASK="$2"
53
+ EPOCHS="${3:-20}"
54
+ NAME="$(basename "$CFG" .yaml)"
55
+
56
+ # CUSTOMIZE: classify the ablation by name pattern -> tracker group + tags (example scheme; extend freely)
57
+ case "$NAME" in
58
+ aug_*|seg_aug_*|det_aug_*) GRP="${TASK}_aug"; TAGS="[$TASK,aug]" ;;
59
+ *_no_*) GRP="${TASK}_module"; TAGS="[$TASK,module]" ;;
60
+ precision_*|seg_precision_*|det_precision_*) GRP="${TASK}_precision"; TAGS="[$TASK,precision]" ;;
61
+ *mask_*) GRP="${TASK}_rate"; TAGS="[$TASK,rate]" ;;
62
+ baseline_*) GRP="${TASK}_baseline"; TAGS="[$TASK,baseline]" ;;
63
+ *) GRP="${TASK}_other"; TAGS="[$TASK,other]" ;;
64
+ esac
65
+
66
+ CKPT_DIR="$CKPT_ROOT/$NAME"
67
+ mkdir -p "$CKPT_DIR"
68
+
69
+ # CUSTOMIZE: replace `src.train` with your project's training entrypoint module + its override flags
70
+ python -m src.train --no-strict \
71
+ -o wandb.group="$GRP" \
72
+ -o wandb.tags="$TAGS" \
73
+ -o data.num_workers=2 \
74
+ -o data.pin_memory=False \
75
+ -o training.val_metric_sample_cap=256 \
76
+ -o training.checkpoint_dir="$CKPT_DIR" \
77
+ -c "$CFG" --task "$TASK" --epochs "$EPOCHS" \
78
+ --experiment-name "abla_$NAME" \
79
+ 2>&1 | tee "$LOG_DIR/$NAME.log"
80
+
81
+ EXIT=${PIPESTATUS[0]}
82
+
83
+ # Post-success: keep best.pth only, prune scratch latest.pth (disk-budget, principle #5).
84
+ if [ "$EXIT" -eq 0 ] && [ -f "$CKPT_DIR/best.pth" ]; then
85
+ rm -f "$CKPT_DIR/latest.pth"
86
+ echo "[$(date +%H:%M:%S)] kept best.pth, pruned latest.pth for $NAME"
87
+ fi
88
+
89
+ # Auto-sync to durable storage. GATE the success line on the actual copy result — an unconditional
90
+ # "synced" echo lies when the durable FS is full / inode-exhausted (references/gotchas_universal.md,
91
+ # silent-sync). Verify best.pth landed before claiming success (principle #3). Skip if DURABLE_DIR="".
92
+ if [ "$EXIT" -eq 0 ] && [ -f "$CKPT_DIR/best.pth" ] && [ -n "$DURABLE_DIR" ]; then
93
+ FS_DIR="$DURABLE_DIR/final_ckpts/$NAME"
94
+ if mkdir -p "$FS_DIR" && cp -f "$CKPT_DIR/best.pth" "$FS_DIR/" && [ -f "$FS_DIR/best.pth" ]; then
95
+ cp -f "$CKPT_DIR/best_metrics.json" "$FS_DIR/" 2>/dev/null || true
96
+ cp -rf "$CKPT_DIR/protocol" "$FS_DIR/" 2>/dev/null || true
97
+ cp -f "$LOG_DIR/$NAME.log" "$FS_DIR/" 2>/dev/null || true
98
+ echo "[$(date +%H:%M:%S)] synced $NAME to durable storage ($FS_DIR)"
99
+ else
100
+ echo "[$(date +%H:%M:%S)] !! DURABLE SYNC FAILED for $NAME — check 'df -i $DURABLE_DIR'. The data-disk copy is source-of-truth."
101
+ fi
102
+ fi
103
+
104
+ exit $EXIT