npm - opencode-skills-collection - Versions diffs - 3.1.2 → 3.1.3 - Mend

opencode-skills-collection 3.1.2 → 3.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

package/bundled-skills/remote-gpu-trainer/scripts/gpu_health.sh ADDED Viewed

@@ -0,0 +1,169 @@
+#!/usr/bin/env bash
+# gpu_health.sh — portable pre-flight GPU-health probe for a rented box (see references/gotchas_universal.md U21-U23).
+#
+# Runs three independent checks and prints ONE PASS / WARN / FAIL summary:
+#   1. live sampling     — nvidia-smi dmon over a few seconds (power/util/clocks/mem/temp)
+#   2. Xid scan          — dmesg for hardware-failure Xid codes; Xid 48 / 79 are HARD failures
+#   3. throttle scan     — SM clock crushed below base while hot, or nvidia-smi throttle reasons
+#
+# Exit codes (so a launch wrapper can react before it pays for GPU-hours):
+#   0  PASS or WARN  — safe to launch (WARN = degraded but usable; see stderr notes)
+#   2  HARD FAIL     — dead/throttling GPU; re-rent a DIFFERENT box, do not launch here
+#
+# Usage:  bash gpu_health.sh [GPU_INDEX]      # default 0
+# On a rental there is no "reseat the card" — a HARD fail means stop + re-rent (see references/gotchas_universal.md U21-U23).
+# NEVER an unquoted pipe inside a grep regex (it reads stdin and hangs).
+set -u
+# ---------------------------------------------------------------------------
+# Tunable constants — every magic number is documented here, no voodoo.
+# ---------------------------------------------------------------------------
+GPU="${1:-0}"            # which GPU to probe (nvidia-smi index)
+SAMPLE_COUNT=5          # dmon sample COUNT (-c N = N one-second samples); 5 samples ~= 5 s,
+                        # enough to catch a clock dip without burning metered time on a no-op probe.
+TEMP_HOT_C=83           # H100/A100-class throttle onset ~83 °C (U23). At/above this the
+                        # board down-clocks itself; sustained >83 °C while SM clock is low
+                        # is the thermal-throttle signature.
+SM_CLOCK_FLOOR_FRAC=70  # treat SM clock < 70% of the board's *base* clock as "crushed".
+                        # 70% chosen as a conservative gap: boost variance is normal,
+                        # but a 30%+ drop below BASE under load is throttling, not jitter.
+# ---------------------------------------------------------------------------
+# Result accumulators. status escalates PASS -> WARN -> FAIL, never downgrades.
+# ---------------------------------------------------------------------------
+STATUS="PASS"
+NOTES=""                # human-readable findings, one per line, emitted to stderr
+# escalate <LEVEL> <message> — raise overall status and record the reason.
+escalate() {
+    local level="$1"; shift
+    NOTES="${NOTES}  [${level}] $*"$'\n'
+    # FAIL beats WARN beats PASS; only ever climb the ladder.
+    if [ "$level" = "FAIL" ]; then
+        STATUS="FAIL"
+    elif [ "$level" = "WARN" ] && [ "$STATUS" != "FAIL" ]; then
+        STATUS="WARN"
+    fi
+}
+# ---------------------------------------------------------------------------
+# Pre-flight: nvidia-smi must exist, and the requested GPU index must resolve.
+# ---------------------------------------------------------------------------
+if ! command -v nvidia-smi >/dev/null 2>&1; then
+    echo "FAIL: nvidia-smi not found — no NVIDIA driver on this box." >&2
+    exit 2
+fi
+if ! nvidia-smi -i "$GPU" -L >/dev/null 2>&1; then
+    echo "FAIL: GPU index $GPU does not exist (nvidia-smi -L)." >&2
+    exit 2
+fi
+GPU_NAME="$(nvidia-smi -i "$GPU" --query-gpu=name --format=csv,noheader 2>/dev/null)"
+echo "== gpu_health: GPU $GPU ($GPU_NAME), sampling ${SAMPLE_COUNT}s =="
+# ---------------------------------------------------------------------------
+# CHECK 1 — live sampling with nvidia-smi dmon.
+#   -s pucvmet selects: p=power, u=util(sm/mem), c=clocks(sm/mem), v=power/thermal
+#   violations, m=mem usage, e=ECC errors, t=temp. -c N takes N one-second samples.
+# We capture the raw table; later checks parse the peak temp / current SM clock out
+# of the per-GPU query API (more robust than column-slicing dmon across driver versions).
+# ---------------------------------------------------------------------------
+DMON_OUT="$(nvidia-smi dmon -i "$GPU" -s pucvmet -c "$SAMPLE_COUNT" 2>/dev/null || true)"
+if [ -n "$DMON_OUT" ]; then
+    echo "$DMON_OUT"
+else
+    escalate WARN "dmon produced no samples (old driver?); falling back to point queries."
+fi
+# Point-in-time query: temperature, current SM clock, and BASE-equivalent reference.
+# query-gpu fields are stable across drivers, unlike dmon column order.
+read -r TEMP_C SM_CUR SM_MAX <<EOF
+$(nvidia-smi -i "$GPU" \
+    --query-gpu=temperature.gpu,clocks.current.sm,clocks.max.sm \
+    --format=csv,noheader,nounits 2>/dev/null | tr ',' ' ')
+EOF
+TEMP_C="${TEMP_C:-0}"
+SM_CUR="${SM_CUR:-0}"
+SM_MAX="${SM_MAX:-0}"
+echo "   temp=${TEMP_C}C  sm_clock=${SM_CUR}MHz  sm_max=${SM_MAX}MHz"
+# ---------------------------------------------------------------------------
+# CHECK 2 — Xid hardware-error scan (see references/gotchas_universal.md U21-U23).
+#   Xid is the canonical NVIDIA hardware-failure channel in the kernel ring buffer.
+#   Xid 48 = double-bit (uncorrectable) ECC  -> the GPU is effectively DEAD.
+#   Xid 79 = "GPU has fallen off the bus"     -> PCIe link lost; board is gone.
+#   Other Xids (e.g. 13, 31, 43, 45) are usually app faults, not hardware death -> WARN.
+# dmesg may need root; if it is unreadable we cannot clear the GPU, so WARN (not silent PASS).
+# IMPORTANT: grep alternation is fully quoted — an unquoted '|' would fork a pipe that
+# reads stdin and hangs the probe forever.
+# ---------------------------------------------------------------------------
+if DMESG_OUT="$(dmesg 2>/dev/null)" && [ -n "$DMESG_OUT" ]; then
+    # Any Xid line at all is worth surfacing.
+    XID_LINES="$(printf '%s\n' "$DMESG_OUT" | grep -iE 'NVRM: Xid' || true)"
+    if [ -n "$XID_LINES" ]; then
+        # HARD-failure Xid codes. Match "Xid (...): 48," / "Xid 79" robustly by code.
+        HARD_XID="$(printf '%s\n' "$XID_LINES" | grep -iE 'Xid[^0-9]*[0-9:() ]*[^0-9](48|79)([,. ]|$)' || true)"
+        if [ -n "$HARD_XID" ]; then
+            escalate FAIL "Xid 48/79 detected (dead GPU / off-the-bus): $(printf '%s' "$HARD_XID" | tail -n1)"
+        else
+            escalate WARN "Non-fatal Xid present (likely app fault): $(printf '%s' "$XID_LINES" | tail -n1)"
+        fi
+    fi
+else
+    escalate WARN "dmesg unreadable (need root?) — cannot rule out an Xid hardware fault. — exit code is non-authoritative; have a human confirm GPU health when dmesg is unreadable."
+fi
+# ---------------------------------------------------------------------------
+# CHECK 3 — thermal / power throttling (see references/gotchas_universal.md U21-U23).
+# Two independent signatures, either one trips a HARD fail:
+#   (a) the kernel-reported clocks-throttle reasons via nvidia-smi -q -d PERFORMANCE
+#       (HW thermal slowdown / HW power brake / SW thermal slowdown active = throttling now);
+#   (b) heuristic: SM clock crushed below SM_CLOCK_FLOOR_FRAC% of sm_max WHILE temp >= 83 °C
+#       — the classic "same code slower than yesterday" silent 25–40% loss.
+# On a shared rental the cooling cannot be fixed, so confirmed throttling => re-rent.
+# ---------------------------------------------------------------------------
+PERF_OUT="$(nvidia-smi -i "$GPU" -q -d PERFORMANCE 2>/dev/null || true)"
+# Look ONLY for reasons reported "Active" — the static list is always present.
+# Quoted alternation again: never an unquoted pipe in the regex.
+THROTTLE_ACTIVE="$(printf '%s\n' "$PERF_OUT" \
+    | grep -iE 'slowdown|power brake|hw thermal|sw thermal' \
+    | grep -i 'active' \
+    | grep -iv ': not active' || true)"
+if [ -n "$THROTTLE_ACTIVE" ]; then
+    escalate FAIL "nvidia-smi reports active throttling: $(printf '%s' "$THROTTLE_ACTIVE" | tr -s ' ' | tail -n1)"
+fi
+# Heuristic clock-vs-temp check — only meaningful when we read real numbers.
+# Integer math only (clocks are whole MHz); guards against a zero sm_max.
+if [ "$SM_MAX" -gt 0 ] 2>/dev/null; then
+    SM_FLOOR=$(( SM_MAX * SM_CLOCK_FLOOR_FRAC / 100 ))   # 70% of max = "crushed" threshold
+    if [ "$SM_CUR" -lt "$SM_FLOOR" ] && [ "$TEMP_C" -ge "$TEMP_HOT_C" ] 2>/dev/null; then
+        escalate FAIL "thermal throttle: sm_clock ${SM_CUR}MHz < ${SM_FLOOR}MHz (70% of max) while temp ${TEMP_C}C >= ${TEMP_HOT_C}C"
+    elif [ "$TEMP_C" -ge "$TEMP_HOT_C" ] 2>/dev/null; then
+        # Hot but clock still high: borderline, warn so the caller watches it.
+        escalate WARN "running hot (${TEMP_C}C >= ${TEMP_HOT_C}C) but SM clock not yet crushed — watch for throttling."
+    fi
+fi
+# ---------------------------------------------------------------------------
+# Summary + exit. HARD fail => exit 2 so a wrapper aborts the launch.
+# ---------------------------------------------------------------------------
+echo "------------------------------------------------------------"
+if [ -n "$NOTES" ]; then
+    printf 'findings:\n%s' "$NOTES" >&2
+fi
+case "$STATUS" in
+    FAIL)
+        echo "RESULT: FAIL — GPU $GPU is unhealthy. Stop this instance and re-rent a different box."
+        exit 2
+        ;;
+    WARN)
+        echo "RESULT: WARN — GPU $GPU usable but degraded; review findings above before a long run."
+        exit 0
+        ;;
+    *)
+        echo "RESULT: PASS — GPU $GPU healthy (no Xid, no throttling, clocks nominal)."
+        exit 0
+        ;;
+esac

package/bundled-skills/remote-gpu-trainer/scripts/health_patrol.sh.template ADDED Viewed

@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+# health_patrol.sh.template — ONE read-only patrol tick for a detached remote GPU job.
+#
+# Fire on a cadence from the host's recurring runner (Claude Code `/loop 30m`; cron
+# `3,33 * * * *` — offset off :00/:30 to dodge platform load spikes; Codex/Cursor
+# Automations → references/monitoring_patterns.md §7). This is the §3 **L2 patrol** body:
+# one combined ssh round-trip → a decision → a 3-5 line report EVEN IF nothing changed.
+#
+# READ-ONLY: never edits, restarts, or deletes anything (principles #6/#9). Watches
+# LIVENESS only; to make the RESULT outlive the box, pair with an on-box L1 self-
+# completion chain (§3 L1). Exit 0 = healthy / in-progress / cleanly done;
+# exit 1 = ESCALATE ("崩了") so the loop/cron surfaces the tick loudly.
+set -u
+# ── PROFILE BLOCK — bind from profiles/<platform>.md §8 SCRIPT OVERRIDES ────────────────
+HOST="${HOST:-autodl-1}"                                    # ssh alias (profile §1)
+RUN_GLOB="${RUN_GLOB:-scripts.train}"                       # pgrep -af pattern for the train process
+RESULT_DIR="${RESULT_DIR:-/root/autodl-tmp/runs/results}"  # dir holding one file per finished cell
+RUN_LOG="${RUN_LOG:-/root/autodl-tmp/runs/logs/train.log}" # the PER-RUN log (NOT a tee'd master — see ‡)
+DATA_MOUNT="${DATA_MOUNT:-/root/autodl-tmp}"               # disk to watch (bytes AND inodes)
+N_TOTAL="${N_TOTAL:-0}"                                     # expected cell count (0 = don't grade completion)
+DISK_PCT_MAX="${DISK_PCT_MAX:-95}"                          # escalate when used% (bytes or inodes) >= this
+# ── ONE combined READ-ONLY round-trip (quoted heredoc → sent verbatim; safe args via bash -s) ──
+OUT="$(ssh -o ConnectTimeout=15 -o ServerAliveInterval=10 -o ServerAliveCountMax=3 "$HOST" \
+        bash -s "$RUN_GLOB" "$RESULT_DIR" "$RUN_LOG" "$DATA_MOUNT" <<'REMOTE'
+set -u
+RUN_GLOB=$1; RESULT_DIR=$2; RUN_LOG=$3; DATA_MOUNT=$4
+RESULT_GLOB='*.json'                                        # CUSTOMIZE: one file per finished cell
+CRASH_RE='Traceback|Error|CUDA out of memory|OutOfMemory|Killed'  # CUSTOMIZE; QUOTED → | is alternation, never a pipe
+echo "ALIVE=$(pgrep -af "$RUN_GLOB" 2>/dev/null | grep -v grep | wc -l)"
+echo "DONE=$(ls "$RESULT_DIR"/$RESULT_GLOB 2>/dev/null | wc -l)"
+echo "EPOCH=$(grep -hoE 'Epoch[ =:]*[0-9]+(/[0-9]+)?' "$RUN_LOG" 2>/dev/null | tail -1)"
+echo "CRASH=$(grep -hE "$CRASH_RE" "$RUN_LOG" 2>/dev/null | wc -l)"   # ‡ scope to per-run log, never run_all.out (§2)
+echo "DISK=$(df -h "$DATA_MOUNT" 2>/dev/null | awk 'NR==2{print $5}')"
+echo "INODE=$(df -i "$DATA_MOUNT" 2>/dev/null | awk 'NR==2{print $5}')"
+REMOTE
+)" || { echo "PATROL $HOST: ssh FAILED — only YOU can see the console (balance / power / preemption). Check it."; exit 1; }
+# ── parse ──
+g(){ printf '%s\n' "$OUT" | sed -n "s/^$1=//p"; }
+ALIVE=$(g ALIVE); DONE=$(g DONE); EPOCH=$(g EPOCH); CRASH=$(g CRASH); DISK=$(g DISK); INODE=$(g INODE)
+dp=${DISK%\%}; ip=${INODE%\%}
+# ── always report, even if nothing changed (§3-L2) ──
+echo "PATROL $HOST: proc=${ALIVE:-?} done=${DONE:-?}/${N_TOTAL} epoch=${EPOCH:-n/a} disk=${DISK:-?} inode=${INODE:-?}"
+# ── escalate? crash signature / disk / inode / process-gone-while-incomplete ──
+esc=0; why=""
+[ "${CRASH:-0}" -gt 0 ] 2>/dev/null            && { esc=1; why="crash x${CRASH} in $(basename "$RUN_LOG")"; }
+[ "${dp:-0}" -ge "$DISK_PCT_MAX" ] 2>/dev/null && { esc=1; why="${why:+$why; }disk ${DISK}"; }
+[ "${ip:-0}" -ge "$DISK_PCT_MAX" ] 2>/dev/null && { esc=1; why="${why:+$why; }inodes ${INODE}"; }
+if [ "${ALIVE:-0}" -eq 0 ] 2>/dev/null && [ "${N_TOTAL:-0}" -gt 0 ] && [ "${DONE:-0}" -lt "$N_TOTAL" ] 2>/dev/null; then
+  esc=1; why="${why:+$why; }process gone at ${DONE}/${N_TOTAL} (incomplete)"
+fi
+if [ "$esc" -eq 1 ]; then
+  echo "PATROL: 崩了 — ${why}. Triage: ssh $HOST \"grep -B2 -A20 -E 'Traceback' '$RUN_LOG' | head -50\" (§6). Do NOT blind-restart; classify → fixed remediation."
+  exit 1
+fi
+if [ "${N_TOTAL:-0}" -gt 0 ] && [ "${DONE:-0}" -ge "$N_TOTAL" ] 2>/dev/null && [ "${ALIVE:-0}" -eq 0 ] 2>/dev/null; then
+  echo "PATROL: all ${N_TOTAL} done, process exited → load-verify + pull, THEN teardown (SKILL.md Phase 5 Iron Law)."; exit 0
+fi
+if [ "${ALIVE:-0}" -eq 0 ] 2>/dev/null && [ "${N_TOTAL:-0}" -eq 0 ]; then
+  echo "PATROL: process not running and completion-grading off — set N_TOTAL to auto-classify, or verify by hand."; exit 0
+fi
+echo "PATROL: healthy / in progress — nothing to do."; exit 0

package/bundled-skills/remote-gpu-trainer/scripts/mem_monitor.sh ADDED Viewed

@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+# 5-second resolution memory + CPU + GPU profiler for AutoDL training.
+# Catches val-phase memory spikes that can cgroup-wedge an instance.
+#
+# Usage: bash mem_monitor.sh > /root/autodl-tmp/runs/logs/mem.tsv 2>&1 &
+#   Run in tmux session (separate from training tmux).
+#
+# Output: TSV with columns:
+#   timestamp  cgroup_gb  cpu_pct  main_pid  main_rss_gb  main_threads  main_fds  n_python  total_python_rss_gb  wandb_pid  wandb_rss_gb  gpu_util_pct  gpu_mem_mb
+set -u
+# Which training process to track for the "main" RSS columns. Override to match your launcher's
+# `pgrep -f` pattern, e.g. TRAIN_PROC=train.py or TRAIN_PROC=accelerate (default: src.train).
+TRAIN_PROC="${TRAIN_PROC:-src.train}"
+# Header
+printf "timestamp\tcgroup_gb\tcpu_pct\tmain_pid\tmain_rss_gb\tmain_threads\tmain_fds\tn_python\ttotal_python_rss_gb\twandb_pid\twandb_rss_gb\tgpu_util_pct\tgpu_mem_mb\n"
+while true; do
+    ts=$(date '+%Y-%m-%d %H:%M:%S')
+    # cgroup current memory (bytes → GB)
+    cgroup_bytes=$(cat /sys/fs/cgroup/memory.current 2>/dev/null || echo 0)
+    cgroup_gb=$(awk "BEGIN{printf \"%.2f\", $cgroup_bytes/1073741824}")
+    # Total CPU usage from /proc/stat (rough; just diff once)
+    cpu_pct=$(top -bn1 | grep "Cpu(s)" | awk '{print $2+$4}')
+    # Main training python PID + RSS (pattern overridable via $TRAIN_PROC)
+    main_pid=$(pgrep -f "$TRAIN_PROC" | head -1)
+    if [ -n "$main_pid" ]; then
+        main_rss=$(awk '/VmRSS/ {print $2}' /proc/$main_pid/status 2>/dev/null || echo 0)
+        main_rss_gb=$(awk "BEGIN{printf \"%.2f\", $main_rss/1048576}")
+        main_threads=$(awk '/Threads/ {print $2}' /proc/$main_pid/status 2>/dev/null || echo 0)
+        main_fds=$(ls /proc/$main_pid/fd 2>/dev/null | wc -l)
+    else
+        main_pid=0; main_rss_gb=0; main_threads=0; main_fds=0
+    fi
+    # All python processes total RSS
+    n_python=$(pgrep -f python | wc -l)
+    total_python_rss_kb=$(ps -eo rss,comm | awk '$2 ~ /python/ {sum+=$1} END {print sum+0}')
+    total_python_rss_gb=$(awk "BEGIN{printf \"%.2f\", $total_python_rss_kb/1048576}")
+    # wandb process
+    wandb_pid=$(pgrep -f wandb-service | head -1)
+    if [ -n "$wandb_pid" ]; then
+        wandb_rss=$(awk '/VmRSS/ {print $2}' /proc/$wandb_pid/status 2>/dev/null || echo 0)
+        wandb_rss_gb=$(awk "BEGIN{printf \"%.2f\", $wandb_rss/1048576}")
+    else
+        wandb_pid=0; wandb_rss_gb=0
+    fi
+    # GPU util + memory
+    gpu_info=$(nvidia-smi --query-gpu=utilization.gpu,memory.used --format=csv,noheader,nounits 2>/dev/null | head -1)
+    gpu_util=$(echo "$gpu_info" | cut -d',' -f1 | tr -d ' ')
+    gpu_mem=$(echo "$gpu_info" | cut -d',' -f2 | tr -d ' ')
+    gpu_util=${gpu_util:-0}
+    gpu_mem=${gpu_mem:-0}
+    printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \
+        "$ts" "$cgroup_gb" "$cpu_pct" "$main_pid" "$main_rss_gb" "$main_threads" "$main_fds" \
+        "$n_python" "$total_python_rss_gb" "$wandb_pid" "$wandb_rss_gb" "$gpu_util" "$gpu_mem"
+    sleep 5
+done

package/bundled-skills/remote-gpu-trainer/scripts/reap_vram_zombies.sh ADDED Viewed

@@ -0,0 +1,175 @@
+#!/usr/bin/env bash
+# reap_vram_zombies.sh — find (and optionally kill) PIDs holding VRAM that the
+# nvidia-smi process list cannot see (see references/gotchas_universal.md U11).
+#
+# After a crashed DDP run or a killed container, a process can keep a CUDA context
+# (and its VRAM) alive while NOT appearing in `nvidia-smi`'s process table — so a
+# fresh job OOMs on an "empty" GPU. Such holders DO still have the /dev/nvidia*
+# device files open, so fuser/lsof can find them when nvidia-smi cannot.
+#
+# Strategy:
+#   1. enumerate every PID with /dev/nvidia* open       (fuser -v, lsof fallback)
+#   2. subtract the PIDs nvidia-smi already accounts for (those are live, visible jobs)
+#   3. of the remainder, flag any that is idle (~0% GPU util) and has lived past a timeout
+#   4. DRY-RUN by default: print candidates only. --force is required to kill -9.
+#
+# Usage:
+#   bash reap_vram_zombies.sh             # dry-run: list zombie candidates, kill nothing
+#   bash reap_vram_zombies.sh --force     # actually kill -9 the flagged candidates
+#
+# A DRY-RUN exits 0 and never touches a process. Killing is destructive:
+# it is gated behind an explicit --force so the orchestrator never auto-reaps.
+# If the holder is inside another container, kill -9 from the host may not clear it —
+# restart that container instead.
+# NEVER an unquoted pipe inside a grep regex (it reads stdin and hangs forever).
+set -u
+# ---------------------------------------------------------------------------
+# Tunable constants — documented, no magic numbers buried in logic.
+# ---------------------------------------------------------------------------
+FORCE=0                 # 0 = dry-run (default), 1 = actually kill. Set by --force.
+MIN_AGE_SECS=120        # only reap a holder that has lived > 2 min. A genuinely new
+                        # process may briefly hold a context while warming up; 2 min
+                        # is well past CUDA-context init, so survivors are stragglers.
+IDLE_UTIL_PCT=5         # treat per-process GPU util <= 5% as "idle". A real training
+                        # job pegs util far higher; ~0% + held VRAM = a zombie, not work.
+# ---------------------------------------------------------------------------
+# Arg parse — only --force is recognized; anything else is a usage error.
+# ---------------------------------------------------------------------------
+for arg in "$@"; do
+    case "$arg" in
+        --force) FORCE=1 ;;
+        -h|--help)
+            echo "usage: bash reap_vram_zombies.sh [--force]" >&2
+            echo "  (default is a dry-run; --force enables kill -9)" >&2
+            exit 0
+            ;;
+        *)
+            echo "unknown argument: $arg (only --force is supported)" >&2
+            exit 64   # EX_USAGE
+            ;;
+    esac
+done
+if ! command -v nvidia-smi >/dev/null 2>&1; then
+    echo "nvidia-smi not found — no NVIDIA driver on this box." >&2
+    exit 1
+fi
+# ---------------------------------------------------------------------------
+# Step 1 — enumerate PIDs holding /dev/nvidia* open.
+# fuser prints PIDs (mode letters attached, e.g. "12345m"); strip non-digits.
+# lsof is the fallback when fuser is absent. Expand /dev/nvidia* to only the real
+# device nodes first: with no NVIDIA driver the glob matches nothing, and passing
+# the literal "/dev/nvidia*" to fuser/lsof would otherwise error and mislead.
+# ---------------------------------------------------------------------------
+collect_dev_holders() {
+    local pids="" dev
+    local devs=()
+    for dev in /dev/nvidia*; do [ -e "$dev" ] && devs+=("$dev"); done
+    if [ "${#devs[@]}" -eq 0 ]; then
+        echo "no /dev/nvidia* device nodes present — cannot enumerate device holders." >&2
+        return 1
+    fi
+    if command -v fuser >/dev/null 2>&1; then
+        # fuser writes the PID list to stdout, the verbose table to stderr.
+        # 2>/dev/null drops the table; we keep only the bare PIDs.
+        pids="$(fuser "${devs[@]}" 2>/dev/null || true)"
+    elif command -v lsof >/dev/null 2>&1; then
+        # lsof -t prints one PID per line for the listed device files.
+        pids="$(lsof -t "${devs[@]}" 2>/dev/null || true)"
+    else
+        echo "neither fuser nor lsof is available — cannot enumerate device holders." >&2
+        return 1
+    fi
+    # Normalize to whitespace-separated bare PIDs (drop fuser's mode letters).
+    printf '%s\n' "$pids" | tr -cs '0-9' ' '
+}
+DEV_HOLDERS="$(collect_dev_holders)" || exit 1
+DEV_HOLDERS="$(printf '%s\n' "$DEV_HOLDERS" | tr ' ' '\n' | grep -E '^[0-9]+$' || true)"
+if [ -z "$DEV_HOLDERS" ]; then
+    echo "RESULT: clean — no process is holding /dev/nvidia* open."
+    exit 0
+fi
+# ---------------------------------------------------------------------------
+# Step 2 — PIDs nvidia-smi already accounts for. These are visible, legitimate
+# jobs; never reap them. (Empty when the zombie is the ONLY holder — the U11 case.)
+# ---------------------------------------------------------------------------
+VISIBLE_PIDS="$(nvidia-smi --query-compute-apps=pid --format=csv,noheader 2>/dev/null \
+    | grep -E '^[0-9]+$' || true)"
+# is_visible <pid> — true if nvidia-smi lists this PID as a compute app.
+is_visible() {
+    local pid="$1"
+    printf '%s\n' "$VISIBLE_PIDS" | grep -qx "$pid"
+}
+# ---------------------------------------------------------------------------
+# Step 3 — classify each remaining holder. A candidate is a holder that is
+# (a) NOT in nvidia-smi's list, (b) older than MIN_AGE_SECS, (c) ~idle on the GPU.
+# Process age comes from `ps -o etimes` (elapsed seconds, integer, portable).
+# ---------------------------------------------------------------------------
+CANDIDATES=""
+echo "== reap_vram_zombies: scanning $(printf '%s' "$DEV_HOLDERS" | tr '\n' ' ')=="
+for pid in $DEV_HOLDERS; do
+    # Skip the kernel/init edge and any PID that vanished mid-scan.
+    if [ ! -d "/proc/$pid" ]; then
+        continue
+    fi
+    CMD="$(ps -o comm= -p "$pid" 2>/dev/null || true)"
+    AGE="$(ps -o etimes= -p "$pid" 2>/dev/null | tr -d ' ' || true)"
+    AGE="${AGE:-0}"
+    if is_visible "$pid"; then
+        echo "   pid $pid ($CMD): visible to nvidia-smi — live job, skip."
+        continue
+    fi
+    if [ "$AGE" -lt "$MIN_AGE_SECS" ] 2>/dev/null; then
+        echo "   pid $pid ($CMD): age ${AGE}s < ${MIN_AGE_SECS}s — too young, skip (may be warming up)."
+        continue
+    fi
+    # This PID holds /dev/nvidia*, is invisible to nvidia-smi, and is old.
+    # nvidia-smi cannot give us a per-process util for an unlisted PID, so by the
+    # U11 definition (held VRAM + invisible) it is already idle on the GPU.
+    echo "   pid $pid ($CMD): age ${AGE}s, holds VRAM, INVISIBLE to nvidia-smi -> ZOMBIE candidate."
+    CANDIDATES="${CANDIDATES}${pid} "
+done
+CANDIDATES="$(printf '%s' "$CANDIDATES" | tr -s ' ' )"
+CANDIDATES="${CANDIDATES# }"; CANDIDATES="${CANDIDATES% }"
+# ---------------------------------------------------------------------------
+# Step 4 — act. Dry-run prints and exits; --force kills -9.
+# ---------------------------------------------------------------------------
+echo "------------------------------------------------------------"
+if [ -z "$CANDIDATES" ]; then
+    echo "RESULT: clean — holders exist but none qualifies as a zombie (all visible/young)."
+    exit 0
+fi
+echo "zombie VRAM holders: $CANDIDATES"
+if [ "$FORCE" -ne 1 ]; then
+    echo "RESULT: DRY-RUN — nothing killed. Re-run with --force to 'kill -9' the PIDs above."
+    echo "        (If a holder lives inside another container, restart that container instead.)"
+    exit 0
+fi
+# --force path: kill each candidate, report per-PID outcome.
+RC=0
+for pid in $CANDIDATES; do
+    if kill -9 "$pid" 2>/dev/null; then
+        echo "killed -9 $pid"
+    else
+        echo "FAILED to kill $pid (gone already, or owned by another container)." >&2
+        RC=1
+    fi
+done
+echo "RESULT: reaped zombie VRAM holders (--force)."
+exit "$RC"

package/bundled-skills/remote-gpu-trainer/scripts/run_one.sh.template ADDED Viewed

@@ -0,0 +1,104 @@
+#!/usr/bin/env bash
+# Per-job (per-ablation) wrapper — platform-agnostic skeleton.
+#
+# Parameterize the PROFILE BLOCK below from your platform profile's "SCRIPT OVERRIDES"
+# section (profiles/<platform>.md §8). The defaults shown are AutoDL's.
+#
+# Mandatory: run the network-acceleration hook before any external call (wandb / HF / pip / git).
+# On AutoDL that is `source /etc/network_turbo`; on a clean box set PROXY_HOOK=":" (a no-op).
+# Without the right hook, wandb.init can hang and a flaky link can drop already-uploaded cloud
+# runs — see references/gotchas_universal.md and references/china-network.md.
+#
+# Usage: ./run_one.sh <config_yaml> <task> [epochs]
+#
+# Disclose every CLI override applied below in any paper's Implementation Details — reproducibility
+# depends on the list being complete (the yaml/source stay untouched). See references/gotchas_universal.md.
+set -u
+# Arg-count guard FIRST — under `set -u`, CFG="$1" below would abort with an unbound-variable
+# error (and no usage hint) when run with no args. Fail with a readable usage line instead.
+if [ "$#" -lt 2 ]; then
+    echo "usage: $0 <config_yaml> <task> [epochs]" >&2
+    exit 1
+fi
+# ===== PROFILE BLOCK — override from profiles/<platform>.md §8 (defaults = AutoDL) =====
+PROJECT_REPO_DIR="${PROJECT_REPO_DIR:-/root/PROJECT_NAME}"   # where your code lives on the instance
+DATA_DIR="${DATA_DIR:-/root/autodl-tmp}"                     # fast per-instance scratch (checkpoints land here)
+DURABLE_DIR="${DURABLE_DIR:-/root/autodl-fs}"                # survives teardown (profile survival matrix); set "" to skip sync
+PROXY_HOOK="${PROXY_HOOK:-source /etc/network_turbo}"        # network-accel hook; ":" (no-op) on a clean box
+CRED_FILE="${CRED_FILE:-/root/.wandb_key}"                   # file holding the tracker key; "" if WANDB_API_KEY already in env
+# =======================================================================================
+# PROXY_HOOK is an OPERATOR-supplied snippet from your platform profile (e.g. `source /etc/network_turbo`,
+# `module load cuda`, or empty). It is eval'd intentionally so a profile can run an arbitrary setup hook —
+# set it ONLY from your own trusted profile, never from untrusted or remote-derived input.
+eval "${PROXY_HOOK}" 2>/dev/null || true
+# The prebuilt base IS the env on most rentals (do not conda create). Activate if present.
+source /root/miniconda3/etc/profile.d/conda.sh 2>/dev/null && conda activate base 2>/dev/null || true
+if [ -n "$CRED_FILE" ] && [ -f "$CRED_FILE" ]; then export WANDB_API_KEY="$(cat "$CRED_FILE")"; fi
+export WANDB_MODE="${WANDB_MODE:-online}"     # offline without a key => W&B silently DISABLED (gotchas_universal)
+export WANDB_START_METHOD=thread
+export PYTHONUNBUFFERED=1
+CKPT_ROOT="$DATA_DIR/checkpoints"
+LOG_DIR="$DATA_DIR/runs/logs"
+mkdir -p "$DATA_DIR/wandb" "$LOG_DIR" "$CKPT_ROOT"
+cd "${PROJECT_REPO_DIR}" || { echo "PROJECT_REPO_DIR not found: $PROJECT_REPO_DIR"; exit 1; }
+CFG="$1"
+TASK="$2"
+EPOCHS="${3:-20}"
+NAME="$(basename "$CFG" .yaml)"
+# CUSTOMIZE: classify the ablation by name pattern -> tracker group + tags (example scheme; extend freely)
+case "$NAME" in
+    aug_*|seg_aug_*|det_aug_*)            GRP="${TASK}_aug";       TAGS="[$TASK,aug]" ;;
+    *_no_*)                              GRP="${TASK}_module";    TAGS="[$TASK,module]" ;;
+    precision_*|seg_precision_*|det_precision_*) GRP="${TASK}_precision"; TAGS="[$TASK,precision]" ;;
+    *mask_*)                             GRP="${TASK}_rate";      TAGS="[$TASK,rate]" ;;
+    baseline_*)                          GRP="${TASK}_baseline";  TAGS="[$TASK,baseline]" ;;
+    *)                                   GRP="${TASK}_other";     TAGS="[$TASK,other]" ;;
+esac
+CKPT_DIR="$CKPT_ROOT/$NAME"
+mkdir -p "$CKPT_DIR"
+# CUSTOMIZE: replace `src.train` with your project's training entrypoint module + its override flags
+python -m src.train --no-strict \
+    -o wandb.group="$GRP" \
+    -o wandb.tags="$TAGS" \
+    -o data.num_workers=2 \
+    -o data.pin_memory=False \
+    -o training.val_metric_sample_cap=256 \
+    -o training.checkpoint_dir="$CKPT_DIR" \
+    -c "$CFG" --task "$TASK" --epochs "$EPOCHS" \
+    --experiment-name "abla_$NAME" \
+    2>&1 | tee "$LOG_DIR/$NAME.log"
+EXIT=${PIPESTATUS[0]}
+# Post-success: keep best.pth only, prune scratch latest.pth (disk-budget, principle #5).
+if [ "$EXIT" -eq 0 ] && [ -f "$CKPT_DIR/best.pth" ]; then
+    rm -f "$CKPT_DIR/latest.pth"
+    echo "[$(date +%H:%M:%S)] kept best.pth, pruned latest.pth for $NAME"
+fi
+# Auto-sync to durable storage. GATE the success line on the actual copy result — an unconditional
+# "synced" echo lies when the durable FS is full / inode-exhausted (references/gotchas_universal.md,
+# silent-sync). Verify best.pth landed before claiming success (principle #3). Skip if DURABLE_DIR="".
+if [ "$EXIT" -eq 0 ] && [ -f "$CKPT_DIR/best.pth" ] && [ -n "$DURABLE_DIR" ]; then
+    FS_DIR="$DURABLE_DIR/final_ckpts/$NAME"
+    if mkdir -p "$FS_DIR" && cp -f "$CKPT_DIR/best.pth" "$FS_DIR/" && [ -f "$FS_DIR/best.pth" ]; then
+        cp -f "$CKPT_DIR/best_metrics.json" "$FS_DIR/" 2>/dev/null || true
+        cp -rf "$CKPT_DIR/protocol" "$FS_DIR/" 2>/dev/null || true
+        cp -f "$LOG_DIR/$NAME.log" "$FS_DIR/" 2>/dev/null || true
+        echo "[$(date +%H:%M:%S)] synced $NAME to durable storage ($FS_DIR)"
+    else
+        echo "[$(date +%H:%M:%S)] !! DURABLE SYNC FAILED for $NAME — check 'df -i $DURABLE_DIR'. The data-disk copy is source-of-truth."
+    fi
+fi
+exit $EXIT