npm - opencode-skills-collection - Versions diffs - 3.1.2 → 3.1.3 - Mend

opencode-skills-collection 3.1.2 → 3.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

package/bundled-skills/remote-gpu-trainer/scripts/run_queue.sh.template ADDED Viewed

@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+# Queue iterator for multi-ablation deployment — platform-agnostic.
+#
+# Network-accel hook so every child (incl. the tracker client) inherits it; ":" no-op on a clean box.
+# See references/gotchas_universal.md and references/china-network.md.
+PROXY_HOOK="${PROXY_HOOK:-source /etc/network_turbo}"
+# PROXY_HOOK is an OPERATOR-supplied profile snippet (source a file / module load / export ...), eval'd
+# on purpose to run an arbitrary setup hook. Set it ONLY from your own trusted profile, never from
+# untrusted or remote-derived input.
+eval "${PROXY_HOOK}" 2>/dev/null || true
+#
+# Each queue line: <config_yaml_path> <task> [epochs]   (epochs defaults to 20)
+# Calls $RUN_ONE per line — defaults to $DURABLE_DIR/run_one.sh (the durable/shared mount from
+# profiles/<platform>.md §8). Export DURABLE_DIR, or set RUN_ONE directly if run_one.sh lives elsewhere.
+#
+# Usage: ./run_queue.sh <queue_file> [start_index]
+#   start_index defaults to 1 (run all). Pass N to RESUME from ablation N (principle #8 — see
+#   references/parallel_ablation.md §5).
+#
+# IMPORTANT: tmux/bash loads THIS script into memory at launch. Editing it mid-flight does NOT affect
+# the running queue; only a NEW launch sees changes. Never overwrite it while a queue reads it
+# (references/gotchas_universal.md, never-mutate-inputs-under-a-live-run; principle #6).
+set -u
+RUN_ONE="${RUN_ONE:-${DURABLE_DIR:-/root/autodl-fs}/run_one.sh}"   # = <durable mount>/run_one.sh; export DURABLE_DIR (profile §8) or set RUN_ONE directly
+# Arg-count guard FIRST — under `set -u`, QUEUE="$1" below would abort with an unbound-variable
+# error before the Usage check could run. Guard so the Usage message is reachable.
+if [ "$#" -lt 1 ]; then
+    echo "Usage: $0 <queue_file> [start_index]"
+    exit 1
+fi
+QUEUE="$1"
+START="${2:-1}"
+if [ -z "$QUEUE" ] || [ ! -f "$QUEUE" ]; then
+    echo "Usage: $0 <queue_file> [start_index]"
+    exit 1
+fi
+HOSTNAME_SHORT=$(hostname -s)
+# Count ablation CELLS only (skip #-comments + blank lines) so $TOTAL and the resume index are
+# CELL numbers, not raw line numbers — `start_index=N` then resumes from ablation N regardless of how
+# many comment/blank lines precede it (the loop below increments i only after the same skip guards).
+TOTAL=$(grep -cvE '^[[:space:]]*(#|$)' "$QUEUE")
+i=0
+fail=0
+failed_names=()
+echo "=== Queue $(basename "$QUEUE"): $TOTAL ablations, starting from $START on $HOSTNAME_SHORT ==="
+while IFS=$' \t' read -r cfg task epochs; do
+    # Skip comment/blank lines BEFORE counting so i (and the START resume index) count CELLS, not lines.
+    if [ -z "$cfg" ]; then continue; fi
+    case "$cfg" in \#*) continue ;; esac   # skip #-prefixed comment lines
+    i=$((i+1))
+    if [ "$i" -lt "$START" ]; then continue; fi
+    EPOCHS="${epochs:-20}"
+    NAME=$(basename "$cfg" .yaml)
+    echo "================================================================"
+    echo "[$(date +'%Y-%m-%d %H:%M:%S')] [$i/$TOTAL] STARTING $NAME ($task, ${EPOCHS}ep)"
+    echo "================================================================"
+    bash "$RUN_ONE" "$cfg" "$task" "$EPOCHS"
+    RC=$?
+    if [ "$RC" -ne 0 ]; then fail=$((fail+1)); failed_names+=("$NAME"); fi
+    echo "[$(date +'%Y-%m-%d %H:%M:%S')] [$i/$TOTAL] FINISHED $NAME (exit=$RC)"
+done < "$QUEUE"
+echo "================================================================"
+if [ "$fail" -eq 0 ]; then
+    echo "[$(date +'%Y-%m-%d %H:%M:%S')] === QUEUE DONE on $HOSTNAME_SHORT -- all $TOTAL cell(s) exited 0 ==="
+else
+    echo "[$(date +'%Y-%m-%d %H:%M:%S')] === QUEUE DONE on $HOSTNAME_SHORT -- $fail cell(s) FAILED: ${failed_names[*]} ==="
+fi
+echo "================================================================"
+# Propagate failure: a queue with any failed cell must NOT exit 0, or tmux/patrol
+# automation reads "QUEUE DONE" as success and a broken ablation hides for hours.
+[ "$fail" -eq 0 ] || exit 1

package/bundled-skills/remote-gpu-trainer/scripts/setup-china-mirrors.sh ADDED Viewed

@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# One-shot China-network setup for a rented GPU box behind the GFW.
+# scp this to the instance, then `source` it (it exports env vars into the CURRENT shell):
+#   scp scripts/setup-china-mirrors.sh <alias>:/root/ && ssh <alias> 'source /root/setup-china-mirrors.sh'
+# Full rationale + the no_proxy trap + the resumable-download ladder: references/china-network.md
+set -u
+# 1. HuggingFace -> hf-mirror (drop-in; identical repo IDs). MUST be set BEFORE importing
+#    huggingface_hub / transformers / datasets — they read HF_ENDPOINT at import time.
+export HF_ENDPOINT="${HF_ENDPOINT:-https://hf-mirror.com}"
+# Keep hf_transfer OFF on flaky CN links — documented hang-with-no-error in exactly these conditions.
+export HF_HUB_ENABLE_HF_TRANSFER=0
+# 2. Redirect model caches off the small system disk onto the data disk (override DATA_DIR per profile).
+DATA_DIR="${DATA_DIR:-/root/autodl-tmp}"
+export HF_HOME="${HF_HOME:-$DATA_DIR/huggingface}"
+export HF_HUB_CACHE="${HF_HUB_CACHE:-$HF_HOME/hub}"
+export MODELSCOPE_CACHE="${MODELSCOPE_CACHE:-$DATA_DIR/modelscope}"
+mkdir -p "$HF_HOME" "$MODELSCOPE_CACHE"
+# 3. pip index -> Tsinghua TUNA (Aliyun / USTC are alternates).
+pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple 2>/dev/null \
+  || export PIP_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple
+# 4. no_proxy hygiene — ONLY when an overseas proxy is exported in THIS shell. A proxy that fixes
+#    huggingface.co will route every domestic mirror overseas and break it unless exempted here.
+#    Use leading-dot domains, set BOTH spellings, include loopback.
+if [ -n "${http_proxy:-}${https_proxy:-}" ]; then
+  export no_proxy="127.0.0.1,localhost,.tuna.tsinghua.edu.cn,.aliyuncs.com,.modelscope.cn,.hf-mirror.com"
+  export NO_PROXY="$no_proxy"
+  echo "[setup-china-mirrors] proxy detected -> exempted domestic mirrors via no_proxy"
+fi
+echo "[setup-china-mirrors] HF_ENDPOINT=$HF_ENDPOINT  HF_HOME=$HF_HOME"
+echo "[setup-china-mirrors] done. conda: edit ~/.condarc per references/china-network.md (NEVER mirror pytorch-nightly)."

package/bundled-skills/remote-gpu-trainer/scripts/verify_local.py ADDED Viewed

@@ -0,0 +1,145 @@
+#!/usr/bin/env python
+"""Verify integrity of downloaded ckpt directories.
+For each <name>/ in the target dir, check:
+  - best.pth exists
+  - best.pth loads cleanly via torch.load
+  - best.pth contains a weights key ('model_state_dict' / 'model' / 'state_dict')
+  - best_metrics.json exists and is valid JSON
+  - reports best epoch + main metric per ablation
+Usage:
+    python verify_local.py <path_to_final_ckpts_dir> [--expect N] [--list-metrics]
+Exit code:
+    0 = all OK
+    1 = at least one error, an empty input dir, or a dir count != --expect
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from pathlib import Path
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("ckpt_dir", help="Directory containing ablation subdirs (each with best.pth + best_metrics.json)")
+    ap.add_argument("--list-metrics", action="store_true", help="Print per-ablation epoch + main metric")
+    ap.add_argument("--expect", type=int, default=None,
+                    help="Assert exactly N ablation subdirs are present -- guards a teardown gate against a partial/empty pull")
+    ap.add_argument("--allow-pickle", action="store_true",
+                    help="Permit the weights_only=False fallback (executes pickle) for checkpoints you trust -- "
+                         "needed only when a checkpoint pickles non-tensor objects (e.g. an args Namespace); OFF by default")
+    args = ap.parse_args()
+    root = Path(args.ckpt_dir)
+    if not root.exists():
+        print(f"ERROR: {root} does not exist")
+        return 1
+    if not root.is_dir():
+        print(f"ERROR: {root} is not a directory")
+        return 1
+    # Structural checks BEFORE importing torch: an empty (or short) input must fail
+    # LOUDLY here -- never silently print "OK: 0/0" and return success, which would let
+    # a Phase-5 teardown gate destroy the rented disk having verified nothing
+    # (principle #3: trust the artifact, not a success line; the teardown Iron Law).
+    dirs = sorted([d for d in root.iterdir() if d.is_dir()])
+    if not dirs:
+        print(f"ERROR: no ablation subdirectories found in {root} -- refusing to report success on an empty input")
+        return 1
+    if args.expect is not None and len(dirs) != args.expect:
+        print(f"ERROR: expected {args.expect} ablation dirs but found {len(dirs)} in {root} -- partial/incomplete pull")
+        return 1
+    try:
+        import torch
+    except ImportError:
+        print("ERROR: torch not installed in this environment")
+        return 1
+    print(f"Found {len(dirs)} ablation dirs in {root}")
+    print()
+    ok = 0
+    errors: list[tuple[str, str]] = []
+    metrics_rows: list[tuple[str, int, str]] = []
+    total_size_bytes = 0
+    for d in dirs:
+        name = d.name
+        pth = d / "best.pth"
+        metrics_path = d / "best_metrics.json"
+        if not pth.exists():
+            errors.append((name, "missing best.pth"))
+            continue
+        if not metrics_path.exists():
+            errors.append((name, "missing best_metrics.json"))
+            continue
+        # Load safe-by-default: weights_only=True refuses to execute pickle, so a poisoned or
+        # compromised remote checkpoint cannot run code on the operator's machine. The unsafe
+        # weights_only=False path (which DOES execute pickle) is OPT-IN via --allow-pickle: an attacker
+        # who controls the remote file could otherwise craft one that fails the safe load to FORCE the
+        # fallback, so auto-falling-back would defeat the gate. Pass --allow-pickle ONLY for your own ckpts.
+        try:
+            ckpt = torch.load(pth, map_location="cpu", weights_only=True)
+        except Exception as e_safe:
+            if not args.allow_pickle:
+                errors.append((name, f"safe load (weights_only=True) failed: {str(e_safe)[:70]} "
+                                     "-- re-run with --allow-pickle if this is your own checkpoint"))
+                continue
+            try:
+                print(
+                    f"  [warn] {name}: weights_only=True failed; --allow-pickle set, retrying "
+                    "weights_only=False (executes pickle -- trust this file)"
+                )
+                ckpt = torch.load(pth, map_location="cpu", weights_only=False)
+            except Exception as e:
+                errors.append((name, f"torch.load failed: {str(e)[:100]}"))
+                continue
+        if not isinstance(ckpt, dict) or not any(k in ckpt for k in ("model_state_dict", "model", "state_dict")):
+            errors.append((name, "no model/model_state_dict/state_dict key in checkpoint"))
+            continue
+        try:
+            with open(metrics_path) as f:
+                m = json.load(f)
+        except Exception as e:
+            errors.append((name, f"best_metrics.json invalid: {str(e)[:80]}"))
+            continue
+        epoch = m.get("epoch", "?")
+        if epoch is None:  # {"epoch": null} → .get returns None (not the default); guard the :3 format. `or` would wrongly eat epoch 0.
+            epoch = "?"
+        # Pick main metric (PSNR for recon, mAP50 for det, dice for seg, fall back to loss)
+        main_metric_key = next(
+            (k for k in ["psnr", "mAP50", "dice"] if k in m),
+            "loss",
+        )
+        main_metric_val = m.get(main_metric_key, "?")
+        metrics_rows.append((name, epoch, f"{main_metric_key}={main_metric_val}"))
+        total_size_bytes += pth.stat().st_size
+        ok += 1
+    print(f"OK: {ok}/{len(dirs)}")
+    print(f"Errors: {len(errors)}")
+    for name, err in errors[:20]:
+        print(f"  - {name}: {err}")
+    print(f"Total best.pth size: {total_size_bytes / 1e9:.1f} GB")
+    if args.list_metrics:
+        print()
+        print("=== Per-ablation metrics ===")
+        for name, epoch, metric in metrics_rows:
+            print(f"  {name:40s} epoch={epoch:3} {metric}")
+    return 0 if not errors else 1
+if __name__ == "__main__":
+    sys.exit(main())

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "opencode-skills-collection",
-  "version": "3.1.2",
+  "version": "3.1.3",
   "description": "OpenCode CLI plugin that automatically downloads and keeps skills up to date.",
   "main": "dist/index.js",
   "types": "dist/index.d.ts",

package/skills_index.json CHANGED Viewed

@@ -551,6 +551,28 @@
       "reasons": []
     }
   },
+  {
+    "id": "agent-creator",
+    "path": "skills/agent-creator",
+    "category": "ai-ml",
+    "name": "agent-creator",
+    "description": "Create custom AI subagents with proper plugin structure, persona generation, and companion routing skills.",
+    "risk": "critical",
+    "source": "community",
+    "date_added": "2026-06-20",
+    "plugin": {
+      "targets": {
+        "codex": "supported",
+        "claude": "supported"
+      },
+      "setup": {
+        "type": "none",
+        "summary": "",
+        "docs": null
+      },
+      "reasons": []
+    }
+  },
   {
     "id": "agent-evaluation",
     "path": "skills/agent-evaluation",
@@ -3388,6 +3410,28 @@
       "reasons": []
     }
   },
+  {
+    "id": "ax-extract-workflow",
+    "path": "skills/ax-extract-workflow",
+    "category": "development",
+    "name": "ax-extract-workflow",
+    "description": "Reconstruct workflow behind a past coding-agent artifact using local ax sessions/commits/skills/tool traces. Use when asked how X was built.",
+    "risk": "safe",
+    "source": "community",
+    "date_added": "2026-06-21",
+    "plugin": {
+      "targets": {
+        "codex": "supported",
+        "claude": "supported"
+      },
+      "setup": {
+        "type": "none",
+        "summary": "",
+        "docs": null
+      },
+      "reasons": []
+    }
+  },
   {
     "id": "axiom",
     "path": "skills/axiom",
@@ -27273,6 +27317,28 @@
       "reasons": []
     }
   },
+  {
+    "id": "remote-gpu-trainer",
+    "path": "skills/remote-gpu-trainer",
+    "category": "ml-ops",
+    "name": "remote-gpu-trainer",
+    "description": "Deploy, monitor, and debug long GPU jobs on RENTED/remote instances (AutoDL, RunPod, vast.ai, Lambda, Slurm, K8s): teardown/billing safety, spot resilience, resumable checkpointing, OOM/NaN triage.",
+    "risk": "safe",
+    "source": "community",
+    "date_added": "2026-06-20",
+    "plugin": {
+      "targets": {
+        "codex": "supported",
+        "claude": "supported"
+      },
+      "setup": {
+        "type": "none",
+        "summary": "",
+        "docs": null
+      },
+      "reasons": []
+    }
+  },
   {
     "id": "remotion",
     "path": "skills/remotion",