opencode-skills-collection 3.1.2 → 3.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/bundled-skills/.antigravity-install-manifest.json +4 -1
  2. package/bundled-skills/agent-creator/SKILL.md +246 -0
  3. package/bundled-skills/ax-extract-workflow/SKILL.md +156 -0
  4. package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
  5. package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
  6. package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
  7. package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
  8. package/bundled-skills/docs/sources/sources.md +1 -1
  9. package/bundled-skills/docs/users/bundles.md +1 -1
  10. package/bundled-skills/docs/users/claude-code-skills.md +1 -1
  11. package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
  12. package/bundled-skills/docs/users/getting-started.md +1 -1
  13. package/bundled-skills/docs/users/kiro-integration.md +1 -1
  14. package/bundled-skills/docs/users/usage.md +4 -4
  15. package/bundled-skills/docs/users/visual-guide.md +4 -4
  16. package/bundled-skills/lovable-cleanup/SKILL.md +2 -1
  17. package/bundled-skills/remote-gpu-trainer/.gitattributes +8 -0
  18. package/bundled-skills/remote-gpu-trainer/LICENSE +21 -0
  19. package/bundled-skills/remote-gpu-trainer/README.md +267 -0
  20. package/bundled-skills/remote-gpu-trainer/SKILL.md +249 -0
  21. package/bundled-skills/remote-gpu-trainer/evals/README.md +57 -0
  22. package/bundled-skills/remote-gpu-trainer/evals/RESULTS.md +44 -0
  23. package/bundled-skills/remote-gpu-trainer/evals/cases.jsonl +14 -0
  24. package/bundled-skills/remote-gpu-trainer/evals/run_evals.py +68 -0
  25. package/bundled-skills/remote-gpu-trainer/examples/autodl_sweep/README.md +72 -0
  26. package/bundled-skills/remote-gpu-trainer/examples/autodl_sweep/queue_1.txt +6 -0
  27. package/bundled-skills/remote-gpu-trainer/profiles/_schema.md +100 -0
  28. package/bundled-skills/remote-gpu-trainer/profiles/autodl.md +327 -0
  29. package/bundled-skills/remote-gpu-trainer/profiles/china.md +397 -0
  30. package/bundled-skills/remote-gpu-trainer/profiles/generic-ssh.md +450 -0
  31. package/bundled-skills/remote-gpu-trainer/profiles/lambda.md +342 -0
  32. package/bundled-skills/remote-gpu-trainer/profiles/paperspace.md +365 -0
  33. package/bundled-skills/remote-gpu-trainer/profiles/runpod.md +164 -0
  34. package/bundled-skills/remote-gpu-trainer/profiles/vastai.md +355 -0
  35. package/bundled-skills/remote-gpu-trainer/references/china-network.md +206 -0
  36. package/bundled-skills/remote-gpu-trainer/references/gotchas_universal.md +704 -0
  37. package/bundled-skills/remote-gpu-trainer/references/lifecycle_checklist.md +148 -0
  38. package/bundled-skills/remote-gpu-trainer/references/monitoring_patterns.md +327 -0
  39. package/bundled-skills/remote-gpu-trainer/references/multinode.md +190 -0
  40. package/bundled-skills/remote-gpu-trainer/references/parallel_ablation.md +196 -0
  41. package/bundled-skills/remote-gpu-trainer/references/principles.md +179 -0
  42. package/bundled-skills/remote-gpu-trainer/references/self-improvement.md +74 -0
  43. package/bundled-skills/remote-gpu-trainer/references/spot-resilience.md +235 -0
  44. package/bundled-skills/remote-gpu-trainer/references/ssh_transport.md +270 -0
  45. package/bundled-skills/remote-gpu-trainer/references/training/by-domain.md +230 -0
  46. package/bundled-skills/remote-gpu-trainer/references/training/checkpoint-resume.md +368 -0
  47. package/bundled-skills/remote-gpu-trainer/references/training/convergence-debugging.md +187 -0
  48. package/bundled-skills/remote-gpu-trainer/references/training/data-pipeline.md +119 -0
  49. package/bundled-skills/remote-gpu-trainer/references/training/distributed-launch.md +422 -0
  50. package/bundled-skills/remote-gpu-trainer/references/training/oom-memory.md +338 -0
  51. package/bundled-skills/remote-gpu-trainer/references/training/precision-stability.md +401 -0
  52. package/bundled-skills/remote-gpu-trainer/references/training/throughput-profiling.md +451 -0
  53. package/bundled-skills/remote-gpu-trainer/scripts/aggregate_to_fs.sh +55 -0
  54. package/bundled-skills/remote-gpu-trainer/scripts/check_staleness.py +70 -0
  55. package/bundled-skills/remote-gpu-trainer/scripts/download_loop.sh +67 -0
  56. package/bundled-skills/remote-gpu-trainer/scripts/gpu_health.sh +169 -0
  57. package/bundled-skills/remote-gpu-trainer/scripts/health_patrol.sh.template +67 -0
  58. package/bundled-skills/remote-gpu-trainer/scripts/mem_monitor.sh +67 -0
  59. package/bundled-skills/remote-gpu-trainer/scripts/reap_vram_zombies.sh +175 -0
  60. package/bundled-skills/remote-gpu-trainer/scripts/run_one.sh.template +104 -0
  61. package/bundled-skills/remote-gpu-trainer/scripts/run_queue.sh.template +83 -0
  62. package/bundled-skills/remote-gpu-trainer/scripts/setup-china-mirrors.sh +35 -0
  63. package/bundled-skills/remote-gpu-trainer/scripts/verify_local.py +145 -0
  64. package/package.json +1 -1
  65. package/skills_index.json +66 -0
@@ -0,0 +1,83 @@
1
+ #!/usr/bin/env bash
2
+ # Queue iterator for multi-ablation deployment — platform-agnostic.
3
+ #
4
+ # Network-accel hook so every child (incl. the tracker client) inherits it; ":" no-op on a clean box.
5
+ # See references/gotchas_universal.md and references/china-network.md.
6
+ PROXY_HOOK="${PROXY_HOOK:-source /etc/network_turbo}"
7
+ # PROXY_HOOK is an OPERATOR-supplied profile snippet (source a file / module load / export ...), eval'd
8
+ # on purpose to run an arbitrary setup hook. Set it ONLY from your own trusted profile, never from
9
+ # untrusted or remote-derived input.
10
+ eval "${PROXY_HOOK}" 2>/dev/null || true
11
+ #
12
+ # Each queue line: <config_yaml_path> <task> [epochs] (epochs defaults to 20)
13
+ # Calls $RUN_ONE per line — defaults to $DURABLE_DIR/run_one.sh (the durable/shared mount from
14
+ # profiles/<platform>.md §8). Export DURABLE_DIR, or set RUN_ONE directly if run_one.sh lives elsewhere.
15
+ #
16
+ # Usage: ./run_queue.sh <queue_file> [start_index]
17
+ # start_index defaults to 1 (run all). Pass N to RESUME from ablation N (principle #8 — see
18
+ # references/parallel_ablation.md §5).
19
+ #
20
+ # IMPORTANT: tmux/bash loads THIS script into memory at launch. Editing it mid-flight does NOT affect
21
+ # the running queue; only a NEW launch sees changes. Never overwrite it while a queue reads it
22
+ # (references/gotchas_universal.md, never-mutate-inputs-under-a-live-run; principle #6).
23
+ set -u
24
+
25
+ RUN_ONE="${RUN_ONE:-${DURABLE_DIR:-/root/autodl-fs}/run_one.sh}" # = <durable mount>/run_one.sh; export DURABLE_DIR (profile §8) or set RUN_ONE directly
26
+
27
+ # Arg-count guard FIRST — under `set -u`, QUEUE="$1" below would abort with an unbound-variable
28
+ # error before the Usage check could run. Guard so the Usage message is reachable.
29
+ if [ "$#" -lt 1 ]; then
30
+ echo "Usage: $0 <queue_file> [start_index]"
31
+ exit 1
32
+ fi
33
+
34
+ QUEUE="$1"
35
+ START="${2:-1}"
36
+
37
+ if [ -z "$QUEUE" ] || [ ! -f "$QUEUE" ]; then
38
+ echo "Usage: $0 <queue_file> [start_index]"
39
+ exit 1
40
+ fi
41
+
42
+ HOSTNAME_SHORT=$(hostname -s)
43
+ # Count ablation CELLS only (skip #-comments + blank lines) so $TOTAL and the resume index are
44
+ # CELL numbers, not raw line numbers — `start_index=N` then resumes from ablation N regardless of how
45
+ # many comment/blank lines precede it (the loop below increments i only after the same skip guards).
46
+ TOTAL=$(grep -cvE '^[[:space:]]*(#|$)' "$QUEUE")
47
+ i=0
48
+ fail=0
49
+ failed_names=()
50
+
51
+ echo "=== Queue $(basename "$QUEUE"): $TOTAL ablations, starting from $START on $HOSTNAME_SHORT ==="
52
+
53
+ while IFS=$' \t' read -r cfg task epochs; do
54
+ # Skip comment/blank lines BEFORE counting so i (and the START resume index) count CELLS, not lines.
55
+ if [ -z "$cfg" ]; then continue; fi
56
+ case "$cfg" in \#*) continue ;; esac # skip #-prefixed comment lines
57
+ i=$((i+1))
58
+ if [ "$i" -lt "$START" ]; then continue; fi
59
+
60
+ EPOCHS="${epochs:-20}"
61
+ NAME=$(basename "$cfg" .yaml)
62
+
63
+ echo "================================================================"
64
+ echo "[$(date +'%Y-%m-%d %H:%M:%S')] [$i/$TOTAL] STARTING $NAME ($task, ${EPOCHS}ep)"
65
+ echo "================================================================"
66
+
67
+ bash "$RUN_ONE" "$cfg" "$task" "$EPOCHS"
68
+ RC=$?
69
+ if [ "$RC" -ne 0 ]; then fail=$((fail+1)); failed_names+=("$NAME"); fi
70
+
71
+ echo "[$(date +'%Y-%m-%d %H:%M:%S')] [$i/$TOTAL] FINISHED $NAME (exit=$RC)"
72
+ done < "$QUEUE"
73
+
74
+ echo "================================================================"
75
+ if [ "$fail" -eq 0 ]; then
76
+ echo "[$(date +'%Y-%m-%d %H:%M:%S')] === QUEUE DONE on $HOSTNAME_SHORT -- all $TOTAL cell(s) exited 0 ==="
77
+ else
78
+ echo "[$(date +'%Y-%m-%d %H:%M:%S')] === QUEUE DONE on $HOSTNAME_SHORT -- $fail cell(s) FAILED: ${failed_names[*]} ==="
79
+ fi
80
+ echo "================================================================"
81
+ # Propagate failure: a queue with any failed cell must NOT exit 0, or tmux/patrol
82
+ # automation reads "QUEUE DONE" as success and a broken ablation hides for hours.
83
+ [ "$fail" -eq 0 ] || exit 1
@@ -0,0 +1,35 @@
1
+ #!/usr/bin/env bash
2
+ # One-shot China-network setup for a rented GPU box behind the GFW.
3
+ # scp this to the instance, then `source` it (it exports env vars into the CURRENT shell):
4
+ # scp scripts/setup-china-mirrors.sh <alias>:/root/ && ssh <alias> 'source /root/setup-china-mirrors.sh'
5
+ # Full rationale + the no_proxy trap + the resumable-download ladder: references/china-network.md
6
+ set -u
7
+
8
+ # 1. HuggingFace -> hf-mirror (drop-in; identical repo IDs). MUST be set BEFORE importing
9
+ # huggingface_hub / transformers / datasets — they read HF_ENDPOINT at import time.
10
+ export HF_ENDPOINT="${HF_ENDPOINT:-https://hf-mirror.com}"
11
+ # Keep hf_transfer OFF on flaky CN links — documented hang-with-no-error in exactly these conditions.
12
+ export HF_HUB_ENABLE_HF_TRANSFER=0
13
+
14
+ # 2. Redirect model caches off the small system disk onto the data disk (override DATA_DIR per profile).
15
+ DATA_DIR="${DATA_DIR:-/root/autodl-tmp}"
16
+ export HF_HOME="${HF_HOME:-$DATA_DIR/huggingface}"
17
+ export HF_HUB_CACHE="${HF_HUB_CACHE:-$HF_HOME/hub}"
18
+ export MODELSCOPE_CACHE="${MODELSCOPE_CACHE:-$DATA_DIR/modelscope}"
19
+ mkdir -p "$HF_HOME" "$MODELSCOPE_CACHE"
20
+
21
+ # 3. pip index -> Tsinghua TUNA (Aliyun / USTC are alternates).
22
+ pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple 2>/dev/null \
23
+ || export PIP_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple
24
+
25
+ # 4. no_proxy hygiene — ONLY when an overseas proxy is exported in THIS shell. A proxy that fixes
26
+ # huggingface.co will route every domestic mirror overseas and break it unless exempted here.
27
+ # Use leading-dot domains, set BOTH spellings, include loopback.
28
+ if [ -n "${http_proxy:-}${https_proxy:-}" ]; then
29
+ export no_proxy="127.0.0.1,localhost,.tuna.tsinghua.edu.cn,.aliyuncs.com,.modelscope.cn,.hf-mirror.com"
30
+ export NO_PROXY="$no_proxy"
31
+ echo "[setup-china-mirrors] proxy detected -> exempted domestic mirrors via no_proxy"
32
+ fi
33
+
34
+ echo "[setup-china-mirrors] HF_ENDPOINT=$HF_ENDPOINT HF_HOME=$HF_HOME"
35
+ echo "[setup-china-mirrors] done. conda: edit ~/.condarc per references/china-network.md (NEVER mirror pytorch-nightly)."
@@ -0,0 +1,145 @@
1
+ #!/usr/bin/env python
2
+ """Verify integrity of downloaded ckpt directories.
3
+
4
+ For each <name>/ in the target dir, check:
5
+ - best.pth exists
6
+ - best.pth loads cleanly via torch.load
7
+ - best.pth contains a weights key ('model_state_dict' / 'model' / 'state_dict')
8
+ - best_metrics.json exists and is valid JSON
9
+ - reports best epoch + main metric per ablation
10
+
11
+ Usage:
12
+ python verify_local.py <path_to_final_ckpts_dir> [--expect N] [--list-metrics]
13
+
14
+ Exit code:
15
+ 0 = all OK
16
+ 1 = at least one error, an empty input dir, or a dir count != --expect
17
+ """
18
+ from __future__ import annotations
19
+ import argparse
20
+ import json
21
+ import sys
22
+ from pathlib import Path
23
+
24
+
25
+ def main() -> int:
26
+ ap = argparse.ArgumentParser()
27
+ ap.add_argument("ckpt_dir", help="Directory containing ablation subdirs (each with best.pth + best_metrics.json)")
28
+ ap.add_argument("--list-metrics", action="store_true", help="Print per-ablation epoch + main metric")
29
+ ap.add_argument("--expect", type=int, default=None,
30
+ help="Assert exactly N ablation subdirs are present -- guards a teardown gate against a partial/empty pull")
31
+ ap.add_argument("--allow-pickle", action="store_true",
32
+ help="Permit the weights_only=False fallback (executes pickle) for checkpoints you trust -- "
33
+ "needed only when a checkpoint pickles non-tensor objects (e.g. an args Namespace); OFF by default")
34
+ args = ap.parse_args()
35
+
36
+ root = Path(args.ckpt_dir)
37
+ if not root.exists():
38
+ print(f"ERROR: {root} does not exist")
39
+ return 1
40
+ if not root.is_dir():
41
+ print(f"ERROR: {root} is not a directory")
42
+ return 1
43
+
44
+ # Structural checks BEFORE importing torch: an empty (or short) input must fail
45
+ # LOUDLY here -- never silently print "OK: 0/0" and return success, which would let
46
+ # a Phase-5 teardown gate destroy the rented disk having verified nothing
47
+ # (principle #3: trust the artifact, not a success line; the teardown Iron Law).
48
+ dirs = sorted([d for d in root.iterdir() if d.is_dir()])
49
+ if not dirs:
50
+ print(f"ERROR: no ablation subdirectories found in {root} -- refusing to report success on an empty input")
51
+ return 1
52
+ if args.expect is not None and len(dirs) != args.expect:
53
+ print(f"ERROR: expected {args.expect} ablation dirs but found {len(dirs)} in {root} -- partial/incomplete pull")
54
+ return 1
55
+
56
+ try:
57
+ import torch
58
+ except ImportError:
59
+ print("ERROR: torch not installed in this environment")
60
+ return 1
61
+
62
+ print(f"Found {len(dirs)} ablation dirs in {root}")
63
+ print()
64
+
65
+ ok = 0
66
+ errors: list[tuple[str, str]] = []
67
+ metrics_rows: list[tuple[str, int, str]] = []
68
+ total_size_bytes = 0
69
+
70
+ for d in dirs:
71
+ name = d.name
72
+ pth = d / "best.pth"
73
+ metrics_path = d / "best_metrics.json"
74
+
75
+ if not pth.exists():
76
+ errors.append((name, "missing best.pth"))
77
+ continue
78
+ if not metrics_path.exists():
79
+ errors.append((name, "missing best_metrics.json"))
80
+ continue
81
+
82
+ # Load safe-by-default: weights_only=True refuses to execute pickle, so a poisoned or
83
+ # compromised remote checkpoint cannot run code on the operator's machine. The unsafe
84
+ # weights_only=False path (which DOES execute pickle) is OPT-IN via --allow-pickle: an attacker
85
+ # who controls the remote file could otherwise craft one that fails the safe load to FORCE the
86
+ # fallback, so auto-falling-back would defeat the gate. Pass --allow-pickle ONLY for your own ckpts.
87
+ try:
88
+ ckpt = torch.load(pth, map_location="cpu", weights_only=True)
89
+ except Exception as e_safe:
90
+ if not args.allow_pickle:
91
+ errors.append((name, f"safe load (weights_only=True) failed: {str(e_safe)[:70]} "
92
+ "-- re-run with --allow-pickle if this is your own checkpoint"))
93
+ continue
94
+ try:
95
+ print(
96
+ f" [warn] {name}: weights_only=True failed; --allow-pickle set, retrying "
97
+ "weights_only=False (executes pickle -- trust this file)"
98
+ )
99
+ ckpt = torch.load(pth, map_location="cpu", weights_only=False)
100
+ except Exception as e:
101
+ errors.append((name, f"torch.load failed: {str(e)[:100]}"))
102
+ continue
103
+
104
+ if not isinstance(ckpt, dict) or not any(k in ckpt for k in ("model_state_dict", "model", "state_dict")):
105
+ errors.append((name, "no model/model_state_dict/state_dict key in checkpoint"))
106
+ continue
107
+
108
+ try:
109
+ with open(metrics_path) as f:
110
+ m = json.load(f)
111
+ except Exception as e:
112
+ errors.append((name, f"best_metrics.json invalid: {str(e)[:80]}"))
113
+ continue
114
+
115
+ epoch = m.get("epoch", "?")
116
+ if epoch is None: # {"epoch": null} → .get returns None (not the default); guard the :3 format. `or` would wrongly eat epoch 0.
117
+ epoch = "?"
118
+ # Pick main metric (PSNR for recon, mAP50 for det, dice for seg, fall back to loss)
119
+ main_metric_key = next(
120
+ (k for k in ["psnr", "mAP50", "dice"] if k in m),
121
+ "loss",
122
+ )
123
+ main_metric_val = m.get(main_metric_key, "?")
124
+ metrics_rows.append((name, epoch, f"{main_metric_key}={main_metric_val}"))
125
+
126
+ total_size_bytes += pth.stat().st_size
127
+ ok += 1
128
+
129
+ print(f"OK: {ok}/{len(dirs)}")
130
+ print(f"Errors: {len(errors)}")
131
+ for name, err in errors[:20]:
132
+ print(f" - {name}: {err}")
133
+ print(f"Total best.pth size: {total_size_bytes / 1e9:.1f} GB")
134
+
135
+ if args.list_metrics:
136
+ print()
137
+ print("=== Per-ablation metrics ===")
138
+ for name, epoch, metric in metrics_rows:
139
+ print(f" {name:40s} epoch={epoch:3} {metric}")
140
+
141
+ return 0 if not errors else 1
142
+
143
+
144
+ if __name__ == "__main__":
145
+ sys.exit(main())
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "opencode-skills-collection",
3
- "version": "3.1.2",
3
+ "version": "3.1.3",
4
4
  "description": "OpenCode CLI plugin that automatically downloads and keeps skills up to date.",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",
package/skills_index.json CHANGED
@@ -551,6 +551,28 @@
551
551
  "reasons": []
552
552
  }
553
553
  },
554
+ {
555
+ "id": "agent-creator",
556
+ "path": "skills/agent-creator",
557
+ "category": "ai-ml",
558
+ "name": "agent-creator",
559
+ "description": "Create custom AI subagents with proper plugin structure, persona generation, and companion routing skills.",
560
+ "risk": "critical",
561
+ "source": "community",
562
+ "date_added": "2026-06-20",
563
+ "plugin": {
564
+ "targets": {
565
+ "codex": "supported",
566
+ "claude": "supported"
567
+ },
568
+ "setup": {
569
+ "type": "none",
570
+ "summary": "",
571
+ "docs": null
572
+ },
573
+ "reasons": []
574
+ }
575
+ },
554
576
  {
555
577
  "id": "agent-evaluation",
556
578
  "path": "skills/agent-evaluation",
@@ -3388,6 +3410,28 @@
3388
3410
  "reasons": []
3389
3411
  }
3390
3412
  },
3413
+ {
3414
+ "id": "ax-extract-workflow",
3415
+ "path": "skills/ax-extract-workflow",
3416
+ "category": "development",
3417
+ "name": "ax-extract-workflow",
3418
+ "description": "Reconstruct workflow behind a past coding-agent artifact using local ax sessions/commits/skills/tool traces. Use when asked how X was built.",
3419
+ "risk": "safe",
3420
+ "source": "community",
3421
+ "date_added": "2026-06-21",
3422
+ "plugin": {
3423
+ "targets": {
3424
+ "codex": "supported",
3425
+ "claude": "supported"
3426
+ },
3427
+ "setup": {
3428
+ "type": "none",
3429
+ "summary": "",
3430
+ "docs": null
3431
+ },
3432
+ "reasons": []
3433
+ }
3434
+ },
3391
3435
  {
3392
3436
  "id": "axiom",
3393
3437
  "path": "skills/axiom",
@@ -27273,6 +27317,28 @@
27273
27317
  "reasons": []
27274
27318
  }
27275
27319
  },
27320
+ {
27321
+ "id": "remote-gpu-trainer",
27322
+ "path": "skills/remote-gpu-trainer",
27323
+ "category": "ml-ops",
27324
+ "name": "remote-gpu-trainer",
27325
+ "description": "Deploy, monitor, and debug long GPU jobs on RENTED/remote instances (AutoDL, RunPod, vast.ai, Lambda, Slurm, K8s): teardown/billing safety, spot resilience, resumable checkpointing, OOM/NaN triage.",
27326
+ "risk": "safe",
27327
+ "source": "community",
27328
+ "date_added": "2026-06-20",
27329
+ "plugin": {
27330
+ "targets": {
27331
+ "codex": "supported",
27332
+ "claude": "supported"
27333
+ },
27334
+ "setup": {
27335
+ "type": "none",
27336
+ "summary": "",
27337
+ "docs": null
27338
+ },
27339
+ "reasons": []
27340
+ }
27341
+ },
27276
27342
  {
27277
27343
  "id": "remotion",
27278
27344
  "path": "skills/remotion",