dt-skills 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,92 @@
1
+ # dt-skills CLI
2
+
3
+ Browse and install AI skills for Claude Code, GitHub Copilot, OpenAI Codex, and OpenCode.
4
+
5
+ ## Quick Start
6
+
7
+ ```sh
8
+ npx dt-skills list
9
+ npx dt-skills install code-review --project
10
+ ```
11
+
12
+ ## Commands
13
+
14
+ ### `list`
15
+
16
+ List all available skills in the registry.
17
+
18
+ ```sh
19
+ npx dt-skills list
20
+ ```
21
+
22
+ ### `search <query>`
23
+
24
+ Search skills by name or description.
25
+
26
+ ```sh
27
+ npx dt-skills search review
28
+ ```
29
+
30
+ ### `info <skill>`
31
+
32
+ Show detailed information about a skill, including its instructions, configuration, and related skills.
33
+
34
+ ```sh
35
+ npx dt-skills info code-review
36
+ ```
37
+
38
+ ### `install <skill>`
39
+
40
+ Install a skill to your local environment. Skills with related skills will install all of them automatically.
41
+
42
+ ```sh
43
+ npx dt-skills install code-review
44
+ ```
45
+
46
+ **Options:**
47
+
48
+ | Flag | Description |
49
+ | ------------------------- | ----------------------------------------------------------------------- |
50
+ | `-p, --project` | Install to the current project instead of your personal/global config |
51
+ | `-t, --harness <harness>` | Target harness: `claude-code` (default), `copilot`, `codex`, `opencode` |
52
+
53
+ **Examples:**
54
+
55
+ ```sh
56
+ # Install to current project for Claude Code (default)
57
+ npx dt-skills install code-review --project
58
+
59
+ # Install for GitHub Copilot
60
+ npx dt-skills install code-review --project -t copilot
61
+
62
+ # Install for OpenAI Codex
63
+ npx dt-skills install code-review --project -t codex
64
+
65
+ # Install for OpenCode
66
+ npx dt-skills install code-review --project -t opencode
67
+
68
+ # Install globally (personal config)
69
+ npx dt-skills install code-review
70
+ ```
71
+
72
+ ## Install Locations
73
+
74
+ | Harness | Project | Personal |
75
+ | ----------- | ---------------------------------------------- | ------------------------------------------------ |
76
+ | Claude Code | `.claude/skills/<skill>/SKILL.md` | `~/.claude/skills/<skill>/SKILL.md` |
77
+ | Copilot | `.github/instructions/<skill>.instructions.md` | `~/.github/instructions/<skill>.instructions.md` |
78
+ | Codex | `.codex/agents/<skill>.md` | `~/.codex/instructions.md` (appended) |
79
+ | OpenCode | `.opencode/skills/<skill>/SKILL.md` | `~/.config/opencode/skills/<skill>/SKILL.md` |
80
+
81
+ ## Related Skills
82
+
83
+ Some skills bundle related skills that get installed together. For example, `code-review` ships with `review-pr` and `review-diff`. When you install the parent skill, all related skills are installed as separate, independent skills.
84
+
85
+ ```sh
86
+ npx dt-skills info code-review # see related skills
87
+ npx dt-skills install code-review --project # installs all 3
88
+ ```
89
+
90
+ ## License
91
+
92
+ MIT
package/dist/cli.d.ts ADDED
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env node
2
+ export {};
package/dist/cli.js ADDED
@@ -0,0 +1,33 @@
1
+ #!/usr/bin/env node
2
+ import { Command } from "commander";
3
+ import { listCommand } from "./commands/list.js";
4
+ import { infoCommand } from "./commands/info.js";
5
+ import { installCommand } from "./commands/install.js";
6
+ import { searchCommand } from "./commands/search.js";
7
+ const program = new Command();
8
+ program
9
+ .name("dt-skills")
10
+ .description("dt-skills AI Skills Registry - browse and install Claude Code skills")
11
+ .version("0.1.0");
12
+ program
13
+ .command("list")
14
+ .alias("ls")
15
+ .description("List all available skills in the registry")
16
+ .action(listCommand);
17
+ program
18
+ .command("info <skill>")
19
+ .description("Show detailed information about a skill")
20
+ .action(infoCommand);
21
+ program
22
+ .command("install <skill>")
23
+ .alias("i")
24
+ .description("Install a skill to your local environment")
25
+ .option("-g, --global", "Install globally instead of to the current project")
26
+ .option("-t, --harness <harness>", "Target harness: claude-code, copilot, codex, opencode", "claude-code")
27
+ .action(installCommand);
28
+ program
29
+ .command("search <query>")
30
+ .alias("s")
31
+ .description("Search skills by name or description")
32
+ .action(searchCommand);
33
+ program.parse();
@@ -0,0 +1 @@
1
+ export declare function infoCommand(name: string): void;
@@ -0,0 +1,53 @@
1
+ import chalk from "chalk";
2
+ import { findSkill } from "../registry.js";
3
+ export function infoCommand(name) {
4
+ const skill = findSkill(name);
5
+ if (!skill) {
6
+ console.log(chalk.red(`Skill "${name}" not found.`));
7
+ console.log(chalk.gray(`Run ${chalk.white("dt-skills list")} to see available skills.`));
8
+ process.exitCode = 1;
9
+ return;
10
+ }
11
+ console.log(chalk.bold(`\n${skill.name}\n`));
12
+ console.log(chalk.gray("─".repeat(60)));
13
+ console.log(`${chalk.bold("Description:")} ${skill.description}`);
14
+ // Show frontmatter config
15
+ const fm = skill.frontmatter;
16
+ if (fm["disable-model-invocation"]) {
17
+ console.log(`${chalk.bold("Invocation:")} Manual only (/${skill.name})`);
18
+ }
19
+ else if (fm["user-invocable"] === false) {
20
+ console.log(`${chalk.bold("Invocation:")} Auto only (Claude invokes when relevant)`);
21
+ }
22
+ else {
23
+ console.log(`${chalk.bold("Invocation:")} Auto + manual (/${skill.name})`);
24
+ }
25
+ if (fm["allowed-tools"]) {
26
+ console.log(`${chalk.bold("Tools:")} ${fm["allowed-tools"]}`);
27
+ }
28
+ if (fm.context) {
29
+ console.log(`${chalk.bold("Context:")} ${fm.context}`);
30
+ }
31
+ if (fm.agent) {
32
+ console.log(`${chalk.bold("Agent:")} ${fm.agent}`);
33
+ }
34
+ // Show files
35
+ if (skill.files.length > 1) {
36
+ console.log(`\n${chalk.bold("Files:")}`);
37
+ for (const file of skill.files) {
38
+ console.log(` ${chalk.gray("•")} ${file}`);
39
+ }
40
+ }
41
+ // Show related skills
42
+ if (skill.relatedSkills.length > 0) {
43
+ console.log(`\n${chalk.bold("Related skills:")}`);
44
+ for (const related of skill.relatedSkills) {
45
+ console.log(` ${chalk.gray("•")} ${chalk.cyan(related.name)} - ${related.description}`);
46
+ }
47
+ }
48
+ // Show skill content
49
+ console.log(`\n${chalk.bold("Instructions:")}`);
50
+ console.log(chalk.gray("─".repeat(60)));
51
+ console.log(skill.content);
52
+ console.log();
53
+ }
@@ -0,0 +1,4 @@
1
+ export declare function installCommand(name: string, options: {
2
+ global?: boolean;
3
+ harness?: string;
4
+ }): void;
@@ -0,0 +1,36 @@
1
+ import chalk from "chalk";
2
+ import { findSkill } from "../registry.js";
3
+ import { harnesses, harnessNames } from "../harnesses.js";
4
+ export function installCommand(name, options) {
5
+ const skill = findSkill(name);
6
+ if (!skill) {
7
+ console.log(chalk.red(`Skill "${name}" not found.`));
8
+ console.log(chalk.gray(`Run ${chalk.white("dt-skills list")} to see available skills.`));
9
+ process.exitCode = 1;
10
+ return;
11
+ }
12
+ const harnessKey = options.harness || "claude-code";
13
+ const harness = harnesses[harnessKey];
14
+ if (!harness) {
15
+ console.log(chalk.red(`Unknown harness "${harnessKey}".`));
16
+ console.log(chalk.gray(`Available: ${harnessNames.join(", ")}`));
17
+ process.exitCode = 1;
18
+ return;
19
+ }
20
+ const project = !(options.global ?? false);
21
+ const scope = project ? "project" : "personal";
22
+ const paths = harness.install(skill, { project });
23
+ const relatedCount = skill.relatedSkills.length;
24
+ if (relatedCount > 0) {
25
+ console.log(chalk.green(`\nInstalled "${skill.name}" + ${relatedCount} related skill${relatedCount === 1 ? "" : "s"} for ${harness.name} (${scope}).`));
26
+ }
27
+ else {
28
+ console.log(chalk.green(`\nInstalled "${skill.name}" for ${harness.name} (${scope}).`));
29
+ }
30
+ for (const p of paths) {
31
+ console.log(chalk.gray(` ${p}`));
32
+ }
33
+ console.log();
34
+ console.log(harness.usageHint(skill));
35
+ console.log();
36
+ }
@@ -0,0 +1 @@
1
+ export declare function listCommand(): void;
@@ -0,0 +1,23 @@
1
+ import chalk from "chalk";
2
+ import { loadRegistry } from "../registry.js";
3
+ export function listCommand() {
4
+ const skills = loadRegistry();
5
+ if (skills.length === 0) {
6
+ console.log(chalk.yellow("No skills found in the registry."));
7
+ return;
8
+ }
9
+ console.log(chalk.bold(`\ndt-skills Skills Registry (${skills.length} skills)\n`));
10
+ console.log(chalk.gray("─".repeat(60)));
11
+ for (const skill of skills) {
12
+ const invocable = skill.frontmatter["disable-model-invocation"]
13
+ ? chalk.gray(" (manual only)")
14
+ : "";
15
+ const related = skill.relatedSkills.length > 0
16
+ ? chalk.magenta(` (+${skill.relatedSkills.length} related)`)
17
+ : "";
18
+ console.log(` ${chalk.cyan(skill.name)}${invocable}${related}`);
19
+ console.log(` ${chalk.gray(skill.description)}`);
20
+ console.log();
21
+ }
22
+ console.log(chalk.gray(`Run ${chalk.white("dt-skills info <skill>")} for details or ${chalk.white("dt-skills install <skill>")} to install.`));
23
+ }
@@ -0,0 +1 @@
1
+ export declare function searchCommand(query: string): void;
@@ -0,0 +1,19 @@
1
+ import chalk from "chalk";
2
+ import { loadRegistry } from "../registry.js";
3
+ export function searchCommand(query) {
4
+ const skills = loadRegistry();
5
+ const q = query.toLowerCase();
6
+ const matches = skills.filter((s) => s.name.toLowerCase().includes(q) ||
7
+ s.description.toLowerCase().includes(q) ||
8
+ s.content.toLowerCase().includes(q));
9
+ if (matches.length === 0) {
10
+ console.log(chalk.yellow(`No skills matching "${query}".`));
11
+ return;
12
+ }
13
+ console.log(chalk.bold(`\n${matches.length} skill${matches.length === 1 ? "" : "s"} matching "${query}"\n`));
14
+ for (const skill of matches) {
15
+ console.log(` ${chalk.cyan(skill.name)}`);
16
+ console.log(` ${chalk.gray(skill.description)}`);
17
+ console.log();
18
+ }
19
+ }
@@ -0,0 +1,12 @@
1
+ import type { Skill } from "./registry.js";
2
+ export interface Harness {
3
+ name: string;
4
+ /** How to use the skill after install */
5
+ usageHint: (skill: Skill) => string;
6
+ /** Install the skill, returns the paths where it was installed */
7
+ install: (skill: Skill, opts: {
8
+ project: boolean;
9
+ }) => string[];
10
+ }
11
+ export declare const harnesses: Record<string, Harness>;
12
+ export declare const harnessNames: string[];
@@ -0,0 +1,188 @@
1
+ import fs from "node:fs";
2
+ import path from "node:path";
3
+ function copyDir(src, dest, excludeDirs = []) {
4
+ fs.mkdirSync(dest, { recursive: true });
5
+ for (const entry of fs.readdirSync(src, { withFileTypes: true })) {
6
+ if (excludeDirs.includes(entry.name))
7
+ continue;
8
+ const srcPath = path.join(src, entry.name);
9
+ const destPath = path.join(dest, entry.name);
10
+ if (entry.isDirectory()) {
11
+ copyDir(srcPath, destPath);
12
+ }
13
+ else {
14
+ fs.copyFileSync(srcPath, destPath);
15
+ }
16
+ }
17
+ }
18
+ function writeCopilotFile(dest, skill) {
19
+ const lines = [];
20
+ lines.push("---");
21
+ lines.push(`applyTo: "**"`);
22
+ lines.push("---");
23
+ lines.push("");
24
+ lines.push(`# ${skill.name}`);
25
+ lines.push("");
26
+ lines.push(skill.content);
27
+ fs.writeFileSync(dest, lines.join("\n"), "utf-8");
28
+ }
29
+ const home = process.env.HOME || "~";
30
+ // ---------------------------------------------------------------------------
31
+ // Claude Code
32
+ // Personal: ~/.claude/skills/<skill>/SKILL.md
33
+ // Project: .claude/skills/<skill>/SKILL.md
34
+ // ---------------------------------------------------------------------------
35
+ const claudeCode = {
36
+ name: "Claude Code",
37
+ usageHint(skill) {
38
+ if (skill.frontmatter["disable-model-invocation"]) {
39
+ return `Use in Claude Code: /${skill.name}`;
40
+ }
41
+ return `Use in Claude Code: /${skill.name} or let Claude invoke it automatically`;
42
+ },
43
+ install(skill, { project }) {
44
+ const base = project
45
+ ? path.join(process.cwd(), ".claude", "skills")
46
+ : path.join(home, ".claude", "skills");
47
+ const dest = path.join(base, skill.id);
48
+ overwriteDir(skill.registryPath, dest, ["related-skills"]);
49
+ const paths = [dest];
50
+ for (const related of skill.relatedSkills) {
51
+ const relDest = path.join(base, related.id);
52
+ overwriteDir(related.registryPath, relDest);
53
+ paths.push(relDest);
54
+ }
55
+ return paths;
56
+ },
57
+ };
58
+ // ---------------------------------------------------------------------------
59
+ // GitHub Copilot
60
+ // Project only: .github/instructions/<skill>.instructions.md
61
+ // Each file uses YAML frontmatter with `applyTo` for scoping.
62
+ // ---------------------------------------------------------------------------
63
+ const copilot = {
64
+ name: "GitHub Copilot",
65
+ usageHint(skill) {
66
+ return `Copilot will apply these instructions automatically when editing matching files.`;
67
+ },
68
+ install(skill, { project }) {
69
+ const base = project
70
+ ? path.join(process.cwd(), ".github", "instructions")
71
+ : path.join(home, ".github", "instructions");
72
+ fs.mkdirSync(base, { recursive: true });
73
+ const dest = path.join(base, `${skill.id}.instructions.md`);
74
+ // Convert to Copilot instruction format
75
+ writeCopilotFile(dest, skill);
76
+ const paths = [dest];
77
+ for (const related of skill.relatedSkills) {
78
+ const relDest = path.join(base, `${related.id}.instructions.md`);
79
+ writeCopilotFile(relDest, related);
80
+ paths.push(relDest);
81
+ }
82
+ return paths;
83
+ },
84
+ };
85
+ // ---------------------------------------------------------------------------
86
+ // OpenAI Codex
87
+ // Personal: ~/.codex/instructions.md (single file, append)
88
+ // Project: AGENTS.md or .codex/agents/<skill>.md
89
+ // ---------------------------------------------------------------------------
90
+ const codex = {
91
+ name: "Codex",
92
+ usageHint(skill) {
93
+ return `Codex will load these instructions automatically at the start of each session.`;
94
+ },
95
+ install(skill, { project }) {
96
+ if (project) {
97
+ // Install as a scoped agent file in .codex/agents/
98
+ const base = path.join(process.cwd(), ".codex", "agents");
99
+ fs.mkdirSync(base, { recursive: true });
100
+ const dest = path.join(base, `${skill.id}.md`);
101
+ writeCodexAgentFile(dest, skill);
102
+ const paths = [dest];
103
+ for (const related of skill.relatedSkills) {
104
+ const relDest = path.join(base, `${related.id}.md`);
105
+ writeCodexAgentFile(relDest, related);
106
+ paths.push(relDest);
107
+ }
108
+ return paths;
109
+ }
110
+ else {
111
+ // Append to global instructions
112
+ const dest = path.join(home, ".codex", "instructions.md");
113
+ fs.mkdirSync(path.dirname(dest), { recursive: true });
114
+ const allSkills = [skill, ...skill.relatedSkills];
115
+ for (const s of allSkills) {
116
+ appendCodexGlobal(dest, s);
117
+ }
118
+ return [dest];
119
+ }
120
+ },
121
+ };
122
+ // ---------------------------------------------------------------------------
123
+ // OpenCode
124
+ // Personal: ~/.config/opencode/skills/<skill>/SKILL.md
125
+ // Project: .opencode/skills/<skill>/SKILL.md
126
+ // ---------------------------------------------------------------------------
127
+ const openCode = {
128
+ name: "OpenCode",
129
+ usageHint(skill) {
130
+ if (skill.frontmatter["disable-model-invocation"]) {
131
+ return `Use in OpenCode: /${skill.name}`;
132
+ }
133
+ return `Use in OpenCode: /${skill.name} or let OpenCode invoke it automatically`;
134
+ },
135
+ install(skill, { project }) {
136
+ const base = project
137
+ ? path.join(process.cwd(), ".opencode", "skills")
138
+ : path.join(home, ".config", "opencode", "skills");
139
+ const dest = path.join(base, skill.id);
140
+ overwriteDir(skill.registryPath, dest, ["related-skills"]);
141
+ const paths = [dest];
142
+ for (const related of skill.relatedSkills) {
143
+ const relDest = path.join(base, related.id);
144
+ overwriteDir(related.registryPath, relDest);
145
+ paths.push(relDest);
146
+ }
147
+ return paths;
148
+ },
149
+ };
150
+ function writeCodexAgentFile(dest, skill) {
151
+ const lines = [];
152
+ lines.push(`# ${skill.name}`);
153
+ lines.push("");
154
+ lines.push(skill.content);
155
+ fs.writeFileSync(dest, lines.join("\n"), "utf-8");
156
+ }
157
+ function appendCodexGlobal(dest, skill) {
158
+ const section = `\n\n## ${skill.name}\n\n${skill.content}\n`;
159
+ if (fs.existsSync(dest)) {
160
+ const existing = fs.readFileSync(dest, "utf-8");
161
+ if (existing.includes(`## ${skill.name}`)) {
162
+ const regex = new RegExp(`## ${escapeRegex(skill.name)}\\n[\\s\\S]*?(?=\\n## |$)`);
163
+ fs.writeFileSync(dest, existing.replace(regex, `## ${skill.name}\n\n${skill.content}\n`), "utf-8");
164
+ }
165
+ else {
166
+ fs.appendFileSync(dest, section, "utf-8");
167
+ }
168
+ }
169
+ else {
170
+ fs.writeFileSync(dest, `# dt-skills Skills${section}`, "utf-8");
171
+ }
172
+ }
173
+ function escapeRegex(str) {
174
+ return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
175
+ }
176
+ function overwriteDir(src, dest, excludeDirs = []) {
177
+ if (fs.existsSync(dest)) {
178
+ fs.rmSync(dest, { recursive: true });
179
+ }
180
+ copyDir(src, dest, excludeDirs);
181
+ }
182
+ export const harnesses = {
183
+ "claude-code": claudeCode,
184
+ copilot,
185
+ codex,
186
+ opencode: openCode,
187
+ };
188
+ export const harnessNames = Object.keys(harnesses);
@@ -0,0 +1,21 @@
1
+ export interface Skill {
2
+ /** Directory name of the skill */
3
+ id: string;
4
+ /** Display name from frontmatter or directory name */
5
+ name: string;
6
+ /** What the skill does */
7
+ description: string;
8
+ /** Full path to the skill directory in the registry */
9
+ registryPath: string;
10
+ /** Raw SKILL.md content (without frontmatter) */
11
+ content: string;
12
+ /** All frontmatter fields */
13
+ frontmatter: Record<string, unknown>;
14
+ /** List of supporting files relative to the skill directory */
15
+ files: string[];
16
+ /** Related skills bundled with this skill */
17
+ relatedSkills: Skill[];
18
+ }
19
+ export declare function loadSkill(skillDir: string): Skill | null;
20
+ export declare function loadRegistry(): Skill[];
21
+ export declare function findSkill(nameOrId: string): Skill | undefined;
@@ -0,0 +1,80 @@
1
+ import fs from "node:fs";
2
+ import path from "node:path";
3
+ import matter from "gray-matter";
4
+ function getRegistryDir() {
5
+ // Skills live in the `skills/` directory next to the package
6
+ const thisFile = new URL(import.meta.url).pathname;
7
+ const packageRoot = path.resolve(path.dirname(thisFile), "..");
8
+ return path.join(packageRoot, "skills");
9
+ }
10
+ function listFiles(dir, base = "", excludeDirs = []) {
11
+ const results = [];
12
+ for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
13
+ if (excludeDirs.includes(entry.name))
14
+ continue;
15
+ const rel = path.join(base, entry.name);
16
+ if (entry.isDirectory()) {
17
+ results.push(...listFiles(path.join(dir, entry.name), rel));
18
+ }
19
+ else {
20
+ results.push(rel);
21
+ }
22
+ }
23
+ return results;
24
+ }
25
+ export function loadSkill(skillDir) {
26
+ const skillMd = path.join(skillDir, "SKILL.md");
27
+ if (!fs.existsSync(skillMd))
28
+ return null;
29
+ const raw = fs.readFileSync(skillMd, "utf-8");
30
+ const { data, content } = matter(raw);
31
+ const id = path.basename(skillDir);
32
+ const allFiles = listFiles(skillDir, "", ["related-skills"]);
33
+ // Load related skills from related-skills/ subdirectory
34
+ const relatedSkills = [];
35
+ const relatedDir = path.join(skillDir, "related-skills");
36
+ if (fs.existsSync(relatedDir)) {
37
+ for (const entry of fs.readdirSync(relatedDir, {
38
+ withFileTypes: true,
39
+ })) {
40
+ if (!entry.isDirectory())
41
+ continue;
42
+ const related = loadSkill(path.join(relatedDir, entry.name));
43
+ if (related)
44
+ relatedSkills.push(related);
45
+ }
46
+ }
47
+ return {
48
+ id,
49
+ name: data.name || id,
50
+ description: data.description ||
51
+ content
52
+ .split("\n")
53
+ .find((l) => l.trim())
54
+ ?.trim() ||
55
+ "",
56
+ registryPath: skillDir,
57
+ content: content.trim(),
58
+ frontmatter: data,
59
+ files: allFiles,
60
+ relatedSkills,
61
+ };
62
+ }
63
+ export function loadRegistry() {
64
+ const registryDir = getRegistryDir();
65
+ if (!fs.existsSync(registryDir))
66
+ return [];
67
+ const skills = [];
68
+ for (const entry of fs.readdirSync(registryDir, { withFileTypes: true })) {
69
+ if (!entry.isDirectory())
70
+ continue;
71
+ const skill = loadSkill(path.join(registryDir, entry.name));
72
+ if (skill)
73
+ skills.push(skill);
74
+ }
75
+ return skills.sort((a, b) => a.name.localeCompare(b.name));
76
+ }
77
+ export function findSkill(nameOrId) {
78
+ const skills = loadRegistry();
79
+ return skills.find((s) => s.id === nameOrId || s.name === nameOrId);
80
+ }
package/package.json ADDED
@@ -0,0 +1,41 @@
1
+ {
2
+ "name": "dt-skills",
3
+ "version": "0.1.0",
4
+ "description": "DataThink AI Skills Registry - browse and install agent skills",
5
+ "type": "module",
6
+ "bin": {
7
+ "dt-skills": "dist/cli.js"
8
+ },
9
+ "publishConfig": {
10
+ "access": "public"
11
+ },
12
+ "scripts": {
13
+ "build": "tsc",
14
+ "dev": "tsx src/cli.ts",
15
+ "prepublishOnly": "npm run build"
16
+ },
17
+ "files": [
18
+ "dist",
19
+ "skills"
20
+ ],
21
+ "keywords": [
22
+ "claude-code",
23
+ "codex",
24
+ "copilot",
25
+ "opencode",
26
+ "skills",
27
+ "ai"
28
+ ],
29
+ "author": "eglenn-dev",
30
+ "license": "MIT",
31
+ "dependencies": {
32
+ "chalk": "^5.6.2",
33
+ "commander": "^14.0.3",
34
+ "gray-matter": "^4.0.3"
35
+ },
36
+ "devDependencies": {
37
+ "@types/node": "^25.3.3",
38
+ "tsx": "^4.21.0",
39
+ "typescript": "^5.9.3"
40
+ }
41
+ }
@@ -0,0 +1,110 @@
1
+ ---
2
+ description: Code review a pull request
3
+ user-invocable: true
4
+ ---
5
+
6
+ Provide a code review for the given pull request.
7
+
8
+ **Agent assumptions (applies to all agents and subagents):**
9
+
10
+ - All tools are functional and will work without error. Do not test tools or make exploratory calls. Make sure this is clear to every subagent that is launched.
11
+ - Only call a tool if it is required to complete the task. Every tool call should have a clear purpose.
12
+
13
+ To do this, follow these steps precisely:
14
+
15
+ 1. Launch a haiku agent to check if any of the following are true:
16
+ - The pull request is closed
17
+ - The pull request is a draft
18
+ - The pull request does not need code review (e.g. automated PR, trivial change that is obviously correct)
19
+ - Claude has already commented on this PR (check `gh pr view <PR> --comments` for comments left by claude)
20
+
21
+ If any condition is true, stop and do not proceed.
22
+
23
+ Note: Still review Claude generated PR's.
24
+
25
+ 2. Launch a haiku agent to return a list of file paths (not their contents) for all relevant CLAUDE.md files including:
26
+ - The root CLAUDE.md file, if it exists
27
+ - Any CLAUDE.md files in directories containing files modified by the pull request
28
+
29
+ 3. Launch a sonnet agent to view the pull request and return a summary of the changes
30
+
31
+ 4. Launch 4 agents in parallel to independently review the changes. Each agent should return the list of issues, where each issue includes a description and the reason it was flagged (e.g. "CLAUDE.md adherence", "bug"). The agents should do the following:
32
+
33
+ Agents 1 + 2: CLAUDE.md compliance sonnet agents
34
+ Audit changes for CLAUDE.md compliance in parallel. Note: When evaluating CLAUDE.md compliance for a file, you should only consider CLAUDE.md files that share a file path with the file or parents.
35
+
36
+ Agent 3: Opus bug agent (parallel subagent with agent 4)
37
+ Scan for obvious bugs. Focus only on the diff itself without reading extra context. Flag only significant bugs; ignore nitpicks and likely false positives. Do not flag issues that you cannot validate without looking at context outside of the git diff.
38
+
39
+ Agent 4: Opus bug agent (parallel subagent with agent 3)
40
+ Look for problems that exist in the introduced code. This could be security issues, incorrect logic, etc. Only look for issues that fall within the changed code.
41
+
42
+ **CRITICAL: We only want HIGH SIGNAL issues.** Flag issues where:
43
+ - The code will fail to compile or parse (syntax errors, type errors, missing imports, unresolved references)
44
+ - The code will definitely produce wrong results regardless of inputs (clear logic errors)
45
+ - Clear, unambiguous CLAUDE.md violations where you can quote the exact rule being broken
46
+
47
+ Do NOT flag:
48
+ - Code style or quality concerns
49
+ - Potential issues that depend on specific inputs or state
50
+ - Subjective suggestions or improvements
51
+
52
+ If you are not certain an issue is real, do not flag it. False positives erode trust and waste reviewer time.
53
+
54
+ In addition to the above, each subagent should be told the PR title and description. This will help provide context regarding the author's intent.
55
+
56
+ 5. For each issue found in the previous step by agents 3 and 4, launch parallel subagents to validate the issue. These subagents should get the PR title and description along with a description of the issue. The agent's job is to review the issue to validate that the stated issue is truly an issue with high confidence. For example, if an issue such as "variable is not defined" was flagged, the subagent's job would be to validate that is actually true in the code. Another example would be CLAUDE.md issues. The agent should validate that the CLAUDE.md rule that was violated is scoped for this file and is actually violated. Use Opus subagents for bugs and logic issues, and sonnet agents for CLAUDE.md violations.
57
+
58
+ 6. Filter out any issues that were not validated in step 5. This step will give us our list of high signal issues for our review.
59
+
60
+ 7. Output a summary of the review findings to the terminal:
61
+ - If issues were found, list each issue with a brief description.
62
+ - If no issues were found, state: "No issues found. Checked for bugs and CLAUDE.md compliance."
63
+
64
+ If `--comment` argument was NOT provided, stop here. Do not post any GitHub comments.
65
+
66
+ If `--comment` argument IS provided and NO issues were found, post a summary comment using `gh pr comment` and stop.
67
+
68
+ If `--comment` argument IS provided and issues were found, continue to step 8.
69
+
70
+ 8. Create a list of all comments that you plan on leaving. This is only for you to make sure you are comfortable with the comments. Do not post this list anywhere.
71
+
72
+ 9. Post inline comments for each issue using `mcp__github_inline_comment__create_inline_comment`. For each comment:
73
+ - Provide a brief description of the issue
74
+ - For small, self-contained fixes, include a committable suggestion block
75
+ - For larger fixes (6+ lines, structural changes, or changes spanning multiple locations), describe the issue and suggested fix without a suggestion block
76
+ - Never post a committable suggestion UNLESS committing the suggestion fixes the issue entirely. If follow up steps are required, do not leave a committable suggestion.
77
+
78
+ **IMPORTANT: Only post ONE comment per unique issue. Do not post duplicate comments.**
79
+
80
+ Use this list when evaluating issues in Steps 4 and 5 (these are false positives, do NOT flag):
81
+
82
+ - Pre-existing issues
83
+ - Something that appears to be a bug but is actually correct
84
+ - Pedantic nitpicks that a senior engineer would not flag
85
+ - Issues that a linter will catch (do not run the linter to verify)
86
+ - General code quality concerns (e.g., lack of test coverage, general security issues) unless explicitly required in CLAUDE.md
87
+ - Issues mentioned in CLAUDE.md but explicitly silenced in the code (e.g., via a lint ignore comment)
88
+
89
+ Notes:
90
+
91
+ - Use gh CLI to interact with GitHub (e.g., fetch pull requests, create comments). Do not use web fetch.
92
+ - Create a todo list before starting.
93
+ - You must cite and link each issue in inline comments (e.g., if referring to a CLAUDE.md, include a link to it).
94
+ - If no issues are found and `--comment` argument is provided, post a comment with the following format:
95
+
96
+ ---
97
+
98
+ ## Code review
99
+
100
+ No issues found. Checked for bugs and CLAUDE.md compliance.
101
+
102
+ ---
103
+
104
+ - When linking to code in inline comments, follow the following format precisely, otherwise the Markdown preview won't render correctly: https://github.com/anthropics/claude-code/blob/c21d3c10bc8e898b7ac1a2d745bdc9bc4e423afe/package.json#L10-L15
105
+ - Requires full git sha
106
+ - You must provide the full sha. Commands like `https://github.com/owner/repo/blob/$(git rev-parse HEAD)/foo/bar` will not work, since your comment will be directly rendered in Markdown.
107
+ - Repo name must match the repo you're code reviewing
108
+ - # sign after the file name
109
+ - Line range format is L[start]-L[end]
110
+ - Provide at least 1 line of context before and after, centered on the line you are commenting about (eg. if you are commenting about lines 5-6, you should link to `L4-7`)
@@ -0,0 +1,17 @@
1
+ ---
2
+ description: Create a git commit
3
+ user-invocable: true
4
+ ---
5
+
6
+ ## Context
7
+
8
+ - Current git status: !`git status`
9
+ - Current git diff (staged and unstaged changes): !`git diff HEAD`
10
+ - Current branch: !`git branch --show-current`
11
+ - Recent commits: !`git log --oneline -10`
12
+
13
+ ## Your task
14
+
15
+ Based on the above changes, create a single git commit.
16
+
17
+ You have the capability to call multiple tools in a single response. Stage and create the commit using a single message. Do not use any other tools or do anything else. Do not send any other text or messages besides these tool calls.
@@ -0,0 +1,13 @@
1
+ ---
2
+ name: explain-code
3
+ description: Explains code with visual diagrams and analogies. Use when explaining how code works, teaching about a codebase, or when asked "how does this work?"
4
+ ---
5
+
6
+ When explaining code, always include:
7
+
8
+ 1. **Start with an analogy**: Compare the code to something from everyday life
9
+ 2. **Draw a diagram**: Use ASCII art to show the flow, structure, or relationships
10
+ 3. **Walk through the code**: Explain step-by-step what happens when the code runs
11
+ 4. **Highlight a gotcha**: What's a common mistake or misconception?
12
+
13
+ Keep explanations conversational. For complex concepts, use multiple analogies at different levels of abstraction.
@@ -0,0 +1,23 @@
1
+ ---
2
+ name: pr-summary
3
+ description: Generate a pull request summary from the current branch changes
4
+ disable-model-invocation: true
5
+ ---
6
+
7
+ ## Pull request context
8
+
9
+ - PR diff: !`git diff main...HEAD`
10
+ - Changed files: !`git diff main...HEAD --name-only`
11
+ - Commit log: !`git log main...HEAD --oneline`
12
+
13
+ ## Your task
14
+
15
+ Summarize this pull request by:
16
+
17
+ 1. **Title**: Write a concise PR title (under 70 characters)
18
+ 2. **Summary**: 1-3 bullet points describing what changed and why
19
+ 3. **Key changes**: Group file changes by area (e.g., API, UI, tests, config)
20
+ 4. **Testing notes**: Suggest what reviewers should test
21
+ 5. **Risk assessment**: Low/Medium/High with brief justification
22
+
23
+ Format the output as a ready-to-use PR description in markdown.
@@ -0,0 +1,506 @@
1
+ ---
2
+ name: skill-creator
3
+ description: Create new skills, modify and improve existing skills, and measure skill performance. Use when users want to create a skill from scratch, update or optimize an existing skill, run evals to test a skill, benchmark skill performance with variance analysis, or optimize a skill's description for better triggering accuracy.
4
+ ---
5
+
6
+ # Skill Creator
7
+
8
+ A skill for creating new skills and iteratively improving them.
9
+
10
+ At a high level, the process of creating a skill goes like this:
11
+
12
+ - Decide what you want the skill to do and roughly how it should do it
13
+ - Write a draft of the skill
14
+ - Create a few test prompts that are realistic and representative of what users would actually say when they want to use this skill
15
+ - Help the user evaluate the results both qualitatively and quantitatively
16
+ - While the runs happen in the background, draft some quantitative evals if there aren't any (if there are some, you can either use as is or modify if you feel something needs to change about them). Then explain them to the user (or if they already existed, explain the ones that already exist)
17
+ - Rewrite the skill based on feedback from the user's evaluation of the results (and also if there are any glaring flaws that become apparent from the quantitative benchmarks)
18
+ - Repeat until you're satisfied
19
+ - Expand the test set and try again at larger scale
20
+
21
+ Your job when using this skill is to figure out where the user is in this process and then jump in and help them progress through these stages. So for instance, maybe they're like "I want to make a skill for X". You can help narrow down what they mean, write a draft, write the test cases, figure out how they want to evaluate, run all the prompts, and repeat.
22
+
23
+ On the other hand, maybe they already have a draft of the skill. In this case you can go straight to the eval/iterate part of the loop.
24
+
25
+ Of course, you should always be flexible and if the user is like "I don't need to run a bunch of evaluations, just vibe with me", you can do that instead.
26
+
27
+ Then after the skill is done (but again, the order is flexible), you can also run the skill description improver, which we have a whole separate script for, to optimize the triggering of the skill.
28
+
29
+ Cool? Cool.
30
+
31
+ ## Communicating with the user
32
+
33
+ The skill creator is liable to be used by people across a wide range of familiarity with coding jargon. If you haven't heard (and how could you, it's only very recently that it started), there's a trend now where the power of Claude is inspiring plumbers to open up their terminals, parents and grandparents to google "how to install npm". On the other hand, the bulk of users are probably fairly computer-literate.
34
+
35
+ So please pay attention to context cues to understand how to phrase your communication! In the default case, just to give you some idea:
36
+
37
+ - "evaluation" and "benchmark" are borderline, but OK
38
+ - for "JSON" and "assertion" you want to see serious cues from the user that they know what those things are before using them without explaining them
39
+
40
+ It's OK to briefly explain terms if you're in doubt, and feel free to clarify terms with a short definition if you're unsure if the user will get it.
41
+
42
+ ---
43
+
44
+ ## Creating a skill
45
+
46
+ ### Capture Intent
47
+
48
+ Start by understanding the user's intent. The current conversation might already contain a workflow the user wants to capture (e.g., they say "turn this into a skill"). If so, extract answers from the conversation history first — the tools used, the sequence of steps, corrections the user made, input/output formats observed. The user may need to fill the gaps, and should confirm before proceeding to the next step.
49
+
50
+ 1. What should this skill enable Claude to do?
51
+ 2. When should this skill trigger? (what user phrases/contexts)
52
+ 3. What's the expected output format?
53
+ 4. Should we set up test cases to verify the skill works? Skills with objectively verifiable outputs (file transforms, data extraction, code generation, fixed workflow steps) benefit from test cases. Skills with subjective outputs (writing style, art) often don't need them. Suggest the appropriate default based on the skill type, but let the user decide.
54
+
55
+ ### Interview and Research
56
+
57
+ Proactively ask questions about edge cases, input/output formats, example files, success criteria, and dependencies. Wait to write test prompts until you've got this part ironed out.
58
+
59
+ Check available MCPs - if useful for research (searching docs, finding similar skills, looking up best practices), research in parallel via subagents if available, otherwise inline. Come prepared with context to reduce burden on the user.
60
+
61
+ ### Write the SKILL.md
62
+
63
+ Based on the user interview, fill in these components:
64
+
65
+ - **name**: Skill identifier
66
+ - **description**: When to trigger, what it does. This is the primary triggering mechanism - include both what the skill does AND specific contexts for when to use it. All "when to use" info goes here, not in the body. Note: currently Claude has a tendency to "undertrigger" skills -- to not use them when they'd be useful. To combat this, please make the skill descriptions a little bit "pushy". So for instance, instead of "How to build a simple fast dashboard to display internal Anthropic data.", you might write "How to build a simple fast dashboard to display internal Anthropic data. Make sure to use this skill whenever the user mentions dashboards, data visualization, internal metrics, or wants to display any kind of company data, even if they don't explicitly ask for a 'dashboard.'"
67
+ - **compatibility**: Required tools, dependencies (optional, rarely needed)
68
+ - **the rest of the skill :)**
69
+
70
+ ### Skill Writing Guide
71
+
72
+ #### Anatomy of a Skill
73
+
74
+ ```
75
+ skill-name/
76
+ ├── SKILL.md (required)
77
+ │ ├── YAML frontmatter (name, description required)
78
+ │ └── Markdown instructions
79
+ └── Bundled Resources (optional)
80
+ ├── scripts/ - Executable code for deterministic/repetitive tasks
81
+ ├── references/ - Docs loaded into context as needed
82
+ └── assets/ - Files used in output (templates, icons, fonts)
83
+ ```
84
+
85
+ #### Progressive Disclosure
86
+
87
+ Skills use a three-level loading system:
88
+
89
+ 1. **Metadata** (name + description) - Always in context (~100 words)
90
+ 2. **SKILL.md body** - In context whenever skill triggers (<500 lines ideal)
91
+ 3. **Bundled resources** - As needed (unlimited, scripts can execute without loading)
92
+
93
+ These word counts are approximate and you can feel free to go longer if needed.
94
+
95
+ **Key patterns:**
96
+
97
+ - Keep SKILL.md under 500 lines; if you're approaching this limit, add an additional layer of hierarchy along with clear pointers about where the model using the skill should go next to follow up.
98
+ - Reference files clearly from SKILL.md with guidance on when to read them
99
+ - For large reference files (>300 lines), include a table of contents
100
+
101
+ **Domain organization**: When a skill supports multiple domains/frameworks, organize by variant:
102
+
103
+ ```
104
+ cloud-deploy/
105
+ ├── SKILL.md (workflow + selection)
106
+ └── references/
107
+ ├── aws.md
108
+ ├── gcp.md
109
+ └── azure.md
110
+ ```
111
+
112
+ Claude reads only the relevant reference file.
113
+
114
+ #### Principle of Lack of Surprise
115
+
116
+ This goes without saying, but skills must not contain malware, exploit code, or any content that could compromise system security. A skill's contents should not surprise the user in their intent if described. Don't go along with requests to create misleading skills or skills designed to facilitate unauthorized access, data exfiltration, or other malicious activities. Things like a "roleplay as an XYZ" are OK though.
117
+
118
+ #### Writing Patterns
119
+
120
+ Prefer using the imperative form in instructions.
121
+
122
+ **Defining output formats** - You can do it like this:
123
+
124
+ ```markdown
125
+ ## Report structure
126
+
127
+ ALWAYS use this exact template:
128
+
129
+ # [Title]
130
+
131
+ ## Executive summary
132
+
133
+ ## Key findings
134
+
135
+ ## Recommendations
136
+ ```
137
+
138
+ **Examples pattern** - It's useful to include examples. You can format them like this (but if "Input" and "Output" are in the examples you might want to deviate a little):
139
+
140
+ ```markdown
141
+ ## Commit message format
142
+
143
+ **Example 1:**
144
+ Input: Added user authentication with JWT tokens
145
+ Output: feat(auth): implement JWT-based authentication
146
+ ```
147
+
148
+ ### Writing Style
149
+
150
+ Try to explain to the model why things are important in lieu of heavy-handed musty MUSTs. Use theory of mind and try to make the skill general and not super-narrow to specific examples. Start by writing a draft and then look at it with fresh eyes and improve it.
151
+
152
+ ### Test Cases
153
+
154
+ After writing the skill draft, come up with 2-3 realistic test prompts — the kind of thing a real user would actually say. Share them with the user: [you don't have to use this exact language] "Here are a few test cases I'd like to try. Do these look right, or do you want to add more?" Then run them.
155
+
156
+ Save test cases to `evals/evals.json`. Don't write assertions yet — just the prompts. You'll draft assertions in the next step while the runs are in progress.
157
+
158
+ ```json
159
+ {
160
+ "skill_name": "example-skill",
161
+ "evals": [
162
+ {
163
+ "id": 1,
164
+ "prompt": "User's task prompt",
165
+ "expected_output": "Description of expected result",
166
+ "files": []
167
+ }
168
+ ]
169
+ }
170
+ ```
171
+
172
+ See `references/schemas.md` for the full schema (including the `assertions` field, which you'll add later).
173
+
174
+ ## Running and evaluating test cases
175
+
176
+ This section is one continuous sequence — don't stop partway through. Do NOT use `/skill-test` or any other testing skill.
177
+
178
+ Put results in `<skill-name>-workspace/` as a sibling to the skill directory. Within the workspace, organize results by iteration (`iteration-1/`, `iteration-2/`, etc.) and within that, each test case gets a directory (`eval-0/`, `eval-1/`, etc.). Don't create all of this upfront — just create directories as you go.
179
+
180
+ ### Step 1: Spawn all runs (with-skill AND baseline) in the same turn
181
+
182
+ For each test case, spawn two subagents in the same turn — one with the skill, one without. This is important: don't spawn the with-skill runs first and then come back for baselines later. Launch everything at once so it all finishes around the same time.
183
+
184
+ **With-skill run:**
185
+
186
+ ```
187
+ Execute this task:
188
+ - Skill path: <path-to-skill>
189
+ - Task: <eval prompt>
190
+ - Input files: <eval files if any, or "none">
191
+ - Save outputs to: <workspace>/iteration-<N>/eval-<ID>/with_skill/outputs/
192
+ - Outputs to save: <what the user cares about — e.g., "the .docx file", "the final CSV">
193
+ ```
194
+
195
+ **Baseline run** (same prompt, but the baseline depends on context):
196
+
197
+ - **Creating a new skill**: no skill at all. Same prompt, no skill path, save to `without_skill/outputs/`.
198
+ - **Improving an existing skill**: the old version. Before editing, snapshot the skill (`cp -r <skill-path> <workspace>/skill-snapshot/`), then point the baseline subagent at the snapshot. Save to `old_skill/outputs/`.
199
+
200
+ Write an `eval_metadata.json` for each test case (assertions can be empty for now). Give each eval a descriptive name based on what it's testing — not just "eval-0". Use this name for the directory too. If this iteration uses new or modified eval prompts, create these files for each new eval directory — don't assume they carry over from previous iterations.
201
+
202
+ ```json
203
+ {
204
+ "eval_id": 0,
205
+ "eval_name": "descriptive-name-here",
206
+ "prompt": "The user's task prompt",
207
+ "assertions": []
208
+ }
209
+ ```
210
+
211
+ ### Step 2: While runs are in progress, draft assertions
212
+
213
+ Don't just wait for the runs to finish — you can use this time productively. Draft quantitative assertions for each test case and explain them to the user. If assertions already exist in `evals/evals.json`, review them and explain what they check.
214
+
215
+ Good assertions are objectively verifiable and have descriptive names — they should read clearly in the benchmark viewer so someone glancing at the results immediately understands what each one checks. Subjective skills (writing style, design quality) are better evaluated qualitatively — don't force assertions onto things that need human judgment.
216
+
217
+ Update the `eval_metadata.json` files and `evals/evals.json` with the assertions once drafted. Also explain to the user what they'll see in the viewer — both the qualitative outputs and the quantitative benchmark.
218
+
219
+ ### Step 3: As runs complete, capture timing data
220
+
221
+ When each subagent task completes, you receive a notification containing `total_tokens` and `duration_ms`. Save this data immediately to `timing.json` in the run directory:
222
+
223
+ ```json
224
+ {
225
+ "total_tokens": 84852,
226
+ "duration_ms": 23332,
227
+ "total_duration_seconds": 23.3
228
+ }
229
+ ```
230
+
231
+ This is the only opportunity to capture this data — it comes through the task notification and isn't persisted elsewhere. Process each notification as it arrives rather than trying to batch them.
232
+
233
+ ### Step 4: Grade, aggregate, and launch the viewer
234
+
235
+ Once all runs are done:
236
+
237
+ 1. **Grade each run** — spawn a grader subagent (or grade inline) that reads `agents/grader.md` and evaluates each assertion against the outputs. Save results to `grading.json` in each run directory. The grading.json expectations array must use the fields `text`, `passed`, and `evidence` (not `name`/`met`/`details` or other variants) — the viewer depends on these exact field names. For assertions that can be checked programmatically, write and run a script rather than eyeballing it — scripts are faster, more reliable, and can be reused across iterations.
238
+
239
+ 2. **Aggregate into benchmark** — run the aggregation script from the skill-creator directory:
240
+
241
+ ```bash
242
+ python -m scripts.aggregate_benchmark <workspace>/iteration-N --skill-name <name>
243
+ ```
244
+
245
+ This produces `benchmark.json` and `benchmark.md` with pass_rate, time, and tokens for each configuration, with mean ± stddev and the delta. If generating benchmark.json manually, see `references/schemas.md` for the exact schema the viewer expects.
246
+ Put each with_skill version before its baseline counterpart.
247
+
248
+ 3. **Do an analyst pass** — read the benchmark data and surface patterns the aggregate stats might hide. See `agents/analyzer.md` (the "Analyzing Benchmark Results" section) for what to look for — things like assertions that always pass regardless of skill (non-discriminating), high-variance evals (possibly flaky), and time/token tradeoffs.
249
+
250
+ 4. **Launch the viewer** with both qualitative outputs and quantitative data:
251
+
252
+ ```bash
253
+ nohup python <skill-creator-path>/eval-viewer/generate_review.py \
254
+ <workspace>/iteration-N \
255
+ --skill-name "my-skill" \
256
+ --benchmark <workspace>/iteration-N/benchmark.json \
257
+ > /dev/null 2>&1 &
258
+ VIEWER_PID=$!
259
+ ```
260
+
261
+ For iteration 2+, also pass `--previous-workspace <workspace>/iteration-<N-1>`.
262
+
263
+ **Cowork / headless environments:** If `webbrowser.open()` is not available or the environment has no display, use `--static <output_path>` to write a standalone HTML file instead of starting a server. Feedback will be downloaded as a `feedback.json` file when the user clicks "Submit All Reviews". After download, copy `feedback.json` into the workspace directory for the next iteration to pick up.
264
+
265
+ Note: please use generate_review.py to create the viewer; there's no need to write custom HTML.
266
+
267
+ 5. **Tell the user** something like: "I've opened the results in your browser. There are two tabs — 'Outputs' lets you click through each test case and leave feedback, 'Benchmark' shows the quantitative comparison. When you're done, come back here and let me know."
268
+
269
+ ### What the user sees in the viewer
270
+
271
+ The "Outputs" tab shows one test case at a time:
272
+
273
+ - **Prompt**: the task that was given
274
+ - **Output**: the files the skill produced, rendered inline where possible
275
+ - **Previous Output** (iteration 2+): collapsed section showing last iteration's output
276
+ - **Formal Grades** (if grading was run): collapsed section showing assertion pass/fail
277
+ - **Feedback**: a textbox that auto-saves as they type
278
+ - **Previous Feedback** (iteration 2+): their comments from last time, shown below the textbox
279
+
280
+ The "Benchmark" tab shows the stats summary: pass rates, timing, and token usage for each configuration, with per-eval breakdowns and analyst observations.
281
+
282
+ Navigation is via prev/next buttons or arrow keys. When done, they click "Submit All Reviews" which saves all feedback to `feedback.json`.
283
+
284
+ ### Step 5: Read the feedback
285
+
286
+ When the user tells you they're done, read `feedback.json`:
287
+
288
+ ```json
289
+ {
290
+ "reviews": [
291
+ {
292
+ "run_id": "eval-0-with_skill",
293
+ "feedback": "the chart is missing axis labels",
294
+ "timestamp": "..."
295
+ },
296
+ { "run_id": "eval-1-with_skill", "feedback": "", "timestamp": "..." },
297
+ {
298
+ "run_id": "eval-2-with_skill",
299
+ "feedback": "perfect, love this",
300
+ "timestamp": "..."
301
+ }
302
+ ],
303
+ "status": "complete"
304
+ }
305
+ ```
306
+
307
+ Empty feedback means the user thought it was fine. Focus your improvements on the test cases where the user had specific complaints.
308
+
309
+ Kill the viewer server when you're done with it:
310
+
311
+ ```bash
312
+ kill $VIEWER_PID 2>/dev/null
313
+ ```
314
+
315
+ ---
316
+
317
+ ## Improving the skill
318
+
319
+ This is the heart of the loop. You've run the test cases, the user has reviewed the results, and now you need to make the skill better based on their feedback.
320
+
321
+ ### How to think about improvements
322
+
323
+ 1. **Generalize from the feedback.** The big picture thing that's happening here is that we're trying to create skills that can be used a million times (maybe literally, maybe even more who knows) across many different prompts. Here you and the user are iterating on only a few examples over and over again because it helps move faster. The user knows these examples in and out and it's quick for them to assess new outputs. But if the skill you and the user are codeveloping works only for those examples, it's useless. Rather than put in fiddly overfitty changes, or oppressively constrictive MUSTs, if there's some stubborn issue, you might try branching out and using different metaphors, or recommending different patterns of working. It's relatively cheap to try and maybe you'll land on something great.
324
+
325
+ 2. **Keep the prompt lean.** Remove things that aren't pulling their weight. Make sure to read the transcripts, not just the final outputs — if it looks like the skill is making the model waste a bunch of time doing things that are unproductive, you can try getting rid of the parts of the skill that are making it do that and seeing what happens.
326
+
327
+ 3. **Explain the why.** Try hard to explain the **why** behind everything you're asking the model to do. Today's LLMs are _smart_. They have good theory of mind and when given a good harness can go beyond rote instructions and really make things happen. Even if the feedback from the user is terse or frustrated, try to actually understand the task and why the user is writing what they wrote, and what they actually wrote, and then transmit this understanding into the instructions. If you find yourself writing ALWAYS or NEVER in all caps, or using super rigid structures, that's a yellow flag — if possible, reframe and explain the reasoning so that the model understands why the thing you're asking for is important. That's a more humane, powerful, and effective approach.
328
+
329
+ 4. **Look for repeated work across test cases.** Read the transcripts from the test runs and notice if the subagents all independently wrote similar helper scripts or took the same multi-step approach to something. If all 3 test cases resulted in the subagent writing a `create_docx.py` or a `build_chart.py`, that's a strong signal the skill should bundle that script. Write it once, put it in `scripts/`, and tell the skill to use it. This saves every future invocation from reinventing the wheel.
330
+
331
+ This task is pretty important (we are trying to create billions a year in economic value here!) and your thinking time is not the blocker; take your time and really mull things over. I'd suggest writing a draft revision and then looking at it anew and making improvements. Really do your best to get into the head of the user and understand what they want and need.
332
+
333
+ ### The iteration loop
334
+
335
+ After improving the skill:
336
+
337
+ 1. Apply your improvements to the skill
338
+ 2. Rerun all test cases into a new `iteration-<N+1>/` directory, including baseline runs. If you're creating a new skill, the baseline is always `without_skill` (no skill) — that stays the same across iterations. If you're improving an existing skill, use your judgment on what makes sense as the baseline: the original version the user came in with, or the previous iteration.
339
+ 3. Launch the reviewer with `--previous-workspace` pointing at the previous iteration
340
+ 4. Wait for the user to review and tell you they're done
341
+ 5. Read the new feedback, improve again, repeat
342
+
343
+ Keep going until:
344
+
345
+ - The user says they're happy
346
+ - The feedback is all empty (everything looks good)
347
+ - You're not making meaningful progress
348
+
349
+ ---
350
+
351
+ ## Advanced: Blind comparison
352
+
353
+ For situations where you want a more rigorous comparison between two versions of a skill (e.g., the user asks "is the new version actually better?"), there's a blind comparison system. Read `agents/comparator.md` and `agents/analyzer.md` for the details. The basic idea is: give two outputs to an independent agent without telling it which is which, and let it judge quality. Then analyze why the winner won.
354
+
355
+ This is optional, requires subagents, and most users won't need it. The human review loop is usually sufficient.
356
+
357
+ ---
358
+
359
+ ## Description Optimization
360
+
361
+ The description field in SKILL.md frontmatter is the primary mechanism that determines whether Claude invokes a skill. After creating or improving a skill, offer to optimize the description for better triggering accuracy.
362
+
363
+ ### Step 1: Generate trigger eval queries
364
+
365
+ Create 20 eval queries — a mix of should-trigger and should-not-trigger. Save as JSON:
366
+
367
+ ```json
368
+ [
369
+ { "query": "the user prompt", "should_trigger": true },
370
+ { "query": "another prompt", "should_trigger": false }
371
+ ]
372
+ ```
373
+
374
+ The queries must be realistic and something a Claude Code or Claude.ai user would actually type. Not abstract requests, but requests that are concrete and specific and have a good amount of detail. For instance, file paths, personal context about the user's job or situation, column names and values, company names, URLs. A little bit of backstory. Some might be in lowercase or contain abbreviations or typos or casual speech. Use a mix of different lengths, and focus on edge cases rather than making them clear-cut (the user will get a chance to sign off on them).
375
+
376
+ Bad: `"Format this data"`, `"Extract text from PDF"`, `"Create a chart"`
377
+
378
+ Good: `"ok so my boss just sent me this xlsx file (its in my downloads, called something like 'Q4 sales final FINAL v2.xlsx') and she wants me to add a column that shows the profit margin as a percentage. The revenue is in column C and costs are in column D i think"`
379
+
380
+ For the **should-trigger** queries (8-10), think about coverage. You want different phrasings of the same intent — some formal, some casual. Include cases where the user doesn't explicitly name the skill or file type but clearly needs it. Throw in some uncommon use cases and cases where this skill competes with another but should win.
381
+
382
+ For the **should-not-trigger** queries (8-10), the most valuable ones are the near-misses — queries that share keywords or concepts with the skill but actually need something different. Think adjacent domains, ambiguous phrasing where a naive keyword match would trigger but shouldn't, and cases where the query touches on something the skill does but in a context where another tool is more appropriate.
383
+
384
+ The key thing to avoid: don't make should-not-trigger queries obviously irrelevant. "Write a fibonacci function" as a negative test for a PDF skill is too easy — it doesn't test anything. The negative cases should be genuinely tricky.
385
+
386
+ ### Step 2: Review with user
387
+
388
+ Present the eval set to the user for review using the HTML template:
389
+
390
+ 1. Read the template from `assets/eval_review.html`
391
+ 2. Replace the placeholders:
392
+ - `__EVAL_DATA_PLACEHOLDER__` → the JSON array of eval items (no quotes around it — it's a JS variable assignment)
393
+ - `__SKILL_NAME_PLACEHOLDER__` → the skill's name
394
+ - `__SKILL_DESCRIPTION_PLACEHOLDER__` → the skill's current description
395
+ 3. Write to a temp file (e.g., `/tmp/eval_review_<skill-name>.html`) and open it: `open /tmp/eval_review_<skill-name>.html`
396
+ 4. The user can edit queries, toggle should-trigger, add/remove entries, then click "Export Eval Set"
397
+ 5. The file downloads to `~/Downloads/eval_set.json` — check the Downloads folder for the most recent version in case there are multiple (e.g., `eval_set (1).json`)
398
+
399
+ This step matters — bad eval queries lead to bad descriptions.
400
+
401
+ ### Step 3: Run the optimization loop
402
+
403
+ Tell the user: "This will take some time — I'll run the optimization loop in the background and check on it periodically."
404
+
405
+ Save the eval set to the workspace, then run in the background:
406
+
407
+ ```bash
408
+ python -m scripts.run_loop \
409
+ --eval-set <path-to-trigger-eval.json> \
410
+ --skill-path <path-to-skill> \
411
+ --model <model-id-powering-this-session> \
412
+ --max-iterations 5 \
413
+ --verbose
414
+ ```
415
+
416
+ Use the model ID from your system prompt (the one powering the current session) so the triggering test matches what the user actually experiences.
417
+
418
+ While it runs, periodically tail the output to give the user updates on which iteration it's on and what the scores look like.
419
+
420
+ This handles the full optimization loop automatically. It splits the eval set into 60% train and 40% held-out test, evaluates the current description (running each query 3 times to get a reliable trigger rate), then calls Claude with extended thinking to propose improvements based on what failed. It re-evaluates each new description on both train and test, iterating up to 5 times. When it's done, it opens an HTML report in the browser showing the results per iteration and returns JSON with `best_description` — selected by test score rather than train score to avoid overfitting.
421
+
422
+ ### How skill triggering works
423
+
424
+ Understanding the triggering mechanism helps design better eval queries. Skills appear in Claude's `available_skills` list with their name + description, and Claude decides whether to consult a skill based on that description. The important thing to know is that Claude only consults skills for tasks it can't easily handle on its own — simple, one-step queries like "read this PDF" may not trigger a skill even if the description matches perfectly, because Claude can handle them directly with basic tools. Complex, multi-step, or specialized queries reliably trigger skills when the description matches.
425
+
426
+ This means your eval queries should be substantive enough that Claude would actually benefit from consulting a skill. Simple queries like "read file X" are poor test cases — they won't trigger skills regardless of description quality.
427
+
428
+ ### Step 4: Apply the result
429
+
430
+ Take `best_description` from the JSON output and update the skill's SKILL.md frontmatter. Show the user before/after and report the scores.
431
+
432
+ ---
433
+
434
+ ### Package and Present (only if `present_files` tool is available)
435
+
436
+ Check whether you have access to the `present_files` tool. If you don't, skip this step. If you do, package the skill and present the .skill file to the user:
437
+
438
+ ```bash
439
+ python -m scripts.package_skill <path/to/skill-folder>
440
+ ```
441
+
442
+ After packaging, direct the user to the resulting `.skill` file path so they can install it.
443
+
444
+ ---
445
+
446
+ ## Claude.ai-specific instructions
447
+
448
+ In Claude.ai, the core workflow is the same (draft → test → review → improve → repeat), but because Claude.ai doesn't have subagents, some mechanics change. Here's what to adapt:
449
+
450
+ **Running test cases**: No subagents means no parallel execution. For each test case, read the skill's SKILL.md, then follow its instructions to accomplish the test prompt yourself. Do them one at a time. This is less rigorous than independent subagents (you wrote the skill and you're also running it, so you have full context), but it's a useful sanity check — and the human review step compensates. Skip the baseline runs — just use the skill to complete the task as requested.
451
+
452
+ **Reviewing results**: If you can't open a browser (e.g., Claude.ai's VM has no display, or you're on a remote server), skip the browser reviewer entirely. Instead, present results directly in the conversation. For each test case, show the prompt and the output. If the output is a file the user needs to see (like a .docx or .xlsx), save it to the filesystem and tell them where it is so they can download and inspect it. Ask for feedback inline: "How does this look? Anything you'd change?"
453
+
454
+ **Benchmarking**: Skip the quantitative benchmarking — it relies on baseline comparisons which aren't meaningful without subagents. Focus on qualitative feedback from the user.
455
+
456
+ **The iteration loop**: Same as before — improve the skill, rerun the test cases, ask for feedback — just without the browser reviewer in the middle. You can still organize results into iteration directories on the filesystem if you have one.
457
+
458
+ **Description optimization**: This section requires the `claude` CLI tool (specifically `claude -p`) which is only available in Claude Code. Skip it if you're on Claude.ai.
459
+
460
+ **Blind comparison**: Requires subagents. Skip it.
461
+
462
+ **Packaging**: The `package_skill.py` script works anywhere with Python and a filesystem. On Claude.ai, you can run it and the user can download the resulting `.skill` file.
463
+
464
+ ---
465
+
466
+ ## Cowork-Specific Instructions
467
+
468
+ If you're in Cowork, the main things to know are:
469
+
470
+ - You have subagents, so the main workflow (spawn test cases in parallel, run baselines, grade, etc.) all works. (However, if you run into severe problems with timeouts, it's OK to run the test prompts in series rather than parallel.)
471
+ - You don't have a browser or display, so when generating the eval viewer, use `--static <output_path>` to write a standalone HTML file instead of starting a server. Then proffer a link that the user can click to open the HTML in their browser.
472
+ - For whatever reason, the Cowork setup seems to disincline Claude from generating the eval viewer after running the tests, so just to reiterate: whether you're in Cowork or in Claude Code, after running tests, you should always generate the eval viewer for the human to look at examples before revising the skill yourself and trying to make corrections, using `generate_review.py` (not writing your own boutique html code). Sorry in advance but I'm gonna go all caps here: GENERATE THE EVAL VIEWER _BEFORE_ evaluating inputs yourself. You want to get them in front of the human ASAP!
473
+ - Feedback works differently: since there's no running server, the viewer's "Submit All Reviews" button will download `feedback.json` as a file. You can then read it from there (you may have to request access first).
474
+ - Packaging works — `package_skill.py` just needs Python and a filesystem.
475
+ - Description optimization (`run_loop.py` / `run_eval.py`) should work in Cowork just fine since it uses `claude -p` via subprocess, not a browser, but please save it until you've fully finished making the skill and the user agrees it's in good shape.
476
+
477
+ ---
478
+
479
+ ## Reference files
480
+
481
+ The agents/ directory contains instructions for specialized subagents. Read them when you need to spawn the relevant subagent.
482
+
483
+ - `agents/grader.md` — How to evaluate assertions against outputs
484
+ - `agents/comparator.md` — How to do blind A/B comparison between two outputs
485
+ - `agents/analyzer.md` — How to analyze why one version beat another
486
+
487
+ The references/ directory has additional documentation:
488
+
489
+ - `references/schemas.md` — JSON structures for evals.json, grading.json, etc.
490
+
491
+ ---
492
+
493
+ Repeating one more time the core loop here for emphasis:
494
+
495
+ - Figure out what the skill is about
496
+ - Draft or edit the skill
497
+ - Run claude-with-access-to-the-skill on test prompts
498
+ - With the user, evaluate the outputs:
499
+ - Create benchmark.json and run `eval-viewer/generate_review.py` to help the user review them
500
+ - Run quantitative evals
501
+ - Repeat until you and the user are satisfied
502
+ - Package the final skill and return it to the user.
503
+
504
+ Please add steps to your TodoList, if you have such a thing, to make sure you don't forget. If you're in Cowork, please specifically put "Create evals JSON and run `eval-viewer/generate_review.py` so human can review test cases" in your TodoList to make sure it happens.
505
+
506
+ Good luck!