@masslessai/push-todo 4.5.0 → 4.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/SKILL.md +6 -0
- package/lib/daemon.js +23 -3
- package/package.json +2 -1
- package/scripts/postinstall.js +88 -4
- package/skills/skill-builder/SKILL.md +313 -0
- package/skills/skill-builder/agents/analyzer.md +274 -0
- package/skills/skill-builder/agents/comparator.md +202 -0
- package/skills/skill-builder/agents/grader.md +223 -0
- package/skills/skill-builder/references/description-optimization.md +212 -0
- package/skills/skill-builder/references/eval-framework.md +230 -0
- package/skills/skill-builder/references/json-schemas.md +430 -0
- package/skills/skill-builder/references/plugin-skill-patterns.md +122 -0
- package/skills/skill-builder/references/writing-guide.md +330 -0
- package/skills/skill-builder/scripts/__init__.py +0 -0
- package/skills/skill-builder/scripts/aggregate_benchmark.py +401 -0
- package/skills/skill-builder/scripts/generate_report.py +326 -0
- package/skills/skill-builder/scripts/improve_description.py +248 -0
- package/skills/skill-builder/scripts/quick_validate.py +103 -0
- package/skills/skill-builder/scripts/run_eval.py +310 -0
- package/skills/skill-builder/scripts/run_loop.py +332 -0
- package/skills/skill-builder/scripts/utils.py +47 -0
package/SKILL.md
CHANGED
|
@@ -433,6 +433,12 @@ The `push-todo` CLI supports these commands:
|
|
|
433
433
|
| `push-todo --status` | Show connection status |
|
|
434
434
|
| `push-todo --mark-completed <uuid>` | Mark task as completed |
|
|
435
435
|
| `push-todo --json` | Output as JSON |
|
|
436
|
+
| `push-todo create <title>` | Create a todo from CLI |
|
|
437
|
+
| `push-todo create <title> --remind <text>` | Create with reminder (e.g., "tomorrow night", "in 2 hours") |
|
|
438
|
+
| `push-todo create <title> --remind-at <iso>` | Create with exact reminder date (ISO8601) |
|
|
439
|
+
| `push-todo create <title> --alarm` | Mark reminder as urgent (bypasses Focus) |
|
|
440
|
+
| `push-todo create <title> --content <text>` | Create with detailed content |
|
|
441
|
+
| `push-todo create <title> --backlog` | Create as backlog item |
|
|
436
442
|
| `push-todo schedule add` | Create a remote schedule (Supabase-backed) |
|
|
437
443
|
| `push-todo schedule list` | List all remote schedules |
|
|
438
444
|
| `push-todo schedule remove <id>` | Remove a schedule |
|
package/lib/daemon.js
CHANGED
|
@@ -2564,16 +2564,30 @@ async function handleTaskCompletion(displayNumber, exitCode) {
|
|
|
2564
2564
|
executionSummary += ` PR: ${prUrl}`;
|
|
2565
2565
|
}
|
|
2566
2566
|
|
|
2567
|
+
// Build structured recap for briefing visualization (JSONB column)
|
|
2568
|
+
const prNumber = prUrl ? parseInt((prUrl.match(/\/(\d+)$/) || [])[1] || '0', 10) || null : null;
|
|
2569
|
+
const executionRecap = {
|
|
2570
|
+
durationStr,
|
|
2571
|
+
durationSeconds: duration,
|
|
2572
|
+
diagram: visualArtifact || null,
|
|
2573
|
+
machineName,
|
|
2574
|
+
outcome: {
|
|
2575
|
+
prUrl: prUrl || null,
|
|
2576
|
+
prNumber,
|
|
2577
|
+
},
|
|
2578
|
+
};
|
|
2579
|
+
|
|
2567
2580
|
const statusUpdated = await updateTaskStatus(displayNumber, 'session_finished', {
|
|
2568
2581
|
duration,
|
|
2569
2582
|
sessionId,
|
|
2570
|
-
summary: executionSummary
|
|
2583
|
+
summary: executionSummary,
|
|
2584
|
+
executionRecap,
|
|
2571
2585
|
}, info.taskId);
|
|
2572
2586
|
if (!statusUpdated) {
|
|
2573
2587
|
logError(`Task #${displayNumber}: Failed to update status to session_finished — will retry`);
|
|
2574
2588
|
// Retry once
|
|
2575
2589
|
await updateTaskStatus(displayNumber, 'session_finished', {
|
|
2576
|
-
duration, sessionId, summary: executionSummary
|
|
2590
|
+
duration, sessionId, summary: executionSummary, executionRecap,
|
|
2577
2591
|
}, info.taskId);
|
|
2578
2592
|
}
|
|
2579
2593
|
|
|
@@ -2692,7 +2706,13 @@ async function handleTaskCompletion(displayNumber, exitCode) {
|
|
|
2692
2706
|
? `${failureSummary}\nExit code ${exitCode}. Ran for ${durationStr} on ${machineName}.`
|
|
2693
2707
|
: `Exit code ${exitCode}: ${stderr.slice(0, 200)}`;
|
|
2694
2708
|
|
|
2695
|
-
|
|
2709
|
+
const failedRecap = {
|
|
2710
|
+
durationStr,
|
|
2711
|
+
durationSeconds: duration,
|
|
2712
|
+
machineName,
|
|
2713
|
+
exitCode,
|
|
2714
|
+
};
|
|
2715
|
+
await updateTaskStatus(displayNumber, 'failed', { error: errorMsg, sessionId, executionRecap: failedRecap }, info.taskId);
|
|
2696
2716
|
|
|
2697
2717
|
if (NOTIFY_ON_FAILURE) {
|
|
2698
2718
|
sendMacNotification(
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@masslessai/push-todo",
|
|
3
|
-
"version": "4.5.
|
|
3
|
+
"version": "4.5.2",
|
|
4
4
|
"description": "Voice tasks from Push iOS app for Claude Code",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -19,6 +19,7 @@
|
|
|
19
19
|
"hooks/",
|
|
20
20
|
"natives/",
|
|
21
21
|
"scripts/",
|
|
22
|
+
"skills/",
|
|
22
23
|
"SKILL.md",
|
|
23
24
|
"LICENSE"
|
|
24
25
|
],
|
package/scripts/postinstall.js
CHANGED
|
@@ -6,10 +6,11 @@
|
|
|
6
6
|
* 1. Claude Code - symlink to ~/.claude/skills/ (gives clean /push-todo command)
|
|
7
7
|
* 2. OpenAI Codex - AGENTS.md in ~/.codex/
|
|
8
8
|
* 3. OpenClaw - SKILL.md in ~/.openclaw/skills/ (legacy: ~/.clawdbot/)
|
|
9
|
-
* 4.
|
|
9
|
+
* 4. Bundled skills - auto-discovers bundled skills and creates per-skill symlinks
|
|
10
|
+
* 5. Downloads native keychain helper binary (macOS)
|
|
10
11
|
*/
|
|
11
12
|
|
|
12
|
-
import { createWriteStream, existsSync, mkdirSync, unlinkSync, readFileSync, writeFileSync, symlinkSync, lstatSync, readlinkSync, rmSync, appendFileSync } from 'fs';
|
|
13
|
+
import { createWriteStream, existsSync, mkdirSync, unlinkSync, readFileSync, writeFileSync, symlinkSync, lstatSync, readlinkSync, rmSync, appendFileSync, readdirSync } from 'fs';
|
|
13
14
|
import { chmod, stat } from 'fs/promises';
|
|
14
15
|
import { pipeline } from 'stream/promises';
|
|
15
16
|
import { join, dirname } from 'path';
|
|
@@ -301,6 +302,68 @@ function setupOpenClaw() {
|
|
|
301
302
|
}
|
|
302
303
|
}
|
|
303
304
|
|
|
305
|
+
/**
|
|
306
|
+
* Auto-discover bundled skills and create per-skill symlinks.
|
|
307
|
+
* Each bundled skill gets: <targetDir>/push-<name> -> PACKAGE_ROOT/skills/<name>
|
|
308
|
+
*
|
|
309
|
+
* Claude Code scans one level deep (~/.claude/skills/X/SKILL.md), but bundled
|
|
310
|
+
* skills are two levels deep from the root push-todo symlink. Per-skill symlinks
|
|
311
|
+
* give each bundled skill its own top-level entry for independent discovery.
|
|
312
|
+
*
|
|
313
|
+
* @param {string} targetSkillsDir - e.g. ~/.claude/skills/
|
|
314
|
+
* @param {string} clientLabel - e.g. "Claude Code"
|
|
315
|
+
* @returns {string[]} names of skills that were symlinked
|
|
316
|
+
*/
|
|
317
|
+
function setupBundledSkills(targetSkillsDir, clientLabel) {
|
|
318
|
+
const bundledDir = join(PACKAGE_ROOT, 'skills');
|
|
319
|
+
if (!existsSync(bundledDir)) return [];
|
|
320
|
+
|
|
321
|
+
let entries;
|
|
322
|
+
try {
|
|
323
|
+
entries = readdirSync(bundledDir);
|
|
324
|
+
} catch {
|
|
325
|
+
return [];
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
const installed = [];
|
|
329
|
+
|
|
330
|
+
for (const name of entries) {
|
|
331
|
+
const skillSource = join(bundledDir, name);
|
|
332
|
+
const skillMd = join(skillSource, 'SKILL.md');
|
|
333
|
+
|
|
334
|
+
// Only process directories containing SKILL.md
|
|
335
|
+
if (!existsSync(skillMd)) continue;
|
|
336
|
+
|
|
337
|
+
// Namespace with push- prefix (skip if already prefixed)
|
|
338
|
+
const linkName = name.startsWith('push-') ? name : `push-${name}`;
|
|
339
|
+
const linkPath = join(targetSkillsDir, linkName);
|
|
340
|
+
|
|
341
|
+
try {
|
|
342
|
+
if (existsSync(linkPath)) {
|
|
343
|
+
const stats = lstatSync(linkPath);
|
|
344
|
+
if (stats.isSymbolicLink()) {
|
|
345
|
+
const currentTarget = readlinkSync(linkPath);
|
|
346
|
+
if (currentTarget === skillSource) {
|
|
347
|
+
installed.push(linkName);
|
|
348
|
+
continue; // Already correct
|
|
349
|
+
}
|
|
350
|
+
unlinkSync(linkPath);
|
|
351
|
+
} else {
|
|
352
|
+
rmSync(linkPath, { recursive: true });
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
symlinkSync(skillSource, linkPath);
|
|
357
|
+
installed.push(linkName);
|
|
358
|
+
console.log(`[push-todo] ${clientLabel}: Bundled skill installed: ${linkName}`);
|
|
359
|
+
} catch (err) {
|
|
360
|
+
console.log(`[push-todo] ${clientLabel}: Failed to install bundled skill ${linkName}: ${err.message}`);
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
return installed;
|
|
365
|
+
}
|
|
366
|
+
|
|
304
367
|
/**
|
|
305
368
|
* Download a file from URL to destination.
|
|
306
369
|
*
|
|
@@ -356,15 +419,36 @@ async function main() {
|
|
|
356
419
|
// Step 3: Set up Claude Code skill symlink
|
|
357
420
|
console.log('[push-todo] Setting up Claude Code skill...');
|
|
358
421
|
const claudeSuccess = setupClaudeSkill();
|
|
422
|
+
if (claudeSuccess) {
|
|
423
|
+
const bundled = setupBundledSkills(SKILL_DIR, 'Claude Code');
|
|
424
|
+
if (bundled.length > 0) {
|
|
425
|
+
console.log(`[push-todo] Claude Code: ${bundled.length} bundled skill(s) installed`);
|
|
426
|
+
}
|
|
427
|
+
}
|
|
359
428
|
console.log('');
|
|
360
429
|
|
|
361
430
|
// Step 4: Set up OpenAI Codex (if installed)
|
|
362
431
|
const codexSuccess = setupCodex();
|
|
363
|
-
if (codexSuccess)
|
|
432
|
+
if (codexSuccess) {
|
|
433
|
+
const codexSkillsDir = join(CODEX_DIR, 'skills');
|
|
434
|
+
const bundled = setupBundledSkills(codexSkillsDir, 'Codex');
|
|
435
|
+
if (bundled.length > 0) {
|
|
436
|
+
console.log(`[push-todo] Codex: ${bundled.length} bundled skill(s) installed`);
|
|
437
|
+
}
|
|
438
|
+
console.log('');
|
|
439
|
+
}
|
|
364
440
|
|
|
365
441
|
// Step 5: Set up OpenClaw (if installed — formerly Clawdbot)
|
|
366
442
|
const openclawSuccess = setupOpenClaw();
|
|
367
|
-
if (openclawSuccess)
|
|
443
|
+
if (openclawSuccess) {
|
|
444
|
+
const clawDir = existsSync(OPENCLAW_DIR) ? OPENCLAW_DIR : OPENCLAW_LEGACY_DIR;
|
|
445
|
+
const clawSkillsDir = join(clawDir, 'skills');
|
|
446
|
+
const bundled = setupBundledSkills(clawSkillsDir, 'OpenClaw');
|
|
447
|
+
if (bundled.length > 0) {
|
|
448
|
+
console.log(`[push-todo] OpenClaw: ${bundled.length} bundled skill(s) installed`);
|
|
449
|
+
}
|
|
450
|
+
console.log('');
|
|
451
|
+
}
|
|
368
452
|
|
|
369
453
|
// Track which clients were set up
|
|
370
454
|
const clients = [];
|
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: skill-builder
|
|
3
|
+
description: Create new skills, modify and improve existing skills, and measure skill performance. Use when users want to create a skill from scratch, update or optimize an existing skill, run evals to test a skill, benchmark skill performance with variance analysis, optimize a skill's description for better triggering, or improve an existing skill's quality. This skill should be used whenever the user mentions "create a skill", "build a skill", "improve skill", "test skill", "eval skill", "benchmark skill", "skill description", "skill triggering", or asks to turn a workflow into a reusable skill.
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# Skill Builder
|
|
8
|
+
|
|
9
|
+
This skill guides the complete lifecycle of building, testing, and improving Claude Code skills — from initial idea through quantitative benchmarking and iterative refinement.
|
|
10
|
+
|
|
11
|
+
## The Core Loop
|
|
12
|
+
|
|
13
|
+
Skill development follows an iterative cycle:
|
|
14
|
+
|
|
15
|
+
1. **Capture Intent** — Understand what the skill should do and when it should trigger
|
|
16
|
+
2. **Interview & Research** — Gather details about edge cases, formats, and dependencies
|
|
17
|
+
3. **Write SKILL.md** — Create the skill with metadata and instructions
|
|
18
|
+
4. **Test** — Run eval cases to measure skill performance
|
|
19
|
+
5. **Evaluate** — Grade outputs, aggregate benchmarks, review with user
|
|
20
|
+
6. **Improve** — Refine based on feedback and data, then loop back to step 4
|
|
21
|
+
|
|
22
|
+
The job when using this skill is to figure out where in this loop the user currently is and help them move forward. Not every skill needs the full eval pipeline — a simple domain knowledge skill might only need steps 1-3, while a complex workflow skill benefits from the complete cycle.
|
|
23
|
+
|
|
24
|
+
## Communicating with the User
|
|
25
|
+
|
|
26
|
+
Adapt communication level to the user's context. If they're casually asking "help me make a skill," keep things simple. If they're asking about "eval assertions" and "benchmark variance," they want the full technical depth.
|
|
27
|
+
|
|
28
|
+
Brief explanations of technical terms are always appropriate:
|
|
29
|
+
- "assertions" = specific pass/fail checks on skill output
|
|
30
|
+
- "trigger eval" = test whether a description causes the skill to activate for the right queries
|
|
31
|
+
- "benchmark" = statistical comparison across multiple test runs
|
|
32
|
+
|
|
33
|
+
## About Skills
|
|
34
|
+
|
|
35
|
+
Skills are modular, self-contained packages that extend Claude's capabilities with specialized knowledge, workflows, and tools. They transform Claude from a general-purpose agent into a specialized agent equipped with procedural knowledge that no model can fully possess.
|
|
36
|
+
|
|
37
|
+
### What Skills Provide
|
|
38
|
+
|
|
39
|
+
1. **Specialized workflows** — Multi-step procedures for specific domains
|
|
40
|
+
2. **Tool integrations** — Instructions for working with specific file formats or APIs
|
|
41
|
+
3. **Domain expertise** — Company-specific knowledge, schemas, business logic
|
|
42
|
+
4. **Bundled resources** — Scripts, references, and assets for complex and repetitive tasks
|
|
43
|
+
|
|
44
|
+
### Anatomy of a Skill
|
|
45
|
+
|
|
46
|
+
```
|
|
47
|
+
skill-name/
|
|
48
|
+
├── SKILL.md (required)
|
|
49
|
+
│ ├── YAML frontmatter (name, description required)
|
|
50
|
+
│ └── Markdown instructions
|
|
51
|
+
└── Bundled Resources (optional)
|
|
52
|
+
├── scripts/ - Executable code (Python/Bash/etc.)
|
|
53
|
+
├── references/ - Documentation loaded as needed
|
|
54
|
+
└── assets/ - Files used in output (templates, icons, fonts)
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Progressive Disclosure
|
|
58
|
+
|
|
59
|
+
Skills use a three-level loading system to manage context efficiently:
|
|
60
|
+
|
|
61
|
+
1. **Metadata (name + description)** — Always in context (~100 words)
|
|
62
|
+
2. **SKILL.md body** — When skill triggers (<500 lines ideal)
|
|
63
|
+
3. **Bundled resources** — As needed by Claude (unlimited)
|
|
64
|
+
|
|
65
|
+
This matters because the context window is a shared resource. Keep SKILL.md lean (under 500 lines) and use references/ for detailed content.
|
|
66
|
+
|
|
67
|
+
## Creating a Skill
|
|
68
|
+
|
|
69
|
+
### Step 1: Capture Intent
|
|
70
|
+
|
|
71
|
+
Start by understanding concrete examples of how the skill will be used. Good questions:
|
|
72
|
+
|
|
73
|
+
- "What functionality should the skill support?"
|
|
74
|
+
- "Can you give examples of how this skill would be used?"
|
|
75
|
+
- "What would a user say that should trigger this skill?"
|
|
76
|
+
|
|
77
|
+
Avoid asking too many questions at once. Start with the most important ones and follow up as needed.
|
|
78
|
+
|
|
79
|
+
### Step 2: Interview & Research
|
|
80
|
+
|
|
81
|
+
Analyze each example to identify reusable resources:
|
|
82
|
+
|
|
83
|
+
- Does the task require **rewriting the same code** each time? → Bundle a `scripts/` utility
|
|
84
|
+
- Does the task need **domain knowledge** to reference? → Create a `references/` file
|
|
85
|
+
- Does the task use **templates or assets** in its output? → Add to `assets/`
|
|
86
|
+
|
|
87
|
+
### Step 3: Write the SKILL.md
|
|
88
|
+
|
|
89
|
+
Initialize the skill directory:
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
mkdir -p skill-name/{references,scripts}
|
|
93
|
+
touch skill-name/SKILL.md
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
#### Frontmatter
|
|
97
|
+
|
|
98
|
+
The description is the most important part — it determines when Claude will use the skill:
|
|
99
|
+
|
|
100
|
+
```yaml
|
|
101
|
+
---
|
|
102
|
+
name: skill-name
|
|
103
|
+
description: Create new X, modify existing X, and optimize X performance. This skill should be used when the user asks to "specific phrase 1", "specific phrase 2", or mentions related concepts. Use whenever the user wants to...
|
|
104
|
+
version: 0.1.0
|
|
105
|
+
---
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
Make descriptions **"pushy"** — explicitly state when to use the skill, including near-miss scenarios:
|
|
109
|
+
|
|
110
|
+
> "Make sure to use this skill whenever the user mentions dashboards, data visualization, internal metrics, or wants to display any kind of company data, even if they don't explicitly ask for a 'dashboard.'"
|
|
111
|
+
|
|
112
|
+
This combats undertriggering, which is the more common failure mode.
|
|
113
|
+
|
|
114
|
+
#### Body
|
|
115
|
+
|
|
116
|
+
Write using **imperative form** (verb-first instructions). Use objective, instructional language:
|
|
117
|
+
|
|
118
|
+
```markdown
|
|
119
|
+
# Good
|
|
120
|
+
Parse the frontmatter using sed.
|
|
121
|
+
To accomplish X, do Y.
|
|
122
|
+
|
|
123
|
+
# Bad
|
|
124
|
+
You should parse the frontmatter...
|
|
125
|
+
If you need to do X, you can...
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
For detailed writing guidance including progressive disclosure patterns, degrees of freedom, and common mistakes, read `references/writing-guide.md`.
|
|
129
|
+
|
|
130
|
+
For plugin-specific skill patterns (auto-discovery, testing with `--plugin-dir`, packaging), read `references/plugin-skill-patterns.md`.
|
|
131
|
+
|
|
132
|
+
### Step 4: Create Test Cases
|
|
133
|
+
|
|
134
|
+
Draft 2-3 realistic test prompts that exercise the skill's core functionality. Save them in a workspace:
|
|
135
|
+
|
|
136
|
+
```json
|
|
137
|
+
{
|
|
138
|
+
"skill_name": "my-skill",
|
|
139
|
+
"evals": [
|
|
140
|
+
{
|
|
141
|
+
"id": 0,
|
|
142
|
+
"prompt": "Realistic user request...",
|
|
143
|
+
"expected_output": "Description of expected result",
|
|
144
|
+
"assertions": ["Output contains X", "Format is correct", "Edge case Y handled"]
|
|
145
|
+
}
|
|
146
|
+
]
|
|
147
|
+
}
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
## Running and Evaluating Test Cases
|
|
151
|
+
|
|
152
|
+
This is the heart of skill improvement. For the full eval framework details, read `references/eval-framework.md`.
|
|
153
|
+
|
|
154
|
+
### Overview
|
|
155
|
+
|
|
156
|
+
For each test case, spawn **two subagent runs simultaneously**:
|
|
157
|
+
|
|
158
|
+
- **With-skill run**: Uses the skill being developed
|
|
159
|
+
- **Baseline run**: Either no skill (new skills) or previous version (iterations)
|
|
160
|
+
|
|
161
|
+
While runs execute, draft assertions — specific pass/fail criteria for each test case.
|
|
162
|
+
|
|
163
|
+
After runs complete:
|
|
164
|
+
|
|
165
|
+
1. **Grade** outputs using the grader agent (`agents/grader.md`)
|
|
166
|
+
2. **Aggregate** results into benchmark statistics
|
|
167
|
+
3. **Review** with user — present findings conversationally
|
|
168
|
+
4. **Collect feedback** for the next iteration
|
|
169
|
+
|
|
170
|
+
### Workspace Structure
|
|
171
|
+
|
|
172
|
+
```
|
|
173
|
+
<skill-name>-workspace/
|
|
174
|
+
├── iteration-1/
|
|
175
|
+
│ ├── eval-0-name/
|
|
176
|
+
│ │ ├── with_skill/outputs/
|
|
177
|
+
│ │ ├── baseline/outputs/
|
|
178
|
+
│ │ └── eval_metadata.json
|
|
179
|
+
│ ├── benchmark.json
|
|
180
|
+
│ └── feedback.json
|
|
181
|
+
├── iteration-2/
|
|
182
|
+
│ └── [same structure]
|
|
183
|
+
└── skill-snapshot/ (previous version when improving)
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
## Improving the Skill
|
|
187
|
+
|
|
188
|
+
After reviewing eval results, apply improvements following these principles:
|
|
189
|
+
|
|
190
|
+
1. **Generalize from feedback** — Don't overfit to specific test cases. The skill must work across future uses.
|
|
191
|
+
2. **Keep the prompt lean** — Remove instructions that don't pull their weight.
|
|
192
|
+
3. **Explain the WHY** — Help Claude understand reasoning rather than following rigid rules. Prefer "This format matters because X" over "MUST use this format."
|
|
193
|
+
4. **Look for repeated work** — If all test runs independently wrote the same helper script, bundle it in `scripts/`.
|
|
194
|
+
5. **Set appropriate degrees of freedom** — High-stakes formatting? Lock it down. Creative decisions? Leave room for judgment.
|
|
195
|
+
|
|
196
|
+
### The Iteration Loop
|
|
197
|
+
|
|
198
|
+
1. Apply improvements to the skill
|
|
199
|
+
2. Rerun all test cases into `iteration-<N+1>/`
|
|
200
|
+
3. Compare against previous iteration
|
|
201
|
+
4. Read feedback and repeat until satisfied
|
|
202
|
+
|
|
203
|
+
For advanced comparison, use the **blind comparator** (`agents/comparator.md`) to eliminate bias — it judges two outputs without knowing which version produced them. Then use the **analyzer** (`agents/analyzer.md`) to explain why the winner won and generate actionable improvement suggestions.
|
|
204
|
+
|
|
205
|
+
## Description Optimization
|
|
206
|
+
|
|
207
|
+
After the skill is working well, optimize the description for accurate triggering.
|
|
208
|
+
|
|
209
|
+
### Quick Method
|
|
210
|
+
|
|
211
|
+
Generate 20 realistic eval queries:
|
|
212
|
+
- 8-10 that **should** trigger the skill
|
|
213
|
+
- 8-10 that **should not** (near-misses from adjacent domains)
|
|
214
|
+
|
|
215
|
+
Review with the user, then iteratively improve the description. For the full automated optimization pipeline using `scripts/run_loop.py`, read `references/description-optimization.md`.
|
|
216
|
+
|
|
217
|
+
### How Triggering Works
|
|
218
|
+
|
|
219
|
+
Claude decides whether to invoke a skill based on the skill's name and description (always in context) matched against the user's request. The SKILL.md body is never read during the triggering decision — only the frontmatter description matters.
|
|
220
|
+
|
|
221
|
+
## Validation
|
|
222
|
+
|
|
223
|
+
Before finalizing, validate the skill:
|
|
224
|
+
|
|
225
|
+
```bash
|
|
226
|
+
python3 scripts/quick_validate.py <path/to/skill>
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
This checks frontmatter format, required fields, description quality, and file organization.
|
|
230
|
+
|
|
231
|
+
### Manual Checklist
|
|
232
|
+
|
|
233
|
+
**Structure:**
|
|
234
|
+
- [ ] SKILL.md exists with valid YAML frontmatter
|
|
235
|
+
- [ ] `name` and `description` fields present
|
|
236
|
+
- [ ] Referenced files actually exist
|
|
237
|
+
|
|
238
|
+
**Description:**
|
|
239
|
+
- [ ] Includes specific trigger phrases users would say
|
|
240
|
+
- [ ] Lists concrete scenarios
|
|
241
|
+
- [ ] "Pushy" enough to combat undertriggering
|
|
242
|
+
|
|
243
|
+
**Content:**
|
|
244
|
+
- [ ] Body under 500 lines (use references/ for details)
|
|
245
|
+
- [ ] Imperative form, objective language
|
|
246
|
+
- [ ] Progressive disclosure applied
|
|
247
|
+
|
|
248
|
+
**Resources:**
|
|
249
|
+
- [ ] Scripts are executable and documented
|
|
250
|
+
- [ ] References don't duplicate SKILL.md content
|
|
251
|
+
- [ ] Examples are complete and working
|
|
252
|
+
|
|
253
|
+
## Bundled Scripts
|
|
254
|
+
|
|
255
|
+
This skill includes Python scripts for automated evaluation:
|
|
256
|
+
|
|
257
|
+
| Script | Purpose | Dependencies |
|
|
258
|
+
|--------|---------|-------------|
|
|
259
|
+
| `quick_validate.py` | Validate skill structure | `pyyaml` |
|
|
260
|
+
| `run_eval.py` | Test description triggering | `anthropic` |
|
|
261
|
+
| `improve_description.py` | Improve descriptions with Claude | `anthropic` |
|
|
262
|
+
| `run_loop.py` | Iterative eval + improve loop | `anthropic` |
|
|
263
|
+
| `generate_report.py` | HTML report generation | stdlib |
|
|
264
|
+
| `aggregate_benchmark.py` | Benchmark statistics | stdlib |
|
|
265
|
+
|
|
266
|
+
**Requirements:** Python 3.10+, `pip install anthropic pyyaml`
|
|
267
|
+
|
|
268
|
+
## Reference Files
|
|
269
|
+
|
|
270
|
+
For detailed documentation on specific topics:
|
|
271
|
+
|
|
272
|
+
| File | When to Read |
|
|
273
|
+
|------|-------------|
|
|
274
|
+
| `references/eval-framework.md` | Setting up and running the full eval pipeline |
|
|
275
|
+
| `references/description-optimization.md` | Automated description improvement workflow |
|
|
276
|
+
| `references/json-schemas.md` | JSON format specifications for all tracking files |
|
|
277
|
+
| `references/writing-guide.md` | Skill writing style, philosophy, and common mistakes |
|
|
278
|
+
| `references/plugin-skill-patterns.md` | Building skills within Claude Code plugins |
|
|
279
|
+
|
|
280
|
+
## Push Integration
|
|
281
|
+
|
|
282
|
+
This skill is bundled with the `push-todo` CLI. When working on a Push voice task (invoked via `/push-todo`), skills can report progress back to the Push iOS app using CLI commands.
|
|
283
|
+
|
|
284
|
+
### Available Callbacks
|
|
285
|
+
|
|
286
|
+
| Command | Purpose |
|
|
287
|
+
|---------|---------|
|
|
288
|
+
| `push-todo --mark-completed <uuid>` | Mark a task as done |
|
|
289
|
+
| `push-todo --mark-completed <uuid> --completion-comment "..."` | Mark done with summary |
|
|
290
|
+
| `push-todo --learn-vocabulary <uuid> --keywords 'term1,term2'` | Teach routing keywords |
|
|
291
|
+
|
|
292
|
+
### When to Use
|
|
293
|
+
|
|
294
|
+
These callbacks are relevant when:
|
|
295
|
+
- The user invoked `/push-todo` and selected a task to work on
|
|
296
|
+
- A skill completes work that resolves a Push task
|
|
297
|
+
- The daemon is executing a task autonomously
|
|
298
|
+
|
|
299
|
+
For skill-builder specifically: after creating or improving a skill as part of a Push task, mark it complete and contribute vocabulary terms (e.g., `skill,SKILL.md,eval,benchmark,frontmatter`).
|
|
300
|
+
|
|
301
|
+
### Not Always Relevant
|
|
302
|
+
|
|
303
|
+
Many skill-builder invocations have nothing to do with Push — the user just wants to create a skill. The Push callbacks only matter when there's an active Push task context. Don't call them unprompted.
|
|
304
|
+
|
|
305
|
+
## Agent Files
|
|
306
|
+
|
|
307
|
+
Specialized subagent instructions (read when spawning subagents for evaluation):
|
|
308
|
+
|
|
309
|
+
| File | Purpose |
|
|
310
|
+
|------|---------|
|
|
311
|
+
| `agents/grader.md` | Assertion-based output grading (8-step process) |
|
|
312
|
+
| `agents/comparator.md` | Blind A/B comparison between skill versions |
|
|
313
|
+
| `agents/analyzer.md` | Post-hoc analysis and improvement suggestions |
|