guild-agents 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/README.md +16 -0
  2. package/bin/guild.js +46 -0
  3. package/package.json +1 -1
  4. package/src/commands/eval.js +225 -0
  5. package/src/commands/stats.js +147 -0
  6. package/src/templates/skills/build-feature/evals/triggers.json +16 -0
  7. package/src/templates/skills/council/evals/triggers.json +16 -0
  8. package/src/templates/skills/create-pr/evals/evals.json +44 -0
  9. package/src/templates/skills/create-pr/evals/triggers.json +16 -0
  10. package/src/templates/skills/debug/SKILL.md +1 -1
  11. package/src/templates/skills/debug/evals/triggers.json +16 -0
  12. package/src/templates/skills/dev-flow/evals/evals.json +36 -0
  13. package/src/templates/skills/dev-flow/evals/triggers.json +16 -0
  14. package/src/templates/skills/guild-specialize/evals/evals.json +54 -0
  15. package/src/templates/skills/guild-specialize/evals/triggers.json +16 -0
  16. package/src/templates/skills/new-feature/evals/evals.json +41 -0
  17. package/src/templates/skills/new-feature/evals/triggers.json +16 -0
  18. package/src/templates/skills/qa-cycle/evals/evals.json +46 -0
  19. package/src/templates/skills/qa-cycle/evals/triggers.json +16 -0
  20. package/src/templates/skills/re-specialize/evals/evals.json +48 -0
  21. package/src/templates/skills/re-specialize/evals/triggers.json +16 -0
  22. package/src/templates/skills/review/evals/evals.json +43 -0
  23. package/src/templates/skills/review/evals/triggers.json +16 -0
  24. package/src/templates/skills/session-end/evals/evals.json +40 -0
  25. package/src/templates/skills/session-end/evals/triggers.json +16 -0
  26. package/src/templates/skills/session-start/evals/evals.json +50 -0
  27. package/src/templates/skills/session-start/evals/triggers.json +16 -0
  28. package/src/templates/skills/status/evals/evals.json +40 -0
  29. package/src/templates/skills/status/evals/triggers.json +16 -0
  30. package/src/templates/skills/tdd/evals/triggers.json +16 -0
  31. package/src/templates/skills/verify/evals/triggers.json +16 -0
  32. package/src/utils/accounting.js +139 -0
  33. package/src/utils/benchmark.js +128 -0
  34. package/src/utils/description-analyzer.js +92 -0
  35. package/src/utils/pricing.js +28 -0
  36. package/src/utils/semantic-matcher.js +91 -0
  37. package/src/utils/trigger-matcher.js +64 -0
  38. package/src/utils/trigger-runner.js +132 -0
package/README.md CHANGED
@@ -93,11 +93,27 @@ guild list # List agents and skills
93
93
  guild run <skill> # Preview a skill's execution plan (dry-run)
94
94
  guild logs # View execution traces
95
95
  guild logs clean # Remove old traces (--days N, --all)
96
+ guild stats # Token usage and cost estimates
97
+ guild eval # Run structural skill evaluations
98
+ guild eval --triggers # Run trigger accuracy tests (keyword matcher)
99
+ guild eval --semantic # Run trigger tests with LLM semantic matcher
100
+ guild eval --suggest # Show description improvement suggestions
96
101
  guild workspace init <name> <members...> # Create a workspace
97
102
  guild workspace add <path> # Add a member repo
98
103
  guild workspace status # Show workspace state
99
104
  ```
100
105
 
106
+ ## Skill Evaluations
107
+
108
+ Guild includes a built-in evaluation framework for validating skill quality:
109
+
110
+ - **Structural evals** (`guild eval`) -- assert workflow structure: steps exist, roles are correct, gates are present
111
+ - **Trigger tests** (`guild eval --triggers`) -- verify that user prompts route to the correct skill using keyword overlap scoring
112
+ - **Semantic matcher** (`guild eval --semantic`) -- optional LLM-based scoring via Anthropic Haiku for higher-fidelity trigger testing (requires `ANTHROPIC_API_KEY`)
113
+ - **Description suggestions** (`guild eval --suggest`) -- analyzes keyword gaps in skill descriptions based on failed triggers
114
+
115
+ Every trigger run automatically records results to `benchmarks/benchmark.json` (rolling 30-entry history) and generates `benchmarks/benchmark.md` with per-skill accuracy, precision, recall, and delta vs previous run. Regressions (>5% accuracy drop with 2+ tests flipped) are flagged automatically.
116
+
101
117
  ## Under the Hood
102
118
 
103
119
  Guild coordinates 10 specialized agents through the pipeline. Each agent handles one phase.
package/bin/guild.js CHANGED
@@ -8,6 +8,7 @@
8
8
  * guild status — view project status
9
9
  * guild doctor — verify setup and report issues
10
10
  * guild list — list installed agents and skills
11
+ * guild stats — view token usage and cost stats
11
12
  */
12
13
 
13
14
  import { program } from 'commander';
@@ -168,6 +169,51 @@ logsCmd
168
169
  }
169
170
  });
170
171
 
172
+ // guild eval
173
+ program
174
+ .command('eval')
175
+ .description('Run skill structural evaluations')
176
+ .argument('[skill]', 'Skill name to evaluate (or all if omitted)')
177
+ .option('--triggers', 'Run trigger tests instead of structural evals')
178
+ .option('--semantic', 'Use LLM-based semantic matcher for trigger tests')
179
+ .option('--suggest', 'Show description improvement suggestions')
180
+ .action(async (skill, options) => {
181
+ try {
182
+ if (options.triggers || options.semantic || options.suggest) {
183
+ const { runEvalTriggers } = await import('../src/commands/eval.js');
184
+ await runEvalTriggers(skill, {
185
+ semantic: options.semantic || false,
186
+ suggest: options.suggest || false,
187
+ });
188
+ } else {
189
+ const { runEval } = await import('../src/commands/eval.js');
190
+ await runEval(skill);
191
+ }
192
+ } catch (err) {
193
+ console.error(err.message);
194
+ process.exit(1);
195
+ }
196
+ });
197
+
198
+ // guild stats
199
+ program
200
+ .command('stats')
201
+ .description('View token usage stats and cost estimates')
202
+ .option('--period <period>', 'Filter by period: today, week, month, all', 'month')
203
+ .option('--compare', 'Compare cost across model profiles')
204
+ .option('--reset', 'Delete all usage history')
205
+ .option('-f, --force', 'Skip confirmation prompt (for --reset)')
206
+ .option('--export <format>', 'Export data (csv)')
207
+ .action(async (options) => {
208
+ try {
209
+ const { runStats } = await import('../src/commands/stats.js');
210
+ await runStats(options);
211
+ } catch (err) {
212
+ console.error(err.message);
213
+ process.exit(1);
214
+ }
215
+ });
216
+
171
217
  // guild workspace
172
218
  const workspaceCmd = program
173
219
  .command('workspace')
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "guild-agents",
3
- "version": "1.3.0",
3
+ "version": "1.4.0",
4
4
  "description": "Specification-driven development CLI for Claude Code — think before you build",
5
5
  "type": "module",
6
6
  "files": [
@@ -0,0 +1,225 @@
1
+ /**
2
+ * eval.js — Run skill structural evaluations.
3
+ *
4
+ * Usage:
5
+ * guild eval — Run all skills that have evals
6
+ * guild eval build-feature — Run evals for a specific skill
7
+ */
8
+
9
+ import * as p from '@clack/prompts';
10
+ import chalk from 'chalk';
11
+ import { readdirSync, readFileSync, writeFileSync } from 'fs';
12
+ import { join, dirname } from 'path';
13
+ import { fileURLToPath } from 'url';
14
+ import { loadEvals, runEvals } from '../utils/eval-runner.js';
15
+ import { loadTriggers, runTriggerTests, computeAccuracy, loadAllSkillDescriptions } from '../utils/trigger-runner.js';
16
+
17
+ const __dirname = dirname(fileURLToPath(import.meta.url));
18
+ const SKILLS_DIR = join(__dirname, '..', 'templates', 'skills');
19
+
20
+ /**
21
+ * Runs skill evaluations.
22
+ * @param {string} [skillName] - Specific skill to evaluate, or all if omitted
23
+ */
24
+ export async function runEval(skillName) {
25
+ const skills = skillName
26
+ ? [skillName]
27
+ : readdirSync(SKILLS_DIR, { withFileTypes: true })
28
+ .filter(d => d.isDirectory())
29
+ .map(d => d.name)
30
+ .filter(name => loadEvals(name) !== null);
31
+
32
+ p.intro(chalk.bold.cyan(`Guild Eval — ${skillName || 'all skills'}`));
33
+
34
+ let totalPassed = 0;
35
+ let totalFailed = 0;
36
+
37
+ for (const skill of skills) {
38
+ try {
39
+ const results = runEvals(skill);
40
+ for (const evalResult of results.results) {
41
+ if (evalResult.passed) {
42
+ p.log.success(`${chalk.gray(skill)} ${evalResult.description}`);
43
+ totalPassed++;
44
+ } else {
45
+ p.log.error(`${chalk.gray(skill)} ${evalResult.description}`);
46
+ for (const exp of evalResult.expectations.filter(e => !e.passed)) {
47
+ p.log.info(chalk.red(` ↳ ${exp.text}: ${exp.evidence}`));
48
+ }
49
+ totalFailed++;
50
+ }
51
+ }
52
+ } catch (err) {
53
+ p.log.error(`${skill}: ${err.message}`);
54
+ totalFailed++;
55
+ }
56
+ }
57
+
58
+ const summary = `${totalPassed + totalFailed} evals: ${chalk.green(`${totalPassed} passed`)}${totalFailed > 0 ? `, ${chalk.red(`${totalFailed} failed`)}` : ''}`;
59
+ p.outro(summary);
60
+
61
+ if (totalFailed > 0) process.exit(1);
62
+ }
63
+
64
+ /**
65
+ * Runs trigger evaluations with optional semantic matcher, benchmarks, and suggestions.
66
+ * @param {string} [skillName] - Specific skill or all
67
+ * @param {object} [options] - CLI options
68
+ * @param {boolean} [options.semantic=false] - Use semantic matcher
69
+ * @param {boolean} [options.suggest=false] - Show description suggestions
70
+ */
71
+ export async function runEvalTriggers(skillName, options = {}) {
72
+ const { semantic = false, suggest = false } = options;
73
+ const allSkills = loadAllSkillDescriptions();
74
+
75
+ // Warn if semantic mode but no API key
76
+ if (semantic && !process.env.ANTHROPIC_API_KEY) {
77
+ p.log.warn(chalk.yellow('ANTHROPIC_API_KEY not set — semantic matcher requires it'));
78
+ process.exit(1);
79
+ }
80
+
81
+ // Lazy-load semantic matcher only when needed
82
+ let scoreMatchSemantic;
83
+ if (semantic) {
84
+ const mod = await import('../utils/semantic-matcher.js');
85
+ scoreMatchSemantic = mod.scoreMatchSemantic;
86
+ }
87
+
88
+ const skills = skillName
89
+ ? [skillName]
90
+ : readdirSync(SKILLS_DIR, { withFileTypes: true })
91
+ .filter(d => d.isDirectory())
92
+ .map(d => d.name)
93
+ .filter(name => loadTriggers(name) !== null);
94
+
95
+ const matcherLabel = semantic ? 'semantic' : 'keyword';
96
+ p.intro(chalk.bold.cyan(`Guild Trigger Tests [${matcherLabel}] — ${skillName || 'all skills'}`));
97
+
98
+ let totalSkills = 0;
99
+ let totalTests = 0;
100
+ let totalCorrect = 0;
101
+ const allResults = [];
102
+ const benchmarkSkills = [];
103
+
104
+ for (const skill of skills) {
105
+ const triggers = loadTriggers(skill);
106
+ if (!triggers) {
107
+ p.log.warn(`${skill}: no triggers.json`);
108
+ continue;
109
+ }
110
+
111
+ const results = await runTriggerTests(triggers, allSkills, {
112
+ semantic,
113
+ scoreMatchSemantic,
114
+ });
115
+ const acc = computeAccuracy(results);
116
+ totalSkills++;
117
+ totalTests += acc.total;
118
+ totalCorrect += acc.tp + acc.tn;
119
+
120
+ const icon = acc.accuracy === 1 ? chalk.green('✓') : acc.accuracy >= 0.75 ? chalk.yellow('~') : chalk.red('✗');
121
+ p.log.info(`${icon} ${chalk.bold(skill)} accuracy=${(acc.accuracy * 100).toFixed(0)}% precision=${(acc.precision * 100).toFixed(0)}% recall=${(acc.recall * 100).toFixed(0)}%`);
122
+
123
+ // Show failures
124
+ for (const r of results) {
125
+ if (r.expected !== r.actual) {
126
+ const label = r.expected ? chalk.red('MISS') : chalk.yellow('FALSE+');
127
+ let detail = `(score=${r.score.toFixed(2)}`;
128
+ if (r.rank !== null) detail += `, rank=#${r.rank}`;
129
+ if (r.reasoning) detail += `, reason: ${r.reasoning}`;
130
+ detail += ')';
131
+ p.log.info(chalk.gray(` ${label} "${r.prompt}" ${detail}`));
132
+ }
133
+ }
134
+
135
+ allResults.push({ skill, results, triggers });
136
+ benchmarkSkills.push({
137
+ name: skill,
138
+ accuracy: acc.accuracy,
139
+ precision: acc.precision,
140
+ recall: acc.recall,
141
+ tp: acc.tp,
142
+ fp: acc.fp,
143
+ fn: acc.fn,
144
+ tn: acc.tn,
145
+ });
146
+ }
147
+
148
+ const overallAcc = totalTests > 0 ? ((totalCorrect / totalTests) * 100).toFixed(0) : 0;
149
+
150
+ // Record benchmark
151
+ const { recordBenchmark, generateReport, detectRegressions } = await import('../utils/benchmark.js');
152
+ const benchmarkDir = join(__dirname, '..', '..', 'benchmarks');
153
+ const benchmarkPath = join(benchmarkDir, 'benchmark.json');
154
+ const reportPath = join(benchmarkDir, 'benchmark.md');
155
+
156
+ const entry = {
157
+ timestamp: new Date().toISOString(),
158
+ matcher: matcherLabel,
159
+ model: semantic ? (process.env.GUILD_SEMANTIC_MODEL || 'claude-haiku-4-5-20251001') : null,
160
+ skills: benchmarkSkills,
161
+ aggregate: {
162
+ accuracy: totalTests > 0 ? totalCorrect / totalTests : 0,
163
+ precision: benchmarkSkills.reduce((s, sk) => s + sk.precision, 0) / (benchmarkSkills.length || 1),
164
+ recall: benchmarkSkills.reduce((s, sk) => s + sk.recall, 0) / (benchmarkSkills.length || 1),
165
+ total: totalTests,
166
+ },
167
+ };
168
+
169
+ recordBenchmark(entry, benchmarkPath);
170
+
171
+ // Load previous entry for comparison
172
+ const entries = JSON.parse(readFileSync(benchmarkPath, 'utf8'));
173
+ const previous = entries.length >= 2 ? entries[entries.length - 2] : null;
174
+
175
+ const report = generateReport(entry, previous);
176
+ writeFileSync(reportPath, report);
177
+ p.log.info(chalk.gray(`Benchmark recorded → benchmarks/benchmark.json`));
178
+
179
+ // Check for regressions
180
+ const regressions = detectRegressions(entry, previous);
181
+ if (regressions.length > 0) {
182
+ p.log.warn(chalk.yellow.bold('Regressions detected:'));
183
+ for (const reg of regressions) {
184
+ p.log.warn(chalk.yellow(` ${reg.skill}: ${(reg.previousAccuracy * 100).toFixed(0)}% → ${(reg.currentAccuracy * 100).toFixed(0)}% (${reg.flippedTests} tests flipped)`));
185
+ }
186
+ }
187
+
188
+ // Description suggestions
189
+ if (suggest) {
190
+ const { analyzeGaps, generateSuggestions } = await import('../utils/description-analyzer.js');
191
+
192
+ const gapsList = [];
193
+ for (const { skill, results, triggers } of allResults) {
194
+ const skillDesc = allSkills.find(s => s.name === skill);
195
+ const gaps = analyzeGaps(results, skillDesc?.description || triggers.description);
196
+ if (gaps.missingKeywords.length > 0) {
197
+ gapsList.push({
198
+ skill,
199
+ currentDescription: skillDesc?.description || triggers.description,
200
+ ...gaps,
201
+ });
202
+ }
203
+ }
204
+
205
+ const suggestions = generateSuggestions(gapsList);
206
+ if (suggestions.length > 0) {
207
+ p.log.info('');
208
+ p.log.info(chalk.bold.cyan('Description Suggestions:'));
209
+ for (const sug of suggestions) {
210
+ const highWords = sug.suggestedKeywords.filter(k => k.confidence === 'high').map(k => k.word);
211
+ const medWords = sug.suggestedKeywords.filter(k => k.confidence === 'medium').map(k => k.word);
212
+ const parts = [];
213
+ if (highWords.length > 0) parts.push(`${highWords.join(', ')} (high)`);
214
+ if (medWords.length > 0) parts.push(`${medWords.join(', ')} (medium)`);
215
+ p.log.warn(` ${chalk.bold(sug.skill)} — ${sug.suggestedKeywords.length} missing keywords`);
216
+ p.log.info(chalk.gray(` Missing: ${parts.join(', ')}`));
217
+ p.log.info(chalk.gray(` Current: "${sug.currentDescription}"`));
218
+ }
219
+ } else {
220
+ p.log.success('No description gaps found');
221
+ }
222
+ }
223
+
224
+ p.outro(`${totalSkills} skills, ${totalTests} tests, ${overallAcc}% overall accuracy`);
225
+ }
@@ -0,0 +1,147 @@
1
+ /**
2
+ * stats.js — Token usage stats command.
3
+ */
4
+
5
+ import * as p from '@clack/prompts';
6
+ import chalk from 'chalk';
7
+ import { existsSync, unlinkSync, copyFileSync } from 'fs';
8
+ import { join } from 'path';
9
+ import { loadUsage, aggregate, estimateWithProfile } from '../utils/accounting.js';
10
+ import { getModelShortName } from '../utils/pricing.js';
11
+
12
+ const USAGE_PATH = join('.claude', 'guild', 'usage.json');
13
+
14
+ function fmt(n) {
15
+ return n.toLocaleString('en-US');
16
+ }
17
+
18
+ function usd(n) {
19
+ return `$${n.toFixed(2)}`;
20
+ }
21
+
22
+ function pct(part, total) {
23
+ if (total === 0) return '0%';
24
+ return `${Math.round((part / total) * 100)}%`;
25
+ }
26
+
27
+ const PERIOD_LABELS = {
28
+ today: 'Today',
29
+ week: 'Last 7 days',
30
+ month: 'Last 30 days',
31
+ all: 'All time',
32
+ };
33
+
34
+ export function formatCsv(entries) {
35
+ const headers = 'timestamp,workflow,agent,tier,model,inputTokens,outputTokens,totalTokens,estimatedCostUSD';
36
+ const rows = entries.map(e =>
37
+ `${e.timestamp},${e.workflow},${e.agent},${e.tier},${e.model},${e.inputTokens},${e.outputTokens},${e.totalTokens},${e.estimatedCostUSD.toFixed(6)}`
38
+ );
39
+ return [headers, ...rows].join('\n') + '\n';
40
+ }
41
+
42
+ export async function runStats(options = {}) {
43
+ const root = process.cwd();
44
+
45
+ if (options.reset) {
46
+ return handleReset(root, options.force);
47
+ }
48
+
49
+ if (options.export === 'csv') {
50
+ const usage = loadUsage(root);
51
+ if (usage.entries.length === 0) {
52
+ console.log('No usage data to export.');
53
+ return;
54
+ }
55
+ process.stdout.write(formatCsv(usage.entries));
56
+ return;
57
+ }
58
+
59
+ const period = options.period || 'month';
60
+ const totals = aggregate(root, period);
61
+
62
+ p.intro(chalk.bold.cyan(`Guild Usage Stats — ${PERIOD_LABELS[period] || period}`));
63
+
64
+ if (totals.totalTokens === 0) {
65
+ p.log.info('No usage data yet. Token tracking will begin when workflows record usage.');
66
+ p.outro('');
67
+ return;
68
+ }
69
+
70
+ p.log.step('Summary');
71
+ p.log.info(` Workflows executed: ${chalk.bold(fmt(totals.workflowCount))}`);
72
+ p.log.info(` Total tokens: ${chalk.bold(fmt(totals.totalTokens))}`);
73
+ p.log.info(` Estimated cost: ${chalk.bold.green(usd(totals.totalCostUSD))}`);
74
+
75
+ if (Object.keys(totals.tokensByTier).length > 0) {
76
+ p.log.step('By tier');
77
+ for (const [tier, tokens] of Object.entries(totals.tokensByTier)) {
78
+ p.log.info(` ${tier.padEnd(12)} ${fmt(tokens).padStart(10)} tok (${pct(tokens, totals.totalTokens).padStart(4)})`);
79
+ }
80
+ }
81
+
82
+ if (Object.keys(totals.tokensByModel).length > 0) {
83
+ p.log.step('By model');
84
+ for (const [model, tokens] of Object.entries(totals.tokensByModel)) {
85
+ p.log.info(` ${getModelShortName(model).padEnd(12)} ${fmt(tokens).padStart(10)} tok`);
86
+ }
87
+ }
88
+
89
+ if (Object.keys(totals.tokensByWorkflow).length > 0) {
90
+ p.log.step('Top workflows');
91
+ const sorted = Object.entries(totals.tokensByWorkflow).sort((a, b) => b[1] - a[1]);
92
+ for (const [wf, tokens] of sorted) {
93
+ p.log.info(` ${wf.padEnd(20)} ${fmt(tokens).padStart(10)} tok`);
94
+ }
95
+ }
96
+
97
+ if (options.compare) {
98
+ const usage = loadUsage(root);
99
+ const filtered = usage.entries;
100
+ const maxCost = estimateWithProfile(filtered, 'max');
101
+ const proCost = estimateWithProfile(filtered, 'pro');
102
+ const allOpusCost = estimateWithProfile(filtered, 'all-opus');
103
+
104
+ p.log.step('Profile comparison');
105
+ p.log.info(` ${'max'.padEnd(12)} ${usd(maxCost).padStart(10)} —`);
106
+ p.log.info(` ${'pro'.padEnd(12)} ${usd(proCost).padStart(10)} ${diffLabel(proCost, maxCost)}`);
107
+ p.log.info(` ${'all-opus'.padEnd(12)} ${usd(allOpusCost).padStart(10)} ${diffLabel(allOpusCost, maxCost)}`);
108
+ }
109
+
110
+ p.outro('');
111
+ }
112
+
113
+ function diffLabel(cost, baseline) {
114
+ if (baseline === 0) return '';
115
+ const diff = ((cost - baseline) / baseline) * 100;
116
+ const sign = diff >= 0 ? '+' : '';
117
+ return chalk.gray(`${sign}${Math.round(diff)}%`);
118
+ }
119
+
120
+ async function handleReset(root, force) {
121
+ const filePath = join(root, USAGE_PATH);
122
+
123
+ p.intro(chalk.bold.cyan('Guild — Reset Usage Stats'));
124
+
125
+ if (!existsSync(filePath)) {
126
+ p.log.info('No usage data found. Nothing to reset.');
127
+ p.outro('');
128
+ return;
129
+ }
130
+
131
+ if (!force) {
132
+ const confirmed = await p.confirm({
133
+ message: 'This will delete all usage history. Continue?',
134
+ initialValue: false,
135
+ });
136
+
137
+ if (p.isCancel(confirmed) || !confirmed) {
138
+ p.cancel('Reset cancelled.');
139
+ return;
140
+ }
141
+ }
142
+
143
+ copyFileSync(filePath, filePath + '.bak');
144
+ unlinkSync(filePath);
145
+ p.log.success(`${chalk.green('✓')} Usage history deleted. Backup saved as usage.json.bak.`);
146
+ p.outro('');
147
+ }
@@ -0,0 +1,16 @@
1
+ {
2
+ "skill": "build-feature",
3
+ "matcherType": "keyword",
4
+ "description": "Full pipeline: evaluation -> spec -> implementation -> review -> QA",
5
+ "threshold": 0.3,
6
+ "tests": [
7
+ { "prompt": "build a new feature with full pipeline", "shouldTrigger": true },
8
+ { "prompt": "implement this feature end to end", "shouldTrigger": true, "keywordExpected": false },
9
+ { "prompt": "run the full implementation pipeline", "shouldTrigger": true },
10
+ { "prompt": "I want to ship this end to end", "shouldTrigger": true, "keywordExpected": false },
11
+ { "prompt": "review my code", "shouldTrigger": false },
12
+ { "prompt": "create a pull request", "shouldTrigger": false },
13
+ { "prompt": "save my session", "shouldTrigger": false },
14
+ { "prompt": "debug this bug", "shouldTrigger": false }
15
+ ]
16
+ }
@@ -0,0 +1,16 @@
1
+ {
2
+ "skill": "council",
3
+ "matcherType": "keyword",
4
+ "description": "Convenes multiple agents to debate an important decision",
5
+ "threshold": 0.3,
6
+ "tests": [
7
+ { "prompt": "convene a council to debate this decision", "shouldTrigger": true },
8
+ { "prompt": "I need multiple agents to debate this", "shouldTrigger": true },
9
+ { "prompt": "let the council decide", "shouldTrigger": true, "keywordExpected": false },
10
+ { "prompt": "I need help making a decision", "shouldTrigger": true, "keywordExpected": false },
11
+ { "prompt": "build a new feature", "shouldTrigger": false },
12
+ { "prompt": "review my code", "shouldTrigger": false },
13
+ { "prompt": "save my session", "shouldTrigger": false },
14
+ { "prompt": "debug this bug", "shouldTrigger": false }
15
+ ]
16
+ }
@@ -0,0 +1,44 @@
1
+ {
2
+ "skill": "create-pr",
3
+ "evals": [
4
+ {
5
+ "id": "cpr-has-core-steps",
6
+ "description": "PR creation has verify, gather, generate, create steps",
7
+ "expectations": [
8
+ { "text": "Has verify-branch step", "assertion": "step-exists:verify-branch" },
9
+ { "text": "Has gather-context step", "assertion": "step-exists:gather-context" },
10
+ { "text": "Has generate-description step", "assertion": "step-exists:generate-description" },
11
+ { "text": "Has create-pr step", "assertion": "step-exists:create-pr" }
12
+ ]
13
+ },
14
+ {
15
+ "id": "cpr-all-system-role",
16
+ "description": "All steps use system role (no agent delegation)",
17
+ "expectations": [
18
+ { "text": "verify-branch is system", "assertion": "step-role:verify-branch:system" },
19
+ { "text": "gather-context is system", "assertion": "step-role:gather-context:system" },
20
+ { "text": "generate-description is system", "assertion": "step-role:generate-description:system" },
21
+ { "text": "create-pr is system", "assertion": "step-role:create-pr:system" },
22
+ { "text": "post-creation is system", "assertion": "step-role:post-creation:system" }
23
+ ]
24
+ },
25
+ {
26
+ "id": "cpr-gates",
27
+ "description": "Gates at description generation and post-creation",
28
+ "expectations": [
29
+ { "text": "Generate-description has gate", "assertion": "gate-exists:generate-description" },
30
+ { "text": "Post-creation has gate", "assertion": "gate-exists:post-creation" }
31
+ ]
32
+ },
33
+ {
34
+ "id": "cpr-dependencies",
35
+ "description": "Steps have correct dependency chain",
36
+ "expectations": [
37
+ { "text": "gather-context requires branch-state", "assertion": "step-requires:gather-context:branch-state" },
38
+ { "text": "generate-description requires commit-list", "assertion": "step-requires:generate-description:commit-list" },
39
+ { "text": "create-pr requires pr-description", "assertion": "step-requires:create-pr:pr-description" },
40
+ { "text": "post-creation requires pr-url", "assertion": "step-requires:post-creation:pr-url" }
41
+ ]
42
+ }
43
+ ]
44
+ }
@@ -0,0 +1,16 @@
1
+ {
2
+ "skill": "create-pr",
3
+ "matcherType": "keyword",
4
+ "description": "Create a pull request from the current branch with structured summary",
5
+ "threshold": 0.3,
6
+ "tests": [
7
+ { "prompt": "create a pull request", "shouldTrigger": true },
8
+ { "prompt": "open a PR for this branch", "shouldTrigger": true },
9
+ { "prompt": "push and create PR", "shouldTrigger": true },
10
+ { "prompt": "I'm ready to submit this for review", "shouldTrigger": true, "keywordExpected": false },
11
+ { "prompt": "review my code changes", "shouldTrigger": false },
12
+ { "prompt": "start a new feature", "shouldTrigger": false },
13
+ { "prompt": "deploy to production", "shouldTrigger": false },
14
+ { "prompt": "save my session", "shouldTrigger": false }
15
+ ]
16
+ }
@@ -1,6 +1,6 @@
1
1
  ---
2
2
  name: debug
3
- description: "Discipline skill — systematic debugging process. Use when encountering any bug, test failure, or unexpected behavior, before proposing fixes."
3
+ description: "Discipline skill — systematic debugging process. Use when encountering any bug, debug issue, test failure, broken function, or unexpected behavior, before proposing fixes."
4
4
  user-invocable: true
5
5
  ---
6
6
 
@@ -0,0 +1,16 @@
1
+ {
2
+ "skill": "debug",
3
+ "matcherType": "keyword",
4
+ "description": "Discipline skill — systematic debugging process. Use when encountering any bug, debug issue, test failure, broken function, or unexpected behavior, before proposing fixes.",
5
+ "threshold": 0.3,
6
+ "tests": [
7
+ { "prompt": "I have a bug in the login flow", "shouldTrigger": true, "keywordExpected": false },
8
+ { "prompt": "tests are failing unexpectedly", "shouldTrigger": true },
9
+ { "prompt": "unexpected behavior in the API", "shouldTrigger": true },
10
+ { "prompt": "help me debug this function", "shouldTrigger": true },
11
+ { "prompt": "create a new feature", "shouldTrigger": false },
12
+ { "prompt": "review my code", "shouldTrigger": false },
13
+ { "prompt": "save my session", "shouldTrigger": false },
14
+ { "prompt": "what phase am I in", "shouldTrigger": false }
15
+ ]
16
+ }
@@ -0,0 +1,36 @@
1
+ {
2
+ "skill": "dev-flow",
3
+ "evals": [
4
+ {
5
+ "id": "df-has-steps",
6
+ "description": "Dev flow has read-state and present-flow steps",
7
+ "expectations": [
8
+ { "text": "Has read-state step", "assertion": "step-exists:read-state" },
9
+ { "text": "Has present-flow step", "assertion": "step-exists:present-flow" }
10
+ ]
11
+ },
12
+ {
13
+ "id": "df-all-system",
14
+ "description": "All steps are system role",
15
+ "expectations": [
16
+ { "text": "read-state is system", "assertion": "step-role:read-state:system" },
17
+ { "text": "present-flow is system", "assertion": "step-role:present-flow:system" }
18
+ ]
19
+ },
20
+ {
21
+ "id": "df-presentation-gate",
22
+ "description": "Present-flow step has a gate for user confirmation",
23
+ "expectations": [
24
+ { "text": "present-flow has gate", "assertion": "gate-exists:present-flow" }
25
+ ]
26
+ },
27
+ {
28
+ "id": "df-dependencies",
29
+ "description": "Present-flow requires session state",
30
+ "expectations": [
31
+ { "text": "present-flow requires session-state", "assertion": "step-requires:present-flow:session-state" },
32
+ { "text": "present-flow requires current-phase", "assertion": "step-requires:present-flow:current-phase" }
33
+ ]
34
+ }
35
+ ]
36
+ }
@@ -0,0 +1,16 @@
1
+ {
2
+ "skill": "dev-flow",
3
+ "matcherType": "keyword",
4
+ "description": "Shows current pipeline phase and what comes next",
5
+ "threshold": 0.3,
6
+ "tests": [
7
+ { "prompt": "what phase am I in", "shouldTrigger": true },
8
+ { "prompt": "show the current pipeline phase", "shouldTrigger": true },
9
+ { "prompt": "what comes next in the flow", "shouldTrigger": true },
10
+ { "prompt": "where did I leave off", "shouldTrigger": true, "keywordExpected": false },
11
+ { "prompt": "create a pull request", "shouldTrigger": false },
12
+ { "prompt": "review my code", "shouldTrigger": false },
13
+ { "prompt": "fix this bug", "shouldTrigger": false },
14
+ { "prompt": "run the tests", "shouldTrigger": false }
15
+ ]
16
+ }