guild-agents 1.2.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -0
- package/bin/guild.js +73 -0
- package/package.json +5 -2
- package/src/commands/eval.js +225 -0
- package/src/commands/stats.js +147 -0
- package/src/commands/workspace.js +38 -1
- package/src/templates/skills/build-feature/evals/evals.json +53 -0
- package/src/templates/skills/build-feature/evals/triggers.json +16 -0
- package/src/templates/skills/council/SKILL.md +27 -6
- package/src/templates/skills/council/evals/evals.json +41 -0
- package/src/templates/skills/council/evals/triggers.json +16 -0
- package/src/templates/skills/create-pr/evals/evals.json +44 -0
- package/src/templates/skills/create-pr/evals/triggers.json +16 -0
- package/src/templates/skills/debug/SKILL.md +1 -1
- package/src/templates/skills/debug/evals/triggers.json +16 -0
- package/src/templates/skills/dev-flow/evals/evals.json +36 -0
- package/src/templates/skills/dev-flow/evals/triggers.json +16 -0
- package/src/templates/skills/guild-specialize/evals/evals.json +54 -0
- package/src/templates/skills/guild-specialize/evals/triggers.json +16 -0
- package/src/templates/skills/new-feature/evals/evals.json +41 -0
- package/src/templates/skills/new-feature/evals/triggers.json +16 -0
- package/src/templates/skills/qa-cycle/evals/evals.json +46 -0
- package/src/templates/skills/qa-cycle/evals/triggers.json +16 -0
- package/src/templates/skills/re-specialize/evals/evals.json +48 -0
- package/src/templates/skills/re-specialize/evals/triggers.json +16 -0
- package/src/templates/skills/review/evals/evals.json +43 -0
- package/src/templates/skills/review/evals/triggers.json +16 -0
- package/src/templates/skills/session-end/evals/evals.json +40 -0
- package/src/templates/skills/session-end/evals/triggers.json +16 -0
- package/src/templates/skills/session-start/evals/evals.json +50 -0
- package/src/templates/skills/session-start/evals/triggers.json +16 -0
- package/src/templates/skills/status/evals/evals.json +40 -0
- package/src/templates/skills/status/evals/triggers.json +16 -0
- package/src/templates/skills/tdd/evals/triggers.json +16 -0
- package/src/templates/skills/verify/evals/triggers.json +16 -0
- package/src/utils/accounting.js +139 -0
- package/src/utils/benchmark.js +128 -0
- package/src/utils/description-analyzer.js +92 -0
- package/src/utils/eval-runner.js +139 -0
- package/src/utils/pricing.js +28 -0
- package/src/utils/semantic-matcher.js +91 -0
- package/src/utils/trigger-matcher.js +64 -0
- package/src/utils/trigger-runner.js +132 -0
- package/src/utils/workspace.js +89 -0
package/README.md
CHANGED
|
@@ -93,11 +93,27 @@ guild list # List agents and skills
|
|
|
93
93
|
guild run <skill> # Preview a skill's execution plan (dry-run)
|
|
94
94
|
guild logs # View execution traces
|
|
95
95
|
guild logs clean # Remove old traces (--days N, --all)
|
|
96
|
+
guild stats # Token usage and cost estimates
|
|
97
|
+
guild eval # Run structural skill evaluations
|
|
98
|
+
guild eval --triggers # Run trigger accuracy tests (keyword matcher)
|
|
99
|
+
guild eval --semantic # Run trigger tests with LLM semantic matcher
|
|
100
|
+
guild eval --suggest # Show description improvement suggestions
|
|
96
101
|
guild workspace init <name> <members...> # Create a workspace
|
|
97
102
|
guild workspace add <path> # Add a member repo
|
|
98
103
|
guild workspace status # Show workspace state
|
|
99
104
|
```
|
|
100
105
|
|
|
106
|
+
## Skill Evaluations
|
|
107
|
+
|
|
108
|
+
Guild includes a built-in evaluation framework for validating skill quality:
|
|
109
|
+
|
|
110
|
+
- **Structural evals** (`guild eval`) -- assert workflow structure: steps exist, roles are correct, gates are present
|
|
111
|
+
- **Trigger tests** (`guild eval --triggers`) -- verify that user prompts route to the correct skill using keyword overlap scoring
|
|
112
|
+
- **Semantic matcher** (`guild eval --semantic`) -- optional LLM-based scoring via Anthropic Haiku for higher-fidelity trigger testing (requires `ANTHROPIC_API_KEY`)
|
|
113
|
+
- **Description suggestions** (`guild eval --suggest`) -- analyzes keyword gaps in skill descriptions based on failed triggers
|
|
114
|
+
|
|
115
|
+
Every trigger run automatically records results to `benchmarks/benchmark.json` (rolling 30-entry history) and generates `benchmarks/benchmark.md` with per-skill accuracy, precision, recall, and delta vs previous run. Regressions (>5% accuracy drop with 2+ tests flipped) are flagged automatically.
|
|
116
|
+
|
|
101
117
|
## Under the Hood
|
|
102
118
|
|
|
103
119
|
Guild coordinates 10 specialized agents through the pipeline. Each agent handles one phase.
|
package/bin/guild.js
CHANGED
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
* guild status — view project status
|
|
9
9
|
* guild doctor — verify setup and report issues
|
|
10
10
|
* guild list — list installed agents and skills
|
|
11
|
+
* guild stats — view token usage and cost stats
|
|
11
12
|
*/
|
|
12
13
|
|
|
13
14
|
import { program } from 'commander';
|
|
@@ -168,6 +169,51 @@ logsCmd
|
|
|
168
169
|
}
|
|
169
170
|
});
|
|
170
171
|
|
|
172
|
+
// guild eval
|
|
173
|
+
program
|
|
174
|
+
.command('eval')
|
|
175
|
+
.description('Run skill structural evaluations')
|
|
176
|
+
.argument('[skill]', 'Skill name to evaluate (or all if omitted)')
|
|
177
|
+
.option('--triggers', 'Run trigger tests instead of structural evals')
|
|
178
|
+
.option('--semantic', 'Use LLM-based semantic matcher for trigger tests')
|
|
179
|
+
.option('--suggest', 'Show description improvement suggestions')
|
|
180
|
+
.action(async (skill, options) => {
|
|
181
|
+
try {
|
|
182
|
+
if (options.triggers || options.semantic || options.suggest) {
|
|
183
|
+
const { runEvalTriggers } = await import('../src/commands/eval.js');
|
|
184
|
+
await runEvalTriggers(skill, {
|
|
185
|
+
semantic: options.semantic || false,
|
|
186
|
+
suggest: options.suggest || false,
|
|
187
|
+
});
|
|
188
|
+
} else {
|
|
189
|
+
const { runEval } = await import('../src/commands/eval.js');
|
|
190
|
+
await runEval(skill);
|
|
191
|
+
}
|
|
192
|
+
} catch (err) {
|
|
193
|
+
console.error(err.message);
|
|
194
|
+
process.exit(1);
|
|
195
|
+
}
|
|
196
|
+
});
|
|
197
|
+
|
|
198
|
+
// guild stats
|
|
199
|
+
program
|
|
200
|
+
.command('stats')
|
|
201
|
+
.description('View token usage stats and cost estimates')
|
|
202
|
+
.option('--period <period>', 'Filter by period: today, week, month, all', 'month')
|
|
203
|
+
.option('--compare', 'Compare cost across model profiles')
|
|
204
|
+
.option('--reset', 'Delete all usage history')
|
|
205
|
+
.option('-f, --force', 'Skip confirmation prompt (for --reset)')
|
|
206
|
+
.option('--export <format>', 'Export data (csv)')
|
|
207
|
+
.action(async (options) => {
|
|
208
|
+
try {
|
|
209
|
+
const { runStats } = await import('../src/commands/stats.js');
|
|
210
|
+
await runStats(options);
|
|
211
|
+
} catch (err) {
|
|
212
|
+
console.error(err.message);
|
|
213
|
+
process.exit(1);
|
|
214
|
+
}
|
|
215
|
+
});
|
|
216
|
+
|
|
171
217
|
// guild workspace
|
|
172
218
|
const workspaceCmd = program
|
|
173
219
|
.command('workspace')
|
|
@@ -225,4 +271,31 @@ workspaceCmd
|
|
|
225
271
|
}
|
|
226
272
|
});
|
|
227
273
|
|
|
274
|
+
// guild workspace run
|
|
275
|
+
workspaceCmd
|
|
276
|
+
.command('run')
|
|
277
|
+
.description('Run a command in a workspace member repo')
|
|
278
|
+
.argument('[member]', 'Member name (or omit with --all)')
|
|
279
|
+
.argument('[preset]', 'Preset command: test, lint, build')
|
|
280
|
+
.option('--cmd <command>', 'Custom command to run')
|
|
281
|
+
.option('--all', 'Run in all workspace members')
|
|
282
|
+
.action(async (member, preset, options) => {
|
|
283
|
+
try {
|
|
284
|
+
const { runWorkspaceCommand } = await import('../src/commands/workspace.js');
|
|
285
|
+
const results = runWorkspaceCommand(member, preset, options);
|
|
286
|
+
for (const r of results) {
|
|
287
|
+
const icon = r.status === 'passed' ? '\u2705' : '\u274C';
|
|
288
|
+
console.log(`${icon} ${r.member}: ${r.status} (${r.duration}ms)`);
|
|
289
|
+
if (r.status === 'failed' && r.output) {
|
|
290
|
+
console.log(r.output);
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
const failed = results.filter(r => r.status === 'failed');
|
|
294
|
+
if (failed.length > 0) process.exit(1);
|
|
295
|
+
} catch (err) {
|
|
296
|
+
console.error(err.message);
|
|
297
|
+
process.exit(1);
|
|
298
|
+
}
|
|
299
|
+
});
|
|
300
|
+
|
|
228
301
|
program.parse();
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "guild-agents",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.4.0",
|
|
4
4
|
"description": "Specification-driven development CLI for Claude Code — think before you build",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"files": [
|
|
@@ -28,7 +28,10 @@
|
|
|
28
28
|
"publish:snapshot": "npm run version:snapshot && npm publish --tag snapshot",
|
|
29
29
|
"publish:beta": "npm run version:beta && npm publish --tag beta",
|
|
30
30
|
"publish:stable": "npm run version:stable && npm publish --tag latest",
|
|
31
|
-
"publish:promote-beta": "npm dist-tag add guild-agents@$(node --input-type=commonjs -p \"require('./package.json').version\") beta"
|
|
31
|
+
"publish:promote-beta": "npm dist-tag add guild-agents@$(node --input-type=commonjs -p \"require('./package.json').version\") beta",
|
|
32
|
+
"eval": "node scripts/run-evals.js",
|
|
33
|
+
"eval:build-feature": "node scripts/run-evals.js build-feature",
|
|
34
|
+
"eval:council": "node scripts/run-evals.js council"
|
|
32
35
|
},
|
|
33
36
|
"keywords": [
|
|
34
37
|
"claude",
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* eval.js — Run skill structural evaluations.
|
|
3
|
+
*
|
|
4
|
+
* Usage:
|
|
5
|
+
* guild eval — Run all skills that have evals
|
|
6
|
+
* guild eval build-feature — Run evals for a specific skill
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import * as p from '@clack/prompts';
|
|
10
|
+
import chalk from 'chalk';
|
|
11
|
+
import { readdirSync, readFileSync, writeFileSync } from 'fs';
|
|
12
|
+
import { join, dirname } from 'path';
|
|
13
|
+
import { fileURLToPath } from 'url';
|
|
14
|
+
import { loadEvals, runEvals } from '../utils/eval-runner.js';
|
|
15
|
+
import { loadTriggers, runTriggerTests, computeAccuracy, loadAllSkillDescriptions } from '../utils/trigger-runner.js';
|
|
16
|
+
|
|
17
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
18
|
+
const SKILLS_DIR = join(__dirname, '..', 'templates', 'skills');
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Runs skill evaluations.
|
|
22
|
+
* @param {string} [skillName] - Specific skill to evaluate, or all if omitted
|
|
23
|
+
*/
|
|
24
|
+
export async function runEval(skillName) {
|
|
25
|
+
const skills = skillName
|
|
26
|
+
? [skillName]
|
|
27
|
+
: readdirSync(SKILLS_DIR, { withFileTypes: true })
|
|
28
|
+
.filter(d => d.isDirectory())
|
|
29
|
+
.map(d => d.name)
|
|
30
|
+
.filter(name => loadEvals(name) !== null);
|
|
31
|
+
|
|
32
|
+
p.intro(chalk.bold.cyan(`Guild Eval — ${skillName || 'all skills'}`));
|
|
33
|
+
|
|
34
|
+
let totalPassed = 0;
|
|
35
|
+
let totalFailed = 0;
|
|
36
|
+
|
|
37
|
+
for (const skill of skills) {
|
|
38
|
+
try {
|
|
39
|
+
const results = runEvals(skill);
|
|
40
|
+
for (const evalResult of results.results) {
|
|
41
|
+
if (evalResult.passed) {
|
|
42
|
+
p.log.success(`${chalk.gray(skill)} ${evalResult.description}`);
|
|
43
|
+
totalPassed++;
|
|
44
|
+
} else {
|
|
45
|
+
p.log.error(`${chalk.gray(skill)} ${evalResult.description}`);
|
|
46
|
+
for (const exp of evalResult.expectations.filter(e => !e.passed)) {
|
|
47
|
+
p.log.info(chalk.red(` ↳ ${exp.text}: ${exp.evidence}`));
|
|
48
|
+
}
|
|
49
|
+
totalFailed++;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
} catch (err) {
|
|
53
|
+
p.log.error(`${skill}: ${err.message}`);
|
|
54
|
+
totalFailed++;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
const summary = `${totalPassed + totalFailed} evals: ${chalk.green(`${totalPassed} passed`)}${totalFailed > 0 ? `, ${chalk.red(`${totalFailed} failed`)}` : ''}`;
|
|
59
|
+
p.outro(summary);
|
|
60
|
+
|
|
61
|
+
if (totalFailed > 0) process.exit(1);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Runs trigger evaluations with optional semantic matcher, benchmarks, and suggestions.
|
|
66
|
+
* @param {string} [skillName] - Specific skill or all
|
|
67
|
+
* @param {object} [options] - CLI options
|
|
68
|
+
* @param {boolean} [options.semantic=false] - Use semantic matcher
|
|
69
|
+
* @param {boolean} [options.suggest=false] - Show description suggestions
|
|
70
|
+
*/
|
|
71
|
+
export async function runEvalTriggers(skillName, options = {}) {
|
|
72
|
+
const { semantic = false, suggest = false } = options;
|
|
73
|
+
const allSkills = loadAllSkillDescriptions();
|
|
74
|
+
|
|
75
|
+
// Warn if semantic mode but no API key
|
|
76
|
+
if (semantic && !process.env.ANTHROPIC_API_KEY) {
|
|
77
|
+
p.log.warn(chalk.yellow('ANTHROPIC_API_KEY not set — semantic matcher requires it'));
|
|
78
|
+
process.exit(1);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// Lazy-load semantic matcher only when needed
|
|
82
|
+
let scoreMatchSemantic;
|
|
83
|
+
if (semantic) {
|
|
84
|
+
const mod = await import('../utils/semantic-matcher.js');
|
|
85
|
+
scoreMatchSemantic = mod.scoreMatchSemantic;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
const skills = skillName
|
|
89
|
+
? [skillName]
|
|
90
|
+
: readdirSync(SKILLS_DIR, { withFileTypes: true })
|
|
91
|
+
.filter(d => d.isDirectory())
|
|
92
|
+
.map(d => d.name)
|
|
93
|
+
.filter(name => loadTriggers(name) !== null);
|
|
94
|
+
|
|
95
|
+
const matcherLabel = semantic ? 'semantic' : 'keyword';
|
|
96
|
+
p.intro(chalk.bold.cyan(`Guild Trigger Tests [${matcherLabel}] — ${skillName || 'all skills'}`));
|
|
97
|
+
|
|
98
|
+
let totalSkills = 0;
|
|
99
|
+
let totalTests = 0;
|
|
100
|
+
let totalCorrect = 0;
|
|
101
|
+
const allResults = [];
|
|
102
|
+
const benchmarkSkills = [];
|
|
103
|
+
|
|
104
|
+
for (const skill of skills) {
|
|
105
|
+
const triggers = loadTriggers(skill);
|
|
106
|
+
if (!triggers) {
|
|
107
|
+
p.log.warn(`${skill}: no triggers.json`);
|
|
108
|
+
continue;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
const results = await runTriggerTests(triggers, allSkills, {
|
|
112
|
+
semantic,
|
|
113
|
+
scoreMatchSemantic,
|
|
114
|
+
});
|
|
115
|
+
const acc = computeAccuracy(results);
|
|
116
|
+
totalSkills++;
|
|
117
|
+
totalTests += acc.total;
|
|
118
|
+
totalCorrect += acc.tp + acc.tn;
|
|
119
|
+
|
|
120
|
+
const icon = acc.accuracy === 1 ? chalk.green('✓') : acc.accuracy >= 0.75 ? chalk.yellow('~') : chalk.red('✗');
|
|
121
|
+
p.log.info(`${icon} ${chalk.bold(skill)} accuracy=${(acc.accuracy * 100).toFixed(0)}% precision=${(acc.precision * 100).toFixed(0)}% recall=${(acc.recall * 100).toFixed(0)}%`);
|
|
122
|
+
|
|
123
|
+
// Show failures
|
|
124
|
+
for (const r of results) {
|
|
125
|
+
if (r.expected !== r.actual) {
|
|
126
|
+
const label = r.expected ? chalk.red('MISS') : chalk.yellow('FALSE+');
|
|
127
|
+
let detail = `(score=${r.score.toFixed(2)}`;
|
|
128
|
+
if (r.rank !== null) detail += `, rank=#${r.rank}`;
|
|
129
|
+
if (r.reasoning) detail += `, reason: ${r.reasoning}`;
|
|
130
|
+
detail += ')';
|
|
131
|
+
p.log.info(chalk.gray(` ${label} "${r.prompt}" ${detail}`));
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
allResults.push({ skill, results, triggers });
|
|
136
|
+
benchmarkSkills.push({
|
|
137
|
+
name: skill,
|
|
138
|
+
accuracy: acc.accuracy,
|
|
139
|
+
precision: acc.precision,
|
|
140
|
+
recall: acc.recall,
|
|
141
|
+
tp: acc.tp,
|
|
142
|
+
fp: acc.fp,
|
|
143
|
+
fn: acc.fn,
|
|
144
|
+
tn: acc.tn,
|
|
145
|
+
});
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
const overallAcc = totalTests > 0 ? ((totalCorrect / totalTests) * 100).toFixed(0) : 0;
|
|
149
|
+
|
|
150
|
+
// Record benchmark
|
|
151
|
+
const { recordBenchmark, generateReport, detectRegressions } = await import('../utils/benchmark.js');
|
|
152
|
+
const benchmarkDir = join(__dirname, '..', '..', 'benchmarks');
|
|
153
|
+
const benchmarkPath = join(benchmarkDir, 'benchmark.json');
|
|
154
|
+
const reportPath = join(benchmarkDir, 'benchmark.md');
|
|
155
|
+
|
|
156
|
+
const entry = {
|
|
157
|
+
timestamp: new Date().toISOString(),
|
|
158
|
+
matcher: matcherLabel,
|
|
159
|
+
model: semantic ? (process.env.GUILD_SEMANTIC_MODEL || 'claude-haiku-4-5-20251001') : null,
|
|
160
|
+
skills: benchmarkSkills,
|
|
161
|
+
aggregate: {
|
|
162
|
+
accuracy: totalTests > 0 ? totalCorrect / totalTests : 0,
|
|
163
|
+
precision: benchmarkSkills.reduce((s, sk) => s + sk.precision, 0) / (benchmarkSkills.length || 1),
|
|
164
|
+
recall: benchmarkSkills.reduce((s, sk) => s + sk.recall, 0) / (benchmarkSkills.length || 1),
|
|
165
|
+
total: totalTests,
|
|
166
|
+
},
|
|
167
|
+
};
|
|
168
|
+
|
|
169
|
+
recordBenchmark(entry, benchmarkPath);
|
|
170
|
+
|
|
171
|
+
// Load previous entry for comparison
|
|
172
|
+
const entries = JSON.parse(readFileSync(benchmarkPath, 'utf8'));
|
|
173
|
+
const previous = entries.length >= 2 ? entries[entries.length - 2] : null;
|
|
174
|
+
|
|
175
|
+
const report = generateReport(entry, previous);
|
|
176
|
+
writeFileSync(reportPath, report);
|
|
177
|
+
p.log.info(chalk.gray(`Benchmark recorded → benchmarks/benchmark.json`));
|
|
178
|
+
|
|
179
|
+
// Check for regressions
|
|
180
|
+
const regressions = detectRegressions(entry, previous);
|
|
181
|
+
if (regressions.length > 0) {
|
|
182
|
+
p.log.warn(chalk.yellow.bold('Regressions detected:'));
|
|
183
|
+
for (const reg of regressions) {
|
|
184
|
+
p.log.warn(chalk.yellow(` ${reg.skill}: ${(reg.previousAccuracy * 100).toFixed(0)}% → ${(reg.currentAccuracy * 100).toFixed(0)}% (${reg.flippedTests} tests flipped)`));
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// Description suggestions
|
|
189
|
+
if (suggest) {
|
|
190
|
+
const { analyzeGaps, generateSuggestions } = await import('../utils/description-analyzer.js');
|
|
191
|
+
|
|
192
|
+
const gapsList = [];
|
|
193
|
+
for (const { skill, results, triggers } of allResults) {
|
|
194
|
+
const skillDesc = allSkills.find(s => s.name === skill);
|
|
195
|
+
const gaps = analyzeGaps(results, skillDesc?.description || triggers.description);
|
|
196
|
+
if (gaps.missingKeywords.length > 0) {
|
|
197
|
+
gapsList.push({
|
|
198
|
+
skill,
|
|
199
|
+
currentDescription: skillDesc?.description || triggers.description,
|
|
200
|
+
...gaps,
|
|
201
|
+
});
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
const suggestions = generateSuggestions(gapsList);
|
|
206
|
+
if (suggestions.length > 0) {
|
|
207
|
+
p.log.info('');
|
|
208
|
+
p.log.info(chalk.bold.cyan('Description Suggestions:'));
|
|
209
|
+
for (const sug of suggestions) {
|
|
210
|
+
const highWords = sug.suggestedKeywords.filter(k => k.confidence === 'high').map(k => k.word);
|
|
211
|
+
const medWords = sug.suggestedKeywords.filter(k => k.confidence === 'medium').map(k => k.word);
|
|
212
|
+
const parts = [];
|
|
213
|
+
if (highWords.length > 0) parts.push(`${highWords.join(', ')} (high)`);
|
|
214
|
+
if (medWords.length > 0) parts.push(`${medWords.join(', ')} (medium)`);
|
|
215
|
+
p.log.warn(` ${chalk.bold(sug.skill)} — ${sug.suggestedKeywords.length} missing keywords`);
|
|
216
|
+
p.log.info(chalk.gray(` Missing: ${parts.join(', ')}`));
|
|
217
|
+
p.log.info(chalk.gray(` Current: "${sug.currentDescription}"`));
|
|
218
|
+
}
|
|
219
|
+
} else {
|
|
220
|
+
p.log.success('No description gaps found');
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
p.outro(`${totalSkills} skills, ${totalTests} tests, ${overallAcc}% overall accuracy`);
|
|
225
|
+
}
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* stats.js — Token usage stats command.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import * as p from '@clack/prompts';
|
|
6
|
+
import chalk from 'chalk';
|
|
7
|
+
import { existsSync, unlinkSync, copyFileSync } from 'fs';
|
|
8
|
+
import { join } from 'path';
|
|
9
|
+
import { loadUsage, aggregate, estimateWithProfile } from '../utils/accounting.js';
|
|
10
|
+
import { getModelShortName } from '../utils/pricing.js';
|
|
11
|
+
|
|
12
|
+
const USAGE_PATH = join('.claude', 'guild', 'usage.json');
|
|
13
|
+
|
|
14
|
+
function fmt(n) {
|
|
15
|
+
return n.toLocaleString('en-US');
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
function usd(n) {
|
|
19
|
+
return `$${n.toFixed(2)}`;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
function pct(part, total) {
|
|
23
|
+
if (total === 0) return '0%';
|
|
24
|
+
return `${Math.round((part / total) * 100)}%`;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
const PERIOD_LABELS = {
|
|
28
|
+
today: 'Today',
|
|
29
|
+
week: 'Last 7 days',
|
|
30
|
+
month: 'Last 30 days',
|
|
31
|
+
all: 'All time',
|
|
32
|
+
};
|
|
33
|
+
|
|
34
|
+
export function formatCsv(entries) {
|
|
35
|
+
const headers = 'timestamp,workflow,agent,tier,model,inputTokens,outputTokens,totalTokens,estimatedCostUSD';
|
|
36
|
+
const rows = entries.map(e =>
|
|
37
|
+
`${e.timestamp},${e.workflow},${e.agent},${e.tier},${e.model},${e.inputTokens},${e.outputTokens},${e.totalTokens},${e.estimatedCostUSD.toFixed(6)}`
|
|
38
|
+
);
|
|
39
|
+
return [headers, ...rows].join('\n') + '\n';
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export async function runStats(options = {}) {
|
|
43
|
+
const root = process.cwd();
|
|
44
|
+
|
|
45
|
+
if (options.reset) {
|
|
46
|
+
return handleReset(root, options.force);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
if (options.export === 'csv') {
|
|
50
|
+
const usage = loadUsage(root);
|
|
51
|
+
if (usage.entries.length === 0) {
|
|
52
|
+
console.log('No usage data to export.');
|
|
53
|
+
return;
|
|
54
|
+
}
|
|
55
|
+
process.stdout.write(formatCsv(usage.entries));
|
|
56
|
+
return;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
const period = options.period || 'month';
|
|
60
|
+
const totals = aggregate(root, period);
|
|
61
|
+
|
|
62
|
+
p.intro(chalk.bold.cyan(`Guild Usage Stats — ${PERIOD_LABELS[period] || period}`));
|
|
63
|
+
|
|
64
|
+
if (totals.totalTokens === 0) {
|
|
65
|
+
p.log.info('No usage data yet. Token tracking will begin when workflows record usage.');
|
|
66
|
+
p.outro('');
|
|
67
|
+
return;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
p.log.step('Summary');
|
|
71
|
+
p.log.info(` Workflows executed: ${chalk.bold(fmt(totals.workflowCount))}`);
|
|
72
|
+
p.log.info(` Total tokens: ${chalk.bold(fmt(totals.totalTokens))}`);
|
|
73
|
+
p.log.info(` Estimated cost: ${chalk.bold.green(usd(totals.totalCostUSD))}`);
|
|
74
|
+
|
|
75
|
+
if (Object.keys(totals.tokensByTier).length > 0) {
|
|
76
|
+
p.log.step('By tier');
|
|
77
|
+
for (const [tier, tokens] of Object.entries(totals.tokensByTier)) {
|
|
78
|
+
p.log.info(` ${tier.padEnd(12)} ${fmt(tokens).padStart(10)} tok (${pct(tokens, totals.totalTokens).padStart(4)})`);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
if (Object.keys(totals.tokensByModel).length > 0) {
|
|
83
|
+
p.log.step('By model');
|
|
84
|
+
for (const [model, tokens] of Object.entries(totals.tokensByModel)) {
|
|
85
|
+
p.log.info(` ${getModelShortName(model).padEnd(12)} ${fmt(tokens).padStart(10)} tok`);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
if (Object.keys(totals.tokensByWorkflow).length > 0) {
|
|
90
|
+
p.log.step('Top workflows');
|
|
91
|
+
const sorted = Object.entries(totals.tokensByWorkflow).sort((a, b) => b[1] - a[1]);
|
|
92
|
+
for (const [wf, tokens] of sorted) {
|
|
93
|
+
p.log.info(` ${wf.padEnd(20)} ${fmt(tokens).padStart(10)} tok`);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
if (options.compare) {
|
|
98
|
+
const usage = loadUsage(root);
|
|
99
|
+
const filtered = usage.entries;
|
|
100
|
+
const maxCost = estimateWithProfile(filtered, 'max');
|
|
101
|
+
const proCost = estimateWithProfile(filtered, 'pro');
|
|
102
|
+
const allOpusCost = estimateWithProfile(filtered, 'all-opus');
|
|
103
|
+
|
|
104
|
+
p.log.step('Profile comparison');
|
|
105
|
+
p.log.info(` ${'max'.padEnd(12)} ${usd(maxCost).padStart(10)} —`);
|
|
106
|
+
p.log.info(` ${'pro'.padEnd(12)} ${usd(proCost).padStart(10)} ${diffLabel(proCost, maxCost)}`);
|
|
107
|
+
p.log.info(` ${'all-opus'.padEnd(12)} ${usd(allOpusCost).padStart(10)} ${diffLabel(allOpusCost, maxCost)}`);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
p.outro('');
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
function diffLabel(cost, baseline) {
|
|
114
|
+
if (baseline === 0) return '';
|
|
115
|
+
const diff = ((cost - baseline) / baseline) * 100;
|
|
116
|
+
const sign = diff >= 0 ? '+' : '';
|
|
117
|
+
return chalk.gray(`${sign}${Math.round(diff)}%`);
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
async function handleReset(root, force) {
|
|
121
|
+
const filePath = join(root, USAGE_PATH);
|
|
122
|
+
|
|
123
|
+
p.intro(chalk.bold.cyan('Guild — Reset Usage Stats'));
|
|
124
|
+
|
|
125
|
+
if (!existsSync(filePath)) {
|
|
126
|
+
p.log.info('No usage data found. Nothing to reset.');
|
|
127
|
+
p.outro('');
|
|
128
|
+
return;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
if (!force) {
|
|
132
|
+
const confirmed = await p.confirm({
|
|
133
|
+
message: 'This will delete all usage history. Continue?',
|
|
134
|
+
initialValue: false,
|
|
135
|
+
});
|
|
136
|
+
|
|
137
|
+
if (p.isCancel(confirmed) || !confirmed) {
|
|
138
|
+
p.cancel('Reset cancelled.');
|
|
139
|
+
return;
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
copyFileSync(filePath, filePath + '.bak');
|
|
144
|
+
unlinkSync(filePath);
|
|
145
|
+
p.log.success(`${chalk.green('✓')} Usage history deleted. Backup saved as usage.json.bak.`);
|
|
146
|
+
p.outro('');
|
|
147
|
+
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'fs';
|
|
2
2
|
import { basename, join } from 'path';
|
|
3
|
-
import { findWorkspaceRoot, WORKSPACE_FILE } from '../utils/workspace.js';
|
|
3
|
+
import { findWorkspaceRoot, loadWorkspace, runInMember, PRESET_COMMANDS, WORKSPACE_FILE } from '../utils/workspace.js';
|
|
4
4
|
|
|
5
5
|
export async function createWorkspaceFile(name, memberPaths) {
|
|
6
6
|
const members = memberPaths.map(p => ({
|
|
@@ -38,6 +38,43 @@ export async function addWorkspaceMember(memberPath) {
|
|
|
38
38
|
writeFileSync(filePath, JSON.stringify(config, null, 2) + '\n', 'utf8');
|
|
39
39
|
}
|
|
40
40
|
|
|
41
|
+
export function runWorkspaceCommand(memberName, preset, options) {
|
|
42
|
+
const workspace = loadWorkspace();
|
|
43
|
+
if (!workspace) throw new Error('No workspace found. Run `guild workspace init` first.');
|
|
44
|
+
|
|
45
|
+
// Resolve command
|
|
46
|
+
let cmd, args;
|
|
47
|
+
if (options.cmd) {
|
|
48
|
+
const parts = options.cmd.split(/\s+/);
|
|
49
|
+
cmd = parts[0];
|
|
50
|
+
args = parts.slice(1);
|
|
51
|
+
} else if (preset && PRESET_COMMANDS[preset]) {
|
|
52
|
+
({ cmd, args } = PRESET_COMMANDS[preset]);
|
|
53
|
+
} else {
|
|
54
|
+
throw new Error(`Unknown command: "${preset}". Use test, lint, build, or --cmd "...".`);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// Resolve members
|
|
58
|
+
let targets;
|
|
59
|
+
if (options.all) {
|
|
60
|
+
targets = workspace.members;
|
|
61
|
+
} else {
|
|
62
|
+
const member = workspace.members.find(m => m.name === memberName);
|
|
63
|
+
if (!member) {
|
|
64
|
+
const available = workspace.members.map(m => m.name).join(', ');
|
|
65
|
+
throw new Error(`Member "${memberName}" not found. Available: ${available}`);
|
|
66
|
+
}
|
|
67
|
+
targets = [member];
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Execute sequentially, collect all
|
|
71
|
+
const results = [];
|
|
72
|
+
for (const target of targets) {
|
|
73
|
+
results.push(runInMember(target, cmd, args));
|
|
74
|
+
}
|
|
75
|
+
return results;
|
|
76
|
+
}
|
|
77
|
+
|
|
41
78
|
export async function getWorkspaceStatus() {
|
|
42
79
|
const root = findWorkspaceRoot();
|
|
43
80
|
if (!root) throw new Error('No workspace found. Run `guild workspace init` first.');
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "build-feature",
|
|
3
|
+
"evals": [
|
|
4
|
+
{
|
|
5
|
+
"id": "bf-has-core-phases",
|
|
6
|
+
"description": "Plan contains evaluate, specify, design, implement phases",
|
|
7
|
+
"expectations": [
|
|
8
|
+
{ "text": "Has evaluate step", "assertion": "step-exists:evaluate" },
|
|
9
|
+
{ "text": "Has specify step", "assertion": "step-exists:specify" },
|
|
10
|
+
{ "text": "Has design step", "assertion": "step-exists:design" },
|
|
11
|
+
{ "text": "Has implement step", "assertion": "step-exists:implement" }
|
|
12
|
+
]
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"id": "bf-has-quality-phases",
|
|
16
|
+
"description": "Plan contains review, QA, and completion phases",
|
|
17
|
+
"expectations": [
|
|
18
|
+
{ "text": "Has review step", "assertion": "step-exists:review" },
|
|
19
|
+
{ "text": "Has QA phase", "assertion": "step-exists:qa-phase" },
|
|
20
|
+
{ "text": "Has completion step", "assertion": "step-exists:completion" }
|
|
21
|
+
]
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"id": "bf-advisor-uses-reasoning",
|
|
25
|
+
"description": "Advisor (evaluate) uses reasoning tier",
|
|
26
|
+
"expectations": [
|
|
27
|
+
{ "text": "Evaluate uses reasoning tier", "assertion": "step-model-tier:evaluate:reasoning" }
|
|
28
|
+
]
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
"id": "bf-developer-uses-execution",
|
|
32
|
+
"description": "Developer (implement) uses execution tier",
|
|
33
|
+
"expectations": [
|
|
34
|
+
{ "text": "Implement uses execution tier", "assertion": "step-model-tier:implement:execution" }
|
|
35
|
+
]
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
"id": "bf-gates-exist",
|
|
39
|
+
"description": "Quality gates exist at pre-review and final",
|
|
40
|
+
"expectations": [
|
|
41
|
+
{ "text": "Pre-review gate exists", "assertion": "gate-exists:gate-pre-review" },
|
|
42
|
+
{ "text": "Final gate exists", "assertion": "gate-exists:gate-final" }
|
|
43
|
+
]
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"id": "bf-minimum-steps",
|
|
47
|
+
"description": "Plan has at least 10 steps",
|
|
48
|
+
"expectations": [
|
|
49
|
+
{ "text": "At least 10 steps", "assertion": "step-count:10" }
|
|
50
|
+
]
|
|
51
|
+
}
|
|
52
|
+
]
|
|
53
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "build-feature",
|
|
3
|
+
"matcherType": "keyword",
|
|
4
|
+
"description": "Full pipeline: evaluation -> spec -> implementation -> review -> QA",
|
|
5
|
+
"threshold": 0.3,
|
|
6
|
+
"tests": [
|
|
7
|
+
{ "prompt": "build a new feature with full pipeline", "shouldTrigger": true },
|
|
8
|
+
{ "prompt": "implement this feature end to end", "shouldTrigger": true, "keywordExpected": false },
|
|
9
|
+
{ "prompt": "run the full implementation pipeline", "shouldTrigger": true },
|
|
10
|
+
{ "prompt": "I want to ship this end to end", "shouldTrigger": true, "keywordExpected": false },
|
|
11
|
+
{ "prompt": "review my code", "shouldTrigger": false },
|
|
12
|
+
{ "prompt": "create a pull request", "shouldTrigger": false },
|
|
13
|
+
{ "prompt": "save my session", "shouldTrigger": false },
|
|
14
|
+
{ "prompt": "debug this bug", "shouldTrigger": false }
|
|
15
|
+
]
|
|
16
|
+
}
|