guild-agents 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/README.md +16 -0
  2. package/bin/guild.js +46 -0
  3. package/package.json +1 -1
  4. package/src/commands/eval.js +225 -0
  5. package/src/commands/stats.js +147 -0
  6. package/src/templates/skills/build-feature/evals/triggers.json +16 -0
  7. package/src/templates/skills/council/evals/triggers.json +16 -0
  8. package/src/templates/skills/create-pr/evals/evals.json +44 -0
  9. package/src/templates/skills/create-pr/evals/triggers.json +16 -0
  10. package/src/templates/skills/debug/SKILL.md +1 -1
  11. package/src/templates/skills/debug/evals/triggers.json +16 -0
  12. package/src/templates/skills/dev-flow/evals/evals.json +36 -0
  13. package/src/templates/skills/dev-flow/evals/triggers.json +16 -0
  14. package/src/templates/skills/guild-specialize/evals/evals.json +54 -0
  15. package/src/templates/skills/guild-specialize/evals/triggers.json +16 -0
  16. package/src/templates/skills/new-feature/evals/evals.json +41 -0
  17. package/src/templates/skills/new-feature/evals/triggers.json +16 -0
  18. package/src/templates/skills/qa-cycle/evals/evals.json +46 -0
  19. package/src/templates/skills/qa-cycle/evals/triggers.json +16 -0
  20. package/src/templates/skills/re-specialize/evals/evals.json +48 -0
  21. package/src/templates/skills/re-specialize/evals/triggers.json +16 -0
  22. package/src/templates/skills/review/evals/evals.json +43 -0
  23. package/src/templates/skills/review/evals/triggers.json +16 -0
  24. package/src/templates/skills/session-end/evals/evals.json +40 -0
  25. package/src/templates/skills/session-end/evals/triggers.json +16 -0
  26. package/src/templates/skills/session-start/evals/evals.json +50 -0
  27. package/src/templates/skills/session-start/evals/triggers.json +16 -0
  28. package/src/templates/skills/status/evals/evals.json +40 -0
  29. package/src/templates/skills/status/evals/triggers.json +16 -0
  30. package/src/templates/skills/tdd/evals/triggers.json +16 -0
  31. package/src/templates/skills/verify/evals/triggers.json +16 -0
  32. package/src/utils/accounting.js +139 -0
  33. package/src/utils/benchmark.js +128 -0
  34. package/src/utils/description-analyzer.js +92 -0
  35. package/src/utils/pricing.js +28 -0
  36. package/src/utils/semantic-matcher.js +91 -0
  37. package/src/utils/trigger-matcher.js +64 -0
  38. package/src/utils/trigger-runner.js +132 -0
@@ -0,0 +1,139 @@
1
+ /**
2
+ * accounting.js — Token usage recording, persistence, and aggregation.
3
+ *
4
+ * Persists usage data to .claude/guild/usage.json.
5
+ */
6
+
7
+ import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
8
+ import { join, dirname } from 'path';
9
+ import { estimateCost } from './pricing.js';
10
+
11
+ const USAGE_PATH = join('.claude', 'guild', 'usage.json');
12
+
13
+ export function emptyUsage() {
14
+ return {
15
+ version: 1,
16
+ lastUpdated: new Date().toISOString(),
17
+ entries: [],
18
+ totals: {
19
+ totalTokens: 0,
20
+ totalInputTokens: 0,
21
+ totalOutputTokens: 0,
22
+ totalCostUSD: 0,
23
+ tokensByModel: {},
24
+ tokensByTier: {},
25
+ tokensByWorkflow: {},
26
+ workflowCount: 0,
27
+ },
28
+ };
29
+ }
30
+
31
+ export function createEntry({ workflow, agent, tier, model, inputTokens, outputTokens }) {
32
+ const totalTokens = inputTokens + outputTokens;
33
+ return {
34
+ timestamp: new Date().toISOString(),
35
+ workflow,
36
+ agent,
37
+ tier,
38
+ model,
39
+ inputTokens,
40
+ outputTokens,
41
+ totalTokens,
42
+ estimatedCostUSD: estimateCost(model, inputTokens, outputTokens),
43
+ };
44
+ }
45
+
46
+ export function loadUsage(root) {
47
+ const filePath = join(root, USAGE_PATH);
48
+ if (!existsSync(filePath)) return emptyUsage();
49
+ try {
50
+ return JSON.parse(readFileSync(filePath, 'utf8'));
51
+ } catch {
52
+ return emptyUsage();
53
+ }
54
+ }
55
+
56
+ export function saveUsage(root, usage) {
57
+ const filePath = join(root, USAGE_PATH);
58
+ mkdirSync(dirname(filePath), { recursive: true });
59
+ usage.lastUpdated = new Date().toISOString();
60
+ writeFileSync(filePath, JSON.stringify(usage, null, 2) + '\n');
61
+ }
62
+
63
+ function updateTotals(totals, entry) {
64
+ totals.totalTokens += entry.totalTokens;
65
+ totals.totalInputTokens += entry.inputTokens;
66
+ totals.totalOutputTokens += entry.outputTokens;
67
+ totals.totalCostUSD += entry.estimatedCostUSD;
68
+ totals.tokensByModel[entry.model] = (totals.tokensByModel[entry.model] || 0) + entry.totalTokens;
69
+ totals.tokensByTier[entry.tier] = (totals.tokensByTier[entry.tier] || 0) + entry.totalTokens;
70
+ totals.tokensByWorkflow[entry.workflow] = (totals.tokensByWorkflow[entry.workflow] || 0) + entry.totalTokens;
71
+ totals.workflowCount += 1;
72
+ }
73
+
74
+ export function recordStep(root, params) {
75
+ const usage = loadUsage(root);
76
+ const entry = createEntry(params);
77
+ usage.entries.push(entry);
78
+ updateTotals(usage.totals, entry);
79
+ saveUsage(root, usage);
80
+ }
81
+
82
+ const PROFILES = {
83
+ max: { reasoning: 'claude-opus-4-6', execution: 'claude-sonnet-4-5', routine: 'claude-haiku-4-5' },
84
+ pro: { reasoning: 'claude-sonnet-4-5', execution: 'claude-sonnet-4-5', routine: 'claude-haiku-4-5' },
85
+ 'all-opus': { reasoning: 'claude-opus-4-6', execution: 'claude-opus-4-6', routine: 'claude-opus-4-6' },
86
+ };
87
+
88
+ export function aggregate(root, period) {
89
+ const usage = loadUsage(root);
90
+ const now = new Date();
91
+ let cutoff;
92
+
93
+ switch (period) {
94
+ case 'today':
95
+ cutoff = new Date(now.getFullYear(), now.getMonth(), now.getDate());
96
+ break;
97
+ case 'week':
98
+ cutoff = new Date(now);
99
+ cutoff.setDate(cutoff.getDate() - 7);
100
+ break;
101
+ case 'month':
102
+ cutoff = new Date(now);
103
+ cutoff.setDate(cutoff.getDate() - 30);
104
+ break;
105
+ default:
106
+ cutoff = new Date(0);
107
+ }
108
+
109
+ const filtered = usage.entries.filter(e => new Date(e.timestamp) >= cutoff);
110
+
111
+ const totals = {
112
+ totalTokens: 0,
113
+ totalInputTokens: 0,
114
+ totalOutputTokens: 0,
115
+ totalCostUSD: 0,
116
+ tokensByModel: {},
117
+ tokensByTier: {},
118
+ tokensByWorkflow: {},
119
+ workflowCount: 0,
120
+ };
121
+
122
+ for (const entry of filtered) {
123
+ updateTotals(totals, entry);
124
+ }
125
+
126
+ return totals;
127
+ }
128
+
129
+ export function estimateWithProfile(entries, profileName) {
130
+ const profile = PROFILES[profileName];
131
+ if (!profile) return 0;
132
+
133
+ let cost = 0;
134
+ for (const entry of entries) {
135
+ const model = profile[entry.tier] || entry.model;
136
+ cost += estimateCost(model, entry.inputTokens, entry.outputTokens);
137
+ }
138
+ return cost;
139
+ }
@@ -0,0 +1,128 @@
1
+ /**
2
+ * benchmark.js — Records, reports, and detects regressions in eval benchmarks.
3
+ *
4
+ * Persists results to benchmarks/benchmark.json with 30-entry rotation.
5
+ * Generates benchmarks/benchmark.md as a human-readable report.
6
+ */
7
+
8
+ import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'fs';
9
+ import { dirname } from 'path';
10
+
11
+ const MAX_ENTRIES = 30;
12
+
13
+ /**
14
+ * Appends a benchmark entry to the JSON file, rotating old entries.
15
+ * @param {object} entry - Benchmark entry with timestamp, matcher, skills, aggregate
16
+ * @param {string} filePath - Path to benchmark.json
17
+ */
18
+ export function recordBenchmark(entry, filePath) {
19
+ const dir = dirname(filePath);
20
+ if (!existsSync(dir)) {
21
+ mkdirSync(dir, { recursive: true });
22
+ }
23
+
24
+ let entries = [];
25
+ if (existsSync(filePath)) {
26
+ entries = JSON.parse(readFileSync(filePath, 'utf8'));
27
+ }
28
+
29
+ entries.push(entry);
30
+
31
+ if (entries.length > MAX_ENTRIES) {
32
+ entries = entries.slice(entries.length - MAX_ENTRIES);
33
+ }
34
+
35
+ writeFileSync(filePath, JSON.stringify(entries, null, 2));
36
+ }
37
+
38
+ /**
39
+ * Generates a markdown report from a benchmark entry.
40
+ * @param {object} current - Current benchmark entry
41
+ * @param {object|null} previous - Previous entry for delta comparison
42
+ * @returns {string} Markdown report
43
+ */
44
+ export function generateReport(current, previous) {
45
+ const lines = [];
46
+ const date = current.timestamp;
47
+ const matcher = current.matcher;
48
+ const model = current.model ? ` (${current.model})` : '';
49
+
50
+ lines.push(`# Eval Benchmark — ${date}`);
51
+ lines.push(`Matcher: ${matcher}${model} | Skills: ${current.skills.length} | Total tests: ${current.aggregate.total}`);
52
+ lines.push('');
53
+ lines.push('| Skill | Accuracy | Precision | Recall | Delta |');
54
+ lines.push('|-------|----------|-----------|--------|-------|');
55
+
56
+ for (const skill of current.skills) {
57
+ let delta = '—';
58
+ if (previous) {
59
+ const prev = previous.skills.find(s => s.name === skill.name);
60
+ if (prev) {
61
+ const diff = (skill.accuracy - prev.accuracy) * 100;
62
+ if (Math.abs(diff) >= 0.1) {
63
+ const sign = diff > 0 ? '+' : '';
64
+ const warn = diff < -5 ? ' !!' : '';
65
+ delta = `${sign}${diff.toFixed(1)}%${warn}`;
66
+ }
67
+ }
68
+ }
69
+
70
+ lines.push(`| ${skill.name} | ${(skill.accuracy * 100).toFixed(1)}% | ${(skill.precision * 100).toFixed(1)}% | ${(skill.recall * 100).toFixed(1)}% | ${delta} |`);
71
+ }
72
+
73
+ lines.push('');
74
+ lines.push('## Aggregate');
75
+
76
+ let aggDelta = '';
77
+ if (previous) {
78
+ const diff = (current.aggregate.accuracy - previous.aggregate.accuracy) * 100;
79
+ if (Math.abs(diff) >= 0.1) {
80
+ const sign = diff > 0 ? '+' : '';
81
+ aggDelta = ` (Delta ${sign}${diff.toFixed(1)}%)`;
82
+ }
83
+ }
84
+
85
+ lines.push(`Accuracy: ${(current.aggregate.accuracy * 100).toFixed(1)}%${aggDelta}`);
86
+ lines.push(`Precision: ${(current.aggregate.precision * 100).toFixed(1)}%`);
87
+ lines.push(`Recall: ${(current.aggregate.recall * 100).toFixed(1)}%`);
88
+ lines.push('');
89
+
90
+ return lines.join('\n');
91
+ }
92
+
93
+ /**
94
+ * Detects regressions between two benchmark entries.
95
+ * A regression is: accuracy dropped >5% AND at least 2 tests flipped.
96
+ * @param {object} current
97
+ * @param {object|null} previous
98
+ * @returns {Array<{ skill: string, currentAccuracy: number, previousAccuracy: number, delta: number, flippedTests: number }>}
99
+ */
100
+ export function detectRegressions(current, previous) {
101
+ if (!previous) return [];
102
+
103
+ const regressions = [];
104
+
105
+ for (const skill of current.skills) {
106
+ const prev = previous.skills.find(s => s.name === skill.name);
107
+ if (!prev) continue;
108
+
109
+ const delta = skill.accuracy - prev.accuracy;
110
+ if (delta > -0.05) continue;
111
+
112
+ const currentCorrect = skill.tp + skill.tn;
113
+ const prevCorrect = prev.tp + prev.tn;
114
+ const flippedTests = Math.abs(currentCorrect - prevCorrect);
115
+
116
+ if (flippedTests < 2) continue;
117
+
118
+ regressions.push({
119
+ skill: skill.name,
120
+ currentAccuracy: skill.accuracy,
121
+ previousAccuracy: prev.accuracy,
122
+ delta,
123
+ flippedTests,
124
+ });
125
+ }
126
+
127
+ return regressions;
128
+ }
@@ -0,0 +1,92 @@
1
+ /**
2
+ * description-analyzer.js — Analyzes keyword gaps in skill descriptions.
3
+ *
4
+ * Uses token analysis to identify which keywords are missing from
5
+ * skill descriptions based on failed trigger tests. No LLM required.
6
+ */
7
+
8
+ import { tokenize } from './trigger-matcher.js';
9
+
10
+ const STOP_WORDS = new Set([
11
+ 'the', 'is', 'at', 'in', 'on', 'to', 'of', 'for', 'and', 'or', 'an',
12
+ 'it', 'by', 'as', 'be', 'do', 'if', 'no', 'so', 'up', 'we', 'my',
13
+ 'use', 'when', 'with', 'from', 'this', 'that', 'will', 'can', 'has',
14
+ 'not', 'are', 'was', 'but', 'all', 'any', 'its', 'you', 'your',
15
+ 'want', 'need', 'just', 'let', 'get', 'make', 'help', 'me',
16
+ ]);
17
+
18
+ /**
19
+ * Checks if a token matches any description token (full or substring).
20
+ */
21
+ function tokenMatchesDescription(token, descTokens) {
22
+ for (const dt of descTokens) {
23
+ if (dt === token || dt.includes(token) || token.includes(dt)) {
24
+ return true;
25
+ }
26
+ }
27
+ return false;
28
+ }
29
+
30
+ /**
31
+ * Analyzes gaps between failed trigger prompts and a skill description.
32
+ * @param {Array} triggerResults - Results from runTriggerTests
33
+ * @param {string} description - Skill description
34
+ * @returns {{ missingKeywords: string[], failedPrompts: string[] }}
35
+ */
36
+ export function analyzeGaps(triggerResults, description) {
37
+ const failedPositives = triggerResults.filter(r => r.expected && !r.actual);
38
+
39
+ if (failedPositives.length === 0) {
40
+ return { missingKeywords: [], failedPrompts: [] };
41
+ }
42
+
43
+ const descTokens = tokenize(description).filter(w => !STOP_WORDS.has(w));
44
+ const missingKeywords = [];
45
+ const failedPrompts = [];
46
+
47
+ for (const result of failedPositives) {
48
+ failedPrompts.push(result.prompt);
49
+ const promptTokens = tokenize(result.prompt).filter(w => !STOP_WORDS.has(w));
50
+
51
+ for (const token of promptTokens) {
52
+ if (!tokenMatchesDescription(token, descTokens)) {
53
+ missingKeywords.push(token);
54
+ }
55
+ }
56
+ }
57
+
58
+ return { missingKeywords, failedPrompts };
59
+ }
60
+
61
+ /**
62
+ * Generates keyword suggestions from gap analysis results.
63
+ * @param {Array<{ skill: string, currentDescription: string, missingKeywords: string[], failedPrompts: string[] }>} gapsList
64
+ * @returns {Array<{ skill: string, currentDescription: string, suggestedKeywords: Array<{ word: string, confidence: string }> }>}
65
+ */
66
+ export function generateSuggestions(gapsList) {
67
+ const suggestions = [];
68
+
69
+ for (const gaps of gapsList) {
70
+ if (gaps.missingKeywords.length === 0) continue;
71
+
72
+ const freq = new Map();
73
+ for (const word of gaps.missingKeywords) {
74
+ freq.set(word, (freq.get(word) || 0) + 1);
75
+ }
76
+
77
+ const suggestedKeywords = [...freq.entries()]
78
+ .sort((a, b) => b[1] - a[1])
79
+ .map(([word, count]) => ({
80
+ word,
81
+ confidence: count >= 2 ? 'high' : 'medium',
82
+ }));
83
+
84
+ suggestions.push({
85
+ skill: gaps.skill,
86
+ currentDescription: gaps.currentDescription,
87
+ suggestedKeywords,
88
+ });
89
+ }
90
+
91
+ return suggestions;
92
+ }
@@ -0,0 +1,28 @@
1
+ /**
2
+ * pricing.js — Model pricing table and cost calculation.
3
+ *
4
+ * Prices per million tokens (USD).
5
+ * Source: https://docs.anthropic.com/en/docs/about-claude/models
6
+ */
7
+
8
+ export const DEFAULT_PRICING = {
9
+ 'claude-opus-4-6': { input: 15.00, output: 75.00 },
10
+ 'claude-sonnet-4-5': { input: 3.00, output: 15.00 },
11
+ 'claude-haiku-4-5': { input: 0.80, output: 4.00 },
12
+ };
13
+
14
+ const SHORT_NAMES = {
15
+ 'claude-opus-4-6': 'Opus',
16
+ 'claude-sonnet-4-5': 'Sonnet',
17
+ 'claude-haiku-4-5': 'Haiku',
18
+ };
19
+
20
+ export function estimateCost(model, inputTokens, outputTokens) {
21
+ const pricing = DEFAULT_PRICING[model];
22
+ if (!pricing) return 0;
23
+ return (inputTokens * pricing.input + outputTokens * pricing.output) / 1_000_000;
24
+ }
25
+
26
+ export function getModelShortName(model) {
27
+ return SHORT_NAMES[model] || model;
28
+ }
@@ -0,0 +1,91 @@
1
+ /**
2
+ * semantic-matcher.js — LLM-based trigger scoring via Anthropic Haiku.
3
+ *
4
+ * Calls the Anthropic Messages API to score how well a user prompt
5
+ * matches a skill. Optional complement to the keyword matcher.
6
+ */
7
+
8
+ export const SEMANTIC_MODEL_DEFAULT = 'claude-haiku-4-5-20251001';
9
+
10
+ const ANTHROPIC_API_URL = 'https://api.anthropic.com/v1/messages';
11
+
12
+ const SYSTEM_PROMPT = `You are a skill-routing classifier. Given a user prompt and a skill name + description, score how likely the user wants to trigger this skill.
13
+
14
+ Respond with ONLY a JSON object, no other text:
15
+ {"score": <0-100>, "reasoning": "<one sentence>"}
16
+
17
+ Score guide:
18
+ - 90-100: Clear, direct match
19
+ - 60-89: Likely match, related intent
20
+ - 30-59: Possible but ambiguous
21
+ - 0-29: Unrelated`;
22
+
23
+ /**
24
+ * Scores a prompt against a skill using the Anthropic Messages API.
25
+ * @param {string} prompt - User prompt to classify
26
+ * @param {string} skillName - Skill identifier
27
+ * @param {string} skillDescription - Skill description text
28
+ * @returns {Promise<{ score: number, reasoning: string, error?: boolean }>}
29
+ */
30
+ export async function scoreMatchSemantic(prompt, skillName, skillDescription) {
31
+ const apiKey = process.env.ANTHROPIC_API_KEY;
32
+ const model = process.env.GUILD_SEMANTIC_MODEL || SEMANTIC_MODEL_DEFAULT;
33
+
34
+ try {
35
+ const response = await fetch(ANTHROPIC_API_URL, {
36
+ method: 'POST',
37
+ headers: {
38
+ 'Content-Type': 'application/json',
39
+ 'x-api-key': apiKey,
40
+ 'anthropic-version': '2023-06-01',
41
+ },
42
+ body: JSON.stringify({
43
+ model,
44
+ max_tokens: 100,
45
+ system: SYSTEM_PROMPT,
46
+ messages: [
47
+ {
48
+ role: 'user',
49
+ content: `User prompt: "${prompt}"\nSkill: ${skillName}\nDescription: ${skillDescription}`,
50
+ },
51
+ ],
52
+ }),
53
+ });
54
+
55
+ if (!response.ok) {
56
+ return { score: 0, reasoning: `API error: ${response.status} ${response.statusText}`, error: true };
57
+ }
58
+
59
+ const data = await response.json();
60
+ const text = data.content[0].text;
61
+
62
+ return parseResponse(text);
63
+ } catch (err) {
64
+ return { score: 0, reasoning: err.message, error: true };
65
+ }
66
+ }
67
+
68
+ /**
69
+ * Parses the LLM response, extracting JSON with fallback.
70
+ * @param {string} text
71
+ * @returns {{ score: number, reasoning: string, error?: boolean }}
72
+ */
73
+ function parseResponse(text) {
74
+ // Try direct parse first
75
+ try {
76
+ const parsed = JSON.parse(text);
77
+ return { score: parsed.score / 100, reasoning: parsed.reasoning };
78
+ } catch {
79
+ // Fallback: extract first JSON object from text
80
+ const match = text.match(/\{[^}]+\}/);
81
+ if (match) {
82
+ try {
83
+ const parsed = JSON.parse(match[0]);
84
+ return { score: parsed.score / 100, reasoning: parsed.reasoning };
85
+ } catch {
86
+ // Fall through
87
+ }
88
+ }
89
+ return { score: 0, reasoning: 'parse-error', error: true };
90
+ }
91
+ }
@@ -0,0 +1,64 @@
1
+ /**
2
+ * trigger-matcher.js — Scores prompts against skill descriptions.
3
+ *
4
+ * Uses keyword overlap scoring to determine how well a user prompt
5
+ * matches a skill's description. No LLM calls — purely programmatic.
6
+ */
7
+
8
+ /**
9
+ * Tokenizes text into lowercase words, stripping punctuation.
10
+ * @param {string} text
11
+ * @returns {string[]}
12
+ */
13
+ export function tokenize(text) {
14
+ return text
15
+ .toLowerCase()
16
+ .replace(/[—–\-/]/g, ' ')
17
+ .replace(/[^\w\s]/g, '')
18
+ .split(/\s+/)
19
+ .filter(w => w.length > 1);
20
+ }
21
+
22
+ const STOP_WORDS = new Set([
23
+ 'the', 'is', 'at', 'in', 'on', 'to', 'of', 'for', 'and', 'or', 'an',
24
+ 'it', 'by', 'as', 'be', 'do', 'if', 'no', 'so', 'up', 'we', 'my',
25
+ 'use', 'when', 'with', 'from', 'this', 'that', 'will', 'can', 'has',
26
+ 'not', 'are', 'was', 'but', 'all', 'any', 'its', 'you', 'your',
27
+ 'skill', 'discipline',
28
+ ]);
29
+
30
+ /**
31
+ * Scores how well a prompt matches a description.
32
+ * Returns 0-1.
33
+ */
34
+ export function scoreMatch(prompt, description) {
35
+ const promptTokens = tokenize(prompt).filter(w => !STOP_WORDS.has(w));
36
+ if (promptTokens.length === 0) return 0;
37
+
38
+ const descTokens = new Set(tokenize(description).filter(w => !STOP_WORDS.has(w)));
39
+
40
+ let matches = 0;
41
+ for (const token of promptTokens) {
42
+ if (descTokens.has(token)) {
43
+ matches++;
44
+ } else {
45
+ for (const dt of descTokens) {
46
+ if (dt.includes(token) || token.includes(dt)) {
47
+ matches += 0.5;
48
+ break;
49
+ }
50
+ }
51
+ }
52
+ }
53
+
54
+ return matches / promptTokens.length;
55
+ }
56
+
57
+ /**
58
+ * Ranks all skills by match score descending.
59
+ */
60
+ export function rankSkills(prompt, skills) {
61
+ return skills
62
+ .map(s => ({ ...s, score: scoreMatch(prompt, s.description) }))
63
+ .sort((a, b) => b.score - a.score);
64
+ }
@@ -0,0 +1,132 @@
1
+ /**
2
+ * trigger-runner.js — Loads and executes trigger tests for skills.
3
+ */
4
+
5
+ import { readFileSync, existsSync, readdirSync } from 'fs';
6
+ import { join, dirname } from 'path';
7
+ import { fileURLToPath } from 'url';
8
+ import { rankSkills } from './trigger-matcher.js';
9
+ import { extractFrontmatterBlock, parseYamlFrontmatter } from './workflow-parser.js';
10
+
11
+ const __dirname = dirname(fileURLToPath(import.meta.url));
12
+ const TEMPLATES_DIR = join(__dirname, '..', 'templates', 'skills');
13
+
14
+ /**
15
+ * Loads triggers.json for a skill template.
16
+ * @param {string} skillName
17
+ * @returns {object|null}
18
+ */
19
+ export function loadTriggers(skillName) {
20
+ const triggersPath = join(TEMPLATES_DIR, skillName, 'evals', 'triggers.json');
21
+ if (!existsSync(triggersPath)) return null;
22
+ return JSON.parse(readFileSync(triggersPath, 'utf8'));
23
+ }
24
+
25
+ /**
26
+ * Loads all skill names and descriptions from templates.
27
+ * @returns {{ name: string, description: string }[]}
28
+ */
29
+ export function loadAllSkillDescriptions() {
30
+ const skillDirs = readdirSync(TEMPLATES_DIR, { withFileTypes: true })
31
+ .filter(d => d.isDirectory())
32
+ .map(d => d.name);
33
+
34
+ const skills = [];
35
+ for (const name of skillDirs) {
36
+ const skillPath = join(TEMPLATES_DIR, name, 'SKILL.md');
37
+ if (!existsSync(skillPath)) continue;
38
+ const content = readFileSync(skillPath, 'utf8');
39
+ const block = extractFrontmatterBlock(content);
40
+ if (!block) continue;
41
+ const fm = parseYamlFrontmatter(block.yaml);
42
+ if (fm.description) {
43
+ skills.push({ name, description: fm.description });
44
+ }
45
+ }
46
+ return skills;
47
+ }
48
+
49
+ /**
50
+ * Runs trigger tests for a skill.
51
+ *
52
+ * When matcherType is "keyword" and a test has keywordExpected defined,
53
+ * that value overrides shouldTrigger for accuracy calculation. This lets
54
+ * tests document the ideal (semantic) expectation while being honest
55
+ * about what keyword matching can achieve.
56
+ *
57
+ * @param {object} triggers - Trigger test config from triggers.json
58
+ * @param {Array} allSkills - All skill descriptions
59
+ * @param {object} [options] - Options
60
+ * @param {boolean} [options.semantic=false] - Use semantic matcher
61
+ * @param {Function} [options.scoreMatchSemantic] - Semantic scoring function (injected for testability)
62
+ */
63
+ export async function runTriggerTests(triggers, allSkills, options = {}) {
64
+ const { semantic = false, scoreMatchSemantic: semanticFn } = options;
65
+ const threshold = triggers.threshold || 0.3;
66
+ const isKeyword = !semantic && triggers.matcherType === 'keyword';
67
+ const results = [];
68
+
69
+ for (const test of triggers.tests) {
70
+ let actual, score, rank, reasoning;
71
+
72
+ if (semantic && semanticFn) {
73
+ const targetSkill = allSkills.find(s => s.name === triggers.skill);
74
+ const semanticResult = await semanticFn(test.prompt, triggers.skill, targetSkill?.description || triggers.description);
75
+ score = semanticResult.score;
76
+ actual = score >= threshold;
77
+ rank = null;
78
+ reasoning = semanticResult.reasoning;
79
+ } else {
80
+ const ranked = rankSkills(test.prompt, allSkills);
81
+ const targetRank = ranked.findIndex(s => s.name === triggers.skill);
82
+ score = targetRank >= 0 ? ranked[targetRank].score : 0;
83
+ actual = targetRank === 0 && score >= threshold;
84
+ rank = targetRank + 1;
85
+ }
86
+
87
+ const hasOverride = isKeyword && test.keywordExpected !== undefined;
88
+ const expected = hasOverride ? test.keywordExpected : test.shouldTrigger;
89
+
90
+ const result = {
91
+ prompt: test.prompt,
92
+ expected,
93
+ actual,
94
+ score,
95
+ rank,
96
+ matcherUsed: semantic ? 'semantic' : 'keyword',
97
+ };
98
+
99
+ if (reasoning) {
100
+ result.reasoning = reasoning;
101
+ }
102
+
103
+ if (hasOverride) {
104
+ result.semanticExpected = test.shouldTrigger;
105
+ }
106
+
107
+ results.push(result);
108
+ }
109
+
110
+ return results;
111
+ }
112
+
113
+ /**
114
+ * Computes precision, recall, and accuracy from trigger test results.
115
+ */
116
+ export function computeAccuracy(results) {
117
+ if (results.length === 0) return { precision: 0, recall: 0, accuracy: 0, total: 0, tp: 0, fp: 0, fn: 0, tn: 0 };
118
+
119
+ let tp = 0, fp = 0, fn = 0, tn = 0;
120
+ for (const r of results) {
121
+ if (r.expected && r.actual) tp++;
122
+ else if (!r.expected && r.actual) fp++;
123
+ else if (r.expected && !r.actual) fn++;
124
+ else tn++;
125
+ }
126
+
127
+ const precision = (tp + fp) > 0 ? tp / (tp + fp) : 0;
128
+ const recall = (tp + fn) > 0 ? tp / (tp + fn) : 0;
129
+ const accuracy = (tp + tn) / results.length;
130
+
131
+ return { precision, recall, accuracy, total: results.length, tp, fp, fn, tn };
132
+ }