guild-agents 1.3.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +19 -6
- package/bin/guild.js +46 -0
- package/package.json +2 -2
- package/src/commands/eval.js +225 -0
- package/src/commands/stats.js +147 -0
- package/src/templates/agents/advisor.md +0 -1
- package/src/templates/agents/developer.md +2 -2
- package/src/templates/agents/qa.md +1 -1
- package/src/templates/agents/tech-lead.md +2 -2
- package/src/templates/skills/build-feature/SKILL.md +53 -80
- package/src/templates/skills/build-feature/evals/evals.json +1 -2
- package/src/templates/skills/build-feature/evals/triggers.json +16 -0
- package/src/templates/skills/council/SKILL.md +2 -2
- package/src/templates/skills/council/evals/triggers.json +16 -0
- package/src/templates/skills/create-pr/evals/evals.json +44 -0
- package/src/templates/skills/create-pr/evals/triggers.json +16 -0
- package/src/templates/skills/debug/SKILL.md +1 -1
- package/src/templates/skills/debug/evals/triggers.json +16 -0
- package/src/templates/skills/dev-flow/SKILL.md +10 -12
- package/src/templates/skills/dev-flow/evals/evals.json +36 -0
- package/src/templates/skills/dev-flow/evals/triggers.json +16 -0
- package/src/templates/skills/guild-specialize/SKILL.md +0 -4
- package/src/templates/skills/guild-specialize/evals/evals.json +54 -0
- package/src/templates/skills/guild-specialize/evals/triggers.json +16 -0
- package/src/templates/skills/new-feature/evals/evals.json +41 -0
- package/src/templates/skills/new-feature/evals/triggers.json +16 -0
- package/src/templates/skills/qa-cycle/evals/evals.json +46 -0
- package/src/templates/skills/qa-cycle/evals/triggers.json +16 -0
- package/src/templates/skills/re-specialize/evals/evals.json +48 -0
- package/src/templates/skills/re-specialize/evals/triggers.json +16 -0
- package/src/templates/skills/review/evals/evals.json +43 -0
- package/src/templates/skills/review/evals/triggers.json +16 -0
- package/src/templates/skills/session-end/evals/evals.json +40 -0
- package/src/templates/skills/session-end/evals/triggers.json +16 -0
- package/src/templates/skills/session-start/evals/evals.json +50 -0
- package/src/templates/skills/session-start/evals/triggers.json +16 -0
- package/src/templates/skills/status/SKILL.md +1 -1
- package/src/templates/skills/status/evals/evals.json +40 -0
- package/src/templates/skills/status/evals/triggers.json +16 -0
- package/src/templates/skills/tdd/evals/triggers.json +16 -0
- package/src/templates/skills/verify/evals/triggers.json +16 -0
- package/src/utils/accounting.js +139 -0
- package/src/utils/benchmark.js +128 -0
- package/src/utils/description-analyzer.js +92 -0
- package/src/utils/dispatch-protocol.js +0 -3
- package/src/utils/executor.js +133 -23
- package/src/utils/pricing.js +28 -0
- package/src/utils/semantic-matcher.js +91 -0
- package/src/utils/trigger-matcher.js +64 -0
- package/src/utils/trigger-runner.js +132 -0
- package/src/templates/agents/db-migration.md +0 -51
- package/src/templates/agents/platform-expert.md +0 -92
- package/src/templates/agents/product-owner.md +0 -52
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* benchmark.js — Records, reports, and detects regressions in eval benchmarks.
|
|
3
|
+
*
|
|
4
|
+
* Persists results to benchmarks/benchmark.json with 30-entry rotation.
|
|
5
|
+
* Generates benchmarks/benchmark.md as a human-readable report.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'fs';
|
|
9
|
+
import { dirname } from 'path';
|
|
10
|
+
|
|
11
|
+
const MAX_ENTRIES = 30;
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Appends a benchmark entry to the JSON file, rotating old entries.
|
|
15
|
+
* @param {object} entry - Benchmark entry with timestamp, matcher, skills, aggregate
|
|
16
|
+
* @param {string} filePath - Path to benchmark.json
|
|
17
|
+
*/
|
|
18
|
+
export function recordBenchmark(entry, filePath) {
|
|
19
|
+
const dir = dirname(filePath);
|
|
20
|
+
if (!existsSync(dir)) {
|
|
21
|
+
mkdirSync(dir, { recursive: true });
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
let entries = [];
|
|
25
|
+
if (existsSync(filePath)) {
|
|
26
|
+
entries = JSON.parse(readFileSync(filePath, 'utf8'));
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
entries.push(entry);
|
|
30
|
+
|
|
31
|
+
if (entries.length > MAX_ENTRIES) {
|
|
32
|
+
entries = entries.slice(entries.length - MAX_ENTRIES);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
writeFileSync(filePath, JSON.stringify(entries, null, 2));
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Generates a markdown report from a benchmark entry.
|
|
40
|
+
* @param {object} current - Current benchmark entry
|
|
41
|
+
* @param {object|null} previous - Previous entry for delta comparison
|
|
42
|
+
* @returns {string} Markdown report
|
|
43
|
+
*/
|
|
44
|
+
export function generateReport(current, previous) {
|
|
45
|
+
const lines = [];
|
|
46
|
+
const date = current.timestamp;
|
|
47
|
+
const matcher = current.matcher;
|
|
48
|
+
const model = current.model ? ` (${current.model})` : '';
|
|
49
|
+
|
|
50
|
+
lines.push(`# Eval Benchmark — ${date}`);
|
|
51
|
+
lines.push(`Matcher: ${matcher}${model} | Skills: ${current.skills.length} | Total tests: ${current.aggregate.total}`);
|
|
52
|
+
lines.push('');
|
|
53
|
+
lines.push('| Skill | Accuracy | Precision | Recall | Delta |');
|
|
54
|
+
lines.push('|-------|----------|-----------|--------|-------|');
|
|
55
|
+
|
|
56
|
+
for (const skill of current.skills) {
|
|
57
|
+
let delta = '—';
|
|
58
|
+
if (previous) {
|
|
59
|
+
const prev = previous.skills.find(s => s.name === skill.name);
|
|
60
|
+
if (prev) {
|
|
61
|
+
const diff = (skill.accuracy - prev.accuracy) * 100;
|
|
62
|
+
if (Math.abs(diff) >= 0.1) {
|
|
63
|
+
const sign = diff > 0 ? '+' : '';
|
|
64
|
+
const warn = diff < -5 ? ' !!' : '';
|
|
65
|
+
delta = `${sign}${diff.toFixed(1)}%${warn}`;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
lines.push(`| ${skill.name} | ${(skill.accuracy * 100).toFixed(1)}% | ${(skill.precision * 100).toFixed(1)}% | ${(skill.recall * 100).toFixed(1)}% | ${delta} |`);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
lines.push('');
|
|
74
|
+
lines.push('## Aggregate');
|
|
75
|
+
|
|
76
|
+
let aggDelta = '';
|
|
77
|
+
if (previous) {
|
|
78
|
+
const diff = (current.aggregate.accuracy - previous.aggregate.accuracy) * 100;
|
|
79
|
+
if (Math.abs(diff) >= 0.1) {
|
|
80
|
+
const sign = diff > 0 ? '+' : '';
|
|
81
|
+
aggDelta = ` (Delta ${sign}${diff.toFixed(1)}%)`;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
lines.push(`Accuracy: ${(current.aggregate.accuracy * 100).toFixed(1)}%${aggDelta}`);
|
|
86
|
+
lines.push(`Precision: ${(current.aggregate.precision * 100).toFixed(1)}%`);
|
|
87
|
+
lines.push(`Recall: ${(current.aggregate.recall * 100).toFixed(1)}%`);
|
|
88
|
+
lines.push('');
|
|
89
|
+
|
|
90
|
+
return lines.join('\n');
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Detects regressions between two benchmark entries.
|
|
95
|
+
* A regression is: accuracy dropped >5% AND at least 2 tests flipped.
|
|
96
|
+
* @param {object} current
|
|
97
|
+
* @param {object|null} previous
|
|
98
|
+
* @returns {Array<{ skill: string, currentAccuracy: number, previousAccuracy: number, delta: number, flippedTests: number }>}
|
|
99
|
+
*/
|
|
100
|
+
export function detectRegressions(current, previous) {
|
|
101
|
+
if (!previous) return [];
|
|
102
|
+
|
|
103
|
+
const regressions = [];
|
|
104
|
+
|
|
105
|
+
for (const skill of current.skills) {
|
|
106
|
+
const prev = previous.skills.find(s => s.name === skill.name);
|
|
107
|
+
if (!prev) continue;
|
|
108
|
+
|
|
109
|
+
const delta = skill.accuracy - prev.accuracy;
|
|
110
|
+
if (delta > -0.05) continue;
|
|
111
|
+
|
|
112
|
+
const currentCorrect = skill.tp + skill.tn;
|
|
113
|
+
const prevCorrect = prev.tp + prev.tn;
|
|
114
|
+
const flippedTests = Math.abs(currentCorrect - prevCorrect);
|
|
115
|
+
|
|
116
|
+
if (flippedTests < 2) continue;
|
|
117
|
+
|
|
118
|
+
regressions.push({
|
|
119
|
+
skill: skill.name,
|
|
120
|
+
currentAccuracy: skill.accuracy,
|
|
121
|
+
previousAccuracy: prev.accuracy,
|
|
122
|
+
delta,
|
|
123
|
+
flippedTests,
|
|
124
|
+
});
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
return regressions;
|
|
128
|
+
}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* description-analyzer.js — Analyzes keyword gaps in skill descriptions.
|
|
3
|
+
*
|
|
4
|
+
* Uses token analysis to identify which keywords are missing from
|
|
5
|
+
* skill descriptions based on failed trigger tests. No LLM required.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { tokenize } from './trigger-matcher.js';
|
|
9
|
+
|
|
10
|
+
const STOP_WORDS = new Set([
|
|
11
|
+
'the', 'is', 'at', 'in', 'on', 'to', 'of', 'for', 'and', 'or', 'an',
|
|
12
|
+
'it', 'by', 'as', 'be', 'do', 'if', 'no', 'so', 'up', 'we', 'my',
|
|
13
|
+
'use', 'when', 'with', 'from', 'this', 'that', 'will', 'can', 'has',
|
|
14
|
+
'not', 'are', 'was', 'but', 'all', 'any', 'its', 'you', 'your',
|
|
15
|
+
'want', 'need', 'just', 'let', 'get', 'make', 'help', 'me',
|
|
16
|
+
]);
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Checks if a token matches any description token (full or substring).
|
|
20
|
+
*/
|
|
21
|
+
function tokenMatchesDescription(token, descTokens) {
|
|
22
|
+
for (const dt of descTokens) {
|
|
23
|
+
if (dt === token || dt.includes(token) || token.includes(dt)) {
|
|
24
|
+
return true;
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
return false;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Analyzes gaps between failed trigger prompts and a skill description.
|
|
32
|
+
* @param {Array} triggerResults - Results from runTriggerTests
|
|
33
|
+
* @param {string} description - Skill description
|
|
34
|
+
* @returns {{ missingKeywords: string[], failedPrompts: string[] }}
|
|
35
|
+
*/
|
|
36
|
+
export function analyzeGaps(triggerResults, description) {
|
|
37
|
+
const failedPositives = triggerResults.filter(r => r.expected && !r.actual);
|
|
38
|
+
|
|
39
|
+
if (failedPositives.length === 0) {
|
|
40
|
+
return { missingKeywords: [], failedPrompts: [] };
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
const descTokens = tokenize(description).filter(w => !STOP_WORDS.has(w));
|
|
44
|
+
const missingKeywords = [];
|
|
45
|
+
const failedPrompts = [];
|
|
46
|
+
|
|
47
|
+
for (const result of failedPositives) {
|
|
48
|
+
failedPrompts.push(result.prompt);
|
|
49
|
+
const promptTokens = tokenize(result.prompt).filter(w => !STOP_WORDS.has(w));
|
|
50
|
+
|
|
51
|
+
for (const token of promptTokens) {
|
|
52
|
+
if (!tokenMatchesDescription(token, descTokens)) {
|
|
53
|
+
missingKeywords.push(token);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
return { missingKeywords, failedPrompts };
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Generates keyword suggestions from gap analysis results.
|
|
63
|
+
* @param {Array<{ skill: string, currentDescription: string, missingKeywords: string[], failedPrompts: string[] }>} gapsList
|
|
64
|
+
* @returns {Array<{ skill: string, currentDescription: string, suggestedKeywords: Array<{ word: string, confidence: string }> }>}
|
|
65
|
+
*/
|
|
66
|
+
export function generateSuggestions(gapsList) {
|
|
67
|
+
const suggestions = [];
|
|
68
|
+
|
|
69
|
+
for (const gaps of gapsList) {
|
|
70
|
+
if (gaps.missingKeywords.length === 0) continue;
|
|
71
|
+
|
|
72
|
+
const freq = new Map();
|
|
73
|
+
for (const word of gaps.missingKeywords) {
|
|
74
|
+
freq.set(word, (freq.get(word) || 0) + 1);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
const suggestedKeywords = [...freq.entries()]
|
|
78
|
+
.sort((a, b) => b[1] - a[1])
|
|
79
|
+
.map(([word, count]) => ({
|
|
80
|
+
word,
|
|
81
|
+
confidence: count >= 2 ? 'high' : 'medium',
|
|
82
|
+
}));
|
|
83
|
+
|
|
84
|
+
suggestions.push({
|
|
85
|
+
skill: gaps.skill,
|
|
86
|
+
currentDescription: gaps.currentDescription,
|
|
87
|
+
suggestedKeywords,
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
return suggestions;
|
|
92
|
+
}
|
|
@@ -34,14 +34,11 @@ export const DEFAULT_FAILURE_STRATEGY = 'abort';
|
|
|
34
34
|
*/
|
|
35
35
|
export const DEFAULT_AGENT_TIERS = {
|
|
36
36
|
'advisor': 'reasoning',
|
|
37
|
-
'product-owner': 'reasoning',
|
|
38
37
|
'tech-lead': 'reasoning',
|
|
39
38
|
'code-reviewer': 'reasoning',
|
|
40
39
|
'developer': 'execution',
|
|
41
40
|
'bugfix': 'execution',
|
|
42
|
-
'db-migration': 'execution',
|
|
43
41
|
'qa': 'execution',
|
|
44
|
-
'platform-expert': 'execution',
|
|
45
42
|
'learnings-extractor': 'routine',
|
|
46
43
|
};
|
|
47
44
|
|
package/src/utils/executor.js
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Drives a plan to completion by iterating through steps, dispatching
|
|
5
5
|
* agent steps to a provider function and system steps to local commands.
|
|
6
|
-
*
|
|
6
|
+
* Supports parallel execution (v1.2) and delegation to sub-skills.
|
|
7
7
|
*/
|
|
8
8
|
|
|
9
9
|
import { execFile } from 'child_process';
|
|
@@ -11,8 +11,15 @@ import {
|
|
|
11
11
|
advanceStep,
|
|
12
12
|
getNextSteps,
|
|
13
13
|
isPlanComplete,
|
|
14
|
+
MAX_DELEGATION_DEPTH,
|
|
15
|
+
createExecutionPlan,
|
|
14
16
|
} from './orchestrator.js';
|
|
15
|
-
import {
|
|
17
|
+
import {
|
|
18
|
+
buildStepContext,
|
|
19
|
+
recordStepTrace,
|
|
20
|
+
loadWorkflow,
|
|
21
|
+
resolveStepDispatch,
|
|
22
|
+
} from './orchestrator-io.js';
|
|
16
23
|
|
|
17
24
|
const SYSTEM_STEP_TIMEOUT = 120_000; // 2 minutes
|
|
18
25
|
|
|
@@ -70,7 +77,7 @@ async function executeSystemStep(step, options = {}) {
|
|
|
70
77
|
}
|
|
71
78
|
|
|
72
79
|
if (step.delegatesTo) {
|
|
73
|
-
return { status: 'passed', output: `
|
|
80
|
+
return { status: 'passed', output: `System step with delegation — handled by executeDelegation` };
|
|
74
81
|
}
|
|
75
82
|
|
|
76
83
|
return { status: 'passed', output: 'System step completed' };
|
|
@@ -92,12 +99,111 @@ function findStepInPlan(plan, stepId) {
|
|
|
92
99
|
return null;
|
|
93
100
|
}
|
|
94
101
|
|
|
102
|
+
/**
|
|
103
|
+
* Dispatches a single step (agent or system) and returns its result.
|
|
104
|
+
*
|
|
105
|
+
* @param {object} step - Step definition
|
|
106
|
+
* @param {object} dispatch - Dispatch info for this step
|
|
107
|
+
* @param {object} context - Execution context
|
|
108
|
+
* @param {import('./orchestrator.js').ExecutionPlan} context.currentPlan - Current plan state
|
|
109
|
+
* @param {Function} context.provider - Agent step provider
|
|
110
|
+
* @param {string} context.projectRoot - Working directory
|
|
111
|
+
* @param {string} context.skillBody - Skill body text
|
|
112
|
+
* @param {object} context.executeOptions - Full options passed to execute()
|
|
113
|
+
* @returns {Promise<{ status: string, output: string, outcome?: object, error?: string }>}
|
|
114
|
+
*/
|
|
115
|
+
async function dispatchStep(step, dispatch, context) {
|
|
116
|
+
const { currentPlan, provider, projectRoot, skillBody, executeOptions } = context;
|
|
117
|
+
|
|
118
|
+
if (step.role === 'system' && step.delegatesTo) {
|
|
119
|
+
return executeDelegation(step, executeOptions);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
if (step.role === 'system') {
|
|
123
|
+
return executeSystemStep(step, { projectRoot });
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
const stepContext = buildStepContext(step, currentPlan, { skillBody });
|
|
127
|
+
return provider(step, dispatch, stepContext);
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/**
|
|
131
|
+
* Executes a delegation step by loading and running the sub-skill.
|
|
132
|
+
*
|
|
133
|
+
* @param {object} step - Delegation step (with delegatesTo field)
|
|
134
|
+
* @param {object} options - Execute options from parent
|
|
135
|
+
* @returns {Promise<{ status: string, output: string, error?: string }>}
|
|
136
|
+
*/
|
|
137
|
+
async function executeDelegation(step, options) {
|
|
138
|
+
const {
|
|
139
|
+
provider,
|
|
140
|
+
trace,
|
|
141
|
+
projectRoot,
|
|
142
|
+
profile = 'max',
|
|
143
|
+
onStepStart,
|
|
144
|
+
onStepEnd,
|
|
145
|
+
delegationDepth = 0,
|
|
146
|
+
} = options;
|
|
147
|
+
|
|
148
|
+
if (delegationDepth >= MAX_DELEGATION_DEPTH) {
|
|
149
|
+
return {
|
|
150
|
+
status: 'failed',
|
|
151
|
+
output: '',
|
|
152
|
+
error: `Delegation depth limit (${MAX_DELEGATION_DEPTH}) exceeded at step "${step.id}" delegating to "${step.delegatesTo}"`,
|
|
153
|
+
};
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
let subSkill;
|
|
157
|
+
try {
|
|
158
|
+
subSkill = loadWorkflow(step.delegatesTo);
|
|
159
|
+
} catch (err) {
|
|
160
|
+
return {
|
|
161
|
+
status: 'failed',
|
|
162
|
+
output: '',
|
|
163
|
+
error: `Failed to load delegated skill "${step.delegatesTo}": ${err.message}`,
|
|
164
|
+
};
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
const subPlan = createExecutionPlan(subSkill.workflow, {
|
|
168
|
+
skillName: subSkill.name || step.delegatesTo,
|
|
169
|
+
});
|
|
170
|
+
|
|
171
|
+
const subDispatchMap = {};
|
|
172
|
+
for (const group of subPlan.groups) {
|
|
173
|
+
for (const s of group.steps) {
|
|
174
|
+
subDispatchMap[s.id] = resolveStepDispatch(s, { profile, projectRoot });
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
const finalSubPlan = await execute(subPlan, subDispatchMap, {
|
|
179
|
+
provider,
|
|
180
|
+
trace,
|
|
181
|
+
projectRoot,
|
|
182
|
+
skillBody: subSkill.body || '',
|
|
183
|
+
onStepStart,
|
|
184
|
+
onStepEnd,
|
|
185
|
+
delegationDepth: delegationDepth + 1,
|
|
186
|
+
profile,
|
|
187
|
+
});
|
|
188
|
+
|
|
189
|
+
if (finalSubPlan.status === 'completed') {
|
|
190
|
+
return { status: 'passed', output: `Delegation to "${step.delegatesTo}" completed` };
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
return {
|
|
194
|
+
status: 'failed',
|
|
195
|
+
output: '',
|
|
196
|
+
error: `Delegated skill "${step.delegatesTo}" ended with status: ${finalSubPlan.status}`,
|
|
197
|
+
};
|
|
198
|
+
}
|
|
199
|
+
|
|
95
200
|
/**
|
|
96
201
|
* Executes a workflow plan to completion.
|
|
97
202
|
*
|
|
98
203
|
* Drives the orchestrator state machine by repeatedly calling getNextSteps,
|
|
99
204
|
* dispatching each step (agent via provider, system via local commands),
|
|
100
|
-
* and advancing the plan with the result.
|
|
205
|
+
* and advancing the plan with the result. Parallel groups are dispatched
|
|
206
|
+
* concurrently via Promise.all.
|
|
101
207
|
*
|
|
102
208
|
* @param {import('./orchestrator.js').ExecutionPlan} plan - Initial execution plan
|
|
103
209
|
* @param {Object.<string, import('./orchestrator-io.js').StepDispatchInfo>} dispatchInfoMap - Dispatch info per step
|
|
@@ -108,6 +214,8 @@ function findStepInPlan(plan, stepId) {
|
|
|
108
214
|
* @param {string} [options.skillBody=''] - Skill body text for context building
|
|
109
215
|
* @param {Function} [options.onStepStart] - Callback before each step: (step, dispatch) => void
|
|
110
216
|
* @param {Function} [options.onStepEnd] - Callback after each step: (step, result) => void
|
|
217
|
+
* @param {number} [options.delegationDepth=0] - Current delegation nesting depth
|
|
218
|
+
* @param {string} [options.profile='max'] - Model profile for delegation dispatch
|
|
111
219
|
* @returns {Promise<import('./orchestrator.js').ExecutionPlan>} Final plan state
|
|
112
220
|
*/
|
|
113
221
|
export async function execute(plan, dispatchInfoMap, options = {}) {
|
|
@@ -127,7 +235,6 @@ export async function execute(plan, dispatchInfoMap, options = {}) {
|
|
|
127
235
|
while (!isPlanComplete(currentPlan)) {
|
|
128
236
|
const { steps, skipped } = getNextSteps(currentPlan);
|
|
129
237
|
|
|
130
|
-
// Advance skipped steps first
|
|
131
238
|
for (const stepId of skipped) {
|
|
132
239
|
currentPlan = advanceStep(currentPlan, stepId, { status: 'skipped' });
|
|
133
240
|
|
|
@@ -140,7 +247,6 @@ export async function execute(plan, dispatchInfoMap, options = {}) {
|
|
|
140
247
|
}
|
|
141
248
|
}
|
|
142
249
|
|
|
143
|
-
// If no executable steps remain, check completion again
|
|
144
250
|
if (steps.length === 0) {
|
|
145
251
|
if (isPlanComplete(currentPlan)) break;
|
|
146
252
|
if (++emptyIterations > MAX_EMPTY_ITERATIONS) {
|
|
@@ -151,30 +257,34 @@ export async function execute(plan, dispatchInfoMap, options = {}) {
|
|
|
151
257
|
}
|
|
152
258
|
emptyIterations = 0;
|
|
153
259
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
260
|
+
const dispatchContext = {
|
|
261
|
+
currentPlan,
|
|
262
|
+
provider,
|
|
263
|
+
projectRoot,
|
|
264
|
+
skillBody,
|
|
265
|
+
executeOptions: options,
|
|
266
|
+
};
|
|
157
267
|
|
|
158
|
-
|
|
268
|
+
const settled = await Promise.all(
|
|
269
|
+
steps.map(async (step) => {
|
|
270
|
+
const dispatch = dispatchInfoMap[step.id] || {};
|
|
271
|
+
onStepStart?.(step, dispatch);
|
|
272
|
+
const result = await dispatchStep(step, dispatch, dispatchContext);
|
|
273
|
+
return { step, dispatch, result };
|
|
274
|
+
})
|
|
275
|
+
);
|
|
159
276
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
result = await executeSystemStep(step, { projectRoot });
|
|
163
|
-
} else {
|
|
164
|
-
const context = buildStepContext(step, currentPlan, { skillBody });
|
|
165
|
-
result = await provider(step, dispatch, context);
|
|
166
|
-
}
|
|
277
|
+
for (const { step, dispatch, result } of settled) {
|
|
278
|
+
currentPlan = advanceStep(currentPlan, step.id, result);
|
|
167
279
|
|
|
168
|
-
|
|
280
|
+
if (trace) {
|
|
281
|
+
recordStepTrace(trace, step, currentPlan.stepStates[step.id], dispatch);
|
|
282
|
+
}
|
|
169
283
|
|
|
170
|
-
|
|
171
|
-
recordStepTrace(trace, step, currentPlan.stepStates[step.id], dispatch);
|
|
284
|
+
onStepEnd?.(step, result);
|
|
172
285
|
}
|
|
173
|
-
|
|
174
|
-
onStepEnd?.(step, result);
|
|
175
286
|
}
|
|
176
287
|
|
|
177
|
-
// Mark plan as completed if all steps reached terminal state and plan is still running
|
|
178
288
|
if (currentPlan.status === 'running' && isPlanComplete(currentPlan)) {
|
|
179
289
|
currentPlan = { ...currentPlan, status: 'completed' };
|
|
180
290
|
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pricing.js — Model pricing table and cost calculation.
|
|
3
|
+
*
|
|
4
|
+
* Prices per million tokens (USD).
|
|
5
|
+
* Source: https://docs.anthropic.com/en/docs/about-claude/models
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
export const DEFAULT_PRICING = {
|
|
9
|
+
'claude-opus-4-6': { input: 15.00, output: 75.00 },
|
|
10
|
+
'claude-sonnet-4-5': { input: 3.00, output: 15.00 },
|
|
11
|
+
'claude-haiku-4-5': { input: 0.80, output: 4.00 },
|
|
12
|
+
};
|
|
13
|
+
|
|
14
|
+
const SHORT_NAMES = {
|
|
15
|
+
'claude-opus-4-6': 'Opus',
|
|
16
|
+
'claude-sonnet-4-5': 'Sonnet',
|
|
17
|
+
'claude-haiku-4-5': 'Haiku',
|
|
18
|
+
};
|
|
19
|
+
|
|
20
|
+
export function estimateCost(model, inputTokens, outputTokens) {
|
|
21
|
+
const pricing = DEFAULT_PRICING[model];
|
|
22
|
+
if (!pricing) return 0;
|
|
23
|
+
return (inputTokens * pricing.input + outputTokens * pricing.output) / 1_000_000;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export function getModelShortName(model) {
|
|
27
|
+
return SHORT_NAMES[model] || model;
|
|
28
|
+
}
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* semantic-matcher.js — LLM-based trigger scoring via Anthropic Haiku.
|
|
3
|
+
*
|
|
4
|
+
* Calls the Anthropic Messages API to score how well a user prompt
|
|
5
|
+
* matches a skill. Optional complement to the keyword matcher.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
export const SEMANTIC_MODEL_DEFAULT = 'claude-haiku-4-5-20251001';
|
|
9
|
+
|
|
10
|
+
const ANTHROPIC_API_URL = 'https://api.anthropic.com/v1/messages';
|
|
11
|
+
|
|
12
|
+
const SYSTEM_PROMPT = `You are a skill-routing classifier. Given a user prompt and a skill name + description, score how likely the user wants to trigger this skill.
|
|
13
|
+
|
|
14
|
+
Respond with ONLY a JSON object, no other text:
|
|
15
|
+
{"score": <0-100>, "reasoning": "<one sentence>"}
|
|
16
|
+
|
|
17
|
+
Score guide:
|
|
18
|
+
- 90-100: Clear, direct match
|
|
19
|
+
- 60-89: Likely match, related intent
|
|
20
|
+
- 30-59: Possible but ambiguous
|
|
21
|
+
- 0-29: Unrelated`;
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Scores a prompt against a skill using the Anthropic Messages API.
|
|
25
|
+
* @param {string} prompt - User prompt to classify
|
|
26
|
+
* @param {string} skillName - Skill identifier
|
|
27
|
+
* @param {string} skillDescription - Skill description text
|
|
28
|
+
* @returns {Promise<{ score: number, reasoning: string, error?: boolean }>}
|
|
29
|
+
*/
|
|
30
|
+
export async function scoreMatchSemantic(prompt, skillName, skillDescription) {
|
|
31
|
+
const apiKey = process.env.ANTHROPIC_API_KEY;
|
|
32
|
+
const model = process.env.GUILD_SEMANTIC_MODEL || SEMANTIC_MODEL_DEFAULT;
|
|
33
|
+
|
|
34
|
+
try {
|
|
35
|
+
const response = await fetch(ANTHROPIC_API_URL, {
|
|
36
|
+
method: 'POST',
|
|
37
|
+
headers: {
|
|
38
|
+
'Content-Type': 'application/json',
|
|
39
|
+
'x-api-key': apiKey,
|
|
40
|
+
'anthropic-version': '2023-06-01',
|
|
41
|
+
},
|
|
42
|
+
body: JSON.stringify({
|
|
43
|
+
model,
|
|
44
|
+
max_tokens: 100,
|
|
45
|
+
system: SYSTEM_PROMPT,
|
|
46
|
+
messages: [
|
|
47
|
+
{
|
|
48
|
+
role: 'user',
|
|
49
|
+
content: `User prompt: "${prompt}"\nSkill: ${skillName}\nDescription: ${skillDescription}`,
|
|
50
|
+
},
|
|
51
|
+
],
|
|
52
|
+
}),
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
if (!response.ok) {
|
|
56
|
+
return { score: 0, reasoning: `API error: ${response.status} ${response.statusText}`, error: true };
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
const data = await response.json();
|
|
60
|
+
const text = data.content[0].text;
|
|
61
|
+
|
|
62
|
+
return parseResponse(text);
|
|
63
|
+
} catch (err) {
|
|
64
|
+
return { score: 0, reasoning: err.message, error: true };
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Parses the LLM response, extracting JSON with fallback.
|
|
70
|
+
* @param {string} text
|
|
71
|
+
* @returns {{ score: number, reasoning: string, error?: boolean }}
|
|
72
|
+
*/
|
|
73
|
+
function parseResponse(text) {
|
|
74
|
+
// Try direct parse first
|
|
75
|
+
try {
|
|
76
|
+
const parsed = JSON.parse(text);
|
|
77
|
+
return { score: parsed.score / 100, reasoning: parsed.reasoning };
|
|
78
|
+
} catch {
|
|
79
|
+
// Fallback: extract first JSON object from text
|
|
80
|
+
const match = text.match(/\{[^}]+\}/);
|
|
81
|
+
if (match) {
|
|
82
|
+
try {
|
|
83
|
+
const parsed = JSON.parse(match[0]);
|
|
84
|
+
return { score: parsed.score / 100, reasoning: parsed.reasoning };
|
|
85
|
+
} catch {
|
|
86
|
+
// Fall through
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
return { score: 0, reasoning: 'parse-error', error: true };
|
|
90
|
+
}
|
|
91
|
+
}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* trigger-matcher.js — Scores prompts against skill descriptions.
|
|
3
|
+
*
|
|
4
|
+
* Uses keyword overlap scoring to determine how well a user prompt
|
|
5
|
+
* matches a skill's description. No LLM calls — purely programmatic.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Tokenizes text into lowercase words, stripping punctuation.
|
|
10
|
+
* @param {string} text
|
|
11
|
+
* @returns {string[]}
|
|
12
|
+
*/
|
|
13
|
+
export function tokenize(text) {
|
|
14
|
+
return text
|
|
15
|
+
.toLowerCase()
|
|
16
|
+
.replace(/[—–\-/]/g, ' ')
|
|
17
|
+
.replace(/[^\w\s]/g, '')
|
|
18
|
+
.split(/\s+/)
|
|
19
|
+
.filter(w => w.length > 1);
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
const STOP_WORDS = new Set([
|
|
23
|
+
'the', 'is', 'at', 'in', 'on', 'to', 'of', 'for', 'and', 'or', 'an',
|
|
24
|
+
'it', 'by', 'as', 'be', 'do', 'if', 'no', 'so', 'up', 'we', 'my',
|
|
25
|
+
'use', 'when', 'with', 'from', 'this', 'that', 'will', 'can', 'has',
|
|
26
|
+
'not', 'are', 'was', 'but', 'all', 'any', 'its', 'you', 'your',
|
|
27
|
+
'skill', 'discipline',
|
|
28
|
+
]);
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Scores how well a prompt matches a description.
|
|
32
|
+
* Returns 0-1.
|
|
33
|
+
*/
|
|
34
|
+
export function scoreMatch(prompt, description) {
|
|
35
|
+
const promptTokens = tokenize(prompt).filter(w => !STOP_WORDS.has(w));
|
|
36
|
+
if (promptTokens.length === 0) return 0;
|
|
37
|
+
|
|
38
|
+
const descTokens = new Set(tokenize(description).filter(w => !STOP_WORDS.has(w)));
|
|
39
|
+
|
|
40
|
+
let matches = 0;
|
|
41
|
+
for (const token of promptTokens) {
|
|
42
|
+
if (descTokens.has(token)) {
|
|
43
|
+
matches++;
|
|
44
|
+
} else {
|
|
45
|
+
for (const dt of descTokens) {
|
|
46
|
+
if (dt.includes(token) || token.includes(dt)) {
|
|
47
|
+
matches += 0.5;
|
|
48
|
+
break;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
return matches / promptTokens.length;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Ranks all skills by match score descending.
|
|
59
|
+
*/
|
|
60
|
+
export function rankSkills(prompt, skills) {
|
|
61
|
+
return skills
|
|
62
|
+
.map(s => ({ ...s, score: scoreMatch(prompt, s.description) }))
|
|
63
|
+
.sort((a, b) => b.score - a.score);
|
|
64
|
+
}
|