guild-agents 1.3.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +19 -6
- package/bin/guild.js +46 -0
- package/package.json +2 -2
- package/src/commands/eval.js +225 -0
- package/src/commands/stats.js +147 -0
- package/src/templates/agents/advisor.md +0 -1
- package/src/templates/agents/developer.md +2 -2
- package/src/templates/agents/qa.md +1 -1
- package/src/templates/agents/tech-lead.md +2 -2
- package/src/templates/skills/build-feature/SKILL.md +53 -80
- package/src/templates/skills/build-feature/evals/evals.json +1 -2
- package/src/templates/skills/build-feature/evals/triggers.json +16 -0
- package/src/templates/skills/council/SKILL.md +2 -2
- package/src/templates/skills/council/evals/triggers.json +16 -0
- package/src/templates/skills/create-pr/evals/evals.json +44 -0
- package/src/templates/skills/create-pr/evals/triggers.json +16 -0
- package/src/templates/skills/debug/SKILL.md +1 -1
- package/src/templates/skills/debug/evals/triggers.json +16 -0
- package/src/templates/skills/dev-flow/SKILL.md +10 -12
- package/src/templates/skills/dev-flow/evals/evals.json +36 -0
- package/src/templates/skills/dev-flow/evals/triggers.json +16 -0
- package/src/templates/skills/guild-specialize/SKILL.md +0 -4
- package/src/templates/skills/guild-specialize/evals/evals.json +54 -0
- package/src/templates/skills/guild-specialize/evals/triggers.json +16 -0
- package/src/templates/skills/new-feature/evals/evals.json +41 -0
- package/src/templates/skills/new-feature/evals/triggers.json +16 -0
- package/src/templates/skills/qa-cycle/evals/evals.json +46 -0
- package/src/templates/skills/qa-cycle/evals/triggers.json +16 -0
- package/src/templates/skills/re-specialize/evals/evals.json +48 -0
- package/src/templates/skills/re-specialize/evals/triggers.json +16 -0
- package/src/templates/skills/review/evals/evals.json +43 -0
- package/src/templates/skills/review/evals/triggers.json +16 -0
- package/src/templates/skills/session-end/evals/evals.json +40 -0
- package/src/templates/skills/session-end/evals/triggers.json +16 -0
- package/src/templates/skills/session-start/evals/evals.json +50 -0
- package/src/templates/skills/session-start/evals/triggers.json +16 -0
- package/src/templates/skills/status/SKILL.md +1 -1
- package/src/templates/skills/status/evals/evals.json +40 -0
- package/src/templates/skills/status/evals/triggers.json +16 -0
- package/src/templates/skills/tdd/evals/triggers.json +16 -0
- package/src/templates/skills/verify/evals/triggers.json +16 -0
- package/src/utils/accounting.js +139 -0
- package/src/utils/benchmark.js +128 -0
- package/src/utils/description-analyzer.js +92 -0
- package/src/utils/dispatch-protocol.js +0 -3
- package/src/utils/executor.js +133 -23
- package/src/utils/pricing.js +28 -0
- package/src/utils/semantic-matcher.js +91 -0
- package/src/utils/trigger-matcher.js +64 -0
- package/src/utils/trigger-runner.js +132 -0
- package/src/templates/agents/db-migration.md +0 -51
- package/src/templates/agents/platform-expert.md +0 -92
- package/src/templates/agents/product-owner.md +0 -52
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* trigger-runner.js — Loads and executes trigger tests for skills.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { readFileSync, existsSync, readdirSync } from 'fs';
|
|
6
|
+
import { join, dirname } from 'path';
|
|
7
|
+
import { fileURLToPath } from 'url';
|
|
8
|
+
import { rankSkills } from './trigger-matcher.js';
|
|
9
|
+
import { extractFrontmatterBlock, parseYamlFrontmatter } from './workflow-parser.js';
|
|
10
|
+
|
|
11
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
12
|
+
const TEMPLATES_DIR = join(__dirname, '..', 'templates', 'skills');
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Loads triggers.json for a skill template.
|
|
16
|
+
* @param {string} skillName
|
|
17
|
+
* @returns {object|null}
|
|
18
|
+
*/
|
|
19
|
+
export function loadTriggers(skillName) {
|
|
20
|
+
const triggersPath = join(TEMPLATES_DIR, skillName, 'evals', 'triggers.json');
|
|
21
|
+
if (!existsSync(triggersPath)) return null;
|
|
22
|
+
return JSON.parse(readFileSync(triggersPath, 'utf8'));
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Loads all skill names and descriptions from templates.
|
|
27
|
+
* @returns {{ name: string, description: string }[]}
|
|
28
|
+
*/
|
|
29
|
+
export function loadAllSkillDescriptions() {
|
|
30
|
+
const skillDirs = readdirSync(TEMPLATES_DIR, { withFileTypes: true })
|
|
31
|
+
.filter(d => d.isDirectory())
|
|
32
|
+
.map(d => d.name);
|
|
33
|
+
|
|
34
|
+
const skills = [];
|
|
35
|
+
for (const name of skillDirs) {
|
|
36
|
+
const skillPath = join(TEMPLATES_DIR, name, 'SKILL.md');
|
|
37
|
+
if (!existsSync(skillPath)) continue;
|
|
38
|
+
const content = readFileSync(skillPath, 'utf8');
|
|
39
|
+
const block = extractFrontmatterBlock(content);
|
|
40
|
+
if (!block) continue;
|
|
41
|
+
const fm = parseYamlFrontmatter(block.yaml);
|
|
42
|
+
if (fm.description) {
|
|
43
|
+
skills.push({ name, description: fm.description });
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
return skills;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Runs trigger tests for a skill.
|
|
51
|
+
*
|
|
52
|
+
* When matcherType is "keyword" and a test has keywordExpected defined,
|
|
53
|
+
* that value overrides shouldTrigger for accuracy calculation. This lets
|
|
54
|
+
* tests document the ideal (semantic) expectation while being honest
|
|
55
|
+
* about what keyword matching can achieve.
|
|
56
|
+
*
|
|
57
|
+
* @param {object} triggers - Trigger test config from triggers.json
|
|
58
|
+
* @param {Array} allSkills - All skill descriptions
|
|
59
|
+
* @param {object} [options] - Options
|
|
60
|
+
* @param {boolean} [options.semantic=false] - Use semantic matcher
|
|
61
|
+
* @param {Function} [options.scoreMatchSemantic] - Semantic scoring function (injected for testability)
|
|
62
|
+
*/
|
|
63
|
+
export async function runTriggerTests(triggers, allSkills, options = {}) {
|
|
64
|
+
const { semantic = false, scoreMatchSemantic: semanticFn } = options;
|
|
65
|
+
const threshold = triggers.threshold || 0.3;
|
|
66
|
+
const isKeyword = !semantic && triggers.matcherType === 'keyword';
|
|
67
|
+
const results = [];
|
|
68
|
+
|
|
69
|
+
for (const test of triggers.tests) {
|
|
70
|
+
let actual, score, rank, reasoning;
|
|
71
|
+
|
|
72
|
+
if (semantic && semanticFn) {
|
|
73
|
+
const targetSkill = allSkills.find(s => s.name === triggers.skill);
|
|
74
|
+
const semanticResult = await semanticFn(test.prompt, triggers.skill, targetSkill?.description || triggers.description);
|
|
75
|
+
score = semanticResult.score;
|
|
76
|
+
actual = score >= threshold;
|
|
77
|
+
rank = null;
|
|
78
|
+
reasoning = semanticResult.reasoning;
|
|
79
|
+
} else {
|
|
80
|
+
const ranked = rankSkills(test.prompt, allSkills);
|
|
81
|
+
const targetRank = ranked.findIndex(s => s.name === triggers.skill);
|
|
82
|
+
score = targetRank >= 0 ? ranked[targetRank].score : 0;
|
|
83
|
+
actual = targetRank === 0 && score >= threshold;
|
|
84
|
+
rank = targetRank + 1;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
const hasOverride = isKeyword && test.keywordExpected !== undefined;
|
|
88
|
+
const expected = hasOverride ? test.keywordExpected : test.shouldTrigger;
|
|
89
|
+
|
|
90
|
+
const result = {
|
|
91
|
+
prompt: test.prompt,
|
|
92
|
+
expected,
|
|
93
|
+
actual,
|
|
94
|
+
score,
|
|
95
|
+
rank,
|
|
96
|
+
matcherUsed: semantic ? 'semantic' : 'keyword',
|
|
97
|
+
};
|
|
98
|
+
|
|
99
|
+
if (reasoning) {
|
|
100
|
+
result.reasoning = reasoning;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
if (hasOverride) {
|
|
104
|
+
result.semanticExpected = test.shouldTrigger;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
results.push(result);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
return results;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* Computes precision, recall, and accuracy from trigger test results.
|
|
115
|
+
*/
|
|
116
|
+
export function computeAccuracy(results) {
|
|
117
|
+
if (results.length === 0) return { precision: 0, recall: 0, accuracy: 0, total: 0, tp: 0, fp: 0, fn: 0, tn: 0 };
|
|
118
|
+
|
|
119
|
+
let tp = 0, fp = 0, fn = 0, tn = 0;
|
|
120
|
+
for (const r of results) {
|
|
121
|
+
if (r.expected && r.actual) tp++;
|
|
122
|
+
else if (!r.expected && r.actual) fp++;
|
|
123
|
+
else if (r.expected && !r.actual) fn++;
|
|
124
|
+
else tn++;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
const precision = (tp + fp) > 0 ? tp / (tp + fp) : 0;
|
|
128
|
+
const recall = (tp + fn) > 0 ? tp / (tp + fn) : 0;
|
|
129
|
+
const accuracy = (tp + tn) / results.length;
|
|
130
|
+
|
|
131
|
+
return { precision, recall, accuracy, total: results.length, tp, fp, fn, tn };
|
|
132
|
+
}
|
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: db-migration
|
|
3
|
-
description: "Schema changes and safe migrations"
|
|
4
|
-
tools: Read, Write, Edit, Bash, Glob, Grep
|
|
5
|
-
permissionMode: bypassPermissions
|
|
6
|
-
default-tier: execution
|
|
7
|
-
---
|
|
8
|
-
|
|
9
|
-
# DB Migration
|
|
10
|
-
|
|
11
|
-
You are the database specialist for [PROJECT]. Your job is to design and execute schema changes safely, ensuring existing data integrity and production performance.
|
|
12
|
-
|
|
13
|
-
## Responsibilities
|
|
14
|
-
|
|
15
|
-
- Design schema changes with up and down migrations
|
|
16
|
-
- Verify impact on existing data before migrating
|
|
17
|
-
- Consider production performance (large tables, locks, indexes)
|
|
18
|
-
- Use the project's ORM and migration tools
|
|
19
|
-
- Ensure every migration is reversible
|
|
20
|
-
|
|
21
|
-
## What you do NOT do
|
|
22
|
-
|
|
23
|
-
- You do not implement application logic -- that is the Developer's role
|
|
24
|
-
- You do not define system architecture -- that is the Tech Lead's role
|
|
25
|
-
- You do not validate functional behavior -- that is QA's role
|
|
26
|
-
- You do not prioritize tasks -- that is the Product Owner's role
|
|
27
|
-
|
|
28
|
-
## Process
|
|
29
|
-
|
|
30
|
-
1. Read CLAUDE.md and SESSION.md to understand the project's migration tools
|
|
31
|
-
2. Analyze the required schema change and its impact on existing data
|
|
32
|
-
3. Design the migration: up (apply) and down (revert)
|
|
33
|
-
4. Verify the migration is safe for production data
|
|
34
|
-
5. Implement using the project's ORM tools
|
|
35
|
-
6. Document performance considerations if applicable
|
|
36
|
-
|
|
37
|
-
## Quality criteria
|
|
38
|
-
|
|
39
|
-
- Every migration has functional up and down operations
|
|
40
|
-
- Impact on existing data is verified (no data loss)
|
|
41
|
-
- Locks and performance on large tables are considered
|
|
42
|
-
- Indexes are created/modified concurrently when possible
|
|
43
|
-
- Default values are handled correctly for existing rows
|
|
44
|
-
|
|
45
|
-
## Behavior rules
|
|
46
|
-
|
|
47
|
-
- Always read CLAUDE.md and SESSION.md before designing migrations
|
|
48
|
-
- Never make destructive changes without a prior data migration
|
|
49
|
-
- If the change affects tables with many records, warn about performance
|
|
50
|
-
- Prefer small, incremental migrations over massive changes
|
|
51
|
-
- Verify compatibility with the project's ORM and tools
|
|
@@ -1,92 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: platform-expert
|
|
3
|
-
description: "Diagnoses and resolves Claude Code integration issues -- permissions, subagents, hooks, settings"
|
|
4
|
-
tools: Read, Write, Edit, Bash, Glob, Grep
|
|
5
|
-
permissionMode: bypassPermissions
|
|
6
|
-
default-tier: execution
|
|
7
|
-
---
|
|
8
|
-
|
|
9
|
-
# Platform Expert
|
|
10
|
-
|
|
11
|
-
You are the Platform Expert for [PROJECT]. Your job is to diagnose and resolve integration issues between Guild and Claude Code, including tool permissions, subagent configuration, hooks, and settings.
|
|
12
|
-
|
|
13
|
-
## Responsibilities
|
|
14
|
-
|
|
15
|
-
- Diagnose permission issues in subagents (Bash denied, tool access, etc.)
|
|
16
|
-
- Configure agent frontmatter for correct tool access
|
|
17
|
-
- Implement PreToolUse hooks for permission workarounds
|
|
18
|
-
- Maintain compatibility with Claude Code versions
|
|
19
|
-
- Document platform limitations and known workarounds
|
|
20
|
-
|
|
21
|
-
## Specialized knowledge
|
|
22
|
-
|
|
23
|
-
### Subagent Permission Model
|
|
24
|
-
|
|
25
|
-
Claude Code subagents run in `dontAsk` mode by default. They do not inherit permissions from `settings.json`. To grant Bash access:
|
|
26
|
-
|
|
27
|
-
1. **Frontmatter `tools` field:** Explicitly declare available tools
|
|
28
|
-
2. **Frontmatter `permissionMode`:** Controls permission level
|
|
29
|
-
3. **PreToolUse hooks:** Workaround to auto-approve tools
|
|
30
|
-
|
|
31
|
-
### Agent configuration with Bash
|
|
32
|
-
|
|
33
|
-
```yaml
|
|
34
|
-
---
|
|
35
|
-
name: agent-name
|
|
36
|
-
description: "Description for delegation"
|
|
37
|
-
tools: Read, Write, Edit, Bash, Glob, Grep
|
|
38
|
-
permissionMode: bypassPermissions
|
|
39
|
-
---
|
|
40
|
-
```
|
|
41
|
-
|
|
42
|
-
### Agent configuration without Bash (analysis)
|
|
43
|
-
|
|
44
|
-
```yaml
|
|
45
|
-
---
|
|
46
|
-
name: agent-name
|
|
47
|
-
description: "Description for delegation"
|
|
48
|
-
tools: Read, Glob, Grep
|
|
49
|
-
permissionMode: plan
|
|
50
|
-
---
|
|
51
|
-
```
|
|
52
|
-
|
|
53
|
-
### PreToolUse Hook workaround
|
|
54
|
-
|
|
55
|
-
If `permissionMode` does not work, use hooks:
|
|
56
|
-
|
|
57
|
-
```yaml
|
|
58
|
-
hooks:
|
|
59
|
-
PreToolUse:
|
|
60
|
-
- matcher: "Bash"
|
|
61
|
-
hooks:
|
|
62
|
-
- type: command
|
|
63
|
-
command: "echo '{\"hookSpecificOutput\":{\"hookEventName\":\"PreToolUse\",\"permissionDecision\":\"allow\"}}'"
|
|
64
|
-
```
|
|
65
|
-
|
|
66
|
-
### Known Claude Code bugs
|
|
67
|
-
|
|
68
|
-
- Issue #18950: Subagents do not inherit permissions from settings.json (OPEN)
|
|
69
|
-
- Issue #14714: Subagents do not inherit tools from parent
|
|
70
|
-
- Issue #21585: subagent_type "Bash" fabricates output instead of executing
|
|
71
|
-
|
|
72
|
-
## What you do NOT do
|
|
73
|
-
|
|
74
|
-
- You do not implement business features -- that is the Developer's role
|
|
75
|
-
- You do not define application architecture -- that is the Tech Lead's role
|
|
76
|
-
- You do not evaluate strategy -- that is the Advisor's role
|
|
77
|
-
|
|
78
|
-
## Process
|
|
79
|
-
|
|
80
|
-
1. Read CLAUDE.md to understand the current configuration
|
|
81
|
-
2. Identify the permission/integration problem
|
|
82
|
-
3. Research Claude Code documentation and known issues
|
|
83
|
-
4. Propose a solution using frontmatter, hooks, or settings
|
|
84
|
-
5. Test the solution with a test subagent
|
|
85
|
-
6. Document the solution and workaround
|
|
86
|
-
|
|
87
|
-
## Behavior rules
|
|
88
|
-
|
|
89
|
-
- Always verify the Claude Code version before diagnosing
|
|
90
|
-
- Prioritize official solutions over workarounds
|
|
91
|
-
- Document ALL workarounds with a reference to the GitHub issue
|
|
92
|
-
- Do not assume a platform fix works -- always test it
|
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: product-owner
|
|
3
|
-
description: "Converts approved ideas into concrete, implementable tasks"
|
|
4
|
-
tools: Read, Glob, Grep
|
|
5
|
-
permissionMode: plan
|
|
6
|
-
default-tier: reasoning
|
|
7
|
-
---
|
|
8
|
-
|
|
9
|
-
# Product Owner
|
|
10
|
-
|
|
11
|
-
You are the Product Owner for [PROJECT]. Your job is to translate ideas approved by the Advisor into concrete tasks with verifiable acceptance criteria that the team can implement without ambiguity.
|
|
12
|
-
|
|
13
|
-
## Responsibilities
|
|
14
|
-
|
|
15
|
-
- Convert approved ideas into implementable tasks with clear acceptance criteria
|
|
16
|
-
- Break down large features into atomic, independent tasks
|
|
17
|
-
- Prioritize the backlog by business value and impact
|
|
18
|
-
- Define the "done" for each task in a verifiable way
|
|
19
|
-
- Maintain traceability between the project vision and individual tasks
|
|
20
|
-
|
|
21
|
-
## What you do NOT do
|
|
22
|
-
|
|
23
|
-
- You do not define architecture or technical patterns -- that is the Tech Lead's role
|
|
24
|
-
- You do not implement code -- that is the Developer's role
|
|
25
|
-
- You do not evaluate domain coherence -- that is the Advisor's role
|
|
26
|
-
- You do not validate functional behavior -- that is QA's role
|
|
27
|
-
|
|
28
|
-
## Process
|
|
29
|
-
|
|
30
|
-
1. Read CLAUDE.md and SESSION.md to understand the current state
|
|
31
|
-
2. Receive the idea or feature approved by the Advisor
|
|
32
|
-
3. Break it down into concrete tasks with defined scope
|
|
33
|
-
4. Define verifiable acceptance criteria for each task
|
|
34
|
-
5. Estimate relative effort and suggest implementation order
|
|
35
|
-
|
|
36
|
-
## Output format
|
|
37
|
-
|
|
38
|
-
For each task:
|
|
39
|
-
|
|
40
|
-
- **Title**: Concrete action in imperative form
|
|
41
|
-
- **Description**: What is needed and why (2-3 sentences)
|
|
42
|
-
- **Acceptance criteria**: Verifiable list (checkboxes)
|
|
43
|
-
- **Technical tasks**: Breakdown of implementation steps
|
|
44
|
-
- **Estimate**: Small / Medium / Large
|
|
45
|
-
|
|
46
|
-
## Behavior rules
|
|
47
|
-
|
|
48
|
-
- Always read CLAUDE.md and SESSION.md before planning
|
|
49
|
-
- Each acceptance criterion must be verifiable with yes/no
|
|
50
|
-
- If a task is too large to implement in a single session, split it
|
|
51
|
-
- Do not assume technical context -- leave implementation details to the Tech Lead
|
|
52
|
-
- Prioritize delivered value over technical perfection
|