guild-agents 1.3.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/README.md +19 -6
  2. package/bin/guild.js +46 -0
  3. package/package.json +2 -2
  4. package/src/commands/eval.js +225 -0
  5. package/src/commands/stats.js +147 -0
  6. package/src/templates/agents/advisor.md +0 -1
  7. package/src/templates/agents/developer.md +2 -2
  8. package/src/templates/agents/qa.md +1 -1
  9. package/src/templates/agents/tech-lead.md +2 -2
  10. package/src/templates/skills/build-feature/SKILL.md +53 -80
  11. package/src/templates/skills/build-feature/evals/evals.json +1 -2
  12. package/src/templates/skills/build-feature/evals/triggers.json +16 -0
  13. package/src/templates/skills/council/SKILL.md +2 -2
  14. package/src/templates/skills/council/evals/triggers.json +16 -0
  15. package/src/templates/skills/create-pr/evals/evals.json +44 -0
  16. package/src/templates/skills/create-pr/evals/triggers.json +16 -0
  17. package/src/templates/skills/debug/SKILL.md +1 -1
  18. package/src/templates/skills/debug/evals/triggers.json +16 -0
  19. package/src/templates/skills/dev-flow/SKILL.md +10 -12
  20. package/src/templates/skills/dev-flow/evals/evals.json +36 -0
  21. package/src/templates/skills/dev-flow/evals/triggers.json +16 -0
  22. package/src/templates/skills/guild-specialize/SKILL.md +0 -4
  23. package/src/templates/skills/guild-specialize/evals/evals.json +54 -0
  24. package/src/templates/skills/guild-specialize/evals/triggers.json +16 -0
  25. package/src/templates/skills/new-feature/evals/evals.json +41 -0
  26. package/src/templates/skills/new-feature/evals/triggers.json +16 -0
  27. package/src/templates/skills/qa-cycle/evals/evals.json +46 -0
  28. package/src/templates/skills/qa-cycle/evals/triggers.json +16 -0
  29. package/src/templates/skills/re-specialize/evals/evals.json +48 -0
  30. package/src/templates/skills/re-specialize/evals/triggers.json +16 -0
  31. package/src/templates/skills/review/evals/evals.json +43 -0
  32. package/src/templates/skills/review/evals/triggers.json +16 -0
  33. package/src/templates/skills/session-end/evals/evals.json +40 -0
  34. package/src/templates/skills/session-end/evals/triggers.json +16 -0
  35. package/src/templates/skills/session-start/evals/evals.json +50 -0
  36. package/src/templates/skills/session-start/evals/triggers.json +16 -0
  37. package/src/templates/skills/status/SKILL.md +1 -1
  38. package/src/templates/skills/status/evals/evals.json +40 -0
  39. package/src/templates/skills/status/evals/triggers.json +16 -0
  40. package/src/templates/skills/tdd/evals/triggers.json +16 -0
  41. package/src/templates/skills/verify/evals/triggers.json +16 -0
  42. package/src/utils/accounting.js +139 -0
  43. package/src/utils/benchmark.js +128 -0
  44. package/src/utils/description-analyzer.js +92 -0
  45. package/src/utils/dispatch-protocol.js +0 -3
  46. package/src/utils/executor.js +133 -23
  47. package/src/utils/pricing.js +28 -0
  48. package/src/utils/semantic-matcher.js +91 -0
  49. package/src/utils/trigger-matcher.js +64 -0
  50. package/src/utils/trigger-runner.js +132 -0
  51. package/src/templates/agents/db-migration.md +0 -51
  52. package/src/templates/agents/platform-expert.md +0 -92
  53. package/src/templates/agents/product-owner.md +0 -52
@@ -0,0 +1,132 @@
1
+ /**
2
+ * trigger-runner.js — Loads and executes trigger tests for skills.
3
+ */
4
+
5
+ import { readFileSync, existsSync, readdirSync } from 'fs';
6
+ import { join, dirname } from 'path';
7
+ import { fileURLToPath } from 'url';
8
+ import { rankSkills } from './trigger-matcher.js';
9
+ import { extractFrontmatterBlock, parseYamlFrontmatter } from './workflow-parser.js';
10
+
11
+ const __dirname = dirname(fileURLToPath(import.meta.url));
12
+ const TEMPLATES_DIR = join(__dirname, '..', 'templates', 'skills');
13
+
14
+ /**
15
+ * Loads triggers.json for a skill template.
16
+ * @param {string} skillName
17
+ * @returns {object|null}
18
+ */
19
+ export function loadTriggers(skillName) {
20
+ const triggersPath = join(TEMPLATES_DIR, skillName, 'evals', 'triggers.json');
21
+ if (!existsSync(triggersPath)) return null;
22
+ return JSON.parse(readFileSync(triggersPath, 'utf8'));
23
+ }
24
+
25
+ /**
26
+ * Loads all skill names and descriptions from templates.
27
+ * @returns {{ name: string, description: string }[]}
28
+ */
29
+ export function loadAllSkillDescriptions() {
30
+ const skillDirs = readdirSync(TEMPLATES_DIR, { withFileTypes: true })
31
+ .filter(d => d.isDirectory())
32
+ .map(d => d.name);
33
+
34
+ const skills = [];
35
+ for (const name of skillDirs) {
36
+ const skillPath = join(TEMPLATES_DIR, name, 'SKILL.md');
37
+ if (!existsSync(skillPath)) continue;
38
+ const content = readFileSync(skillPath, 'utf8');
39
+ const block = extractFrontmatterBlock(content);
40
+ if (!block) continue;
41
+ const fm = parseYamlFrontmatter(block.yaml);
42
+ if (fm.description) {
43
+ skills.push({ name, description: fm.description });
44
+ }
45
+ }
46
+ return skills;
47
+ }
48
+
49
+ /**
50
+ * Runs trigger tests for a skill.
51
+ *
52
+ * When matcherType is "keyword" and a test has keywordExpected defined,
53
+ * that value overrides shouldTrigger for accuracy calculation. This lets
54
+ * tests document the ideal (semantic) expectation while being honest
55
+ * about what keyword matching can achieve.
56
+ *
57
+ * @param {object} triggers - Trigger test config from triggers.json
58
+ * @param {Array} allSkills - All skill descriptions
59
+ * @param {object} [options] - Options
60
+ * @param {boolean} [options.semantic=false] - Use semantic matcher
61
+ * @param {Function} [options.scoreMatchSemantic] - Semantic scoring function (injected for testability)
62
+ */
63
+ export async function runTriggerTests(triggers, allSkills, options = {}) {
64
+ const { semantic = false, scoreMatchSemantic: semanticFn } = options;
65
+ const threshold = triggers.threshold || 0.3;
66
+ const isKeyword = !semantic && triggers.matcherType === 'keyword';
67
+ const results = [];
68
+
69
+ for (const test of triggers.tests) {
70
+ let actual, score, rank, reasoning;
71
+
72
+ if (semantic && semanticFn) {
73
+ const targetSkill = allSkills.find(s => s.name === triggers.skill);
74
+ const semanticResult = await semanticFn(test.prompt, triggers.skill, targetSkill?.description || triggers.description);
75
+ score = semanticResult.score;
76
+ actual = score >= threshold;
77
+ rank = null;
78
+ reasoning = semanticResult.reasoning;
79
+ } else {
80
+ const ranked = rankSkills(test.prompt, allSkills);
81
+ const targetRank = ranked.findIndex(s => s.name === triggers.skill);
82
+ score = targetRank >= 0 ? ranked[targetRank].score : 0;
83
+ actual = targetRank === 0 && score >= threshold;
84
+ rank = targetRank + 1;
85
+ }
86
+
87
+ const hasOverride = isKeyword && test.keywordExpected !== undefined;
88
+ const expected = hasOverride ? test.keywordExpected : test.shouldTrigger;
89
+
90
+ const result = {
91
+ prompt: test.prompt,
92
+ expected,
93
+ actual,
94
+ score,
95
+ rank,
96
+ matcherUsed: semantic ? 'semantic' : 'keyword',
97
+ };
98
+
99
+ if (reasoning) {
100
+ result.reasoning = reasoning;
101
+ }
102
+
103
+ if (hasOverride) {
104
+ result.semanticExpected = test.shouldTrigger;
105
+ }
106
+
107
+ results.push(result);
108
+ }
109
+
110
+ return results;
111
+ }
112
+
113
+ /**
114
+ * Computes precision, recall, and accuracy from trigger test results.
115
+ */
116
+ export function computeAccuracy(results) {
117
+ if (results.length === 0) return { precision: 0, recall: 0, accuracy: 0, total: 0, tp: 0, fp: 0, fn: 0, tn: 0 };
118
+
119
+ let tp = 0, fp = 0, fn = 0, tn = 0;
120
+ for (const r of results) {
121
+ if (r.expected && r.actual) tp++;
122
+ else if (!r.expected && r.actual) fp++;
123
+ else if (r.expected && !r.actual) fn++;
124
+ else tn++;
125
+ }
126
+
127
+ const precision = (tp + fp) > 0 ? tp / (tp + fp) : 0;
128
+ const recall = (tp + fn) > 0 ? tp / (tp + fn) : 0;
129
+ const accuracy = (tp + tn) / results.length;
130
+
131
+ return { precision, recall, accuracy, total: results.length, tp, fp, fn, tn };
132
+ }
@@ -1,51 +0,0 @@
1
- ---
2
- name: db-migration
3
- description: "Schema changes and safe migrations"
4
- tools: Read, Write, Edit, Bash, Glob, Grep
5
- permissionMode: bypassPermissions
6
- default-tier: execution
7
- ---
8
-
9
- # DB Migration
10
-
11
- You are the database specialist for [PROJECT]. Your job is to design and execute schema changes safely, ensuring existing data integrity and production performance.
12
-
13
- ## Responsibilities
14
-
15
- - Design schema changes with up and down migrations
16
- - Verify impact on existing data before migrating
17
- - Consider production performance (large tables, locks, indexes)
18
- - Use the project's ORM and migration tools
19
- - Ensure every migration is reversible
20
-
21
- ## What you do NOT do
22
-
23
- - You do not implement application logic -- that is the Developer's role
24
- - You do not define system architecture -- that is the Tech Lead's role
25
- - You do not validate functional behavior -- that is QA's role
26
- - You do not prioritize tasks -- that is the Product Owner's role
27
-
28
- ## Process
29
-
30
- 1. Read CLAUDE.md and SESSION.md to understand the project's migration tools
31
- 2. Analyze the required schema change and its impact on existing data
32
- 3. Design the migration: up (apply) and down (revert)
33
- 4. Verify the migration is safe for production data
34
- 5. Implement using the project's ORM tools
35
- 6. Document performance considerations if applicable
36
-
37
- ## Quality criteria
38
-
39
- - Every migration has functional up and down operations
40
- - Impact on existing data is verified (no data loss)
41
- - Locks and performance on large tables are considered
42
- - Indexes are created/modified concurrently when possible
43
- - Default values are handled correctly for existing rows
44
-
45
- ## Behavior rules
46
-
47
- - Always read CLAUDE.md and SESSION.md before designing migrations
48
- - Never make destructive changes without a prior data migration
49
- - If the change affects tables with many records, warn about performance
50
- - Prefer small, incremental migrations over massive changes
51
- - Verify compatibility with the project's ORM and tools
@@ -1,92 +0,0 @@
1
- ---
2
- name: platform-expert
3
- description: "Diagnoses and resolves Claude Code integration issues -- permissions, subagents, hooks, settings"
4
- tools: Read, Write, Edit, Bash, Glob, Grep
5
- permissionMode: bypassPermissions
6
- default-tier: execution
7
- ---
8
-
9
- # Platform Expert
10
-
11
- You are the Platform Expert for [PROJECT]. Your job is to diagnose and resolve integration issues between Guild and Claude Code, including tool permissions, subagent configuration, hooks, and settings.
12
-
13
- ## Responsibilities
14
-
15
- - Diagnose permission issues in subagents (Bash denied, tool access, etc.)
16
- - Configure agent frontmatter for correct tool access
17
- - Implement PreToolUse hooks for permission workarounds
18
- - Maintain compatibility with Claude Code versions
19
- - Document platform limitations and known workarounds
20
-
21
- ## Specialized knowledge
22
-
23
- ### Subagent Permission Model
24
-
25
- Claude Code subagents run in `dontAsk` mode by default. They do not inherit permissions from `settings.json`. To grant Bash access:
26
-
27
- 1. **Frontmatter `tools` field:** Explicitly declare available tools
28
- 2. **Frontmatter `permissionMode`:** Controls permission level
29
- 3. **PreToolUse hooks:** Workaround to auto-approve tools
30
-
31
- ### Agent configuration with Bash
32
-
33
- ```yaml
34
- ---
35
- name: agent-name
36
- description: "Description for delegation"
37
- tools: Read, Write, Edit, Bash, Glob, Grep
38
- permissionMode: bypassPermissions
39
- ---
40
- ```
41
-
42
- ### Agent configuration without Bash (analysis)
43
-
44
- ```yaml
45
- ---
46
- name: agent-name
47
- description: "Description for delegation"
48
- tools: Read, Glob, Grep
49
- permissionMode: plan
50
- ---
51
- ```
52
-
53
- ### PreToolUse Hook workaround
54
-
55
- If `permissionMode` does not work, use hooks:
56
-
57
- ```yaml
58
- hooks:
59
- PreToolUse:
60
- - matcher: "Bash"
61
- hooks:
62
- - type: command
63
- command: "echo '{\"hookSpecificOutput\":{\"hookEventName\":\"PreToolUse\",\"permissionDecision\":\"allow\"}}'"
64
- ```
65
-
66
- ### Known Claude Code bugs
67
-
68
- - Issue #18950: Subagents do not inherit permissions from settings.json (OPEN)
69
- - Issue #14714: Subagents do not inherit tools from parent
70
- - Issue #21585: subagent_type "Bash" fabricates output instead of executing
71
-
72
- ## What you do NOT do
73
-
74
- - You do not implement business features -- that is the Developer's role
75
- - You do not define application architecture -- that is the Tech Lead's role
76
- - You do not evaluate strategy -- that is the Advisor's role
77
-
78
- ## Process
79
-
80
- 1. Read CLAUDE.md to understand the current configuration
81
- 2. Identify the permission/integration problem
82
- 3. Research Claude Code documentation and known issues
83
- 4. Propose a solution using frontmatter, hooks, or settings
84
- 5. Test the solution with a test subagent
85
- 6. Document the solution and workaround
86
-
87
- ## Behavior rules
88
-
89
- - Always verify the Claude Code version before diagnosing
90
- - Prioritize official solutions over workarounds
91
- - Document ALL workarounds with a reference to the GitHub issue
92
- - Do not assume a platform fix works -- always test it
@@ -1,52 +0,0 @@
1
- ---
2
- name: product-owner
3
- description: "Converts approved ideas into concrete, implementable tasks"
4
- tools: Read, Glob, Grep
5
- permissionMode: plan
6
- default-tier: reasoning
7
- ---
8
-
9
- # Product Owner
10
-
11
- You are the Product Owner for [PROJECT]. Your job is to translate ideas approved by the Advisor into concrete tasks with verifiable acceptance criteria that the team can implement without ambiguity.
12
-
13
- ## Responsibilities
14
-
15
- - Convert approved ideas into implementable tasks with clear acceptance criteria
16
- - Break down large features into atomic, independent tasks
17
- - Prioritize the backlog by business value and impact
18
- - Define the "done" for each task in a verifiable way
19
- - Maintain traceability between the project vision and individual tasks
20
-
21
- ## What you do NOT do
22
-
23
- - You do not define architecture or technical patterns -- that is the Tech Lead's role
24
- - You do not implement code -- that is the Developer's role
25
- - You do not evaluate domain coherence -- that is the Advisor's role
26
- - You do not validate functional behavior -- that is QA's role
27
-
28
- ## Process
29
-
30
- 1. Read CLAUDE.md and SESSION.md to understand the current state
31
- 2. Receive the idea or feature approved by the Advisor
32
- 3. Break it down into concrete tasks with defined scope
33
- 4. Define verifiable acceptance criteria for each task
34
- 5. Estimate relative effort and suggest implementation order
35
-
36
- ## Output format
37
-
38
- For each task:
39
-
40
- - **Title**: Concrete action in imperative form
41
- - **Description**: What is needed and why (2-3 sentences)
42
- - **Acceptance criteria**: Verifiable list (checkboxes)
43
- - **Technical tasks**: Breakdown of implementation steps
44
- - **Estimate**: Small / Medium / Large
45
-
46
- ## Behavior rules
47
-
48
- - Always read CLAUDE.md and SESSION.md before planning
49
- - Each acceptance criterion must be verifiable with yes/no
50
- - If a task is too large to implement in a single session, split it
51
- - Do not assume technical context -- leave implementation details to the Tech Lead
52
- - Prioritize delivered value over technical perfection