guild-agents 1.1.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -1
- package/bin/guild.js +84 -0
- package/package.json +5 -2
- package/src/commands/init.js +8 -1
- package/src/commands/workspace.js +92 -0
- package/src/templates/skills/build-feature/evals/evals.json +53 -0
- package/src/templates/skills/council/SKILL.md +27 -6
- package/src/templates/skills/council/evals/evals.json +41 -0
- package/src/templates/skills/debug/SKILL.md +145 -0
- package/src/templates/skills/guild-specialize/SKILL.md +20 -0
- package/src/templates/skills/re-specialize/SKILL.md +153 -0
- package/src/templates/skills/tdd/SKILL.md +159 -0
- package/src/templates/skills/verify/SKILL.md +114 -0
- package/src/utils/eval-runner.js +139 -0
- package/src/utils/generators.js +11 -9
- package/src/utils/workspace.js +171 -0
- package/src/utils/zones.js +39 -0
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: re-specialize
|
|
3
|
+
description: "Incremental re-specialization — re-scans the project and updates only auto-generated zones in CLAUDE.md and agents"
|
|
4
|
+
user-invocable: true
|
|
5
|
+
workflow:
|
|
6
|
+
version: 1
|
|
7
|
+
steps:
|
|
8
|
+
- id: read-current
|
|
9
|
+
role: system
|
|
10
|
+
intent: "Read current CLAUDE.md and agent files to identify existing zones."
|
|
11
|
+
commands: [cat CLAUDE.md]
|
|
12
|
+
produces: [current-claude-md, current-agents]
|
|
13
|
+
- id: explore-project
|
|
14
|
+
role: system
|
|
15
|
+
intent: "Scan project for current stack, dependencies, architecture, conventions."
|
|
16
|
+
commands: [ls -R src/, cat package.json]
|
|
17
|
+
produces: [detected-stack, detected-architecture, detected-conventions]
|
|
18
|
+
gate: true
|
|
19
|
+
- id: check-zones
|
|
20
|
+
role: system
|
|
21
|
+
intent: "Check if CLAUDE.md has guild zone markers. If not, offer to inject them."
|
|
22
|
+
requires: [current-claude-md]
|
|
23
|
+
produces: [zone-status]
|
|
24
|
+
gate: true
|
|
25
|
+
- id: regenerate-zones
|
|
26
|
+
role: tech-lead
|
|
27
|
+
intent: "Generate new content for each auto zone based on fresh project scan. Present diff to user."
|
|
28
|
+
requires: [current-claude-md, detected-stack, detected-architecture, detected-conventions, zone-status]
|
|
29
|
+
produces: [updated-claude-md, zone-diffs]
|
|
30
|
+
model-tier: reasoning
|
|
31
|
+
- id: update-agents
|
|
32
|
+
role: tech-lead
|
|
33
|
+
intent: "Update agent-context zones in agent files with fresh project context."
|
|
34
|
+
requires: [detected-stack, detected-architecture, detected-conventions]
|
|
35
|
+
produces: [updated-agents]
|
|
36
|
+
model-tier: execution
|
|
37
|
+
- id: confirm
|
|
38
|
+
role: system
|
|
39
|
+
intent: "Present summary of changes and get user confirmation."
|
|
40
|
+
requires: [zone-diffs, updated-agents]
|
|
41
|
+
produces: [confirmation]
|
|
42
|
+
gate: true
|
|
43
|
+
- id: commit
|
|
44
|
+
role: system
|
|
45
|
+
intent: "Commit re-specialized files."
|
|
46
|
+
commands: [git add CLAUDE.md .claude/agents/*.md, git commit -m "chore: re-specialize via guild-re-specialize"]
|
|
47
|
+
requires: [updated-claude-md, updated-agents, confirmation]
|
|
48
|
+
produces: [re-specialize-commit]
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
# Re-Specialize
|
|
52
|
+
|
|
53
|
+
Incrementally updates auto-generated content in CLAUDE.md and agent files
|
|
54
|
+
without touching user customizations. Uses protected zone markers to identify
|
|
55
|
+
what can be safely regenerated.
|
|
56
|
+
|
|
57
|
+
## When to use
|
|
58
|
+
|
|
59
|
+
- When project dependencies have changed (new framework, updated versions)
|
|
60
|
+
- When architecture has evolved (new patterns, restructured folders)
|
|
61
|
+
- When agents need refreshed context about the project
|
|
62
|
+
- Periodically to keep CLAUDE.md in sync with the actual codebase
|
|
63
|
+
|
|
64
|
+
## Process
|
|
65
|
+
|
|
66
|
+
### Step 1 -- Read current state
|
|
67
|
+
|
|
68
|
+
Read CLAUDE.md and all agent files in `.claude/agents/`:
|
|
69
|
+
|
|
70
|
+
- Identify existing zone markers (`<!-- guild:auto-start:ID -->` / `<!-- guild:auto-end:ID -->`)
|
|
71
|
+
- Note which zones exist and their current content
|
|
72
|
+
- Identify any user customizations outside of zones
|
|
73
|
+
|
|
74
|
+
### Step 2 -- Explore the project
|
|
75
|
+
|
|
76
|
+
Same exploration as guild-specialize:
|
|
77
|
+
|
|
78
|
+
- Scan dependency files for current stack and versions
|
|
79
|
+
- Analyze project structure and architecture patterns
|
|
80
|
+
- Detect code conventions from linter/formatter configs
|
|
81
|
+
- Check environment variable examples
|
|
82
|
+
|
|
83
|
+
### Step 3 -- Check zone markers
|
|
84
|
+
|
|
85
|
+
If CLAUDE.md has zone markers, proceed to regeneration.
|
|
86
|
+
|
|
87
|
+
If CLAUDE.md does NOT have zone markers (legacy project):
|
|
88
|
+
|
|
89
|
+
- Offer to inject markers around the auto-generated sections
|
|
90
|
+
- Show the user where markers would be placed
|
|
91
|
+
- Require explicit confirmation before modifying
|
|
92
|
+
- If user declines, abort gracefully
|
|
93
|
+
|
|
94
|
+
### Step 4 -- Regenerate zone content
|
|
95
|
+
|
|
96
|
+
Invoke the Tech Lead agent using Task tool with `model: "opus"` (reasoning tier):
|
|
97
|
+
|
|
98
|
+
- Generate fresh content for each zone based on the project scan
|
|
99
|
+
- Compare new content with existing zone content
|
|
100
|
+
- Present a diff for each zone to the user
|
|
101
|
+
- Only replace zones where content has actually changed
|
|
102
|
+
|
|
103
|
+
Zones to regenerate:
|
|
104
|
+
|
|
105
|
+
| Zone ID | Content |
|
|
106
|
+
|----------------|--------------------------------------------|
|
|
107
|
+
| `structure` | Project folder structure with descriptions |
|
|
108
|
+
| `architecture` | Architecture patterns and design decisions |
|
|
109
|
+
| `conventions` | Code conventions from linter/formatter |
|
|
110
|
+
| `env-vars` | Environment variables from .env.example |
|
|
111
|
+
|
|
112
|
+
### Step 5 -- Update agent context
|
|
113
|
+
|
|
114
|
+
Invoke the Tech Lead agent using Task tool with `model: "sonnet"` (execution tier):
|
|
115
|
+
|
|
116
|
+
- For each agent in `.claude/agents/*.md`, update the `agent-context` zone
|
|
117
|
+
- If no `agent-context` zone exists, append one at the bottom
|
|
118
|
+
- Preserve everything outside the zone (role definition, process, rules)
|
|
119
|
+
|
|
120
|
+
### Step 6 -- Confirm changes
|
|
121
|
+
|
|
122
|
+
Present a summary:
|
|
123
|
+
|
|
124
|
+
```text
|
|
125
|
+
Re-specialization complete for [project-name]
|
|
126
|
+
|
|
127
|
+
Zones updated:
|
|
128
|
+
- structure: [changed/unchanged]
|
|
129
|
+
- architecture: [changed/unchanged]
|
|
130
|
+
- conventions: [changed/unchanged]
|
|
131
|
+
- env-vars: [changed/unchanged]
|
|
132
|
+
|
|
133
|
+
Agents updated: [count] of [total]
|
|
134
|
+
|
|
135
|
+
Review the changes above. Confirm to commit.
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### Step 7 -- Commit
|
|
139
|
+
|
|
140
|
+
Commit all changes as an atomic commit:
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
git add CLAUDE.md .claude/agents/*.md
|
|
144
|
+
git commit -m "chore: re-specialize via guild-re-specialize"
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## Important Notes
|
|
148
|
+
|
|
149
|
+
- NEVER modify content outside of zone markers
|
|
150
|
+
- NEVER read real `.env` files -- only `.env.example`
|
|
151
|
+
- If a zone's content hasn't changed, skip it (no-op)
|
|
152
|
+
- Present diffs before applying changes
|
|
153
|
+
- User confirmation is required before committing
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: tdd
|
|
3
|
+
description: "Discipline skill — TDD red-green-refactor cycle. Use when implementing any feature or bugfix, before writing implementation code."
|
|
4
|
+
user-invocable: true
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# Test-Driven Development (TDD)
|
|
8
|
+
|
|
9
|
+
Write the test first. Watch it fail. Write minimal code to pass.
|
|
10
|
+
|
|
11
|
+
**Core principle:** If you didn't watch the test fail, you don't know if it tests the right thing.
|
|
12
|
+
|
|
13
|
+
## Usage
|
|
14
|
+
|
|
15
|
+
`/tdd`
|
|
16
|
+
|
|
17
|
+
Invoke this skill before implementing any feature or bugfix. It establishes the discipline for your implementation session.
|
|
18
|
+
|
|
19
|
+
## When to use
|
|
20
|
+
|
|
21
|
+
- New features
|
|
22
|
+
- Bug fixes
|
|
23
|
+
- Refactoring
|
|
24
|
+
- Behavior changes
|
|
25
|
+
|
|
26
|
+
**Exceptions (ask the user):** throwaway prototypes, generated code, configuration files.
|
|
27
|
+
|
|
28
|
+
## The Iron Law
|
|
29
|
+
|
|
30
|
+
```text
|
|
31
|
+
NO PRODUCTION CODE WITHOUT A FAILING TEST FIRST
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Write code before the test? Delete it. Start over.
|
|
35
|
+
|
|
36
|
+
- Don't keep it as "reference"
|
|
37
|
+
- Don't "adapt" it while writing tests
|
|
38
|
+
- Don't look at it
|
|
39
|
+
- Delete means delete
|
|
40
|
+
|
|
41
|
+
## Red-Green-Refactor
|
|
42
|
+
|
|
43
|
+
### RED - Write Failing Test
|
|
44
|
+
|
|
45
|
+
Write one minimal test showing what should happen.
|
|
46
|
+
|
|
47
|
+
**Requirements:**
|
|
48
|
+
|
|
49
|
+
- One behavior per test
|
|
50
|
+
- Clear name that describes behavior
|
|
51
|
+
- Real code (no mocks unless unavoidable)
|
|
52
|
+
|
|
53
|
+
**Run the test. Confirm:**
|
|
54
|
+
|
|
55
|
+
- Test fails (not errors)
|
|
56
|
+
- Failure message is expected
|
|
57
|
+
- Fails because feature missing (not typos)
|
|
58
|
+
|
|
59
|
+
Test passes? You're testing existing behavior. Fix the test.
|
|
60
|
+
|
|
61
|
+
### GREEN - Minimal Code
|
|
62
|
+
|
|
63
|
+
Write the simplest code to pass the test.
|
|
64
|
+
|
|
65
|
+
Don't add features, refactor other code, or "improve" beyond the test.
|
|
66
|
+
|
|
67
|
+
**Run the test. Confirm:**
|
|
68
|
+
|
|
69
|
+
- Test passes
|
|
70
|
+
- Other tests still pass
|
|
71
|
+
- Output pristine (no errors, warnings)
|
|
72
|
+
|
|
73
|
+
Test fails? Fix code, not test.
|
|
74
|
+
|
|
75
|
+
### REFACTOR - Clean Up
|
|
76
|
+
|
|
77
|
+
After green only:
|
|
78
|
+
|
|
79
|
+
- Remove duplication
|
|
80
|
+
- Improve names
|
|
81
|
+
- Extract helpers
|
|
82
|
+
|
|
83
|
+
Keep tests green. Don't add behavior.
|
|
84
|
+
|
|
85
|
+
### Repeat
|
|
86
|
+
|
|
87
|
+
Next failing test for next behavior.
|
|
88
|
+
|
|
89
|
+
## Good Tests
|
|
90
|
+
|
|
91
|
+
| Quality | Good | Bad |
|
|
92
|
+
| ---------------- | ----------------------------------- | --------------------------------------------------- |
|
|
93
|
+
| **Minimal** | One thing. "and" in name? Split it. | `test('validates email and domain and whitespace')` |
|
|
94
|
+
| **Clear** | Name describes behavior | `test('test1')` |
|
|
95
|
+
| **Shows intent** | Demonstrates desired API | Obscures what code should do |
|
|
96
|
+
|
|
97
|
+
## Common Rationalizations
|
|
98
|
+
|
|
99
|
+
| Excuse | Reality |
|
|
100
|
+
| -------------------------------- | ----------------------------------------------------------------------- |
|
|
101
|
+
| "Too simple to test" | Simple code breaks. Test takes 30 seconds. |
|
|
102
|
+
| "I'll test after" | Tests passing immediately prove nothing. |
|
|
103
|
+
| "Tests after achieve same goals" | Tests-after = "what does this do?" Tests-first = "what should this do?" |
|
|
104
|
+
| "Already manually tested" | Ad-hoc is not systematic. No record, can't re-run. |
|
|
105
|
+
| "Deleting X hours is wasteful" | Sunk cost fallacy. Keeping unverified code is technical debt. |
|
|
106
|
+
| "Need to explore first" | Fine. Throw away exploration, start with TDD. |
|
|
107
|
+
| "Test hard = design unclear" | Listen to the test. Hard to test = hard to use. |
|
|
108
|
+
| "TDD will slow me down" | TDD is faster than debugging. |
|
|
109
|
+
|
|
110
|
+
## Red Flags - STOP and Start Over
|
|
111
|
+
|
|
112
|
+
- Code before test
|
|
113
|
+
- Test after implementation
|
|
114
|
+
- Test passes immediately
|
|
115
|
+
- Can't explain why test failed
|
|
116
|
+
- Rationalizing "just this once"
|
|
117
|
+
- "I already manually tested it"
|
|
118
|
+
- "Keep as reference"
|
|
119
|
+
|
|
120
|
+
**All of these mean: Delete code. Start over with TDD.**
|
|
121
|
+
|
|
122
|
+
## Bug Fix Flow
|
|
123
|
+
|
|
124
|
+
1. Write failing test reproducing the bug
|
|
125
|
+
2. Verify RED (test fails as expected)
|
|
126
|
+
3. Implement minimal fix
|
|
127
|
+
4. Verify GREEN (test passes, all tests pass)
|
|
128
|
+
5. Refactor if needed
|
|
129
|
+
|
|
130
|
+
Never fix bugs without a test.
|
|
131
|
+
|
|
132
|
+
## When Stuck
|
|
133
|
+
|
|
134
|
+
| Problem | Solution |
|
|
135
|
+
| ------------------------ | ----------------------------------------------------- |
|
|
136
|
+
| Don't know how to test | Write wished-for API. Write assertion first. |
|
|
137
|
+
| Test too complicated | Design too complicated. Simplify interface. |
|
|
138
|
+
| Must mock everything | Code too coupled. Use dependency injection. |
|
|
139
|
+
| Test setup huge | Extract helpers. Still complex? Simplify design. |
|
|
140
|
+
|
|
141
|
+
## Verification Checklist
|
|
142
|
+
|
|
143
|
+
Before marking work complete:
|
|
144
|
+
|
|
145
|
+
- [ ] Every new function/method has a test
|
|
146
|
+
- [ ] Watched each test fail before implementing
|
|
147
|
+
- [ ] Each test failed for expected reason
|
|
148
|
+
- [ ] Wrote minimal code to pass each test
|
|
149
|
+
- [ ] All tests pass
|
|
150
|
+
- [ ] Output pristine (no errors, warnings)
|
|
151
|
+
- [ ] Tests use real code (mocks only if unavoidable)
|
|
152
|
+
- [ ] Edge cases and errors covered
|
|
153
|
+
|
|
154
|
+
Can't check all boxes? You skipped TDD. Start over.
|
|
155
|
+
|
|
156
|
+
## Related Skills
|
|
157
|
+
|
|
158
|
+
- `/debug` — systematic debugging when tests reveal unexpected failures
|
|
159
|
+
- `/verify` — verification before claiming work is complete
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: verify
|
|
3
|
+
description: "Discipline skill — verification before completion. Use when about to claim work is complete, fixed, or passing, before committing or creating PRs."
|
|
4
|
+
user-invocable: true
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# Verification Before Completion
|
|
8
|
+
|
|
9
|
+
Claiming work is complete without verification is dishonesty, not efficiency.
|
|
10
|
+
|
|
11
|
+
**Core principle:** Evidence before claims, always.
|
|
12
|
+
|
|
13
|
+
## Usage
|
|
14
|
+
|
|
15
|
+
`/verify`
|
|
16
|
+
|
|
17
|
+
Invoke this skill before claiming any work is done, before committing, and before creating PRs.
|
|
18
|
+
|
|
19
|
+
## When to use
|
|
20
|
+
|
|
21
|
+
**ALWAYS before:**
|
|
22
|
+
|
|
23
|
+
- Any success or completion claim
|
|
24
|
+
- Committing, pushing, or creating PRs
|
|
25
|
+
- Moving to the next task
|
|
26
|
+
- Expressing satisfaction about work state
|
|
27
|
+
|
|
28
|
+
## The Iron Law
|
|
29
|
+
|
|
30
|
+
```text
|
|
31
|
+
NO COMPLETION CLAIMS WITHOUT FRESH VERIFICATION EVIDENCE
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
If you haven't run the verification command in this step, you cannot claim it passes.
|
|
35
|
+
|
|
36
|
+
## The Gate Function
|
|
37
|
+
|
|
38
|
+
```text
|
|
39
|
+
BEFORE claiming any status:
|
|
40
|
+
|
|
41
|
+
1. IDENTIFY: What command proves this claim?
|
|
42
|
+
2. RUN: Execute the FULL command (fresh, complete)
|
|
43
|
+
3. READ: Full output, check exit code, count failures
|
|
44
|
+
4. VERIFY: Does output confirm the claim?
|
|
45
|
+
- If NO: State actual status with evidence
|
|
46
|
+
- If YES: State claim WITH evidence
|
|
47
|
+
5. ONLY THEN: Make the claim
|
|
48
|
+
|
|
49
|
+
Skip any step = lying, not verifying
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## What Each Claim Requires
|
|
53
|
+
|
|
54
|
+
| Claim | Requires | Not Sufficient |
|
|
55
|
+
| ---------------- | ------------------------------- | ------------------------------ |
|
|
56
|
+
| Tests pass | Test command output: 0 failures | Previous run, "should pass" |
|
|
57
|
+
| Linter clean | Linter output: 0 errors | Partial check, extrapolation |
|
|
58
|
+
| Build succeeds | Build command: exit 0 | Linter passing, logs look good |
|
|
59
|
+
| Bug fixed | Test original symptom: passes | Code changed, assumed fixed |
|
|
60
|
+
| Regression test | Red-green cycle verified | Test passes once |
|
|
61
|
+
| Requirements met | Line-by-line checklist | Tests passing |
|
|
62
|
+
|
|
63
|
+
## Red Flags - STOP
|
|
64
|
+
|
|
65
|
+
- Using "should", "probably", "seems to"
|
|
66
|
+
- Expressing satisfaction before verification ("Great!", "Perfect!", "Done!")
|
|
67
|
+
- About to commit/push/PR without verification
|
|
68
|
+
- Relying on partial verification
|
|
69
|
+
- Thinking "just this once"
|
|
70
|
+
- ANY wording implying success without having run verification
|
|
71
|
+
|
|
72
|
+
## Common Rationalizations
|
|
73
|
+
|
|
74
|
+
| Excuse | Reality |
|
|
75
|
+
| ---------------------------- | ------------------------------ |
|
|
76
|
+
| "Should work now" | RUN the verification |
|
|
77
|
+
| "I'm confident" | Confidence is not evidence |
|
|
78
|
+
| "Just this once" | No exceptions |
|
|
79
|
+
| "Linter passed" | Linter is not compiler |
|
|
80
|
+
| "Partial check is enough" | Partial proves nothing |
|
|
81
|
+
|
|
82
|
+
## Verification Patterns
|
|
83
|
+
|
|
84
|
+
**Tests:**
|
|
85
|
+
|
|
86
|
+
```text
|
|
87
|
+
OK: [Run test command] [See: 34/34 pass] "All tests pass"
|
|
88
|
+
BAD: "Should pass now" / "Looks correct"
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
**Build:**
|
|
92
|
+
|
|
93
|
+
```text
|
|
94
|
+
OK: [Run build] [See: exit 0] "Build passes"
|
|
95
|
+
BAD: "Linter passed" (linter doesn't check compilation)
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
**Requirements:**
|
|
99
|
+
|
|
100
|
+
```text
|
|
101
|
+
OK: Re-read plan -> Create checklist -> Verify each -> Report gaps or completion
|
|
102
|
+
BAD: "Tests pass, phase complete"
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## The Bottom Line
|
|
106
|
+
|
|
107
|
+
Run the command. Read the output. THEN claim the result.
|
|
108
|
+
|
|
109
|
+
No shortcuts. Non-negotiable.
|
|
110
|
+
|
|
111
|
+
## Related Skills
|
|
112
|
+
|
|
113
|
+
- `/tdd` — TDD ensures tests exist before claiming code works
|
|
114
|
+
- `/debug` — systematic debugging when verification reveals failures
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* eval-runner.js — Skill evaluation framework for Guild.
|
|
3
|
+
*
|
|
4
|
+
* Runs assertions against parsed skill workflows to verify
|
|
5
|
+
* structural correctness. Compatible with anthropics/skills eval format.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { readFileSync, existsSync } from 'fs';
|
|
9
|
+
import { join, dirname } from 'path';
|
|
10
|
+
import { fileURLToPath } from 'url';
|
|
11
|
+
import { parseSkill } from './workflow-parser.js';
|
|
12
|
+
|
|
13
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
14
|
+
const TEMPLATES_DIR = join(__dirname, '..', 'templates', 'skills');
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Evaluates a single assertion against a parsed workflow.
|
|
18
|
+
* @param {object} workflow - Parsed workflow with { version, steps[] }
|
|
19
|
+
* @param {string} assertion - Assertion string (e.g. "step-exists:evaluate")
|
|
20
|
+
* @returns {{ passed: boolean, evidence: string }}
|
|
21
|
+
*/
|
|
22
|
+
export function evaluateAssertion(workflow, assertion) {
|
|
23
|
+
const colonIdx = assertion.indexOf(':');
|
|
24
|
+
if (colonIdx === -1) {
|
|
25
|
+
return { passed: false, evidence: `Malformed assertion: "${assertion}"` };
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
const type = assertion.slice(0, colonIdx);
|
|
29
|
+
const args = assertion.slice(colonIdx + 1);
|
|
30
|
+
|
|
31
|
+
switch (type) {
|
|
32
|
+
case 'step-exists': {
|
|
33
|
+
const step = workflow.steps.find(s => s.id === args);
|
|
34
|
+
return step
|
|
35
|
+
? { passed: true, evidence: `Step "${args}" found` }
|
|
36
|
+
: { passed: false, evidence: `Step "${args}" not found in ${workflow.steps.map(s => s.id).join(', ')}` };
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
case 'step-role': {
|
|
40
|
+
const [stepId, expectedRole] = args.split(':');
|
|
41
|
+
const step = workflow.steps.find(s => s.id === stepId);
|
|
42
|
+
if (!step) return { passed: false, evidence: `Step "${stepId}" not found` };
|
|
43
|
+
return step.role === expectedRole
|
|
44
|
+
? { passed: true, evidence: `Step "${stepId}" has role "${expectedRole}"` }
|
|
45
|
+
: { passed: false, evidence: `Step "${stepId}" has role "${step.role}", expected "${expectedRole}"` };
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
case 'step-model-tier': {
|
|
49
|
+
const [stepId, expectedTier] = args.split(':');
|
|
50
|
+
const step = workflow.steps.find(s => s.id === stepId);
|
|
51
|
+
if (!step) return { passed: false, evidence: `Step "${stepId}" not found` };
|
|
52
|
+
return step.modelTier === expectedTier
|
|
53
|
+
? { passed: true, evidence: `Step "${stepId}" uses tier "${expectedTier}"` }
|
|
54
|
+
: { passed: false, evidence: `Step "${stepId}" uses tier "${step.modelTier}", expected "${expectedTier}"` };
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
case 'step-requires': {
|
|
58
|
+
const [stepId, dep] = args.split(':');
|
|
59
|
+
const step = workflow.steps.find(s => s.id === stepId);
|
|
60
|
+
if (!step) return { passed: false, evidence: `Step "${stepId}" not found` };
|
|
61
|
+
return step.requires.includes(dep)
|
|
62
|
+
? { passed: true, evidence: `Step "${stepId}" requires "${dep}"` }
|
|
63
|
+
: { passed: false, evidence: `Step "${stepId}" requires [${step.requires.join(', ')}], missing "${dep}"` };
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
case 'step-parallel': {
|
|
67
|
+
const step = workflow.steps.find(s => s.id === args);
|
|
68
|
+
if (!step) return { passed: false, evidence: `Step "${args}" not found` };
|
|
69
|
+
return step.parallel && step.parallel.length > 0
|
|
70
|
+
? { passed: true, evidence: `Step "${args}" is parallel with [${step.parallel.join(', ')}]` }
|
|
71
|
+
: { passed: false, evidence: `Step "${args}" has no parallel group` };
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
case 'gate-exists': {
|
|
75
|
+
const step = workflow.steps.find(s => s.id === args);
|
|
76
|
+
if (!step) return { passed: false, evidence: `Step "${args}" not found` };
|
|
77
|
+
return step.gate === true
|
|
78
|
+
? { passed: true, evidence: `Step "${args}" has gate: true` }
|
|
79
|
+
: { passed: false, evidence: `Step "${args}" has gate: ${step.gate}` };
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
case 'step-count': {
|
|
83
|
+
const min = parseInt(args, 10);
|
|
84
|
+
const actual = workflow.steps.length;
|
|
85
|
+
return actual >= min
|
|
86
|
+
? { passed: true, evidence: `Workflow has ${actual} steps (minimum ${min})` }
|
|
87
|
+
: { passed: false, evidence: `Workflow has ${actual} steps, expected at least ${min}` };
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
default:
|
|
91
|
+
return { passed: false, evidence: `Unknown assertion type: "${type}"` };
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Loads evals.json for a skill template.
|
|
97
|
+
* @param {string} skillName - Skill directory name (e.g. 'build-feature')
|
|
98
|
+
* @returns {object|null} Parsed evals object or null if no evals exist
|
|
99
|
+
*/
|
|
100
|
+
export function loadEvals(skillName) {
|
|
101
|
+
const evalsPath = join(TEMPLATES_DIR, skillName, 'evals', 'evals.json');
|
|
102
|
+
if (!existsSync(evalsPath)) return null;
|
|
103
|
+
return JSON.parse(readFileSync(evalsPath, 'utf8'));
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Runs all evals for a skill template.
|
|
108
|
+
* Parses the SKILL.md, loads evals.json, and evaluates each assertion.
|
|
109
|
+
* @param {string} skillName - Skill directory name
|
|
110
|
+
* @returns {{ skill: string, results: Array<{ id: string, description: string, passed: boolean, expectations: Array }> }}
|
|
111
|
+
*/
|
|
112
|
+
export function runEvals(skillName) {
|
|
113
|
+
const evals = loadEvals(skillName);
|
|
114
|
+
if (!evals) throw new Error(`No evals found for skill "${skillName}"`);
|
|
115
|
+
|
|
116
|
+
const skillPath = join(TEMPLATES_DIR, skillName, 'SKILL.md');
|
|
117
|
+
const content = readFileSync(skillPath, 'utf8');
|
|
118
|
+
const skill = parseSkill(content);
|
|
119
|
+
|
|
120
|
+
if (!skill.workflow) {
|
|
121
|
+
throw new Error(`Skill "${skillName}" has no workflow definition`);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
const results = evals.evals.map(evalCase => {
|
|
125
|
+
const expectations = evalCase.expectations.map(exp => {
|
|
126
|
+
const result = evaluateAssertion(skill.workflow, exp.assertion);
|
|
127
|
+
return { text: exp.text, assertion: exp.assertion, ...result };
|
|
128
|
+
});
|
|
129
|
+
const passed = expectations.every(e => e.passed);
|
|
130
|
+
return {
|
|
131
|
+
id: evalCase.id,
|
|
132
|
+
description: evalCase.description,
|
|
133
|
+
passed,
|
|
134
|
+
expectations,
|
|
135
|
+
};
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
return { skill: skillName, results };
|
|
139
|
+
}
|
package/src/utils/generators.js
CHANGED
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
*/
|
|
4
4
|
|
|
5
5
|
import { writeFileSync } from 'fs';
|
|
6
|
+
import { wrapZone } from './zones.js';
|
|
7
|
+
import { generateWorkspaceContext } from './workspace.js';
|
|
6
8
|
|
|
7
9
|
/**
|
|
8
10
|
* Generates PROJECT.md with the onboarding data.
|
|
@@ -95,7 +97,10 @@ export function inferEnvVars(type, stack) {
|
|
|
95
97
|
/**
|
|
96
98
|
* Generates CLAUDE.md — central document with placeholders for guild-specialize.
|
|
97
99
|
*/
|
|
98
|
-
export async function generateClaudeMd(data) {
|
|
100
|
+
export async function generateClaudeMd(data, workspace = null, currentMemberName = null) {
|
|
101
|
+
const wsContext = generateWorkspaceContext(workspace, currentMemberName);
|
|
102
|
+
const workspaceSection = wsContext ? `\n${wsContext}\n` : '';
|
|
103
|
+
|
|
99
104
|
const content = `# ${data.name}
|
|
100
105
|
|
|
101
106
|
## Framework
|
|
@@ -105,20 +110,17 @@ This project uses Guild. Read SESSION.md at the start of each session.
|
|
|
105
110
|
${data.stack}
|
|
106
111
|
|
|
107
112
|
## Project structure
|
|
108
|
-
[PENDING: guild-specialize]
|
|
109
|
-
|
|
110
|
-
docs/
|
|
111
|
-
specs/ # Design documents (SDD specs)
|
|
113
|
+
${wrapZone('structure', '[PENDING: guild-specialize]\n\ndocs/\n specs/ # Design documents (SDD specs)')}
|
|
112
114
|
|
|
113
115
|
## Code conventions
|
|
114
|
-
${inferCodeConventions(data.type, data.stack)}
|
|
116
|
+
${wrapZone('conventions', inferCodeConventions(data.type, data.stack))}
|
|
115
117
|
|
|
116
118
|
## Architecture patterns
|
|
117
|
-
[PENDING: guild-specialize]
|
|
119
|
+
${wrapZone('architecture', '[PENDING: guild-specialize]')}
|
|
118
120
|
|
|
119
121
|
## Environment variables
|
|
120
|
-
${inferEnvVars(data.type, data.stack)}
|
|
121
|
-
|
|
122
|
+
${wrapZone('env-vars', inferEnvVars(data.type, data.stack))}
|
|
123
|
+
${workspaceSection}
|
|
122
124
|
## Global rules
|
|
123
125
|
- Do not implement without an approved plan
|
|
124
126
|
- Update SESSION.md at the end of each session
|