specweave 1.0.390 → 1.0.392
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +27 -2
- package/dist/src/generators/spec/spec-parser.js +1 -1
- package/dist/src/generators/spec/spec-parser.js.map +1 -1
- package/package.json +1 -1
- package/plugins/specweave-github/skills/github-sync/evals/HOW-TO-RUN-EVALS.md +50 -0
- package/plugins/specweave-github/skills/github-sync/evals/evals.json +136 -0
package/README.md
CHANGED
|
@@ -99,9 +99,34 @@ You can also invoke commands directly for fine-grained control.
|
|
|
99
99
|
|
|
100
100
|
## Why SpecWeave?
|
|
101
101
|
|
|
102
|
-
|
|
102
|
+
**The plan is more important than the code.**
|
|
103
103
|
|
|
104
|
-
|
|
104
|
+
AI coding agents are powerful, but without structured planning they produce what practitioners call "slop" — code generated through unstructured chat that creates technical debt from day one. The [Research-Plan-Implement (RPI)](https://www.youtube.com/watch?v=rmvDxxNubIg) methodology demonstrates that **bad plans create orders of magnitude more problems than bad code**. Human review should focus on the plan — the highest-leverage checkpoint where misunderstandings are cheapest to correct.
|
|
105
|
+
|
|
106
|
+
SpecWeave embodies this principle. Each **increment** is a focused, reviewable unit of work — spec, plan, tasks — that you control through a sprint or external tools (GitHub, JIRA, ADO). As long as the spec and plan are solid, the implementation follows naturally, one step at a time.
|
|
107
|
+
|
|
108
|
+
The more structure in your workflow, the harder the problems you can solve:
|
|
109
|
+
|
|
110
|
+
```
|
|
111
|
+
Hardest problem big refactors, ○ ← /sw:team-lead
|
|
112
|
+
you can solve whole new features (multi-agent)
|
|
113
|
+
↑ ○
|
|
114
|
+
│ medium features ← /sw:brainstorm → /sw:increment → /sw:auto
|
|
115
|
+
│ across repos (research + plan + implement)
|
|
116
|
+
│ ○
|
|
117
|
+
│ small features ← /sw:increment → /sw:do
|
|
118
|
+
│ across 3-5 files (plan + implement)
|
|
119
|
+
│ ○
|
|
120
|
+
│ small fixes ← just talk to AI
|
|
121
|
+
│ copy changes (no planning)
|
|
122
|
+
└────────────────────────────────────────────→
|
|
123
|
+
just talk simple plan research → multi-phase
|
|
124
|
+
to AI then work it plan → impl agent teams
|
|
125
|
+
|
|
126
|
+
amount of context engineering →
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
SpecWeave solves multi-agent chaos with **file-based coordination**:
|
|
105
130
|
|
|
106
131
|
```
|
|
107
132
|
.specweave/increments/0001-oauth/
|
|
@@ -121,7 +121,7 @@ function parseFrontmatter(lines) {
|
|
|
121
121
|
'See CLAUDE.md Rule #16 for details.');
|
|
122
122
|
}
|
|
123
123
|
// Validate increment ID format (0001-feature-name, 0417J-name, 0111E-name)
|
|
124
|
-
const incrementIdRegex = /^[0-9]{4}[EGJA]?-[a-z0-9-]
|
|
124
|
+
const incrementIdRegex = /^[0-9]{4}[EGJA]?-[a-z0-9-]+$/;
|
|
125
125
|
if (!incrementIdRegex.test(frontmatter.increment)) {
|
|
126
126
|
throw new Error(`Invalid increment ID format: "${frontmatter.increment}"\n\n` +
|
|
127
127
|
'Expected format: 4-digit number + hyphen + kebab-case name\n\n' +
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"spec-parser.js","sourceRoot":"","sources":["../../../../src/generators/spec/spec-parser.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,IAAI,CAAC;AAElC,OAAO,KAAK,IAAI,MAAM,SAAS,CAAC;AA0ChC;;;;;;GAMG;AACH,MAAM,UAAU,WAAW,CAAC,QAAgB;IAC1C,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAChD,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAElC,kDAAkD;QAClD,MAAM,EAAE,WAAW,EAAE,KAAK,EAAE,GAAG,gBAAgB,CAAC,KAAK,CAAC,CAAC;QAEvD,uBAAuB;QACvB,MAAM,WAAW,GAAG,kBAAkB,CAAC,KAAK,CAAC,CAAC;QAE9C,qBAAqB;QACrB,MAAM,QAAQ,GAAG,WAAW,CAAC,OAAO,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,kBAAkB,CAAC,CAAC;QAElE,OAAO;YACL,WAAW;YACX,KAAK;YACL,WAAW;YACX,QAAQ;SACT,CAAC;IACJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,IAAI,KAAK,CAAC,8BAA8B,QAAQ,KAAK,KAAK,EAAE,CAAC,CAAC;IACtE,CAAC;AACH,CAAC;AAED;;;;;;;;;GASG;AACH,SAAS,gBAAgB,CAAC,KAAe;IACvC,IAAI,aAAa,GAAG,KAAK,CAAC;IAC1B,MAAM,gBAAgB,GAAa,EAAE,CAAC;IACtC,IAAI,gBAAgB,GAAG,CAAC,CAAC,CAAC;IAC1B,IAAI,cAAc,GAAG,CAAC,CAAC,CAAC;IAExB,gDAAgD;IAChD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QACtB,IAAI,IAAI,CAAC,IAAI,EAAE,KAAK,KAAK,EAAE,CAAC;YAC1B,IAAI,CAAC,aAAa,EAAE,CAAC;gBACnB,aAAa,GAAG,IAAI,CAAC;gBACrB,gBAAgB,GAAG,CAAC,GAAG,CAAC,CAAC;gBACzB,SAAS;YACX,CAAC;iBAAM,CAAC;gBACN,cAAc,GAAG,CAAC,GAAG,CAAC,CAAC;gBACvB,MAAM,CAAC,qBAAqB;YAC9B,CAAC;QACH,CAAC;QACD,IAAI,aAAa,EAAE,CAAC;YAClB,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC9B,CAAC;IACH,CAAC;IAED,8BAA8B;IAC9B,IAAI,gBAAgB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAClC,MAAM,IAAI,KAAK,CACb,2CAA2C;YAC3C,4BAA4B;YAC5B,OAAO;YACP,gCAAgC;YAChC,oCAAoC;YACpC,SAAS;YACT,qCAAqC,CACtC,CAAC;IACJ,CAAC;IAED,gEAAgE;IAChE,IAAI,WAAgB,CAAC;IACrB,IAAI,CAAC;QACH,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC;IACvD,CAAC;IAAC,OAAO,KAAU,EAAE,CAAC;QACpB,MAAM,QAAQ,GAAG,KAAK,CAAC,OAAO,IAAI,MAAM,CAAC,KAAK,CAAC,CAAC;QAChD,MAAM,IAAI,KAAK,CACb,qCAAqC,gBAAgB,IAAI,cAAc,QAAQ;YAC/E,GAAG,QAAQ,MAAM;YACjB,oBAAoB;YACpB,oCAAoC;YACpC,kCAAkC;YAClC,oCAAoC;YACpC,+CAA+C;YAC/C,kBAAkB;YAClB,OAAO;YACP,gCAAgC;YAChC,wBAAwB;YACxB,sBAAsB;YACtB,SAAS;YACT,qCAAqC,CACtC,CAAC;IACJ,CAAC;IAED,oCAAoC;IACpC,IAAI,CAAC,WAAW,IAAI,OAAO,WAAW,KAAK,QAAQ,EAAE,CAAC;QACpD,MAAM,IAAI,KAAK,CACb,8DAA8D;YAC9D,gBAAgB;YAChB,iBAAiB;YACjB,SAAS;YACT,cAAc;YACd,gCAAgC;YAChC,KAAK,CACN,CAAC;IACJ,CAAC;IAED,qCAAqC;IACrC,IAAI,CAAC,WAAW,CAAC,SAAS,EAAE,CAAC;QAC3B,MAAM,IAAI,KAAK,CACb,uCAAuC;YACvC,uBAAuB;YACvB,kCAAkC;YAClC,qCAAqC,CACtC,CAAC;IACJ,CAAC;IAED,2EAA2E;IAC3E,MAAM,gBAAgB,GAAG
|
|
1
|
+
{"version":3,"file":"spec-parser.js","sourceRoot":"","sources":["../../../../src/generators/spec/spec-parser.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,IAAI,CAAC;AAElC,OAAO,KAAK,IAAI,MAAM,SAAS,CAAC;AA0ChC;;;;;;GAMG;AACH,MAAM,UAAU,WAAW,CAAC,QAAgB;IAC1C,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAChD,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAElC,kDAAkD;QAClD,MAAM,EAAE,WAAW,EAAE,KAAK,EAAE,GAAG,gBAAgB,CAAC,KAAK,CAAC,CAAC;QAEvD,uBAAuB;QACvB,MAAM,WAAW,GAAG,kBAAkB,CAAC,KAAK,CAAC,CAAC;QAE9C,qBAAqB;QACrB,MAAM,QAAQ,GAAG,WAAW,CAAC,OAAO,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,kBAAkB,CAAC,CAAC;QAElE,OAAO;YACL,WAAW;YACX,KAAK;YACL,WAAW;YACX,QAAQ;SACT,CAAC;IACJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,IAAI,KAAK,CAAC,8BAA8B,QAAQ,KAAK,KAAK,EAAE,CAAC,CAAC;IACtE,CAAC;AACH,CAAC;AAED;;;;;;;;;GASG;AACH,SAAS,gBAAgB,CAAC,KAAe;IACvC,IAAI,aAAa,GAAG,KAAK,CAAC;IAC1B,MAAM,gBAAgB,GAAa,EAAE,CAAC;IACtC,IAAI,gBAAgB,GAAG,CAAC,CAAC,CAAC;IAC1B,IAAI,cAAc,GAAG,CAAC,CAAC,CAAC;IAExB,gDAAgD;IAChD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QACtB,IAAI,IAAI,CAAC,IAAI,EAAE,KAAK,KAAK,EAAE,CAAC;YAC1B,IAAI,CAAC,aAAa,EAAE,CAAC;gBACnB,aAAa,GAAG,IAAI,CAAC;gBACrB,gBAAgB,GAAG,CAAC,GAAG,CAAC,CAAC;gBACzB,SAAS;YACX,CAAC;iBAAM,CAAC;gBACN,cAAc,GAAG,CAAC,GAAG,CAAC,CAAC;gBACvB,MAAM,CAAC,qBAAqB;YAC9B,CAAC;QACH,CAAC;QACD,IAAI,aAAa,EAAE,CAAC;YAClB,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC9B,CAAC;IACH,CAAC;IAED,8BAA8B;IAC9B,IAAI,gBAAgB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAClC,MAAM,IAAI,KAAK,CACb,2CAA2C;YAC3C,4BAA4B;YAC5B,OAAO;YACP,gCAAgC;YAChC,oCAAoC;YACpC,SAAS;YACT,qCAAqC,CACtC,CAAC;IACJ,CAAC;IAED,gEAAgE;IAChE,IAAI,WAAgB,CAAC;IACrB,IAAI,CAAC;QACH,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC;IACvD,CAAC;IAAC,OAAO,KAAU,EAAE,CAAC;QACpB,MAAM,QAAQ,GAAG,KAAK,CAAC,OAAO,IAAI,MAAM,CAAC,KAAK,CAAC,CAAC;QAChD,MAAM,IAAI,KAAK,CACb,qCAAqC,gBAAgB,IAAI,cAAc,QAAQ;YAC/E,GAAG,QAAQ,MAAM;YACjB,oBAAoB;YACpB,oCAAoC;YACpC,kCAAkC;YAClC,oCAAoC;YACpC,+CAA+C;YAC/C,kBAAkB;YAClB,OAAO;YACP,gCAAgC;YAChC,wBAAwB;YACxB,sBAAsB;YACtB,SAAS;YACT,qCAAqC,CACtC,CAAC;IACJ,CAAC;IAED,oCAAoC;IACpC,IAAI,CAAC,WAAW,IAAI,OAAO,WAAW,KAAK,QAAQ,EAAE,CAAC;QACpD,MAAM,IAAI,KAAK,CACb,8DAA8D;YAC9D,gBAAgB;YAChB,iBAAiB;YACjB,SAAS;YACT,cAAc;YACd,gCAAgC;YAChC,KAAK,CACN,CAAC;IACJ,CAAC;IAED,qCAAqC;IACrC,IAAI,CAAC,WAAW,CAAC,SAAS,EAAE,CAAC;QAC3B,MAAM,IAAI,KAAK,CACb,uCAAuC;YACvC,uBAAuB;YACvB,kCAAkC;YAClC,qCAAqC,CACtC,CAAC;IACJ,CAAC;IAED,2EAA2E;IAC3E,MAAM,gBAAgB,GAAG,8BAA8B,CAAC;IACxD,IAAI,CAAC,gBAAgB,CAAC,IAAI,CAAC,WAAW,CAAC,SAAS,CAAC,EAAE,CAAC;QAClD,MAAM,IAAI,KAAK,CACb,iCAAiC,WAAW,CAAC,SAAS,OAAO;YAC7D,gEAAgE;YAChE,mBAAmB;YACnB,yBAAyB;YACzB,oBAAoB;YACpB,uBAAuB;YACvB,qBAAqB;YACrB,sCAAsC;YACtC,gDAAgD;YAChD,qCAAqC;YACrC,6BAA6B;YAC7B,qCAAqC,CACtC,CAAC;IACJ,CAAC;IAED,OAAO;QACL,WAAW,EAAE,WAAW,CAAC,SAAS;QAClC,KAAK,EAAE,WAAW,CAAC,KAAK,IAAI,WAAW,CAAC,SAAS;KAClD,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,kBAAkB,CAAC,KAAe;IACzC,MAAM,WAAW,GAAgB,EAAE,CAAC;IACpC,IAAI,SAAS,GAAqB,IAAI,CAAC;IACvC,IAAI,WAAW,GAAG,KAAK,CAAC;IAExB,4DAA4D;IAC5D,kDAAkD;IAClD,MAAM,aAAa,GAAG,gCAAgC,CAAC,CAAE,0CAA0C;IACnG,MAAM,OAAO,GAAG,4CAA4C,CAAC,CAAE,uBAAuB;IACtF,MAAM,aAAa,GAAG,8BAA8B,CAAC;IAErD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QACtB,MAAM,UAAU,GAAG,CAAC,GAAG,CAAC,CAAC;QAEzB,8BAA8B;QAC9B,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;QAC1C,IAAI,OAAO,EAAE,CAAC;YACZ,6BAA6B;YAC7B,IAAI,SAAS,EAAE,CAAC;gBACd,WAAW,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YAC9B,CAAC;YAED,eAAe;YACf,SAAS,GAAG;gBACV,EAAE,EAAE,OAAO,CAAC,CAAC,CAAC;gBACd,KAAK,EAAE,OAAO,CAAC,CAAC,CAAC;gBACjB,kBAAkB,EAAE,EAAE;gBACtB,UAAU;aACX,CAAC;YACF,WAAW,GAAG,KAAK,CAAC;YACpB,SAAS;QACX,CAAC;QAED,wBAAwB;QACxB,IAAI,CAAC,SAAS;YAAE,SAAS;QAEzB,wCAAwC;QACxC,IAAI,IAAI,CAAC,QAAQ,CAAC,0BAA0B,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,qBAAqB,CAAC,EAAE,CAAC;YACtF,WAAW,GAAG,IAAI,CAAC;YACnB,SAAS;QACX,CAAC;QAED,mDAAmD;QACnD,IAAI,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,aAAa,CAAC,EAAE,CAAC;YACxD,WAAW,GAAG,KAAK,CAAC;QACtB,CAAC;QAED,kCAAkC;QAClC,IAAI,WAAW,EAAE,CAAC;YAChB,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;YACpC,IAAI,OAAO,EAAE,CAAC;gBACZ,MAAM,IAAI,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;gBACxB,SAAS,CAAC,kBAAkB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC1C,CAAC;QACH,CAAC;QAED,8BAA8B;QAC9B,MAAM,aAAa,GAAG,IAAI,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;QAChD,IAAI,aAAa,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,CAAC;YACzC,SAAS,CAAC,QAAQ,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC;QACxC,CAAC;IACH,CAAC;IAED,eAAe;IACf,IAAI,SAAS,EAAE,CAAC;QACd,WAAW,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAC9B,CAAC;IAED,OAAO,WAAW,CAAC;AACrB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,WAAW,CAAC,QAAgB;IAC1C,OAAO,WAAW,CAAC,QAAQ,CAAC,CAAC,WAAW,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC;AAC5D,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,WAAW,CAAC,QAAgB;IAC1C,OAAO,WAAW,CAAC,QAAQ,CAAC,CAAC,QAAQ,CAAC;AACxC,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,WAAW,CAAC,QAAgB,EAAE,IAAY;IACxD,MAAM,EAAE,GAAG,WAAW,CAAC,QAAQ,CAAC,CAAC,WAAW,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,EAAE,KAAK,IAAI,CAAC,CAAC;IAC9E,OAAO,EAAE,EAAE,kBAAkB,IAAI,EAAE,CAAC;AACtC,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,qBAAqB,CAAC,IAAY,EAAE,IAAY;IAC9D,8CAA8C;IAC9C,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,oBAAoB,CAAC,CAAC;IACjD,IAAI,CAAC,OAAO;QAAE,OAAO,KAAK,CAAC;IAE3B,MAAM,UAAU,GAAG,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IAE5C,2DAA2D;IAC3D,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC;IAC5C,IAAI,CAAC,OAAO;QAAE,OAAO,KAAK,CAAC;IAE3B,MAAM,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IAE1C,OAAO,UAAU,KAAK,QAAQ,CAAC;AACjC,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "specweave",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.392",
|
|
4
4
|
"description": "Spec-driven development framework for AI coding agents. Works with Claude Code, Codex, Antigravity, Cursor, Copilot & more. 100+ skills, 49 CLI commands, verified skill certification, autonomous execution, and living documentation.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# How to Run Evals for the GitHub Sync Skill
|
|
2
|
+
|
|
3
|
+
This guide explains how to add test cases, run them, and compare quality with and without the skill.
|
|
4
|
+
|
|
5
|
+
## What Are Evals?
|
|
6
|
+
|
|
7
|
+
Evals are test cases that prove the skill works. Each one has:
|
|
8
|
+
- A **prompt** -- something you'd actually ask (e.g., "How do I set up GitHub sync?")
|
|
9
|
+
- **Assertions** -- specific things the skill should make Claude do (e.g., "mentions gh auth login", "explains spec-to-project mapping")
|
|
10
|
+
|
|
11
|
+
We run each prompt twice: once with the skill loaded, once without. The difference in scores shows exactly how much value the skill adds.
|
|
12
|
+
|
|
13
|
+
## File Structure
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
github-sync/
|
|
17
|
+
evals/
|
|
18
|
+
evals.json <-- Test cases
|
|
19
|
+
HOW-TO-RUN-EVALS.md <-- This file
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Running Evals
|
|
23
|
+
|
|
24
|
+
### Via vskill CLI
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
# Set the model (Opus 4.6 recommended for judging)
|
|
28
|
+
export VSKILL_EVAL_MODEL=claude-opus-4-6
|
|
29
|
+
export ANTHROPIC_API_KEY=sk-ant-...
|
|
30
|
+
|
|
31
|
+
# Run evals for this skill
|
|
32
|
+
vskill eval run specweave-github/github-sync
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### Via Claude Code
|
|
36
|
+
|
|
37
|
+
Ask Claude:
|
|
38
|
+
> "Run the github-sync evals -- both with-skill and without-skill."
|
|
39
|
+
|
|
40
|
+
## Adding Test Cases
|
|
41
|
+
|
|
42
|
+
Open `evals.json` and add entries to the `evals` array. Each case needs:
|
|
43
|
+
- A realistic prompt with specific details
|
|
44
|
+
- Objectively verifiable assertions (boolean pass/fail)
|
|
45
|
+
|
|
46
|
+
## Model Selection
|
|
47
|
+
|
|
48
|
+
For eval generation and judging, use:
|
|
49
|
+
- `claude-opus-4-6` -- highest quality, recommended for final evals
|
|
50
|
+
- `claude-sonnet-4-6` -- faster, good for iteration
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill_name": "github-sync",
|
|
3
|
+
"evals": [
|
|
4
|
+
{
|
|
5
|
+
"id": 1,
|
|
6
|
+
"name": "initial-github-sync-setup",
|
|
7
|
+
"prompt": "I just started using SpecWeave on my project at github.com/myorg/backend-api. I want to sync my specs to GitHub Projects so my team can track progress. How do I set up the GitHub integration? I need two-way sync.",
|
|
8
|
+
"expected_output": "Should guide through GitHub CLI authentication (gh auth login), explain the config.json setup for specweave-github plugin with repo, autoSyncSpecs, syncDirection settings, explain that specs (not increments) are synced to GitHub Projects, and mention the /sw-github:sync-spec command for manual sync.",
|
|
9
|
+
"files": [],
|
|
10
|
+
"assertions": [
|
|
11
|
+
{
|
|
12
|
+
"id": "gh-cli-auth",
|
|
13
|
+
"text": "Mentions GitHub CLI (gh) installation and authentication steps (gh auth login or gh auth status)",
|
|
14
|
+
"type": "boolean"
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
"id": "config-json-setup",
|
|
18
|
+
"text": "Shows or references the .specweave/config.json configuration with specweave-github plugin settings including repo, autoSyncSpecs, and syncDirection fields",
|
|
19
|
+
"type": "boolean"
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
"id": "specs-not-increments",
|
|
23
|
+
"text": "Clarifies that SpecWeave syncs specs (living docs) to GitHub, NOT increments, and explains why (specs are permanent, increments are temporary)",
|
|
24
|
+
"type": "boolean"
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
"id": "two-way-sync",
|
|
28
|
+
"text": "Explains two-way sync capability and mentions syncDirection: two-way as the default configuration",
|
|
29
|
+
"type": "boolean"
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"id": "sync-command",
|
|
33
|
+
"text": "References the /sw-github:sync-spec command for triggering manual sync operations",
|
|
34
|
+
"type": "boolean"
|
|
35
|
+
}
|
|
36
|
+
]
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
"id": 2,
|
|
40
|
+
"name": "sync-troubleshooting-permissions",
|
|
41
|
+
"prompt": "My GitHub sync keeps failing with a 403 error when I run /sw-github:sync-spec spec-003. I'm getting 'insufficient permissions' errors. I set up the token but something is wrong. How do I fix this?",
|
|
42
|
+
"expected_output": "Should diagnose GitHub permissions issues: check gh auth status, verify token scopes (needs repo and project access), check repository write access, suggest re-authenticating with correct scopes, and mention rate limit checking with gh api rate_limit.",
|
|
43
|
+
"files": [],
|
|
44
|
+
"assertions": [
|
|
45
|
+
{
|
|
46
|
+
"id": "check-auth-status",
|
|
47
|
+
"text": "Suggests running gh auth status to verify current authentication state and token validity",
|
|
48
|
+
"type": "boolean"
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
"id": "token-permissions",
|
|
52
|
+
"text": "Addresses token scope/permissions requirements - mentions needing repo write access or project access scopes",
|
|
53
|
+
"type": "boolean"
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
"id": "re-auth-suggestion",
|
|
57
|
+
"text": "Suggests re-authenticating (gh auth login) with correct scopes as a resolution step",
|
|
58
|
+
"type": "boolean"
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
"id": "rate-limit-check",
|
|
62
|
+
"text": "Mentions checking GitHub API rate limits (gh api rate_limit) as a potential cause of failures",
|
|
63
|
+
"type": "boolean"
|
|
64
|
+
},
|
|
65
|
+
{
|
|
66
|
+
"id": "config-verification",
|
|
67
|
+
"text": "Suggests verifying the .specweave/config.json sync configuration and repo field correctness",
|
|
68
|
+
"type": "boolean"
|
|
69
|
+
}
|
|
70
|
+
]
|
|
71
|
+
},
|
|
72
|
+
{
|
|
73
|
+
"id": 3,
|
|
74
|
+
"name": "understand-sync-mapping",
|
|
75
|
+
"prompt": "Can you explain how SpecWeave maps to GitHub? I want to understand what becomes what. We have spec-001 with 5 user stories and each story has 3-4 acceptance criteria. What will this look like in GitHub?",
|
|
76
|
+
"expected_output": "Should explain the mapping: Spec -> GitHub Project (with title, description, columns), User Stories -> GitHub Issues (with AC as checkboxes, labels), progress tracking via checkbox updates, and automatic project column movement (Backlog -> In Progress -> Done).",
|
|
77
|
+
"files": [],
|
|
78
|
+
"assertions": [
|
|
79
|
+
{
|
|
80
|
+
"id": "spec-to-project",
|
|
81
|
+
"text": "Explains that a spec maps to a GitHub Project with title format [SPEC-001], description, and columns (Backlog, In Progress, Done)",
|
|
82
|
+
"type": "boolean"
|
|
83
|
+
},
|
|
84
|
+
{
|
|
85
|
+
"id": "story-to-issue",
|
|
86
|
+
"text": "Explains that each user story maps to a GitHub Issue with title format [US-001] and acceptance criteria as checkboxes in the issue body",
|
|
87
|
+
"type": "boolean"
|
|
88
|
+
},
|
|
89
|
+
{
|
|
90
|
+
"id": "ac-checkboxes",
|
|
91
|
+
"text": "Describes acceptance criteria being represented as checkboxes in GitHub Issues that get checked/unchecked as ACs are completed",
|
|
92
|
+
"type": "boolean"
|
|
93
|
+
},
|
|
94
|
+
{
|
|
95
|
+
"id": "labels-mentioned",
|
|
96
|
+
"text": "Mentions GitHub labels being applied to issues (e.g., user-story, spec:spec-001, priority labels)",
|
|
97
|
+
"type": "boolean"
|
|
98
|
+
},
|
|
99
|
+
{
|
|
100
|
+
"id": "progress-tracking",
|
|
101
|
+
"text": "Explains automatic progress tracking - project board column movement or percentage updates as tasks/ACs are completed",
|
|
102
|
+
"type": "boolean"
|
|
103
|
+
}
|
|
104
|
+
]
|
|
105
|
+
},
|
|
106
|
+
{
|
|
107
|
+
"id": 4,
|
|
108
|
+
"name": "projects-v2-configuration",
|
|
109
|
+
"prompt": "We're using GitHub Projects V2 (the new board view). I want to set up SpecWeave to use our existing Projects V2 board (project number 5) with custom status fields. Our board has columns: Triage, Ready, Building, Review, Shipped. How do I configure this?",
|
|
110
|
+
"expected_output": "Should explain Projects V2 configuration with projectV2Enabled: true and projectV2Number: 5, custom status field mappings from SpecWeave states (planned, in-progress, completed) to the user's custom column names, and priority field mappings.",
|
|
111
|
+
"files": [],
|
|
112
|
+
"assertions": [
|
|
113
|
+
{
|
|
114
|
+
"id": "v2-config-fields",
|
|
115
|
+
"text": "Shows or references the projectV2Enabled: true and projectV2Number: 5 configuration settings in the sync config",
|
|
116
|
+
"type": "boolean"
|
|
117
|
+
},
|
|
118
|
+
{
|
|
119
|
+
"id": "status-field-mapping",
|
|
120
|
+
"text": "Explains statusFieldMapping configuration that maps SpecWeave states (planned, in-progress, completed) to custom column names",
|
|
121
|
+
"type": "boolean"
|
|
122
|
+
},
|
|
123
|
+
{
|
|
124
|
+
"id": "priority-field-mapping",
|
|
125
|
+
"text": "Mentions or shows priorityFieldMapping configuration for mapping P1-P4 priorities to custom priority field values",
|
|
126
|
+
"type": "boolean"
|
|
127
|
+
},
|
|
128
|
+
{
|
|
129
|
+
"id": "existing-board-reuse",
|
|
130
|
+
"text": "Addresses using an existing Projects V2 board by referencing its project number rather than creating a new one",
|
|
131
|
+
"type": "boolean"
|
|
132
|
+
}
|
|
133
|
+
]
|
|
134
|
+
}
|
|
135
|
+
]
|
|
136
|
+
}
|