kushi-agents 5.0.2 → 5.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/README.md +35 -0
  2. package/bin/cli.mjs +103 -0
  3. package/package.json +6 -2
  4. package/plugin/agents/kushi.agent.md +3 -1
  5. package/plugin/instructions/skill-authoring.instructions.md +147 -0
  6. package/plugin/instructions/skill-evals.instructions.md +130 -0
  7. package/plugin/skills/aggregate-project/evals/evals.json +33 -0
  8. package/plugin/skills/apply-ado-update/evals/evals.json +33 -0
  9. package/plugin/skills/ask-project/SKILL.md +10 -0
  10. package/plugin/skills/ask-project/evals/evals.json +34 -0
  11. package/plugin/skills/bootstrap-project/evals/evals.json +34 -0
  12. package/plugin/skills/build-state/evals/evals.json +31 -0
  13. package/plugin/skills/consolidate-evidence/evals/evals.json +33 -0
  14. package/plugin/skills/dashboard/evals/evals.json +33 -0
  15. package/plugin/skills/emit-vertex/evals/evals.json +33 -0
  16. package/plugin/skills/eval/SKILL.md +90 -0
  17. package/plugin/skills/eval/evals.schema.json +73 -0
  18. package/plugin/skills/eval/run-evals.ps1 +372 -0
  19. package/plugin/skills/fde-intake/evals/evals.json +33 -0
  20. package/plugin/skills/fde-report/evals/evals.json +33 -0
  21. package/plugin/skills/fde-triage/evals/evals.json +33 -0
  22. package/plugin/skills/intro/SKILL.md +160 -451
  23. package/plugin/skills/intro/evals/evals.json +33 -0
  24. package/plugin/skills/intro/references/walkthrough.md +310 -0
  25. package/plugin/skills/link-entities/evals/evals.json +31 -0
  26. package/plugin/skills/project-status/SKILL.md +10 -1
  27. package/plugin/skills/project-status/evals/evals.json +33 -0
  28. package/plugin/skills/propose-ado-update/evals/evals.json +33 -0
  29. package/plugin/skills/pull-ado/evals/evals.json +35 -0
  30. package/plugin/skills/pull-crm/evals/evals.json +35 -0
  31. package/plugin/skills/pull-email/evals/evals.json +35 -0
  32. package/plugin/skills/pull-loop/evals/evals.json +35 -0
  33. package/plugin/skills/pull-meetings/evals/evals.json +35 -0
  34. package/plugin/skills/pull-misc/evals/evals.json +35 -0
  35. package/plugin/skills/pull-onenote/evals/evals.json +35 -0
  36. package/plugin/skills/pull-sharepoint/evals/evals.json +35 -0
  37. package/plugin/skills/pull-teams/evals/evals.json +35 -0
  38. package/plugin/skills/refresh-project/evals/evals.json +31 -0
  39. package/plugin/skills/self-check/SKILL.md +2 -0
  40. package/plugin/skills/self-check/evals/evals.json +28 -0
  41. package/plugin/skills/self-check/run.ps1 +144 -0
  42. package/plugin/skills/setup/SKILL.md +10 -0
  43. package/plugin/skills/setup/evals/evals.json +33 -0
  44. package/plugin/skills/skill-checker/SKILL.md +136 -0
  45. package/plugin/skills/skill-checker/check-skill.ps1 +416 -0
  46. package/plugin/skills/skill-checker/evals/evals.json +41 -0
  47. package/plugin/skills/skill-creator/SKILL.md +134 -0
  48. package/plugin/skills/skill-creator/evals/evals.json +40 -0
  49. package/plugin/skills/skill-creator/generate-eval-review.ps1 +101 -0
  50. package/plugin/skills/skill-creator/optimize-description.ps1 +87 -0
  51. package/plugin/skills/skill-creator/scaffold.ps1 +180 -0
  52. package/plugin/skills/skill-creator/templates/evals-starter.template.json +27 -0
  53. package/plugin/skills/skill-creator/templates/gotchas-stub.template.md +9 -0
  54. package/plugin/skills/skill-creator/templates/skill-skeleton.template.md +28 -0
  55. package/plugin/skills/tour/evals/evals.json +33 -0
  56. package/plugin/skills/vertex-link/SKILL.md +10 -0
  57. package/plugin/skills/vertex-link/evals/evals.json +33 -0
  58. package/src/eval-aggregator.mjs +209 -0
  59. package/src/eval-aggregator.test.mjs +64 -0
  60. package/src/eval-runner.test.mjs +69 -0
  61. package/src/skill-checker.test.mjs +118 -0
  62. package/src/skill-creator.test.mjs +92 -0
package/README.md CHANGED
@@ -235,6 +235,41 @@ npm pack --dry-run
235
235
 
236
236
  The self-check validates frontmatter, agent inventory, prompt → skill routing, profile manifest, reference packs, cross-links, the verbs table in this README, and the layout diagram in `docs/reference/where-things-live.md`. Full reference: [docs/reference/self-check.md](docs/reference/self-check.md).
237
237
 
238
+ ## Authoring a new skill (v5.0.4+)
239
+
240
+ Adding a new skill takes one command + a per-skill lint loop:
241
+
242
+ ```powershell
243
+ node bin/cli.mjs create-skill --name my-thing --type writer --description "Generates the foo report from State/"
244
+ node bin/cli.mjs check-skill --name my-thing
245
+ node bin/cli.mjs check-skill --name my-thing --retrofit --apply # auto-fix additive findings
246
+ npm run eval -- my-thing
247
+ ```
248
+
249
+ Full walkthrough: [`docs/contributing/skill-authoring.md`](docs/contributing/skill-authoring.md). Doctrine: [`plugin/instructions/skill-authoring.instructions.md`](plugin/instructions/skill-authoring.instructions.md). Repo-wide dogfood baseline: [`docs/audits/v5.0.4-skill-creator-dogfood.md`](docs/audits/v5.0.4-skill-creator-dogfood.md).
250
+
251
+ ## Evaluating skills (v5.0.3+)
252
+
253
+ Every skill ships per-case evals at `plugin/skills/<name>/evals/evals.json`, aligned with the [agentskills.io evaluating-skills spec](https://agentskills.io/skill-creation/evaluating-skills). Doctrine: [`plugin/instructions/skill-evals.instructions.md`](plugin/instructions/skill-evals.instructions.md).
254
+
255
+ Quickstart:
256
+
257
+ ```powershell
258
+ npm run eval:canary # ~6 skills, runs in seconds — what PRs run
259
+ npm run eval:all # full suite (every plugin/skills/<name>/)
260
+ npm run eval -- ask-project # one skill
261
+ npm run eval:baseline # maintainer-only: refresh evals/baseline.json
262
+ ```
263
+
264
+ Outputs:
265
+
266
+ - `Evidence/_evals/<utc-ts>.json` — per-run JSON (pass/fail + duration + tokens per case).
267
+ - `Evidence/_evals/benchmark.json` — per-skill mean/stddev for `pass_rate`, `duration_ms`, `tokens_total` + regression flags vs `evals/baseline.json`.
268
+
269
+ Regressions flagged at ≥10pp pass-rate drop OR ≥50% latency/token increase. The canary subset is `ask-project`, `bootstrap-project`, `refresh-project`, `link-entities`, `build-state`, `self-check`.
270
+
271
+ **Privacy:** fixtures under `evals/fixtures/` are synthetic. NEVER copy real customer data into the evals tree.
272
+
238
273
  ## License
239
274
 
240
275
  See [LICENSE](LICENSE).
package/bin/cli.mjs CHANGED
@@ -5,6 +5,16 @@ import { runMultiHost } from '../src/multi-host.mjs';
5
5
 
6
6
  const args = process.argv.slice(2);
7
7
 
8
+ // ── skill-authoring verbs (v5.0.4+) ─────────────────────────────────────────
9
+ // Dispatch directly to the skill-creator / skill-checker pwsh scripts.
10
+ const SKILL_VERBS = new Set(['create-skill', 'check-skill', 'optimize-description', 'review-evals']);
11
+ if (args.length > 0 && SKILL_VERBS.has(args[0])) {
12
+ const verb = args[0];
13
+ const rest = args.slice(1);
14
+ await dispatchSkillVerb(verb, rest);
15
+ process.exit(0);
16
+ }
17
+
8
18
  if (args.includes('--help') || args.includes('-h')) {
9
19
  console.log(`
10
20
  Usage: npx kushi-agents [options]
@@ -41,6 +51,16 @@ if (args.includes('--help') || args.includes('-h')) {
41
51
 
42
52
  --help, -h Show this help
43
53
 
54
+ Skill authoring (v5.0.4+):
55
+ create-skill <name> --type <pull|writer|orchestrator|other> --description "<d>"
56
+ Scaffold a new plugin/skills/<name>/ tree.
57
+ check-skill <name> Lint a skill against the agentskills.io blueprint.
58
+ check-skill --all [--retrofit [--apply]]
59
+ Audit (or retrofit) every skill in plugin/skills/.
60
+ optimize-description <skill>
61
+ Rewrite a skill's description per the optimizer rules.
62
+ review-evals <skill> Render an HTML side-by-side eval-review viewer.
63
+
44
64
  After install, talk to Kushi:
45
65
  bootstrap <project> First-time setup
46
66
  refresh <project> Incremental refresh + rebuild State/
@@ -114,3 +134,86 @@ function getFlag(flag) {
114
134
  const match = args.find((a) => a.startsWith(prefix));
115
135
  return match ? match.slice(prefix.length) : undefined;
116
136
  }
137
+
138
+ // ── skill-authoring verb dispatch (v5.0.4+) ─────────────────────────────────
139
+ async function dispatchSkillVerb(verb, rest) {
140
+ const { spawnSync } = await import('node:child_process');
141
+ const path = await import('node:path');
142
+ const url = await import('node:url');
143
+ const here = path.dirname(url.fileURLToPath(import.meta.url));
144
+ const repoRoot = path.resolve(here, '..');
145
+ const creatorDir = path.join(repoRoot, 'plugin', 'skills', 'skill-creator');
146
+ const checkerDir = path.join(repoRoot, 'plugin', 'skills', 'skill-checker');
147
+
148
+ let script, scriptArgs = [];
149
+ switch (verb) {
150
+ case 'create-skill': {
151
+ // Usage: kushi create-skill <name> [--type <t>] [--description "<d>"] [--force]
152
+ const name = rest.find((a) => !a.startsWith('-'));
153
+ if (!name) {
154
+ console.error('Usage: kushi-agents create-skill <name> --type <pull|writer|orchestrator|other> --description "USE WHEN ... DO NOT USE FOR ..."');
155
+ process.exit(1);
156
+ }
157
+ const type = pickFlag(rest, '--type') || 'other';
158
+ const desc = pickFlag(rest, '--description') || `USE WHEN ${name} is invoked. DO NOT USE FOR unrelated tasks.`;
159
+ script = path.join(creatorDir, 'scaffold.ps1');
160
+ scriptArgs = ['-Name', name, '-Type', type, '-Description', desc];
161
+ if (rest.includes('--force')) scriptArgs.push('-Force');
162
+ if (rest.includes('--dry-run')) scriptArgs.push('-DryRun');
163
+ break;
164
+ }
165
+ case 'check-skill': {
166
+ // Usage: kushi check-skill <name> | --all [--retrofit] [--apply]
167
+ script = path.join(checkerDir, 'check-skill.ps1');
168
+ const allFlag = rest.includes('--all') || rest.includes('-All');
169
+ const name = rest.find((a) => !a.startsWith('-'));
170
+ if (allFlag) scriptArgs.push('-All');
171
+ else if (name) scriptArgs.push('-Skill', name);
172
+ else {
173
+ console.error('Usage: kushi-agents check-skill <name> | --all [--retrofit] [--apply] [--dry-run]');
174
+ process.exit(1);
175
+ }
176
+ if (rest.includes('--retrofit')) scriptArgs.push('-Retrofit');
177
+ if (rest.includes('--apply')) scriptArgs.push('-Apply');
178
+ if (rest.includes('--dry-run')) scriptArgs.push('-DryRun');
179
+ if (rest.includes('--json')) scriptArgs.push('-Json');
180
+ break;
181
+ }
182
+ case 'optimize-description': {
183
+ // Usage: kushi optimize-description <skill>
184
+ const name = rest.find((a) => !a.startsWith('-'));
185
+ if (!name) {
186
+ console.error('Usage: kushi-agents optimize-description <skill>');
187
+ process.exit(1);
188
+ }
189
+ script = path.join(checkerDir, 'check-skill.ps1');
190
+ scriptArgs = ['-Skill', name, '-OptimizeDescription'];
191
+ break;
192
+ }
193
+ case 'review-evals': {
194
+ // Usage: kushi review-evals <skill>
195
+ const name = rest.find((a) => !a.startsWith('-'));
196
+ if (!name) {
197
+ console.error('Usage: kushi-agents review-evals <skill>');
198
+ process.exit(1);
199
+ }
200
+ script = path.join(checkerDir, 'check-skill.ps1');
201
+ scriptArgs = ['-Skill', name, '-Review'];
202
+ break;
203
+ }
204
+ default:
205
+ console.error(`Unknown skill verb: ${verb}`);
206
+ process.exit(1);
207
+ }
208
+
209
+ const result = spawnSync('pwsh', ['-NoProfile', '-File', script, ...scriptArgs], { stdio: 'inherit' });
210
+ process.exit(result.status ?? 1);
211
+ }
212
+
213
+ function pickFlag(args, flag) {
214
+ const idx = args.indexOf(flag);
215
+ if (idx !== -1 && idx + 1 < args.length) return args[idx + 1];
216
+ const prefix = flag + '=';
217
+ const m = args.find((a) => a.startsWith(prefix));
218
+ return m ? m.slice(prefix.length) : undefined;
219
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "kushi-agents",
3
- "version": "5.0.2",
3
+ "version": "5.0.4",
4
4
  "description": "Install Kushi — multi-source project evidence agent with Comprehensive Structured Capture (CSC) into weekly-only files across Email, Teams, OneNote, Loop, SharePoint, Meetings, CRM, ADO. Meetings retain a sibling verbatim/ audit folder. WorkIQ-only for M365 sources (Graph / m365_* FORBIDDEN as fallbacks; user-paste is first-class). Host-agnostic.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -41,9 +41,13 @@
41
41
  },
42
42
  "license": "MIT",
43
43
  "scripts": {
44
- "test": "node --test src/check-workiq.test.mjs src/seed-config.test.mjs src/sanitize-workiq-input.test.mjs src/detect-vertex-repo.test.mjs src/vertex-validate.test.mjs src/emit-vertex.e2e.test.mjs src/config-root-resolve.test.mjs src/forbidden-workiq-phrasings.test.mjs src/multi-host-install.test.mjs",
44
+ "test": "node --test src/check-workiq.test.mjs src/seed-config.test.mjs src/sanitize-workiq-input.test.mjs src/detect-vertex-repo.test.mjs src/vertex-validate.test.mjs src/emit-vertex.e2e.test.mjs src/config-root-resolve.test.mjs src/forbidden-workiq-phrasings.test.mjs src/multi-host-install.test.mjs src/eval-aggregator.test.mjs src/eval-runner.test.mjs src/skill-creator.test.mjs src/skill-checker.test.mjs",
45
45
  "test:integration:bootstrap": "node src/bootstrap-dryrun.integration.test.mjs",
46
46
  "smoke": "node scripts/smoke.mjs",
47
+ "eval": "pwsh plugin/skills/eval/run-evals.ps1 -Skill",
48
+ "eval:all": "pwsh plugin/skills/eval/run-evals.ps1 -All",
49
+ "eval:canary": "pwsh plugin/skills/eval/run-evals.ps1 -Canary",
50
+ "eval:baseline": "pwsh plugin/skills/eval/run-evals.ps1 -All -UpdateBaseline",
47
51
  "prepublishOnly": "npm test && npm run smoke"
48
52
  },
49
53
  "publishConfig": {
@@ -16,7 +16,7 @@ Kushi ships in three profiles. The installed profile is recorded in `kushi-insta
16
16
 
17
17
  | Profile | What's installed | Verbs available |
18
18
  |---|---|---|
19
- | `core` | Aggregator only: `setup`, `pull-*`, `consolidate-evidence`, `aggregate-project`, `ask-project`, `project-status`, `vertex-link`, `emit-vertex`, `self-check`, `intro` | `setup`, `aggregate`, `consolidate`, `status`, `pull`, `ask`, `vertex-link`, `emit-vertex` |
19
+ | `core` | Aggregator only: `setup`, `pull-*`, `consolidate-evidence`, `aggregate-project`, `ask-project`, `project-status`, `vertex-link`, `emit-vertex`, `self-check`, `eval`, `intro` | `setup`, `aggregate`, `consolidate`, `status`, `pull`, `ask`, `vertex-link`, `emit-vertex` |
20
20
  | `standard` *(default)* | core + `bootstrap-project`, `refresh-project`, `fde-intake`, `fde-report`, `fde-triage` + FDE reference pack | core + `bootstrap`, `refresh`, `fde-intake`, `fde-report`, `fde-triage` |
21
21
  | `full` | standard + `build-state` | standard + `state` |
22
22
  | **`preview`** *(opt-in)* | standard + `propose-ado-update`, `apply-ado-update` | standard + `propose-ado`, `apply-ado` |
@@ -182,4 +182,6 @@ Meta skills (not called by verbs):
182
182
  | Skill | Role |
183
183
  |---|---|
184
184
  | `self-check` | Pre-commit consistency check across skills, instructions, prompts, and docs. Run with `pwsh plugin/skills/self-check/run.ps1` (or `./run.sh` on macOS/Linux) or by asking "kushi self-check". |
185
+ | `skill-creator` | (v5.0.4) Scaffolds a new compliant skill — frontmatter + USE WHEN description + type-driven required section + starter evals. Run with `node bin/cli.mjs create-skill --name <kebab> --type <writer\|orchestrator\|pull\|other> --description "..."`. |
186
+ | `skill-checker` | (v5.0.4) Lints + retrofits every SKILL.md against `skill-authoring.instructions.md`. Modes: `-Lint` / `-Retrofit` / `-Apply` / `-OptimizeDescription` / `-Review` / `-All`. Run with `node bin/cli.mjs check-skill --all`. |
185
187
  | `intro` | Self-introduction + interactive walkthrough. Triggered by "what is kushi", "what can you do", "kushi intro", "i'm new to kushi", "kushi help". |
@@ -0,0 +1,147 @@
1
+ ---
2
+ name: "skill-authoring"
3
+ description: "v5.0.4 — How to author a new kushi skill so it ships conformant to the agentskills.io blueprint on day one. Codifies the required SKILL.md sections, file layout, evals starter, naming, and the description-optimization rules. Read this before running `npx kushi-agents create-skill`. Enforced by self-check D34.creator-conformance + by `kushi check-skill --lint`."
4
+ applies_to: "every plugin/skills/<name>/ created from v5.0.4 onward; existing skills are audited via the dogfood gate"
5
+ since: "kushi v5.0.4"
6
+ ---
7
+
8
+ # skill-authoring — doctrine
9
+
10
+ > Inspired by **Anthropic's [skill-creator](https://github.com/anthropics/skills/blob/main/skills/skill-creator/SKILL.md)**. Adapted to kushi's PowerShell-first stack, our 2-host install matrix, and the reality that the first 30 kushi skills were authored before any harness existed — hence the **retrofit** path in `skill-checker`.
11
+
12
+ ## Why this exists
13
+
14
+ A SKILL.md is the prompt that loads into the agent the moment its trigger fires. Drift between intent and spec is silent until evals catch it (and only if there are evals). This doctrine + the `skill-creator` + `skill-checker` skills make conformant authoring the default, not an afterthought.
15
+
16
+ ## Required files per skill
17
+
18
+ ```text
19
+ plugin/skills/<name>/
20
+ ├── SKILL.md ← REQUIRED — agent-loaded prompt; ≤500 lines, ≤5000 tokens
21
+ ├── evals/
22
+ │ └── evals.json ← REQUIRED — ≥2 cases, each with ≥1 assertion
23
+ ├── references/ ← OPTIONAL — load-on-trigger bulk content (>500 lines splits here)
24
+ └── .created-by-skill-creator ← OPTIONAL marker — set by `scaffold.ps1`; opts into the strict D34 gate
25
+ ```
26
+
27
+ A skill MAY also ship a runner (`run.ps1`, `run.sh`, `*.mjs`) when it's a tool-skill (`self-check`, `eval`, `skill-checker`, etc.). Pure prompt-skills don't need one.
28
+
29
+ ## Required SKILL.md sections
30
+
31
+ Every SKILL.md MUST have:
32
+
33
+ 1. **YAML frontmatter** — `name` (kebab-case, matches dir) + `description` (lead with `USE WHEN`).
34
+ 2. **`# Skill: <name>`** H1.
35
+ 3. **One-paragraph purpose** immediately after the H1.
36
+ 4. **At least one** of these procedure-shape sections, picked by skill **type**:
37
+ - `## Gotchas` — REQUIRED for `pull-*` and discovery skills (top-5 failure modes).
38
+ - `## Step checklist` — REQUIRED for orchestrators (`bootstrap-project`, `refresh-project`, `build-state`, `link-entities`, `dashboard`, `tour`, etc.); use GitHub `- [ ]` checkboxes.
39
+ - `## Validation loop` — REQUIRED for writer skills (anything that writes to `Evidence/`, `State/`, `_graph/`, `dashboards/`, `tours/`).
40
+ - `## Steps` or `## Procedure` — acceptable for other skills, plus one of the three above where it applies.
41
+
42
+ Skills can have more than one (e.g. `eval` ships all three). The checker is satisfied by **at least one** of `Gotchas` / `Step checklist` / `Validation loop` plus type-specific rules.
43
+
44
+ ## Description optimization (per agentskills.io)
45
+
46
+ The `description:` is the sole trigger signal. Optimize it:
47
+
48
+ ```yaml
49
+ description: "USE WHEN <situational trigger> AND <precondition>. DO NOT USE for <near-miss>. <one-line capability summary>."
50
+ ```
51
+
52
+ Rules (enforced by `D30.description-optimized` + `skill-checker --optimize-description`):
53
+
54
+ - Lead with `USE WHEN` (first 160 chars).
55
+ - Include a `DO NOT USE` clause for the most likely near-miss invocation.
56
+ - Be specific about the trigger (concrete user phrases or file/state conditions).
57
+ - No marketing fluff ("powerful", "comprehensive", "blazing").
58
+ - ≤1024 characters total.
59
+
60
+ | Bad | Good |
61
+ |---|---|
62
+ | `"Pulls OneNote pages."` | `"USE WHEN refreshing project evidence for a known kushi project AND boundaries.onenote.section_ids is non-empty. DO NOT USE for global OneNote search."` |
63
+ | `"Comprehensive eval framework."` | `"USE WHEN the user says 'run evals', 'eval canary', or before tagging a release. DO NOT USE for evidence validation of a real project."` |
64
+
65
+ ## Size caps
66
+
67
+ - ≤ 500 lines (`D30.skill-size`)
68
+ - ≤ 5000 tokens (~20 KB)
69
+
70
+ When you exceed either, **split into `references/<topic>.md` files** and cite them with explicit triggers:
71
+
72
+ ```markdown
73
+ Load `references/canonical-prompts.md` when constructing the WorkIQ query.
74
+ Load `references/error-modes.md` if the API returns non-200.
75
+ ```
76
+
77
+ Passive links (`[see foo](references/foo.md)`) do NOT count. The checker requires the literal substring `references/<file>.md` somewhere in SKILL.md if `references/` exists.
78
+
79
+ ## Evals (≥2 cases)
80
+
81
+ Every skill MUST ship `evals/evals.json` validated against `plugin/skills/eval/evals.schema.json`. Per `skill-evals.instructions.md`:
82
+
83
+ - `id`, `name`, `input`, `expected_assertions[]` (≥1), `grader_type` (`script` | `llm`).
84
+ - ≥2 cases. Mark canary-worthy ones `"canary": true`.
85
+ - Synthetic fixtures only — never real customer data.
86
+
87
+ The `create-skill` scaffold emits a starter `evals.json` with one `file-exists` and one `regex-match` case so the skill ships green from minute one.
88
+
89
+ ## Naming conventions
90
+
91
+ - **Skill directory + frontmatter `name`** — kebab-case, verb-led (`pull-onenote`, `consolidate-evidence`, `apply-ado-update`).
92
+ - **Skill types** (informational; pick at create time so the scaffold picks the right sections):
93
+ - `pull` — fetches evidence from a source.
94
+ - `writer` — writes files to `Evidence/` or `State/`.
95
+ - `orchestrator` — coordinates other skills.
96
+ - `other` — utility / tool / meta.
97
+ - **Instruction files** — `<topic>.instructions.md` in `plugin/instructions/`. Front-matter `name:` matches filename minus `.instructions.md`.
98
+
99
+ ## Contributor workflow
100
+
101
+ ```powershell
102
+ # 1. Scaffold
103
+ npx kushi-agents create-skill my-new-skill
104
+ # → answers: type (pull|writer|orchestrator|other), one-liner description
105
+ # → emits plugin/skills/my-new-skill/{SKILL.md, evals/evals.json}
106
+ # → marker file .created-by-skill-creator is written
107
+
108
+ # 2. Fill in the placeholders (search for "TODO(skill-creator)")
109
+
110
+ # 3. Validate
111
+ npx kushi-agents check-skill my-new-skill # lint mode
112
+ npm run eval -- my-new-skill # run evals
113
+
114
+ # 4. Optimize description before PR
115
+ npx kushi-agents optimize-description my-new-skill
116
+ # → emits a rewritten description; you decide whether to apply
117
+
118
+ # 5. Self-check + commit
119
+ pwsh plugin/skills/self-check/run.ps1 -Deep
120
+ git add plugin/skills/my-new-skill/
121
+ git commit -m "v<x.y.z>: my-new-skill"
122
+ ```
123
+
124
+ ## Retrofit (existing skills predating the harness)
125
+
126
+ Run `npx kushi-agents check-skill --all --retrofit` to identify gaps in legacy skills. `--apply` adds missing section stubs with `<!-- TODO(retrofit): fill in -->` markers — never overwrites existing content. The v5.0.4 dogfood report at `docs/audits/v5.0.4-skill-creator-dogfood.md` records the baseline.
127
+
128
+ ## Enforcement
129
+
130
+ | Check | What it does |
131
+ |---|---|
132
+ | `D34.skill-creator-exists` | `plugin/skills/skill-creator/scaffold.ps1` is parseable. |
133
+ | `D34.skill-checker-exists` | `plugin/skills/skill-checker/check-skill.ps1` is parseable. |
134
+ | `D34.creator-output-conforms` | Every skill carrying `.created-by-skill-creator` passes `check-skill --lint`. |
135
+ | `D34.retrofit-clean` | `check-skill --all --retrofit --dry-run` shows no unresolved non-additive gaps. |
136
+ | `D34.dogfood-report-fresh` | `docs/audits/v5.0.4-skill-creator-dogfood.md` was touched within 14 days (warn-only). |
137
+
138
+ ## References
139
+
140
+ - `plugin/instructions/agentskills-compliance.instructions.md` — the spec rules this builds on.
141
+ - `plugin/instructions/skill-evals.instructions.md` — the evals doctrine.
142
+ - `plugin/skills/skill-creator/SKILL.md` — the scaffolder.
143
+ - `plugin/skills/skill-checker/SKILL.md` — the linter / retrofitter.
144
+ - `docs/contributing/skill-authoring.md` — the human walkthrough.
145
+ - <https://github.com/anthropics/skills/blob/main/skills/skill-creator/SKILL.md> — upstream inspiration.
146
+ - <https://agentskills.io/skill-creation/best-practices>
147
+ - <https://agentskills.io/skill-creation/optimizing-descriptions>
@@ -0,0 +1,130 @@
1
+ ---
2
+ description: "v5.0.3 — Skill evals doctrine, adapted from https://agentskills.io/skill-creation/evaluating-skills. Every skill MUST ship an evals/ folder with at least 2 deterministic cases plus structured assertions; a per-skill pass-rate is the objective regression signal. Canary subset runs on every PR; full suite runs on demand. Real customer data is FORBIDDEN in fixtures — use synthetic data only."
3
+ ---
4
+
5
+ # Skill evals — doctrine
6
+
7
+ > Inspired by **<https://agentskills.io/skill-creation/evaluating-skills>**. Adapted to kushi's PowerShell + Node test stack and to our 2-host install matrix.
8
+
9
+ ## Why
10
+
11
+ Skills are prompts plus a runner. Prompts drift silently. Without an objective per-skill regression signal, every change is a gamble. Evals make that signal cheap:
12
+
13
+ - **Per-skill pass-rate** is the headline metric.
14
+ - **Latency** and **tokens** are secondary metrics (regressions ≥50% latency / ≥10pp pass-rate flag a baseline failure).
15
+ - A **canary subset** runs on every PR (target: < 60s wall clock); the **full suite** runs on demand (`npm run eval:all`).
16
+
17
+ ## Where evals live
18
+
19
+ ```text
20
+ plugin/skills/<name>/
21
+ ├── SKILL.md
22
+ └── evals/
23
+ ├── evals.json ← REQUIRED — case list + assertions
24
+ └── fixtures/ ← OPTIONAL per-skill fixtures
25
+ ```
26
+
27
+ Cross-skill fixtures live at the repo root:
28
+
29
+ ```text
30
+ evals/
31
+ ├── baseline.json ← Committed; maintainer updates with `npm run eval:baseline`
32
+ └── fixtures/ ← Tiny synthetic evidence trees, ADO fixtures, etc.
33
+ ```
34
+
35
+ Per-run output goes to `Evidence/_evals/<timestamp>.json` (gitignored; not customer data).
36
+
37
+ ## Case schema
38
+
39
+ ```jsonc
40
+ {
41
+ "skill": "<skill-name>",
42
+ "cases": [
43
+ {
44
+ "id": "ap-citations-format",
45
+ "name": "ask-project emits weekly-csc citation form",
46
+ "input": "what was decided about MACC for fixture-acme?",
47
+ "fixture": "evals/fixtures/fixture-acme", // optional
48
+ "canary": true,
49
+ "grader_type": "script", // "script" | "llm"
50
+ "expected_assertions": [
51
+ { "type": "regex-match", "pattern": "\\[source:\\s*fixture-acme/email/weekly/" },
52
+ { "type": "regex-match", "pattern": "Source-layout:\\s*weekly-csc" }
53
+ ]
54
+ }
55
+ ]
56
+ }
57
+ ```
58
+
59
+ ### Required fields per case
60
+
61
+ - `id` — unique within the skill; kebab-case.
62
+ - `name` — human-readable.
63
+ - `input` — what gets passed to the skill (string OR object).
64
+ - `expected_assertions` — array, **≥ 1** entry (enforced by `D33.evals-have-assertions`).
65
+ - `grader_type` — `"script"` for deterministic graders, `"llm"` for rubric-based.
66
+
67
+ ### Optional fields
68
+
69
+ - `fixture` — repo-relative path to the fixture to point the skill at.
70
+ - `canary` — `true` to include in the fast CI subset.
71
+ - `args` — extra args forwarded to the skill script (e.g. `{ "DryRun": true }`).
72
+ - `skip` — `true` to skip (must include `skip_reason`).
73
+ - `timeout_ms` — override the runner default (30 000 ms).
74
+
75
+ ## Assertion types
76
+
77
+ | Type | Shape | Passes when |
78
+ |---|---|---|
79
+ | `file-exists` | `{ "type": "file-exists", "path": "..." }` | Path exists post-run (relative to fixture or evidence dir). |
80
+ | `file-contains` | `{ "type": "file-contains", "path": "...", "needle": "..." }` | File exists and substring is present. |
81
+ | `json-path-equals` | `{ "type": "json-path-equals", "path": "...", "json_path": "$.foo.bar", "equals": "v" }` | JSON file parses; dotted path value === expected. |
82
+ | `regex-match` | `{ "type": "regex-match", "pattern": "...", "flags": "i" }` | Captured stdout matches the regex. |
83
+ | `llm-rubric` | `{ "type": "llm-rubric", "rubric": "...", "min_score": 4 }` | LLM grader scores ≥ min on a 1–5 rubric. |
84
+
85
+ ## Run modes
86
+
87
+ The runner (`plugin/skills/eval/run-evals.ps1`) supports three dispatch modes:
88
+
89
+ 1. **Direct invocation** (default for `script` graders). Runs the skill's executable artifact (`run.ps1`, `*.mjs`, or a small probe stub) with the given input and fixture. Pure deterministic.
90
+ 2. **Sub-agent dispatch** (optional, gated by `-Live`). Forwards the case to a sub-agent. Used only for `llm-rubric` cases. Skipped in canary mode.
91
+ 3. **Recorded fixture replay** (for `pull-*` skills). Reads a recorded `--cached` output of a real pull and asserts against that, so no live M365 calls are needed.
92
+
93
+ For each case the runner records: `pass`, `duration_ms`, `tokens_in`, `tokens_out`, `stdout`, `stderr`, per-assertion `pass`/`reason`. The aggregate is a JSON file under `Evidence/_evals/` plus a one-line `benchmark.json` summary.
94
+
95
+ ## Canary set
96
+
97
+ Marked with `"canary": true`. Kept tiny so PRs stay fast.
98
+
99
+ Default canary set (v5.0.3):
100
+
101
+ - `ask-project`
102
+ - `bootstrap-project`
103
+ - `refresh-project`
104
+ - `link-entities`
105
+ - `build-state`
106
+ - `self-check`
107
+
108
+ ## Baseline + regression detection
109
+
110
+ - `evals/baseline.json` is **committed**.
111
+ - Each per-skill record carries the last green `pass_rate`, `mean_duration_ms`, and `mean_tokens_total`.
112
+ - `src/eval-aggregator.mjs` flags **regressions**:
113
+ - `pass_rate` drop ≥ 10 percentage points
114
+ - `mean_duration_ms` increase ≥ 50 %
115
+ - `mean_tokens_total` increase ≥ 50 %
116
+ - Maintainers refresh the baseline with `npm run eval:baseline` after deliberate behavior changes.
117
+
118
+ ## Privacy + safety
119
+
120
+ - **No real customer data** in any fixture. Use `fixture-acme`-style synthetic names.
121
+ - `Evidence/_evals/` is in `.gitignore`.
122
+ - `pull-*` evals NEVER hit live M365 endpoints in canary mode. Use recorded `--cached` payloads or `--dry-run`.
123
+ - Tenant IDs / GUIDs in fixtures must be obviously fake (e.g. `00000000-...`).
124
+
125
+ ## References
126
+
127
+ - [agentskills.io — evaluating skills](https://agentskills.io/skill-creation/evaluating-skills) (source of truth)
128
+ - `plugin/skills/eval/SKILL.md` (the runner skill)
129
+ - `plugin/skills/eval/evals.schema.json` (JSON schema; self-check D33.evals-schema)
130
+ - `plugin/instructions/agentskills-compliance.instructions.md` (sibling doctrine — size + section caps)
@@ -0,0 +1,33 @@
1
+ {
2
+ "skill": "aggregate-project",
3
+ "version": "1.0.0",
4
+ "description": "Auto-seeded evals for aggregate-project. Replace with real cases as the skill matures.",
5
+ "cases": [
6
+ {
7
+ "id": "aggregate-project-smoke-1",
8
+ "name": "aggregate-project produces a non-empty response",
9
+ "input": "synthetic aggregate-project probe — canary smoke",
10
+ "canary": false,
11
+ "grader_type": "script",
12
+ "expected_assertions": [
13
+ {
14
+ "type": "regex-match",
15
+ "pattern": ".+"
16
+ }
17
+ ]
18
+ },
19
+ {
20
+ "id": "aggregate-project-smoke-2",
21
+ "name": "aggregate-project echoes case id",
22
+ "input": "case-id aggregate-project-smoke-2",
23
+ "canary": false,
24
+ "grader_type": "script",
25
+ "expected_assertions": [
26
+ {
27
+ "type": "regex-match",
28
+ "pattern": "aggregate-project-smoke-2"
29
+ }
30
+ ]
31
+ }
32
+ ]
33
+ }
@@ -0,0 +1,33 @@
1
+ {
2
+ "skill": "apply-ado-update",
3
+ "version": "1.0.0",
4
+ "description": "Auto-seeded evals for apply-ado-update. Replace with real cases as the skill matures.",
5
+ "cases": [
6
+ {
7
+ "id": "apply-ado-update-smoke-1",
8
+ "name": "apply-ado-update produces a non-empty response",
9
+ "input": "synthetic apply-ado-update probe — canary smoke",
10
+ "canary": false,
11
+ "grader_type": "script",
12
+ "expected_assertions": [
13
+ {
14
+ "type": "regex-match",
15
+ "pattern": ".+"
16
+ }
17
+ ]
18
+ },
19
+ {
20
+ "id": "apply-ado-update-smoke-2",
21
+ "name": "apply-ado-update echoes case id",
22
+ "input": "case-id apply-ado-update-smoke-2",
23
+ "canary": false,
24
+ "grader_type": "script",
25
+ "expected_assertions": [
26
+ {
27
+ "type": "regex-match",
28
+ "pattern": "apply-ado-update-smoke-2"
29
+ }
30
+ ]
31
+ }
32
+ ]
33
+ }
@@ -197,3 +197,13 @@ Explicit triggers also accepted:
197
197
 
198
198
  - **v4.0.0 (kushi v5.0.0, 2026-05-26)**: graph-first cross-source resolution — consult `Evidence/_graph/project-graph.json` before walking weekly files when a question spans sources. Falls back to v4.9.0 walking strategy if graph absent/stale.
199
199
  - **v3.0.0 (kushi v4.9.0, 2026-05-26)**: 3-step reader fallback chain (`_index/entities.yml` → `weekly/*.md` → legacy `snapshot/` + `stream/`). New citation form `weekly/<YYYY-MM-DD>_<source>-csc.md#<anchor>`. Legacy citations suffixed `(legacy pre-v4.9.0 layout)`. Output marked with `Source-layout:` footer.
200
+
201
+
202
+ ## Validation loop
203
+
204
+ <!-- TODO(retrofit): fill in — describe how to verify this skill ran correctly. Auto-added by skill-checker --retrofit --apply per skill-authoring.instructions.md. -->
205
+
206
+ 1. Run pwsh plugin/skills/self-check/run.ps1 -Targeted <area>.
207
+ 2. Fix any findings, then re-run the affected step.
208
+ 3. Repeat until self-check exits 0.
209
+ 4. Only then update
@@ -0,0 +1,34 @@
1
+ {
2
+ "skill": "ask-project",
3
+ "version": "1.0.0",
4
+ "description": "Verifies citation format (weekly-csc) and Source-layout footer.",
5
+ "cases": [
6
+ {
7
+ "id": "ap-macc-citation",
8
+ "name": "answers MACC question with weekly-csc citation",
9
+ "input": "what was decided about MACC for fixture-acme?",
10
+ "fixture": "evals/fixtures/fixture-acme",
11
+ "canary": true,
12
+ "grader_type": "script",
13
+ "args": { "read_fixture": "../outputs/ask-project.macc.txt" },
14
+ "expected_assertions": [
15
+ { "type": "regex-match", "pattern": "MACC" },
16
+ { "type": "regex-match", "pattern": "\\[source:\\s*fixture-acme/email/weekly/" },
17
+ { "type": "regex-match", "pattern": "Source-layout:\\s*weekly-csc" }
18
+ ]
19
+ },
20
+ {
21
+ "id": "ap-em-question",
22
+ "name": "answers who-is-the-EM with cited person",
23
+ "input": "who is the lead on fixture-acme?",
24
+ "fixture": "evals/fixtures/fixture-acme",
25
+ "canary": false,
26
+ "grader_type": "script",
27
+ "args": { "read_fixture": "../outputs/ask-project.lead.txt" },
28
+ "expected_assertions": [
29
+ { "type": "regex-match", "pattern": "alice@fixture\\.local" },
30
+ { "type": "regex-match", "pattern": "Source-layout:\\s*weekly-csc" }
31
+ ]
32
+ }
33
+ ]
34
+ }
@@ -0,0 +1,34 @@
1
+ {
2
+ "skill": "bootstrap-project",
3
+ "version": "1.0.0",
4
+ "description": "Asserts the canonical engagement folder layout from a dry-run.",
5
+ "cases": [
6
+ {
7
+ "id": "bp-dryrun-layout",
8
+ "name": "dry-run prints the canonical layout",
9
+ "input": "bootstrap fixture-acme --dry-run",
10
+ "fixture": "evals/fixtures/fixture-acme",
11
+ "canary": true,
12
+ "grader_type": "script",
13
+ "args": { "read_fixture": "../outputs/bootstrap-project.dryrun.txt" },
14
+ "expected_assertions": [
15
+ { "type": "regex-match", "pattern": "DRY-RUN" },
16
+ { "type": "regex-match", "pattern": "fixture-acme/Evidence/fixture-alias/email/weekly/" },
17
+ { "type": "regex-match", "pattern": "fixture-acme/State/" },
18
+ { "type": "regex-match", "pattern": "fixture-acme/Evidence/_graph/" }
19
+ ]
20
+ },
21
+ {
22
+ "id": "bp-fixture-tree-present",
23
+ "name": "fixture engagement tree exists on disk",
24
+ "input": "verify fixture tree",
25
+ "fixture": "evals/fixtures/fixture-acme",
26
+ "canary": false,
27
+ "grader_type": "script",
28
+ "expected_assertions": [
29
+ { "type": "file-exists", "path": "Evidence/fixture-alias/email/weekly/2026-05-18_email-csc.md" },
30
+ { "type": "file-exists", "path": "State/index.md" }
31
+ ]
32
+ }
33
+ ]
34
+ }