kushi-agents 5.0.2 → 5.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/README.md +22 -0
  2. package/package.json +6 -2
  3. package/plugin/agents/kushi.agent.md +1 -1
  4. package/plugin/instructions/skill-evals.instructions.md +130 -0
  5. package/plugin/skills/aggregate-project/evals/evals.json +33 -0
  6. package/plugin/skills/apply-ado-update/evals/evals.json +33 -0
  7. package/plugin/skills/ask-project/evals/evals.json +34 -0
  8. package/plugin/skills/bootstrap-project/evals/evals.json +34 -0
  9. package/plugin/skills/build-state/evals/evals.json +31 -0
  10. package/plugin/skills/consolidate-evidence/evals/evals.json +33 -0
  11. package/plugin/skills/dashboard/evals/evals.json +33 -0
  12. package/plugin/skills/emit-vertex/evals/evals.json +33 -0
  13. package/plugin/skills/eval/SKILL.md +90 -0
  14. package/plugin/skills/eval/evals.schema.json +73 -0
  15. package/plugin/skills/eval/run-evals.ps1 +372 -0
  16. package/plugin/skills/fde-intake/evals/evals.json +33 -0
  17. package/plugin/skills/fde-report/evals/evals.json +33 -0
  18. package/plugin/skills/fde-triage/evals/evals.json +33 -0
  19. package/plugin/skills/intro/evals/evals.json +33 -0
  20. package/plugin/skills/link-entities/evals/evals.json +31 -0
  21. package/plugin/skills/project-status/evals/evals.json +33 -0
  22. package/plugin/skills/propose-ado-update/evals/evals.json +33 -0
  23. package/plugin/skills/pull-ado/evals/evals.json +35 -0
  24. package/plugin/skills/pull-crm/evals/evals.json +35 -0
  25. package/plugin/skills/pull-email/evals/evals.json +35 -0
  26. package/plugin/skills/pull-loop/evals/evals.json +35 -0
  27. package/plugin/skills/pull-meetings/evals/evals.json +35 -0
  28. package/plugin/skills/pull-misc/evals/evals.json +35 -0
  29. package/plugin/skills/pull-onenote/evals/evals.json +35 -0
  30. package/plugin/skills/pull-sharepoint/evals/evals.json +35 -0
  31. package/plugin/skills/pull-teams/evals/evals.json +35 -0
  32. package/plugin/skills/refresh-project/evals/evals.json +31 -0
  33. package/plugin/skills/self-check/SKILL.md +1 -0
  34. package/plugin/skills/self-check/evals/evals.json +28 -0
  35. package/plugin/skills/self-check/run.ps1 +63 -0
  36. package/plugin/skills/setup/evals/evals.json +33 -0
  37. package/plugin/skills/tour/evals/evals.json +33 -0
  38. package/plugin/skills/vertex-link/evals/evals.json +33 -0
  39. package/src/eval-aggregator.mjs +209 -0
  40. package/src/eval-aggregator.test.mjs +64 -0
  41. package/src/eval-runner.test.mjs +69 -0
package/README.md CHANGED
@@ -235,6 +235,28 @@ npm pack --dry-run
235
235
 
236
236
  The self-check validates frontmatter, agent inventory, prompt → skill routing, profile manifest, reference packs, cross-links, the verbs table in this README, and the layout diagram in `docs/reference/where-things-live.md`. Full reference: [docs/reference/self-check.md](docs/reference/self-check.md).
237
237
 
238
+ ## Evaluating skills (v5.0.3+)
239
+
240
+ Every skill ships per-case evals at `plugin/skills/<name>/evals/evals.json`, aligned with the [agentskills.io evaluating-skills spec](https://agentskills.io/skill-creation/evaluating-skills). Doctrine: [`plugin/instructions/skill-evals.instructions.md`](plugin/instructions/skill-evals.instructions.md).
241
+
242
+ Quickstart:
243
+
244
+ ```powershell
245
+ npm run eval:canary # ~6 skills, runs in seconds — what PRs run
246
+ npm run eval:all # full suite (every plugin/skills/<name>/)
247
+ npm run eval -- ask-project # one skill
248
+ npm run eval:baseline # maintainer-only: refresh evals/baseline.json
249
+ ```
250
+
251
+ Outputs:
252
+
253
+ - `Evidence/_evals/<utc-ts>.json` — per-run JSON (pass/fail + duration + tokens per case).
254
+ - `Evidence/_evals/benchmark.json` — per-skill mean/stddev for `pass_rate`, `duration_ms`, `tokens_total` + regression flags vs `evals/baseline.json`.
255
+
256
+ Regressions flagged at ≥10pp pass-rate drop OR ≥50% latency/token increase. The canary subset is `ask-project`, `bootstrap-project`, `refresh-project`, `link-entities`, `build-state`, `self-check`.
257
+
258
+ **Privacy:** fixtures under `evals/fixtures/` are synthetic. NEVER copy real customer data into the evals tree.
259
+
238
260
  ## License
239
261
 
240
262
  See [LICENSE](LICENSE).
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "kushi-agents",
3
- "version": "5.0.2",
3
+ "version": "5.0.3",
4
4
  "description": "Install Kushi — multi-source project evidence agent with Comprehensive Structured Capture (CSC) into weekly-only files across Email, Teams, OneNote, Loop, SharePoint, Meetings, CRM, ADO. Meetings retain a sibling verbatim/ audit folder. WorkIQ-only for M365 sources (Graph / m365_* FORBIDDEN as fallbacks; user-paste is first-class). Host-agnostic.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -41,9 +41,13 @@
41
41
  },
42
42
  "license": "MIT",
43
43
  "scripts": {
44
- "test": "node --test src/check-workiq.test.mjs src/seed-config.test.mjs src/sanitize-workiq-input.test.mjs src/detect-vertex-repo.test.mjs src/vertex-validate.test.mjs src/emit-vertex.e2e.test.mjs src/config-root-resolve.test.mjs src/forbidden-workiq-phrasings.test.mjs src/multi-host-install.test.mjs",
44
+ "test": "node --test src/check-workiq.test.mjs src/seed-config.test.mjs src/sanitize-workiq-input.test.mjs src/detect-vertex-repo.test.mjs src/vertex-validate.test.mjs src/emit-vertex.e2e.test.mjs src/config-root-resolve.test.mjs src/forbidden-workiq-phrasings.test.mjs src/multi-host-install.test.mjs src/eval-aggregator.test.mjs src/eval-runner.test.mjs",
45
45
  "test:integration:bootstrap": "node src/bootstrap-dryrun.integration.test.mjs",
46
46
  "smoke": "node scripts/smoke.mjs",
47
+ "eval": "pwsh plugin/skills/eval/run-evals.ps1 -Skill",
48
+ "eval:all": "pwsh plugin/skills/eval/run-evals.ps1 -All",
49
+ "eval:canary": "pwsh plugin/skills/eval/run-evals.ps1 -Canary",
50
+ "eval:baseline": "pwsh plugin/skills/eval/run-evals.ps1 -All -UpdateBaseline",
47
51
  "prepublishOnly": "npm test && npm run smoke"
48
52
  },
49
53
  "publishConfig": {
@@ -16,7 +16,7 @@ Kushi ships in three profiles. The installed profile is recorded in `kushi-insta
16
16
 
17
17
  | Profile | What's installed | Verbs available |
18
18
  |---|---|---|
19
- | `core` | Aggregator only: `setup`, `pull-*`, `consolidate-evidence`, `aggregate-project`, `ask-project`, `project-status`, `vertex-link`, `emit-vertex`, `self-check`, `intro` | `setup`, `aggregate`, `consolidate`, `status`, `pull`, `ask`, `vertex-link`, `emit-vertex` |
19
+ | `core` | Aggregator only: `setup`, `pull-*`, `consolidate-evidence`, `aggregate-project`, `ask-project`, `project-status`, `vertex-link`, `emit-vertex`, `self-check`, `eval`, `intro` | `setup`, `aggregate`, `consolidate`, `status`, `pull`, `ask`, `vertex-link`, `emit-vertex` |
20
20
  | `standard` *(default)* | core + `bootstrap-project`, `refresh-project`, `fde-intake`, `fde-report`, `fde-triage` + FDE reference pack | core + `bootstrap`, `refresh`, `fde-intake`, `fde-report`, `fde-triage` |
21
21
  | `full` | standard + `build-state` | standard + `state` |
22
22
  | **`preview`** *(opt-in)* | standard + `propose-ado-update`, `apply-ado-update` | standard + `propose-ado`, `apply-ado` |
@@ -0,0 +1,130 @@
1
+ ---
2
+ description: "v5.0.3 — Skill evals doctrine, adapted from https://agentskills.io/skill-creation/evaluating-skills. Every skill MUST ship an evals/ folder with at least 2 deterministic cases plus structured assertions; a per-skill pass-rate is the objective regression signal. Canary subset runs on every PR; full suite runs on demand. Real customer data is FORBIDDEN in fixtures — use synthetic data only."
3
+ ---
4
+
5
+ # Skill evals — doctrine
6
+
7
+ > Inspired by **<https://agentskills.io/skill-creation/evaluating-skills>**. Adapted to kushi's PowerShell + Node test stack and to our 2-host install matrix.
8
+
9
+ ## Why
10
+
11
+ Skills are prompts plus a runner. Prompts drift silently. Without an objective per-skill regression signal, every change is a gamble. Evals make that signal cheap:
12
+
13
+ - **Per-skill pass-rate** is the headline metric.
14
+ - **Latency** and **tokens** are secondary metrics (regressions ≥50% latency / ≥10pp pass-rate flag a baseline failure).
15
+ - A **canary subset** runs on every PR (target: < 60s wall clock); the **full suite** runs on demand (`npm run eval:all`).
16
+
17
+ ## Where evals live
18
+
19
+ ```text
20
+ plugin/skills/<name>/
21
+ ├── SKILL.md
22
+ └── evals/
23
+ ├── evals.json ← REQUIRED — case list + assertions
24
+ └── fixtures/ ← OPTIONAL per-skill fixtures
25
+ ```
26
+
27
+ Cross-skill fixtures live at the repo root:
28
+
29
+ ```text
30
+ evals/
31
+ ├── baseline.json ← Committed; maintainer updates with `npm run eval:baseline`
32
+ └── fixtures/ ← Tiny synthetic evidence trees, ADO fixtures, etc.
33
+ ```
34
+
35
+ Per-run output goes to `Evidence/_evals/<timestamp>.json` (gitignored; not customer data).
36
+
37
+ ## Case schema
38
+
39
+ ```jsonc
40
+ {
41
+ "skill": "<skill-name>",
42
+ "cases": [
43
+ {
44
+ "id": "ap-citations-format",
45
+ "name": "ask-project emits weekly-csc citation form",
46
+ "input": "what was decided about MACC for fixture-acme?",
47
+ "fixture": "evals/fixtures/fixture-acme", // optional
48
+ "canary": true,
49
+ "grader_type": "script", // "script" | "llm"
50
+ "expected_assertions": [
51
+ { "type": "regex-match", "pattern": "\\[source:\\s*fixture-acme/email/weekly/" },
52
+ { "type": "regex-match", "pattern": "Source-layout:\\s*weekly-csc" }
53
+ ]
54
+ }
55
+ ]
56
+ }
57
+ ```
58
+
59
+ ### Required fields per case
60
+
61
+ - `id` — unique within the skill; kebab-case.
62
+ - `name` — human-readable.
63
+ - `input` — what gets passed to the skill (string OR object).
64
+ - `expected_assertions` — array, **≥ 1** entry (enforced by `D33.evals-have-assertions`).
65
+ - `grader_type` — `"script"` for deterministic graders, `"llm"` for rubric-based.
66
+
67
+ ### Optional fields
68
+
69
+ - `fixture` — repo-relative path to the fixture to point the skill at.
70
+ - `canary` — `true` to include in the fast CI subset.
71
+ - `args` — extra args forwarded to the skill script (e.g. `{ "DryRun": true }`).
72
+ - `skip` — `true` to skip (must include `skip_reason`).
73
+ - `timeout_ms` — override the runner default (30 000 ms).
74
+
75
+ ## Assertion types
76
+
77
+ | Type | Shape | Passes when |
78
+ |---|---|---|
79
+ | `file-exists` | `{ "type": "file-exists", "path": "..." }` | Path exists post-run (relative to fixture or evidence dir). |
80
+ | `file-contains` | `{ "type": "file-contains", "path": "...", "needle": "..." }` | File exists and substring is present. |
81
+ | `json-path-equals` | `{ "type": "json-path-equals", "path": "...", "json_path": "$.foo.bar", "equals": "v" }` | JSON file parses; dotted path value === expected. |
82
+ | `regex-match` | `{ "type": "regex-match", "pattern": "...", "flags": "i" }` | Captured stdout matches the regex. |
83
+ | `llm-rubric` | `{ "type": "llm-rubric", "rubric": "...", "min_score": 4 }` | LLM grader scores ≥ min on a 1–5 rubric. |
84
+
85
+ ## Run modes
86
+
87
+ The runner (`plugin/skills/eval/run-evals.ps1`) supports three dispatch modes:
88
+
89
+ 1. **Direct invocation** (default for `script` graders). Runs the skill's executable artifact (`run.ps1`, `*.mjs`, or a small probe stub) with the given input and fixture. Pure deterministic.
90
+ 2. **Sub-agent dispatch** (optional, gated by `-Live`). Forwards the case to a sub-agent. Used only for `llm-rubric` cases. Skipped in canary mode.
91
+ 3. **Recorded fixture replay** (for `pull-*` skills). Reads a recorded `--cached` output of a real pull and asserts against that, so no live M365 calls are needed.
92
+
93
+ For each case the runner records: `pass`, `duration_ms`, `tokens_in`, `tokens_out`, `stdout`, `stderr`, per-assertion `pass`/`reason`. The aggregate is a JSON file under `Evidence/_evals/` plus a one-line `benchmark.json` summary.
94
+
95
+ ## Canary set
96
+
97
+ Marked with `"canary": true`. Kept tiny so PRs stay fast.
98
+
99
+ Default canary set (v5.0.3):
100
+
101
+ - `ask-project`
102
+ - `bootstrap-project`
103
+ - `refresh-project`
104
+ - `link-entities`
105
+ - `build-state`
106
+ - `self-check`
107
+
108
+ ## Baseline + regression detection
109
+
110
+ - `evals/baseline.json` is **committed**.
111
+ - Each per-skill record carries the last green `pass_rate`, `mean_duration_ms`, and `mean_tokens_total`.
112
+ - `src/eval-aggregator.mjs` flags **regressions**:
113
+ - `pass_rate` drop ≥ 10 percentage points
114
+ - `mean_duration_ms` increase ≥ 50 %
115
+ - `mean_tokens_total` increase ≥ 50 %
116
+ - Maintainers refresh the baseline with `npm run eval:baseline` after deliberate behavior changes.
117
+
118
+ ## Privacy + safety
119
+
120
+ - **No real customer data** in any fixture. Use `fixture-acme`-style synthetic names.
121
+ - `Evidence/_evals/` is in `.gitignore`.
122
+ - `pull-*` evals NEVER hit live M365 endpoints in canary mode. Use recorded `--cached` payloads or `--dry-run`.
123
+ - Tenant IDs / GUIDs in fixtures must be obviously fake (e.g. `00000000-...`).
124
+
125
+ ## References
126
+
127
+ - [agentskills.io — evaluating skills](https://agentskills.io/skill-creation/evaluating-skills) (source of truth)
128
+ - `plugin/skills/eval/SKILL.md` (the runner skill)
129
+ - `plugin/skills/eval/evals.schema.json` (JSON schema; self-check D33.evals-schema)
130
+ - `plugin/instructions/agentskills-compliance.instructions.md` (sibling doctrine — size + section caps)
@@ -0,0 +1,33 @@
1
+ {
2
+ "skill": "aggregate-project",
3
+ "version": "1.0.0",
4
+ "description": "Auto-seeded evals for aggregate-project. Replace with real cases as the skill matures.",
5
+ "cases": [
6
+ {
7
+ "id": "aggregate-project-smoke-1",
8
+ "name": "aggregate-project produces a non-empty response",
9
+ "input": "synthetic aggregate-project probe — canary smoke",
10
+ "canary": false,
11
+ "grader_type": "script",
12
+ "expected_assertions": [
13
+ {
14
+ "type": "regex-match",
15
+ "pattern": ".+"
16
+ }
17
+ ]
18
+ },
19
+ {
20
+ "id": "aggregate-project-smoke-2",
21
+ "name": "aggregate-project echoes case id",
22
+ "input": "case-id aggregate-project-smoke-2",
23
+ "canary": false,
24
+ "grader_type": "script",
25
+ "expected_assertions": [
26
+ {
27
+ "type": "regex-match",
28
+ "pattern": "aggregate-project-smoke-2"
29
+ }
30
+ ]
31
+ }
32
+ ]
33
+ }
@@ -0,0 +1,33 @@
1
+ {
2
+ "skill": "apply-ado-update",
3
+ "version": "1.0.0",
4
+ "description": "Auto-seeded evals for apply-ado-update. Replace with real cases as the skill matures.",
5
+ "cases": [
6
+ {
7
+ "id": "apply-ado-update-smoke-1",
8
+ "name": "apply-ado-update produces a non-empty response",
9
+ "input": "synthetic apply-ado-update probe — canary smoke",
10
+ "canary": false,
11
+ "grader_type": "script",
12
+ "expected_assertions": [
13
+ {
14
+ "type": "regex-match",
15
+ "pattern": ".+"
16
+ }
17
+ ]
18
+ },
19
+ {
20
+ "id": "apply-ado-update-smoke-2",
21
+ "name": "apply-ado-update echoes case id",
22
+ "input": "case-id apply-ado-update-smoke-2",
23
+ "canary": false,
24
+ "grader_type": "script",
25
+ "expected_assertions": [
26
+ {
27
+ "type": "regex-match",
28
+ "pattern": "apply-ado-update-smoke-2"
29
+ }
30
+ ]
31
+ }
32
+ ]
33
+ }
@@ -0,0 +1,34 @@
1
+ {
2
+ "skill": "ask-project",
3
+ "version": "1.0.0",
4
+ "description": "Verifies citation format (weekly-csc) and Source-layout footer.",
5
+ "cases": [
6
+ {
7
+ "id": "ap-macc-citation",
8
+ "name": "answers MACC question with weekly-csc citation",
9
+ "input": "what was decided about MACC for fixture-acme?",
10
+ "fixture": "evals/fixtures/fixture-acme",
11
+ "canary": true,
12
+ "grader_type": "script",
13
+ "args": { "read_fixture": "../outputs/ask-project.macc.txt" },
14
+ "expected_assertions": [
15
+ { "type": "regex-match", "pattern": "MACC" },
16
+ { "type": "regex-match", "pattern": "\\[source:\\s*fixture-acme/email/weekly/" },
17
+ { "type": "regex-match", "pattern": "Source-layout:\\s*weekly-csc" }
18
+ ]
19
+ },
20
+ {
21
+ "id": "ap-em-question",
22
+ "name": "answers who-is-the-EM with cited person",
23
+ "input": "who is the lead on fixture-acme?",
24
+ "fixture": "evals/fixtures/fixture-acme",
25
+ "canary": false,
26
+ "grader_type": "script",
27
+ "args": { "read_fixture": "../outputs/ask-project.lead.txt" },
28
+ "expected_assertions": [
29
+ { "type": "regex-match", "pattern": "alice@fixture\\.local" },
30
+ { "type": "regex-match", "pattern": "Source-layout:\\s*weekly-csc" }
31
+ ]
32
+ }
33
+ ]
34
+ }
@@ -0,0 +1,34 @@
1
+ {
2
+ "skill": "bootstrap-project",
3
+ "version": "1.0.0",
4
+ "description": "Asserts the canonical engagement folder layout from a dry-run.",
5
+ "cases": [
6
+ {
7
+ "id": "bp-dryrun-layout",
8
+ "name": "dry-run prints the canonical layout",
9
+ "input": "bootstrap fixture-acme --dry-run",
10
+ "fixture": "evals/fixtures/fixture-acme",
11
+ "canary": true,
12
+ "grader_type": "script",
13
+ "args": { "read_fixture": "../outputs/bootstrap-project.dryrun.txt" },
14
+ "expected_assertions": [
15
+ { "type": "regex-match", "pattern": "DRY-RUN" },
16
+ { "type": "regex-match", "pattern": "fixture-acme/Evidence/fixture-alias/email/weekly/" },
17
+ { "type": "regex-match", "pattern": "fixture-acme/State/" },
18
+ { "type": "regex-match", "pattern": "fixture-acme/Evidence/_graph/" }
19
+ ]
20
+ },
21
+ {
22
+ "id": "bp-fixture-tree-present",
23
+ "name": "fixture engagement tree exists on disk",
24
+ "input": "verify fixture tree",
25
+ "fixture": "evals/fixtures/fixture-acme",
26
+ "canary": false,
27
+ "grader_type": "script",
28
+ "expected_assertions": [
29
+ { "type": "file-exists", "path": "Evidence/fixture-alias/email/weekly/2026-05-18_email-csc.md" },
30
+ { "type": "file-exists", "path": "State/index.md" }
31
+ ]
32
+ }
33
+ ]
34
+ }
@@ -0,0 +1,31 @@
1
+ {
2
+ "skill": "build-state",
3
+ "version": "1.0.0",
4
+ "description": "Karpathy State layout — index.md + log.md + per-category folders.",
5
+ "cases": [
6
+ {
7
+ "id": "bs-state-index",
8
+ "name": "fixture State/index.md has kushi_state_page front-matter",
9
+ "input": "validate state fixture",
10
+ "fixture": "evals/fixtures/fixture-acme",
11
+ "canary": true,
12
+ "grader_type": "script",
13
+ "args": { "read_fixture": "State/index.md" },
14
+ "expected_assertions": [
15
+ { "type": "regex-match", "pattern": "kushi_state_page:\\s*true" }
16
+ ]
17
+ },
18
+ {
19
+ "id": "bs-state-log-exists",
20
+ "name": "fixture State/log.md exists",
21
+ "input": "verify log",
22
+ "fixture": "evals/fixtures/fixture-acme",
23
+ "canary": false,
24
+ "grader_type": "script",
25
+ "expected_assertions": [
26
+ { "type": "file-exists", "path": "State/log.md" },
27
+ { "type": "file-exists", "path": "State/index.md" }
28
+ ]
29
+ }
30
+ ]
31
+ }
@@ -0,0 +1,33 @@
1
+ {
2
+ "skill": "consolidate-evidence",
3
+ "version": "1.0.0",
4
+ "description": "Auto-seeded evals for consolidate-evidence. Replace with real cases as the skill matures.",
5
+ "cases": [
6
+ {
7
+ "id": "consolidate-evidence-smoke-1",
8
+ "name": "consolidate-evidence produces a non-empty response",
9
+ "input": "synthetic consolidate-evidence probe — canary smoke",
10
+ "canary": false,
11
+ "grader_type": "script",
12
+ "expected_assertions": [
13
+ {
14
+ "type": "regex-match",
15
+ "pattern": ".+"
16
+ }
17
+ ]
18
+ },
19
+ {
20
+ "id": "consolidate-evidence-smoke-2",
21
+ "name": "consolidate-evidence echoes case id",
22
+ "input": "case-id consolidate-evidence-smoke-2",
23
+ "canary": false,
24
+ "grader_type": "script",
25
+ "expected_assertions": [
26
+ {
27
+ "type": "regex-match",
28
+ "pattern": "consolidate-evidence-smoke-2"
29
+ }
30
+ ]
31
+ }
32
+ ]
33
+ }
@@ -0,0 +1,33 @@
1
+ {
2
+ "skill": "dashboard",
3
+ "version": "1.0.0",
4
+ "description": "Auto-seeded evals for dashboard. Replace with real cases as the skill matures.",
5
+ "cases": [
6
+ {
7
+ "id": "dashboard-smoke-1",
8
+ "name": "dashboard produces a non-empty response",
9
+ "input": "synthetic dashboard probe — canary smoke",
10
+ "canary": false,
11
+ "grader_type": "script",
12
+ "expected_assertions": [
13
+ {
14
+ "type": "regex-match",
15
+ "pattern": ".+"
16
+ }
17
+ ]
18
+ },
19
+ {
20
+ "id": "dashboard-smoke-2",
21
+ "name": "dashboard echoes case id",
22
+ "input": "case-id dashboard-smoke-2",
23
+ "canary": false,
24
+ "grader_type": "script",
25
+ "expected_assertions": [
26
+ {
27
+ "type": "regex-match",
28
+ "pattern": "dashboard-smoke-2"
29
+ }
30
+ ]
31
+ }
32
+ ]
33
+ }
@@ -0,0 +1,33 @@
1
+ {
2
+ "skill": "emit-vertex",
3
+ "version": "1.0.0",
4
+ "description": "Auto-seeded evals for emit-vertex. Replace with real cases as the skill matures.",
5
+ "cases": [
6
+ {
7
+ "id": "emit-vertex-smoke-1",
8
+ "name": "emit-vertex produces a non-empty response",
9
+ "input": "synthetic emit-vertex probe — canary smoke",
10
+ "canary": false,
11
+ "grader_type": "script",
12
+ "expected_assertions": [
13
+ {
14
+ "type": "regex-match",
15
+ "pattern": ".+"
16
+ }
17
+ ]
18
+ },
19
+ {
20
+ "id": "emit-vertex-smoke-2",
21
+ "name": "emit-vertex echoes case id",
22
+ "input": "case-id emit-vertex-smoke-2",
23
+ "canary": false,
24
+ "grader_type": "script",
25
+ "expected_assertions": [
26
+ {
27
+ "type": "regex-match",
28
+ "pattern": "emit-vertex-smoke-2"
29
+ }
30
+ ]
31
+ }
32
+ ]
33
+ }
@@ -0,0 +1,90 @@
1
+ ---
2
+ name: "eval"
3
+ version: "1.0.0"
4
+ description: "USE WHEN the user says \"run evals\", \"eval canary\", \"eval ask-project\", \"check skill regression\", \"update eval baseline\", or before tagging a release. DO NOT USE for evidence validation of a real project (use ask-project / project-status). Capability: runs per-skill evals (deterministic script graders + optional LLM-rubric graders) from each skill's evals/evals.json, aggregates pass-rate / latency / token metrics, and compares against evals/baseline.json to flag regressions. Synthetic fixtures only — never live customer data."
5
+ ---
6
+
7
+ # Skill: eval
8
+
9
+ The objective regression signal for every other kushi skill. Spec: <https://agentskills.io/skill-creation/evaluating-skills>. Doctrine: `instructions/skill-evals.instructions.md`.
10
+
11
+ User triggers: "run evals", "eval canary", "eval <skill>", "eval all", "update eval baseline", "check skill regression".
12
+
13
+ ## USE WHEN
14
+
15
+ - About to commit a behavioral change to any `plugin/skills/<name>/SKILL.md` or its companion script.
16
+ - Reviewing a PR that touches skills, prompts, or shared instructions.
17
+ - Tagging a release (full suite is implied by the release script).
18
+ - Investigating "did this skill get worse?" after a refactor.
19
+
20
+ ## DO NOT USE FOR
21
+
22
+ - Validating real customer evidence (use `ask-project` / `project-status`).
23
+ - Live M365 / ADO / CRM probes (use `pull-*` directly).
24
+ - Generating new fixtures from real engagements — that violates the privacy rule.
25
+
26
+ ## Gotchas
27
+
28
+ 1. **Canary ≠ full.** `npm run eval:canary` runs ~6 skills. Use `npm run eval:all` before tagging.
29
+ 2. **Synthetic fixtures only.** Never copy real customer evidence into `evals/fixtures/`. The runner does NOT enforce this — humans do.
30
+ 3. **`pull-*` skills run in `--cached` / `--dry-run`.** Live network calls are explicitly disabled in canary mode; the LLM-rubric subset is skipped unless `-Live` is passed.
31
+ 4. **Baseline drift is OK after intentional changes.** Refresh with `npm run eval:baseline`. NEVER auto-update baseline in CI.
32
+ 5. **`Evidence/_evals/` is gitignored.** If you need to share a run, copy the JSON manually.
33
+ 6. **Tokens are estimates.** When a grader can't measure tokens (e.g. a pwsh-only run), it records `0` and the aggregator excludes those from the mean.
34
+
35
+ ## Step checklist
36
+
37
+ - [ ] Pick mode: `-Skill <name>` (one skill) · `-Canary` (fast subset) · `-All` (full suite).
38
+ - [ ] Confirm `evals/baseline.json` exists; if missing, run with `-UpdateBaseline` first.
39
+ - [ ] Run: `pwsh plugin/skills/eval/run-evals.ps1 -Canary` (or other mode).
40
+ - [ ] Inspect the output JSON under `Evidence/_evals/` and the `benchmark.json` summary.
41
+ - [ ] If regressions flagged: re-run the specific skill with `-Skill <name>` for detail.
42
+ - [ ] If intentional change: bump the skill version, then `npm run eval:baseline`.
43
+
44
+ ## Validation loop
45
+
46
+ After running, verify:
47
+
48
+ - Output JSON exists at the path printed by the runner.
49
+ - Every case has `pass: true|false` AND `duration_ms` AND a per-assertion breakdown.
50
+ - `benchmark.json` has `summary.regressions` array (empty if clean).
51
+ - Exit code: `0` = all green vs baseline; `1` = at least one regression.
52
+
53
+ If a case errors out before any assertion runs (e.g. fixture missing), it counts as `pass: false` with `error: "<message>"` so totals stay honest.
54
+
55
+ ## What the runner does (`run-evals.ps1`)
56
+
57
+ 1. Discovers `plugin/skills/<name>/evals/evals.json` for the requested skills.
58
+ 2. Validates each file against `plugin/skills/eval/evals.schema.json`.
59
+ 3. For each case:
60
+ - Resolves the fixture path (`fixture` field, relative to repo root).
61
+ - Dispatches per `grader_type`:
62
+ - `script` → invokes the skill's known executable (`run.ps1`, a `*.mjs` probe, or a deterministic shim under `evals/probe.*`) with the case input.
63
+ - `llm` → only when `-Live` is set; otherwise marked `skipped`.
64
+ - Captures `stdout`, `stderr`, `duration_ms`.
65
+ - Runs every assertion in `expected_assertions`; collects per-assertion pass/fail.
66
+ - Case `pass` = all assertions pass AND no error.
67
+ 4. Writes per-run JSON to `Evidence/_evals/<utc-timestamp>.json` (or `-Output` override).
68
+ 5. Calls `node src/eval-aggregator.mjs` to compute `benchmark.json` and compare against `evals/baseline.json`.
69
+ 6. Prints a one-screen summary; exits 0 on clean, 1 on regression.
70
+
71
+ ## Arguments
72
+
73
+ | Flag | Purpose |
74
+ |---|---|
75
+ | `-Skill <name>` | Run only one skill's evals. |
76
+ | `-All` | Run every `plugin/skills/<name>/evals/evals.json`. |
77
+ | `-Canary` | Run only cases marked `"canary": true`. |
78
+ | `-Output <path>` | Override per-run JSON path. Default `Evidence/_evals/<ts>.json`. |
79
+ | `-Baseline` | Compare against `evals/baseline.json` (default ON). |
80
+ | `-UpdateBaseline` | Write current run's metrics into `evals/baseline.json`. |
81
+ | `-Live` | Allow LLM-rubric cases (requires `m_*` tools / sub-agent). |
82
+ | `-StrictExit` | Exit 1 on any case failure (CI mode). |
83
+
84
+ ## References
85
+
86
+ - `plugin/instructions/skill-evals.instructions.md` (doctrine)
87
+ - `plugin/instructions/agentskills-compliance.instructions.md` (size + section caps for SKILL.md)
88
+ - `src/eval-aggregator.mjs` (mean/stddev + regression detection)
89
+ - `evals/baseline.json` (committed baseline)
90
+ - <https://agentskills.io/skill-creation/evaluating-skills>
@@ -0,0 +1,73 @@
1
+ {
2
+ "$schema": "http://json-schema.org/draft-07/schema#",
3
+ "$id": "https://gim-home.github.io/kushi/schemas/evals.schema.json",
4
+ "title": "Kushi per-skill evals file",
5
+ "description": "Schema for plugin/skills/<name>/evals/evals.json. Validated by self-check D33.evals-schema.",
6
+ "type": "object",
7
+ "required": ["skill", "cases"],
8
+ "additionalProperties": false,
9
+ "properties": {
10
+ "skill": {
11
+ "type": "string",
12
+ "minLength": 1,
13
+ "description": "Skill name (must match the parent directory and SKILL.md frontmatter name)."
14
+ },
15
+ "version": { "type": "string" },
16
+ "description": { "type": "string" },
17
+ "cases": {
18
+ "type": "array",
19
+ "minItems": 1,
20
+ "items": { "$ref": "#/definitions/case" }
21
+ }
22
+ },
23
+ "definitions": {
24
+ "case": {
25
+ "type": "object",
26
+ "required": ["id", "name", "input", "expected_assertions", "grader_type"],
27
+ "additionalProperties": false,
28
+ "properties": {
29
+ "id": { "type": "string", "pattern": "^[a-z0-9][a-z0-9-]*$" },
30
+ "name": { "type": "string", "minLength": 1 },
31
+ "input": {
32
+ "anyOf": [{ "type": "string" }, { "type": "object" }]
33
+ },
34
+ "fixture": { "type": "string" },
35
+ "canary": { "type": "boolean", "default": false },
36
+ "skip": { "type": "boolean", "default": false },
37
+ "skip_reason": { "type": "string" },
38
+ "timeout_ms": { "type": "integer", "minimum": 100 },
39
+ "args": { "type": "object" },
40
+ "grader_type": { "enum": ["script", "llm"] },
41
+ "expected_assertions": {
42
+ "type": "array",
43
+ "minItems": 1,
44
+ "items": { "$ref": "#/definitions/assertion" }
45
+ }
46
+ }
47
+ },
48
+ "assertion": {
49
+ "type": "object",
50
+ "required": ["type"],
51
+ "properties": {
52
+ "type": {
53
+ "enum": [
54
+ "file-exists",
55
+ "file-contains",
56
+ "json-path-equals",
57
+ "regex-match",
58
+ "llm-rubric"
59
+ ]
60
+ },
61
+ "path": { "type": "string" },
62
+ "needle": { "type": "string" },
63
+ "pattern": { "type": "string" },
64
+ "flags": { "type": "string" },
65
+ "json_path": { "type": "string" },
66
+ "equals": {},
67
+ "rubric": { "type": "string" },
68
+ "min_score": { "type": "number" }
69
+ },
70
+ "additionalProperties": false
71
+ }
72
+ }
73
+ }