npm - kushi-agents - Versions diffs - 5.0.2 → 5.0.3 - Mend

kushi-agents 5.0.2 → 5.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

package/README.md +22 -0
package/package.json +6 -2
package/plugin/agents/kushi.agent.md +1 -1
package/plugin/instructions/skill-evals.instructions.md +130 -0
package/plugin/skills/aggregate-project/evals/evals.json +33 -0
package/plugin/skills/apply-ado-update/evals/evals.json +33 -0
package/plugin/skills/ask-project/evals/evals.json +34 -0
package/plugin/skills/bootstrap-project/evals/evals.json +34 -0
package/plugin/skills/build-state/evals/evals.json +31 -0
package/plugin/skills/consolidate-evidence/evals/evals.json +33 -0
package/plugin/skills/dashboard/evals/evals.json +33 -0
package/plugin/skills/emit-vertex/evals/evals.json +33 -0
package/plugin/skills/eval/SKILL.md +90 -0
package/plugin/skills/eval/evals.schema.json +73 -0
package/plugin/skills/eval/run-evals.ps1 +372 -0
package/plugin/skills/fde-intake/evals/evals.json +33 -0
package/plugin/skills/fde-report/evals/evals.json +33 -0
package/plugin/skills/fde-triage/evals/evals.json +33 -0
package/plugin/skills/intro/evals/evals.json +33 -0
package/plugin/skills/link-entities/evals/evals.json +31 -0
package/plugin/skills/project-status/evals/evals.json +33 -0
package/plugin/skills/propose-ado-update/evals/evals.json +33 -0
package/plugin/skills/pull-ado/evals/evals.json +35 -0
package/plugin/skills/pull-crm/evals/evals.json +35 -0
package/plugin/skills/pull-email/evals/evals.json +35 -0
package/plugin/skills/pull-loop/evals/evals.json +35 -0
package/plugin/skills/pull-meetings/evals/evals.json +35 -0
package/plugin/skills/pull-misc/evals/evals.json +35 -0
package/plugin/skills/pull-onenote/evals/evals.json +35 -0
package/plugin/skills/pull-sharepoint/evals/evals.json +35 -0
package/plugin/skills/pull-teams/evals/evals.json +35 -0
package/plugin/skills/refresh-project/evals/evals.json +31 -0
package/plugin/skills/self-check/SKILL.md +1 -0
package/plugin/skills/self-check/evals/evals.json +28 -0
package/plugin/skills/self-check/run.ps1 +63 -0
package/plugin/skills/setup/evals/evals.json +33 -0
package/plugin/skills/tour/evals/evals.json +33 -0
package/plugin/skills/vertex-link/evals/evals.json +33 -0
package/src/eval-aggregator.mjs +209 -0
package/src/eval-aggregator.test.mjs +64 -0
package/src/eval-runner.test.mjs +69 -0

package/README.md CHANGED Viewed

@@ -235,6 +235,28 @@ npm pack --dry-run
 The self-check validates frontmatter, agent inventory, prompt → skill routing, profile manifest, reference packs, cross-links, the verbs table in this README, and the layout diagram in `docs/reference/where-things-live.md`. Full reference: [docs/reference/self-check.md](docs/reference/self-check.md).
+## Evaluating skills (v5.0.3+)
+Every skill ships per-case evals at `plugin/skills/<name>/evals/evals.json`, aligned with the [agentskills.io evaluating-skills spec](https://agentskills.io/skill-creation/evaluating-skills). Doctrine: [`plugin/instructions/skill-evals.instructions.md`](plugin/instructions/skill-evals.instructions.md).
+Quickstart:
+```powershell
+npm run eval:canary        # ~6 skills, runs in seconds — what PRs run
+npm run eval:all           # full suite (every plugin/skills/<name>/)
+npm run eval -- ask-project   # one skill
+npm run eval:baseline      # maintainer-only: refresh evals/baseline.json
+```
+Outputs:
+- `Evidence/_evals/<utc-ts>.json` — per-run JSON (pass/fail + duration + tokens per case).
+- `Evidence/_evals/benchmark.json` — per-skill mean/stddev for `pass_rate`, `duration_ms`, `tokens_total` + regression flags vs `evals/baseline.json`.
+Regressions flagged at ≥10pp pass-rate drop OR ≥50% latency/token increase. The canary subset is `ask-project`, `bootstrap-project`, `refresh-project`, `link-entities`, `build-state`, `self-check`.
+**Privacy:** fixtures under `evals/fixtures/` are synthetic. NEVER copy real customer data into the evals tree.
 ## License
 See [LICENSE](LICENSE).

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "kushi-agents",
-  "version": "5.0.2",
+  "version": "5.0.3",
   "description": "Install Kushi — multi-source project evidence agent with Comprehensive Structured Capture (CSC) into weekly-only files across Email, Teams, OneNote, Loop, SharePoint, Meetings, CRM, ADO. Meetings retain a sibling verbatim/ audit folder. WorkIQ-only for M365 sources (Graph / m365_* FORBIDDEN as fallbacks; user-paste is first-class). Host-agnostic.",
   "type": "module",
   "bin": {
@@ -41,9 +41,13 @@
   },
   "license": "MIT",
   "scripts": {
-    "test": "node --test src/check-workiq.test.mjs src/seed-config.test.mjs src/sanitize-workiq-input.test.mjs src/detect-vertex-repo.test.mjs src/vertex-validate.test.mjs src/emit-vertex.e2e.test.mjs src/config-root-resolve.test.mjs src/forbidden-workiq-phrasings.test.mjs src/multi-host-install.test.mjs",
+    "test": "node --test src/check-workiq.test.mjs src/seed-config.test.mjs src/sanitize-workiq-input.test.mjs src/detect-vertex-repo.test.mjs src/vertex-validate.test.mjs src/emit-vertex.e2e.test.mjs src/config-root-resolve.test.mjs src/forbidden-workiq-phrasings.test.mjs src/multi-host-install.test.mjs src/eval-aggregator.test.mjs src/eval-runner.test.mjs",
     "test:integration:bootstrap": "node src/bootstrap-dryrun.integration.test.mjs",
     "smoke": "node scripts/smoke.mjs",
+    "eval": "pwsh plugin/skills/eval/run-evals.ps1 -Skill",
+    "eval:all": "pwsh plugin/skills/eval/run-evals.ps1 -All",
+    "eval:canary": "pwsh plugin/skills/eval/run-evals.ps1 -Canary",
+    "eval:baseline": "pwsh plugin/skills/eval/run-evals.ps1 -All -UpdateBaseline",
     "prepublishOnly": "npm test && npm run smoke"
   },
   "publishConfig": {

package/plugin/agents/kushi.agent.md CHANGED Viewed

@@ -16,7 +16,7 @@ Kushi ships in three profiles. The installed profile is recorded in `kushi-insta
 | Profile | What's installed | Verbs available |
 |---|---|---|
-| `core` | Aggregator only: `setup`, `pull-*`, `consolidate-evidence`, `aggregate-project`, `ask-project`, `project-status`, `vertex-link`, `emit-vertex`, `self-check`, `intro` | `setup`, `aggregate`, `consolidate`, `status`, `pull`, `ask`, `vertex-link`, `emit-vertex` |
+| `core` | Aggregator only: `setup`, `pull-*`, `consolidate-evidence`, `aggregate-project`, `ask-project`, `project-status`, `vertex-link`, `emit-vertex`, `self-check`, `eval`, `intro` | `setup`, `aggregate`, `consolidate`, `status`, `pull`, `ask`, `vertex-link`, `emit-vertex` |
 | `standard` *(default)* | core + `bootstrap-project`, `refresh-project`, `fde-intake`, `fde-report`, `fde-triage` + FDE reference pack | core + `bootstrap`, `refresh`, `fde-intake`, `fde-report`, `fde-triage` |
 | `full` | standard + `build-state` | standard + `state` |
 | **`preview`** *(opt-in)* | standard + `propose-ado-update`, `apply-ado-update` | standard + `propose-ado`, `apply-ado` |

package/plugin/instructions/skill-evals.instructions.md ADDED Viewed

@@ -0,0 +1,130 @@
+---
+description: "v5.0.3 — Skill evals doctrine, adapted from https://agentskills.io/skill-creation/evaluating-skills. Every skill MUST ship an evals/ folder with at least 2 deterministic cases plus structured assertions; a per-skill pass-rate is the objective regression signal. Canary subset runs on every PR; full suite runs on demand. Real customer data is FORBIDDEN in fixtures — use synthetic data only."
+---
+# Skill evals — doctrine
+> Inspired by **<https://agentskills.io/skill-creation/evaluating-skills>**. Adapted to kushi's PowerShell + Node test stack and to our 2-host install matrix.
+## Why
+Skills are prompts plus a runner. Prompts drift silently. Without an objective per-skill regression signal, every change is a gamble. Evals make that signal cheap:
+- **Per-skill pass-rate** is the headline metric.
+- **Latency** and **tokens** are secondary metrics (regressions ≥50% latency / ≥10pp pass-rate flag a baseline failure).
+- A **canary subset** runs on every PR (target: < 60s wall clock); the **full suite** runs on demand (`npm run eval:all`).
+## Where evals live
+```text
+plugin/skills/<name>/
+├── SKILL.md
+└── evals/
+    ├── evals.json        ← REQUIRED — case list + assertions
+    └── fixtures/         ← OPTIONAL per-skill fixtures
+```
+Cross-skill fixtures live at the repo root:
+```text
+evals/
+├── baseline.json         ← Committed; maintainer updates with `npm run eval:baseline`
+└── fixtures/             ← Tiny synthetic evidence trees, ADO fixtures, etc.
+```
+Per-run output goes to `Evidence/_evals/<timestamp>.json` (gitignored; not customer data).
+## Case schema
+```jsonc
+{
+  "skill": "<skill-name>",
+  "cases": [
+    {
+      "id": "ap-citations-format",
+      "name": "ask-project emits weekly-csc citation form",
+      "input": "what was decided about MACC for fixture-acme?",
+      "fixture": "evals/fixtures/fixture-acme",        // optional
+      "canary": true,
+      "grader_type": "script",                          // "script" | "llm"
+      "expected_assertions": [
+        { "type": "regex-match", "pattern": "\\[source:\\s*fixture-acme/email/weekly/" },
+        { "type": "regex-match", "pattern": "Source-layout:\\s*weekly-csc" }
+      ]
+    }
+  ]
+}
+```
+### Required fields per case
+- `id` — unique within the skill; kebab-case.
+- `name` — human-readable.
+- `input` — what gets passed to the skill (string OR object).
+- `expected_assertions` — array, **≥ 1** entry (enforced by `D33.evals-have-assertions`).
+- `grader_type` — `"script"` for deterministic graders, `"llm"` for rubric-based.
+### Optional fields
+- `fixture` — repo-relative path to the fixture to point the skill at.
+- `canary` — `true` to include in the fast CI subset.
+- `args` — extra args forwarded to the skill script (e.g. `{ "DryRun": true }`).
+- `skip` — `true` to skip (must include `skip_reason`).
+- `timeout_ms` — override the runner default (30 000 ms).
+## Assertion types
+| Type | Shape | Passes when |
+|---|---|---|
+| `file-exists` | `{ "type": "file-exists", "path": "..." }` | Path exists post-run (relative to fixture or evidence dir). |
+| `file-contains` | `{ "type": "file-contains", "path": "...", "needle": "..." }` | File exists and substring is present. |
+| `json-path-equals` | `{ "type": "json-path-equals", "path": "...", "json_path": "$.foo.bar", "equals": "v" }` | JSON file parses; dotted path value === expected. |
+| `regex-match` | `{ "type": "regex-match", "pattern": "...", "flags": "i" }` | Captured stdout matches the regex. |
+| `llm-rubric` | `{ "type": "llm-rubric", "rubric": "...", "min_score": 4 }` | LLM grader scores ≥ min on a 1–5 rubric. |
+## Run modes
+The runner (`plugin/skills/eval/run-evals.ps1`) supports three dispatch modes:
+1. **Direct invocation** (default for `script` graders). Runs the skill's executable artifact (`run.ps1`, `*.mjs`, or a small probe stub) with the given input and fixture. Pure deterministic.
+2. **Sub-agent dispatch** (optional, gated by `-Live`). Forwards the case to a sub-agent. Used only for `llm-rubric` cases. Skipped in canary mode.
+3. **Recorded fixture replay** (for `pull-*` skills). Reads a recorded `--cached` output of a real pull and asserts against that, so no live M365 calls are needed.
+For each case the runner records: `pass`, `duration_ms`, `tokens_in`, `tokens_out`, `stdout`, `stderr`, per-assertion `pass`/`reason`. The aggregate is a JSON file under `Evidence/_evals/` plus a one-line `benchmark.json` summary.
+## Canary set
+Marked with `"canary": true`. Kept tiny so PRs stay fast.
+Default canary set (v5.0.3):
+- `ask-project`
+- `bootstrap-project`
+- `refresh-project`
+- `link-entities`
+- `build-state`
+- `self-check`
+## Baseline + regression detection
+- `evals/baseline.json` is **committed**.
+- Each per-skill record carries the last green `pass_rate`, `mean_duration_ms`, and `mean_tokens_total`.
+- `src/eval-aggregator.mjs` flags **regressions**:
+  - `pass_rate` drop ≥ 10 percentage points
+  - `mean_duration_ms` increase ≥ 50 %
+  - `mean_tokens_total` increase ≥ 50 %
+- Maintainers refresh the baseline with `npm run eval:baseline` after deliberate behavior changes.
+## Privacy + safety
+- **No real customer data** in any fixture. Use `fixture-acme`-style synthetic names.
+- `Evidence/_evals/` is in `.gitignore`.
+- `pull-*` evals NEVER hit live M365 endpoints in canary mode. Use recorded `--cached` payloads or `--dry-run`.
+- Tenant IDs / GUIDs in fixtures must be obviously fake (e.g. `00000000-...`).
+## References
+- [agentskills.io — evaluating skills](https://agentskills.io/skill-creation/evaluating-skills) (source of truth)
+- `plugin/skills/eval/SKILL.md` (the runner skill)
+- `plugin/skills/eval/evals.schema.json` (JSON schema; self-check D33.evals-schema)
+- `plugin/instructions/agentskills-compliance.instructions.md` (sibling doctrine — size + section caps)

package/plugin/skills/aggregate-project/evals/evals.json ADDED Viewed

@@ -0,0 +1,33 @@
+{
+  "skill": "aggregate-project",
+  "version": "1.0.0",
+  "description": "Auto-seeded evals for aggregate-project. Replace with real cases as the skill matures.",
+  "cases": [
+    {
+      "id": "aggregate-project-smoke-1",
+      "name": "aggregate-project produces a non-empty response",
+      "input": "synthetic aggregate-project probe — canary smoke",
+      "canary": false,
+      "grader_type": "script",
+      "expected_assertions": [
+        {
+          "type": "regex-match",
+          "pattern": ".+"
+        }
+      ]
+    },
+    {
+      "id": "aggregate-project-smoke-2",
+      "name": "aggregate-project echoes case id",
+      "input": "case-id aggregate-project-smoke-2",
+      "canary": false,
+      "grader_type": "script",
+      "expected_assertions": [
+        {
+          "type": "regex-match",
+          "pattern": "aggregate-project-smoke-2"
+        }
+      ]
+    }
+  ]
+}

package/plugin/skills/apply-ado-update/evals/evals.json ADDED Viewed

@@ -0,0 +1,33 @@
+{
+  "skill": "apply-ado-update",
+  "version": "1.0.0",
+  "description": "Auto-seeded evals for apply-ado-update. Replace with real cases as the skill matures.",
+  "cases": [
+    {
+      "id": "apply-ado-update-smoke-1",
+      "name": "apply-ado-update produces a non-empty response",
+      "input": "synthetic apply-ado-update probe — canary smoke",
+      "canary": false,
+      "grader_type": "script",
+      "expected_assertions": [
+        {
+          "type": "regex-match",
+          "pattern": ".+"
+        }
+      ]
+    },
+    {
+      "id": "apply-ado-update-smoke-2",
+      "name": "apply-ado-update echoes case id",
+      "input": "case-id apply-ado-update-smoke-2",
+      "canary": false,
+      "grader_type": "script",
+      "expected_assertions": [
+        {
+          "type": "regex-match",
+          "pattern": "apply-ado-update-smoke-2"
+        }
+      ]
+    }
+  ]
+}

package/plugin/skills/ask-project/evals/evals.json ADDED Viewed

@@ -0,0 +1,34 @@
+{
+  "skill": "ask-project",
+  "version": "1.0.0",
+  "description": "Verifies citation format (weekly-csc) and Source-layout footer.",
+  "cases": [
+    {
+      "id": "ap-macc-citation",
+      "name": "answers MACC question with weekly-csc citation",
+      "input": "what was decided about MACC for fixture-acme?",
+      "fixture": "evals/fixtures/fixture-acme",
+      "canary": true,
+      "grader_type": "script",
+      "args": { "read_fixture": "../outputs/ask-project.macc.txt" },
+      "expected_assertions": [
+        { "type": "regex-match", "pattern": "MACC" },
+        { "type": "regex-match", "pattern": "\\[source:\\s*fixture-acme/email/weekly/" },
+        { "type": "regex-match", "pattern": "Source-layout:\\s*weekly-csc" }
+      ]
+    },
+    {
+      "id": "ap-em-question",
+      "name": "answers who-is-the-EM with cited person",
+      "input": "who is the lead on fixture-acme?",
+      "fixture": "evals/fixtures/fixture-acme",
+      "canary": false,
+      "grader_type": "script",
+      "args": { "read_fixture": "../outputs/ask-project.lead.txt" },
+      "expected_assertions": [
+        { "type": "regex-match", "pattern": "alice@fixture\\.local" },
+        { "type": "regex-match", "pattern": "Source-layout:\\s*weekly-csc" }
+      ]
+    }
+  ]
+}

package/plugin/skills/bootstrap-project/evals/evals.json ADDED Viewed

@@ -0,0 +1,34 @@
+{
+  "skill": "bootstrap-project",
+  "version": "1.0.0",
+  "description": "Asserts the canonical engagement folder layout from a dry-run.",
+  "cases": [
+    {
+      "id": "bp-dryrun-layout",
+      "name": "dry-run prints the canonical layout",
+      "input": "bootstrap fixture-acme --dry-run",
+      "fixture": "evals/fixtures/fixture-acme",
+      "canary": true,
+      "grader_type": "script",
+      "args": { "read_fixture": "../outputs/bootstrap-project.dryrun.txt" },
+      "expected_assertions": [
+        { "type": "regex-match", "pattern": "DRY-RUN" },
+        { "type": "regex-match", "pattern": "fixture-acme/Evidence/fixture-alias/email/weekly/" },
+        { "type": "regex-match", "pattern": "fixture-acme/State/" },
+        { "type": "regex-match", "pattern": "fixture-acme/Evidence/_graph/" }
+      ]
+    },
+    {
+      "id": "bp-fixture-tree-present",
+      "name": "fixture engagement tree exists on disk",
+      "input": "verify fixture tree",
+      "fixture": "evals/fixtures/fixture-acme",
+      "canary": false,
+      "grader_type": "script",
+      "expected_assertions": [
+        { "type": "file-exists", "path": "Evidence/fixture-alias/email/weekly/2026-05-18_email-csc.md" },
+        { "type": "file-exists", "path": "State/index.md" }
+      ]
+    }
+  ]
+}

package/plugin/skills/build-state/evals/evals.json ADDED Viewed

@@ -0,0 +1,31 @@
+{
+  "skill": "build-state",
+  "version": "1.0.0",
+  "description": "Karpathy State layout — index.md + log.md + per-category folders.",
+  "cases": [
+    {
+      "id": "bs-state-index",
+      "name": "fixture State/index.md has kushi_state_page front-matter",
+      "input": "validate state fixture",
+      "fixture": "evals/fixtures/fixture-acme",
+      "canary": true,
+      "grader_type": "script",
+      "args": { "read_fixture": "State/index.md" },
+      "expected_assertions": [
+        { "type": "regex-match", "pattern": "kushi_state_page:\\s*true" }
+      ]
+    },
+    {
+      "id": "bs-state-log-exists",
+      "name": "fixture State/log.md exists",
+      "input": "verify log",
+      "fixture": "evals/fixtures/fixture-acme",
+      "canary": false,
+      "grader_type": "script",
+      "expected_assertions": [
+        { "type": "file-exists", "path": "State/log.md" },
+        { "type": "file-exists", "path": "State/index.md" }
+      ]
+    }
+  ]
+}

package/plugin/skills/consolidate-evidence/evals/evals.json ADDED Viewed

@@ -0,0 +1,33 @@
+{
+  "skill": "consolidate-evidence",
+  "version": "1.0.0",
+  "description": "Auto-seeded evals for consolidate-evidence. Replace with real cases as the skill matures.",
+  "cases": [
+    {
+      "id": "consolidate-evidence-smoke-1",
+      "name": "consolidate-evidence produces a non-empty response",
+      "input": "synthetic consolidate-evidence probe — canary smoke",
+      "canary": false,
+      "grader_type": "script",
+      "expected_assertions": [
+        {
+          "type": "regex-match",
+          "pattern": ".+"
+        }
+      ]
+    },
+    {
+      "id": "consolidate-evidence-smoke-2",
+      "name": "consolidate-evidence echoes case id",
+      "input": "case-id consolidate-evidence-smoke-2",
+      "canary": false,
+      "grader_type": "script",
+      "expected_assertions": [
+        {
+          "type": "regex-match",
+          "pattern": "consolidate-evidence-smoke-2"
+        }
+      ]
+    }
+  ]
+}

package/plugin/skills/dashboard/evals/evals.json ADDED Viewed

@@ -0,0 +1,33 @@
+{
+  "skill": "dashboard",
+  "version": "1.0.0",
+  "description": "Auto-seeded evals for dashboard. Replace with real cases as the skill matures.",
+  "cases": [
+    {
+      "id": "dashboard-smoke-1",
+      "name": "dashboard produces a non-empty response",
+      "input": "synthetic dashboard probe — canary smoke",
+      "canary": false,
+      "grader_type": "script",
+      "expected_assertions": [
+        {
+          "type": "regex-match",
+          "pattern": ".+"
+        }
+      ]
+    },
+    {
+      "id": "dashboard-smoke-2",
+      "name": "dashboard echoes case id",
+      "input": "case-id dashboard-smoke-2",
+      "canary": false,
+      "grader_type": "script",
+      "expected_assertions": [
+        {
+          "type": "regex-match",
+          "pattern": "dashboard-smoke-2"
+        }
+      ]
+    }
+  ]
+}

package/plugin/skills/emit-vertex/evals/evals.json ADDED Viewed

@@ -0,0 +1,33 @@
+{
+  "skill": "emit-vertex",
+  "version": "1.0.0",
+  "description": "Auto-seeded evals for emit-vertex. Replace with real cases as the skill matures.",
+  "cases": [
+    {
+      "id": "emit-vertex-smoke-1",
+      "name": "emit-vertex produces a non-empty response",
+      "input": "synthetic emit-vertex probe — canary smoke",
+      "canary": false,
+      "grader_type": "script",
+      "expected_assertions": [
+        {
+          "type": "regex-match",
+          "pattern": ".+"
+        }
+      ]
+    },
+    {
+      "id": "emit-vertex-smoke-2",
+      "name": "emit-vertex echoes case id",
+      "input": "case-id emit-vertex-smoke-2",
+      "canary": false,
+      "grader_type": "script",
+      "expected_assertions": [
+        {
+          "type": "regex-match",
+          "pattern": "emit-vertex-smoke-2"
+        }
+      ]
+    }
+  ]
+}

package/plugin/skills/eval/SKILL.md ADDED Viewed

@@ -0,0 +1,90 @@
+---
+name: "eval"
+version: "1.0.0"
+description: "USE WHEN the user says \"run evals\", \"eval canary\", \"eval ask-project\", \"check skill regression\", \"update eval baseline\", or before tagging a release. DO NOT USE for evidence validation of a real project (use ask-project / project-status). Capability: runs per-skill evals (deterministic script graders + optional LLM-rubric graders) from each skill's evals/evals.json, aggregates pass-rate / latency / token metrics, and compares against evals/baseline.json to flag regressions. Synthetic fixtures only — never live customer data."
+---
+# Skill: eval
+The objective regression signal for every other kushi skill. Spec: <https://agentskills.io/skill-creation/evaluating-skills>. Doctrine: `instructions/skill-evals.instructions.md`.
+User triggers: "run evals", "eval canary", "eval <skill>", "eval all", "update eval baseline", "check skill regression".
+## USE WHEN
+- About to commit a behavioral change to any `plugin/skills/<name>/SKILL.md` or its companion script.
+- Reviewing a PR that touches skills, prompts, or shared instructions.
+- Tagging a release (full suite is implied by the release script).
+- Investigating "did this skill get worse?" after a refactor.
+## DO NOT USE FOR
+- Validating real customer evidence (use `ask-project` / `project-status`).
+- Live M365 / ADO / CRM probes (use `pull-*` directly).
+- Generating new fixtures from real engagements — that violates the privacy rule.
+## Gotchas
+1. **Canary ≠ full.** `npm run eval:canary` runs ~6 skills. Use `npm run eval:all` before tagging.
+2. **Synthetic fixtures only.** Never copy real customer evidence into `evals/fixtures/`. The runner does NOT enforce this — humans do.
+3. **`pull-*` skills run in `--cached` / `--dry-run`.** Live network calls are explicitly disabled in canary mode; the LLM-rubric subset is skipped unless `-Live` is passed.
+4. **Baseline drift is OK after intentional changes.** Refresh with `npm run eval:baseline`. NEVER auto-update baseline in CI.
+5. **`Evidence/_evals/` is gitignored.** If you need to share a run, copy the JSON manually.
+6. **Tokens are estimates.** When a grader can't measure tokens (e.g. a pwsh-only run), it records `0` and the aggregator excludes those from the mean.
+## Step checklist
+- [ ] Pick mode: `-Skill <name>` (one skill) · `-Canary` (fast subset) · `-All` (full suite).
+- [ ] Confirm `evals/baseline.json` exists; if missing, run with `-UpdateBaseline` first.
+- [ ] Run: `pwsh plugin/skills/eval/run-evals.ps1 -Canary` (or other mode).
+- [ ] Inspect the output JSON under `Evidence/_evals/` and the `benchmark.json` summary.
+- [ ] If regressions flagged: re-run the specific skill with `-Skill <name>` for detail.
+- [ ] If intentional change: bump the skill version, then `npm run eval:baseline`.
+## Validation loop
+After running, verify:
+- Output JSON exists at the path printed by the runner.
+- Every case has `pass: true|false` AND `duration_ms` AND a per-assertion breakdown.
+- `benchmark.json` has `summary.regressions` array (empty if clean).
+- Exit code: `0` = all green vs baseline; `1` = at least one regression.
+If a case errors out before any assertion runs (e.g. fixture missing), it counts as `pass: false` with `error: "<message>"` so totals stay honest.
+## What the runner does (`run-evals.ps1`)
+1. Discovers `plugin/skills/<name>/evals/evals.json` for the requested skills.
+2. Validates each file against `plugin/skills/eval/evals.schema.json`.
+3. For each case:
+   - Resolves the fixture path (`fixture` field, relative to repo root).
+   - Dispatches per `grader_type`:
+     - `script` → invokes the skill's known executable (`run.ps1`, a `*.mjs` probe, or a deterministic shim under `evals/probe.*`) with the case input.
+     - `llm` → only when `-Live` is set; otherwise marked `skipped`.
+   - Captures `stdout`, `stderr`, `duration_ms`.
+   - Runs every assertion in `expected_assertions`; collects per-assertion pass/fail.
+   - Case `pass` = all assertions pass AND no error.
+4. Writes per-run JSON to `Evidence/_evals/<utc-timestamp>.json` (or `-Output` override).
+5. Calls `node src/eval-aggregator.mjs` to compute `benchmark.json` and compare against `evals/baseline.json`.
+6. Prints a one-screen summary; exits 0 on clean, 1 on regression.
+## Arguments
+| Flag | Purpose |
+|---|---|
+| `-Skill <name>` | Run only one skill's evals. |
+| `-All` | Run every `plugin/skills/<name>/evals/evals.json`. |
+| `-Canary` | Run only cases marked `"canary": true`. |
+| `-Output <path>` | Override per-run JSON path. Default `Evidence/_evals/<ts>.json`. |
+| `-Baseline` | Compare against `evals/baseline.json` (default ON). |
+| `-UpdateBaseline` | Write current run's metrics into `evals/baseline.json`. |
+| `-Live` | Allow LLM-rubric cases (requires `m_*` tools / sub-agent). |
+| `-StrictExit` | Exit 1 on any case failure (CI mode). |
+## References
+- `plugin/instructions/skill-evals.instructions.md` (doctrine)
+- `plugin/instructions/agentskills-compliance.instructions.md` (size + section caps for SKILL.md)
+- `src/eval-aggregator.mjs` (mean/stddev + regression detection)
+- `evals/baseline.json` (committed baseline)
+- <https://agentskills.io/skill-creation/evaluating-skills>

package/plugin/skills/eval/evals.schema.json ADDED Viewed

@@ -0,0 +1,73 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "$id": "https://gim-home.github.io/kushi/schemas/evals.schema.json",
+  "title": "Kushi per-skill evals file",
+  "description": "Schema for plugin/skills/<name>/evals/evals.json. Validated by self-check D33.evals-schema.",
+  "type": "object",
+  "required": ["skill", "cases"],
+  "additionalProperties": false,
+  "properties": {
+    "skill": {
+      "type": "string",
+      "minLength": 1,
+      "description": "Skill name (must match the parent directory and SKILL.md frontmatter name)."
+    },
+    "version": { "type": "string" },
+    "description": { "type": "string" },
+    "cases": {
+      "type": "array",
+      "minItems": 1,
+      "items": { "$ref": "#/definitions/case" }
+    }
+  },
+  "definitions": {
+    "case": {
+      "type": "object",
+      "required": ["id", "name", "input", "expected_assertions", "grader_type"],
+      "additionalProperties": false,
+      "properties": {
+        "id": { "type": "string", "pattern": "^[a-z0-9][a-z0-9-]*$" },
+        "name": { "type": "string", "minLength": 1 },
+        "input": {
+          "anyOf": [{ "type": "string" }, { "type": "object" }]
+        },
+        "fixture": { "type": "string" },
+        "canary": { "type": "boolean", "default": false },
+        "skip": { "type": "boolean", "default": false },
+        "skip_reason": { "type": "string" },
+        "timeout_ms": { "type": "integer", "minimum": 100 },
+        "args": { "type": "object" },
+        "grader_type": { "enum": ["script", "llm"] },
+        "expected_assertions": {
+          "type": "array",
+          "minItems": 1,
+          "items": { "$ref": "#/definitions/assertion" }
+        }
+      }
+    },
+    "assertion": {
+      "type": "object",
+      "required": ["type"],
+      "properties": {
+        "type": {
+          "enum": [
+            "file-exists",
+            "file-contains",
+            "json-path-equals",
+            "regex-match",
+            "llm-rubric"
+          ]
+        },
+        "path": { "type": "string" },
+        "needle": { "type": "string" },
+        "pattern": { "type": "string" },
+        "flags": { "type": "string" },
+        "json_path": { "type": "string" },
+        "equals": {},
+        "rubric": { "type": "string" },
+        "min_score": { "type": "number" }
+      },
+      "additionalProperties": false
+    }
+  }
+}