kushi-agents 5.0.2 → 5.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/README.md +35 -0
  2. package/bin/cli.mjs +103 -0
  3. package/package.json +6 -2
  4. package/plugin/agents/kushi.agent.md +3 -1
  5. package/plugin/instructions/skill-authoring.instructions.md +147 -0
  6. package/plugin/instructions/skill-evals.instructions.md +130 -0
  7. package/plugin/skills/aggregate-project/evals/evals.json +33 -0
  8. package/plugin/skills/apply-ado-update/evals/evals.json +33 -0
  9. package/plugin/skills/ask-project/SKILL.md +10 -0
  10. package/plugin/skills/ask-project/evals/evals.json +34 -0
  11. package/plugin/skills/bootstrap-project/evals/evals.json +34 -0
  12. package/plugin/skills/build-state/evals/evals.json +31 -0
  13. package/plugin/skills/consolidate-evidence/evals/evals.json +33 -0
  14. package/plugin/skills/dashboard/evals/evals.json +33 -0
  15. package/plugin/skills/emit-vertex/evals/evals.json +33 -0
  16. package/plugin/skills/eval/SKILL.md +90 -0
  17. package/plugin/skills/eval/evals.schema.json +73 -0
  18. package/plugin/skills/eval/run-evals.ps1 +372 -0
  19. package/plugin/skills/fde-intake/evals/evals.json +33 -0
  20. package/plugin/skills/fde-report/evals/evals.json +33 -0
  21. package/plugin/skills/fde-triage/evals/evals.json +33 -0
  22. package/plugin/skills/intro/SKILL.md +160 -451
  23. package/plugin/skills/intro/evals/evals.json +33 -0
  24. package/plugin/skills/intro/references/walkthrough.md +310 -0
  25. package/plugin/skills/link-entities/evals/evals.json +31 -0
  26. package/plugin/skills/project-status/SKILL.md +10 -1
  27. package/plugin/skills/project-status/evals/evals.json +33 -0
  28. package/plugin/skills/propose-ado-update/evals/evals.json +33 -0
  29. package/plugin/skills/pull-ado/evals/evals.json +35 -0
  30. package/plugin/skills/pull-crm/evals/evals.json +35 -0
  31. package/plugin/skills/pull-email/evals/evals.json +35 -0
  32. package/plugin/skills/pull-loop/evals/evals.json +35 -0
  33. package/plugin/skills/pull-meetings/evals/evals.json +35 -0
  34. package/plugin/skills/pull-misc/evals/evals.json +35 -0
  35. package/plugin/skills/pull-onenote/evals/evals.json +35 -0
  36. package/plugin/skills/pull-sharepoint/evals/evals.json +35 -0
  37. package/plugin/skills/pull-teams/evals/evals.json +35 -0
  38. package/plugin/skills/refresh-project/evals/evals.json +31 -0
  39. package/plugin/skills/self-check/SKILL.md +2 -0
  40. package/plugin/skills/self-check/evals/evals.json +28 -0
  41. package/plugin/skills/self-check/run.ps1 +144 -0
  42. package/plugin/skills/setup/SKILL.md +10 -0
  43. package/plugin/skills/setup/evals/evals.json +33 -0
  44. package/plugin/skills/skill-checker/SKILL.md +136 -0
  45. package/plugin/skills/skill-checker/check-skill.ps1 +416 -0
  46. package/plugin/skills/skill-checker/evals/evals.json +41 -0
  47. package/plugin/skills/skill-creator/SKILL.md +134 -0
  48. package/plugin/skills/skill-creator/evals/evals.json +40 -0
  49. package/plugin/skills/skill-creator/generate-eval-review.ps1 +101 -0
  50. package/plugin/skills/skill-creator/optimize-description.ps1 +87 -0
  51. package/plugin/skills/skill-creator/scaffold.ps1 +180 -0
  52. package/plugin/skills/skill-creator/templates/evals-starter.template.json +27 -0
  53. package/plugin/skills/skill-creator/templates/gotchas-stub.template.md +9 -0
  54. package/plugin/skills/skill-creator/templates/skill-skeleton.template.md +28 -0
  55. package/plugin/skills/tour/evals/evals.json +33 -0
  56. package/plugin/skills/vertex-link/SKILL.md +10 -0
  57. package/plugin/skills/vertex-link/evals/evals.json +33 -0
  58. package/src/eval-aggregator.mjs +209 -0
  59. package/src/eval-aggregator.test.mjs +64 -0
  60. package/src/eval-runner.test.mjs +69 -0
  61. package/src/skill-checker.test.mjs +118 -0
  62. package/src/skill-creator.test.mjs +92 -0
@@ -0,0 +1,31 @@
1
+ {
2
+ "skill": "build-state",
3
+ "version": "1.0.0",
4
+ "description": "Karpathy State layout — index.md + log.md + per-category folders.",
5
+ "cases": [
6
+ {
7
+ "id": "bs-state-index",
8
+ "name": "fixture State/index.md has kushi_state_page front-matter",
9
+ "input": "validate state fixture",
10
+ "fixture": "evals/fixtures/fixture-acme",
11
+ "canary": true,
12
+ "grader_type": "script",
13
+ "args": { "read_fixture": "State/index.md" },
14
+ "expected_assertions": [
15
+ { "type": "regex-match", "pattern": "kushi_state_page:\\s*true" }
16
+ ]
17
+ },
18
+ {
19
+ "id": "bs-state-log-exists",
20
+ "name": "fixture State/log.md exists",
21
+ "input": "verify log",
22
+ "fixture": "evals/fixtures/fixture-acme",
23
+ "canary": false,
24
+ "grader_type": "script",
25
+ "expected_assertions": [
26
+ { "type": "file-exists", "path": "State/log.md" },
27
+ { "type": "file-exists", "path": "State/index.md" }
28
+ ]
29
+ }
30
+ ]
31
+ }
@@ -0,0 +1,33 @@
1
+ {
2
+ "skill": "consolidate-evidence",
3
+ "version": "1.0.0",
4
+ "description": "Auto-seeded evals for consolidate-evidence. Replace with real cases as the skill matures.",
5
+ "cases": [
6
+ {
7
+ "id": "consolidate-evidence-smoke-1",
8
+ "name": "consolidate-evidence produces a non-empty response",
9
+ "input": "synthetic consolidate-evidence probe — canary smoke",
10
+ "canary": false,
11
+ "grader_type": "script",
12
+ "expected_assertions": [
13
+ {
14
+ "type": "regex-match",
15
+ "pattern": ".+"
16
+ }
17
+ ]
18
+ },
19
+ {
20
+ "id": "consolidate-evidence-smoke-2",
21
+ "name": "consolidate-evidence echoes case id",
22
+ "input": "case-id consolidate-evidence-smoke-2",
23
+ "canary": false,
24
+ "grader_type": "script",
25
+ "expected_assertions": [
26
+ {
27
+ "type": "regex-match",
28
+ "pattern": "consolidate-evidence-smoke-2"
29
+ }
30
+ ]
31
+ }
32
+ ]
33
+ }
@@ -0,0 +1,33 @@
1
+ {
2
+ "skill": "dashboard",
3
+ "version": "1.0.0",
4
+ "description": "Auto-seeded evals for dashboard. Replace with real cases as the skill matures.",
5
+ "cases": [
6
+ {
7
+ "id": "dashboard-smoke-1",
8
+ "name": "dashboard produces a non-empty response",
9
+ "input": "synthetic dashboard probe — canary smoke",
10
+ "canary": false,
11
+ "grader_type": "script",
12
+ "expected_assertions": [
13
+ {
14
+ "type": "regex-match",
15
+ "pattern": ".+"
16
+ }
17
+ ]
18
+ },
19
+ {
20
+ "id": "dashboard-smoke-2",
21
+ "name": "dashboard echoes case id",
22
+ "input": "case-id dashboard-smoke-2",
23
+ "canary": false,
24
+ "grader_type": "script",
25
+ "expected_assertions": [
26
+ {
27
+ "type": "regex-match",
28
+ "pattern": "dashboard-smoke-2"
29
+ }
30
+ ]
31
+ }
32
+ ]
33
+ }
@@ -0,0 +1,33 @@
1
+ {
2
+ "skill": "emit-vertex",
3
+ "version": "1.0.0",
4
+ "description": "Auto-seeded evals for emit-vertex. Replace with real cases as the skill matures.",
5
+ "cases": [
6
+ {
7
+ "id": "emit-vertex-smoke-1",
8
+ "name": "emit-vertex produces a non-empty response",
9
+ "input": "synthetic emit-vertex probe — canary smoke",
10
+ "canary": false,
11
+ "grader_type": "script",
12
+ "expected_assertions": [
13
+ {
14
+ "type": "regex-match",
15
+ "pattern": ".+"
16
+ }
17
+ ]
18
+ },
19
+ {
20
+ "id": "emit-vertex-smoke-2",
21
+ "name": "emit-vertex echoes case id",
22
+ "input": "case-id emit-vertex-smoke-2",
23
+ "canary": false,
24
+ "grader_type": "script",
25
+ "expected_assertions": [
26
+ {
27
+ "type": "regex-match",
28
+ "pattern": "emit-vertex-smoke-2"
29
+ }
30
+ ]
31
+ }
32
+ ]
33
+ }
@@ -0,0 +1,90 @@
1
+ ---
2
+ name: "eval"
3
+ version: "1.0.0"
4
+ description: "USE WHEN the user says \"run evals\", \"eval canary\", \"eval ask-project\", \"check skill regression\", \"update eval baseline\", or before tagging a release. DO NOT USE for evidence validation of a real project (use ask-project / project-status). Capability: runs per-skill evals (deterministic script graders + optional LLM-rubric graders) from each skill's evals/evals.json, aggregates pass-rate / latency / token metrics, and compares against evals/baseline.json to flag regressions. Synthetic fixtures only — never live customer data."
5
+ ---
6
+
7
+ # Skill: eval
8
+
9
+ The objective regression signal for every other kushi skill. Spec: <https://agentskills.io/skill-creation/evaluating-skills>. Doctrine: `instructions/skill-evals.instructions.md`.
10
+
11
+ User triggers: "run evals", "eval canary", "eval <skill>", "eval all", "update eval baseline", "check skill regression".
12
+
13
+ ## USE WHEN
14
+
15
+ - About to commit a behavioral change to any `plugin/skills/<name>/SKILL.md` or its companion script.
16
+ - Reviewing a PR that touches skills, prompts, or shared instructions.
17
+ - Tagging a release (full suite is implied by the release script).
18
+ - Investigating "did this skill get worse?" after a refactor.
19
+
20
+ ## DO NOT USE FOR
21
+
22
+ - Validating real customer evidence (use `ask-project` / `project-status`).
23
+ - Live M365 / ADO / CRM probes (use `pull-*` directly).
24
+ - Generating new fixtures from real engagements — that violates the privacy rule.
25
+
26
+ ## Gotchas
27
+
28
+ 1. **Canary ≠ full.** `npm run eval:canary` runs ~6 skills. Use `npm run eval:all` before tagging.
29
+ 2. **Synthetic fixtures only.** Never copy real customer evidence into `evals/fixtures/`. The runner does NOT enforce this — humans do.
30
+ 3. **`pull-*` skills run in `--cached` / `--dry-run`.** Live network calls are explicitly disabled in canary mode; the LLM-rubric subset is skipped unless `-Live` is passed.
31
+ 4. **Baseline drift is OK after intentional changes.** Refresh with `npm run eval:baseline`. NEVER auto-update baseline in CI.
32
+ 5. **`Evidence/_evals/` is gitignored.** If you need to share a run, copy the JSON manually.
33
+ 6. **Tokens are estimates.** When a grader can't measure tokens (e.g. a pwsh-only run), it records `0` and the aggregator excludes those from the mean.
34
+
35
+ ## Step checklist
36
+
37
+ - [ ] Pick mode: `-Skill <name>` (one skill) · `-Canary` (fast subset) · `-All` (full suite).
38
+ - [ ] Confirm `evals/baseline.json` exists; if missing, run with `-UpdateBaseline` first.
39
+ - [ ] Run: `pwsh plugin/skills/eval/run-evals.ps1 -Canary` (or other mode).
40
+ - [ ] Inspect the output JSON under `Evidence/_evals/` and the `benchmark.json` summary.
41
+ - [ ] If regressions flagged: re-run the specific skill with `-Skill <name>` for detail.
42
+ - [ ] If intentional change: bump the skill version, then `npm run eval:baseline`.
43
+
44
+ ## Validation loop
45
+
46
+ After running, verify:
47
+
48
+ - Output JSON exists at the path printed by the runner.
49
+ - Every case has `pass: true|false` AND `duration_ms` AND a per-assertion breakdown.
50
+ - `benchmark.json` has `summary.regressions` array (empty if clean).
51
+ - Exit code: `0` = all green vs baseline; `1` = at least one regression.
52
+
53
+ If a case errors out before any assertion runs (e.g. fixture missing), it counts as `pass: false` with `error: "<message>"` so totals stay honest.
54
+
55
+ ## What the runner does (`run-evals.ps1`)
56
+
57
+ 1. Discovers `plugin/skills/<name>/evals/evals.json` for the requested skills.
58
+ 2. Validates each file against `plugin/skills/eval/evals.schema.json`.
59
+ 3. For each case:
60
+ - Resolves the fixture path (`fixture` field, relative to repo root).
61
+ - Dispatches per `grader_type`:
62
+ - `script` → invokes the skill's known executable (`run.ps1`, a `*.mjs` probe, or a deterministic shim under `evals/probe.*`) with the case input.
63
+ - `llm` → only when `-Live` is set; otherwise marked `skipped`.
64
+ - Captures `stdout`, `stderr`, `duration_ms`.
65
+ - Runs every assertion in `expected_assertions`; collects per-assertion pass/fail.
66
+ - Case `pass` = all assertions pass AND no error.
67
+ 4. Writes per-run JSON to `Evidence/_evals/<utc-timestamp>.json` (or `-Output` override).
68
+ 5. Calls `node src/eval-aggregator.mjs` to compute `benchmark.json` and compare against `evals/baseline.json`.
69
+ 6. Prints a one-screen summary; exits 0 on clean, 1 on regression.
70
+
71
+ ## Arguments
72
+
73
+ | Flag | Purpose |
74
+ |---|---|
75
+ | `-Skill <name>` | Run only one skill's evals. |
76
+ | `-All` | Run every `plugin/skills/<name>/evals/evals.json`. |
77
+ | `-Canary` | Run only cases marked `"canary": true`. |
78
+ | `-Output <path>` | Override per-run JSON path. Default `Evidence/_evals/<ts>.json`. |
79
+ | `-Baseline` | Compare against `evals/baseline.json` (default ON). |
80
+ | `-UpdateBaseline` | Write current run's metrics into `evals/baseline.json`. |
81
+ | `-Live` | Allow LLM-rubric cases (requires `m_*` tools / sub-agent). |
82
+ | `-StrictExit` | Exit 1 on any case failure (CI mode). |
83
+
84
+ ## References
85
+
86
+ - `plugin/instructions/skill-evals.instructions.md` (doctrine)
87
+ - `plugin/instructions/agentskills-compliance.instructions.md` (size + section caps for SKILL.md)
88
+ - `src/eval-aggregator.mjs` (mean/stddev + regression detection)
89
+ - `evals/baseline.json` (committed baseline)
90
+ - <https://agentskills.io/skill-creation/evaluating-skills>
@@ -0,0 +1,73 @@
1
+ {
2
+ "$schema": "http://json-schema.org/draft-07/schema#",
3
+ "$id": "https://gim-home.github.io/kushi/schemas/evals.schema.json",
4
+ "title": "Kushi per-skill evals file",
5
+ "description": "Schema for plugin/skills/<name>/evals/evals.json. Validated by self-check D33.evals-schema.",
6
+ "type": "object",
7
+ "required": ["skill", "cases"],
8
+ "additionalProperties": false,
9
+ "properties": {
10
+ "skill": {
11
+ "type": "string",
12
+ "minLength": 1,
13
+ "description": "Skill name (must match the parent directory and SKILL.md frontmatter name)."
14
+ },
15
+ "version": { "type": "string" },
16
+ "description": { "type": "string" },
17
+ "cases": {
18
+ "type": "array",
19
+ "minItems": 1,
20
+ "items": { "$ref": "#/definitions/case" }
21
+ }
22
+ },
23
+ "definitions": {
24
+ "case": {
25
+ "type": "object",
26
+ "required": ["id", "name", "input", "expected_assertions", "grader_type"],
27
+ "additionalProperties": false,
28
+ "properties": {
29
+ "id": { "type": "string", "pattern": "^[a-z0-9][a-z0-9-]*$" },
30
+ "name": { "type": "string", "minLength": 1 },
31
+ "input": {
32
+ "anyOf": [{ "type": "string" }, { "type": "object" }]
33
+ },
34
+ "fixture": { "type": "string" },
35
+ "canary": { "type": "boolean", "default": false },
36
+ "skip": { "type": "boolean", "default": false },
37
+ "skip_reason": { "type": "string" },
38
+ "timeout_ms": { "type": "integer", "minimum": 100 },
39
+ "args": { "type": "object" },
40
+ "grader_type": { "enum": ["script", "llm"] },
41
+ "expected_assertions": {
42
+ "type": "array",
43
+ "minItems": 1,
44
+ "items": { "$ref": "#/definitions/assertion" }
45
+ }
46
+ }
47
+ },
48
+ "assertion": {
49
+ "type": "object",
50
+ "required": ["type"],
51
+ "properties": {
52
+ "type": {
53
+ "enum": [
54
+ "file-exists",
55
+ "file-contains",
56
+ "json-path-equals",
57
+ "regex-match",
58
+ "llm-rubric"
59
+ ]
60
+ },
61
+ "path": { "type": "string" },
62
+ "needle": { "type": "string" },
63
+ "pattern": { "type": "string" },
64
+ "flags": { "type": "string" },
65
+ "json_path": { "type": "string" },
66
+ "equals": {},
67
+ "rubric": { "type": "string" },
68
+ "min_score": { "type": "number" }
69
+ },
70
+ "additionalProperties": false
71
+ }
72
+ }
73
+ }
@@ -0,0 +1,372 @@
1
+ <#
2
+ .SYNOPSIS
3
+ Kushi skill evals runner (v5.0.3+).
4
+
5
+ .DESCRIPTION
6
+ Discovers per-skill evals at `plugin/skills/<name>/evals/evals.json`,
7
+ validates against `plugin/skills/eval/evals.schema.json`, dispatches each case,
8
+ runs the assertions, and writes a per-run JSON to `Evidence/_evals/<ts>.json`.
9
+
10
+ Per the doctrine in `plugin/instructions/skill-evals.instructions.md`, the
11
+ runner NEVER makes live M365 / ADO / CRM calls. Skills that genuinely require
12
+ live calls (`pull-*`) ship cached fixtures.
13
+
14
+ Dispatch:
15
+ - `grader_type: "script"` cases are dispatched to:
16
+ 1. `plugin/skills/<skill>/evals/probe.ps1` (preferred, per-skill probe)
17
+ 2. `plugin/skills/<skill>/evals/probe.mjs` (Node probe)
18
+ 3. The built-in "fixture-echo" probe, which reads `case.args.read_fixture`
19
+ and emits its content. Lets simple read-only assertions work with no
20
+ per-skill code at all.
21
+ - `grader_type: "llm"` cases are SKIPPED unless `-Live` is set; under `-Live`
22
+ they require an `m_*` sub-agent dispatcher (not implemented in OSS — flagged
23
+ as skipped with reason).
24
+
25
+ .PARAMETER Skill
26
+ Single skill to run. Conflicts with -All / -Canary.
27
+
28
+ .PARAMETER All
29
+ Run every plugin/skills/<name>/evals/evals.json (except `eval/` and `self-check/`).
30
+
31
+ .PARAMETER Canary
32
+ Run only cases marked `"canary": true`.
33
+
34
+ .PARAMETER Output
35
+ Override the per-run JSON output path.
36
+
37
+ .PARAMETER Baseline
38
+ Compare against `evals/baseline.json` (default ON).
39
+
40
+ .PARAMETER UpdateBaseline
41
+ Write the current run's metrics into `evals/baseline.json` instead of comparing.
42
+
43
+ .PARAMETER Live
44
+ Allow LLM-rubric cases to attempt to dispatch (not implemented in OSS; will skip).
45
+
46
+ .PARAMETER StrictExit
47
+ Exit code 1 if any case fails (default: exit 1 only on regression vs baseline).
48
+
49
+ .PARAMETER Root
50
+ Repo root (default: two levels above this script).
51
+ #>
52
+ [CmdletBinding()]
53
+ param(
54
+ [string]$Skill,
55
+ [switch]$All,
56
+ [switch]$Canary,
57
+ [string]$Output,
58
+ [switch]$Baseline = $true,
59
+ [switch]$UpdateBaseline,
60
+ [switch]$Live,
61
+ [switch]$StrictExit,
62
+ [string]$Root = (Resolve-Path (Join-Path $PSScriptRoot "..\..\..")).Path
63
+ )
64
+
65
+ $ErrorActionPreference = 'Stop'
66
+
67
+ function Write-Info($m) { Write-Host "kushi-eval: $m" }
68
+ function Write-Warn($m) { Write-Host "kushi-eval: $m" -ForegroundColor Yellow }
69
+ function Write-Err($m) { Write-Host "kushi-eval: $m" -ForegroundColor Red }
70
+
71
+ # ---------- Discover skills ----------
72
+
73
+ function Get-EvalFiles {
74
+ param([string]$Root, [string]$Skill)
75
+ $skillsDir = Join-Path $Root 'plugin/skills'
76
+ if ($Skill) {
77
+ $f = Join-Path $skillsDir "$Skill/evals/evals.json"
78
+ if (-not (Test-Path $f)) { throw "No evals.json for skill '$Skill' at $f" }
79
+ return ,$f
80
+ }
81
+ # Discover every skill that ships an evals/evals.json. The eval skill itself is
82
+ # excluded (it has no SKILL evals — it IS the runner). self-check IS included
83
+ # because it ships its own meta-evals.
84
+ return Get-ChildItem -Path $skillsDir -Directory |
85
+ Where-Object { $_.Name -ne 'eval' } |
86
+ ForEach-Object { Join-Path $_.FullName 'evals/evals.json' } |
87
+ Where-Object { Test-Path $_ }
88
+ }
89
+
90
+ # ---------- Lightweight JSON-Schema-ish validator (just the cases we care about) ----------
91
+
92
+ function Test-EvalsShape {
93
+ param([Parameter(Mandatory)] $Obj, [string]$Path)
94
+ $problems = New-Object System.Collections.Generic.List[string]
95
+ if (-not $Obj.skill) { $problems.Add("${Path}: missing 'skill'") }
96
+ if (-not $Obj.cases -or $Obj.cases.Count -lt 1) { $problems.Add("${Path}: needs >=1 case") }
97
+ foreach ($c in $Obj.cases) {
98
+ foreach ($req in 'id', 'name', 'input', 'grader_type', 'expected_assertions') {
99
+ if (-not $c.PSObject.Properties[$req]) { $problems.Add("${Path}: case missing '$req'") }
100
+ }
101
+ if ($c.grader_type -and ($c.grader_type -notin 'script', 'llm')) {
102
+ $problems.Add("${Path}: case '$($c.id)' has invalid grader_type '$($c.grader_type)'")
103
+ }
104
+ if ($c.expected_assertions -and $c.expected_assertions.Count -lt 1) {
105
+ $problems.Add("${Path}: case '$($c.id)' needs >=1 assertion")
106
+ }
107
+ }
108
+ return $problems
109
+ }
110
+
111
+ # ---------- Dispatch a single case ----------
112
+
113
+ function Invoke-Case {
114
+ param(
115
+ [string]$Root,
116
+ [string]$SkillName,
117
+ $Case
118
+ )
119
+ $skillDir = Join-Path $Root "plugin/skills/$SkillName"
120
+ $probePs1 = Join-Path $skillDir 'evals/probe.ps1'
121
+ $probeMjs = Join-Path $skillDir 'evals/probe.mjs'
122
+
123
+ $result = [ordered]@{
124
+ id = $Case.id
125
+ name = $Case.name
126
+ canary = [bool]$Case.canary
127
+ grader_type = $Case.grader_type
128
+ pass = $false
129
+ duration_ms = 0
130
+ tokens_in = 0
131
+ tokens_out = 0
132
+ stdout = ''
133
+ stderr = ''
134
+ assertions = @()
135
+ error = $null
136
+ }
137
+
138
+ if ($Case.skip) {
139
+ $result.error = "skipped: $($Case.skip_reason)"
140
+ return [pscustomobject]$result
141
+ }
142
+ if ($Case.grader_type -eq 'llm' -and -not $Live) {
143
+ $result.error = 'skipped: llm-rubric cases require -Live'
144
+ return [pscustomobject]$result
145
+ }
146
+ if ($Case.grader_type -eq 'llm' -and $Live) {
147
+ $result.error = 'skipped: llm dispatch not implemented in OSS runner'
148
+ return [pscustomobject]$result
149
+ }
150
+
151
+ $fixturePath = $null
152
+ if ($Case.fixture) { $fixturePath = Join-Path $Root $Case.fixture }
153
+
154
+ $inputStr = if ($Case.input -is [string]) { $Case.input } else { ($Case.input | ConvertTo-Json -Compress -Depth 8) }
155
+
156
+ $sw = [System.Diagnostics.Stopwatch]::StartNew()
157
+ try {
158
+ if (Test-Path $probePs1) {
159
+ $output = & pwsh -NoLogo -NoProfile -File $probePs1 -Input $inputStr -Fixture $fixturePath -CaseId $Case.id 2>&1
160
+ $result.stdout = ($output | Out-String).Trim()
161
+ } elseif (Test-Path $probeMjs) {
162
+ $output = & node $probeMjs --input $inputStr --fixture ($fixturePath ?? '') --case-id $Case.id 2>&1
163
+ $result.stdout = ($output | Out-String).Trim()
164
+ } else {
165
+ # Built-in fixture-echo probe.
166
+ $readPath = $null
167
+ if ($Case.args -and $Case.args.read_fixture) {
168
+ if ($fixturePath) { $readPath = Join-Path $fixturePath $Case.args.read_fixture }
169
+ else { $readPath = Join-Path $Root $Case.args.read_fixture }
170
+ }
171
+ if ($readPath -and (Test-Path $readPath)) {
172
+ $result.stdout = Get-Content -Raw -Path $readPath
173
+ } else {
174
+ # Default: echo the input as the "answer".
175
+ $result.stdout = $inputStr
176
+ }
177
+ }
178
+ } catch {
179
+ $result.error = $_.Exception.Message
180
+ } finally {
181
+ $sw.Stop()
182
+ $result.duration_ms = [int]$sw.Elapsed.TotalMilliseconds
183
+ }
184
+
185
+ # Rough token estimate: 4 chars/token.
186
+ $result.tokens_in = [int][math]::Ceiling($inputStr.Length / 4.0)
187
+ $result.tokens_out = [int][math]::Ceiling($result.stdout.Length / 4.0)
188
+
189
+ # Run assertions
190
+ $allPass = $true
191
+ foreach ($a in $Case.expected_assertions) {
192
+ $ares = Invoke-Assertion -Assertion $a -CaseResult $result -FixturePath $fixturePath -Root $Root
193
+ $result.assertions += $ares
194
+ if (-not $ares.pass) { $allPass = $false }
195
+ }
196
+ if ($result.error) { $allPass = $false }
197
+ $result.pass = $allPass
198
+ return [pscustomobject]$result
199
+ }
200
+
201
+ # ---------- Assertion implementations ----------
202
+
203
+ function Resolve-AssertPath {
204
+ param([string]$P, [string]$FixturePath, [string]$Root)
205
+ if ([System.IO.Path]::IsPathRooted($P)) { return $P }
206
+ if ($FixturePath -and (Test-Path (Join-Path $FixturePath $P))) { return Join-Path $FixturePath $P }
207
+ return Join-Path $Root $P
208
+ }
209
+
210
+ function Invoke-Assertion {
211
+ param($Assertion, $CaseResult, [string]$FixturePath, [string]$Root)
212
+ $ar = [ordered]@{ type = $Assertion.type; pass = $false; reason = '' }
213
+ try {
214
+ switch ($Assertion.type) {
215
+ 'file-exists' {
216
+ $p = Resolve-AssertPath -P $Assertion.path -FixturePath $FixturePath -Root $Root
217
+ $ar.pass = Test-Path $p
218
+ if (-not $ar.pass) { $ar.reason = "missing: $p" }
219
+ }
220
+ 'file-contains' {
221
+ $p = Resolve-AssertPath -P $Assertion.path -FixturePath $FixturePath -Root $Root
222
+ if (-not (Test-Path $p)) { $ar.reason = "missing: $p" }
223
+ else {
224
+ $txt = Get-Content -Raw -Path $p
225
+ $ar.pass = $txt -and $txt.Contains([string]$Assertion.needle)
226
+ if (-not $ar.pass) { $ar.reason = "needle not found in $p" }
227
+ }
228
+ }
229
+ 'regex-match' {
230
+ $flags = if ($Assertion.flags) { $Assertion.flags } else { '' }
231
+ $opts = [System.Text.RegularExpressions.RegexOptions]::None
232
+ if ($flags -match 'i') { $opts = $opts -bor [System.Text.RegularExpressions.RegexOptions]::IgnoreCase }
233
+ if ($flags -match 'm') { $opts = $opts -bor [System.Text.RegularExpressions.RegexOptions]::Multiline }
234
+ $rx = [System.Text.RegularExpressions.Regex]::new($Assertion.pattern, $opts)
235
+ $hay = "$($CaseResult.stdout)`n$($CaseResult.stderr)"
236
+ $ar.pass = $rx.IsMatch($hay)
237
+ if (-not $ar.pass) { $ar.reason = "no match for /$($Assertion.pattern)/" }
238
+ }
239
+ 'json-path-equals' {
240
+ $p = Resolve-AssertPath -P $Assertion.path -FixturePath $FixturePath -Root $Root
241
+ if (-not (Test-Path $p)) { $ar.reason = "missing: $p" }
242
+ else {
243
+ $j = Get-Content -Raw -Path $p | ConvertFrom-Json
244
+ $val = $j
245
+ foreach ($seg in ($Assertion.json_path -replace '^\$\.?', '' -split '\.' | Where-Object { $_ })) {
246
+ if ($null -eq $val) { break }
247
+ $val = $val.$seg
248
+ }
249
+ if ($val -is [array]) { $val = ($val -join ',') }
250
+ $ar.pass = ("$val" -eq "$($Assertion.equals)")
251
+ if (-not $ar.pass) { $ar.reason = "expected '$($Assertion.equals)', got '$val'" }
252
+ }
253
+ }
254
+ 'llm-rubric' {
255
+ $ar.pass = $false
256
+ $ar.reason = 'llm-rubric not evaluable in script runner'
257
+ }
258
+ default {
259
+ $ar.reason = "unknown assertion type: $($Assertion.type)"
260
+ }
261
+ }
262
+ } catch {
263
+ $ar.reason = "assertion error: $($_.Exception.Message)"
264
+ }
265
+ return [pscustomobject]$ar
266
+ }
267
+
268
+ # ---------- Main ----------
269
+
270
+ if ($Skill -and ($All -or $Canary)) {
271
+ Write-Err "-Skill cannot be combined with -All / -Canary"; exit 2
272
+ }
273
+ if (-not $Skill -and -not $All -and -not $Canary) {
274
+ Write-Err "Pick a mode: -Skill <name> | -All | -Canary"; exit 2
275
+ }
276
+
277
+ $evalFiles = Get-EvalFiles -Root $Root -Skill $Skill
278
+
279
+ if (-not $Output) {
280
+ $stamp = (Get-Date).ToUniversalTime().ToString('yyyyMMdd-HHmmss')
281
+ $Output = Join-Path $Root "Evidence/_evals/$stamp.json"
282
+ }
283
+ New-Item -ItemType Directory -Force -Path (Split-Path $Output -Parent) | Out-Null
284
+
285
+ $runRecord = [ordered]@{
286
+ schema = 'kushi.evals.run/v1'
287
+ generated_at = (Get-Date).ToUniversalTime().ToString('o')
288
+ mode = if ($Skill) { "skill:$Skill" } elseif ($Canary) { 'canary' } else { 'all' }
289
+ root = $Root
290
+ skills = @()
291
+ }
292
+ $schemaProblems = New-Object System.Collections.Generic.List[string]
293
+ $anyCaseFailed = $false
294
+
295
+ foreach ($f in $evalFiles) {
296
+ $obj = Get-Content -Raw $f | ConvertFrom-Json
297
+ $problems = Test-EvalsShape -Obj $obj -Path $f
298
+ foreach ($p in $problems) { $schemaProblems.Add($p) }
299
+ if ($problems.Count -gt 0) { continue }
300
+
301
+ $cases = $obj.cases
302
+ if ($Canary) { $cases = $cases | Where-Object { $_.canary } }
303
+ $skillEntry = [ordered]@{
304
+ skill = $obj.skill
305
+ evals_file = (Resolve-Path -Relative $f)
306
+ cases = @()
307
+ pass_count = 0
308
+ fail_count = 0
309
+ skipped_count = 0
310
+ }
311
+
312
+ foreach ($c in $cases) {
313
+ Write-Info "[$($obj.skill)] $($c.id) ..."
314
+ $r = Invoke-Case -Root $Root -SkillName $obj.skill -Case $c
315
+ $skillEntry.cases += $r
316
+ if ($r.error -and $r.error.StartsWith('skipped')) { $skillEntry.skipped_count++ }
317
+ elseif ($r.pass) { $skillEntry.pass_count++ }
318
+ else { $skillEntry.fail_count++; $anyCaseFailed = $true }
319
+ }
320
+ $runRecord.skills += $skillEntry
321
+ }
322
+
323
+ if ($schemaProblems.Count -gt 0) {
324
+ Write-Err "Schema problems:"
325
+ foreach ($p in $schemaProblems) { Write-Err " - $p" }
326
+ exit 2
327
+ }
328
+
329
+ $runRecord | ConvertTo-Json -Depth 10 | Set-Content -LiteralPath $Output -Encoding UTF8
330
+ Write-Info "Run written: $Output"
331
+
332
+ # Aggregator + baseline compare
333
+ $aggregator = Join-Path $Root 'src/eval-aggregator.mjs'
334
+ $baselineFile = Join-Path $Root 'evals/baseline.json'
335
+ $benchFile = Join-Path $Root 'Evidence/_evals/benchmark.json'
336
+
337
+ if (Test-Path $aggregator) {
338
+ $aggArgs = @($aggregator, '--run', $Output, '--bench', $benchFile)
339
+ if ((Test-Path $baselineFile -ErrorAction SilentlyContinue) -or $UpdateBaseline) {
340
+ $aggArgs += @('--baseline', $baselineFile)
341
+ }
342
+ if ($UpdateBaseline) { $aggArgs += '--update-baseline' }
343
+ & node @aggArgs
344
+ $aggExit = $LASTEXITCODE
345
+ } else {
346
+ Write-Warn 'src/eval-aggregator.mjs not found — skipping aggregation'
347
+ $aggExit = 0
348
+ }
349
+
350
+ # Print a one-screen summary
351
+ $total = 0; $pass = 0; $fail = 0; $skip = 0
352
+ foreach ($s in $runRecord.skills) {
353
+ $total += $s.pass_count + $s.fail_count + $s.skipped_count
354
+ $pass += $s.pass_count
355
+ $fail += $s.fail_count
356
+ $skip += $s.skipped_count
357
+ }
358
+ Write-Host ""
359
+ Write-Host "### Kushi evals — $($runRecord.mode)"
360
+ Write-Host "| Skill | Pass | Fail | Skip |"
361
+ Write-Host "|-------|-----:|-----:|-----:|"
362
+ foreach ($s in $runRecord.skills) {
363
+ Write-Host ("| {0} | {1} | {2} | {3} |" -f $s.skill, $s.pass_count, $s.fail_count, $s.skipped_count)
364
+ }
365
+ Write-Host ""
366
+ Write-Host "Total: $total · Pass: $pass · Fail: $fail · Skip: $skip"
367
+ Write-Host "Run JSON: $Output"
368
+ if (Test-Path $benchFile) { Write-Host "Benchmark: $benchFile" }
369
+
370
+ if ($StrictExit -and $anyCaseFailed) { exit 1 }
371
+ if ($aggExit -ne 0) { exit $aggExit }
372
+ exit 0