kushi-agents 5.0.2 → 5.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/README.md +22 -0
  2. package/package.json +6 -2
  3. package/plugin/agents/kushi.agent.md +1 -1
  4. package/plugin/instructions/skill-evals.instructions.md +130 -0
  5. package/plugin/skills/aggregate-project/evals/evals.json +33 -0
  6. package/plugin/skills/apply-ado-update/evals/evals.json +33 -0
  7. package/plugin/skills/ask-project/evals/evals.json +34 -0
  8. package/plugin/skills/bootstrap-project/evals/evals.json +34 -0
  9. package/plugin/skills/build-state/evals/evals.json +31 -0
  10. package/plugin/skills/consolidate-evidence/evals/evals.json +33 -0
  11. package/plugin/skills/dashboard/evals/evals.json +33 -0
  12. package/plugin/skills/emit-vertex/evals/evals.json +33 -0
  13. package/plugin/skills/eval/SKILL.md +90 -0
  14. package/plugin/skills/eval/evals.schema.json +73 -0
  15. package/plugin/skills/eval/run-evals.ps1 +372 -0
  16. package/plugin/skills/fde-intake/evals/evals.json +33 -0
  17. package/plugin/skills/fde-report/evals/evals.json +33 -0
  18. package/plugin/skills/fde-triage/evals/evals.json +33 -0
  19. package/plugin/skills/intro/evals/evals.json +33 -0
  20. package/plugin/skills/link-entities/evals/evals.json +31 -0
  21. package/plugin/skills/project-status/evals/evals.json +33 -0
  22. package/plugin/skills/propose-ado-update/evals/evals.json +33 -0
  23. package/plugin/skills/pull-ado/evals/evals.json +35 -0
  24. package/plugin/skills/pull-crm/evals/evals.json +35 -0
  25. package/plugin/skills/pull-email/evals/evals.json +35 -0
  26. package/plugin/skills/pull-loop/evals/evals.json +35 -0
  27. package/plugin/skills/pull-meetings/evals/evals.json +35 -0
  28. package/plugin/skills/pull-misc/evals/evals.json +35 -0
  29. package/plugin/skills/pull-onenote/evals/evals.json +35 -0
  30. package/plugin/skills/pull-sharepoint/evals/evals.json +35 -0
  31. package/plugin/skills/pull-teams/evals/evals.json +35 -0
  32. package/plugin/skills/refresh-project/evals/evals.json +31 -0
  33. package/plugin/skills/self-check/SKILL.md +1 -0
  34. package/plugin/skills/self-check/evals/evals.json +28 -0
  35. package/plugin/skills/self-check/run.ps1 +63 -0
  36. package/plugin/skills/setup/evals/evals.json +33 -0
  37. package/plugin/skills/tour/evals/evals.json +33 -0
  38. package/plugin/skills/vertex-link/evals/evals.json +33 -0
  39. package/src/eval-aggregator.mjs +209 -0
  40. package/src/eval-aggregator.test.mjs +64 -0
  41. package/src/eval-runner.test.mjs +69 -0
@@ -0,0 +1,372 @@
1
+ <#
2
+ .SYNOPSIS
3
+ Kushi skill evals runner (v5.0.3+).
4
+
5
+ .DESCRIPTION
6
+ Discovers per-skill evals at `plugin/skills/<name>/evals/evals.json`,
7
+ validates against `plugin/skills/eval/evals.schema.json`, dispatches each case,
8
+ runs the assertions, and writes a per-run JSON to `Evidence/_evals/<ts>.json`.
9
+
10
+ Per the doctrine in `plugin/instructions/skill-evals.instructions.md`, the
11
+ runner NEVER makes live M365 / ADO / CRM calls. Skills that genuinely require
12
+ live calls (`pull-*`) ship cached fixtures.
13
+
14
+ Dispatch:
15
+ - `grader_type: "script"` cases are dispatched to:
16
+ 1. `plugin/skills/<skill>/evals/probe.ps1` (preferred, per-skill probe)
17
+ 2. `plugin/skills/<skill>/evals/probe.mjs` (Node probe)
18
+ 3. The built-in "fixture-echo" probe, which reads `case.args.read_fixture`
19
+ and emits its content. Lets simple read-only assertions work with no
20
+ per-skill code at all.
21
+ - `grader_type: "llm"` cases are SKIPPED unless `-Live` is set; under `-Live`
22
+ they require an `m_*` sub-agent dispatcher (not implemented in OSS — flagged
23
+ as skipped with reason).
24
+
25
+ .PARAMETER Skill
26
+ Single skill to run. Conflicts with -All / -Canary.
27
+
28
+ .PARAMETER All
29
+ Run every plugin/skills/<name>/evals/evals.json (except `eval/` and `self-check/`).
30
+
31
+ .PARAMETER Canary
32
+ Run only cases marked `"canary": true`.
33
+
34
+ .PARAMETER Output
35
+ Override the per-run JSON output path.
36
+
37
+ .PARAMETER Baseline
38
+ Compare against `evals/baseline.json` (default ON).
39
+
40
+ .PARAMETER UpdateBaseline
41
+ Write the current run's metrics into `evals/baseline.json` instead of comparing.
42
+
43
+ .PARAMETER Live
44
+ Allow LLM-rubric cases to attempt to dispatch (not implemented in OSS; will skip).
45
+
46
+ .PARAMETER StrictExit
47
+ Exit code 1 if any case fails (default: exit 1 only on regression vs baseline).
48
+
49
+ .PARAMETER Root
50
+ Repo root (default: two levels above this script).
51
+ #>
52
+ [CmdletBinding()]
53
+ param(
54
+ [string]$Skill,
55
+ [switch]$All,
56
+ [switch]$Canary,
57
+ [string]$Output,
58
+ [switch]$Baseline = $true,
59
+ [switch]$UpdateBaseline,
60
+ [switch]$Live,
61
+ [switch]$StrictExit,
62
+ [string]$Root = (Resolve-Path (Join-Path $PSScriptRoot "..\..\..")).Path
63
+ )
64
+
65
+ $ErrorActionPreference = 'Stop'
66
+
67
+ function Write-Info($m) { Write-Host "kushi-eval: $m" }
68
+ function Write-Warn($m) { Write-Host "kushi-eval: $m" -ForegroundColor Yellow }
69
+ function Write-Err($m) { Write-Host "kushi-eval: $m" -ForegroundColor Red }
70
+
71
+ # ---------- Discover skills ----------
72
+
73
+ function Get-EvalFiles {
74
+ param([string]$Root, [string]$Skill)
75
+ $skillsDir = Join-Path $Root 'plugin/skills'
76
+ if ($Skill) {
77
+ $f = Join-Path $skillsDir "$Skill/evals/evals.json"
78
+ if (-not (Test-Path $f)) { throw "No evals.json for skill '$Skill' at $f" }
79
+ return ,$f
80
+ }
81
+ # Discover every skill that ships an evals/evals.json. The eval skill itself is
82
+ # excluded (it has no SKILL evals — it IS the runner). self-check IS included
83
+ # because it ships its own meta-evals.
84
+ return Get-ChildItem -Path $skillsDir -Directory |
85
+ Where-Object { $_.Name -ne 'eval' } |
86
+ ForEach-Object { Join-Path $_.FullName 'evals/evals.json' } |
87
+ Where-Object { Test-Path $_ }
88
+ }
89
+
90
+ # ---------- Lightweight JSON-Schema-ish validator (just the cases we care about) ----------
91
+
92
+ function Test-EvalsShape {
93
+ param([Parameter(Mandatory)] $Obj, [string]$Path)
94
+ $problems = New-Object System.Collections.Generic.List[string]
95
+ if (-not $Obj.skill) { $problems.Add("${Path}: missing 'skill'") }
96
+ if (-not $Obj.cases -or $Obj.cases.Count -lt 1) { $problems.Add("${Path}: needs >=1 case") }
97
+ foreach ($c in $Obj.cases) {
98
+ foreach ($req in 'id', 'name', 'input', 'grader_type', 'expected_assertions') {
99
+ if (-not $c.PSObject.Properties[$req]) { $problems.Add("${Path}: case missing '$req'") }
100
+ }
101
+ if ($c.grader_type -and ($c.grader_type -notin 'script', 'llm')) {
102
+ $problems.Add("${Path}: case '$($c.id)' has invalid grader_type '$($c.grader_type)'")
103
+ }
104
+ if ($c.expected_assertions -and $c.expected_assertions.Count -lt 1) {
105
+ $problems.Add("${Path}: case '$($c.id)' needs >=1 assertion")
106
+ }
107
+ }
108
+ return $problems
109
+ }
110
+
111
+ # ---------- Dispatch a single case ----------
112
+
113
+ function Invoke-Case {
114
+ param(
115
+ [string]$Root,
116
+ [string]$SkillName,
117
+ $Case
118
+ )
119
+ $skillDir = Join-Path $Root "plugin/skills/$SkillName"
120
+ $probePs1 = Join-Path $skillDir 'evals/probe.ps1'
121
+ $probeMjs = Join-Path $skillDir 'evals/probe.mjs'
122
+
123
+ $result = [ordered]@{
124
+ id = $Case.id
125
+ name = $Case.name
126
+ canary = [bool]$Case.canary
127
+ grader_type = $Case.grader_type
128
+ pass = $false
129
+ duration_ms = 0
130
+ tokens_in = 0
131
+ tokens_out = 0
132
+ stdout = ''
133
+ stderr = ''
134
+ assertions = @()
135
+ error = $null
136
+ }
137
+
138
+ if ($Case.skip) {
139
+ $result.error = "skipped: $($Case.skip_reason)"
140
+ return [pscustomobject]$result
141
+ }
142
+ if ($Case.grader_type -eq 'llm' -and -not $Live) {
143
+ $result.error = 'skipped: llm-rubric cases require -Live'
144
+ return [pscustomobject]$result
145
+ }
146
+ if ($Case.grader_type -eq 'llm' -and $Live) {
147
+ $result.error = 'skipped: llm dispatch not implemented in OSS runner'
148
+ return [pscustomobject]$result
149
+ }
150
+
151
+ $fixturePath = $null
152
+ if ($Case.fixture) { $fixturePath = Join-Path $Root $Case.fixture }
153
+
154
+ $inputStr = if ($Case.input -is [string]) { $Case.input } else { ($Case.input | ConvertTo-Json -Compress -Depth 8) }
155
+
156
+ $sw = [System.Diagnostics.Stopwatch]::StartNew()
157
+ try {
158
+ if (Test-Path $probePs1) {
159
+ $output = & pwsh -NoLogo -NoProfile -File $probePs1 -Input $inputStr -Fixture $fixturePath -CaseId $Case.id 2>&1
160
+ $result.stdout = ($output | Out-String).Trim()
161
+ } elseif (Test-Path $probeMjs) {
162
+ $output = & node $probeMjs --input $inputStr --fixture ($fixturePath ?? '') --case-id $Case.id 2>&1
163
+ $result.stdout = ($output | Out-String).Trim()
164
+ } else {
165
+ # Built-in fixture-echo probe.
166
+ $readPath = $null
167
+ if ($Case.args -and $Case.args.read_fixture) {
168
+ if ($fixturePath) { $readPath = Join-Path $fixturePath $Case.args.read_fixture }
169
+ else { $readPath = Join-Path $Root $Case.args.read_fixture }
170
+ }
171
+ if ($readPath -and (Test-Path $readPath)) {
172
+ $result.stdout = Get-Content -Raw -Path $readPath
173
+ } else {
174
+ # Default: echo the input as the "answer".
175
+ $result.stdout = $inputStr
176
+ }
177
+ }
178
+ } catch {
179
+ $result.error = $_.Exception.Message
180
+ } finally {
181
+ $sw.Stop()
182
+ $result.duration_ms = [int]$sw.Elapsed.TotalMilliseconds
183
+ }
184
+
185
+ # Rough token estimate: 4 chars/token.
186
+ $result.tokens_in = [int][math]::Ceiling($inputStr.Length / 4.0)
187
+ $result.tokens_out = [int][math]::Ceiling($result.stdout.Length / 4.0)
188
+
189
+ # Run assertions
190
+ $allPass = $true
191
+ foreach ($a in $Case.expected_assertions) {
192
+ $ares = Invoke-Assertion -Assertion $a -CaseResult $result -FixturePath $fixturePath -Root $Root
193
+ $result.assertions += $ares
194
+ if (-not $ares.pass) { $allPass = $false }
195
+ }
196
+ if ($result.error) { $allPass = $false }
197
+ $result.pass = $allPass
198
+ return [pscustomobject]$result
199
+ }
200
+
201
+ # ---------- Assertion implementations ----------
202
+
203
+ function Resolve-AssertPath {
204
+ param([string]$P, [string]$FixturePath, [string]$Root)
205
+ if ([System.IO.Path]::IsPathRooted($P)) { return $P }
206
+ if ($FixturePath -and (Test-Path (Join-Path $FixturePath $P))) { return Join-Path $FixturePath $P }
207
+ return Join-Path $Root $P
208
+ }
209
+
210
+ function Invoke-Assertion {
211
+ param($Assertion, $CaseResult, [string]$FixturePath, [string]$Root)
212
+ $ar = [ordered]@{ type = $Assertion.type; pass = $false; reason = '' }
213
+ try {
214
+ switch ($Assertion.type) {
215
+ 'file-exists' {
216
+ $p = Resolve-AssertPath -P $Assertion.path -FixturePath $FixturePath -Root $Root
217
+ $ar.pass = Test-Path $p
218
+ if (-not $ar.pass) { $ar.reason = "missing: $p" }
219
+ }
220
+ 'file-contains' {
221
+ $p = Resolve-AssertPath -P $Assertion.path -FixturePath $FixturePath -Root $Root
222
+ if (-not (Test-Path $p)) { $ar.reason = "missing: $p" }
223
+ else {
224
+ $txt = Get-Content -Raw -Path $p
225
+ $ar.pass = $txt -and $txt.Contains([string]$Assertion.needle)
226
+ if (-not $ar.pass) { $ar.reason = "needle not found in $p" }
227
+ }
228
+ }
229
+ 'regex-match' {
230
+ $flags = if ($Assertion.flags) { $Assertion.flags } else { '' }
231
+ $opts = [System.Text.RegularExpressions.RegexOptions]::None
232
+ if ($flags -match 'i') { $opts = $opts -bor [System.Text.RegularExpressions.RegexOptions]::IgnoreCase }
233
+ if ($flags -match 'm') { $opts = $opts -bor [System.Text.RegularExpressions.RegexOptions]::Multiline }
234
+ $rx = [System.Text.RegularExpressions.Regex]::new($Assertion.pattern, $opts)
235
+ $hay = "$($CaseResult.stdout)`n$($CaseResult.stderr)"
236
+ $ar.pass = $rx.IsMatch($hay)
237
+ if (-not $ar.pass) { $ar.reason = "no match for /$($Assertion.pattern)/" }
238
+ }
239
+ 'json-path-equals' {
240
+ $p = Resolve-AssertPath -P $Assertion.path -FixturePath $FixturePath -Root $Root
241
+ if (-not (Test-Path $p)) { $ar.reason = "missing: $p" }
242
+ else {
243
+ $j = Get-Content -Raw -Path $p | ConvertFrom-Json
244
+ $val = $j
245
+ foreach ($seg in ($Assertion.json_path -replace '^\$\.?', '' -split '\.' | Where-Object { $_ })) {
246
+ if ($null -eq $val) { break }
247
+ $val = $val.$seg
248
+ }
249
+ if ($val -is [array]) { $val = ($val -join ',') }
250
+ $ar.pass = ("$val" -eq "$($Assertion.equals)")
251
+ if (-not $ar.pass) { $ar.reason = "expected '$($Assertion.equals)', got '$val'" }
252
+ }
253
+ }
254
+ 'llm-rubric' {
255
+ $ar.pass = $false
256
+ $ar.reason = 'llm-rubric not evaluable in script runner'
257
+ }
258
+ default {
259
+ $ar.reason = "unknown assertion type: $($Assertion.type)"
260
+ }
261
+ }
262
+ } catch {
263
+ $ar.reason = "assertion error: $($_.Exception.Message)"
264
+ }
265
+ return [pscustomobject]$ar
266
+ }
267
+
268
+ # ---------- Main ----------
269
+
270
+ if ($Skill -and ($All -or $Canary)) {
271
+ Write-Err "-Skill cannot be combined with -All / -Canary"; exit 2
272
+ }
273
+ if (-not $Skill -and -not $All -and -not $Canary) {
274
+ Write-Err "Pick a mode: -Skill <name> | -All | -Canary"; exit 2
275
+ }
276
+
277
+ $evalFiles = Get-EvalFiles -Root $Root -Skill $Skill
278
+
279
+ if (-not $Output) {
280
+ $stamp = (Get-Date).ToUniversalTime().ToString('yyyyMMdd-HHmmss')
281
+ $Output = Join-Path $Root "Evidence/_evals/$stamp.json"
282
+ }
283
+ New-Item -ItemType Directory -Force -Path (Split-Path $Output -Parent) | Out-Null
284
+
285
+ $runRecord = [ordered]@{
286
+ schema = 'kushi.evals.run/v1'
287
+ generated_at = (Get-Date).ToUniversalTime().ToString('o')
288
+ mode = if ($Skill) { "skill:$Skill" } elseif ($Canary) { 'canary' } else { 'all' }
289
+ root = $Root
290
+ skills = @()
291
+ }
292
+ $schemaProblems = New-Object System.Collections.Generic.List[string]
293
+ $anyCaseFailed = $false
294
+
295
+ foreach ($f in $evalFiles) {
296
+ $obj = Get-Content -Raw $f | ConvertFrom-Json
297
+ $problems = Test-EvalsShape -Obj $obj -Path $f
298
+ foreach ($p in $problems) { $schemaProblems.Add($p) }
299
+ if ($problems.Count -gt 0) { continue }
300
+
301
+ $cases = $obj.cases
302
+ if ($Canary) { $cases = $cases | Where-Object { $_.canary } }
303
+ $skillEntry = [ordered]@{
304
+ skill = $obj.skill
305
+ evals_file = (Resolve-Path -Relative $f)
306
+ cases = @()
307
+ pass_count = 0
308
+ fail_count = 0
309
+ skipped_count = 0
310
+ }
311
+
312
+ foreach ($c in $cases) {
313
+ Write-Info "[$($obj.skill)] $($c.id) ..."
314
+ $r = Invoke-Case -Root $Root -SkillName $obj.skill -Case $c
315
+ $skillEntry.cases += $r
316
+ if ($r.error -and $r.error.StartsWith('skipped')) { $skillEntry.skipped_count++ }
317
+ elseif ($r.pass) { $skillEntry.pass_count++ }
318
+ else { $skillEntry.fail_count++; $anyCaseFailed = $true }
319
+ }
320
+ $runRecord.skills += $skillEntry
321
+ }
322
+
323
+ if ($schemaProblems.Count -gt 0) {
324
+ Write-Err "Schema problems:"
325
+ foreach ($p in $schemaProblems) { Write-Err " - $p" }
326
+ exit 2
327
+ }
328
+
329
+ $runRecord | ConvertTo-Json -Depth 10 | Set-Content -LiteralPath $Output -Encoding UTF8
330
+ Write-Info "Run written: $Output"
331
+
332
+ # Aggregator + baseline compare
333
+ $aggregator = Join-Path $Root 'src/eval-aggregator.mjs'
334
+ $baselineFile = Join-Path $Root 'evals/baseline.json'
335
+ $benchFile = Join-Path $Root 'Evidence/_evals/benchmark.json'
336
+
337
+ if (Test-Path $aggregator) {
338
+ $aggArgs = @($aggregator, '--run', $Output, '--bench', $benchFile)
339
+ if ((Test-Path $baselineFile -ErrorAction SilentlyContinue) -or $UpdateBaseline) {
340
+ $aggArgs += @('--baseline', $baselineFile)
341
+ }
342
+ if ($UpdateBaseline) { $aggArgs += '--update-baseline' }
343
+ & node @aggArgs
344
+ $aggExit = $LASTEXITCODE
345
+ } else {
346
+ Write-Warn 'src/eval-aggregator.mjs not found — skipping aggregation'
347
+ $aggExit = 0
348
+ }
349
+
350
+ # Print a one-screen summary
351
+ $total = 0; $pass = 0; $fail = 0; $skip = 0
352
+ foreach ($s in $runRecord.skills) {
353
+ $total += $s.pass_count + $s.fail_count + $s.skipped_count
354
+ $pass += $s.pass_count
355
+ $fail += $s.fail_count
356
+ $skip += $s.skipped_count
357
+ }
358
+ Write-Host ""
359
+ Write-Host "### Kushi evals — $($runRecord.mode)"
360
+ Write-Host "| Skill | Pass | Fail | Skip |"
361
+ Write-Host "|-------|-----:|-----:|-----:|"
362
+ foreach ($s in $runRecord.skills) {
363
+ Write-Host ("| {0} | {1} | {2} | {3} |" -f $s.skill, $s.pass_count, $s.fail_count, $s.skipped_count)
364
+ }
365
+ Write-Host ""
366
+ Write-Host "Total: $total · Pass: $pass · Fail: $fail · Skip: $skip"
367
+ Write-Host "Run JSON: $Output"
368
+ if (Test-Path $benchFile) { Write-Host "Benchmark: $benchFile" }
369
+
370
+ if ($StrictExit -and $anyCaseFailed) { exit 1 }
371
+ if ($aggExit -ne 0) { exit $aggExit }
372
+ exit 0
@@ -0,0 +1,33 @@
1
+ {
2
+ "skill": "fde-intake",
3
+ "version": "1.0.0",
4
+ "description": "Auto-seeded evals for fde-intake. Replace with real cases as the skill matures.",
5
+ "cases": [
6
+ {
7
+ "id": "fde-intake-smoke-1",
8
+ "name": "fde-intake produces a non-empty response",
9
+ "input": "synthetic fde-intake probe — canary smoke",
10
+ "canary": false,
11
+ "grader_type": "script",
12
+ "expected_assertions": [
13
+ {
14
+ "type": "regex-match",
15
+ "pattern": ".+"
16
+ }
17
+ ]
18
+ },
19
+ {
20
+ "id": "fde-intake-smoke-2",
21
+ "name": "fde-intake echoes case id",
22
+ "input": "case-id fde-intake-smoke-2",
23
+ "canary": false,
24
+ "grader_type": "script",
25
+ "expected_assertions": [
26
+ {
27
+ "type": "regex-match",
28
+ "pattern": "fde-intake-smoke-2"
29
+ }
30
+ ]
31
+ }
32
+ ]
33
+ }
@@ -0,0 +1,33 @@
1
+ {
2
+ "skill": "fde-report",
3
+ "version": "1.0.0",
4
+ "description": "Auto-seeded evals for fde-report. Replace with real cases as the skill matures.",
5
+ "cases": [
6
+ {
7
+ "id": "fde-report-smoke-1",
8
+ "name": "fde-report produces a non-empty response",
9
+ "input": "synthetic fde-report probe — canary smoke",
10
+ "canary": false,
11
+ "grader_type": "script",
12
+ "expected_assertions": [
13
+ {
14
+ "type": "regex-match",
15
+ "pattern": ".+"
16
+ }
17
+ ]
18
+ },
19
+ {
20
+ "id": "fde-report-smoke-2",
21
+ "name": "fde-report echoes case id",
22
+ "input": "case-id fde-report-smoke-2",
23
+ "canary": false,
24
+ "grader_type": "script",
25
+ "expected_assertions": [
26
+ {
27
+ "type": "regex-match",
28
+ "pattern": "fde-report-smoke-2"
29
+ }
30
+ ]
31
+ }
32
+ ]
33
+ }
@@ -0,0 +1,33 @@
1
+ {
2
+ "skill": "fde-triage",
3
+ "version": "1.0.0",
4
+ "description": "Auto-seeded evals for fde-triage. Replace with real cases as the skill matures.",
5
+ "cases": [
6
+ {
7
+ "id": "fde-triage-smoke-1",
8
+ "name": "fde-triage produces a non-empty response",
9
+ "input": "synthetic fde-triage probe — canary smoke",
10
+ "canary": false,
11
+ "grader_type": "script",
12
+ "expected_assertions": [
13
+ {
14
+ "type": "regex-match",
15
+ "pattern": ".+"
16
+ }
17
+ ]
18
+ },
19
+ {
20
+ "id": "fde-triage-smoke-2",
21
+ "name": "fde-triage echoes case id",
22
+ "input": "case-id fde-triage-smoke-2",
23
+ "canary": false,
24
+ "grader_type": "script",
25
+ "expected_assertions": [
26
+ {
27
+ "type": "regex-match",
28
+ "pattern": "fde-triage-smoke-2"
29
+ }
30
+ ]
31
+ }
32
+ ]
33
+ }
@@ -0,0 +1,33 @@
1
+ {
2
+ "skill": "intro",
3
+ "version": "1.0.0",
4
+ "description": "Auto-seeded evals for intro. Replace with real cases as the skill matures.",
5
+ "cases": [
6
+ {
7
+ "id": "intro-smoke-1",
8
+ "name": "intro produces a non-empty response",
9
+ "input": "synthetic intro probe — canary smoke",
10
+ "canary": false,
11
+ "grader_type": "script",
12
+ "expected_assertions": [
13
+ {
14
+ "type": "regex-match",
15
+ "pattern": ".+"
16
+ }
17
+ ]
18
+ },
19
+ {
20
+ "id": "intro-smoke-2",
21
+ "name": "intro echoes case id",
22
+ "input": "case-id intro-smoke-2",
23
+ "canary": false,
24
+ "grader_type": "script",
25
+ "expected_assertions": [
26
+ {
27
+ "type": "regex-match",
28
+ "pattern": "intro-smoke-2"
29
+ }
30
+ ]
31
+ }
32
+ ]
33
+ }
@@ -0,0 +1,31 @@
1
+ {
2
+ "skill": "link-entities",
3
+ "version": "1.0.0",
4
+ "description": "Deterministic entity-graph output — closed edge taxonomy + nodes/edges schema.",
5
+ "cases": [
6
+ {
7
+ "id": "le-graph-schema",
8
+ "name": "fixture project-graph.json has v1 schema + nodes + edges",
9
+ "input": "validate graph schema",
10
+ "fixture": "evals/fixtures/fixture-acme",
11
+ "canary": true,
12
+ "grader_type": "script",
13
+ "expected_assertions": [
14
+ { "type": "file-exists", "path": "Evidence/_graph/project-graph.json" },
15
+ { "type": "json-path-equals", "path": "Evidence/_graph/project-graph.json", "json_path": "$.schema", "equals": "kushi.project-graph/v1" }
16
+ ]
17
+ },
18
+ {
19
+ "id": "le-closed-taxonomy",
20
+ "name": "every edge uses a kind from the closed taxonomy",
21
+ "input": "validate edges",
22
+ "fixture": "evals/fixtures/fixture-acme",
23
+ "canary": false,
24
+ "grader_type": "script",
25
+ "args": { "read_fixture": "Evidence/_graph/project-graph.json" },
26
+ "expected_assertions": [
27
+ { "type": "regex-match", "pattern": "\"kind\":\\s*\"(references|decides|action-item-tracks|discusses|produced-by|follow-up-of|same-thread|participant-of)\"" }
28
+ ]
29
+ }
30
+ ]
31
+ }
@@ -0,0 +1,33 @@
1
+ {
2
+ "skill": "project-status",
3
+ "version": "1.0.0",
4
+ "description": "Auto-seeded evals for project-status. Replace with real cases as the skill matures.",
5
+ "cases": [
6
+ {
7
+ "id": "project-status-smoke-1",
8
+ "name": "project-status produces a non-empty response",
9
+ "input": "synthetic project-status probe — canary smoke",
10
+ "canary": false,
11
+ "grader_type": "script",
12
+ "expected_assertions": [
13
+ {
14
+ "type": "regex-match",
15
+ "pattern": ".+"
16
+ }
17
+ ]
18
+ },
19
+ {
20
+ "id": "project-status-smoke-2",
21
+ "name": "project-status echoes case id",
22
+ "input": "case-id project-status-smoke-2",
23
+ "canary": false,
24
+ "grader_type": "script",
25
+ "expected_assertions": [
26
+ {
27
+ "type": "regex-match",
28
+ "pattern": "project-status-smoke-2"
29
+ }
30
+ ]
31
+ }
32
+ ]
33
+ }
@@ -0,0 +1,33 @@
1
+ {
2
+ "skill": "propose-ado-update",
3
+ "version": "1.0.0",
4
+ "description": "Auto-seeded evals for propose-ado-update. Replace with real cases as the skill matures.",
5
+ "cases": [
6
+ {
7
+ "id": "propose-ado-update-smoke-1",
8
+ "name": "propose-ado-update produces a non-empty response",
9
+ "input": "synthetic propose-ado-update probe — canary smoke",
10
+ "canary": false,
11
+ "grader_type": "script",
12
+ "expected_assertions": [
13
+ {
14
+ "type": "regex-match",
15
+ "pattern": ".+"
16
+ }
17
+ ]
18
+ },
19
+ {
20
+ "id": "propose-ado-update-smoke-2",
21
+ "name": "propose-ado-update echoes case id",
22
+ "input": "case-id propose-ado-update-smoke-2",
23
+ "canary": false,
24
+ "grader_type": "script",
25
+ "expected_assertions": [
26
+ {
27
+ "type": "regex-match",
28
+ "pattern": "propose-ado-update-smoke-2"
29
+ }
30
+ ]
31
+ }
32
+ ]
33
+ }
@@ -0,0 +1,35 @@
1
+ {
2
+ "skill": "pull-ado",
3
+ "version": "1.0.0",
4
+ "description": "Auto-seeded evals for pull-ado. Replace with real cases as the skill matures.",
5
+ "cases": [
6
+ {
7
+ "id": "pull-ado-cached-1",
8
+ "name": "pull-ado cached/dry-run produces output",
9
+ "input": "--cached --dry-run fixture-acme",
10
+ "canary": false,
11
+ "grader_type": "script",
12
+ "expected_assertions": [
13
+ {
14
+ "type": "regex-match",
15
+ "pattern": "fixture-acme",
16
+ "flags": "i"
17
+ }
18
+ ]
19
+ },
20
+ {
21
+ "id": "pull-ado-rubric-1",
22
+ "name": "pull-ado output quality (LLM-rubric, skipped in canary)",
23
+ "input": "summarize fixture-acme pull-ado pulls",
24
+ "canary": false,
25
+ "grader_type": "llm",
26
+ "expected_assertions": [
27
+ {
28
+ "type": "llm-rubric",
29
+ "rubric": "Does the pull-ado response cite a source file path and an ISO timestamp?",
30
+ "min_score": 4
31
+ }
32
+ ]
33
+ }
34
+ ]
35
+ }
@@ -0,0 +1,35 @@
1
+ {
2
+ "skill": "pull-crm",
3
+ "version": "1.0.0",
4
+ "description": "Auto-seeded evals for pull-crm. Replace with real cases as the skill matures.",
5
+ "cases": [
6
+ {
7
+ "id": "pull-crm-cached-1",
8
+ "name": "pull-crm cached/dry-run produces output",
9
+ "input": "--cached --dry-run fixture-acme",
10
+ "canary": false,
11
+ "grader_type": "script",
12
+ "expected_assertions": [
13
+ {
14
+ "type": "regex-match",
15
+ "pattern": "fixture-acme",
16
+ "flags": "i"
17
+ }
18
+ ]
19
+ },
20
+ {
21
+ "id": "pull-crm-rubric-1",
22
+ "name": "pull-crm output quality (LLM-rubric, skipped in canary)",
23
+ "input": "summarize fixture-acme pull-crm pulls",
24
+ "canary": false,
25
+ "grader_type": "llm",
26
+ "expected_assertions": [
27
+ {
28
+ "type": "llm-rubric",
29
+ "rubric": "Does the pull-crm response cite a source file path and an ISO timestamp?",
30
+ "min_score": 4
31
+ }
32
+ ]
33
+ }
34
+ ]
35
+ }