kushi-agents 5.0.2 → 5.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -0
- package/package.json +6 -2
- package/plugin/agents/kushi.agent.md +1 -1
- package/plugin/instructions/skill-evals.instructions.md +130 -0
- package/plugin/skills/aggregate-project/evals/evals.json +33 -0
- package/plugin/skills/apply-ado-update/evals/evals.json +33 -0
- package/plugin/skills/ask-project/evals/evals.json +34 -0
- package/plugin/skills/bootstrap-project/evals/evals.json +34 -0
- package/plugin/skills/build-state/evals/evals.json +31 -0
- package/plugin/skills/consolidate-evidence/evals/evals.json +33 -0
- package/plugin/skills/dashboard/evals/evals.json +33 -0
- package/plugin/skills/emit-vertex/evals/evals.json +33 -0
- package/plugin/skills/eval/SKILL.md +90 -0
- package/plugin/skills/eval/evals.schema.json +73 -0
- package/plugin/skills/eval/run-evals.ps1 +372 -0
- package/plugin/skills/fde-intake/evals/evals.json +33 -0
- package/plugin/skills/fde-report/evals/evals.json +33 -0
- package/plugin/skills/fde-triage/evals/evals.json +33 -0
- package/plugin/skills/intro/evals/evals.json +33 -0
- package/plugin/skills/link-entities/evals/evals.json +31 -0
- package/plugin/skills/project-status/evals/evals.json +33 -0
- package/plugin/skills/propose-ado-update/evals/evals.json +33 -0
- package/plugin/skills/pull-ado/evals/evals.json +35 -0
- package/plugin/skills/pull-crm/evals/evals.json +35 -0
- package/plugin/skills/pull-email/evals/evals.json +35 -0
- package/plugin/skills/pull-loop/evals/evals.json +35 -0
- package/plugin/skills/pull-meetings/evals/evals.json +35 -0
- package/plugin/skills/pull-misc/evals/evals.json +35 -0
- package/plugin/skills/pull-onenote/evals/evals.json +35 -0
- package/plugin/skills/pull-sharepoint/evals/evals.json +35 -0
- package/plugin/skills/pull-teams/evals/evals.json +35 -0
- package/plugin/skills/refresh-project/evals/evals.json +31 -0
- package/plugin/skills/self-check/SKILL.md +1 -0
- package/plugin/skills/self-check/evals/evals.json +28 -0
- package/plugin/skills/self-check/run.ps1 +63 -0
- package/plugin/skills/setup/evals/evals.json +33 -0
- package/plugin/skills/tour/evals/evals.json +33 -0
- package/plugin/skills/vertex-link/evals/evals.json +33 -0
- package/src/eval-aggregator.mjs +209 -0
- package/src/eval-aggregator.test.mjs +64 -0
- package/src/eval-runner.test.mjs +69 -0
|
@@ -0,0 +1,372 @@
|
|
|
1
|
+
<#
|
|
2
|
+
.SYNOPSIS
|
|
3
|
+
Kushi skill evals runner (v5.0.3+).
|
|
4
|
+
|
|
5
|
+
.DESCRIPTION
|
|
6
|
+
Discovers per-skill evals at `plugin/skills/<name>/evals/evals.json`,
|
|
7
|
+
validates against `plugin/skills/eval/evals.schema.json`, dispatches each case,
|
|
8
|
+
runs the assertions, and writes a per-run JSON to `Evidence/_evals/<ts>.json`.
|
|
9
|
+
|
|
10
|
+
Per the doctrine in `plugin/instructions/skill-evals.instructions.md`, the
|
|
11
|
+
runner NEVER makes live M365 / ADO / CRM calls. Skills that genuinely require
|
|
12
|
+
live calls (`pull-*`) ship cached fixtures.
|
|
13
|
+
|
|
14
|
+
Dispatch:
|
|
15
|
+
- `grader_type: "script"` cases are dispatched to:
|
|
16
|
+
1. `plugin/skills/<skill>/evals/probe.ps1` (preferred, per-skill probe)
|
|
17
|
+
2. `plugin/skills/<skill>/evals/probe.mjs` (Node probe)
|
|
18
|
+
3. The built-in "fixture-echo" probe, which reads `case.args.read_fixture`
|
|
19
|
+
and emits its content. Lets simple read-only assertions work with no
|
|
20
|
+
per-skill code at all.
|
|
21
|
+
- `grader_type: "llm"` cases are SKIPPED unless `-Live` is set; under `-Live`
|
|
22
|
+
they require an `m_*` sub-agent dispatcher (not implemented in OSS — flagged
|
|
23
|
+
as skipped with reason).
|
|
24
|
+
|
|
25
|
+
.PARAMETER Skill
|
|
26
|
+
Single skill to run. Conflicts with -All / -Canary.
|
|
27
|
+
|
|
28
|
+
.PARAMETER All
|
|
29
|
+
Run every plugin/skills/<name>/evals/evals.json (except `eval/` and `self-check/`).
|
|
30
|
+
|
|
31
|
+
.PARAMETER Canary
|
|
32
|
+
Run only cases marked `"canary": true`.
|
|
33
|
+
|
|
34
|
+
.PARAMETER Output
|
|
35
|
+
Override the per-run JSON output path.
|
|
36
|
+
|
|
37
|
+
.PARAMETER Baseline
|
|
38
|
+
Compare against `evals/baseline.json` (default ON).
|
|
39
|
+
|
|
40
|
+
.PARAMETER UpdateBaseline
|
|
41
|
+
Write the current run's metrics into `evals/baseline.json` instead of comparing.
|
|
42
|
+
|
|
43
|
+
.PARAMETER Live
|
|
44
|
+
Allow LLM-rubric cases to attempt to dispatch (not implemented in OSS; will skip).
|
|
45
|
+
|
|
46
|
+
.PARAMETER StrictExit
|
|
47
|
+
Exit code 1 if any case fails (default: exit 1 only on regression vs baseline).
|
|
48
|
+
|
|
49
|
+
.PARAMETER Root
|
|
50
|
+
Repo root (default: two levels above this script).
|
|
51
|
+
#>
|
|
52
|
+
[CmdletBinding()]
|
|
53
|
+
param(
|
|
54
|
+
[string]$Skill,
|
|
55
|
+
[switch]$All,
|
|
56
|
+
[switch]$Canary,
|
|
57
|
+
[string]$Output,
|
|
58
|
+
[switch]$Baseline = $true,
|
|
59
|
+
[switch]$UpdateBaseline,
|
|
60
|
+
[switch]$Live,
|
|
61
|
+
[switch]$StrictExit,
|
|
62
|
+
[string]$Root = (Resolve-Path (Join-Path $PSScriptRoot "..\..\..")).Path
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
$ErrorActionPreference = 'Stop'
|
|
66
|
+
|
|
67
|
+
function Write-Info($m) { Write-Host "kushi-eval: $m" }
|
|
68
|
+
function Write-Warn($m) { Write-Host "kushi-eval: $m" -ForegroundColor Yellow }
|
|
69
|
+
function Write-Err($m) { Write-Host "kushi-eval: $m" -ForegroundColor Red }
|
|
70
|
+
|
|
71
|
+
# ---------- Discover skills ----------
|
|
72
|
+
|
|
73
|
+
function Get-EvalFiles {
|
|
74
|
+
param([string]$Root, [string]$Skill)
|
|
75
|
+
$skillsDir = Join-Path $Root 'plugin/skills'
|
|
76
|
+
if ($Skill) {
|
|
77
|
+
$f = Join-Path $skillsDir "$Skill/evals/evals.json"
|
|
78
|
+
if (-not (Test-Path $f)) { throw "No evals.json for skill '$Skill' at $f" }
|
|
79
|
+
return ,$f
|
|
80
|
+
}
|
|
81
|
+
# Discover every skill that ships an evals/evals.json. The eval skill itself is
|
|
82
|
+
# excluded (it has no SKILL evals — it IS the runner). self-check IS included
|
|
83
|
+
# because it ships its own meta-evals.
|
|
84
|
+
return Get-ChildItem -Path $skillsDir -Directory |
|
|
85
|
+
Where-Object { $_.Name -ne 'eval' } |
|
|
86
|
+
ForEach-Object { Join-Path $_.FullName 'evals/evals.json' } |
|
|
87
|
+
Where-Object { Test-Path $_ }
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
# ---------- Lightweight JSON-Schema-ish validator (just the cases we care about) ----------
|
|
91
|
+
|
|
92
|
+
function Test-EvalsShape {
|
|
93
|
+
param([Parameter(Mandatory)] $Obj, [string]$Path)
|
|
94
|
+
$problems = New-Object System.Collections.Generic.List[string]
|
|
95
|
+
if (-not $Obj.skill) { $problems.Add("${Path}: missing 'skill'") }
|
|
96
|
+
if (-not $Obj.cases -or $Obj.cases.Count -lt 1) { $problems.Add("${Path}: needs >=1 case") }
|
|
97
|
+
foreach ($c in $Obj.cases) {
|
|
98
|
+
foreach ($req in 'id', 'name', 'input', 'grader_type', 'expected_assertions') {
|
|
99
|
+
if (-not $c.PSObject.Properties[$req]) { $problems.Add("${Path}: case missing '$req'") }
|
|
100
|
+
}
|
|
101
|
+
if ($c.grader_type -and ($c.grader_type -notin 'script', 'llm')) {
|
|
102
|
+
$problems.Add("${Path}: case '$($c.id)' has invalid grader_type '$($c.grader_type)'")
|
|
103
|
+
}
|
|
104
|
+
if ($c.expected_assertions -and $c.expected_assertions.Count -lt 1) {
|
|
105
|
+
$problems.Add("${Path}: case '$($c.id)' needs >=1 assertion")
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
return $problems
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
# ---------- Dispatch a single case ----------
|
|
112
|
+
|
|
113
|
+
function Invoke-Case {
|
|
114
|
+
param(
|
|
115
|
+
[string]$Root,
|
|
116
|
+
[string]$SkillName,
|
|
117
|
+
$Case
|
|
118
|
+
)
|
|
119
|
+
$skillDir = Join-Path $Root "plugin/skills/$SkillName"
|
|
120
|
+
$probePs1 = Join-Path $skillDir 'evals/probe.ps1'
|
|
121
|
+
$probeMjs = Join-Path $skillDir 'evals/probe.mjs'
|
|
122
|
+
|
|
123
|
+
$result = [ordered]@{
|
|
124
|
+
id = $Case.id
|
|
125
|
+
name = $Case.name
|
|
126
|
+
canary = [bool]$Case.canary
|
|
127
|
+
grader_type = $Case.grader_type
|
|
128
|
+
pass = $false
|
|
129
|
+
duration_ms = 0
|
|
130
|
+
tokens_in = 0
|
|
131
|
+
tokens_out = 0
|
|
132
|
+
stdout = ''
|
|
133
|
+
stderr = ''
|
|
134
|
+
assertions = @()
|
|
135
|
+
error = $null
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
if ($Case.skip) {
|
|
139
|
+
$result.error = "skipped: $($Case.skip_reason)"
|
|
140
|
+
return [pscustomobject]$result
|
|
141
|
+
}
|
|
142
|
+
if ($Case.grader_type -eq 'llm' -and -not $Live) {
|
|
143
|
+
$result.error = 'skipped: llm-rubric cases require -Live'
|
|
144
|
+
return [pscustomobject]$result
|
|
145
|
+
}
|
|
146
|
+
if ($Case.grader_type -eq 'llm' -and $Live) {
|
|
147
|
+
$result.error = 'skipped: llm dispatch not implemented in OSS runner'
|
|
148
|
+
return [pscustomobject]$result
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
$fixturePath = $null
|
|
152
|
+
if ($Case.fixture) { $fixturePath = Join-Path $Root $Case.fixture }
|
|
153
|
+
|
|
154
|
+
$inputStr = if ($Case.input -is [string]) { $Case.input } else { ($Case.input | ConvertTo-Json -Compress -Depth 8) }
|
|
155
|
+
|
|
156
|
+
$sw = [System.Diagnostics.Stopwatch]::StartNew()
|
|
157
|
+
try {
|
|
158
|
+
if (Test-Path $probePs1) {
|
|
159
|
+
$output = & pwsh -NoLogo -NoProfile -File $probePs1 -Input $inputStr -Fixture $fixturePath -CaseId $Case.id 2>&1
|
|
160
|
+
$result.stdout = ($output | Out-String).Trim()
|
|
161
|
+
} elseif (Test-Path $probeMjs) {
|
|
162
|
+
$output = & node $probeMjs --input $inputStr --fixture ($fixturePath ?? '') --case-id $Case.id 2>&1
|
|
163
|
+
$result.stdout = ($output | Out-String).Trim()
|
|
164
|
+
} else {
|
|
165
|
+
# Built-in fixture-echo probe.
|
|
166
|
+
$readPath = $null
|
|
167
|
+
if ($Case.args -and $Case.args.read_fixture) {
|
|
168
|
+
if ($fixturePath) { $readPath = Join-Path $fixturePath $Case.args.read_fixture }
|
|
169
|
+
else { $readPath = Join-Path $Root $Case.args.read_fixture }
|
|
170
|
+
}
|
|
171
|
+
if ($readPath -and (Test-Path $readPath)) {
|
|
172
|
+
$result.stdout = Get-Content -Raw -Path $readPath
|
|
173
|
+
} else {
|
|
174
|
+
# Default: echo the input as the "answer".
|
|
175
|
+
$result.stdout = $inputStr
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
} catch {
|
|
179
|
+
$result.error = $_.Exception.Message
|
|
180
|
+
} finally {
|
|
181
|
+
$sw.Stop()
|
|
182
|
+
$result.duration_ms = [int]$sw.Elapsed.TotalMilliseconds
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
# Rough token estimate: 4 chars/token.
|
|
186
|
+
$result.tokens_in = [int][math]::Ceiling($inputStr.Length / 4.0)
|
|
187
|
+
$result.tokens_out = [int][math]::Ceiling($result.stdout.Length / 4.0)
|
|
188
|
+
|
|
189
|
+
# Run assertions
|
|
190
|
+
$allPass = $true
|
|
191
|
+
foreach ($a in $Case.expected_assertions) {
|
|
192
|
+
$ares = Invoke-Assertion -Assertion $a -CaseResult $result -FixturePath $fixturePath -Root $Root
|
|
193
|
+
$result.assertions += $ares
|
|
194
|
+
if (-not $ares.pass) { $allPass = $false }
|
|
195
|
+
}
|
|
196
|
+
if ($result.error) { $allPass = $false }
|
|
197
|
+
$result.pass = $allPass
|
|
198
|
+
return [pscustomobject]$result
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
# ---------- Assertion implementations ----------
|
|
202
|
+
|
|
203
|
+
function Resolve-AssertPath {
|
|
204
|
+
param([string]$P, [string]$FixturePath, [string]$Root)
|
|
205
|
+
if ([System.IO.Path]::IsPathRooted($P)) { return $P }
|
|
206
|
+
if ($FixturePath -and (Test-Path (Join-Path $FixturePath $P))) { return Join-Path $FixturePath $P }
|
|
207
|
+
return Join-Path $Root $P
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
function Invoke-Assertion {
|
|
211
|
+
param($Assertion, $CaseResult, [string]$FixturePath, [string]$Root)
|
|
212
|
+
$ar = [ordered]@{ type = $Assertion.type; pass = $false; reason = '' }
|
|
213
|
+
try {
|
|
214
|
+
switch ($Assertion.type) {
|
|
215
|
+
'file-exists' {
|
|
216
|
+
$p = Resolve-AssertPath -P $Assertion.path -FixturePath $FixturePath -Root $Root
|
|
217
|
+
$ar.pass = Test-Path $p
|
|
218
|
+
if (-not $ar.pass) { $ar.reason = "missing: $p" }
|
|
219
|
+
}
|
|
220
|
+
'file-contains' {
|
|
221
|
+
$p = Resolve-AssertPath -P $Assertion.path -FixturePath $FixturePath -Root $Root
|
|
222
|
+
if (-not (Test-Path $p)) { $ar.reason = "missing: $p" }
|
|
223
|
+
else {
|
|
224
|
+
$txt = Get-Content -Raw -Path $p
|
|
225
|
+
$ar.pass = $txt -and $txt.Contains([string]$Assertion.needle)
|
|
226
|
+
if (-not $ar.pass) { $ar.reason = "needle not found in $p" }
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
'regex-match' {
|
|
230
|
+
$flags = if ($Assertion.flags) { $Assertion.flags } else { '' }
|
|
231
|
+
$opts = [System.Text.RegularExpressions.RegexOptions]::None
|
|
232
|
+
if ($flags -match 'i') { $opts = $opts -bor [System.Text.RegularExpressions.RegexOptions]::IgnoreCase }
|
|
233
|
+
if ($flags -match 'm') { $opts = $opts -bor [System.Text.RegularExpressions.RegexOptions]::Multiline }
|
|
234
|
+
$rx = [System.Text.RegularExpressions.Regex]::new($Assertion.pattern, $opts)
|
|
235
|
+
$hay = "$($CaseResult.stdout)`n$($CaseResult.stderr)"
|
|
236
|
+
$ar.pass = $rx.IsMatch($hay)
|
|
237
|
+
if (-not $ar.pass) { $ar.reason = "no match for /$($Assertion.pattern)/" }
|
|
238
|
+
}
|
|
239
|
+
'json-path-equals' {
|
|
240
|
+
$p = Resolve-AssertPath -P $Assertion.path -FixturePath $FixturePath -Root $Root
|
|
241
|
+
if (-not (Test-Path $p)) { $ar.reason = "missing: $p" }
|
|
242
|
+
else {
|
|
243
|
+
$j = Get-Content -Raw -Path $p | ConvertFrom-Json
|
|
244
|
+
$val = $j
|
|
245
|
+
foreach ($seg in ($Assertion.json_path -replace '^\$\.?', '' -split '\.' | Where-Object { $_ })) {
|
|
246
|
+
if ($null -eq $val) { break }
|
|
247
|
+
$val = $val.$seg
|
|
248
|
+
}
|
|
249
|
+
if ($val -is [array]) { $val = ($val -join ',') }
|
|
250
|
+
$ar.pass = ("$val" -eq "$($Assertion.equals)")
|
|
251
|
+
if (-not $ar.pass) { $ar.reason = "expected '$($Assertion.equals)', got '$val'" }
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
'llm-rubric' {
|
|
255
|
+
$ar.pass = $false
|
|
256
|
+
$ar.reason = 'llm-rubric not evaluable in script runner'
|
|
257
|
+
}
|
|
258
|
+
default {
|
|
259
|
+
$ar.reason = "unknown assertion type: $($Assertion.type)"
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
} catch {
|
|
263
|
+
$ar.reason = "assertion error: $($_.Exception.Message)"
|
|
264
|
+
}
|
|
265
|
+
return [pscustomobject]$ar
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
# ---------- Main ----------
|
|
269
|
+
|
|
270
|
+
if ($Skill -and ($All -or $Canary)) {
|
|
271
|
+
Write-Err "-Skill cannot be combined with -All / -Canary"; exit 2
|
|
272
|
+
}
|
|
273
|
+
if (-not $Skill -and -not $All -and -not $Canary) {
|
|
274
|
+
Write-Err "Pick a mode: -Skill <name> | -All | -Canary"; exit 2
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
$evalFiles = Get-EvalFiles -Root $Root -Skill $Skill
|
|
278
|
+
|
|
279
|
+
if (-not $Output) {
|
|
280
|
+
$stamp = (Get-Date).ToUniversalTime().ToString('yyyyMMdd-HHmmss')
|
|
281
|
+
$Output = Join-Path $Root "Evidence/_evals/$stamp.json"
|
|
282
|
+
}
|
|
283
|
+
New-Item -ItemType Directory -Force -Path (Split-Path $Output -Parent) | Out-Null
|
|
284
|
+
|
|
285
|
+
$runRecord = [ordered]@{
|
|
286
|
+
schema = 'kushi.evals.run/v1'
|
|
287
|
+
generated_at = (Get-Date).ToUniversalTime().ToString('o')
|
|
288
|
+
mode = if ($Skill) { "skill:$Skill" } elseif ($Canary) { 'canary' } else { 'all' }
|
|
289
|
+
root = $Root
|
|
290
|
+
skills = @()
|
|
291
|
+
}
|
|
292
|
+
$schemaProblems = New-Object System.Collections.Generic.List[string]
|
|
293
|
+
$anyCaseFailed = $false
|
|
294
|
+
|
|
295
|
+
foreach ($f in $evalFiles) {
|
|
296
|
+
$obj = Get-Content -Raw $f | ConvertFrom-Json
|
|
297
|
+
$problems = Test-EvalsShape -Obj $obj -Path $f
|
|
298
|
+
foreach ($p in $problems) { $schemaProblems.Add($p) }
|
|
299
|
+
if ($problems.Count -gt 0) { continue }
|
|
300
|
+
|
|
301
|
+
$cases = $obj.cases
|
|
302
|
+
if ($Canary) { $cases = $cases | Where-Object { $_.canary } }
|
|
303
|
+
$skillEntry = [ordered]@{
|
|
304
|
+
skill = $obj.skill
|
|
305
|
+
evals_file = (Resolve-Path -Relative $f)
|
|
306
|
+
cases = @()
|
|
307
|
+
pass_count = 0
|
|
308
|
+
fail_count = 0
|
|
309
|
+
skipped_count = 0
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
foreach ($c in $cases) {
|
|
313
|
+
Write-Info "[$($obj.skill)] $($c.id) ..."
|
|
314
|
+
$r = Invoke-Case -Root $Root -SkillName $obj.skill -Case $c
|
|
315
|
+
$skillEntry.cases += $r
|
|
316
|
+
if ($r.error -and $r.error.StartsWith('skipped')) { $skillEntry.skipped_count++ }
|
|
317
|
+
elseif ($r.pass) { $skillEntry.pass_count++ }
|
|
318
|
+
else { $skillEntry.fail_count++; $anyCaseFailed = $true }
|
|
319
|
+
}
|
|
320
|
+
$runRecord.skills += $skillEntry
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
if ($schemaProblems.Count -gt 0) {
|
|
324
|
+
Write-Err "Schema problems:"
|
|
325
|
+
foreach ($p in $schemaProblems) { Write-Err " - $p" }
|
|
326
|
+
exit 2
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
$runRecord | ConvertTo-Json -Depth 10 | Set-Content -LiteralPath $Output -Encoding UTF8
|
|
330
|
+
Write-Info "Run written: $Output"
|
|
331
|
+
|
|
332
|
+
# Aggregator + baseline compare
|
|
333
|
+
$aggregator = Join-Path $Root 'src/eval-aggregator.mjs'
|
|
334
|
+
$baselineFile = Join-Path $Root 'evals/baseline.json'
|
|
335
|
+
$benchFile = Join-Path $Root 'Evidence/_evals/benchmark.json'
|
|
336
|
+
|
|
337
|
+
if (Test-Path $aggregator) {
|
|
338
|
+
$aggArgs = @($aggregator, '--run', $Output, '--bench', $benchFile)
|
|
339
|
+
if ((Test-Path $baselineFile -ErrorAction SilentlyContinue) -or $UpdateBaseline) {
|
|
340
|
+
$aggArgs += @('--baseline', $baselineFile)
|
|
341
|
+
}
|
|
342
|
+
if ($UpdateBaseline) { $aggArgs += '--update-baseline' }
|
|
343
|
+
& node @aggArgs
|
|
344
|
+
$aggExit = $LASTEXITCODE
|
|
345
|
+
} else {
|
|
346
|
+
Write-Warn 'src/eval-aggregator.mjs not found — skipping aggregation'
|
|
347
|
+
$aggExit = 0
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
# Print a one-screen summary
|
|
351
|
+
$total = 0; $pass = 0; $fail = 0; $skip = 0
|
|
352
|
+
foreach ($s in $runRecord.skills) {
|
|
353
|
+
$total += $s.pass_count + $s.fail_count + $s.skipped_count
|
|
354
|
+
$pass += $s.pass_count
|
|
355
|
+
$fail += $s.fail_count
|
|
356
|
+
$skip += $s.skipped_count
|
|
357
|
+
}
|
|
358
|
+
Write-Host ""
|
|
359
|
+
Write-Host "### Kushi evals — $($runRecord.mode)"
|
|
360
|
+
Write-Host "| Skill | Pass | Fail | Skip |"
|
|
361
|
+
Write-Host "|-------|-----:|-----:|-----:|"
|
|
362
|
+
foreach ($s in $runRecord.skills) {
|
|
363
|
+
Write-Host ("| {0} | {1} | {2} | {3} |" -f $s.skill, $s.pass_count, $s.fail_count, $s.skipped_count)
|
|
364
|
+
}
|
|
365
|
+
Write-Host ""
|
|
366
|
+
Write-Host "Total: $total · Pass: $pass · Fail: $fail · Skip: $skip"
|
|
367
|
+
Write-Host "Run JSON: $Output"
|
|
368
|
+
if (Test-Path $benchFile) { Write-Host "Benchmark: $benchFile" }
|
|
369
|
+
|
|
370
|
+
if ($StrictExit -and $anyCaseFailed) { exit 1 }
|
|
371
|
+
if ($aggExit -ne 0) { exit $aggExit }
|
|
372
|
+
exit 0
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "fde-intake",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Auto-seeded evals for fde-intake. Replace with real cases as the skill matures.",
|
|
5
|
+
"cases": [
|
|
6
|
+
{
|
|
7
|
+
"id": "fde-intake-smoke-1",
|
|
8
|
+
"name": "fde-intake produces a non-empty response",
|
|
9
|
+
"input": "synthetic fde-intake probe — canary smoke",
|
|
10
|
+
"canary": false,
|
|
11
|
+
"grader_type": "script",
|
|
12
|
+
"expected_assertions": [
|
|
13
|
+
{
|
|
14
|
+
"type": "regex-match",
|
|
15
|
+
"pattern": ".+"
|
|
16
|
+
}
|
|
17
|
+
]
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"id": "fde-intake-smoke-2",
|
|
21
|
+
"name": "fde-intake echoes case id",
|
|
22
|
+
"input": "case-id fde-intake-smoke-2",
|
|
23
|
+
"canary": false,
|
|
24
|
+
"grader_type": "script",
|
|
25
|
+
"expected_assertions": [
|
|
26
|
+
{
|
|
27
|
+
"type": "regex-match",
|
|
28
|
+
"pattern": "fde-intake-smoke-2"
|
|
29
|
+
}
|
|
30
|
+
]
|
|
31
|
+
}
|
|
32
|
+
]
|
|
33
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "fde-report",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Auto-seeded evals for fde-report. Replace with real cases as the skill matures.",
|
|
5
|
+
"cases": [
|
|
6
|
+
{
|
|
7
|
+
"id": "fde-report-smoke-1",
|
|
8
|
+
"name": "fde-report produces a non-empty response",
|
|
9
|
+
"input": "synthetic fde-report probe — canary smoke",
|
|
10
|
+
"canary": false,
|
|
11
|
+
"grader_type": "script",
|
|
12
|
+
"expected_assertions": [
|
|
13
|
+
{
|
|
14
|
+
"type": "regex-match",
|
|
15
|
+
"pattern": ".+"
|
|
16
|
+
}
|
|
17
|
+
]
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"id": "fde-report-smoke-2",
|
|
21
|
+
"name": "fde-report echoes case id",
|
|
22
|
+
"input": "case-id fde-report-smoke-2",
|
|
23
|
+
"canary": false,
|
|
24
|
+
"grader_type": "script",
|
|
25
|
+
"expected_assertions": [
|
|
26
|
+
{
|
|
27
|
+
"type": "regex-match",
|
|
28
|
+
"pattern": "fde-report-smoke-2"
|
|
29
|
+
}
|
|
30
|
+
]
|
|
31
|
+
}
|
|
32
|
+
]
|
|
33
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "fde-triage",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Auto-seeded evals for fde-triage. Replace with real cases as the skill matures.",
|
|
5
|
+
"cases": [
|
|
6
|
+
{
|
|
7
|
+
"id": "fde-triage-smoke-1",
|
|
8
|
+
"name": "fde-triage produces a non-empty response",
|
|
9
|
+
"input": "synthetic fde-triage probe — canary smoke",
|
|
10
|
+
"canary": false,
|
|
11
|
+
"grader_type": "script",
|
|
12
|
+
"expected_assertions": [
|
|
13
|
+
{
|
|
14
|
+
"type": "regex-match",
|
|
15
|
+
"pattern": ".+"
|
|
16
|
+
}
|
|
17
|
+
]
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"id": "fde-triage-smoke-2",
|
|
21
|
+
"name": "fde-triage echoes case id",
|
|
22
|
+
"input": "case-id fde-triage-smoke-2",
|
|
23
|
+
"canary": false,
|
|
24
|
+
"grader_type": "script",
|
|
25
|
+
"expected_assertions": [
|
|
26
|
+
{
|
|
27
|
+
"type": "regex-match",
|
|
28
|
+
"pattern": "fde-triage-smoke-2"
|
|
29
|
+
}
|
|
30
|
+
]
|
|
31
|
+
}
|
|
32
|
+
]
|
|
33
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "intro",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Auto-seeded evals for intro. Replace with real cases as the skill matures.",
|
|
5
|
+
"cases": [
|
|
6
|
+
{
|
|
7
|
+
"id": "intro-smoke-1",
|
|
8
|
+
"name": "intro produces a non-empty response",
|
|
9
|
+
"input": "synthetic intro probe — canary smoke",
|
|
10
|
+
"canary": false,
|
|
11
|
+
"grader_type": "script",
|
|
12
|
+
"expected_assertions": [
|
|
13
|
+
{
|
|
14
|
+
"type": "regex-match",
|
|
15
|
+
"pattern": ".+"
|
|
16
|
+
}
|
|
17
|
+
]
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"id": "intro-smoke-2",
|
|
21
|
+
"name": "intro echoes case id",
|
|
22
|
+
"input": "case-id intro-smoke-2",
|
|
23
|
+
"canary": false,
|
|
24
|
+
"grader_type": "script",
|
|
25
|
+
"expected_assertions": [
|
|
26
|
+
{
|
|
27
|
+
"type": "regex-match",
|
|
28
|
+
"pattern": "intro-smoke-2"
|
|
29
|
+
}
|
|
30
|
+
]
|
|
31
|
+
}
|
|
32
|
+
]
|
|
33
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "link-entities",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Deterministic entity-graph output — closed edge taxonomy + nodes/edges schema.",
|
|
5
|
+
"cases": [
|
|
6
|
+
{
|
|
7
|
+
"id": "le-graph-schema",
|
|
8
|
+
"name": "fixture project-graph.json has v1 schema + nodes + edges",
|
|
9
|
+
"input": "validate graph schema",
|
|
10
|
+
"fixture": "evals/fixtures/fixture-acme",
|
|
11
|
+
"canary": true,
|
|
12
|
+
"grader_type": "script",
|
|
13
|
+
"expected_assertions": [
|
|
14
|
+
{ "type": "file-exists", "path": "Evidence/_graph/project-graph.json" },
|
|
15
|
+
{ "type": "json-path-equals", "path": "Evidence/_graph/project-graph.json", "json_path": "$.schema", "equals": "kushi.project-graph/v1" }
|
|
16
|
+
]
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
"id": "le-closed-taxonomy",
|
|
20
|
+
"name": "every edge uses a kind from the closed taxonomy",
|
|
21
|
+
"input": "validate edges",
|
|
22
|
+
"fixture": "evals/fixtures/fixture-acme",
|
|
23
|
+
"canary": false,
|
|
24
|
+
"grader_type": "script",
|
|
25
|
+
"args": { "read_fixture": "Evidence/_graph/project-graph.json" },
|
|
26
|
+
"expected_assertions": [
|
|
27
|
+
{ "type": "regex-match", "pattern": "\"kind\":\\s*\"(references|decides|action-item-tracks|discusses|produced-by|follow-up-of|same-thread|participant-of)\"" }
|
|
28
|
+
]
|
|
29
|
+
}
|
|
30
|
+
]
|
|
31
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "project-status",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Auto-seeded evals for project-status. Replace with real cases as the skill matures.",
|
|
5
|
+
"cases": [
|
|
6
|
+
{
|
|
7
|
+
"id": "project-status-smoke-1",
|
|
8
|
+
"name": "project-status produces a non-empty response",
|
|
9
|
+
"input": "synthetic project-status probe — canary smoke",
|
|
10
|
+
"canary": false,
|
|
11
|
+
"grader_type": "script",
|
|
12
|
+
"expected_assertions": [
|
|
13
|
+
{
|
|
14
|
+
"type": "regex-match",
|
|
15
|
+
"pattern": ".+"
|
|
16
|
+
}
|
|
17
|
+
]
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"id": "project-status-smoke-2",
|
|
21
|
+
"name": "project-status echoes case id",
|
|
22
|
+
"input": "case-id project-status-smoke-2",
|
|
23
|
+
"canary": false,
|
|
24
|
+
"grader_type": "script",
|
|
25
|
+
"expected_assertions": [
|
|
26
|
+
{
|
|
27
|
+
"type": "regex-match",
|
|
28
|
+
"pattern": "project-status-smoke-2"
|
|
29
|
+
}
|
|
30
|
+
]
|
|
31
|
+
}
|
|
32
|
+
]
|
|
33
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "propose-ado-update",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Auto-seeded evals for propose-ado-update. Replace with real cases as the skill matures.",
|
|
5
|
+
"cases": [
|
|
6
|
+
{
|
|
7
|
+
"id": "propose-ado-update-smoke-1",
|
|
8
|
+
"name": "propose-ado-update produces a non-empty response",
|
|
9
|
+
"input": "synthetic propose-ado-update probe — canary smoke",
|
|
10
|
+
"canary": false,
|
|
11
|
+
"grader_type": "script",
|
|
12
|
+
"expected_assertions": [
|
|
13
|
+
{
|
|
14
|
+
"type": "regex-match",
|
|
15
|
+
"pattern": ".+"
|
|
16
|
+
}
|
|
17
|
+
]
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"id": "propose-ado-update-smoke-2",
|
|
21
|
+
"name": "propose-ado-update echoes case id",
|
|
22
|
+
"input": "case-id propose-ado-update-smoke-2",
|
|
23
|
+
"canary": false,
|
|
24
|
+
"grader_type": "script",
|
|
25
|
+
"expected_assertions": [
|
|
26
|
+
{
|
|
27
|
+
"type": "regex-match",
|
|
28
|
+
"pattern": "propose-ado-update-smoke-2"
|
|
29
|
+
}
|
|
30
|
+
]
|
|
31
|
+
}
|
|
32
|
+
]
|
|
33
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "pull-ado",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Auto-seeded evals for pull-ado. Replace with real cases as the skill matures.",
|
|
5
|
+
"cases": [
|
|
6
|
+
{
|
|
7
|
+
"id": "pull-ado-cached-1",
|
|
8
|
+
"name": "pull-ado cached/dry-run produces output",
|
|
9
|
+
"input": "--cached --dry-run fixture-acme",
|
|
10
|
+
"canary": false,
|
|
11
|
+
"grader_type": "script",
|
|
12
|
+
"expected_assertions": [
|
|
13
|
+
{
|
|
14
|
+
"type": "regex-match",
|
|
15
|
+
"pattern": "fixture-acme",
|
|
16
|
+
"flags": "i"
|
|
17
|
+
}
|
|
18
|
+
]
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"id": "pull-ado-rubric-1",
|
|
22
|
+
"name": "pull-ado output quality (LLM-rubric, skipped in canary)",
|
|
23
|
+
"input": "summarize fixture-acme pull-ado pulls",
|
|
24
|
+
"canary": false,
|
|
25
|
+
"grader_type": "llm",
|
|
26
|
+
"expected_assertions": [
|
|
27
|
+
{
|
|
28
|
+
"type": "llm-rubric",
|
|
29
|
+
"rubric": "Does the pull-ado response cite a source file path and an ISO timestamp?",
|
|
30
|
+
"min_score": 4
|
|
31
|
+
}
|
|
32
|
+
]
|
|
33
|
+
}
|
|
34
|
+
]
|
|
35
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "pull-crm",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Auto-seeded evals for pull-crm. Replace with real cases as the skill matures.",
|
|
5
|
+
"cases": [
|
|
6
|
+
{
|
|
7
|
+
"id": "pull-crm-cached-1",
|
|
8
|
+
"name": "pull-crm cached/dry-run produces output",
|
|
9
|
+
"input": "--cached --dry-run fixture-acme",
|
|
10
|
+
"canary": false,
|
|
11
|
+
"grader_type": "script",
|
|
12
|
+
"expected_assertions": [
|
|
13
|
+
{
|
|
14
|
+
"type": "regex-match",
|
|
15
|
+
"pattern": "fixture-acme",
|
|
16
|
+
"flags": "i"
|
|
17
|
+
}
|
|
18
|
+
]
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"id": "pull-crm-rubric-1",
|
|
22
|
+
"name": "pull-crm output quality (LLM-rubric, skipped in canary)",
|
|
23
|
+
"input": "summarize fixture-acme pull-crm pulls",
|
|
24
|
+
"canary": false,
|
|
25
|
+
"grader_type": "llm",
|
|
26
|
+
"expected_assertions": [
|
|
27
|
+
{
|
|
28
|
+
"type": "llm-rubric",
|
|
29
|
+
"rubric": "Does the pull-crm response cite a source file path and an ISO timestamp?",
|
|
30
|
+
"min_score": 4
|
|
31
|
+
}
|
|
32
|
+
]
|
|
33
|
+
}
|
|
34
|
+
]
|
|
35
|
+
}
|