kushi-agents 5.0.2 → 5.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/README.md +22 -0
  2. package/package.json +6 -2
  3. package/plugin/agents/kushi.agent.md +1 -1
  4. package/plugin/instructions/skill-evals.instructions.md +130 -0
  5. package/plugin/skills/aggregate-project/evals/evals.json +33 -0
  6. package/plugin/skills/apply-ado-update/evals/evals.json +33 -0
  7. package/plugin/skills/ask-project/evals/evals.json +34 -0
  8. package/plugin/skills/bootstrap-project/evals/evals.json +34 -0
  9. package/plugin/skills/build-state/evals/evals.json +31 -0
  10. package/plugin/skills/consolidate-evidence/evals/evals.json +33 -0
  11. package/plugin/skills/dashboard/evals/evals.json +33 -0
  12. package/plugin/skills/emit-vertex/evals/evals.json +33 -0
  13. package/plugin/skills/eval/SKILL.md +90 -0
  14. package/plugin/skills/eval/evals.schema.json +73 -0
  15. package/plugin/skills/eval/run-evals.ps1 +372 -0
  16. package/plugin/skills/fde-intake/evals/evals.json +33 -0
  17. package/plugin/skills/fde-report/evals/evals.json +33 -0
  18. package/plugin/skills/fde-triage/evals/evals.json +33 -0
  19. package/plugin/skills/intro/evals/evals.json +33 -0
  20. package/plugin/skills/link-entities/evals/evals.json +31 -0
  21. package/plugin/skills/project-status/evals/evals.json +33 -0
  22. package/plugin/skills/propose-ado-update/evals/evals.json +33 -0
  23. package/plugin/skills/pull-ado/evals/evals.json +35 -0
  24. package/plugin/skills/pull-crm/evals/evals.json +35 -0
  25. package/plugin/skills/pull-email/evals/evals.json +35 -0
  26. package/plugin/skills/pull-loop/evals/evals.json +35 -0
  27. package/plugin/skills/pull-meetings/evals/evals.json +35 -0
  28. package/plugin/skills/pull-misc/evals/evals.json +35 -0
  29. package/plugin/skills/pull-onenote/evals/evals.json +35 -0
  30. package/plugin/skills/pull-sharepoint/evals/evals.json +35 -0
  31. package/plugin/skills/pull-teams/evals/evals.json +35 -0
  32. package/plugin/skills/refresh-project/evals/evals.json +31 -0
  33. package/plugin/skills/self-check/SKILL.md +1 -0
  34. package/plugin/skills/self-check/evals/evals.json +28 -0
  35. package/plugin/skills/self-check/run.ps1 +63 -0
  36. package/plugin/skills/setup/evals/evals.json +33 -0
  37. package/plugin/skills/tour/evals/evals.json +33 -0
  38. package/plugin/skills/vertex-link/evals/evals.json +33 -0
  39. package/src/eval-aggregator.mjs +209 -0
  40. package/src/eval-aggregator.test.mjs +64 -0
  41. package/src/eval-runner.test.mjs +69 -0
@@ -0,0 +1,209 @@
1
+ #!/usr/bin/env node
2
+ // kushi v5.0.3 — eval-aggregator.mjs
3
+ //
4
+ // Reads one (or many) per-run JSON files produced by plugin/skills/eval/run-evals.ps1,
5
+ // computes per-skill pass_rate / mean_duration_ms / mean_tokens_total + stddev, writes
6
+ // Evidence/_evals/benchmark.json, optionally compares against evals/baseline.json and
7
+ // flags regressions:
8
+ // - pass_rate drop >= 10 percentage points
9
+ // - mean_duration >= 50 % slower
10
+ // - mean_tokens >= 50 % more
11
+ //
12
+ // With --update-baseline, overwrites evals/baseline.json with the current run's metrics.
13
+
14
+ import fs from 'node:fs';
15
+ import path from 'node:path';
16
+
17
+ const REGRESSION = {
18
+ passRatePpDrop: 10,
19
+ durationFactor: 1.5,
20
+ tokensFactor: 1.5,
21
+ };
22
+
23
+ function parseArgs(argv) {
24
+ const out = { runs: [], bench: null, baseline: null, updateBaseline: false };
25
+ for (let i = 0; i < argv.length; i++) {
26
+ const a = argv[i];
27
+ if (a === '--run') out.runs.push(argv[++i]);
28
+ else if (a === '--runs-dir') {
29
+ const dir = argv[++i];
30
+ for (const f of fs.readdirSync(dir)) {
31
+ if (f.endsWith('.json') && f !== 'benchmark.json') out.runs.push(path.join(dir, f));
32
+ }
33
+ }
34
+ else if (a === '--bench') out.bench = argv[++i];
35
+ else if (a === '--baseline') out.baseline = argv[++i];
36
+ else if (a === '--update-baseline') out.updateBaseline = true;
37
+ }
38
+ return out;
39
+ }
40
+
41
+ export function loadRun(filePath) {
42
+ const text = fs.readFileSync(filePath, 'utf-8');
43
+ const obj = JSON.parse(text);
44
+ if (!obj || !Array.isArray(obj.skills)) {
45
+ throw new Error(`${filePath}: not a kushi.evals.run/v1 JSON (missing skills[])`);
46
+ }
47
+ return obj;
48
+ }
49
+
50
+ function mean(xs) {
51
+ if (xs.length === 0) return 0;
52
+ return xs.reduce((a, b) => a + b, 0) / xs.length;
53
+ }
54
+
55
+ function stddev(xs) {
56
+ if (xs.length < 2) return 0;
57
+ const m = mean(xs);
58
+ const v = xs.reduce((a, b) => a + (b - m) * (b - m), 0) / (xs.length - 1);
59
+ return Math.sqrt(v);
60
+ }
61
+
62
+ export function aggregate(runs) {
63
+ // skill -> { cases: [{pass, duration_ms, tokens}] }
64
+ const bySkill = new Map();
65
+ for (const run of runs) {
66
+ for (const s of run.skills) {
67
+ if (!bySkill.has(s.skill)) bySkill.set(s.skill, []);
68
+ for (const c of s.cases) {
69
+ if (c.error && String(c.error).startsWith('skipped')) continue;
70
+ bySkill.get(s.skill).push({
71
+ pass: !!c.pass,
72
+ duration_ms: c.duration_ms || 0,
73
+ tokens: (c.tokens_in || 0) + (c.tokens_out || 0),
74
+ });
75
+ }
76
+ }
77
+ }
78
+ const skills = [];
79
+ for (const [skill, cases] of bySkill.entries()) {
80
+ const n = cases.length;
81
+ const passes = cases.filter((c) => c.pass).length;
82
+ const durations = cases.map((c) => c.duration_ms);
83
+ const tokens = cases.map((c) => c.tokens).filter((t) => t > 0);
84
+ skills.push({
85
+ skill,
86
+ n,
87
+ pass_rate: n === 0 ? 0 : passes / n,
88
+ mean_duration_ms: Math.round(mean(durations)),
89
+ stddev_duration_ms: Math.round(stddev(durations)),
90
+ mean_tokens_total: Math.round(mean(tokens)),
91
+ stddev_tokens_total: Math.round(stddev(tokens)),
92
+ });
93
+ }
94
+ skills.sort((a, b) => a.skill.localeCompare(b.skill));
95
+ return { schema: 'kushi.evals.benchmark/v1', generated_at: new Date().toISOString(), skills };
96
+ }
97
+
98
+ export function detectRegressions(bench, baseline) {
99
+ if (!baseline || !Array.isArray(baseline.skills)) return [];
100
+ const byName = new Map(baseline.skills.map((s) => [s.skill, s]));
101
+ const regs = [];
102
+ for (const cur of bench.skills) {
103
+ const base = byName.get(cur.skill);
104
+ if (!base) continue;
105
+ const ppDrop = (base.pass_rate - cur.pass_rate) * 100;
106
+ if (ppDrop >= REGRESSION.passRatePpDrop) {
107
+ regs.push({
108
+ skill: cur.skill,
109
+ metric: 'pass_rate',
110
+ baseline: base.pass_rate,
111
+ current: cur.pass_rate,
112
+ delta_pp: Number(ppDrop.toFixed(1)),
113
+ threshold_pp: REGRESSION.passRatePpDrop,
114
+ });
115
+ }
116
+ if (base.mean_duration_ms > 0) {
117
+ const factor = cur.mean_duration_ms / base.mean_duration_ms;
118
+ if (factor >= REGRESSION.durationFactor) {
119
+ regs.push({
120
+ skill: cur.skill,
121
+ metric: 'mean_duration_ms',
122
+ baseline: base.mean_duration_ms,
123
+ current: cur.mean_duration_ms,
124
+ factor: Number(factor.toFixed(2)),
125
+ threshold_factor: REGRESSION.durationFactor,
126
+ });
127
+ }
128
+ }
129
+ if (base.mean_tokens_total > 0) {
130
+ const factor = cur.mean_tokens_total / base.mean_tokens_total;
131
+ if (factor >= REGRESSION.tokensFactor) {
132
+ regs.push({
133
+ skill: cur.skill,
134
+ metric: 'mean_tokens_total',
135
+ baseline: base.mean_tokens_total,
136
+ current: cur.mean_tokens_total,
137
+ factor: Number(factor.toFixed(2)),
138
+ threshold_factor: REGRESSION.tokensFactor,
139
+ });
140
+ }
141
+ }
142
+ }
143
+ return regs;
144
+ }
145
+
146
+ async function main() {
147
+ const args = parseArgs(process.argv.slice(2));
148
+ if (args.runs.length === 0) {
149
+ console.error('eval-aggregator: at least one --run <file> required');
150
+ process.exit(2);
151
+ }
152
+
153
+ const runs = args.runs.map(loadRun);
154
+ const bench = aggregate(runs);
155
+
156
+ let baseline = null;
157
+ if (args.baseline && fs.existsSync(args.baseline)) {
158
+ baseline = JSON.parse(fs.readFileSync(args.baseline, 'utf-8'));
159
+ }
160
+ const regressions = detectRegressions(bench, baseline);
161
+ bench.summary = {
162
+ n_skills: bench.skills.length,
163
+ mean_pass_rate: Number(mean(bench.skills.map((s) => s.pass_rate)).toFixed(3)),
164
+ regressions,
165
+ };
166
+
167
+ if (args.bench) {
168
+ fs.mkdirSync(path.dirname(args.bench), { recursive: true });
169
+ fs.writeFileSync(args.bench, JSON.stringify(bench, null, 2));
170
+ console.log(`eval-aggregator: wrote ${args.bench}`);
171
+ } else {
172
+ process.stdout.write(JSON.stringify(bench, null, 2));
173
+ }
174
+
175
+ if (args.updateBaseline && args.baseline) {
176
+ fs.mkdirSync(path.dirname(args.baseline), { recursive: true });
177
+ fs.writeFileSync(
178
+ args.baseline,
179
+ JSON.stringify(
180
+ {
181
+ schema: 'kushi.evals.baseline/v1',
182
+ generated_at: bench.generated_at,
183
+ skills: bench.skills,
184
+ },
185
+ null,
186
+ 2,
187
+ ),
188
+ );
189
+ console.log(`eval-aggregator: baseline updated -> ${args.baseline}`);
190
+ }
191
+
192
+ if (regressions.length > 0) {
193
+ console.error(`eval-aggregator: ${regressions.length} regression(s) flagged:`);
194
+ for (const r of regressions) console.error(' -', JSON.stringify(r));
195
+ process.exit(1);
196
+ }
197
+ }
198
+
199
+ // Only run when invoked directly, not when imported by tests.
200
+ const isDirect = (() => {
201
+ try {
202
+ const argv1 = path.resolve(process.argv[1] || '');
203
+ const here = path.resolve(new URL(import.meta.url).pathname.replace(/^\/([A-Za-z]:)/, '$1'));
204
+ return argv1 === here;
205
+ } catch { return false; }
206
+ })();
207
+ if (isDirect) {
208
+ main().catch((e) => { console.error(e); process.exit(2); });
209
+ }
@@ -0,0 +1,64 @@
1
+ // kushi v5.0.3 — eval-aggregator unit tests.
2
+ //
3
+ // Sanity-checks aggregation + regression detection logic. Pure functions, no IO
4
+ // against the real repo.
5
+
6
+ import test from 'node:test';
7
+ import assert from 'node:assert/strict';
8
+ import { aggregate, detectRegressions } from './eval-aggregator.mjs';
9
+
10
+ function makeRun(skill, cases) {
11
+ return {
12
+ schema: 'kushi.evals.run/v1',
13
+ mode: 'test',
14
+ skills: [{ skill, cases }],
15
+ };
16
+ }
17
+
18
+ test('aggregate: computes pass_rate per skill', () => {
19
+ const run = makeRun('foo', [
20
+ { pass: true, duration_ms: 100, tokens_in: 10, tokens_out: 5 },
21
+ { pass: false, duration_ms: 200, tokens_in: 10, tokens_out: 5 },
22
+ { pass: true, duration_ms: 150, tokens_in: 10, tokens_out: 5 },
23
+ ]);
24
+ const b = aggregate([run]);
25
+ assert.equal(b.skills.length, 1);
26
+ assert.equal(b.skills[0].skill, 'foo');
27
+ assert.equal(b.skills[0].n, 3);
28
+ assert.equal(b.skills[0].pass_rate.toFixed(2), '0.67');
29
+ assert.equal(b.skills[0].mean_duration_ms, 150);
30
+ });
31
+
32
+ test('aggregate: ignores skipped cases', () => {
33
+ const run = makeRun('foo', [
34
+ { pass: true, duration_ms: 100, tokens_in: 1, tokens_out: 1 },
35
+ { pass: false, duration_ms: 0, tokens_in: 0, tokens_out: 0, error: 'skipped: llm needs -Live' },
36
+ ]);
37
+ const b = aggregate([run]);
38
+ assert.equal(b.skills[0].n, 1);
39
+ assert.equal(b.skills[0].pass_rate, 1);
40
+ });
41
+
42
+ test('detectRegressions: pass-rate drop >= 10pp flags', () => {
43
+ const baseline = { skills: [{ skill: 'foo', pass_rate: 1.0, mean_duration_ms: 100, mean_tokens_total: 50 }] };
44
+ const bench = { skills: [{ skill: 'foo', pass_rate: 0.85, mean_duration_ms: 100, mean_tokens_total: 50 }] };
45
+ const regs = detectRegressions(bench, baseline);
46
+ assert.equal(regs.length, 1);
47
+ assert.equal(regs[0].metric, 'pass_rate');
48
+ assert.ok(regs[0].delta_pp >= 10);
49
+ });
50
+
51
+ test('detectRegressions: latency factor >=1.5x flags; <1.5x does not', () => {
52
+ const baseline = { skills: [{ skill: 'foo', pass_rate: 1.0, mean_duration_ms: 100, mean_tokens_total: 50 }] };
53
+ const fast = { skills: [{ skill: 'foo', pass_rate: 1.0, mean_duration_ms: 140, mean_tokens_total: 50 }] };
54
+ const slow = { skills: [{ skill: 'foo', pass_rate: 1.0, mean_duration_ms: 160, mean_tokens_total: 50 }] };
55
+ assert.equal(detectRegressions(fast, baseline).length, 0);
56
+ assert.equal(detectRegressions(slow, baseline).length, 1);
57
+ assert.equal(detectRegressions(slow, baseline)[0].metric, 'mean_duration_ms');
58
+ });
59
+
60
+ test('detectRegressions: no-op when baseline missing the skill', () => {
61
+ const baseline = { skills: [{ skill: 'other', pass_rate: 1.0, mean_duration_ms: 100, mean_tokens_total: 50 }] };
62
+ const bench = { skills: [{ skill: 'foo', pass_rate: 0.0, mean_duration_ms: 1000, mean_tokens_total: 99999 }] };
63
+ assert.deepEqual(detectRegressions(bench, baseline), []);
64
+ });
@@ -0,0 +1,69 @@
1
+ // kushi v5.0.3 — eval-runner smoke tests.
2
+ //
3
+ // Asserts the runner script + schema + at least the canary evals files are
4
+ // present and parseable. Does NOT exec pwsh (keeps the test suite OS-agnostic
5
+ // and fast — the pwsh integration is covered by self-check D33 + the
6
+ // `npm run eval:canary` smoke).
7
+
8
+ import test from 'node:test';
9
+ import assert from 'node:assert/strict';
10
+ import fs from 'node:fs';
11
+ import path from 'node:path';
12
+ import { fileURLToPath } from 'node:url';
13
+
14
+ const repoRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..');
15
+
16
+ test('eval-runner: run-evals.ps1 ships in plugin/skills/eval/', () => {
17
+ const p = path.join(repoRoot, 'plugin/skills/eval/run-evals.ps1');
18
+ assert.ok(fs.existsSync(p), `missing: ${p}`);
19
+ const txt = fs.readFileSync(p, 'utf-8');
20
+ assert.match(txt, /param\(/, 'run-evals.ps1 has no [CmdletBinding] param block');
21
+ assert.match(txt, /\$Canary/, 'run-evals.ps1 missing -Canary flag');
22
+ assert.match(txt, /\$All/, 'run-evals.ps1 missing -All flag');
23
+ assert.match(txt, /\$UpdateBaseline/, 'run-evals.ps1 missing -UpdateBaseline flag');
24
+ });
25
+
26
+ test('eval-runner: evals.schema.json validates structurally', () => {
27
+ const p = path.join(repoRoot, 'plugin/skills/eval/evals.schema.json');
28
+ assert.ok(fs.existsSync(p), `missing: ${p}`);
29
+ const s = JSON.parse(fs.readFileSync(p, 'utf-8'));
30
+ assert.equal(s.title, 'Kushi per-skill evals file');
31
+ assert.ok(s.definitions && s.definitions.case);
32
+ assert.ok(s.definitions.case.required.includes('expected_assertions'));
33
+ });
34
+
35
+ test('eval-runner: every plugin/skills/<name>/ (except eval/) has evals/evals.json that parses', () => {
36
+ const skillsDir = path.join(repoRoot, 'plugin/skills');
37
+ const skills = fs.readdirSync(skillsDir, { withFileTypes: true })
38
+ .filter((d) => d.isDirectory() && d.name !== 'eval')
39
+ .map((d) => d.name);
40
+ const missing = [];
41
+ for (const skill of skills) {
42
+ const f = path.join(skillsDir, skill, 'evals/evals.json');
43
+ if (!fs.existsSync(f)) { missing.push(skill); continue; }
44
+ const obj = JSON.parse(fs.readFileSync(f, 'utf-8'));
45
+ assert.equal(obj.skill, skill, `${skill}: evals.json declares skill '${obj.skill}'`);
46
+ assert.ok(Array.isArray(obj.cases) && obj.cases.length >= 2, `${skill}: needs >=2 cases`);
47
+ for (const c of obj.cases) {
48
+ assert.ok(c.id && c.name && c.input !== undefined, `${skill}/${c.id}: missing core fields`);
49
+ assert.ok(Array.isArray(c.expected_assertions) && c.expected_assertions.length >= 1,
50
+ `${skill}/${c.id}: needs >=1 assertion`);
51
+ assert.ok(['script', 'llm'].includes(c.grader_type), `${skill}/${c.id}: bad grader_type`);
52
+ }
53
+ }
54
+ assert.deepEqual(missing, [], `skills missing evals/evals.json: ${missing.join(', ')}`);
55
+ });
56
+
57
+ test('eval-runner: canary subset includes the documented six skills', () => {
58
+ const expected = new Set([
59
+ 'ask-project', 'bootstrap-project', 'refresh-project',
60
+ 'link-entities', 'build-state', 'self-check',
61
+ ]);
62
+ const found = new Set();
63
+ for (const skill of expected) {
64
+ const f = path.join(repoRoot, 'plugin/skills', skill, 'evals/evals.json');
65
+ const obj = JSON.parse(fs.readFileSync(f, 'utf-8'));
66
+ if (obj.cases.some((c) => c.canary === true)) found.add(skill);
67
+ }
68
+ assert.deepEqual([...found].sort(), [...expected].sort());
69
+ });