kushi-agents 5.0.2 → 5.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +35 -0
- package/bin/cli.mjs +103 -0
- package/package.json +6 -2
- package/plugin/agents/kushi.agent.md +3 -1
- package/plugin/instructions/skill-authoring.instructions.md +147 -0
- package/plugin/instructions/skill-evals.instructions.md +130 -0
- package/plugin/skills/aggregate-project/evals/evals.json +33 -0
- package/plugin/skills/apply-ado-update/evals/evals.json +33 -0
- package/plugin/skills/ask-project/SKILL.md +10 -0
- package/plugin/skills/ask-project/evals/evals.json +34 -0
- package/plugin/skills/bootstrap-project/evals/evals.json +34 -0
- package/plugin/skills/build-state/evals/evals.json +31 -0
- package/plugin/skills/consolidate-evidence/evals/evals.json +33 -0
- package/plugin/skills/dashboard/evals/evals.json +33 -0
- package/plugin/skills/emit-vertex/evals/evals.json +33 -0
- package/plugin/skills/eval/SKILL.md +90 -0
- package/plugin/skills/eval/evals.schema.json +73 -0
- package/plugin/skills/eval/run-evals.ps1 +372 -0
- package/plugin/skills/fde-intake/evals/evals.json +33 -0
- package/plugin/skills/fde-report/evals/evals.json +33 -0
- package/plugin/skills/fde-triage/evals/evals.json +33 -0
- package/plugin/skills/intro/SKILL.md +160 -451
- package/plugin/skills/intro/evals/evals.json +33 -0
- package/plugin/skills/intro/references/walkthrough.md +310 -0
- package/plugin/skills/link-entities/evals/evals.json +31 -0
- package/plugin/skills/project-status/SKILL.md +10 -1
- package/plugin/skills/project-status/evals/evals.json +33 -0
- package/plugin/skills/propose-ado-update/evals/evals.json +33 -0
- package/plugin/skills/pull-ado/evals/evals.json +35 -0
- package/plugin/skills/pull-crm/evals/evals.json +35 -0
- package/plugin/skills/pull-email/evals/evals.json +35 -0
- package/plugin/skills/pull-loop/evals/evals.json +35 -0
- package/plugin/skills/pull-meetings/evals/evals.json +35 -0
- package/plugin/skills/pull-misc/evals/evals.json +35 -0
- package/plugin/skills/pull-onenote/evals/evals.json +35 -0
- package/plugin/skills/pull-sharepoint/evals/evals.json +35 -0
- package/plugin/skills/pull-teams/evals/evals.json +35 -0
- package/plugin/skills/refresh-project/evals/evals.json +31 -0
- package/plugin/skills/self-check/SKILL.md +2 -0
- package/plugin/skills/self-check/evals/evals.json +28 -0
- package/plugin/skills/self-check/run.ps1 +144 -0
- package/plugin/skills/setup/SKILL.md +10 -0
- package/plugin/skills/setup/evals/evals.json +33 -0
- package/plugin/skills/skill-checker/SKILL.md +136 -0
- package/plugin/skills/skill-checker/check-skill.ps1 +416 -0
- package/plugin/skills/skill-checker/evals/evals.json +41 -0
- package/plugin/skills/skill-creator/SKILL.md +134 -0
- package/plugin/skills/skill-creator/evals/evals.json +40 -0
- package/plugin/skills/skill-creator/generate-eval-review.ps1 +101 -0
- package/plugin/skills/skill-creator/optimize-description.ps1 +87 -0
- package/plugin/skills/skill-creator/scaffold.ps1 +180 -0
- package/plugin/skills/skill-creator/templates/evals-starter.template.json +27 -0
- package/plugin/skills/skill-creator/templates/gotchas-stub.template.md +9 -0
- package/plugin/skills/skill-creator/templates/skill-skeleton.template.md +28 -0
- package/plugin/skills/tour/evals/evals.json +33 -0
- package/plugin/skills/vertex-link/SKILL.md +10 -0
- package/plugin/skills/vertex-link/evals/evals.json +33 -0
- package/src/eval-aggregator.mjs +209 -0
- package/src/eval-aggregator.test.mjs +64 -0
- package/src/eval-runner.test.mjs +69 -0
- package/src/skill-checker.test.mjs +118 -0
- package/src/skill-creator.test.mjs +92 -0
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
// kushi v5.0.3 — eval-aggregator.mjs
|
|
3
|
+
//
|
|
4
|
+
// Reads one (or many) per-run JSON files produced by plugin/skills/eval/run-evals.ps1,
|
|
5
|
+
// computes per-skill pass_rate / mean_duration_ms / mean_tokens_total + stddev, writes
|
|
6
|
+
// Evidence/_evals/benchmark.json, optionally compares against evals/baseline.json and
|
|
7
|
+
// flags regressions:
|
|
8
|
+
// - pass_rate drop >= 10 percentage points
|
|
9
|
+
// - mean_duration >= 50 % slower
|
|
10
|
+
// - mean_tokens >= 50 % more
|
|
11
|
+
//
|
|
12
|
+
// With --update-baseline, overwrites evals/baseline.json with the current run's metrics.
|
|
13
|
+
|
|
14
|
+
import fs from 'node:fs';
|
|
15
|
+
import path from 'node:path';
|
|
16
|
+
|
|
17
|
+
const REGRESSION = {
|
|
18
|
+
passRatePpDrop: 10,
|
|
19
|
+
durationFactor: 1.5,
|
|
20
|
+
tokensFactor: 1.5,
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
function parseArgs(argv) {
|
|
24
|
+
const out = { runs: [], bench: null, baseline: null, updateBaseline: false };
|
|
25
|
+
for (let i = 0; i < argv.length; i++) {
|
|
26
|
+
const a = argv[i];
|
|
27
|
+
if (a === '--run') out.runs.push(argv[++i]);
|
|
28
|
+
else if (a === '--runs-dir') {
|
|
29
|
+
const dir = argv[++i];
|
|
30
|
+
for (const f of fs.readdirSync(dir)) {
|
|
31
|
+
if (f.endsWith('.json') && f !== 'benchmark.json') out.runs.push(path.join(dir, f));
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
else if (a === '--bench') out.bench = argv[++i];
|
|
35
|
+
else if (a === '--baseline') out.baseline = argv[++i];
|
|
36
|
+
else if (a === '--update-baseline') out.updateBaseline = true;
|
|
37
|
+
}
|
|
38
|
+
return out;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export function loadRun(filePath) {
|
|
42
|
+
const text = fs.readFileSync(filePath, 'utf-8');
|
|
43
|
+
const obj = JSON.parse(text);
|
|
44
|
+
if (!obj || !Array.isArray(obj.skills)) {
|
|
45
|
+
throw new Error(`${filePath}: not a kushi.evals.run/v1 JSON (missing skills[])`);
|
|
46
|
+
}
|
|
47
|
+
return obj;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
function mean(xs) {
|
|
51
|
+
if (xs.length === 0) return 0;
|
|
52
|
+
return xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function stddev(xs) {
|
|
56
|
+
if (xs.length < 2) return 0;
|
|
57
|
+
const m = mean(xs);
|
|
58
|
+
const v = xs.reduce((a, b) => a + (b - m) * (b - m), 0) / (xs.length - 1);
|
|
59
|
+
return Math.sqrt(v);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
export function aggregate(runs) {
|
|
63
|
+
// skill -> { cases: [{pass, duration_ms, tokens}] }
|
|
64
|
+
const bySkill = new Map();
|
|
65
|
+
for (const run of runs) {
|
|
66
|
+
for (const s of run.skills) {
|
|
67
|
+
if (!bySkill.has(s.skill)) bySkill.set(s.skill, []);
|
|
68
|
+
for (const c of s.cases) {
|
|
69
|
+
if (c.error && String(c.error).startsWith('skipped')) continue;
|
|
70
|
+
bySkill.get(s.skill).push({
|
|
71
|
+
pass: !!c.pass,
|
|
72
|
+
duration_ms: c.duration_ms || 0,
|
|
73
|
+
tokens: (c.tokens_in || 0) + (c.tokens_out || 0),
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
const skills = [];
|
|
79
|
+
for (const [skill, cases] of bySkill.entries()) {
|
|
80
|
+
const n = cases.length;
|
|
81
|
+
const passes = cases.filter((c) => c.pass).length;
|
|
82
|
+
const durations = cases.map((c) => c.duration_ms);
|
|
83
|
+
const tokens = cases.map((c) => c.tokens).filter((t) => t > 0);
|
|
84
|
+
skills.push({
|
|
85
|
+
skill,
|
|
86
|
+
n,
|
|
87
|
+
pass_rate: n === 0 ? 0 : passes / n,
|
|
88
|
+
mean_duration_ms: Math.round(mean(durations)),
|
|
89
|
+
stddev_duration_ms: Math.round(stddev(durations)),
|
|
90
|
+
mean_tokens_total: Math.round(mean(tokens)),
|
|
91
|
+
stddev_tokens_total: Math.round(stddev(tokens)),
|
|
92
|
+
});
|
|
93
|
+
}
|
|
94
|
+
skills.sort((a, b) => a.skill.localeCompare(b.skill));
|
|
95
|
+
return { schema: 'kushi.evals.benchmark/v1', generated_at: new Date().toISOString(), skills };
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
export function detectRegressions(bench, baseline) {
|
|
99
|
+
if (!baseline || !Array.isArray(baseline.skills)) return [];
|
|
100
|
+
const byName = new Map(baseline.skills.map((s) => [s.skill, s]));
|
|
101
|
+
const regs = [];
|
|
102
|
+
for (const cur of bench.skills) {
|
|
103
|
+
const base = byName.get(cur.skill);
|
|
104
|
+
if (!base) continue;
|
|
105
|
+
const ppDrop = (base.pass_rate - cur.pass_rate) * 100;
|
|
106
|
+
if (ppDrop >= REGRESSION.passRatePpDrop) {
|
|
107
|
+
regs.push({
|
|
108
|
+
skill: cur.skill,
|
|
109
|
+
metric: 'pass_rate',
|
|
110
|
+
baseline: base.pass_rate,
|
|
111
|
+
current: cur.pass_rate,
|
|
112
|
+
delta_pp: Number(ppDrop.toFixed(1)),
|
|
113
|
+
threshold_pp: REGRESSION.passRatePpDrop,
|
|
114
|
+
});
|
|
115
|
+
}
|
|
116
|
+
if (base.mean_duration_ms > 0) {
|
|
117
|
+
const factor = cur.mean_duration_ms / base.mean_duration_ms;
|
|
118
|
+
if (factor >= REGRESSION.durationFactor) {
|
|
119
|
+
regs.push({
|
|
120
|
+
skill: cur.skill,
|
|
121
|
+
metric: 'mean_duration_ms',
|
|
122
|
+
baseline: base.mean_duration_ms,
|
|
123
|
+
current: cur.mean_duration_ms,
|
|
124
|
+
factor: Number(factor.toFixed(2)),
|
|
125
|
+
threshold_factor: REGRESSION.durationFactor,
|
|
126
|
+
});
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
if (base.mean_tokens_total > 0) {
|
|
130
|
+
const factor = cur.mean_tokens_total / base.mean_tokens_total;
|
|
131
|
+
if (factor >= REGRESSION.tokensFactor) {
|
|
132
|
+
regs.push({
|
|
133
|
+
skill: cur.skill,
|
|
134
|
+
metric: 'mean_tokens_total',
|
|
135
|
+
baseline: base.mean_tokens_total,
|
|
136
|
+
current: cur.mean_tokens_total,
|
|
137
|
+
factor: Number(factor.toFixed(2)),
|
|
138
|
+
threshold_factor: REGRESSION.tokensFactor,
|
|
139
|
+
});
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
return regs;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
async function main() {
|
|
147
|
+
const args = parseArgs(process.argv.slice(2));
|
|
148
|
+
if (args.runs.length === 0) {
|
|
149
|
+
console.error('eval-aggregator: at least one --run <file> required');
|
|
150
|
+
process.exit(2);
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
const runs = args.runs.map(loadRun);
|
|
154
|
+
const bench = aggregate(runs);
|
|
155
|
+
|
|
156
|
+
let baseline = null;
|
|
157
|
+
if (args.baseline && fs.existsSync(args.baseline)) {
|
|
158
|
+
baseline = JSON.parse(fs.readFileSync(args.baseline, 'utf-8'));
|
|
159
|
+
}
|
|
160
|
+
const regressions = detectRegressions(bench, baseline);
|
|
161
|
+
bench.summary = {
|
|
162
|
+
n_skills: bench.skills.length,
|
|
163
|
+
mean_pass_rate: Number(mean(bench.skills.map((s) => s.pass_rate)).toFixed(3)),
|
|
164
|
+
regressions,
|
|
165
|
+
};
|
|
166
|
+
|
|
167
|
+
if (args.bench) {
|
|
168
|
+
fs.mkdirSync(path.dirname(args.bench), { recursive: true });
|
|
169
|
+
fs.writeFileSync(args.bench, JSON.stringify(bench, null, 2));
|
|
170
|
+
console.log(`eval-aggregator: wrote ${args.bench}`);
|
|
171
|
+
} else {
|
|
172
|
+
process.stdout.write(JSON.stringify(bench, null, 2));
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
if (args.updateBaseline && args.baseline) {
|
|
176
|
+
fs.mkdirSync(path.dirname(args.baseline), { recursive: true });
|
|
177
|
+
fs.writeFileSync(
|
|
178
|
+
args.baseline,
|
|
179
|
+
JSON.stringify(
|
|
180
|
+
{
|
|
181
|
+
schema: 'kushi.evals.baseline/v1',
|
|
182
|
+
generated_at: bench.generated_at,
|
|
183
|
+
skills: bench.skills,
|
|
184
|
+
},
|
|
185
|
+
null,
|
|
186
|
+
2,
|
|
187
|
+
),
|
|
188
|
+
);
|
|
189
|
+
console.log(`eval-aggregator: baseline updated -> ${args.baseline}`);
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
if (regressions.length > 0) {
|
|
193
|
+
console.error(`eval-aggregator: ${regressions.length} regression(s) flagged:`);
|
|
194
|
+
for (const r of regressions) console.error(' -', JSON.stringify(r));
|
|
195
|
+
process.exit(1);
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
// Only run when invoked directly, not when imported by tests.
|
|
200
|
+
const isDirect = (() => {
|
|
201
|
+
try {
|
|
202
|
+
const argv1 = path.resolve(process.argv[1] || '');
|
|
203
|
+
const here = path.resolve(new URL(import.meta.url).pathname.replace(/^\/([A-Za-z]:)/, '$1'));
|
|
204
|
+
return argv1 === here;
|
|
205
|
+
} catch { return false; }
|
|
206
|
+
})();
|
|
207
|
+
if (isDirect) {
|
|
208
|
+
main().catch((e) => { console.error(e); process.exit(2); });
|
|
209
|
+
}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
// kushi v5.0.3 — eval-aggregator unit tests.
|
|
2
|
+
//
|
|
3
|
+
// Sanity-checks aggregation + regression detection logic. Pure functions, no IO
|
|
4
|
+
// against the real repo.
|
|
5
|
+
|
|
6
|
+
import test from 'node:test';
|
|
7
|
+
import assert from 'node:assert/strict';
|
|
8
|
+
import { aggregate, detectRegressions } from './eval-aggregator.mjs';
|
|
9
|
+
|
|
10
|
+
function makeRun(skill, cases) {
|
|
11
|
+
return {
|
|
12
|
+
schema: 'kushi.evals.run/v1',
|
|
13
|
+
mode: 'test',
|
|
14
|
+
skills: [{ skill, cases }],
|
|
15
|
+
};
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
test('aggregate: computes pass_rate per skill', () => {
|
|
19
|
+
const run = makeRun('foo', [
|
|
20
|
+
{ pass: true, duration_ms: 100, tokens_in: 10, tokens_out: 5 },
|
|
21
|
+
{ pass: false, duration_ms: 200, tokens_in: 10, tokens_out: 5 },
|
|
22
|
+
{ pass: true, duration_ms: 150, tokens_in: 10, tokens_out: 5 },
|
|
23
|
+
]);
|
|
24
|
+
const b = aggregate([run]);
|
|
25
|
+
assert.equal(b.skills.length, 1);
|
|
26
|
+
assert.equal(b.skills[0].skill, 'foo');
|
|
27
|
+
assert.equal(b.skills[0].n, 3);
|
|
28
|
+
assert.equal(b.skills[0].pass_rate.toFixed(2), '0.67');
|
|
29
|
+
assert.equal(b.skills[0].mean_duration_ms, 150);
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
test('aggregate: ignores skipped cases', () => {
|
|
33
|
+
const run = makeRun('foo', [
|
|
34
|
+
{ pass: true, duration_ms: 100, tokens_in: 1, tokens_out: 1 },
|
|
35
|
+
{ pass: false, duration_ms: 0, tokens_in: 0, tokens_out: 0, error: 'skipped: llm needs -Live' },
|
|
36
|
+
]);
|
|
37
|
+
const b = aggregate([run]);
|
|
38
|
+
assert.equal(b.skills[0].n, 1);
|
|
39
|
+
assert.equal(b.skills[0].pass_rate, 1);
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
test('detectRegressions: pass-rate drop >= 10pp flags', () => {
|
|
43
|
+
const baseline = { skills: [{ skill: 'foo', pass_rate: 1.0, mean_duration_ms: 100, mean_tokens_total: 50 }] };
|
|
44
|
+
const bench = { skills: [{ skill: 'foo', pass_rate: 0.85, mean_duration_ms: 100, mean_tokens_total: 50 }] };
|
|
45
|
+
const regs = detectRegressions(bench, baseline);
|
|
46
|
+
assert.equal(regs.length, 1);
|
|
47
|
+
assert.equal(regs[0].metric, 'pass_rate');
|
|
48
|
+
assert.ok(regs[0].delta_pp >= 10);
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
test('detectRegressions: latency factor >=1.5x flags; <1.5x does not', () => {
|
|
52
|
+
const baseline = { skills: [{ skill: 'foo', pass_rate: 1.0, mean_duration_ms: 100, mean_tokens_total: 50 }] };
|
|
53
|
+
const fast = { skills: [{ skill: 'foo', pass_rate: 1.0, mean_duration_ms: 140, mean_tokens_total: 50 }] };
|
|
54
|
+
const slow = { skills: [{ skill: 'foo', pass_rate: 1.0, mean_duration_ms: 160, mean_tokens_total: 50 }] };
|
|
55
|
+
assert.equal(detectRegressions(fast, baseline).length, 0);
|
|
56
|
+
assert.equal(detectRegressions(slow, baseline).length, 1);
|
|
57
|
+
assert.equal(detectRegressions(slow, baseline)[0].metric, 'mean_duration_ms');
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
test('detectRegressions: no-op when baseline missing the skill', () => {
|
|
61
|
+
const baseline = { skills: [{ skill: 'other', pass_rate: 1.0, mean_duration_ms: 100, mean_tokens_total: 50 }] };
|
|
62
|
+
const bench = { skills: [{ skill: 'foo', pass_rate: 0.0, mean_duration_ms: 1000, mean_tokens_total: 99999 }] };
|
|
63
|
+
assert.deepEqual(detectRegressions(bench, baseline), []);
|
|
64
|
+
});
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
// kushi v5.0.3 — eval-runner smoke tests.
|
|
2
|
+
//
|
|
3
|
+
// Asserts the runner script + schema + at least the canary evals files are
|
|
4
|
+
// present and parseable. Does NOT exec pwsh (keeps the test suite OS-agnostic
|
|
5
|
+
// and fast — the pwsh integration is covered by self-check D33 + the
|
|
6
|
+
// `npm run eval:canary` smoke).
|
|
7
|
+
|
|
8
|
+
import test from 'node:test';
|
|
9
|
+
import assert from 'node:assert/strict';
|
|
10
|
+
import fs from 'node:fs';
|
|
11
|
+
import path from 'node:path';
|
|
12
|
+
import { fileURLToPath } from 'node:url';
|
|
13
|
+
|
|
14
|
+
const repoRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..');
|
|
15
|
+
|
|
16
|
+
test('eval-runner: run-evals.ps1 ships in plugin/skills/eval/', () => {
|
|
17
|
+
const p = path.join(repoRoot, 'plugin/skills/eval/run-evals.ps1');
|
|
18
|
+
assert.ok(fs.existsSync(p), `missing: ${p}`);
|
|
19
|
+
const txt = fs.readFileSync(p, 'utf-8');
|
|
20
|
+
assert.match(txt, /param\(/, 'run-evals.ps1 has no [CmdletBinding] param block');
|
|
21
|
+
assert.match(txt, /\$Canary/, 'run-evals.ps1 missing -Canary flag');
|
|
22
|
+
assert.match(txt, /\$All/, 'run-evals.ps1 missing -All flag');
|
|
23
|
+
assert.match(txt, /\$UpdateBaseline/, 'run-evals.ps1 missing -UpdateBaseline flag');
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
test('eval-runner: evals.schema.json validates structurally', () => {
|
|
27
|
+
const p = path.join(repoRoot, 'plugin/skills/eval/evals.schema.json');
|
|
28
|
+
assert.ok(fs.existsSync(p), `missing: ${p}`);
|
|
29
|
+
const s = JSON.parse(fs.readFileSync(p, 'utf-8'));
|
|
30
|
+
assert.equal(s.title, 'Kushi per-skill evals file');
|
|
31
|
+
assert.ok(s.definitions && s.definitions.case);
|
|
32
|
+
assert.ok(s.definitions.case.required.includes('expected_assertions'));
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
test('eval-runner: every plugin/skills/<name>/ (except eval/) has evals/evals.json that parses', () => {
|
|
36
|
+
const skillsDir = path.join(repoRoot, 'plugin/skills');
|
|
37
|
+
const skills = fs.readdirSync(skillsDir, { withFileTypes: true })
|
|
38
|
+
.filter((d) => d.isDirectory() && d.name !== 'eval')
|
|
39
|
+
.map((d) => d.name);
|
|
40
|
+
const missing = [];
|
|
41
|
+
for (const skill of skills) {
|
|
42
|
+
const f = path.join(skillsDir, skill, 'evals/evals.json');
|
|
43
|
+
if (!fs.existsSync(f)) { missing.push(skill); continue; }
|
|
44
|
+
const obj = JSON.parse(fs.readFileSync(f, 'utf-8'));
|
|
45
|
+
assert.equal(obj.skill, skill, `${skill}: evals.json declares skill '${obj.skill}'`);
|
|
46
|
+
assert.ok(Array.isArray(obj.cases) && obj.cases.length >= 2, `${skill}: needs >=2 cases`);
|
|
47
|
+
for (const c of obj.cases) {
|
|
48
|
+
assert.ok(c.id && c.name && c.input !== undefined, `${skill}/${c.id}: missing core fields`);
|
|
49
|
+
assert.ok(Array.isArray(c.expected_assertions) && c.expected_assertions.length >= 1,
|
|
50
|
+
`${skill}/${c.id}: needs >=1 assertion`);
|
|
51
|
+
assert.ok(['script', 'llm'].includes(c.grader_type), `${skill}/${c.id}: bad grader_type`);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
assert.deepEqual(missing, [], `skills missing evals/evals.json: ${missing.join(', ')}`);
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
test('eval-runner: canary subset includes the documented six skills', () => {
|
|
58
|
+
const expected = new Set([
|
|
59
|
+
'ask-project', 'bootstrap-project', 'refresh-project',
|
|
60
|
+
'link-entities', 'build-state', 'self-check',
|
|
61
|
+
]);
|
|
62
|
+
const found = new Set();
|
|
63
|
+
for (const skill of expected) {
|
|
64
|
+
const f = path.join(repoRoot, 'plugin/skills', skill, 'evals/evals.json');
|
|
65
|
+
const obj = JSON.parse(fs.readFileSync(f, 'utf-8'));
|
|
66
|
+
if (obj.cases.some((c) => c.canary === true)) found.add(skill);
|
|
67
|
+
}
|
|
68
|
+
assert.deepEqual([...found].sort(), [...expected].sort());
|
|
69
|
+
});
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
// Tests for plugin/skills/skill-checker/ — lint catches known-bad, retrofit
|
|
2
|
+
// emits the expected fix plan, apply is idempotent.
|
|
3
|
+
// Per the v5.0.4 task: ≥3 cases. Uses synthetic tmp skills under .testtmp/.
|
|
4
|
+
|
|
5
|
+
import { test } from 'node:test';
|
|
6
|
+
import assert from 'node:assert/strict';
|
|
7
|
+
import { spawnSync } from 'node:child_process';
|
|
8
|
+
import fs from 'node:fs';
|
|
9
|
+
import path from 'node:path';
|
|
10
|
+
import url from 'node:url';
|
|
11
|
+
|
|
12
|
+
const here = path.dirname(url.fileURLToPath(import.meta.url));
|
|
13
|
+
const repoRoot = path.resolve(here, '..');
|
|
14
|
+
const checker = path.join(repoRoot, 'plugin', 'skills', 'skill-checker', 'check-skill.ps1');
|
|
15
|
+
|
|
16
|
+
function pwsh(args) {
|
|
17
|
+
return spawnSync('pwsh', ['-NoProfile', '-File', ...args], { encoding: 'utf8' });
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
function makeBrokenSkill(label, mdContent) {
|
|
21
|
+
const root = path.join(repoRoot, '.testtmp', `skill-checker-${label}-${Date.now()}`);
|
|
22
|
+
const skillDir = path.join(root, 'plugin', 'skills', label);
|
|
23
|
+
fs.mkdirSync(skillDir, { recursive: true });
|
|
24
|
+
fs.writeFileSync(path.join(skillDir, 'SKILL.md'), mdContent);
|
|
25
|
+
return { root, skillDir };
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
test('lint catches missing USE WHEN + missing procedure section', () => {
|
|
29
|
+
// No "USE WHEN" in description, no Gotchas/Validation/Checklist.
|
|
30
|
+
const broken = `---
|
|
31
|
+
name: "sc-broken-skill"
|
|
32
|
+
version: "0.0.1"
|
|
33
|
+
description: "Just a skill that does some stuff."
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
# Skill: sc-broken-skill
|
|
37
|
+
|
|
38
|
+
A skill.
|
|
39
|
+
`;
|
|
40
|
+
const { root } = makeBrokenSkill('sc-broken-skill', broken);
|
|
41
|
+
try {
|
|
42
|
+
const r = pwsh([checker, '-Skill', 'sc-broken-skill', '-Root', root, '-Json']);
|
|
43
|
+
assert.equal(r.status, 0);
|
|
44
|
+
const rep = JSON.parse(r.stdout);
|
|
45
|
+
const codes = rep.skills[0].findings.map((f) => f.code);
|
|
46
|
+
assert.ok(codes.includes('SC3.description-no-use-when'), `expected SC3 in ${codes}`);
|
|
47
|
+
assert.ok(codes.includes('SC5.missing-procedure-section'), `expected SC5 in ${codes}`);
|
|
48
|
+
assert.ok(rep.skills[0].findings.some((f) => f.kind === 'additive'), 'at least one finding additive');
|
|
49
|
+
} finally {
|
|
50
|
+
fs.rmSync(root, { recursive: true, force: true });
|
|
51
|
+
}
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
test('retrofit --apply adds missing section + creates starter evals', () => {
|
|
55
|
+
const broken = `---
|
|
56
|
+
name: "sc-retrofit-target"
|
|
57
|
+
version: "0.0.1"
|
|
58
|
+
description: "USE WHEN testing retrofit. DO NOT USE FOR real skills."
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
# Skill: sc-retrofit-target
|
|
62
|
+
|
|
63
|
+
Body.
|
|
64
|
+
`;
|
|
65
|
+
const { root, skillDir } = makeBrokenSkill('sc-retrofit-target', broken);
|
|
66
|
+
try {
|
|
67
|
+
const before = fs.readFileSync(path.join(skillDir, 'SKILL.md'), 'utf8');
|
|
68
|
+
const r = pwsh([checker, '-Skill', 'sc-retrofit-target', '-Root', root, '-Retrofit', '-Apply', '-Json']);
|
|
69
|
+
assert.equal(r.status, 0);
|
|
70
|
+
const after = fs.readFileSync(path.join(skillDir, 'SKILL.md'), 'utf8');
|
|
71
|
+
assert.ok(after.length > before.length, 'SKILL.md should have grown');
|
|
72
|
+
assert.match(after, /## Validation loop|## Gotchas|## Step checklist/,
|
|
73
|
+
'apply should have appended a procedure section');
|
|
74
|
+
assert.ok(after.startsWith(before), 'apply should be additive (original prefix preserved)');
|
|
75
|
+
assert.ok(fs.existsSync(path.join(skillDir, 'evals', 'evals.json')),
|
|
76
|
+
'starter evals.json should have been created');
|
|
77
|
+
const evals = JSON.parse(fs.readFileSync(path.join(skillDir, 'evals', 'evals.json'), 'utf8'));
|
|
78
|
+
assert.equal(evals.skill, 'sc-retrofit-target');
|
|
79
|
+
assert.ok(evals.cases.length >= 2);
|
|
80
|
+
} finally {
|
|
81
|
+
fs.rmSync(root, { recursive: true, force: true });
|
|
82
|
+
}
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
test('apply is idempotent — second run is a no-op', () => {
|
|
86
|
+
const broken = `---
|
|
87
|
+
name: "sc-idempotent"
|
|
88
|
+
version: "0.0.1"
|
|
89
|
+
description: "USE WHEN testing idempotency. DO NOT USE FOR real skills."
|
|
90
|
+
---
|
|
91
|
+
|
|
92
|
+
# Skill: sc-idempotent
|
|
93
|
+
|
|
94
|
+
Body.
|
|
95
|
+
`;
|
|
96
|
+
const { root, skillDir } = makeBrokenSkill('sc-idempotent', broken);
|
|
97
|
+
try {
|
|
98
|
+
pwsh([checker, '-Skill', 'sc-idempotent', '-Root', root, '-Retrofit', '-Apply']);
|
|
99
|
+
const afterFirst = fs.readFileSync(path.join(skillDir, 'SKILL.md'), 'utf8');
|
|
100
|
+
const r2 = pwsh([checker, '-Skill', 'sc-idempotent', '-Root', root, '-Retrofit', '-Apply', '-Json']);
|
|
101
|
+
assert.equal(r2.status, 0);
|
|
102
|
+
const afterSecond = fs.readFileSync(path.join(skillDir, 'SKILL.md'), 'utf8');
|
|
103
|
+
assert.equal(afterFirst, afterSecond, 'second apply should not modify the file');
|
|
104
|
+
const rep = JSON.parse(r2.stdout);
|
|
105
|
+
assert.equal(rep.summary.applied, 0, 'second apply should report 0 fixes');
|
|
106
|
+
} finally {
|
|
107
|
+
fs.rmSync(root, { recursive: true, force: true });
|
|
108
|
+
}
|
|
109
|
+
});
|
|
110
|
+
|
|
111
|
+
test('--all lints every skill in the live repo and returns valid JSON', () => {
|
|
112
|
+
const r = pwsh([checker, '-All', '-Json']);
|
|
113
|
+
assert.equal(r.status, 0);
|
|
114
|
+
const rep = JSON.parse(r.stdout);
|
|
115
|
+
assert.ok(rep.skills.length > 0, 'should find skills');
|
|
116
|
+
assert.equal(typeof rep.summary.total_skills, 'number');
|
|
117
|
+
assert.equal(typeof rep.summary.skills_clean, 'number');
|
|
118
|
+
});
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
// Tests for plugin/skills/skill-creator/ — scaffold + optimize-description.
|
|
2
|
+
// Per the v5.0.4 task: ≥3 cases. Uses node:test + node:assert with synthetic
|
|
3
|
+
// tmp dirs under the repo's own .testtmp/ (NEVER /tmp).
|
|
4
|
+
|
|
5
|
+
import { test } from 'node:test';
|
|
6
|
+
import assert from 'node:assert/strict';
|
|
7
|
+
import { spawnSync } from 'node:child_process';
|
|
8
|
+
import fs from 'node:fs';
|
|
9
|
+
import path from 'node:path';
|
|
10
|
+
import url from 'node:url';
|
|
11
|
+
|
|
12
|
+
const here = path.dirname(url.fileURLToPath(import.meta.url));
|
|
13
|
+
const repoRoot = path.resolve(here, '..');
|
|
14
|
+
const scaffold = path.join(repoRoot, 'plugin', 'skills', 'skill-creator', 'scaffold.ps1');
|
|
15
|
+
const checker = path.join(repoRoot, 'plugin', 'skills', 'skill-checker', 'check-skill.ps1');
|
|
16
|
+
const optimize = path.join(repoRoot, 'plugin', 'skills', 'skill-creator', 'optimize-description.ps1');
|
|
17
|
+
|
|
18
|
+
function pwsh(scriptArgs, opts = {}) {
|
|
19
|
+
return spawnSync('pwsh', ['-NoProfile', '-File', ...scriptArgs], {
|
|
20
|
+
encoding: 'utf8',
|
|
21
|
+
...opts,
|
|
22
|
+
});
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
function makeTmpRoot(label) {
|
|
26
|
+
const dir = path.join(repoRoot, '.testtmp', `skill-creator-${label}-${Date.now()}`);
|
|
27
|
+
fs.mkdirSync(path.join(dir, 'plugin', 'skills'), { recursive: true });
|
|
28
|
+
// Mirror the templates dir so scaffold can read it (it resolves templates
|
|
29
|
+
// relative to its own location, so we don't need to copy — but we do need
|
|
30
|
+
// a writable plugin/skills/<name>/ destination).
|
|
31
|
+
return dir;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
test('scaffold creates a valid SKILL.md + evals.json + marker', () => {
|
|
35
|
+
const tmpRoot = makeTmpRoot('scaffold');
|
|
36
|
+
try {
|
|
37
|
+
const name = 'sc-test-fresh';
|
|
38
|
+
const result = pwsh([scaffold, '-Name', name, '-Type', 'other',
|
|
39
|
+
'-Description', 'USE WHEN testing the scaffold. DO NOT USE FOR real skills.',
|
|
40
|
+
'-Root', tmpRoot]);
|
|
41
|
+
assert.equal(result.status, 0, `scaffold failed: ${result.stderr}\n${result.stdout}`);
|
|
42
|
+
const skillDir = path.join(tmpRoot, 'plugin', 'skills', name);
|
|
43
|
+
assert.ok(fs.existsSync(path.join(skillDir, 'SKILL.md')), 'SKILL.md missing');
|
|
44
|
+
assert.ok(fs.existsSync(path.join(skillDir, 'evals', 'evals.json')), 'evals/evals.json missing');
|
|
45
|
+
assert.ok(fs.existsSync(path.join(skillDir, '.created-by-skill-creator')), 'marker missing');
|
|
46
|
+
const md = fs.readFileSync(path.join(skillDir, 'SKILL.md'), 'utf8');
|
|
47
|
+
assert.match(md, /USE WHEN/, 'description should contain USE WHEN');
|
|
48
|
+
assert.match(md, new RegExp(`name: "${name}"`));
|
|
49
|
+
} finally {
|
|
50
|
+
fs.rmSync(tmpRoot, { recursive: true, force: true });
|
|
51
|
+
}
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
test('scaffold output passes check-skill --lint clean', () => {
|
|
55
|
+
const tmpRoot = makeTmpRoot('lint');
|
|
56
|
+
try {
|
|
57
|
+
const name = 'sc-test-lints-clean';
|
|
58
|
+
const a = pwsh([scaffold, '-Name', name, '-Type', 'other',
|
|
59
|
+
'-Description', 'USE WHEN testing scaffold output conforms. DO NOT USE FOR production.',
|
|
60
|
+
'-Root', tmpRoot]);
|
|
61
|
+
assert.equal(a.status, 0, `scaffold failed: ${a.stderr}`);
|
|
62
|
+
const b = pwsh([checker, '-Skill', name, '-Root', tmpRoot, '-Json']);
|
|
63
|
+
assert.equal(b.status, 0, `check-skill failed: ${b.stderr}\n${b.stdout}`);
|
|
64
|
+
// Last JSON line is the report.
|
|
65
|
+
const lines = b.stdout.trim().split('\n');
|
|
66
|
+
// The Json mode prints a multi-line JSON via ConvertTo-Json; parse the whole stdout.
|
|
67
|
+
const report = JSON.parse(b.stdout);
|
|
68
|
+
assert.equal(report.summary.total_findings, 0,
|
|
69
|
+
`scaffold output had findings: ${JSON.stringify(report.skills[0].findings)}`);
|
|
70
|
+
} finally {
|
|
71
|
+
fs.rmSync(tmpRoot, { recursive: true, force: true });
|
|
72
|
+
}
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
test('optimize-description rewrites a known-bad description', () => {
|
|
76
|
+
const bad = 'Powerful comprehensive skill that pulls some stuff.';
|
|
77
|
+
const result = pwsh([optimize, '-Description', bad, '-Quiet']);
|
|
78
|
+
assert.equal(result.status, 0);
|
|
79
|
+
const rewritten = result.stdout.trim();
|
|
80
|
+
assert.match(rewritten, /USE WHEN/, 'must inject USE WHEN');
|
|
81
|
+
assert.match(rewritten, /DO NOT USE/i, 'must inject DO NOT USE');
|
|
82
|
+
assert.doesNotMatch(rewritten, /\bpowerful\b/i, 'must strip marketing word');
|
|
83
|
+
assert.doesNotMatch(rewritten, /\bcomprehensive\b/, 'must strip marketing word (lowercase)');
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
test('optimize-description is idempotent on already-good input', () => {
|
|
87
|
+
const good = 'USE WHEN auditing skills. DO NOT USE FOR runtime checks. Validates blueprint conformance.';
|
|
88
|
+
const r1 = pwsh([optimize, '-Description', good, '-Quiet']).stdout.trim();
|
|
89
|
+
const r2 = pwsh([optimize, '-Description', r1, '-Quiet']).stdout.trim();
|
|
90
|
+
assert.equal(r1, r2, 'optimizer should be idempotent');
|
|
91
|
+
assert.match(r1, /^USE WHEN/, 'should still lead with USE WHEN');
|
|
92
|
+
});
|