skilltest 0.6.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +3 -6
- package/README.md +104 -2
- package/dist/index.js +441 -140
- package/dist/index.js.map +1 -1
- package/package.json +4 -3
package/CLAUDE.md
CHANGED
|
@@ -77,6 +77,7 @@ ANTHROPIC_API_KEY=your-key node dist/index.js trigger test-fixtures/sample-skill
|
|
|
77
77
|
- default concurrency is `5`
|
|
78
78
|
- `--concurrency 1` preserves the old sequential behavior
|
|
79
79
|
- trigger RNG-dependent fake-skill setup is precomputed before requests begin, preserving seed determinism
|
|
80
|
+
- Comparative trigger testing is opt-in via `--compare`; standard fake-skill pool is the default.
|
|
80
81
|
- JSON mode is strict:
|
|
81
82
|
- no spinners
|
|
82
83
|
- no colored output
|
|
@@ -103,11 +104,7 @@ ANTHROPIC_API_KEY=your-key node dist/index.js trigger test-fixtures/sample-skill
|
|
|
103
104
|
- Security heuristics: `src/core/linter/security.ts`
|
|
104
105
|
- Progressive disclosure: `src/core/linter/disclosure.ts`
|
|
105
106
|
- Compatibility hints: `src/core/linter/compat.ts`
|
|
106
|
-
-
|
|
107
|
+
- Plugin loading + validation + rule execution: `src/core/linter/plugin.ts`
|
|
108
|
+
- Trigger fake skill pool + comparative competitor loading + scoring: `src/core/trigger-tester.ts`
|
|
107
109
|
- Eval grading schema: `src/core/grader.ts`
|
|
108
110
|
- Combined quality gate orchestration: `src/core/check-runner.ts`
|
|
109
|
-
|
|
110
|
-
## Future Work (Not Implemented Yet)
|
|
111
|
-
|
|
112
|
-
- Config file support (`.skilltestrc`)
|
|
113
|
-
- Plugin linter rules
|
package/README.md
CHANGED
|
@@ -8,11 +8,15 @@ The testing framework for Agent Skills. Lint, test triggering, and evaluate your
|
|
|
8
8
|
|
|
9
9
|
`skilltest` is a standalone CLI for the Agent Skills ecosystem (spec: https://agentskills.io). Think of it as pytest for skills.
|
|
10
10
|
|
|
11
|
+
The repository itself uses a fast Vitest suite for offline unit and integration
|
|
12
|
+
coverage of the parser, linters, trigger math, config resolution, reporters,
|
|
13
|
+
and linter orchestration.
|
|
14
|
+
|
|
11
15
|
## Demo
|
|
12
16
|
|
|
13
17
|
GIF coming soon.
|
|
14
18
|
|
|
15
|
-

|
|
19
|
+
<!--  -->
|
|
16
20
|
|
|
17
21
|
## Why skilltest?
|
|
18
22
|
|
|
@@ -159,6 +163,72 @@ What it checks:
|
|
|
159
163
|
Flags:
|
|
160
164
|
|
|
161
165
|
- `--html <path>` write a self-contained HTML report
|
|
166
|
+
- `--plugin <path>` load a custom lint plugin file (repeatable)
|
|
167
|
+
|
|
168
|
+
### Plugin Rules
|
|
169
|
+
|
|
170
|
+
You can run custom lint rules alongside the built-in checks. Plugin rules use the
|
|
171
|
+
same `LintContext` and `LintIssue` types as the core linter, and their results
|
|
172
|
+
appear in the same `LintReport`.
|
|
173
|
+
|
|
174
|
+
Config:
|
|
175
|
+
|
|
176
|
+
```json
|
|
177
|
+
{
|
|
178
|
+
"lint": {
|
|
179
|
+
"plugins": ["./my-rules.js"]
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
CLI:
|
|
185
|
+
|
|
186
|
+
```bash
|
|
187
|
+
skilltest lint ./skill --plugin ./my-rules.js
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
Minimal plugin example:
|
|
191
|
+
|
|
192
|
+
```js
|
|
193
|
+
export default {
|
|
194
|
+
rules: [
|
|
195
|
+
{
|
|
196
|
+
checkId: "custom:no-todo",
|
|
197
|
+
title: "No TODO comments",
|
|
198
|
+
check(context) {
|
|
199
|
+
const body = context.frontmatter.content;
|
|
200
|
+
if (/\bTODO\b/.test(body)) {
|
|
201
|
+
return [
|
|
202
|
+
{
|
|
203
|
+
id: "custom.no-todo",
|
|
204
|
+
checkId: "custom:no-todo",
|
|
205
|
+
title: "No TODO comments",
|
|
206
|
+
status: "warn",
|
|
207
|
+
message: "SKILL.md contains a TODO marker."
|
|
208
|
+
}
|
|
209
|
+
];
|
|
210
|
+
}
|
|
211
|
+
return [
|
|
212
|
+
{
|
|
213
|
+
id: "custom.no-todo",
|
|
214
|
+
checkId: "custom:no-todo",
|
|
215
|
+
title: "No TODO comments",
|
|
216
|
+
status: "pass",
|
|
217
|
+
message: "No TODO markers found."
|
|
218
|
+
}
|
|
219
|
+
];
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
]
|
|
223
|
+
};
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
Notes:
|
|
227
|
+
|
|
228
|
+
- Plugin files are loaded with dynamic `import()`.
|
|
229
|
+
- `.js` and `.mjs` work directly; `.ts` plugins must be precompiled by the user.
|
|
230
|
+
- Plugin rules run after all built-in lint checks, in the order the plugin files are listed.
|
|
231
|
+
- CLI `--plugin` values replace config-file `lint.plugins` values.
|
|
162
232
|
|
|
163
233
|
### `skilltest trigger <path-to-skill>`
|
|
164
234
|
|
|
@@ -171,6 +241,7 @@ Flow:
|
|
|
171
241
|
3. For each query, asks model to select one skill from a mixed list:
|
|
172
242
|
- your skill under test
|
|
173
243
|
- realistic fake skills
|
|
244
|
+
- optional sibling competitor skills from `--compare`
|
|
174
245
|
4. Computes TP, TN, FP, FN, precision, recall, F1.
|
|
175
246
|
|
|
176
247
|
For reproducible fake-skill sampling, pass `--seed <number>`. When a seed is used,
|
|
@@ -184,6 +255,7 @@ Flags:
|
|
|
184
255
|
- `--model <model>` default: `claude-sonnet-4-5-20250929`
|
|
185
256
|
- `--provider <anthropic|openai>` default: `anthropic`
|
|
186
257
|
- `--queries <path>` use custom queries JSON
|
|
258
|
+
- `--compare <path>` path to a sibling skill directory to use as a competitor (repeatable)
|
|
187
259
|
- `--num-queries <n>` default: `20` (must be even)
|
|
188
260
|
- `--seed <number>` RNG seed for reproducible fake-skill sampling
|
|
189
261
|
- `--concurrency <n>` default: `5`
|
|
@@ -192,6 +264,28 @@ Flags:
|
|
|
192
264
|
- `--api-key <key>` explicit key override
|
|
193
265
|
- `--verbose` show full model decision text
|
|
194
266
|
|
|
267
|
+
### Comparative Trigger Testing
|
|
268
|
+
|
|
269
|
+
Test whether your skill is distinctive enough to be selected over similar real skills:
|
|
270
|
+
|
|
271
|
+
```bash
|
|
272
|
+
skilltest trigger ./my-skill --compare ../similar-skill-1 --compare ../similar-skill-2
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
Config:
|
|
276
|
+
|
|
277
|
+
```json
|
|
278
|
+
{
|
|
279
|
+
"trigger": {
|
|
280
|
+
"compare": ["../similar-skill-1", "../similar-skill-2"]
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
Comparative mode includes the real competitor skills in the candidate list alongside
|
|
286
|
+
fake skills. This reveals confusion between skills with overlapping descriptions that
|
|
287
|
+
standard trigger testing would miss.
|
|
288
|
+
|
|
195
289
|
### `skilltest eval <path-to-skill>`
|
|
196
290
|
|
|
197
291
|
Runs full skill behavior and grades outputs against assertions.
|
|
@@ -234,9 +328,11 @@ Flags:
|
|
|
234
328
|
- `--grader-model <model>` default: same as resolved `--model`
|
|
235
329
|
- `--api-key <key>` explicit key override
|
|
236
330
|
- `--queries <path>` custom trigger queries JSON
|
|
331
|
+
- `--compare <path>` path to a sibling skill directory to use as a competitor (repeatable)
|
|
237
332
|
- `--num-queries <n>` default: `20` (must be even)
|
|
238
333
|
- `--seed <number>` RNG seed for reproducible trigger sampling
|
|
239
334
|
- `--prompts <path>` custom eval prompts JSON
|
|
335
|
+
- `--plugin <path>` load a custom lint plugin file (repeatable)
|
|
240
336
|
- `--concurrency <n>` default: `5` (`1` keeps the old sequential `check` behavior)
|
|
241
337
|
- `--html <path>` write a self-contained HTML report
|
|
242
338
|
- `--min-f1 <n>` default: `0.8`
|
|
@@ -375,6 +471,8 @@ jobs:
|
|
|
375
471
|
with:
|
|
376
472
|
node-version: "20"
|
|
377
473
|
- run: npm ci
|
|
474
|
+
- run: npm run lint
|
|
475
|
+
- run: npm run test
|
|
378
476
|
- run: npm run build
|
|
379
477
|
- run: npx skilltest lint path/to/skill --json
|
|
380
478
|
```
|
|
@@ -410,11 +508,15 @@ jobs:
|
|
|
410
508
|
```bash
|
|
411
509
|
npm install
|
|
412
510
|
npm run lint
|
|
511
|
+
npm run test
|
|
413
512
|
npm run build
|
|
414
513
|
node dist/index.js --help
|
|
415
514
|
```
|
|
416
515
|
|
|
417
|
-
|
|
516
|
+
`npm test` runs the Vitest suite. The tests are offline and do not call model
|
|
517
|
+
providers.
|
|
518
|
+
|
|
519
|
+
Manual CLI smoke tests:
|
|
418
520
|
|
|
419
521
|
```bash
|
|
420
522
|
node dist/index.js lint test-fixtures/sample-skill/
|