skilltest 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -2
- package/dist/index.js +67 -46
- package/dist/index.js.map +1 -1
- package/package.json +4 -3
package/README.md
CHANGED
|
@@ -8,11 +8,15 @@ The testing framework for Agent Skills. Lint, test triggering, and evaluate your
|
|
|
8
8
|
|
|
9
9
|
`skilltest` is a standalone CLI for the Agent Skills ecosystem (spec: https://agentskills.io). Think of it as pytest for skills.
|
|
10
10
|
|
|
11
|
+
The repository itself uses a fast Vitest suite for offline unit and integration
|
|
12
|
+
coverage of the parser, linters, trigger math, config resolution, reporters,
|
|
13
|
+
and linter orchestration.
|
|
14
|
+
|
|
11
15
|
## Demo
|
|
12
16
|
|
|
13
17
|
GIF coming soon.
|
|
14
18
|
|
|
15
|
-

|
|
19
|
+
<!--  -->
|
|
16
20
|
|
|
17
21
|
## Why skilltest?
|
|
18
22
|
|
|
@@ -375,6 +379,8 @@ jobs:
|
|
|
375
379
|
with:
|
|
376
380
|
node-version: "20"
|
|
377
381
|
- run: npm ci
|
|
382
|
+
- run: npm run lint
|
|
383
|
+
- run: npm run test
|
|
378
384
|
- run: npm run build
|
|
379
385
|
- run: npx skilltest lint path/to/skill --json
|
|
380
386
|
```
|
|
@@ -410,11 +416,15 @@ jobs:
|
|
|
410
416
|
```bash
|
|
411
417
|
npm install
|
|
412
418
|
npm run lint
|
|
419
|
+
npm run test
|
|
413
420
|
npm run build
|
|
414
421
|
node dist/index.js --help
|
|
415
422
|
```
|
|
416
423
|
|
|
417
|
-
|
|
424
|
+
`npm test` runs the Vitest suite. The tests are offline and do not call model
|
|
425
|
+
providers.
|
|
426
|
+
|
|
427
|
+
Manual CLI smoke tests:
|
|
418
428
|
|
|
419
429
|
```bash
|
|
420
430
|
node dist/index.js lint test-fixtures/sample-skill/
|
package/dist/index.js
CHANGED
|
@@ -581,24 +581,6 @@ function runContentChecks(context) {
|
|
|
581
581
|
message: "No obvious vague placeholder phrasing found."
|
|
582
582
|
});
|
|
583
583
|
}
|
|
584
|
-
if (context.frontmatter.rawFrontmatter && /[<>]/.test(context.frontmatter.rawFrontmatter)) {
|
|
585
|
-
issues.push({
|
|
586
|
-
id: "content.frontmatter-angle-brackets",
|
|
587
|
-
checkId: "content:angle-brackets",
|
|
588
|
-
title: "Frontmatter Angle Brackets",
|
|
589
|
-
status: "warn",
|
|
590
|
-
message: "Frontmatter contains angle bracket characters (< or >), which can be misinterpreted in some agents.",
|
|
591
|
-
suggestion: "Remove XML-like tags from frontmatter values when possible."
|
|
592
|
-
});
|
|
593
|
-
} else {
|
|
594
|
-
issues.push({
|
|
595
|
-
id: "content.frontmatter-angle-brackets",
|
|
596
|
-
checkId: "content:angle-brackets",
|
|
597
|
-
title: "Frontmatter Angle Brackets",
|
|
598
|
-
status: "pass",
|
|
599
|
-
message: "No angle bracket tokens detected in frontmatter."
|
|
600
|
-
});
|
|
601
|
-
}
|
|
602
584
|
const secretsIssue = buildSecretsIssue(context);
|
|
603
585
|
if (secretsIssue) {
|
|
604
586
|
issues.push(secretsIssue);
|
|
@@ -951,6 +933,24 @@ function runFrontmatterChecks(context) {
|
|
|
951
933
|
message: "license field is present."
|
|
952
934
|
});
|
|
953
935
|
}
|
|
936
|
+
if (context.frontmatter.rawFrontmatter && /[<>]/.test(context.frontmatter.rawFrontmatter)) {
|
|
937
|
+
issues.push({
|
|
938
|
+
id: "frontmatter.angle-brackets",
|
|
939
|
+
checkId: "frontmatter:angle-brackets",
|
|
940
|
+
title: "Frontmatter Angle Brackets",
|
|
941
|
+
status: "warn",
|
|
942
|
+
message: "Frontmatter contains angle bracket characters (< or >), which can be misinterpreted in some agents.",
|
|
943
|
+
suggestion: "Remove XML-like tags from frontmatter values when possible."
|
|
944
|
+
});
|
|
945
|
+
} else {
|
|
946
|
+
issues.push({
|
|
947
|
+
id: "frontmatter.angle-brackets",
|
|
948
|
+
checkId: "frontmatter:angle-brackets",
|
|
949
|
+
title: "Frontmatter Angle Brackets",
|
|
950
|
+
status: "pass",
|
|
951
|
+
message: "No angle bracket tokens detected in frontmatter."
|
|
952
|
+
});
|
|
953
|
+
}
|
|
954
954
|
if (description && description.trim() !== "" && !descriptionLooksActionable(description)) {
|
|
955
955
|
issues.push({
|
|
956
956
|
id: "frontmatter.description.triggerability",
|
|
@@ -2314,12 +2314,13 @@ function extractJsonObject(raw) {
|
|
|
2314
2314
|
}
|
|
2315
2315
|
throw new Error("Grader did not return a JSON object.");
|
|
2316
2316
|
}
|
|
2317
|
-
|
|
2318
|
-
|
|
2319
|
-
|
|
2320
|
-
|
|
2321
|
-
|
|
2322
|
-
|
|
2317
|
+
var DEFAULT_ASSERTIONS = [
|
|
2318
|
+
"The response follows the skill instructions faithfully.",
|
|
2319
|
+
"The response is well-structured and actionable.",
|
|
2320
|
+
"The response addresses the user prompt directly."
|
|
2321
|
+
];
|
|
2322
|
+
function buildGraderPrompts(options) {
|
|
2323
|
+
const assertions = options.assertions && options.assertions.length > 0 ? options.assertions : DEFAULT_ASSERTIONS;
|
|
2323
2324
|
const systemPrompt = [
|
|
2324
2325
|
"You are a strict evaluator for agent skill outputs.",
|
|
2325
2326
|
"Assess each assertion and return JSON only.",
|
|
@@ -2336,15 +2337,26 @@ async function gradeResponse(options) {
|
|
|
2336
2337
|
options.modelResponse,
|
|
2337
2338
|
"",
|
|
2338
2339
|
"Assertions to evaluate:",
|
|
2339
|
-
|
|
2340
|
+
assertions.map((assertion, index) => `${index + 1}. ${assertion}`).join("\n")
|
|
2340
2341
|
].join("\n");
|
|
2341
|
-
|
|
2342
|
+
return {
|
|
2343
|
+
assertions,
|
|
2344
|
+
systemPrompt,
|
|
2345
|
+
userPrompt
|
|
2346
|
+
};
|
|
2347
|
+
}
|
|
2348
|
+
function parseGraderOutput(raw) {
|
|
2342
2349
|
const parsed = graderOutputSchema.safeParse(extractJsonObject(raw));
|
|
2343
2350
|
if (!parsed.success) {
|
|
2344
2351
|
throw new Error(`Failed to parse grader output: ${parsed.error.issues[0]?.message ?? "invalid grader JSON"}`);
|
|
2345
2352
|
}
|
|
2346
2353
|
return parsed.data.assertions;
|
|
2347
2354
|
}
|
|
2355
|
+
async function gradeResponse(options) {
|
|
2356
|
+
const prompts = buildGraderPrompts(options);
|
|
2357
|
+
const raw = await options.provider.sendMessage(prompts.systemPrompt, prompts.userPrompt, { model: options.model });
|
|
2358
|
+
return parseGraderOutput(raw);
|
|
2359
|
+
}
|
|
2348
2360
|
|
|
2349
2361
|
// src/utils/concurrency.ts
|
|
2350
2362
|
async function pMap(items, fn, concurrency) {
|
|
@@ -2499,6 +2511,7 @@ var triggerQuerySchema = z4.object({
|
|
|
2499
2511
|
should_trigger: z4.boolean()
|
|
2500
2512
|
});
|
|
2501
2513
|
var triggerQueryArraySchema = z4.array(triggerQuerySchema);
|
|
2514
|
+
var triggerNumQueriesSchema = z4.number().int().min(2).refine((value) => value % 2 === 0, "numQueries must be an even number.");
|
|
2502
2515
|
var FAKE_SKILLS = [
|
|
2503
2516
|
{ name: "code-review", description: "Reviews code changes for bugs, regressions, and maintainability issues." },
|
|
2504
2517
|
{ name: "api-tester", description: "Designs and runs REST API tests, validating status codes and response shapes." },
|
|
@@ -2539,6 +2552,9 @@ function shuffle(values, rng) {
|
|
|
2539
2552
|
function sample(values, count, rng) {
|
|
2540
2553
|
return shuffle(values, rng).slice(0, Math.max(0, Math.min(count, values.length)));
|
|
2541
2554
|
}
|
|
2555
|
+
function validateNumQueries(numQueries) {
|
|
2556
|
+
return triggerNumQueriesSchema.parse(numQueries);
|
|
2557
|
+
}
|
|
2542
2558
|
function parseJsonArrayFromModelOutput(raw) {
|
|
2543
2559
|
const trimmed = raw.trim();
|
|
2544
2560
|
if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
|
|
@@ -2553,6 +2569,7 @@ function parseJsonArrayFromModelOutput(raw) {
|
|
|
2553
2569
|
throw new Error("Model did not return a JSON array.");
|
|
2554
2570
|
}
|
|
2555
2571
|
async function generateQueriesWithModel(skill, provider, model, numQueries) {
|
|
2572
|
+
validateNumQueries(numQueries);
|
|
2556
2573
|
const shouldTriggerCount = Math.floor(numQueries / 2);
|
|
2557
2574
|
const shouldNotTriggerCount = numQueries - shouldTriggerCount;
|
|
2558
2575
|
const systemPrompt = [
|
|
@@ -2597,6 +2614,29 @@ function parseDecision(rawResponse, skillNames) {
|
|
|
2597
2614
|
}
|
|
2598
2615
|
return "unrecognized";
|
|
2599
2616
|
}
|
|
2617
|
+
function prepareTriggerQueries(skill, queries, seed) {
|
|
2618
|
+
const rng = createRng(seed);
|
|
2619
|
+
return queries.map((testQuery) => {
|
|
2620
|
+
const fakeCount = 5 + Math.floor(rng() * 5);
|
|
2621
|
+
const fakeSkills = sample(FAKE_SKILLS, fakeCount, rng);
|
|
2622
|
+
const allSkills = shuffle(
|
|
2623
|
+
[
|
|
2624
|
+
...fakeSkills,
|
|
2625
|
+
{
|
|
2626
|
+
name: skill.frontmatter.name,
|
|
2627
|
+
description: skill.frontmatter.description
|
|
2628
|
+
}
|
|
2629
|
+
],
|
|
2630
|
+
rng
|
|
2631
|
+
);
|
|
2632
|
+
return {
|
|
2633
|
+
testQuery,
|
|
2634
|
+
fakeSkills,
|
|
2635
|
+
allSkills,
|
|
2636
|
+
skillListText: allSkills.map((entry) => `- ${entry.name}: ${entry.description}`).join("\n")
|
|
2637
|
+
};
|
|
2638
|
+
});
|
|
2639
|
+
}
|
|
2600
2640
|
function calculateMetrics(skillName, cases) {
|
|
2601
2641
|
let truePositives = 0;
|
|
2602
2642
|
let trueNegatives = 0;
|
|
@@ -2649,28 +2689,9 @@ function buildSuggestions(metrics) {
|
|
|
2649
2689
|
return suggestions;
|
|
2650
2690
|
}
|
|
2651
2691
|
async function runTriggerTest(skill, options) {
|
|
2652
|
-
const rng = createRng(options.seed);
|
|
2653
2692
|
const queries = options.queries && options.queries.length > 0 ? triggerQueryArraySchema.parse(options.queries) : await generateQueriesWithModel(skill, options.provider, options.model, options.numQueries);
|
|
2654
2693
|
const skillName = skill.frontmatter.name;
|
|
2655
|
-
const preparedQueries = queries.
|
|
2656
|
-
const fakeCount = 5 + Math.floor(rng() * 5);
|
|
2657
|
-
const fakeSkills = sample(FAKE_SKILLS, fakeCount, rng);
|
|
2658
|
-
const allSkills = shuffle([
|
|
2659
|
-
...fakeSkills,
|
|
2660
|
-
{
|
|
2661
|
-
name: skill.frontmatter.name,
|
|
2662
|
-
description: skill.frontmatter.description
|
|
2663
|
-
}
|
|
2664
|
-
], rng);
|
|
2665
|
-
const skillListText = allSkills.map((entry) => `- ${entry.name}: ${entry.description}`).join("\n");
|
|
2666
|
-
return {
|
|
2667
|
-
testQuery,
|
|
2668
|
-
fakeCount,
|
|
2669
|
-
fakeSkills,
|
|
2670
|
-
allSkills,
|
|
2671
|
-
skillListText
|
|
2672
|
-
};
|
|
2673
|
-
});
|
|
2694
|
+
const preparedQueries = prepareTriggerQueries(skill, queries, options.seed);
|
|
2674
2695
|
const systemPrompt = [
|
|
2675
2696
|
"You are selecting one skill to activate for a user query.",
|
|
2676
2697
|
"Choose the single best matching skill name from the provided list, or 'none' if no skill is a good fit.",
|