skilltest 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -8,11 +8,15 @@ The testing framework for Agent Skills. Lint, test triggering, and evaluate your
8
8
 
9
9
  `skilltest` is a standalone CLI for the Agent Skills ecosystem (spec: https://agentskills.io). Think of it as pytest for skills.
10
10
 
11
+ The repository itself uses a fast Vitest suite for offline unit and integration
12
+ coverage of the parser, linters, trigger math, config resolution, reporters,
13
+ and linter orchestration.
14
+
11
15
  ## Demo
12
16
 
13
17
  GIF coming soon.
14
18
 
15
- ![skilltest demo placeholder](https://via.placeholder.com/1200x420?text=skilltest+demo+gif+coming+soon)
19
+ <!-- ![skilltest demo placeholder](https://via.placeholder.com/1200x420?text=skilltest+demo+gif+coming+soon) -->
16
20
 
17
21
  ## Why skilltest?
18
22
 
@@ -375,6 +379,8 @@ jobs:
375
379
  with:
376
380
  node-version: "20"
377
381
  - run: npm ci
382
+ - run: npm run lint
383
+ - run: npm run test
378
384
  - run: npm run build
379
385
  - run: npx skilltest lint path/to/skill --json
380
386
  ```
@@ -410,11 +416,15 @@ jobs:
410
416
  ```bash
411
417
  npm install
412
418
  npm run lint
419
+ npm run test
413
420
  npm run build
414
421
  node dist/index.js --help
415
422
  ```
416
423
 
417
- Smoke tests:
424
+ `npm test` runs the Vitest suite. The tests are offline and do not call model
425
+ providers.
426
+
427
+ Manual CLI smoke tests:
418
428
 
419
429
  ```bash
420
430
  node dist/index.js lint test-fixtures/sample-skill/
package/dist/index.js CHANGED
@@ -581,24 +581,6 @@ function runContentChecks(context) {
581
581
  message: "No obvious vague placeholder phrasing found."
582
582
  });
583
583
  }
584
- if (context.frontmatter.rawFrontmatter && /[<>]/.test(context.frontmatter.rawFrontmatter)) {
585
- issues.push({
586
- id: "content.frontmatter-angle-brackets",
587
- checkId: "content:angle-brackets",
588
- title: "Frontmatter Angle Brackets",
589
- status: "warn",
590
- message: "Frontmatter contains angle bracket characters (< or >), which can be misinterpreted in some agents.",
591
- suggestion: "Remove XML-like tags from frontmatter values when possible."
592
- });
593
- } else {
594
- issues.push({
595
- id: "content.frontmatter-angle-brackets",
596
- checkId: "content:angle-brackets",
597
- title: "Frontmatter Angle Brackets",
598
- status: "pass",
599
- message: "No angle bracket tokens detected in frontmatter."
600
- });
601
- }
602
584
  const secretsIssue = buildSecretsIssue(context);
603
585
  if (secretsIssue) {
604
586
  issues.push(secretsIssue);
@@ -951,6 +933,24 @@ function runFrontmatterChecks(context) {
951
933
  message: "license field is present."
952
934
  });
953
935
  }
936
+ if (context.frontmatter.rawFrontmatter && /[<>]/.test(context.frontmatter.rawFrontmatter)) {
937
+ issues.push({
938
+ id: "frontmatter.angle-brackets",
939
+ checkId: "frontmatter:angle-brackets",
940
+ title: "Frontmatter Angle Brackets",
941
+ status: "warn",
942
+ message: "Frontmatter contains angle bracket characters (< or >), which can be misinterpreted in some agents.",
943
+ suggestion: "Remove XML-like tags from frontmatter values when possible."
944
+ });
945
+ } else {
946
+ issues.push({
947
+ id: "frontmatter.angle-brackets",
948
+ checkId: "frontmatter:angle-brackets",
949
+ title: "Frontmatter Angle Brackets",
950
+ status: "pass",
951
+ message: "No angle bracket tokens detected in frontmatter."
952
+ });
953
+ }
954
954
  if (description && description.trim() !== "" && !descriptionLooksActionable(description)) {
955
955
  issues.push({
956
956
  id: "frontmatter.description.triggerability",
@@ -2314,12 +2314,13 @@ function extractJsonObject(raw) {
2314
2314
  }
2315
2315
  throw new Error("Grader did not return a JSON object.");
2316
2316
  }
2317
- async function gradeResponse(options) {
2318
- const assertionList = options.assertions && options.assertions.length > 0 ? options.assertions : [
2319
- "The response follows the skill instructions faithfully.",
2320
- "The response is well-structured and actionable.",
2321
- "The response addresses the user prompt directly."
2322
- ];
2317
+ var DEFAULT_ASSERTIONS = [
2318
+ "The response follows the skill instructions faithfully.",
2319
+ "The response is well-structured and actionable.",
2320
+ "The response addresses the user prompt directly."
2321
+ ];
2322
+ function buildGraderPrompts(options) {
2323
+ const assertions = options.assertions && options.assertions.length > 0 ? options.assertions : DEFAULT_ASSERTIONS;
2323
2324
  const systemPrompt = [
2324
2325
  "You are a strict evaluator for agent skill outputs.",
2325
2326
  "Assess each assertion and return JSON only.",
@@ -2336,15 +2337,26 @@ async function gradeResponse(options) {
2336
2337
  options.modelResponse,
2337
2338
  "",
2338
2339
  "Assertions to evaluate:",
2339
- assertionList.map((assertion, index) => `${index + 1}. ${assertion}`).join("\n")
2340
+ assertions.map((assertion, index) => `${index + 1}. ${assertion}`).join("\n")
2340
2341
  ].join("\n");
2341
- const raw = await options.provider.sendMessage(systemPrompt, userPrompt, { model: options.model });
2342
+ return {
2343
+ assertions,
2344
+ systemPrompt,
2345
+ userPrompt
2346
+ };
2347
+ }
2348
+ function parseGraderOutput(raw) {
2342
2349
  const parsed = graderOutputSchema.safeParse(extractJsonObject(raw));
2343
2350
  if (!parsed.success) {
2344
2351
  throw new Error(`Failed to parse grader output: ${parsed.error.issues[0]?.message ?? "invalid grader JSON"}`);
2345
2352
  }
2346
2353
  return parsed.data.assertions;
2347
2354
  }
2355
+ async function gradeResponse(options) {
2356
+ const prompts = buildGraderPrompts(options);
2357
+ const raw = await options.provider.sendMessage(prompts.systemPrompt, prompts.userPrompt, { model: options.model });
2358
+ return parseGraderOutput(raw);
2359
+ }
2348
2360
 
2349
2361
  // src/utils/concurrency.ts
2350
2362
  async function pMap(items, fn, concurrency) {
@@ -2499,6 +2511,7 @@ var triggerQuerySchema = z4.object({
2499
2511
  should_trigger: z4.boolean()
2500
2512
  });
2501
2513
  var triggerQueryArraySchema = z4.array(triggerQuerySchema);
2514
+ var triggerNumQueriesSchema = z4.number().int().min(2).refine((value) => value % 2 === 0, "numQueries must be an even number.");
2502
2515
  var FAKE_SKILLS = [
2503
2516
  { name: "code-review", description: "Reviews code changes for bugs, regressions, and maintainability issues." },
2504
2517
  { name: "api-tester", description: "Designs and runs REST API tests, validating status codes and response shapes." },
@@ -2539,6 +2552,9 @@ function shuffle(values, rng) {
2539
2552
  function sample(values, count, rng) {
2540
2553
  return shuffle(values, rng).slice(0, Math.max(0, Math.min(count, values.length)));
2541
2554
  }
2555
+ function validateNumQueries(numQueries) {
2556
+ return triggerNumQueriesSchema.parse(numQueries);
2557
+ }
2542
2558
  function parseJsonArrayFromModelOutput(raw) {
2543
2559
  const trimmed = raw.trim();
2544
2560
  if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
@@ -2553,6 +2569,7 @@ function parseJsonArrayFromModelOutput(raw) {
2553
2569
  throw new Error("Model did not return a JSON array.");
2554
2570
  }
2555
2571
  async function generateQueriesWithModel(skill, provider, model, numQueries) {
2572
+ validateNumQueries(numQueries);
2556
2573
  const shouldTriggerCount = Math.floor(numQueries / 2);
2557
2574
  const shouldNotTriggerCount = numQueries - shouldTriggerCount;
2558
2575
  const systemPrompt = [
@@ -2597,6 +2614,29 @@ function parseDecision(rawResponse, skillNames) {
2597
2614
  }
2598
2615
  return "unrecognized";
2599
2616
  }
2617
+ function prepareTriggerQueries(skill, queries, seed) {
2618
+ const rng = createRng(seed);
2619
+ return queries.map((testQuery) => {
2620
+ const fakeCount = 5 + Math.floor(rng() * 5);
2621
+ const fakeSkills = sample(FAKE_SKILLS, fakeCount, rng);
2622
+ const allSkills = shuffle(
2623
+ [
2624
+ ...fakeSkills,
2625
+ {
2626
+ name: skill.frontmatter.name,
2627
+ description: skill.frontmatter.description
2628
+ }
2629
+ ],
2630
+ rng
2631
+ );
2632
+ return {
2633
+ testQuery,
2634
+ fakeSkills,
2635
+ allSkills,
2636
+ skillListText: allSkills.map((entry) => `- ${entry.name}: ${entry.description}`).join("\n")
2637
+ };
2638
+ });
2639
+ }
2600
2640
  function calculateMetrics(skillName, cases) {
2601
2641
  let truePositives = 0;
2602
2642
  let trueNegatives = 0;
@@ -2649,28 +2689,9 @@ function buildSuggestions(metrics) {
2649
2689
  return suggestions;
2650
2690
  }
2651
2691
  async function runTriggerTest(skill, options) {
2652
- const rng = createRng(options.seed);
2653
2692
  const queries = options.queries && options.queries.length > 0 ? triggerQueryArraySchema.parse(options.queries) : await generateQueriesWithModel(skill, options.provider, options.model, options.numQueries);
2654
2693
  const skillName = skill.frontmatter.name;
2655
- const preparedQueries = queries.map((testQuery) => {
2656
- const fakeCount = 5 + Math.floor(rng() * 5);
2657
- const fakeSkills = sample(FAKE_SKILLS, fakeCount, rng);
2658
- const allSkills = shuffle([
2659
- ...fakeSkills,
2660
- {
2661
- name: skill.frontmatter.name,
2662
- description: skill.frontmatter.description
2663
- }
2664
- ], rng);
2665
- const skillListText = allSkills.map((entry) => `- ${entry.name}: ${entry.description}`).join("\n");
2666
- return {
2667
- testQuery,
2668
- fakeCount,
2669
- fakeSkills,
2670
- allSkills,
2671
- skillListText
2672
- };
2673
- });
2694
+ const preparedQueries = prepareTriggerQueries(skill, queries, options.seed);
2674
2695
  const systemPrompt = [
2675
2696
  "You are selecting one skill to activate for a user query.",
2676
2697
  "Choose the single best matching skill name from the provided list, or 'none' if no skill is a good fit.",