skilltest 0.6.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,13 +1,13 @@
1
1
  #!/usr/bin/env node
2
2
 
3
3
  // src/index.ts
4
- import fs11 from "node:fs";
5
- import path6 from "node:path";
4
+ import fs12 from "node:fs";
5
+ import path7 from "node:path";
6
6
  import { fileURLToPath } from "node:url";
7
7
  import { Command } from "commander";
8
8
 
9
9
  // src/commands/lint.ts
10
- import fs6 from "node:fs/promises";
10
+ import fs7 from "node:fs/promises";
11
11
  import { z as z6 } from "zod";
12
12
 
13
13
  // src/core/skill-parser.ts
@@ -581,24 +581,6 @@ function runContentChecks(context) {
581
581
  message: "No obvious vague placeholder phrasing found."
582
582
  });
583
583
  }
584
- if (context.frontmatter.rawFrontmatter && /[<>]/.test(context.frontmatter.rawFrontmatter)) {
585
- issues.push({
586
- id: "content.frontmatter-angle-brackets",
587
- checkId: "content:angle-brackets",
588
- title: "Frontmatter Angle Brackets",
589
- status: "warn",
590
- message: "Frontmatter contains angle bracket characters (< or >), which can be misinterpreted in some agents.",
591
- suggestion: "Remove XML-like tags from frontmatter values when possible."
592
- });
593
- } else {
594
- issues.push({
595
- id: "content.frontmatter-angle-brackets",
596
- checkId: "content:angle-brackets",
597
- title: "Frontmatter Angle Brackets",
598
- status: "pass",
599
- message: "No angle bracket tokens detected in frontmatter."
600
- });
601
- }
602
584
  const secretsIssue = buildSecretsIssue(context);
603
585
  if (secretsIssue) {
604
586
  issues.push(secretsIssue);
@@ -951,6 +933,24 @@ function runFrontmatterChecks(context) {
951
933
  message: "license field is present."
952
934
  });
953
935
  }
936
+ if (context.frontmatter.rawFrontmatter && /[<>]/.test(context.frontmatter.rawFrontmatter)) {
937
+ issues.push({
938
+ id: "frontmatter.angle-brackets",
939
+ checkId: "frontmatter:angle-brackets",
940
+ title: "Frontmatter Angle Brackets",
941
+ status: "warn",
942
+ message: "Frontmatter contains angle bracket characters (< or >), which can be misinterpreted in some agents.",
943
+ suggestion: "Remove XML-like tags from frontmatter values when possible."
944
+ });
945
+ } else {
946
+ issues.push({
947
+ id: "frontmatter.angle-brackets",
948
+ checkId: "frontmatter:angle-brackets",
949
+ title: "Frontmatter Angle Brackets",
950
+ status: "pass",
951
+ message: "No angle bracket tokens detected in frontmatter."
952
+ });
953
+ }
954
954
  if (description && description.trim() !== "" && !descriptionLooksActionable(description)) {
955
955
  issues.push({
956
956
  id: "frontmatter.description.triggerability",
@@ -972,6 +972,116 @@ function runFrontmatterChecks(context) {
972
972
  return issues;
973
973
  }
974
974
 
975
+ // src/core/linter/plugin.ts
976
+ import fs4 from "node:fs/promises";
977
+ import path4 from "node:path";
978
+ import { pathToFileURL } from "node:url";
979
+ function normalizeRuleCheckId(checkId) {
980
+ return checkId.includes(":") ? checkId : `plugin:${checkId}`;
981
+ }
982
+ function buildPluginValidationError(filePath, message) {
983
+ return new Error(`Invalid lint plugin at ${filePath}: ${message}`);
984
+ }
985
+ function validatePluginCandidate(candidate, filePath, exportName) {
986
+ if (!candidate || typeof candidate !== "object" || !("rules" in candidate)) {
987
+ throw buildPluginValidationError(filePath, `${exportName} export must be an object with a rules array.`);
988
+ }
989
+ const rules = candidate.rules;
990
+ if (!Array.isArray(rules)) {
991
+ throw buildPluginValidationError(filePath, `${exportName} export must include a rules array.`);
992
+ }
993
+ return {
994
+ rules: rules.map((rule, index) => {
995
+ if (!rule || typeof rule !== "object") {
996
+ throw buildPluginValidationError(filePath, `rule at index ${index} must be an object.`);
997
+ }
998
+ const checkId = rule.checkId;
999
+ if (typeof checkId !== "string" || checkId.trim() === "") {
1000
+ throw buildPluginValidationError(filePath, `rule at index ${index} must have a non-empty string checkId.`);
1001
+ }
1002
+ const title = rule.title;
1003
+ if (typeof title !== "string" || title.trim() === "") {
1004
+ throw buildPluginValidationError(filePath, `rule at index ${index} must have a non-empty string title.`);
1005
+ }
1006
+ const check = rule.check;
1007
+ if (typeof check !== "function") {
1008
+ throw buildPluginValidationError(filePath, `rule '${checkId}' must have a check function.`);
1009
+ }
1010
+ return {
1011
+ checkId: normalizeRuleCheckId(checkId),
1012
+ title,
1013
+ check
1014
+ };
1015
+ })
1016
+ };
1017
+ }
1018
+ async function loadPlugin(filePath) {
1019
+ const absolutePath = path4.resolve(filePath);
1020
+ try {
1021
+ await fs4.access(absolutePath);
1022
+ } catch {
1023
+ throw new Error(`Failed to load lint plugin at ${absolutePath}: file does not exist.`);
1024
+ }
1025
+ let loadedModule;
1026
+ try {
1027
+ loadedModule = await import(pathToFileURL(absolutePath).href);
1028
+ } catch (error) {
1029
+ const message = error instanceof Error ? error.message : String(error);
1030
+ throw new Error(`Failed to load lint plugin at ${absolutePath}: ${message}`);
1031
+ }
1032
+ const validationErrors = [];
1033
+ for (const [exportName, candidate] of [
1034
+ ["default", loadedModule.default],
1035
+ ["plugin", loadedModule.plugin]
1036
+ ]) {
1037
+ if (candidate === void 0) {
1038
+ continue;
1039
+ }
1040
+ try {
1041
+ return validatePluginCandidate(candidate, absolutePath, exportName);
1042
+ } catch (error) {
1043
+ validationErrors.push(error instanceof Error ? error.message : String(error));
1044
+ }
1045
+ }
1046
+ if (validationErrors.length > 0) {
1047
+ throw new Error(validationErrors.join(" "));
1048
+ }
1049
+ throw buildPluginValidationError(
1050
+ absolutePath,
1051
+ "expected a default export or named export 'plugin' containing a rules array."
1052
+ );
1053
+ }
1054
+ function buildRuleExecutionError(rule, error) {
1055
+ const message = error instanceof Error ? error.message : String(error);
1056
+ return {
1057
+ id: `plugin.load-error.${rule.checkId.replace(/[^A-Za-z0-9]+/g, "-").replace(/^-+|-+$/g, "").toLowerCase()}`,
1058
+ checkId: "plugin:load-error",
1059
+ title: "Plugin Rule Error",
1060
+ status: "fail",
1061
+ message: `Plugin rule '${rule.checkId}' failed: ${message}`
1062
+ };
1063
+ }
1064
+ async function runPluginRules(plugin, context) {
1065
+ const issues = [];
1066
+ for (const rule of plugin.rules) {
1067
+ try {
1068
+ const result = await rule.check(context);
1069
+ if (!Array.isArray(result)) {
1070
+ throw new Error("check function must return an array of lint issues.");
1071
+ }
1072
+ issues.push(
1073
+ ...result.map((issue) => ({
1074
+ ...issue,
1075
+ checkId: rule.checkId
1076
+ }))
1077
+ );
1078
+ } catch (error) {
1079
+ issues.push(buildRuleExecutionError(rule, error));
1080
+ }
1081
+ }
1082
+ return issues;
1083
+ }
1084
+
975
1085
  // src/core/linter/security.ts
976
1086
  var DANGEROUS_COMMAND_PATTERNS = [
977
1087
  {
@@ -1179,8 +1289,8 @@ function runSecurityChecks(context) {
1179
1289
  }
1180
1290
 
1181
1291
  // src/core/linter/structure.ts
1182
- import fs4 from "node:fs/promises";
1183
- import path4 from "node:path";
1292
+ import fs5 from "node:fs/promises";
1293
+ import path5 from "node:path";
1184
1294
  function hasTableOfContents(content) {
1185
1295
  if (/^#{1,6}\s+table of contents\b/im.test(content)) {
1186
1296
  return true;
@@ -1221,21 +1331,21 @@ async function runStructureChecks(context) {
1221
1331
  message: `SKILL.md length is ${context.skill.lineCount} lines.`
1222
1332
  });
1223
1333
  }
1224
- const referencesDir = path4.join(context.skill.skillRoot, "references");
1334
+ const referencesDir = path5.join(context.skill.skillRoot, "references");
1225
1335
  if (await pathExists(referencesDir)) {
1226
1336
  const files = await listFilesRecursive(referencesDir);
1227
1337
  let oversizedWithoutToc = 0;
1228
1338
  for (const file of files) {
1229
- const raw = await fs4.readFile(file, "utf8");
1339
+ const raw = await fs5.readFile(file, "utf8");
1230
1340
  const lineCount = raw === "" ? 0 : raw.split(/\r?\n/).length;
1231
1341
  if (lineCount > 300 && !hasTableOfContents(raw)) {
1232
1342
  oversizedWithoutToc += 1;
1233
1343
  issues.push({
1234
- id: `structure.references.toc.${toPosixPath(path4.relative(context.skill.skillRoot, file))}`,
1344
+ id: `structure.references.toc.${toPosixPath(path5.relative(context.skill.skillRoot, file))}`,
1235
1345
  checkId: "structure:toc",
1236
1346
  title: "Reference File Navigation",
1237
1347
  status: "warn",
1238
- message: `${toPosixPath(path4.relative(context.skill.skillRoot, file))} is ${lineCount} lines and has no table of contents.`,
1348
+ message: `${toPosixPath(path5.relative(context.skill.skillRoot, file))} is ${lineCount} lines and has no table of contents.`,
1239
1349
  suggestion: "Add a table of contents for long reference files."
1240
1350
  });
1241
1351
  }
@@ -1265,7 +1375,7 @@ async function runStructureChecks(context) {
1265
1375
  other: []
1266
1376
  };
1267
1377
  for (const reference of references) {
1268
- const resolved = path4.resolve(context.skill.skillRoot, reference);
1378
+ const resolved = path5.resolve(context.skill.skillRoot, reference);
1269
1379
  if (!await pathExists(resolved)) {
1270
1380
  const kind = classifyReferencePath(reference);
1271
1381
  missingByType[kind].push(reference);
@@ -1362,6 +1472,10 @@ async function runLinter(inputPath, options = {}) {
1362
1472
  issues.push(...runSecurityChecks(context));
1363
1473
  issues.push(...await runDisclosureChecks(context));
1364
1474
  issues.push(...runCompatibilityChecks(context));
1475
+ for (const pluginPath of options.plugins ?? []) {
1476
+ const plugin = await loadPlugin(pluginPath);
1477
+ issues.push(...await runPluginRules(plugin, context));
1478
+ }
1365
1479
  const filteredIssues = issues.filter((issue) => !suppressedCheckIds.has(issue.checkId));
1366
1480
  return {
1367
1481
  target: inputPath,
@@ -1525,10 +1639,10 @@ function renderLintIssueList(report) {
1525
1639
  const info = skippedSecurityPatterns > 0 ? `<p class="info-line">Skipped security patterns in examples/comments: ${escapeHtml(skippedSecurityPatterns)}</p>` : "";
1526
1640
  return `<div class="row-list">${rows}</div>${info}`;
1527
1641
  }
1528
- function renderTriggerCaseRow(testCase) {
1642
+ function renderTriggerCaseRow(testCase, showSelectedCompetitor) {
1529
1643
  const details = testCase.rawModelResponse ? renderDetails("Model response", renderPreBlock(testCase.rawModelResponse)) : "";
1530
1644
  return `
1531
- <div class="row">
1645
+ <div class="row${testCase.selectedCompetitor ? " competitor-selected" : ""}">
1532
1646
  <div class="row-header">
1533
1647
  <div>
1534
1648
  <div class="row-title">${escapeHtml(testCase.query)}</div>
@@ -1540,12 +1654,29 @@ function renderTriggerCaseRow(testCase) {
1540
1654
  </div>
1541
1655
  ${renderDefinitionList([
1542
1656
  { label: "Expected", value: testCase.expected },
1543
- { label: "Actual", value: testCase.actual }
1657
+ { label: "Actual", value: testCase.actual },
1658
+ ...showSelectedCompetitor ? [{ label: "Selected competitor", value: testCase.selectedCompetitor ?? "none" }] : []
1544
1659
  ])}
1545
1660
  ${details}
1546
1661
  </div>
1547
1662
  `;
1548
1663
  }
1664
+ function renderCompetitorSkillsSection(result) {
1665
+ if (!result.competitors || result.competitors.length === 0) {
1666
+ return "";
1667
+ }
1668
+ return renderSectionCard(
1669
+ "Competitor Skills",
1670
+ `<div class="row-list">${result.competitors.map(
1671
+ (competitor) => renderMessageRow(
1672
+ "warn",
1673
+ competitor.name,
1674
+ competitor.description,
1675
+ renderDefinitionList([{ label: "Source", value: competitor.sourcePath }])
1676
+ )
1677
+ ).join("")}</div>`
1678
+ );
1679
+ }
1549
1680
  function promptStatus(promptResult) {
1550
1681
  if (promptResult.totalAssertions === 0) {
1551
1682
  return "skip";
@@ -1638,6 +1769,7 @@ function renderHtmlDocument(title, body) {
1638
1769
  --pass: #22c55e;
1639
1770
  --warn: #eab308;
1640
1771
  --fail: #ef4444;
1772
+ --competitor: #f97316;
1641
1773
  --skip: #6b7280;
1642
1774
  --shadow: 0 10px 30px rgba(15, 23, 42, 0.08);
1643
1775
  }
@@ -1786,6 +1918,11 @@ function renderHtmlDocument(title, body) {
1786
1918
  background: var(--surface-muted);
1787
1919
  }
1788
1920
 
1921
+ .row.competitor-selected {
1922
+ border-color: rgba(249, 115, 22, 0.45);
1923
+ background: rgba(249, 115, 22, 0.08);
1924
+ }
1925
+
1789
1926
  .row-header {
1790
1927
  display: flex;
1791
1928
  justify-content: space-between;
@@ -1965,6 +2102,7 @@ function renderTriggerHtml(result) {
1965
2102
  const target = resolveOptionalTarget(htmlResult, result.skillName);
1966
2103
  const matchedCount = result.cases.filter((testCase) => testCase.matched).length;
1967
2104
  const matchRate = result.cases.length === 0 ? 0 : matchedCount / result.cases.length;
2105
+ const hasCompetitors = Boolean(result.competitors && result.competitors.length > 0);
1968
2106
  const body = [
1969
2107
  renderHeaderCard(
1970
2108
  "trigger",
@@ -1980,10 +2118,15 @@ function renderTriggerHtml(result) {
1980
2118
  { label: "Provider", value: result.provider },
1981
2119
  { label: "Model", value: result.model },
1982
2120
  { label: "Seed", value: result.seed !== void 0 ? String(result.seed) : "none" },
2121
+ ...hasCompetitors ? [{ label: "Competitors", value: String(result.competitors?.length ?? 0) }] : [],
1983
2122
  { label: "Queries", value: String(result.queries.length) }
1984
2123
  ]
1985
2124
  ),
1986
- renderSectionCard("Trigger Cases", `<div class="row-list">${result.cases.map((testCase) => renderTriggerCaseRow(testCase)).join("")}</div>`),
2125
+ renderCompetitorSkillsSection(result),
2126
+ renderSectionCard(
2127
+ "Trigger Cases",
2128
+ `<div class="row-list">${result.cases.map((testCase) => renderTriggerCaseRow(testCase, hasCompetitors)).join("")}</div>`
2129
+ ),
1987
2130
  renderSectionCard(
1988
2131
  "Suggestions",
1989
2132
  `<ul>${result.suggestions.map((suggestion) => `<li>${escapeHtml(suggestion)}</li>`).join("")}</ul>`
@@ -2023,7 +2166,8 @@ function renderEvalHtml(result) {
2023
2166
  }
2024
2167
  function renderCheckHtml(result) {
2025
2168
  const skillName = result.trigger?.skillName ?? result.eval?.skillName ?? result.target;
2026
- const triggerBody = result.trigger ? `<div class="row-list">${result.trigger.cases.map((testCase) => renderTriggerCaseRow(testCase)).join("")}</div>
2169
+ const triggerBody = result.trigger ? `${renderCompetitorSkillsSection(result.trigger)}
2170
+ <div class="row-list">${result.trigger.cases.map((testCase) => renderTriggerCaseRow(testCase, Boolean(result.trigger?.competitors?.length))).join("")}</div>
2027
2171
  <div class="card" style="margin-top: 16px;">
2028
2172
  <h2>Trigger Suggestions</h2>
2029
2173
  <ul>${result.trigger.suggestions.map((suggestion) => `<li>${escapeHtml(suggestion)}</li>`).join("")}</ul>
@@ -2123,46 +2267,47 @@ function countSkippedSecurityPatterns2(issues) {
2123
2267
  return total + (issue.skippedPatterns?.length ?? 0);
2124
2268
  }, 0);
2125
2269
  }
2270
+ function formatPercent2(value) {
2271
+ return `${(value * 100).toFixed(1)}%`;
2272
+ }
2126
2273
  function renderLintReport(report, enableColor) {
2127
2274
  const c = getChalkInstance(enableColor);
2128
2275
  const { passed, warnings, failures, total } = report.summary;
2129
2276
  const headerLines = [
2130
- `\u250C\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510`,
2131
- `\u2502 skilltest lint \u2502`,
2132
- `\u251C\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2524`,
2133
- `\u2502 target: ${report.target}`,
2134
- `\u2502 summary: ${passed}/${total} checks passed, ${warnings} warnings, ${failures} failures`,
2135
- `\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518`
2277
+ "skilltest lint",
2278
+ `target: ${report.target}`,
2279
+ `summary: ${passed}/${total} checks passed, ${warnings} warnings, ${failures} failures`
2136
2280
  ];
2137
2281
  const renderedIssues = report.issues.map((issue) => renderIssueLine(issue, c)).join("\n");
2138
2282
  const skippedSecurityPatterns = countSkippedSecurityPatterns2(report.issues);
2139
2283
  const infoLine = skippedSecurityPatterns > 0 ? `
2140
- ${c.cyan("\u2139")} ${skippedSecurityPatterns} security pattern(s) found in code examples/comments (not flagged)` : "";
2284
+ ${c.cyan("INFO")} ${skippedSecurityPatterns} security pattern(s) found in code examples/comments (not flagged)` : "";
2141
2285
  return `${headerLines.join("\n")}
2142
2286
  ${renderedIssues}${infoLine}`;
2143
2287
  }
2144
- function formatPercent2(value) {
2145
- return `${(value * 100).toFixed(1)}%`;
2146
- }
2147
2288
  function renderTriggerReport(result, enableColor, verbose) {
2148
2289
  const c = getChalkInstance(enableColor);
2149
- const lines = [];
2150
- lines.push("\u250C\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510");
2151
- lines.push("\u2502 skilltest trigger \u2502");
2152
- lines.push("\u251C\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2524");
2153
- lines.push(`\u2502 skill: ${result.skillName}`);
2154
- lines.push(`\u2502 provider/model: ${result.provider}/${result.model}`);
2290
+ const lines = [
2291
+ "skilltest trigger",
2292
+ `skill: ${result.skillName}`,
2293
+ `provider/model: ${result.provider}/${result.model}`
2294
+ ];
2295
+ if (result.competitors && result.competitors.length > 0) {
2296
+ lines.push(`competitors: ${result.competitors.map((competitor) => competitor.name).join(", ")}`);
2297
+ }
2155
2298
  lines.push(
2156
- `\u2502 precision: ${formatPercent2(result.metrics.precision)} recall: ${formatPercent2(result.metrics.recall)} f1: ${formatPercent2(result.metrics.f1)}`
2299
+ `precision: ${formatPercent2(result.metrics.precision)} recall: ${formatPercent2(result.metrics.recall)} f1: ${formatPercent2(result.metrics.f1)}`
2157
2300
  );
2158
2301
  lines.push(
2159
- `\u2502 TP ${result.metrics.truePositives} TN ${result.metrics.trueNegatives} FP ${result.metrics.falsePositives} FN ${result.metrics.falseNegatives}`
2302
+ `TP ${result.metrics.truePositives} TN ${result.metrics.trueNegatives} FP ${result.metrics.falsePositives} FN ${result.metrics.falseNegatives}`
2160
2303
  );
2161
- lines.push("\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518");
2162
2304
  for (const [index, testCase] of result.cases.entries()) {
2163
2305
  const status = testCase.matched ? c.green("PASS") : c.red("FAIL");
2164
2306
  lines.push(`${index + 1}. ${status} query: ${testCase.query}`);
2165
2307
  lines.push(` expected: ${testCase.expected} | actual: ${testCase.actual}`);
2308
+ if (verbose && testCase.selectedCompetitor) {
2309
+ lines.push(` competitor selected: ${testCase.selectedCompetitor}`);
2310
+ }
2166
2311
  if (verbose && testCase.rawModelResponse) {
2167
2312
  lines.push(` model: ${testCase.rawModelResponse.replace(/\s+/g, " ").trim()}`);
2168
2313
  }
@@ -2175,15 +2320,13 @@ function renderTriggerReport(result, enableColor, verbose) {
2175
2320
  }
2176
2321
  function renderEvalReport(result, enableColor, verbose) {
2177
2322
  const c = getChalkInstance(enableColor);
2178
- const lines = [];
2179
- lines.push("\u250C\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510");
2180
- lines.push("\u2502 skilltest eval \u2502");
2181
- lines.push("\u251C\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2524");
2182
- lines.push(`\u2502 skill: ${result.skillName}`);
2183
- lines.push(`\u2502 provider/model: ${result.provider}/${result.model}`);
2184
- lines.push(`\u2502 grader model: ${result.graderModel}`);
2185
- lines.push(`\u2502 assertions passed: ${result.summary.passedAssertions}/${result.summary.totalAssertions}`);
2186
- lines.push("\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518");
2323
+ const lines = [
2324
+ "skilltest eval",
2325
+ `skill: ${result.skillName}`,
2326
+ `provider/model: ${result.provider}/${result.model}`,
2327
+ `grader model: ${result.graderModel}`,
2328
+ `assertions passed: ${result.summary.passedAssertions}/${result.summary.totalAssertions}`
2329
+ ];
2187
2330
  for (const [index, promptResult] of result.results.entries()) {
2188
2331
  lines.push(`${index + 1}. prompt: ${promptResult.prompt}`);
2189
2332
  lines.push(` response summary: ${promptResult.responseSummary.replace(/\s+/g, " ").trim()}`);
@@ -2229,7 +2372,7 @@ function renderCheckReport(result, enableColor, verbose) {
2229
2372
  }
2230
2373
  const skippedSecurityPatterns = countSkippedSecurityPatterns2(result.lint.issues);
2231
2374
  if (skippedSecurityPatterns > 0) {
2232
- lines.push(` ${c.cyan("\u2139")} ${skippedSecurityPatterns} security pattern(s) found in code examples/comments (not flagged)`);
2375
+ lines.push(` ${c.cyan("INFO")} ${skippedSecurityPatterns} security pattern(s) found in code examples/comments (not flagged)`);
2233
2376
  }
2234
2377
  lines.push("");
2235
2378
  lines.push("Trigger");
@@ -2240,11 +2383,17 @@ function renderCheckReport(result, enableColor, verbose) {
2240
2383
  lines.push(
2241
2384
  ` TP ${result.trigger.metrics.truePositives} TN ${result.trigger.metrics.trueNegatives} FP ${result.trigger.metrics.falsePositives} FN ${result.trigger.metrics.falseNegatives}`
2242
2385
  );
2386
+ if (result.trigger.competitors && result.trigger.competitors.length > 0) {
2387
+ lines.push(` competitors: ${result.trigger.competitors.map((competitor) => competitor.name).join(", ")}`);
2388
+ }
2243
2389
  const triggerCases = verbose ? result.trigger.cases : result.trigger.cases.filter((testCase) => !testCase.matched);
2244
2390
  for (const testCase of triggerCases) {
2245
2391
  const status = testCase.matched ? c.green("PASS") : c.red("FAIL");
2246
2392
  lines.push(` - ${status} ${testCase.query}`);
2247
2393
  lines.push(` expected=${testCase.expected} actual=${testCase.actual}`);
2394
+ if (testCase.selectedCompetitor) {
2395
+ lines.push(` competitor selected=${testCase.selectedCompetitor}`);
2396
+ }
2248
2397
  }
2249
2398
  } else {
2250
2399
  lines.push(`- ${triggerGate} ${result.triggerSkippedReason ?? "Skipped."}`);
@@ -2286,7 +2435,7 @@ function renderCheckReport(result, enableColor, verbose) {
2286
2435
  }
2287
2436
 
2288
2437
  // src/commands/common.ts
2289
- import fs5 from "node:fs/promises";
2438
+ import fs6 from "node:fs/promises";
2290
2439
  import { z as z5 } from "zod";
2291
2440
 
2292
2441
  // src/core/eval-runner.ts
@@ -2314,12 +2463,13 @@ function extractJsonObject(raw) {
2314
2463
  }
2315
2464
  throw new Error("Grader did not return a JSON object.");
2316
2465
  }
2317
- async function gradeResponse(options) {
2318
- const assertionList = options.assertions && options.assertions.length > 0 ? options.assertions : [
2319
- "The response follows the skill instructions faithfully.",
2320
- "The response is well-structured and actionable.",
2321
- "The response addresses the user prompt directly."
2322
- ];
2466
+ var DEFAULT_ASSERTIONS = [
2467
+ "The response follows the skill instructions faithfully.",
2468
+ "The response is well-structured and actionable.",
2469
+ "The response addresses the user prompt directly."
2470
+ ];
2471
+ function buildGraderPrompts(options) {
2472
+ const assertions = options.assertions && options.assertions.length > 0 ? options.assertions : DEFAULT_ASSERTIONS;
2323
2473
  const systemPrompt = [
2324
2474
  "You are a strict evaluator for agent skill outputs.",
2325
2475
  "Assess each assertion and return JSON only.",
@@ -2336,15 +2486,26 @@ async function gradeResponse(options) {
2336
2486
  options.modelResponse,
2337
2487
  "",
2338
2488
  "Assertions to evaluate:",
2339
- assertionList.map((assertion, index) => `${index + 1}. ${assertion}`).join("\n")
2489
+ assertions.map((assertion, index) => `${index + 1}. ${assertion}`).join("\n")
2340
2490
  ].join("\n");
2341
- const raw = await options.provider.sendMessage(systemPrompt, userPrompt, { model: options.model });
2491
+ return {
2492
+ assertions,
2493
+ systemPrompt,
2494
+ userPrompt
2495
+ };
2496
+ }
2497
+ function parseGraderOutput(raw) {
2342
2498
  const parsed = graderOutputSchema.safeParse(extractJsonObject(raw));
2343
2499
  if (!parsed.success) {
2344
2500
  throw new Error(`Failed to parse grader output: ${parsed.error.issues[0]?.message ?? "invalid grader JSON"}`);
2345
2501
  }
2346
2502
  return parsed.data.assertions;
2347
2503
  }
2504
+ async function gradeResponse(options) {
2505
+ const prompts = buildGraderPrompts(options);
2506
+ const raw = await options.provider.sendMessage(prompts.systemPrompt, prompts.userPrompt, { model: options.model });
2507
+ return parseGraderOutput(raw);
2508
+ }
2348
2509
 
2349
2510
  // src/utils/concurrency.ts
2350
2511
  async function pMap(items, fn, concurrency) {
@@ -2499,6 +2660,7 @@ var triggerQuerySchema = z4.object({
2499
2660
  should_trigger: z4.boolean()
2500
2661
  });
2501
2662
  var triggerQueryArraySchema = z4.array(triggerQuerySchema);
2663
+ var triggerNumQueriesSchema = z4.number().int().min(2).refine((value) => value % 2 === 0, "numQueries must be an even number.");
2502
2664
  var FAKE_SKILLS = [
2503
2665
  { name: "code-review", description: "Reviews code changes for bugs, regressions, and maintainability issues." },
2504
2666
  { name: "api-tester", description: "Designs and runs REST API tests, validating status codes and response shapes." },
@@ -2539,6 +2701,9 @@ function shuffle(values, rng) {
2539
2701
  function sample(values, count, rng) {
2540
2702
  return shuffle(values, rng).slice(0, Math.max(0, Math.min(count, values.length)));
2541
2703
  }
2704
+ function validateNumQueries(numQueries) {
2705
+ return triggerNumQueriesSchema.parse(numQueries);
2706
+ }
2542
2707
  function parseJsonArrayFromModelOutput(raw) {
2543
2708
  const trimmed = raw.trim();
2544
2709
  if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
@@ -2552,7 +2717,8 @@ function parseJsonArrayFromModelOutput(raw) {
2552
2717
  }
2553
2718
  throw new Error("Model did not return a JSON array.");
2554
2719
  }
2555
- async function generateQueriesWithModel(skill, provider, model, numQueries) {
2720
+ async function generateQueriesWithModel(skill, provider, model, numQueries, competitors) {
2721
+ validateNumQueries(numQueries);
2556
2722
  const shouldTriggerCount = Math.floor(numQueries / 2);
2557
2723
  const shouldNotTriggerCount = numQueries - shouldTriggerCount;
2558
2724
  const systemPrompt = [
@@ -2564,6 +2730,15 @@ async function generateQueriesWithModel(skill, provider, model, numQueries) {
2564
2730
  const userPrompt = [
2565
2731
  `Skill name: ${skill.frontmatter.name}`,
2566
2732
  `Skill description: ${skill.frontmatter.description}`,
2733
+ ...competitors && competitors.length > 0 ? [
2734
+ "",
2735
+ "Competitor skills in the same domain:",
2736
+ ...competitors.map((competitor) => `- ${competitor.name}: ${competitor.description}`),
2737
+ "",
2738
+ "Generate queries that test whether the target skill triggers correctly even when these similar skills exist.",
2739
+ "Positive queries should clearly belong to the target skill, not the competitors.",
2740
+ "Negative queries should belong to a competitor or to no skill at all."
2741
+ ] : [],
2567
2742
  `Generate ${numQueries} prompts total.`,
2568
2743
  `Exactly ${shouldTriggerCount} should have should_trigger=true.`,
2569
2744
  `Exactly ${shouldNotTriggerCount} should have should_trigger=false.`,
@@ -2597,6 +2772,46 @@ function parseDecision(rawResponse, skillNames) {
2597
2772
  }
2598
2773
  return "unrecognized";
2599
2774
  }
2775
+ function prepareTriggerQueries(skill, queries, seed, competitors) {
2776
+ const rng = createRng(seed);
2777
+ const competitorCandidates = (competitors ?? []).map((competitor) => ({
2778
+ name: competitor.name,
2779
+ description: competitor.description
2780
+ }));
2781
+ return queries.map((testQuery) => {
2782
+ const usingCompetitors = competitorCandidates.length > 0;
2783
+ const fakeCount = usingCompetitors ? testQuery.should_trigger ? 2 + Math.floor(rng() * 3) : 3 + Math.floor(rng() * 3) : 5 + Math.floor(rng() * 5);
2784
+ const fakeSkills = sample(FAKE_SKILLS, fakeCount, rng);
2785
+ const allSkills = usingCompetitors ? shuffle(
2786
+ [
2787
+ ...competitorCandidates,
2788
+ ...fakeSkills,
2789
+ ...testQuery.should_trigger ? [
2790
+ {
2791
+ name: skill.frontmatter.name,
2792
+ description: skill.frontmatter.description
2793
+ }
2794
+ ] : []
2795
+ ],
2796
+ rng
2797
+ ) : shuffle(
2798
+ [
2799
+ ...fakeSkills,
2800
+ {
2801
+ name: skill.frontmatter.name,
2802
+ description: skill.frontmatter.description
2803
+ }
2804
+ ],
2805
+ rng
2806
+ );
2807
+ return {
2808
+ testQuery,
2809
+ fakeSkills,
2810
+ allSkills,
2811
+ skillListText: allSkills.map((entry) => `- ${entry.name}: ${entry.description}`).join("\n")
2812
+ };
2813
+ });
2814
+ }
2600
2815
  function calculateMetrics(skillName, cases) {
2601
2816
  let truePositives = 0;
2602
2817
  let trueNegatives = 0;
@@ -2633,44 +2848,82 @@ function calculateMetrics(skillName, cases) {
2633
2848
  f1
2634
2849
  };
2635
2850
  }
2636
- function buildSuggestions(metrics) {
2851
+ function assertCompetitorNamesDistinct(skillName, competitors) {
2852
+ for (const competitor of competitors) {
2853
+ if (competitor.name === skillName) {
2854
+ throw new Error(`Competitor skill '${competitor.name}' has the same name as the skill under test.`);
2855
+ }
2856
+ }
2857
+ }
2858
+ function buildTriggerCaseResult(options) {
2859
+ const expected = options.testQuery.should_trigger ? options.skillName : "none";
2860
+ const matched = options.testQuery.should_trigger ? options.decision === options.skillName : options.decision !== options.skillName;
2861
+ const selectedCompetitor = options.competitorNames?.includes(options.decision) ? options.decision : void 0;
2862
+ return {
2863
+ query: options.testQuery.query,
2864
+ shouldTrigger: options.testQuery.should_trigger,
2865
+ expected,
2866
+ actual: options.decision,
2867
+ matched,
2868
+ selectedCompetitor,
2869
+ rawModelResponse: options.rawModelResponse
2870
+ };
2871
+ }
2872
+ function buildSuggestions(skillName, metrics, cases, competitors) {
2637
2873
  const suggestions = [];
2638
2874
  if (metrics.falseNegatives > 0) {
2639
2875
  suggestions.push(
2640
2876
  "False negatives found: clarify capability keywords and add explicit 'use when ...' phrasing in description."
2641
2877
  );
2878
+ if (competitors && competitors.length > 0) {
2879
+ const competitorCounts = /* @__PURE__ */ new Map();
2880
+ for (const testCase of cases) {
2881
+ if (!testCase.shouldTrigger || testCase.actual === skillName || !testCase.selectedCompetitor) {
2882
+ continue;
2883
+ }
2884
+ competitorCounts.set(testCase.selectedCompetitor, (competitorCounts.get(testCase.selectedCompetitor) ?? 0) + 1);
2885
+ }
2886
+ for (const [competitorName, count] of competitorCounts.entries()) {
2887
+ suggestions.push(
2888
+ `Skill '${competitorName}' was selected instead of '${skillName}' for ${count} quer${count === 1 ? "y" : "ies"}. Differentiate your description from '${competitorName}'.`
2889
+ );
2890
+ }
2891
+ }
2642
2892
  }
2643
2893
  if (metrics.falsePositives > 0) {
2644
2894
  suggestions.push("False positives found: narrow scope boundaries and add explicit non-goals in description.");
2895
+ if (competitors && competitors.length > 0) {
2896
+ suggestions.push(
2897
+ `With competitor skills present, ${metrics.falsePositives} negative quer${metrics.falsePositives === 1 ? "y still" : "ies still"} triggered '${skillName}'. Narrow your description's scope boundaries.`
2898
+ );
2899
+ }
2645
2900
  }
2646
2901
  if (suggestions.length === 0) {
2647
2902
  suggestions.push("Trigger behavior looks clean on this sample. Keep monitoring with domain-specific custom queries.");
2648
2903
  }
2649
2904
  return suggestions;
2650
2905
  }
2906
+ async function loadCompetitorSkills(comparePaths) {
2907
+ const competitors = [];
2908
+ for (const comparePath of comparePaths) {
2909
+ const parsed = await parseSkillStrict(comparePath);
2910
+ competitors.push({
2911
+ name: parsed.frontmatter.name,
2912
+ description: parsed.frontmatter.description,
2913
+ sourcePath: comparePath
2914
+ });
2915
+ }
2916
+ return competitors;
2917
+ }
2651
2918
  async function runTriggerTest(skill, options) {
2652
- const rng = createRng(options.seed);
2653
- const queries = options.queries && options.queries.length > 0 ? triggerQueryArraySchema.parse(options.queries) : await generateQueriesWithModel(skill, options.provider, options.model, options.numQueries);
2919
+ const competitors = options.compare && options.compare.length > 0 ? await loadCompetitorSkills(options.compare) : void 0;
2920
+ if (competitors && competitors.length > 0) {
2921
+ assertCompetitorNamesDistinct(skill.frontmatter.name, competitors);
2922
+ }
2923
+ const queries = options.queries && options.queries.length > 0 ? triggerQueryArraySchema.parse(options.queries) : await generateQueriesWithModel(skill, options.provider, options.model, options.numQueries, competitors);
2654
2924
  const skillName = skill.frontmatter.name;
2655
- const preparedQueries = queries.map((testQuery) => {
2656
- const fakeCount = 5 + Math.floor(rng() * 5);
2657
- const fakeSkills = sample(FAKE_SKILLS, fakeCount, rng);
2658
- const allSkills = shuffle([
2659
- ...fakeSkills,
2660
- {
2661
- name: skill.frontmatter.name,
2662
- description: skill.frontmatter.description
2663
- }
2664
- ], rng);
2665
- const skillListText = allSkills.map((entry) => `- ${entry.name}: ${entry.description}`).join("\n");
2666
- return {
2667
- testQuery,
2668
- fakeCount,
2669
- fakeSkills,
2670
- allSkills,
2671
- skillListText
2672
- };
2673
- });
2925
+ const preparedQueries = prepareTriggerQueries(skill, queries, options.seed, competitors);
2926
+ const competitorNames = competitors?.map((competitor) => competitor.name) ?? [];
2674
2927
  const systemPrompt = [
2675
2928
  "You are selecting one skill to activate for a user query.",
2676
2929
  "Choose the single best matching skill name from the provided list, or 'none' if no skill is a good fit.",
@@ -2683,18 +2936,15 @@ async function runTriggerTest(skill, options) {
2683
2936
  const rawResponse = await options.provider.sendMessage(systemPrompt, userPrompt, { model: options.model });
2684
2937
  const decision = parseDecision(
2685
2938
  rawResponse,
2686
- allSkills.map((entry) => entry.name)
2939
+ Array.from(/* @__PURE__ */ new Set([skillName, ...allSkills.map((entry) => entry.name)]))
2687
2940
  );
2688
- const expected = testQuery.should_trigger ? skillName : "none";
2689
- const matched = testQuery.should_trigger ? decision === skillName : decision !== skillName;
2690
- return {
2691
- query: testQuery.query,
2692
- shouldTrigger: testQuery.should_trigger,
2693
- expected,
2694
- actual: decision,
2695
- matched,
2941
+ return buildTriggerCaseResult({
2942
+ testQuery,
2943
+ skillName,
2944
+ decision,
2945
+ competitorNames,
2696
2946
  rawModelResponse: options.verbose ? rawResponse : void 0
2697
- };
2947
+ });
2698
2948
  },
2699
2949
  options.concurrency ?? 5
2700
2950
  );
@@ -2704,10 +2954,11 @@ async function runTriggerTest(skill, options) {
2704
2954
  model: options.model,
2705
2955
  provider: options.provider.name,
2706
2956
  seed: options.seed,
2957
+ competitors,
2707
2958
  queries,
2708
2959
  cases: results,
2709
2960
  metrics,
2710
- suggestions: buildSuggestions(metrics)
2961
+ suggestions: buildSuggestions(skillName, metrics, results, competitors)
2711
2962
  };
2712
2963
  }
2713
2964
 
@@ -2822,10 +3073,10 @@ async function loadConfiguredEvalPrompts(command) {
2822
3073
  if (!promptFile && assertionsFile) {
2823
3074
  throw new Error("Config field eval.assertionsFile requires eval.promptFile.");
2824
3075
  }
2825
- const promptRaw = await fs5.readFile(promptFile, "utf8");
3076
+ const promptRaw = await fs6.readFile(promptFile, "utf8");
2826
3077
  let prompts = normalizeEvalPrompts(parseJsonIfPossible(promptRaw), promptFile);
2827
3078
  if (assertionsFile) {
2828
- const assertionsRaw = await fs5.readFile(assertionsFile, "utf8");
3079
+ const assertionsRaw = await fs6.readFile(assertionsFile, "utf8");
2829
3080
  const assertions = normalizeAssertions(parseJsonIfPossible(assertionsRaw), assertionsFile);
2830
3081
  prompts = prompts.map((prompt) => ({
2831
3082
  prompt: prompt.prompt,
@@ -2864,18 +3115,22 @@ function writeError(error, asJson) {
2864
3115
 
2865
3116
  // src/commands/lint.ts
2866
3117
  var lintCliSchema = z6.object({
2867
- html: z6.string().optional()
3118
+ html: z6.string().optional(),
3119
+ plugin: z6.array(z6.string().min(1)).optional()
2868
3120
  });
3121
+ function collectPluginPaths(value, previous = []) {
3122
+ return [...previous, value];
3123
+ }
2869
3124
  async function handleLintCommand(targetPath, options) {
2870
3125
  try {
2871
- const report = await runLinter(targetPath, { suppress: options.suppress });
3126
+ const report = await runLinter(targetPath, { suppress: options.suppress, plugins: options.plugins });
2872
3127
  if (options.json) {
2873
3128
  writeResult(report, true);
2874
3129
  } else {
2875
3130
  writeResult(renderLintReport(report, options.color), false);
2876
3131
  }
2877
3132
  if (options.html) {
2878
- await fs6.writeFile(options.html, renderLintHtml(report), "utf8");
3133
+ await fs7.writeFile(options.html, renderLintHtml(report), "utf8");
2879
3134
  }
2880
3135
  if (lintFails(report, options.failOn)) {
2881
3136
  process.exitCode = 1;
@@ -2886,7 +3141,7 @@ async function handleLintCommand(targetPath, options) {
2886
3141
  }
2887
3142
  }
2888
3143
  function registerLintCommand(program) {
2889
- program.command("lint").description("Run static lint checks against a SKILL.md file or skill directory.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--html <path>", "Write an HTML report to the given file path").action(async (targetPath, _commandOptions, command) => {
3144
+ program.command("lint").description("Run static lint checks against a SKILL.md file or skill directory.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--html <path>", "Write an HTML report to the given file path").option("--plugin <path>", "Load a custom lint plugin file", collectPluginPaths, []).action(async (targetPath, _commandOptions, command) => {
2890
3145
  const globalOptions = getGlobalCliOptions(command);
2891
3146
  const config = getResolvedConfig(command);
2892
3147
  const parsedCli = lintCliSchema.safeParse(command.opts());
@@ -2899,30 +3154,33 @@ function registerLintCommand(program) {
2899
3154
  ...globalOptions,
2900
3155
  failOn: config.lint.failOn,
2901
3156
  suppress: config.lint.suppress,
3157
+ plugins: config.lint.plugins,
2902
3158
  html: parsedCli.data.html
2903
3159
  });
2904
3160
  });
2905
3161
  }
2906
3162
 
2907
3163
  // src/commands/trigger.ts
2908
- import fs8 from "node:fs/promises";
3164
+ import fs9 from "node:fs/promises";
2909
3165
  import ora from "ora";
2910
3166
  import { z as z8 } from "zod";
2911
3167
 
2912
3168
  // src/utils/config.ts
2913
- import fs7 from "node:fs/promises";
2914
- import path5 from "node:path";
3169
+ import fs8 from "node:fs/promises";
3170
+ import path6 from "node:path";
2915
3171
  import { z as z7 } from "zod";
2916
3172
  var providerNameSchema = z7.enum(["anthropic", "openai"]);
2917
3173
  var lintFailOnSchema = z7.enum(["error", "warn"]);
2918
3174
  var lintConfigSchema = z7.object({
2919
3175
  failOn: lintFailOnSchema.optional(),
2920
- suppress: z7.array(z7.string().min(1)).optional()
3176
+ suppress: z7.array(z7.string().min(1)).optional(),
3177
+ plugins: z7.array(z7.string().min(1)).optional()
2921
3178
  }).strict();
2922
3179
  var triggerConfigSchema = z7.object({
2923
3180
  numQueries: z7.number().int().min(2).refine((value) => value % 2 === 0, "trigger.numQueries must be an even number."),
2924
3181
  threshold: z7.number().min(0).max(1).optional(),
2925
- seed: z7.number().int().optional()
3182
+ seed: z7.number().int().optional(),
3183
+ compare: z7.array(z7.string().min(1)).optional()
2926
3184
  }).strict().partial();
2927
3185
  var evalConfigSchema = z7.object({
2928
3186
  numRuns: z7.number().int().min(1).optional(),
@@ -2946,12 +3204,14 @@ var resolvedSkilltestConfigSchema = z7.object({
2946
3204
  concurrency: z7.number().int().min(1),
2947
3205
  lint: z7.object({
2948
3206
  failOn: lintFailOnSchema,
2949
- suppress: z7.array(z7.string().min(1))
3207
+ suppress: z7.array(z7.string().min(1)),
3208
+ plugins: z7.array(z7.string().min(1))
2950
3209
  }),
2951
3210
  trigger: z7.object({
2952
3211
  numQueries: z7.number().int().min(2).refine((value) => value % 2 === 0, "trigger.numQueries must be an even number."),
2953
3212
  threshold: z7.number().min(0).max(1),
2954
- seed: z7.number().int().optional()
3213
+ seed: z7.number().int().optional(),
3214
+ compare: z7.array(z7.string().min(1))
2955
3215
  }),
2956
3216
  eval: z7.object({
2957
3217
  numRuns: z7.number().int().min(1),
@@ -2967,11 +3227,13 @@ var DEFAULT_SKILLTEST_CONFIG = {
2967
3227
  concurrency: 5,
2968
3228
  lint: {
2969
3229
  failOn: "error",
2970
- suppress: []
3230
+ suppress: [],
3231
+ plugins: []
2971
3232
  },
2972
3233
  trigger: {
2973
3234
  numQueries: 20,
2974
- threshold: 0.8
3235
+ threshold: 0.8,
3236
+ compare: []
2975
3237
  },
2976
3238
  eval: {
2977
3239
  numRuns: 5,
@@ -2993,7 +3255,7 @@ function buildConfigValidationError(error, sourceLabel) {
2993
3255
  async function readJsonObject(filePath, label) {
2994
3256
  let raw;
2995
3257
  try {
2996
- raw = await fs7.readFile(filePath, "utf8");
3258
+ raw = await fs8.readFile(filePath, "utf8");
2997
3259
  } catch (error) {
2998
3260
  const message = error instanceof Error ? error.message : String(error);
2999
3261
  throw new Error(`Failed to read ${label}: ${message}`);
@@ -3017,13 +3279,13 @@ async function loadConfigFromJsonFile(filePath) {
3017
3279
  return {
3018
3280
  configFile: parsed.data,
3019
3281
  sourcePath: filePath,
3020
- sourceDirectory: path5.dirname(filePath)
3282
+ sourceDirectory: path6.dirname(filePath)
3021
3283
  };
3022
3284
  }
3023
3285
  async function loadConfigFromNearestPackageJson(startDirectory) {
3024
- let currentDirectory = path5.resolve(startDirectory);
3286
+ let currentDirectory = path6.resolve(startDirectory);
3025
3287
  while (true) {
3026
- const packageJsonPath = path5.join(currentDirectory, "package.json");
3288
+ const packageJsonPath = path6.join(currentDirectory, "package.json");
3027
3289
  if (await pathExists(packageJsonPath)) {
3028
3290
  const raw = await readJsonObject(packageJsonPath, packageJsonPath);
3029
3291
  const packageJsonSchema = z7.object({
@@ -3042,7 +3304,7 @@ async function loadConfigFromNearestPackageJson(startDirectory) {
3042
3304
  sourceDirectory: currentDirectory
3043
3305
  };
3044
3306
  }
3045
- const parentDirectory = path5.dirname(currentDirectory);
3307
+ const parentDirectory = path6.dirname(currentDirectory);
3046
3308
  if (parentDirectory === currentDirectory) {
3047
3309
  return null;
3048
3310
  }
@@ -3055,7 +3317,7 @@ async function resolveSkillDirectoryConfig(targetPath) {
3055
3317
  }
3056
3318
  try {
3057
3319
  const { skillRoot } = await resolveSkillPath(targetPath);
3058
- return loadConfigFromJsonFile(path5.join(skillRoot, ".skilltestrc"));
3320
+ return loadConfigFromJsonFile(path6.join(skillRoot, ".skilltestrc"));
3059
3321
  } catch {
3060
3322
  return null;
3061
3323
  }
@@ -3064,7 +3326,13 @@ function resolveConfigRelativePath(baseDirectory, value) {
3064
3326
  if (!value) {
3065
3327
  return void 0;
3066
3328
  }
3067
- return path5.resolve(baseDirectory, value);
3329
+ return path6.resolve(baseDirectory, value);
3330
+ }
3331
+ function resolveConfigRelativePaths(baseDirectory, values) {
3332
+ if (!values || values.length === 0) {
3333
+ return [];
3334
+ }
3335
+ return values.map((value) => path6.resolve(baseDirectory, value));
3068
3336
  }
3069
3337
  function mergeConfigLayers(configFile = {}, cliFlags = {}, baseDirectory = process.cwd()) {
3070
3338
  const merged = {
@@ -3074,12 +3342,20 @@ function mergeConfigLayers(configFile = {}, cliFlags = {}, baseDirectory = proce
3074
3342
  concurrency: cliFlags.concurrency ?? configFile.concurrency ?? DEFAULT_SKILLTEST_CONFIG.concurrency,
3075
3343
  lint: {
3076
3344
  failOn: cliFlags.lint?.failOn ?? configFile.lint?.failOn ?? DEFAULT_SKILLTEST_CONFIG.lint.failOn,
3077
- suppress: cliFlags.lint?.suppress ?? configFile.lint?.suppress ?? DEFAULT_SKILLTEST_CONFIG.lint.suppress
3345
+ suppress: cliFlags.lint?.suppress ?? configFile.lint?.suppress ?? DEFAULT_SKILLTEST_CONFIG.lint.suppress,
3346
+ plugins: resolveConfigRelativePaths(
3347
+ baseDirectory,
3348
+ cliFlags.lint?.plugins ?? configFile.lint?.plugins ?? DEFAULT_SKILLTEST_CONFIG.lint.plugins
3349
+ )
3078
3350
  },
3079
3351
  trigger: {
3080
3352
  numQueries: cliFlags.trigger?.numQueries ?? configFile.trigger?.numQueries ?? DEFAULT_SKILLTEST_CONFIG.trigger.numQueries,
3081
3353
  threshold: cliFlags.trigger?.threshold ?? configFile.trigger?.threshold ?? DEFAULT_SKILLTEST_CONFIG.trigger.threshold,
3082
- seed: cliFlags.trigger?.seed ?? configFile.trigger?.seed
3354
+ seed: cliFlags.trigger?.seed ?? configFile.trigger?.seed,
3355
+ compare: resolveConfigRelativePaths(
3356
+ baseDirectory,
3357
+ cliFlags.trigger?.compare ?? configFile.trigger?.compare ?? DEFAULT_SKILLTEST_CONFIG.trigger.compare
3358
+ )
3083
3359
  },
3084
3360
  eval: {
3085
3361
  numRuns: cliFlags.eval?.numRuns ?? configFile.eval?.numRuns ?? DEFAULT_SKILLTEST_CONFIG.eval.numRuns,
@@ -3124,6 +3400,18 @@ function extractCliConfigOverrides(command) {
3124
3400
  numQueries: getTypedOptionValue(command, "numQueries")
3125
3401
  };
3126
3402
  }
3403
+ if ((command.name() === "trigger" || command.name() === "check") && command.getOptionValueSource("compare") === "cli") {
3404
+ overrides.trigger = {
3405
+ ...overrides.trigger,
3406
+ compare: getTypedOptionValue(command, "compare")
3407
+ };
3408
+ }
3409
+ if ((command.name() === "lint" || command.name() === "check") && command.getOptionValueSource("plugin") === "cli") {
3410
+ overrides.lint = {
3411
+ ...overrides.lint,
3412
+ plugins: getTypedOptionValue(command, "plugin")
3413
+ };
3414
+ }
3127
3415
  if (command.name() === "check" && command.getOptionValueSource("minF1") === "cli") {
3128
3416
  overrides.trigger = {
3129
3417
  ...overrides.trigger,
@@ -3151,7 +3439,7 @@ async function resolveConfigContext(targetPath, cliFlags) {
3151
3439
  config: mergeConfigLayers(skillDirectoryConfig.configFile, cliFlags, skillDirectoryConfig.sourceDirectory)
3152
3440
  };
3153
3441
  }
3154
- const cwdConfigPath = path5.join(cwd, ".skilltestrc");
3442
+ const cwdConfigPath = path6.join(cwd, ".skilltestrc");
3155
3443
  const cwdConfig = await loadConfigFromJsonFile(cwdConfigPath);
3156
3444
  if (cwdConfig) {
3157
3445
  return {
@@ -3372,6 +3660,7 @@ function createProvider(providerName, apiKeyOverride) {
3372
3660
  var triggerCliSchema = z8.object({
3373
3661
  queries: z8.string().optional(),
3374
3662
  saveQueries: z8.string().optional(),
3663
+ compare: z8.array(z8.string().min(1)).optional(),
3375
3664
  seed: z8.number().int().optional(),
3376
3665
  concurrency: z8.number().int().min(1).optional(),
3377
3666
  html: z8.string().optional(),
@@ -3420,6 +3709,7 @@ async function handleTriggerCommand(targetPath, options) {
3420
3709
  provider,
3421
3710
  queries,
3422
3711
  numQueries: options.numQueries,
3712
+ compare: options.compare,
3423
3713
  seed: options.seed,
3424
3714
  concurrency: options.concurrency,
3425
3715
  verbose: options.verbose
@@ -3438,7 +3728,7 @@ async function handleTriggerCommand(targetPath, options) {
3438
3728
  ...result,
3439
3729
  target: targetPath
3440
3730
  };
3441
- await fs8.writeFile(options.html, renderTriggerHtml(htmlResult), "utf8");
3731
+ await fs9.writeFile(options.html, renderTriggerHtml(htmlResult), "utf8");
3442
3732
  }
3443
3733
  } catch (error) {
3444
3734
  spinner?.stop();
@@ -3447,7 +3737,7 @@ async function handleTriggerCommand(targetPath, options) {
3447
3737
  }
3448
3738
  }
3449
3739
  function registerTriggerCommand(program) {
3450
- program.command("trigger").description("Evaluate whether a skill description triggers correctly.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--model <model>", "Model to use").option("--provider <provider>", "LLM provider: anthropic|openai").option("--queries <path>", "Path to custom test queries JSON").option("--num-queries <n>", "Number of auto-generated queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--concurrency <n>", "Maximum in-flight trigger requests", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--save-queries <path>", "Save generated queries to a JSON file").option("--api-key <key>", "API key override").option("--verbose", "Show full model decisions").action(async (targetPath, _commandOptions, command) => {
3740
+ program.command("trigger").description("Evaluate whether a skill description triggers correctly.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--model <model>", "Model to use").option("--provider <provider>", "LLM provider: anthropic|openai").option("--queries <path>", "Path to custom test queries JSON").option("--compare <path...>", "Path(s) to sibling skill directories to include as competitors").option("--num-queries <n>", "Number of auto-generated queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--concurrency <n>", "Maximum in-flight trigger requests", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--save-queries <path>", "Save generated queries to a JSON file").option("--api-key <key>", "API key override").option("--verbose", "Show full model decisions").action(async (targetPath, _commandOptions, command) => {
3451
3741
  const globalOptions = getGlobalCliOptions(command);
3452
3742
  const config = getResolvedConfig(command);
3453
3743
  const parsedCli = triggerCliSchema.safeParse(command.opts());
@@ -3462,6 +3752,7 @@ function registerTriggerCommand(program) {
3462
3752
  provider: config.provider,
3463
3753
  queries: parsedCli.data.queries,
3464
3754
  numQueries: config.trigger.numQueries,
3755
+ compare: config.trigger.compare,
3465
3756
  saveQueries: parsedCli.data.saveQueries,
3466
3757
  seed: parsedCli.data.seed ?? config.trigger.seed,
3467
3758
  concurrency: config.concurrency,
@@ -3473,7 +3764,7 @@ function registerTriggerCommand(program) {
3473
3764
  }
3474
3765
 
3475
3766
  // src/commands/eval.ts
3476
- import fs9 from "node:fs/promises";
3767
+ import fs10 from "node:fs/promises";
3477
3768
  import ora2 from "ora";
3478
3769
  import { z as z9 } from "zod";
3479
3770
  var evalCliSchema = z9.object({
@@ -3540,7 +3831,7 @@ async function handleEvalCommand(targetPath, options, command) {
3540
3831
  ...result,
3541
3832
  target: targetPath
3542
3833
  };
3543
- await fs9.writeFile(options.html, renderEvalHtml(htmlResult), "utf8");
3834
+ await fs10.writeFile(options.html, renderEvalHtml(htmlResult), "utf8");
3544
3835
  }
3545
3836
  } catch (error) {
3546
3837
  spinner?.stop();
@@ -3579,7 +3870,7 @@ function registerEvalCommand(program) {
3579
3870
  }
3580
3871
 
3581
3872
  // src/commands/check.ts
3582
- import fs10 from "node:fs/promises";
3873
+ import fs11 from "node:fs/promises";
3583
3874
  import ora3 from "ora";
3584
3875
  import { z as z10 } from "zod";
3585
3876
 
@@ -3592,7 +3883,7 @@ function calculateEvalAssertPassRate(result) {
3592
3883
  }
3593
3884
  async function runCheck(inputPath, options) {
3594
3885
  options.onStage?.("lint");
3595
- const lint = await runLinter(inputPath, { suppress: options.lintSuppress });
3886
+ const lint = await runLinter(inputPath, { suppress: options.lintSuppress, plugins: options.lintPlugins });
3596
3887
  const lintPassed = !lintFails(lint, options.lintFailOn);
3597
3888
  let trigger = null;
3598
3889
  let evalResult = null;
@@ -3616,6 +3907,7 @@ async function runCheck(inputPath, options) {
3616
3907
  provider: options.provider,
3617
3908
  model: options.model,
3618
3909
  queries: options.queries,
3910
+ compare: options.compare,
3619
3911
  numQueries: options.numQueries,
3620
3912
  seed: options.triggerSeed,
3621
3913
  concurrency: options.concurrency,
@@ -3677,8 +3969,10 @@ var checkCliSchema = z10.object({
3677
3969
  graderModel: z10.string().optional(),
3678
3970
  apiKey: z10.string().optional(),
3679
3971
  queries: z10.string().optional(),
3972
+ compare: z10.array(z10.string().min(1)).optional(),
3680
3973
  seed: z10.number().int().optional(),
3681
3974
  prompts: z10.string().optional(),
3975
+ plugin: z10.array(z10.string().min(1)).optional(),
3682
3976
  concurrency: z10.number().int().min(1).optional(),
3683
3977
  html: z10.string().optional(),
3684
3978
  saveResults: z10.string().optional(),
@@ -3687,6 +3981,9 @@ var checkCliSchema = z10.object({
3687
3981
  });
3688
3982
  var DEFAULT_ANTHROPIC_MODEL3 = "claude-sonnet-4-5-20250929";
3689
3983
  var DEFAULT_OPENAI_MODEL3 = "gpt-4.1-mini";
3984
+ function collectPluginPaths2(value, previous = []) {
3985
+ return [...previous, value];
3986
+ }
3690
3987
  function resolveModel3(provider, model) {
3691
3988
  if (provider === "openai" && model === DEFAULT_ANTHROPIC_MODEL3) {
3692
3989
  return DEFAULT_OPENAI_MODEL3;
@@ -3737,7 +4034,9 @@ async function handleCheckCommand(targetPath, options, command) {
3737
4034
  graderModel,
3738
4035
  lintFailOn: options.lintFailOn,
3739
4036
  lintSuppress: options.lintSuppress,
4037
+ lintPlugins: options.lintPlugins,
3740
4038
  queries,
4039
+ compare: options.compare,
3741
4040
  numQueries: options.numQueries,
3742
4041
  triggerSeed: options.triggerSeed,
3743
4042
  prompts,
@@ -3773,7 +4072,7 @@ async function handleCheckCommand(targetPath, options, command) {
3773
4072
  );
3774
4073
  }
3775
4074
  if (options.html) {
3776
- await fs10.writeFile(options.html, renderCheckHtml(result), "utf8");
4075
+ await fs11.writeFile(options.html, renderCheckHtml(result), "utf8");
3777
4076
  }
3778
4077
  process.exitCode = result.gates.overallPassed ? 0 : 1;
3779
4078
  } catch (error) {
@@ -3783,7 +4082,7 @@ async function handleCheckCommand(targetPath, options, command) {
3783
4082
  }
3784
4083
  }
3785
4084
  function registerCheckCommand(program) {
3786
- program.command("check").description("Run lint + trigger + eval with threshold-based quality gates.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai").option("--model <model>", "Model for trigger/eval runs").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--prompts <path>", "Path to eval prompts JSON").option("--concurrency <n>", "Maximum in-flight trigger/eval tasks", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value)).option("--min-assert-pass-rate <n>", "Minimum required eval assertion pass rate (0-1)", (value) => Number.parseFloat(value)).option("--save-results <path>", "Save combined check results to JSON").option("--continue-on-lint-fail", "Continue trigger/eval stages even when lint has failures").option("--verbose", "Show detailed trigger/eval output sections").action(async (targetPath, _commandOptions, command) => {
4085
+ program.command("check").description("Run lint + trigger + eval with threshold-based quality gates.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai").option("--model <model>", "Model for trigger/eval runs").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--compare <path...>", "Path(s) to sibling skill directories to include as competitors").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--prompts <path>", "Path to eval prompts JSON").option("--plugin <path>", "Load a custom lint plugin file", collectPluginPaths2, []).option("--concurrency <n>", "Maximum in-flight trigger/eval tasks", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value)).option("--min-assert-pass-rate <n>", "Minimum required eval assertion pass rate (0-1)", (value) => Number.parseFloat(value)).option("--save-results <path>", "Save combined check results to JSON").option("--continue-on-lint-fail", "Continue trigger/eval stages even when lint has failures").option("--verbose", "Show detailed trigger/eval output sections").action(async (targetPath, _commandOptions, command) => {
3787
4086
  const globalOptions = getGlobalCliOptions(command);
3788
4087
  const config = getResolvedConfig(command);
3789
4088
  const parsedCli = checkCliSchema.safeParse(command.opts());
@@ -3801,6 +4100,7 @@ function registerCheckCommand(program) {
3801
4100
  graderModel: parsedCli.data.graderModel,
3802
4101
  apiKey: parsedCli.data.apiKey,
3803
4102
  queries: parsedCli.data.queries,
4103
+ compare: config.trigger.compare,
3804
4104
  numQueries: config.trigger.numQueries,
3805
4105
  prompts: parsedCli.data.prompts,
3806
4106
  minF1: config.trigger.threshold,
@@ -3810,6 +4110,7 @@ function registerCheckCommand(program) {
3810
4110
  html: parsedCli.data.html,
3811
4111
  lintFailOn: config.lint.failOn,
3812
4112
  lintSuppress: config.lint.suppress,
4113
+ lintPlugins: config.lint.plugins,
3813
4114
  triggerSeed: parsedCli.data.seed ?? config.trigger.seed,
3814
4115
  saveResults: parsedCli.data.saveResults,
3815
4116
  continueOnLintFail: Boolean(parsedCli.data.continueOnLintFail),
@@ -3824,8 +4125,8 @@ function registerCheckCommand(program) {
3824
4125
  function resolveVersion() {
3825
4126
  try {
3826
4127
  const currentFilePath = fileURLToPath(import.meta.url);
3827
- const packageJsonPath = path6.resolve(path6.dirname(currentFilePath), "..", "package.json");
3828
- const raw = fs11.readFileSync(packageJsonPath, "utf8");
4128
+ const packageJsonPath = path7.resolve(path7.dirname(currentFilePath), "..", "package.json");
4129
+ const raw = fs12.readFileSync(packageJsonPath, "utf8");
3829
4130
  const parsed = JSON.parse(raw);
3830
4131
  return parsed.version ?? "0.0.0";
3831
4132
  } catch {