skilltest 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +3 -6
- package/README.md +92 -0
- package/dist/index.js +378 -98
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/index.ts
|
|
4
|
-
import
|
|
5
|
-
import
|
|
4
|
+
import fs12 from "node:fs";
|
|
5
|
+
import path7 from "node:path";
|
|
6
6
|
import { fileURLToPath } from "node:url";
|
|
7
7
|
import { Command } from "commander";
|
|
8
8
|
|
|
9
9
|
// src/commands/lint.ts
|
|
10
|
-
import
|
|
10
|
+
import fs7 from "node:fs/promises";
|
|
11
11
|
import { z as z6 } from "zod";
|
|
12
12
|
|
|
13
13
|
// src/core/skill-parser.ts
|
|
@@ -972,6 +972,116 @@ function runFrontmatterChecks(context) {
|
|
|
972
972
|
return issues;
|
|
973
973
|
}
|
|
974
974
|
|
|
975
|
+
// src/core/linter/plugin.ts
|
|
976
|
+
import fs4 from "node:fs/promises";
|
|
977
|
+
import path4 from "node:path";
|
|
978
|
+
import { pathToFileURL } from "node:url";
|
|
979
|
+
function normalizeRuleCheckId(checkId) {
|
|
980
|
+
return checkId.includes(":") ? checkId : `plugin:${checkId}`;
|
|
981
|
+
}
|
|
982
|
+
function buildPluginValidationError(filePath, message) {
|
|
983
|
+
return new Error(`Invalid lint plugin at ${filePath}: ${message}`);
|
|
984
|
+
}
|
|
985
|
+
function validatePluginCandidate(candidate, filePath, exportName) {
|
|
986
|
+
if (!candidate || typeof candidate !== "object" || !("rules" in candidate)) {
|
|
987
|
+
throw buildPluginValidationError(filePath, `${exportName} export must be an object with a rules array.`);
|
|
988
|
+
}
|
|
989
|
+
const rules = candidate.rules;
|
|
990
|
+
if (!Array.isArray(rules)) {
|
|
991
|
+
throw buildPluginValidationError(filePath, `${exportName} export must include a rules array.`);
|
|
992
|
+
}
|
|
993
|
+
return {
|
|
994
|
+
rules: rules.map((rule, index) => {
|
|
995
|
+
if (!rule || typeof rule !== "object") {
|
|
996
|
+
throw buildPluginValidationError(filePath, `rule at index ${index} must be an object.`);
|
|
997
|
+
}
|
|
998
|
+
const checkId = rule.checkId;
|
|
999
|
+
if (typeof checkId !== "string" || checkId.trim() === "") {
|
|
1000
|
+
throw buildPluginValidationError(filePath, `rule at index ${index} must have a non-empty string checkId.`);
|
|
1001
|
+
}
|
|
1002
|
+
const title = rule.title;
|
|
1003
|
+
if (typeof title !== "string" || title.trim() === "") {
|
|
1004
|
+
throw buildPluginValidationError(filePath, `rule at index ${index} must have a non-empty string title.`);
|
|
1005
|
+
}
|
|
1006
|
+
const check = rule.check;
|
|
1007
|
+
if (typeof check !== "function") {
|
|
1008
|
+
throw buildPluginValidationError(filePath, `rule '${checkId}' must have a check function.`);
|
|
1009
|
+
}
|
|
1010
|
+
return {
|
|
1011
|
+
checkId: normalizeRuleCheckId(checkId),
|
|
1012
|
+
title,
|
|
1013
|
+
check
|
|
1014
|
+
};
|
|
1015
|
+
})
|
|
1016
|
+
};
|
|
1017
|
+
}
|
|
1018
|
+
async function loadPlugin(filePath) {
|
|
1019
|
+
const absolutePath = path4.resolve(filePath);
|
|
1020
|
+
try {
|
|
1021
|
+
await fs4.access(absolutePath);
|
|
1022
|
+
} catch {
|
|
1023
|
+
throw new Error(`Failed to load lint plugin at ${absolutePath}: file does not exist.`);
|
|
1024
|
+
}
|
|
1025
|
+
let loadedModule;
|
|
1026
|
+
try {
|
|
1027
|
+
loadedModule = await import(pathToFileURL(absolutePath).href);
|
|
1028
|
+
} catch (error) {
|
|
1029
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1030
|
+
throw new Error(`Failed to load lint plugin at ${absolutePath}: ${message}`);
|
|
1031
|
+
}
|
|
1032
|
+
const validationErrors = [];
|
|
1033
|
+
for (const [exportName, candidate] of [
|
|
1034
|
+
["default", loadedModule.default],
|
|
1035
|
+
["plugin", loadedModule.plugin]
|
|
1036
|
+
]) {
|
|
1037
|
+
if (candidate === void 0) {
|
|
1038
|
+
continue;
|
|
1039
|
+
}
|
|
1040
|
+
try {
|
|
1041
|
+
return validatePluginCandidate(candidate, absolutePath, exportName);
|
|
1042
|
+
} catch (error) {
|
|
1043
|
+
validationErrors.push(error instanceof Error ? error.message : String(error));
|
|
1044
|
+
}
|
|
1045
|
+
}
|
|
1046
|
+
if (validationErrors.length > 0) {
|
|
1047
|
+
throw new Error(validationErrors.join(" "));
|
|
1048
|
+
}
|
|
1049
|
+
throw buildPluginValidationError(
|
|
1050
|
+
absolutePath,
|
|
1051
|
+
"expected a default export or named export 'plugin' containing a rules array."
|
|
1052
|
+
);
|
|
1053
|
+
}
|
|
1054
|
+
function buildRuleExecutionError(rule, error) {
|
|
1055
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1056
|
+
return {
|
|
1057
|
+
id: `plugin.load-error.${rule.checkId.replace(/[^A-Za-z0-9]+/g, "-").replace(/^-+|-+$/g, "").toLowerCase()}`,
|
|
1058
|
+
checkId: "plugin:load-error",
|
|
1059
|
+
title: "Plugin Rule Error",
|
|
1060
|
+
status: "fail",
|
|
1061
|
+
message: `Plugin rule '${rule.checkId}' failed: ${message}`
|
|
1062
|
+
};
|
|
1063
|
+
}
|
|
1064
|
+
async function runPluginRules(plugin, context) {
|
|
1065
|
+
const issues = [];
|
|
1066
|
+
for (const rule of plugin.rules) {
|
|
1067
|
+
try {
|
|
1068
|
+
const result = await rule.check(context);
|
|
1069
|
+
if (!Array.isArray(result)) {
|
|
1070
|
+
throw new Error("check function must return an array of lint issues.");
|
|
1071
|
+
}
|
|
1072
|
+
issues.push(
|
|
1073
|
+
...result.map((issue) => ({
|
|
1074
|
+
...issue,
|
|
1075
|
+
checkId: rule.checkId
|
|
1076
|
+
}))
|
|
1077
|
+
);
|
|
1078
|
+
} catch (error) {
|
|
1079
|
+
issues.push(buildRuleExecutionError(rule, error));
|
|
1080
|
+
}
|
|
1081
|
+
}
|
|
1082
|
+
return issues;
|
|
1083
|
+
}
|
|
1084
|
+
|
|
975
1085
|
// src/core/linter/security.ts
|
|
976
1086
|
var DANGEROUS_COMMAND_PATTERNS = [
|
|
977
1087
|
{
|
|
@@ -1179,8 +1289,8 @@ function runSecurityChecks(context) {
|
|
|
1179
1289
|
}
|
|
1180
1290
|
|
|
1181
1291
|
// src/core/linter/structure.ts
|
|
1182
|
-
import
|
|
1183
|
-
import
|
|
1292
|
+
import fs5 from "node:fs/promises";
|
|
1293
|
+
import path5 from "node:path";
|
|
1184
1294
|
function hasTableOfContents(content) {
|
|
1185
1295
|
if (/^#{1,6}\s+table of contents\b/im.test(content)) {
|
|
1186
1296
|
return true;
|
|
@@ -1221,21 +1331,21 @@ async function runStructureChecks(context) {
|
|
|
1221
1331
|
message: `SKILL.md length is ${context.skill.lineCount} lines.`
|
|
1222
1332
|
});
|
|
1223
1333
|
}
|
|
1224
|
-
const referencesDir =
|
|
1334
|
+
const referencesDir = path5.join(context.skill.skillRoot, "references");
|
|
1225
1335
|
if (await pathExists(referencesDir)) {
|
|
1226
1336
|
const files = await listFilesRecursive(referencesDir);
|
|
1227
1337
|
let oversizedWithoutToc = 0;
|
|
1228
1338
|
for (const file of files) {
|
|
1229
|
-
const raw = await
|
|
1339
|
+
const raw = await fs5.readFile(file, "utf8");
|
|
1230
1340
|
const lineCount = raw === "" ? 0 : raw.split(/\r?\n/).length;
|
|
1231
1341
|
if (lineCount > 300 && !hasTableOfContents(raw)) {
|
|
1232
1342
|
oversizedWithoutToc += 1;
|
|
1233
1343
|
issues.push({
|
|
1234
|
-
id: `structure.references.toc.${toPosixPath(
|
|
1344
|
+
id: `structure.references.toc.${toPosixPath(path5.relative(context.skill.skillRoot, file))}`,
|
|
1235
1345
|
checkId: "structure:toc",
|
|
1236
1346
|
title: "Reference File Navigation",
|
|
1237
1347
|
status: "warn",
|
|
1238
|
-
message: `${toPosixPath(
|
|
1348
|
+
message: `${toPosixPath(path5.relative(context.skill.skillRoot, file))} is ${lineCount} lines and has no table of contents.`,
|
|
1239
1349
|
suggestion: "Add a table of contents for long reference files."
|
|
1240
1350
|
});
|
|
1241
1351
|
}
|
|
@@ -1265,7 +1375,7 @@ async function runStructureChecks(context) {
|
|
|
1265
1375
|
other: []
|
|
1266
1376
|
};
|
|
1267
1377
|
for (const reference of references) {
|
|
1268
|
-
const resolved =
|
|
1378
|
+
const resolved = path5.resolve(context.skill.skillRoot, reference);
|
|
1269
1379
|
if (!await pathExists(resolved)) {
|
|
1270
1380
|
const kind = classifyReferencePath(reference);
|
|
1271
1381
|
missingByType[kind].push(reference);
|
|
@@ -1362,6 +1472,10 @@ async function runLinter(inputPath, options = {}) {
|
|
|
1362
1472
|
issues.push(...runSecurityChecks(context));
|
|
1363
1473
|
issues.push(...await runDisclosureChecks(context));
|
|
1364
1474
|
issues.push(...runCompatibilityChecks(context));
|
|
1475
|
+
for (const pluginPath of options.plugins ?? []) {
|
|
1476
|
+
const plugin = await loadPlugin(pluginPath);
|
|
1477
|
+
issues.push(...await runPluginRules(plugin, context));
|
|
1478
|
+
}
|
|
1365
1479
|
const filteredIssues = issues.filter((issue) => !suppressedCheckIds.has(issue.checkId));
|
|
1366
1480
|
return {
|
|
1367
1481
|
target: inputPath,
|
|
@@ -1525,10 +1639,10 @@ function renderLintIssueList(report) {
|
|
|
1525
1639
|
const info = skippedSecurityPatterns > 0 ? `<p class="info-line">Skipped security patterns in examples/comments: ${escapeHtml(skippedSecurityPatterns)}</p>` : "";
|
|
1526
1640
|
return `<div class="row-list">${rows}</div>${info}`;
|
|
1527
1641
|
}
|
|
1528
|
-
function renderTriggerCaseRow(testCase) {
|
|
1642
|
+
function renderTriggerCaseRow(testCase, showSelectedCompetitor) {
|
|
1529
1643
|
const details = testCase.rawModelResponse ? renderDetails("Model response", renderPreBlock(testCase.rawModelResponse)) : "";
|
|
1530
1644
|
return `
|
|
1531
|
-
<div class="row">
|
|
1645
|
+
<div class="row${testCase.selectedCompetitor ? " competitor-selected" : ""}">
|
|
1532
1646
|
<div class="row-header">
|
|
1533
1647
|
<div>
|
|
1534
1648
|
<div class="row-title">${escapeHtml(testCase.query)}</div>
|
|
@@ -1540,12 +1654,29 @@ function renderTriggerCaseRow(testCase) {
|
|
|
1540
1654
|
</div>
|
|
1541
1655
|
${renderDefinitionList([
|
|
1542
1656
|
{ label: "Expected", value: testCase.expected },
|
|
1543
|
-
{ label: "Actual", value: testCase.actual }
|
|
1657
|
+
{ label: "Actual", value: testCase.actual },
|
|
1658
|
+
...showSelectedCompetitor ? [{ label: "Selected competitor", value: testCase.selectedCompetitor ?? "none" }] : []
|
|
1544
1659
|
])}
|
|
1545
1660
|
${details}
|
|
1546
1661
|
</div>
|
|
1547
1662
|
`;
|
|
1548
1663
|
}
|
|
1664
|
+
function renderCompetitorSkillsSection(result) {
|
|
1665
|
+
if (!result.competitors || result.competitors.length === 0) {
|
|
1666
|
+
return "";
|
|
1667
|
+
}
|
|
1668
|
+
return renderSectionCard(
|
|
1669
|
+
"Competitor Skills",
|
|
1670
|
+
`<div class="row-list">${result.competitors.map(
|
|
1671
|
+
(competitor) => renderMessageRow(
|
|
1672
|
+
"warn",
|
|
1673
|
+
competitor.name,
|
|
1674
|
+
competitor.description,
|
|
1675
|
+
renderDefinitionList([{ label: "Source", value: competitor.sourcePath }])
|
|
1676
|
+
)
|
|
1677
|
+
).join("")}</div>`
|
|
1678
|
+
);
|
|
1679
|
+
}
|
|
1549
1680
|
function promptStatus(promptResult) {
|
|
1550
1681
|
if (promptResult.totalAssertions === 0) {
|
|
1551
1682
|
return "skip";
|
|
@@ -1638,6 +1769,7 @@ function renderHtmlDocument(title, body) {
|
|
|
1638
1769
|
--pass: #22c55e;
|
|
1639
1770
|
--warn: #eab308;
|
|
1640
1771
|
--fail: #ef4444;
|
|
1772
|
+
--competitor: #f97316;
|
|
1641
1773
|
--skip: #6b7280;
|
|
1642
1774
|
--shadow: 0 10px 30px rgba(15, 23, 42, 0.08);
|
|
1643
1775
|
}
|
|
@@ -1786,6 +1918,11 @@ function renderHtmlDocument(title, body) {
|
|
|
1786
1918
|
background: var(--surface-muted);
|
|
1787
1919
|
}
|
|
1788
1920
|
|
|
1921
|
+
.row.competitor-selected {
|
|
1922
|
+
border-color: rgba(249, 115, 22, 0.45);
|
|
1923
|
+
background: rgba(249, 115, 22, 0.08);
|
|
1924
|
+
}
|
|
1925
|
+
|
|
1789
1926
|
.row-header {
|
|
1790
1927
|
display: flex;
|
|
1791
1928
|
justify-content: space-between;
|
|
@@ -1965,6 +2102,7 @@ function renderTriggerHtml(result) {
|
|
|
1965
2102
|
const target = resolveOptionalTarget(htmlResult, result.skillName);
|
|
1966
2103
|
const matchedCount = result.cases.filter((testCase) => testCase.matched).length;
|
|
1967
2104
|
const matchRate = result.cases.length === 0 ? 0 : matchedCount / result.cases.length;
|
|
2105
|
+
const hasCompetitors = Boolean(result.competitors && result.competitors.length > 0);
|
|
1968
2106
|
const body = [
|
|
1969
2107
|
renderHeaderCard(
|
|
1970
2108
|
"trigger",
|
|
@@ -1980,10 +2118,15 @@ function renderTriggerHtml(result) {
|
|
|
1980
2118
|
{ label: "Provider", value: result.provider },
|
|
1981
2119
|
{ label: "Model", value: result.model },
|
|
1982
2120
|
{ label: "Seed", value: result.seed !== void 0 ? String(result.seed) : "none" },
|
|
2121
|
+
...hasCompetitors ? [{ label: "Competitors", value: String(result.competitors?.length ?? 0) }] : [],
|
|
1983
2122
|
{ label: "Queries", value: String(result.queries.length) }
|
|
1984
2123
|
]
|
|
1985
2124
|
),
|
|
1986
|
-
|
|
2125
|
+
renderCompetitorSkillsSection(result),
|
|
2126
|
+
renderSectionCard(
|
|
2127
|
+
"Trigger Cases",
|
|
2128
|
+
`<div class="row-list">${result.cases.map((testCase) => renderTriggerCaseRow(testCase, hasCompetitors)).join("")}</div>`
|
|
2129
|
+
),
|
|
1987
2130
|
renderSectionCard(
|
|
1988
2131
|
"Suggestions",
|
|
1989
2132
|
`<ul>${result.suggestions.map((suggestion) => `<li>${escapeHtml(suggestion)}</li>`).join("")}</ul>`
|
|
@@ -2023,7 +2166,8 @@ function renderEvalHtml(result) {
|
|
|
2023
2166
|
}
|
|
2024
2167
|
function renderCheckHtml(result) {
|
|
2025
2168
|
const skillName = result.trigger?.skillName ?? result.eval?.skillName ?? result.target;
|
|
2026
|
-
const triggerBody = result.trigger ?
|
|
2169
|
+
const triggerBody = result.trigger ? `${renderCompetitorSkillsSection(result.trigger)}
|
|
2170
|
+
<div class="row-list">${result.trigger.cases.map((testCase) => renderTriggerCaseRow(testCase, Boolean(result.trigger?.competitors?.length))).join("")}</div>
|
|
2027
2171
|
<div class="card" style="margin-top: 16px;">
|
|
2028
2172
|
<h2>Trigger Suggestions</h2>
|
|
2029
2173
|
<ul>${result.trigger.suggestions.map((suggestion) => `<li>${escapeHtml(suggestion)}</li>`).join("")}</ul>
|
|
@@ -2123,46 +2267,47 @@ function countSkippedSecurityPatterns2(issues) {
|
|
|
2123
2267
|
return total + (issue.skippedPatterns?.length ?? 0);
|
|
2124
2268
|
}, 0);
|
|
2125
2269
|
}
|
|
2270
|
+
function formatPercent2(value) {
|
|
2271
|
+
return `${(value * 100).toFixed(1)}%`;
|
|
2272
|
+
}
|
|
2126
2273
|
function renderLintReport(report, enableColor) {
|
|
2127
2274
|
const c = getChalkInstance(enableColor);
|
|
2128
2275
|
const { passed, warnings, failures, total } = report.summary;
|
|
2129
2276
|
const headerLines = [
|
|
2130
|
-
|
|
2131
|
-
|
|
2132
|
-
|
|
2133
|
-
`\u2502 target: ${report.target}`,
|
|
2134
|
-
`\u2502 summary: ${passed}/${total} checks passed, ${warnings} warnings, ${failures} failures`,
|
|
2135
|
-
`\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518`
|
|
2277
|
+
"skilltest lint",
|
|
2278
|
+
`target: ${report.target}`,
|
|
2279
|
+
`summary: ${passed}/${total} checks passed, ${warnings} warnings, ${failures} failures`
|
|
2136
2280
|
];
|
|
2137
2281
|
const renderedIssues = report.issues.map((issue) => renderIssueLine(issue, c)).join("\n");
|
|
2138
2282
|
const skippedSecurityPatterns = countSkippedSecurityPatterns2(report.issues);
|
|
2139
2283
|
const infoLine = skippedSecurityPatterns > 0 ? `
|
|
2140
|
-
${c.cyan("
|
|
2284
|
+
${c.cyan("INFO")} ${skippedSecurityPatterns} security pattern(s) found in code examples/comments (not flagged)` : "";
|
|
2141
2285
|
return `${headerLines.join("\n")}
|
|
2142
2286
|
${renderedIssues}${infoLine}`;
|
|
2143
2287
|
}
|
|
2144
|
-
function formatPercent2(value) {
|
|
2145
|
-
return `${(value * 100).toFixed(1)}%`;
|
|
2146
|
-
}
|
|
2147
2288
|
function renderTriggerReport(result, enableColor, verbose) {
|
|
2148
2289
|
const c = getChalkInstance(enableColor);
|
|
2149
|
-
const lines = [
|
|
2150
|
-
|
|
2151
|
-
|
|
2152
|
-
|
|
2153
|
-
|
|
2154
|
-
|
|
2290
|
+
const lines = [
|
|
2291
|
+
"skilltest trigger",
|
|
2292
|
+
`skill: ${result.skillName}`,
|
|
2293
|
+
`provider/model: ${result.provider}/${result.model}`
|
|
2294
|
+
];
|
|
2295
|
+
if (result.competitors && result.competitors.length > 0) {
|
|
2296
|
+
lines.push(`competitors: ${result.competitors.map((competitor) => competitor.name).join(", ")}`);
|
|
2297
|
+
}
|
|
2155
2298
|
lines.push(
|
|
2156
|
-
|
|
2299
|
+
`precision: ${formatPercent2(result.metrics.precision)} recall: ${formatPercent2(result.metrics.recall)} f1: ${formatPercent2(result.metrics.f1)}`
|
|
2157
2300
|
);
|
|
2158
2301
|
lines.push(
|
|
2159
|
-
|
|
2302
|
+
`TP ${result.metrics.truePositives} TN ${result.metrics.trueNegatives} FP ${result.metrics.falsePositives} FN ${result.metrics.falseNegatives}`
|
|
2160
2303
|
);
|
|
2161
|
-
lines.push("\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518");
|
|
2162
2304
|
for (const [index, testCase] of result.cases.entries()) {
|
|
2163
2305
|
const status = testCase.matched ? c.green("PASS") : c.red("FAIL");
|
|
2164
2306
|
lines.push(`${index + 1}. ${status} query: ${testCase.query}`);
|
|
2165
2307
|
lines.push(` expected: ${testCase.expected} | actual: ${testCase.actual}`);
|
|
2308
|
+
if (verbose && testCase.selectedCompetitor) {
|
|
2309
|
+
lines.push(` competitor selected: ${testCase.selectedCompetitor}`);
|
|
2310
|
+
}
|
|
2166
2311
|
if (verbose && testCase.rawModelResponse) {
|
|
2167
2312
|
lines.push(` model: ${testCase.rawModelResponse.replace(/\s+/g, " ").trim()}`);
|
|
2168
2313
|
}
|
|
@@ -2175,15 +2320,13 @@ function renderTriggerReport(result, enableColor, verbose) {
|
|
|
2175
2320
|
}
|
|
2176
2321
|
function renderEvalReport(result, enableColor, verbose) {
|
|
2177
2322
|
const c = getChalkInstance(enableColor);
|
|
2178
|
-
const lines = [
|
|
2179
|
-
|
|
2180
|
-
|
|
2181
|
-
|
|
2182
|
-
|
|
2183
|
-
|
|
2184
|
-
|
|
2185
|
-
lines.push(`\u2502 assertions passed: ${result.summary.passedAssertions}/${result.summary.totalAssertions}`);
|
|
2186
|
-
lines.push("\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518");
|
|
2323
|
+
const lines = [
|
|
2324
|
+
"skilltest eval",
|
|
2325
|
+
`skill: ${result.skillName}`,
|
|
2326
|
+
`provider/model: ${result.provider}/${result.model}`,
|
|
2327
|
+
`grader model: ${result.graderModel}`,
|
|
2328
|
+
`assertions passed: ${result.summary.passedAssertions}/${result.summary.totalAssertions}`
|
|
2329
|
+
];
|
|
2187
2330
|
for (const [index, promptResult] of result.results.entries()) {
|
|
2188
2331
|
lines.push(`${index + 1}. prompt: ${promptResult.prompt}`);
|
|
2189
2332
|
lines.push(` response summary: ${promptResult.responseSummary.replace(/\s+/g, " ").trim()}`);
|
|
@@ -2229,7 +2372,7 @@ function renderCheckReport(result, enableColor, verbose) {
|
|
|
2229
2372
|
}
|
|
2230
2373
|
const skippedSecurityPatterns = countSkippedSecurityPatterns2(result.lint.issues);
|
|
2231
2374
|
if (skippedSecurityPatterns > 0) {
|
|
2232
|
-
lines.push(` ${c.cyan("
|
|
2375
|
+
lines.push(` ${c.cyan("INFO")} ${skippedSecurityPatterns} security pattern(s) found in code examples/comments (not flagged)`);
|
|
2233
2376
|
}
|
|
2234
2377
|
lines.push("");
|
|
2235
2378
|
lines.push("Trigger");
|
|
@@ -2240,11 +2383,17 @@ function renderCheckReport(result, enableColor, verbose) {
|
|
|
2240
2383
|
lines.push(
|
|
2241
2384
|
` TP ${result.trigger.metrics.truePositives} TN ${result.trigger.metrics.trueNegatives} FP ${result.trigger.metrics.falsePositives} FN ${result.trigger.metrics.falseNegatives}`
|
|
2242
2385
|
);
|
|
2386
|
+
if (result.trigger.competitors && result.trigger.competitors.length > 0) {
|
|
2387
|
+
lines.push(` competitors: ${result.trigger.competitors.map((competitor) => competitor.name).join(", ")}`);
|
|
2388
|
+
}
|
|
2243
2389
|
const triggerCases = verbose ? result.trigger.cases : result.trigger.cases.filter((testCase) => !testCase.matched);
|
|
2244
2390
|
for (const testCase of triggerCases) {
|
|
2245
2391
|
const status = testCase.matched ? c.green("PASS") : c.red("FAIL");
|
|
2246
2392
|
lines.push(` - ${status} ${testCase.query}`);
|
|
2247
2393
|
lines.push(` expected=${testCase.expected} actual=${testCase.actual}`);
|
|
2394
|
+
if (testCase.selectedCompetitor) {
|
|
2395
|
+
lines.push(` competitor selected=${testCase.selectedCompetitor}`);
|
|
2396
|
+
}
|
|
2248
2397
|
}
|
|
2249
2398
|
} else {
|
|
2250
2399
|
lines.push(`- ${triggerGate} ${result.triggerSkippedReason ?? "Skipped."}`);
|
|
@@ -2286,7 +2435,7 @@ function renderCheckReport(result, enableColor, verbose) {
|
|
|
2286
2435
|
}
|
|
2287
2436
|
|
|
2288
2437
|
// src/commands/common.ts
|
|
2289
|
-
import
|
|
2438
|
+
import fs6 from "node:fs/promises";
|
|
2290
2439
|
import { z as z5 } from "zod";
|
|
2291
2440
|
|
|
2292
2441
|
// src/core/eval-runner.ts
|
|
@@ -2568,7 +2717,7 @@ function parseJsonArrayFromModelOutput(raw) {
|
|
|
2568
2717
|
}
|
|
2569
2718
|
throw new Error("Model did not return a JSON array.");
|
|
2570
2719
|
}
|
|
2571
|
-
async function generateQueriesWithModel(skill, provider, model, numQueries) {
|
|
2720
|
+
async function generateQueriesWithModel(skill, provider, model, numQueries, competitors) {
|
|
2572
2721
|
validateNumQueries(numQueries);
|
|
2573
2722
|
const shouldTriggerCount = Math.floor(numQueries / 2);
|
|
2574
2723
|
const shouldNotTriggerCount = numQueries - shouldTriggerCount;
|
|
@@ -2581,6 +2730,15 @@ async function generateQueriesWithModel(skill, provider, model, numQueries) {
|
|
|
2581
2730
|
const userPrompt = [
|
|
2582
2731
|
`Skill name: ${skill.frontmatter.name}`,
|
|
2583
2732
|
`Skill description: ${skill.frontmatter.description}`,
|
|
2733
|
+
...competitors && competitors.length > 0 ? [
|
|
2734
|
+
"",
|
|
2735
|
+
"Competitor skills in the same domain:",
|
|
2736
|
+
...competitors.map((competitor) => `- ${competitor.name}: ${competitor.description}`),
|
|
2737
|
+
"",
|
|
2738
|
+
"Generate queries that test whether the target skill triggers correctly even when these similar skills exist.",
|
|
2739
|
+
"Positive queries should clearly belong to the target skill, not the competitors.",
|
|
2740
|
+
"Negative queries should belong to a competitor or to no skill at all."
|
|
2741
|
+
] : [],
|
|
2584
2742
|
`Generate ${numQueries} prompts total.`,
|
|
2585
2743
|
`Exactly ${shouldTriggerCount} should have should_trigger=true.`,
|
|
2586
2744
|
`Exactly ${shouldNotTriggerCount} should have should_trigger=false.`,
|
|
@@ -2614,12 +2772,29 @@ function parseDecision(rawResponse, skillNames) {
|
|
|
2614
2772
|
}
|
|
2615
2773
|
return "unrecognized";
|
|
2616
2774
|
}
|
|
2617
|
-
function prepareTriggerQueries(skill, queries, seed) {
|
|
2775
|
+
function prepareTriggerQueries(skill, queries, seed, competitors) {
|
|
2618
2776
|
const rng = createRng(seed);
|
|
2777
|
+
const competitorCandidates = (competitors ?? []).map((competitor) => ({
|
|
2778
|
+
name: competitor.name,
|
|
2779
|
+
description: competitor.description
|
|
2780
|
+
}));
|
|
2619
2781
|
return queries.map((testQuery) => {
|
|
2620
|
-
const
|
|
2782
|
+
const usingCompetitors = competitorCandidates.length > 0;
|
|
2783
|
+
const fakeCount = usingCompetitors ? testQuery.should_trigger ? 2 + Math.floor(rng() * 3) : 3 + Math.floor(rng() * 3) : 5 + Math.floor(rng() * 5);
|
|
2621
2784
|
const fakeSkills = sample(FAKE_SKILLS, fakeCount, rng);
|
|
2622
|
-
const allSkills = shuffle(
|
|
2785
|
+
const allSkills = usingCompetitors ? shuffle(
|
|
2786
|
+
[
|
|
2787
|
+
...competitorCandidates,
|
|
2788
|
+
...fakeSkills,
|
|
2789
|
+
...testQuery.should_trigger ? [
|
|
2790
|
+
{
|
|
2791
|
+
name: skill.frontmatter.name,
|
|
2792
|
+
description: skill.frontmatter.description
|
|
2793
|
+
}
|
|
2794
|
+
] : []
|
|
2795
|
+
],
|
|
2796
|
+
rng
|
|
2797
|
+
) : shuffle(
|
|
2623
2798
|
[
|
|
2624
2799
|
...fakeSkills,
|
|
2625
2800
|
{
|
|
@@ -2673,25 +2848,82 @@ function calculateMetrics(skillName, cases) {
|
|
|
2673
2848
|
f1
|
|
2674
2849
|
};
|
|
2675
2850
|
}
|
|
2676
|
-
function
|
|
2851
|
+
function assertCompetitorNamesDistinct(skillName, competitors) {
|
|
2852
|
+
for (const competitor of competitors) {
|
|
2853
|
+
if (competitor.name === skillName) {
|
|
2854
|
+
throw new Error(`Competitor skill '${competitor.name}' has the same name as the skill under test.`);
|
|
2855
|
+
}
|
|
2856
|
+
}
|
|
2857
|
+
}
|
|
2858
|
+
function buildTriggerCaseResult(options) {
|
|
2859
|
+
const expected = options.testQuery.should_trigger ? options.skillName : "none";
|
|
2860
|
+
const matched = options.testQuery.should_trigger ? options.decision === options.skillName : options.decision !== options.skillName;
|
|
2861
|
+
const selectedCompetitor = options.competitorNames?.includes(options.decision) ? options.decision : void 0;
|
|
2862
|
+
return {
|
|
2863
|
+
query: options.testQuery.query,
|
|
2864
|
+
shouldTrigger: options.testQuery.should_trigger,
|
|
2865
|
+
expected,
|
|
2866
|
+
actual: options.decision,
|
|
2867
|
+
matched,
|
|
2868
|
+
selectedCompetitor,
|
|
2869
|
+
rawModelResponse: options.rawModelResponse
|
|
2870
|
+
};
|
|
2871
|
+
}
|
|
2872
|
+
function buildSuggestions(skillName, metrics, cases, competitors) {
|
|
2677
2873
|
const suggestions = [];
|
|
2678
2874
|
if (metrics.falseNegatives > 0) {
|
|
2679
2875
|
suggestions.push(
|
|
2680
2876
|
"False negatives found: clarify capability keywords and add explicit 'use when ...' phrasing in description."
|
|
2681
2877
|
);
|
|
2878
|
+
if (competitors && competitors.length > 0) {
|
|
2879
|
+
const competitorCounts = /* @__PURE__ */ new Map();
|
|
2880
|
+
for (const testCase of cases) {
|
|
2881
|
+
if (!testCase.shouldTrigger || testCase.actual === skillName || !testCase.selectedCompetitor) {
|
|
2882
|
+
continue;
|
|
2883
|
+
}
|
|
2884
|
+
competitorCounts.set(testCase.selectedCompetitor, (competitorCounts.get(testCase.selectedCompetitor) ?? 0) + 1);
|
|
2885
|
+
}
|
|
2886
|
+
for (const [competitorName, count] of competitorCounts.entries()) {
|
|
2887
|
+
suggestions.push(
|
|
2888
|
+
`Skill '${competitorName}' was selected instead of '${skillName}' for ${count} quer${count === 1 ? "y" : "ies"}. Differentiate your description from '${competitorName}'.`
|
|
2889
|
+
);
|
|
2890
|
+
}
|
|
2891
|
+
}
|
|
2682
2892
|
}
|
|
2683
2893
|
if (metrics.falsePositives > 0) {
|
|
2684
2894
|
suggestions.push("False positives found: narrow scope boundaries and add explicit non-goals in description.");
|
|
2895
|
+
if (competitors && competitors.length > 0) {
|
|
2896
|
+
suggestions.push(
|
|
2897
|
+
`With competitor skills present, ${metrics.falsePositives} negative quer${metrics.falsePositives === 1 ? "y still" : "ies still"} triggered '${skillName}'. Narrow your description's scope boundaries.`
|
|
2898
|
+
);
|
|
2899
|
+
}
|
|
2685
2900
|
}
|
|
2686
2901
|
if (suggestions.length === 0) {
|
|
2687
2902
|
suggestions.push("Trigger behavior looks clean on this sample. Keep monitoring with domain-specific custom queries.");
|
|
2688
2903
|
}
|
|
2689
2904
|
return suggestions;
|
|
2690
2905
|
}
|
|
2906
|
+
async function loadCompetitorSkills(comparePaths) {
|
|
2907
|
+
const competitors = [];
|
|
2908
|
+
for (const comparePath of comparePaths) {
|
|
2909
|
+
const parsed = await parseSkillStrict(comparePath);
|
|
2910
|
+
competitors.push({
|
|
2911
|
+
name: parsed.frontmatter.name,
|
|
2912
|
+
description: parsed.frontmatter.description,
|
|
2913
|
+
sourcePath: comparePath
|
|
2914
|
+
});
|
|
2915
|
+
}
|
|
2916
|
+
return competitors;
|
|
2917
|
+
}
|
|
2691
2918
|
async function runTriggerTest(skill, options) {
|
|
2692
|
-
const
|
|
2919
|
+
const competitors = options.compare && options.compare.length > 0 ? await loadCompetitorSkills(options.compare) : void 0;
|
|
2920
|
+
if (competitors && competitors.length > 0) {
|
|
2921
|
+
assertCompetitorNamesDistinct(skill.frontmatter.name, competitors);
|
|
2922
|
+
}
|
|
2923
|
+
const queries = options.queries && options.queries.length > 0 ? triggerQueryArraySchema.parse(options.queries) : await generateQueriesWithModel(skill, options.provider, options.model, options.numQueries, competitors);
|
|
2693
2924
|
const skillName = skill.frontmatter.name;
|
|
2694
|
-
const preparedQueries = prepareTriggerQueries(skill, queries, options.seed);
|
|
2925
|
+
const preparedQueries = prepareTriggerQueries(skill, queries, options.seed, competitors);
|
|
2926
|
+
const competitorNames = competitors?.map((competitor) => competitor.name) ?? [];
|
|
2695
2927
|
const systemPrompt = [
|
|
2696
2928
|
"You are selecting one skill to activate for a user query.",
|
|
2697
2929
|
"Choose the single best matching skill name from the provided list, or 'none' if no skill is a good fit.",
|
|
@@ -2704,18 +2936,15 @@ async function runTriggerTest(skill, options) {
|
|
|
2704
2936
|
const rawResponse = await options.provider.sendMessage(systemPrompt, userPrompt, { model: options.model });
|
|
2705
2937
|
const decision = parseDecision(
|
|
2706
2938
|
rawResponse,
|
|
2707
|
-
allSkills.map((entry) => entry.name)
|
|
2939
|
+
Array.from(/* @__PURE__ */ new Set([skillName, ...allSkills.map((entry) => entry.name)]))
|
|
2708
2940
|
);
|
|
2709
|
-
|
|
2710
|
-
|
|
2711
|
-
|
|
2712
|
-
|
|
2713
|
-
|
|
2714
|
-
expected,
|
|
2715
|
-
actual: decision,
|
|
2716
|
-
matched,
|
|
2941
|
+
return buildTriggerCaseResult({
|
|
2942
|
+
testQuery,
|
|
2943
|
+
skillName,
|
|
2944
|
+
decision,
|
|
2945
|
+
competitorNames,
|
|
2717
2946
|
rawModelResponse: options.verbose ? rawResponse : void 0
|
|
2718
|
-
};
|
|
2947
|
+
});
|
|
2719
2948
|
},
|
|
2720
2949
|
options.concurrency ?? 5
|
|
2721
2950
|
);
|
|
@@ -2725,10 +2954,11 @@ async function runTriggerTest(skill, options) {
|
|
|
2725
2954
|
model: options.model,
|
|
2726
2955
|
provider: options.provider.name,
|
|
2727
2956
|
seed: options.seed,
|
|
2957
|
+
competitors,
|
|
2728
2958
|
queries,
|
|
2729
2959
|
cases: results,
|
|
2730
2960
|
metrics,
|
|
2731
|
-
suggestions: buildSuggestions(metrics)
|
|
2961
|
+
suggestions: buildSuggestions(skillName, metrics, results, competitors)
|
|
2732
2962
|
};
|
|
2733
2963
|
}
|
|
2734
2964
|
|
|
@@ -2843,10 +3073,10 @@ async function loadConfiguredEvalPrompts(command) {
|
|
|
2843
3073
|
if (!promptFile && assertionsFile) {
|
|
2844
3074
|
throw new Error("Config field eval.assertionsFile requires eval.promptFile.");
|
|
2845
3075
|
}
|
|
2846
|
-
const promptRaw = await
|
|
3076
|
+
const promptRaw = await fs6.readFile(promptFile, "utf8");
|
|
2847
3077
|
let prompts = normalizeEvalPrompts(parseJsonIfPossible(promptRaw), promptFile);
|
|
2848
3078
|
if (assertionsFile) {
|
|
2849
|
-
const assertionsRaw = await
|
|
3079
|
+
const assertionsRaw = await fs6.readFile(assertionsFile, "utf8");
|
|
2850
3080
|
const assertions = normalizeAssertions(parseJsonIfPossible(assertionsRaw), assertionsFile);
|
|
2851
3081
|
prompts = prompts.map((prompt) => ({
|
|
2852
3082
|
prompt: prompt.prompt,
|
|
@@ -2885,18 +3115,22 @@ function writeError(error, asJson) {
|
|
|
2885
3115
|
|
|
2886
3116
|
// src/commands/lint.ts
|
|
2887
3117
|
var lintCliSchema = z6.object({
|
|
2888
|
-
html: z6.string().optional()
|
|
3118
|
+
html: z6.string().optional(),
|
|
3119
|
+
plugin: z6.array(z6.string().min(1)).optional()
|
|
2889
3120
|
});
|
|
3121
|
+
function collectPluginPaths(value, previous = []) {
|
|
3122
|
+
return [...previous, value];
|
|
3123
|
+
}
|
|
2890
3124
|
async function handleLintCommand(targetPath, options) {
|
|
2891
3125
|
try {
|
|
2892
|
-
const report = await runLinter(targetPath, { suppress: options.suppress });
|
|
3126
|
+
const report = await runLinter(targetPath, { suppress: options.suppress, plugins: options.plugins });
|
|
2893
3127
|
if (options.json) {
|
|
2894
3128
|
writeResult(report, true);
|
|
2895
3129
|
} else {
|
|
2896
3130
|
writeResult(renderLintReport(report, options.color), false);
|
|
2897
3131
|
}
|
|
2898
3132
|
if (options.html) {
|
|
2899
|
-
await
|
|
3133
|
+
await fs7.writeFile(options.html, renderLintHtml(report), "utf8");
|
|
2900
3134
|
}
|
|
2901
3135
|
if (lintFails(report, options.failOn)) {
|
|
2902
3136
|
process.exitCode = 1;
|
|
@@ -2907,7 +3141,7 @@ async function handleLintCommand(targetPath, options) {
|
|
|
2907
3141
|
}
|
|
2908
3142
|
}
|
|
2909
3143
|
function registerLintCommand(program) {
|
|
2910
|
-
program.command("lint").description("Run static lint checks against a SKILL.md file or skill directory.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--html <path>", "Write an HTML report to the given file path").action(async (targetPath, _commandOptions, command) => {
|
|
3144
|
+
program.command("lint").description("Run static lint checks against a SKILL.md file or skill directory.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--html <path>", "Write an HTML report to the given file path").option("--plugin <path>", "Load a custom lint plugin file", collectPluginPaths, []).action(async (targetPath, _commandOptions, command) => {
|
|
2911
3145
|
const globalOptions = getGlobalCliOptions(command);
|
|
2912
3146
|
const config = getResolvedConfig(command);
|
|
2913
3147
|
const parsedCli = lintCliSchema.safeParse(command.opts());
|
|
@@ -2920,30 +3154,33 @@ function registerLintCommand(program) {
|
|
|
2920
3154
|
...globalOptions,
|
|
2921
3155
|
failOn: config.lint.failOn,
|
|
2922
3156
|
suppress: config.lint.suppress,
|
|
3157
|
+
plugins: config.lint.plugins,
|
|
2923
3158
|
html: parsedCli.data.html
|
|
2924
3159
|
});
|
|
2925
3160
|
});
|
|
2926
3161
|
}
|
|
2927
3162
|
|
|
2928
3163
|
// src/commands/trigger.ts
|
|
2929
|
-
import
|
|
3164
|
+
import fs9 from "node:fs/promises";
|
|
2930
3165
|
import ora from "ora";
|
|
2931
3166
|
import { z as z8 } from "zod";
|
|
2932
3167
|
|
|
2933
3168
|
// src/utils/config.ts
|
|
2934
|
-
import
|
|
2935
|
-
import
|
|
3169
|
+
import fs8 from "node:fs/promises";
|
|
3170
|
+
import path6 from "node:path";
|
|
2936
3171
|
import { z as z7 } from "zod";
|
|
2937
3172
|
var providerNameSchema = z7.enum(["anthropic", "openai"]);
|
|
2938
3173
|
var lintFailOnSchema = z7.enum(["error", "warn"]);
|
|
2939
3174
|
var lintConfigSchema = z7.object({
|
|
2940
3175
|
failOn: lintFailOnSchema.optional(),
|
|
2941
|
-
suppress: z7.array(z7.string().min(1)).optional()
|
|
3176
|
+
suppress: z7.array(z7.string().min(1)).optional(),
|
|
3177
|
+
plugins: z7.array(z7.string().min(1)).optional()
|
|
2942
3178
|
}).strict();
|
|
2943
3179
|
var triggerConfigSchema = z7.object({
|
|
2944
3180
|
numQueries: z7.number().int().min(2).refine((value) => value % 2 === 0, "trigger.numQueries must be an even number."),
|
|
2945
3181
|
threshold: z7.number().min(0).max(1).optional(),
|
|
2946
|
-
seed: z7.number().int().optional()
|
|
3182
|
+
seed: z7.number().int().optional(),
|
|
3183
|
+
compare: z7.array(z7.string().min(1)).optional()
|
|
2947
3184
|
}).strict().partial();
|
|
2948
3185
|
var evalConfigSchema = z7.object({
|
|
2949
3186
|
numRuns: z7.number().int().min(1).optional(),
|
|
@@ -2967,12 +3204,14 @@ var resolvedSkilltestConfigSchema = z7.object({
|
|
|
2967
3204
|
concurrency: z7.number().int().min(1),
|
|
2968
3205
|
lint: z7.object({
|
|
2969
3206
|
failOn: lintFailOnSchema,
|
|
2970
|
-
suppress: z7.array(z7.string().min(1))
|
|
3207
|
+
suppress: z7.array(z7.string().min(1)),
|
|
3208
|
+
plugins: z7.array(z7.string().min(1))
|
|
2971
3209
|
}),
|
|
2972
3210
|
trigger: z7.object({
|
|
2973
3211
|
numQueries: z7.number().int().min(2).refine((value) => value % 2 === 0, "trigger.numQueries must be an even number."),
|
|
2974
3212
|
threshold: z7.number().min(0).max(1),
|
|
2975
|
-
seed: z7.number().int().optional()
|
|
3213
|
+
seed: z7.number().int().optional(),
|
|
3214
|
+
compare: z7.array(z7.string().min(1))
|
|
2976
3215
|
}),
|
|
2977
3216
|
eval: z7.object({
|
|
2978
3217
|
numRuns: z7.number().int().min(1),
|
|
@@ -2988,11 +3227,13 @@ var DEFAULT_SKILLTEST_CONFIG = {
|
|
|
2988
3227
|
concurrency: 5,
|
|
2989
3228
|
lint: {
|
|
2990
3229
|
failOn: "error",
|
|
2991
|
-
suppress: []
|
|
3230
|
+
suppress: [],
|
|
3231
|
+
plugins: []
|
|
2992
3232
|
},
|
|
2993
3233
|
trigger: {
|
|
2994
3234
|
numQueries: 20,
|
|
2995
|
-
threshold: 0.8
|
|
3235
|
+
threshold: 0.8,
|
|
3236
|
+
compare: []
|
|
2996
3237
|
},
|
|
2997
3238
|
eval: {
|
|
2998
3239
|
numRuns: 5,
|
|
@@ -3014,7 +3255,7 @@ function buildConfigValidationError(error, sourceLabel) {
|
|
|
3014
3255
|
async function readJsonObject(filePath, label) {
|
|
3015
3256
|
let raw;
|
|
3016
3257
|
try {
|
|
3017
|
-
raw = await
|
|
3258
|
+
raw = await fs8.readFile(filePath, "utf8");
|
|
3018
3259
|
} catch (error) {
|
|
3019
3260
|
const message = error instanceof Error ? error.message : String(error);
|
|
3020
3261
|
throw new Error(`Failed to read ${label}: ${message}`);
|
|
@@ -3038,13 +3279,13 @@ async function loadConfigFromJsonFile(filePath) {
|
|
|
3038
3279
|
return {
|
|
3039
3280
|
configFile: parsed.data,
|
|
3040
3281
|
sourcePath: filePath,
|
|
3041
|
-
sourceDirectory:
|
|
3282
|
+
sourceDirectory: path6.dirname(filePath)
|
|
3042
3283
|
};
|
|
3043
3284
|
}
|
|
3044
3285
|
async function loadConfigFromNearestPackageJson(startDirectory) {
|
|
3045
|
-
let currentDirectory =
|
|
3286
|
+
let currentDirectory = path6.resolve(startDirectory);
|
|
3046
3287
|
while (true) {
|
|
3047
|
-
const packageJsonPath =
|
|
3288
|
+
const packageJsonPath = path6.join(currentDirectory, "package.json");
|
|
3048
3289
|
if (await pathExists(packageJsonPath)) {
|
|
3049
3290
|
const raw = await readJsonObject(packageJsonPath, packageJsonPath);
|
|
3050
3291
|
const packageJsonSchema = z7.object({
|
|
@@ -3063,7 +3304,7 @@ async function loadConfigFromNearestPackageJson(startDirectory) {
|
|
|
3063
3304
|
sourceDirectory: currentDirectory
|
|
3064
3305
|
};
|
|
3065
3306
|
}
|
|
3066
|
-
const parentDirectory =
|
|
3307
|
+
const parentDirectory = path6.dirname(currentDirectory);
|
|
3067
3308
|
if (parentDirectory === currentDirectory) {
|
|
3068
3309
|
return null;
|
|
3069
3310
|
}
|
|
@@ -3076,7 +3317,7 @@ async function resolveSkillDirectoryConfig(targetPath) {
|
|
|
3076
3317
|
}
|
|
3077
3318
|
try {
|
|
3078
3319
|
const { skillRoot } = await resolveSkillPath(targetPath);
|
|
3079
|
-
return loadConfigFromJsonFile(
|
|
3320
|
+
return loadConfigFromJsonFile(path6.join(skillRoot, ".skilltestrc"));
|
|
3080
3321
|
} catch {
|
|
3081
3322
|
return null;
|
|
3082
3323
|
}
|
|
@@ -3085,7 +3326,13 @@ function resolveConfigRelativePath(baseDirectory, value) {
|
|
|
3085
3326
|
if (!value) {
|
|
3086
3327
|
return void 0;
|
|
3087
3328
|
}
|
|
3088
|
-
return
|
|
3329
|
+
return path6.resolve(baseDirectory, value);
|
|
3330
|
+
}
|
|
3331
|
+
function resolveConfigRelativePaths(baseDirectory, values) {
|
|
3332
|
+
if (!values || values.length === 0) {
|
|
3333
|
+
return [];
|
|
3334
|
+
}
|
|
3335
|
+
return values.map((value) => path6.resolve(baseDirectory, value));
|
|
3089
3336
|
}
|
|
3090
3337
|
function mergeConfigLayers(configFile = {}, cliFlags = {}, baseDirectory = process.cwd()) {
|
|
3091
3338
|
const merged = {
|
|
@@ -3095,12 +3342,20 @@ function mergeConfigLayers(configFile = {}, cliFlags = {}, baseDirectory = proce
|
|
|
3095
3342
|
concurrency: cliFlags.concurrency ?? configFile.concurrency ?? DEFAULT_SKILLTEST_CONFIG.concurrency,
|
|
3096
3343
|
lint: {
|
|
3097
3344
|
failOn: cliFlags.lint?.failOn ?? configFile.lint?.failOn ?? DEFAULT_SKILLTEST_CONFIG.lint.failOn,
|
|
3098
|
-
suppress: cliFlags.lint?.suppress ?? configFile.lint?.suppress ?? DEFAULT_SKILLTEST_CONFIG.lint.suppress
|
|
3345
|
+
suppress: cliFlags.lint?.suppress ?? configFile.lint?.suppress ?? DEFAULT_SKILLTEST_CONFIG.lint.suppress,
|
|
3346
|
+
plugins: resolveConfigRelativePaths(
|
|
3347
|
+
baseDirectory,
|
|
3348
|
+
cliFlags.lint?.plugins ?? configFile.lint?.plugins ?? DEFAULT_SKILLTEST_CONFIG.lint.plugins
|
|
3349
|
+
)
|
|
3099
3350
|
},
|
|
3100
3351
|
trigger: {
|
|
3101
3352
|
numQueries: cliFlags.trigger?.numQueries ?? configFile.trigger?.numQueries ?? DEFAULT_SKILLTEST_CONFIG.trigger.numQueries,
|
|
3102
3353
|
threshold: cliFlags.trigger?.threshold ?? configFile.trigger?.threshold ?? DEFAULT_SKILLTEST_CONFIG.trigger.threshold,
|
|
3103
|
-
seed: cliFlags.trigger?.seed ?? configFile.trigger?.seed
|
|
3354
|
+
seed: cliFlags.trigger?.seed ?? configFile.trigger?.seed,
|
|
3355
|
+
compare: resolveConfigRelativePaths(
|
|
3356
|
+
baseDirectory,
|
|
3357
|
+
cliFlags.trigger?.compare ?? configFile.trigger?.compare ?? DEFAULT_SKILLTEST_CONFIG.trigger.compare
|
|
3358
|
+
)
|
|
3104
3359
|
},
|
|
3105
3360
|
eval: {
|
|
3106
3361
|
numRuns: cliFlags.eval?.numRuns ?? configFile.eval?.numRuns ?? DEFAULT_SKILLTEST_CONFIG.eval.numRuns,
|
|
@@ -3145,6 +3400,18 @@ function extractCliConfigOverrides(command) {
|
|
|
3145
3400
|
numQueries: getTypedOptionValue(command, "numQueries")
|
|
3146
3401
|
};
|
|
3147
3402
|
}
|
|
3403
|
+
if ((command.name() === "trigger" || command.name() === "check") && command.getOptionValueSource("compare") === "cli") {
|
|
3404
|
+
overrides.trigger = {
|
|
3405
|
+
...overrides.trigger,
|
|
3406
|
+
compare: getTypedOptionValue(command, "compare")
|
|
3407
|
+
};
|
|
3408
|
+
}
|
|
3409
|
+
if ((command.name() === "lint" || command.name() === "check") && command.getOptionValueSource("plugin") === "cli") {
|
|
3410
|
+
overrides.lint = {
|
|
3411
|
+
...overrides.lint,
|
|
3412
|
+
plugins: getTypedOptionValue(command, "plugin")
|
|
3413
|
+
};
|
|
3414
|
+
}
|
|
3148
3415
|
if (command.name() === "check" && command.getOptionValueSource("minF1") === "cli") {
|
|
3149
3416
|
overrides.trigger = {
|
|
3150
3417
|
...overrides.trigger,
|
|
@@ -3172,7 +3439,7 @@ async function resolveConfigContext(targetPath, cliFlags) {
|
|
|
3172
3439
|
config: mergeConfigLayers(skillDirectoryConfig.configFile, cliFlags, skillDirectoryConfig.sourceDirectory)
|
|
3173
3440
|
};
|
|
3174
3441
|
}
|
|
3175
|
-
const cwdConfigPath =
|
|
3442
|
+
const cwdConfigPath = path6.join(cwd, ".skilltestrc");
|
|
3176
3443
|
const cwdConfig = await loadConfigFromJsonFile(cwdConfigPath);
|
|
3177
3444
|
if (cwdConfig) {
|
|
3178
3445
|
return {
|
|
@@ -3393,6 +3660,7 @@ function createProvider(providerName, apiKeyOverride) {
|
|
|
3393
3660
|
var triggerCliSchema = z8.object({
|
|
3394
3661
|
queries: z8.string().optional(),
|
|
3395
3662
|
saveQueries: z8.string().optional(),
|
|
3663
|
+
compare: z8.array(z8.string().min(1)).optional(),
|
|
3396
3664
|
seed: z8.number().int().optional(),
|
|
3397
3665
|
concurrency: z8.number().int().min(1).optional(),
|
|
3398
3666
|
html: z8.string().optional(),
|
|
@@ -3441,6 +3709,7 @@ async function handleTriggerCommand(targetPath, options) {
|
|
|
3441
3709
|
provider,
|
|
3442
3710
|
queries,
|
|
3443
3711
|
numQueries: options.numQueries,
|
|
3712
|
+
compare: options.compare,
|
|
3444
3713
|
seed: options.seed,
|
|
3445
3714
|
concurrency: options.concurrency,
|
|
3446
3715
|
verbose: options.verbose
|
|
@@ -3459,7 +3728,7 @@ async function handleTriggerCommand(targetPath, options) {
|
|
|
3459
3728
|
...result,
|
|
3460
3729
|
target: targetPath
|
|
3461
3730
|
};
|
|
3462
|
-
await
|
|
3731
|
+
await fs9.writeFile(options.html, renderTriggerHtml(htmlResult), "utf8");
|
|
3463
3732
|
}
|
|
3464
3733
|
} catch (error) {
|
|
3465
3734
|
spinner?.stop();
|
|
@@ -3468,7 +3737,7 @@ async function handleTriggerCommand(targetPath, options) {
|
|
|
3468
3737
|
}
|
|
3469
3738
|
}
|
|
3470
3739
|
function registerTriggerCommand(program) {
|
|
3471
|
-
program.command("trigger").description("Evaluate whether a skill description triggers correctly.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--model <model>", "Model to use").option("--provider <provider>", "LLM provider: anthropic|openai").option("--queries <path>", "Path to custom test queries JSON").option("--num-queries <n>", "Number of auto-generated queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--concurrency <n>", "Maximum in-flight trigger requests", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--save-queries <path>", "Save generated queries to a JSON file").option("--api-key <key>", "API key override").option("--verbose", "Show full model decisions").action(async (targetPath, _commandOptions, command) => {
|
|
3740
|
+
program.command("trigger").description("Evaluate whether a skill description triggers correctly.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--model <model>", "Model to use").option("--provider <provider>", "LLM provider: anthropic|openai").option("--queries <path>", "Path to custom test queries JSON").option("--compare <path...>", "Path(s) to sibling skill directories to include as competitors").option("--num-queries <n>", "Number of auto-generated queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--concurrency <n>", "Maximum in-flight trigger requests", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--save-queries <path>", "Save generated queries to a JSON file").option("--api-key <key>", "API key override").option("--verbose", "Show full model decisions").action(async (targetPath, _commandOptions, command) => {
|
|
3472
3741
|
const globalOptions = getGlobalCliOptions(command);
|
|
3473
3742
|
const config = getResolvedConfig(command);
|
|
3474
3743
|
const parsedCli = triggerCliSchema.safeParse(command.opts());
|
|
@@ -3483,6 +3752,7 @@ function registerTriggerCommand(program) {
|
|
|
3483
3752
|
provider: config.provider,
|
|
3484
3753
|
queries: parsedCli.data.queries,
|
|
3485
3754
|
numQueries: config.trigger.numQueries,
|
|
3755
|
+
compare: config.trigger.compare,
|
|
3486
3756
|
saveQueries: parsedCli.data.saveQueries,
|
|
3487
3757
|
seed: parsedCli.data.seed ?? config.trigger.seed,
|
|
3488
3758
|
concurrency: config.concurrency,
|
|
@@ -3494,7 +3764,7 @@ function registerTriggerCommand(program) {
|
|
|
3494
3764
|
}
|
|
3495
3765
|
|
|
3496
3766
|
// src/commands/eval.ts
|
|
3497
|
-
import
|
|
3767
|
+
import fs10 from "node:fs/promises";
|
|
3498
3768
|
import ora2 from "ora";
|
|
3499
3769
|
import { z as z9 } from "zod";
|
|
3500
3770
|
var evalCliSchema = z9.object({
|
|
@@ -3561,7 +3831,7 @@ async function handleEvalCommand(targetPath, options, command) {
|
|
|
3561
3831
|
...result,
|
|
3562
3832
|
target: targetPath
|
|
3563
3833
|
};
|
|
3564
|
-
await
|
|
3834
|
+
await fs10.writeFile(options.html, renderEvalHtml(htmlResult), "utf8");
|
|
3565
3835
|
}
|
|
3566
3836
|
} catch (error) {
|
|
3567
3837
|
spinner?.stop();
|
|
@@ -3600,7 +3870,7 @@ function registerEvalCommand(program) {
|
|
|
3600
3870
|
}
|
|
3601
3871
|
|
|
3602
3872
|
// src/commands/check.ts
|
|
3603
|
-
import
|
|
3873
|
+
import fs11 from "node:fs/promises";
|
|
3604
3874
|
import ora3 from "ora";
|
|
3605
3875
|
import { z as z10 } from "zod";
|
|
3606
3876
|
|
|
@@ -3613,7 +3883,7 @@ function calculateEvalAssertPassRate(result) {
|
|
|
3613
3883
|
}
|
|
3614
3884
|
async function runCheck(inputPath, options) {
|
|
3615
3885
|
options.onStage?.("lint");
|
|
3616
|
-
const lint = await runLinter(inputPath, { suppress: options.lintSuppress });
|
|
3886
|
+
const lint = await runLinter(inputPath, { suppress: options.lintSuppress, plugins: options.lintPlugins });
|
|
3617
3887
|
const lintPassed = !lintFails(lint, options.lintFailOn);
|
|
3618
3888
|
let trigger = null;
|
|
3619
3889
|
let evalResult = null;
|
|
@@ -3637,6 +3907,7 @@ async function runCheck(inputPath, options) {
|
|
|
3637
3907
|
provider: options.provider,
|
|
3638
3908
|
model: options.model,
|
|
3639
3909
|
queries: options.queries,
|
|
3910
|
+
compare: options.compare,
|
|
3640
3911
|
numQueries: options.numQueries,
|
|
3641
3912
|
seed: options.triggerSeed,
|
|
3642
3913
|
concurrency: options.concurrency,
|
|
@@ -3698,8 +3969,10 @@ var checkCliSchema = z10.object({
|
|
|
3698
3969
|
graderModel: z10.string().optional(),
|
|
3699
3970
|
apiKey: z10.string().optional(),
|
|
3700
3971
|
queries: z10.string().optional(),
|
|
3972
|
+
compare: z10.array(z10.string().min(1)).optional(),
|
|
3701
3973
|
seed: z10.number().int().optional(),
|
|
3702
3974
|
prompts: z10.string().optional(),
|
|
3975
|
+
plugin: z10.array(z10.string().min(1)).optional(),
|
|
3703
3976
|
concurrency: z10.number().int().min(1).optional(),
|
|
3704
3977
|
html: z10.string().optional(),
|
|
3705
3978
|
saveResults: z10.string().optional(),
|
|
@@ -3708,6 +3981,9 @@ var checkCliSchema = z10.object({
|
|
|
3708
3981
|
});
|
|
3709
3982
|
var DEFAULT_ANTHROPIC_MODEL3 = "claude-sonnet-4-5-20250929";
|
|
3710
3983
|
var DEFAULT_OPENAI_MODEL3 = "gpt-4.1-mini";
|
|
3984
|
+
function collectPluginPaths2(value, previous = []) {
|
|
3985
|
+
return [...previous, value];
|
|
3986
|
+
}
|
|
3711
3987
|
function resolveModel3(provider, model) {
|
|
3712
3988
|
if (provider === "openai" && model === DEFAULT_ANTHROPIC_MODEL3) {
|
|
3713
3989
|
return DEFAULT_OPENAI_MODEL3;
|
|
@@ -3758,7 +4034,9 @@ async function handleCheckCommand(targetPath, options, command) {
|
|
|
3758
4034
|
graderModel,
|
|
3759
4035
|
lintFailOn: options.lintFailOn,
|
|
3760
4036
|
lintSuppress: options.lintSuppress,
|
|
4037
|
+
lintPlugins: options.lintPlugins,
|
|
3761
4038
|
queries,
|
|
4039
|
+
compare: options.compare,
|
|
3762
4040
|
numQueries: options.numQueries,
|
|
3763
4041
|
triggerSeed: options.triggerSeed,
|
|
3764
4042
|
prompts,
|
|
@@ -3794,7 +4072,7 @@ async function handleCheckCommand(targetPath, options, command) {
|
|
|
3794
4072
|
);
|
|
3795
4073
|
}
|
|
3796
4074
|
if (options.html) {
|
|
3797
|
-
await
|
|
4075
|
+
await fs11.writeFile(options.html, renderCheckHtml(result), "utf8");
|
|
3798
4076
|
}
|
|
3799
4077
|
process.exitCode = result.gates.overallPassed ? 0 : 1;
|
|
3800
4078
|
} catch (error) {
|
|
@@ -3804,7 +4082,7 @@ async function handleCheckCommand(targetPath, options, command) {
|
|
|
3804
4082
|
}
|
|
3805
4083
|
}
|
|
3806
4084
|
function registerCheckCommand(program) {
|
|
3807
|
-
program.command("check").description("Run lint + trigger + eval with threshold-based quality gates.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai").option("--model <model>", "Model for trigger/eval runs").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--prompts <path>", "Path to eval prompts JSON").option("--concurrency <n>", "Maximum in-flight trigger/eval tasks", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value)).option("--min-assert-pass-rate <n>", "Minimum required eval assertion pass rate (0-1)", (value) => Number.parseFloat(value)).option("--save-results <path>", "Save combined check results to JSON").option("--continue-on-lint-fail", "Continue trigger/eval stages even when lint has failures").option("--verbose", "Show detailed trigger/eval output sections").action(async (targetPath, _commandOptions, command) => {
|
|
4085
|
+
program.command("check").description("Run lint + trigger + eval with threshold-based quality gates.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai").option("--model <model>", "Model for trigger/eval runs").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--compare <path...>", "Path(s) to sibling skill directories to include as competitors").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--prompts <path>", "Path to eval prompts JSON").option("--plugin <path>", "Load a custom lint plugin file", collectPluginPaths2, []).option("--concurrency <n>", "Maximum in-flight trigger/eval tasks", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value)).option("--min-assert-pass-rate <n>", "Minimum required eval assertion pass rate (0-1)", (value) => Number.parseFloat(value)).option("--save-results <path>", "Save combined check results to JSON").option("--continue-on-lint-fail", "Continue trigger/eval stages even when lint has failures").option("--verbose", "Show detailed trigger/eval output sections").action(async (targetPath, _commandOptions, command) => {
|
|
3808
4086
|
const globalOptions = getGlobalCliOptions(command);
|
|
3809
4087
|
const config = getResolvedConfig(command);
|
|
3810
4088
|
const parsedCli = checkCliSchema.safeParse(command.opts());
|
|
@@ -3822,6 +4100,7 @@ function registerCheckCommand(program) {
|
|
|
3822
4100
|
graderModel: parsedCli.data.graderModel,
|
|
3823
4101
|
apiKey: parsedCli.data.apiKey,
|
|
3824
4102
|
queries: parsedCli.data.queries,
|
|
4103
|
+
compare: config.trigger.compare,
|
|
3825
4104
|
numQueries: config.trigger.numQueries,
|
|
3826
4105
|
prompts: parsedCli.data.prompts,
|
|
3827
4106
|
minF1: config.trigger.threshold,
|
|
@@ -3831,6 +4110,7 @@ function registerCheckCommand(program) {
|
|
|
3831
4110
|
html: parsedCli.data.html,
|
|
3832
4111
|
lintFailOn: config.lint.failOn,
|
|
3833
4112
|
lintSuppress: config.lint.suppress,
|
|
4113
|
+
lintPlugins: config.lint.plugins,
|
|
3834
4114
|
triggerSeed: parsedCli.data.seed ?? config.trigger.seed,
|
|
3835
4115
|
saveResults: parsedCli.data.saveResults,
|
|
3836
4116
|
continueOnLintFail: Boolean(parsedCli.data.continueOnLintFail),
|
|
@@ -3845,8 +4125,8 @@ function registerCheckCommand(program) {
|
|
|
3845
4125
|
function resolveVersion() {
|
|
3846
4126
|
try {
|
|
3847
4127
|
const currentFilePath = fileURLToPath(import.meta.url);
|
|
3848
|
-
const packageJsonPath =
|
|
3849
|
-
const raw =
|
|
4128
|
+
const packageJsonPath = path7.resolve(path7.dirname(currentFilePath), "..", "package.json");
|
|
4129
|
+
const raw = fs12.readFileSync(packageJsonPath, "utf8");
|
|
3850
4130
|
const parsed = JSON.parse(raw);
|
|
3851
4131
|
return parsed.version ?? "0.0.0";
|
|
3852
4132
|
} catch {
|