skilltest 0.6.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +3 -6
- package/README.md +104 -2
- package/dist/index.js +441 -140
- package/dist/index.js.map +1 -1
- package/package.json +4 -3
package/dist/index.js
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/index.ts
|
|
4
|
-
import
|
|
5
|
-
import
|
|
4
|
+
import fs12 from "node:fs";
|
|
5
|
+
import path7 from "node:path";
|
|
6
6
|
import { fileURLToPath } from "node:url";
|
|
7
7
|
import { Command } from "commander";
|
|
8
8
|
|
|
9
9
|
// src/commands/lint.ts
|
|
10
|
-
import
|
|
10
|
+
import fs7 from "node:fs/promises";
|
|
11
11
|
import { z as z6 } from "zod";
|
|
12
12
|
|
|
13
13
|
// src/core/skill-parser.ts
|
|
@@ -581,24 +581,6 @@ function runContentChecks(context) {
|
|
|
581
581
|
message: "No obvious vague placeholder phrasing found."
|
|
582
582
|
});
|
|
583
583
|
}
|
|
584
|
-
if (context.frontmatter.rawFrontmatter && /[<>]/.test(context.frontmatter.rawFrontmatter)) {
|
|
585
|
-
issues.push({
|
|
586
|
-
id: "content.frontmatter-angle-brackets",
|
|
587
|
-
checkId: "content:angle-brackets",
|
|
588
|
-
title: "Frontmatter Angle Brackets",
|
|
589
|
-
status: "warn",
|
|
590
|
-
message: "Frontmatter contains angle bracket characters (< or >), which can be misinterpreted in some agents.",
|
|
591
|
-
suggestion: "Remove XML-like tags from frontmatter values when possible."
|
|
592
|
-
});
|
|
593
|
-
} else {
|
|
594
|
-
issues.push({
|
|
595
|
-
id: "content.frontmatter-angle-brackets",
|
|
596
|
-
checkId: "content:angle-brackets",
|
|
597
|
-
title: "Frontmatter Angle Brackets",
|
|
598
|
-
status: "pass",
|
|
599
|
-
message: "No angle bracket tokens detected in frontmatter."
|
|
600
|
-
});
|
|
601
|
-
}
|
|
602
584
|
const secretsIssue = buildSecretsIssue(context);
|
|
603
585
|
if (secretsIssue) {
|
|
604
586
|
issues.push(secretsIssue);
|
|
@@ -951,6 +933,24 @@ function runFrontmatterChecks(context) {
|
|
|
951
933
|
message: "license field is present."
|
|
952
934
|
});
|
|
953
935
|
}
|
|
936
|
+
if (context.frontmatter.rawFrontmatter && /[<>]/.test(context.frontmatter.rawFrontmatter)) {
|
|
937
|
+
issues.push({
|
|
938
|
+
id: "frontmatter.angle-brackets",
|
|
939
|
+
checkId: "frontmatter:angle-brackets",
|
|
940
|
+
title: "Frontmatter Angle Brackets",
|
|
941
|
+
status: "warn",
|
|
942
|
+
message: "Frontmatter contains angle bracket characters (< or >), which can be misinterpreted in some agents.",
|
|
943
|
+
suggestion: "Remove XML-like tags from frontmatter values when possible."
|
|
944
|
+
});
|
|
945
|
+
} else {
|
|
946
|
+
issues.push({
|
|
947
|
+
id: "frontmatter.angle-brackets",
|
|
948
|
+
checkId: "frontmatter:angle-brackets",
|
|
949
|
+
title: "Frontmatter Angle Brackets",
|
|
950
|
+
status: "pass",
|
|
951
|
+
message: "No angle bracket tokens detected in frontmatter."
|
|
952
|
+
});
|
|
953
|
+
}
|
|
954
954
|
if (description && description.trim() !== "" && !descriptionLooksActionable(description)) {
|
|
955
955
|
issues.push({
|
|
956
956
|
id: "frontmatter.description.triggerability",
|
|
@@ -972,6 +972,116 @@ function runFrontmatterChecks(context) {
|
|
|
972
972
|
return issues;
|
|
973
973
|
}
|
|
974
974
|
|
|
975
|
+
// src/core/linter/plugin.ts
|
|
976
|
+
import fs4 from "node:fs/promises";
|
|
977
|
+
import path4 from "node:path";
|
|
978
|
+
import { pathToFileURL } from "node:url";
|
|
979
|
+
function normalizeRuleCheckId(checkId) {
|
|
980
|
+
return checkId.includes(":") ? checkId : `plugin:${checkId}`;
|
|
981
|
+
}
|
|
982
|
+
function buildPluginValidationError(filePath, message) {
|
|
983
|
+
return new Error(`Invalid lint plugin at ${filePath}: ${message}`);
|
|
984
|
+
}
|
|
985
|
+
function validatePluginCandidate(candidate, filePath, exportName) {
|
|
986
|
+
if (!candidate || typeof candidate !== "object" || !("rules" in candidate)) {
|
|
987
|
+
throw buildPluginValidationError(filePath, `${exportName} export must be an object with a rules array.`);
|
|
988
|
+
}
|
|
989
|
+
const rules = candidate.rules;
|
|
990
|
+
if (!Array.isArray(rules)) {
|
|
991
|
+
throw buildPluginValidationError(filePath, `${exportName} export must include a rules array.`);
|
|
992
|
+
}
|
|
993
|
+
return {
|
|
994
|
+
rules: rules.map((rule, index) => {
|
|
995
|
+
if (!rule || typeof rule !== "object") {
|
|
996
|
+
throw buildPluginValidationError(filePath, `rule at index ${index} must be an object.`);
|
|
997
|
+
}
|
|
998
|
+
const checkId = rule.checkId;
|
|
999
|
+
if (typeof checkId !== "string" || checkId.trim() === "") {
|
|
1000
|
+
throw buildPluginValidationError(filePath, `rule at index ${index} must have a non-empty string checkId.`);
|
|
1001
|
+
}
|
|
1002
|
+
const title = rule.title;
|
|
1003
|
+
if (typeof title !== "string" || title.trim() === "") {
|
|
1004
|
+
throw buildPluginValidationError(filePath, `rule at index ${index} must have a non-empty string title.`);
|
|
1005
|
+
}
|
|
1006
|
+
const check = rule.check;
|
|
1007
|
+
if (typeof check !== "function") {
|
|
1008
|
+
throw buildPluginValidationError(filePath, `rule '${checkId}' must have a check function.`);
|
|
1009
|
+
}
|
|
1010
|
+
return {
|
|
1011
|
+
checkId: normalizeRuleCheckId(checkId),
|
|
1012
|
+
title,
|
|
1013
|
+
check
|
|
1014
|
+
};
|
|
1015
|
+
})
|
|
1016
|
+
};
|
|
1017
|
+
}
|
|
1018
|
+
async function loadPlugin(filePath) {
|
|
1019
|
+
const absolutePath = path4.resolve(filePath);
|
|
1020
|
+
try {
|
|
1021
|
+
await fs4.access(absolutePath);
|
|
1022
|
+
} catch {
|
|
1023
|
+
throw new Error(`Failed to load lint plugin at ${absolutePath}: file does not exist.`);
|
|
1024
|
+
}
|
|
1025
|
+
let loadedModule;
|
|
1026
|
+
try {
|
|
1027
|
+
loadedModule = await import(pathToFileURL(absolutePath).href);
|
|
1028
|
+
} catch (error) {
|
|
1029
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1030
|
+
throw new Error(`Failed to load lint plugin at ${absolutePath}: ${message}`);
|
|
1031
|
+
}
|
|
1032
|
+
const validationErrors = [];
|
|
1033
|
+
for (const [exportName, candidate] of [
|
|
1034
|
+
["default", loadedModule.default],
|
|
1035
|
+
["plugin", loadedModule.plugin]
|
|
1036
|
+
]) {
|
|
1037
|
+
if (candidate === void 0) {
|
|
1038
|
+
continue;
|
|
1039
|
+
}
|
|
1040
|
+
try {
|
|
1041
|
+
return validatePluginCandidate(candidate, absolutePath, exportName);
|
|
1042
|
+
} catch (error) {
|
|
1043
|
+
validationErrors.push(error instanceof Error ? error.message : String(error));
|
|
1044
|
+
}
|
|
1045
|
+
}
|
|
1046
|
+
if (validationErrors.length > 0) {
|
|
1047
|
+
throw new Error(validationErrors.join(" "));
|
|
1048
|
+
}
|
|
1049
|
+
throw buildPluginValidationError(
|
|
1050
|
+
absolutePath,
|
|
1051
|
+
"expected a default export or named export 'plugin' containing a rules array."
|
|
1052
|
+
);
|
|
1053
|
+
}
|
|
1054
|
+
function buildRuleExecutionError(rule, error) {
|
|
1055
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1056
|
+
return {
|
|
1057
|
+
id: `plugin.load-error.${rule.checkId.replace(/[^A-Za-z0-9]+/g, "-").replace(/^-+|-+$/g, "").toLowerCase()}`,
|
|
1058
|
+
checkId: "plugin:load-error",
|
|
1059
|
+
title: "Plugin Rule Error",
|
|
1060
|
+
status: "fail",
|
|
1061
|
+
message: `Plugin rule '${rule.checkId}' failed: ${message}`
|
|
1062
|
+
};
|
|
1063
|
+
}
|
|
1064
|
+
async function runPluginRules(plugin, context) {
|
|
1065
|
+
const issues = [];
|
|
1066
|
+
for (const rule of plugin.rules) {
|
|
1067
|
+
try {
|
|
1068
|
+
const result = await rule.check(context);
|
|
1069
|
+
if (!Array.isArray(result)) {
|
|
1070
|
+
throw new Error("check function must return an array of lint issues.");
|
|
1071
|
+
}
|
|
1072
|
+
issues.push(
|
|
1073
|
+
...result.map((issue) => ({
|
|
1074
|
+
...issue,
|
|
1075
|
+
checkId: rule.checkId
|
|
1076
|
+
}))
|
|
1077
|
+
);
|
|
1078
|
+
} catch (error) {
|
|
1079
|
+
issues.push(buildRuleExecutionError(rule, error));
|
|
1080
|
+
}
|
|
1081
|
+
}
|
|
1082
|
+
return issues;
|
|
1083
|
+
}
|
|
1084
|
+
|
|
975
1085
|
// src/core/linter/security.ts
|
|
976
1086
|
var DANGEROUS_COMMAND_PATTERNS = [
|
|
977
1087
|
{
|
|
@@ -1179,8 +1289,8 @@ function runSecurityChecks(context) {
|
|
|
1179
1289
|
}
|
|
1180
1290
|
|
|
1181
1291
|
// src/core/linter/structure.ts
|
|
1182
|
-
import
|
|
1183
|
-
import
|
|
1292
|
+
import fs5 from "node:fs/promises";
|
|
1293
|
+
import path5 from "node:path";
|
|
1184
1294
|
function hasTableOfContents(content) {
|
|
1185
1295
|
if (/^#{1,6}\s+table of contents\b/im.test(content)) {
|
|
1186
1296
|
return true;
|
|
@@ -1221,21 +1331,21 @@ async function runStructureChecks(context) {
|
|
|
1221
1331
|
message: `SKILL.md length is ${context.skill.lineCount} lines.`
|
|
1222
1332
|
});
|
|
1223
1333
|
}
|
|
1224
|
-
const referencesDir =
|
|
1334
|
+
const referencesDir = path5.join(context.skill.skillRoot, "references");
|
|
1225
1335
|
if (await pathExists(referencesDir)) {
|
|
1226
1336
|
const files = await listFilesRecursive(referencesDir);
|
|
1227
1337
|
let oversizedWithoutToc = 0;
|
|
1228
1338
|
for (const file of files) {
|
|
1229
|
-
const raw = await
|
|
1339
|
+
const raw = await fs5.readFile(file, "utf8");
|
|
1230
1340
|
const lineCount = raw === "" ? 0 : raw.split(/\r?\n/).length;
|
|
1231
1341
|
if (lineCount > 300 && !hasTableOfContents(raw)) {
|
|
1232
1342
|
oversizedWithoutToc += 1;
|
|
1233
1343
|
issues.push({
|
|
1234
|
-
id: `structure.references.toc.${toPosixPath(
|
|
1344
|
+
id: `structure.references.toc.${toPosixPath(path5.relative(context.skill.skillRoot, file))}`,
|
|
1235
1345
|
checkId: "structure:toc",
|
|
1236
1346
|
title: "Reference File Navigation",
|
|
1237
1347
|
status: "warn",
|
|
1238
|
-
message: `${toPosixPath(
|
|
1348
|
+
message: `${toPosixPath(path5.relative(context.skill.skillRoot, file))} is ${lineCount} lines and has no table of contents.`,
|
|
1239
1349
|
suggestion: "Add a table of contents for long reference files."
|
|
1240
1350
|
});
|
|
1241
1351
|
}
|
|
@@ -1265,7 +1375,7 @@ async function runStructureChecks(context) {
|
|
|
1265
1375
|
other: []
|
|
1266
1376
|
};
|
|
1267
1377
|
for (const reference of references) {
|
|
1268
|
-
const resolved =
|
|
1378
|
+
const resolved = path5.resolve(context.skill.skillRoot, reference);
|
|
1269
1379
|
if (!await pathExists(resolved)) {
|
|
1270
1380
|
const kind = classifyReferencePath(reference);
|
|
1271
1381
|
missingByType[kind].push(reference);
|
|
@@ -1362,6 +1472,10 @@ async function runLinter(inputPath, options = {}) {
|
|
|
1362
1472
|
issues.push(...runSecurityChecks(context));
|
|
1363
1473
|
issues.push(...await runDisclosureChecks(context));
|
|
1364
1474
|
issues.push(...runCompatibilityChecks(context));
|
|
1475
|
+
for (const pluginPath of options.plugins ?? []) {
|
|
1476
|
+
const plugin = await loadPlugin(pluginPath);
|
|
1477
|
+
issues.push(...await runPluginRules(plugin, context));
|
|
1478
|
+
}
|
|
1365
1479
|
const filteredIssues = issues.filter((issue) => !suppressedCheckIds.has(issue.checkId));
|
|
1366
1480
|
return {
|
|
1367
1481
|
target: inputPath,
|
|
@@ -1525,10 +1639,10 @@ function renderLintIssueList(report) {
|
|
|
1525
1639
|
const info = skippedSecurityPatterns > 0 ? `<p class="info-line">Skipped security patterns in examples/comments: ${escapeHtml(skippedSecurityPatterns)}</p>` : "";
|
|
1526
1640
|
return `<div class="row-list">${rows}</div>${info}`;
|
|
1527
1641
|
}
|
|
1528
|
-
function renderTriggerCaseRow(testCase) {
|
|
1642
|
+
function renderTriggerCaseRow(testCase, showSelectedCompetitor) {
|
|
1529
1643
|
const details = testCase.rawModelResponse ? renderDetails("Model response", renderPreBlock(testCase.rawModelResponse)) : "";
|
|
1530
1644
|
return `
|
|
1531
|
-
<div class="row">
|
|
1645
|
+
<div class="row${testCase.selectedCompetitor ? " competitor-selected" : ""}">
|
|
1532
1646
|
<div class="row-header">
|
|
1533
1647
|
<div>
|
|
1534
1648
|
<div class="row-title">${escapeHtml(testCase.query)}</div>
|
|
@@ -1540,12 +1654,29 @@ function renderTriggerCaseRow(testCase) {
|
|
|
1540
1654
|
</div>
|
|
1541
1655
|
${renderDefinitionList([
|
|
1542
1656
|
{ label: "Expected", value: testCase.expected },
|
|
1543
|
-
{ label: "Actual", value: testCase.actual }
|
|
1657
|
+
{ label: "Actual", value: testCase.actual },
|
|
1658
|
+
...showSelectedCompetitor ? [{ label: "Selected competitor", value: testCase.selectedCompetitor ?? "none" }] : []
|
|
1544
1659
|
])}
|
|
1545
1660
|
${details}
|
|
1546
1661
|
</div>
|
|
1547
1662
|
`;
|
|
1548
1663
|
}
|
|
1664
|
+
function renderCompetitorSkillsSection(result) {
|
|
1665
|
+
if (!result.competitors || result.competitors.length === 0) {
|
|
1666
|
+
return "";
|
|
1667
|
+
}
|
|
1668
|
+
return renderSectionCard(
|
|
1669
|
+
"Competitor Skills",
|
|
1670
|
+
`<div class="row-list">${result.competitors.map(
|
|
1671
|
+
(competitor) => renderMessageRow(
|
|
1672
|
+
"warn",
|
|
1673
|
+
competitor.name,
|
|
1674
|
+
competitor.description,
|
|
1675
|
+
renderDefinitionList([{ label: "Source", value: competitor.sourcePath }])
|
|
1676
|
+
)
|
|
1677
|
+
).join("")}</div>`
|
|
1678
|
+
);
|
|
1679
|
+
}
|
|
1549
1680
|
function promptStatus(promptResult) {
|
|
1550
1681
|
if (promptResult.totalAssertions === 0) {
|
|
1551
1682
|
return "skip";
|
|
@@ -1638,6 +1769,7 @@ function renderHtmlDocument(title, body) {
|
|
|
1638
1769
|
--pass: #22c55e;
|
|
1639
1770
|
--warn: #eab308;
|
|
1640
1771
|
--fail: #ef4444;
|
|
1772
|
+
--competitor: #f97316;
|
|
1641
1773
|
--skip: #6b7280;
|
|
1642
1774
|
--shadow: 0 10px 30px rgba(15, 23, 42, 0.08);
|
|
1643
1775
|
}
|
|
@@ -1786,6 +1918,11 @@ function renderHtmlDocument(title, body) {
|
|
|
1786
1918
|
background: var(--surface-muted);
|
|
1787
1919
|
}
|
|
1788
1920
|
|
|
1921
|
+
.row.competitor-selected {
|
|
1922
|
+
border-color: rgba(249, 115, 22, 0.45);
|
|
1923
|
+
background: rgba(249, 115, 22, 0.08);
|
|
1924
|
+
}
|
|
1925
|
+
|
|
1789
1926
|
.row-header {
|
|
1790
1927
|
display: flex;
|
|
1791
1928
|
justify-content: space-between;
|
|
@@ -1965,6 +2102,7 @@ function renderTriggerHtml(result) {
|
|
|
1965
2102
|
const target = resolveOptionalTarget(htmlResult, result.skillName);
|
|
1966
2103
|
const matchedCount = result.cases.filter((testCase) => testCase.matched).length;
|
|
1967
2104
|
const matchRate = result.cases.length === 0 ? 0 : matchedCount / result.cases.length;
|
|
2105
|
+
const hasCompetitors = Boolean(result.competitors && result.competitors.length > 0);
|
|
1968
2106
|
const body = [
|
|
1969
2107
|
renderHeaderCard(
|
|
1970
2108
|
"trigger",
|
|
@@ -1980,10 +2118,15 @@ function renderTriggerHtml(result) {
|
|
|
1980
2118
|
{ label: "Provider", value: result.provider },
|
|
1981
2119
|
{ label: "Model", value: result.model },
|
|
1982
2120
|
{ label: "Seed", value: result.seed !== void 0 ? String(result.seed) : "none" },
|
|
2121
|
+
...hasCompetitors ? [{ label: "Competitors", value: String(result.competitors?.length ?? 0) }] : [],
|
|
1983
2122
|
{ label: "Queries", value: String(result.queries.length) }
|
|
1984
2123
|
]
|
|
1985
2124
|
),
|
|
1986
|
-
|
|
2125
|
+
renderCompetitorSkillsSection(result),
|
|
2126
|
+
renderSectionCard(
|
|
2127
|
+
"Trigger Cases",
|
|
2128
|
+
`<div class="row-list">${result.cases.map((testCase) => renderTriggerCaseRow(testCase, hasCompetitors)).join("")}</div>`
|
|
2129
|
+
),
|
|
1987
2130
|
renderSectionCard(
|
|
1988
2131
|
"Suggestions",
|
|
1989
2132
|
`<ul>${result.suggestions.map((suggestion) => `<li>${escapeHtml(suggestion)}</li>`).join("")}</ul>`
|
|
@@ -2023,7 +2166,8 @@ function renderEvalHtml(result) {
|
|
|
2023
2166
|
}
|
|
2024
2167
|
function renderCheckHtml(result) {
|
|
2025
2168
|
const skillName = result.trigger?.skillName ?? result.eval?.skillName ?? result.target;
|
|
2026
|
-
const triggerBody = result.trigger ?
|
|
2169
|
+
const triggerBody = result.trigger ? `${renderCompetitorSkillsSection(result.trigger)}
|
|
2170
|
+
<div class="row-list">${result.trigger.cases.map((testCase) => renderTriggerCaseRow(testCase, Boolean(result.trigger?.competitors?.length))).join("")}</div>
|
|
2027
2171
|
<div class="card" style="margin-top: 16px;">
|
|
2028
2172
|
<h2>Trigger Suggestions</h2>
|
|
2029
2173
|
<ul>${result.trigger.suggestions.map((suggestion) => `<li>${escapeHtml(suggestion)}</li>`).join("")}</ul>
|
|
@@ -2123,46 +2267,47 @@ function countSkippedSecurityPatterns2(issues) {
|
|
|
2123
2267
|
return total + (issue.skippedPatterns?.length ?? 0);
|
|
2124
2268
|
}, 0);
|
|
2125
2269
|
}
|
|
2270
|
+
function formatPercent2(value) {
|
|
2271
|
+
return `${(value * 100).toFixed(1)}%`;
|
|
2272
|
+
}
|
|
2126
2273
|
function renderLintReport(report, enableColor) {
|
|
2127
2274
|
const c = getChalkInstance(enableColor);
|
|
2128
2275
|
const { passed, warnings, failures, total } = report.summary;
|
|
2129
2276
|
const headerLines = [
|
|
2130
|
-
|
|
2131
|
-
|
|
2132
|
-
|
|
2133
|
-
`\u2502 target: ${report.target}`,
|
|
2134
|
-
`\u2502 summary: ${passed}/${total} checks passed, ${warnings} warnings, ${failures} failures`,
|
|
2135
|
-
`\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518`
|
|
2277
|
+
"skilltest lint",
|
|
2278
|
+
`target: ${report.target}`,
|
|
2279
|
+
`summary: ${passed}/${total} checks passed, ${warnings} warnings, ${failures} failures`
|
|
2136
2280
|
];
|
|
2137
2281
|
const renderedIssues = report.issues.map((issue) => renderIssueLine(issue, c)).join("\n");
|
|
2138
2282
|
const skippedSecurityPatterns = countSkippedSecurityPatterns2(report.issues);
|
|
2139
2283
|
const infoLine = skippedSecurityPatterns > 0 ? `
|
|
2140
|
-
${c.cyan("
|
|
2284
|
+
${c.cyan("INFO")} ${skippedSecurityPatterns} security pattern(s) found in code examples/comments (not flagged)` : "";
|
|
2141
2285
|
return `${headerLines.join("\n")}
|
|
2142
2286
|
${renderedIssues}${infoLine}`;
|
|
2143
2287
|
}
|
|
2144
|
-
function formatPercent2(value) {
|
|
2145
|
-
return `${(value * 100).toFixed(1)}%`;
|
|
2146
|
-
}
|
|
2147
2288
|
function renderTriggerReport(result, enableColor, verbose) {
|
|
2148
2289
|
const c = getChalkInstance(enableColor);
|
|
2149
|
-
const lines = [
|
|
2150
|
-
|
|
2151
|
-
|
|
2152
|
-
|
|
2153
|
-
|
|
2154
|
-
|
|
2290
|
+
const lines = [
|
|
2291
|
+
"skilltest trigger",
|
|
2292
|
+
`skill: ${result.skillName}`,
|
|
2293
|
+
`provider/model: ${result.provider}/${result.model}`
|
|
2294
|
+
];
|
|
2295
|
+
if (result.competitors && result.competitors.length > 0) {
|
|
2296
|
+
lines.push(`competitors: ${result.competitors.map((competitor) => competitor.name).join(", ")}`);
|
|
2297
|
+
}
|
|
2155
2298
|
lines.push(
|
|
2156
|
-
|
|
2299
|
+
`precision: ${formatPercent2(result.metrics.precision)} recall: ${formatPercent2(result.metrics.recall)} f1: ${formatPercent2(result.metrics.f1)}`
|
|
2157
2300
|
);
|
|
2158
2301
|
lines.push(
|
|
2159
|
-
|
|
2302
|
+
`TP ${result.metrics.truePositives} TN ${result.metrics.trueNegatives} FP ${result.metrics.falsePositives} FN ${result.metrics.falseNegatives}`
|
|
2160
2303
|
);
|
|
2161
|
-
lines.push("\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518");
|
|
2162
2304
|
for (const [index, testCase] of result.cases.entries()) {
|
|
2163
2305
|
const status = testCase.matched ? c.green("PASS") : c.red("FAIL");
|
|
2164
2306
|
lines.push(`${index + 1}. ${status} query: ${testCase.query}`);
|
|
2165
2307
|
lines.push(` expected: ${testCase.expected} | actual: ${testCase.actual}`);
|
|
2308
|
+
if (verbose && testCase.selectedCompetitor) {
|
|
2309
|
+
lines.push(` competitor selected: ${testCase.selectedCompetitor}`);
|
|
2310
|
+
}
|
|
2166
2311
|
if (verbose && testCase.rawModelResponse) {
|
|
2167
2312
|
lines.push(` model: ${testCase.rawModelResponse.replace(/\s+/g, " ").trim()}`);
|
|
2168
2313
|
}
|
|
@@ -2175,15 +2320,13 @@ function renderTriggerReport(result, enableColor, verbose) {
|
|
|
2175
2320
|
}
|
|
2176
2321
|
function renderEvalReport(result, enableColor, verbose) {
|
|
2177
2322
|
const c = getChalkInstance(enableColor);
|
|
2178
|
-
const lines = [
|
|
2179
|
-
|
|
2180
|
-
|
|
2181
|
-
|
|
2182
|
-
|
|
2183
|
-
|
|
2184
|
-
|
|
2185
|
-
lines.push(`\u2502 assertions passed: ${result.summary.passedAssertions}/${result.summary.totalAssertions}`);
|
|
2186
|
-
lines.push("\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518");
|
|
2323
|
+
const lines = [
|
|
2324
|
+
"skilltest eval",
|
|
2325
|
+
`skill: ${result.skillName}`,
|
|
2326
|
+
`provider/model: ${result.provider}/${result.model}`,
|
|
2327
|
+
`grader model: ${result.graderModel}`,
|
|
2328
|
+
`assertions passed: ${result.summary.passedAssertions}/${result.summary.totalAssertions}`
|
|
2329
|
+
];
|
|
2187
2330
|
for (const [index, promptResult] of result.results.entries()) {
|
|
2188
2331
|
lines.push(`${index + 1}. prompt: ${promptResult.prompt}`);
|
|
2189
2332
|
lines.push(` response summary: ${promptResult.responseSummary.replace(/\s+/g, " ").trim()}`);
|
|
@@ -2229,7 +2372,7 @@ function renderCheckReport(result, enableColor, verbose) {
|
|
|
2229
2372
|
}
|
|
2230
2373
|
const skippedSecurityPatterns = countSkippedSecurityPatterns2(result.lint.issues);
|
|
2231
2374
|
if (skippedSecurityPatterns > 0) {
|
|
2232
|
-
lines.push(` ${c.cyan("
|
|
2375
|
+
lines.push(` ${c.cyan("INFO")} ${skippedSecurityPatterns} security pattern(s) found in code examples/comments (not flagged)`);
|
|
2233
2376
|
}
|
|
2234
2377
|
lines.push("");
|
|
2235
2378
|
lines.push("Trigger");
|
|
@@ -2240,11 +2383,17 @@ function renderCheckReport(result, enableColor, verbose) {
|
|
|
2240
2383
|
lines.push(
|
|
2241
2384
|
` TP ${result.trigger.metrics.truePositives} TN ${result.trigger.metrics.trueNegatives} FP ${result.trigger.metrics.falsePositives} FN ${result.trigger.metrics.falseNegatives}`
|
|
2242
2385
|
);
|
|
2386
|
+
if (result.trigger.competitors && result.trigger.competitors.length > 0) {
|
|
2387
|
+
lines.push(` competitors: ${result.trigger.competitors.map((competitor) => competitor.name).join(", ")}`);
|
|
2388
|
+
}
|
|
2243
2389
|
const triggerCases = verbose ? result.trigger.cases : result.trigger.cases.filter((testCase) => !testCase.matched);
|
|
2244
2390
|
for (const testCase of triggerCases) {
|
|
2245
2391
|
const status = testCase.matched ? c.green("PASS") : c.red("FAIL");
|
|
2246
2392
|
lines.push(` - ${status} ${testCase.query}`);
|
|
2247
2393
|
lines.push(` expected=${testCase.expected} actual=${testCase.actual}`);
|
|
2394
|
+
if (testCase.selectedCompetitor) {
|
|
2395
|
+
lines.push(` competitor selected=${testCase.selectedCompetitor}`);
|
|
2396
|
+
}
|
|
2248
2397
|
}
|
|
2249
2398
|
} else {
|
|
2250
2399
|
lines.push(`- ${triggerGate} ${result.triggerSkippedReason ?? "Skipped."}`);
|
|
@@ -2286,7 +2435,7 @@ function renderCheckReport(result, enableColor, verbose) {
|
|
|
2286
2435
|
}
|
|
2287
2436
|
|
|
2288
2437
|
// src/commands/common.ts
|
|
2289
|
-
import
|
|
2438
|
+
import fs6 from "node:fs/promises";
|
|
2290
2439
|
import { z as z5 } from "zod";
|
|
2291
2440
|
|
|
2292
2441
|
// src/core/eval-runner.ts
|
|
@@ -2314,12 +2463,13 @@ function extractJsonObject(raw) {
|
|
|
2314
2463
|
}
|
|
2315
2464
|
throw new Error("Grader did not return a JSON object.");
|
|
2316
2465
|
}
|
|
2317
|
-
|
|
2318
|
-
|
|
2319
|
-
|
|
2320
|
-
|
|
2321
|
-
|
|
2322
|
-
|
|
2466
|
+
var DEFAULT_ASSERTIONS = [
|
|
2467
|
+
"The response follows the skill instructions faithfully.",
|
|
2468
|
+
"The response is well-structured and actionable.",
|
|
2469
|
+
"The response addresses the user prompt directly."
|
|
2470
|
+
];
|
|
2471
|
+
function buildGraderPrompts(options) {
|
|
2472
|
+
const assertions = options.assertions && options.assertions.length > 0 ? options.assertions : DEFAULT_ASSERTIONS;
|
|
2323
2473
|
const systemPrompt = [
|
|
2324
2474
|
"You are a strict evaluator for agent skill outputs.",
|
|
2325
2475
|
"Assess each assertion and return JSON only.",
|
|
@@ -2336,15 +2486,26 @@ async function gradeResponse(options) {
|
|
|
2336
2486
|
options.modelResponse,
|
|
2337
2487
|
"",
|
|
2338
2488
|
"Assertions to evaluate:",
|
|
2339
|
-
|
|
2489
|
+
assertions.map((assertion, index) => `${index + 1}. ${assertion}`).join("\n")
|
|
2340
2490
|
].join("\n");
|
|
2341
|
-
|
|
2491
|
+
return {
|
|
2492
|
+
assertions,
|
|
2493
|
+
systemPrompt,
|
|
2494
|
+
userPrompt
|
|
2495
|
+
};
|
|
2496
|
+
}
|
|
2497
|
+
function parseGraderOutput(raw) {
|
|
2342
2498
|
const parsed = graderOutputSchema.safeParse(extractJsonObject(raw));
|
|
2343
2499
|
if (!parsed.success) {
|
|
2344
2500
|
throw new Error(`Failed to parse grader output: ${parsed.error.issues[0]?.message ?? "invalid grader JSON"}`);
|
|
2345
2501
|
}
|
|
2346
2502
|
return parsed.data.assertions;
|
|
2347
2503
|
}
|
|
2504
|
+
async function gradeResponse(options) {
|
|
2505
|
+
const prompts = buildGraderPrompts(options);
|
|
2506
|
+
const raw = await options.provider.sendMessage(prompts.systemPrompt, prompts.userPrompt, { model: options.model });
|
|
2507
|
+
return parseGraderOutput(raw);
|
|
2508
|
+
}
|
|
2348
2509
|
|
|
2349
2510
|
// src/utils/concurrency.ts
|
|
2350
2511
|
async function pMap(items, fn, concurrency) {
|
|
@@ -2499,6 +2660,7 @@ var triggerQuerySchema = z4.object({
|
|
|
2499
2660
|
should_trigger: z4.boolean()
|
|
2500
2661
|
});
|
|
2501
2662
|
var triggerQueryArraySchema = z4.array(triggerQuerySchema);
|
|
2663
|
+
var triggerNumQueriesSchema = z4.number().int().min(2).refine((value) => value % 2 === 0, "numQueries must be an even number.");
|
|
2502
2664
|
var FAKE_SKILLS = [
|
|
2503
2665
|
{ name: "code-review", description: "Reviews code changes for bugs, regressions, and maintainability issues." },
|
|
2504
2666
|
{ name: "api-tester", description: "Designs and runs REST API tests, validating status codes and response shapes." },
|
|
@@ -2539,6 +2701,9 @@ function shuffle(values, rng) {
|
|
|
2539
2701
|
function sample(values, count, rng) {
|
|
2540
2702
|
return shuffle(values, rng).slice(0, Math.max(0, Math.min(count, values.length)));
|
|
2541
2703
|
}
|
|
2704
|
+
function validateNumQueries(numQueries) {
|
|
2705
|
+
return triggerNumQueriesSchema.parse(numQueries);
|
|
2706
|
+
}
|
|
2542
2707
|
function parseJsonArrayFromModelOutput(raw) {
|
|
2543
2708
|
const trimmed = raw.trim();
|
|
2544
2709
|
if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
|
|
@@ -2552,7 +2717,8 @@ function parseJsonArrayFromModelOutput(raw) {
|
|
|
2552
2717
|
}
|
|
2553
2718
|
throw new Error("Model did not return a JSON array.");
|
|
2554
2719
|
}
|
|
2555
|
-
async function generateQueriesWithModel(skill, provider, model, numQueries) {
|
|
2720
|
+
async function generateQueriesWithModel(skill, provider, model, numQueries, competitors) {
|
|
2721
|
+
validateNumQueries(numQueries);
|
|
2556
2722
|
const shouldTriggerCount = Math.floor(numQueries / 2);
|
|
2557
2723
|
const shouldNotTriggerCount = numQueries - shouldTriggerCount;
|
|
2558
2724
|
const systemPrompt = [
|
|
@@ -2564,6 +2730,15 @@ async function generateQueriesWithModel(skill, provider, model, numQueries) {
|
|
|
2564
2730
|
const userPrompt = [
|
|
2565
2731
|
`Skill name: ${skill.frontmatter.name}`,
|
|
2566
2732
|
`Skill description: ${skill.frontmatter.description}`,
|
|
2733
|
+
...competitors && competitors.length > 0 ? [
|
|
2734
|
+
"",
|
|
2735
|
+
"Competitor skills in the same domain:",
|
|
2736
|
+
...competitors.map((competitor) => `- ${competitor.name}: ${competitor.description}`),
|
|
2737
|
+
"",
|
|
2738
|
+
"Generate queries that test whether the target skill triggers correctly even when these similar skills exist.",
|
|
2739
|
+
"Positive queries should clearly belong to the target skill, not the competitors.",
|
|
2740
|
+
"Negative queries should belong to a competitor or to no skill at all."
|
|
2741
|
+
] : [],
|
|
2567
2742
|
`Generate ${numQueries} prompts total.`,
|
|
2568
2743
|
`Exactly ${shouldTriggerCount} should have should_trigger=true.`,
|
|
2569
2744
|
`Exactly ${shouldNotTriggerCount} should have should_trigger=false.`,
|
|
@@ -2597,6 +2772,46 @@ function parseDecision(rawResponse, skillNames) {
|
|
|
2597
2772
|
}
|
|
2598
2773
|
return "unrecognized";
|
|
2599
2774
|
}
|
|
2775
|
+
function prepareTriggerQueries(skill, queries, seed, competitors) {
|
|
2776
|
+
const rng = createRng(seed);
|
|
2777
|
+
const competitorCandidates = (competitors ?? []).map((competitor) => ({
|
|
2778
|
+
name: competitor.name,
|
|
2779
|
+
description: competitor.description
|
|
2780
|
+
}));
|
|
2781
|
+
return queries.map((testQuery) => {
|
|
2782
|
+
const usingCompetitors = competitorCandidates.length > 0;
|
|
2783
|
+
const fakeCount = usingCompetitors ? testQuery.should_trigger ? 2 + Math.floor(rng() * 3) : 3 + Math.floor(rng() * 3) : 5 + Math.floor(rng() * 5);
|
|
2784
|
+
const fakeSkills = sample(FAKE_SKILLS, fakeCount, rng);
|
|
2785
|
+
const allSkills = usingCompetitors ? shuffle(
|
|
2786
|
+
[
|
|
2787
|
+
...competitorCandidates,
|
|
2788
|
+
...fakeSkills,
|
|
2789
|
+
...testQuery.should_trigger ? [
|
|
2790
|
+
{
|
|
2791
|
+
name: skill.frontmatter.name,
|
|
2792
|
+
description: skill.frontmatter.description
|
|
2793
|
+
}
|
|
2794
|
+
] : []
|
|
2795
|
+
],
|
|
2796
|
+
rng
|
|
2797
|
+
) : shuffle(
|
|
2798
|
+
[
|
|
2799
|
+
...fakeSkills,
|
|
2800
|
+
{
|
|
2801
|
+
name: skill.frontmatter.name,
|
|
2802
|
+
description: skill.frontmatter.description
|
|
2803
|
+
}
|
|
2804
|
+
],
|
|
2805
|
+
rng
|
|
2806
|
+
);
|
|
2807
|
+
return {
|
|
2808
|
+
testQuery,
|
|
2809
|
+
fakeSkills,
|
|
2810
|
+
allSkills,
|
|
2811
|
+
skillListText: allSkills.map((entry) => `- ${entry.name}: ${entry.description}`).join("\n")
|
|
2812
|
+
};
|
|
2813
|
+
});
|
|
2814
|
+
}
|
|
2600
2815
|
function calculateMetrics(skillName, cases) {
|
|
2601
2816
|
let truePositives = 0;
|
|
2602
2817
|
let trueNegatives = 0;
|
|
@@ -2633,44 +2848,82 @@ function calculateMetrics(skillName, cases) {
|
|
|
2633
2848
|
f1
|
|
2634
2849
|
};
|
|
2635
2850
|
}
|
|
2636
|
-
function
|
|
2851
|
+
function assertCompetitorNamesDistinct(skillName, competitors) {
|
|
2852
|
+
for (const competitor of competitors) {
|
|
2853
|
+
if (competitor.name === skillName) {
|
|
2854
|
+
throw new Error(`Competitor skill '${competitor.name}' has the same name as the skill under test.`);
|
|
2855
|
+
}
|
|
2856
|
+
}
|
|
2857
|
+
}
|
|
2858
|
+
function buildTriggerCaseResult(options) {
|
|
2859
|
+
const expected = options.testQuery.should_trigger ? options.skillName : "none";
|
|
2860
|
+
const matched = options.testQuery.should_trigger ? options.decision === options.skillName : options.decision !== options.skillName;
|
|
2861
|
+
const selectedCompetitor = options.competitorNames?.includes(options.decision) ? options.decision : void 0;
|
|
2862
|
+
return {
|
|
2863
|
+
query: options.testQuery.query,
|
|
2864
|
+
shouldTrigger: options.testQuery.should_trigger,
|
|
2865
|
+
expected,
|
|
2866
|
+
actual: options.decision,
|
|
2867
|
+
matched,
|
|
2868
|
+
selectedCompetitor,
|
|
2869
|
+
rawModelResponse: options.rawModelResponse
|
|
2870
|
+
};
|
|
2871
|
+
}
|
|
2872
|
+
function buildSuggestions(skillName, metrics, cases, competitors) {
|
|
2637
2873
|
const suggestions = [];
|
|
2638
2874
|
if (metrics.falseNegatives > 0) {
|
|
2639
2875
|
suggestions.push(
|
|
2640
2876
|
"False negatives found: clarify capability keywords and add explicit 'use when ...' phrasing in description."
|
|
2641
2877
|
);
|
|
2878
|
+
if (competitors && competitors.length > 0) {
|
|
2879
|
+
const competitorCounts = /* @__PURE__ */ new Map();
|
|
2880
|
+
for (const testCase of cases) {
|
|
2881
|
+
if (!testCase.shouldTrigger || testCase.actual === skillName || !testCase.selectedCompetitor) {
|
|
2882
|
+
continue;
|
|
2883
|
+
}
|
|
2884
|
+
competitorCounts.set(testCase.selectedCompetitor, (competitorCounts.get(testCase.selectedCompetitor) ?? 0) + 1);
|
|
2885
|
+
}
|
|
2886
|
+
for (const [competitorName, count] of competitorCounts.entries()) {
|
|
2887
|
+
suggestions.push(
|
|
2888
|
+
`Skill '${competitorName}' was selected instead of '${skillName}' for ${count} quer${count === 1 ? "y" : "ies"}. Differentiate your description from '${competitorName}'.`
|
|
2889
|
+
);
|
|
2890
|
+
}
|
|
2891
|
+
}
|
|
2642
2892
|
}
|
|
2643
2893
|
if (metrics.falsePositives > 0) {
|
|
2644
2894
|
suggestions.push("False positives found: narrow scope boundaries and add explicit non-goals in description.");
|
|
2895
|
+
if (competitors && competitors.length > 0) {
|
|
2896
|
+
suggestions.push(
|
|
2897
|
+
`With competitor skills present, ${metrics.falsePositives} negative quer${metrics.falsePositives === 1 ? "y still" : "ies still"} triggered '${skillName}'. Narrow your description's scope boundaries.`
|
|
2898
|
+
);
|
|
2899
|
+
}
|
|
2645
2900
|
}
|
|
2646
2901
|
if (suggestions.length === 0) {
|
|
2647
2902
|
suggestions.push("Trigger behavior looks clean on this sample. Keep monitoring with domain-specific custom queries.");
|
|
2648
2903
|
}
|
|
2649
2904
|
return suggestions;
|
|
2650
2905
|
}
|
|
2906
|
+
async function loadCompetitorSkills(comparePaths) {
|
|
2907
|
+
const competitors = [];
|
|
2908
|
+
for (const comparePath of comparePaths) {
|
|
2909
|
+
const parsed = await parseSkillStrict(comparePath);
|
|
2910
|
+
competitors.push({
|
|
2911
|
+
name: parsed.frontmatter.name,
|
|
2912
|
+
description: parsed.frontmatter.description,
|
|
2913
|
+
sourcePath: comparePath
|
|
2914
|
+
});
|
|
2915
|
+
}
|
|
2916
|
+
return competitors;
|
|
2917
|
+
}
|
|
2651
2918
|
async function runTriggerTest(skill, options) {
|
|
2652
|
-
const
|
|
2653
|
-
|
|
2919
|
+
const competitors = options.compare && options.compare.length > 0 ? await loadCompetitorSkills(options.compare) : void 0;
|
|
2920
|
+
if (competitors && competitors.length > 0) {
|
|
2921
|
+
assertCompetitorNamesDistinct(skill.frontmatter.name, competitors);
|
|
2922
|
+
}
|
|
2923
|
+
const queries = options.queries && options.queries.length > 0 ? triggerQueryArraySchema.parse(options.queries) : await generateQueriesWithModel(skill, options.provider, options.model, options.numQueries, competitors);
|
|
2654
2924
|
const skillName = skill.frontmatter.name;
|
|
2655
|
-
const preparedQueries = queries.
|
|
2656
|
-
|
|
2657
|
-
const fakeSkills = sample(FAKE_SKILLS, fakeCount, rng);
|
|
2658
|
-
const allSkills = shuffle([
|
|
2659
|
-
...fakeSkills,
|
|
2660
|
-
{
|
|
2661
|
-
name: skill.frontmatter.name,
|
|
2662
|
-
description: skill.frontmatter.description
|
|
2663
|
-
}
|
|
2664
|
-
], rng);
|
|
2665
|
-
const skillListText = allSkills.map((entry) => `- ${entry.name}: ${entry.description}`).join("\n");
|
|
2666
|
-
return {
|
|
2667
|
-
testQuery,
|
|
2668
|
-
fakeCount,
|
|
2669
|
-
fakeSkills,
|
|
2670
|
-
allSkills,
|
|
2671
|
-
skillListText
|
|
2672
|
-
};
|
|
2673
|
-
});
|
|
2925
|
+
const preparedQueries = prepareTriggerQueries(skill, queries, options.seed, competitors);
|
|
2926
|
+
const competitorNames = competitors?.map((competitor) => competitor.name) ?? [];
|
|
2674
2927
|
const systemPrompt = [
|
|
2675
2928
|
"You are selecting one skill to activate for a user query.",
|
|
2676
2929
|
"Choose the single best matching skill name from the provided list, or 'none' if no skill is a good fit.",
|
|
@@ -2683,18 +2936,15 @@ async function runTriggerTest(skill, options) {
|
|
|
2683
2936
|
const rawResponse = await options.provider.sendMessage(systemPrompt, userPrompt, { model: options.model });
|
|
2684
2937
|
const decision = parseDecision(
|
|
2685
2938
|
rawResponse,
|
|
2686
|
-
allSkills.map((entry) => entry.name)
|
|
2939
|
+
Array.from(/* @__PURE__ */ new Set([skillName, ...allSkills.map((entry) => entry.name)]))
|
|
2687
2940
|
);
|
|
2688
|
-
|
|
2689
|
-
|
|
2690
|
-
|
|
2691
|
-
|
|
2692
|
-
|
|
2693
|
-
expected,
|
|
2694
|
-
actual: decision,
|
|
2695
|
-
matched,
|
|
2941
|
+
return buildTriggerCaseResult({
|
|
2942
|
+
testQuery,
|
|
2943
|
+
skillName,
|
|
2944
|
+
decision,
|
|
2945
|
+
competitorNames,
|
|
2696
2946
|
rawModelResponse: options.verbose ? rawResponse : void 0
|
|
2697
|
-
};
|
|
2947
|
+
});
|
|
2698
2948
|
},
|
|
2699
2949
|
options.concurrency ?? 5
|
|
2700
2950
|
);
|
|
@@ -2704,10 +2954,11 @@ async function runTriggerTest(skill, options) {
|
|
|
2704
2954
|
model: options.model,
|
|
2705
2955
|
provider: options.provider.name,
|
|
2706
2956
|
seed: options.seed,
|
|
2957
|
+
competitors,
|
|
2707
2958
|
queries,
|
|
2708
2959
|
cases: results,
|
|
2709
2960
|
metrics,
|
|
2710
|
-
suggestions: buildSuggestions(metrics)
|
|
2961
|
+
suggestions: buildSuggestions(skillName, metrics, results, competitors)
|
|
2711
2962
|
};
|
|
2712
2963
|
}
|
|
2713
2964
|
|
|
@@ -2822,10 +3073,10 @@ async function loadConfiguredEvalPrompts(command) {
|
|
|
2822
3073
|
if (!promptFile && assertionsFile) {
|
|
2823
3074
|
throw new Error("Config field eval.assertionsFile requires eval.promptFile.");
|
|
2824
3075
|
}
|
|
2825
|
-
const promptRaw = await
|
|
3076
|
+
const promptRaw = await fs6.readFile(promptFile, "utf8");
|
|
2826
3077
|
let prompts = normalizeEvalPrompts(parseJsonIfPossible(promptRaw), promptFile);
|
|
2827
3078
|
if (assertionsFile) {
|
|
2828
|
-
const assertionsRaw = await
|
|
3079
|
+
const assertionsRaw = await fs6.readFile(assertionsFile, "utf8");
|
|
2829
3080
|
const assertions = normalizeAssertions(parseJsonIfPossible(assertionsRaw), assertionsFile);
|
|
2830
3081
|
prompts = prompts.map((prompt) => ({
|
|
2831
3082
|
prompt: prompt.prompt,
|
|
@@ -2864,18 +3115,22 @@ function writeError(error, asJson) {
|
|
|
2864
3115
|
|
|
2865
3116
|
// src/commands/lint.ts
|
|
2866
3117
|
var lintCliSchema = z6.object({
|
|
2867
|
-
html: z6.string().optional()
|
|
3118
|
+
html: z6.string().optional(),
|
|
3119
|
+
plugin: z6.array(z6.string().min(1)).optional()
|
|
2868
3120
|
});
|
|
3121
|
+
function collectPluginPaths(value, previous = []) {
|
|
3122
|
+
return [...previous, value];
|
|
3123
|
+
}
|
|
2869
3124
|
async function handleLintCommand(targetPath, options) {
|
|
2870
3125
|
try {
|
|
2871
|
-
const report = await runLinter(targetPath, { suppress: options.suppress });
|
|
3126
|
+
const report = await runLinter(targetPath, { suppress: options.suppress, plugins: options.plugins });
|
|
2872
3127
|
if (options.json) {
|
|
2873
3128
|
writeResult(report, true);
|
|
2874
3129
|
} else {
|
|
2875
3130
|
writeResult(renderLintReport(report, options.color), false);
|
|
2876
3131
|
}
|
|
2877
3132
|
if (options.html) {
|
|
2878
|
-
await
|
|
3133
|
+
await fs7.writeFile(options.html, renderLintHtml(report), "utf8");
|
|
2879
3134
|
}
|
|
2880
3135
|
if (lintFails(report, options.failOn)) {
|
|
2881
3136
|
process.exitCode = 1;
|
|
@@ -2886,7 +3141,7 @@ async function handleLintCommand(targetPath, options) {
|
|
|
2886
3141
|
}
|
|
2887
3142
|
}
|
|
2888
3143
|
function registerLintCommand(program) {
|
|
2889
|
-
program.command("lint").description("Run static lint checks against a SKILL.md file or skill directory.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--html <path>", "Write an HTML report to the given file path").action(async (targetPath, _commandOptions, command) => {
|
|
3144
|
+
program.command("lint").description("Run static lint checks against a SKILL.md file or skill directory.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--html <path>", "Write an HTML report to the given file path").option("--plugin <path>", "Load a custom lint plugin file", collectPluginPaths, []).action(async (targetPath, _commandOptions, command) => {
|
|
2890
3145
|
const globalOptions = getGlobalCliOptions(command);
|
|
2891
3146
|
const config = getResolvedConfig(command);
|
|
2892
3147
|
const parsedCli = lintCliSchema.safeParse(command.opts());
|
|
@@ -2899,30 +3154,33 @@ function registerLintCommand(program) {
|
|
|
2899
3154
|
...globalOptions,
|
|
2900
3155
|
failOn: config.lint.failOn,
|
|
2901
3156
|
suppress: config.lint.suppress,
|
|
3157
|
+
plugins: config.lint.plugins,
|
|
2902
3158
|
html: parsedCli.data.html
|
|
2903
3159
|
});
|
|
2904
3160
|
});
|
|
2905
3161
|
}
|
|
2906
3162
|
|
|
2907
3163
|
// src/commands/trigger.ts
|
|
2908
|
-
import
|
|
3164
|
+
import fs9 from "node:fs/promises";
|
|
2909
3165
|
import ora from "ora";
|
|
2910
3166
|
import { z as z8 } from "zod";
|
|
2911
3167
|
|
|
2912
3168
|
// src/utils/config.ts
|
|
2913
|
-
import
|
|
2914
|
-
import
|
|
3169
|
+
import fs8 from "node:fs/promises";
|
|
3170
|
+
import path6 from "node:path";
|
|
2915
3171
|
import { z as z7 } from "zod";
|
|
2916
3172
|
var providerNameSchema = z7.enum(["anthropic", "openai"]);
|
|
2917
3173
|
var lintFailOnSchema = z7.enum(["error", "warn"]);
|
|
2918
3174
|
var lintConfigSchema = z7.object({
|
|
2919
3175
|
failOn: lintFailOnSchema.optional(),
|
|
2920
|
-
suppress: z7.array(z7.string().min(1)).optional()
|
|
3176
|
+
suppress: z7.array(z7.string().min(1)).optional(),
|
|
3177
|
+
plugins: z7.array(z7.string().min(1)).optional()
|
|
2921
3178
|
}).strict();
|
|
2922
3179
|
var triggerConfigSchema = z7.object({
|
|
2923
3180
|
numQueries: z7.number().int().min(2).refine((value) => value % 2 === 0, "trigger.numQueries must be an even number."),
|
|
2924
3181
|
threshold: z7.number().min(0).max(1).optional(),
|
|
2925
|
-
seed: z7.number().int().optional()
|
|
3182
|
+
seed: z7.number().int().optional(),
|
|
3183
|
+
compare: z7.array(z7.string().min(1)).optional()
|
|
2926
3184
|
}).strict().partial();
|
|
2927
3185
|
var evalConfigSchema = z7.object({
|
|
2928
3186
|
numRuns: z7.number().int().min(1).optional(),
|
|
@@ -2946,12 +3204,14 @@ var resolvedSkilltestConfigSchema = z7.object({
|
|
|
2946
3204
|
concurrency: z7.number().int().min(1),
|
|
2947
3205
|
lint: z7.object({
|
|
2948
3206
|
failOn: lintFailOnSchema,
|
|
2949
|
-
suppress: z7.array(z7.string().min(1))
|
|
3207
|
+
suppress: z7.array(z7.string().min(1)),
|
|
3208
|
+
plugins: z7.array(z7.string().min(1))
|
|
2950
3209
|
}),
|
|
2951
3210
|
trigger: z7.object({
|
|
2952
3211
|
numQueries: z7.number().int().min(2).refine((value) => value % 2 === 0, "trigger.numQueries must be an even number."),
|
|
2953
3212
|
threshold: z7.number().min(0).max(1),
|
|
2954
|
-
seed: z7.number().int().optional()
|
|
3213
|
+
seed: z7.number().int().optional(),
|
|
3214
|
+
compare: z7.array(z7.string().min(1))
|
|
2955
3215
|
}),
|
|
2956
3216
|
eval: z7.object({
|
|
2957
3217
|
numRuns: z7.number().int().min(1),
|
|
@@ -2967,11 +3227,13 @@ var DEFAULT_SKILLTEST_CONFIG = {
|
|
|
2967
3227
|
concurrency: 5,
|
|
2968
3228
|
lint: {
|
|
2969
3229
|
failOn: "error",
|
|
2970
|
-
suppress: []
|
|
3230
|
+
suppress: [],
|
|
3231
|
+
plugins: []
|
|
2971
3232
|
},
|
|
2972
3233
|
trigger: {
|
|
2973
3234
|
numQueries: 20,
|
|
2974
|
-
threshold: 0.8
|
|
3235
|
+
threshold: 0.8,
|
|
3236
|
+
compare: []
|
|
2975
3237
|
},
|
|
2976
3238
|
eval: {
|
|
2977
3239
|
numRuns: 5,
|
|
@@ -2993,7 +3255,7 @@ function buildConfigValidationError(error, sourceLabel) {
|
|
|
2993
3255
|
async function readJsonObject(filePath, label) {
|
|
2994
3256
|
let raw;
|
|
2995
3257
|
try {
|
|
2996
|
-
raw = await
|
|
3258
|
+
raw = await fs8.readFile(filePath, "utf8");
|
|
2997
3259
|
} catch (error) {
|
|
2998
3260
|
const message = error instanceof Error ? error.message : String(error);
|
|
2999
3261
|
throw new Error(`Failed to read ${label}: ${message}`);
|
|
@@ -3017,13 +3279,13 @@ async function loadConfigFromJsonFile(filePath) {
|
|
|
3017
3279
|
return {
|
|
3018
3280
|
configFile: parsed.data,
|
|
3019
3281
|
sourcePath: filePath,
|
|
3020
|
-
sourceDirectory:
|
|
3282
|
+
sourceDirectory: path6.dirname(filePath)
|
|
3021
3283
|
};
|
|
3022
3284
|
}
|
|
3023
3285
|
async function loadConfigFromNearestPackageJson(startDirectory) {
|
|
3024
|
-
let currentDirectory =
|
|
3286
|
+
let currentDirectory = path6.resolve(startDirectory);
|
|
3025
3287
|
while (true) {
|
|
3026
|
-
const packageJsonPath =
|
|
3288
|
+
const packageJsonPath = path6.join(currentDirectory, "package.json");
|
|
3027
3289
|
if (await pathExists(packageJsonPath)) {
|
|
3028
3290
|
const raw = await readJsonObject(packageJsonPath, packageJsonPath);
|
|
3029
3291
|
const packageJsonSchema = z7.object({
|
|
@@ -3042,7 +3304,7 @@ async function loadConfigFromNearestPackageJson(startDirectory) {
|
|
|
3042
3304
|
sourceDirectory: currentDirectory
|
|
3043
3305
|
};
|
|
3044
3306
|
}
|
|
3045
|
-
const parentDirectory =
|
|
3307
|
+
const parentDirectory = path6.dirname(currentDirectory);
|
|
3046
3308
|
if (parentDirectory === currentDirectory) {
|
|
3047
3309
|
return null;
|
|
3048
3310
|
}
|
|
@@ -3055,7 +3317,7 @@ async function resolveSkillDirectoryConfig(targetPath) {
|
|
|
3055
3317
|
}
|
|
3056
3318
|
try {
|
|
3057
3319
|
const { skillRoot } = await resolveSkillPath(targetPath);
|
|
3058
|
-
return loadConfigFromJsonFile(
|
|
3320
|
+
return loadConfigFromJsonFile(path6.join(skillRoot, ".skilltestrc"));
|
|
3059
3321
|
} catch {
|
|
3060
3322
|
return null;
|
|
3061
3323
|
}
|
|
@@ -3064,7 +3326,13 @@ function resolveConfigRelativePath(baseDirectory, value) {
|
|
|
3064
3326
|
if (!value) {
|
|
3065
3327
|
return void 0;
|
|
3066
3328
|
}
|
|
3067
|
-
return
|
|
3329
|
+
return path6.resolve(baseDirectory, value);
|
|
3330
|
+
}
|
|
3331
|
+
function resolveConfigRelativePaths(baseDirectory, values) {
|
|
3332
|
+
if (!values || values.length === 0) {
|
|
3333
|
+
return [];
|
|
3334
|
+
}
|
|
3335
|
+
return values.map((value) => path6.resolve(baseDirectory, value));
|
|
3068
3336
|
}
|
|
3069
3337
|
function mergeConfigLayers(configFile = {}, cliFlags = {}, baseDirectory = process.cwd()) {
|
|
3070
3338
|
const merged = {
|
|
@@ -3074,12 +3342,20 @@ function mergeConfigLayers(configFile = {}, cliFlags = {}, baseDirectory = proce
|
|
|
3074
3342
|
concurrency: cliFlags.concurrency ?? configFile.concurrency ?? DEFAULT_SKILLTEST_CONFIG.concurrency,
|
|
3075
3343
|
lint: {
|
|
3076
3344
|
failOn: cliFlags.lint?.failOn ?? configFile.lint?.failOn ?? DEFAULT_SKILLTEST_CONFIG.lint.failOn,
|
|
3077
|
-
suppress: cliFlags.lint?.suppress ?? configFile.lint?.suppress ?? DEFAULT_SKILLTEST_CONFIG.lint.suppress
|
|
3345
|
+
suppress: cliFlags.lint?.suppress ?? configFile.lint?.suppress ?? DEFAULT_SKILLTEST_CONFIG.lint.suppress,
|
|
3346
|
+
plugins: resolveConfigRelativePaths(
|
|
3347
|
+
baseDirectory,
|
|
3348
|
+
cliFlags.lint?.plugins ?? configFile.lint?.plugins ?? DEFAULT_SKILLTEST_CONFIG.lint.plugins
|
|
3349
|
+
)
|
|
3078
3350
|
},
|
|
3079
3351
|
trigger: {
|
|
3080
3352
|
numQueries: cliFlags.trigger?.numQueries ?? configFile.trigger?.numQueries ?? DEFAULT_SKILLTEST_CONFIG.trigger.numQueries,
|
|
3081
3353
|
threshold: cliFlags.trigger?.threshold ?? configFile.trigger?.threshold ?? DEFAULT_SKILLTEST_CONFIG.trigger.threshold,
|
|
3082
|
-
seed: cliFlags.trigger?.seed ?? configFile.trigger?.seed
|
|
3354
|
+
seed: cliFlags.trigger?.seed ?? configFile.trigger?.seed,
|
|
3355
|
+
compare: resolveConfigRelativePaths(
|
|
3356
|
+
baseDirectory,
|
|
3357
|
+
cliFlags.trigger?.compare ?? configFile.trigger?.compare ?? DEFAULT_SKILLTEST_CONFIG.trigger.compare
|
|
3358
|
+
)
|
|
3083
3359
|
},
|
|
3084
3360
|
eval: {
|
|
3085
3361
|
numRuns: cliFlags.eval?.numRuns ?? configFile.eval?.numRuns ?? DEFAULT_SKILLTEST_CONFIG.eval.numRuns,
|
|
@@ -3124,6 +3400,18 @@ function extractCliConfigOverrides(command) {
|
|
|
3124
3400
|
numQueries: getTypedOptionValue(command, "numQueries")
|
|
3125
3401
|
};
|
|
3126
3402
|
}
|
|
3403
|
+
if ((command.name() === "trigger" || command.name() === "check") && command.getOptionValueSource("compare") === "cli") {
|
|
3404
|
+
overrides.trigger = {
|
|
3405
|
+
...overrides.trigger,
|
|
3406
|
+
compare: getTypedOptionValue(command, "compare")
|
|
3407
|
+
};
|
|
3408
|
+
}
|
|
3409
|
+
if ((command.name() === "lint" || command.name() === "check") && command.getOptionValueSource("plugin") === "cli") {
|
|
3410
|
+
overrides.lint = {
|
|
3411
|
+
...overrides.lint,
|
|
3412
|
+
plugins: getTypedOptionValue(command, "plugin")
|
|
3413
|
+
};
|
|
3414
|
+
}
|
|
3127
3415
|
if (command.name() === "check" && command.getOptionValueSource("minF1") === "cli") {
|
|
3128
3416
|
overrides.trigger = {
|
|
3129
3417
|
...overrides.trigger,
|
|
@@ -3151,7 +3439,7 @@ async function resolveConfigContext(targetPath, cliFlags) {
|
|
|
3151
3439
|
config: mergeConfigLayers(skillDirectoryConfig.configFile, cliFlags, skillDirectoryConfig.sourceDirectory)
|
|
3152
3440
|
};
|
|
3153
3441
|
}
|
|
3154
|
-
const cwdConfigPath =
|
|
3442
|
+
const cwdConfigPath = path6.join(cwd, ".skilltestrc");
|
|
3155
3443
|
const cwdConfig = await loadConfigFromJsonFile(cwdConfigPath);
|
|
3156
3444
|
if (cwdConfig) {
|
|
3157
3445
|
return {
|
|
@@ -3372,6 +3660,7 @@ function createProvider(providerName, apiKeyOverride) {
|
|
|
3372
3660
|
var triggerCliSchema = z8.object({
|
|
3373
3661
|
queries: z8.string().optional(),
|
|
3374
3662
|
saveQueries: z8.string().optional(),
|
|
3663
|
+
compare: z8.array(z8.string().min(1)).optional(),
|
|
3375
3664
|
seed: z8.number().int().optional(),
|
|
3376
3665
|
concurrency: z8.number().int().min(1).optional(),
|
|
3377
3666
|
html: z8.string().optional(),
|
|
@@ -3420,6 +3709,7 @@ async function handleTriggerCommand(targetPath, options) {
|
|
|
3420
3709
|
provider,
|
|
3421
3710
|
queries,
|
|
3422
3711
|
numQueries: options.numQueries,
|
|
3712
|
+
compare: options.compare,
|
|
3423
3713
|
seed: options.seed,
|
|
3424
3714
|
concurrency: options.concurrency,
|
|
3425
3715
|
verbose: options.verbose
|
|
@@ -3438,7 +3728,7 @@ async function handleTriggerCommand(targetPath, options) {
|
|
|
3438
3728
|
...result,
|
|
3439
3729
|
target: targetPath
|
|
3440
3730
|
};
|
|
3441
|
-
await
|
|
3731
|
+
await fs9.writeFile(options.html, renderTriggerHtml(htmlResult), "utf8");
|
|
3442
3732
|
}
|
|
3443
3733
|
} catch (error) {
|
|
3444
3734
|
spinner?.stop();
|
|
@@ -3447,7 +3737,7 @@ async function handleTriggerCommand(targetPath, options) {
|
|
|
3447
3737
|
}
|
|
3448
3738
|
}
|
|
3449
3739
|
function registerTriggerCommand(program) {
|
|
3450
|
-
program.command("trigger").description("Evaluate whether a skill description triggers correctly.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--model <model>", "Model to use").option("--provider <provider>", "LLM provider: anthropic|openai").option("--queries <path>", "Path to custom test queries JSON").option("--num-queries <n>", "Number of auto-generated queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--concurrency <n>", "Maximum in-flight trigger requests", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--save-queries <path>", "Save generated queries to a JSON file").option("--api-key <key>", "API key override").option("--verbose", "Show full model decisions").action(async (targetPath, _commandOptions, command) => {
|
|
3740
|
+
program.command("trigger").description("Evaluate whether a skill description triggers correctly.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--model <model>", "Model to use").option("--provider <provider>", "LLM provider: anthropic|openai").option("--queries <path>", "Path to custom test queries JSON").option("--compare <path...>", "Path(s) to sibling skill directories to include as competitors").option("--num-queries <n>", "Number of auto-generated queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--concurrency <n>", "Maximum in-flight trigger requests", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--save-queries <path>", "Save generated queries to a JSON file").option("--api-key <key>", "API key override").option("--verbose", "Show full model decisions").action(async (targetPath, _commandOptions, command) => {
|
|
3451
3741
|
const globalOptions = getGlobalCliOptions(command);
|
|
3452
3742
|
const config = getResolvedConfig(command);
|
|
3453
3743
|
const parsedCli = triggerCliSchema.safeParse(command.opts());
|
|
@@ -3462,6 +3752,7 @@ function registerTriggerCommand(program) {
|
|
|
3462
3752
|
provider: config.provider,
|
|
3463
3753
|
queries: parsedCli.data.queries,
|
|
3464
3754
|
numQueries: config.trigger.numQueries,
|
|
3755
|
+
compare: config.trigger.compare,
|
|
3465
3756
|
saveQueries: parsedCli.data.saveQueries,
|
|
3466
3757
|
seed: parsedCli.data.seed ?? config.trigger.seed,
|
|
3467
3758
|
concurrency: config.concurrency,
|
|
@@ -3473,7 +3764,7 @@ function registerTriggerCommand(program) {
|
|
|
3473
3764
|
}
|
|
3474
3765
|
|
|
3475
3766
|
// src/commands/eval.ts
|
|
3476
|
-
import
|
|
3767
|
+
import fs10 from "node:fs/promises";
|
|
3477
3768
|
import ora2 from "ora";
|
|
3478
3769
|
import { z as z9 } from "zod";
|
|
3479
3770
|
var evalCliSchema = z9.object({
|
|
@@ -3540,7 +3831,7 @@ async function handleEvalCommand(targetPath, options, command) {
|
|
|
3540
3831
|
...result,
|
|
3541
3832
|
target: targetPath
|
|
3542
3833
|
};
|
|
3543
|
-
await
|
|
3834
|
+
await fs10.writeFile(options.html, renderEvalHtml(htmlResult), "utf8");
|
|
3544
3835
|
}
|
|
3545
3836
|
} catch (error) {
|
|
3546
3837
|
spinner?.stop();
|
|
@@ -3579,7 +3870,7 @@ function registerEvalCommand(program) {
|
|
|
3579
3870
|
}
|
|
3580
3871
|
|
|
3581
3872
|
// src/commands/check.ts
|
|
3582
|
-
import
|
|
3873
|
+
import fs11 from "node:fs/promises";
|
|
3583
3874
|
import ora3 from "ora";
|
|
3584
3875
|
import { z as z10 } from "zod";
|
|
3585
3876
|
|
|
@@ -3592,7 +3883,7 @@ function calculateEvalAssertPassRate(result) {
|
|
|
3592
3883
|
}
|
|
3593
3884
|
async function runCheck(inputPath, options) {
|
|
3594
3885
|
options.onStage?.("lint");
|
|
3595
|
-
const lint = await runLinter(inputPath, { suppress: options.lintSuppress });
|
|
3886
|
+
const lint = await runLinter(inputPath, { suppress: options.lintSuppress, plugins: options.lintPlugins });
|
|
3596
3887
|
const lintPassed = !lintFails(lint, options.lintFailOn);
|
|
3597
3888
|
let trigger = null;
|
|
3598
3889
|
let evalResult = null;
|
|
@@ -3616,6 +3907,7 @@ async function runCheck(inputPath, options) {
|
|
|
3616
3907
|
provider: options.provider,
|
|
3617
3908
|
model: options.model,
|
|
3618
3909
|
queries: options.queries,
|
|
3910
|
+
compare: options.compare,
|
|
3619
3911
|
numQueries: options.numQueries,
|
|
3620
3912
|
seed: options.triggerSeed,
|
|
3621
3913
|
concurrency: options.concurrency,
|
|
@@ -3677,8 +3969,10 @@ var checkCliSchema = z10.object({
|
|
|
3677
3969
|
graderModel: z10.string().optional(),
|
|
3678
3970
|
apiKey: z10.string().optional(),
|
|
3679
3971
|
queries: z10.string().optional(),
|
|
3972
|
+
compare: z10.array(z10.string().min(1)).optional(),
|
|
3680
3973
|
seed: z10.number().int().optional(),
|
|
3681
3974
|
prompts: z10.string().optional(),
|
|
3975
|
+
plugin: z10.array(z10.string().min(1)).optional(),
|
|
3682
3976
|
concurrency: z10.number().int().min(1).optional(),
|
|
3683
3977
|
html: z10.string().optional(),
|
|
3684
3978
|
saveResults: z10.string().optional(),
|
|
@@ -3687,6 +3981,9 @@ var checkCliSchema = z10.object({
|
|
|
3687
3981
|
});
|
|
3688
3982
|
var DEFAULT_ANTHROPIC_MODEL3 = "claude-sonnet-4-5-20250929";
|
|
3689
3983
|
var DEFAULT_OPENAI_MODEL3 = "gpt-4.1-mini";
|
|
3984
|
+
function collectPluginPaths2(value, previous = []) {
|
|
3985
|
+
return [...previous, value];
|
|
3986
|
+
}
|
|
3690
3987
|
function resolveModel3(provider, model) {
|
|
3691
3988
|
if (provider === "openai" && model === DEFAULT_ANTHROPIC_MODEL3) {
|
|
3692
3989
|
return DEFAULT_OPENAI_MODEL3;
|
|
@@ -3737,7 +4034,9 @@ async function handleCheckCommand(targetPath, options, command) {
|
|
|
3737
4034
|
graderModel,
|
|
3738
4035
|
lintFailOn: options.lintFailOn,
|
|
3739
4036
|
lintSuppress: options.lintSuppress,
|
|
4037
|
+
lintPlugins: options.lintPlugins,
|
|
3740
4038
|
queries,
|
|
4039
|
+
compare: options.compare,
|
|
3741
4040
|
numQueries: options.numQueries,
|
|
3742
4041
|
triggerSeed: options.triggerSeed,
|
|
3743
4042
|
prompts,
|
|
@@ -3773,7 +4072,7 @@ async function handleCheckCommand(targetPath, options, command) {
|
|
|
3773
4072
|
);
|
|
3774
4073
|
}
|
|
3775
4074
|
if (options.html) {
|
|
3776
|
-
await
|
|
4075
|
+
await fs11.writeFile(options.html, renderCheckHtml(result), "utf8");
|
|
3777
4076
|
}
|
|
3778
4077
|
process.exitCode = result.gates.overallPassed ? 0 : 1;
|
|
3779
4078
|
} catch (error) {
|
|
@@ -3783,7 +4082,7 @@ async function handleCheckCommand(targetPath, options, command) {
|
|
|
3783
4082
|
}
|
|
3784
4083
|
}
|
|
3785
4084
|
function registerCheckCommand(program) {
|
|
3786
|
-
program.command("check").description("Run lint + trigger + eval with threshold-based quality gates.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai").option("--model <model>", "Model for trigger/eval runs").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--prompts <path>", "Path to eval prompts JSON").option("--concurrency <n>", "Maximum in-flight trigger/eval tasks", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value)).option("--min-assert-pass-rate <n>", "Minimum required eval assertion pass rate (0-1)", (value) => Number.parseFloat(value)).option("--save-results <path>", "Save combined check results to JSON").option("--continue-on-lint-fail", "Continue trigger/eval stages even when lint has failures").option("--verbose", "Show detailed trigger/eval output sections").action(async (targetPath, _commandOptions, command) => {
|
|
4085
|
+
program.command("check").description("Run lint + trigger + eval with threshold-based quality gates.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai").option("--model <model>", "Model for trigger/eval runs").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--compare <path...>", "Path(s) to sibling skill directories to include as competitors").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--prompts <path>", "Path to eval prompts JSON").option("--plugin <path>", "Load a custom lint plugin file", collectPluginPaths2, []).option("--concurrency <n>", "Maximum in-flight trigger/eval tasks", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value)).option("--min-assert-pass-rate <n>", "Minimum required eval assertion pass rate (0-1)", (value) => Number.parseFloat(value)).option("--save-results <path>", "Save combined check results to JSON").option("--continue-on-lint-fail", "Continue trigger/eval stages even when lint has failures").option("--verbose", "Show detailed trigger/eval output sections").action(async (targetPath, _commandOptions, command) => {
|
|
3787
4086
|
const globalOptions = getGlobalCliOptions(command);
|
|
3788
4087
|
const config = getResolvedConfig(command);
|
|
3789
4088
|
const parsedCli = checkCliSchema.safeParse(command.opts());
|
|
@@ -3801,6 +4100,7 @@ function registerCheckCommand(program) {
|
|
|
3801
4100
|
graderModel: parsedCli.data.graderModel,
|
|
3802
4101
|
apiKey: parsedCli.data.apiKey,
|
|
3803
4102
|
queries: parsedCli.data.queries,
|
|
4103
|
+
compare: config.trigger.compare,
|
|
3804
4104
|
numQueries: config.trigger.numQueries,
|
|
3805
4105
|
prompts: parsedCli.data.prompts,
|
|
3806
4106
|
minF1: config.trigger.threshold,
|
|
@@ -3810,6 +4110,7 @@ function registerCheckCommand(program) {
|
|
|
3810
4110
|
html: parsedCli.data.html,
|
|
3811
4111
|
lintFailOn: config.lint.failOn,
|
|
3812
4112
|
lintSuppress: config.lint.suppress,
|
|
4113
|
+
lintPlugins: config.lint.plugins,
|
|
3813
4114
|
triggerSeed: parsedCli.data.seed ?? config.trigger.seed,
|
|
3814
4115
|
saveResults: parsedCli.data.saveResults,
|
|
3815
4116
|
continueOnLintFail: Boolean(parsedCli.data.continueOnLintFail),
|
|
@@ -3824,8 +4125,8 @@ function registerCheckCommand(program) {
|
|
|
3824
4125
|
function resolveVersion() {
|
|
3825
4126
|
try {
|
|
3826
4127
|
const currentFilePath = fileURLToPath(import.meta.url);
|
|
3827
|
-
const packageJsonPath =
|
|
3828
|
-
const raw =
|
|
4128
|
+
const packageJsonPath = path7.resolve(path7.dirname(currentFilePath), "..", "package.json");
|
|
4129
|
+
const raw = fs12.readFileSync(packageJsonPath, "utf8");
|
|
3829
4130
|
const parsed = JSON.parse(raw);
|
|
3830
4131
|
return parsed.version ?? "0.0.0";
|
|
3831
4132
|
} catch {
|