skilltest 0.7.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +11 -7
- package/README.md +267 -12
- package/dist/index.js +1699 -173
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/index.ts
|
|
4
|
-
import
|
|
5
|
-
import
|
|
4
|
+
import fs13 from "node:fs";
|
|
5
|
+
import path8 from "node:path";
|
|
6
6
|
import { fileURLToPath } from "node:url";
|
|
7
7
|
import { Command } from "commander";
|
|
8
8
|
|
|
9
9
|
// src/commands/lint.ts
|
|
10
|
-
import
|
|
10
|
+
import fs7 from "node:fs/promises";
|
|
11
11
|
import { z as z6 } from "zod";
|
|
12
12
|
|
|
13
13
|
// src/core/skill-parser.ts
|
|
@@ -100,7 +100,10 @@ function parseFrontmatter(rawSkill) {
|
|
|
100
100
|
}
|
|
101
101
|
async function parseSkillStrict(inputPath) {
|
|
102
102
|
const skillContext = await loadSkillFile(inputPath);
|
|
103
|
-
|
|
103
|
+
return parseSkillDocumentStrict(skillContext.raw, skillContext.skillRoot, skillContext.skillFile);
|
|
104
|
+
}
|
|
105
|
+
function parseSkillDocumentStrict(rawSkill, skillRoot, skillFile) {
|
|
106
|
+
const parsedFrontmatter = parseFrontmatter(rawSkill);
|
|
104
107
|
if (!parsedFrontmatter.hasFrontmatter) {
|
|
105
108
|
throw new Error("SKILL.md is missing YAML frontmatter.");
|
|
106
109
|
}
|
|
@@ -113,9 +116,9 @@ async function parseSkillStrict(inputPath) {
|
|
|
113
116
|
throw new Error(`Invalid frontmatter field '${issue.path.join(".")}': ${issue.message}`);
|
|
114
117
|
}
|
|
115
118
|
return {
|
|
116
|
-
skillRoot
|
|
117
|
-
skillFile
|
|
118
|
-
raw:
|
|
119
|
+
skillRoot,
|
|
120
|
+
skillFile,
|
|
121
|
+
raw: rawSkill,
|
|
119
122
|
content: parsedFrontmatter.content,
|
|
120
123
|
frontmatterRaw: parsedFrontmatter.rawFrontmatter,
|
|
121
124
|
frontmatter: validation.data
|
|
@@ -972,6 +975,116 @@ function runFrontmatterChecks(context) {
|
|
|
972
975
|
return issues;
|
|
973
976
|
}
|
|
974
977
|
|
|
978
|
+
// src/core/linter/plugin.ts
|
|
979
|
+
import fs4 from "node:fs/promises";
|
|
980
|
+
import path4 from "node:path";
|
|
981
|
+
import { pathToFileURL } from "node:url";
|
|
982
|
+
function normalizeRuleCheckId(checkId) {
|
|
983
|
+
return checkId.includes(":") ? checkId : `plugin:${checkId}`;
|
|
984
|
+
}
|
|
985
|
+
function buildPluginValidationError(filePath, message) {
|
|
986
|
+
return new Error(`Invalid lint plugin at ${filePath}: ${message}`);
|
|
987
|
+
}
|
|
988
|
+
function validatePluginCandidate(candidate, filePath, exportName) {
|
|
989
|
+
if (!candidate || typeof candidate !== "object" || !("rules" in candidate)) {
|
|
990
|
+
throw buildPluginValidationError(filePath, `${exportName} export must be an object with a rules array.`);
|
|
991
|
+
}
|
|
992
|
+
const rules = candidate.rules;
|
|
993
|
+
if (!Array.isArray(rules)) {
|
|
994
|
+
throw buildPluginValidationError(filePath, `${exportName} export must include a rules array.`);
|
|
995
|
+
}
|
|
996
|
+
return {
|
|
997
|
+
rules: rules.map((rule, index) => {
|
|
998
|
+
if (!rule || typeof rule !== "object") {
|
|
999
|
+
throw buildPluginValidationError(filePath, `rule at index ${index} must be an object.`);
|
|
1000
|
+
}
|
|
1001
|
+
const checkId = rule.checkId;
|
|
1002
|
+
if (typeof checkId !== "string" || checkId.trim() === "") {
|
|
1003
|
+
throw buildPluginValidationError(filePath, `rule at index ${index} must have a non-empty string checkId.`);
|
|
1004
|
+
}
|
|
1005
|
+
const title = rule.title;
|
|
1006
|
+
if (typeof title !== "string" || title.trim() === "") {
|
|
1007
|
+
throw buildPluginValidationError(filePath, `rule at index ${index} must have a non-empty string title.`);
|
|
1008
|
+
}
|
|
1009
|
+
const check = rule.check;
|
|
1010
|
+
if (typeof check !== "function") {
|
|
1011
|
+
throw buildPluginValidationError(filePath, `rule '${checkId}' must have a check function.`);
|
|
1012
|
+
}
|
|
1013
|
+
return {
|
|
1014
|
+
checkId: normalizeRuleCheckId(checkId),
|
|
1015
|
+
title,
|
|
1016
|
+
check
|
|
1017
|
+
};
|
|
1018
|
+
})
|
|
1019
|
+
};
|
|
1020
|
+
}
|
|
1021
|
+
async function loadPlugin(filePath) {
|
|
1022
|
+
const absolutePath = path4.resolve(filePath);
|
|
1023
|
+
try {
|
|
1024
|
+
await fs4.access(absolutePath);
|
|
1025
|
+
} catch {
|
|
1026
|
+
throw new Error(`Failed to load lint plugin at ${absolutePath}: file does not exist.`);
|
|
1027
|
+
}
|
|
1028
|
+
let loadedModule;
|
|
1029
|
+
try {
|
|
1030
|
+
loadedModule = await import(pathToFileURL(absolutePath).href);
|
|
1031
|
+
} catch (error) {
|
|
1032
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1033
|
+
throw new Error(`Failed to load lint plugin at ${absolutePath}: ${message}`);
|
|
1034
|
+
}
|
|
1035
|
+
const validationErrors = [];
|
|
1036
|
+
for (const [exportName, candidate] of [
|
|
1037
|
+
["default", loadedModule.default],
|
|
1038
|
+
["plugin", loadedModule.plugin]
|
|
1039
|
+
]) {
|
|
1040
|
+
if (candidate === void 0) {
|
|
1041
|
+
continue;
|
|
1042
|
+
}
|
|
1043
|
+
try {
|
|
1044
|
+
return validatePluginCandidate(candidate, absolutePath, exportName);
|
|
1045
|
+
} catch (error) {
|
|
1046
|
+
validationErrors.push(error instanceof Error ? error.message : String(error));
|
|
1047
|
+
}
|
|
1048
|
+
}
|
|
1049
|
+
if (validationErrors.length > 0) {
|
|
1050
|
+
throw new Error(validationErrors.join(" "));
|
|
1051
|
+
}
|
|
1052
|
+
throw buildPluginValidationError(
|
|
1053
|
+
absolutePath,
|
|
1054
|
+
"expected a default export or named export 'plugin' containing a rules array."
|
|
1055
|
+
);
|
|
1056
|
+
}
|
|
1057
|
+
function buildRuleExecutionError(rule, error) {
|
|
1058
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1059
|
+
return {
|
|
1060
|
+
id: `plugin.load-error.${rule.checkId.replace(/[^A-Za-z0-9]+/g, "-").replace(/^-+|-+$/g, "").toLowerCase()}`,
|
|
1061
|
+
checkId: "plugin:load-error",
|
|
1062
|
+
title: "Plugin Rule Error",
|
|
1063
|
+
status: "fail",
|
|
1064
|
+
message: `Plugin rule '${rule.checkId}' failed: ${message}`
|
|
1065
|
+
};
|
|
1066
|
+
}
|
|
1067
|
+
async function runPluginRules(plugin, context) {
|
|
1068
|
+
const issues = [];
|
|
1069
|
+
for (const rule of plugin.rules) {
|
|
1070
|
+
try {
|
|
1071
|
+
const result = await rule.check(context);
|
|
1072
|
+
if (!Array.isArray(result)) {
|
|
1073
|
+
throw new Error("check function must return an array of lint issues.");
|
|
1074
|
+
}
|
|
1075
|
+
issues.push(
|
|
1076
|
+
...result.map((issue) => ({
|
|
1077
|
+
...issue,
|
|
1078
|
+
checkId: rule.checkId
|
|
1079
|
+
}))
|
|
1080
|
+
);
|
|
1081
|
+
} catch (error) {
|
|
1082
|
+
issues.push(buildRuleExecutionError(rule, error));
|
|
1083
|
+
}
|
|
1084
|
+
}
|
|
1085
|
+
return issues;
|
|
1086
|
+
}
|
|
1087
|
+
|
|
975
1088
|
// src/core/linter/security.ts
|
|
976
1089
|
var DANGEROUS_COMMAND_PATTERNS = [
|
|
977
1090
|
{
|
|
@@ -1179,8 +1292,8 @@ function runSecurityChecks(context) {
|
|
|
1179
1292
|
}
|
|
1180
1293
|
|
|
1181
1294
|
// src/core/linter/structure.ts
|
|
1182
|
-
import
|
|
1183
|
-
import
|
|
1295
|
+
import fs5 from "node:fs/promises";
|
|
1296
|
+
import path5 from "node:path";
|
|
1184
1297
|
function hasTableOfContents(content) {
|
|
1185
1298
|
if (/^#{1,6}\s+table of contents\b/im.test(content)) {
|
|
1186
1299
|
return true;
|
|
@@ -1221,21 +1334,21 @@ async function runStructureChecks(context) {
|
|
|
1221
1334
|
message: `SKILL.md length is ${context.skill.lineCount} lines.`
|
|
1222
1335
|
});
|
|
1223
1336
|
}
|
|
1224
|
-
const referencesDir =
|
|
1337
|
+
const referencesDir = path5.join(context.skill.skillRoot, "references");
|
|
1225
1338
|
if (await pathExists(referencesDir)) {
|
|
1226
1339
|
const files = await listFilesRecursive(referencesDir);
|
|
1227
1340
|
let oversizedWithoutToc = 0;
|
|
1228
1341
|
for (const file of files) {
|
|
1229
|
-
const raw = await
|
|
1342
|
+
const raw = await fs5.readFile(file, "utf8");
|
|
1230
1343
|
const lineCount = raw === "" ? 0 : raw.split(/\r?\n/).length;
|
|
1231
1344
|
if (lineCount > 300 && !hasTableOfContents(raw)) {
|
|
1232
1345
|
oversizedWithoutToc += 1;
|
|
1233
1346
|
issues.push({
|
|
1234
|
-
id: `structure.references.toc.${toPosixPath(
|
|
1347
|
+
id: `structure.references.toc.${toPosixPath(path5.relative(context.skill.skillRoot, file))}`,
|
|
1235
1348
|
checkId: "structure:toc",
|
|
1236
1349
|
title: "Reference File Navigation",
|
|
1237
1350
|
status: "warn",
|
|
1238
|
-
message: `${toPosixPath(
|
|
1351
|
+
message: `${toPosixPath(path5.relative(context.skill.skillRoot, file))} is ${lineCount} lines and has no table of contents.`,
|
|
1239
1352
|
suggestion: "Add a table of contents for long reference files."
|
|
1240
1353
|
});
|
|
1241
1354
|
}
|
|
@@ -1265,7 +1378,7 @@ async function runStructureChecks(context) {
|
|
|
1265
1378
|
other: []
|
|
1266
1379
|
};
|
|
1267
1380
|
for (const reference of references) {
|
|
1268
|
-
const resolved =
|
|
1381
|
+
const resolved = path5.resolve(context.skill.skillRoot, reference);
|
|
1269
1382
|
if (!await pathExists(resolved)) {
|
|
1270
1383
|
const kind = classifyReferencePath(reference);
|
|
1271
1384
|
missingByType[kind].push(reference);
|
|
@@ -1362,6 +1475,10 @@ async function runLinter(inputPath, options = {}) {
|
|
|
1362
1475
|
issues.push(...runSecurityChecks(context));
|
|
1363
1476
|
issues.push(...await runDisclosureChecks(context));
|
|
1364
1477
|
issues.push(...runCompatibilityChecks(context));
|
|
1478
|
+
for (const pluginPath of options.plugins ?? []) {
|
|
1479
|
+
const plugin = await loadPlugin(pluginPath);
|
|
1480
|
+
issues.push(...await runPluginRules(plugin, context));
|
|
1481
|
+
}
|
|
1365
1482
|
const filteredIssues = issues.filter((issue) => !suppressedCheckIds.has(issue.checkId));
|
|
1366
1483
|
return {
|
|
1367
1484
|
target: inputPath,
|
|
@@ -1401,6 +1518,9 @@ function badgeLabel(status) {
|
|
|
1401
1518
|
function renderBadge(status) {
|
|
1402
1519
|
return `<span class="badge ${status}">${badgeLabel(status)}</span>`;
|
|
1403
1520
|
}
|
|
1521
|
+
function renderMetaBadge(label) {
|
|
1522
|
+
return `<span class="meta-badge">${escapeHtml(label)}</span>`;
|
|
1523
|
+
}
|
|
1404
1524
|
function renderStatCards(stats) {
|
|
1405
1525
|
return `<div class="stats-grid">${stats.map(
|
|
1406
1526
|
(stat) => `
|
|
@@ -1525,10 +1645,10 @@ function renderLintIssueList(report) {
|
|
|
1525
1645
|
const info = skippedSecurityPatterns > 0 ? `<p class="info-line">Skipped security patterns in examples/comments: ${escapeHtml(skippedSecurityPatterns)}</p>` : "";
|
|
1526
1646
|
return `<div class="row-list">${rows}</div>${info}`;
|
|
1527
1647
|
}
|
|
1528
|
-
function renderTriggerCaseRow(testCase) {
|
|
1648
|
+
function renderTriggerCaseRow(testCase, showSelectedCompetitor) {
|
|
1529
1649
|
const details = testCase.rawModelResponse ? renderDetails("Model response", renderPreBlock(testCase.rawModelResponse)) : "";
|
|
1530
1650
|
return `
|
|
1531
|
-
<div class="row">
|
|
1651
|
+
<div class="row${testCase.selectedCompetitor ? " competitor-selected" : ""}">
|
|
1532
1652
|
<div class="row-header">
|
|
1533
1653
|
<div>
|
|
1534
1654
|
<div class="row-title">${escapeHtml(testCase.query)}</div>
|
|
@@ -1540,12 +1660,29 @@ function renderTriggerCaseRow(testCase) {
|
|
|
1540
1660
|
</div>
|
|
1541
1661
|
${renderDefinitionList([
|
|
1542
1662
|
{ label: "Expected", value: testCase.expected },
|
|
1543
|
-
{ label: "Actual", value: testCase.actual }
|
|
1663
|
+
{ label: "Actual", value: testCase.actual },
|
|
1664
|
+
...showSelectedCompetitor ? [{ label: "Selected competitor", value: testCase.selectedCompetitor ?? "none" }] : []
|
|
1544
1665
|
])}
|
|
1545
1666
|
${details}
|
|
1546
1667
|
</div>
|
|
1547
1668
|
`;
|
|
1548
1669
|
}
|
|
1670
|
+
function renderCompetitorSkillsSection(result) {
|
|
1671
|
+
if (!result.competitors || result.competitors.length === 0) {
|
|
1672
|
+
return "";
|
|
1673
|
+
}
|
|
1674
|
+
return renderSectionCard(
|
|
1675
|
+
"Competitor Skills",
|
|
1676
|
+
`<div class="row-list">${result.competitors.map(
|
|
1677
|
+
(competitor) => renderMessageRow(
|
|
1678
|
+
"warn",
|
|
1679
|
+
competitor.name,
|
|
1680
|
+
competitor.description,
|
|
1681
|
+
renderDefinitionList([{ label: "Source", value: competitor.sourcePath }])
|
|
1682
|
+
)
|
|
1683
|
+
).join("")}</div>`
|
|
1684
|
+
);
|
|
1685
|
+
}
|
|
1549
1686
|
function promptStatus(promptResult) {
|
|
1550
1687
|
if (promptResult.totalAssertions === 0) {
|
|
1551
1688
|
return "skip";
|
|
@@ -1559,10 +1696,37 @@ function promptStatus(promptResult) {
|
|
|
1559
1696
|
return "warn";
|
|
1560
1697
|
}
|
|
1561
1698
|
function renderAssertionRow(assertion) {
|
|
1562
|
-
return
|
|
1563
|
-
|
|
1564
|
-
|
|
1565
|
-
|
|
1699
|
+
return `
|
|
1700
|
+
<details class="detail-block">
|
|
1701
|
+
<summary>
|
|
1702
|
+
${renderBadge(assertion.passed ? "pass" : "fail")}
|
|
1703
|
+
${assertion.source === "tool" ? renderMetaBadge("Tool") : ""}
|
|
1704
|
+
<span>${escapeHtml(assertion.assertion)}</span>
|
|
1705
|
+
</summary>
|
|
1706
|
+
<div class="detail-content">${renderPreBlock(assertion.evidence)}</div>
|
|
1707
|
+
</details>
|
|
1708
|
+
`;
|
|
1709
|
+
}
|
|
1710
|
+
function renderToolCallsSection(promptResult) {
|
|
1711
|
+
if (!promptResult.toolCalls || promptResult.toolCalls.length === 0) {
|
|
1712
|
+
return "";
|
|
1713
|
+
}
|
|
1714
|
+
const toolRows = promptResult.toolCalls.map(
|
|
1715
|
+
(toolCall) => `
|
|
1716
|
+
<div class="tool-call">
|
|
1717
|
+
<div class="row-header">
|
|
1718
|
+
<div>
|
|
1719
|
+
<div class="row-title">${escapeHtml(toolCall.name)}</div>
|
|
1720
|
+
<div class="row-subtitle">${escapeHtml(`turn ${toolCall.turnIndex}`)}</div>
|
|
1721
|
+
</div>
|
|
1722
|
+
${renderMetaBadge("Tool Call")}
|
|
1723
|
+
</div>
|
|
1724
|
+
${renderDefinitionList([{ label: "Arguments", value: JSON.stringify(toolCall.arguments) }])}
|
|
1725
|
+
${renderDetails("Mock response", renderPreBlock(toolCall.response))}
|
|
1726
|
+
</div>
|
|
1727
|
+
`
|
|
1728
|
+
).join("");
|
|
1729
|
+
return renderDetails("Tool Calls", `<div class="tool-call-list">${toolRows}</div>`);
|
|
1566
1730
|
}
|
|
1567
1731
|
function renderEvalPromptRow(promptResult) {
|
|
1568
1732
|
const assertionDetails = promptResult.assertions.map((assertion) => renderAssertionRow(assertion)).join("");
|
|
@@ -1581,9 +1745,12 @@ function renderEvalPromptRow(promptResult) {
|
|
|
1581
1745
|
<div class="row-body">${escapeHtml(promptResult.responseSummary)}</div>
|
|
1582
1746
|
${renderDefinitionList([
|
|
1583
1747
|
{ label: "Passed assertions", value: String(promptResult.passedAssertions) },
|
|
1584
|
-
{ label: "Total assertions", value: String(promptResult.totalAssertions) }
|
|
1748
|
+
{ label: "Total assertions", value: String(promptResult.totalAssertions) },
|
|
1749
|
+
...promptResult.toolCalls ? [{ label: "Tool calls", value: String(promptResult.toolCalls.length) }] : [],
|
|
1750
|
+
...promptResult.loopIterations !== void 0 ? [{ label: "Loop iterations", value: String(promptResult.loopIterations) }] : []
|
|
1585
1751
|
])}
|
|
1586
1752
|
${renderDetails("Assertion evidence", assertionDetails || `<p>No assertions.</p>`)}
|
|
1753
|
+
${renderToolCallsSection(promptResult)}
|
|
1587
1754
|
${responseDetails}
|
|
1588
1755
|
</div>
|
|
1589
1756
|
`;
|
|
@@ -1638,6 +1805,7 @@ function renderHtmlDocument(title, body) {
|
|
|
1638
1805
|
--pass: #22c55e;
|
|
1639
1806
|
--warn: #eab308;
|
|
1640
1807
|
--fail: #ef4444;
|
|
1808
|
+
--competitor: #f97316;
|
|
1641
1809
|
--skip: #6b7280;
|
|
1642
1810
|
--shadow: 0 10px 30px rgba(15, 23, 42, 0.08);
|
|
1643
1811
|
}
|
|
@@ -1786,6 +1954,11 @@ function renderHtmlDocument(title, body) {
|
|
|
1786
1954
|
background: var(--surface-muted);
|
|
1787
1955
|
}
|
|
1788
1956
|
|
|
1957
|
+
.row.competitor-selected {
|
|
1958
|
+
border-color: rgba(249, 115, 22, 0.45);
|
|
1959
|
+
background: rgba(249, 115, 22, 0.08);
|
|
1960
|
+
}
|
|
1961
|
+
|
|
1789
1962
|
.row-header {
|
|
1790
1963
|
display: flex;
|
|
1791
1964
|
justify-content: space-between;
|
|
@@ -1844,6 +2017,20 @@ function renderHtmlDocument(title, body) {
|
|
|
1844
2017
|
background: rgba(107, 114, 128, 0.14);
|
|
1845
2018
|
}
|
|
1846
2019
|
|
|
2020
|
+
.meta-badge {
|
|
2021
|
+
display: inline-flex;
|
|
2022
|
+
align-items: center;
|
|
2023
|
+
justify-content: center;
|
|
2024
|
+
padding: 3px 10px;
|
|
2025
|
+
border-radius: 999px;
|
|
2026
|
+
border: 1px solid rgba(17, 24, 39, 0.16);
|
|
2027
|
+
background: rgba(17, 24, 39, 0.06);
|
|
2028
|
+
color: var(--text);
|
|
2029
|
+
font-size: 0.76rem;
|
|
2030
|
+
font-weight: 700;
|
|
2031
|
+
white-space: nowrap;
|
|
2032
|
+
}
|
|
2033
|
+
|
|
1847
2034
|
details {
|
|
1848
2035
|
margin-top: 10px;
|
|
1849
2036
|
}
|
|
@@ -1858,6 +2045,13 @@ function renderHtmlDocument(title, body) {
|
|
|
1858
2045
|
padding-top: 10px;
|
|
1859
2046
|
}
|
|
1860
2047
|
|
|
2048
|
+
.detail-block summary {
|
|
2049
|
+
display: flex;
|
|
2050
|
+
align-items: center;
|
|
2051
|
+
gap: 8px;
|
|
2052
|
+
flex-wrap: wrap;
|
|
2053
|
+
}
|
|
2054
|
+
|
|
1861
2055
|
.detail-content p {
|
|
1862
2056
|
margin: 0;
|
|
1863
2057
|
}
|
|
@@ -1908,6 +2102,18 @@ function renderHtmlDocument(title, body) {
|
|
|
1908
2102
|
overflow-wrap: anywhere;
|
|
1909
2103
|
}
|
|
1910
2104
|
|
|
2105
|
+
.tool-call-list {
|
|
2106
|
+
display: grid;
|
|
2107
|
+
gap: 12px;
|
|
2108
|
+
}
|
|
2109
|
+
|
|
2110
|
+
.tool-call {
|
|
2111
|
+
border: 1px solid var(--border);
|
|
2112
|
+
border-radius: 12px;
|
|
2113
|
+
padding: 14px;
|
|
2114
|
+
background: #fffaf0;
|
|
2115
|
+
}
|
|
2116
|
+
|
|
1911
2117
|
ul {
|
|
1912
2118
|
margin: 0;
|
|
1913
2119
|
padding-left: 20px;
|
|
@@ -1965,6 +2171,7 @@ function renderTriggerHtml(result) {
|
|
|
1965
2171
|
const target = resolveOptionalTarget(htmlResult, result.skillName);
|
|
1966
2172
|
const matchedCount = result.cases.filter((testCase) => testCase.matched).length;
|
|
1967
2173
|
const matchRate = result.cases.length === 0 ? 0 : matchedCount / result.cases.length;
|
|
2174
|
+
const hasCompetitors = Boolean(result.competitors && result.competitors.length > 0);
|
|
1968
2175
|
const body = [
|
|
1969
2176
|
renderHeaderCard(
|
|
1970
2177
|
"trigger",
|
|
@@ -1980,10 +2187,15 @@ function renderTriggerHtml(result) {
|
|
|
1980
2187
|
{ label: "Provider", value: result.provider },
|
|
1981
2188
|
{ label: "Model", value: result.model },
|
|
1982
2189
|
{ label: "Seed", value: result.seed !== void 0 ? String(result.seed) : "none" },
|
|
2190
|
+
...hasCompetitors ? [{ label: "Competitors", value: String(result.competitors?.length ?? 0) }] : [],
|
|
1983
2191
|
{ label: "Queries", value: String(result.queries.length) }
|
|
1984
2192
|
]
|
|
1985
2193
|
),
|
|
1986
|
-
|
|
2194
|
+
renderCompetitorSkillsSection(result),
|
|
2195
|
+
renderSectionCard(
|
|
2196
|
+
"Trigger Cases",
|
|
2197
|
+
`<div class="row-list">${result.cases.map((testCase) => renderTriggerCaseRow(testCase, hasCompetitors)).join("")}</div>`
|
|
2198
|
+
),
|
|
1987
2199
|
renderSectionCard(
|
|
1988
2200
|
"Suggestions",
|
|
1989
2201
|
`<ul>${result.suggestions.map((suggestion) => `<li>${escapeHtml(suggestion)}</li>`).join("")}</ul>`
|
|
@@ -2023,7 +2235,8 @@ function renderEvalHtml(result) {
|
|
|
2023
2235
|
}
|
|
2024
2236
|
function renderCheckHtml(result) {
|
|
2025
2237
|
const skillName = result.trigger?.skillName ?? result.eval?.skillName ?? result.target;
|
|
2026
|
-
const triggerBody = result.trigger ?
|
|
2238
|
+
const triggerBody = result.trigger ? `${renderCompetitorSkillsSection(result.trigger)}
|
|
2239
|
+
<div class="row-list">${result.trigger.cases.map((testCase) => renderTriggerCaseRow(testCase, Boolean(result.trigger?.competitors?.length))).join("")}</div>
|
|
2027
2240
|
<div class="card" style="margin-top: 16px;">
|
|
2028
2241
|
<h2>Trigger Suggestions</h2>
|
|
2029
2242
|
<ul>${result.trigger.suggestions.map((suggestion) => `<li>${escapeHtml(suggestion)}</li>`).join("")}</ul>
|
|
@@ -2123,46 +2336,111 @@ function countSkippedSecurityPatterns2(issues) {
|
|
|
2123
2336
|
return total + (issue.skippedPatterns?.length ?? 0);
|
|
2124
2337
|
}, 0);
|
|
2125
2338
|
}
|
|
2339
|
+
function formatPercent2(value) {
|
|
2340
|
+
return `${(value * 100).toFixed(1)}%`;
|
|
2341
|
+
}
|
|
2342
|
+
function formatSignedNumber(value, digits = 4) {
|
|
2343
|
+
const prefix = value > 0 ? "+" : "";
|
|
2344
|
+
return `${prefix}${value.toFixed(digits)}`;
|
|
2345
|
+
}
|
|
2346
|
+
function diffChangedLines(beforeText, afterText) {
|
|
2347
|
+
const beforeLines = beforeText.split(/\r?\n/);
|
|
2348
|
+
const afterLines = afterText.split(/\r?\n/);
|
|
2349
|
+
const dp = Array.from({ length: beforeLines.length + 1 }, () => Array(afterLines.length + 1).fill(0));
|
|
2350
|
+
for (let beforeIndex2 = beforeLines.length - 1; beforeIndex2 >= 0; beforeIndex2 -= 1) {
|
|
2351
|
+
for (let afterIndex2 = afterLines.length - 1; afterIndex2 >= 0; afterIndex2 -= 1) {
|
|
2352
|
+
if (beforeLines[beforeIndex2] === afterLines[afterIndex2]) {
|
|
2353
|
+
dp[beforeIndex2][afterIndex2] = 1 + (dp[beforeIndex2 + 1][afterIndex2 + 1] ?? 0);
|
|
2354
|
+
} else {
|
|
2355
|
+
dp[beforeIndex2][afterIndex2] = Math.max(dp[beforeIndex2 + 1][afterIndex2] ?? 0, dp[beforeIndex2][afterIndex2 + 1] ?? 0);
|
|
2356
|
+
}
|
|
2357
|
+
}
|
|
2358
|
+
}
|
|
2359
|
+
const changedLines = [];
|
|
2360
|
+
let beforeIndex = 0;
|
|
2361
|
+
let afterIndex = 0;
|
|
2362
|
+
while (beforeIndex < beforeLines.length && afterIndex < afterLines.length) {
|
|
2363
|
+
if (beforeLines[beforeIndex] === afterLines[afterIndex]) {
|
|
2364
|
+
beforeIndex += 1;
|
|
2365
|
+
afterIndex += 1;
|
|
2366
|
+
continue;
|
|
2367
|
+
}
|
|
2368
|
+
const skipBefore = dp[beforeIndex + 1][afterIndex] ?? 0;
|
|
2369
|
+
const skipAfter = dp[beforeIndex][afterIndex + 1] ?? 0;
|
|
2370
|
+
if (skipBefore >= skipAfter) {
|
|
2371
|
+
changedLines.push({ type: "-", line: beforeLines[beforeIndex] ?? "" });
|
|
2372
|
+
beforeIndex += 1;
|
|
2373
|
+
} else {
|
|
2374
|
+
changedLines.push({ type: "+", line: afterLines[afterIndex] ?? "" });
|
|
2375
|
+
afterIndex += 1;
|
|
2376
|
+
}
|
|
2377
|
+
}
|
|
2378
|
+
while (beforeIndex < beforeLines.length) {
|
|
2379
|
+
changedLines.push({ type: "-", line: beforeLines[beforeIndex] ?? "" });
|
|
2380
|
+
beforeIndex += 1;
|
|
2381
|
+
}
|
|
2382
|
+
while (afterIndex < afterLines.length) {
|
|
2383
|
+
changedLines.push({ type: "+", line: afterLines[afterIndex] ?? "" });
|
|
2384
|
+
afterIndex += 1;
|
|
2385
|
+
}
|
|
2386
|
+
return changedLines;
|
|
2387
|
+
}
|
|
2388
|
+
function renderDiffPreview(beforeText, afterText, maxLines = 40) {
|
|
2389
|
+
const changedLines = diffChangedLines(beforeText, afterText);
|
|
2390
|
+
if (changedLines.length === 0) {
|
|
2391
|
+
return [" (no content changes)"];
|
|
2392
|
+
}
|
|
2393
|
+
const previewLines = changedLines.slice(0, maxLines).map((entry) => ` ${entry.type} ${entry.line}`);
|
|
2394
|
+
if (changedLines.length > maxLines) {
|
|
2395
|
+
previewLines.push(` ... ${changedLines.length - maxLines} more changed line(s)`);
|
|
2396
|
+
}
|
|
2397
|
+
return previewLines;
|
|
2398
|
+
}
|
|
2399
|
+
function summarizeToolCalls(toolCalls) {
|
|
2400
|
+
const counts = /* @__PURE__ */ new Map();
|
|
2401
|
+
for (const toolCall of toolCalls) {
|
|
2402
|
+
counts.set(toolCall.name, (counts.get(toolCall.name) ?? 0) + 1);
|
|
2403
|
+
}
|
|
2404
|
+
return Array.from(counts.entries()).map(([name, count]) => `${name} x${count}`).join(", ");
|
|
2405
|
+
}
|
|
2126
2406
|
function renderLintReport(report, enableColor) {
|
|
2127
2407
|
const c = getChalkInstance(enableColor);
|
|
2128
2408
|
const { passed, warnings, failures, total } = report.summary;
|
|
2129
2409
|
const headerLines = [
|
|
2130
|
-
|
|
2131
|
-
|
|
2132
|
-
|
|
2133
|
-
`\u2502 target: ${report.target}`,
|
|
2134
|
-
`\u2502 summary: ${passed}/${total} checks passed, ${warnings} warnings, ${failures} failures`,
|
|
2135
|
-
`\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518`
|
|
2410
|
+
"skilltest lint",
|
|
2411
|
+
`target: ${report.target}`,
|
|
2412
|
+
`summary: ${passed}/${total} checks passed, ${warnings} warnings, ${failures} failures`
|
|
2136
2413
|
];
|
|
2137
2414
|
const renderedIssues = report.issues.map((issue) => renderIssueLine(issue, c)).join("\n");
|
|
2138
2415
|
const skippedSecurityPatterns = countSkippedSecurityPatterns2(report.issues);
|
|
2139
2416
|
const infoLine = skippedSecurityPatterns > 0 ? `
|
|
2140
|
-
${c.cyan("
|
|
2417
|
+
${c.cyan("INFO")} ${skippedSecurityPatterns} security pattern(s) found in code examples/comments (not flagged)` : "";
|
|
2141
2418
|
return `${headerLines.join("\n")}
|
|
2142
2419
|
${renderedIssues}${infoLine}`;
|
|
2143
2420
|
}
|
|
2144
|
-
function formatPercent2(value) {
|
|
2145
|
-
return `${(value * 100).toFixed(1)}%`;
|
|
2146
|
-
}
|
|
2147
2421
|
function renderTriggerReport(result, enableColor, verbose) {
|
|
2148
2422
|
const c = getChalkInstance(enableColor);
|
|
2149
|
-
const lines = [
|
|
2150
|
-
|
|
2151
|
-
|
|
2152
|
-
|
|
2153
|
-
|
|
2154
|
-
|
|
2423
|
+
const lines = [
|
|
2424
|
+
"skilltest trigger",
|
|
2425
|
+
`skill: ${result.skillName}`,
|
|
2426
|
+
`provider/model: ${result.provider}/${result.model}`
|
|
2427
|
+
];
|
|
2428
|
+
if (result.competitors && result.competitors.length > 0) {
|
|
2429
|
+
lines.push(`competitors: ${result.competitors.map((competitor) => competitor.name).join(", ")}`);
|
|
2430
|
+
}
|
|
2155
2431
|
lines.push(
|
|
2156
|
-
|
|
2432
|
+
`precision: ${formatPercent2(result.metrics.precision)} recall: ${formatPercent2(result.metrics.recall)} f1: ${formatPercent2(result.metrics.f1)}`
|
|
2157
2433
|
);
|
|
2158
2434
|
lines.push(
|
|
2159
|
-
|
|
2435
|
+
`TP ${result.metrics.truePositives} TN ${result.metrics.trueNegatives} FP ${result.metrics.falsePositives} FN ${result.metrics.falseNegatives}`
|
|
2160
2436
|
);
|
|
2161
|
-
lines.push("\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518");
|
|
2162
2437
|
for (const [index, testCase] of result.cases.entries()) {
|
|
2163
2438
|
const status = testCase.matched ? c.green("PASS") : c.red("FAIL");
|
|
2164
2439
|
lines.push(`${index + 1}. ${status} query: ${testCase.query}`);
|
|
2165
2440
|
lines.push(` expected: ${testCase.expected} | actual: ${testCase.actual}`);
|
|
2441
|
+
if (verbose && testCase.selectedCompetitor) {
|
|
2442
|
+
lines.push(` competitor selected: ${testCase.selectedCompetitor}`);
|
|
2443
|
+
}
|
|
2166
2444
|
if (verbose && testCase.rawModelResponse) {
|
|
2167
2445
|
lines.push(` model: ${testCase.rawModelResponse.replace(/\s+/g, " ").trim()}`);
|
|
2168
2446
|
}
|
|
@@ -2175,24 +2453,35 @@ function renderTriggerReport(result, enableColor, verbose) {
|
|
|
2175
2453
|
}
|
|
2176
2454
|
function renderEvalReport(result, enableColor, verbose) {
|
|
2177
2455
|
const c = getChalkInstance(enableColor);
|
|
2178
|
-
const lines = [
|
|
2179
|
-
|
|
2180
|
-
|
|
2181
|
-
|
|
2182
|
-
|
|
2183
|
-
|
|
2184
|
-
|
|
2185
|
-
lines.push(`\u2502 assertions passed: ${result.summary.passedAssertions}/${result.summary.totalAssertions}`);
|
|
2186
|
-
lines.push("\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518");
|
|
2456
|
+
const lines = [
|
|
2457
|
+
"skilltest eval",
|
|
2458
|
+
`skill: ${result.skillName}`,
|
|
2459
|
+
`provider/model: ${result.provider}/${result.model}`,
|
|
2460
|
+
`grader model: ${result.graderModel}`,
|
|
2461
|
+
`assertions passed: ${result.summary.passedAssertions}/${result.summary.totalAssertions}`
|
|
2462
|
+
];
|
|
2187
2463
|
for (const [index, promptResult] of result.results.entries()) {
|
|
2188
2464
|
lines.push(`${index + 1}. prompt: ${promptResult.prompt}`);
|
|
2189
2465
|
lines.push(` response summary: ${promptResult.responseSummary.replace(/\s+/g, " ").trim()}`);
|
|
2466
|
+
if (promptResult.toolCalls) {
|
|
2467
|
+
lines.push(` Tools: ${promptResult.toolCalls.length} calls (${summarizeToolCalls(promptResult.toolCalls)})`);
|
|
2468
|
+
if (promptResult.loopIterations !== void 0) {
|
|
2469
|
+
lines.push(` loop iterations: ${promptResult.loopIterations}`);
|
|
2470
|
+
}
|
|
2471
|
+
}
|
|
2190
2472
|
for (const assertion of promptResult.assertions) {
|
|
2191
2473
|
const status = assertion.passed ? c.green("PASS") : c.red("FAIL");
|
|
2192
2474
|
lines.push(` ${status} ${assertion.assertion}`);
|
|
2193
2475
|
lines.push(` evidence: ${assertion.evidence}`);
|
|
2194
2476
|
}
|
|
2195
2477
|
if (verbose) {
|
|
2478
|
+
if (promptResult.toolCalls) {
|
|
2479
|
+
for (const toolCall of promptResult.toolCalls) {
|
|
2480
|
+
lines.push(` tool ${toolCall.turnIndex}: ${toolCall.name}`);
|
|
2481
|
+
lines.push(` arguments: ${JSON.stringify(toolCall.arguments)}`);
|
|
2482
|
+
lines.push(` response: ${toolCall.response}`);
|
|
2483
|
+
}
|
|
2484
|
+
}
|
|
2196
2485
|
lines.push(` full response: ${promptResult.response}`);
|
|
2197
2486
|
}
|
|
2198
2487
|
}
|
|
@@ -2229,7 +2518,7 @@ function renderCheckReport(result, enableColor, verbose) {
|
|
|
2229
2518
|
}
|
|
2230
2519
|
const skippedSecurityPatterns = countSkippedSecurityPatterns2(result.lint.issues);
|
|
2231
2520
|
if (skippedSecurityPatterns > 0) {
|
|
2232
|
-
lines.push(` ${c.cyan("
|
|
2521
|
+
lines.push(` ${c.cyan("INFO")} ${skippedSecurityPatterns} security pattern(s) found in code examples/comments (not flagged)`);
|
|
2233
2522
|
}
|
|
2234
2523
|
lines.push("");
|
|
2235
2524
|
lines.push("Trigger");
|
|
@@ -2240,11 +2529,17 @@ function renderCheckReport(result, enableColor, verbose) {
|
|
|
2240
2529
|
lines.push(
|
|
2241
2530
|
` TP ${result.trigger.metrics.truePositives} TN ${result.trigger.metrics.trueNegatives} FP ${result.trigger.metrics.falsePositives} FN ${result.trigger.metrics.falseNegatives}`
|
|
2242
2531
|
);
|
|
2532
|
+
if (result.trigger.competitors && result.trigger.competitors.length > 0) {
|
|
2533
|
+
lines.push(` competitors: ${result.trigger.competitors.map((competitor) => competitor.name).join(", ")}`);
|
|
2534
|
+
}
|
|
2243
2535
|
const triggerCases = verbose ? result.trigger.cases : result.trigger.cases.filter((testCase) => !testCase.matched);
|
|
2244
2536
|
for (const testCase of triggerCases) {
|
|
2245
2537
|
const status = testCase.matched ? c.green("PASS") : c.red("FAIL");
|
|
2246
2538
|
lines.push(` - ${status} ${testCase.query}`);
|
|
2247
2539
|
lines.push(` expected=${testCase.expected} actual=${testCase.actual}`);
|
|
2540
|
+
if (testCase.selectedCompetitor) {
|
|
2541
|
+
lines.push(` competitor selected=${testCase.selectedCompetitor}`);
|
|
2542
|
+
}
|
|
2248
2543
|
}
|
|
2249
2544
|
} else {
|
|
2250
2545
|
lines.push(`- ${triggerGate} ${result.triggerSkippedReason ?? "Skipped."}`);
|
|
@@ -2263,6 +2558,12 @@ function renderCheckReport(result, enableColor, verbose) {
|
|
|
2263
2558
|
}
|
|
2264
2559
|
lines.push(` - prompt: ${promptResult.prompt}`);
|
|
2265
2560
|
lines.push(` response summary: ${promptResult.responseSummary.replace(/\s+/g, " ").trim()}`);
|
|
2561
|
+
if (promptResult.toolCalls) {
|
|
2562
|
+
lines.push(` Tools: ${promptResult.toolCalls.length} calls (${summarizeToolCalls(promptResult.toolCalls)})`);
|
|
2563
|
+
if (promptResult.loopIterations !== void 0) {
|
|
2564
|
+
lines.push(` loop iterations: ${promptResult.loopIterations}`);
|
|
2565
|
+
}
|
|
2566
|
+
}
|
|
2266
2567
|
const assertionsToRender = verbose ? promptResult.assertions : failedAssertions;
|
|
2267
2568
|
for (const assertion of assertionsToRender) {
|
|
2268
2569
|
const assertionStatus = assertion.passed ? c.green("PASS") : c.red("FAIL");
|
|
@@ -2270,6 +2571,13 @@ function renderCheckReport(result, enableColor, verbose) {
|
|
|
2270
2571
|
lines.push(` evidence: ${assertion.evidence}`);
|
|
2271
2572
|
}
|
|
2272
2573
|
if (verbose) {
|
|
2574
|
+
if (promptResult.toolCalls) {
|
|
2575
|
+
for (const toolCall of promptResult.toolCalls) {
|
|
2576
|
+
lines.push(` tool ${toolCall.turnIndex}: ${toolCall.name}`);
|
|
2577
|
+
lines.push(` arguments: ${JSON.stringify(toolCall.arguments)}`);
|
|
2578
|
+
lines.push(` response: ${toolCall.response}`);
|
|
2579
|
+
}
|
|
2580
|
+
}
|
|
2273
2581
|
lines.push(` full response: ${promptResult.response}`);
|
|
2274
2582
|
}
|
|
2275
2583
|
}
|
|
@@ -2284,9 +2592,76 @@ function renderCheckReport(result, enableColor, verbose) {
|
|
|
2284
2592
|
lines.push(`- overall: ${overallGate}`);
|
|
2285
2593
|
return lines.join("\n");
|
|
2286
2594
|
}
|
|
2595
|
+
function renderImproveReport(result, enableColor, verbose = false) {
|
|
2596
|
+
const c = getChalkInstance(enableColor);
|
|
2597
|
+
const lines = [
|
|
2598
|
+
"skilltest improve",
|
|
2599
|
+
`target: ${result.target}`,
|
|
2600
|
+
`provider/model: ${result.provider}/${result.model}`,
|
|
2601
|
+
`thresholds: min-f1=${result.thresholds.minF1.toFixed(2)} min-assert-pass-rate=${result.thresholds.minAssertPassRate.toFixed(2)}`
|
|
2602
|
+
];
|
|
2603
|
+
const statusLabel = result.blockedReason ? c.red("BLOCKED") : result.applied ? c.green("APPLIED") : c.green("VERIFIED");
|
|
2604
|
+
lines.push(`status: ${statusLabel}`);
|
|
2605
|
+
if (result.candidate) {
|
|
2606
|
+
lines.push("");
|
|
2607
|
+
lines.push("Change Summary");
|
|
2608
|
+
for (const item of result.candidate.changeSummary) {
|
|
2609
|
+
lines.push(`- ${item}`);
|
|
2610
|
+
}
|
|
2611
|
+
lines.push("");
|
|
2612
|
+
lines.push("Targeted Problems");
|
|
2613
|
+
for (const item of result.candidate.targetedProblems) {
|
|
2614
|
+
lines.push(`- ${item}`);
|
|
2615
|
+
}
|
|
2616
|
+
}
|
|
2617
|
+
if (result.delta && result.verification) {
|
|
2618
|
+
lines.push("");
|
|
2619
|
+
lines.push("Before / After");
|
|
2620
|
+
lines.push(
|
|
2621
|
+
`- lint failures: ${result.delta.lintFailures.before} -> ${result.delta.lintFailures.after} (${formatSignedNumber(result.delta.lintFailures.delta, 0)})`
|
|
2622
|
+
);
|
|
2623
|
+
lines.push(
|
|
2624
|
+
`- lint warnings: ${result.delta.lintWarnings.before} -> ${result.delta.lintWarnings.after} (${formatSignedNumber(result.delta.lintWarnings.delta, 0)})`
|
|
2625
|
+
);
|
|
2626
|
+
lines.push(
|
|
2627
|
+
`- trigger f1: ${formatPercent2(result.delta.triggerF1.before)} -> ${formatPercent2(result.delta.triggerF1.after)} (${formatSignedNumber(result.delta.triggerF1.delta)})`
|
|
2628
|
+
);
|
|
2629
|
+
lines.push(
|
|
2630
|
+
`- eval assertion pass rate: ${formatPercent2(result.delta.evalAssertPassRate.before)} -> ${formatPercent2(result.delta.evalAssertPassRate.after)} (${formatSignedNumber(result.delta.evalAssertPassRate.delta)})`
|
|
2631
|
+
);
|
|
2632
|
+
lines.push(
|
|
2633
|
+
`- overall gate: ${result.delta.overallPassed.before ? c.green("PASS") : c.red("FAIL")} -> ${result.delta.overallPassed.after ? c.green("PASS") : c.red("FAIL")}`
|
|
2634
|
+
);
|
|
2635
|
+
}
|
|
2636
|
+
if (result.outputPath) {
|
|
2637
|
+
lines.push("");
|
|
2638
|
+
lines.push(`output: ${result.outputPath}`);
|
|
2639
|
+
}
|
|
2640
|
+
if (result.blockedReason) {
|
|
2641
|
+
lines.push("");
|
|
2642
|
+
lines.push("Blocked");
|
|
2643
|
+
lines.push(`- ${result.blockedReason}`);
|
|
2644
|
+
}
|
|
2645
|
+
if (result.candidate) {
|
|
2646
|
+
lines.push("");
|
|
2647
|
+
lines.push("Diff Preview");
|
|
2648
|
+
lines.push(...renderDiffPreview(result.originalRaw, result.candidate.raw));
|
|
2649
|
+
}
|
|
2650
|
+
if (verbose) {
|
|
2651
|
+
lines.push("");
|
|
2652
|
+
lines.push("Baseline");
|
|
2653
|
+
lines.push(renderCheckReport(result.baseline, enableColor, true));
|
|
2654
|
+
if (result.verification) {
|
|
2655
|
+
lines.push("");
|
|
2656
|
+
lines.push("Verification");
|
|
2657
|
+
lines.push(renderCheckReport(result.verification, enableColor, true));
|
|
2658
|
+
}
|
|
2659
|
+
}
|
|
2660
|
+
return lines.join("\n");
|
|
2661
|
+
}
|
|
2287
2662
|
|
|
2288
2663
|
// src/commands/common.ts
|
|
2289
|
-
import
|
|
2664
|
+
import fs6 from "node:fs/promises";
|
|
2290
2665
|
import { z as z5 } from "zod";
|
|
2291
2666
|
|
|
2292
2667
|
// src/core/eval-runner.ts
|
|
@@ -2355,7 +2730,10 @@ function parseGraderOutput(raw) {
|
|
|
2355
2730
|
async function gradeResponse(options) {
|
|
2356
2731
|
const prompts = buildGraderPrompts(options);
|
|
2357
2732
|
const raw = await options.provider.sendMessage(prompts.systemPrompt, prompts.userPrompt, { model: options.model });
|
|
2358
|
-
return parseGraderOutput(raw)
|
|
2733
|
+
return parseGraderOutput(raw).map((assertion) => ({
|
|
2734
|
+
...assertion,
|
|
2735
|
+
source: "grader"
|
|
2736
|
+
}));
|
|
2359
2737
|
}
|
|
2360
2738
|
|
|
2361
2739
|
// src/utils/concurrency.ts
|
|
@@ -2410,12 +2788,290 @@ async function pMap(items, fn, concurrency) {
|
|
|
2410
2788
|
});
|
|
2411
2789
|
}
|
|
2412
2790
|
|
|
2791
|
+
// src/core/tool-environment.ts
|
|
2792
|
+
function isPlainObject(value) {
|
|
2793
|
+
return value !== null && typeof value === "object" && !Array.isArray(value);
|
|
2794
|
+
}
|
|
2795
|
+
function deepEqual(left, right) {
|
|
2796
|
+
if (Array.isArray(left) && Array.isArray(right)) {
|
|
2797
|
+
if (left.length !== right.length) {
|
|
2798
|
+
return false;
|
|
2799
|
+
}
|
|
2800
|
+
return left.every((item, index) => deepEqual(item, right[index]));
|
|
2801
|
+
}
|
|
2802
|
+
if (isPlainObject(left) && isPlainObject(right)) {
|
|
2803
|
+
const leftKeys = Object.keys(left);
|
|
2804
|
+
const rightKeys = Object.keys(right);
|
|
2805
|
+
if (leftKeys.length !== rightKeys.length) {
|
|
2806
|
+
return false;
|
|
2807
|
+
}
|
|
2808
|
+
return leftKeys.every((key) => deepEqual(left[key], right[key]));
|
|
2809
|
+
}
|
|
2810
|
+
return left === right;
|
|
2811
|
+
}
|
|
2812
|
+
function matchesArgumentSubset(actual, expected) {
|
|
2813
|
+
if (Array.isArray(expected)) {
|
|
2814
|
+
if (!Array.isArray(actual) || actual.length !== expected.length) {
|
|
2815
|
+
return false;
|
|
2816
|
+
}
|
|
2817
|
+
return expected.every((value, index) => matchesArgumentSubset(actual[index], value));
|
|
2818
|
+
}
|
|
2819
|
+
if (isPlainObject(expected)) {
|
|
2820
|
+
if (!isPlainObject(actual)) {
|
|
2821
|
+
return false;
|
|
2822
|
+
}
|
|
2823
|
+
return Object.entries(expected).every(([key, value]) => matchesArgumentSubset(actual[key], value));
|
|
2824
|
+
}
|
|
2825
|
+
return deepEqual(actual, expected);
|
|
2826
|
+
}
|
|
2827
|
+
function parseResponsePattern(pattern) {
|
|
2828
|
+
if (pattern === "*") {
|
|
2829
|
+
return null;
|
|
2830
|
+
}
|
|
2831
|
+
try {
|
|
2832
|
+
const parsed = JSON.parse(pattern);
|
|
2833
|
+
return isPlainObject(parsed) ? parsed : null;
|
|
2834
|
+
} catch {
|
|
2835
|
+
return null;
|
|
2836
|
+
}
|
|
2837
|
+
}
|
|
2838
|
+
function renderFallbackResponse(tool, args) {
|
|
2839
|
+
return `[mock] No mock response configured for tool '${tool.name}' with arguments: ${JSON.stringify(args)}`;
|
|
2840
|
+
}
|
|
2841
|
+
function resolveToolResponse(tool, args) {
|
|
2842
|
+
const exactMatchKey = JSON.stringify(args);
|
|
2843
|
+
const exactMatch = tool.responses[exactMatchKey];
|
|
2844
|
+
if (exactMatch !== void 0) {
|
|
2845
|
+
return exactMatch;
|
|
2846
|
+
}
|
|
2847
|
+
let bestPartialMatch = null;
|
|
2848
|
+
for (const [pattern, response] of Object.entries(tool.responses)) {
|
|
2849
|
+
if (pattern === "*") {
|
|
2850
|
+
continue;
|
|
2851
|
+
}
|
|
2852
|
+
const parsedPattern = parseResponsePattern(pattern);
|
|
2853
|
+
if (!parsedPattern) {
|
|
2854
|
+
continue;
|
|
2855
|
+
}
|
|
2856
|
+
if (!matchesArgumentSubset(args, parsedPattern)) {
|
|
2857
|
+
continue;
|
|
2858
|
+
}
|
|
2859
|
+
const specificity = Object.keys(parsedPattern).length;
|
|
2860
|
+
if (!bestPartialMatch || specificity > bestPartialMatch.specificity) {
|
|
2861
|
+
bestPartialMatch = { specificity, response };
|
|
2862
|
+
}
|
|
2863
|
+
}
|
|
2864
|
+
if (bestPartialMatch) {
|
|
2865
|
+
return bestPartialMatch.response;
|
|
2866
|
+
}
|
|
2867
|
+
const wildcardMatch = tool.responses["*"];
|
|
2868
|
+
if (wildcardMatch !== void 0) {
|
|
2869
|
+
return wildcardMatch;
|
|
2870
|
+
}
|
|
2871
|
+
return renderFallbackResponse(tool, args);
|
|
2872
|
+
}
|
|
2873
|
+
function toProviderToolDefinitions(mockTools) {
|
|
2874
|
+
return mockTools.map((tool) => {
|
|
2875
|
+
const parameters = tool.parameters ?? [];
|
|
2876
|
+
return {
|
|
2877
|
+
name: tool.name,
|
|
2878
|
+
description: tool.description,
|
|
2879
|
+
parameters: {
|
|
2880
|
+
type: "object",
|
|
2881
|
+
properties: Object.fromEntries(
|
|
2882
|
+
parameters.map((parameter) => [
|
|
2883
|
+
parameter.name,
|
|
2884
|
+
{
|
|
2885
|
+
type: parameter.type,
|
|
2886
|
+
description: parameter.description
|
|
2887
|
+
}
|
|
2888
|
+
])
|
|
2889
|
+
),
|
|
2890
|
+
required: parameters.filter((parameter) => parameter.required).map((parameter) => parameter.name)
|
|
2891
|
+
}
|
|
2892
|
+
};
|
|
2893
|
+
});
|
|
2894
|
+
}
|
|
2895
|
+
function toAssistantConversationBlocks(response) {
|
|
2896
|
+
const contentBlocks = [];
|
|
2897
|
+
if (response.textContent.trim().length > 0) {
|
|
2898
|
+
contentBlocks.push({
|
|
2899
|
+
type: "text",
|
|
2900
|
+
text: response.textContent
|
|
2901
|
+
});
|
|
2902
|
+
}
|
|
2903
|
+
for (const block of response.toolUseBlocks) {
|
|
2904
|
+
contentBlocks.push({
|
|
2905
|
+
type: "tool_use",
|
|
2906
|
+
id: block.id,
|
|
2907
|
+
name: block.name,
|
|
2908
|
+
input: block.arguments
|
|
2909
|
+
});
|
|
2910
|
+
}
|
|
2911
|
+
return contentBlocks.length === 0 ? [] : [
|
|
2912
|
+
{
|
|
2913
|
+
role: "assistant",
|
|
2914
|
+
content: contentBlocks
|
|
2915
|
+
}
|
|
2916
|
+
];
|
|
2917
|
+
}
|
|
2918
|
+
async function runWithTools(options) {
|
|
2919
|
+
const maxIterations = options.maxIterations ?? 10;
|
|
2920
|
+
const toolsByName = new Map(options.tools.map((tool) => [tool.name, tool]));
|
|
2921
|
+
const providerTools = toProviderToolDefinitions(options.tools);
|
|
2922
|
+
const messages = [{ role: "user", content: options.userMessage }];
|
|
2923
|
+
const toolCalls = [];
|
|
2924
|
+
let finalResponse = "";
|
|
2925
|
+
let loopIterations = 0;
|
|
2926
|
+
while (loopIterations < maxIterations) {
|
|
2927
|
+
loopIterations += 1;
|
|
2928
|
+
const response = await options.provider.sendWithTools(options.systemPrompt, messages, {
|
|
2929
|
+
model: options.model,
|
|
2930
|
+
tools: providerTools
|
|
2931
|
+
});
|
|
2932
|
+
if (response.textContent.trim().length > 0) {
|
|
2933
|
+
finalResponse = response.textContent;
|
|
2934
|
+
}
|
|
2935
|
+
if (response.toolUseBlocks.length === 0) {
|
|
2936
|
+
return {
|
|
2937
|
+
finalResponse,
|
|
2938
|
+
toolCalls,
|
|
2939
|
+
loopIterations
|
|
2940
|
+
};
|
|
2941
|
+
}
|
|
2942
|
+
messages.push(...toAssistantConversationBlocks(response));
|
|
2943
|
+
const toolResultBlocks = [];
|
|
2944
|
+
for (const toolUse of response.toolUseBlocks) {
|
|
2945
|
+
const tool = toolsByName.get(toolUse.name);
|
|
2946
|
+
const resolvedResponse = tool ? resolveToolResponse(tool, toolUse.arguments) : `[mock] No tool named '${toolUse.name}' is registered.`;
|
|
2947
|
+
toolCalls.push({
|
|
2948
|
+
name: toolUse.name,
|
|
2949
|
+
arguments: toolUse.arguments,
|
|
2950
|
+
response: resolvedResponse,
|
|
2951
|
+
turnIndex: loopIterations
|
|
2952
|
+
});
|
|
2953
|
+
toolResultBlocks.push({
|
|
2954
|
+
type: "tool_result",
|
|
2955
|
+
tool_use_id: toolUse.id,
|
|
2956
|
+
content: resolvedResponse
|
|
2957
|
+
});
|
|
2958
|
+
}
|
|
2959
|
+
messages.push({
|
|
2960
|
+
role: "user",
|
|
2961
|
+
content: toolResultBlocks
|
|
2962
|
+
});
|
|
2963
|
+
}
|
|
2964
|
+
const terminationNote = `[skilltest: tool loop terminated after ${maxIterations} iterations]`;
|
|
2965
|
+
finalResponse = finalResponse ? `${finalResponse}
|
|
2966
|
+
|
|
2967
|
+
${terminationNote}` : terminationNote;
|
|
2968
|
+
return {
|
|
2969
|
+
finalResponse,
|
|
2970
|
+
toolCalls,
|
|
2971
|
+
loopIterations
|
|
2972
|
+
};
|
|
2973
|
+
}
|
|
2974
|
+
|
|
2413
2975
|
// src/core/eval-runner.ts
|
|
2976
|
+
var toolParameterSchema = z3.object({
|
|
2977
|
+
name: z3.string().min(1),
|
|
2978
|
+
type: z3.enum(["string", "number", "boolean", "object", "array"]),
|
|
2979
|
+
description: z3.string().min(1),
|
|
2980
|
+
required: z3.boolean().optional()
|
|
2981
|
+
});
|
|
2982
|
+
var mockToolDefinitionSchema = z3.object({
|
|
2983
|
+
name: z3.string().min(1),
|
|
2984
|
+
description: z3.string().min(1),
|
|
2985
|
+
parameters: z3.array(toolParameterSchema).optional(),
|
|
2986
|
+
responses: z3.record(z3.string())
|
|
2987
|
+
});
|
|
2988
|
+
var toolAssertionSchema = z3.object({
|
|
2989
|
+
type: z3.enum(["tool_called", "tool_not_called", "tool_call_order", "tool_argument_match"]),
|
|
2990
|
+
toolName: z3.string().min(1).optional(),
|
|
2991
|
+
toolNames: z3.array(z3.string().min(1)).optional(),
|
|
2992
|
+
expectedArgs: z3.record(z3.unknown()).optional(),
|
|
2993
|
+
description: z3.string().min(1)
|
|
2994
|
+
}).superRefine((value, context) => {
|
|
2995
|
+
if ((value.type === "tool_called" || value.type === "tool_not_called" || value.type === "tool_argument_match") && !value.toolName) {
|
|
2996
|
+
context.addIssue({
|
|
2997
|
+
code: z3.ZodIssueCode.custom,
|
|
2998
|
+
message: `${value.type} requires toolName.`
|
|
2999
|
+
});
|
|
3000
|
+
}
|
|
3001
|
+
if (value.type === "tool_call_order" && (!value.toolNames || value.toolNames.length === 0)) {
|
|
3002
|
+
context.addIssue({
|
|
3003
|
+
code: z3.ZodIssueCode.custom,
|
|
3004
|
+
message: "tool_call_order requires toolNames."
|
|
3005
|
+
});
|
|
3006
|
+
}
|
|
3007
|
+
if (value.type === "tool_argument_match" && !value.expectedArgs) {
|
|
3008
|
+
context.addIssue({
|
|
3009
|
+
code: z3.ZodIssueCode.custom,
|
|
3010
|
+
message: "tool_argument_match requires expectedArgs."
|
|
3011
|
+
});
|
|
3012
|
+
}
|
|
3013
|
+
});
|
|
2414
3014
|
var evalPromptSchema = z3.object({
|
|
2415
3015
|
prompt: z3.string().min(1),
|
|
2416
|
-
assertions: z3.array(z3.string().min(1)).optional()
|
|
3016
|
+
assertions: z3.array(z3.string().min(1)).optional(),
|
|
3017
|
+
tools: z3.array(mockToolDefinitionSchema).optional(),
|
|
3018
|
+
toolAssertions: z3.array(toolAssertionSchema).optional()
|
|
2417
3019
|
});
|
|
2418
3020
|
var evalPromptArraySchema = z3.array(evalPromptSchema);
|
|
3021
|
+
function formatExpectedOrder(toolNames) {
|
|
3022
|
+
return `[${toolNames.join(", ")}]`;
|
|
3023
|
+
}
|
|
3024
|
+
function formatActualOrder(toolCalls, toolNames) {
|
|
3025
|
+
const relevantNames = new Set(toolNames);
|
|
3026
|
+
const actualOrder = toolCalls.filter((toolCall) => relevantNames.has(toolCall.name)).map((toolCall) => toolCall.name);
|
|
3027
|
+
return `[${actualOrder.join(", ")}]`;
|
|
3028
|
+
}
|
|
3029
|
+
function evaluateToolAssertions(toolAssertions, toolCalls) {
|
|
3030
|
+
return toolAssertions.map((toolAssertion) => {
|
|
3031
|
+
if (toolAssertion.type === "tool_called") {
|
|
3032
|
+
const matchingCalls = toolCalls.filter((toolCall) => toolCall.name === toolAssertion.toolName);
|
|
3033
|
+
return {
|
|
3034
|
+
assertion: toolAssertion.description,
|
|
3035
|
+
passed: matchingCalls.length > 0,
|
|
3036
|
+
evidence: matchingCalls.length > 0 ? `Tool '${toolAssertion.toolName}' was called ${matchingCalls.length} time${matchingCalls.length === 1 ? "" : "s"}.` : `Tool '${toolAssertion.toolName}' was not called.`,
|
|
3037
|
+
source: "tool"
|
|
3038
|
+
};
|
|
3039
|
+
}
|
|
3040
|
+
if (toolAssertion.type === "tool_not_called") {
|
|
3041
|
+
const matchingCalls = toolCalls.filter((toolCall) => toolCall.name === toolAssertion.toolName);
|
|
3042
|
+
return {
|
|
3043
|
+
assertion: toolAssertion.description,
|
|
3044
|
+
passed: matchingCalls.length === 0,
|
|
3045
|
+
evidence: matchingCalls.length === 0 ? `Tool '${toolAssertion.toolName}' was not called.` : `Tool '${toolAssertion.toolName}' was called ${matchingCalls.length} time${matchingCalls.length === 1 ? "" : "s"}.`,
|
|
3046
|
+
source: "tool"
|
|
3047
|
+
};
|
|
3048
|
+
}
|
|
3049
|
+
if (toolAssertion.type === "tool_call_order") {
|
|
3050
|
+
const expectedOrder = toolAssertion.toolNames ?? [];
|
|
3051
|
+
let nextExpectedIndex = 0;
|
|
3052
|
+
for (const toolCall of toolCalls) {
|
|
3053
|
+
if (toolCall.name === expectedOrder[nextExpectedIndex]) {
|
|
3054
|
+
nextExpectedIndex += 1;
|
|
3055
|
+
}
|
|
3056
|
+
}
|
|
3057
|
+
return {
|
|
3058
|
+
assertion: toolAssertion.description,
|
|
3059
|
+
passed: nextExpectedIndex === expectedOrder.length,
|
|
3060
|
+
evidence: nextExpectedIndex === expectedOrder.length ? `Observed tool call order ${formatExpectedOrder(expectedOrder)}.` : `Expected call order ${formatExpectedOrder(expectedOrder)} but got ${formatActualOrder(toolCalls, expectedOrder)}.`,
|
|
3061
|
+
source: "tool"
|
|
3062
|
+
};
|
|
3063
|
+
}
|
|
3064
|
+
const matchingCall = toolCalls.find(
|
|
3065
|
+
(toolCall) => toolCall.name === toolAssertion.toolName && matchesArgumentSubset(toolCall.arguments, toolAssertion.expectedArgs ?? {})
|
|
3066
|
+
);
|
|
3067
|
+
return {
|
|
3068
|
+
assertion: toolAssertion.description,
|
|
3069
|
+
passed: Boolean(matchingCall),
|
|
3070
|
+
evidence: matchingCall ? `Tool '${toolAssertion.toolName}' was called with arguments matching ${JSON.stringify(toolAssertion.expectedArgs ?? {})}.` : `No '${toolAssertion.toolName}' call matched ${JSON.stringify(toolAssertion.expectedArgs ?? {})}.`,
|
|
3071
|
+
source: "tool"
|
|
3072
|
+
};
|
|
3073
|
+
});
|
|
3074
|
+
}
|
|
2419
3075
|
function extractJsonArray(raw) {
|
|
2420
3076
|
const trimmed = raw.trim();
|
|
2421
3077
|
if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
|
|
@@ -2442,6 +3098,7 @@ async function generatePrompts(skill, provider, model, count) {
|
|
|
2442
3098
|
skill.content,
|
|
2443
3099
|
"",
|
|
2444
3100
|
`Generate ${count} prompts that stress the main capabilities and likely edge cases.`,
|
|
3101
|
+
// Tool-aware prompts require user-defined mock responses and are not auto-generated.
|
|
2445
3102
|
"Each prompt should include 2-4 assertions."
|
|
2446
3103
|
].join("\n");
|
|
2447
3104
|
const raw = await provider.sendMessage(systemPrompt, userPrompt, { model });
|
|
@@ -2465,7 +3122,24 @@ async function runEval(skill, options) {
|
|
|
2465
3122
|
const results = await pMap(
|
|
2466
3123
|
prompts,
|
|
2467
3124
|
async (evalPrompt) => {
|
|
2468
|
-
|
|
3125
|
+
let response;
|
|
3126
|
+
let toolCalls;
|
|
3127
|
+
let loopIterations;
|
|
3128
|
+
if (evalPrompt.tools && evalPrompt.tools.length > 0) {
|
|
3129
|
+
const toolRun = await runWithTools({
|
|
3130
|
+
provider: options.provider,
|
|
3131
|
+
model: options.model,
|
|
3132
|
+
systemPrompt,
|
|
3133
|
+
userMessage: evalPrompt.prompt,
|
|
3134
|
+
tools: evalPrompt.tools,
|
|
3135
|
+
maxIterations: options.maxToolIterations
|
|
3136
|
+
});
|
|
3137
|
+
response = toolRun.finalResponse;
|
|
3138
|
+
toolCalls = toolRun.toolCalls;
|
|
3139
|
+
loopIterations = toolRun.loopIterations;
|
|
3140
|
+
} else {
|
|
3141
|
+
response = await options.provider.sendMessage(systemPrompt, evalPrompt.prompt, { model: options.model });
|
|
3142
|
+
}
|
|
2469
3143
|
const gradedAssertions = await gradeResponse({
|
|
2470
3144
|
provider: options.provider,
|
|
2471
3145
|
model: options.graderModel,
|
|
@@ -2475,14 +3149,18 @@ async function runEval(skill, options) {
|
|
|
2475
3149
|
modelResponse: response,
|
|
2476
3150
|
assertions: evalPrompt.assertions
|
|
2477
3151
|
});
|
|
2478
|
-
const
|
|
3152
|
+
const structuralAssertions = evalPrompt.toolAssertions && evalPrompt.toolAssertions.length > 0 ? evaluateToolAssertions(evalPrompt.toolAssertions, toolCalls ?? []) : [];
|
|
3153
|
+
const assertions = [...gradedAssertions, ...structuralAssertions];
|
|
3154
|
+
const passedAssertions2 = assertions.filter((assertion) => assertion.passed).length;
|
|
2479
3155
|
return {
|
|
2480
3156
|
prompt: evalPrompt.prompt,
|
|
2481
|
-
assertions
|
|
3157
|
+
assertions,
|
|
2482
3158
|
responseSummary: response.slice(0, 200),
|
|
2483
3159
|
response,
|
|
2484
3160
|
passedAssertions: passedAssertions2,
|
|
2485
|
-
totalAssertions:
|
|
3161
|
+
totalAssertions: assertions.length,
|
|
3162
|
+
...toolCalls ? { toolCalls } : {},
|
|
3163
|
+
...loopIterations !== void 0 ? { loopIterations } : {}
|
|
2486
3164
|
};
|
|
2487
3165
|
},
|
|
2488
3166
|
options.concurrency ?? 5
|
|
@@ -2568,7 +3246,7 @@ function parseJsonArrayFromModelOutput(raw) {
|
|
|
2568
3246
|
}
|
|
2569
3247
|
throw new Error("Model did not return a JSON array.");
|
|
2570
3248
|
}
|
|
2571
|
-
async function generateQueriesWithModel(skill, provider, model, numQueries) {
|
|
3249
|
+
async function generateQueriesWithModel(skill, provider, model, numQueries, competitors) {
|
|
2572
3250
|
validateNumQueries(numQueries);
|
|
2573
3251
|
const shouldTriggerCount = Math.floor(numQueries / 2);
|
|
2574
3252
|
const shouldNotTriggerCount = numQueries - shouldTriggerCount;
|
|
@@ -2581,6 +3259,15 @@ async function generateQueriesWithModel(skill, provider, model, numQueries) {
|
|
|
2581
3259
|
const userPrompt = [
|
|
2582
3260
|
`Skill name: ${skill.frontmatter.name}`,
|
|
2583
3261
|
`Skill description: ${skill.frontmatter.description}`,
|
|
3262
|
+
...competitors && competitors.length > 0 ? [
|
|
3263
|
+
"",
|
|
3264
|
+
"Competitor skills in the same domain:",
|
|
3265
|
+
...competitors.map((competitor) => `- ${competitor.name}: ${competitor.description}`),
|
|
3266
|
+
"",
|
|
3267
|
+
"Generate queries that test whether the target skill triggers correctly even when these similar skills exist.",
|
|
3268
|
+
"Positive queries should clearly belong to the target skill, not the competitors.",
|
|
3269
|
+
"Negative queries should belong to a competitor or to no skill at all."
|
|
3270
|
+
] : [],
|
|
2584
3271
|
`Generate ${numQueries} prompts total.`,
|
|
2585
3272
|
`Exactly ${shouldTriggerCount} should have should_trigger=true.`,
|
|
2586
3273
|
`Exactly ${shouldNotTriggerCount} should have should_trigger=false.`,
|
|
@@ -2614,16 +3301,33 @@ function parseDecision(rawResponse, skillNames) {
|
|
|
2614
3301
|
}
|
|
2615
3302
|
return "unrecognized";
|
|
2616
3303
|
}
|
|
2617
|
-
function prepareTriggerQueries(skill, queries, seed) {
|
|
3304
|
+
function prepareTriggerQueries(skill, queries, seed, competitors) {
|
|
2618
3305
|
const rng = createRng(seed);
|
|
3306
|
+
const competitorCandidates = (competitors ?? []).map((competitor) => ({
|
|
3307
|
+
name: competitor.name,
|
|
3308
|
+
description: competitor.description
|
|
3309
|
+
}));
|
|
2619
3310
|
return queries.map((testQuery) => {
|
|
2620
|
-
const
|
|
3311
|
+
const usingCompetitors = competitorCandidates.length > 0;
|
|
3312
|
+
const fakeCount = usingCompetitors ? testQuery.should_trigger ? 2 + Math.floor(rng() * 3) : 3 + Math.floor(rng() * 3) : 5 + Math.floor(rng() * 5);
|
|
2621
3313
|
const fakeSkills = sample(FAKE_SKILLS, fakeCount, rng);
|
|
2622
|
-
const allSkills = shuffle(
|
|
3314
|
+
const allSkills = usingCompetitors ? shuffle(
|
|
2623
3315
|
[
|
|
3316
|
+
...competitorCandidates,
|
|
2624
3317
|
...fakeSkills,
|
|
2625
|
-
|
|
2626
|
-
|
|
3318
|
+
...testQuery.should_trigger ? [
|
|
3319
|
+
{
|
|
3320
|
+
name: skill.frontmatter.name,
|
|
3321
|
+
description: skill.frontmatter.description
|
|
3322
|
+
}
|
|
3323
|
+
] : []
|
|
3324
|
+
],
|
|
3325
|
+
rng
|
|
3326
|
+
) : shuffle(
|
|
3327
|
+
[
|
|
3328
|
+
...fakeSkills,
|
|
3329
|
+
{
|
|
3330
|
+
name: skill.frontmatter.name,
|
|
2627
3331
|
description: skill.frontmatter.description
|
|
2628
3332
|
}
|
|
2629
3333
|
],
|
|
@@ -2673,25 +3377,82 @@ function calculateMetrics(skillName, cases) {
|
|
|
2673
3377
|
f1
|
|
2674
3378
|
};
|
|
2675
3379
|
}
|
|
2676
|
-
function
|
|
3380
|
+
function assertCompetitorNamesDistinct(skillName, competitors) {
|
|
3381
|
+
for (const competitor of competitors) {
|
|
3382
|
+
if (competitor.name === skillName) {
|
|
3383
|
+
throw new Error(`Competitor skill '${competitor.name}' has the same name as the skill under test.`);
|
|
3384
|
+
}
|
|
3385
|
+
}
|
|
3386
|
+
}
|
|
3387
|
+
function buildTriggerCaseResult(options) {
|
|
3388
|
+
const expected = options.testQuery.should_trigger ? options.skillName : "none";
|
|
3389
|
+
const matched = options.testQuery.should_trigger ? options.decision === options.skillName : options.decision !== options.skillName;
|
|
3390
|
+
const selectedCompetitor = options.competitorNames?.includes(options.decision) ? options.decision : void 0;
|
|
3391
|
+
return {
|
|
3392
|
+
query: options.testQuery.query,
|
|
3393
|
+
shouldTrigger: options.testQuery.should_trigger,
|
|
3394
|
+
expected,
|
|
3395
|
+
actual: options.decision,
|
|
3396
|
+
matched,
|
|
3397
|
+
selectedCompetitor,
|
|
3398
|
+
rawModelResponse: options.rawModelResponse
|
|
3399
|
+
};
|
|
3400
|
+
}
|
|
3401
|
+
function buildSuggestions(skillName, metrics, cases, competitors) {
|
|
2677
3402
|
const suggestions = [];
|
|
2678
3403
|
if (metrics.falseNegatives > 0) {
|
|
2679
3404
|
suggestions.push(
|
|
2680
3405
|
"False negatives found: clarify capability keywords and add explicit 'use when ...' phrasing in description."
|
|
2681
3406
|
);
|
|
3407
|
+
if (competitors && competitors.length > 0) {
|
|
3408
|
+
const competitorCounts = /* @__PURE__ */ new Map();
|
|
3409
|
+
for (const testCase of cases) {
|
|
3410
|
+
if (!testCase.shouldTrigger || testCase.actual === skillName || !testCase.selectedCompetitor) {
|
|
3411
|
+
continue;
|
|
3412
|
+
}
|
|
3413
|
+
competitorCounts.set(testCase.selectedCompetitor, (competitorCounts.get(testCase.selectedCompetitor) ?? 0) + 1);
|
|
3414
|
+
}
|
|
3415
|
+
for (const [competitorName, count] of competitorCounts.entries()) {
|
|
3416
|
+
suggestions.push(
|
|
3417
|
+
`Skill '${competitorName}' was selected instead of '${skillName}' for ${count} quer${count === 1 ? "y" : "ies"}. Differentiate your description from '${competitorName}'.`
|
|
3418
|
+
);
|
|
3419
|
+
}
|
|
3420
|
+
}
|
|
2682
3421
|
}
|
|
2683
3422
|
if (metrics.falsePositives > 0) {
|
|
2684
3423
|
suggestions.push("False positives found: narrow scope boundaries and add explicit non-goals in description.");
|
|
3424
|
+
if (competitors && competitors.length > 0) {
|
|
3425
|
+
suggestions.push(
|
|
3426
|
+
`With competitor skills present, ${metrics.falsePositives} negative quer${metrics.falsePositives === 1 ? "y still" : "ies still"} triggered '${skillName}'. Narrow your description's scope boundaries.`
|
|
3427
|
+
);
|
|
3428
|
+
}
|
|
2685
3429
|
}
|
|
2686
3430
|
if (suggestions.length === 0) {
|
|
2687
3431
|
suggestions.push("Trigger behavior looks clean on this sample. Keep monitoring with domain-specific custom queries.");
|
|
2688
3432
|
}
|
|
2689
3433
|
return suggestions;
|
|
2690
3434
|
}
|
|
3435
|
+
async function loadCompetitorSkills(comparePaths) {
|
|
3436
|
+
const competitors = [];
|
|
3437
|
+
for (const comparePath of comparePaths) {
|
|
3438
|
+
const parsed = await parseSkillStrict(comparePath);
|
|
3439
|
+
competitors.push({
|
|
3440
|
+
name: parsed.frontmatter.name,
|
|
3441
|
+
description: parsed.frontmatter.description,
|
|
3442
|
+
sourcePath: comparePath
|
|
3443
|
+
});
|
|
3444
|
+
}
|
|
3445
|
+
return competitors;
|
|
3446
|
+
}
|
|
2691
3447
|
async function runTriggerTest(skill, options) {
|
|
2692
|
-
const
|
|
3448
|
+
const competitors = options.compare && options.compare.length > 0 ? await loadCompetitorSkills(options.compare) : void 0;
|
|
3449
|
+
if (competitors && competitors.length > 0) {
|
|
3450
|
+
assertCompetitorNamesDistinct(skill.frontmatter.name, competitors);
|
|
3451
|
+
}
|
|
3452
|
+
const queries = options.queries && options.queries.length > 0 ? triggerQueryArraySchema.parse(options.queries) : await generateQueriesWithModel(skill, options.provider, options.model, options.numQueries, competitors);
|
|
2693
3453
|
const skillName = skill.frontmatter.name;
|
|
2694
|
-
const preparedQueries = prepareTriggerQueries(skill, queries, options.seed);
|
|
3454
|
+
const preparedQueries = prepareTriggerQueries(skill, queries, options.seed, competitors);
|
|
3455
|
+
const competitorNames = competitors?.map((competitor) => competitor.name) ?? [];
|
|
2695
3456
|
const systemPrompt = [
|
|
2696
3457
|
"You are selecting one skill to activate for a user query.",
|
|
2697
3458
|
"Choose the single best matching skill name from the provided list, or 'none' if no skill is a good fit.",
|
|
@@ -2704,18 +3465,15 @@ async function runTriggerTest(skill, options) {
|
|
|
2704
3465
|
const rawResponse = await options.provider.sendMessage(systemPrompt, userPrompt, { model: options.model });
|
|
2705
3466
|
const decision = parseDecision(
|
|
2706
3467
|
rawResponse,
|
|
2707
|
-
allSkills.map((entry) => entry.name)
|
|
3468
|
+
Array.from(/* @__PURE__ */ new Set([skillName, ...allSkills.map((entry) => entry.name)]))
|
|
2708
3469
|
);
|
|
2709
|
-
|
|
2710
|
-
|
|
2711
|
-
|
|
2712
|
-
|
|
2713
|
-
|
|
2714
|
-
expected,
|
|
2715
|
-
actual: decision,
|
|
2716
|
-
matched,
|
|
3470
|
+
return buildTriggerCaseResult({
|
|
3471
|
+
testQuery,
|
|
3472
|
+
skillName,
|
|
3473
|
+
decision,
|
|
3474
|
+
competitorNames,
|
|
2717
3475
|
rawModelResponse: options.verbose ? rawResponse : void 0
|
|
2718
|
-
};
|
|
3476
|
+
});
|
|
2719
3477
|
},
|
|
2720
3478
|
options.concurrency ?? 5
|
|
2721
3479
|
);
|
|
@@ -2725,10 +3483,11 @@ async function runTriggerTest(skill, options) {
|
|
|
2725
3483
|
model: options.model,
|
|
2726
3484
|
provider: options.provider.name,
|
|
2727
3485
|
seed: options.seed,
|
|
3486
|
+
competitors,
|
|
2728
3487
|
queries,
|
|
2729
3488
|
cases: results,
|
|
2730
3489
|
metrics,
|
|
2731
|
-
suggestions: buildSuggestions(metrics)
|
|
3490
|
+
suggestions: buildSuggestions(skillName, metrics, results, competitors)
|
|
2732
3491
|
};
|
|
2733
3492
|
}
|
|
2734
3493
|
|
|
@@ -2739,10 +3498,7 @@ function renderJson(value) {
|
|
|
2739
3498
|
|
|
2740
3499
|
// src/commands/common.ts
|
|
2741
3500
|
var executionContextByCommand = /* @__PURE__ */ new WeakMap();
|
|
2742
|
-
var singleEvalPromptSchema =
|
|
2743
|
-
prompt: z5.string().min(1),
|
|
2744
|
-
assertions: z5.array(z5.string().min(1)).optional()
|
|
2745
|
-
});
|
|
3501
|
+
var singleEvalPromptSchema = evalPromptSchema;
|
|
2746
3502
|
var promptStringArraySchema = z5.array(z5.string().min(1));
|
|
2747
3503
|
var assertionsObjectSchema = z5.object({
|
|
2748
3504
|
assertions: z5.array(z5.string().min(1))
|
|
@@ -2777,6 +3533,22 @@ function normalizeEvalPrompts(value, sourceLabel) {
|
|
|
2777
3533
|
function parseAssertionsFromText(raw) {
|
|
2778
3534
|
return raw.split(/\r?\n/).map((line) => line.trim().replace(/^[-*]\s+/, "").replace(/^\d+\.\s+/, "")).filter((line) => line.length > 0);
|
|
2779
3535
|
}
|
|
3536
|
+
function cloneEvalPrompt(prompt) {
|
|
3537
|
+
return {
|
|
3538
|
+
prompt: prompt.prompt,
|
|
3539
|
+
assertions: prompt.assertions ? [...prompt.assertions] : void 0,
|
|
3540
|
+
tools: prompt.tools ? prompt.tools.map((tool) => ({
|
|
3541
|
+
...tool,
|
|
3542
|
+
parameters: tool.parameters ? tool.parameters.map((parameter) => ({ ...parameter })) : void 0,
|
|
3543
|
+
responses: { ...tool.responses }
|
|
3544
|
+
})) : void 0,
|
|
3545
|
+
toolAssertions: prompt.toolAssertions ? prompt.toolAssertions.map((toolAssertion) => ({
|
|
3546
|
+
...toolAssertion,
|
|
3547
|
+
toolNames: toolAssertion.toolNames ? [...toolAssertion.toolNames] : void 0,
|
|
3548
|
+
expectedArgs: toolAssertion.expectedArgs ? { ...toolAssertion.expectedArgs } : void 0
|
|
3549
|
+
})) : void 0
|
|
3550
|
+
};
|
|
3551
|
+
}
|
|
2780
3552
|
function normalizeAssertions(value, sourceLabel) {
|
|
2781
3553
|
const assertionArray = z5.array(z5.string().min(1)).safeParse(value);
|
|
2782
3554
|
if (assertionArray.success) {
|
|
@@ -2843,23 +3615,20 @@ async function loadConfiguredEvalPrompts(command) {
|
|
|
2843
3615
|
if (!promptFile && assertionsFile) {
|
|
2844
3616
|
throw new Error("Config field eval.assertionsFile requires eval.promptFile.");
|
|
2845
3617
|
}
|
|
2846
|
-
const promptRaw = await
|
|
3618
|
+
const promptRaw = await fs6.readFile(promptFile, "utf8");
|
|
2847
3619
|
let prompts = normalizeEvalPrompts(parseJsonIfPossible(promptRaw), promptFile);
|
|
2848
3620
|
if (assertionsFile) {
|
|
2849
|
-
const assertionsRaw = await
|
|
3621
|
+
const assertionsRaw = await fs6.readFile(assertionsFile, "utf8");
|
|
2850
3622
|
const assertions = normalizeAssertions(parseJsonIfPossible(assertionsRaw), assertionsFile);
|
|
2851
3623
|
prompts = prompts.map((prompt) => ({
|
|
2852
|
-
prompt
|
|
3624
|
+
...cloneEvalPrompt(prompt),
|
|
2853
3625
|
assertions: [...assertions]
|
|
2854
3626
|
}));
|
|
2855
3627
|
}
|
|
2856
3628
|
const numRunsWasExplicit = context.configFile?.eval?.numRuns !== void 0;
|
|
2857
3629
|
if (numRunsWasExplicit && prompts.length === 1 && context.config.eval.numRuns > 1) {
|
|
2858
3630
|
const promptTemplate = prompts[0];
|
|
2859
|
-
prompts = Array.from({ length: context.config.eval.numRuns }, () => (
|
|
2860
|
-
prompt: promptTemplate.prompt,
|
|
2861
|
-
assertions: promptTemplate.assertions ? [...promptTemplate.assertions] : void 0
|
|
2862
|
-
}));
|
|
3631
|
+
prompts = Array.from({ length: context.config.eval.numRuns }, () => cloneEvalPrompt(promptTemplate));
|
|
2863
3632
|
}
|
|
2864
3633
|
return prompts;
|
|
2865
3634
|
}
|
|
@@ -2885,18 +3654,22 @@ function writeError(error, asJson) {
|
|
|
2885
3654
|
|
|
2886
3655
|
// src/commands/lint.ts
|
|
2887
3656
|
var lintCliSchema = z6.object({
|
|
2888
|
-
html: z6.string().optional()
|
|
3657
|
+
html: z6.string().optional(),
|
|
3658
|
+
plugin: z6.array(z6.string().min(1)).optional()
|
|
2889
3659
|
});
|
|
3660
|
+
function collectPluginPaths(value, previous = []) {
|
|
3661
|
+
return [...previous, value];
|
|
3662
|
+
}
|
|
2890
3663
|
async function handleLintCommand(targetPath, options) {
|
|
2891
3664
|
try {
|
|
2892
|
-
const report = await runLinter(targetPath, { suppress: options.suppress });
|
|
3665
|
+
const report = await runLinter(targetPath, { suppress: options.suppress, plugins: options.plugins });
|
|
2893
3666
|
if (options.json) {
|
|
2894
3667
|
writeResult(report, true);
|
|
2895
3668
|
} else {
|
|
2896
3669
|
writeResult(renderLintReport(report, options.color), false);
|
|
2897
3670
|
}
|
|
2898
3671
|
if (options.html) {
|
|
2899
|
-
await
|
|
3672
|
+
await fs7.writeFile(options.html, renderLintHtml(report), "utf8");
|
|
2900
3673
|
}
|
|
2901
3674
|
if (lintFails(report, options.failOn)) {
|
|
2902
3675
|
process.exitCode = 1;
|
|
@@ -2907,7 +3680,7 @@ async function handleLintCommand(targetPath, options) {
|
|
|
2907
3680
|
}
|
|
2908
3681
|
}
|
|
2909
3682
|
function registerLintCommand(program) {
|
|
2910
|
-
program.command("lint").description("Run static lint checks against a SKILL.md file or skill directory.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--html <path>", "Write an HTML report to the given file path").action(async (targetPath, _commandOptions, command) => {
|
|
3683
|
+
program.command("lint").description("Run static lint checks against a SKILL.md file or skill directory.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--html <path>", "Write an HTML report to the given file path").option("--plugin <path>", "Load a custom lint plugin file", collectPluginPaths, []).action(async (targetPath, _commandOptions, command) => {
|
|
2911
3684
|
const globalOptions = getGlobalCliOptions(command);
|
|
2912
3685
|
const config = getResolvedConfig(command);
|
|
2913
3686
|
const parsedCli = lintCliSchema.safeParse(command.opts());
|
|
@@ -2920,36 +3693,40 @@ function registerLintCommand(program) {
|
|
|
2920
3693
|
...globalOptions,
|
|
2921
3694
|
failOn: config.lint.failOn,
|
|
2922
3695
|
suppress: config.lint.suppress,
|
|
3696
|
+
plugins: config.lint.plugins,
|
|
2923
3697
|
html: parsedCli.data.html
|
|
2924
3698
|
});
|
|
2925
3699
|
});
|
|
2926
3700
|
}
|
|
2927
3701
|
|
|
2928
3702
|
// src/commands/trigger.ts
|
|
2929
|
-
import
|
|
3703
|
+
import fs9 from "node:fs/promises";
|
|
2930
3704
|
import ora from "ora";
|
|
2931
3705
|
import { z as z8 } from "zod";
|
|
2932
3706
|
|
|
2933
3707
|
// src/utils/config.ts
|
|
2934
|
-
import
|
|
2935
|
-
import
|
|
3708
|
+
import fs8 from "node:fs/promises";
|
|
3709
|
+
import path6 from "node:path";
|
|
2936
3710
|
import { z as z7 } from "zod";
|
|
2937
3711
|
var providerNameSchema = z7.enum(["anthropic", "openai"]);
|
|
2938
3712
|
var lintFailOnSchema = z7.enum(["error", "warn"]);
|
|
2939
3713
|
var lintConfigSchema = z7.object({
|
|
2940
3714
|
failOn: lintFailOnSchema.optional(),
|
|
2941
|
-
suppress: z7.array(z7.string().min(1)).optional()
|
|
3715
|
+
suppress: z7.array(z7.string().min(1)).optional(),
|
|
3716
|
+
plugins: z7.array(z7.string().min(1)).optional()
|
|
2942
3717
|
}).strict();
|
|
2943
3718
|
var triggerConfigSchema = z7.object({
|
|
2944
3719
|
numQueries: z7.number().int().min(2).refine((value) => value % 2 === 0, "trigger.numQueries must be an even number."),
|
|
2945
3720
|
threshold: z7.number().min(0).max(1).optional(),
|
|
2946
|
-
seed: z7.number().int().optional()
|
|
3721
|
+
seed: z7.number().int().optional(),
|
|
3722
|
+
compare: z7.array(z7.string().min(1)).optional()
|
|
2947
3723
|
}).strict().partial();
|
|
2948
3724
|
var evalConfigSchema = z7.object({
|
|
2949
3725
|
numRuns: z7.number().int().min(1).optional(),
|
|
2950
3726
|
threshold: z7.number().min(0).max(1).optional(),
|
|
2951
3727
|
promptFile: z7.string().min(1).optional(),
|
|
2952
|
-
assertionsFile: z7.string().min(1).optional()
|
|
3728
|
+
assertionsFile: z7.string().min(1).optional(),
|
|
3729
|
+
maxToolIterations: z7.number().int().min(1).max(50).optional()
|
|
2953
3730
|
}).strict().partial();
|
|
2954
3731
|
var skilltestConfigSchema = z7.object({
|
|
2955
3732
|
provider: providerNameSchema.optional(),
|
|
@@ -2967,18 +3744,21 @@ var resolvedSkilltestConfigSchema = z7.object({
|
|
|
2967
3744
|
concurrency: z7.number().int().min(1),
|
|
2968
3745
|
lint: z7.object({
|
|
2969
3746
|
failOn: lintFailOnSchema,
|
|
2970
|
-
suppress: z7.array(z7.string().min(1))
|
|
3747
|
+
suppress: z7.array(z7.string().min(1)),
|
|
3748
|
+
plugins: z7.array(z7.string().min(1))
|
|
2971
3749
|
}),
|
|
2972
3750
|
trigger: z7.object({
|
|
2973
3751
|
numQueries: z7.number().int().min(2).refine((value) => value % 2 === 0, "trigger.numQueries must be an even number."),
|
|
2974
3752
|
threshold: z7.number().min(0).max(1),
|
|
2975
|
-
seed: z7.number().int().optional()
|
|
3753
|
+
seed: z7.number().int().optional(),
|
|
3754
|
+
compare: z7.array(z7.string().min(1))
|
|
2976
3755
|
}),
|
|
2977
3756
|
eval: z7.object({
|
|
2978
3757
|
numRuns: z7.number().int().min(1),
|
|
2979
3758
|
threshold: z7.number().min(0).max(1),
|
|
2980
3759
|
promptFile: z7.string().min(1).optional(),
|
|
2981
|
-
assertionsFile: z7.string().min(1).optional()
|
|
3760
|
+
assertionsFile: z7.string().min(1).optional(),
|
|
3761
|
+
maxToolIterations: z7.number().int().min(1).max(50)
|
|
2982
3762
|
})
|
|
2983
3763
|
});
|
|
2984
3764
|
var DEFAULT_SKILLTEST_CONFIG = {
|
|
@@ -2988,15 +3768,18 @@ var DEFAULT_SKILLTEST_CONFIG = {
|
|
|
2988
3768
|
concurrency: 5,
|
|
2989
3769
|
lint: {
|
|
2990
3770
|
failOn: "error",
|
|
2991
|
-
suppress: []
|
|
3771
|
+
suppress: [],
|
|
3772
|
+
plugins: []
|
|
2992
3773
|
},
|
|
2993
3774
|
trigger: {
|
|
2994
3775
|
numQueries: 20,
|
|
2995
|
-
threshold: 0.8
|
|
3776
|
+
threshold: 0.8,
|
|
3777
|
+
compare: []
|
|
2996
3778
|
},
|
|
2997
3779
|
eval: {
|
|
2998
3780
|
numRuns: 5,
|
|
2999
|
-
threshold: 0.9
|
|
3781
|
+
threshold: 0.9,
|
|
3782
|
+
maxToolIterations: 10
|
|
3000
3783
|
}
|
|
3001
3784
|
};
|
|
3002
3785
|
function formatIssuePath(issuePath) {
|
|
@@ -3014,7 +3797,7 @@ function buildConfigValidationError(error, sourceLabel) {
|
|
|
3014
3797
|
async function readJsonObject(filePath, label) {
|
|
3015
3798
|
let raw;
|
|
3016
3799
|
try {
|
|
3017
|
-
raw = await
|
|
3800
|
+
raw = await fs8.readFile(filePath, "utf8");
|
|
3018
3801
|
} catch (error) {
|
|
3019
3802
|
const message = error instanceof Error ? error.message : String(error);
|
|
3020
3803
|
throw new Error(`Failed to read ${label}: ${message}`);
|
|
@@ -3038,13 +3821,13 @@ async function loadConfigFromJsonFile(filePath) {
|
|
|
3038
3821
|
return {
|
|
3039
3822
|
configFile: parsed.data,
|
|
3040
3823
|
sourcePath: filePath,
|
|
3041
|
-
sourceDirectory:
|
|
3824
|
+
sourceDirectory: path6.dirname(filePath)
|
|
3042
3825
|
};
|
|
3043
3826
|
}
|
|
3044
3827
|
async function loadConfigFromNearestPackageJson(startDirectory) {
|
|
3045
|
-
let currentDirectory =
|
|
3828
|
+
let currentDirectory = path6.resolve(startDirectory);
|
|
3046
3829
|
while (true) {
|
|
3047
|
-
const packageJsonPath =
|
|
3830
|
+
const packageJsonPath = path6.join(currentDirectory, "package.json");
|
|
3048
3831
|
if (await pathExists(packageJsonPath)) {
|
|
3049
3832
|
const raw = await readJsonObject(packageJsonPath, packageJsonPath);
|
|
3050
3833
|
const packageJsonSchema = z7.object({
|
|
@@ -3063,7 +3846,7 @@ async function loadConfigFromNearestPackageJson(startDirectory) {
|
|
|
3063
3846
|
sourceDirectory: currentDirectory
|
|
3064
3847
|
};
|
|
3065
3848
|
}
|
|
3066
|
-
const parentDirectory =
|
|
3849
|
+
const parentDirectory = path6.dirname(currentDirectory);
|
|
3067
3850
|
if (parentDirectory === currentDirectory) {
|
|
3068
3851
|
return null;
|
|
3069
3852
|
}
|
|
@@ -3076,7 +3859,7 @@ async function resolveSkillDirectoryConfig(targetPath) {
|
|
|
3076
3859
|
}
|
|
3077
3860
|
try {
|
|
3078
3861
|
const { skillRoot } = await resolveSkillPath(targetPath);
|
|
3079
|
-
return loadConfigFromJsonFile(
|
|
3862
|
+
return loadConfigFromJsonFile(path6.join(skillRoot, ".skilltestrc"));
|
|
3080
3863
|
} catch {
|
|
3081
3864
|
return null;
|
|
3082
3865
|
}
|
|
@@ -3085,7 +3868,13 @@ function resolveConfigRelativePath(baseDirectory, value) {
|
|
|
3085
3868
|
if (!value) {
|
|
3086
3869
|
return void 0;
|
|
3087
3870
|
}
|
|
3088
|
-
return
|
|
3871
|
+
return path6.resolve(baseDirectory, value);
|
|
3872
|
+
}
|
|
3873
|
+
function resolveConfigRelativePaths(baseDirectory, values) {
|
|
3874
|
+
if (!values || values.length === 0) {
|
|
3875
|
+
return [];
|
|
3876
|
+
}
|
|
3877
|
+
return values.map((value) => path6.resolve(baseDirectory, value));
|
|
3089
3878
|
}
|
|
3090
3879
|
function mergeConfigLayers(configFile = {}, cliFlags = {}, baseDirectory = process.cwd()) {
|
|
3091
3880
|
const merged = {
|
|
@@ -3095,12 +3884,20 @@ function mergeConfigLayers(configFile = {}, cliFlags = {}, baseDirectory = proce
|
|
|
3095
3884
|
concurrency: cliFlags.concurrency ?? configFile.concurrency ?? DEFAULT_SKILLTEST_CONFIG.concurrency,
|
|
3096
3885
|
lint: {
|
|
3097
3886
|
failOn: cliFlags.lint?.failOn ?? configFile.lint?.failOn ?? DEFAULT_SKILLTEST_CONFIG.lint.failOn,
|
|
3098
|
-
suppress: cliFlags.lint?.suppress ?? configFile.lint?.suppress ?? DEFAULT_SKILLTEST_CONFIG.lint.suppress
|
|
3887
|
+
suppress: cliFlags.lint?.suppress ?? configFile.lint?.suppress ?? DEFAULT_SKILLTEST_CONFIG.lint.suppress,
|
|
3888
|
+
plugins: resolveConfigRelativePaths(
|
|
3889
|
+
baseDirectory,
|
|
3890
|
+
cliFlags.lint?.plugins ?? configFile.lint?.plugins ?? DEFAULT_SKILLTEST_CONFIG.lint.plugins
|
|
3891
|
+
)
|
|
3099
3892
|
},
|
|
3100
3893
|
trigger: {
|
|
3101
3894
|
numQueries: cliFlags.trigger?.numQueries ?? configFile.trigger?.numQueries ?? DEFAULT_SKILLTEST_CONFIG.trigger.numQueries,
|
|
3102
3895
|
threshold: cliFlags.trigger?.threshold ?? configFile.trigger?.threshold ?? DEFAULT_SKILLTEST_CONFIG.trigger.threshold,
|
|
3103
|
-
seed: cliFlags.trigger?.seed ?? configFile.trigger?.seed
|
|
3896
|
+
seed: cliFlags.trigger?.seed ?? configFile.trigger?.seed,
|
|
3897
|
+
compare: resolveConfigRelativePaths(
|
|
3898
|
+
baseDirectory,
|
|
3899
|
+
cliFlags.trigger?.compare ?? configFile.trigger?.compare ?? DEFAULT_SKILLTEST_CONFIG.trigger.compare
|
|
3900
|
+
)
|
|
3104
3901
|
},
|
|
3105
3902
|
eval: {
|
|
3106
3903
|
numRuns: cliFlags.eval?.numRuns ?? configFile.eval?.numRuns ?? DEFAULT_SKILLTEST_CONFIG.eval.numRuns,
|
|
@@ -3112,7 +3909,8 @@ function mergeConfigLayers(configFile = {}, cliFlags = {}, baseDirectory = proce
|
|
|
3112
3909
|
assertionsFile: resolveConfigRelativePath(
|
|
3113
3910
|
baseDirectory,
|
|
3114
3911
|
cliFlags.eval?.assertionsFile ?? configFile.eval?.assertionsFile ?? DEFAULT_SKILLTEST_CONFIG.eval.assertionsFile
|
|
3115
|
-
)
|
|
3912
|
+
),
|
|
3913
|
+
maxToolIterations: cliFlags.eval?.maxToolIterations ?? configFile.eval?.maxToolIterations ?? DEFAULT_SKILLTEST_CONFIG.eval.maxToolIterations
|
|
3116
3914
|
}
|
|
3117
3915
|
};
|
|
3118
3916
|
return resolvedSkilltestConfigSchema.parse(merged);
|
|
@@ -3136,22 +3934,34 @@ function extractCliConfigOverrides(command) {
|
|
|
3136
3934
|
if (command.getOptionValueSource("model") === "cli") {
|
|
3137
3935
|
overrides.model = getTypedOptionValue(command, "model");
|
|
3138
3936
|
}
|
|
3139
|
-
if ((command.name() === "trigger" || command.name() === "eval" || command.name() === "check") && command.getOptionValueSource("concurrency") === "cli") {
|
|
3937
|
+
if ((command.name() === "trigger" || command.name() === "eval" || command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("concurrency") === "cli") {
|
|
3140
3938
|
overrides.concurrency = getTypedOptionValue(command, "concurrency");
|
|
3141
3939
|
}
|
|
3142
|
-
if ((command.name() === "trigger" || command.name() === "check") && command.getOptionValueSource("numQueries") === "cli") {
|
|
3940
|
+
if ((command.name() === "trigger" || command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("numQueries") === "cli") {
|
|
3143
3941
|
overrides.trigger = {
|
|
3144
3942
|
...overrides.trigger,
|
|
3145
3943
|
numQueries: getTypedOptionValue(command, "numQueries")
|
|
3146
3944
|
};
|
|
3147
3945
|
}
|
|
3148
|
-
if (command.name() === "check" && command.getOptionValueSource("
|
|
3946
|
+
if ((command.name() === "trigger" || command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("compare") === "cli") {
|
|
3947
|
+
overrides.trigger = {
|
|
3948
|
+
...overrides.trigger,
|
|
3949
|
+
compare: getTypedOptionValue(command, "compare")
|
|
3950
|
+
};
|
|
3951
|
+
}
|
|
3952
|
+
if ((command.name() === "lint" || command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("plugin") === "cli") {
|
|
3953
|
+
overrides.lint = {
|
|
3954
|
+
...overrides.lint,
|
|
3955
|
+
plugins: getTypedOptionValue(command, "plugin")
|
|
3956
|
+
};
|
|
3957
|
+
}
|
|
3958
|
+
if ((command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("minF1") === "cli") {
|
|
3149
3959
|
overrides.trigger = {
|
|
3150
3960
|
...overrides.trigger,
|
|
3151
3961
|
threshold: getTypedOptionValue(command, "minF1")
|
|
3152
3962
|
};
|
|
3153
3963
|
}
|
|
3154
|
-
if (command.name() === "check" && command.getOptionValueSource("minAssertPassRate") === "cli") {
|
|
3964
|
+
if ((command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("minAssertPassRate") === "cli") {
|
|
3155
3965
|
overrides.eval = {
|
|
3156
3966
|
...overrides.eval,
|
|
3157
3967
|
threshold: getTypedOptionValue(command, "minAssertPassRate")
|
|
@@ -3172,7 +3982,7 @@ async function resolveConfigContext(targetPath, cliFlags) {
|
|
|
3172
3982
|
config: mergeConfigLayers(skillDirectoryConfig.configFile, cliFlags, skillDirectoryConfig.sourceDirectory)
|
|
3173
3983
|
};
|
|
3174
3984
|
}
|
|
3175
|
-
const cwdConfigPath =
|
|
3985
|
+
const cwdConfigPath = path6.join(cwd, ".skilltestrc");
|
|
3176
3986
|
const cwdConfig = await loadConfigFromJsonFile(cwdConfigPath);
|
|
3177
3987
|
if (cwdConfig) {
|
|
3178
3988
|
return {
|
|
@@ -3216,6 +4026,12 @@ function resolveApiKey(provider, override) {
|
|
|
3216
4026
|
|
|
3217
4027
|
// src/providers/anthropic.ts
|
|
3218
4028
|
import Anthropic from "@anthropic-ai/sdk";
|
|
4029
|
+
function isAnthropicTextBlock(block) {
|
|
4030
|
+
return block.type === "text";
|
|
4031
|
+
}
|
|
4032
|
+
function isAnthropicToolUseBlock(block) {
|
|
4033
|
+
return block.type === "tool_use";
|
|
4034
|
+
}
|
|
3219
4035
|
function wait(ms) {
|
|
3220
4036
|
return new Promise((resolve) => {
|
|
3221
4037
|
setTimeout(resolve, ms);
|
|
@@ -3241,27 +4057,11 @@ var AnthropicProvider = class {
|
|
|
3241
4057
|
constructor(apiKey) {
|
|
3242
4058
|
this.client = new Anthropic({ apiKey });
|
|
3243
4059
|
}
|
|
3244
|
-
async
|
|
4060
|
+
async createMessage(request) {
|
|
3245
4061
|
let lastError;
|
|
3246
4062
|
for (let attempt = 0; attempt < 3; attempt += 1) {
|
|
3247
4063
|
try {
|
|
3248
|
-
|
|
3249
|
-
model: options.model,
|
|
3250
|
-
max_tokens: 2048,
|
|
3251
|
-
system: systemPrompt,
|
|
3252
|
-
messages: [
|
|
3253
|
-
{
|
|
3254
|
-
role: "user",
|
|
3255
|
-
content: userMessage
|
|
3256
|
-
}
|
|
3257
|
-
]
|
|
3258
|
-
});
|
|
3259
|
-
const textBlocks = response.content.filter((block) => block.type === "text");
|
|
3260
|
-
const text = textBlocks.map((block) => block.text).join("\n").trim();
|
|
3261
|
-
if (text.length === 0) {
|
|
3262
|
-
throw new Error("Model returned an empty response.");
|
|
3263
|
-
}
|
|
3264
|
-
return text;
|
|
4064
|
+
return await this.client.messages.create(request);
|
|
3265
4065
|
} catch (error) {
|
|
3266
4066
|
lastError = error;
|
|
3267
4067
|
if (!isRateLimitError(error) || attempt === 2) {
|
|
@@ -3276,6 +4076,55 @@ var AnthropicProvider = class {
|
|
|
3276
4076
|
}
|
|
3277
4077
|
throw new Error("Anthropic API call failed with an unknown error.");
|
|
3278
4078
|
}
|
|
4079
|
+
toAnthropicMessages(messages) {
|
|
4080
|
+
return messages.map((message) => ({
|
|
4081
|
+
role: message.role,
|
|
4082
|
+
content: message.content
|
|
4083
|
+
}));
|
|
4084
|
+
}
|
|
4085
|
+
async sendMessage(systemPrompt, userMessage, options) {
|
|
4086
|
+
const response = await this.createMessage({
|
|
4087
|
+
model: options.model,
|
|
4088
|
+
max_tokens: 2048,
|
|
4089
|
+
system: systemPrompt,
|
|
4090
|
+
messages: [
|
|
4091
|
+
{
|
|
4092
|
+
role: "user",
|
|
4093
|
+
content: userMessage
|
|
4094
|
+
}
|
|
4095
|
+
]
|
|
4096
|
+
});
|
|
4097
|
+
const textBlocks = response.content.filter(isAnthropicTextBlock);
|
|
4098
|
+
const text = textBlocks.map((block) => block.text).join("\n").trim();
|
|
4099
|
+
if (text.length === 0) {
|
|
4100
|
+
throw new Error("Model returned an empty response.");
|
|
4101
|
+
}
|
|
4102
|
+
return text;
|
|
4103
|
+
}
|
|
4104
|
+
async sendWithTools(systemPrompt, messages, options) {
|
|
4105
|
+
const response = await this.createMessage({
|
|
4106
|
+
model: options.model,
|
|
4107
|
+
max_tokens: 2048,
|
|
4108
|
+
system: systemPrompt,
|
|
4109
|
+
messages: this.toAnthropicMessages(messages),
|
|
4110
|
+
tools: options.tools.map((tool) => ({
|
|
4111
|
+
name: tool.name,
|
|
4112
|
+
description: tool.description,
|
|
4113
|
+
input_schema: tool.parameters ?? { type: "object", properties: {} }
|
|
4114
|
+
}))
|
|
4115
|
+
});
|
|
4116
|
+
const textContent = response.content.filter(isAnthropicTextBlock).map((block) => block.text).join("\n").trim();
|
|
4117
|
+
const toolUseBlocks = response.content.filter(isAnthropicToolUseBlock).map((block) => ({
|
|
4118
|
+
id: block.id,
|
|
4119
|
+
name: block.name,
|
|
4120
|
+
arguments: block.input
|
|
4121
|
+
}));
|
|
4122
|
+
return {
|
|
4123
|
+
textContent,
|
|
4124
|
+
toolUseBlocks,
|
|
4125
|
+
stopReason: response.stop_reason ?? "end_turn"
|
|
4126
|
+
};
|
|
4127
|
+
}
|
|
3279
4128
|
};
|
|
3280
4129
|
|
|
3281
4130
|
// src/providers/openai.ts
|
|
@@ -3312,6 +4161,71 @@ function extractTextContent(content) {
|
|
|
3312
4161
|
const text = content.map((item) => item.type === "text" || !item.type ? item.text ?? "" : "").join("\n").trim();
|
|
3313
4162
|
return text;
|
|
3314
4163
|
}
|
|
4164
|
+
function parseToolArguments(raw, toolName) {
|
|
4165
|
+
if (!raw || raw.trim() === "") {
|
|
4166
|
+
return {};
|
|
4167
|
+
}
|
|
4168
|
+
try {
|
|
4169
|
+
const parsed = JSON.parse(raw);
|
|
4170
|
+
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
|
|
4171
|
+
throw new Error("Tool arguments must be a JSON object.");
|
|
4172
|
+
}
|
|
4173
|
+
return parsed;
|
|
4174
|
+
} catch (error) {
|
|
4175
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
4176
|
+
throw new Error(`OpenAI tool call arguments for '${toolName}' were not valid JSON: ${message}`);
|
|
4177
|
+
}
|
|
4178
|
+
}
|
|
4179
|
+
function getBlockText(blocks) {
|
|
4180
|
+
return blocks.filter((block) => block.type === "text").map((block) => String(block.text ?? "")).join("\n").trim();
|
|
4181
|
+
}
|
|
4182
|
+
function mapAssistantBlocksToMessage(blocks) {
|
|
4183
|
+
const textContent = getBlockText(blocks);
|
|
4184
|
+
const toolCalls = blocks.filter((block) => block.type === "tool_use").map((block) => ({
|
|
4185
|
+
id: String(block.id ?? ""),
|
|
4186
|
+
type: "function",
|
|
4187
|
+
function: {
|
|
4188
|
+
name: String(block.name ?? ""),
|
|
4189
|
+
arguments: JSON.stringify(block.input ?? {})
|
|
4190
|
+
}
|
|
4191
|
+
}));
|
|
4192
|
+
return {
|
|
4193
|
+
role: "assistant",
|
|
4194
|
+
content: textContent.length > 0 ? textContent : null,
|
|
4195
|
+
...toolCalls.length > 0 ? { tool_calls: toolCalls } : {}
|
|
4196
|
+
};
|
|
4197
|
+
}
|
|
4198
|
+
function mapUserBlocksToMessages(blocks) {
|
|
4199
|
+
const toolResults = blocks.filter((block) => block.type === "tool_result");
|
|
4200
|
+
if (toolResults.length > 0) {
|
|
4201
|
+
return toolResults.map((block) => ({
|
|
4202
|
+
role: "tool",
|
|
4203
|
+
tool_call_id: String(block.tool_use_id ?? ""),
|
|
4204
|
+
content: String(block.content ?? "")
|
|
4205
|
+
}));
|
|
4206
|
+
}
|
|
4207
|
+
const textContent = getBlockText(blocks);
|
|
4208
|
+
return [
|
|
4209
|
+
{
|
|
4210
|
+
role: "user",
|
|
4211
|
+
content: textContent
|
|
4212
|
+
}
|
|
4213
|
+
];
|
|
4214
|
+
}
|
|
4215
|
+
function mapConversationBlockToMessages(block) {
|
|
4216
|
+
if (typeof block.content === "string") {
|
|
4217
|
+
return [
|
|
4218
|
+
{
|
|
4219
|
+
role: block.role,
|
|
4220
|
+
content: block.content
|
|
4221
|
+
}
|
|
4222
|
+
];
|
|
4223
|
+
}
|
|
4224
|
+
if (block.role === "assistant") {
|
|
4225
|
+
return [mapAssistantBlocksToMessage(block.content)];
|
|
4226
|
+
}
|
|
4227
|
+
return mapUserBlocksToMessages(block.content);
|
|
4228
|
+
}
|
|
3315
4229
|
var OpenAIProvider = class {
|
|
3316
4230
|
name = "openai";
|
|
3317
4231
|
apiKey;
|
|
@@ -3340,30 +4254,12 @@ var OpenAIProvider = class {
|
|
|
3340
4254
|
this.client = new OpenAIConstructor({ apiKey: this.apiKey });
|
|
3341
4255
|
return this.client;
|
|
3342
4256
|
}
|
|
3343
|
-
async
|
|
4257
|
+
async createCompletion(input) {
|
|
3344
4258
|
const client = await this.ensureClient();
|
|
3345
4259
|
let lastError;
|
|
3346
4260
|
for (let attempt = 0; attempt < 3; attempt += 1) {
|
|
3347
4261
|
try {
|
|
3348
|
-
|
|
3349
|
-
model: options.model,
|
|
3350
|
-
max_tokens: 2048,
|
|
3351
|
-
messages: [
|
|
3352
|
-
{
|
|
3353
|
-
role: "system",
|
|
3354
|
-
content: systemPrompt
|
|
3355
|
-
},
|
|
3356
|
-
{
|
|
3357
|
-
role: "user",
|
|
3358
|
-
content: userMessage
|
|
3359
|
-
}
|
|
3360
|
-
]
|
|
3361
|
-
});
|
|
3362
|
-
const text = (response.choices ?? []).map((choice) => extractTextContent(choice.message?.content)).join("\n").trim();
|
|
3363
|
-
if (text.length === 0) {
|
|
3364
|
-
throw new Error("Model returned an empty response.");
|
|
3365
|
-
}
|
|
3366
|
-
return text;
|
|
4262
|
+
return await client.chat.completions.create(input);
|
|
3367
4263
|
} catch (error) {
|
|
3368
4264
|
lastError = error;
|
|
3369
4265
|
if (!isRetriableError(error) || attempt === 2) {
|
|
@@ -3378,6 +4274,57 @@ var OpenAIProvider = class {
|
|
|
3378
4274
|
}
|
|
3379
4275
|
throw new Error("OpenAI API call failed with an unknown error.");
|
|
3380
4276
|
}
|
|
4277
|
+
toOpenAiMessages(systemPrompt, messages) {
|
|
4278
|
+
return [
|
|
4279
|
+
{
|
|
4280
|
+
role: "system",
|
|
4281
|
+
content: systemPrompt
|
|
4282
|
+
},
|
|
4283
|
+
...messages.flatMap((message) => mapConversationBlockToMessages(message))
|
|
4284
|
+
];
|
|
4285
|
+
}
|
|
4286
|
+
async sendMessage(systemPrompt, userMessage, options) {
|
|
4287
|
+
const response = await this.createCompletion({
|
|
4288
|
+
model: options.model,
|
|
4289
|
+
max_tokens: 2048,
|
|
4290
|
+
messages: this.toOpenAiMessages(systemPrompt, [{ role: "user", content: userMessage }])
|
|
4291
|
+
});
|
|
4292
|
+
const text = (response.choices ?? []).map((choice) => extractTextContent(choice.message?.content)).join("\n").trim();
|
|
4293
|
+
if (text.length === 0) {
|
|
4294
|
+
throw new Error("Model returned an empty response.");
|
|
4295
|
+
}
|
|
4296
|
+
return text;
|
|
4297
|
+
}
|
|
4298
|
+
async sendWithTools(systemPrompt, messages, options) {
|
|
4299
|
+
const response = await this.createCompletion({
|
|
4300
|
+
model: options.model,
|
|
4301
|
+
max_tokens: 2048,
|
|
4302
|
+
messages: this.toOpenAiMessages(systemPrompt, messages),
|
|
4303
|
+
tools: options.tools.map((tool) => ({
|
|
4304
|
+
type: "function",
|
|
4305
|
+
function: {
|
|
4306
|
+
name: tool.name,
|
|
4307
|
+
description: tool.description,
|
|
4308
|
+
parameters: tool.parameters
|
|
4309
|
+
}
|
|
4310
|
+
}))
|
|
4311
|
+
});
|
|
4312
|
+
const choice = response.choices?.[0];
|
|
4313
|
+
const message = choice?.message;
|
|
4314
|
+
const toolUseBlocks = (message?.tool_calls ?? []).map((toolCall, index) => {
|
|
4315
|
+
const toolName = toolCall.function?.name ?? `tool-${index + 1}`;
|
|
4316
|
+
return {
|
|
4317
|
+
id: toolCall.id ?? `${toolName}-${index + 1}`,
|
|
4318
|
+
name: toolName,
|
|
4319
|
+
arguments: parseToolArguments(toolCall.function?.arguments, toolName)
|
|
4320
|
+
};
|
|
4321
|
+
});
|
|
4322
|
+
return {
|
|
4323
|
+
textContent: extractTextContent(message?.content),
|
|
4324
|
+
toolUseBlocks,
|
|
4325
|
+
stopReason: choice?.finish_reason === "stop" ? "end_turn" : choice?.finish_reason === "tool_calls" ? "tool_use" : choice?.finish_reason ?? "end_turn"
|
|
4326
|
+
};
|
|
4327
|
+
}
|
|
3381
4328
|
};
|
|
3382
4329
|
|
|
3383
4330
|
// src/providers/index.ts
|
|
@@ -3393,6 +4340,7 @@ function createProvider(providerName, apiKeyOverride) {
|
|
|
3393
4340
|
var triggerCliSchema = z8.object({
|
|
3394
4341
|
queries: z8.string().optional(),
|
|
3395
4342
|
saveQueries: z8.string().optional(),
|
|
4343
|
+
compare: z8.array(z8.string().min(1)).optional(),
|
|
3396
4344
|
seed: z8.number().int().optional(),
|
|
3397
4345
|
concurrency: z8.number().int().min(1).optional(),
|
|
3398
4346
|
html: z8.string().optional(),
|
|
@@ -3441,6 +4389,7 @@ async function handleTriggerCommand(targetPath, options) {
|
|
|
3441
4389
|
provider,
|
|
3442
4390
|
queries,
|
|
3443
4391
|
numQueries: options.numQueries,
|
|
4392
|
+
compare: options.compare,
|
|
3444
4393
|
seed: options.seed,
|
|
3445
4394
|
concurrency: options.concurrency,
|
|
3446
4395
|
verbose: options.verbose
|
|
@@ -3459,7 +4408,7 @@ async function handleTriggerCommand(targetPath, options) {
|
|
|
3459
4408
|
...result,
|
|
3460
4409
|
target: targetPath
|
|
3461
4410
|
};
|
|
3462
|
-
await
|
|
4411
|
+
await fs9.writeFile(options.html, renderTriggerHtml(htmlResult), "utf8");
|
|
3463
4412
|
}
|
|
3464
4413
|
} catch (error) {
|
|
3465
4414
|
spinner?.stop();
|
|
@@ -3468,7 +4417,7 @@ async function handleTriggerCommand(targetPath, options) {
|
|
|
3468
4417
|
}
|
|
3469
4418
|
}
|
|
3470
4419
|
function registerTriggerCommand(program) {
|
|
3471
|
-
program.command("trigger").description("Evaluate whether a skill description triggers correctly.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--model <model>", "Model to use").option("--provider <provider>", "LLM provider: anthropic|openai").option("--queries <path>", "Path to custom test queries JSON").option("--num-queries <n>", "Number of auto-generated queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--concurrency <n>", "Maximum in-flight trigger requests", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--save-queries <path>", "Save generated queries to a JSON file").option("--api-key <key>", "API key override").option("--verbose", "Show full model decisions").action(async (targetPath, _commandOptions, command) => {
|
|
4420
|
+
program.command("trigger").description("Evaluate whether a skill description triggers correctly.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--model <model>", "Model to use").option("--provider <provider>", "LLM provider: anthropic|openai").option("--queries <path>", "Path to custom test queries JSON").option("--compare <path...>", "Path(s) to sibling skill directories to include as competitors").option("--num-queries <n>", "Number of auto-generated queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--concurrency <n>", "Maximum in-flight trigger requests", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--save-queries <path>", "Save generated queries to a JSON file").option("--api-key <key>", "API key override").option("--verbose", "Show full model decisions").action(async (targetPath, _commandOptions, command) => {
|
|
3472
4421
|
const globalOptions = getGlobalCliOptions(command);
|
|
3473
4422
|
const config = getResolvedConfig(command);
|
|
3474
4423
|
const parsedCli = triggerCliSchema.safeParse(command.opts());
|
|
@@ -3483,6 +4432,7 @@ function registerTriggerCommand(program) {
|
|
|
3483
4432
|
provider: config.provider,
|
|
3484
4433
|
queries: parsedCli.data.queries,
|
|
3485
4434
|
numQueries: config.trigger.numQueries,
|
|
4435
|
+
compare: config.trigger.compare,
|
|
3486
4436
|
saveQueries: parsedCli.data.saveQueries,
|
|
3487
4437
|
seed: parsedCli.data.seed ?? config.trigger.seed,
|
|
3488
4438
|
concurrency: config.concurrency,
|
|
@@ -3494,7 +4444,7 @@ function registerTriggerCommand(program) {
|
|
|
3494
4444
|
}
|
|
3495
4445
|
|
|
3496
4446
|
// src/commands/eval.ts
|
|
3497
|
-
import
|
|
4447
|
+
import fs10 from "node:fs/promises";
|
|
3498
4448
|
import ora2 from "ora";
|
|
3499
4449
|
import { z as z9 } from "zod";
|
|
3500
4450
|
var evalCliSchema = z9.object({
|
|
@@ -3545,7 +4495,8 @@ async function handleEvalCommand(targetPath, options, command) {
|
|
|
3545
4495
|
graderModel,
|
|
3546
4496
|
numRuns: options.numRuns,
|
|
3547
4497
|
concurrency: options.concurrency,
|
|
3548
|
-
prompts
|
|
4498
|
+
prompts,
|
|
4499
|
+
maxToolIterations: options.maxToolIterations
|
|
3549
4500
|
});
|
|
3550
4501
|
if (options.saveResults) {
|
|
3551
4502
|
await writeJsonFile(options.saveResults, result);
|
|
@@ -3561,7 +4512,7 @@ async function handleEvalCommand(targetPath, options, command) {
|
|
|
3561
4512
|
...result,
|
|
3562
4513
|
target: targetPath
|
|
3563
4514
|
};
|
|
3564
|
-
await
|
|
4515
|
+
await fs10.writeFile(options.html, renderEvalHtml(htmlResult), "utf8");
|
|
3565
4516
|
}
|
|
3566
4517
|
} catch (error) {
|
|
3567
4518
|
spinner?.stop();
|
|
@@ -3592,7 +4543,8 @@ function registerEvalCommand(program) {
|
|
|
3592
4543
|
verbose: Boolean(parsedCli.data.verbose),
|
|
3593
4544
|
apiKey: parsedCli.data.apiKey,
|
|
3594
4545
|
numRuns: config.eval.numRuns,
|
|
3595
|
-
concurrency: config.concurrency
|
|
4546
|
+
concurrency: config.concurrency,
|
|
4547
|
+
maxToolIterations: config.eval.maxToolIterations
|
|
3596
4548
|
},
|
|
3597
4549
|
command
|
|
3598
4550
|
);
|
|
@@ -3600,7 +4552,7 @@ function registerEvalCommand(program) {
|
|
|
3600
4552
|
}
|
|
3601
4553
|
|
|
3602
4554
|
// src/commands/check.ts
|
|
3603
|
-
import
|
|
4555
|
+
import fs11 from "node:fs/promises";
|
|
3604
4556
|
import ora3 from "ora";
|
|
3605
4557
|
import { z as z10 } from "zod";
|
|
3606
4558
|
|
|
@@ -3613,7 +4565,7 @@ function calculateEvalAssertPassRate(result) {
|
|
|
3613
4565
|
}
|
|
3614
4566
|
async function runCheck(inputPath, options) {
|
|
3615
4567
|
options.onStage?.("lint");
|
|
3616
|
-
const lint = await runLinter(inputPath, { suppress: options.lintSuppress });
|
|
4568
|
+
const lint = await runLinter(inputPath, { suppress: options.lintSuppress, plugins: options.lintPlugins });
|
|
3617
4569
|
const lintPassed = !lintFails(lint, options.lintFailOn);
|
|
3618
4570
|
let trigger = null;
|
|
3619
4571
|
let evalResult = null;
|
|
@@ -3637,6 +4589,7 @@ async function runCheck(inputPath, options) {
|
|
|
3637
4589
|
provider: options.provider,
|
|
3638
4590
|
model: options.model,
|
|
3639
4591
|
queries: options.queries,
|
|
4592
|
+
compare: options.compare,
|
|
3640
4593
|
numQueries: options.numQueries,
|
|
3641
4594
|
seed: options.triggerSeed,
|
|
3642
4595
|
concurrency: options.concurrency,
|
|
@@ -3648,7 +4601,8 @@ async function runCheck(inputPath, options) {
|
|
|
3648
4601
|
graderModel: options.graderModel,
|
|
3649
4602
|
numRuns: options.evalNumRuns,
|
|
3650
4603
|
prompts: options.prompts,
|
|
3651
|
-
concurrency: options.concurrency
|
|
4604
|
+
concurrency: options.concurrency,
|
|
4605
|
+
maxToolIterations: options.evalMaxToolIterations
|
|
3652
4606
|
};
|
|
3653
4607
|
if ((options.concurrency ?? 5) === 1) {
|
|
3654
4608
|
options.onStage?.("trigger");
|
|
@@ -3698,8 +4652,10 @@ var checkCliSchema = z10.object({
|
|
|
3698
4652
|
graderModel: z10.string().optional(),
|
|
3699
4653
|
apiKey: z10.string().optional(),
|
|
3700
4654
|
queries: z10.string().optional(),
|
|
4655
|
+
compare: z10.array(z10.string().min(1)).optional(),
|
|
3701
4656
|
seed: z10.number().int().optional(),
|
|
3702
4657
|
prompts: z10.string().optional(),
|
|
4658
|
+
plugin: z10.array(z10.string().min(1)).optional(),
|
|
3703
4659
|
concurrency: z10.number().int().min(1).optional(),
|
|
3704
4660
|
html: z10.string().optional(),
|
|
3705
4661
|
saveResults: z10.string().optional(),
|
|
@@ -3708,6 +4664,9 @@ var checkCliSchema = z10.object({
|
|
|
3708
4664
|
});
|
|
3709
4665
|
var DEFAULT_ANTHROPIC_MODEL3 = "claude-sonnet-4-5-20250929";
|
|
3710
4666
|
var DEFAULT_OPENAI_MODEL3 = "gpt-4.1-mini";
|
|
4667
|
+
function collectPluginPaths2(value, previous = []) {
|
|
4668
|
+
return [...previous, value];
|
|
4669
|
+
}
|
|
3711
4670
|
function resolveModel3(provider, model) {
|
|
3712
4671
|
if (provider === "openai" && model === DEFAULT_ANTHROPIC_MODEL3) {
|
|
3713
4672
|
return DEFAULT_OPENAI_MODEL3;
|
|
@@ -3758,11 +4717,14 @@ async function handleCheckCommand(targetPath, options, command) {
|
|
|
3758
4717
|
graderModel,
|
|
3759
4718
|
lintFailOn: options.lintFailOn,
|
|
3760
4719
|
lintSuppress: options.lintSuppress,
|
|
4720
|
+
lintPlugins: options.lintPlugins,
|
|
3761
4721
|
queries,
|
|
4722
|
+
compare: options.compare,
|
|
3762
4723
|
numQueries: options.numQueries,
|
|
3763
4724
|
triggerSeed: options.triggerSeed,
|
|
3764
4725
|
prompts,
|
|
3765
4726
|
evalNumRuns: options.numRuns,
|
|
4727
|
+
evalMaxToolIterations: options.maxToolIterations,
|
|
3766
4728
|
concurrency: options.concurrency,
|
|
3767
4729
|
minF1: options.minF1,
|
|
3768
4730
|
minAssertPassRate: options.minAssertPassRate,
|
|
@@ -3794,7 +4756,7 @@ async function handleCheckCommand(targetPath, options, command) {
|
|
|
3794
4756
|
);
|
|
3795
4757
|
}
|
|
3796
4758
|
if (options.html) {
|
|
3797
|
-
await
|
|
4759
|
+
await fs11.writeFile(options.html, renderCheckHtml(result), "utf8");
|
|
3798
4760
|
}
|
|
3799
4761
|
process.exitCode = result.gates.overallPassed ? 0 : 1;
|
|
3800
4762
|
} catch (error) {
|
|
@@ -3804,7 +4766,7 @@ async function handleCheckCommand(targetPath, options, command) {
|
|
|
3804
4766
|
}
|
|
3805
4767
|
}
|
|
3806
4768
|
function registerCheckCommand(program) {
|
|
3807
|
-
program.command("check").description("Run lint + trigger + eval with threshold-based quality gates.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai").option("--model <model>", "Model for trigger/eval runs").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--prompts <path>", "Path to eval prompts JSON").option("--concurrency <n>", "Maximum in-flight trigger/eval tasks", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value)).option("--min-assert-pass-rate <n>", "Minimum required eval assertion pass rate (0-1)", (value) => Number.parseFloat(value)).option("--save-results <path>", "Save combined check results to JSON").option("--continue-on-lint-fail", "Continue trigger/eval stages even when lint has failures").option("--verbose", "Show detailed trigger/eval output sections").action(async (targetPath, _commandOptions, command) => {
|
|
4769
|
+
program.command("check").description("Run lint + trigger + eval with threshold-based quality gates.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai").option("--model <model>", "Model for trigger/eval runs").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--compare <path...>", "Path(s) to sibling skill directories to include as competitors").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--prompts <path>", "Path to eval prompts JSON").option("--plugin <path>", "Load a custom lint plugin file", collectPluginPaths2, []).option("--concurrency <n>", "Maximum in-flight trigger/eval tasks", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value)).option("--min-assert-pass-rate <n>", "Minimum required eval assertion pass rate (0-1)", (value) => Number.parseFloat(value)).option("--save-results <path>", "Save combined check results to JSON").option("--continue-on-lint-fail", "Continue trigger/eval stages even when lint has failures").option("--verbose", "Show detailed trigger/eval output sections").action(async (targetPath, _commandOptions, command) => {
|
|
3808
4770
|
const globalOptions = getGlobalCliOptions(command);
|
|
3809
4771
|
const config = getResolvedConfig(command);
|
|
3810
4772
|
const parsedCli = checkCliSchema.safeParse(command.opts());
|
|
@@ -3822,15 +4784,18 @@ function registerCheckCommand(program) {
|
|
|
3822
4784
|
graderModel: parsedCli.data.graderModel,
|
|
3823
4785
|
apiKey: parsedCli.data.apiKey,
|
|
3824
4786
|
queries: parsedCli.data.queries,
|
|
4787
|
+
compare: config.trigger.compare,
|
|
3825
4788
|
numQueries: config.trigger.numQueries,
|
|
3826
4789
|
prompts: parsedCli.data.prompts,
|
|
3827
4790
|
minF1: config.trigger.threshold,
|
|
3828
4791
|
minAssertPassRate: config.eval.threshold,
|
|
3829
4792
|
numRuns: config.eval.numRuns,
|
|
4793
|
+
maxToolIterations: config.eval.maxToolIterations,
|
|
3830
4794
|
concurrency: config.concurrency,
|
|
3831
4795
|
html: parsedCli.data.html,
|
|
3832
4796
|
lintFailOn: config.lint.failOn,
|
|
3833
4797
|
lintSuppress: config.lint.suppress,
|
|
4798
|
+
lintPlugins: config.lint.plugins,
|
|
3834
4799
|
triggerSeed: parsedCli.data.seed ?? config.trigger.seed,
|
|
3835
4800
|
saveResults: parsedCli.data.saveResults,
|
|
3836
4801
|
continueOnLintFail: Boolean(parsedCli.data.continueOnLintFail),
|
|
@@ -3841,12 +4806,572 @@ function registerCheckCommand(program) {
|
|
|
3841
4806
|
});
|
|
3842
4807
|
}
|
|
3843
4808
|
|
|
4809
|
+
// src/commands/improve.ts
|
|
4810
|
+
import ora4 from "ora";
|
|
4811
|
+
import { z as z12 } from "zod";
|
|
4812
|
+
|
|
4813
|
+
// src/core/improver.ts
|
|
4814
|
+
import fs12 from "node:fs/promises";
|
|
4815
|
+
import os from "node:os";
|
|
4816
|
+
import path7 from "node:path";
|
|
4817
|
+
import yaml2 from "js-yaml";
|
|
4818
|
+
import { z as z11 } from "zod";
|
|
4819
|
+
var improveRewriteSchema = z11.object({
|
|
4820
|
+
frontmatter: z11.record(z11.unknown()),
|
|
4821
|
+
content: z11.string().min(1),
|
|
4822
|
+
changeSummary: z11.array(z11.string().min(1)).min(1),
|
|
4823
|
+
targetedProblems: z11.array(z11.string().min(1)).min(1)
|
|
4824
|
+
});
|
|
4825
|
+
function calculateEvalAssertPassRate2(result) {
|
|
4826
|
+
if (!result || result.summary.totalAssertions === 0) {
|
|
4827
|
+
return 0;
|
|
4828
|
+
}
|
|
4829
|
+
return result.summary.passedAssertions / result.summary.totalAssertions;
|
|
4830
|
+
}
|
|
4831
|
+
function extractJsonObject2(raw) {
|
|
4832
|
+
const trimmed = raw.trim();
|
|
4833
|
+
if (trimmed.startsWith("{") && trimmed.endsWith("}")) {
|
|
4834
|
+
return JSON.parse(trimmed);
|
|
4835
|
+
}
|
|
4836
|
+
const start = trimmed.indexOf("{");
|
|
4837
|
+
const end = trimmed.lastIndexOf("}");
|
|
4838
|
+
if (start >= 0 && end > start) {
|
|
4839
|
+
return JSON.parse(trimmed.slice(start, end + 1));
|
|
4840
|
+
}
|
|
4841
|
+
throw new Error("Improver did not return a JSON object.");
|
|
4842
|
+
}
|
|
4843
|
+
function orderFrontmatter(frontmatter) {
|
|
4844
|
+
const ordered = {};
|
|
4845
|
+
for (const key of ["name", "description", "license"]) {
|
|
4846
|
+
if (Object.prototype.hasOwnProperty.call(frontmatter, key)) {
|
|
4847
|
+
ordered[key] = frontmatter[key];
|
|
4848
|
+
}
|
|
4849
|
+
}
|
|
4850
|
+
for (const [key, value] of Object.entries(frontmatter)) {
|
|
4851
|
+
if (!Object.prototype.hasOwnProperty.call(ordered, key)) {
|
|
4852
|
+
ordered[key] = value;
|
|
4853
|
+
}
|
|
4854
|
+
}
|
|
4855
|
+
return ordered;
|
|
4856
|
+
}
|
|
4857
|
+
function detectLineEnding(raw) {
|
|
4858
|
+
return raw.includes("\r\n") ? "\r\n" : "\n";
|
|
4859
|
+
}
|
|
4860
|
+
function buildSkillMarkdown(frontmatter, content, lineEnding) {
|
|
4861
|
+
const normalizedBody = content.trim();
|
|
4862
|
+
if (normalizedBody.length === 0) {
|
|
4863
|
+
throw new Error("Candidate rewrite produced an empty SKILL.md body.");
|
|
4864
|
+
}
|
|
4865
|
+
const frontmatterBlock = yaml2.dump(orderFrontmatter(frontmatter), {
|
|
4866
|
+
lineWidth: 0,
|
|
4867
|
+
noRefs: true,
|
|
4868
|
+
sortKeys: false
|
|
4869
|
+
}).replace(/\n/g, lineEnding);
|
|
4870
|
+
return `---${lineEnding}${frontmatterBlock}---${lineEnding}${lineEnding}${normalizedBody.replace(/\n/g, lineEnding)}${lineEnding}`;
|
|
4871
|
+
}
|
|
4872
|
+
async function validateRelativeReferences(raw, skillRoot) {
|
|
4873
|
+
for (const reference of extractRelativeFileReferences(raw)) {
|
|
4874
|
+
const resolved = path7.resolve(skillRoot, reference);
|
|
4875
|
+
const relativeToRoot = path7.relative(skillRoot, resolved);
|
|
4876
|
+
const escapesRoot = relativeToRoot === "" ? false : relativeToRoot.startsWith("..") || path7.isAbsolute(relativeToRoot);
|
|
4877
|
+
if (escapesRoot) {
|
|
4878
|
+
throw new Error(`Candidate rewrite introduced an out-of-root reference: ${reference}`);
|
|
4879
|
+
}
|
|
4880
|
+
if (!await pathExists(resolved)) {
|
|
4881
|
+
throw new Error(`Candidate rewrite introduced a broken relative reference: ${reference}`);
|
|
4882
|
+
}
|
|
4883
|
+
}
|
|
4884
|
+
}
|
|
4885
|
+
async function buildCandidate(skill, rewrite) {
|
|
4886
|
+
if (typeof rewrite.frontmatter.name === "string" && rewrite.frontmatter.name !== skill.frontmatter.name) {
|
|
4887
|
+
throw new Error(`Candidate rewrite attempted to rename skill '${skill.frontmatter.name}' to '${rewrite.frontmatter.name}'.`);
|
|
4888
|
+
}
|
|
4889
|
+
if (skill.frontmatter.license && typeof rewrite.frontmatter.license === "string" && rewrite.frontmatter.license !== skill.frontmatter.license) {
|
|
4890
|
+
throw new Error(
|
|
4891
|
+
`Candidate rewrite attempted to change license '${skill.frontmatter.license}' to '${rewrite.frontmatter.license}'.`
|
|
4892
|
+
);
|
|
4893
|
+
}
|
|
4894
|
+
const mergedFrontmatter = {
|
|
4895
|
+
...skill.frontmatter,
|
|
4896
|
+
...rewrite.frontmatter,
|
|
4897
|
+
name: skill.frontmatter.name,
|
|
4898
|
+
...skill.frontmatter.license ? { license: skill.frontmatter.license } : {}
|
|
4899
|
+
};
|
|
4900
|
+
const raw = buildSkillMarkdown(mergedFrontmatter, rewrite.content, detectLineEnding(skill.raw));
|
|
4901
|
+
parseSkillDocumentStrict(raw, skill.skillRoot, skill.skillFile);
|
|
4902
|
+
await validateRelativeReferences(raw, skill.skillRoot);
|
|
4903
|
+
return {
|
|
4904
|
+
frontmatter: mergedFrontmatter,
|
|
4905
|
+
content: rewrite.content.trim(),
|
|
4906
|
+
raw,
|
|
4907
|
+
changeSummary: rewrite.changeSummary,
|
|
4908
|
+
targetedProblems: rewrite.targetedProblems
|
|
4909
|
+
};
|
|
4910
|
+
}
|
|
4911
|
+
function extractActionableIssues(result) {
|
|
4912
|
+
const lintIssues = result.lint.issues.filter((issue) => issue.status !== "pass").map((issue) => ({
|
|
4913
|
+
checkId: issue.checkId,
|
|
4914
|
+
title: issue.title,
|
|
4915
|
+
status: issue.status === "warn" ? "warn" : "fail",
|
|
4916
|
+
message: issue.message,
|
|
4917
|
+
suggestion: issue.suggestion,
|
|
4918
|
+
startLine: issue.startLine,
|
|
4919
|
+
endLine: issue.endLine
|
|
4920
|
+
}));
|
|
4921
|
+
const triggerFailures = result.trigger?.cases.filter((testCase) => !testCase.matched).map((testCase) => ({
|
|
4922
|
+
query: testCase.query,
|
|
4923
|
+
expected: testCase.expected,
|
|
4924
|
+
actual: testCase.actual,
|
|
4925
|
+
selectedCompetitor: testCase.selectedCompetitor,
|
|
4926
|
+
rawModelResponse: testCase.rawModelResponse
|
|
4927
|
+
})) ?? [];
|
|
4928
|
+
const evalFailures = result.eval?.results.flatMap(
|
|
4929
|
+
(promptResult) => promptResult.assertions.filter((assertion) => !assertion.passed).map((assertion) => ({
|
|
4930
|
+
prompt: promptResult.prompt,
|
|
4931
|
+
assertion: assertion.assertion,
|
|
4932
|
+
evidence: assertion.evidence,
|
|
4933
|
+
source: assertion.source === "grader" || assertion.source === "tool" ? assertion.source : "unknown"
|
|
4934
|
+
}))
|
|
4935
|
+
) ?? [];
|
|
4936
|
+
return {
|
|
4937
|
+
lintIssues,
|
|
4938
|
+
triggerFailures,
|
|
4939
|
+
evalFailures,
|
|
4940
|
+
triggerSuggestions: result.trigger?.suggestions ?? []
|
|
4941
|
+
};
|
|
4942
|
+
}
|
|
4943
|
+
function hasActionableProblems(brief) {
|
|
4944
|
+
return brief.lintIssues.length > 0 || brief.triggerFailures.length > 0 || brief.evalFailures.length > 0 || brief.triggerSuggestions.length > 0;
|
|
4945
|
+
}
|
|
4946
|
+
async function listSkillFiles(skillRoot) {
|
|
4947
|
+
const entries = await fs12.readdir(skillRoot, { withFileTypes: true });
|
|
4948
|
+
const files = [];
|
|
4949
|
+
for (const entry of entries) {
|
|
4950
|
+
const absolutePath = path7.join(skillRoot, entry.name);
|
|
4951
|
+
if (entry.isDirectory()) {
|
|
4952
|
+
files.push(...await listSkillFiles(absolutePath));
|
|
4953
|
+
continue;
|
|
4954
|
+
}
|
|
4955
|
+
if (entry.isFile()) {
|
|
4956
|
+
files.push(path7.relative(skillRoot, absolutePath).split(path7.sep).join("/"));
|
|
4957
|
+
}
|
|
4958
|
+
}
|
|
4959
|
+
return files.sort();
|
|
4960
|
+
}
|
|
4961
|
+
async function requestRewrite(skill, baseline, brief, provider, model) {
|
|
4962
|
+
const availableFiles = await listSkillFiles(skill.skillRoot);
|
|
4963
|
+
const systemPrompt = [
|
|
4964
|
+
"You rewrite Agent Skill files to improve measured quality.",
|
|
4965
|
+
"Return JSON only.",
|
|
4966
|
+
"Required format:",
|
|
4967
|
+
'{"frontmatter": {...}, "content": "...", "changeSummary": ["..."], "targetedProblems": ["..."]}',
|
|
4968
|
+
"The content field must contain only the markdown body of SKILL.md, without YAML frontmatter fences.",
|
|
4969
|
+
`Keep the skill name exactly '${skill.frontmatter.name}'.`,
|
|
4970
|
+
skill.frontmatter.license ? `Keep the license exactly '${skill.frontmatter.license}'.` : "Do not remove any valid existing frontmatter fields.",
|
|
4971
|
+
"Do not invent new scripts, assets, references, APIs, or tools.",
|
|
4972
|
+
"Only reference files that already exist under the skill root.",
|
|
4973
|
+
"Optimize for trigger clarity, explicit scope boundaries, concrete examples, safety guidance, and tool usage instructions."
|
|
4974
|
+
].join(" ");
|
|
4975
|
+
const baselineTriggerF1 = baseline.trigger?.metrics.f1 ?? 0;
|
|
4976
|
+
const baselineEvalPassRate = calculateEvalAssertPassRate2(baseline.eval);
|
|
4977
|
+
const userPrompt = [
|
|
4978
|
+
`Skill file: ${skill.skillFile}`,
|
|
4979
|
+
`Current trigger F1: ${baselineTriggerF1.toFixed(4)}`,
|
|
4980
|
+
`Current eval assertion pass rate: ${baselineEvalPassRate.toFixed(4)}`,
|
|
4981
|
+
`Lint failures: ${baseline.lint.summary.failures}`,
|
|
4982
|
+
`Lint warnings: ${baseline.lint.summary.warnings}`,
|
|
4983
|
+
"",
|
|
4984
|
+
"Available files under the skill root:",
|
|
4985
|
+
...availableFiles.map((file) => `- ${file}`),
|
|
4986
|
+
"",
|
|
4987
|
+
"Current SKILL.md:",
|
|
4988
|
+
"```markdown",
|
|
4989
|
+
skill.raw,
|
|
4990
|
+
"```",
|
|
4991
|
+
"",
|
|
4992
|
+
"Actionable problems to fix:",
|
|
4993
|
+
JSON.stringify(brief, null, 2),
|
|
4994
|
+
"",
|
|
4995
|
+
"Rewrite the skill to address only these evidenced problems. Keep the instructions tight and practical."
|
|
4996
|
+
].join("\n");
|
|
4997
|
+
const raw = await provider.sendMessage(systemPrompt, userPrompt, { model });
|
|
4998
|
+
const parsed = improveRewriteSchema.safeParse(extractJsonObject2(raw));
|
|
4999
|
+
if (!parsed.success) {
|
|
5000
|
+
throw new Error(`Failed to parse improve output: ${parsed.error.issues[0]?.message ?? "invalid improve JSON"}`);
|
|
5001
|
+
}
|
|
5002
|
+
return parsed.data;
|
|
5003
|
+
}
|
|
5004
|
+
async function createVerificationDirectory(skillRoot, candidateRaw) {
|
|
5005
|
+
const tempRoot = await fs12.mkdtemp(path7.join(os.tmpdir(), "skilltest-improve-"));
|
|
5006
|
+
const tempSkillRoot = path7.join(tempRoot, path7.basename(skillRoot));
|
|
5007
|
+
await fs12.cp(skillRoot, tempSkillRoot, { recursive: true });
|
|
5008
|
+
await fs12.writeFile(path7.join(tempSkillRoot, "SKILL.md"), candidateRaw, "utf8");
|
|
5009
|
+
return {
|
|
5010
|
+
tempRoot,
|
|
5011
|
+
skillPath: tempSkillRoot
|
|
5012
|
+
};
|
|
5013
|
+
}
|
|
5014
|
+
function buildDelta(baseline, verification) {
|
|
5015
|
+
const baselineTriggerF1 = baseline.trigger?.metrics.f1 ?? 0;
|
|
5016
|
+
const verificationTriggerF1 = verification.trigger?.metrics.f1 ?? 0;
|
|
5017
|
+
const baselineEvalPassRate = calculateEvalAssertPassRate2(baseline.eval);
|
|
5018
|
+
const verificationEvalPassRate = calculateEvalAssertPassRate2(verification.eval);
|
|
5019
|
+
const lintFailuresDelta = baseline.lint.summary.failures - verification.lint.summary.failures;
|
|
5020
|
+
const lintWarningsDelta = baseline.lint.summary.warnings - verification.lint.summary.warnings;
|
|
5021
|
+
const triggerF1Delta = verificationTriggerF1 - baselineTriggerF1;
|
|
5022
|
+
const evalPassRateDelta = verificationEvalPassRate - baselineEvalPassRate;
|
|
5023
|
+
const hasRegression = verification.lint.summary.failures > baseline.lint.summary.failures || verification.lint.summary.warnings > baseline.lint.summary.warnings || verificationTriggerF1 < baselineTriggerF1 || verificationEvalPassRate < baselineEvalPassRate;
|
|
5024
|
+
const improved = verification.gates.overallPassed !== baseline.gates.overallPassed ? verification.gates.overallPassed : lintFailuresDelta > 0 || lintWarningsDelta > 0 || triggerF1Delta > 0 || evalPassRateDelta > 0;
|
|
5025
|
+
return {
|
|
5026
|
+
lintFailures: {
|
|
5027
|
+
before: baseline.lint.summary.failures,
|
|
5028
|
+
after: verification.lint.summary.failures,
|
|
5029
|
+
delta: lintFailuresDelta
|
|
5030
|
+
},
|
|
5031
|
+
lintWarnings: {
|
|
5032
|
+
before: baseline.lint.summary.warnings,
|
|
5033
|
+
after: verification.lint.summary.warnings,
|
|
5034
|
+
delta: lintWarningsDelta
|
|
5035
|
+
},
|
|
5036
|
+
triggerF1: {
|
|
5037
|
+
before: baselineTriggerF1,
|
|
5038
|
+
after: verificationTriggerF1,
|
|
5039
|
+
delta: triggerF1Delta
|
|
5040
|
+
},
|
|
5041
|
+
evalAssertPassRate: {
|
|
5042
|
+
before: baselineEvalPassRate,
|
|
5043
|
+
after: verificationEvalPassRate,
|
|
5044
|
+
delta: evalPassRateDelta
|
|
5045
|
+
},
|
|
5046
|
+
overallPassed: {
|
|
5047
|
+
before: baseline.gates.overallPassed,
|
|
5048
|
+
after: verification.gates.overallPassed
|
|
5049
|
+
},
|
|
5050
|
+
improved,
|
|
5051
|
+
hasRegression
|
|
5052
|
+
};
|
|
5053
|
+
}
|
|
5054
|
+
function normalizeVerificationTarget(result, target) {
|
|
5055
|
+
return {
|
|
5056
|
+
...result,
|
|
5057
|
+
target
|
|
5058
|
+
};
|
|
5059
|
+
}
|
|
5060
|
+
function buildBlockingReason(delta, verification) {
|
|
5061
|
+
if (delta.hasRegression) {
|
|
5062
|
+
return "Candidate rewrite regressed one or more quality metrics on the frozen test set.";
|
|
5063
|
+
}
|
|
5064
|
+
if (!delta.improved) {
|
|
5065
|
+
return "Candidate rewrite did not produce a measurable improvement on the frozen test set.";
|
|
5066
|
+
}
|
|
5067
|
+
if (!verification.gates.overallPassed) {
|
|
5068
|
+
return "Candidate rewrite improved the skill but still failed the configured quality gates.";
|
|
5069
|
+
}
|
|
5070
|
+
return void 0;
|
|
5071
|
+
}
|
|
5072
|
+
async function maybeWriteOutput(outputPath, raw) {
|
|
5073
|
+
const absolutePath = path7.resolve(outputPath);
|
|
5074
|
+
await fs12.mkdir(path7.dirname(absolutePath), { recursive: true });
|
|
5075
|
+
await fs12.writeFile(absolutePath, raw, "utf8");
|
|
5076
|
+
return absolutePath;
|
|
5077
|
+
}
|
|
5078
|
+
async function runImprove(inputPath, options) {
|
|
5079
|
+
options.onStage?.("baseline");
|
|
5080
|
+
const baseline = await runCheck(inputPath, {
|
|
5081
|
+
provider: options.provider,
|
|
5082
|
+
model: options.model,
|
|
5083
|
+
graderModel: options.model,
|
|
5084
|
+
lintFailOn: options.lintFailOn,
|
|
5085
|
+
lintSuppress: options.lintSuppress,
|
|
5086
|
+
lintPlugins: options.lintPlugins,
|
|
5087
|
+
compare: options.compare,
|
|
5088
|
+
numQueries: options.numQueries,
|
|
5089
|
+
triggerSeed: options.triggerSeed,
|
|
5090
|
+
queries: options.queries,
|
|
5091
|
+
evalNumRuns: options.evalNumRuns,
|
|
5092
|
+
prompts: options.prompts,
|
|
5093
|
+
evalMaxToolIterations: options.evalMaxToolIterations,
|
|
5094
|
+
concurrency: options.concurrency,
|
|
5095
|
+
minF1: options.minF1,
|
|
5096
|
+
minAssertPassRate: options.minAssertPassRate,
|
|
5097
|
+
continueOnLintFail: true,
|
|
5098
|
+
verbose: options.verbose
|
|
5099
|
+
});
|
|
5100
|
+
if (!baseline.trigger || !baseline.eval) {
|
|
5101
|
+
return {
|
|
5102
|
+
target: inputPath,
|
|
5103
|
+
provider: options.provider.name,
|
|
5104
|
+
model: options.model,
|
|
5105
|
+
originalRaw: "",
|
|
5106
|
+
thresholds: {
|
|
5107
|
+
minF1: options.minF1,
|
|
5108
|
+
minAssertPassRate: options.minAssertPassRate
|
|
5109
|
+
},
|
|
5110
|
+
baseline,
|
|
5111
|
+
candidate: null,
|
|
5112
|
+
verification: null,
|
|
5113
|
+
delta: null,
|
|
5114
|
+
applied: false,
|
|
5115
|
+
blockedReason: baseline.triggerSkippedReason ?? baseline.evalSkippedReason ?? "Improve requires a strictly parseable skill so trigger and eval can be frozen."
|
|
5116
|
+
};
|
|
5117
|
+
}
|
|
5118
|
+
const skill = await parseSkillStrict(inputPath);
|
|
5119
|
+
const brief = extractActionableIssues(baseline);
|
|
5120
|
+
if (!hasActionableProblems(brief)) {
|
|
5121
|
+
return {
|
|
5122
|
+
target: inputPath,
|
|
5123
|
+
provider: options.provider.name,
|
|
5124
|
+
model: options.model,
|
|
5125
|
+
originalRaw: skill.raw,
|
|
5126
|
+
thresholds: {
|
|
5127
|
+
minF1: options.minF1,
|
|
5128
|
+
minAssertPassRate: options.minAssertPassRate
|
|
5129
|
+
},
|
|
5130
|
+
baseline,
|
|
5131
|
+
candidate: null,
|
|
5132
|
+
verification: null,
|
|
5133
|
+
delta: null,
|
|
5134
|
+
applied: false,
|
|
5135
|
+
blockedReason: "No actionable failures, warnings, or mismatches were found to improve."
|
|
5136
|
+
};
|
|
5137
|
+
}
|
|
5138
|
+
options.onStage?.("generate");
|
|
5139
|
+
const rewrite = await requestRewrite(skill, baseline, brief, options.provider, options.model);
|
|
5140
|
+
options.onStage?.("validate");
|
|
5141
|
+
const candidate = await buildCandidate(skill, rewrite);
|
|
5142
|
+
if (candidate.raw === skill.raw) {
|
|
5143
|
+
return {
|
|
5144
|
+
target: inputPath,
|
|
5145
|
+
provider: options.provider.name,
|
|
5146
|
+
model: options.model,
|
|
5147
|
+
originalRaw: skill.raw,
|
|
5148
|
+
thresholds: {
|
|
5149
|
+
minF1: options.minF1,
|
|
5150
|
+
minAssertPassRate: options.minAssertPassRate
|
|
5151
|
+
},
|
|
5152
|
+
baseline,
|
|
5153
|
+
candidate,
|
|
5154
|
+
verification: null,
|
|
5155
|
+
delta: null,
|
|
5156
|
+
applied: false,
|
|
5157
|
+
blockedReason: "Candidate rewrite produced no changes."
|
|
5158
|
+
};
|
|
5159
|
+
}
|
|
5160
|
+
options.onStage?.("verify");
|
|
5161
|
+
const verificationDirectory = await createVerificationDirectory(skill.skillRoot, candidate.raw);
|
|
5162
|
+
let verification;
|
|
5163
|
+
try {
|
|
5164
|
+
verification = normalizeVerificationTarget(
|
|
5165
|
+
await runCheck(verificationDirectory.skillPath, {
|
|
5166
|
+
provider: options.provider,
|
|
5167
|
+
model: options.model,
|
|
5168
|
+
graderModel: options.model,
|
|
5169
|
+
lintFailOn: options.lintFailOn,
|
|
5170
|
+
lintSuppress: options.lintSuppress,
|
|
5171
|
+
lintPlugins: options.lintPlugins,
|
|
5172
|
+
compare: options.compare,
|
|
5173
|
+
numQueries: baseline.trigger.queries.length,
|
|
5174
|
+
triggerSeed: options.triggerSeed,
|
|
5175
|
+
queries: baseline.trigger.queries,
|
|
5176
|
+
evalNumRuns: baseline.eval.prompts.length,
|
|
5177
|
+
prompts: baseline.eval.prompts,
|
|
5178
|
+
evalMaxToolIterations: options.evalMaxToolIterations,
|
|
5179
|
+
concurrency: options.concurrency,
|
|
5180
|
+
minF1: options.minF1,
|
|
5181
|
+
minAssertPassRate: options.minAssertPassRate,
|
|
5182
|
+
continueOnLintFail: true,
|
|
5183
|
+
verbose: options.verbose
|
|
5184
|
+
}),
|
|
5185
|
+
inputPath
|
|
5186
|
+
);
|
|
5187
|
+
} finally {
|
|
5188
|
+
await fs12.rm(verificationDirectory.tempRoot, { recursive: true, force: true });
|
|
5189
|
+
}
|
|
5190
|
+
const delta = buildDelta(baseline, verification);
|
|
5191
|
+
const blockedReason = buildBlockingReason(delta, verification);
|
|
5192
|
+
let applied = false;
|
|
5193
|
+
let outputPath;
|
|
5194
|
+
if (!blockedReason) {
|
|
5195
|
+
if (options.outputPath) {
|
|
5196
|
+
options.onStage?.("write");
|
|
5197
|
+
outputPath = await maybeWriteOutput(options.outputPath, candidate.raw);
|
|
5198
|
+
}
|
|
5199
|
+
if (options.apply) {
|
|
5200
|
+
options.onStage?.("write");
|
|
5201
|
+
await fs12.writeFile(skill.skillFile, candidate.raw, "utf8");
|
|
5202
|
+
applied = true;
|
|
5203
|
+
}
|
|
5204
|
+
}
|
|
5205
|
+
return {
|
|
5206
|
+
target: inputPath,
|
|
5207
|
+
provider: options.provider.name,
|
|
5208
|
+
model: options.model,
|
|
5209
|
+
originalRaw: skill.raw,
|
|
5210
|
+
thresholds: {
|
|
5211
|
+
minF1: options.minF1,
|
|
5212
|
+
minAssertPassRate: options.minAssertPassRate
|
|
5213
|
+
},
|
|
5214
|
+
baseline,
|
|
5215
|
+
candidate,
|
|
5216
|
+
verification,
|
|
5217
|
+
delta,
|
|
5218
|
+
applied,
|
|
5219
|
+
...outputPath ? { outputPath } : {},
|
|
5220
|
+
...blockedReason ? { blockedReason } : {}
|
|
5221
|
+
};
|
|
5222
|
+
}
|
|
5223
|
+
|
|
5224
|
+
// src/commands/improve.ts
|
|
5225
|
+
var improveCliSchema = z12.object({
|
|
5226
|
+
apiKey: z12.string().optional(),
|
|
5227
|
+
queries: z12.string().optional(),
|
|
5228
|
+
compare: z12.array(z12.string().min(1)).optional(),
|
|
5229
|
+
seed: z12.number().int().optional(),
|
|
5230
|
+
prompts: z12.string().optional(),
|
|
5231
|
+
plugin: z12.array(z12.string().min(1)).optional(),
|
|
5232
|
+
concurrency: z12.number().int().min(1).optional(),
|
|
5233
|
+
output: z12.string().optional(),
|
|
5234
|
+
saveResults: z12.string().optional(),
|
|
5235
|
+
apply: z12.boolean().optional(),
|
|
5236
|
+
verbose: z12.boolean().optional()
|
|
5237
|
+
});
|
|
5238
|
+
var DEFAULT_ANTHROPIC_MODEL4 = "claude-sonnet-4-5-20250929";
|
|
5239
|
+
var DEFAULT_OPENAI_MODEL4 = "gpt-4.1-mini";
|
|
5240
|
+
function collectPluginPaths3(value, previous = []) {
|
|
5241
|
+
return [...previous, value];
|
|
5242
|
+
}
|
|
5243
|
+
function resolveModel4(provider, model) {
|
|
5244
|
+
if (provider === "openai" && model === DEFAULT_ANTHROPIC_MODEL4) {
|
|
5245
|
+
return DEFAULT_OPENAI_MODEL4;
|
|
5246
|
+
}
|
|
5247
|
+
return model;
|
|
5248
|
+
}
|
|
5249
|
+
async function handleImproveCommand(targetPath, options, command) {
|
|
5250
|
+
const spinner = options.json || !process.stdout.isTTY ? null : ora4("Preparing improvement run...").start();
|
|
5251
|
+
try {
|
|
5252
|
+
if (spinner) {
|
|
5253
|
+
spinner.text = "Initializing model provider...";
|
|
5254
|
+
}
|
|
5255
|
+
const provider = createProvider(options.provider, options.apiKey);
|
|
5256
|
+
let queries = void 0;
|
|
5257
|
+
if (options.queries) {
|
|
5258
|
+
if (spinner) {
|
|
5259
|
+
spinner.text = "Loading frozen trigger queries...";
|
|
5260
|
+
}
|
|
5261
|
+
queries = await loadTriggerQueriesFile(options.queries);
|
|
5262
|
+
}
|
|
5263
|
+
let prompts = void 0;
|
|
5264
|
+
if (options.prompts) {
|
|
5265
|
+
if (spinner) {
|
|
5266
|
+
spinner.text = "Loading eval prompts...";
|
|
5267
|
+
}
|
|
5268
|
+
prompts = await loadEvalPromptsJson(options.prompts);
|
|
5269
|
+
} else {
|
|
5270
|
+
prompts = await loadConfiguredEvalPrompts(command);
|
|
5271
|
+
}
|
|
5272
|
+
const model = resolveModel4(options.provider, options.model);
|
|
5273
|
+
const result = await runImprove(targetPath, {
|
|
5274
|
+
provider,
|
|
5275
|
+
model,
|
|
5276
|
+
lintFailOn: options.lintFailOn,
|
|
5277
|
+
lintSuppress: options.lintSuppress,
|
|
5278
|
+
lintPlugins: options.lintPlugins,
|
|
5279
|
+
compare: options.compare,
|
|
5280
|
+
numQueries: options.numQueries,
|
|
5281
|
+
triggerSeed: options.triggerSeed,
|
|
5282
|
+
queries,
|
|
5283
|
+
prompts,
|
|
5284
|
+
evalNumRuns: options.numRuns,
|
|
5285
|
+
evalMaxToolIterations: options.maxToolIterations,
|
|
5286
|
+
minF1: options.minF1,
|
|
5287
|
+
minAssertPassRate: options.minAssertPassRate,
|
|
5288
|
+
concurrency: options.concurrency,
|
|
5289
|
+
apply: options.apply,
|
|
5290
|
+
outputPath: options.output,
|
|
5291
|
+
verbose: options.verbose,
|
|
5292
|
+
onStage: (stage) => {
|
|
5293
|
+
if (!spinner) {
|
|
5294
|
+
return;
|
|
5295
|
+
}
|
|
5296
|
+
if (stage === "baseline") {
|
|
5297
|
+
spinner.text = "Running baseline check...";
|
|
5298
|
+
} else if (stage === "generate") {
|
|
5299
|
+
spinner.text = "Generating candidate rewrite...";
|
|
5300
|
+
} else if (stage === "validate") {
|
|
5301
|
+
spinner.text = "Validating candidate rewrite...";
|
|
5302
|
+
} else if (stage === "verify") {
|
|
5303
|
+
spinner.text = "Verifying candidate against frozen test inputs...";
|
|
5304
|
+
} else if (stage === "write") {
|
|
5305
|
+
spinner.text = options.apply ? "Writing improved SKILL.md..." : "Writing candidate output...";
|
|
5306
|
+
}
|
|
5307
|
+
}
|
|
5308
|
+
});
|
|
5309
|
+
if (options.saveResults) {
|
|
5310
|
+
await writeJsonFile(options.saveResults, result);
|
|
5311
|
+
}
|
|
5312
|
+
spinner?.stop();
|
|
5313
|
+
if (options.json) {
|
|
5314
|
+
writeResult(result, true);
|
|
5315
|
+
} else {
|
|
5316
|
+
writeResult(renderImproveReport(result, options.color, options.verbose), false);
|
|
5317
|
+
}
|
|
5318
|
+
process.exitCode = result.blockedReason ? 1 : 0;
|
|
5319
|
+
} catch (error) {
|
|
5320
|
+
spinner?.stop();
|
|
5321
|
+
writeError(error, options.json);
|
|
5322
|
+
process.exitCode = 2;
|
|
5323
|
+
}
|
|
5324
|
+
}
|
|
5325
|
+
function registerImproveCommand(program) {
|
|
5326
|
+
program.command("improve").description("Rewrite SKILL.md, verify it on frozen test inputs, and optionally apply it.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai").option("--model <model>", "Model for baseline, rewrite, and verification runs").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--compare <path...>", "Path(s) to sibling skill directories to include as competitors").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible trigger results", (value) => Number.parseInt(value, 10)).option("--prompts <path>", "Path to eval prompts JSON").option("--plugin <path>", "Load a custom lint plugin file", collectPluginPaths3, []).option("--concurrency <n>", "Maximum in-flight trigger/eval tasks", (value) => Number.parseInt(value, 10)).option("--output <path>", "Write the verified candidate SKILL.md to a separate file").option("--save-results <path>", "Save the full improve result JSON").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value)).option(
|
|
5327
|
+
"--min-assert-pass-rate <n>",
|
|
5328
|
+
"Minimum required eval assertion pass rate (0-1)",
|
|
5329
|
+
(value) => Number.parseFloat(value)
|
|
5330
|
+
).option("--apply", "Apply the verified rewrite to the source SKILL.md").option("--verbose", "Include detailed baseline and verification reports").action(async (targetPath, _commandOptions, command) => {
|
|
5331
|
+
const globalOptions = getGlobalCliOptions(command);
|
|
5332
|
+
const config = getResolvedConfig(command);
|
|
5333
|
+
const parsedCli = improveCliSchema.safeParse(command.opts());
|
|
5334
|
+
if (!parsedCli.success) {
|
|
5335
|
+
writeError(new Error(parsedCli.error.issues[0]?.message ?? "Invalid improve options."), globalOptions.json);
|
|
5336
|
+
process.exitCode = 2;
|
|
5337
|
+
return;
|
|
5338
|
+
}
|
|
5339
|
+
await handleImproveCommand(
|
|
5340
|
+
targetPath,
|
|
5341
|
+
{
|
|
5342
|
+
...globalOptions,
|
|
5343
|
+
provider: config.provider,
|
|
5344
|
+
model: config.model,
|
|
5345
|
+
apiKey: parsedCli.data.apiKey,
|
|
5346
|
+
queries: parsedCli.data.queries,
|
|
5347
|
+
compare: config.trigger.compare,
|
|
5348
|
+
numQueries: config.trigger.numQueries,
|
|
5349
|
+
prompts: parsedCli.data.prompts,
|
|
5350
|
+
minF1: config.trigger.threshold,
|
|
5351
|
+
minAssertPassRate: config.eval.threshold,
|
|
5352
|
+
numRuns: config.eval.numRuns,
|
|
5353
|
+
maxToolIterations: config.eval.maxToolIterations,
|
|
5354
|
+
concurrency: config.concurrency,
|
|
5355
|
+
lintFailOn: config.lint.failOn,
|
|
5356
|
+
lintSuppress: config.lint.suppress,
|
|
5357
|
+
lintPlugins: config.lint.plugins,
|
|
5358
|
+
triggerSeed: parsedCli.data.seed ?? config.trigger.seed,
|
|
5359
|
+
output: parsedCli.data.output,
|
|
5360
|
+
saveResults: parsedCli.data.saveResults,
|
|
5361
|
+
apply: Boolean(parsedCli.data.apply),
|
|
5362
|
+
verbose: Boolean(parsedCli.data.verbose)
|
|
5363
|
+
},
|
|
5364
|
+
command
|
|
5365
|
+
);
|
|
5366
|
+
});
|
|
5367
|
+
}
|
|
5368
|
+
|
|
3844
5369
|
// src/index.ts
|
|
3845
5370
|
function resolveVersion() {
|
|
3846
5371
|
try {
|
|
3847
5372
|
const currentFilePath = fileURLToPath(import.meta.url);
|
|
3848
|
-
const packageJsonPath =
|
|
3849
|
-
const raw =
|
|
5373
|
+
const packageJsonPath = path8.resolve(path8.dirname(currentFilePath), "..", "package.json");
|
|
5374
|
+
const raw = fs13.readFileSync(packageJsonPath, "utf8");
|
|
3850
5375
|
const parsed = JSON.parse(raw);
|
|
3851
5376
|
return parsed.version ?? "0.0.0";
|
|
3852
5377
|
} catch {
|
|
@@ -3879,6 +5404,7 @@ async function run(argv) {
|
|
|
3879
5404
|
registerTriggerCommand(program);
|
|
3880
5405
|
registerEvalCommand(program);
|
|
3881
5406
|
registerCheckCommand(program);
|
|
5407
|
+
registerImproveCommand(program);
|
|
3882
5408
|
try {
|
|
3883
5409
|
await program.parseAsync(argv);
|
|
3884
5410
|
} catch (error) {
|