skilltest 0.8.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +8 -1
- package/README.md +175 -12
- package/dist/index.js +1766 -89
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/index.ts
|
|
4
|
-
import
|
|
5
|
-
import
|
|
4
|
+
import fs14 from "node:fs";
|
|
5
|
+
import path9 from "node:path";
|
|
6
6
|
import { fileURLToPath } from "node:url";
|
|
7
7
|
import { Command } from "commander";
|
|
8
8
|
|
|
@@ -100,7 +100,10 @@ function parseFrontmatter(rawSkill) {
|
|
|
100
100
|
}
|
|
101
101
|
async function parseSkillStrict(inputPath) {
|
|
102
102
|
const skillContext = await loadSkillFile(inputPath);
|
|
103
|
-
|
|
103
|
+
return parseSkillDocumentStrict(skillContext.raw, skillContext.skillRoot, skillContext.skillFile);
|
|
104
|
+
}
|
|
105
|
+
function parseSkillDocumentStrict(rawSkill, skillRoot, skillFile) {
|
|
106
|
+
const parsedFrontmatter = parseFrontmatter(rawSkill);
|
|
104
107
|
if (!parsedFrontmatter.hasFrontmatter) {
|
|
105
108
|
throw new Error("SKILL.md is missing YAML frontmatter.");
|
|
106
109
|
}
|
|
@@ -113,9 +116,9 @@ async function parseSkillStrict(inputPath) {
|
|
|
113
116
|
throw new Error(`Invalid frontmatter field '${issue.path.join(".")}': ${issue.message}`);
|
|
114
117
|
}
|
|
115
118
|
return {
|
|
116
|
-
skillRoot
|
|
117
|
-
skillFile
|
|
118
|
-
raw:
|
|
119
|
+
skillRoot,
|
|
120
|
+
skillFile,
|
|
121
|
+
raw: rawSkill,
|
|
119
122
|
content: parsedFrontmatter.content,
|
|
120
123
|
frontmatterRaw: parsedFrontmatter.rawFrontmatter,
|
|
121
124
|
frontmatter: validation.data
|
|
@@ -1515,6 +1518,9 @@ function badgeLabel(status) {
|
|
|
1515
1518
|
function renderBadge(status) {
|
|
1516
1519
|
return `<span class="badge ${status}">${badgeLabel(status)}</span>`;
|
|
1517
1520
|
}
|
|
1521
|
+
function renderMetaBadge(label) {
|
|
1522
|
+
return `<span class="meta-badge">${escapeHtml(label)}</span>`;
|
|
1523
|
+
}
|
|
1518
1524
|
function renderStatCards(stats) {
|
|
1519
1525
|
return `<div class="stats-grid">${stats.map(
|
|
1520
1526
|
(stat) => `
|
|
@@ -1690,10 +1696,37 @@ function promptStatus(promptResult) {
|
|
|
1690
1696
|
return "warn";
|
|
1691
1697
|
}
|
|
1692
1698
|
function renderAssertionRow(assertion) {
|
|
1693
|
-
return
|
|
1694
|
-
|
|
1695
|
-
|
|
1696
|
-
|
|
1699
|
+
return `
|
|
1700
|
+
<details class="detail-block">
|
|
1701
|
+
<summary>
|
|
1702
|
+
${renderBadge(assertion.passed ? "pass" : "fail")}
|
|
1703
|
+
${assertion.source === "tool" ? renderMetaBadge("Tool") : ""}
|
|
1704
|
+
<span>${escapeHtml(assertion.assertion)}</span>
|
|
1705
|
+
</summary>
|
|
1706
|
+
<div class="detail-content">${renderPreBlock(assertion.evidence)}</div>
|
|
1707
|
+
</details>
|
|
1708
|
+
`;
|
|
1709
|
+
}
|
|
1710
|
+
function renderToolCallsSection(promptResult) {
|
|
1711
|
+
if (!promptResult.toolCalls || promptResult.toolCalls.length === 0) {
|
|
1712
|
+
return "";
|
|
1713
|
+
}
|
|
1714
|
+
const toolRows = promptResult.toolCalls.map(
|
|
1715
|
+
(toolCall) => `
|
|
1716
|
+
<div class="tool-call">
|
|
1717
|
+
<div class="row-header">
|
|
1718
|
+
<div>
|
|
1719
|
+
<div class="row-title">${escapeHtml(toolCall.name)}</div>
|
|
1720
|
+
<div class="row-subtitle">${escapeHtml(`turn ${toolCall.turnIndex}`)}</div>
|
|
1721
|
+
</div>
|
|
1722
|
+
${renderMetaBadge("Tool Call")}
|
|
1723
|
+
</div>
|
|
1724
|
+
${renderDefinitionList([{ label: "Arguments", value: JSON.stringify(toolCall.arguments) }])}
|
|
1725
|
+
${renderDetails("Mock response", renderPreBlock(toolCall.response))}
|
|
1726
|
+
</div>
|
|
1727
|
+
`
|
|
1728
|
+
).join("");
|
|
1729
|
+
return renderDetails("Tool Calls", `<div class="tool-call-list">${toolRows}</div>`);
|
|
1697
1730
|
}
|
|
1698
1731
|
function renderEvalPromptRow(promptResult) {
|
|
1699
1732
|
const assertionDetails = promptResult.assertions.map((assertion) => renderAssertionRow(assertion)).join("");
|
|
@@ -1712,9 +1745,12 @@ function renderEvalPromptRow(promptResult) {
|
|
|
1712
1745
|
<div class="row-body">${escapeHtml(promptResult.responseSummary)}</div>
|
|
1713
1746
|
${renderDefinitionList([
|
|
1714
1747
|
{ label: "Passed assertions", value: String(promptResult.passedAssertions) },
|
|
1715
|
-
{ label: "Total assertions", value: String(promptResult.totalAssertions) }
|
|
1748
|
+
{ label: "Total assertions", value: String(promptResult.totalAssertions) },
|
|
1749
|
+
...promptResult.toolCalls ? [{ label: "Tool calls", value: String(promptResult.toolCalls.length) }] : [],
|
|
1750
|
+
...promptResult.loopIterations !== void 0 ? [{ label: "Loop iterations", value: String(promptResult.loopIterations) }] : []
|
|
1716
1751
|
])}
|
|
1717
1752
|
${renderDetails("Assertion evidence", assertionDetails || `<p>No assertions.</p>`)}
|
|
1753
|
+
${renderToolCallsSection(promptResult)}
|
|
1718
1754
|
${responseDetails}
|
|
1719
1755
|
</div>
|
|
1720
1756
|
`;
|
|
@@ -1981,6 +2017,20 @@ function renderHtmlDocument(title, body) {
|
|
|
1981
2017
|
background: rgba(107, 114, 128, 0.14);
|
|
1982
2018
|
}
|
|
1983
2019
|
|
|
2020
|
+
.meta-badge {
|
|
2021
|
+
display: inline-flex;
|
|
2022
|
+
align-items: center;
|
|
2023
|
+
justify-content: center;
|
|
2024
|
+
padding: 3px 10px;
|
|
2025
|
+
border-radius: 999px;
|
|
2026
|
+
border: 1px solid rgba(17, 24, 39, 0.16);
|
|
2027
|
+
background: rgba(17, 24, 39, 0.06);
|
|
2028
|
+
color: var(--text);
|
|
2029
|
+
font-size: 0.76rem;
|
|
2030
|
+
font-weight: 700;
|
|
2031
|
+
white-space: nowrap;
|
|
2032
|
+
}
|
|
2033
|
+
|
|
1984
2034
|
details {
|
|
1985
2035
|
margin-top: 10px;
|
|
1986
2036
|
}
|
|
@@ -1995,6 +2045,13 @@ function renderHtmlDocument(title, body) {
|
|
|
1995
2045
|
padding-top: 10px;
|
|
1996
2046
|
}
|
|
1997
2047
|
|
|
2048
|
+
.detail-block summary {
|
|
2049
|
+
display: flex;
|
|
2050
|
+
align-items: center;
|
|
2051
|
+
gap: 8px;
|
|
2052
|
+
flex-wrap: wrap;
|
|
2053
|
+
}
|
|
2054
|
+
|
|
1998
2055
|
.detail-content p {
|
|
1999
2056
|
margin: 0;
|
|
2000
2057
|
}
|
|
@@ -2045,6 +2102,18 @@ function renderHtmlDocument(title, body) {
|
|
|
2045
2102
|
overflow-wrap: anywhere;
|
|
2046
2103
|
}
|
|
2047
2104
|
|
|
2105
|
+
.tool-call-list {
|
|
2106
|
+
display: grid;
|
|
2107
|
+
gap: 12px;
|
|
2108
|
+
}
|
|
2109
|
+
|
|
2110
|
+
.tool-call {
|
|
2111
|
+
border: 1px solid var(--border);
|
|
2112
|
+
border-radius: 12px;
|
|
2113
|
+
padding: 14px;
|
|
2114
|
+
background: #fffaf0;
|
|
2115
|
+
}
|
|
2116
|
+
|
|
2048
2117
|
ul {
|
|
2049
2118
|
margin: 0;
|
|
2050
2119
|
padding-left: 20px;
|
|
@@ -2246,6 +2315,76 @@ function renderCheckHtml(result) {
|
|
|
2246
2315
|
);
|
|
2247
2316
|
return renderHtmlDocument(`skilltest check - ${skillName}`, [header, lintSection, triggerSection, evalSection, qualityGate].join(""));
|
|
2248
2317
|
}
|
|
2318
|
+
function renderRouteMatrix(result) {
|
|
2319
|
+
const cols = [...result.skills, "none"];
|
|
2320
|
+
const headerCells = cols.map((col) => `<th>${escapeHtml(col)}</th>`).join("");
|
|
2321
|
+
const rows = result.skills.map((target) => {
|
|
2322
|
+
const cells = cols.map((col) => {
|
|
2323
|
+
const pct = result.matrixPct[target]?.[col] ?? 0;
|
|
2324
|
+
const isDiag = col === target;
|
|
2325
|
+
const bg = isDiag ? "background:rgba(34,197,94,0.18);" : pct > 0.15 ? "background:rgba(239,68,68,0.18);" : pct > 0.05 ? "background:rgba(234,179,8,0.12);" : "";
|
|
2326
|
+
return `<td style="${bg}">${escapeHtml(formatPercent(pct))}</td>`;
|
|
2327
|
+
}).join("");
|
|
2328
|
+
return `<tr><th>${escapeHtml(target)}</th>${cells}</tr>`;
|
|
2329
|
+
}).join("");
|
|
2330
|
+
return `<style>.rt{border-collapse:collapse;font-size:.85rem;width:100%}.rt th,.rt td{border:1px solid #d4d4d8;padding:8px 12px;text-align:center}.rt thead th{background:#fafafa;font-weight:700}</style><div style="overflow-x:auto"><table class="rt"><thead><tr><th></th>${headerCells}</tr></thead><tbody>${rows}</tbody></table></div>`;
|
|
2331
|
+
}
|
|
2332
|
+
function renderRouteHtml(result) {
|
|
2333
|
+
const conflictCount = result.conflicts.length;
|
|
2334
|
+
const overallStatus = result.overallAccuracy >= 0.8 ? "pass" : "warn";
|
|
2335
|
+
const conflictStatus = conflictCount === 0 ? "pass" : "warn";
|
|
2336
|
+
const header = renderHeaderCard(
|
|
2337
|
+
"route",
|
|
2338
|
+
`Routing Report \u2014 ${result.skills.length} skills`,
|
|
2339
|
+
result.skillDir,
|
|
2340
|
+
[
|
|
2341
|
+
{ label: "Overall accuracy", value: formatPercent(result.overallAccuracy), status: overallStatus },
|
|
2342
|
+
{ label: "Conflicts", value: String(conflictCount), status: conflictStatus },
|
|
2343
|
+
{ label: "Skills", value: String(result.skills.length) },
|
|
2344
|
+
{ label: "Queries/skill", value: String(result.numQueriesPerSkill) }
|
|
2345
|
+
],
|
|
2346
|
+
[
|
|
2347
|
+
{ label: "Provider", value: result.provider },
|
|
2348
|
+
{ label: "Model", value: result.model },
|
|
2349
|
+
{ label: "Seed", value: result.seed !== void 0 ? String(result.seed) : "none" }
|
|
2350
|
+
]
|
|
2351
|
+
);
|
|
2352
|
+
const matrixSection = renderSectionCard("Routing Matrix", renderRouteMatrix(result));
|
|
2353
|
+
const metricsRows = result.perSkillMetrics.map((m) => {
|
|
2354
|
+
const status = m.f1 >= 0.8 ? "pass" : "warn";
|
|
2355
|
+
return renderMessageRow(
|
|
2356
|
+
status,
|
|
2357
|
+
m.skill,
|
|
2358
|
+
`F1: ${formatPercent(m.f1)} precision: ${formatPercent(m.precision)} recall: ${formatPercent(m.recall)}`,
|
|
2359
|
+
renderDefinitionList([
|
|
2360
|
+
{ label: "Queries", value: String(m.queriesTotal) },
|
|
2361
|
+
{ label: "Correct", value: String(m.correct) },
|
|
2362
|
+
{ label: "Precision", value: formatPercent(m.precision) },
|
|
2363
|
+
{ label: "Recall", value: formatPercent(m.recall) }
|
|
2364
|
+
])
|
|
2365
|
+
);
|
|
2366
|
+
}).join("");
|
|
2367
|
+
const metricsSection = renderSectionCard("Per-Skill Metrics", `<div class="row-list">${metricsRows}</div>`);
|
|
2368
|
+
let conflictsSection = "";
|
|
2369
|
+
if (result.conflicts.length > 0) {
|
|
2370
|
+
const conflictRows = result.conflicts.map(
|
|
2371
|
+
(conflict) => renderMessageRow(
|
|
2372
|
+
"warn",
|
|
2373
|
+
`${escapeHtml(conflict.skillA)} \u2194 ${escapeHtml(conflict.skillB)}`,
|
|
2374
|
+
`${formatPercent(conflict.bleedAtoB)} of ${escapeHtml(conflict.skillA)} queries routed to ${escapeHtml(conflict.skillB)}; ${formatPercent(conflict.bleedBtoA)} the other way`
|
|
2375
|
+
)
|
|
2376
|
+
).join("");
|
|
2377
|
+
conflictsSection = renderSectionCard("Conflicts", `<div class="row-list">${conflictRows}</div>`);
|
|
2378
|
+
}
|
|
2379
|
+
const suggestionsSection = renderSectionCard(
|
|
2380
|
+
"Suggestions",
|
|
2381
|
+
`<ul>${result.suggestions.map((s) => `<li>${escapeHtml(s)}</li>`).join("")}</ul>`
|
|
2382
|
+
);
|
|
2383
|
+
return renderHtmlDocument(
|
|
2384
|
+
`skilltest route \u2014 ${result.skillDir}`,
|
|
2385
|
+
[header, matrixSection, metricsSection, conflictsSection, suggestionsSection].join("")
|
|
2386
|
+
);
|
|
2387
|
+
}
|
|
2249
2388
|
|
|
2250
2389
|
// src/reporters/terminal.ts
|
|
2251
2390
|
import { Chalk } from "chalk";
|
|
@@ -2270,6 +2409,70 @@ function countSkippedSecurityPatterns2(issues) {
|
|
|
2270
2409
|
function formatPercent2(value) {
|
|
2271
2410
|
return `${(value * 100).toFixed(1)}%`;
|
|
2272
2411
|
}
|
|
2412
|
+
function formatSignedNumber(value, digits = 4) {
|
|
2413
|
+
const prefix = value > 0 ? "+" : "";
|
|
2414
|
+
return `${prefix}${value.toFixed(digits)}`;
|
|
2415
|
+
}
|
|
2416
|
+
function diffChangedLines(beforeText, afterText) {
|
|
2417
|
+
const beforeLines = beforeText.split(/\r?\n/);
|
|
2418
|
+
const afterLines = afterText.split(/\r?\n/);
|
|
2419
|
+
const dp = Array.from({ length: beforeLines.length + 1 }, () => Array(afterLines.length + 1).fill(0));
|
|
2420
|
+
for (let beforeIndex2 = beforeLines.length - 1; beforeIndex2 >= 0; beforeIndex2 -= 1) {
|
|
2421
|
+
for (let afterIndex2 = afterLines.length - 1; afterIndex2 >= 0; afterIndex2 -= 1) {
|
|
2422
|
+
if (beforeLines[beforeIndex2] === afterLines[afterIndex2]) {
|
|
2423
|
+
dp[beforeIndex2][afterIndex2] = 1 + (dp[beforeIndex2 + 1][afterIndex2 + 1] ?? 0);
|
|
2424
|
+
} else {
|
|
2425
|
+
dp[beforeIndex2][afterIndex2] = Math.max(dp[beforeIndex2 + 1][afterIndex2] ?? 0, dp[beforeIndex2][afterIndex2 + 1] ?? 0);
|
|
2426
|
+
}
|
|
2427
|
+
}
|
|
2428
|
+
}
|
|
2429
|
+
const changedLines = [];
|
|
2430
|
+
let beforeIndex = 0;
|
|
2431
|
+
let afterIndex = 0;
|
|
2432
|
+
while (beforeIndex < beforeLines.length && afterIndex < afterLines.length) {
|
|
2433
|
+
if (beforeLines[beforeIndex] === afterLines[afterIndex]) {
|
|
2434
|
+
beforeIndex += 1;
|
|
2435
|
+
afterIndex += 1;
|
|
2436
|
+
continue;
|
|
2437
|
+
}
|
|
2438
|
+
const skipBefore = dp[beforeIndex + 1][afterIndex] ?? 0;
|
|
2439
|
+
const skipAfter = dp[beforeIndex][afterIndex + 1] ?? 0;
|
|
2440
|
+
if (skipBefore >= skipAfter) {
|
|
2441
|
+
changedLines.push({ type: "-", line: beforeLines[beforeIndex] ?? "" });
|
|
2442
|
+
beforeIndex += 1;
|
|
2443
|
+
} else {
|
|
2444
|
+
changedLines.push({ type: "+", line: afterLines[afterIndex] ?? "" });
|
|
2445
|
+
afterIndex += 1;
|
|
2446
|
+
}
|
|
2447
|
+
}
|
|
2448
|
+
while (beforeIndex < beforeLines.length) {
|
|
2449
|
+
changedLines.push({ type: "-", line: beforeLines[beforeIndex] ?? "" });
|
|
2450
|
+
beforeIndex += 1;
|
|
2451
|
+
}
|
|
2452
|
+
while (afterIndex < afterLines.length) {
|
|
2453
|
+
changedLines.push({ type: "+", line: afterLines[afterIndex] ?? "" });
|
|
2454
|
+
afterIndex += 1;
|
|
2455
|
+
}
|
|
2456
|
+
return changedLines;
|
|
2457
|
+
}
|
|
2458
|
+
function renderDiffPreview(beforeText, afterText, maxLines = 40) {
|
|
2459
|
+
const changedLines = diffChangedLines(beforeText, afterText);
|
|
2460
|
+
if (changedLines.length === 0) {
|
|
2461
|
+
return [" (no content changes)"];
|
|
2462
|
+
}
|
|
2463
|
+
const previewLines = changedLines.slice(0, maxLines).map((entry) => ` ${entry.type} ${entry.line}`);
|
|
2464
|
+
if (changedLines.length > maxLines) {
|
|
2465
|
+
previewLines.push(` ... ${changedLines.length - maxLines} more changed line(s)`);
|
|
2466
|
+
}
|
|
2467
|
+
return previewLines;
|
|
2468
|
+
}
|
|
2469
|
+
function summarizeToolCalls(toolCalls) {
|
|
2470
|
+
const counts = /* @__PURE__ */ new Map();
|
|
2471
|
+
for (const toolCall of toolCalls) {
|
|
2472
|
+
counts.set(toolCall.name, (counts.get(toolCall.name) ?? 0) + 1);
|
|
2473
|
+
}
|
|
2474
|
+
return Array.from(counts.entries()).map(([name, count]) => `${name} x${count}`).join(", ");
|
|
2475
|
+
}
|
|
2273
2476
|
function renderLintReport(report, enableColor) {
|
|
2274
2477
|
const c = getChalkInstance(enableColor);
|
|
2275
2478
|
const { passed, warnings, failures, total } = report.summary;
|
|
@@ -2330,12 +2533,25 @@ function renderEvalReport(result, enableColor, verbose) {
|
|
|
2330
2533
|
for (const [index, promptResult] of result.results.entries()) {
|
|
2331
2534
|
lines.push(`${index + 1}. prompt: ${promptResult.prompt}`);
|
|
2332
2535
|
lines.push(` response summary: ${promptResult.responseSummary.replace(/\s+/g, " ").trim()}`);
|
|
2536
|
+
if (promptResult.toolCalls) {
|
|
2537
|
+
lines.push(` Tools: ${promptResult.toolCalls.length} calls (${summarizeToolCalls(promptResult.toolCalls)})`);
|
|
2538
|
+
if (promptResult.loopIterations !== void 0) {
|
|
2539
|
+
lines.push(` loop iterations: ${promptResult.loopIterations}`);
|
|
2540
|
+
}
|
|
2541
|
+
}
|
|
2333
2542
|
for (const assertion of promptResult.assertions) {
|
|
2334
2543
|
const status = assertion.passed ? c.green("PASS") : c.red("FAIL");
|
|
2335
2544
|
lines.push(` ${status} ${assertion.assertion}`);
|
|
2336
2545
|
lines.push(` evidence: ${assertion.evidence}`);
|
|
2337
2546
|
}
|
|
2338
2547
|
if (verbose) {
|
|
2548
|
+
if (promptResult.toolCalls) {
|
|
2549
|
+
for (const toolCall of promptResult.toolCalls) {
|
|
2550
|
+
lines.push(` tool ${toolCall.turnIndex}: ${toolCall.name}`);
|
|
2551
|
+
lines.push(` arguments: ${JSON.stringify(toolCall.arguments)}`);
|
|
2552
|
+
lines.push(` response: ${toolCall.response}`);
|
|
2553
|
+
}
|
|
2554
|
+
}
|
|
2339
2555
|
lines.push(` full response: ${promptResult.response}`);
|
|
2340
2556
|
}
|
|
2341
2557
|
}
|
|
@@ -2412,6 +2628,12 @@ function renderCheckReport(result, enableColor, verbose) {
|
|
|
2412
2628
|
}
|
|
2413
2629
|
lines.push(` - prompt: ${promptResult.prompt}`);
|
|
2414
2630
|
lines.push(` response summary: ${promptResult.responseSummary.replace(/\s+/g, " ").trim()}`);
|
|
2631
|
+
if (promptResult.toolCalls) {
|
|
2632
|
+
lines.push(` Tools: ${promptResult.toolCalls.length} calls (${summarizeToolCalls(promptResult.toolCalls)})`);
|
|
2633
|
+
if (promptResult.loopIterations !== void 0) {
|
|
2634
|
+
lines.push(` loop iterations: ${promptResult.loopIterations}`);
|
|
2635
|
+
}
|
|
2636
|
+
}
|
|
2415
2637
|
const assertionsToRender = verbose ? promptResult.assertions : failedAssertions;
|
|
2416
2638
|
for (const assertion of assertionsToRender) {
|
|
2417
2639
|
const assertionStatus = assertion.passed ? c.green("PASS") : c.red("FAIL");
|
|
@@ -2419,6 +2641,13 @@ function renderCheckReport(result, enableColor, verbose) {
|
|
|
2419
2641
|
lines.push(` evidence: ${assertion.evidence}`);
|
|
2420
2642
|
}
|
|
2421
2643
|
if (verbose) {
|
|
2644
|
+
if (promptResult.toolCalls) {
|
|
2645
|
+
for (const toolCall of promptResult.toolCalls) {
|
|
2646
|
+
lines.push(` tool ${toolCall.turnIndex}: ${toolCall.name}`);
|
|
2647
|
+
lines.push(` arguments: ${JSON.stringify(toolCall.arguments)}`);
|
|
2648
|
+
lines.push(` response: ${toolCall.response}`);
|
|
2649
|
+
}
|
|
2650
|
+
}
|
|
2422
2651
|
lines.push(` full response: ${promptResult.response}`);
|
|
2423
2652
|
}
|
|
2424
2653
|
}
|
|
@@ -2433,6 +2662,137 @@ function renderCheckReport(result, enableColor, verbose) {
|
|
|
2433
2662
|
lines.push(`- overall: ${overallGate}`);
|
|
2434
2663
|
return lines.join("\n");
|
|
2435
2664
|
}
|
|
2665
|
+
function renderImproveReport(result, enableColor, verbose = false) {
|
|
2666
|
+
const c = getChalkInstance(enableColor);
|
|
2667
|
+
const lines = [
|
|
2668
|
+
"skilltest improve",
|
|
2669
|
+
`target: ${result.target}`,
|
|
2670
|
+
`provider/model: ${result.provider}/${result.model}`,
|
|
2671
|
+
`thresholds: min-f1=${result.thresholds.minF1.toFixed(2)} min-assert-pass-rate=${result.thresholds.minAssertPassRate.toFixed(2)}`
|
|
2672
|
+
];
|
|
2673
|
+
const statusLabel = result.blockedReason ? c.red("BLOCKED") : result.applied ? c.green("APPLIED") : c.green("VERIFIED");
|
|
2674
|
+
lines.push(`status: ${statusLabel}`);
|
|
2675
|
+
if (result.candidate) {
|
|
2676
|
+
lines.push("");
|
|
2677
|
+
lines.push("Change Summary");
|
|
2678
|
+
for (const item of result.candidate.changeSummary) {
|
|
2679
|
+
lines.push(`- ${item}`);
|
|
2680
|
+
}
|
|
2681
|
+
lines.push("");
|
|
2682
|
+
lines.push("Targeted Problems");
|
|
2683
|
+
for (const item of result.candidate.targetedProblems) {
|
|
2684
|
+
lines.push(`- ${item}`);
|
|
2685
|
+
}
|
|
2686
|
+
}
|
|
2687
|
+
if (result.delta && result.verification) {
|
|
2688
|
+
lines.push("");
|
|
2689
|
+
lines.push("Before / After");
|
|
2690
|
+
lines.push(
|
|
2691
|
+
`- lint failures: ${result.delta.lintFailures.before} -> ${result.delta.lintFailures.after} (${formatSignedNumber(result.delta.lintFailures.delta, 0)})`
|
|
2692
|
+
);
|
|
2693
|
+
lines.push(
|
|
2694
|
+
`- lint warnings: ${result.delta.lintWarnings.before} -> ${result.delta.lintWarnings.after} (${formatSignedNumber(result.delta.lintWarnings.delta, 0)})`
|
|
2695
|
+
);
|
|
2696
|
+
lines.push(
|
|
2697
|
+
`- trigger f1: ${formatPercent2(result.delta.triggerF1.before)} -> ${formatPercent2(result.delta.triggerF1.after)} (${formatSignedNumber(result.delta.triggerF1.delta)})`
|
|
2698
|
+
);
|
|
2699
|
+
lines.push(
|
|
2700
|
+
`- eval assertion pass rate: ${formatPercent2(result.delta.evalAssertPassRate.before)} -> ${formatPercent2(result.delta.evalAssertPassRate.after)} (${formatSignedNumber(result.delta.evalAssertPassRate.delta)})`
|
|
2701
|
+
);
|
|
2702
|
+
lines.push(
|
|
2703
|
+
`- overall gate: ${result.delta.overallPassed.before ? c.green("PASS") : c.red("FAIL")} -> ${result.delta.overallPassed.after ? c.green("PASS") : c.red("FAIL")}`
|
|
2704
|
+
);
|
|
2705
|
+
}
|
|
2706
|
+
if (result.outputPath) {
|
|
2707
|
+
lines.push("");
|
|
2708
|
+
lines.push(`output: ${result.outputPath}`);
|
|
2709
|
+
}
|
|
2710
|
+
if (result.blockedReason) {
|
|
2711
|
+
lines.push("");
|
|
2712
|
+
lines.push("Blocked");
|
|
2713
|
+
lines.push(`- ${result.blockedReason}`);
|
|
2714
|
+
}
|
|
2715
|
+
if (result.candidate) {
|
|
2716
|
+
lines.push("");
|
|
2717
|
+
lines.push("Diff Preview");
|
|
2718
|
+
lines.push(...renderDiffPreview(result.originalRaw, result.candidate.raw));
|
|
2719
|
+
}
|
|
2720
|
+
if (verbose) {
|
|
2721
|
+
lines.push("");
|
|
2722
|
+
lines.push("Baseline");
|
|
2723
|
+
lines.push(renderCheckReport(result.baseline, enableColor, true));
|
|
2724
|
+
if (result.verification) {
|
|
2725
|
+
lines.push("");
|
|
2726
|
+
lines.push("Verification");
|
|
2727
|
+
lines.push(renderCheckReport(result.verification, enableColor, true));
|
|
2728
|
+
}
|
|
2729
|
+
}
|
|
2730
|
+
return lines.join("\n");
|
|
2731
|
+
}
|
|
2732
|
+
function renderRouteReport(result, enableColor, verbose) {
|
|
2733
|
+
const c = getChalkInstance(enableColor);
|
|
2734
|
+
const lines = [
|
|
2735
|
+
"skilltest route",
|
|
2736
|
+
`directory: ${result.skillDir}`,
|
|
2737
|
+
`provider/model: ${result.provider}/${result.model}`,
|
|
2738
|
+
`skills: ${result.skills.length} queries per skill: ${result.numQueriesPerSkill}`
|
|
2739
|
+
];
|
|
2740
|
+
lines.push("");
|
|
2741
|
+
lines.push("Per-skill metrics:");
|
|
2742
|
+
for (const m of result.perSkillMetrics) {
|
|
2743
|
+
const badge = m.f1 >= 0.8 ? c.green("PASS") : c.yellow("WARN");
|
|
2744
|
+
lines.push(
|
|
2745
|
+
` ${m.skill.padEnd(24)} F1: ${formatPercent2(m.f1).padEnd(7)} precision: ${formatPercent2(m.precision).padEnd(7)} recall: ${formatPercent2(m.recall)} [${badge}]`
|
|
2746
|
+
);
|
|
2747
|
+
}
|
|
2748
|
+
lines.push("");
|
|
2749
|
+
lines.push("Routing matrix (% of row queries routed to column):");
|
|
2750
|
+
const colHeaders = [...result.skills, "none"];
|
|
2751
|
+
const colWidth = 10;
|
|
2752
|
+
const rowLabelWidth = 24;
|
|
2753
|
+
const headerRow = "".padEnd(rowLabelWidth) + colHeaders.map((h) => h.slice(0, colWidth - 1).padEnd(colWidth)).join("");
|
|
2754
|
+
lines.push(" " + headerRow);
|
|
2755
|
+
for (const targetSkill of result.skills) {
|
|
2756
|
+
const rowLabel = (" " + targetSkill).padEnd(rowLabelWidth);
|
|
2757
|
+
const cells = colHeaders.map((col) => {
|
|
2758
|
+
const pct = result.matrixPct[targetSkill]?.[col] ?? 0;
|
|
2759
|
+
const formatted = formatPercent2(pct).padEnd(colWidth);
|
|
2760
|
+
if (col === targetSkill) return c.green(formatted);
|
|
2761
|
+
if (pct > 0.1) return c.yellow(formatted);
|
|
2762
|
+
return formatted;
|
|
2763
|
+
}).join("");
|
|
2764
|
+
lines.push(rowLabel + cells);
|
|
2765
|
+
}
|
|
2766
|
+
if (result.conflicts.length > 0) {
|
|
2767
|
+
lines.push("");
|
|
2768
|
+
lines.push("Conflicts detected:");
|
|
2769
|
+
for (const conflict of result.conflicts) {
|
|
2770
|
+
lines.push(
|
|
2771
|
+
` ${conflict.skillA} <-> ${conflict.skillB} ${formatPercent2(conflict.bleedAtoB)} / ${formatPercent2(conflict.bleedBtoA)} bleed [${c.yellow("WARN")}]`
|
|
2772
|
+
);
|
|
2773
|
+
}
|
|
2774
|
+
}
|
|
2775
|
+
lines.push("");
|
|
2776
|
+
lines.push(`Overall accuracy: ${formatPercent2(result.overallAccuracy)}`);
|
|
2777
|
+
lines.push("");
|
|
2778
|
+
lines.push("Suggestions:");
|
|
2779
|
+
for (const suggestion of result.suggestions) {
|
|
2780
|
+
lines.push(`- ${suggestion}`);
|
|
2781
|
+
}
|
|
2782
|
+
if (verbose) {
|
|
2783
|
+
lines.push("");
|
|
2784
|
+
lines.push("Cases:");
|
|
2785
|
+
for (const [index, testCase] of result.cases.entries()) {
|
|
2786
|
+
const status = testCase.correct ? c.green("PASS") : c.red("FAIL");
|
|
2787
|
+
lines.push(` ${index + 1}. ${status} [${testCase.targetSkill}] ${testCase.query}`);
|
|
2788
|
+
lines.push(` routed to: ${testCase.actualSkill}`);
|
|
2789
|
+
if (testCase.rawModelResponse) {
|
|
2790
|
+
lines.push(` model: ${testCase.rawModelResponse.replace(/\s+/g, " ").trim()}`);
|
|
2791
|
+
}
|
|
2792
|
+
}
|
|
2793
|
+
}
|
|
2794
|
+
return lines.join("\n");
|
|
2795
|
+
}
|
|
2436
2796
|
|
|
2437
2797
|
// src/commands/common.ts
|
|
2438
2798
|
import fs6 from "node:fs/promises";
|
|
@@ -2504,7 +2864,10 @@ function parseGraderOutput(raw) {
|
|
|
2504
2864
|
async function gradeResponse(options) {
|
|
2505
2865
|
const prompts = buildGraderPrompts(options);
|
|
2506
2866
|
const raw = await options.provider.sendMessage(prompts.systemPrompt, prompts.userPrompt, { model: options.model });
|
|
2507
|
-
return parseGraderOutput(raw)
|
|
2867
|
+
return parseGraderOutput(raw).map((assertion) => ({
|
|
2868
|
+
...assertion,
|
|
2869
|
+
source: "grader"
|
|
2870
|
+
}));
|
|
2508
2871
|
}
|
|
2509
2872
|
|
|
2510
2873
|
// src/utils/concurrency.ts
|
|
@@ -2559,12 +2922,290 @@ async function pMap(items, fn, concurrency) {
|
|
|
2559
2922
|
});
|
|
2560
2923
|
}
|
|
2561
2924
|
|
|
2925
|
+
// src/core/tool-environment.ts
|
|
2926
|
+
function isPlainObject(value) {
|
|
2927
|
+
return value !== null && typeof value === "object" && !Array.isArray(value);
|
|
2928
|
+
}
|
|
2929
|
+
function deepEqual(left, right) {
|
|
2930
|
+
if (Array.isArray(left) && Array.isArray(right)) {
|
|
2931
|
+
if (left.length !== right.length) {
|
|
2932
|
+
return false;
|
|
2933
|
+
}
|
|
2934
|
+
return left.every((item, index) => deepEqual(item, right[index]));
|
|
2935
|
+
}
|
|
2936
|
+
if (isPlainObject(left) && isPlainObject(right)) {
|
|
2937
|
+
const leftKeys = Object.keys(left);
|
|
2938
|
+
const rightKeys = Object.keys(right);
|
|
2939
|
+
if (leftKeys.length !== rightKeys.length) {
|
|
2940
|
+
return false;
|
|
2941
|
+
}
|
|
2942
|
+
return leftKeys.every((key) => deepEqual(left[key], right[key]));
|
|
2943
|
+
}
|
|
2944
|
+
return left === right;
|
|
2945
|
+
}
|
|
2946
|
+
function matchesArgumentSubset(actual, expected) {
|
|
2947
|
+
if (Array.isArray(expected)) {
|
|
2948
|
+
if (!Array.isArray(actual) || actual.length !== expected.length) {
|
|
2949
|
+
return false;
|
|
2950
|
+
}
|
|
2951
|
+
return expected.every((value, index) => matchesArgumentSubset(actual[index], value));
|
|
2952
|
+
}
|
|
2953
|
+
if (isPlainObject(expected)) {
|
|
2954
|
+
if (!isPlainObject(actual)) {
|
|
2955
|
+
return false;
|
|
2956
|
+
}
|
|
2957
|
+
return Object.entries(expected).every(([key, value]) => matchesArgumentSubset(actual[key], value));
|
|
2958
|
+
}
|
|
2959
|
+
return deepEqual(actual, expected);
|
|
2960
|
+
}
|
|
2961
|
+
function parseResponsePattern(pattern) {
|
|
2962
|
+
if (pattern === "*") {
|
|
2963
|
+
return null;
|
|
2964
|
+
}
|
|
2965
|
+
try {
|
|
2966
|
+
const parsed = JSON.parse(pattern);
|
|
2967
|
+
return isPlainObject(parsed) ? parsed : null;
|
|
2968
|
+
} catch {
|
|
2969
|
+
return null;
|
|
2970
|
+
}
|
|
2971
|
+
}
|
|
2972
|
+
function renderFallbackResponse(tool, args) {
|
|
2973
|
+
return `[mock] No mock response configured for tool '${tool.name}' with arguments: ${JSON.stringify(args)}`;
|
|
2974
|
+
}
|
|
2975
|
+
function resolveToolResponse(tool, args) {
|
|
2976
|
+
const exactMatchKey = JSON.stringify(args);
|
|
2977
|
+
const exactMatch = tool.responses[exactMatchKey];
|
|
2978
|
+
if (exactMatch !== void 0) {
|
|
2979
|
+
return exactMatch;
|
|
2980
|
+
}
|
|
2981
|
+
let bestPartialMatch = null;
|
|
2982
|
+
for (const [pattern, response] of Object.entries(tool.responses)) {
|
|
2983
|
+
if (pattern === "*") {
|
|
2984
|
+
continue;
|
|
2985
|
+
}
|
|
2986
|
+
const parsedPattern = parseResponsePattern(pattern);
|
|
2987
|
+
if (!parsedPattern) {
|
|
2988
|
+
continue;
|
|
2989
|
+
}
|
|
2990
|
+
if (!matchesArgumentSubset(args, parsedPattern)) {
|
|
2991
|
+
continue;
|
|
2992
|
+
}
|
|
2993
|
+
const specificity = Object.keys(parsedPattern).length;
|
|
2994
|
+
if (!bestPartialMatch || specificity > bestPartialMatch.specificity) {
|
|
2995
|
+
bestPartialMatch = { specificity, response };
|
|
2996
|
+
}
|
|
2997
|
+
}
|
|
2998
|
+
if (bestPartialMatch) {
|
|
2999
|
+
return bestPartialMatch.response;
|
|
3000
|
+
}
|
|
3001
|
+
const wildcardMatch = tool.responses["*"];
|
|
3002
|
+
if (wildcardMatch !== void 0) {
|
|
3003
|
+
return wildcardMatch;
|
|
3004
|
+
}
|
|
3005
|
+
return renderFallbackResponse(tool, args);
|
|
3006
|
+
}
|
|
3007
|
+
function toProviderToolDefinitions(mockTools) {
|
|
3008
|
+
return mockTools.map((tool) => {
|
|
3009
|
+
const parameters = tool.parameters ?? [];
|
|
3010
|
+
return {
|
|
3011
|
+
name: tool.name,
|
|
3012
|
+
description: tool.description,
|
|
3013
|
+
parameters: {
|
|
3014
|
+
type: "object",
|
|
3015
|
+
properties: Object.fromEntries(
|
|
3016
|
+
parameters.map((parameter) => [
|
|
3017
|
+
parameter.name,
|
|
3018
|
+
{
|
|
3019
|
+
type: parameter.type,
|
|
3020
|
+
description: parameter.description
|
|
3021
|
+
}
|
|
3022
|
+
])
|
|
3023
|
+
),
|
|
3024
|
+
required: parameters.filter((parameter) => parameter.required).map((parameter) => parameter.name)
|
|
3025
|
+
}
|
|
3026
|
+
};
|
|
3027
|
+
});
|
|
3028
|
+
}
|
|
3029
|
+
function toAssistantConversationBlocks(response) {
|
|
3030
|
+
const contentBlocks = [];
|
|
3031
|
+
if (response.textContent.trim().length > 0) {
|
|
3032
|
+
contentBlocks.push({
|
|
3033
|
+
type: "text",
|
|
3034
|
+
text: response.textContent
|
|
3035
|
+
});
|
|
3036
|
+
}
|
|
3037
|
+
for (const block of response.toolUseBlocks) {
|
|
3038
|
+
contentBlocks.push({
|
|
3039
|
+
type: "tool_use",
|
|
3040
|
+
id: block.id,
|
|
3041
|
+
name: block.name,
|
|
3042
|
+
input: block.arguments
|
|
3043
|
+
});
|
|
3044
|
+
}
|
|
3045
|
+
return contentBlocks.length === 0 ? [] : [
|
|
3046
|
+
{
|
|
3047
|
+
role: "assistant",
|
|
3048
|
+
content: contentBlocks
|
|
3049
|
+
}
|
|
3050
|
+
];
|
|
3051
|
+
}
|
|
3052
|
+
async function runWithTools(options) {
|
|
3053
|
+
const maxIterations = options.maxIterations ?? 10;
|
|
3054
|
+
const toolsByName = new Map(options.tools.map((tool) => [tool.name, tool]));
|
|
3055
|
+
const providerTools = toProviderToolDefinitions(options.tools);
|
|
3056
|
+
const messages = [{ role: "user", content: options.userMessage }];
|
|
3057
|
+
const toolCalls = [];
|
|
3058
|
+
let finalResponse = "";
|
|
3059
|
+
let loopIterations = 0;
|
|
3060
|
+
while (loopIterations < maxIterations) {
|
|
3061
|
+
loopIterations += 1;
|
|
3062
|
+
const response = await options.provider.sendWithTools(options.systemPrompt, messages, {
|
|
3063
|
+
model: options.model,
|
|
3064
|
+
tools: providerTools
|
|
3065
|
+
});
|
|
3066
|
+
if (response.textContent.trim().length > 0) {
|
|
3067
|
+
finalResponse = response.textContent;
|
|
3068
|
+
}
|
|
3069
|
+
if (response.toolUseBlocks.length === 0) {
|
|
3070
|
+
return {
|
|
3071
|
+
finalResponse,
|
|
3072
|
+
toolCalls,
|
|
3073
|
+
loopIterations
|
|
3074
|
+
};
|
|
3075
|
+
}
|
|
3076
|
+
messages.push(...toAssistantConversationBlocks(response));
|
|
3077
|
+
const toolResultBlocks = [];
|
|
3078
|
+
for (const toolUse of response.toolUseBlocks) {
|
|
3079
|
+
const tool = toolsByName.get(toolUse.name);
|
|
3080
|
+
const resolvedResponse = tool ? resolveToolResponse(tool, toolUse.arguments) : `[mock] No tool named '${toolUse.name}' is registered.`;
|
|
3081
|
+
toolCalls.push({
|
|
3082
|
+
name: toolUse.name,
|
|
3083
|
+
arguments: toolUse.arguments,
|
|
3084
|
+
response: resolvedResponse,
|
|
3085
|
+
turnIndex: loopIterations
|
|
3086
|
+
});
|
|
3087
|
+
toolResultBlocks.push({
|
|
3088
|
+
type: "tool_result",
|
|
3089
|
+
tool_use_id: toolUse.id,
|
|
3090
|
+
content: resolvedResponse
|
|
3091
|
+
});
|
|
3092
|
+
}
|
|
3093
|
+
messages.push({
|
|
3094
|
+
role: "user",
|
|
3095
|
+
content: toolResultBlocks
|
|
3096
|
+
});
|
|
3097
|
+
}
|
|
3098
|
+
const terminationNote = `[skilltest: tool loop terminated after ${maxIterations} iterations]`;
|
|
3099
|
+
finalResponse = finalResponse ? `${finalResponse}
|
|
3100
|
+
|
|
3101
|
+
${terminationNote}` : terminationNote;
|
|
3102
|
+
return {
|
|
3103
|
+
finalResponse,
|
|
3104
|
+
toolCalls,
|
|
3105
|
+
loopIterations
|
|
3106
|
+
};
|
|
3107
|
+
}
|
|
3108
|
+
|
|
2562
3109
|
// src/core/eval-runner.ts
|
|
3110
|
+
var toolParameterSchema = z3.object({
|
|
3111
|
+
name: z3.string().min(1),
|
|
3112
|
+
type: z3.enum(["string", "number", "boolean", "object", "array"]),
|
|
3113
|
+
description: z3.string().min(1),
|
|
3114
|
+
required: z3.boolean().optional()
|
|
3115
|
+
});
|
|
3116
|
+
var mockToolDefinitionSchema = z3.object({
|
|
3117
|
+
name: z3.string().min(1),
|
|
3118
|
+
description: z3.string().min(1),
|
|
3119
|
+
parameters: z3.array(toolParameterSchema).optional(),
|
|
3120
|
+
responses: z3.record(z3.string())
|
|
3121
|
+
});
|
|
3122
|
+
var toolAssertionSchema = z3.object({
|
|
3123
|
+
type: z3.enum(["tool_called", "tool_not_called", "tool_call_order", "tool_argument_match"]),
|
|
3124
|
+
toolName: z3.string().min(1).optional(),
|
|
3125
|
+
toolNames: z3.array(z3.string().min(1)).optional(),
|
|
3126
|
+
expectedArgs: z3.record(z3.unknown()).optional(),
|
|
3127
|
+
description: z3.string().min(1)
|
|
3128
|
+
}).superRefine((value, context) => {
|
|
3129
|
+
if ((value.type === "tool_called" || value.type === "tool_not_called" || value.type === "tool_argument_match") && !value.toolName) {
|
|
3130
|
+
context.addIssue({
|
|
3131
|
+
code: z3.ZodIssueCode.custom,
|
|
3132
|
+
message: `${value.type} requires toolName.`
|
|
3133
|
+
});
|
|
3134
|
+
}
|
|
3135
|
+
if (value.type === "tool_call_order" && (!value.toolNames || value.toolNames.length === 0)) {
|
|
3136
|
+
context.addIssue({
|
|
3137
|
+
code: z3.ZodIssueCode.custom,
|
|
3138
|
+
message: "tool_call_order requires toolNames."
|
|
3139
|
+
});
|
|
3140
|
+
}
|
|
3141
|
+
if (value.type === "tool_argument_match" && !value.expectedArgs) {
|
|
3142
|
+
context.addIssue({
|
|
3143
|
+
code: z3.ZodIssueCode.custom,
|
|
3144
|
+
message: "tool_argument_match requires expectedArgs."
|
|
3145
|
+
});
|
|
3146
|
+
}
|
|
3147
|
+
});
|
|
2563
3148
|
var evalPromptSchema = z3.object({
|
|
2564
3149
|
prompt: z3.string().min(1),
|
|
2565
|
-
assertions: z3.array(z3.string().min(1)).optional()
|
|
3150
|
+
assertions: z3.array(z3.string().min(1)).optional(),
|
|
3151
|
+
tools: z3.array(mockToolDefinitionSchema).optional(),
|
|
3152
|
+
toolAssertions: z3.array(toolAssertionSchema).optional()
|
|
2566
3153
|
});
|
|
2567
3154
|
var evalPromptArraySchema = z3.array(evalPromptSchema);
|
|
3155
|
+
function formatExpectedOrder(toolNames) {
|
|
3156
|
+
return `[${toolNames.join(", ")}]`;
|
|
3157
|
+
}
|
|
3158
|
+
function formatActualOrder(toolCalls, toolNames) {
|
|
3159
|
+
const relevantNames = new Set(toolNames);
|
|
3160
|
+
const actualOrder = toolCalls.filter((toolCall) => relevantNames.has(toolCall.name)).map((toolCall) => toolCall.name);
|
|
3161
|
+
return `[${actualOrder.join(", ")}]`;
|
|
3162
|
+
}
|
|
3163
|
+
function evaluateToolAssertions(toolAssertions, toolCalls) {
|
|
3164
|
+
return toolAssertions.map((toolAssertion) => {
|
|
3165
|
+
if (toolAssertion.type === "tool_called") {
|
|
3166
|
+
const matchingCalls = toolCalls.filter((toolCall) => toolCall.name === toolAssertion.toolName);
|
|
3167
|
+
return {
|
|
3168
|
+
assertion: toolAssertion.description,
|
|
3169
|
+
passed: matchingCalls.length > 0,
|
|
3170
|
+
evidence: matchingCalls.length > 0 ? `Tool '${toolAssertion.toolName}' was called ${matchingCalls.length} time${matchingCalls.length === 1 ? "" : "s"}.` : `Tool '${toolAssertion.toolName}' was not called.`,
|
|
3171
|
+
source: "tool"
|
|
3172
|
+
};
|
|
3173
|
+
}
|
|
3174
|
+
if (toolAssertion.type === "tool_not_called") {
|
|
3175
|
+
const matchingCalls = toolCalls.filter((toolCall) => toolCall.name === toolAssertion.toolName);
|
|
3176
|
+
return {
|
|
3177
|
+
assertion: toolAssertion.description,
|
|
3178
|
+
passed: matchingCalls.length === 0,
|
|
3179
|
+
evidence: matchingCalls.length === 0 ? `Tool '${toolAssertion.toolName}' was not called.` : `Tool '${toolAssertion.toolName}' was called ${matchingCalls.length} time${matchingCalls.length === 1 ? "" : "s"}.`,
|
|
3180
|
+
source: "tool"
|
|
3181
|
+
};
|
|
3182
|
+
}
|
|
3183
|
+
if (toolAssertion.type === "tool_call_order") {
|
|
3184
|
+
const expectedOrder = toolAssertion.toolNames ?? [];
|
|
3185
|
+
let nextExpectedIndex = 0;
|
|
3186
|
+
for (const toolCall of toolCalls) {
|
|
3187
|
+
if (toolCall.name === expectedOrder[nextExpectedIndex]) {
|
|
3188
|
+
nextExpectedIndex += 1;
|
|
3189
|
+
}
|
|
3190
|
+
}
|
|
3191
|
+
return {
|
|
3192
|
+
assertion: toolAssertion.description,
|
|
3193
|
+
passed: nextExpectedIndex === expectedOrder.length,
|
|
3194
|
+
evidence: nextExpectedIndex === expectedOrder.length ? `Observed tool call order ${formatExpectedOrder(expectedOrder)}.` : `Expected call order ${formatExpectedOrder(expectedOrder)} but got ${formatActualOrder(toolCalls, expectedOrder)}.`,
|
|
3195
|
+
source: "tool"
|
|
3196
|
+
};
|
|
3197
|
+
}
|
|
3198
|
+
const matchingCall = toolCalls.find(
|
|
3199
|
+
(toolCall) => toolCall.name === toolAssertion.toolName && matchesArgumentSubset(toolCall.arguments, toolAssertion.expectedArgs ?? {})
|
|
3200
|
+
);
|
|
3201
|
+
return {
|
|
3202
|
+
assertion: toolAssertion.description,
|
|
3203
|
+
passed: Boolean(matchingCall),
|
|
3204
|
+
evidence: matchingCall ? `Tool '${toolAssertion.toolName}' was called with arguments matching ${JSON.stringify(toolAssertion.expectedArgs ?? {})}.` : `No '${toolAssertion.toolName}' call matched ${JSON.stringify(toolAssertion.expectedArgs ?? {})}.`,
|
|
3205
|
+
source: "tool"
|
|
3206
|
+
};
|
|
3207
|
+
});
|
|
3208
|
+
}
|
|
2568
3209
|
function extractJsonArray(raw) {
|
|
2569
3210
|
const trimmed = raw.trim();
|
|
2570
3211
|
if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
|
|
@@ -2591,6 +3232,7 @@ async function generatePrompts(skill, provider, model, count) {
|
|
|
2591
3232
|
skill.content,
|
|
2592
3233
|
"",
|
|
2593
3234
|
`Generate ${count} prompts that stress the main capabilities and likely edge cases.`,
|
|
3235
|
+
// Tool-aware prompts require user-defined mock responses and are not auto-generated.
|
|
2594
3236
|
"Each prompt should include 2-4 assertions."
|
|
2595
3237
|
].join("\n");
|
|
2596
3238
|
const raw = await provider.sendMessage(systemPrompt, userPrompt, { model });
|
|
@@ -2614,7 +3256,24 @@ async function runEval(skill, options) {
|
|
|
2614
3256
|
const results = await pMap(
|
|
2615
3257
|
prompts,
|
|
2616
3258
|
async (evalPrompt) => {
|
|
2617
|
-
|
|
3259
|
+
let response;
|
|
3260
|
+
let toolCalls;
|
|
3261
|
+
let loopIterations;
|
|
3262
|
+
if (evalPrompt.tools && evalPrompt.tools.length > 0) {
|
|
3263
|
+
const toolRun = await runWithTools({
|
|
3264
|
+
provider: options.provider,
|
|
3265
|
+
model: options.model,
|
|
3266
|
+
systemPrompt,
|
|
3267
|
+
userMessage: evalPrompt.prompt,
|
|
3268
|
+
tools: evalPrompt.tools,
|
|
3269
|
+
maxIterations: options.maxToolIterations
|
|
3270
|
+
});
|
|
3271
|
+
response = toolRun.finalResponse;
|
|
3272
|
+
toolCalls = toolRun.toolCalls;
|
|
3273
|
+
loopIterations = toolRun.loopIterations;
|
|
3274
|
+
} else {
|
|
3275
|
+
response = await options.provider.sendMessage(systemPrompt, evalPrompt.prompt, { model: options.model });
|
|
3276
|
+
}
|
|
2618
3277
|
const gradedAssertions = await gradeResponse({
|
|
2619
3278
|
provider: options.provider,
|
|
2620
3279
|
model: options.graderModel,
|
|
@@ -2624,14 +3283,18 @@ async function runEval(skill, options) {
|
|
|
2624
3283
|
modelResponse: response,
|
|
2625
3284
|
assertions: evalPrompt.assertions
|
|
2626
3285
|
});
|
|
2627
|
-
const
|
|
3286
|
+
const structuralAssertions = evalPrompt.toolAssertions && evalPrompt.toolAssertions.length > 0 ? evaluateToolAssertions(evalPrompt.toolAssertions, toolCalls ?? []) : [];
|
|
3287
|
+
const assertions = [...gradedAssertions, ...structuralAssertions];
|
|
3288
|
+
const passedAssertions2 = assertions.filter((assertion) => assertion.passed).length;
|
|
2628
3289
|
return {
|
|
2629
3290
|
prompt: evalPrompt.prompt,
|
|
2630
|
-
assertions
|
|
3291
|
+
assertions,
|
|
2631
3292
|
responseSummary: response.slice(0, 200),
|
|
2632
3293
|
response,
|
|
2633
3294
|
passedAssertions: passedAssertions2,
|
|
2634
|
-
totalAssertions:
|
|
3295
|
+
totalAssertions: assertions.length,
|
|
3296
|
+
...toolCalls ? { toolCalls } : {},
|
|
3297
|
+
...loopIterations !== void 0 ? { loopIterations } : {}
|
|
2635
3298
|
};
|
|
2636
3299
|
},
|
|
2637
3300
|
options.concurrency ?? 5
|
|
@@ -2969,10 +3632,7 @@ function renderJson(value) {
|
|
|
2969
3632
|
|
|
2970
3633
|
// src/commands/common.ts
|
|
2971
3634
|
var executionContextByCommand = /* @__PURE__ */ new WeakMap();
|
|
2972
|
-
var singleEvalPromptSchema =
|
|
2973
|
-
prompt: z5.string().min(1),
|
|
2974
|
-
assertions: z5.array(z5.string().min(1)).optional()
|
|
2975
|
-
});
|
|
3635
|
+
var singleEvalPromptSchema = evalPromptSchema;
|
|
2976
3636
|
var promptStringArraySchema = z5.array(z5.string().min(1));
|
|
2977
3637
|
var assertionsObjectSchema = z5.object({
|
|
2978
3638
|
assertions: z5.array(z5.string().min(1))
|
|
@@ -3007,6 +3667,22 @@ function normalizeEvalPrompts(value, sourceLabel) {
|
|
|
3007
3667
|
function parseAssertionsFromText(raw) {
|
|
3008
3668
|
return raw.split(/\r?\n/).map((line) => line.trim().replace(/^[-*]\s+/, "").replace(/^\d+\.\s+/, "")).filter((line) => line.length > 0);
|
|
3009
3669
|
}
|
|
3670
|
+
function cloneEvalPrompt(prompt) {
|
|
3671
|
+
return {
|
|
3672
|
+
prompt: prompt.prompt,
|
|
3673
|
+
assertions: prompt.assertions ? [...prompt.assertions] : void 0,
|
|
3674
|
+
tools: prompt.tools ? prompt.tools.map((tool) => ({
|
|
3675
|
+
...tool,
|
|
3676
|
+
parameters: tool.parameters ? tool.parameters.map((parameter) => ({ ...parameter })) : void 0,
|
|
3677
|
+
responses: { ...tool.responses }
|
|
3678
|
+
})) : void 0,
|
|
3679
|
+
toolAssertions: prompt.toolAssertions ? prompt.toolAssertions.map((toolAssertion) => ({
|
|
3680
|
+
...toolAssertion,
|
|
3681
|
+
toolNames: toolAssertion.toolNames ? [...toolAssertion.toolNames] : void 0,
|
|
3682
|
+
expectedArgs: toolAssertion.expectedArgs ? { ...toolAssertion.expectedArgs } : void 0
|
|
3683
|
+
})) : void 0
|
|
3684
|
+
};
|
|
3685
|
+
}
|
|
3010
3686
|
function normalizeAssertions(value, sourceLabel) {
|
|
3011
3687
|
const assertionArray = z5.array(z5.string().min(1)).safeParse(value);
|
|
3012
3688
|
if (assertionArray.success) {
|
|
@@ -3079,17 +3755,14 @@ async function loadConfiguredEvalPrompts(command) {
|
|
|
3079
3755
|
const assertionsRaw = await fs6.readFile(assertionsFile, "utf8");
|
|
3080
3756
|
const assertions = normalizeAssertions(parseJsonIfPossible(assertionsRaw), assertionsFile);
|
|
3081
3757
|
prompts = prompts.map((prompt) => ({
|
|
3082
|
-
prompt
|
|
3758
|
+
...cloneEvalPrompt(prompt),
|
|
3083
3759
|
assertions: [...assertions]
|
|
3084
3760
|
}));
|
|
3085
3761
|
}
|
|
3086
3762
|
const numRunsWasExplicit = context.configFile?.eval?.numRuns !== void 0;
|
|
3087
3763
|
if (numRunsWasExplicit && prompts.length === 1 && context.config.eval.numRuns > 1) {
|
|
3088
3764
|
const promptTemplate = prompts[0];
|
|
3089
|
-
prompts = Array.from({ length: context.config.eval.numRuns }, () => (
|
|
3090
|
-
prompt: promptTemplate.prompt,
|
|
3091
|
-
assertions: promptTemplate.assertions ? [...promptTemplate.assertions] : void 0
|
|
3092
|
-
}));
|
|
3765
|
+
prompts = Array.from({ length: context.config.eval.numRuns }, () => cloneEvalPrompt(promptTemplate));
|
|
3093
3766
|
}
|
|
3094
3767
|
return prompts;
|
|
3095
3768
|
}
|
|
@@ -3186,7 +3859,8 @@ var evalConfigSchema = z7.object({
|
|
|
3186
3859
|
numRuns: z7.number().int().min(1).optional(),
|
|
3187
3860
|
threshold: z7.number().min(0).max(1).optional(),
|
|
3188
3861
|
promptFile: z7.string().min(1).optional(),
|
|
3189
|
-
assertionsFile: z7.string().min(1).optional()
|
|
3862
|
+
assertionsFile: z7.string().min(1).optional(),
|
|
3863
|
+
maxToolIterations: z7.number().int().min(1).max(50).optional()
|
|
3190
3864
|
}).strict().partial();
|
|
3191
3865
|
var skilltestConfigSchema = z7.object({
|
|
3192
3866
|
provider: providerNameSchema.optional(),
|
|
@@ -3217,7 +3891,8 @@ var resolvedSkilltestConfigSchema = z7.object({
|
|
|
3217
3891
|
numRuns: z7.number().int().min(1),
|
|
3218
3892
|
threshold: z7.number().min(0).max(1),
|
|
3219
3893
|
promptFile: z7.string().min(1).optional(),
|
|
3220
|
-
assertionsFile: z7.string().min(1).optional()
|
|
3894
|
+
assertionsFile: z7.string().min(1).optional(),
|
|
3895
|
+
maxToolIterations: z7.number().int().min(1).max(50)
|
|
3221
3896
|
})
|
|
3222
3897
|
});
|
|
3223
3898
|
var DEFAULT_SKILLTEST_CONFIG = {
|
|
@@ -3237,7 +3912,8 @@ var DEFAULT_SKILLTEST_CONFIG = {
|
|
|
3237
3912
|
},
|
|
3238
3913
|
eval: {
|
|
3239
3914
|
numRuns: 5,
|
|
3240
|
-
threshold: 0.9
|
|
3915
|
+
threshold: 0.9,
|
|
3916
|
+
maxToolIterations: 10
|
|
3241
3917
|
}
|
|
3242
3918
|
};
|
|
3243
3919
|
function formatIssuePath(issuePath) {
|
|
@@ -3367,7 +4043,8 @@ function mergeConfigLayers(configFile = {}, cliFlags = {}, baseDirectory = proce
|
|
|
3367
4043
|
assertionsFile: resolveConfigRelativePath(
|
|
3368
4044
|
baseDirectory,
|
|
3369
4045
|
cliFlags.eval?.assertionsFile ?? configFile.eval?.assertionsFile ?? DEFAULT_SKILLTEST_CONFIG.eval.assertionsFile
|
|
3370
|
-
)
|
|
4046
|
+
),
|
|
4047
|
+
maxToolIterations: cliFlags.eval?.maxToolIterations ?? configFile.eval?.maxToolIterations ?? DEFAULT_SKILLTEST_CONFIG.eval.maxToolIterations
|
|
3371
4048
|
}
|
|
3372
4049
|
};
|
|
3373
4050
|
return resolvedSkilltestConfigSchema.parse(merged);
|
|
@@ -3391,34 +4068,34 @@ function extractCliConfigOverrides(command) {
|
|
|
3391
4068
|
if (command.getOptionValueSource("model") === "cli") {
|
|
3392
4069
|
overrides.model = getTypedOptionValue(command, "model");
|
|
3393
4070
|
}
|
|
3394
|
-
if ((command.name() === "trigger" || command.name() === "eval" || command.name() === "check") && command.getOptionValueSource("concurrency") === "cli") {
|
|
4071
|
+
if ((command.name() === "trigger" || command.name() === "eval" || command.name() === "check" || command.name() === "improve" || command.name() === "route") && command.getOptionValueSource("concurrency") === "cli") {
|
|
3395
4072
|
overrides.concurrency = getTypedOptionValue(command, "concurrency");
|
|
3396
4073
|
}
|
|
3397
|
-
if ((command.name() === "trigger" || command.name() === "check") && command.getOptionValueSource("numQueries") === "cli") {
|
|
4074
|
+
if ((command.name() === "trigger" || command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("numQueries") === "cli") {
|
|
3398
4075
|
overrides.trigger = {
|
|
3399
4076
|
...overrides.trigger,
|
|
3400
4077
|
numQueries: getTypedOptionValue(command, "numQueries")
|
|
3401
4078
|
};
|
|
3402
4079
|
}
|
|
3403
|
-
if ((command.name() === "trigger" || command.name() === "check") && command.getOptionValueSource("compare") === "cli") {
|
|
4080
|
+
if ((command.name() === "trigger" || command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("compare") === "cli") {
|
|
3404
4081
|
overrides.trigger = {
|
|
3405
4082
|
...overrides.trigger,
|
|
3406
4083
|
compare: getTypedOptionValue(command, "compare")
|
|
3407
4084
|
};
|
|
3408
4085
|
}
|
|
3409
|
-
if ((command.name() === "lint" || command.name() === "check") && command.getOptionValueSource("plugin") === "cli") {
|
|
4086
|
+
if ((command.name() === "lint" || command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("plugin") === "cli") {
|
|
3410
4087
|
overrides.lint = {
|
|
3411
4088
|
...overrides.lint,
|
|
3412
4089
|
plugins: getTypedOptionValue(command, "plugin")
|
|
3413
4090
|
};
|
|
3414
4091
|
}
|
|
3415
|
-
if (command.name() === "check" && command.getOptionValueSource("minF1") === "cli") {
|
|
4092
|
+
if ((command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("minF1") === "cli") {
|
|
3416
4093
|
overrides.trigger = {
|
|
3417
4094
|
...overrides.trigger,
|
|
3418
4095
|
threshold: getTypedOptionValue(command, "minF1")
|
|
3419
4096
|
};
|
|
3420
4097
|
}
|
|
3421
|
-
if (command.name() === "check" && command.getOptionValueSource("minAssertPassRate") === "cli") {
|
|
4098
|
+
if ((command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("minAssertPassRate") === "cli") {
|
|
3422
4099
|
overrides.eval = {
|
|
3423
4100
|
...overrides.eval,
|
|
3424
4101
|
threshold: getTypedOptionValue(command, "minAssertPassRate")
|
|
@@ -3483,6 +4160,12 @@ function resolveApiKey(provider, override) {
|
|
|
3483
4160
|
|
|
3484
4161
|
// src/providers/anthropic.ts
|
|
3485
4162
|
import Anthropic from "@anthropic-ai/sdk";
|
|
4163
|
+
function isAnthropicTextBlock(block) {
|
|
4164
|
+
return block.type === "text";
|
|
4165
|
+
}
|
|
4166
|
+
function isAnthropicToolUseBlock(block) {
|
|
4167
|
+
return block.type === "tool_use";
|
|
4168
|
+
}
|
|
3486
4169
|
function wait(ms) {
|
|
3487
4170
|
return new Promise((resolve) => {
|
|
3488
4171
|
setTimeout(resolve, ms);
|
|
@@ -3508,27 +4191,11 @@ var AnthropicProvider = class {
|
|
|
3508
4191
|
constructor(apiKey) {
|
|
3509
4192
|
this.client = new Anthropic({ apiKey });
|
|
3510
4193
|
}
|
|
3511
|
-
async
|
|
4194
|
+
async createMessage(request) {
|
|
3512
4195
|
let lastError;
|
|
3513
4196
|
for (let attempt = 0; attempt < 3; attempt += 1) {
|
|
3514
4197
|
try {
|
|
3515
|
-
|
|
3516
|
-
model: options.model,
|
|
3517
|
-
max_tokens: 2048,
|
|
3518
|
-
system: systemPrompt,
|
|
3519
|
-
messages: [
|
|
3520
|
-
{
|
|
3521
|
-
role: "user",
|
|
3522
|
-
content: userMessage
|
|
3523
|
-
}
|
|
3524
|
-
]
|
|
3525
|
-
});
|
|
3526
|
-
const textBlocks = response.content.filter((block) => block.type === "text");
|
|
3527
|
-
const text = textBlocks.map((block) => block.text).join("\n").trim();
|
|
3528
|
-
if (text.length === 0) {
|
|
3529
|
-
throw new Error("Model returned an empty response.");
|
|
3530
|
-
}
|
|
3531
|
-
return text;
|
|
4198
|
+
return await this.client.messages.create(request);
|
|
3532
4199
|
} catch (error) {
|
|
3533
4200
|
lastError = error;
|
|
3534
4201
|
if (!isRateLimitError(error) || attempt === 2) {
|
|
@@ -3543,6 +4210,55 @@ var AnthropicProvider = class {
|
|
|
3543
4210
|
}
|
|
3544
4211
|
throw new Error("Anthropic API call failed with an unknown error.");
|
|
3545
4212
|
}
|
|
4213
|
+
toAnthropicMessages(messages) {
|
|
4214
|
+
return messages.map((message) => ({
|
|
4215
|
+
role: message.role,
|
|
4216
|
+
content: message.content
|
|
4217
|
+
}));
|
|
4218
|
+
}
|
|
4219
|
+
async sendMessage(systemPrompt, userMessage, options) {
|
|
4220
|
+
const response = await this.createMessage({
|
|
4221
|
+
model: options.model,
|
|
4222
|
+
max_tokens: 2048,
|
|
4223
|
+
system: systemPrompt,
|
|
4224
|
+
messages: [
|
|
4225
|
+
{
|
|
4226
|
+
role: "user",
|
|
4227
|
+
content: userMessage
|
|
4228
|
+
}
|
|
4229
|
+
]
|
|
4230
|
+
});
|
|
4231
|
+
const textBlocks = response.content.filter(isAnthropicTextBlock);
|
|
4232
|
+
const text = textBlocks.map((block) => block.text).join("\n").trim();
|
|
4233
|
+
if (text.length === 0) {
|
|
4234
|
+
throw new Error("Model returned an empty response.");
|
|
4235
|
+
}
|
|
4236
|
+
return text;
|
|
4237
|
+
}
|
|
4238
|
+
async sendWithTools(systemPrompt, messages, options) {
|
|
4239
|
+
const response = await this.createMessage({
|
|
4240
|
+
model: options.model,
|
|
4241
|
+
max_tokens: 2048,
|
|
4242
|
+
system: systemPrompt,
|
|
4243
|
+
messages: this.toAnthropicMessages(messages),
|
|
4244
|
+
tools: options.tools.map((tool) => ({
|
|
4245
|
+
name: tool.name,
|
|
4246
|
+
description: tool.description,
|
|
4247
|
+
input_schema: tool.parameters ?? { type: "object", properties: {} }
|
|
4248
|
+
}))
|
|
4249
|
+
});
|
|
4250
|
+
const textContent = response.content.filter(isAnthropicTextBlock).map((block) => block.text).join("\n").trim();
|
|
4251
|
+
const toolUseBlocks = response.content.filter(isAnthropicToolUseBlock).map((block) => ({
|
|
4252
|
+
id: block.id,
|
|
4253
|
+
name: block.name,
|
|
4254
|
+
arguments: block.input
|
|
4255
|
+
}));
|
|
4256
|
+
return {
|
|
4257
|
+
textContent,
|
|
4258
|
+
toolUseBlocks,
|
|
4259
|
+
stopReason: response.stop_reason ?? "end_turn"
|
|
4260
|
+
};
|
|
4261
|
+
}
|
|
3546
4262
|
};
|
|
3547
4263
|
|
|
3548
4264
|
// src/providers/openai.ts
|
|
@@ -3579,17 +4295,82 @@ function extractTextContent(content) {
|
|
|
3579
4295
|
const text = content.map((item) => item.type === "text" || !item.type ? item.text ?? "" : "").join("\n").trim();
|
|
3580
4296
|
return text;
|
|
3581
4297
|
}
|
|
3582
|
-
|
|
3583
|
-
|
|
3584
|
-
|
|
3585
|
-
client;
|
|
3586
|
-
constructor(apiKey) {
|
|
3587
|
-
this.apiKey = apiKey;
|
|
3588
|
-
this.client = null;
|
|
4298
|
+
function parseToolArguments(raw, toolName) {
|
|
4299
|
+
if (!raw || raw.trim() === "") {
|
|
4300
|
+
return {};
|
|
3589
4301
|
}
|
|
3590
|
-
|
|
3591
|
-
|
|
3592
|
-
|
|
4302
|
+
try {
|
|
4303
|
+
const parsed = JSON.parse(raw);
|
|
4304
|
+
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
|
|
4305
|
+
throw new Error("Tool arguments must be a JSON object.");
|
|
4306
|
+
}
|
|
4307
|
+
return parsed;
|
|
4308
|
+
} catch (error) {
|
|
4309
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
4310
|
+
throw new Error(`OpenAI tool call arguments for '${toolName}' were not valid JSON: ${message}`);
|
|
4311
|
+
}
|
|
4312
|
+
}
|
|
4313
|
+
function getBlockText(blocks) {
|
|
4314
|
+
return blocks.filter((block) => block.type === "text").map((block) => String(block.text ?? "")).join("\n").trim();
|
|
4315
|
+
}
|
|
4316
|
+
function mapAssistantBlocksToMessage(blocks) {
|
|
4317
|
+
const textContent = getBlockText(blocks);
|
|
4318
|
+
const toolCalls = blocks.filter((block) => block.type === "tool_use").map((block) => ({
|
|
4319
|
+
id: String(block.id ?? ""),
|
|
4320
|
+
type: "function",
|
|
4321
|
+
function: {
|
|
4322
|
+
name: String(block.name ?? ""),
|
|
4323
|
+
arguments: JSON.stringify(block.input ?? {})
|
|
4324
|
+
}
|
|
4325
|
+
}));
|
|
4326
|
+
return {
|
|
4327
|
+
role: "assistant",
|
|
4328
|
+
content: textContent.length > 0 ? textContent : null,
|
|
4329
|
+
...toolCalls.length > 0 ? { tool_calls: toolCalls } : {}
|
|
4330
|
+
};
|
|
4331
|
+
}
|
|
4332
|
+
function mapUserBlocksToMessages(blocks) {
|
|
4333
|
+
const toolResults = blocks.filter((block) => block.type === "tool_result");
|
|
4334
|
+
if (toolResults.length > 0) {
|
|
4335
|
+
return toolResults.map((block) => ({
|
|
4336
|
+
role: "tool",
|
|
4337
|
+
tool_call_id: String(block.tool_use_id ?? ""),
|
|
4338
|
+
content: String(block.content ?? "")
|
|
4339
|
+
}));
|
|
4340
|
+
}
|
|
4341
|
+
const textContent = getBlockText(blocks);
|
|
4342
|
+
return [
|
|
4343
|
+
{
|
|
4344
|
+
role: "user",
|
|
4345
|
+
content: textContent
|
|
4346
|
+
}
|
|
4347
|
+
];
|
|
4348
|
+
}
|
|
4349
|
+
function mapConversationBlockToMessages(block) {
|
|
4350
|
+
if (typeof block.content === "string") {
|
|
4351
|
+
return [
|
|
4352
|
+
{
|
|
4353
|
+
role: block.role,
|
|
4354
|
+
content: block.content
|
|
4355
|
+
}
|
|
4356
|
+
];
|
|
4357
|
+
}
|
|
4358
|
+
if (block.role === "assistant") {
|
|
4359
|
+
return [mapAssistantBlocksToMessage(block.content)];
|
|
4360
|
+
}
|
|
4361
|
+
return mapUserBlocksToMessages(block.content);
|
|
4362
|
+
}
|
|
4363
|
+
var OpenAIProvider = class {
|
|
4364
|
+
name = "openai";
|
|
4365
|
+
apiKey;
|
|
4366
|
+
client;
|
|
4367
|
+
constructor(apiKey) {
|
|
4368
|
+
this.apiKey = apiKey;
|
|
4369
|
+
this.client = null;
|
|
4370
|
+
}
|
|
4371
|
+
async ensureClient() {
|
|
4372
|
+
if (this.client) {
|
|
4373
|
+
return this.client;
|
|
3593
4374
|
}
|
|
3594
4375
|
let openAiModule;
|
|
3595
4376
|
try {
|
|
@@ -3607,30 +4388,12 @@ var OpenAIProvider = class {
|
|
|
3607
4388
|
this.client = new OpenAIConstructor({ apiKey: this.apiKey });
|
|
3608
4389
|
return this.client;
|
|
3609
4390
|
}
|
|
3610
|
-
async
|
|
4391
|
+
async createCompletion(input) {
|
|
3611
4392
|
const client = await this.ensureClient();
|
|
3612
4393
|
let lastError;
|
|
3613
4394
|
for (let attempt = 0; attempt < 3; attempt += 1) {
|
|
3614
4395
|
try {
|
|
3615
|
-
|
|
3616
|
-
model: options.model,
|
|
3617
|
-
max_tokens: 2048,
|
|
3618
|
-
messages: [
|
|
3619
|
-
{
|
|
3620
|
-
role: "system",
|
|
3621
|
-
content: systemPrompt
|
|
3622
|
-
},
|
|
3623
|
-
{
|
|
3624
|
-
role: "user",
|
|
3625
|
-
content: userMessage
|
|
3626
|
-
}
|
|
3627
|
-
]
|
|
3628
|
-
});
|
|
3629
|
-
const text = (response.choices ?? []).map((choice) => extractTextContent(choice.message?.content)).join("\n").trim();
|
|
3630
|
-
if (text.length === 0) {
|
|
3631
|
-
throw new Error("Model returned an empty response.");
|
|
3632
|
-
}
|
|
3633
|
-
return text;
|
|
4396
|
+
return await client.chat.completions.create(input);
|
|
3634
4397
|
} catch (error) {
|
|
3635
4398
|
lastError = error;
|
|
3636
4399
|
if (!isRetriableError(error) || attempt === 2) {
|
|
@@ -3645,6 +4408,57 @@ var OpenAIProvider = class {
|
|
|
3645
4408
|
}
|
|
3646
4409
|
throw new Error("OpenAI API call failed with an unknown error.");
|
|
3647
4410
|
}
|
|
4411
|
+
toOpenAiMessages(systemPrompt, messages) {
|
|
4412
|
+
return [
|
|
4413
|
+
{
|
|
4414
|
+
role: "system",
|
|
4415
|
+
content: systemPrompt
|
|
4416
|
+
},
|
|
4417
|
+
...messages.flatMap((message) => mapConversationBlockToMessages(message))
|
|
4418
|
+
];
|
|
4419
|
+
}
|
|
4420
|
+
async sendMessage(systemPrompt, userMessage, options) {
|
|
4421
|
+
const response = await this.createCompletion({
|
|
4422
|
+
model: options.model,
|
|
4423
|
+
max_tokens: 2048,
|
|
4424
|
+
messages: this.toOpenAiMessages(systemPrompt, [{ role: "user", content: userMessage }])
|
|
4425
|
+
});
|
|
4426
|
+
const text = (response.choices ?? []).map((choice) => extractTextContent(choice.message?.content)).join("\n").trim();
|
|
4427
|
+
if (text.length === 0) {
|
|
4428
|
+
throw new Error("Model returned an empty response.");
|
|
4429
|
+
}
|
|
4430
|
+
return text;
|
|
4431
|
+
}
|
|
4432
|
+
async sendWithTools(systemPrompt, messages, options) {
|
|
4433
|
+
const response = await this.createCompletion({
|
|
4434
|
+
model: options.model,
|
|
4435
|
+
max_tokens: 2048,
|
|
4436
|
+
messages: this.toOpenAiMessages(systemPrompt, messages),
|
|
4437
|
+
tools: options.tools.map((tool) => ({
|
|
4438
|
+
type: "function",
|
|
4439
|
+
function: {
|
|
4440
|
+
name: tool.name,
|
|
4441
|
+
description: tool.description,
|
|
4442
|
+
parameters: tool.parameters
|
|
4443
|
+
}
|
|
4444
|
+
}))
|
|
4445
|
+
});
|
|
4446
|
+
const choice = response.choices?.[0];
|
|
4447
|
+
const message = choice?.message;
|
|
4448
|
+
const toolUseBlocks = (message?.tool_calls ?? []).map((toolCall, index) => {
|
|
4449
|
+
const toolName = toolCall.function?.name ?? `tool-${index + 1}`;
|
|
4450
|
+
return {
|
|
4451
|
+
id: toolCall.id ?? `${toolName}-${index + 1}`,
|
|
4452
|
+
name: toolName,
|
|
4453
|
+
arguments: parseToolArguments(toolCall.function?.arguments, toolName)
|
|
4454
|
+
};
|
|
4455
|
+
});
|
|
4456
|
+
return {
|
|
4457
|
+
textContent: extractTextContent(message?.content),
|
|
4458
|
+
toolUseBlocks,
|
|
4459
|
+
stopReason: choice?.finish_reason === "stop" ? "end_turn" : choice?.finish_reason === "tool_calls" ? "tool_use" : choice?.finish_reason ?? "end_turn"
|
|
4460
|
+
};
|
|
4461
|
+
}
|
|
3648
4462
|
};
|
|
3649
4463
|
|
|
3650
4464
|
// src/providers/index.ts
|
|
@@ -3815,7 +4629,8 @@ async function handleEvalCommand(targetPath, options, command) {
|
|
|
3815
4629
|
graderModel,
|
|
3816
4630
|
numRuns: options.numRuns,
|
|
3817
4631
|
concurrency: options.concurrency,
|
|
3818
|
-
prompts
|
|
4632
|
+
prompts,
|
|
4633
|
+
maxToolIterations: options.maxToolIterations
|
|
3819
4634
|
});
|
|
3820
4635
|
if (options.saveResults) {
|
|
3821
4636
|
await writeJsonFile(options.saveResults, result);
|
|
@@ -3862,7 +4677,8 @@ function registerEvalCommand(program) {
|
|
|
3862
4677
|
verbose: Boolean(parsedCli.data.verbose),
|
|
3863
4678
|
apiKey: parsedCli.data.apiKey,
|
|
3864
4679
|
numRuns: config.eval.numRuns,
|
|
3865
|
-
concurrency: config.concurrency
|
|
4680
|
+
concurrency: config.concurrency,
|
|
4681
|
+
maxToolIterations: config.eval.maxToolIterations
|
|
3866
4682
|
},
|
|
3867
4683
|
command
|
|
3868
4684
|
);
|
|
@@ -3919,7 +4735,8 @@ async function runCheck(inputPath, options) {
|
|
|
3919
4735
|
graderModel: options.graderModel,
|
|
3920
4736
|
numRuns: options.evalNumRuns,
|
|
3921
4737
|
prompts: options.prompts,
|
|
3922
|
-
concurrency: options.concurrency
|
|
4738
|
+
concurrency: options.concurrency,
|
|
4739
|
+
maxToolIterations: options.evalMaxToolIterations
|
|
3923
4740
|
};
|
|
3924
4741
|
if ((options.concurrency ?? 5) === 1) {
|
|
3925
4742
|
options.onStage?.("trigger");
|
|
@@ -4041,6 +4858,7 @@ async function handleCheckCommand(targetPath, options, command) {
|
|
|
4041
4858
|
triggerSeed: options.triggerSeed,
|
|
4042
4859
|
prompts,
|
|
4043
4860
|
evalNumRuns: options.numRuns,
|
|
4861
|
+
evalMaxToolIterations: options.maxToolIterations,
|
|
4044
4862
|
concurrency: options.concurrency,
|
|
4045
4863
|
minF1: options.minF1,
|
|
4046
4864
|
minAssertPassRate: options.minAssertPassRate,
|
|
@@ -4106,6 +4924,7 @@ function registerCheckCommand(program) {
|
|
|
4106
4924
|
minF1: config.trigger.threshold,
|
|
4107
4925
|
minAssertPassRate: config.eval.threshold,
|
|
4108
4926
|
numRuns: config.eval.numRuns,
|
|
4927
|
+
maxToolIterations: config.eval.maxToolIterations,
|
|
4109
4928
|
concurrency: config.concurrency,
|
|
4110
4929
|
html: parsedCli.data.html,
|
|
4111
4930
|
lintFailOn: config.lint.failOn,
|
|
@@ -4121,12 +4940,868 @@ function registerCheckCommand(program) {
|
|
|
4121
4940
|
});
|
|
4122
4941
|
}
|
|
4123
4942
|
|
|
4943
|
+
// src/commands/improve.ts
|
|
4944
|
+
import ora4 from "ora";
|
|
4945
|
+
import { z as z12 } from "zod";
|
|
4946
|
+
|
|
4947
|
+
// src/core/improver.ts
|
|
4948
|
+
import fs12 from "node:fs/promises";
|
|
4949
|
+
import os from "node:os";
|
|
4950
|
+
import path7 from "node:path";
|
|
4951
|
+
import yaml2 from "js-yaml";
|
|
4952
|
+
import { z as z11 } from "zod";
|
|
4953
|
+
var improveRewriteSchema = z11.object({
|
|
4954
|
+
frontmatter: z11.record(z11.unknown()),
|
|
4955
|
+
content: z11.string().min(1),
|
|
4956
|
+
changeSummary: z11.array(z11.string().min(1)).min(1),
|
|
4957
|
+
targetedProblems: z11.array(z11.string().min(1)).min(1)
|
|
4958
|
+
});
|
|
4959
|
+
function calculateEvalAssertPassRate2(result) {
|
|
4960
|
+
if (!result || result.summary.totalAssertions === 0) {
|
|
4961
|
+
return 0;
|
|
4962
|
+
}
|
|
4963
|
+
return result.summary.passedAssertions / result.summary.totalAssertions;
|
|
4964
|
+
}
|
|
4965
|
+
function extractJsonObject2(raw) {
|
|
4966
|
+
const trimmed = raw.trim();
|
|
4967
|
+
if (trimmed.startsWith("{") && trimmed.endsWith("}")) {
|
|
4968
|
+
return JSON.parse(trimmed);
|
|
4969
|
+
}
|
|
4970
|
+
const start = trimmed.indexOf("{");
|
|
4971
|
+
const end = trimmed.lastIndexOf("}");
|
|
4972
|
+
if (start >= 0 && end > start) {
|
|
4973
|
+
return JSON.parse(trimmed.slice(start, end + 1));
|
|
4974
|
+
}
|
|
4975
|
+
throw new Error("Improver did not return a JSON object.");
|
|
4976
|
+
}
|
|
4977
|
+
function orderFrontmatter(frontmatter) {
|
|
4978
|
+
const ordered = {};
|
|
4979
|
+
for (const key of ["name", "description", "license"]) {
|
|
4980
|
+
if (Object.prototype.hasOwnProperty.call(frontmatter, key)) {
|
|
4981
|
+
ordered[key] = frontmatter[key];
|
|
4982
|
+
}
|
|
4983
|
+
}
|
|
4984
|
+
for (const [key, value] of Object.entries(frontmatter)) {
|
|
4985
|
+
if (!Object.prototype.hasOwnProperty.call(ordered, key)) {
|
|
4986
|
+
ordered[key] = value;
|
|
4987
|
+
}
|
|
4988
|
+
}
|
|
4989
|
+
return ordered;
|
|
4990
|
+
}
|
|
4991
|
+
function detectLineEnding(raw) {
|
|
4992
|
+
return raw.includes("\r\n") ? "\r\n" : "\n";
|
|
4993
|
+
}
|
|
4994
|
+
function buildSkillMarkdown(frontmatter, content, lineEnding) {
|
|
4995
|
+
const normalizedBody = content.trim();
|
|
4996
|
+
if (normalizedBody.length === 0) {
|
|
4997
|
+
throw new Error("Candidate rewrite produced an empty SKILL.md body.");
|
|
4998
|
+
}
|
|
4999
|
+
const frontmatterBlock = yaml2.dump(orderFrontmatter(frontmatter), {
|
|
5000
|
+
lineWidth: 0,
|
|
5001
|
+
noRefs: true,
|
|
5002
|
+
sortKeys: false
|
|
5003
|
+
}).replace(/\n/g, lineEnding);
|
|
5004
|
+
return `---${lineEnding}${frontmatterBlock}---${lineEnding}${lineEnding}${normalizedBody.replace(/\n/g, lineEnding)}${lineEnding}`;
|
|
5005
|
+
}
|
|
5006
|
+
async function validateRelativeReferences(raw, skillRoot) {
|
|
5007
|
+
for (const reference of extractRelativeFileReferences(raw)) {
|
|
5008
|
+
const resolved = path7.resolve(skillRoot, reference);
|
|
5009
|
+
const relativeToRoot = path7.relative(skillRoot, resolved);
|
|
5010
|
+
const escapesRoot = relativeToRoot === "" ? false : relativeToRoot.startsWith("..") || path7.isAbsolute(relativeToRoot);
|
|
5011
|
+
if (escapesRoot) {
|
|
5012
|
+
throw new Error(`Candidate rewrite introduced an out-of-root reference: ${reference}`);
|
|
5013
|
+
}
|
|
5014
|
+
if (!await pathExists(resolved)) {
|
|
5015
|
+
throw new Error(`Candidate rewrite introduced a broken relative reference: ${reference}`);
|
|
5016
|
+
}
|
|
5017
|
+
}
|
|
5018
|
+
}
|
|
5019
|
+
async function buildCandidate(skill, rewrite) {
|
|
5020
|
+
if (typeof rewrite.frontmatter.name === "string" && rewrite.frontmatter.name !== skill.frontmatter.name) {
|
|
5021
|
+
throw new Error(`Candidate rewrite attempted to rename skill '${skill.frontmatter.name}' to '${rewrite.frontmatter.name}'.`);
|
|
5022
|
+
}
|
|
5023
|
+
if (skill.frontmatter.license && typeof rewrite.frontmatter.license === "string" && rewrite.frontmatter.license !== skill.frontmatter.license) {
|
|
5024
|
+
throw new Error(
|
|
5025
|
+
`Candidate rewrite attempted to change license '${skill.frontmatter.license}' to '${rewrite.frontmatter.license}'.`
|
|
5026
|
+
);
|
|
5027
|
+
}
|
|
5028
|
+
const mergedFrontmatter = {
|
|
5029
|
+
...skill.frontmatter,
|
|
5030
|
+
...rewrite.frontmatter,
|
|
5031
|
+
name: skill.frontmatter.name,
|
|
5032
|
+
...skill.frontmatter.license ? { license: skill.frontmatter.license } : {}
|
|
5033
|
+
};
|
|
5034
|
+
const raw = buildSkillMarkdown(mergedFrontmatter, rewrite.content, detectLineEnding(skill.raw));
|
|
5035
|
+
parseSkillDocumentStrict(raw, skill.skillRoot, skill.skillFile);
|
|
5036
|
+
await validateRelativeReferences(raw, skill.skillRoot);
|
|
5037
|
+
return {
|
|
5038
|
+
frontmatter: mergedFrontmatter,
|
|
5039
|
+
content: rewrite.content.trim(),
|
|
5040
|
+
raw,
|
|
5041
|
+
changeSummary: rewrite.changeSummary,
|
|
5042
|
+
targetedProblems: rewrite.targetedProblems
|
|
5043
|
+
};
|
|
5044
|
+
}
|
|
5045
|
+
function extractActionableIssues(result) {
|
|
5046
|
+
const lintIssues = result.lint.issues.filter((issue) => issue.status !== "pass").map((issue) => ({
|
|
5047
|
+
checkId: issue.checkId,
|
|
5048
|
+
title: issue.title,
|
|
5049
|
+
status: issue.status === "warn" ? "warn" : "fail",
|
|
5050
|
+
message: issue.message,
|
|
5051
|
+
suggestion: issue.suggestion,
|
|
5052
|
+
startLine: issue.startLine,
|
|
5053
|
+
endLine: issue.endLine
|
|
5054
|
+
}));
|
|
5055
|
+
const triggerFailures = result.trigger?.cases.filter((testCase) => !testCase.matched).map((testCase) => ({
|
|
5056
|
+
query: testCase.query,
|
|
5057
|
+
expected: testCase.expected,
|
|
5058
|
+
actual: testCase.actual,
|
|
5059
|
+
selectedCompetitor: testCase.selectedCompetitor,
|
|
5060
|
+
rawModelResponse: testCase.rawModelResponse
|
|
5061
|
+
})) ?? [];
|
|
5062
|
+
const evalFailures = result.eval?.results.flatMap(
|
|
5063
|
+
(promptResult) => promptResult.assertions.filter((assertion) => !assertion.passed).map((assertion) => ({
|
|
5064
|
+
prompt: promptResult.prompt,
|
|
5065
|
+
assertion: assertion.assertion,
|
|
5066
|
+
evidence: assertion.evidence,
|
|
5067
|
+
source: assertion.source === "grader" || assertion.source === "tool" ? assertion.source : "unknown"
|
|
5068
|
+
}))
|
|
5069
|
+
) ?? [];
|
|
5070
|
+
return {
|
|
5071
|
+
lintIssues,
|
|
5072
|
+
triggerFailures,
|
|
5073
|
+
evalFailures,
|
|
5074
|
+
triggerSuggestions: result.trigger?.suggestions ?? []
|
|
5075
|
+
};
|
|
5076
|
+
}
|
|
5077
|
+
function hasActionableProblems(brief) {
|
|
5078
|
+
return brief.lintIssues.length > 0 || brief.triggerFailures.length > 0 || brief.evalFailures.length > 0 || brief.triggerSuggestions.length > 0;
|
|
5079
|
+
}
|
|
5080
|
+
async function listSkillFiles(skillRoot) {
|
|
5081
|
+
const entries = await fs12.readdir(skillRoot, { withFileTypes: true });
|
|
5082
|
+
const files = [];
|
|
5083
|
+
for (const entry of entries) {
|
|
5084
|
+
const absolutePath = path7.join(skillRoot, entry.name);
|
|
5085
|
+
if (entry.isDirectory()) {
|
|
5086
|
+
files.push(...await listSkillFiles(absolutePath));
|
|
5087
|
+
continue;
|
|
5088
|
+
}
|
|
5089
|
+
if (entry.isFile()) {
|
|
5090
|
+
files.push(path7.relative(skillRoot, absolutePath).split(path7.sep).join("/"));
|
|
5091
|
+
}
|
|
5092
|
+
}
|
|
5093
|
+
return files.sort();
|
|
5094
|
+
}
|
|
5095
|
+
async function requestRewrite(skill, baseline, brief, provider, model) {
|
|
5096
|
+
const availableFiles = await listSkillFiles(skill.skillRoot);
|
|
5097
|
+
const systemPrompt = [
|
|
5098
|
+
"You rewrite Agent Skill files to improve measured quality.",
|
|
5099
|
+
"Return JSON only.",
|
|
5100
|
+
"Required format:",
|
|
5101
|
+
'{"frontmatter": {...}, "content": "...", "changeSummary": ["..."], "targetedProblems": ["..."]}',
|
|
5102
|
+
"The content field must contain only the markdown body of SKILL.md, without YAML frontmatter fences.",
|
|
5103
|
+
`Keep the skill name exactly '${skill.frontmatter.name}'.`,
|
|
5104
|
+
skill.frontmatter.license ? `Keep the license exactly '${skill.frontmatter.license}'.` : "Do not remove any valid existing frontmatter fields.",
|
|
5105
|
+
"Do not invent new scripts, assets, references, APIs, or tools.",
|
|
5106
|
+
"Only reference files that already exist under the skill root.",
|
|
5107
|
+
"Optimize for trigger clarity, explicit scope boundaries, concrete examples, safety guidance, and tool usage instructions."
|
|
5108
|
+
].join(" ");
|
|
5109
|
+
const baselineTriggerF1 = baseline.trigger?.metrics.f1 ?? 0;
|
|
5110
|
+
const baselineEvalPassRate = calculateEvalAssertPassRate2(baseline.eval);
|
|
5111
|
+
const userPrompt = [
|
|
5112
|
+
`Skill file: ${skill.skillFile}`,
|
|
5113
|
+
`Current trigger F1: ${baselineTriggerF1.toFixed(4)}`,
|
|
5114
|
+
`Current eval assertion pass rate: ${baselineEvalPassRate.toFixed(4)}`,
|
|
5115
|
+
`Lint failures: ${baseline.lint.summary.failures}`,
|
|
5116
|
+
`Lint warnings: ${baseline.lint.summary.warnings}`,
|
|
5117
|
+
"",
|
|
5118
|
+
"Available files under the skill root:",
|
|
5119
|
+
...availableFiles.map((file) => `- ${file}`),
|
|
5120
|
+
"",
|
|
5121
|
+
"Current SKILL.md:",
|
|
5122
|
+
"```markdown",
|
|
5123
|
+
skill.raw,
|
|
5124
|
+
"```",
|
|
5125
|
+
"",
|
|
5126
|
+
"Actionable problems to fix:",
|
|
5127
|
+
JSON.stringify(brief, null, 2),
|
|
5128
|
+
"",
|
|
5129
|
+
"Rewrite the skill to address only these evidenced problems. Keep the instructions tight and practical."
|
|
5130
|
+
].join("\n");
|
|
5131
|
+
const raw = await provider.sendMessage(systemPrompt, userPrompt, { model });
|
|
5132
|
+
const parsed = improveRewriteSchema.safeParse(extractJsonObject2(raw));
|
|
5133
|
+
if (!parsed.success) {
|
|
5134
|
+
throw new Error(`Failed to parse improve output: ${parsed.error.issues[0]?.message ?? "invalid improve JSON"}`);
|
|
5135
|
+
}
|
|
5136
|
+
return parsed.data;
|
|
5137
|
+
}
|
|
5138
|
+
async function createVerificationDirectory(skillRoot, candidateRaw) {
|
|
5139
|
+
const tempRoot = await fs12.mkdtemp(path7.join(os.tmpdir(), "skilltest-improve-"));
|
|
5140
|
+
const tempSkillRoot = path7.join(tempRoot, path7.basename(skillRoot));
|
|
5141
|
+
await fs12.cp(skillRoot, tempSkillRoot, { recursive: true });
|
|
5142
|
+
await fs12.writeFile(path7.join(tempSkillRoot, "SKILL.md"), candidateRaw, "utf8");
|
|
5143
|
+
return {
|
|
5144
|
+
tempRoot,
|
|
5145
|
+
skillPath: tempSkillRoot
|
|
5146
|
+
};
|
|
5147
|
+
}
|
|
5148
|
+
function buildDelta(baseline, verification) {
|
|
5149
|
+
const baselineTriggerF1 = baseline.trigger?.metrics.f1 ?? 0;
|
|
5150
|
+
const verificationTriggerF1 = verification.trigger?.metrics.f1 ?? 0;
|
|
5151
|
+
const baselineEvalPassRate = calculateEvalAssertPassRate2(baseline.eval);
|
|
5152
|
+
const verificationEvalPassRate = calculateEvalAssertPassRate2(verification.eval);
|
|
5153
|
+
const lintFailuresDelta = baseline.lint.summary.failures - verification.lint.summary.failures;
|
|
5154
|
+
const lintWarningsDelta = baseline.lint.summary.warnings - verification.lint.summary.warnings;
|
|
5155
|
+
const triggerF1Delta = verificationTriggerF1 - baselineTriggerF1;
|
|
5156
|
+
const evalPassRateDelta = verificationEvalPassRate - baselineEvalPassRate;
|
|
5157
|
+
const hasRegression = verification.lint.summary.failures > baseline.lint.summary.failures || verification.lint.summary.warnings > baseline.lint.summary.warnings || verificationTriggerF1 < baselineTriggerF1 || verificationEvalPassRate < baselineEvalPassRate;
|
|
5158
|
+
const improved = verification.gates.overallPassed !== baseline.gates.overallPassed ? verification.gates.overallPassed : lintFailuresDelta > 0 || lintWarningsDelta > 0 || triggerF1Delta > 0 || evalPassRateDelta > 0;
|
|
5159
|
+
return {
|
|
5160
|
+
lintFailures: {
|
|
5161
|
+
before: baseline.lint.summary.failures,
|
|
5162
|
+
after: verification.lint.summary.failures,
|
|
5163
|
+
delta: lintFailuresDelta
|
|
5164
|
+
},
|
|
5165
|
+
lintWarnings: {
|
|
5166
|
+
before: baseline.lint.summary.warnings,
|
|
5167
|
+
after: verification.lint.summary.warnings,
|
|
5168
|
+
delta: lintWarningsDelta
|
|
5169
|
+
},
|
|
5170
|
+
triggerF1: {
|
|
5171
|
+
before: baselineTriggerF1,
|
|
5172
|
+
after: verificationTriggerF1,
|
|
5173
|
+
delta: triggerF1Delta
|
|
5174
|
+
},
|
|
5175
|
+
evalAssertPassRate: {
|
|
5176
|
+
before: baselineEvalPassRate,
|
|
5177
|
+
after: verificationEvalPassRate,
|
|
5178
|
+
delta: evalPassRateDelta
|
|
5179
|
+
},
|
|
5180
|
+
overallPassed: {
|
|
5181
|
+
before: baseline.gates.overallPassed,
|
|
5182
|
+
after: verification.gates.overallPassed
|
|
5183
|
+
},
|
|
5184
|
+
improved,
|
|
5185
|
+
hasRegression
|
|
5186
|
+
};
|
|
5187
|
+
}
|
|
5188
|
+
function normalizeVerificationTarget(result, target) {
|
|
5189
|
+
return {
|
|
5190
|
+
...result,
|
|
5191
|
+
target
|
|
5192
|
+
};
|
|
5193
|
+
}
|
|
5194
|
+
function buildBlockingReason(delta, verification) {
|
|
5195
|
+
if (delta.hasRegression) {
|
|
5196
|
+
return "Candidate rewrite regressed one or more quality metrics on the frozen test set.";
|
|
5197
|
+
}
|
|
5198
|
+
if (!delta.improved) {
|
|
5199
|
+
return "Candidate rewrite did not produce a measurable improvement on the frozen test set.";
|
|
5200
|
+
}
|
|
5201
|
+
if (!verification.gates.overallPassed) {
|
|
5202
|
+
return "Candidate rewrite improved the skill but still failed the configured quality gates.";
|
|
5203
|
+
}
|
|
5204
|
+
return void 0;
|
|
5205
|
+
}
|
|
5206
|
+
async function maybeWriteOutput(outputPath, raw) {
|
|
5207
|
+
const absolutePath = path7.resolve(outputPath);
|
|
5208
|
+
await fs12.mkdir(path7.dirname(absolutePath), { recursive: true });
|
|
5209
|
+
await fs12.writeFile(absolutePath, raw, "utf8");
|
|
5210
|
+
return absolutePath;
|
|
5211
|
+
}
|
|
5212
|
+
async function runImprove(inputPath, options) {
|
|
5213
|
+
options.onStage?.("baseline");
|
|
5214
|
+
const baseline = await runCheck(inputPath, {
|
|
5215
|
+
provider: options.provider,
|
|
5216
|
+
model: options.model,
|
|
5217
|
+
graderModel: options.model,
|
|
5218
|
+
lintFailOn: options.lintFailOn,
|
|
5219
|
+
lintSuppress: options.lintSuppress,
|
|
5220
|
+
lintPlugins: options.lintPlugins,
|
|
5221
|
+
compare: options.compare,
|
|
5222
|
+
numQueries: options.numQueries,
|
|
5223
|
+
triggerSeed: options.triggerSeed,
|
|
5224
|
+
queries: options.queries,
|
|
5225
|
+
evalNumRuns: options.evalNumRuns,
|
|
5226
|
+
prompts: options.prompts,
|
|
5227
|
+
evalMaxToolIterations: options.evalMaxToolIterations,
|
|
5228
|
+
concurrency: options.concurrency,
|
|
5229
|
+
minF1: options.minF1,
|
|
5230
|
+
minAssertPassRate: options.minAssertPassRate,
|
|
5231
|
+
continueOnLintFail: true,
|
|
5232
|
+
verbose: options.verbose
|
|
5233
|
+
});
|
|
5234
|
+
if (!baseline.trigger || !baseline.eval) {
|
|
5235
|
+
return {
|
|
5236
|
+
target: inputPath,
|
|
5237
|
+
provider: options.provider.name,
|
|
5238
|
+
model: options.model,
|
|
5239
|
+
originalRaw: "",
|
|
5240
|
+
thresholds: {
|
|
5241
|
+
minF1: options.minF1,
|
|
5242
|
+
minAssertPassRate: options.minAssertPassRate
|
|
5243
|
+
},
|
|
5244
|
+
baseline,
|
|
5245
|
+
candidate: null,
|
|
5246
|
+
verification: null,
|
|
5247
|
+
delta: null,
|
|
5248
|
+
applied: false,
|
|
5249
|
+
blockedReason: baseline.triggerSkippedReason ?? baseline.evalSkippedReason ?? "Improve requires a strictly parseable skill so trigger and eval can be frozen."
|
|
5250
|
+
};
|
|
5251
|
+
}
|
|
5252
|
+
const skill = await parseSkillStrict(inputPath);
|
|
5253
|
+
const brief = extractActionableIssues(baseline);
|
|
5254
|
+
if (!hasActionableProblems(brief)) {
|
|
5255
|
+
return {
|
|
5256
|
+
target: inputPath,
|
|
5257
|
+
provider: options.provider.name,
|
|
5258
|
+
model: options.model,
|
|
5259
|
+
originalRaw: skill.raw,
|
|
5260
|
+
thresholds: {
|
|
5261
|
+
minF1: options.minF1,
|
|
5262
|
+
minAssertPassRate: options.minAssertPassRate
|
|
5263
|
+
},
|
|
5264
|
+
baseline,
|
|
5265
|
+
candidate: null,
|
|
5266
|
+
verification: null,
|
|
5267
|
+
delta: null,
|
|
5268
|
+
applied: false,
|
|
5269
|
+
blockedReason: "No actionable failures, warnings, or mismatches were found to improve."
|
|
5270
|
+
};
|
|
5271
|
+
}
|
|
5272
|
+
options.onStage?.("generate");
|
|
5273
|
+
const rewrite = await requestRewrite(skill, baseline, brief, options.provider, options.model);
|
|
5274
|
+
options.onStage?.("validate");
|
|
5275
|
+
const candidate = await buildCandidate(skill, rewrite);
|
|
5276
|
+
if (candidate.raw === skill.raw) {
|
|
5277
|
+
return {
|
|
5278
|
+
target: inputPath,
|
|
5279
|
+
provider: options.provider.name,
|
|
5280
|
+
model: options.model,
|
|
5281
|
+
originalRaw: skill.raw,
|
|
5282
|
+
thresholds: {
|
|
5283
|
+
minF1: options.minF1,
|
|
5284
|
+
minAssertPassRate: options.minAssertPassRate
|
|
5285
|
+
},
|
|
5286
|
+
baseline,
|
|
5287
|
+
candidate,
|
|
5288
|
+
verification: null,
|
|
5289
|
+
delta: null,
|
|
5290
|
+
applied: false,
|
|
5291
|
+
blockedReason: "Candidate rewrite produced no changes."
|
|
5292
|
+
};
|
|
5293
|
+
}
|
|
5294
|
+
options.onStage?.("verify");
|
|
5295
|
+
const verificationDirectory = await createVerificationDirectory(skill.skillRoot, candidate.raw);
|
|
5296
|
+
let verification;
|
|
5297
|
+
try {
|
|
5298
|
+
verification = normalizeVerificationTarget(
|
|
5299
|
+
await runCheck(verificationDirectory.skillPath, {
|
|
5300
|
+
provider: options.provider,
|
|
5301
|
+
model: options.model,
|
|
5302
|
+
graderModel: options.model,
|
|
5303
|
+
lintFailOn: options.lintFailOn,
|
|
5304
|
+
lintSuppress: options.lintSuppress,
|
|
5305
|
+
lintPlugins: options.lintPlugins,
|
|
5306
|
+
compare: options.compare,
|
|
5307
|
+
numQueries: baseline.trigger.queries.length,
|
|
5308
|
+
triggerSeed: options.triggerSeed,
|
|
5309
|
+
queries: baseline.trigger.queries,
|
|
5310
|
+
evalNumRuns: baseline.eval.prompts.length,
|
|
5311
|
+
prompts: baseline.eval.prompts,
|
|
5312
|
+
evalMaxToolIterations: options.evalMaxToolIterations,
|
|
5313
|
+
concurrency: options.concurrency,
|
|
5314
|
+
minF1: options.minF1,
|
|
5315
|
+
minAssertPassRate: options.minAssertPassRate,
|
|
5316
|
+
continueOnLintFail: true,
|
|
5317
|
+
verbose: options.verbose
|
|
5318
|
+
}),
|
|
5319
|
+
inputPath
|
|
5320
|
+
);
|
|
5321
|
+
} finally {
|
|
5322
|
+
await fs12.rm(verificationDirectory.tempRoot, { recursive: true, force: true });
|
|
5323
|
+
}
|
|
5324
|
+
const delta = buildDelta(baseline, verification);
|
|
5325
|
+
const blockedReason = buildBlockingReason(delta, verification);
|
|
5326
|
+
let applied = false;
|
|
5327
|
+
let outputPath;
|
|
5328
|
+
if (!blockedReason) {
|
|
5329
|
+
if (options.outputPath) {
|
|
5330
|
+
options.onStage?.("write");
|
|
5331
|
+
outputPath = await maybeWriteOutput(options.outputPath, candidate.raw);
|
|
5332
|
+
}
|
|
5333
|
+
if (options.apply) {
|
|
5334
|
+
options.onStage?.("write");
|
|
5335
|
+
await fs12.writeFile(skill.skillFile, candidate.raw, "utf8");
|
|
5336
|
+
applied = true;
|
|
5337
|
+
}
|
|
5338
|
+
}
|
|
5339
|
+
return {
|
|
5340
|
+
target: inputPath,
|
|
5341
|
+
provider: options.provider.name,
|
|
5342
|
+
model: options.model,
|
|
5343
|
+
originalRaw: skill.raw,
|
|
5344
|
+
thresholds: {
|
|
5345
|
+
minF1: options.minF1,
|
|
5346
|
+
minAssertPassRate: options.minAssertPassRate
|
|
5347
|
+
},
|
|
5348
|
+
baseline,
|
|
5349
|
+
candidate,
|
|
5350
|
+
verification,
|
|
5351
|
+
delta,
|
|
5352
|
+
applied,
|
|
5353
|
+
...outputPath ? { outputPath } : {},
|
|
5354
|
+
...blockedReason ? { blockedReason } : {}
|
|
5355
|
+
};
|
|
5356
|
+
}
|
|
5357
|
+
|
|
5358
|
+
// src/commands/improve.ts
|
|
5359
|
+
var improveCliSchema = z12.object({
|
|
5360
|
+
apiKey: z12.string().optional(),
|
|
5361
|
+
queries: z12.string().optional(),
|
|
5362
|
+
compare: z12.array(z12.string().min(1)).optional(),
|
|
5363
|
+
seed: z12.number().int().optional(),
|
|
5364
|
+
prompts: z12.string().optional(),
|
|
5365
|
+
plugin: z12.array(z12.string().min(1)).optional(),
|
|
5366
|
+
concurrency: z12.number().int().min(1).optional(),
|
|
5367
|
+
output: z12.string().optional(),
|
|
5368
|
+
saveResults: z12.string().optional(),
|
|
5369
|
+
apply: z12.boolean().optional(),
|
|
5370
|
+
verbose: z12.boolean().optional()
|
|
5371
|
+
});
|
|
5372
|
+
var DEFAULT_ANTHROPIC_MODEL4 = "claude-sonnet-4-5-20250929";
|
|
5373
|
+
var DEFAULT_OPENAI_MODEL4 = "gpt-4.1-mini";
|
|
5374
|
+
function collectPluginPaths3(value, previous = []) {
|
|
5375
|
+
return [...previous, value];
|
|
5376
|
+
}
|
|
5377
|
+
function resolveModel4(provider, model) {
|
|
5378
|
+
if (provider === "openai" && model === DEFAULT_ANTHROPIC_MODEL4) {
|
|
5379
|
+
return DEFAULT_OPENAI_MODEL4;
|
|
5380
|
+
}
|
|
5381
|
+
return model;
|
|
5382
|
+
}
|
|
5383
|
+
async function handleImproveCommand(targetPath, options, command) {
|
|
5384
|
+
const spinner = options.json || !process.stdout.isTTY ? null : ora4("Preparing improvement run...").start();
|
|
5385
|
+
try {
|
|
5386
|
+
if (spinner) {
|
|
5387
|
+
spinner.text = "Initializing model provider...";
|
|
5388
|
+
}
|
|
5389
|
+
const provider = createProvider(options.provider, options.apiKey);
|
|
5390
|
+
let queries = void 0;
|
|
5391
|
+
if (options.queries) {
|
|
5392
|
+
if (spinner) {
|
|
5393
|
+
spinner.text = "Loading frozen trigger queries...";
|
|
5394
|
+
}
|
|
5395
|
+
queries = await loadTriggerQueriesFile(options.queries);
|
|
5396
|
+
}
|
|
5397
|
+
let prompts = void 0;
|
|
5398
|
+
if (options.prompts) {
|
|
5399
|
+
if (spinner) {
|
|
5400
|
+
spinner.text = "Loading eval prompts...";
|
|
5401
|
+
}
|
|
5402
|
+
prompts = await loadEvalPromptsJson(options.prompts);
|
|
5403
|
+
} else {
|
|
5404
|
+
prompts = await loadConfiguredEvalPrompts(command);
|
|
5405
|
+
}
|
|
5406
|
+
const model = resolveModel4(options.provider, options.model);
|
|
5407
|
+
const result = await runImprove(targetPath, {
|
|
5408
|
+
provider,
|
|
5409
|
+
model,
|
|
5410
|
+
lintFailOn: options.lintFailOn,
|
|
5411
|
+
lintSuppress: options.lintSuppress,
|
|
5412
|
+
lintPlugins: options.lintPlugins,
|
|
5413
|
+
compare: options.compare,
|
|
5414
|
+
numQueries: options.numQueries,
|
|
5415
|
+
triggerSeed: options.triggerSeed,
|
|
5416
|
+
queries,
|
|
5417
|
+
prompts,
|
|
5418
|
+
evalNumRuns: options.numRuns,
|
|
5419
|
+
evalMaxToolIterations: options.maxToolIterations,
|
|
5420
|
+
minF1: options.minF1,
|
|
5421
|
+
minAssertPassRate: options.minAssertPassRate,
|
|
5422
|
+
concurrency: options.concurrency,
|
|
5423
|
+
apply: options.apply,
|
|
5424
|
+
outputPath: options.output,
|
|
5425
|
+
verbose: options.verbose,
|
|
5426
|
+
onStage: (stage) => {
|
|
5427
|
+
if (!spinner) {
|
|
5428
|
+
return;
|
|
5429
|
+
}
|
|
5430
|
+
if (stage === "baseline") {
|
|
5431
|
+
spinner.text = "Running baseline check...";
|
|
5432
|
+
} else if (stage === "generate") {
|
|
5433
|
+
spinner.text = "Generating candidate rewrite...";
|
|
5434
|
+
} else if (stage === "validate") {
|
|
5435
|
+
spinner.text = "Validating candidate rewrite...";
|
|
5436
|
+
} else if (stage === "verify") {
|
|
5437
|
+
spinner.text = "Verifying candidate against frozen test inputs...";
|
|
5438
|
+
} else if (stage === "write") {
|
|
5439
|
+
spinner.text = options.apply ? "Writing improved SKILL.md..." : "Writing candidate output...";
|
|
5440
|
+
}
|
|
5441
|
+
}
|
|
5442
|
+
});
|
|
5443
|
+
if (options.saveResults) {
|
|
5444
|
+
await writeJsonFile(options.saveResults, result);
|
|
5445
|
+
}
|
|
5446
|
+
spinner?.stop();
|
|
5447
|
+
if (options.json) {
|
|
5448
|
+
writeResult(result, true);
|
|
5449
|
+
} else {
|
|
5450
|
+
writeResult(renderImproveReport(result, options.color, options.verbose), false);
|
|
5451
|
+
}
|
|
5452
|
+
process.exitCode = result.blockedReason ? 1 : 0;
|
|
5453
|
+
} catch (error) {
|
|
5454
|
+
spinner?.stop();
|
|
5455
|
+
writeError(error, options.json);
|
|
5456
|
+
process.exitCode = 2;
|
|
5457
|
+
}
|
|
5458
|
+
}
|
|
5459
|
+
function registerImproveCommand(program) {
|
|
5460
|
+
program.command("improve").description("Rewrite SKILL.md, verify it on frozen test inputs, and optionally apply it.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai").option("--model <model>", "Model for baseline, rewrite, and verification runs").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--compare <path...>", "Path(s) to sibling skill directories to include as competitors").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible trigger results", (value) => Number.parseInt(value, 10)).option("--prompts <path>", "Path to eval prompts JSON").option("--plugin <path>", "Load a custom lint plugin file", collectPluginPaths3, []).option("--concurrency <n>", "Maximum in-flight trigger/eval tasks", (value) => Number.parseInt(value, 10)).option("--output <path>", "Write the verified candidate SKILL.md to a separate file").option("--save-results <path>", "Save the full improve result JSON").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value)).option(
|
|
5461
|
+
"--min-assert-pass-rate <n>",
|
|
5462
|
+
"Minimum required eval assertion pass rate (0-1)",
|
|
5463
|
+
(value) => Number.parseFloat(value)
|
|
5464
|
+
).option("--apply", "Apply the verified rewrite to the source SKILL.md").option("--verbose", "Include detailed baseline and verification reports").action(async (targetPath, _commandOptions, command) => {
|
|
5465
|
+
const globalOptions = getGlobalCliOptions(command);
|
|
5466
|
+
const config = getResolvedConfig(command);
|
|
5467
|
+
const parsedCli = improveCliSchema.safeParse(command.opts());
|
|
5468
|
+
if (!parsedCli.success) {
|
|
5469
|
+
writeError(new Error(parsedCli.error.issues[0]?.message ?? "Invalid improve options."), globalOptions.json);
|
|
5470
|
+
process.exitCode = 2;
|
|
5471
|
+
return;
|
|
5472
|
+
}
|
|
5473
|
+
await handleImproveCommand(
|
|
5474
|
+
targetPath,
|
|
5475
|
+
{
|
|
5476
|
+
...globalOptions,
|
|
5477
|
+
provider: config.provider,
|
|
5478
|
+
model: config.model,
|
|
5479
|
+
apiKey: parsedCli.data.apiKey,
|
|
5480
|
+
queries: parsedCli.data.queries,
|
|
5481
|
+
compare: config.trigger.compare,
|
|
5482
|
+
numQueries: config.trigger.numQueries,
|
|
5483
|
+
prompts: parsedCli.data.prompts,
|
|
5484
|
+
minF1: config.trigger.threshold,
|
|
5485
|
+
minAssertPassRate: config.eval.threshold,
|
|
5486
|
+
numRuns: config.eval.numRuns,
|
|
5487
|
+
maxToolIterations: config.eval.maxToolIterations,
|
|
5488
|
+
concurrency: config.concurrency,
|
|
5489
|
+
lintFailOn: config.lint.failOn,
|
|
5490
|
+
lintSuppress: config.lint.suppress,
|
|
5491
|
+
lintPlugins: config.lint.plugins,
|
|
5492
|
+
triggerSeed: parsedCli.data.seed ?? config.trigger.seed,
|
|
5493
|
+
output: parsedCli.data.output,
|
|
5494
|
+
saveResults: parsedCli.data.saveResults,
|
|
5495
|
+
apply: Boolean(parsedCli.data.apply),
|
|
5496
|
+
verbose: Boolean(parsedCli.data.verbose)
|
|
5497
|
+
},
|
|
5498
|
+
command
|
|
5499
|
+
);
|
|
5500
|
+
});
|
|
5501
|
+
}
|
|
5502
|
+
|
|
5503
|
+
// src/commands/route.ts
|
|
5504
|
+
import fs13 from "node:fs/promises";
|
|
5505
|
+
import ora5 from "ora";
|
|
5506
|
+
import { z as z14 } from "zod";
|
|
5507
|
+
|
|
5508
|
+
// src/core/route-tester.ts
|
|
5509
|
+
import path8 from "node:path";
|
|
5510
|
+
import { z as z13 } from "zod";
|
|
5511
|
+
var stringArraySchema = z13.array(z13.string().min(1));
|
|
5512
|
+
function parseJsonArrayFromModelOutput2(raw) {
|
|
5513
|
+
const trimmed = raw.trim();
|
|
5514
|
+
if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
|
|
5515
|
+
return JSON.parse(trimmed);
|
|
5516
|
+
}
|
|
5517
|
+
const start = trimmed.indexOf("[");
|
|
5518
|
+
const end = trimmed.lastIndexOf("]");
|
|
5519
|
+
if (start >= 0 && end > start) {
|
|
5520
|
+
return JSON.parse(trimmed.slice(start, end + 1));
|
|
5521
|
+
}
|
|
5522
|
+
throw new Error("Model did not return a JSON array.");
|
|
5523
|
+
}
|
|
5524
|
+
function parseRouteDecision(rawResponse, skillNames) {
|
|
5525
|
+
const normalized = rawResponse.trim().toLowerCase();
|
|
5526
|
+
if (normalized === "none" || normalized.startsWith("none")) {
|
|
5527
|
+
return "none";
|
|
5528
|
+
}
|
|
5529
|
+
for (const skillName of skillNames) {
|
|
5530
|
+
const escaped = skillName.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
5531
|
+
const regex = new RegExp(`\\b${escaped}\\b`, "i");
|
|
5532
|
+
if (regex.test(rawResponse)) {
|
|
5533
|
+
return skillName;
|
|
5534
|
+
}
|
|
5535
|
+
}
|
|
5536
|
+
return "unrecognized";
|
|
5537
|
+
}
|
|
5538
|
+
async function discoverSkillPaths(skillDir) {
|
|
5539
|
+
const allFiles = await listFilesRecursive(skillDir);
|
|
5540
|
+
return allFiles.filter((f) => path8.basename(f) === "SKILL.md");
|
|
5541
|
+
}
|
|
5542
|
+
async function generatePositiveQueriesForSkill(skill, provider, model, count) {
|
|
5543
|
+
const systemPrompt = [
|
|
5544
|
+
"You generate realistic user queries that should trigger a specific agent skill.",
|
|
5545
|
+
"Return a JSON array of strings only. No markdown, no comments.",
|
|
5546
|
+
"Each string is one realistic user query that clearly belongs to this skill.",
|
|
5547
|
+
"Queries should look like real user requests with enough context to drive a routing decision."
|
|
5548
|
+
].join(" ");
|
|
5549
|
+
const userPrompt = [
|
|
5550
|
+
`Skill name: ${skill.frontmatter.name}`,
|
|
5551
|
+
`Skill description: ${skill.frontmatter.description}`,
|
|
5552
|
+
`Generate exactly ${count} distinct queries that should trigger this skill.`
|
|
5553
|
+
].join("\n");
|
|
5554
|
+
const raw = await provider.sendMessage(systemPrompt, userPrompt, { model });
|
|
5555
|
+
const parsed = stringArraySchema.safeParse(parseJsonArrayFromModelOutput2(raw));
|
|
5556
|
+
if (!parsed.success) {
|
|
5557
|
+
throw new Error(
|
|
5558
|
+
`Failed to parse generated queries for skill '${skill.frontmatter.name}': ${parsed.error.issues[0]?.message ?? "invalid format"}`
|
|
5559
|
+
);
|
|
5560
|
+
}
|
|
5561
|
+
if (parsed.data.length < count) {
|
|
5562
|
+
throw new Error(
|
|
5563
|
+
`Expected ${count} queries for skill '${skill.frontmatter.name}', got ${parsed.data.length}.`
|
|
5564
|
+
);
|
|
5565
|
+
}
|
|
5566
|
+
return parsed.data.slice(0, count);
|
|
5567
|
+
}
|
|
5568
|
+
function buildSkillListText(skills) {
|
|
5569
|
+
return skills.map((s) => `- ${s.frontmatter.name}: ${s.frontmatter.description}`).join("\n");
|
|
5570
|
+
}
|
|
5571
|
+
function buildConfusionMatrix(cases, skillNames, numQueriesPerSkill) {
|
|
5572
|
+
const allActualValues = [...skillNames, "none", "unrecognized"];
|
|
5573
|
+
const matrix = {};
|
|
5574
|
+
for (const target of skillNames) {
|
|
5575
|
+
matrix[target] = {};
|
|
5576
|
+
for (const actual of allActualValues) {
|
|
5577
|
+
matrix[target][actual] = 0;
|
|
5578
|
+
}
|
|
5579
|
+
}
|
|
5580
|
+
for (const c of cases) {
|
|
5581
|
+
const row = matrix[c.targetSkill];
|
|
5582
|
+
if (row) {
|
|
5583
|
+
row[c.actualSkill] = (row[c.actualSkill] ?? 0) + 1;
|
|
5584
|
+
}
|
|
5585
|
+
}
|
|
5586
|
+
const matrixPct = {};
|
|
5587
|
+
const divisor = numQueriesPerSkill > 0 ? numQueriesPerSkill : 1;
|
|
5588
|
+
for (const target of skillNames) {
|
|
5589
|
+
matrixPct[target] = {};
|
|
5590
|
+
for (const actual of allActualValues) {
|
|
5591
|
+
matrixPct[target][actual] = (matrix[target][actual] ?? 0) / divisor;
|
|
5592
|
+
}
|
|
5593
|
+
}
|
|
5594
|
+
return { matrix, matrixPct };
|
|
5595
|
+
}
|
|
5596
|
+
function computePerSkillMetrics(skillNames, matrix, numQueriesPerSkill) {
|
|
5597
|
+
return skillNames.map((skill) => {
|
|
5598
|
+
const tp = matrix[skill]?.[skill] ?? 0;
|
|
5599
|
+
const fp = skillNames.filter((s) => s !== skill).reduce((sum, other) => sum + (matrix[other]?.[skill] ?? 0), 0);
|
|
5600
|
+
const recall = numQueriesPerSkill === 0 ? 0 : tp / numQueriesPerSkill;
|
|
5601
|
+
const precDenom = tp + fp;
|
|
5602
|
+
const precision = precDenom === 0 ? 0 : tp / precDenom;
|
|
5603
|
+
const f1 = precision + recall === 0 ? 0 : 2 * precision * recall / (precision + recall);
|
|
5604
|
+
return { skill, queriesTotal: numQueriesPerSkill, correct: tp, precision, recall, f1 };
|
|
5605
|
+
});
|
|
5606
|
+
}
|
|
5607
|
+
function detectConflicts(skillNames, matrixPct, conflictThreshold) {
|
|
5608
|
+
const conflicts = [];
|
|
5609
|
+
for (let i = 0; i < skillNames.length; i++) {
|
|
5610
|
+
for (let j = i + 1; j < skillNames.length; j++) {
|
|
5611
|
+
const skillA = skillNames[i];
|
|
5612
|
+
const skillB = skillNames[j];
|
|
5613
|
+
const bleedAtoB = matrixPct[skillA]?.[skillB] ?? 0;
|
|
5614
|
+
const bleedBtoA = matrixPct[skillB]?.[skillA] ?? 0;
|
|
5615
|
+
if (Math.max(bleedAtoB, bleedBtoA) > conflictThreshold) {
|
|
5616
|
+
conflicts.push({ skillA, skillB, bleedAtoB, bleedBtoA });
|
|
5617
|
+
}
|
|
5618
|
+
}
|
|
5619
|
+
}
|
|
5620
|
+
return conflicts;
|
|
5621
|
+
}
|
|
5622
|
+
function buildRouteSuggestions(perSkillMetrics, conflicts) {
|
|
5623
|
+
const suggestions = [];
|
|
5624
|
+
for (const metrics of perSkillMetrics) {
|
|
5625
|
+
if (metrics.f1 < 0.7) {
|
|
5626
|
+
suggestions.push(
|
|
5627
|
+
`'${metrics.skill}' has low F1 (${(metrics.f1 * 100).toFixed(1)}%) \u2014 consider clarifying its description and scope boundaries.`
|
|
5628
|
+
);
|
|
5629
|
+
}
|
|
5630
|
+
}
|
|
5631
|
+
for (const conflict of conflicts) {
|
|
5632
|
+
suggestions.push(
|
|
5633
|
+
`'${conflict.skillA}' and '${conflict.skillB}' overlap: ${(conflict.bleedAtoB * 100).toFixed(1)}% of ${conflict.skillA} queries routed to ${conflict.skillB}, ${(conflict.bleedBtoA * 100).toFixed(1)}% the other way \u2014 consider narrowing scope boundaries.`
|
|
5634
|
+
);
|
|
5635
|
+
}
|
|
5636
|
+
if (suggestions.length === 0) {
|
|
5637
|
+
suggestions.push("Routing looks clean. All skills are well-differentiated on this sample.");
|
|
5638
|
+
}
|
|
5639
|
+
return suggestions;
|
|
5640
|
+
}
|
|
5641
|
+
async function runRouteTest(skillDir, options) {
|
|
5642
|
+
const numQueriesPerSkill = options.numQueriesPerSkill ?? 10;
|
|
5643
|
+
const conflictThreshold = options.conflictThreshold ?? 0.1;
|
|
5644
|
+
const concurrency = options.concurrency ?? 5;
|
|
5645
|
+
const absoluteSkillDir = path8.resolve(skillDir);
|
|
5646
|
+
const skillPaths = await discoverSkillPaths(absoluteSkillDir);
|
|
5647
|
+
if (skillPaths.length < 2) {
|
|
5648
|
+
throw new Error(
|
|
5649
|
+
`Route test requires at least 2 skills. Found ${skillPaths.length} in: ${skillDir}`
|
|
5650
|
+
);
|
|
5651
|
+
}
|
|
5652
|
+
if (skillPaths.length > 20) {
|
|
5653
|
+
process.stderr.write(
|
|
5654
|
+
`Warning: ${skillPaths.length} skills found. This will make ${skillPaths.length * numQueriesPerSkill} routing model calls.
|
|
5655
|
+
`
|
|
5656
|
+
);
|
|
5657
|
+
}
|
|
5658
|
+
const skills = await Promise.all(skillPaths.map((p) => parseSkillStrict(p)));
|
|
5659
|
+
const skillNames = skills.map((s) => s.frontmatter.name);
|
|
5660
|
+
const queriesPerSkill = await pMap(
|
|
5661
|
+
skills,
|
|
5662
|
+
(skill) => generatePositiveQueriesForSkill(skill, options.provider, options.model, numQueriesPerSkill),
|
|
5663
|
+
concurrency
|
|
5664
|
+
);
|
|
5665
|
+
const workItems = [];
|
|
5666
|
+
for (let i = 0; i < skills.length; i++) {
|
|
5667
|
+
const skill = skills[i];
|
|
5668
|
+
const queries = queriesPerSkill[i];
|
|
5669
|
+
for (const query of queries) {
|
|
5670
|
+
workItems.push({ query, targetSkill: skill.frontmatter.name });
|
|
5671
|
+
}
|
|
5672
|
+
}
|
|
5673
|
+
const skillListText = buildSkillListText(skills);
|
|
5674
|
+
const systemPrompt = "Select the single best skill for the user's request from the provided list. Respond with only the skill name, or 'none' if nothing fits.";
|
|
5675
|
+
const cases = await pMap(
|
|
5676
|
+
workItems,
|
|
5677
|
+
async ({ query, targetSkill }) => {
|
|
5678
|
+
const userPrompt = `Available skills:
|
|
5679
|
+
${skillListText}
|
|
5680
|
+
|
|
5681
|
+
User query: ${query}`;
|
|
5682
|
+
const rawResponse = await options.provider.sendMessage(systemPrompt, userPrompt, { model: options.model });
|
|
5683
|
+
const actualSkill = parseRouteDecision(rawResponse, skillNames);
|
|
5684
|
+
return {
|
|
5685
|
+
query,
|
|
5686
|
+
targetSkill,
|
|
5687
|
+
actualSkill,
|
|
5688
|
+
correct: actualSkill === targetSkill,
|
|
5689
|
+
rawModelResponse: options.verbose ? rawResponse : void 0
|
|
5690
|
+
};
|
|
5691
|
+
},
|
|
5692
|
+
concurrency
|
|
5693
|
+
);
|
|
5694
|
+
const { matrix, matrixPct } = buildConfusionMatrix(cases, skillNames, numQueriesPerSkill);
|
|
5695
|
+
const perSkillMetrics = computePerSkillMetrics(skillNames, matrix, numQueriesPerSkill);
|
|
5696
|
+
const conflicts = detectConflicts(skillNames, matrixPct, conflictThreshold);
|
|
5697
|
+
const correctCount = cases.filter((c) => c.correct).length;
|
|
5698
|
+
const overallAccuracy = cases.length === 0 ? 0 : correctCount / cases.length;
|
|
5699
|
+
const suggestions = buildRouteSuggestions(perSkillMetrics, conflicts);
|
|
5700
|
+
return {
|
|
5701
|
+
skillDir: absoluteSkillDir,
|
|
5702
|
+
skills: skillNames,
|
|
5703
|
+
model: options.model,
|
|
5704
|
+
provider: options.provider.name,
|
|
5705
|
+
seed: options.seed,
|
|
5706
|
+
numQueriesPerSkill,
|
|
5707
|
+
cases,
|
|
5708
|
+
matrix,
|
|
5709
|
+
matrixPct,
|
|
5710
|
+
perSkillMetrics,
|
|
5711
|
+
conflicts,
|
|
5712
|
+
suggestions,
|
|
5713
|
+
overallAccuracy
|
|
5714
|
+
};
|
|
5715
|
+
}
|
|
5716
|
+
|
|
5717
|
+
// src/commands/route.ts
|
|
5718
|
+
var routeCliSchema = z14.object({
|
|
5719
|
+
numQueries: z14.number().int().min(1).optional(),
|
|
5720
|
+
conflictThreshold: z14.number().min(0).max(1).optional(),
|
|
5721
|
+
saveQueries: z14.string().optional(),
|
|
5722
|
+
seed: z14.number().int().optional(),
|
|
5723
|
+
concurrency: z14.number().int().min(1).optional(),
|
|
5724
|
+
html: z14.string().optional(),
|
|
5725
|
+
verbose: z14.boolean().optional(),
|
|
5726
|
+
apiKey: z14.string().optional()
|
|
5727
|
+
});
|
|
5728
|
+
var DEFAULT_ANTHROPIC_MODEL5 = "claude-sonnet-4-5-20250929";
|
|
5729
|
+
var DEFAULT_OPENAI_MODEL5 = "gpt-4.1-mini";
|
|
5730
|
+
function resolveModel5(provider, model) {
|
|
5731
|
+
if (provider === "openai" && model === DEFAULT_ANTHROPIC_MODEL5) {
|
|
5732
|
+
return DEFAULT_OPENAI_MODEL5;
|
|
5733
|
+
}
|
|
5734
|
+
return model;
|
|
5735
|
+
}
|
|
5736
|
+
async function handleRouteCommand(skillDir, options) {
|
|
5737
|
+
const spinner = options.json || !process.stdout.isTTY ? null : ora5("Preparing route evaluation...").start();
|
|
5738
|
+
try {
|
|
5739
|
+
if (spinner) spinner.text = "Initializing model provider...";
|
|
5740
|
+
const provider = createProvider(options.provider, options.apiKey);
|
|
5741
|
+
if (spinner) spinner.text = "Running route simulations...";
|
|
5742
|
+
const model = resolveModel5(options.provider, options.model);
|
|
5743
|
+
const result = await runRouteTest(skillDir, {
|
|
5744
|
+
model,
|
|
5745
|
+
provider,
|
|
5746
|
+
numQueriesPerSkill: options.numQueriesPerSkill,
|
|
5747
|
+
conflictThreshold: options.conflictThreshold,
|
|
5748
|
+
seed: options.seed,
|
|
5749
|
+
concurrency: options.concurrency,
|
|
5750
|
+
verbose: options.verbose
|
|
5751
|
+
});
|
|
5752
|
+
if (options.saveQueries) {
|
|
5753
|
+
await writeJsonFile(
|
|
5754
|
+
options.saveQueries,
|
|
5755
|
+
result.cases.map((c) => ({ query: c.query, targetSkill: c.targetSkill }))
|
|
5756
|
+
);
|
|
5757
|
+
}
|
|
5758
|
+
spinner?.stop();
|
|
5759
|
+
if (options.json) {
|
|
5760
|
+
writeResult(result, true);
|
|
5761
|
+
} else {
|
|
5762
|
+
writeResult(renderRouteReport(result, options.color, options.verbose), false);
|
|
5763
|
+
}
|
|
5764
|
+
if (options.html) {
|
|
5765
|
+
await fs13.writeFile(options.html, renderRouteHtml(result), "utf8");
|
|
5766
|
+
}
|
|
5767
|
+
} catch (error) {
|
|
5768
|
+
spinner?.stop();
|
|
5769
|
+
writeError(error, options.json);
|
|
5770
|
+
process.exitCode = 2;
|
|
5771
|
+
}
|
|
5772
|
+
}
|
|
5773
|
+
function registerRouteCommand(program) {
|
|
5774
|
+
program.command("route").description("Validate multi-skill routing across all skills in a directory.").argument("<skillDir>", "Directory containing skill subdirectories with SKILL.md files").option("--model <model>", "Model to use").option("--provider <provider>", "LLM provider: anthropic|openai").option("--num-queries <n>", "Queries per skill (default: 10)", (value) => Number.parseInt(value, 10)).option("--conflict-threshold <n>", "Bleed fraction to flag as conflict (default: 0.1)", (value) => Number.parseFloat(value)).option("--seed <number>", "RNG seed for reproducibility metadata", (value) => Number.parseInt(value, 10)).option("--concurrency <n>", "Maximum in-flight requests", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--save-queries <path>", "Save generated queries as JSON").option("--api-key <key>", "API key override").option("--verbose", "Show raw model responses").action(async (skillDir, _commandOptions, command) => {
|
|
5775
|
+
const globalOptions = getGlobalCliOptions(command);
|
|
5776
|
+
const config = getResolvedConfig(command);
|
|
5777
|
+
const parsedCli = routeCliSchema.safeParse(command.opts());
|
|
5778
|
+
if (!parsedCli.success) {
|
|
5779
|
+
writeError(new Error(parsedCli.error.issues[0]?.message ?? "Invalid route options."), globalOptions.json);
|
|
5780
|
+
process.exitCode = 2;
|
|
5781
|
+
return;
|
|
5782
|
+
}
|
|
5783
|
+
await handleRouteCommand(skillDir, {
|
|
5784
|
+
...globalOptions,
|
|
5785
|
+
model: config.model,
|
|
5786
|
+
provider: config.provider,
|
|
5787
|
+
numQueriesPerSkill: parsedCli.data.numQueries ?? 10,
|
|
5788
|
+
conflictThreshold: parsedCli.data.conflictThreshold ?? 0.1,
|
|
5789
|
+
saveQueries: parsedCli.data.saveQueries,
|
|
5790
|
+
seed: parsedCli.data.seed,
|
|
5791
|
+
concurrency: parsedCli.data.concurrency ?? config.concurrency,
|
|
5792
|
+
html: parsedCli.data.html,
|
|
5793
|
+
verbose: Boolean(parsedCli.data.verbose),
|
|
5794
|
+
apiKey: parsedCli.data.apiKey
|
|
5795
|
+
});
|
|
5796
|
+
});
|
|
5797
|
+
}
|
|
5798
|
+
|
|
4124
5799
|
// src/index.ts
|
|
4125
5800
|
function resolveVersion() {
|
|
4126
5801
|
try {
|
|
4127
5802
|
const currentFilePath = fileURLToPath(import.meta.url);
|
|
4128
|
-
const packageJsonPath =
|
|
4129
|
-
const raw =
|
|
5803
|
+
const packageJsonPath = path9.resolve(path9.dirname(currentFilePath), "..", "package.json");
|
|
5804
|
+
const raw = fs14.readFileSync(packageJsonPath, "utf8");
|
|
4130
5805
|
const parsed = JSON.parse(raw);
|
|
4131
5806
|
return parsed.version ?? "0.0.0";
|
|
4132
5807
|
} catch {
|
|
@@ -4159,6 +5834,8 @@ async function run(argv) {
|
|
|
4159
5834
|
registerTriggerCommand(program);
|
|
4160
5835
|
registerEvalCommand(program);
|
|
4161
5836
|
registerCheckCommand(program);
|
|
5837
|
+
registerImproveCommand(program);
|
|
5838
|
+
registerRouteCommand(program);
|
|
4162
5839
|
try {
|
|
4163
5840
|
await program.parseAsync(argv);
|
|
4164
5841
|
} catch (error) {
|