skilltest 0.8.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +8 -1
- package/README.md +175 -12
- package/dist/index.js +1325 -79
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/index.ts
|
|
4
|
-
import
|
|
5
|
-
import
|
|
4
|
+
import fs13 from "node:fs";
|
|
5
|
+
import path8 from "node:path";
|
|
6
6
|
import { fileURLToPath } from "node:url";
|
|
7
7
|
import { Command } from "commander";
|
|
8
8
|
|
|
@@ -100,7 +100,10 @@ function parseFrontmatter(rawSkill) {
|
|
|
100
100
|
}
|
|
101
101
|
async function parseSkillStrict(inputPath) {
|
|
102
102
|
const skillContext = await loadSkillFile(inputPath);
|
|
103
|
-
|
|
103
|
+
return parseSkillDocumentStrict(skillContext.raw, skillContext.skillRoot, skillContext.skillFile);
|
|
104
|
+
}
|
|
105
|
+
function parseSkillDocumentStrict(rawSkill, skillRoot, skillFile) {
|
|
106
|
+
const parsedFrontmatter = parseFrontmatter(rawSkill);
|
|
104
107
|
if (!parsedFrontmatter.hasFrontmatter) {
|
|
105
108
|
throw new Error("SKILL.md is missing YAML frontmatter.");
|
|
106
109
|
}
|
|
@@ -113,9 +116,9 @@ async function parseSkillStrict(inputPath) {
|
|
|
113
116
|
throw new Error(`Invalid frontmatter field '${issue.path.join(".")}': ${issue.message}`);
|
|
114
117
|
}
|
|
115
118
|
return {
|
|
116
|
-
skillRoot
|
|
117
|
-
skillFile
|
|
118
|
-
raw:
|
|
119
|
+
skillRoot,
|
|
120
|
+
skillFile,
|
|
121
|
+
raw: rawSkill,
|
|
119
122
|
content: parsedFrontmatter.content,
|
|
120
123
|
frontmatterRaw: parsedFrontmatter.rawFrontmatter,
|
|
121
124
|
frontmatter: validation.data
|
|
@@ -1515,6 +1518,9 @@ function badgeLabel(status) {
|
|
|
1515
1518
|
function renderBadge(status) {
|
|
1516
1519
|
return `<span class="badge ${status}">${badgeLabel(status)}</span>`;
|
|
1517
1520
|
}
|
|
1521
|
+
function renderMetaBadge(label) {
|
|
1522
|
+
return `<span class="meta-badge">${escapeHtml(label)}</span>`;
|
|
1523
|
+
}
|
|
1518
1524
|
function renderStatCards(stats) {
|
|
1519
1525
|
return `<div class="stats-grid">${stats.map(
|
|
1520
1526
|
(stat) => `
|
|
@@ -1690,10 +1696,37 @@ function promptStatus(promptResult) {
|
|
|
1690
1696
|
return "warn";
|
|
1691
1697
|
}
|
|
1692
1698
|
function renderAssertionRow(assertion) {
|
|
1693
|
-
return
|
|
1694
|
-
|
|
1695
|
-
|
|
1696
|
-
|
|
1699
|
+
return `
|
|
1700
|
+
<details class="detail-block">
|
|
1701
|
+
<summary>
|
|
1702
|
+
${renderBadge(assertion.passed ? "pass" : "fail")}
|
|
1703
|
+
${assertion.source === "tool" ? renderMetaBadge("Tool") : ""}
|
|
1704
|
+
<span>${escapeHtml(assertion.assertion)}</span>
|
|
1705
|
+
</summary>
|
|
1706
|
+
<div class="detail-content">${renderPreBlock(assertion.evidence)}</div>
|
|
1707
|
+
</details>
|
|
1708
|
+
`;
|
|
1709
|
+
}
|
|
1710
|
+
function renderToolCallsSection(promptResult) {
|
|
1711
|
+
if (!promptResult.toolCalls || promptResult.toolCalls.length === 0) {
|
|
1712
|
+
return "";
|
|
1713
|
+
}
|
|
1714
|
+
const toolRows = promptResult.toolCalls.map(
|
|
1715
|
+
(toolCall) => `
|
|
1716
|
+
<div class="tool-call">
|
|
1717
|
+
<div class="row-header">
|
|
1718
|
+
<div>
|
|
1719
|
+
<div class="row-title">${escapeHtml(toolCall.name)}</div>
|
|
1720
|
+
<div class="row-subtitle">${escapeHtml(`turn ${toolCall.turnIndex}`)}</div>
|
|
1721
|
+
</div>
|
|
1722
|
+
${renderMetaBadge("Tool Call")}
|
|
1723
|
+
</div>
|
|
1724
|
+
${renderDefinitionList([{ label: "Arguments", value: JSON.stringify(toolCall.arguments) }])}
|
|
1725
|
+
${renderDetails("Mock response", renderPreBlock(toolCall.response))}
|
|
1726
|
+
</div>
|
|
1727
|
+
`
|
|
1728
|
+
).join("");
|
|
1729
|
+
return renderDetails("Tool Calls", `<div class="tool-call-list">${toolRows}</div>`);
|
|
1697
1730
|
}
|
|
1698
1731
|
function renderEvalPromptRow(promptResult) {
|
|
1699
1732
|
const assertionDetails = promptResult.assertions.map((assertion) => renderAssertionRow(assertion)).join("");
|
|
@@ -1712,9 +1745,12 @@ function renderEvalPromptRow(promptResult) {
|
|
|
1712
1745
|
<div class="row-body">${escapeHtml(promptResult.responseSummary)}</div>
|
|
1713
1746
|
${renderDefinitionList([
|
|
1714
1747
|
{ label: "Passed assertions", value: String(promptResult.passedAssertions) },
|
|
1715
|
-
{ label: "Total assertions", value: String(promptResult.totalAssertions) }
|
|
1748
|
+
{ label: "Total assertions", value: String(promptResult.totalAssertions) },
|
|
1749
|
+
...promptResult.toolCalls ? [{ label: "Tool calls", value: String(promptResult.toolCalls.length) }] : [],
|
|
1750
|
+
...promptResult.loopIterations !== void 0 ? [{ label: "Loop iterations", value: String(promptResult.loopIterations) }] : []
|
|
1716
1751
|
])}
|
|
1717
1752
|
${renderDetails("Assertion evidence", assertionDetails || `<p>No assertions.</p>`)}
|
|
1753
|
+
${renderToolCallsSection(promptResult)}
|
|
1718
1754
|
${responseDetails}
|
|
1719
1755
|
</div>
|
|
1720
1756
|
`;
|
|
@@ -1981,6 +2017,20 @@ function renderHtmlDocument(title, body) {
|
|
|
1981
2017
|
background: rgba(107, 114, 128, 0.14);
|
|
1982
2018
|
}
|
|
1983
2019
|
|
|
2020
|
+
.meta-badge {
|
|
2021
|
+
display: inline-flex;
|
|
2022
|
+
align-items: center;
|
|
2023
|
+
justify-content: center;
|
|
2024
|
+
padding: 3px 10px;
|
|
2025
|
+
border-radius: 999px;
|
|
2026
|
+
border: 1px solid rgba(17, 24, 39, 0.16);
|
|
2027
|
+
background: rgba(17, 24, 39, 0.06);
|
|
2028
|
+
color: var(--text);
|
|
2029
|
+
font-size: 0.76rem;
|
|
2030
|
+
font-weight: 700;
|
|
2031
|
+
white-space: nowrap;
|
|
2032
|
+
}
|
|
2033
|
+
|
|
1984
2034
|
details {
|
|
1985
2035
|
margin-top: 10px;
|
|
1986
2036
|
}
|
|
@@ -1995,6 +2045,13 @@ function renderHtmlDocument(title, body) {
|
|
|
1995
2045
|
padding-top: 10px;
|
|
1996
2046
|
}
|
|
1997
2047
|
|
|
2048
|
+
.detail-block summary {
|
|
2049
|
+
display: flex;
|
|
2050
|
+
align-items: center;
|
|
2051
|
+
gap: 8px;
|
|
2052
|
+
flex-wrap: wrap;
|
|
2053
|
+
}
|
|
2054
|
+
|
|
1998
2055
|
.detail-content p {
|
|
1999
2056
|
margin: 0;
|
|
2000
2057
|
}
|
|
@@ -2045,6 +2102,18 @@ function renderHtmlDocument(title, body) {
|
|
|
2045
2102
|
overflow-wrap: anywhere;
|
|
2046
2103
|
}
|
|
2047
2104
|
|
|
2105
|
+
.tool-call-list {
|
|
2106
|
+
display: grid;
|
|
2107
|
+
gap: 12px;
|
|
2108
|
+
}
|
|
2109
|
+
|
|
2110
|
+
.tool-call {
|
|
2111
|
+
border: 1px solid var(--border);
|
|
2112
|
+
border-radius: 12px;
|
|
2113
|
+
padding: 14px;
|
|
2114
|
+
background: #fffaf0;
|
|
2115
|
+
}
|
|
2116
|
+
|
|
2048
2117
|
ul {
|
|
2049
2118
|
margin: 0;
|
|
2050
2119
|
padding-left: 20px;
|
|
@@ -2270,6 +2339,70 @@ function countSkippedSecurityPatterns2(issues) {
|
|
|
2270
2339
|
function formatPercent2(value) {
|
|
2271
2340
|
return `${(value * 100).toFixed(1)}%`;
|
|
2272
2341
|
}
|
|
2342
|
+
function formatSignedNumber(value, digits = 4) {
|
|
2343
|
+
const prefix = value > 0 ? "+" : "";
|
|
2344
|
+
return `${prefix}${value.toFixed(digits)}`;
|
|
2345
|
+
}
|
|
2346
|
+
function diffChangedLines(beforeText, afterText) {
|
|
2347
|
+
const beforeLines = beforeText.split(/\r?\n/);
|
|
2348
|
+
const afterLines = afterText.split(/\r?\n/);
|
|
2349
|
+
const dp = Array.from({ length: beforeLines.length + 1 }, () => Array(afterLines.length + 1).fill(0));
|
|
2350
|
+
for (let beforeIndex2 = beforeLines.length - 1; beforeIndex2 >= 0; beforeIndex2 -= 1) {
|
|
2351
|
+
for (let afterIndex2 = afterLines.length - 1; afterIndex2 >= 0; afterIndex2 -= 1) {
|
|
2352
|
+
if (beforeLines[beforeIndex2] === afterLines[afterIndex2]) {
|
|
2353
|
+
dp[beforeIndex2][afterIndex2] = 1 + (dp[beforeIndex2 + 1][afterIndex2 + 1] ?? 0);
|
|
2354
|
+
} else {
|
|
2355
|
+
dp[beforeIndex2][afterIndex2] = Math.max(dp[beforeIndex2 + 1][afterIndex2] ?? 0, dp[beforeIndex2][afterIndex2 + 1] ?? 0);
|
|
2356
|
+
}
|
|
2357
|
+
}
|
|
2358
|
+
}
|
|
2359
|
+
const changedLines = [];
|
|
2360
|
+
let beforeIndex = 0;
|
|
2361
|
+
let afterIndex = 0;
|
|
2362
|
+
while (beforeIndex < beforeLines.length && afterIndex < afterLines.length) {
|
|
2363
|
+
if (beforeLines[beforeIndex] === afterLines[afterIndex]) {
|
|
2364
|
+
beforeIndex += 1;
|
|
2365
|
+
afterIndex += 1;
|
|
2366
|
+
continue;
|
|
2367
|
+
}
|
|
2368
|
+
const skipBefore = dp[beforeIndex + 1][afterIndex] ?? 0;
|
|
2369
|
+
const skipAfter = dp[beforeIndex][afterIndex + 1] ?? 0;
|
|
2370
|
+
if (skipBefore >= skipAfter) {
|
|
2371
|
+
changedLines.push({ type: "-", line: beforeLines[beforeIndex] ?? "" });
|
|
2372
|
+
beforeIndex += 1;
|
|
2373
|
+
} else {
|
|
2374
|
+
changedLines.push({ type: "+", line: afterLines[afterIndex] ?? "" });
|
|
2375
|
+
afterIndex += 1;
|
|
2376
|
+
}
|
|
2377
|
+
}
|
|
2378
|
+
while (beforeIndex < beforeLines.length) {
|
|
2379
|
+
changedLines.push({ type: "-", line: beforeLines[beforeIndex] ?? "" });
|
|
2380
|
+
beforeIndex += 1;
|
|
2381
|
+
}
|
|
2382
|
+
while (afterIndex < afterLines.length) {
|
|
2383
|
+
changedLines.push({ type: "+", line: afterLines[afterIndex] ?? "" });
|
|
2384
|
+
afterIndex += 1;
|
|
2385
|
+
}
|
|
2386
|
+
return changedLines;
|
|
2387
|
+
}
|
|
2388
|
+
function renderDiffPreview(beforeText, afterText, maxLines = 40) {
|
|
2389
|
+
const changedLines = diffChangedLines(beforeText, afterText);
|
|
2390
|
+
if (changedLines.length === 0) {
|
|
2391
|
+
return [" (no content changes)"];
|
|
2392
|
+
}
|
|
2393
|
+
const previewLines = changedLines.slice(0, maxLines).map((entry) => ` ${entry.type} ${entry.line}`);
|
|
2394
|
+
if (changedLines.length > maxLines) {
|
|
2395
|
+
previewLines.push(` ... ${changedLines.length - maxLines} more changed line(s)`);
|
|
2396
|
+
}
|
|
2397
|
+
return previewLines;
|
|
2398
|
+
}
|
|
2399
|
+
function summarizeToolCalls(toolCalls) {
|
|
2400
|
+
const counts = /* @__PURE__ */ new Map();
|
|
2401
|
+
for (const toolCall of toolCalls) {
|
|
2402
|
+
counts.set(toolCall.name, (counts.get(toolCall.name) ?? 0) + 1);
|
|
2403
|
+
}
|
|
2404
|
+
return Array.from(counts.entries()).map(([name, count]) => `${name} x${count}`).join(", ");
|
|
2405
|
+
}
|
|
2273
2406
|
function renderLintReport(report, enableColor) {
|
|
2274
2407
|
const c = getChalkInstance(enableColor);
|
|
2275
2408
|
const { passed, warnings, failures, total } = report.summary;
|
|
@@ -2330,12 +2463,25 @@ function renderEvalReport(result, enableColor, verbose) {
|
|
|
2330
2463
|
for (const [index, promptResult] of result.results.entries()) {
|
|
2331
2464
|
lines.push(`${index + 1}. prompt: ${promptResult.prompt}`);
|
|
2332
2465
|
lines.push(` response summary: ${promptResult.responseSummary.replace(/\s+/g, " ").trim()}`);
|
|
2466
|
+
if (promptResult.toolCalls) {
|
|
2467
|
+
lines.push(` Tools: ${promptResult.toolCalls.length} calls (${summarizeToolCalls(promptResult.toolCalls)})`);
|
|
2468
|
+
if (promptResult.loopIterations !== void 0) {
|
|
2469
|
+
lines.push(` loop iterations: ${promptResult.loopIterations}`);
|
|
2470
|
+
}
|
|
2471
|
+
}
|
|
2333
2472
|
for (const assertion of promptResult.assertions) {
|
|
2334
2473
|
const status = assertion.passed ? c.green("PASS") : c.red("FAIL");
|
|
2335
2474
|
lines.push(` ${status} ${assertion.assertion}`);
|
|
2336
2475
|
lines.push(` evidence: ${assertion.evidence}`);
|
|
2337
2476
|
}
|
|
2338
2477
|
if (verbose) {
|
|
2478
|
+
if (promptResult.toolCalls) {
|
|
2479
|
+
for (const toolCall of promptResult.toolCalls) {
|
|
2480
|
+
lines.push(` tool ${toolCall.turnIndex}: ${toolCall.name}`);
|
|
2481
|
+
lines.push(` arguments: ${JSON.stringify(toolCall.arguments)}`);
|
|
2482
|
+
lines.push(` response: ${toolCall.response}`);
|
|
2483
|
+
}
|
|
2484
|
+
}
|
|
2339
2485
|
lines.push(` full response: ${promptResult.response}`);
|
|
2340
2486
|
}
|
|
2341
2487
|
}
|
|
@@ -2412,6 +2558,12 @@ function renderCheckReport(result, enableColor, verbose) {
|
|
|
2412
2558
|
}
|
|
2413
2559
|
lines.push(` - prompt: ${promptResult.prompt}`);
|
|
2414
2560
|
lines.push(` response summary: ${promptResult.responseSummary.replace(/\s+/g, " ").trim()}`);
|
|
2561
|
+
if (promptResult.toolCalls) {
|
|
2562
|
+
lines.push(` Tools: ${promptResult.toolCalls.length} calls (${summarizeToolCalls(promptResult.toolCalls)})`);
|
|
2563
|
+
if (promptResult.loopIterations !== void 0) {
|
|
2564
|
+
lines.push(` loop iterations: ${promptResult.loopIterations}`);
|
|
2565
|
+
}
|
|
2566
|
+
}
|
|
2415
2567
|
const assertionsToRender = verbose ? promptResult.assertions : failedAssertions;
|
|
2416
2568
|
for (const assertion of assertionsToRender) {
|
|
2417
2569
|
const assertionStatus = assertion.passed ? c.green("PASS") : c.red("FAIL");
|
|
@@ -2419,6 +2571,13 @@ function renderCheckReport(result, enableColor, verbose) {
|
|
|
2419
2571
|
lines.push(` evidence: ${assertion.evidence}`);
|
|
2420
2572
|
}
|
|
2421
2573
|
if (verbose) {
|
|
2574
|
+
if (promptResult.toolCalls) {
|
|
2575
|
+
for (const toolCall of promptResult.toolCalls) {
|
|
2576
|
+
lines.push(` tool ${toolCall.turnIndex}: ${toolCall.name}`);
|
|
2577
|
+
lines.push(` arguments: ${JSON.stringify(toolCall.arguments)}`);
|
|
2578
|
+
lines.push(` response: ${toolCall.response}`);
|
|
2579
|
+
}
|
|
2580
|
+
}
|
|
2422
2581
|
lines.push(` full response: ${promptResult.response}`);
|
|
2423
2582
|
}
|
|
2424
2583
|
}
|
|
@@ -2433,6 +2592,73 @@ function renderCheckReport(result, enableColor, verbose) {
|
|
|
2433
2592
|
lines.push(`- overall: ${overallGate}`);
|
|
2434
2593
|
return lines.join("\n");
|
|
2435
2594
|
}
|
|
2595
|
+
function renderImproveReport(result, enableColor, verbose = false) {
|
|
2596
|
+
const c = getChalkInstance(enableColor);
|
|
2597
|
+
const lines = [
|
|
2598
|
+
"skilltest improve",
|
|
2599
|
+
`target: ${result.target}`,
|
|
2600
|
+
`provider/model: ${result.provider}/${result.model}`,
|
|
2601
|
+
`thresholds: min-f1=${result.thresholds.minF1.toFixed(2)} min-assert-pass-rate=${result.thresholds.minAssertPassRate.toFixed(2)}`
|
|
2602
|
+
];
|
|
2603
|
+
const statusLabel = result.blockedReason ? c.red("BLOCKED") : result.applied ? c.green("APPLIED") : c.green("VERIFIED");
|
|
2604
|
+
lines.push(`status: ${statusLabel}`);
|
|
2605
|
+
if (result.candidate) {
|
|
2606
|
+
lines.push("");
|
|
2607
|
+
lines.push("Change Summary");
|
|
2608
|
+
for (const item of result.candidate.changeSummary) {
|
|
2609
|
+
lines.push(`- ${item}`);
|
|
2610
|
+
}
|
|
2611
|
+
lines.push("");
|
|
2612
|
+
lines.push("Targeted Problems");
|
|
2613
|
+
for (const item of result.candidate.targetedProblems) {
|
|
2614
|
+
lines.push(`- ${item}`);
|
|
2615
|
+
}
|
|
2616
|
+
}
|
|
2617
|
+
if (result.delta && result.verification) {
|
|
2618
|
+
lines.push("");
|
|
2619
|
+
lines.push("Before / After");
|
|
2620
|
+
lines.push(
|
|
2621
|
+
`- lint failures: ${result.delta.lintFailures.before} -> ${result.delta.lintFailures.after} (${formatSignedNumber(result.delta.lintFailures.delta, 0)})`
|
|
2622
|
+
);
|
|
2623
|
+
lines.push(
|
|
2624
|
+
`- lint warnings: ${result.delta.lintWarnings.before} -> ${result.delta.lintWarnings.after} (${formatSignedNumber(result.delta.lintWarnings.delta, 0)})`
|
|
2625
|
+
);
|
|
2626
|
+
lines.push(
|
|
2627
|
+
`- trigger f1: ${formatPercent2(result.delta.triggerF1.before)} -> ${formatPercent2(result.delta.triggerF1.after)} (${formatSignedNumber(result.delta.triggerF1.delta)})`
|
|
2628
|
+
);
|
|
2629
|
+
lines.push(
|
|
2630
|
+
`- eval assertion pass rate: ${formatPercent2(result.delta.evalAssertPassRate.before)} -> ${formatPercent2(result.delta.evalAssertPassRate.after)} (${formatSignedNumber(result.delta.evalAssertPassRate.delta)})`
|
|
2631
|
+
);
|
|
2632
|
+
lines.push(
|
|
2633
|
+
`- overall gate: ${result.delta.overallPassed.before ? c.green("PASS") : c.red("FAIL")} -> ${result.delta.overallPassed.after ? c.green("PASS") : c.red("FAIL")}`
|
|
2634
|
+
);
|
|
2635
|
+
}
|
|
2636
|
+
if (result.outputPath) {
|
|
2637
|
+
lines.push("");
|
|
2638
|
+
lines.push(`output: ${result.outputPath}`);
|
|
2639
|
+
}
|
|
2640
|
+
if (result.blockedReason) {
|
|
2641
|
+
lines.push("");
|
|
2642
|
+
lines.push("Blocked");
|
|
2643
|
+
lines.push(`- ${result.blockedReason}`);
|
|
2644
|
+
}
|
|
2645
|
+
if (result.candidate) {
|
|
2646
|
+
lines.push("");
|
|
2647
|
+
lines.push("Diff Preview");
|
|
2648
|
+
lines.push(...renderDiffPreview(result.originalRaw, result.candidate.raw));
|
|
2649
|
+
}
|
|
2650
|
+
if (verbose) {
|
|
2651
|
+
lines.push("");
|
|
2652
|
+
lines.push("Baseline");
|
|
2653
|
+
lines.push(renderCheckReport(result.baseline, enableColor, true));
|
|
2654
|
+
if (result.verification) {
|
|
2655
|
+
lines.push("");
|
|
2656
|
+
lines.push("Verification");
|
|
2657
|
+
lines.push(renderCheckReport(result.verification, enableColor, true));
|
|
2658
|
+
}
|
|
2659
|
+
}
|
|
2660
|
+
return lines.join("\n");
|
|
2661
|
+
}
|
|
2436
2662
|
|
|
2437
2663
|
// src/commands/common.ts
|
|
2438
2664
|
import fs6 from "node:fs/promises";
|
|
@@ -2504,7 +2730,10 @@ function parseGraderOutput(raw) {
|
|
|
2504
2730
|
async function gradeResponse(options) {
|
|
2505
2731
|
const prompts = buildGraderPrompts(options);
|
|
2506
2732
|
const raw = await options.provider.sendMessage(prompts.systemPrompt, prompts.userPrompt, { model: options.model });
|
|
2507
|
-
return parseGraderOutput(raw)
|
|
2733
|
+
return parseGraderOutput(raw).map((assertion) => ({
|
|
2734
|
+
...assertion,
|
|
2735
|
+
source: "grader"
|
|
2736
|
+
}));
|
|
2508
2737
|
}
|
|
2509
2738
|
|
|
2510
2739
|
// src/utils/concurrency.ts
|
|
@@ -2559,12 +2788,290 @@ async function pMap(items, fn, concurrency) {
|
|
|
2559
2788
|
});
|
|
2560
2789
|
}
|
|
2561
2790
|
|
|
2791
|
+
// src/core/tool-environment.ts
|
|
2792
|
+
function isPlainObject(value) {
|
|
2793
|
+
return value !== null && typeof value === "object" && !Array.isArray(value);
|
|
2794
|
+
}
|
|
2795
|
+
function deepEqual(left, right) {
|
|
2796
|
+
if (Array.isArray(left) && Array.isArray(right)) {
|
|
2797
|
+
if (left.length !== right.length) {
|
|
2798
|
+
return false;
|
|
2799
|
+
}
|
|
2800
|
+
return left.every((item, index) => deepEqual(item, right[index]));
|
|
2801
|
+
}
|
|
2802
|
+
if (isPlainObject(left) && isPlainObject(right)) {
|
|
2803
|
+
const leftKeys = Object.keys(left);
|
|
2804
|
+
const rightKeys = Object.keys(right);
|
|
2805
|
+
if (leftKeys.length !== rightKeys.length) {
|
|
2806
|
+
return false;
|
|
2807
|
+
}
|
|
2808
|
+
return leftKeys.every((key) => deepEqual(left[key], right[key]));
|
|
2809
|
+
}
|
|
2810
|
+
return left === right;
|
|
2811
|
+
}
|
|
2812
|
+
function matchesArgumentSubset(actual, expected) {
|
|
2813
|
+
if (Array.isArray(expected)) {
|
|
2814
|
+
if (!Array.isArray(actual) || actual.length !== expected.length) {
|
|
2815
|
+
return false;
|
|
2816
|
+
}
|
|
2817
|
+
return expected.every((value, index) => matchesArgumentSubset(actual[index], value));
|
|
2818
|
+
}
|
|
2819
|
+
if (isPlainObject(expected)) {
|
|
2820
|
+
if (!isPlainObject(actual)) {
|
|
2821
|
+
return false;
|
|
2822
|
+
}
|
|
2823
|
+
return Object.entries(expected).every(([key, value]) => matchesArgumentSubset(actual[key], value));
|
|
2824
|
+
}
|
|
2825
|
+
return deepEqual(actual, expected);
|
|
2826
|
+
}
|
|
2827
|
+
function parseResponsePattern(pattern) {
|
|
2828
|
+
if (pattern === "*") {
|
|
2829
|
+
return null;
|
|
2830
|
+
}
|
|
2831
|
+
try {
|
|
2832
|
+
const parsed = JSON.parse(pattern);
|
|
2833
|
+
return isPlainObject(parsed) ? parsed : null;
|
|
2834
|
+
} catch {
|
|
2835
|
+
return null;
|
|
2836
|
+
}
|
|
2837
|
+
}
|
|
2838
|
+
function renderFallbackResponse(tool, args) {
|
|
2839
|
+
return `[mock] No mock response configured for tool '${tool.name}' with arguments: ${JSON.stringify(args)}`;
|
|
2840
|
+
}
|
|
2841
|
+
function resolveToolResponse(tool, args) {
|
|
2842
|
+
const exactMatchKey = JSON.stringify(args);
|
|
2843
|
+
const exactMatch = tool.responses[exactMatchKey];
|
|
2844
|
+
if (exactMatch !== void 0) {
|
|
2845
|
+
return exactMatch;
|
|
2846
|
+
}
|
|
2847
|
+
let bestPartialMatch = null;
|
|
2848
|
+
for (const [pattern, response] of Object.entries(tool.responses)) {
|
|
2849
|
+
if (pattern === "*") {
|
|
2850
|
+
continue;
|
|
2851
|
+
}
|
|
2852
|
+
const parsedPattern = parseResponsePattern(pattern);
|
|
2853
|
+
if (!parsedPattern) {
|
|
2854
|
+
continue;
|
|
2855
|
+
}
|
|
2856
|
+
if (!matchesArgumentSubset(args, parsedPattern)) {
|
|
2857
|
+
continue;
|
|
2858
|
+
}
|
|
2859
|
+
const specificity = Object.keys(parsedPattern).length;
|
|
2860
|
+
if (!bestPartialMatch || specificity > bestPartialMatch.specificity) {
|
|
2861
|
+
bestPartialMatch = { specificity, response };
|
|
2862
|
+
}
|
|
2863
|
+
}
|
|
2864
|
+
if (bestPartialMatch) {
|
|
2865
|
+
return bestPartialMatch.response;
|
|
2866
|
+
}
|
|
2867
|
+
const wildcardMatch = tool.responses["*"];
|
|
2868
|
+
if (wildcardMatch !== void 0) {
|
|
2869
|
+
return wildcardMatch;
|
|
2870
|
+
}
|
|
2871
|
+
return renderFallbackResponse(tool, args);
|
|
2872
|
+
}
|
|
2873
|
+
function toProviderToolDefinitions(mockTools) {
|
|
2874
|
+
return mockTools.map((tool) => {
|
|
2875
|
+
const parameters = tool.parameters ?? [];
|
|
2876
|
+
return {
|
|
2877
|
+
name: tool.name,
|
|
2878
|
+
description: tool.description,
|
|
2879
|
+
parameters: {
|
|
2880
|
+
type: "object",
|
|
2881
|
+
properties: Object.fromEntries(
|
|
2882
|
+
parameters.map((parameter) => [
|
|
2883
|
+
parameter.name,
|
|
2884
|
+
{
|
|
2885
|
+
type: parameter.type,
|
|
2886
|
+
description: parameter.description
|
|
2887
|
+
}
|
|
2888
|
+
])
|
|
2889
|
+
),
|
|
2890
|
+
required: parameters.filter((parameter) => parameter.required).map((parameter) => parameter.name)
|
|
2891
|
+
}
|
|
2892
|
+
};
|
|
2893
|
+
});
|
|
2894
|
+
}
|
|
2895
|
+
function toAssistantConversationBlocks(response) {
|
|
2896
|
+
const contentBlocks = [];
|
|
2897
|
+
if (response.textContent.trim().length > 0) {
|
|
2898
|
+
contentBlocks.push({
|
|
2899
|
+
type: "text",
|
|
2900
|
+
text: response.textContent
|
|
2901
|
+
});
|
|
2902
|
+
}
|
|
2903
|
+
for (const block of response.toolUseBlocks) {
|
|
2904
|
+
contentBlocks.push({
|
|
2905
|
+
type: "tool_use",
|
|
2906
|
+
id: block.id,
|
|
2907
|
+
name: block.name,
|
|
2908
|
+
input: block.arguments
|
|
2909
|
+
});
|
|
2910
|
+
}
|
|
2911
|
+
return contentBlocks.length === 0 ? [] : [
|
|
2912
|
+
{
|
|
2913
|
+
role: "assistant",
|
|
2914
|
+
content: contentBlocks
|
|
2915
|
+
}
|
|
2916
|
+
];
|
|
2917
|
+
}
|
|
2918
|
+
async function runWithTools(options) {
|
|
2919
|
+
const maxIterations = options.maxIterations ?? 10;
|
|
2920
|
+
const toolsByName = new Map(options.tools.map((tool) => [tool.name, tool]));
|
|
2921
|
+
const providerTools = toProviderToolDefinitions(options.tools);
|
|
2922
|
+
const messages = [{ role: "user", content: options.userMessage }];
|
|
2923
|
+
const toolCalls = [];
|
|
2924
|
+
let finalResponse = "";
|
|
2925
|
+
let loopIterations = 0;
|
|
2926
|
+
while (loopIterations < maxIterations) {
|
|
2927
|
+
loopIterations += 1;
|
|
2928
|
+
const response = await options.provider.sendWithTools(options.systemPrompt, messages, {
|
|
2929
|
+
model: options.model,
|
|
2930
|
+
tools: providerTools
|
|
2931
|
+
});
|
|
2932
|
+
if (response.textContent.trim().length > 0) {
|
|
2933
|
+
finalResponse = response.textContent;
|
|
2934
|
+
}
|
|
2935
|
+
if (response.toolUseBlocks.length === 0) {
|
|
2936
|
+
return {
|
|
2937
|
+
finalResponse,
|
|
2938
|
+
toolCalls,
|
|
2939
|
+
loopIterations
|
|
2940
|
+
};
|
|
2941
|
+
}
|
|
2942
|
+
messages.push(...toAssistantConversationBlocks(response));
|
|
2943
|
+
const toolResultBlocks = [];
|
|
2944
|
+
for (const toolUse of response.toolUseBlocks) {
|
|
2945
|
+
const tool = toolsByName.get(toolUse.name);
|
|
2946
|
+
const resolvedResponse = tool ? resolveToolResponse(tool, toolUse.arguments) : `[mock] No tool named '${toolUse.name}' is registered.`;
|
|
2947
|
+
toolCalls.push({
|
|
2948
|
+
name: toolUse.name,
|
|
2949
|
+
arguments: toolUse.arguments,
|
|
2950
|
+
response: resolvedResponse,
|
|
2951
|
+
turnIndex: loopIterations
|
|
2952
|
+
});
|
|
2953
|
+
toolResultBlocks.push({
|
|
2954
|
+
type: "tool_result",
|
|
2955
|
+
tool_use_id: toolUse.id,
|
|
2956
|
+
content: resolvedResponse
|
|
2957
|
+
});
|
|
2958
|
+
}
|
|
2959
|
+
messages.push({
|
|
2960
|
+
role: "user",
|
|
2961
|
+
content: toolResultBlocks
|
|
2962
|
+
});
|
|
2963
|
+
}
|
|
2964
|
+
const terminationNote = `[skilltest: tool loop terminated after ${maxIterations} iterations]`;
|
|
2965
|
+
finalResponse = finalResponse ? `${finalResponse}
|
|
2966
|
+
|
|
2967
|
+
${terminationNote}` : terminationNote;
|
|
2968
|
+
return {
|
|
2969
|
+
finalResponse,
|
|
2970
|
+
toolCalls,
|
|
2971
|
+
loopIterations
|
|
2972
|
+
};
|
|
2973
|
+
}
|
|
2974
|
+
|
|
2562
2975
|
// src/core/eval-runner.ts
|
|
2976
|
+
var toolParameterSchema = z3.object({
|
|
2977
|
+
name: z3.string().min(1),
|
|
2978
|
+
type: z3.enum(["string", "number", "boolean", "object", "array"]),
|
|
2979
|
+
description: z3.string().min(1),
|
|
2980
|
+
required: z3.boolean().optional()
|
|
2981
|
+
});
|
|
2982
|
+
var mockToolDefinitionSchema = z3.object({
|
|
2983
|
+
name: z3.string().min(1),
|
|
2984
|
+
description: z3.string().min(1),
|
|
2985
|
+
parameters: z3.array(toolParameterSchema).optional(),
|
|
2986
|
+
responses: z3.record(z3.string())
|
|
2987
|
+
});
|
|
2988
|
+
var toolAssertionSchema = z3.object({
|
|
2989
|
+
type: z3.enum(["tool_called", "tool_not_called", "tool_call_order", "tool_argument_match"]),
|
|
2990
|
+
toolName: z3.string().min(1).optional(),
|
|
2991
|
+
toolNames: z3.array(z3.string().min(1)).optional(),
|
|
2992
|
+
expectedArgs: z3.record(z3.unknown()).optional(),
|
|
2993
|
+
description: z3.string().min(1)
|
|
2994
|
+
}).superRefine((value, context) => {
|
|
2995
|
+
if ((value.type === "tool_called" || value.type === "tool_not_called" || value.type === "tool_argument_match") && !value.toolName) {
|
|
2996
|
+
context.addIssue({
|
|
2997
|
+
code: z3.ZodIssueCode.custom,
|
|
2998
|
+
message: `${value.type} requires toolName.`
|
|
2999
|
+
});
|
|
3000
|
+
}
|
|
3001
|
+
if (value.type === "tool_call_order" && (!value.toolNames || value.toolNames.length === 0)) {
|
|
3002
|
+
context.addIssue({
|
|
3003
|
+
code: z3.ZodIssueCode.custom,
|
|
3004
|
+
message: "tool_call_order requires toolNames."
|
|
3005
|
+
});
|
|
3006
|
+
}
|
|
3007
|
+
if (value.type === "tool_argument_match" && !value.expectedArgs) {
|
|
3008
|
+
context.addIssue({
|
|
3009
|
+
code: z3.ZodIssueCode.custom,
|
|
3010
|
+
message: "tool_argument_match requires expectedArgs."
|
|
3011
|
+
});
|
|
3012
|
+
}
|
|
3013
|
+
});
|
|
2563
3014
|
var evalPromptSchema = z3.object({
|
|
2564
3015
|
prompt: z3.string().min(1),
|
|
2565
|
-
assertions: z3.array(z3.string().min(1)).optional()
|
|
3016
|
+
assertions: z3.array(z3.string().min(1)).optional(),
|
|
3017
|
+
tools: z3.array(mockToolDefinitionSchema).optional(),
|
|
3018
|
+
toolAssertions: z3.array(toolAssertionSchema).optional()
|
|
2566
3019
|
});
|
|
2567
3020
|
var evalPromptArraySchema = z3.array(evalPromptSchema);
|
|
3021
|
+
function formatExpectedOrder(toolNames) {
|
|
3022
|
+
return `[${toolNames.join(", ")}]`;
|
|
3023
|
+
}
|
|
3024
|
+
function formatActualOrder(toolCalls, toolNames) {
|
|
3025
|
+
const relevantNames = new Set(toolNames);
|
|
3026
|
+
const actualOrder = toolCalls.filter((toolCall) => relevantNames.has(toolCall.name)).map((toolCall) => toolCall.name);
|
|
3027
|
+
return `[${actualOrder.join(", ")}]`;
|
|
3028
|
+
}
|
|
3029
|
+
function evaluateToolAssertions(toolAssertions, toolCalls) {
|
|
3030
|
+
return toolAssertions.map((toolAssertion) => {
|
|
3031
|
+
if (toolAssertion.type === "tool_called") {
|
|
3032
|
+
const matchingCalls = toolCalls.filter((toolCall) => toolCall.name === toolAssertion.toolName);
|
|
3033
|
+
return {
|
|
3034
|
+
assertion: toolAssertion.description,
|
|
3035
|
+
passed: matchingCalls.length > 0,
|
|
3036
|
+
evidence: matchingCalls.length > 0 ? `Tool '${toolAssertion.toolName}' was called ${matchingCalls.length} time${matchingCalls.length === 1 ? "" : "s"}.` : `Tool '${toolAssertion.toolName}' was not called.`,
|
|
3037
|
+
source: "tool"
|
|
3038
|
+
};
|
|
3039
|
+
}
|
|
3040
|
+
if (toolAssertion.type === "tool_not_called") {
|
|
3041
|
+
const matchingCalls = toolCalls.filter((toolCall) => toolCall.name === toolAssertion.toolName);
|
|
3042
|
+
return {
|
|
3043
|
+
assertion: toolAssertion.description,
|
|
3044
|
+
passed: matchingCalls.length === 0,
|
|
3045
|
+
evidence: matchingCalls.length === 0 ? `Tool '${toolAssertion.toolName}' was not called.` : `Tool '${toolAssertion.toolName}' was called ${matchingCalls.length} time${matchingCalls.length === 1 ? "" : "s"}.`,
|
|
3046
|
+
source: "tool"
|
|
3047
|
+
};
|
|
3048
|
+
}
|
|
3049
|
+
if (toolAssertion.type === "tool_call_order") {
|
|
3050
|
+
const expectedOrder = toolAssertion.toolNames ?? [];
|
|
3051
|
+
let nextExpectedIndex = 0;
|
|
3052
|
+
for (const toolCall of toolCalls) {
|
|
3053
|
+
if (toolCall.name === expectedOrder[nextExpectedIndex]) {
|
|
3054
|
+
nextExpectedIndex += 1;
|
|
3055
|
+
}
|
|
3056
|
+
}
|
|
3057
|
+
return {
|
|
3058
|
+
assertion: toolAssertion.description,
|
|
3059
|
+
passed: nextExpectedIndex === expectedOrder.length,
|
|
3060
|
+
evidence: nextExpectedIndex === expectedOrder.length ? `Observed tool call order ${formatExpectedOrder(expectedOrder)}.` : `Expected call order ${formatExpectedOrder(expectedOrder)} but got ${formatActualOrder(toolCalls, expectedOrder)}.`,
|
|
3061
|
+
source: "tool"
|
|
3062
|
+
};
|
|
3063
|
+
}
|
|
3064
|
+
const matchingCall = toolCalls.find(
|
|
3065
|
+
(toolCall) => toolCall.name === toolAssertion.toolName && matchesArgumentSubset(toolCall.arguments, toolAssertion.expectedArgs ?? {})
|
|
3066
|
+
);
|
|
3067
|
+
return {
|
|
3068
|
+
assertion: toolAssertion.description,
|
|
3069
|
+
passed: Boolean(matchingCall),
|
|
3070
|
+
evidence: matchingCall ? `Tool '${toolAssertion.toolName}' was called with arguments matching ${JSON.stringify(toolAssertion.expectedArgs ?? {})}.` : `No '${toolAssertion.toolName}' call matched ${JSON.stringify(toolAssertion.expectedArgs ?? {})}.`,
|
|
3071
|
+
source: "tool"
|
|
3072
|
+
};
|
|
3073
|
+
});
|
|
3074
|
+
}
|
|
2568
3075
|
function extractJsonArray(raw) {
|
|
2569
3076
|
const trimmed = raw.trim();
|
|
2570
3077
|
if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
|
|
@@ -2591,6 +3098,7 @@ async function generatePrompts(skill, provider, model, count) {
|
|
|
2591
3098
|
skill.content,
|
|
2592
3099
|
"",
|
|
2593
3100
|
`Generate ${count} prompts that stress the main capabilities and likely edge cases.`,
|
|
3101
|
+
// Tool-aware prompts require user-defined mock responses and are not auto-generated.
|
|
2594
3102
|
"Each prompt should include 2-4 assertions."
|
|
2595
3103
|
].join("\n");
|
|
2596
3104
|
const raw = await provider.sendMessage(systemPrompt, userPrompt, { model });
|
|
@@ -2614,7 +3122,24 @@ async function runEval(skill, options) {
|
|
|
2614
3122
|
const results = await pMap(
|
|
2615
3123
|
prompts,
|
|
2616
3124
|
async (evalPrompt) => {
|
|
2617
|
-
|
|
3125
|
+
let response;
|
|
3126
|
+
let toolCalls;
|
|
3127
|
+
let loopIterations;
|
|
3128
|
+
if (evalPrompt.tools && evalPrompt.tools.length > 0) {
|
|
3129
|
+
const toolRun = await runWithTools({
|
|
3130
|
+
provider: options.provider,
|
|
3131
|
+
model: options.model,
|
|
3132
|
+
systemPrompt,
|
|
3133
|
+
userMessage: evalPrompt.prompt,
|
|
3134
|
+
tools: evalPrompt.tools,
|
|
3135
|
+
maxIterations: options.maxToolIterations
|
|
3136
|
+
});
|
|
3137
|
+
response = toolRun.finalResponse;
|
|
3138
|
+
toolCalls = toolRun.toolCalls;
|
|
3139
|
+
loopIterations = toolRun.loopIterations;
|
|
3140
|
+
} else {
|
|
3141
|
+
response = await options.provider.sendMessage(systemPrompt, evalPrompt.prompt, { model: options.model });
|
|
3142
|
+
}
|
|
2618
3143
|
const gradedAssertions = await gradeResponse({
|
|
2619
3144
|
provider: options.provider,
|
|
2620
3145
|
model: options.graderModel,
|
|
@@ -2624,14 +3149,18 @@ async function runEval(skill, options) {
|
|
|
2624
3149
|
modelResponse: response,
|
|
2625
3150
|
assertions: evalPrompt.assertions
|
|
2626
3151
|
});
|
|
2627
|
-
const
|
|
3152
|
+
const structuralAssertions = evalPrompt.toolAssertions && evalPrompt.toolAssertions.length > 0 ? evaluateToolAssertions(evalPrompt.toolAssertions, toolCalls ?? []) : [];
|
|
3153
|
+
const assertions = [...gradedAssertions, ...structuralAssertions];
|
|
3154
|
+
const passedAssertions2 = assertions.filter((assertion) => assertion.passed).length;
|
|
2628
3155
|
return {
|
|
2629
3156
|
prompt: evalPrompt.prompt,
|
|
2630
|
-
assertions
|
|
3157
|
+
assertions,
|
|
2631
3158
|
responseSummary: response.slice(0, 200),
|
|
2632
3159
|
response,
|
|
2633
3160
|
passedAssertions: passedAssertions2,
|
|
2634
|
-
totalAssertions:
|
|
3161
|
+
totalAssertions: assertions.length,
|
|
3162
|
+
...toolCalls ? { toolCalls } : {},
|
|
3163
|
+
...loopIterations !== void 0 ? { loopIterations } : {}
|
|
2635
3164
|
};
|
|
2636
3165
|
},
|
|
2637
3166
|
options.concurrency ?? 5
|
|
@@ -2969,10 +3498,7 @@ function renderJson(value) {
|
|
|
2969
3498
|
|
|
2970
3499
|
// src/commands/common.ts
|
|
2971
3500
|
var executionContextByCommand = /* @__PURE__ */ new WeakMap();
|
|
2972
|
-
var singleEvalPromptSchema =
|
|
2973
|
-
prompt: z5.string().min(1),
|
|
2974
|
-
assertions: z5.array(z5.string().min(1)).optional()
|
|
2975
|
-
});
|
|
3501
|
+
var singleEvalPromptSchema = evalPromptSchema;
|
|
2976
3502
|
var promptStringArraySchema = z5.array(z5.string().min(1));
|
|
2977
3503
|
var assertionsObjectSchema = z5.object({
|
|
2978
3504
|
assertions: z5.array(z5.string().min(1))
|
|
@@ -3007,6 +3533,22 @@ function normalizeEvalPrompts(value, sourceLabel) {
|
|
|
3007
3533
|
function parseAssertionsFromText(raw) {
|
|
3008
3534
|
return raw.split(/\r?\n/).map((line) => line.trim().replace(/^[-*]\s+/, "").replace(/^\d+\.\s+/, "")).filter((line) => line.length > 0);
|
|
3009
3535
|
}
|
|
3536
|
+
function cloneEvalPrompt(prompt) {
|
|
3537
|
+
return {
|
|
3538
|
+
prompt: prompt.prompt,
|
|
3539
|
+
assertions: prompt.assertions ? [...prompt.assertions] : void 0,
|
|
3540
|
+
tools: prompt.tools ? prompt.tools.map((tool) => ({
|
|
3541
|
+
...tool,
|
|
3542
|
+
parameters: tool.parameters ? tool.parameters.map((parameter) => ({ ...parameter })) : void 0,
|
|
3543
|
+
responses: { ...tool.responses }
|
|
3544
|
+
})) : void 0,
|
|
3545
|
+
toolAssertions: prompt.toolAssertions ? prompt.toolAssertions.map((toolAssertion) => ({
|
|
3546
|
+
...toolAssertion,
|
|
3547
|
+
toolNames: toolAssertion.toolNames ? [...toolAssertion.toolNames] : void 0,
|
|
3548
|
+
expectedArgs: toolAssertion.expectedArgs ? { ...toolAssertion.expectedArgs } : void 0
|
|
3549
|
+
})) : void 0
|
|
3550
|
+
};
|
|
3551
|
+
}
|
|
3010
3552
|
function normalizeAssertions(value, sourceLabel) {
|
|
3011
3553
|
const assertionArray = z5.array(z5.string().min(1)).safeParse(value);
|
|
3012
3554
|
if (assertionArray.success) {
|
|
@@ -3079,17 +3621,14 @@ async function loadConfiguredEvalPrompts(command) {
|
|
|
3079
3621
|
const assertionsRaw = await fs6.readFile(assertionsFile, "utf8");
|
|
3080
3622
|
const assertions = normalizeAssertions(parseJsonIfPossible(assertionsRaw), assertionsFile);
|
|
3081
3623
|
prompts = prompts.map((prompt) => ({
|
|
3082
|
-
prompt
|
|
3624
|
+
...cloneEvalPrompt(prompt),
|
|
3083
3625
|
assertions: [...assertions]
|
|
3084
3626
|
}));
|
|
3085
3627
|
}
|
|
3086
3628
|
const numRunsWasExplicit = context.configFile?.eval?.numRuns !== void 0;
|
|
3087
3629
|
if (numRunsWasExplicit && prompts.length === 1 && context.config.eval.numRuns > 1) {
|
|
3088
3630
|
const promptTemplate = prompts[0];
|
|
3089
|
-
prompts = Array.from({ length: context.config.eval.numRuns }, () => (
|
|
3090
|
-
prompt: promptTemplate.prompt,
|
|
3091
|
-
assertions: promptTemplate.assertions ? [...promptTemplate.assertions] : void 0
|
|
3092
|
-
}));
|
|
3631
|
+
prompts = Array.from({ length: context.config.eval.numRuns }, () => cloneEvalPrompt(promptTemplate));
|
|
3093
3632
|
}
|
|
3094
3633
|
return prompts;
|
|
3095
3634
|
}
|
|
@@ -3186,7 +3725,8 @@ var evalConfigSchema = z7.object({
|
|
|
3186
3725
|
numRuns: z7.number().int().min(1).optional(),
|
|
3187
3726
|
threshold: z7.number().min(0).max(1).optional(),
|
|
3188
3727
|
promptFile: z7.string().min(1).optional(),
|
|
3189
|
-
assertionsFile: z7.string().min(1).optional()
|
|
3728
|
+
assertionsFile: z7.string().min(1).optional(),
|
|
3729
|
+
maxToolIterations: z7.number().int().min(1).max(50).optional()
|
|
3190
3730
|
}).strict().partial();
|
|
3191
3731
|
var skilltestConfigSchema = z7.object({
|
|
3192
3732
|
provider: providerNameSchema.optional(),
|
|
@@ -3217,7 +3757,8 @@ var resolvedSkilltestConfigSchema = z7.object({
|
|
|
3217
3757
|
numRuns: z7.number().int().min(1),
|
|
3218
3758
|
threshold: z7.number().min(0).max(1),
|
|
3219
3759
|
promptFile: z7.string().min(1).optional(),
|
|
3220
|
-
assertionsFile: z7.string().min(1).optional()
|
|
3760
|
+
assertionsFile: z7.string().min(1).optional(),
|
|
3761
|
+
maxToolIterations: z7.number().int().min(1).max(50)
|
|
3221
3762
|
})
|
|
3222
3763
|
});
|
|
3223
3764
|
var DEFAULT_SKILLTEST_CONFIG = {
|
|
@@ -3237,7 +3778,8 @@ var DEFAULT_SKILLTEST_CONFIG = {
|
|
|
3237
3778
|
},
|
|
3238
3779
|
eval: {
|
|
3239
3780
|
numRuns: 5,
|
|
3240
|
-
threshold: 0.9
|
|
3781
|
+
threshold: 0.9,
|
|
3782
|
+
maxToolIterations: 10
|
|
3241
3783
|
}
|
|
3242
3784
|
};
|
|
3243
3785
|
function formatIssuePath(issuePath) {
|
|
@@ -3367,7 +3909,8 @@ function mergeConfigLayers(configFile = {}, cliFlags = {}, baseDirectory = proce
|
|
|
3367
3909
|
assertionsFile: resolveConfigRelativePath(
|
|
3368
3910
|
baseDirectory,
|
|
3369
3911
|
cliFlags.eval?.assertionsFile ?? configFile.eval?.assertionsFile ?? DEFAULT_SKILLTEST_CONFIG.eval.assertionsFile
|
|
3370
|
-
)
|
|
3912
|
+
),
|
|
3913
|
+
maxToolIterations: cliFlags.eval?.maxToolIterations ?? configFile.eval?.maxToolIterations ?? DEFAULT_SKILLTEST_CONFIG.eval.maxToolIterations
|
|
3371
3914
|
}
|
|
3372
3915
|
};
|
|
3373
3916
|
return resolvedSkilltestConfigSchema.parse(merged);
|
|
@@ -3391,34 +3934,34 @@ function extractCliConfigOverrides(command) {
|
|
|
3391
3934
|
if (command.getOptionValueSource("model") === "cli") {
|
|
3392
3935
|
overrides.model = getTypedOptionValue(command, "model");
|
|
3393
3936
|
}
|
|
3394
|
-
if ((command.name() === "trigger" || command.name() === "eval" || command.name() === "check") && command.getOptionValueSource("concurrency") === "cli") {
|
|
3937
|
+
if ((command.name() === "trigger" || command.name() === "eval" || command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("concurrency") === "cli") {
|
|
3395
3938
|
overrides.concurrency = getTypedOptionValue(command, "concurrency");
|
|
3396
3939
|
}
|
|
3397
|
-
if ((command.name() === "trigger" || command.name() === "check") && command.getOptionValueSource("numQueries") === "cli") {
|
|
3940
|
+
if ((command.name() === "trigger" || command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("numQueries") === "cli") {
|
|
3398
3941
|
overrides.trigger = {
|
|
3399
3942
|
...overrides.trigger,
|
|
3400
3943
|
numQueries: getTypedOptionValue(command, "numQueries")
|
|
3401
3944
|
};
|
|
3402
3945
|
}
|
|
3403
|
-
if ((command.name() === "trigger" || command.name() === "check") && command.getOptionValueSource("compare") === "cli") {
|
|
3946
|
+
if ((command.name() === "trigger" || command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("compare") === "cli") {
|
|
3404
3947
|
overrides.trigger = {
|
|
3405
3948
|
...overrides.trigger,
|
|
3406
3949
|
compare: getTypedOptionValue(command, "compare")
|
|
3407
3950
|
};
|
|
3408
3951
|
}
|
|
3409
|
-
if ((command.name() === "lint" || command.name() === "check") && command.getOptionValueSource("plugin") === "cli") {
|
|
3952
|
+
if ((command.name() === "lint" || command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("plugin") === "cli") {
|
|
3410
3953
|
overrides.lint = {
|
|
3411
3954
|
...overrides.lint,
|
|
3412
3955
|
plugins: getTypedOptionValue(command, "plugin")
|
|
3413
3956
|
};
|
|
3414
3957
|
}
|
|
3415
|
-
if (command.name() === "check" && command.getOptionValueSource("minF1") === "cli") {
|
|
3958
|
+
if ((command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("minF1") === "cli") {
|
|
3416
3959
|
overrides.trigger = {
|
|
3417
3960
|
...overrides.trigger,
|
|
3418
3961
|
threshold: getTypedOptionValue(command, "minF1")
|
|
3419
3962
|
};
|
|
3420
3963
|
}
|
|
3421
|
-
if (command.name() === "check" && command.getOptionValueSource("minAssertPassRate") === "cli") {
|
|
3964
|
+
if ((command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("minAssertPassRate") === "cli") {
|
|
3422
3965
|
overrides.eval = {
|
|
3423
3966
|
...overrides.eval,
|
|
3424
3967
|
threshold: getTypedOptionValue(command, "minAssertPassRate")
|
|
@@ -3483,6 +4026,12 @@ function resolveApiKey(provider, override) {
|
|
|
3483
4026
|
|
|
3484
4027
|
// src/providers/anthropic.ts
|
|
3485
4028
|
import Anthropic from "@anthropic-ai/sdk";
|
|
4029
|
+
function isAnthropicTextBlock(block) {
|
|
4030
|
+
return block.type === "text";
|
|
4031
|
+
}
|
|
4032
|
+
function isAnthropicToolUseBlock(block) {
|
|
4033
|
+
return block.type === "tool_use";
|
|
4034
|
+
}
|
|
3486
4035
|
function wait(ms) {
|
|
3487
4036
|
return new Promise((resolve) => {
|
|
3488
4037
|
setTimeout(resolve, ms);
|
|
@@ -3508,27 +4057,11 @@ var AnthropicProvider = class {
|
|
|
3508
4057
|
constructor(apiKey) {
|
|
3509
4058
|
this.client = new Anthropic({ apiKey });
|
|
3510
4059
|
}
|
|
3511
|
-
async
|
|
4060
|
+
async createMessage(request) {
|
|
3512
4061
|
let lastError;
|
|
3513
4062
|
for (let attempt = 0; attempt < 3; attempt += 1) {
|
|
3514
4063
|
try {
|
|
3515
|
-
|
|
3516
|
-
model: options.model,
|
|
3517
|
-
max_tokens: 2048,
|
|
3518
|
-
system: systemPrompt,
|
|
3519
|
-
messages: [
|
|
3520
|
-
{
|
|
3521
|
-
role: "user",
|
|
3522
|
-
content: userMessage
|
|
3523
|
-
}
|
|
3524
|
-
]
|
|
3525
|
-
});
|
|
3526
|
-
const textBlocks = response.content.filter((block) => block.type === "text");
|
|
3527
|
-
const text = textBlocks.map((block) => block.text).join("\n").trim();
|
|
3528
|
-
if (text.length === 0) {
|
|
3529
|
-
throw new Error("Model returned an empty response.");
|
|
3530
|
-
}
|
|
3531
|
-
return text;
|
|
4064
|
+
return await this.client.messages.create(request);
|
|
3532
4065
|
} catch (error) {
|
|
3533
4066
|
lastError = error;
|
|
3534
4067
|
if (!isRateLimitError(error) || attempt === 2) {
|
|
@@ -3543,6 +4076,55 @@ var AnthropicProvider = class {
|
|
|
3543
4076
|
}
|
|
3544
4077
|
throw new Error("Anthropic API call failed with an unknown error.");
|
|
3545
4078
|
}
|
|
4079
|
+
toAnthropicMessages(messages) {
|
|
4080
|
+
return messages.map((message) => ({
|
|
4081
|
+
role: message.role,
|
|
4082
|
+
content: message.content
|
|
4083
|
+
}));
|
|
4084
|
+
}
|
|
4085
|
+
async sendMessage(systemPrompt, userMessage, options) {
|
|
4086
|
+
const response = await this.createMessage({
|
|
4087
|
+
model: options.model,
|
|
4088
|
+
max_tokens: 2048,
|
|
4089
|
+
system: systemPrompt,
|
|
4090
|
+
messages: [
|
|
4091
|
+
{
|
|
4092
|
+
role: "user",
|
|
4093
|
+
content: userMessage
|
|
4094
|
+
}
|
|
4095
|
+
]
|
|
4096
|
+
});
|
|
4097
|
+
const textBlocks = response.content.filter(isAnthropicTextBlock);
|
|
4098
|
+
const text = textBlocks.map((block) => block.text).join("\n").trim();
|
|
4099
|
+
if (text.length === 0) {
|
|
4100
|
+
throw new Error("Model returned an empty response.");
|
|
4101
|
+
}
|
|
4102
|
+
return text;
|
|
4103
|
+
}
|
|
4104
|
+
async sendWithTools(systemPrompt, messages, options) {
|
|
4105
|
+
const response = await this.createMessage({
|
|
4106
|
+
model: options.model,
|
|
4107
|
+
max_tokens: 2048,
|
|
4108
|
+
system: systemPrompt,
|
|
4109
|
+
messages: this.toAnthropicMessages(messages),
|
|
4110
|
+
tools: options.tools.map((tool) => ({
|
|
4111
|
+
name: tool.name,
|
|
4112
|
+
description: tool.description,
|
|
4113
|
+
input_schema: tool.parameters ?? { type: "object", properties: {} }
|
|
4114
|
+
}))
|
|
4115
|
+
});
|
|
4116
|
+
const textContent = response.content.filter(isAnthropicTextBlock).map((block) => block.text).join("\n").trim();
|
|
4117
|
+
const toolUseBlocks = response.content.filter(isAnthropicToolUseBlock).map((block) => ({
|
|
4118
|
+
id: block.id,
|
|
4119
|
+
name: block.name,
|
|
4120
|
+
arguments: block.input
|
|
4121
|
+
}));
|
|
4122
|
+
return {
|
|
4123
|
+
textContent,
|
|
4124
|
+
toolUseBlocks,
|
|
4125
|
+
stopReason: response.stop_reason ?? "end_turn"
|
|
4126
|
+
};
|
|
4127
|
+
}
|
|
3546
4128
|
};
|
|
3547
4129
|
|
|
3548
4130
|
// src/providers/openai.ts
|
|
@@ -3579,6 +4161,71 @@ function extractTextContent(content) {
|
|
|
3579
4161
|
const text = content.map((item) => item.type === "text" || !item.type ? item.text ?? "" : "").join("\n").trim();
|
|
3580
4162
|
return text;
|
|
3581
4163
|
}
|
|
4164
|
+
function parseToolArguments(raw, toolName) {
|
|
4165
|
+
if (!raw || raw.trim() === "") {
|
|
4166
|
+
return {};
|
|
4167
|
+
}
|
|
4168
|
+
try {
|
|
4169
|
+
const parsed = JSON.parse(raw);
|
|
4170
|
+
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
|
|
4171
|
+
throw new Error("Tool arguments must be a JSON object.");
|
|
4172
|
+
}
|
|
4173
|
+
return parsed;
|
|
4174
|
+
} catch (error) {
|
|
4175
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
4176
|
+
throw new Error(`OpenAI tool call arguments for '${toolName}' were not valid JSON: ${message}`);
|
|
4177
|
+
}
|
|
4178
|
+
}
|
|
4179
|
+
function getBlockText(blocks) {
|
|
4180
|
+
return blocks.filter((block) => block.type === "text").map((block) => String(block.text ?? "")).join("\n").trim();
|
|
4181
|
+
}
|
|
4182
|
+
function mapAssistantBlocksToMessage(blocks) {
|
|
4183
|
+
const textContent = getBlockText(blocks);
|
|
4184
|
+
const toolCalls = blocks.filter((block) => block.type === "tool_use").map((block) => ({
|
|
4185
|
+
id: String(block.id ?? ""),
|
|
4186
|
+
type: "function",
|
|
4187
|
+
function: {
|
|
4188
|
+
name: String(block.name ?? ""),
|
|
4189
|
+
arguments: JSON.stringify(block.input ?? {})
|
|
4190
|
+
}
|
|
4191
|
+
}));
|
|
4192
|
+
return {
|
|
4193
|
+
role: "assistant",
|
|
4194
|
+
content: textContent.length > 0 ? textContent : null,
|
|
4195
|
+
...toolCalls.length > 0 ? { tool_calls: toolCalls } : {}
|
|
4196
|
+
};
|
|
4197
|
+
}
|
|
4198
|
+
function mapUserBlocksToMessages(blocks) {
|
|
4199
|
+
const toolResults = blocks.filter((block) => block.type === "tool_result");
|
|
4200
|
+
if (toolResults.length > 0) {
|
|
4201
|
+
return toolResults.map((block) => ({
|
|
4202
|
+
role: "tool",
|
|
4203
|
+
tool_call_id: String(block.tool_use_id ?? ""),
|
|
4204
|
+
content: String(block.content ?? "")
|
|
4205
|
+
}));
|
|
4206
|
+
}
|
|
4207
|
+
const textContent = getBlockText(blocks);
|
|
4208
|
+
return [
|
|
4209
|
+
{
|
|
4210
|
+
role: "user",
|
|
4211
|
+
content: textContent
|
|
4212
|
+
}
|
|
4213
|
+
];
|
|
4214
|
+
}
|
|
4215
|
+
function mapConversationBlockToMessages(block) {
|
|
4216
|
+
if (typeof block.content === "string") {
|
|
4217
|
+
return [
|
|
4218
|
+
{
|
|
4219
|
+
role: block.role,
|
|
4220
|
+
content: block.content
|
|
4221
|
+
}
|
|
4222
|
+
];
|
|
4223
|
+
}
|
|
4224
|
+
if (block.role === "assistant") {
|
|
4225
|
+
return [mapAssistantBlocksToMessage(block.content)];
|
|
4226
|
+
}
|
|
4227
|
+
return mapUserBlocksToMessages(block.content);
|
|
4228
|
+
}
|
|
3582
4229
|
var OpenAIProvider = class {
|
|
3583
4230
|
name = "openai";
|
|
3584
4231
|
apiKey;
|
|
@@ -3607,30 +4254,12 @@ var OpenAIProvider = class {
|
|
|
3607
4254
|
this.client = new OpenAIConstructor({ apiKey: this.apiKey });
|
|
3608
4255
|
return this.client;
|
|
3609
4256
|
}
|
|
3610
|
-
async
|
|
4257
|
+
async createCompletion(input) {
|
|
3611
4258
|
const client = await this.ensureClient();
|
|
3612
4259
|
let lastError;
|
|
3613
4260
|
for (let attempt = 0; attempt < 3; attempt += 1) {
|
|
3614
4261
|
try {
|
|
3615
|
-
|
|
3616
|
-
model: options.model,
|
|
3617
|
-
max_tokens: 2048,
|
|
3618
|
-
messages: [
|
|
3619
|
-
{
|
|
3620
|
-
role: "system",
|
|
3621
|
-
content: systemPrompt
|
|
3622
|
-
},
|
|
3623
|
-
{
|
|
3624
|
-
role: "user",
|
|
3625
|
-
content: userMessage
|
|
3626
|
-
}
|
|
3627
|
-
]
|
|
3628
|
-
});
|
|
3629
|
-
const text = (response.choices ?? []).map((choice) => extractTextContent(choice.message?.content)).join("\n").trim();
|
|
3630
|
-
if (text.length === 0) {
|
|
3631
|
-
throw new Error("Model returned an empty response.");
|
|
3632
|
-
}
|
|
3633
|
-
return text;
|
|
4262
|
+
return await client.chat.completions.create(input);
|
|
3634
4263
|
} catch (error) {
|
|
3635
4264
|
lastError = error;
|
|
3636
4265
|
if (!isRetriableError(error) || attempt === 2) {
|
|
@@ -3645,6 +4274,57 @@ var OpenAIProvider = class {
|
|
|
3645
4274
|
}
|
|
3646
4275
|
throw new Error("OpenAI API call failed with an unknown error.");
|
|
3647
4276
|
}
|
|
4277
|
+
toOpenAiMessages(systemPrompt, messages) {
|
|
4278
|
+
return [
|
|
4279
|
+
{
|
|
4280
|
+
role: "system",
|
|
4281
|
+
content: systemPrompt
|
|
4282
|
+
},
|
|
4283
|
+
...messages.flatMap((message) => mapConversationBlockToMessages(message))
|
|
4284
|
+
];
|
|
4285
|
+
}
|
|
4286
|
+
async sendMessage(systemPrompt, userMessage, options) {
|
|
4287
|
+
const response = await this.createCompletion({
|
|
4288
|
+
model: options.model,
|
|
4289
|
+
max_tokens: 2048,
|
|
4290
|
+
messages: this.toOpenAiMessages(systemPrompt, [{ role: "user", content: userMessage }])
|
|
4291
|
+
});
|
|
4292
|
+
const text = (response.choices ?? []).map((choice) => extractTextContent(choice.message?.content)).join("\n").trim();
|
|
4293
|
+
if (text.length === 0) {
|
|
4294
|
+
throw new Error("Model returned an empty response.");
|
|
4295
|
+
}
|
|
4296
|
+
return text;
|
|
4297
|
+
}
|
|
4298
|
+
async sendWithTools(systemPrompt, messages, options) {
|
|
4299
|
+
const response = await this.createCompletion({
|
|
4300
|
+
model: options.model,
|
|
4301
|
+
max_tokens: 2048,
|
|
4302
|
+
messages: this.toOpenAiMessages(systemPrompt, messages),
|
|
4303
|
+
tools: options.tools.map((tool) => ({
|
|
4304
|
+
type: "function",
|
|
4305
|
+
function: {
|
|
4306
|
+
name: tool.name,
|
|
4307
|
+
description: tool.description,
|
|
4308
|
+
parameters: tool.parameters
|
|
4309
|
+
}
|
|
4310
|
+
}))
|
|
4311
|
+
});
|
|
4312
|
+
const choice = response.choices?.[0];
|
|
4313
|
+
const message = choice?.message;
|
|
4314
|
+
const toolUseBlocks = (message?.tool_calls ?? []).map((toolCall, index) => {
|
|
4315
|
+
const toolName = toolCall.function?.name ?? `tool-${index + 1}`;
|
|
4316
|
+
return {
|
|
4317
|
+
id: toolCall.id ?? `${toolName}-${index + 1}`,
|
|
4318
|
+
name: toolName,
|
|
4319
|
+
arguments: parseToolArguments(toolCall.function?.arguments, toolName)
|
|
4320
|
+
};
|
|
4321
|
+
});
|
|
4322
|
+
return {
|
|
4323
|
+
textContent: extractTextContent(message?.content),
|
|
4324
|
+
toolUseBlocks,
|
|
4325
|
+
stopReason: choice?.finish_reason === "stop" ? "end_turn" : choice?.finish_reason === "tool_calls" ? "tool_use" : choice?.finish_reason ?? "end_turn"
|
|
4326
|
+
};
|
|
4327
|
+
}
|
|
3648
4328
|
};
|
|
3649
4329
|
|
|
3650
4330
|
// src/providers/index.ts
|
|
@@ -3815,7 +4495,8 @@ async function handleEvalCommand(targetPath, options, command) {
|
|
|
3815
4495
|
graderModel,
|
|
3816
4496
|
numRuns: options.numRuns,
|
|
3817
4497
|
concurrency: options.concurrency,
|
|
3818
|
-
prompts
|
|
4498
|
+
prompts,
|
|
4499
|
+
maxToolIterations: options.maxToolIterations
|
|
3819
4500
|
});
|
|
3820
4501
|
if (options.saveResults) {
|
|
3821
4502
|
await writeJsonFile(options.saveResults, result);
|
|
@@ -3862,7 +4543,8 @@ function registerEvalCommand(program) {
|
|
|
3862
4543
|
verbose: Boolean(parsedCli.data.verbose),
|
|
3863
4544
|
apiKey: parsedCli.data.apiKey,
|
|
3864
4545
|
numRuns: config.eval.numRuns,
|
|
3865
|
-
concurrency: config.concurrency
|
|
4546
|
+
concurrency: config.concurrency,
|
|
4547
|
+
maxToolIterations: config.eval.maxToolIterations
|
|
3866
4548
|
},
|
|
3867
4549
|
command
|
|
3868
4550
|
);
|
|
@@ -3919,7 +4601,8 @@ async function runCheck(inputPath, options) {
|
|
|
3919
4601
|
graderModel: options.graderModel,
|
|
3920
4602
|
numRuns: options.evalNumRuns,
|
|
3921
4603
|
prompts: options.prompts,
|
|
3922
|
-
concurrency: options.concurrency
|
|
4604
|
+
concurrency: options.concurrency,
|
|
4605
|
+
maxToolIterations: options.evalMaxToolIterations
|
|
3923
4606
|
};
|
|
3924
4607
|
if ((options.concurrency ?? 5) === 1) {
|
|
3925
4608
|
options.onStage?.("trigger");
|
|
@@ -4041,6 +4724,7 @@ async function handleCheckCommand(targetPath, options, command) {
|
|
|
4041
4724
|
triggerSeed: options.triggerSeed,
|
|
4042
4725
|
prompts,
|
|
4043
4726
|
evalNumRuns: options.numRuns,
|
|
4727
|
+
evalMaxToolIterations: options.maxToolIterations,
|
|
4044
4728
|
concurrency: options.concurrency,
|
|
4045
4729
|
minF1: options.minF1,
|
|
4046
4730
|
minAssertPassRate: options.minAssertPassRate,
|
|
@@ -4106,6 +4790,7 @@ function registerCheckCommand(program) {
|
|
|
4106
4790
|
minF1: config.trigger.threshold,
|
|
4107
4791
|
minAssertPassRate: config.eval.threshold,
|
|
4108
4792
|
numRuns: config.eval.numRuns,
|
|
4793
|
+
maxToolIterations: config.eval.maxToolIterations,
|
|
4109
4794
|
concurrency: config.concurrency,
|
|
4110
4795
|
html: parsedCli.data.html,
|
|
4111
4796
|
lintFailOn: config.lint.failOn,
|
|
@@ -4121,12 +4806,572 @@ function registerCheckCommand(program) {
|
|
|
4121
4806
|
});
|
|
4122
4807
|
}
|
|
4123
4808
|
|
|
4809
|
+
// src/commands/improve.ts
|
|
4810
|
+
import ora4 from "ora";
|
|
4811
|
+
import { z as z12 } from "zod";
|
|
4812
|
+
|
|
4813
|
+
// src/core/improver.ts
|
|
4814
|
+
import fs12 from "node:fs/promises";
|
|
4815
|
+
import os from "node:os";
|
|
4816
|
+
import path7 from "node:path";
|
|
4817
|
+
import yaml2 from "js-yaml";
|
|
4818
|
+
import { z as z11 } from "zod";
|
|
4819
|
+
var improveRewriteSchema = z11.object({
|
|
4820
|
+
frontmatter: z11.record(z11.unknown()),
|
|
4821
|
+
content: z11.string().min(1),
|
|
4822
|
+
changeSummary: z11.array(z11.string().min(1)).min(1),
|
|
4823
|
+
targetedProblems: z11.array(z11.string().min(1)).min(1)
|
|
4824
|
+
});
|
|
4825
|
+
function calculateEvalAssertPassRate2(result) {
|
|
4826
|
+
if (!result || result.summary.totalAssertions === 0) {
|
|
4827
|
+
return 0;
|
|
4828
|
+
}
|
|
4829
|
+
return result.summary.passedAssertions / result.summary.totalAssertions;
|
|
4830
|
+
}
|
|
4831
|
+
function extractJsonObject2(raw) {
|
|
4832
|
+
const trimmed = raw.trim();
|
|
4833
|
+
if (trimmed.startsWith("{") && trimmed.endsWith("}")) {
|
|
4834
|
+
return JSON.parse(trimmed);
|
|
4835
|
+
}
|
|
4836
|
+
const start = trimmed.indexOf("{");
|
|
4837
|
+
const end = trimmed.lastIndexOf("}");
|
|
4838
|
+
if (start >= 0 && end > start) {
|
|
4839
|
+
return JSON.parse(trimmed.slice(start, end + 1));
|
|
4840
|
+
}
|
|
4841
|
+
throw new Error("Improver did not return a JSON object.");
|
|
4842
|
+
}
|
|
4843
|
+
function orderFrontmatter(frontmatter) {
|
|
4844
|
+
const ordered = {};
|
|
4845
|
+
for (const key of ["name", "description", "license"]) {
|
|
4846
|
+
if (Object.prototype.hasOwnProperty.call(frontmatter, key)) {
|
|
4847
|
+
ordered[key] = frontmatter[key];
|
|
4848
|
+
}
|
|
4849
|
+
}
|
|
4850
|
+
for (const [key, value] of Object.entries(frontmatter)) {
|
|
4851
|
+
if (!Object.prototype.hasOwnProperty.call(ordered, key)) {
|
|
4852
|
+
ordered[key] = value;
|
|
4853
|
+
}
|
|
4854
|
+
}
|
|
4855
|
+
return ordered;
|
|
4856
|
+
}
|
|
4857
|
+
function detectLineEnding(raw) {
|
|
4858
|
+
return raw.includes("\r\n") ? "\r\n" : "\n";
|
|
4859
|
+
}
|
|
4860
|
+
function buildSkillMarkdown(frontmatter, content, lineEnding) {
|
|
4861
|
+
const normalizedBody = content.trim();
|
|
4862
|
+
if (normalizedBody.length === 0) {
|
|
4863
|
+
throw new Error("Candidate rewrite produced an empty SKILL.md body.");
|
|
4864
|
+
}
|
|
4865
|
+
const frontmatterBlock = yaml2.dump(orderFrontmatter(frontmatter), {
|
|
4866
|
+
lineWidth: 0,
|
|
4867
|
+
noRefs: true,
|
|
4868
|
+
sortKeys: false
|
|
4869
|
+
}).replace(/\n/g, lineEnding);
|
|
4870
|
+
return `---${lineEnding}${frontmatterBlock}---${lineEnding}${lineEnding}${normalizedBody.replace(/\n/g, lineEnding)}${lineEnding}`;
|
|
4871
|
+
}
|
|
4872
|
+
async function validateRelativeReferences(raw, skillRoot) {
|
|
4873
|
+
for (const reference of extractRelativeFileReferences(raw)) {
|
|
4874
|
+
const resolved = path7.resolve(skillRoot, reference);
|
|
4875
|
+
const relativeToRoot = path7.relative(skillRoot, resolved);
|
|
4876
|
+
const escapesRoot = relativeToRoot === "" ? false : relativeToRoot.startsWith("..") || path7.isAbsolute(relativeToRoot);
|
|
4877
|
+
if (escapesRoot) {
|
|
4878
|
+
throw new Error(`Candidate rewrite introduced an out-of-root reference: ${reference}`);
|
|
4879
|
+
}
|
|
4880
|
+
if (!await pathExists(resolved)) {
|
|
4881
|
+
throw new Error(`Candidate rewrite introduced a broken relative reference: ${reference}`);
|
|
4882
|
+
}
|
|
4883
|
+
}
|
|
4884
|
+
}
|
|
4885
|
+
async function buildCandidate(skill, rewrite) {
|
|
4886
|
+
if (typeof rewrite.frontmatter.name === "string" && rewrite.frontmatter.name !== skill.frontmatter.name) {
|
|
4887
|
+
throw new Error(`Candidate rewrite attempted to rename skill '${skill.frontmatter.name}' to '${rewrite.frontmatter.name}'.`);
|
|
4888
|
+
}
|
|
4889
|
+
if (skill.frontmatter.license && typeof rewrite.frontmatter.license === "string" && rewrite.frontmatter.license !== skill.frontmatter.license) {
|
|
4890
|
+
throw new Error(
|
|
4891
|
+
`Candidate rewrite attempted to change license '${skill.frontmatter.license}' to '${rewrite.frontmatter.license}'.`
|
|
4892
|
+
);
|
|
4893
|
+
}
|
|
4894
|
+
const mergedFrontmatter = {
|
|
4895
|
+
...skill.frontmatter,
|
|
4896
|
+
...rewrite.frontmatter,
|
|
4897
|
+
name: skill.frontmatter.name,
|
|
4898
|
+
...skill.frontmatter.license ? { license: skill.frontmatter.license } : {}
|
|
4899
|
+
};
|
|
4900
|
+
const raw = buildSkillMarkdown(mergedFrontmatter, rewrite.content, detectLineEnding(skill.raw));
|
|
4901
|
+
parseSkillDocumentStrict(raw, skill.skillRoot, skill.skillFile);
|
|
4902
|
+
await validateRelativeReferences(raw, skill.skillRoot);
|
|
4903
|
+
return {
|
|
4904
|
+
frontmatter: mergedFrontmatter,
|
|
4905
|
+
content: rewrite.content.trim(),
|
|
4906
|
+
raw,
|
|
4907
|
+
changeSummary: rewrite.changeSummary,
|
|
4908
|
+
targetedProblems: rewrite.targetedProblems
|
|
4909
|
+
};
|
|
4910
|
+
}
|
|
4911
|
+
function extractActionableIssues(result) {
|
|
4912
|
+
const lintIssues = result.lint.issues.filter((issue) => issue.status !== "pass").map((issue) => ({
|
|
4913
|
+
checkId: issue.checkId,
|
|
4914
|
+
title: issue.title,
|
|
4915
|
+
status: issue.status === "warn" ? "warn" : "fail",
|
|
4916
|
+
message: issue.message,
|
|
4917
|
+
suggestion: issue.suggestion,
|
|
4918
|
+
startLine: issue.startLine,
|
|
4919
|
+
endLine: issue.endLine
|
|
4920
|
+
}));
|
|
4921
|
+
const triggerFailures = result.trigger?.cases.filter((testCase) => !testCase.matched).map((testCase) => ({
|
|
4922
|
+
query: testCase.query,
|
|
4923
|
+
expected: testCase.expected,
|
|
4924
|
+
actual: testCase.actual,
|
|
4925
|
+
selectedCompetitor: testCase.selectedCompetitor,
|
|
4926
|
+
rawModelResponse: testCase.rawModelResponse
|
|
4927
|
+
})) ?? [];
|
|
4928
|
+
const evalFailures = result.eval?.results.flatMap(
|
|
4929
|
+
(promptResult) => promptResult.assertions.filter((assertion) => !assertion.passed).map((assertion) => ({
|
|
4930
|
+
prompt: promptResult.prompt,
|
|
4931
|
+
assertion: assertion.assertion,
|
|
4932
|
+
evidence: assertion.evidence,
|
|
4933
|
+
source: assertion.source === "grader" || assertion.source === "tool" ? assertion.source : "unknown"
|
|
4934
|
+
}))
|
|
4935
|
+
) ?? [];
|
|
4936
|
+
return {
|
|
4937
|
+
lintIssues,
|
|
4938
|
+
triggerFailures,
|
|
4939
|
+
evalFailures,
|
|
4940
|
+
triggerSuggestions: result.trigger?.suggestions ?? []
|
|
4941
|
+
};
|
|
4942
|
+
}
|
|
4943
|
+
function hasActionableProblems(brief) {
|
|
4944
|
+
return brief.lintIssues.length > 0 || brief.triggerFailures.length > 0 || brief.evalFailures.length > 0 || brief.triggerSuggestions.length > 0;
|
|
4945
|
+
}
|
|
4946
|
+
async function listSkillFiles(skillRoot) {
|
|
4947
|
+
const entries = await fs12.readdir(skillRoot, { withFileTypes: true });
|
|
4948
|
+
const files = [];
|
|
4949
|
+
for (const entry of entries) {
|
|
4950
|
+
const absolutePath = path7.join(skillRoot, entry.name);
|
|
4951
|
+
if (entry.isDirectory()) {
|
|
4952
|
+
files.push(...await listSkillFiles(absolutePath));
|
|
4953
|
+
continue;
|
|
4954
|
+
}
|
|
4955
|
+
if (entry.isFile()) {
|
|
4956
|
+
files.push(path7.relative(skillRoot, absolutePath).split(path7.sep).join("/"));
|
|
4957
|
+
}
|
|
4958
|
+
}
|
|
4959
|
+
return files.sort();
|
|
4960
|
+
}
|
|
4961
|
+
async function requestRewrite(skill, baseline, brief, provider, model) {
|
|
4962
|
+
const availableFiles = await listSkillFiles(skill.skillRoot);
|
|
4963
|
+
const systemPrompt = [
|
|
4964
|
+
"You rewrite Agent Skill files to improve measured quality.",
|
|
4965
|
+
"Return JSON only.",
|
|
4966
|
+
"Required format:",
|
|
4967
|
+
'{"frontmatter": {...}, "content": "...", "changeSummary": ["..."], "targetedProblems": ["..."]}',
|
|
4968
|
+
"The content field must contain only the markdown body of SKILL.md, without YAML frontmatter fences.",
|
|
4969
|
+
`Keep the skill name exactly '${skill.frontmatter.name}'.`,
|
|
4970
|
+
skill.frontmatter.license ? `Keep the license exactly '${skill.frontmatter.license}'.` : "Do not remove any valid existing frontmatter fields.",
|
|
4971
|
+
"Do not invent new scripts, assets, references, APIs, or tools.",
|
|
4972
|
+
"Only reference files that already exist under the skill root.",
|
|
4973
|
+
"Optimize for trigger clarity, explicit scope boundaries, concrete examples, safety guidance, and tool usage instructions."
|
|
4974
|
+
].join(" ");
|
|
4975
|
+
const baselineTriggerF1 = baseline.trigger?.metrics.f1 ?? 0;
|
|
4976
|
+
const baselineEvalPassRate = calculateEvalAssertPassRate2(baseline.eval);
|
|
4977
|
+
const userPrompt = [
|
|
4978
|
+
`Skill file: ${skill.skillFile}`,
|
|
4979
|
+
`Current trigger F1: ${baselineTriggerF1.toFixed(4)}`,
|
|
4980
|
+
`Current eval assertion pass rate: ${baselineEvalPassRate.toFixed(4)}`,
|
|
4981
|
+
`Lint failures: ${baseline.lint.summary.failures}`,
|
|
4982
|
+
`Lint warnings: ${baseline.lint.summary.warnings}`,
|
|
4983
|
+
"",
|
|
4984
|
+
"Available files under the skill root:",
|
|
4985
|
+
...availableFiles.map((file) => `- ${file}`),
|
|
4986
|
+
"",
|
|
4987
|
+
"Current SKILL.md:",
|
|
4988
|
+
"```markdown",
|
|
4989
|
+
skill.raw,
|
|
4990
|
+
"```",
|
|
4991
|
+
"",
|
|
4992
|
+
"Actionable problems to fix:",
|
|
4993
|
+
JSON.stringify(brief, null, 2),
|
|
4994
|
+
"",
|
|
4995
|
+
"Rewrite the skill to address only these evidenced problems. Keep the instructions tight and practical."
|
|
4996
|
+
].join("\n");
|
|
4997
|
+
const raw = await provider.sendMessage(systemPrompt, userPrompt, { model });
|
|
4998
|
+
const parsed = improveRewriteSchema.safeParse(extractJsonObject2(raw));
|
|
4999
|
+
if (!parsed.success) {
|
|
5000
|
+
throw new Error(`Failed to parse improve output: ${parsed.error.issues[0]?.message ?? "invalid improve JSON"}`);
|
|
5001
|
+
}
|
|
5002
|
+
return parsed.data;
|
|
5003
|
+
}
|
|
5004
|
+
async function createVerificationDirectory(skillRoot, candidateRaw) {
|
|
5005
|
+
const tempRoot = await fs12.mkdtemp(path7.join(os.tmpdir(), "skilltest-improve-"));
|
|
5006
|
+
const tempSkillRoot = path7.join(tempRoot, path7.basename(skillRoot));
|
|
5007
|
+
await fs12.cp(skillRoot, tempSkillRoot, { recursive: true });
|
|
5008
|
+
await fs12.writeFile(path7.join(tempSkillRoot, "SKILL.md"), candidateRaw, "utf8");
|
|
5009
|
+
return {
|
|
5010
|
+
tempRoot,
|
|
5011
|
+
skillPath: tempSkillRoot
|
|
5012
|
+
};
|
|
5013
|
+
}
|
|
5014
|
+
function buildDelta(baseline, verification) {
|
|
5015
|
+
const baselineTriggerF1 = baseline.trigger?.metrics.f1 ?? 0;
|
|
5016
|
+
const verificationTriggerF1 = verification.trigger?.metrics.f1 ?? 0;
|
|
5017
|
+
const baselineEvalPassRate = calculateEvalAssertPassRate2(baseline.eval);
|
|
5018
|
+
const verificationEvalPassRate = calculateEvalAssertPassRate2(verification.eval);
|
|
5019
|
+
const lintFailuresDelta = baseline.lint.summary.failures - verification.lint.summary.failures;
|
|
5020
|
+
const lintWarningsDelta = baseline.lint.summary.warnings - verification.lint.summary.warnings;
|
|
5021
|
+
const triggerF1Delta = verificationTriggerF1 - baselineTriggerF1;
|
|
5022
|
+
const evalPassRateDelta = verificationEvalPassRate - baselineEvalPassRate;
|
|
5023
|
+
const hasRegression = verification.lint.summary.failures > baseline.lint.summary.failures || verification.lint.summary.warnings > baseline.lint.summary.warnings || verificationTriggerF1 < baselineTriggerF1 || verificationEvalPassRate < baselineEvalPassRate;
|
|
5024
|
+
const improved = verification.gates.overallPassed !== baseline.gates.overallPassed ? verification.gates.overallPassed : lintFailuresDelta > 0 || lintWarningsDelta > 0 || triggerF1Delta > 0 || evalPassRateDelta > 0;
|
|
5025
|
+
return {
|
|
5026
|
+
lintFailures: {
|
|
5027
|
+
before: baseline.lint.summary.failures,
|
|
5028
|
+
after: verification.lint.summary.failures,
|
|
5029
|
+
delta: lintFailuresDelta
|
|
5030
|
+
},
|
|
5031
|
+
lintWarnings: {
|
|
5032
|
+
before: baseline.lint.summary.warnings,
|
|
5033
|
+
after: verification.lint.summary.warnings,
|
|
5034
|
+
delta: lintWarningsDelta
|
|
5035
|
+
},
|
|
5036
|
+
triggerF1: {
|
|
5037
|
+
before: baselineTriggerF1,
|
|
5038
|
+
after: verificationTriggerF1,
|
|
5039
|
+
delta: triggerF1Delta
|
|
5040
|
+
},
|
|
5041
|
+
evalAssertPassRate: {
|
|
5042
|
+
before: baselineEvalPassRate,
|
|
5043
|
+
after: verificationEvalPassRate,
|
|
5044
|
+
delta: evalPassRateDelta
|
|
5045
|
+
},
|
|
5046
|
+
overallPassed: {
|
|
5047
|
+
before: baseline.gates.overallPassed,
|
|
5048
|
+
after: verification.gates.overallPassed
|
|
5049
|
+
},
|
|
5050
|
+
improved,
|
|
5051
|
+
hasRegression
|
|
5052
|
+
};
|
|
5053
|
+
}
|
|
5054
|
+
function normalizeVerificationTarget(result, target) {
|
|
5055
|
+
return {
|
|
5056
|
+
...result,
|
|
5057
|
+
target
|
|
5058
|
+
};
|
|
5059
|
+
}
|
|
5060
|
+
function buildBlockingReason(delta, verification) {
|
|
5061
|
+
if (delta.hasRegression) {
|
|
5062
|
+
return "Candidate rewrite regressed one or more quality metrics on the frozen test set.";
|
|
5063
|
+
}
|
|
5064
|
+
if (!delta.improved) {
|
|
5065
|
+
return "Candidate rewrite did not produce a measurable improvement on the frozen test set.";
|
|
5066
|
+
}
|
|
5067
|
+
if (!verification.gates.overallPassed) {
|
|
5068
|
+
return "Candidate rewrite improved the skill but still failed the configured quality gates.";
|
|
5069
|
+
}
|
|
5070
|
+
return void 0;
|
|
5071
|
+
}
|
|
5072
|
+
async function maybeWriteOutput(outputPath, raw) {
|
|
5073
|
+
const absolutePath = path7.resolve(outputPath);
|
|
5074
|
+
await fs12.mkdir(path7.dirname(absolutePath), { recursive: true });
|
|
5075
|
+
await fs12.writeFile(absolutePath, raw, "utf8");
|
|
5076
|
+
return absolutePath;
|
|
5077
|
+
}
|
|
5078
|
+
async function runImprove(inputPath, options) {
|
|
5079
|
+
options.onStage?.("baseline");
|
|
5080
|
+
const baseline = await runCheck(inputPath, {
|
|
5081
|
+
provider: options.provider,
|
|
5082
|
+
model: options.model,
|
|
5083
|
+
graderModel: options.model,
|
|
5084
|
+
lintFailOn: options.lintFailOn,
|
|
5085
|
+
lintSuppress: options.lintSuppress,
|
|
5086
|
+
lintPlugins: options.lintPlugins,
|
|
5087
|
+
compare: options.compare,
|
|
5088
|
+
numQueries: options.numQueries,
|
|
5089
|
+
triggerSeed: options.triggerSeed,
|
|
5090
|
+
queries: options.queries,
|
|
5091
|
+
evalNumRuns: options.evalNumRuns,
|
|
5092
|
+
prompts: options.prompts,
|
|
5093
|
+
evalMaxToolIterations: options.evalMaxToolIterations,
|
|
5094
|
+
concurrency: options.concurrency,
|
|
5095
|
+
minF1: options.minF1,
|
|
5096
|
+
minAssertPassRate: options.minAssertPassRate,
|
|
5097
|
+
continueOnLintFail: true,
|
|
5098
|
+
verbose: options.verbose
|
|
5099
|
+
});
|
|
5100
|
+
if (!baseline.trigger || !baseline.eval) {
|
|
5101
|
+
return {
|
|
5102
|
+
target: inputPath,
|
|
5103
|
+
provider: options.provider.name,
|
|
5104
|
+
model: options.model,
|
|
5105
|
+
originalRaw: "",
|
|
5106
|
+
thresholds: {
|
|
5107
|
+
minF1: options.minF1,
|
|
5108
|
+
minAssertPassRate: options.minAssertPassRate
|
|
5109
|
+
},
|
|
5110
|
+
baseline,
|
|
5111
|
+
candidate: null,
|
|
5112
|
+
verification: null,
|
|
5113
|
+
delta: null,
|
|
5114
|
+
applied: false,
|
|
5115
|
+
blockedReason: baseline.triggerSkippedReason ?? baseline.evalSkippedReason ?? "Improve requires a strictly parseable skill so trigger and eval can be frozen."
|
|
5116
|
+
};
|
|
5117
|
+
}
|
|
5118
|
+
const skill = await parseSkillStrict(inputPath);
|
|
5119
|
+
const brief = extractActionableIssues(baseline);
|
|
5120
|
+
if (!hasActionableProblems(brief)) {
|
|
5121
|
+
return {
|
|
5122
|
+
target: inputPath,
|
|
5123
|
+
provider: options.provider.name,
|
|
5124
|
+
model: options.model,
|
|
5125
|
+
originalRaw: skill.raw,
|
|
5126
|
+
thresholds: {
|
|
5127
|
+
minF1: options.minF1,
|
|
5128
|
+
minAssertPassRate: options.minAssertPassRate
|
|
5129
|
+
},
|
|
5130
|
+
baseline,
|
|
5131
|
+
candidate: null,
|
|
5132
|
+
verification: null,
|
|
5133
|
+
delta: null,
|
|
5134
|
+
applied: false,
|
|
5135
|
+
blockedReason: "No actionable failures, warnings, or mismatches were found to improve."
|
|
5136
|
+
};
|
|
5137
|
+
}
|
|
5138
|
+
options.onStage?.("generate");
|
|
5139
|
+
const rewrite = await requestRewrite(skill, baseline, brief, options.provider, options.model);
|
|
5140
|
+
options.onStage?.("validate");
|
|
5141
|
+
const candidate = await buildCandidate(skill, rewrite);
|
|
5142
|
+
if (candidate.raw === skill.raw) {
|
|
5143
|
+
return {
|
|
5144
|
+
target: inputPath,
|
|
5145
|
+
provider: options.provider.name,
|
|
5146
|
+
model: options.model,
|
|
5147
|
+
originalRaw: skill.raw,
|
|
5148
|
+
thresholds: {
|
|
5149
|
+
minF1: options.minF1,
|
|
5150
|
+
minAssertPassRate: options.minAssertPassRate
|
|
5151
|
+
},
|
|
5152
|
+
baseline,
|
|
5153
|
+
candidate,
|
|
5154
|
+
verification: null,
|
|
5155
|
+
delta: null,
|
|
5156
|
+
applied: false,
|
|
5157
|
+
blockedReason: "Candidate rewrite produced no changes."
|
|
5158
|
+
};
|
|
5159
|
+
}
|
|
5160
|
+
options.onStage?.("verify");
|
|
5161
|
+
const verificationDirectory = await createVerificationDirectory(skill.skillRoot, candidate.raw);
|
|
5162
|
+
let verification;
|
|
5163
|
+
try {
|
|
5164
|
+
verification = normalizeVerificationTarget(
|
|
5165
|
+
await runCheck(verificationDirectory.skillPath, {
|
|
5166
|
+
provider: options.provider,
|
|
5167
|
+
model: options.model,
|
|
5168
|
+
graderModel: options.model,
|
|
5169
|
+
lintFailOn: options.lintFailOn,
|
|
5170
|
+
lintSuppress: options.lintSuppress,
|
|
5171
|
+
lintPlugins: options.lintPlugins,
|
|
5172
|
+
compare: options.compare,
|
|
5173
|
+
numQueries: baseline.trigger.queries.length,
|
|
5174
|
+
triggerSeed: options.triggerSeed,
|
|
5175
|
+
queries: baseline.trigger.queries,
|
|
5176
|
+
evalNumRuns: baseline.eval.prompts.length,
|
|
5177
|
+
prompts: baseline.eval.prompts,
|
|
5178
|
+
evalMaxToolIterations: options.evalMaxToolIterations,
|
|
5179
|
+
concurrency: options.concurrency,
|
|
5180
|
+
minF1: options.minF1,
|
|
5181
|
+
minAssertPassRate: options.minAssertPassRate,
|
|
5182
|
+
continueOnLintFail: true,
|
|
5183
|
+
verbose: options.verbose
|
|
5184
|
+
}),
|
|
5185
|
+
inputPath
|
|
5186
|
+
);
|
|
5187
|
+
} finally {
|
|
5188
|
+
await fs12.rm(verificationDirectory.tempRoot, { recursive: true, force: true });
|
|
5189
|
+
}
|
|
5190
|
+
const delta = buildDelta(baseline, verification);
|
|
5191
|
+
const blockedReason = buildBlockingReason(delta, verification);
|
|
5192
|
+
let applied = false;
|
|
5193
|
+
let outputPath;
|
|
5194
|
+
if (!blockedReason) {
|
|
5195
|
+
if (options.outputPath) {
|
|
5196
|
+
options.onStage?.("write");
|
|
5197
|
+
outputPath = await maybeWriteOutput(options.outputPath, candidate.raw);
|
|
5198
|
+
}
|
|
5199
|
+
if (options.apply) {
|
|
5200
|
+
options.onStage?.("write");
|
|
5201
|
+
await fs12.writeFile(skill.skillFile, candidate.raw, "utf8");
|
|
5202
|
+
applied = true;
|
|
5203
|
+
}
|
|
5204
|
+
}
|
|
5205
|
+
return {
|
|
5206
|
+
target: inputPath,
|
|
5207
|
+
provider: options.provider.name,
|
|
5208
|
+
model: options.model,
|
|
5209
|
+
originalRaw: skill.raw,
|
|
5210
|
+
thresholds: {
|
|
5211
|
+
minF1: options.minF1,
|
|
5212
|
+
minAssertPassRate: options.minAssertPassRate
|
|
5213
|
+
},
|
|
5214
|
+
baseline,
|
|
5215
|
+
candidate,
|
|
5216
|
+
verification,
|
|
5217
|
+
delta,
|
|
5218
|
+
applied,
|
|
5219
|
+
...outputPath ? { outputPath } : {},
|
|
5220
|
+
...blockedReason ? { blockedReason } : {}
|
|
5221
|
+
};
|
|
5222
|
+
}
|
|
5223
|
+
|
|
5224
|
+
// src/commands/improve.ts
|
|
5225
|
+
var improveCliSchema = z12.object({
|
|
5226
|
+
apiKey: z12.string().optional(),
|
|
5227
|
+
queries: z12.string().optional(),
|
|
5228
|
+
compare: z12.array(z12.string().min(1)).optional(),
|
|
5229
|
+
seed: z12.number().int().optional(),
|
|
5230
|
+
prompts: z12.string().optional(),
|
|
5231
|
+
plugin: z12.array(z12.string().min(1)).optional(),
|
|
5232
|
+
concurrency: z12.number().int().min(1).optional(),
|
|
5233
|
+
output: z12.string().optional(),
|
|
5234
|
+
saveResults: z12.string().optional(),
|
|
5235
|
+
apply: z12.boolean().optional(),
|
|
5236
|
+
verbose: z12.boolean().optional()
|
|
5237
|
+
});
|
|
5238
|
+
var DEFAULT_ANTHROPIC_MODEL4 = "claude-sonnet-4-5-20250929";
|
|
5239
|
+
var DEFAULT_OPENAI_MODEL4 = "gpt-4.1-mini";
|
|
5240
|
+
function collectPluginPaths3(value, previous = []) {
|
|
5241
|
+
return [...previous, value];
|
|
5242
|
+
}
|
|
5243
|
+
function resolveModel4(provider, model) {
|
|
5244
|
+
if (provider === "openai" && model === DEFAULT_ANTHROPIC_MODEL4) {
|
|
5245
|
+
return DEFAULT_OPENAI_MODEL4;
|
|
5246
|
+
}
|
|
5247
|
+
return model;
|
|
5248
|
+
}
|
|
5249
|
+
async function handleImproveCommand(targetPath, options, command) {
|
|
5250
|
+
const spinner = options.json || !process.stdout.isTTY ? null : ora4("Preparing improvement run...").start();
|
|
5251
|
+
try {
|
|
5252
|
+
if (spinner) {
|
|
5253
|
+
spinner.text = "Initializing model provider...";
|
|
5254
|
+
}
|
|
5255
|
+
const provider = createProvider(options.provider, options.apiKey);
|
|
5256
|
+
let queries = void 0;
|
|
5257
|
+
if (options.queries) {
|
|
5258
|
+
if (spinner) {
|
|
5259
|
+
spinner.text = "Loading frozen trigger queries...";
|
|
5260
|
+
}
|
|
5261
|
+
queries = await loadTriggerQueriesFile(options.queries);
|
|
5262
|
+
}
|
|
5263
|
+
let prompts = void 0;
|
|
5264
|
+
if (options.prompts) {
|
|
5265
|
+
if (spinner) {
|
|
5266
|
+
spinner.text = "Loading eval prompts...";
|
|
5267
|
+
}
|
|
5268
|
+
prompts = await loadEvalPromptsJson(options.prompts);
|
|
5269
|
+
} else {
|
|
5270
|
+
prompts = await loadConfiguredEvalPrompts(command);
|
|
5271
|
+
}
|
|
5272
|
+
const model = resolveModel4(options.provider, options.model);
|
|
5273
|
+
const result = await runImprove(targetPath, {
|
|
5274
|
+
provider,
|
|
5275
|
+
model,
|
|
5276
|
+
lintFailOn: options.lintFailOn,
|
|
5277
|
+
lintSuppress: options.lintSuppress,
|
|
5278
|
+
lintPlugins: options.lintPlugins,
|
|
5279
|
+
compare: options.compare,
|
|
5280
|
+
numQueries: options.numQueries,
|
|
5281
|
+
triggerSeed: options.triggerSeed,
|
|
5282
|
+
queries,
|
|
5283
|
+
prompts,
|
|
5284
|
+
evalNumRuns: options.numRuns,
|
|
5285
|
+
evalMaxToolIterations: options.maxToolIterations,
|
|
5286
|
+
minF1: options.minF1,
|
|
5287
|
+
minAssertPassRate: options.minAssertPassRate,
|
|
5288
|
+
concurrency: options.concurrency,
|
|
5289
|
+
apply: options.apply,
|
|
5290
|
+
outputPath: options.output,
|
|
5291
|
+
verbose: options.verbose,
|
|
5292
|
+
onStage: (stage) => {
|
|
5293
|
+
if (!spinner) {
|
|
5294
|
+
return;
|
|
5295
|
+
}
|
|
5296
|
+
if (stage === "baseline") {
|
|
5297
|
+
spinner.text = "Running baseline check...";
|
|
5298
|
+
} else if (stage === "generate") {
|
|
5299
|
+
spinner.text = "Generating candidate rewrite...";
|
|
5300
|
+
} else if (stage === "validate") {
|
|
5301
|
+
spinner.text = "Validating candidate rewrite...";
|
|
5302
|
+
} else if (stage === "verify") {
|
|
5303
|
+
spinner.text = "Verifying candidate against frozen test inputs...";
|
|
5304
|
+
} else if (stage === "write") {
|
|
5305
|
+
spinner.text = options.apply ? "Writing improved SKILL.md..." : "Writing candidate output...";
|
|
5306
|
+
}
|
|
5307
|
+
}
|
|
5308
|
+
});
|
|
5309
|
+
if (options.saveResults) {
|
|
5310
|
+
await writeJsonFile(options.saveResults, result);
|
|
5311
|
+
}
|
|
5312
|
+
spinner?.stop();
|
|
5313
|
+
if (options.json) {
|
|
5314
|
+
writeResult(result, true);
|
|
5315
|
+
} else {
|
|
5316
|
+
writeResult(renderImproveReport(result, options.color, options.verbose), false);
|
|
5317
|
+
}
|
|
5318
|
+
process.exitCode = result.blockedReason ? 1 : 0;
|
|
5319
|
+
} catch (error) {
|
|
5320
|
+
spinner?.stop();
|
|
5321
|
+
writeError(error, options.json);
|
|
5322
|
+
process.exitCode = 2;
|
|
5323
|
+
}
|
|
5324
|
+
}
|
|
5325
|
+
function registerImproveCommand(program) {
|
|
5326
|
+
program.command("improve").description("Rewrite SKILL.md, verify it on frozen test inputs, and optionally apply it.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai").option("--model <model>", "Model for baseline, rewrite, and verification runs").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--compare <path...>", "Path(s) to sibling skill directories to include as competitors").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible trigger results", (value) => Number.parseInt(value, 10)).option("--prompts <path>", "Path to eval prompts JSON").option("--plugin <path>", "Load a custom lint plugin file", collectPluginPaths3, []).option("--concurrency <n>", "Maximum in-flight trigger/eval tasks", (value) => Number.parseInt(value, 10)).option("--output <path>", "Write the verified candidate SKILL.md to a separate file").option("--save-results <path>", "Save the full improve result JSON").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value)).option(
|
|
5327
|
+
"--min-assert-pass-rate <n>",
|
|
5328
|
+
"Minimum required eval assertion pass rate (0-1)",
|
|
5329
|
+
(value) => Number.parseFloat(value)
|
|
5330
|
+
).option("--apply", "Apply the verified rewrite to the source SKILL.md").option("--verbose", "Include detailed baseline and verification reports").action(async (targetPath, _commandOptions, command) => {
|
|
5331
|
+
const globalOptions = getGlobalCliOptions(command);
|
|
5332
|
+
const config = getResolvedConfig(command);
|
|
5333
|
+
const parsedCli = improveCliSchema.safeParse(command.opts());
|
|
5334
|
+
if (!parsedCli.success) {
|
|
5335
|
+
writeError(new Error(parsedCli.error.issues[0]?.message ?? "Invalid improve options."), globalOptions.json);
|
|
5336
|
+
process.exitCode = 2;
|
|
5337
|
+
return;
|
|
5338
|
+
}
|
|
5339
|
+
await handleImproveCommand(
|
|
5340
|
+
targetPath,
|
|
5341
|
+
{
|
|
5342
|
+
...globalOptions,
|
|
5343
|
+
provider: config.provider,
|
|
5344
|
+
model: config.model,
|
|
5345
|
+
apiKey: parsedCli.data.apiKey,
|
|
5346
|
+
queries: parsedCli.data.queries,
|
|
5347
|
+
compare: config.trigger.compare,
|
|
5348
|
+
numQueries: config.trigger.numQueries,
|
|
5349
|
+
prompts: parsedCli.data.prompts,
|
|
5350
|
+
minF1: config.trigger.threshold,
|
|
5351
|
+
minAssertPassRate: config.eval.threshold,
|
|
5352
|
+
numRuns: config.eval.numRuns,
|
|
5353
|
+
maxToolIterations: config.eval.maxToolIterations,
|
|
5354
|
+
concurrency: config.concurrency,
|
|
5355
|
+
lintFailOn: config.lint.failOn,
|
|
5356
|
+
lintSuppress: config.lint.suppress,
|
|
5357
|
+
lintPlugins: config.lint.plugins,
|
|
5358
|
+
triggerSeed: parsedCli.data.seed ?? config.trigger.seed,
|
|
5359
|
+
output: parsedCli.data.output,
|
|
5360
|
+
saveResults: parsedCli.data.saveResults,
|
|
5361
|
+
apply: Boolean(parsedCli.data.apply),
|
|
5362
|
+
verbose: Boolean(parsedCli.data.verbose)
|
|
5363
|
+
},
|
|
5364
|
+
command
|
|
5365
|
+
);
|
|
5366
|
+
});
|
|
5367
|
+
}
|
|
5368
|
+
|
|
4124
5369
|
// src/index.ts
|
|
4125
5370
|
function resolveVersion() {
|
|
4126
5371
|
try {
|
|
4127
5372
|
const currentFilePath = fileURLToPath(import.meta.url);
|
|
4128
|
-
const packageJsonPath =
|
|
4129
|
-
const raw =
|
|
5373
|
+
const packageJsonPath = path8.resolve(path8.dirname(currentFilePath), "..", "package.json");
|
|
5374
|
+
const raw = fs13.readFileSync(packageJsonPath, "utf8");
|
|
4130
5375
|
const parsed = JSON.parse(raw);
|
|
4131
5376
|
return parsed.version ?? "0.0.0";
|
|
4132
5377
|
} catch {
|
|
@@ -4159,6 +5404,7 @@ async function run(argv) {
|
|
|
4159
5404
|
registerTriggerCommand(program);
|
|
4160
5405
|
registerEvalCommand(program);
|
|
4161
5406
|
registerCheckCommand(program);
|
|
5407
|
+
registerImproveCommand(program);
|
|
4162
5408
|
try {
|
|
4163
5409
|
await program.parseAsync(argv);
|
|
4164
5410
|
} catch (error) {
|