skilltest 0.8.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,8 +1,8 @@
1
1
  #!/usr/bin/env node
2
2
 
3
3
  // src/index.ts
4
- import fs12 from "node:fs";
5
- import path7 from "node:path";
4
+ import fs14 from "node:fs";
5
+ import path9 from "node:path";
6
6
  import { fileURLToPath } from "node:url";
7
7
  import { Command } from "commander";
8
8
 
@@ -100,7 +100,10 @@ function parseFrontmatter(rawSkill) {
100
100
  }
101
101
  async function parseSkillStrict(inputPath) {
102
102
  const skillContext = await loadSkillFile(inputPath);
103
- const parsedFrontmatter = parseFrontmatter(skillContext.raw);
103
+ return parseSkillDocumentStrict(skillContext.raw, skillContext.skillRoot, skillContext.skillFile);
104
+ }
105
+ function parseSkillDocumentStrict(rawSkill, skillRoot, skillFile) {
106
+ const parsedFrontmatter = parseFrontmatter(rawSkill);
104
107
  if (!parsedFrontmatter.hasFrontmatter) {
105
108
  throw new Error("SKILL.md is missing YAML frontmatter.");
106
109
  }
@@ -113,9 +116,9 @@ async function parseSkillStrict(inputPath) {
113
116
  throw new Error(`Invalid frontmatter field '${issue.path.join(".")}': ${issue.message}`);
114
117
  }
115
118
  return {
116
- skillRoot: skillContext.skillRoot,
117
- skillFile: skillContext.skillFile,
118
- raw: skillContext.raw,
119
+ skillRoot,
120
+ skillFile,
121
+ raw: rawSkill,
119
122
  content: parsedFrontmatter.content,
120
123
  frontmatterRaw: parsedFrontmatter.rawFrontmatter,
121
124
  frontmatter: validation.data
@@ -1515,6 +1518,9 @@ function badgeLabel(status) {
1515
1518
  function renderBadge(status) {
1516
1519
  return `<span class="badge ${status}">${badgeLabel(status)}</span>`;
1517
1520
  }
1521
+ function renderMetaBadge(label) {
1522
+ return `<span class="meta-badge">${escapeHtml(label)}</span>`;
1523
+ }
1518
1524
  function renderStatCards(stats) {
1519
1525
  return `<div class="stats-grid">${stats.map(
1520
1526
  (stat) => `
@@ -1690,10 +1696,37 @@ function promptStatus(promptResult) {
1690
1696
  return "warn";
1691
1697
  }
1692
1698
  function renderAssertionRow(assertion) {
1693
- return renderDetails(
1694
- `${badgeLabel(assertion.passed ? "pass" : "fail")} ${assertion.assertion}`,
1695
- renderPreBlock(assertion.evidence)
1696
- );
1699
+ return `
1700
+ <details class="detail-block">
1701
+ <summary>
1702
+ ${renderBadge(assertion.passed ? "pass" : "fail")}
1703
+ ${assertion.source === "tool" ? renderMetaBadge("Tool") : ""}
1704
+ <span>${escapeHtml(assertion.assertion)}</span>
1705
+ </summary>
1706
+ <div class="detail-content">${renderPreBlock(assertion.evidence)}</div>
1707
+ </details>
1708
+ `;
1709
+ }
1710
+ function renderToolCallsSection(promptResult) {
1711
+ if (!promptResult.toolCalls || promptResult.toolCalls.length === 0) {
1712
+ return "";
1713
+ }
1714
+ const toolRows = promptResult.toolCalls.map(
1715
+ (toolCall) => `
1716
+ <div class="tool-call">
1717
+ <div class="row-header">
1718
+ <div>
1719
+ <div class="row-title">${escapeHtml(toolCall.name)}</div>
1720
+ <div class="row-subtitle">${escapeHtml(`turn ${toolCall.turnIndex}`)}</div>
1721
+ </div>
1722
+ ${renderMetaBadge("Tool Call")}
1723
+ </div>
1724
+ ${renderDefinitionList([{ label: "Arguments", value: JSON.stringify(toolCall.arguments) }])}
1725
+ ${renderDetails("Mock response", renderPreBlock(toolCall.response))}
1726
+ </div>
1727
+ `
1728
+ ).join("");
1729
+ return renderDetails("Tool Calls", `<div class="tool-call-list">${toolRows}</div>`);
1697
1730
  }
1698
1731
  function renderEvalPromptRow(promptResult) {
1699
1732
  const assertionDetails = promptResult.assertions.map((assertion) => renderAssertionRow(assertion)).join("");
@@ -1712,9 +1745,12 @@ function renderEvalPromptRow(promptResult) {
1712
1745
  <div class="row-body">${escapeHtml(promptResult.responseSummary)}</div>
1713
1746
  ${renderDefinitionList([
1714
1747
  { label: "Passed assertions", value: String(promptResult.passedAssertions) },
1715
- { label: "Total assertions", value: String(promptResult.totalAssertions) }
1748
+ { label: "Total assertions", value: String(promptResult.totalAssertions) },
1749
+ ...promptResult.toolCalls ? [{ label: "Tool calls", value: String(promptResult.toolCalls.length) }] : [],
1750
+ ...promptResult.loopIterations !== void 0 ? [{ label: "Loop iterations", value: String(promptResult.loopIterations) }] : []
1716
1751
  ])}
1717
1752
  ${renderDetails("Assertion evidence", assertionDetails || `<p>No assertions.</p>`)}
1753
+ ${renderToolCallsSection(promptResult)}
1718
1754
  ${responseDetails}
1719
1755
  </div>
1720
1756
  `;
@@ -1981,6 +2017,20 @@ function renderHtmlDocument(title, body) {
1981
2017
  background: rgba(107, 114, 128, 0.14);
1982
2018
  }
1983
2019
 
2020
+ .meta-badge {
2021
+ display: inline-flex;
2022
+ align-items: center;
2023
+ justify-content: center;
2024
+ padding: 3px 10px;
2025
+ border-radius: 999px;
2026
+ border: 1px solid rgba(17, 24, 39, 0.16);
2027
+ background: rgba(17, 24, 39, 0.06);
2028
+ color: var(--text);
2029
+ font-size: 0.76rem;
2030
+ font-weight: 700;
2031
+ white-space: nowrap;
2032
+ }
2033
+
1984
2034
  details {
1985
2035
  margin-top: 10px;
1986
2036
  }
@@ -1995,6 +2045,13 @@ function renderHtmlDocument(title, body) {
1995
2045
  padding-top: 10px;
1996
2046
  }
1997
2047
 
2048
+ .detail-block summary {
2049
+ display: flex;
2050
+ align-items: center;
2051
+ gap: 8px;
2052
+ flex-wrap: wrap;
2053
+ }
2054
+
1998
2055
  .detail-content p {
1999
2056
  margin: 0;
2000
2057
  }
@@ -2045,6 +2102,18 @@ function renderHtmlDocument(title, body) {
2045
2102
  overflow-wrap: anywhere;
2046
2103
  }
2047
2104
 
2105
+ .tool-call-list {
2106
+ display: grid;
2107
+ gap: 12px;
2108
+ }
2109
+
2110
+ .tool-call {
2111
+ border: 1px solid var(--border);
2112
+ border-radius: 12px;
2113
+ padding: 14px;
2114
+ background: #fffaf0;
2115
+ }
2116
+
2048
2117
  ul {
2049
2118
  margin: 0;
2050
2119
  padding-left: 20px;
@@ -2246,6 +2315,76 @@ function renderCheckHtml(result) {
2246
2315
  );
2247
2316
  return renderHtmlDocument(`skilltest check - ${skillName}`, [header, lintSection, triggerSection, evalSection, qualityGate].join(""));
2248
2317
  }
2318
+ function renderRouteMatrix(result) {
2319
+ const cols = [...result.skills, "none"];
2320
+ const headerCells = cols.map((col) => `<th>${escapeHtml(col)}</th>`).join("");
2321
+ const rows = result.skills.map((target) => {
2322
+ const cells = cols.map((col) => {
2323
+ const pct = result.matrixPct[target]?.[col] ?? 0;
2324
+ const isDiag = col === target;
2325
+ const bg = isDiag ? "background:rgba(34,197,94,0.18);" : pct > 0.15 ? "background:rgba(239,68,68,0.18);" : pct > 0.05 ? "background:rgba(234,179,8,0.12);" : "";
2326
+ return `<td style="${bg}">${escapeHtml(formatPercent(pct))}</td>`;
2327
+ }).join("");
2328
+ return `<tr><th>${escapeHtml(target)}</th>${cells}</tr>`;
2329
+ }).join("");
2330
+ return `<style>.rt{border-collapse:collapse;font-size:.85rem;width:100%}.rt th,.rt td{border:1px solid #d4d4d8;padding:8px 12px;text-align:center}.rt thead th{background:#fafafa;font-weight:700}</style><div style="overflow-x:auto"><table class="rt"><thead><tr><th></th>${headerCells}</tr></thead><tbody>${rows}</tbody></table></div>`;
2331
+ }
2332
+ function renderRouteHtml(result) {
2333
+ const conflictCount = result.conflicts.length;
2334
+ const overallStatus = result.overallAccuracy >= 0.8 ? "pass" : "warn";
2335
+ const conflictStatus = conflictCount === 0 ? "pass" : "warn";
2336
+ const header = renderHeaderCard(
2337
+ "route",
2338
+ `Routing Report \u2014 ${result.skills.length} skills`,
2339
+ result.skillDir,
2340
+ [
2341
+ { label: "Overall accuracy", value: formatPercent(result.overallAccuracy), status: overallStatus },
2342
+ { label: "Conflicts", value: String(conflictCount), status: conflictStatus },
2343
+ { label: "Skills", value: String(result.skills.length) },
2344
+ { label: "Queries/skill", value: String(result.numQueriesPerSkill) }
2345
+ ],
2346
+ [
2347
+ { label: "Provider", value: result.provider },
2348
+ { label: "Model", value: result.model },
2349
+ { label: "Seed", value: result.seed !== void 0 ? String(result.seed) : "none" }
2350
+ ]
2351
+ );
2352
+ const matrixSection = renderSectionCard("Routing Matrix", renderRouteMatrix(result));
2353
+ const metricsRows = result.perSkillMetrics.map((m) => {
2354
+ const status = m.f1 >= 0.8 ? "pass" : "warn";
2355
+ return renderMessageRow(
2356
+ status,
2357
+ m.skill,
2358
+ `F1: ${formatPercent(m.f1)} precision: ${formatPercent(m.precision)} recall: ${formatPercent(m.recall)}`,
2359
+ renderDefinitionList([
2360
+ { label: "Queries", value: String(m.queriesTotal) },
2361
+ { label: "Correct", value: String(m.correct) },
2362
+ { label: "Precision", value: formatPercent(m.precision) },
2363
+ { label: "Recall", value: formatPercent(m.recall) }
2364
+ ])
2365
+ );
2366
+ }).join("");
2367
+ const metricsSection = renderSectionCard("Per-Skill Metrics", `<div class="row-list">${metricsRows}</div>`);
2368
+ let conflictsSection = "";
2369
+ if (result.conflicts.length > 0) {
2370
+ const conflictRows = result.conflicts.map(
2371
+ (conflict) => renderMessageRow(
2372
+ "warn",
2373
+ `${escapeHtml(conflict.skillA)} \u2194 ${escapeHtml(conflict.skillB)}`,
2374
+ `${formatPercent(conflict.bleedAtoB)} of ${escapeHtml(conflict.skillA)} queries routed to ${escapeHtml(conflict.skillB)}; ${formatPercent(conflict.bleedBtoA)} the other way`
2375
+ )
2376
+ ).join("");
2377
+ conflictsSection = renderSectionCard("Conflicts", `<div class="row-list">${conflictRows}</div>`);
2378
+ }
2379
+ const suggestionsSection = renderSectionCard(
2380
+ "Suggestions",
2381
+ `<ul>${result.suggestions.map((s) => `<li>${escapeHtml(s)}</li>`).join("")}</ul>`
2382
+ );
2383
+ return renderHtmlDocument(
2384
+ `skilltest route \u2014 ${result.skillDir}`,
2385
+ [header, matrixSection, metricsSection, conflictsSection, suggestionsSection].join("")
2386
+ );
2387
+ }
2249
2388
 
2250
2389
  // src/reporters/terminal.ts
2251
2390
  import { Chalk } from "chalk";
@@ -2270,6 +2409,70 @@ function countSkippedSecurityPatterns2(issues) {
2270
2409
  function formatPercent2(value) {
2271
2410
  return `${(value * 100).toFixed(1)}%`;
2272
2411
  }
2412
+ function formatSignedNumber(value, digits = 4) {
2413
+ const prefix = value > 0 ? "+" : "";
2414
+ return `${prefix}${value.toFixed(digits)}`;
2415
+ }
2416
+ function diffChangedLines(beforeText, afterText) {
2417
+ const beforeLines = beforeText.split(/\r?\n/);
2418
+ const afterLines = afterText.split(/\r?\n/);
2419
+ const dp = Array.from({ length: beforeLines.length + 1 }, () => Array(afterLines.length + 1).fill(0));
2420
+ for (let beforeIndex2 = beforeLines.length - 1; beforeIndex2 >= 0; beforeIndex2 -= 1) {
2421
+ for (let afterIndex2 = afterLines.length - 1; afterIndex2 >= 0; afterIndex2 -= 1) {
2422
+ if (beforeLines[beforeIndex2] === afterLines[afterIndex2]) {
2423
+ dp[beforeIndex2][afterIndex2] = 1 + (dp[beforeIndex2 + 1][afterIndex2 + 1] ?? 0);
2424
+ } else {
2425
+ dp[beforeIndex2][afterIndex2] = Math.max(dp[beforeIndex2 + 1][afterIndex2] ?? 0, dp[beforeIndex2][afterIndex2 + 1] ?? 0);
2426
+ }
2427
+ }
2428
+ }
2429
+ const changedLines = [];
2430
+ let beforeIndex = 0;
2431
+ let afterIndex = 0;
2432
+ while (beforeIndex < beforeLines.length && afterIndex < afterLines.length) {
2433
+ if (beforeLines[beforeIndex] === afterLines[afterIndex]) {
2434
+ beforeIndex += 1;
2435
+ afterIndex += 1;
2436
+ continue;
2437
+ }
2438
+ const skipBefore = dp[beforeIndex + 1][afterIndex] ?? 0;
2439
+ const skipAfter = dp[beforeIndex][afterIndex + 1] ?? 0;
2440
+ if (skipBefore >= skipAfter) {
2441
+ changedLines.push({ type: "-", line: beforeLines[beforeIndex] ?? "" });
2442
+ beforeIndex += 1;
2443
+ } else {
2444
+ changedLines.push({ type: "+", line: afterLines[afterIndex] ?? "" });
2445
+ afterIndex += 1;
2446
+ }
2447
+ }
2448
+ while (beforeIndex < beforeLines.length) {
2449
+ changedLines.push({ type: "-", line: beforeLines[beforeIndex] ?? "" });
2450
+ beforeIndex += 1;
2451
+ }
2452
+ while (afterIndex < afterLines.length) {
2453
+ changedLines.push({ type: "+", line: afterLines[afterIndex] ?? "" });
2454
+ afterIndex += 1;
2455
+ }
2456
+ return changedLines;
2457
+ }
2458
+ function renderDiffPreview(beforeText, afterText, maxLines = 40) {
2459
+ const changedLines = diffChangedLines(beforeText, afterText);
2460
+ if (changedLines.length === 0) {
2461
+ return [" (no content changes)"];
2462
+ }
2463
+ const previewLines = changedLines.slice(0, maxLines).map((entry) => ` ${entry.type} ${entry.line}`);
2464
+ if (changedLines.length > maxLines) {
2465
+ previewLines.push(` ... ${changedLines.length - maxLines} more changed line(s)`);
2466
+ }
2467
+ return previewLines;
2468
+ }
2469
+ function summarizeToolCalls(toolCalls) {
2470
+ const counts = /* @__PURE__ */ new Map();
2471
+ for (const toolCall of toolCalls) {
2472
+ counts.set(toolCall.name, (counts.get(toolCall.name) ?? 0) + 1);
2473
+ }
2474
+ return Array.from(counts.entries()).map(([name, count]) => `${name} x${count}`).join(", ");
2475
+ }
2273
2476
  function renderLintReport(report, enableColor) {
2274
2477
  const c = getChalkInstance(enableColor);
2275
2478
  const { passed, warnings, failures, total } = report.summary;
@@ -2330,12 +2533,25 @@ function renderEvalReport(result, enableColor, verbose) {
2330
2533
  for (const [index, promptResult] of result.results.entries()) {
2331
2534
  lines.push(`${index + 1}. prompt: ${promptResult.prompt}`);
2332
2535
  lines.push(` response summary: ${promptResult.responseSummary.replace(/\s+/g, " ").trim()}`);
2536
+ if (promptResult.toolCalls) {
2537
+ lines.push(` Tools: ${promptResult.toolCalls.length} calls (${summarizeToolCalls(promptResult.toolCalls)})`);
2538
+ if (promptResult.loopIterations !== void 0) {
2539
+ lines.push(` loop iterations: ${promptResult.loopIterations}`);
2540
+ }
2541
+ }
2333
2542
  for (const assertion of promptResult.assertions) {
2334
2543
  const status = assertion.passed ? c.green("PASS") : c.red("FAIL");
2335
2544
  lines.push(` ${status} ${assertion.assertion}`);
2336
2545
  lines.push(` evidence: ${assertion.evidence}`);
2337
2546
  }
2338
2547
  if (verbose) {
2548
+ if (promptResult.toolCalls) {
2549
+ for (const toolCall of promptResult.toolCalls) {
2550
+ lines.push(` tool ${toolCall.turnIndex}: ${toolCall.name}`);
2551
+ lines.push(` arguments: ${JSON.stringify(toolCall.arguments)}`);
2552
+ lines.push(` response: ${toolCall.response}`);
2553
+ }
2554
+ }
2339
2555
  lines.push(` full response: ${promptResult.response}`);
2340
2556
  }
2341
2557
  }
@@ -2412,6 +2628,12 @@ function renderCheckReport(result, enableColor, verbose) {
2412
2628
  }
2413
2629
  lines.push(` - prompt: ${promptResult.prompt}`);
2414
2630
  lines.push(` response summary: ${promptResult.responseSummary.replace(/\s+/g, " ").trim()}`);
2631
+ if (promptResult.toolCalls) {
2632
+ lines.push(` Tools: ${promptResult.toolCalls.length} calls (${summarizeToolCalls(promptResult.toolCalls)})`);
2633
+ if (promptResult.loopIterations !== void 0) {
2634
+ lines.push(` loop iterations: ${promptResult.loopIterations}`);
2635
+ }
2636
+ }
2415
2637
  const assertionsToRender = verbose ? promptResult.assertions : failedAssertions;
2416
2638
  for (const assertion of assertionsToRender) {
2417
2639
  const assertionStatus = assertion.passed ? c.green("PASS") : c.red("FAIL");
@@ -2419,6 +2641,13 @@ function renderCheckReport(result, enableColor, verbose) {
2419
2641
  lines.push(` evidence: ${assertion.evidence}`);
2420
2642
  }
2421
2643
  if (verbose) {
2644
+ if (promptResult.toolCalls) {
2645
+ for (const toolCall of promptResult.toolCalls) {
2646
+ lines.push(` tool ${toolCall.turnIndex}: ${toolCall.name}`);
2647
+ lines.push(` arguments: ${JSON.stringify(toolCall.arguments)}`);
2648
+ lines.push(` response: ${toolCall.response}`);
2649
+ }
2650
+ }
2422
2651
  lines.push(` full response: ${promptResult.response}`);
2423
2652
  }
2424
2653
  }
@@ -2433,6 +2662,137 @@ function renderCheckReport(result, enableColor, verbose) {
2433
2662
  lines.push(`- overall: ${overallGate}`);
2434
2663
  return lines.join("\n");
2435
2664
  }
2665
+ function renderImproveReport(result, enableColor, verbose = false) {
2666
+ const c = getChalkInstance(enableColor);
2667
+ const lines = [
2668
+ "skilltest improve",
2669
+ `target: ${result.target}`,
2670
+ `provider/model: ${result.provider}/${result.model}`,
2671
+ `thresholds: min-f1=${result.thresholds.minF1.toFixed(2)} min-assert-pass-rate=${result.thresholds.minAssertPassRate.toFixed(2)}`
2672
+ ];
2673
+ const statusLabel = result.blockedReason ? c.red("BLOCKED") : result.applied ? c.green("APPLIED") : c.green("VERIFIED");
2674
+ lines.push(`status: ${statusLabel}`);
2675
+ if (result.candidate) {
2676
+ lines.push("");
2677
+ lines.push("Change Summary");
2678
+ for (const item of result.candidate.changeSummary) {
2679
+ lines.push(`- ${item}`);
2680
+ }
2681
+ lines.push("");
2682
+ lines.push("Targeted Problems");
2683
+ for (const item of result.candidate.targetedProblems) {
2684
+ lines.push(`- ${item}`);
2685
+ }
2686
+ }
2687
+ if (result.delta && result.verification) {
2688
+ lines.push("");
2689
+ lines.push("Before / After");
2690
+ lines.push(
2691
+ `- lint failures: ${result.delta.lintFailures.before} -> ${result.delta.lintFailures.after} (${formatSignedNumber(result.delta.lintFailures.delta, 0)})`
2692
+ );
2693
+ lines.push(
2694
+ `- lint warnings: ${result.delta.lintWarnings.before} -> ${result.delta.lintWarnings.after} (${formatSignedNumber(result.delta.lintWarnings.delta, 0)})`
2695
+ );
2696
+ lines.push(
2697
+ `- trigger f1: ${formatPercent2(result.delta.triggerF1.before)} -> ${formatPercent2(result.delta.triggerF1.after)} (${formatSignedNumber(result.delta.triggerF1.delta)})`
2698
+ );
2699
+ lines.push(
2700
+ `- eval assertion pass rate: ${formatPercent2(result.delta.evalAssertPassRate.before)} -> ${formatPercent2(result.delta.evalAssertPassRate.after)} (${formatSignedNumber(result.delta.evalAssertPassRate.delta)})`
2701
+ );
2702
+ lines.push(
2703
+ `- overall gate: ${result.delta.overallPassed.before ? c.green("PASS") : c.red("FAIL")} -> ${result.delta.overallPassed.after ? c.green("PASS") : c.red("FAIL")}`
2704
+ );
2705
+ }
2706
+ if (result.outputPath) {
2707
+ lines.push("");
2708
+ lines.push(`output: ${result.outputPath}`);
2709
+ }
2710
+ if (result.blockedReason) {
2711
+ lines.push("");
2712
+ lines.push("Blocked");
2713
+ lines.push(`- ${result.blockedReason}`);
2714
+ }
2715
+ if (result.candidate) {
2716
+ lines.push("");
2717
+ lines.push("Diff Preview");
2718
+ lines.push(...renderDiffPreview(result.originalRaw, result.candidate.raw));
2719
+ }
2720
+ if (verbose) {
2721
+ lines.push("");
2722
+ lines.push("Baseline");
2723
+ lines.push(renderCheckReport(result.baseline, enableColor, true));
2724
+ if (result.verification) {
2725
+ lines.push("");
2726
+ lines.push("Verification");
2727
+ lines.push(renderCheckReport(result.verification, enableColor, true));
2728
+ }
2729
+ }
2730
+ return lines.join("\n");
2731
+ }
2732
+ function renderRouteReport(result, enableColor, verbose) {
2733
+ const c = getChalkInstance(enableColor);
2734
+ const lines = [
2735
+ "skilltest route",
2736
+ `directory: ${result.skillDir}`,
2737
+ `provider/model: ${result.provider}/${result.model}`,
2738
+ `skills: ${result.skills.length} queries per skill: ${result.numQueriesPerSkill}`
2739
+ ];
2740
+ lines.push("");
2741
+ lines.push("Per-skill metrics:");
2742
+ for (const m of result.perSkillMetrics) {
2743
+ const badge = m.f1 >= 0.8 ? c.green("PASS") : c.yellow("WARN");
2744
+ lines.push(
2745
+ ` ${m.skill.padEnd(24)} F1: ${formatPercent2(m.f1).padEnd(7)} precision: ${formatPercent2(m.precision).padEnd(7)} recall: ${formatPercent2(m.recall)} [${badge}]`
2746
+ );
2747
+ }
2748
+ lines.push("");
2749
+ lines.push("Routing matrix (% of row queries routed to column):");
2750
+ const colHeaders = [...result.skills, "none"];
2751
+ const colWidth = 10;
2752
+ const rowLabelWidth = 24;
2753
+ const headerRow = "".padEnd(rowLabelWidth) + colHeaders.map((h) => h.slice(0, colWidth - 1).padEnd(colWidth)).join("");
2754
+ lines.push(" " + headerRow);
2755
+ for (const targetSkill of result.skills) {
2756
+ const rowLabel = (" " + targetSkill).padEnd(rowLabelWidth);
2757
+ const cells = colHeaders.map((col) => {
2758
+ const pct = result.matrixPct[targetSkill]?.[col] ?? 0;
2759
+ const formatted = formatPercent2(pct).padEnd(colWidth);
2760
+ if (col === targetSkill) return c.green(formatted);
2761
+ if (pct > 0.1) return c.yellow(formatted);
2762
+ return formatted;
2763
+ }).join("");
2764
+ lines.push(rowLabel + cells);
2765
+ }
2766
+ if (result.conflicts.length > 0) {
2767
+ lines.push("");
2768
+ lines.push("Conflicts detected:");
2769
+ for (const conflict of result.conflicts) {
2770
+ lines.push(
2771
+ ` ${conflict.skillA} <-> ${conflict.skillB} ${formatPercent2(conflict.bleedAtoB)} / ${formatPercent2(conflict.bleedBtoA)} bleed [${c.yellow("WARN")}]`
2772
+ );
2773
+ }
2774
+ }
2775
+ lines.push("");
2776
+ lines.push(`Overall accuracy: ${formatPercent2(result.overallAccuracy)}`);
2777
+ lines.push("");
2778
+ lines.push("Suggestions:");
2779
+ for (const suggestion of result.suggestions) {
2780
+ lines.push(`- ${suggestion}`);
2781
+ }
2782
+ if (verbose) {
2783
+ lines.push("");
2784
+ lines.push("Cases:");
2785
+ for (const [index, testCase] of result.cases.entries()) {
2786
+ const status = testCase.correct ? c.green("PASS") : c.red("FAIL");
2787
+ lines.push(` ${index + 1}. ${status} [${testCase.targetSkill}] ${testCase.query}`);
2788
+ lines.push(` routed to: ${testCase.actualSkill}`);
2789
+ if (testCase.rawModelResponse) {
2790
+ lines.push(` model: ${testCase.rawModelResponse.replace(/\s+/g, " ").trim()}`);
2791
+ }
2792
+ }
2793
+ }
2794
+ return lines.join("\n");
2795
+ }
2436
2796
 
2437
2797
  // src/commands/common.ts
2438
2798
  import fs6 from "node:fs/promises";
@@ -2504,7 +2864,10 @@ function parseGraderOutput(raw) {
2504
2864
  async function gradeResponse(options) {
2505
2865
  const prompts = buildGraderPrompts(options);
2506
2866
  const raw = await options.provider.sendMessage(prompts.systemPrompt, prompts.userPrompt, { model: options.model });
2507
- return parseGraderOutput(raw);
2867
+ return parseGraderOutput(raw).map((assertion) => ({
2868
+ ...assertion,
2869
+ source: "grader"
2870
+ }));
2508
2871
  }
2509
2872
 
2510
2873
  // src/utils/concurrency.ts
@@ -2559,12 +2922,290 @@ async function pMap(items, fn, concurrency) {
2559
2922
  });
2560
2923
  }
2561
2924
 
2925
+ // src/core/tool-environment.ts
2926
+ function isPlainObject(value) {
2927
+ return value !== null && typeof value === "object" && !Array.isArray(value);
2928
+ }
2929
+ function deepEqual(left, right) {
2930
+ if (Array.isArray(left) && Array.isArray(right)) {
2931
+ if (left.length !== right.length) {
2932
+ return false;
2933
+ }
2934
+ return left.every((item, index) => deepEqual(item, right[index]));
2935
+ }
2936
+ if (isPlainObject(left) && isPlainObject(right)) {
2937
+ const leftKeys = Object.keys(left);
2938
+ const rightKeys = Object.keys(right);
2939
+ if (leftKeys.length !== rightKeys.length) {
2940
+ return false;
2941
+ }
2942
+ return leftKeys.every((key) => deepEqual(left[key], right[key]));
2943
+ }
2944
+ return left === right;
2945
+ }
2946
+ function matchesArgumentSubset(actual, expected) {
2947
+ if (Array.isArray(expected)) {
2948
+ if (!Array.isArray(actual) || actual.length !== expected.length) {
2949
+ return false;
2950
+ }
2951
+ return expected.every((value, index) => matchesArgumentSubset(actual[index], value));
2952
+ }
2953
+ if (isPlainObject(expected)) {
2954
+ if (!isPlainObject(actual)) {
2955
+ return false;
2956
+ }
2957
+ return Object.entries(expected).every(([key, value]) => matchesArgumentSubset(actual[key], value));
2958
+ }
2959
+ return deepEqual(actual, expected);
2960
+ }
2961
+ function parseResponsePattern(pattern) {
2962
+ if (pattern === "*") {
2963
+ return null;
2964
+ }
2965
+ try {
2966
+ const parsed = JSON.parse(pattern);
2967
+ return isPlainObject(parsed) ? parsed : null;
2968
+ } catch {
2969
+ return null;
2970
+ }
2971
+ }
2972
+ function renderFallbackResponse(tool, args) {
2973
+ return `[mock] No mock response configured for tool '${tool.name}' with arguments: ${JSON.stringify(args)}`;
2974
+ }
2975
+ function resolveToolResponse(tool, args) {
2976
+ const exactMatchKey = JSON.stringify(args);
2977
+ const exactMatch = tool.responses[exactMatchKey];
2978
+ if (exactMatch !== void 0) {
2979
+ return exactMatch;
2980
+ }
2981
+ let bestPartialMatch = null;
2982
+ for (const [pattern, response] of Object.entries(tool.responses)) {
2983
+ if (pattern === "*") {
2984
+ continue;
2985
+ }
2986
+ const parsedPattern = parseResponsePattern(pattern);
2987
+ if (!parsedPattern) {
2988
+ continue;
2989
+ }
2990
+ if (!matchesArgumentSubset(args, parsedPattern)) {
2991
+ continue;
2992
+ }
2993
+ const specificity = Object.keys(parsedPattern).length;
2994
+ if (!bestPartialMatch || specificity > bestPartialMatch.specificity) {
2995
+ bestPartialMatch = { specificity, response };
2996
+ }
2997
+ }
2998
+ if (bestPartialMatch) {
2999
+ return bestPartialMatch.response;
3000
+ }
3001
+ const wildcardMatch = tool.responses["*"];
3002
+ if (wildcardMatch !== void 0) {
3003
+ return wildcardMatch;
3004
+ }
3005
+ return renderFallbackResponse(tool, args);
3006
+ }
3007
+ function toProviderToolDefinitions(mockTools) {
3008
+ return mockTools.map((tool) => {
3009
+ const parameters = tool.parameters ?? [];
3010
+ return {
3011
+ name: tool.name,
3012
+ description: tool.description,
3013
+ parameters: {
3014
+ type: "object",
3015
+ properties: Object.fromEntries(
3016
+ parameters.map((parameter) => [
3017
+ parameter.name,
3018
+ {
3019
+ type: parameter.type,
3020
+ description: parameter.description
3021
+ }
3022
+ ])
3023
+ ),
3024
+ required: parameters.filter((parameter) => parameter.required).map((parameter) => parameter.name)
3025
+ }
3026
+ };
3027
+ });
3028
+ }
3029
+ function toAssistantConversationBlocks(response) {
3030
+ const contentBlocks = [];
3031
+ if (response.textContent.trim().length > 0) {
3032
+ contentBlocks.push({
3033
+ type: "text",
3034
+ text: response.textContent
3035
+ });
3036
+ }
3037
+ for (const block of response.toolUseBlocks) {
3038
+ contentBlocks.push({
3039
+ type: "tool_use",
3040
+ id: block.id,
3041
+ name: block.name,
3042
+ input: block.arguments
3043
+ });
3044
+ }
3045
+ return contentBlocks.length === 0 ? [] : [
3046
+ {
3047
+ role: "assistant",
3048
+ content: contentBlocks
3049
+ }
3050
+ ];
3051
+ }
3052
+ async function runWithTools(options) {
3053
+ const maxIterations = options.maxIterations ?? 10;
3054
+ const toolsByName = new Map(options.tools.map((tool) => [tool.name, tool]));
3055
+ const providerTools = toProviderToolDefinitions(options.tools);
3056
+ const messages = [{ role: "user", content: options.userMessage }];
3057
+ const toolCalls = [];
3058
+ let finalResponse = "";
3059
+ let loopIterations = 0;
3060
+ while (loopIterations < maxIterations) {
3061
+ loopIterations += 1;
3062
+ const response = await options.provider.sendWithTools(options.systemPrompt, messages, {
3063
+ model: options.model,
3064
+ tools: providerTools
3065
+ });
3066
+ if (response.textContent.trim().length > 0) {
3067
+ finalResponse = response.textContent;
3068
+ }
3069
+ if (response.toolUseBlocks.length === 0) {
3070
+ return {
3071
+ finalResponse,
3072
+ toolCalls,
3073
+ loopIterations
3074
+ };
3075
+ }
3076
+ messages.push(...toAssistantConversationBlocks(response));
3077
+ const toolResultBlocks = [];
3078
+ for (const toolUse of response.toolUseBlocks) {
3079
+ const tool = toolsByName.get(toolUse.name);
3080
+ const resolvedResponse = tool ? resolveToolResponse(tool, toolUse.arguments) : `[mock] No tool named '${toolUse.name}' is registered.`;
3081
+ toolCalls.push({
3082
+ name: toolUse.name,
3083
+ arguments: toolUse.arguments,
3084
+ response: resolvedResponse,
3085
+ turnIndex: loopIterations
3086
+ });
3087
+ toolResultBlocks.push({
3088
+ type: "tool_result",
3089
+ tool_use_id: toolUse.id,
3090
+ content: resolvedResponse
3091
+ });
3092
+ }
3093
+ messages.push({
3094
+ role: "user",
3095
+ content: toolResultBlocks
3096
+ });
3097
+ }
3098
+ const terminationNote = `[skilltest: tool loop terminated after ${maxIterations} iterations]`;
3099
+ finalResponse = finalResponse ? `${finalResponse}
3100
+
3101
+ ${terminationNote}` : terminationNote;
3102
+ return {
3103
+ finalResponse,
3104
+ toolCalls,
3105
+ loopIterations
3106
+ };
3107
+ }
3108
+
2562
3109
  // src/core/eval-runner.ts
3110
+ var toolParameterSchema = z3.object({
3111
+ name: z3.string().min(1),
3112
+ type: z3.enum(["string", "number", "boolean", "object", "array"]),
3113
+ description: z3.string().min(1),
3114
+ required: z3.boolean().optional()
3115
+ });
3116
+ var mockToolDefinitionSchema = z3.object({
3117
+ name: z3.string().min(1),
3118
+ description: z3.string().min(1),
3119
+ parameters: z3.array(toolParameterSchema).optional(),
3120
+ responses: z3.record(z3.string())
3121
+ });
3122
+ var toolAssertionSchema = z3.object({
3123
+ type: z3.enum(["tool_called", "tool_not_called", "tool_call_order", "tool_argument_match"]),
3124
+ toolName: z3.string().min(1).optional(),
3125
+ toolNames: z3.array(z3.string().min(1)).optional(),
3126
+ expectedArgs: z3.record(z3.unknown()).optional(),
3127
+ description: z3.string().min(1)
3128
+ }).superRefine((value, context) => {
3129
+ if ((value.type === "tool_called" || value.type === "tool_not_called" || value.type === "tool_argument_match") && !value.toolName) {
3130
+ context.addIssue({
3131
+ code: z3.ZodIssueCode.custom,
3132
+ message: `${value.type} requires toolName.`
3133
+ });
3134
+ }
3135
+ if (value.type === "tool_call_order" && (!value.toolNames || value.toolNames.length === 0)) {
3136
+ context.addIssue({
3137
+ code: z3.ZodIssueCode.custom,
3138
+ message: "tool_call_order requires toolNames."
3139
+ });
3140
+ }
3141
+ if (value.type === "tool_argument_match" && !value.expectedArgs) {
3142
+ context.addIssue({
3143
+ code: z3.ZodIssueCode.custom,
3144
+ message: "tool_argument_match requires expectedArgs."
3145
+ });
3146
+ }
3147
+ });
2563
3148
  var evalPromptSchema = z3.object({
2564
3149
  prompt: z3.string().min(1),
2565
- assertions: z3.array(z3.string().min(1)).optional()
3150
+ assertions: z3.array(z3.string().min(1)).optional(),
3151
+ tools: z3.array(mockToolDefinitionSchema).optional(),
3152
+ toolAssertions: z3.array(toolAssertionSchema).optional()
2566
3153
  });
2567
3154
  var evalPromptArraySchema = z3.array(evalPromptSchema);
3155
+ function formatExpectedOrder(toolNames) {
3156
+ return `[${toolNames.join(", ")}]`;
3157
+ }
3158
+ function formatActualOrder(toolCalls, toolNames) {
3159
+ const relevantNames = new Set(toolNames);
3160
+ const actualOrder = toolCalls.filter((toolCall) => relevantNames.has(toolCall.name)).map((toolCall) => toolCall.name);
3161
+ return `[${actualOrder.join(", ")}]`;
3162
+ }
3163
+ function evaluateToolAssertions(toolAssertions, toolCalls) {
3164
+ return toolAssertions.map((toolAssertion) => {
3165
+ if (toolAssertion.type === "tool_called") {
3166
+ const matchingCalls = toolCalls.filter((toolCall) => toolCall.name === toolAssertion.toolName);
3167
+ return {
3168
+ assertion: toolAssertion.description,
3169
+ passed: matchingCalls.length > 0,
3170
+ evidence: matchingCalls.length > 0 ? `Tool '${toolAssertion.toolName}' was called ${matchingCalls.length} time${matchingCalls.length === 1 ? "" : "s"}.` : `Tool '${toolAssertion.toolName}' was not called.`,
3171
+ source: "tool"
3172
+ };
3173
+ }
3174
+ if (toolAssertion.type === "tool_not_called") {
3175
+ const matchingCalls = toolCalls.filter((toolCall) => toolCall.name === toolAssertion.toolName);
3176
+ return {
3177
+ assertion: toolAssertion.description,
3178
+ passed: matchingCalls.length === 0,
3179
+ evidence: matchingCalls.length === 0 ? `Tool '${toolAssertion.toolName}' was not called.` : `Tool '${toolAssertion.toolName}' was called ${matchingCalls.length} time${matchingCalls.length === 1 ? "" : "s"}.`,
3180
+ source: "tool"
3181
+ };
3182
+ }
3183
+ if (toolAssertion.type === "tool_call_order") {
3184
+ const expectedOrder = toolAssertion.toolNames ?? [];
3185
+ let nextExpectedIndex = 0;
3186
+ for (const toolCall of toolCalls) {
3187
+ if (toolCall.name === expectedOrder[nextExpectedIndex]) {
3188
+ nextExpectedIndex += 1;
3189
+ }
3190
+ }
3191
+ return {
3192
+ assertion: toolAssertion.description,
3193
+ passed: nextExpectedIndex === expectedOrder.length,
3194
+ evidence: nextExpectedIndex === expectedOrder.length ? `Observed tool call order ${formatExpectedOrder(expectedOrder)}.` : `Expected call order ${formatExpectedOrder(expectedOrder)} but got ${formatActualOrder(toolCalls, expectedOrder)}.`,
3195
+ source: "tool"
3196
+ };
3197
+ }
3198
+ const matchingCall = toolCalls.find(
3199
+ (toolCall) => toolCall.name === toolAssertion.toolName && matchesArgumentSubset(toolCall.arguments, toolAssertion.expectedArgs ?? {})
3200
+ );
3201
+ return {
3202
+ assertion: toolAssertion.description,
3203
+ passed: Boolean(matchingCall),
3204
+ evidence: matchingCall ? `Tool '${toolAssertion.toolName}' was called with arguments matching ${JSON.stringify(toolAssertion.expectedArgs ?? {})}.` : `No '${toolAssertion.toolName}' call matched ${JSON.stringify(toolAssertion.expectedArgs ?? {})}.`,
3205
+ source: "tool"
3206
+ };
3207
+ });
3208
+ }
2568
3209
  function extractJsonArray(raw) {
2569
3210
  const trimmed = raw.trim();
2570
3211
  if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
@@ -2591,6 +3232,7 @@ async function generatePrompts(skill, provider, model, count) {
2591
3232
  skill.content,
2592
3233
  "",
2593
3234
  `Generate ${count} prompts that stress the main capabilities and likely edge cases.`,
3235
+ // Tool-aware prompts require user-defined mock responses and are not auto-generated.
2594
3236
  "Each prompt should include 2-4 assertions."
2595
3237
  ].join("\n");
2596
3238
  const raw = await provider.sendMessage(systemPrompt, userPrompt, { model });
@@ -2614,7 +3256,24 @@ async function runEval(skill, options) {
2614
3256
  const results = await pMap(
2615
3257
  prompts,
2616
3258
  async (evalPrompt) => {
2617
- const response = await options.provider.sendMessage(systemPrompt, evalPrompt.prompt, { model: options.model });
3259
+ let response;
3260
+ let toolCalls;
3261
+ let loopIterations;
3262
+ if (evalPrompt.tools && evalPrompt.tools.length > 0) {
3263
+ const toolRun = await runWithTools({
3264
+ provider: options.provider,
3265
+ model: options.model,
3266
+ systemPrompt,
3267
+ userMessage: evalPrompt.prompt,
3268
+ tools: evalPrompt.tools,
3269
+ maxIterations: options.maxToolIterations
3270
+ });
3271
+ response = toolRun.finalResponse;
3272
+ toolCalls = toolRun.toolCalls;
3273
+ loopIterations = toolRun.loopIterations;
3274
+ } else {
3275
+ response = await options.provider.sendMessage(systemPrompt, evalPrompt.prompt, { model: options.model });
3276
+ }
2618
3277
  const gradedAssertions = await gradeResponse({
2619
3278
  provider: options.provider,
2620
3279
  model: options.graderModel,
@@ -2624,14 +3283,18 @@ async function runEval(skill, options) {
2624
3283
  modelResponse: response,
2625
3284
  assertions: evalPrompt.assertions
2626
3285
  });
2627
- const passedAssertions2 = gradedAssertions.filter((assertion) => assertion.passed).length;
3286
+ const structuralAssertions = evalPrompt.toolAssertions && evalPrompt.toolAssertions.length > 0 ? evaluateToolAssertions(evalPrompt.toolAssertions, toolCalls ?? []) : [];
3287
+ const assertions = [...gradedAssertions, ...structuralAssertions];
3288
+ const passedAssertions2 = assertions.filter((assertion) => assertion.passed).length;
2628
3289
  return {
2629
3290
  prompt: evalPrompt.prompt,
2630
- assertions: gradedAssertions,
3291
+ assertions,
2631
3292
  responseSummary: response.slice(0, 200),
2632
3293
  response,
2633
3294
  passedAssertions: passedAssertions2,
2634
- totalAssertions: gradedAssertions.length
3295
+ totalAssertions: assertions.length,
3296
+ ...toolCalls ? { toolCalls } : {},
3297
+ ...loopIterations !== void 0 ? { loopIterations } : {}
2635
3298
  };
2636
3299
  },
2637
3300
  options.concurrency ?? 5
@@ -2969,10 +3632,7 @@ function renderJson(value) {
2969
3632
 
2970
3633
  // src/commands/common.ts
2971
3634
  var executionContextByCommand = /* @__PURE__ */ new WeakMap();
2972
- var singleEvalPromptSchema = z5.object({
2973
- prompt: z5.string().min(1),
2974
- assertions: z5.array(z5.string().min(1)).optional()
2975
- });
3635
+ var singleEvalPromptSchema = evalPromptSchema;
2976
3636
  var promptStringArraySchema = z5.array(z5.string().min(1));
2977
3637
  var assertionsObjectSchema = z5.object({
2978
3638
  assertions: z5.array(z5.string().min(1))
@@ -3007,6 +3667,22 @@ function normalizeEvalPrompts(value, sourceLabel) {
3007
3667
  function parseAssertionsFromText(raw) {
3008
3668
  return raw.split(/\r?\n/).map((line) => line.trim().replace(/^[-*]\s+/, "").replace(/^\d+\.\s+/, "")).filter((line) => line.length > 0);
3009
3669
  }
3670
+ function cloneEvalPrompt(prompt) {
3671
+ return {
3672
+ prompt: prompt.prompt,
3673
+ assertions: prompt.assertions ? [...prompt.assertions] : void 0,
3674
+ tools: prompt.tools ? prompt.tools.map((tool) => ({
3675
+ ...tool,
3676
+ parameters: tool.parameters ? tool.parameters.map((parameter) => ({ ...parameter })) : void 0,
3677
+ responses: { ...tool.responses }
3678
+ })) : void 0,
3679
+ toolAssertions: prompt.toolAssertions ? prompt.toolAssertions.map((toolAssertion) => ({
3680
+ ...toolAssertion,
3681
+ toolNames: toolAssertion.toolNames ? [...toolAssertion.toolNames] : void 0,
3682
+ expectedArgs: toolAssertion.expectedArgs ? { ...toolAssertion.expectedArgs } : void 0
3683
+ })) : void 0
3684
+ };
3685
+ }
3010
3686
  function normalizeAssertions(value, sourceLabel) {
3011
3687
  const assertionArray = z5.array(z5.string().min(1)).safeParse(value);
3012
3688
  if (assertionArray.success) {
@@ -3079,17 +3755,14 @@ async function loadConfiguredEvalPrompts(command) {
3079
3755
  const assertionsRaw = await fs6.readFile(assertionsFile, "utf8");
3080
3756
  const assertions = normalizeAssertions(parseJsonIfPossible(assertionsRaw), assertionsFile);
3081
3757
  prompts = prompts.map((prompt) => ({
3082
- prompt: prompt.prompt,
3758
+ ...cloneEvalPrompt(prompt),
3083
3759
  assertions: [...assertions]
3084
3760
  }));
3085
3761
  }
3086
3762
  const numRunsWasExplicit = context.configFile?.eval?.numRuns !== void 0;
3087
3763
  if (numRunsWasExplicit && prompts.length === 1 && context.config.eval.numRuns > 1) {
3088
3764
  const promptTemplate = prompts[0];
3089
- prompts = Array.from({ length: context.config.eval.numRuns }, () => ({
3090
- prompt: promptTemplate.prompt,
3091
- assertions: promptTemplate.assertions ? [...promptTemplate.assertions] : void 0
3092
- }));
3765
+ prompts = Array.from({ length: context.config.eval.numRuns }, () => cloneEvalPrompt(promptTemplate));
3093
3766
  }
3094
3767
  return prompts;
3095
3768
  }
@@ -3186,7 +3859,8 @@ var evalConfigSchema = z7.object({
3186
3859
  numRuns: z7.number().int().min(1).optional(),
3187
3860
  threshold: z7.number().min(0).max(1).optional(),
3188
3861
  promptFile: z7.string().min(1).optional(),
3189
- assertionsFile: z7.string().min(1).optional()
3862
+ assertionsFile: z7.string().min(1).optional(),
3863
+ maxToolIterations: z7.number().int().min(1).max(50).optional()
3190
3864
  }).strict().partial();
3191
3865
  var skilltestConfigSchema = z7.object({
3192
3866
  provider: providerNameSchema.optional(),
@@ -3217,7 +3891,8 @@ var resolvedSkilltestConfigSchema = z7.object({
3217
3891
  numRuns: z7.number().int().min(1),
3218
3892
  threshold: z7.number().min(0).max(1),
3219
3893
  promptFile: z7.string().min(1).optional(),
3220
- assertionsFile: z7.string().min(1).optional()
3894
+ assertionsFile: z7.string().min(1).optional(),
3895
+ maxToolIterations: z7.number().int().min(1).max(50)
3221
3896
  })
3222
3897
  });
3223
3898
  var DEFAULT_SKILLTEST_CONFIG = {
@@ -3237,7 +3912,8 @@ var DEFAULT_SKILLTEST_CONFIG = {
3237
3912
  },
3238
3913
  eval: {
3239
3914
  numRuns: 5,
3240
- threshold: 0.9
3915
+ threshold: 0.9,
3916
+ maxToolIterations: 10
3241
3917
  }
3242
3918
  };
3243
3919
  function formatIssuePath(issuePath) {
@@ -3367,7 +4043,8 @@ function mergeConfigLayers(configFile = {}, cliFlags = {}, baseDirectory = proce
3367
4043
  assertionsFile: resolveConfigRelativePath(
3368
4044
  baseDirectory,
3369
4045
  cliFlags.eval?.assertionsFile ?? configFile.eval?.assertionsFile ?? DEFAULT_SKILLTEST_CONFIG.eval.assertionsFile
3370
- )
4046
+ ),
4047
+ maxToolIterations: cliFlags.eval?.maxToolIterations ?? configFile.eval?.maxToolIterations ?? DEFAULT_SKILLTEST_CONFIG.eval.maxToolIterations
3371
4048
  }
3372
4049
  };
3373
4050
  return resolvedSkilltestConfigSchema.parse(merged);
@@ -3391,34 +4068,34 @@ function extractCliConfigOverrides(command) {
3391
4068
  if (command.getOptionValueSource("model") === "cli") {
3392
4069
  overrides.model = getTypedOptionValue(command, "model");
3393
4070
  }
3394
- if ((command.name() === "trigger" || command.name() === "eval" || command.name() === "check") && command.getOptionValueSource("concurrency") === "cli") {
4071
+ if ((command.name() === "trigger" || command.name() === "eval" || command.name() === "check" || command.name() === "improve" || command.name() === "route") && command.getOptionValueSource("concurrency") === "cli") {
3395
4072
  overrides.concurrency = getTypedOptionValue(command, "concurrency");
3396
4073
  }
3397
- if ((command.name() === "trigger" || command.name() === "check") && command.getOptionValueSource("numQueries") === "cli") {
4074
+ if ((command.name() === "trigger" || command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("numQueries") === "cli") {
3398
4075
  overrides.trigger = {
3399
4076
  ...overrides.trigger,
3400
4077
  numQueries: getTypedOptionValue(command, "numQueries")
3401
4078
  };
3402
4079
  }
3403
- if ((command.name() === "trigger" || command.name() === "check") && command.getOptionValueSource("compare") === "cli") {
4080
+ if ((command.name() === "trigger" || command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("compare") === "cli") {
3404
4081
  overrides.trigger = {
3405
4082
  ...overrides.trigger,
3406
4083
  compare: getTypedOptionValue(command, "compare")
3407
4084
  };
3408
4085
  }
3409
- if ((command.name() === "lint" || command.name() === "check") && command.getOptionValueSource("plugin") === "cli") {
4086
+ if ((command.name() === "lint" || command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("plugin") === "cli") {
3410
4087
  overrides.lint = {
3411
4088
  ...overrides.lint,
3412
4089
  plugins: getTypedOptionValue(command, "plugin")
3413
4090
  };
3414
4091
  }
3415
- if (command.name() === "check" && command.getOptionValueSource("minF1") === "cli") {
4092
+ if ((command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("minF1") === "cli") {
3416
4093
  overrides.trigger = {
3417
4094
  ...overrides.trigger,
3418
4095
  threshold: getTypedOptionValue(command, "minF1")
3419
4096
  };
3420
4097
  }
3421
- if (command.name() === "check" && command.getOptionValueSource("minAssertPassRate") === "cli") {
4098
+ if ((command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("minAssertPassRate") === "cli") {
3422
4099
  overrides.eval = {
3423
4100
  ...overrides.eval,
3424
4101
  threshold: getTypedOptionValue(command, "minAssertPassRate")
@@ -3483,6 +4160,12 @@ function resolveApiKey(provider, override) {
3483
4160
 
3484
4161
  // src/providers/anthropic.ts
3485
4162
  import Anthropic from "@anthropic-ai/sdk";
4163
+ function isAnthropicTextBlock(block) {
4164
+ return block.type === "text";
4165
+ }
4166
+ function isAnthropicToolUseBlock(block) {
4167
+ return block.type === "tool_use";
4168
+ }
3486
4169
  function wait(ms) {
3487
4170
  return new Promise((resolve) => {
3488
4171
  setTimeout(resolve, ms);
@@ -3508,27 +4191,11 @@ var AnthropicProvider = class {
3508
4191
  constructor(apiKey) {
3509
4192
  this.client = new Anthropic({ apiKey });
3510
4193
  }
3511
- async sendMessage(systemPrompt, userMessage, options) {
4194
+ async createMessage(request) {
3512
4195
  let lastError;
3513
4196
  for (let attempt = 0; attempt < 3; attempt += 1) {
3514
4197
  try {
3515
- const response = await this.client.messages.create({
3516
- model: options.model,
3517
- max_tokens: 2048,
3518
- system: systemPrompt,
3519
- messages: [
3520
- {
3521
- role: "user",
3522
- content: userMessage
3523
- }
3524
- ]
3525
- });
3526
- const textBlocks = response.content.filter((block) => block.type === "text");
3527
- const text = textBlocks.map((block) => block.text).join("\n").trim();
3528
- if (text.length === 0) {
3529
- throw new Error("Model returned an empty response.");
3530
- }
3531
- return text;
4198
+ return await this.client.messages.create(request);
3532
4199
  } catch (error) {
3533
4200
  lastError = error;
3534
4201
  if (!isRateLimitError(error) || attempt === 2) {
@@ -3543,6 +4210,55 @@ var AnthropicProvider = class {
3543
4210
  }
3544
4211
  throw new Error("Anthropic API call failed with an unknown error.");
3545
4212
  }
4213
+ toAnthropicMessages(messages) {
4214
+ return messages.map((message) => ({
4215
+ role: message.role,
4216
+ content: message.content
4217
+ }));
4218
+ }
4219
+ async sendMessage(systemPrompt, userMessage, options) {
4220
+ const response = await this.createMessage({
4221
+ model: options.model,
4222
+ max_tokens: 2048,
4223
+ system: systemPrompt,
4224
+ messages: [
4225
+ {
4226
+ role: "user",
4227
+ content: userMessage
4228
+ }
4229
+ ]
4230
+ });
4231
+ const textBlocks = response.content.filter(isAnthropicTextBlock);
4232
+ const text = textBlocks.map((block) => block.text).join("\n").trim();
4233
+ if (text.length === 0) {
4234
+ throw new Error("Model returned an empty response.");
4235
+ }
4236
+ return text;
4237
+ }
4238
+ async sendWithTools(systemPrompt, messages, options) {
4239
+ const response = await this.createMessage({
4240
+ model: options.model,
4241
+ max_tokens: 2048,
4242
+ system: systemPrompt,
4243
+ messages: this.toAnthropicMessages(messages),
4244
+ tools: options.tools.map((tool) => ({
4245
+ name: tool.name,
4246
+ description: tool.description,
4247
+ input_schema: tool.parameters ?? { type: "object", properties: {} }
4248
+ }))
4249
+ });
4250
+ const textContent = response.content.filter(isAnthropicTextBlock).map((block) => block.text).join("\n").trim();
4251
+ const toolUseBlocks = response.content.filter(isAnthropicToolUseBlock).map((block) => ({
4252
+ id: block.id,
4253
+ name: block.name,
4254
+ arguments: block.input
4255
+ }));
4256
+ return {
4257
+ textContent,
4258
+ toolUseBlocks,
4259
+ stopReason: response.stop_reason ?? "end_turn"
4260
+ };
4261
+ }
3546
4262
  };
3547
4263
 
3548
4264
  // src/providers/openai.ts
@@ -3579,17 +4295,82 @@ function extractTextContent(content) {
3579
4295
  const text = content.map((item) => item.type === "text" || !item.type ? item.text ?? "" : "").join("\n").trim();
3580
4296
  return text;
3581
4297
  }
3582
- var OpenAIProvider = class {
3583
- name = "openai";
3584
- apiKey;
3585
- client;
3586
- constructor(apiKey) {
3587
- this.apiKey = apiKey;
3588
- this.client = null;
4298
+ function parseToolArguments(raw, toolName) {
4299
+ if (!raw || raw.trim() === "") {
4300
+ return {};
3589
4301
  }
3590
- async ensureClient() {
3591
- if (this.client) {
3592
- return this.client;
4302
+ try {
4303
+ const parsed = JSON.parse(raw);
4304
+ if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
4305
+ throw new Error("Tool arguments must be a JSON object.");
4306
+ }
4307
+ return parsed;
4308
+ } catch (error) {
4309
+ const message = error instanceof Error ? error.message : String(error);
4310
+ throw new Error(`OpenAI tool call arguments for '${toolName}' were not valid JSON: ${message}`);
4311
+ }
4312
+ }
4313
+ function getBlockText(blocks) {
4314
+ return blocks.filter((block) => block.type === "text").map((block) => String(block.text ?? "")).join("\n").trim();
4315
+ }
4316
+ function mapAssistantBlocksToMessage(blocks) {
4317
+ const textContent = getBlockText(blocks);
4318
+ const toolCalls = blocks.filter((block) => block.type === "tool_use").map((block) => ({
4319
+ id: String(block.id ?? ""),
4320
+ type: "function",
4321
+ function: {
4322
+ name: String(block.name ?? ""),
4323
+ arguments: JSON.stringify(block.input ?? {})
4324
+ }
4325
+ }));
4326
+ return {
4327
+ role: "assistant",
4328
+ content: textContent.length > 0 ? textContent : null,
4329
+ ...toolCalls.length > 0 ? { tool_calls: toolCalls } : {}
4330
+ };
4331
+ }
4332
+ function mapUserBlocksToMessages(blocks) {
4333
+ const toolResults = blocks.filter((block) => block.type === "tool_result");
4334
+ if (toolResults.length > 0) {
4335
+ return toolResults.map((block) => ({
4336
+ role: "tool",
4337
+ tool_call_id: String(block.tool_use_id ?? ""),
4338
+ content: String(block.content ?? "")
4339
+ }));
4340
+ }
4341
+ const textContent = getBlockText(blocks);
4342
+ return [
4343
+ {
4344
+ role: "user",
4345
+ content: textContent
4346
+ }
4347
+ ];
4348
+ }
4349
+ function mapConversationBlockToMessages(block) {
4350
+ if (typeof block.content === "string") {
4351
+ return [
4352
+ {
4353
+ role: block.role,
4354
+ content: block.content
4355
+ }
4356
+ ];
4357
+ }
4358
+ if (block.role === "assistant") {
4359
+ return [mapAssistantBlocksToMessage(block.content)];
4360
+ }
4361
+ return mapUserBlocksToMessages(block.content);
4362
+ }
4363
+ var OpenAIProvider = class {
4364
+ name = "openai";
4365
+ apiKey;
4366
+ client;
4367
+ constructor(apiKey) {
4368
+ this.apiKey = apiKey;
4369
+ this.client = null;
4370
+ }
4371
+ async ensureClient() {
4372
+ if (this.client) {
4373
+ return this.client;
3593
4374
  }
3594
4375
  let openAiModule;
3595
4376
  try {
@@ -3607,30 +4388,12 @@ var OpenAIProvider = class {
3607
4388
  this.client = new OpenAIConstructor({ apiKey: this.apiKey });
3608
4389
  return this.client;
3609
4390
  }
3610
- async sendMessage(systemPrompt, userMessage, options) {
4391
+ async createCompletion(input) {
3611
4392
  const client = await this.ensureClient();
3612
4393
  let lastError;
3613
4394
  for (let attempt = 0; attempt < 3; attempt += 1) {
3614
4395
  try {
3615
- const response = await client.chat.completions.create({
3616
- model: options.model,
3617
- max_tokens: 2048,
3618
- messages: [
3619
- {
3620
- role: "system",
3621
- content: systemPrompt
3622
- },
3623
- {
3624
- role: "user",
3625
- content: userMessage
3626
- }
3627
- ]
3628
- });
3629
- const text = (response.choices ?? []).map((choice) => extractTextContent(choice.message?.content)).join("\n").trim();
3630
- if (text.length === 0) {
3631
- throw new Error("Model returned an empty response.");
3632
- }
3633
- return text;
4396
+ return await client.chat.completions.create(input);
3634
4397
  } catch (error) {
3635
4398
  lastError = error;
3636
4399
  if (!isRetriableError(error) || attempt === 2) {
@@ -3645,6 +4408,57 @@ var OpenAIProvider = class {
3645
4408
  }
3646
4409
  throw new Error("OpenAI API call failed with an unknown error.");
3647
4410
  }
4411
+ toOpenAiMessages(systemPrompt, messages) {
4412
+ return [
4413
+ {
4414
+ role: "system",
4415
+ content: systemPrompt
4416
+ },
4417
+ ...messages.flatMap((message) => mapConversationBlockToMessages(message))
4418
+ ];
4419
+ }
4420
+ async sendMessage(systemPrompt, userMessage, options) {
4421
+ const response = await this.createCompletion({
4422
+ model: options.model,
4423
+ max_tokens: 2048,
4424
+ messages: this.toOpenAiMessages(systemPrompt, [{ role: "user", content: userMessage }])
4425
+ });
4426
+ const text = (response.choices ?? []).map((choice) => extractTextContent(choice.message?.content)).join("\n").trim();
4427
+ if (text.length === 0) {
4428
+ throw new Error("Model returned an empty response.");
4429
+ }
4430
+ return text;
4431
+ }
4432
+ async sendWithTools(systemPrompt, messages, options) {
4433
+ const response = await this.createCompletion({
4434
+ model: options.model,
4435
+ max_tokens: 2048,
4436
+ messages: this.toOpenAiMessages(systemPrompt, messages),
4437
+ tools: options.tools.map((tool) => ({
4438
+ type: "function",
4439
+ function: {
4440
+ name: tool.name,
4441
+ description: tool.description,
4442
+ parameters: tool.parameters
4443
+ }
4444
+ }))
4445
+ });
4446
+ const choice = response.choices?.[0];
4447
+ const message = choice?.message;
4448
+ const toolUseBlocks = (message?.tool_calls ?? []).map((toolCall, index) => {
4449
+ const toolName = toolCall.function?.name ?? `tool-${index + 1}`;
4450
+ return {
4451
+ id: toolCall.id ?? `${toolName}-${index + 1}`,
4452
+ name: toolName,
4453
+ arguments: parseToolArguments(toolCall.function?.arguments, toolName)
4454
+ };
4455
+ });
4456
+ return {
4457
+ textContent: extractTextContent(message?.content),
4458
+ toolUseBlocks,
4459
+ stopReason: choice?.finish_reason === "stop" ? "end_turn" : choice?.finish_reason === "tool_calls" ? "tool_use" : choice?.finish_reason ?? "end_turn"
4460
+ };
4461
+ }
3648
4462
  };
3649
4463
 
3650
4464
  // src/providers/index.ts
@@ -3815,7 +4629,8 @@ async function handleEvalCommand(targetPath, options, command) {
3815
4629
  graderModel,
3816
4630
  numRuns: options.numRuns,
3817
4631
  concurrency: options.concurrency,
3818
- prompts
4632
+ prompts,
4633
+ maxToolIterations: options.maxToolIterations
3819
4634
  });
3820
4635
  if (options.saveResults) {
3821
4636
  await writeJsonFile(options.saveResults, result);
@@ -3862,7 +4677,8 @@ function registerEvalCommand(program) {
3862
4677
  verbose: Boolean(parsedCli.data.verbose),
3863
4678
  apiKey: parsedCli.data.apiKey,
3864
4679
  numRuns: config.eval.numRuns,
3865
- concurrency: config.concurrency
4680
+ concurrency: config.concurrency,
4681
+ maxToolIterations: config.eval.maxToolIterations
3866
4682
  },
3867
4683
  command
3868
4684
  );
@@ -3919,7 +4735,8 @@ async function runCheck(inputPath, options) {
3919
4735
  graderModel: options.graderModel,
3920
4736
  numRuns: options.evalNumRuns,
3921
4737
  prompts: options.prompts,
3922
- concurrency: options.concurrency
4738
+ concurrency: options.concurrency,
4739
+ maxToolIterations: options.evalMaxToolIterations
3923
4740
  };
3924
4741
  if ((options.concurrency ?? 5) === 1) {
3925
4742
  options.onStage?.("trigger");
@@ -4041,6 +4858,7 @@ async function handleCheckCommand(targetPath, options, command) {
4041
4858
  triggerSeed: options.triggerSeed,
4042
4859
  prompts,
4043
4860
  evalNumRuns: options.numRuns,
4861
+ evalMaxToolIterations: options.maxToolIterations,
4044
4862
  concurrency: options.concurrency,
4045
4863
  minF1: options.minF1,
4046
4864
  minAssertPassRate: options.minAssertPassRate,
@@ -4106,6 +4924,7 @@ function registerCheckCommand(program) {
4106
4924
  minF1: config.trigger.threshold,
4107
4925
  minAssertPassRate: config.eval.threshold,
4108
4926
  numRuns: config.eval.numRuns,
4927
+ maxToolIterations: config.eval.maxToolIterations,
4109
4928
  concurrency: config.concurrency,
4110
4929
  html: parsedCli.data.html,
4111
4930
  lintFailOn: config.lint.failOn,
@@ -4121,12 +4940,868 @@ function registerCheckCommand(program) {
4121
4940
  });
4122
4941
  }
4123
4942
 
4943
+ // src/commands/improve.ts
4944
+ import ora4 from "ora";
4945
+ import { z as z12 } from "zod";
4946
+
4947
+ // src/core/improver.ts
4948
+ import fs12 from "node:fs/promises";
4949
+ import os from "node:os";
4950
+ import path7 from "node:path";
4951
+ import yaml2 from "js-yaml";
4952
+ import { z as z11 } from "zod";
4953
+ var improveRewriteSchema = z11.object({
4954
+ frontmatter: z11.record(z11.unknown()),
4955
+ content: z11.string().min(1),
4956
+ changeSummary: z11.array(z11.string().min(1)).min(1),
4957
+ targetedProblems: z11.array(z11.string().min(1)).min(1)
4958
+ });
4959
+ function calculateEvalAssertPassRate2(result) {
4960
+ if (!result || result.summary.totalAssertions === 0) {
4961
+ return 0;
4962
+ }
4963
+ return result.summary.passedAssertions / result.summary.totalAssertions;
4964
+ }
4965
+ function extractJsonObject2(raw) {
4966
+ const trimmed = raw.trim();
4967
+ if (trimmed.startsWith("{") && trimmed.endsWith("}")) {
4968
+ return JSON.parse(trimmed);
4969
+ }
4970
+ const start = trimmed.indexOf("{");
4971
+ const end = trimmed.lastIndexOf("}");
4972
+ if (start >= 0 && end > start) {
4973
+ return JSON.parse(trimmed.slice(start, end + 1));
4974
+ }
4975
+ throw new Error("Improver did not return a JSON object.");
4976
+ }
4977
+ function orderFrontmatter(frontmatter) {
4978
+ const ordered = {};
4979
+ for (const key of ["name", "description", "license"]) {
4980
+ if (Object.prototype.hasOwnProperty.call(frontmatter, key)) {
4981
+ ordered[key] = frontmatter[key];
4982
+ }
4983
+ }
4984
+ for (const [key, value] of Object.entries(frontmatter)) {
4985
+ if (!Object.prototype.hasOwnProperty.call(ordered, key)) {
4986
+ ordered[key] = value;
4987
+ }
4988
+ }
4989
+ return ordered;
4990
+ }
4991
+ function detectLineEnding(raw) {
4992
+ return raw.includes("\r\n") ? "\r\n" : "\n";
4993
+ }
4994
+ function buildSkillMarkdown(frontmatter, content, lineEnding) {
4995
+ const normalizedBody = content.trim();
4996
+ if (normalizedBody.length === 0) {
4997
+ throw new Error("Candidate rewrite produced an empty SKILL.md body.");
4998
+ }
4999
+ const frontmatterBlock = yaml2.dump(orderFrontmatter(frontmatter), {
5000
+ lineWidth: 0,
5001
+ noRefs: true,
5002
+ sortKeys: false
5003
+ }).replace(/\n/g, lineEnding);
5004
+ return `---${lineEnding}${frontmatterBlock}---${lineEnding}${lineEnding}${normalizedBody.replace(/\n/g, lineEnding)}${lineEnding}`;
5005
+ }
5006
+ async function validateRelativeReferences(raw, skillRoot) {
5007
+ for (const reference of extractRelativeFileReferences(raw)) {
5008
+ const resolved = path7.resolve(skillRoot, reference);
5009
+ const relativeToRoot = path7.relative(skillRoot, resolved);
5010
+ const escapesRoot = relativeToRoot === "" ? false : relativeToRoot.startsWith("..") || path7.isAbsolute(relativeToRoot);
5011
+ if (escapesRoot) {
5012
+ throw new Error(`Candidate rewrite introduced an out-of-root reference: ${reference}`);
5013
+ }
5014
+ if (!await pathExists(resolved)) {
5015
+ throw new Error(`Candidate rewrite introduced a broken relative reference: ${reference}`);
5016
+ }
5017
+ }
5018
+ }
5019
+ async function buildCandidate(skill, rewrite) {
5020
+ if (typeof rewrite.frontmatter.name === "string" && rewrite.frontmatter.name !== skill.frontmatter.name) {
5021
+ throw new Error(`Candidate rewrite attempted to rename skill '${skill.frontmatter.name}' to '${rewrite.frontmatter.name}'.`);
5022
+ }
5023
+ if (skill.frontmatter.license && typeof rewrite.frontmatter.license === "string" && rewrite.frontmatter.license !== skill.frontmatter.license) {
5024
+ throw new Error(
5025
+ `Candidate rewrite attempted to change license '${skill.frontmatter.license}' to '${rewrite.frontmatter.license}'.`
5026
+ );
5027
+ }
5028
+ const mergedFrontmatter = {
5029
+ ...skill.frontmatter,
5030
+ ...rewrite.frontmatter,
5031
+ name: skill.frontmatter.name,
5032
+ ...skill.frontmatter.license ? { license: skill.frontmatter.license } : {}
5033
+ };
5034
+ const raw = buildSkillMarkdown(mergedFrontmatter, rewrite.content, detectLineEnding(skill.raw));
5035
+ parseSkillDocumentStrict(raw, skill.skillRoot, skill.skillFile);
5036
+ await validateRelativeReferences(raw, skill.skillRoot);
5037
+ return {
5038
+ frontmatter: mergedFrontmatter,
5039
+ content: rewrite.content.trim(),
5040
+ raw,
5041
+ changeSummary: rewrite.changeSummary,
5042
+ targetedProblems: rewrite.targetedProblems
5043
+ };
5044
+ }
5045
+ function extractActionableIssues(result) {
5046
+ const lintIssues = result.lint.issues.filter((issue) => issue.status !== "pass").map((issue) => ({
5047
+ checkId: issue.checkId,
5048
+ title: issue.title,
5049
+ status: issue.status === "warn" ? "warn" : "fail",
5050
+ message: issue.message,
5051
+ suggestion: issue.suggestion,
5052
+ startLine: issue.startLine,
5053
+ endLine: issue.endLine
5054
+ }));
5055
+ const triggerFailures = result.trigger?.cases.filter((testCase) => !testCase.matched).map((testCase) => ({
5056
+ query: testCase.query,
5057
+ expected: testCase.expected,
5058
+ actual: testCase.actual,
5059
+ selectedCompetitor: testCase.selectedCompetitor,
5060
+ rawModelResponse: testCase.rawModelResponse
5061
+ })) ?? [];
5062
+ const evalFailures = result.eval?.results.flatMap(
5063
+ (promptResult) => promptResult.assertions.filter((assertion) => !assertion.passed).map((assertion) => ({
5064
+ prompt: promptResult.prompt,
5065
+ assertion: assertion.assertion,
5066
+ evidence: assertion.evidence,
5067
+ source: assertion.source === "grader" || assertion.source === "tool" ? assertion.source : "unknown"
5068
+ }))
5069
+ ) ?? [];
5070
+ return {
5071
+ lintIssues,
5072
+ triggerFailures,
5073
+ evalFailures,
5074
+ triggerSuggestions: result.trigger?.suggestions ?? []
5075
+ };
5076
+ }
5077
+ function hasActionableProblems(brief) {
5078
+ return brief.lintIssues.length > 0 || brief.triggerFailures.length > 0 || brief.evalFailures.length > 0 || brief.triggerSuggestions.length > 0;
5079
+ }
5080
+ async function listSkillFiles(skillRoot) {
5081
+ const entries = await fs12.readdir(skillRoot, { withFileTypes: true });
5082
+ const files = [];
5083
+ for (const entry of entries) {
5084
+ const absolutePath = path7.join(skillRoot, entry.name);
5085
+ if (entry.isDirectory()) {
5086
+ files.push(...await listSkillFiles(absolutePath));
5087
+ continue;
5088
+ }
5089
+ if (entry.isFile()) {
5090
+ files.push(path7.relative(skillRoot, absolutePath).split(path7.sep).join("/"));
5091
+ }
5092
+ }
5093
+ return files.sort();
5094
+ }
5095
+ async function requestRewrite(skill, baseline, brief, provider, model) {
5096
+ const availableFiles = await listSkillFiles(skill.skillRoot);
5097
+ const systemPrompt = [
5098
+ "You rewrite Agent Skill files to improve measured quality.",
5099
+ "Return JSON only.",
5100
+ "Required format:",
5101
+ '{"frontmatter": {...}, "content": "...", "changeSummary": ["..."], "targetedProblems": ["..."]}',
5102
+ "The content field must contain only the markdown body of SKILL.md, without YAML frontmatter fences.",
5103
+ `Keep the skill name exactly '${skill.frontmatter.name}'.`,
5104
+ skill.frontmatter.license ? `Keep the license exactly '${skill.frontmatter.license}'.` : "Do not remove any valid existing frontmatter fields.",
5105
+ "Do not invent new scripts, assets, references, APIs, or tools.",
5106
+ "Only reference files that already exist under the skill root.",
5107
+ "Optimize for trigger clarity, explicit scope boundaries, concrete examples, safety guidance, and tool usage instructions."
5108
+ ].join(" ");
5109
+ const baselineTriggerF1 = baseline.trigger?.metrics.f1 ?? 0;
5110
+ const baselineEvalPassRate = calculateEvalAssertPassRate2(baseline.eval);
5111
+ const userPrompt = [
5112
+ `Skill file: ${skill.skillFile}`,
5113
+ `Current trigger F1: ${baselineTriggerF1.toFixed(4)}`,
5114
+ `Current eval assertion pass rate: ${baselineEvalPassRate.toFixed(4)}`,
5115
+ `Lint failures: ${baseline.lint.summary.failures}`,
5116
+ `Lint warnings: ${baseline.lint.summary.warnings}`,
5117
+ "",
5118
+ "Available files under the skill root:",
5119
+ ...availableFiles.map((file) => `- ${file}`),
5120
+ "",
5121
+ "Current SKILL.md:",
5122
+ "```markdown",
5123
+ skill.raw,
5124
+ "```",
5125
+ "",
5126
+ "Actionable problems to fix:",
5127
+ JSON.stringify(brief, null, 2),
5128
+ "",
5129
+ "Rewrite the skill to address only these evidenced problems. Keep the instructions tight and practical."
5130
+ ].join("\n");
5131
+ const raw = await provider.sendMessage(systemPrompt, userPrompt, { model });
5132
+ const parsed = improveRewriteSchema.safeParse(extractJsonObject2(raw));
5133
+ if (!parsed.success) {
5134
+ throw new Error(`Failed to parse improve output: ${parsed.error.issues[0]?.message ?? "invalid improve JSON"}`);
5135
+ }
5136
+ return parsed.data;
5137
+ }
5138
+ async function createVerificationDirectory(skillRoot, candidateRaw) {
5139
+ const tempRoot = await fs12.mkdtemp(path7.join(os.tmpdir(), "skilltest-improve-"));
5140
+ const tempSkillRoot = path7.join(tempRoot, path7.basename(skillRoot));
5141
+ await fs12.cp(skillRoot, tempSkillRoot, { recursive: true });
5142
+ await fs12.writeFile(path7.join(tempSkillRoot, "SKILL.md"), candidateRaw, "utf8");
5143
+ return {
5144
+ tempRoot,
5145
+ skillPath: tempSkillRoot
5146
+ };
5147
+ }
5148
+ function buildDelta(baseline, verification) {
5149
+ const baselineTriggerF1 = baseline.trigger?.metrics.f1 ?? 0;
5150
+ const verificationTriggerF1 = verification.trigger?.metrics.f1 ?? 0;
5151
+ const baselineEvalPassRate = calculateEvalAssertPassRate2(baseline.eval);
5152
+ const verificationEvalPassRate = calculateEvalAssertPassRate2(verification.eval);
5153
+ const lintFailuresDelta = baseline.lint.summary.failures - verification.lint.summary.failures;
5154
+ const lintWarningsDelta = baseline.lint.summary.warnings - verification.lint.summary.warnings;
5155
+ const triggerF1Delta = verificationTriggerF1 - baselineTriggerF1;
5156
+ const evalPassRateDelta = verificationEvalPassRate - baselineEvalPassRate;
5157
+ const hasRegression = verification.lint.summary.failures > baseline.lint.summary.failures || verification.lint.summary.warnings > baseline.lint.summary.warnings || verificationTriggerF1 < baselineTriggerF1 || verificationEvalPassRate < baselineEvalPassRate;
5158
+ const improved = verification.gates.overallPassed !== baseline.gates.overallPassed ? verification.gates.overallPassed : lintFailuresDelta > 0 || lintWarningsDelta > 0 || triggerF1Delta > 0 || evalPassRateDelta > 0;
5159
+ return {
5160
+ lintFailures: {
5161
+ before: baseline.lint.summary.failures,
5162
+ after: verification.lint.summary.failures,
5163
+ delta: lintFailuresDelta
5164
+ },
5165
+ lintWarnings: {
5166
+ before: baseline.lint.summary.warnings,
5167
+ after: verification.lint.summary.warnings,
5168
+ delta: lintWarningsDelta
5169
+ },
5170
+ triggerF1: {
5171
+ before: baselineTriggerF1,
5172
+ after: verificationTriggerF1,
5173
+ delta: triggerF1Delta
5174
+ },
5175
+ evalAssertPassRate: {
5176
+ before: baselineEvalPassRate,
5177
+ after: verificationEvalPassRate,
5178
+ delta: evalPassRateDelta
5179
+ },
5180
+ overallPassed: {
5181
+ before: baseline.gates.overallPassed,
5182
+ after: verification.gates.overallPassed
5183
+ },
5184
+ improved,
5185
+ hasRegression
5186
+ };
5187
+ }
5188
+ function normalizeVerificationTarget(result, target) {
5189
+ return {
5190
+ ...result,
5191
+ target
5192
+ };
5193
+ }
5194
+ function buildBlockingReason(delta, verification) {
5195
+ if (delta.hasRegression) {
5196
+ return "Candidate rewrite regressed one or more quality metrics on the frozen test set.";
5197
+ }
5198
+ if (!delta.improved) {
5199
+ return "Candidate rewrite did not produce a measurable improvement on the frozen test set.";
5200
+ }
5201
+ if (!verification.gates.overallPassed) {
5202
+ return "Candidate rewrite improved the skill but still failed the configured quality gates.";
5203
+ }
5204
+ return void 0;
5205
+ }
5206
+ async function maybeWriteOutput(outputPath, raw) {
5207
+ const absolutePath = path7.resolve(outputPath);
5208
+ await fs12.mkdir(path7.dirname(absolutePath), { recursive: true });
5209
+ await fs12.writeFile(absolutePath, raw, "utf8");
5210
+ return absolutePath;
5211
+ }
5212
+ async function runImprove(inputPath, options) {
5213
+ options.onStage?.("baseline");
5214
+ const baseline = await runCheck(inputPath, {
5215
+ provider: options.provider,
5216
+ model: options.model,
5217
+ graderModel: options.model,
5218
+ lintFailOn: options.lintFailOn,
5219
+ lintSuppress: options.lintSuppress,
5220
+ lintPlugins: options.lintPlugins,
5221
+ compare: options.compare,
5222
+ numQueries: options.numQueries,
5223
+ triggerSeed: options.triggerSeed,
5224
+ queries: options.queries,
5225
+ evalNumRuns: options.evalNumRuns,
5226
+ prompts: options.prompts,
5227
+ evalMaxToolIterations: options.evalMaxToolIterations,
5228
+ concurrency: options.concurrency,
5229
+ minF1: options.minF1,
5230
+ minAssertPassRate: options.minAssertPassRate,
5231
+ continueOnLintFail: true,
5232
+ verbose: options.verbose
5233
+ });
5234
+ if (!baseline.trigger || !baseline.eval) {
5235
+ return {
5236
+ target: inputPath,
5237
+ provider: options.provider.name,
5238
+ model: options.model,
5239
+ originalRaw: "",
5240
+ thresholds: {
5241
+ minF1: options.minF1,
5242
+ minAssertPassRate: options.minAssertPassRate
5243
+ },
5244
+ baseline,
5245
+ candidate: null,
5246
+ verification: null,
5247
+ delta: null,
5248
+ applied: false,
5249
+ blockedReason: baseline.triggerSkippedReason ?? baseline.evalSkippedReason ?? "Improve requires a strictly parseable skill so trigger and eval can be frozen."
5250
+ };
5251
+ }
5252
+ const skill = await parseSkillStrict(inputPath);
5253
+ const brief = extractActionableIssues(baseline);
5254
+ if (!hasActionableProblems(brief)) {
5255
+ return {
5256
+ target: inputPath,
5257
+ provider: options.provider.name,
5258
+ model: options.model,
5259
+ originalRaw: skill.raw,
5260
+ thresholds: {
5261
+ minF1: options.minF1,
5262
+ minAssertPassRate: options.minAssertPassRate
5263
+ },
5264
+ baseline,
5265
+ candidate: null,
5266
+ verification: null,
5267
+ delta: null,
5268
+ applied: false,
5269
+ blockedReason: "No actionable failures, warnings, or mismatches were found to improve."
5270
+ };
5271
+ }
5272
+ options.onStage?.("generate");
5273
+ const rewrite = await requestRewrite(skill, baseline, brief, options.provider, options.model);
5274
+ options.onStage?.("validate");
5275
+ const candidate = await buildCandidate(skill, rewrite);
5276
+ if (candidate.raw === skill.raw) {
5277
+ return {
5278
+ target: inputPath,
5279
+ provider: options.provider.name,
5280
+ model: options.model,
5281
+ originalRaw: skill.raw,
5282
+ thresholds: {
5283
+ minF1: options.minF1,
5284
+ minAssertPassRate: options.minAssertPassRate
5285
+ },
5286
+ baseline,
5287
+ candidate,
5288
+ verification: null,
5289
+ delta: null,
5290
+ applied: false,
5291
+ blockedReason: "Candidate rewrite produced no changes."
5292
+ };
5293
+ }
5294
+ options.onStage?.("verify");
5295
+ const verificationDirectory = await createVerificationDirectory(skill.skillRoot, candidate.raw);
5296
+ let verification;
5297
+ try {
5298
+ verification = normalizeVerificationTarget(
5299
+ await runCheck(verificationDirectory.skillPath, {
5300
+ provider: options.provider,
5301
+ model: options.model,
5302
+ graderModel: options.model,
5303
+ lintFailOn: options.lintFailOn,
5304
+ lintSuppress: options.lintSuppress,
5305
+ lintPlugins: options.lintPlugins,
5306
+ compare: options.compare,
5307
+ numQueries: baseline.trigger.queries.length,
5308
+ triggerSeed: options.triggerSeed,
5309
+ queries: baseline.trigger.queries,
5310
+ evalNumRuns: baseline.eval.prompts.length,
5311
+ prompts: baseline.eval.prompts,
5312
+ evalMaxToolIterations: options.evalMaxToolIterations,
5313
+ concurrency: options.concurrency,
5314
+ minF1: options.minF1,
5315
+ minAssertPassRate: options.minAssertPassRate,
5316
+ continueOnLintFail: true,
5317
+ verbose: options.verbose
5318
+ }),
5319
+ inputPath
5320
+ );
5321
+ } finally {
5322
+ await fs12.rm(verificationDirectory.tempRoot, { recursive: true, force: true });
5323
+ }
5324
+ const delta = buildDelta(baseline, verification);
5325
+ const blockedReason = buildBlockingReason(delta, verification);
5326
+ let applied = false;
5327
+ let outputPath;
5328
+ if (!blockedReason) {
5329
+ if (options.outputPath) {
5330
+ options.onStage?.("write");
5331
+ outputPath = await maybeWriteOutput(options.outputPath, candidate.raw);
5332
+ }
5333
+ if (options.apply) {
5334
+ options.onStage?.("write");
5335
+ await fs12.writeFile(skill.skillFile, candidate.raw, "utf8");
5336
+ applied = true;
5337
+ }
5338
+ }
5339
+ return {
5340
+ target: inputPath,
5341
+ provider: options.provider.name,
5342
+ model: options.model,
5343
+ originalRaw: skill.raw,
5344
+ thresholds: {
5345
+ minF1: options.minF1,
5346
+ minAssertPassRate: options.minAssertPassRate
5347
+ },
5348
+ baseline,
5349
+ candidate,
5350
+ verification,
5351
+ delta,
5352
+ applied,
5353
+ ...outputPath ? { outputPath } : {},
5354
+ ...blockedReason ? { blockedReason } : {}
5355
+ };
5356
+ }
5357
+
5358
+ // src/commands/improve.ts
5359
+ var improveCliSchema = z12.object({
5360
+ apiKey: z12.string().optional(),
5361
+ queries: z12.string().optional(),
5362
+ compare: z12.array(z12.string().min(1)).optional(),
5363
+ seed: z12.number().int().optional(),
5364
+ prompts: z12.string().optional(),
5365
+ plugin: z12.array(z12.string().min(1)).optional(),
5366
+ concurrency: z12.number().int().min(1).optional(),
5367
+ output: z12.string().optional(),
5368
+ saveResults: z12.string().optional(),
5369
+ apply: z12.boolean().optional(),
5370
+ verbose: z12.boolean().optional()
5371
+ });
5372
+ var DEFAULT_ANTHROPIC_MODEL4 = "claude-sonnet-4-5-20250929";
5373
+ var DEFAULT_OPENAI_MODEL4 = "gpt-4.1-mini";
5374
+ function collectPluginPaths3(value, previous = []) {
5375
+ return [...previous, value];
5376
+ }
5377
+ function resolveModel4(provider, model) {
5378
+ if (provider === "openai" && model === DEFAULT_ANTHROPIC_MODEL4) {
5379
+ return DEFAULT_OPENAI_MODEL4;
5380
+ }
5381
+ return model;
5382
+ }
5383
+ async function handleImproveCommand(targetPath, options, command) {
5384
+ const spinner = options.json || !process.stdout.isTTY ? null : ora4("Preparing improvement run...").start();
5385
+ try {
5386
+ if (spinner) {
5387
+ spinner.text = "Initializing model provider...";
5388
+ }
5389
+ const provider = createProvider(options.provider, options.apiKey);
5390
+ let queries = void 0;
5391
+ if (options.queries) {
5392
+ if (spinner) {
5393
+ spinner.text = "Loading frozen trigger queries...";
5394
+ }
5395
+ queries = await loadTriggerQueriesFile(options.queries);
5396
+ }
5397
+ let prompts = void 0;
5398
+ if (options.prompts) {
5399
+ if (spinner) {
5400
+ spinner.text = "Loading eval prompts...";
5401
+ }
5402
+ prompts = await loadEvalPromptsJson(options.prompts);
5403
+ } else {
5404
+ prompts = await loadConfiguredEvalPrompts(command);
5405
+ }
5406
+ const model = resolveModel4(options.provider, options.model);
5407
+ const result = await runImprove(targetPath, {
5408
+ provider,
5409
+ model,
5410
+ lintFailOn: options.lintFailOn,
5411
+ lintSuppress: options.lintSuppress,
5412
+ lintPlugins: options.lintPlugins,
5413
+ compare: options.compare,
5414
+ numQueries: options.numQueries,
5415
+ triggerSeed: options.triggerSeed,
5416
+ queries,
5417
+ prompts,
5418
+ evalNumRuns: options.numRuns,
5419
+ evalMaxToolIterations: options.maxToolIterations,
5420
+ minF1: options.minF1,
5421
+ minAssertPassRate: options.minAssertPassRate,
5422
+ concurrency: options.concurrency,
5423
+ apply: options.apply,
5424
+ outputPath: options.output,
5425
+ verbose: options.verbose,
5426
+ onStage: (stage) => {
5427
+ if (!spinner) {
5428
+ return;
5429
+ }
5430
+ if (stage === "baseline") {
5431
+ spinner.text = "Running baseline check...";
5432
+ } else if (stage === "generate") {
5433
+ spinner.text = "Generating candidate rewrite...";
5434
+ } else if (stage === "validate") {
5435
+ spinner.text = "Validating candidate rewrite...";
5436
+ } else if (stage === "verify") {
5437
+ spinner.text = "Verifying candidate against frozen test inputs...";
5438
+ } else if (stage === "write") {
5439
+ spinner.text = options.apply ? "Writing improved SKILL.md..." : "Writing candidate output...";
5440
+ }
5441
+ }
5442
+ });
5443
+ if (options.saveResults) {
5444
+ await writeJsonFile(options.saveResults, result);
5445
+ }
5446
+ spinner?.stop();
5447
+ if (options.json) {
5448
+ writeResult(result, true);
5449
+ } else {
5450
+ writeResult(renderImproveReport(result, options.color, options.verbose), false);
5451
+ }
5452
+ process.exitCode = result.blockedReason ? 1 : 0;
5453
+ } catch (error) {
5454
+ spinner?.stop();
5455
+ writeError(error, options.json);
5456
+ process.exitCode = 2;
5457
+ }
5458
+ }
5459
+ function registerImproveCommand(program) {
5460
+ program.command("improve").description("Rewrite SKILL.md, verify it on frozen test inputs, and optionally apply it.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai").option("--model <model>", "Model for baseline, rewrite, and verification runs").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--compare <path...>", "Path(s) to sibling skill directories to include as competitors").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible trigger results", (value) => Number.parseInt(value, 10)).option("--prompts <path>", "Path to eval prompts JSON").option("--plugin <path>", "Load a custom lint plugin file", collectPluginPaths3, []).option("--concurrency <n>", "Maximum in-flight trigger/eval tasks", (value) => Number.parseInt(value, 10)).option("--output <path>", "Write the verified candidate SKILL.md to a separate file").option("--save-results <path>", "Save the full improve result JSON").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value)).option(
5461
+ "--min-assert-pass-rate <n>",
5462
+ "Minimum required eval assertion pass rate (0-1)",
5463
+ (value) => Number.parseFloat(value)
5464
+ ).option("--apply", "Apply the verified rewrite to the source SKILL.md").option("--verbose", "Include detailed baseline and verification reports").action(async (targetPath, _commandOptions, command) => {
5465
+ const globalOptions = getGlobalCliOptions(command);
5466
+ const config = getResolvedConfig(command);
5467
+ const parsedCli = improveCliSchema.safeParse(command.opts());
5468
+ if (!parsedCli.success) {
5469
+ writeError(new Error(parsedCli.error.issues[0]?.message ?? "Invalid improve options."), globalOptions.json);
5470
+ process.exitCode = 2;
5471
+ return;
5472
+ }
5473
+ await handleImproveCommand(
5474
+ targetPath,
5475
+ {
5476
+ ...globalOptions,
5477
+ provider: config.provider,
5478
+ model: config.model,
5479
+ apiKey: parsedCli.data.apiKey,
5480
+ queries: parsedCli.data.queries,
5481
+ compare: config.trigger.compare,
5482
+ numQueries: config.trigger.numQueries,
5483
+ prompts: parsedCli.data.prompts,
5484
+ minF1: config.trigger.threshold,
5485
+ minAssertPassRate: config.eval.threshold,
5486
+ numRuns: config.eval.numRuns,
5487
+ maxToolIterations: config.eval.maxToolIterations,
5488
+ concurrency: config.concurrency,
5489
+ lintFailOn: config.lint.failOn,
5490
+ lintSuppress: config.lint.suppress,
5491
+ lintPlugins: config.lint.plugins,
5492
+ triggerSeed: parsedCli.data.seed ?? config.trigger.seed,
5493
+ output: parsedCli.data.output,
5494
+ saveResults: parsedCli.data.saveResults,
5495
+ apply: Boolean(parsedCli.data.apply),
5496
+ verbose: Boolean(parsedCli.data.verbose)
5497
+ },
5498
+ command
5499
+ );
5500
+ });
5501
+ }
5502
+
5503
+ // src/commands/route.ts
5504
+ import fs13 from "node:fs/promises";
5505
+ import ora5 from "ora";
5506
+ import { z as z14 } from "zod";
5507
+
5508
+ // src/core/route-tester.ts
5509
+ import path8 from "node:path";
5510
+ import { z as z13 } from "zod";
5511
+ var stringArraySchema = z13.array(z13.string().min(1));
5512
+ function parseJsonArrayFromModelOutput2(raw) {
5513
+ const trimmed = raw.trim();
5514
+ if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
5515
+ return JSON.parse(trimmed);
5516
+ }
5517
+ const start = trimmed.indexOf("[");
5518
+ const end = trimmed.lastIndexOf("]");
5519
+ if (start >= 0 && end > start) {
5520
+ return JSON.parse(trimmed.slice(start, end + 1));
5521
+ }
5522
+ throw new Error("Model did not return a JSON array.");
5523
+ }
5524
+ function parseRouteDecision(rawResponse, skillNames) {
5525
+ const normalized = rawResponse.trim().toLowerCase();
5526
+ if (normalized === "none" || normalized.startsWith("none")) {
5527
+ return "none";
5528
+ }
5529
+ for (const skillName of skillNames) {
5530
+ const escaped = skillName.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
5531
+ const regex = new RegExp(`\\b${escaped}\\b`, "i");
5532
+ if (regex.test(rawResponse)) {
5533
+ return skillName;
5534
+ }
5535
+ }
5536
+ return "unrecognized";
5537
+ }
5538
+ async function discoverSkillPaths(skillDir) {
5539
+ const allFiles = await listFilesRecursive(skillDir);
5540
+ return allFiles.filter((f) => path8.basename(f) === "SKILL.md");
5541
+ }
5542
+ async function generatePositiveQueriesForSkill(skill, provider, model, count) {
5543
+ const systemPrompt = [
5544
+ "You generate realistic user queries that should trigger a specific agent skill.",
5545
+ "Return a JSON array of strings only. No markdown, no comments.",
5546
+ "Each string is one realistic user query that clearly belongs to this skill.",
5547
+ "Queries should look like real user requests with enough context to drive a routing decision."
5548
+ ].join(" ");
5549
+ const userPrompt = [
5550
+ `Skill name: ${skill.frontmatter.name}`,
5551
+ `Skill description: ${skill.frontmatter.description}`,
5552
+ `Generate exactly ${count} distinct queries that should trigger this skill.`
5553
+ ].join("\n");
5554
+ const raw = await provider.sendMessage(systemPrompt, userPrompt, { model });
5555
+ const parsed = stringArraySchema.safeParse(parseJsonArrayFromModelOutput2(raw));
5556
+ if (!parsed.success) {
5557
+ throw new Error(
5558
+ `Failed to parse generated queries for skill '${skill.frontmatter.name}': ${parsed.error.issues[0]?.message ?? "invalid format"}`
5559
+ );
5560
+ }
5561
+ if (parsed.data.length < count) {
5562
+ throw new Error(
5563
+ `Expected ${count} queries for skill '${skill.frontmatter.name}', got ${parsed.data.length}.`
5564
+ );
5565
+ }
5566
+ return parsed.data.slice(0, count);
5567
+ }
5568
+ function buildSkillListText(skills) {
5569
+ return skills.map((s) => `- ${s.frontmatter.name}: ${s.frontmatter.description}`).join("\n");
5570
+ }
5571
+ function buildConfusionMatrix(cases, skillNames, numQueriesPerSkill) {
5572
+ const allActualValues = [...skillNames, "none", "unrecognized"];
5573
+ const matrix = {};
5574
+ for (const target of skillNames) {
5575
+ matrix[target] = {};
5576
+ for (const actual of allActualValues) {
5577
+ matrix[target][actual] = 0;
5578
+ }
5579
+ }
5580
+ for (const c of cases) {
5581
+ const row = matrix[c.targetSkill];
5582
+ if (row) {
5583
+ row[c.actualSkill] = (row[c.actualSkill] ?? 0) + 1;
5584
+ }
5585
+ }
5586
+ const matrixPct = {};
5587
+ const divisor = numQueriesPerSkill > 0 ? numQueriesPerSkill : 1;
5588
+ for (const target of skillNames) {
5589
+ matrixPct[target] = {};
5590
+ for (const actual of allActualValues) {
5591
+ matrixPct[target][actual] = (matrix[target][actual] ?? 0) / divisor;
5592
+ }
5593
+ }
5594
+ return { matrix, matrixPct };
5595
+ }
5596
+ function computePerSkillMetrics(skillNames, matrix, numQueriesPerSkill) {
5597
+ return skillNames.map((skill) => {
5598
+ const tp = matrix[skill]?.[skill] ?? 0;
5599
+ const fp = skillNames.filter((s) => s !== skill).reduce((sum, other) => sum + (matrix[other]?.[skill] ?? 0), 0);
5600
+ const recall = numQueriesPerSkill === 0 ? 0 : tp / numQueriesPerSkill;
5601
+ const precDenom = tp + fp;
5602
+ const precision = precDenom === 0 ? 0 : tp / precDenom;
5603
+ const f1 = precision + recall === 0 ? 0 : 2 * precision * recall / (precision + recall);
5604
+ return { skill, queriesTotal: numQueriesPerSkill, correct: tp, precision, recall, f1 };
5605
+ });
5606
+ }
5607
+ function detectConflicts(skillNames, matrixPct, conflictThreshold) {
5608
+ const conflicts = [];
5609
+ for (let i = 0; i < skillNames.length; i++) {
5610
+ for (let j = i + 1; j < skillNames.length; j++) {
5611
+ const skillA = skillNames[i];
5612
+ const skillB = skillNames[j];
5613
+ const bleedAtoB = matrixPct[skillA]?.[skillB] ?? 0;
5614
+ const bleedBtoA = matrixPct[skillB]?.[skillA] ?? 0;
5615
+ if (Math.max(bleedAtoB, bleedBtoA) > conflictThreshold) {
5616
+ conflicts.push({ skillA, skillB, bleedAtoB, bleedBtoA });
5617
+ }
5618
+ }
5619
+ }
5620
+ return conflicts;
5621
+ }
5622
+ function buildRouteSuggestions(perSkillMetrics, conflicts) {
5623
+ const suggestions = [];
5624
+ for (const metrics of perSkillMetrics) {
5625
+ if (metrics.f1 < 0.7) {
5626
+ suggestions.push(
5627
+ `'${metrics.skill}' has low F1 (${(metrics.f1 * 100).toFixed(1)}%) \u2014 consider clarifying its description and scope boundaries.`
5628
+ );
5629
+ }
5630
+ }
5631
+ for (const conflict of conflicts) {
5632
+ suggestions.push(
5633
+ `'${conflict.skillA}' and '${conflict.skillB}' overlap: ${(conflict.bleedAtoB * 100).toFixed(1)}% of ${conflict.skillA} queries routed to ${conflict.skillB}, ${(conflict.bleedBtoA * 100).toFixed(1)}% the other way \u2014 consider narrowing scope boundaries.`
5634
+ );
5635
+ }
5636
+ if (suggestions.length === 0) {
5637
+ suggestions.push("Routing looks clean. All skills are well-differentiated on this sample.");
5638
+ }
5639
+ return suggestions;
5640
+ }
5641
+ async function runRouteTest(skillDir, options) {
5642
+ const numQueriesPerSkill = options.numQueriesPerSkill ?? 10;
5643
+ const conflictThreshold = options.conflictThreshold ?? 0.1;
5644
+ const concurrency = options.concurrency ?? 5;
5645
+ const absoluteSkillDir = path8.resolve(skillDir);
5646
+ const skillPaths = await discoverSkillPaths(absoluteSkillDir);
5647
+ if (skillPaths.length < 2) {
5648
+ throw new Error(
5649
+ `Route test requires at least 2 skills. Found ${skillPaths.length} in: ${skillDir}`
5650
+ );
5651
+ }
5652
+ if (skillPaths.length > 20) {
5653
+ process.stderr.write(
5654
+ `Warning: ${skillPaths.length} skills found. This will make ${skillPaths.length * numQueriesPerSkill} routing model calls.
5655
+ `
5656
+ );
5657
+ }
5658
+ const skills = await Promise.all(skillPaths.map((p) => parseSkillStrict(p)));
5659
+ const skillNames = skills.map((s) => s.frontmatter.name);
5660
+ const queriesPerSkill = await pMap(
5661
+ skills,
5662
+ (skill) => generatePositiveQueriesForSkill(skill, options.provider, options.model, numQueriesPerSkill),
5663
+ concurrency
5664
+ );
5665
+ const workItems = [];
5666
+ for (let i = 0; i < skills.length; i++) {
5667
+ const skill = skills[i];
5668
+ const queries = queriesPerSkill[i];
5669
+ for (const query of queries) {
5670
+ workItems.push({ query, targetSkill: skill.frontmatter.name });
5671
+ }
5672
+ }
5673
+ const skillListText = buildSkillListText(skills);
5674
+ const systemPrompt = "Select the single best skill for the user's request from the provided list. Respond with only the skill name, or 'none' if nothing fits.";
5675
+ const cases = await pMap(
5676
+ workItems,
5677
+ async ({ query, targetSkill }) => {
5678
+ const userPrompt = `Available skills:
5679
+ ${skillListText}
5680
+
5681
+ User query: ${query}`;
5682
+ const rawResponse = await options.provider.sendMessage(systemPrompt, userPrompt, { model: options.model });
5683
+ const actualSkill = parseRouteDecision(rawResponse, skillNames);
5684
+ return {
5685
+ query,
5686
+ targetSkill,
5687
+ actualSkill,
5688
+ correct: actualSkill === targetSkill,
5689
+ rawModelResponse: options.verbose ? rawResponse : void 0
5690
+ };
5691
+ },
5692
+ concurrency
5693
+ );
5694
+ const { matrix, matrixPct } = buildConfusionMatrix(cases, skillNames, numQueriesPerSkill);
5695
+ const perSkillMetrics = computePerSkillMetrics(skillNames, matrix, numQueriesPerSkill);
5696
+ const conflicts = detectConflicts(skillNames, matrixPct, conflictThreshold);
5697
+ const correctCount = cases.filter((c) => c.correct).length;
5698
+ const overallAccuracy = cases.length === 0 ? 0 : correctCount / cases.length;
5699
+ const suggestions = buildRouteSuggestions(perSkillMetrics, conflicts);
5700
+ return {
5701
+ skillDir: absoluteSkillDir,
5702
+ skills: skillNames,
5703
+ model: options.model,
5704
+ provider: options.provider.name,
5705
+ seed: options.seed,
5706
+ numQueriesPerSkill,
5707
+ cases,
5708
+ matrix,
5709
+ matrixPct,
5710
+ perSkillMetrics,
5711
+ conflicts,
5712
+ suggestions,
5713
+ overallAccuracy
5714
+ };
5715
+ }
5716
+
5717
+ // src/commands/route.ts
5718
+ var routeCliSchema = z14.object({
5719
+ numQueries: z14.number().int().min(1).optional(),
5720
+ conflictThreshold: z14.number().min(0).max(1).optional(),
5721
+ saveQueries: z14.string().optional(),
5722
+ seed: z14.number().int().optional(),
5723
+ concurrency: z14.number().int().min(1).optional(),
5724
+ html: z14.string().optional(),
5725
+ verbose: z14.boolean().optional(),
5726
+ apiKey: z14.string().optional()
5727
+ });
5728
+ var DEFAULT_ANTHROPIC_MODEL5 = "claude-sonnet-4-5-20250929";
5729
+ var DEFAULT_OPENAI_MODEL5 = "gpt-4.1-mini";
5730
+ function resolveModel5(provider, model) {
5731
+ if (provider === "openai" && model === DEFAULT_ANTHROPIC_MODEL5) {
5732
+ return DEFAULT_OPENAI_MODEL5;
5733
+ }
5734
+ return model;
5735
+ }
5736
+ async function handleRouteCommand(skillDir, options) {
5737
+ const spinner = options.json || !process.stdout.isTTY ? null : ora5("Preparing route evaluation...").start();
5738
+ try {
5739
+ if (spinner) spinner.text = "Initializing model provider...";
5740
+ const provider = createProvider(options.provider, options.apiKey);
5741
+ if (spinner) spinner.text = "Running route simulations...";
5742
+ const model = resolveModel5(options.provider, options.model);
5743
+ const result = await runRouteTest(skillDir, {
5744
+ model,
5745
+ provider,
5746
+ numQueriesPerSkill: options.numQueriesPerSkill,
5747
+ conflictThreshold: options.conflictThreshold,
5748
+ seed: options.seed,
5749
+ concurrency: options.concurrency,
5750
+ verbose: options.verbose
5751
+ });
5752
+ if (options.saveQueries) {
5753
+ await writeJsonFile(
5754
+ options.saveQueries,
5755
+ result.cases.map((c) => ({ query: c.query, targetSkill: c.targetSkill }))
5756
+ );
5757
+ }
5758
+ spinner?.stop();
5759
+ if (options.json) {
5760
+ writeResult(result, true);
5761
+ } else {
5762
+ writeResult(renderRouteReport(result, options.color, options.verbose), false);
5763
+ }
5764
+ if (options.html) {
5765
+ await fs13.writeFile(options.html, renderRouteHtml(result), "utf8");
5766
+ }
5767
+ } catch (error) {
5768
+ spinner?.stop();
5769
+ writeError(error, options.json);
5770
+ process.exitCode = 2;
5771
+ }
5772
+ }
5773
+ function registerRouteCommand(program) {
5774
+ program.command("route").description("Validate multi-skill routing across all skills in a directory.").argument("<skillDir>", "Directory containing skill subdirectories with SKILL.md files").option("--model <model>", "Model to use").option("--provider <provider>", "LLM provider: anthropic|openai").option("--num-queries <n>", "Queries per skill (default: 10)", (value) => Number.parseInt(value, 10)).option("--conflict-threshold <n>", "Bleed fraction to flag as conflict (default: 0.1)", (value) => Number.parseFloat(value)).option("--seed <number>", "RNG seed for reproducibility metadata", (value) => Number.parseInt(value, 10)).option("--concurrency <n>", "Maximum in-flight requests", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--save-queries <path>", "Save generated queries as JSON").option("--api-key <key>", "API key override").option("--verbose", "Show raw model responses").action(async (skillDir, _commandOptions, command) => {
5775
+ const globalOptions = getGlobalCliOptions(command);
5776
+ const config = getResolvedConfig(command);
5777
+ const parsedCli = routeCliSchema.safeParse(command.opts());
5778
+ if (!parsedCli.success) {
5779
+ writeError(new Error(parsedCli.error.issues[0]?.message ?? "Invalid route options."), globalOptions.json);
5780
+ process.exitCode = 2;
5781
+ return;
5782
+ }
5783
+ await handleRouteCommand(skillDir, {
5784
+ ...globalOptions,
5785
+ model: config.model,
5786
+ provider: config.provider,
5787
+ numQueriesPerSkill: parsedCli.data.numQueries ?? 10,
5788
+ conflictThreshold: parsedCli.data.conflictThreshold ?? 0.1,
5789
+ saveQueries: parsedCli.data.saveQueries,
5790
+ seed: parsedCli.data.seed,
5791
+ concurrency: parsedCli.data.concurrency ?? config.concurrency,
5792
+ html: parsedCli.data.html,
5793
+ verbose: Boolean(parsedCli.data.verbose),
5794
+ apiKey: parsedCli.data.apiKey
5795
+ });
5796
+ });
5797
+ }
5798
+
4124
5799
  // src/index.ts
4125
5800
  function resolveVersion() {
4126
5801
  try {
4127
5802
  const currentFilePath = fileURLToPath(import.meta.url);
4128
- const packageJsonPath = path7.resolve(path7.dirname(currentFilePath), "..", "package.json");
4129
- const raw = fs12.readFileSync(packageJsonPath, "utf8");
5803
+ const packageJsonPath = path9.resolve(path9.dirname(currentFilePath), "..", "package.json");
5804
+ const raw = fs14.readFileSync(packageJsonPath, "utf8");
4130
5805
  const parsed = JSON.parse(raw);
4131
5806
  return parsed.version ?? "0.0.0";
4132
5807
  } catch {
@@ -4159,6 +5834,8 @@ async function run(argv) {
4159
5834
  registerTriggerCommand(program);
4160
5835
  registerEvalCommand(program);
4161
5836
  registerCheckCommand(program);
5837
+ registerImproveCommand(program);
5838
+ registerRouteCommand(program);
4162
5839
  try {
4163
5840
  await program.parseAsync(argv);
4164
5841
  } catch (error) {