npm - skilltest - Versions diffs - 0.8.0 → 0.10.0 - Mend

skilltest 0.8.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/index.js CHANGED Viewed

@@ -1,8 +1,8 @@
 #!/usr/bin/env node
 // src/index.ts
-import fs12 from "node:fs";
-import path7 from "node:path";
+import fs14 from "node:fs";
+import path9 from "node:path";
 import { fileURLToPath } from "node:url";
 import { Command } from "commander";
@@ -100,7 +100,10 @@ function parseFrontmatter(rawSkill) {
 }
 async function parseSkillStrict(inputPath) {
   const skillContext = await loadSkillFile(inputPath);
-  const parsedFrontmatter = parseFrontmatter(skillContext.raw);
+  return parseSkillDocumentStrict(skillContext.raw, skillContext.skillRoot, skillContext.skillFile);
+}
+function parseSkillDocumentStrict(rawSkill, skillRoot, skillFile) {
+  const parsedFrontmatter = parseFrontmatter(rawSkill);
   if (!parsedFrontmatter.hasFrontmatter) {
     throw new Error("SKILL.md is missing YAML frontmatter.");
   }
@@ -113,9 +116,9 @@ async function parseSkillStrict(inputPath) {
     throw new Error(`Invalid frontmatter field '${issue.path.join(".")}': ${issue.message}`);
   }
   return {
-    skillRoot: skillContext.skillRoot,
-    skillFile: skillContext.skillFile,
-    raw: skillContext.raw,
+    skillRoot,
+    skillFile,
+    raw: rawSkill,
     content: parsedFrontmatter.content,
     frontmatterRaw: parsedFrontmatter.rawFrontmatter,
     frontmatter: validation.data
@@ -1515,6 +1518,9 @@ function badgeLabel(status) {
 function renderBadge(status) {
   return `<span class="badge ${status}">${badgeLabel(status)}</span>`;
 }
+function renderMetaBadge(label) {
+  return `<span class="meta-badge">${escapeHtml(label)}</span>`;
+}
 function renderStatCards(stats) {
   return `<div class="stats-grid">${stats.map(
     (stat) => `
@@ -1690,10 +1696,37 @@ function promptStatus(promptResult) {
   return "warn";
 }
 function renderAssertionRow(assertion) {
-  return renderDetails(
-    `${badgeLabel(assertion.passed ? "pass" : "fail")} ${assertion.assertion}`,
-    renderPreBlock(assertion.evidence)
-  );
+  return `
+    <details class="detail-block">
+      <summary>
+        ${renderBadge(assertion.passed ? "pass" : "fail")}
+        ${assertion.source === "tool" ? renderMetaBadge("Tool") : ""}
+        <span>${escapeHtml(assertion.assertion)}</span>
+      </summary>
+      <div class="detail-content">${renderPreBlock(assertion.evidence)}</div>
+    </details>
+  `;
+}
+function renderToolCallsSection(promptResult) {
+  if (!promptResult.toolCalls || promptResult.toolCalls.length === 0) {
+    return "";
+  }
+  const toolRows = promptResult.toolCalls.map(
+    (toolCall) => `
+        <div class="tool-call">
+          <div class="row-header">
+            <div>
+              <div class="row-title">${escapeHtml(toolCall.name)}</div>
+              <div class="row-subtitle">${escapeHtml(`turn ${toolCall.turnIndex}`)}</div>
+            </div>
+            ${renderMetaBadge("Tool Call")}
+          </div>
+          ${renderDefinitionList([{ label: "Arguments", value: JSON.stringify(toolCall.arguments) }])}
+          ${renderDetails("Mock response", renderPreBlock(toolCall.response))}
+        </div>
+      `
+  ).join("");
+  return renderDetails("Tool Calls", `<div class="tool-call-list">${toolRows}</div>`);
 }
 function renderEvalPromptRow(promptResult) {
   const assertionDetails = promptResult.assertions.map((assertion) => renderAssertionRow(assertion)).join("");
@@ -1712,9 +1745,12 @@ function renderEvalPromptRow(promptResult) {
       <div class="row-body">${escapeHtml(promptResult.responseSummary)}</div>
       ${renderDefinitionList([
     { label: "Passed assertions", value: String(promptResult.passedAssertions) },
-    { label: "Total assertions", value: String(promptResult.totalAssertions) }
+    { label: "Total assertions", value: String(promptResult.totalAssertions) },
+    ...promptResult.toolCalls ? [{ label: "Tool calls", value: String(promptResult.toolCalls.length) }] : [],
+    ...promptResult.loopIterations !== void 0 ? [{ label: "Loop iterations", value: String(promptResult.loopIterations) }] : []
   ])}
       ${renderDetails("Assertion evidence", assertionDetails || `<p>No assertions.</p>`)}
+      ${renderToolCallsSection(promptResult)}
       ${responseDetails}
     </div>
   `;
@@ -1981,6 +2017,20 @@ function renderHtmlDocument(title, body) {
         background: rgba(107, 114, 128, 0.14);
       }
+      .meta-badge {
+        display: inline-flex;
+        align-items: center;
+        justify-content: center;
+        padding: 3px 10px;
+        border-radius: 999px;
+        border: 1px solid rgba(17, 24, 39, 0.16);
+        background: rgba(17, 24, 39, 0.06);
+        color: var(--text);
+        font-size: 0.76rem;
+        font-weight: 700;
+        white-space: nowrap;
+      }
       details {
         margin-top: 10px;
       }
@@ -1995,6 +2045,13 @@ function renderHtmlDocument(title, body) {
         padding-top: 10px;
       }
+      .detail-block summary {
+        display: flex;
+        align-items: center;
+        gap: 8px;
+        flex-wrap: wrap;
+      }
       .detail-content p {
         margin: 0;
       }
@@ -2045,6 +2102,18 @@ function renderHtmlDocument(title, body) {
         overflow-wrap: anywhere;
       }
+      .tool-call-list {
+        display: grid;
+        gap: 12px;
+      }
+      .tool-call {
+        border: 1px solid var(--border);
+        border-radius: 12px;
+        padding: 14px;
+        background: #fffaf0;
+      }
       ul {
         margin: 0;
         padding-left: 20px;
@@ -2246,6 +2315,76 @@ function renderCheckHtml(result) {
   );
   return renderHtmlDocument(`skilltest check - ${skillName}`, [header, lintSection, triggerSection, evalSection, qualityGate].join(""));
 }
+function renderRouteMatrix(result) {
+  const cols = [...result.skills, "none"];
+  const headerCells = cols.map((col) => `<th>${escapeHtml(col)}</th>`).join("");
+  const rows = result.skills.map((target) => {
+    const cells = cols.map((col) => {
+      const pct = result.matrixPct[target]?.[col] ?? 0;
+      const isDiag = col === target;
+      const bg = isDiag ? "background:rgba(34,197,94,0.18);" : pct > 0.15 ? "background:rgba(239,68,68,0.18);" : pct > 0.05 ? "background:rgba(234,179,8,0.12);" : "";
+      return `<td style="${bg}">${escapeHtml(formatPercent(pct))}</td>`;
+    }).join("");
+    return `<tr><th>${escapeHtml(target)}</th>${cells}</tr>`;
+  }).join("");
+  return `<style>.rt{border-collapse:collapse;font-size:.85rem;width:100%}.rt th,.rt td{border:1px solid #d4d4d8;padding:8px 12px;text-align:center}.rt thead th{background:#fafafa;font-weight:700}</style><div style="overflow-x:auto"><table class="rt"><thead><tr><th></th>${headerCells}</tr></thead><tbody>${rows}</tbody></table></div>`;
+}
+function renderRouteHtml(result) {
+  const conflictCount = result.conflicts.length;
+  const overallStatus = result.overallAccuracy >= 0.8 ? "pass" : "warn";
+  const conflictStatus = conflictCount === 0 ? "pass" : "warn";
+  const header = renderHeaderCard(
+    "route",
+    `Routing Report \u2014 ${result.skills.length} skills`,
+    result.skillDir,
+    [
+      { label: "Overall accuracy", value: formatPercent(result.overallAccuracy), status: overallStatus },
+      { label: "Conflicts", value: String(conflictCount), status: conflictStatus },
+      { label: "Skills", value: String(result.skills.length) },
+      { label: "Queries/skill", value: String(result.numQueriesPerSkill) }
+    ],
+    [
+      { label: "Provider", value: result.provider },
+      { label: "Model", value: result.model },
+      { label: "Seed", value: result.seed !== void 0 ? String(result.seed) : "none" }
+    ]
+  );
+  const matrixSection = renderSectionCard("Routing Matrix", renderRouteMatrix(result));
+  const metricsRows = result.perSkillMetrics.map((m) => {
+    const status = m.f1 >= 0.8 ? "pass" : "warn";
+    return renderMessageRow(
+      status,
+      m.skill,
+      `F1: ${formatPercent(m.f1)}  precision: ${formatPercent(m.precision)}  recall: ${formatPercent(m.recall)}`,
+      renderDefinitionList([
+        { label: "Queries", value: String(m.queriesTotal) },
+        { label: "Correct", value: String(m.correct) },
+        { label: "Precision", value: formatPercent(m.precision) },
+        { label: "Recall", value: formatPercent(m.recall) }
+      ])
+    );
+  }).join("");
+  const metricsSection = renderSectionCard("Per-Skill Metrics", `<div class="row-list">${metricsRows}</div>`);
+  let conflictsSection = "";
+  if (result.conflicts.length > 0) {
+    const conflictRows = result.conflicts.map(
+      (conflict) => renderMessageRow(
+        "warn",
+        `${escapeHtml(conflict.skillA)} \u2194 ${escapeHtml(conflict.skillB)}`,
+        `${formatPercent(conflict.bleedAtoB)} of ${escapeHtml(conflict.skillA)} queries routed to ${escapeHtml(conflict.skillB)}; ${formatPercent(conflict.bleedBtoA)} the other way`
+      )
+    ).join("");
+    conflictsSection = renderSectionCard("Conflicts", `<div class="row-list">${conflictRows}</div>`);
+  }
+  const suggestionsSection = renderSectionCard(
+    "Suggestions",
+    `<ul>${result.suggestions.map((s) => `<li>${escapeHtml(s)}</li>`).join("")}</ul>`
+  );
+  return renderHtmlDocument(
+    `skilltest route \u2014 ${result.skillDir}`,
+    [header, matrixSection, metricsSection, conflictsSection, suggestionsSection].join("")
+  );
+}
 // src/reporters/terminal.ts
 import { Chalk } from "chalk";
@@ -2270,6 +2409,70 @@ function countSkippedSecurityPatterns2(issues) {
 function formatPercent2(value) {
   return `${(value * 100).toFixed(1)}%`;
 }
+function formatSignedNumber(value, digits = 4) {
+  const prefix = value > 0 ? "+" : "";
+  return `${prefix}${value.toFixed(digits)}`;
+}
+function diffChangedLines(beforeText, afterText) {
+  const beforeLines = beforeText.split(/\r?\n/);
+  const afterLines = afterText.split(/\r?\n/);
+  const dp = Array.from({ length: beforeLines.length + 1 }, () => Array(afterLines.length + 1).fill(0));
+  for (let beforeIndex2 = beforeLines.length - 1; beforeIndex2 >= 0; beforeIndex2 -= 1) {
+    for (let afterIndex2 = afterLines.length - 1; afterIndex2 >= 0; afterIndex2 -= 1) {
+      if (beforeLines[beforeIndex2] === afterLines[afterIndex2]) {
+        dp[beforeIndex2][afterIndex2] = 1 + (dp[beforeIndex2 + 1][afterIndex2 + 1] ?? 0);
+      } else {
+        dp[beforeIndex2][afterIndex2] = Math.max(dp[beforeIndex2 + 1][afterIndex2] ?? 0, dp[beforeIndex2][afterIndex2 + 1] ?? 0);
+      }
+    }
+  }
+  const changedLines = [];
+  let beforeIndex = 0;
+  let afterIndex = 0;
+  while (beforeIndex < beforeLines.length && afterIndex < afterLines.length) {
+    if (beforeLines[beforeIndex] === afterLines[afterIndex]) {
+      beforeIndex += 1;
+      afterIndex += 1;
+      continue;
+    }
+    const skipBefore = dp[beforeIndex + 1][afterIndex] ?? 0;
+    const skipAfter = dp[beforeIndex][afterIndex + 1] ?? 0;
+    if (skipBefore >= skipAfter) {
+      changedLines.push({ type: "-", line: beforeLines[beforeIndex] ?? "" });
+      beforeIndex += 1;
+    } else {
+      changedLines.push({ type: "+", line: afterLines[afterIndex] ?? "" });
+      afterIndex += 1;
+    }
+  }
+  while (beforeIndex < beforeLines.length) {
+    changedLines.push({ type: "-", line: beforeLines[beforeIndex] ?? "" });
+    beforeIndex += 1;
+  }
+  while (afterIndex < afterLines.length) {
+    changedLines.push({ type: "+", line: afterLines[afterIndex] ?? "" });
+    afterIndex += 1;
+  }
+  return changedLines;
+}
+function renderDiffPreview(beforeText, afterText, maxLines = 40) {
+  const changedLines = diffChangedLines(beforeText, afterText);
+  if (changedLines.length === 0) {
+    return ["  (no content changes)"];
+  }
+  const previewLines = changedLines.slice(0, maxLines).map((entry) => `  ${entry.type} ${entry.line}`);
+  if (changedLines.length > maxLines) {
+    previewLines.push(`  ... ${changedLines.length - maxLines} more changed line(s)`);
+  }
+  return previewLines;
+}
+function summarizeToolCalls(toolCalls) {
+  const counts = /* @__PURE__ */ new Map();
+  for (const toolCall of toolCalls) {
+    counts.set(toolCall.name, (counts.get(toolCall.name) ?? 0) + 1);
+  }
+  return Array.from(counts.entries()).map(([name, count]) => `${name} x${count}`).join(", ");
+}
 function renderLintReport(report, enableColor) {
   const c = getChalkInstance(enableColor);
   const { passed, warnings, failures, total } = report.summary;
@@ -2330,12 +2533,25 @@ function renderEvalReport(result, enableColor, verbose) {
   for (const [index, promptResult] of result.results.entries()) {
     lines.push(`${index + 1}. prompt: ${promptResult.prompt}`);
     lines.push(`   response summary: ${promptResult.responseSummary.replace(/\s+/g, " ").trim()}`);
+    if (promptResult.toolCalls) {
+      lines.push(`   Tools: ${promptResult.toolCalls.length} calls (${summarizeToolCalls(promptResult.toolCalls)})`);
+      if (promptResult.loopIterations !== void 0) {
+        lines.push(`   loop iterations: ${promptResult.loopIterations}`);
+      }
+    }
     for (const assertion of promptResult.assertions) {
       const status = assertion.passed ? c.green("PASS") : c.red("FAIL");
       lines.push(`   ${status} ${assertion.assertion}`);
       lines.push(`      evidence: ${assertion.evidence}`);
     }
     if (verbose) {
+      if (promptResult.toolCalls) {
+        for (const toolCall of promptResult.toolCalls) {
+          lines.push(`   tool ${toolCall.turnIndex}: ${toolCall.name}`);
+          lines.push(`      arguments: ${JSON.stringify(toolCall.arguments)}`);
+          lines.push(`      response: ${toolCall.response}`);
+        }
+      }
       lines.push(`   full response: ${promptResult.response}`);
     }
   }
@@ -2412,6 +2628,12 @@ function renderCheckReport(result, enableColor, verbose) {
       }
       lines.push(`  - prompt: ${promptResult.prompt}`);
       lines.push(`    response summary: ${promptResult.responseSummary.replace(/\s+/g, " ").trim()}`);
+      if (promptResult.toolCalls) {
+        lines.push(`    Tools: ${promptResult.toolCalls.length} calls (${summarizeToolCalls(promptResult.toolCalls)})`);
+        if (promptResult.loopIterations !== void 0) {
+          lines.push(`    loop iterations: ${promptResult.loopIterations}`);
+        }
+      }
       const assertionsToRender = verbose ? promptResult.assertions : failedAssertions;
       for (const assertion of assertionsToRender) {
         const assertionStatus = assertion.passed ? c.green("PASS") : c.red("FAIL");
@@ -2419,6 +2641,13 @@ function renderCheckReport(result, enableColor, verbose) {
         lines.push(`      evidence: ${assertion.evidence}`);
       }
       if (verbose) {
+        if (promptResult.toolCalls) {
+          for (const toolCall of promptResult.toolCalls) {
+            lines.push(`    tool ${toolCall.turnIndex}: ${toolCall.name}`);
+            lines.push(`      arguments: ${JSON.stringify(toolCall.arguments)}`);
+            lines.push(`      response: ${toolCall.response}`);
+          }
+        }
         lines.push(`    full response: ${promptResult.response}`);
       }
     }
@@ -2433,6 +2662,137 @@ function renderCheckReport(result, enableColor, verbose) {
   lines.push(`- overall: ${overallGate}`);
   return lines.join("\n");
 }
+function renderImproveReport(result, enableColor, verbose = false) {
+  const c = getChalkInstance(enableColor);
+  const lines = [
+    "skilltest improve",
+    `target: ${result.target}`,
+    `provider/model: ${result.provider}/${result.model}`,
+    `thresholds: min-f1=${result.thresholds.minF1.toFixed(2)} min-assert-pass-rate=${result.thresholds.minAssertPassRate.toFixed(2)}`
+  ];
+  const statusLabel = result.blockedReason ? c.red("BLOCKED") : result.applied ? c.green("APPLIED") : c.green("VERIFIED");
+  lines.push(`status: ${statusLabel}`);
+  if (result.candidate) {
+    lines.push("");
+    lines.push("Change Summary");
+    for (const item of result.candidate.changeSummary) {
+      lines.push(`- ${item}`);
+    }
+    lines.push("");
+    lines.push("Targeted Problems");
+    for (const item of result.candidate.targetedProblems) {
+      lines.push(`- ${item}`);
+    }
+  }
+  if (result.delta && result.verification) {
+    lines.push("");
+    lines.push("Before / After");
+    lines.push(
+      `- lint failures: ${result.delta.lintFailures.before} -> ${result.delta.lintFailures.after} (${formatSignedNumber(result.delta.lintFailures.delta, 0)})`
+    );
+    lines.push(
+      `- lint warnings: ${result.delta.lintWarnings.before} -> ${result.delta.lintWarnings.after} (${formatSignedNumber(result.delta.lintWarnings.delta, 0)})`
+    );
+    lines.push(
+      `- trigger f1: ${formatPercent2(result.delta.triggerF1.before)} -> ${formatPercent2(result.delta.triggerF1.after)} (${formatSignedNumber(result.delta.triggerF1.delta)})`
+    );
+    lines.push(
+      `- eval assertion pass rate: ${formatPercent2(result.delta.evalAssertPassRate.before)} -> ${formatPercent2(result.delta.evalAssertPassRate.after)} (${formatSignedNumber(result.delta.evalAssertPassRate.delta)})`
+    );
+    lines.push(
+      `- overall gate: ${result.delta.overallPassed.before ? c.green("PASS") : c.red("FAIL")} -> ${result.delta.overallPassed.after ? c.green("PASS") : c.red("FAIL")}`
+    );
+  }
+  if (result.outputPath) {
+    lines.push("");
+    lines.push(`output: ${result.outputPath}`);
+  }
+  if (result.blockedReason) {
+    lines.push("");
+    lines.push("Blocked");
+    lines.push(`- ${result.blockedReason}`);
+  }
+  if (result.candidate) {
+    lines.push("");
+    lines.push("Diff Preview");
+    lines.push(...renderDiffPreview(result.originalRaw, result.candidate.raw));
+  }
+  if (verbose) {
+    lines.push("");
+    lines.push("Baseline");
+    lines.push(renderCheckReport(result.baseline, enableColor, true));
+    if (result.verification) {
+      lines.push("");
+      lines.push("Verification");
+      lines.push(renderCheckReport(result.verification, enableColor, true));
+    }
+  }
+  return lines.join("\n");
+}
+function renderRouteReport(result, enableColor, verbose) {
+  const c = getChalkInstance(enableColor);
+  const lines = [
+    "skilltest route",
+    `directory: ${result.skillDir}`,
+    `provider/model: ${result.provider}/${result.model}`,
+    `skills: ${result.skills.length}  queries per skill: ${result.numQueriesPerSkill}`
+  ];
+  lines.push("");
+  lines.push("Per-skill metrics:");
+  for (const m of result.perSkillMetrics) {
+    const badge = m.f1 >= 0.8 ? c.green("PASS") : c.yellow("WARN");
+    lines.push(
+      `  ${m.skill.padEnd(24)} F1: ${formatPercent2(m.f1).padEnd(7)}  precision: ${formatPercent2(m.precision).padEnd(7)}  recall: ${formatPercent2(m.recall)}  [${badge}]`
+    );
+  }
+  lines.push("");
+  lines.push("Routing matrix (% of row queries routed to column):");
+  const colHeaders = [...result.skills, "none"];
+  const colWidth = 10;
+  const rowLabelWidth = 24;
+  const headerRow = "".padEnd(rowLabelWidth) + colHeaders.map((h) => h.slice(0, colWidth - 1).padEnd(colWidth)).join("");
+  lines.push("  " + headerRow);
+  for (const targetSkill of result.skills) {
+    const rowLabel = ("  " + targetSkill).padEnd(rowLabelWidth);
+    const cells = colHeaders.map((col) => {
+      const pct = result.matrixPct[targetSkill]?.[col] ?? 0;
+      const formatted = formatPercent2(pct).padEnd(colWidth);
+      if (col === targetSkill) return c.green(formatted);
+      if (pct > 0.1) return c.yellow(formatted);
+      return formatted;
+    }).join("");
+    lines.push(rowLabel + cells);
+  }
+  if (result.conflicts.length > 0) {
+    lines.push("");
+    lines.push("Conflicts detected:");
+    for (const conflict of result.conflicts) {
+      lines.push(
+        `  ${conflict.skillA} <-> ${conflict.skillB}  ${formatPercent2(conflict.bleedAtoB)} / ${formatPercent2(conflict.bleedBtoA)} bleed  [${c.yellow("WARN")}]`
+      );
+    }
+  }
+  lines.push("");
+  lines.push(`Overall accuracy: ${formatPercent2(result.overallAccuracy)}`);
+  lines.push("");
+  lines.push("Suggestions:");
+  for (const suggestion of result.suggestions) {
+    lines.push(`- ${suggestion}`);
+  }
+  if (verbose) {
+    lines.push("");
+    lines.push("Cases:");
+    for (const [index, testCase] of result.cases.entries()) {
+      const status = testCase.correct ? c.green("PASS") : c.red("FAIL");
+      lines.push(`  ${index + 1}. ${status} [${testCase.targetSkill}] ${testCase.query}`);
+      lines.push(`     routed to: ${testCase.actualSkill}`);
+      if (testCase.rawModelResponse) {
+        lines.push(`     model: ${testCase.rawModelResponse.replace(/\s+/g, " ").trim()}`);
+      }
+    }
+  }
+  return lines.join("\n");
+}
 // src/commands/common.ts
 import fs6 from "node:fs/promises";
@@ -2504,7 +2864,10 @@ function parseGraderOutput(raw) {
 async function gradeResponse(options) {
   const prompts = buildGraderPrompts(options);
   const raw = await options.provider.sendMessage(prompts.systemPrompt, prompts.userPrompt, { model: options.model });
-  return parseGraderOutput(raw);
+  return parseGraderOutput(raw).map((assertion) => ({
+    ...assertion,
+    source: "grader"
+  }));
 }
 // src/utils/concurrency.ts
@@ -2559,12 +2922,290 @@ async function pMap(items, fn, concurrency) {
   });
 }
+// src/core/tool-environment.ts
+function isPlainObject(value) {
+  return value !== null && typeof value === "object" && !Array.isArray(value);
+}
+function deepEqual(left, right) {
+  if (Array.isArray(left) && Array.isArray(right)) {
+    if (left.length !== right.length) {
+      return false;
+    }
+    return left.every((item, index) => deepEqual(item, right[index]));
+  }
+  if (isPlainObject(left) && isPlainObject(right)) {
+    const leftKeys = Object.keys(left);
+    const rightKeys = Object.keys(right);
+    if (leftKeys.length !== rightKeys.length) {
+      return false;
+    }
+    return leftKeys.every((key) => deepEqual(left[key], right[key]));
+  }
+  return left === right;
+}
+function matchesArgumentSubset(actual, expected) {
+  if (Array.isArray(expected)) {
+    if (!Array.isArray(actual) || actual.length !== expected.length) {
+      return false;
+    }
+    return expected.every((value, index) => matchesArgumentSubset(actual[index], value));
+  }
+  if (isPlainObject(expected)) {
+    if (!isPlainObject(actual)) {
+      return false;
+    }
+    return Object.entries(expected).every(([key, value]) => matchesArgumentSubset(actual[key], value));
+  }
+  return deepEqual(actual, expected);
+}
+function parseResponsePattern(pattern) {
+  if (pattern === "*") {
+    return null;
+  }
+  try {
+    const parsed = JSON.parse(pattern);
+    return isPlainObject(parsed) ? parsed : null;
+  } catch {
+    return null;
+  }
+}
+function renderFallbackResponse(tool, args) {
+  return `[mock] No mock response configured for tool '${tool.name}' with arguments: ${JSON.stringify(args)}`;
+}
+function resolveToolResponse(tool, args) {
+  const exactMatchKey = JSON.stringify(args);
+  const exactMatch = tool.responses[exactMatchKey];
+  if (exactMatch !== void 0) {
+    return exactMatch;
+  }
+  let bestPartialMatch = null;
+  for (const [pattern, response] of Object.entries(tool.responses)) {
+    if (pattern === "*") {
+      continue;
+    }
+    const parsedPattern = parseResponsePattern(pattern);
+    if (!parsedPattern) {
+      continue;
+    }
+    if (!matchesArgumentSubset(args, parsedPattern)) {
+      continue;
+    }
+    const specificity = Object.keys(parsedPattern).length;
+    if (!bestPartialMatch || specificity > bestPartialMatch.specificity) {
+      bestPartialMatch = { specificity, response };
+    }
+  }
+  if (bestPartialMatch) {
+    return bestPartialMatch.response;
+  }
+  const wildcardMatch = tool.responses["*"];
+  if (wildcardMatch !== void 0) {
+    return wildcardMatch;
+  }
+  return renderFallbackResponse(tool, args);
+}
+function toProviderToolDefinitions(mockTools) {
+  return mockTools.map((tool) => {
+    const parameters = tool.parameters ?? [];
+    return {
+      name: tool.name,
+      description: tool.description,
+      parameters: {
+        type: "object",
+        properties: Object.fromEntries(
+          parameters.map((parameter) => [
+            parameter.name,
+            {
+              type: parameter.type,
+              description: parameter.description
+            }
+          ])
+        ),
+        required: parameters.filter((parameter) => parameter.required).map((parameter) => parameter.name)
+      }
+    };
+  });
+}
+function toAssistantConversationBlocks(response) {
+  const contentBlocks = [];
+  if (response.textContent.trim().length > 0) {
+    contentBlocks.push({
+      type: "text",
+      text: response.textContent
+    });
+  }
+  for (const block of response.toolUseBlocks) {
+    contentBlocks.push({
+      type: "tool_use",
+      id: block.id,
+      name: block.name,
+      input: block.arguments
+    });
+  }
+  return contentBlocks.length === 0 ? [] : [
+    {
+      role: "assistant",
+      content: contentBlocks
+    }
+  ];
+}
+async function runWithTools(options) {
+  const maxIterations = options.maxIterations ?? 10;
+  const toolsByName = new Map(options.tools.map((tool) => [tool.name, tool]));
+  const providerTools = toProviderToolDefinitions(options.tools);
+  const messages = [{ role: "user", content: options.userMessage }];
+  const toolCalls = [];
+  let finalResponse = "";
+  let loopIterations = 0;
+  while (loopIterations < maxIterations) {
+    loopIterations += 1;
+    const response = await options.provider.sendWithTools(options.systemPrompt, messages, {
+      model: options.model,
+      tools: providerTools
+    });
+    if (response.textContent.trim().length > 0) {
+      finalResponse = response.textContent;
+    }
+    if (response.toolUseBlocks.length === 0) {
+      return {
+        finalResponse,
+        toolCalls,
+        loopIterations
+      };
+    }
+    messages.push(...toAssistantConversationBlocks(response));
+    const toolResultBlocks = [];
+    for (const toolUse of response.toolUseBlocks) {
+      const tool = toolsByName.get(toolUse.name);
+      const resolvedResponse = tool ? resolveToolResponse(tool, toolUse.arguments) : `[mock] No tool named '${toolUse.name}' is registered.`;
+      toolCalls.push({
+        name: toolUse.name,
+        arguments: toolUse.arguments,
+        response: resolvedResponse,
+        turnIndex: loopIterations
+      });
+      toolResultBlocks.push({
+        type: "tool_result",
+        tool_use_id: toolUse.id,
+        content: resolvedResponse
+      });
+    }
+    messages.push({
+      role: "user",
+      content: toolResultBlocks
+    });
+  }
+  const terminationNote = `[skilltest: tool loop terminated after ${maxIterations} iterations]`;
+  finalResponse = finalResponse ? `${finalResponse}
+${terminationNote}` : terminationNote;
+  return {
+    finalResponse,
+    toolCalls,
+    loopIterations
+  };
+}
 // src/core/eval-runner.ts
+var toolParameterSchema = z3.object({
+  name: z3.string().min(1),
+  type: z3.enum(["string", "number", "boolean", "object", "array"]),
+  description: z3.string().min(1),
+  required: z3.boolean().optional()
+});
+var mockToolDefinitionSchema = z3.object({
+  name: z3.string().min(1),
+  description: z3.string().min(1),
+  parameters: z3.array(toolParameterSchema).optional(),
+  responses: z3.record(z3.string())
+});
+var toolAssertionSchema = z3.object({
+  type: z3.enum(["tool_called", "tool_not_called", "tool_call_order", "tool_argument_match"]),
+  toolName: z3.string().min(1).optional(),
+  toolNames: z3.array(z3.string().min(1)).optional(),
+  expectedArgs: z3.record(z3.unknown()).optional(),
+  description: z3.string().min(1)
+}).superRefine((value, context) => {
+  if ((value.type === "tool_called" || value.type === "tool_not_called" || value.type === "tool_argument_match") && !value.toolName) {
+    context.addIssue({
+      code: z3.ZodIssueCode.custom,
+      message: `${value.type} requires toolName.`
+    });
+  }
+  if (value.type === "tool_call_order" && (!value.toolNames || value.toolNames.length === 0)) {
+    context.addIssue({
+      code: z3.ZodIssueCode.custom,
+      message: "tool_call_order requires toolNames."
+    });
+  }
+  if (value.type === "tool_argument_match" && !value.expectedArgs) {
+    context.addIssue({
+      code: z3.ZodIssueCode.custom,
+      message: "tool_argument_match requires expectedArgs."
+    });
+  }
+});
 var evalPromptSchema = z3.object({
   prompt: z3.string().min(1),
-  assertions: z3.array(z3.string().min(1)).optional()
+  assertions: z3.array(z3.string().min(1)).optional(),
+  tools: z3.array(mockToolDefinitionSchema).optional(),
+  toolAssertions: z3.array(toolAssertionSchema).optional()
 });
 var evalPromptArraySchema = z3.array(evalPromptSchema);
+function formatExpectedOrder(toolNames) {
+  return `[${toolNames.join(", ")}]`;
+}
+function formatActualOrder(toolCalls, toolNames) {
+  const relevantNames = new Set(toolNames);
+  const actualOrder = toolCalls.filter((toolCall) => relevantNames.has(toolCall.name)).map((toolCall) => toolCall.name);
+  return `[${actualOrder.join(", ")}]`;
+}
+function evaluateToolAssertions(toolAssertions, toolCalls) {
+  return toolAssertions.map((toolAssertion) => {
+    if (toolAssertion.type === "tool_called") {
+      const matchingCalls = toolCalls.filter((toolCall) => toolCall.name === toolAssertion.toolName);
+      return {
+        assertion: toolAssertion.description,
+        passed: matchingCalls.length > 0,
+        evidence: matchingCalls.length > 0 ? `Tool '${toolAssertion.toolName}' was called ${matchingCalls.length} time${matchingCalls.length === 1 ? "" : "s"}.` : `Tool '${toolAssertion.toolName}' was not called.`,
+        source: "tool"
+      };
+    }
+    if (toolAssertion.type === "tool_not_called") {
+      const matchingCalls = toolCalls.filter((toolCall) => toolCall.name === toolAssertion.toolName);
+      return {
+        assertion: toolAssertion.description,
+        passed: matchingCalls.length === 0,
+        evidence: matchingCalls.length === 0 ? `Tool '${toolAssertion.toolName}' was not called.` : `Tool '${toolAssertion.toolName}' was called ${matchingCalls.length} time${matchingCalls.length === 1 ? "" : "s"}.`,
+        source: "tool"
+      };
+    }
+    if (toolAssertion.type === "tool_call_order") {
+      const expectedOrder = toolAssertion.toolNames ?? [];
+      let nextExpectedIndex = 0;
+      for (const toolCall of toolCalls) {
+        if (toolCall.name === expectedOrder[nextExpectedIndex]) {
+          nextExpectedIndex += 1;
+        }
+      }
+      return {
+        assertion: toolAssertion.description,
+        passed: nextExpectedIndex === expectedOrder.length,
+        evidence: nextExpectedIndex === expectedOrder.length ? `Observed tool call order ${formatExpectedOrder(expectedOrder)}.` : `Expected call order ${formatExpectedOrder(expectedOrder)} but got ${formatActualOrder(toolCalls, expectedOrder)}.`,
+        source: "tool"
+      };
+    }
+    const matchingCall = toolCalls.find(
+      (toolCall) => toolCall.name === toolAssertion.toolName && matchesArgumentSubset(toolCall.arguments, toolAssertion.expectedArgs ?? {})
+    );
+    return {
+      assertion: toolAssertion.description,
+      passed: Boolean(matchingCall),
+      evidence: matchingCall ? `Tool '${toolAssertion.toolName}' was called with arguments matching ${JSON.stringify(toolAssertion.expectedArgs ?? {})}.` : `No '${toolAssertion.toolName}' call matched ${JSON.stringify(toolAssertion.expectedArgs ?? {})}.`,
+      source: "tool"
+    };
+  });
+}
 function extractJsonArray(raw) {
   const trimmed = raw.trim();
   if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
@@ -2591,6 +3232,7 @@ async function generatePrompts(skill, provider, model, count) {
     skill.content,
     "",
     `Generate ${count} prompts that stress the main capabilities and likely edge cases.`,
+    // Tool-aware prompts require user-defined mock responses and are not auto-generated.
     "Each prompt should include 2-4 assertions."
   ].join("\n");
   const raw = await provider.sendMessage(systemPrompt, userPrompt, { model });
@@ -2614,7 +3256,24 @@ async function runEval(skill, options) {
   const results = await pMap(
     prompts,
     async (evalPrompt) => {
-      const response = await options.provider.sendMessage(systemPrompt, evalPrompt.prompt, { model: options.model });
+      let response;
+      let toolCalls;
+      let loopIterations;
+      if (evalPrompt.tools && evalPrompt.tools.length > 0) {
+        const toolRun = await runWithTools({
+          provider: options.provider,
+          model: options.model,
+          systemPrompt,
+          userMessage: evalPrompt.prompt,
+          tools: evalPrompt.tools,
+          maxIterations: options.maxToolIterations
+        });
+        response = toolRun.finalResponse;
+        toolCalls = toolRun.toolCalls;
+        loopIterations = toolRun.loopIterations;
+      } else {
+        response = await options.provider.sendMessage(systemPrompt, evalPrompt.prompt, { model: options.model });
+      }
       const gradedAssertions = await gradeResponse({
         provider: options.provider,
         model: options.graderModel,
@@ -2624,14 +3283,18 @@ async function runEval(skill, options) {
         modelResponse: response,
         assertions: evalPrompt.assertions
       });
-      const passedAssertions2 = gradedAssertions.filter((assertion) => assertion.passed).length;
+      const structuralAssertions = evalPrompt.toolAssertions && evalPrompt.toolAssertions.length > 0 ? evaluateToolAssertions(evalPrompt.toolAssertions, toolCalls ?? []) : [];
+      const assertions = [...gradedAssertions, ...structuralAssertions];
+      const passedAssertions2 = assertions.filter((assertion) => assertion.passed).length;
       return {
         prompt: evalPrompt.prompt,
-        assertions: gradedAssertions,
+        assertions,
         responseSummary: response.slice(0, 200),
         response,
         passedAssertions: passedAssertions2,
-        totalAssertions: gradedAssertions.length
+        totalAssertions: assertions.length,
+        ...toolCalls ? { toolCalls } : {},
+        ...loopIterations !== void 0 ? { loopIterations } : {}
       };
     },
     options.concurrency ?? 5
@@ -2969,10 +3632,7 @@ function renderJson(value) {
 // src/commands/common.ts
 var executionContextByCommand = /* @__PURE__ */ new WeakMap();
-var singleEvalPromptSchema = z5.object({
-  prompt: z5.string().min(1),
-  assertions: z5.array(z5.string().min(1)).optional()
-});
+var singleEvalPromptSchema = evalPromptSchema;
 var promptStringArraySchema = z5.array(z5.string().min(1));
 var assertionsObjectSchema = z5.object({
   assertions: z5.array(z5.string().min(1))
@@ -3007,6 +3667,22 @@ function normalizeEvalPrompts(value, sourceLabel) {
 function parseAssertionsFromText(raw) {
   return raw.split(/\r?\n/).map((line) => line.trim().replace(/^[-*]\s+/, "").replace(/^\d+\.\s+/, "")).filter((line) => line.length > 0);
 }
+function cloneEvalPrompt(prompt) {
+  return {
+    prompt: prompt.prompt,
+    assertions: prompt.assertions ? [...prompt.assertions] : void 0,
+    tools: prompt.tools ? prompt.tools.map((tool) => ({
+      ...tool,
+      parameters: tool.parameters ? tool.parameters.map((parameter) => ({ ...parameter })) : void 0,
+      responses: { ...tool.responses }
+    })) : void 0,
+    toolAssertions: prompt.toolAssertions ? prompt.toolAssertions.map((toolAssertion) => ({
+      ...toolAssertion,
+      toolNames: toolAssertion.toolNames ? [...toolAssertion.toolNames] : void 0,
+      expectedArgs: toolAssertion.expectedArgs ? { ...toolAssertion.expectedArgs } : void 0
+    })) : void 0
+  };
+}
 function normalizeAssertions(value, sourceLabel) {
   const assertionArray = z5.array(z5.string().min(1)).safeParse(value);
   if (assertionArray.success) {
@@ -3079,17 +3755,14 @@ async function loadConfiguredEvalPrompts(command) {
     const assertionsRaw = await fs6.readFile(assertionsFile, "utf8");
     const assertions = normalizeAssertions(parseJsonIfPossible(assertionsRaw), assertionsFile);
     prompts = prompts.map((prompt) => ({
-      prompt: prompt.prompt,
+      ...cloneEvalPrompt(prompt),
       assertions: [...assertions]
     }));
   }
   const numRunsWasExplicit = context.configFile?.eval?.numRuns !== void 0;
   if (numRunsWasExplicit && prompts.length === 1 && context.config.eval.numRuns > 1) {
     const promptTemplate = prompts[0];
-    prompts = Array.from({ length: context.config.eval.numRuns }, () => ({
-      prompt: promptTemplate.prompt,
-      assertions: promptTemplate.assertions ? [...promptTemplate.assertions] : void 0
-    }));
+    prompts = Array.from({ length: context.config.eval.numRuns }, () => cloneEvalPrompt(promptTemplate));
   }
   return prompts;
 }
@@ -3186,7 +3859,8 @@ var evalConfigSchema = z7.object({
   numRuns: z7.number().int().min(1).optional(),
   threshold: z7.number().min(0).max(1).optional(),
   promptFile: z7.string().min(1).optional(),
-  assertionsFile: z7.string().min(1).optional()
+  assertionsFile: z7.string().min(1).optional(),
+  maxToolIterations: z7.number().int().min(1).max(50).optional()
 }).strict().partial();
 var skilltestConfigSchema = z7.object({
   provider: providerNameSchema.optional(),
@@ -3217,7 +3891,8 @@ var resolvedSkilltestConfigSchema = z7.object({
     numRuns: z7.number().int().min(1),
     threshold: z7.number().min(0).max(1),
     promptFile: z7.string().min(1).optional(),
-    assertionsFile: z7.string().min(1).optional()
+    assertionsFile: z7.string().min(1).optional(),
+    maxToolIterations: z7.number().int().min(1).max(50)
   })
 });
 var DEFAULT_SKILLTEST_CONFIG = {
@@ -3237,7 +3912,8 @@ var DEFAULT_SKILLTEST_CONFIG = {
   },
   eval: {
     numRuns: 5,
-    threshold: 0.9
+    threshold: 0.9,
+    maxToolIterations: 10
   }
 };
 function formatIssuePath(issuePath) {
@@ -3367,7 +4043,8 @@ function mergeConfigLayers(configFile = {}, cliFlags = {}, baseDirectory = proce
       assertionsFile: resolveConfigRelativePath(
         baseDirectory,
         cliFlags.eval?.assertionsFile ?? configFile.eval?.assertionsFile ?? DEFAULT_SKILLTEST_CONFIG.eval.assertionsFile
-      )
+      ),
+      maxToolIterations: cliFlags.eval?.maxToolIterations ?? configFile.eval?.maxToolIterations ?? DEFAULT_SKILLTEST_CONFIG.eval.maxToolIterations
     }
   };
   return resolvedSkilltestConfigSchema.parse(merged);
@@ -3391,34 +4068,34 @@ function extractCliConfigOverrides(command) {
   if (command.getOptionValueSource("model") === "cli") {
     overrides.model = getTypedOptionValue(command, "model");
   }
-  if ((command.name() === "trigger" || command.name() === "eval" || command.name() === "check") && command.getOptionValueSource("concurrency") === "cli") {
+  if ((command.name() === "trigger" || command.name() === "eval" || command.name() === "check" || command.name() === "improve" || command.name() === "route") && command.getOptionValueSource("concurrency") === "cli") {
     overrides.concurrency = getTypedOptionValue(command, "concurrency");
   }
-  if ((command.name() === "trigger" || command.name() === "check") && command.getOptionValueSource("numQueries") === "cli") {
+  if ((command.name() === "trigger" || command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("numQueries") === "cli") {
     overrides.trigger = {
       ...overrides.trigger,
       numQueries: getTypedOptionValue(command, "numQueries")
     };
   }
-  if ((command.name() === "trigger" || command.name() === "check") && command.getOptionValueSource("compare") === "cli") {
+  if ((command.name() === "trigger" || command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("compare") === "cli") {
     overrides.trigger = {
       ...overrides.trigger,
       compare: getTypedOptionValue(command, "compare")
     };
   }
-  if ((command.name() === "lint" || command.name() === "check") && command.getOptionValueSource("plugin") === "cli") {
+  if ((command.name() === "lint" || command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("plugin") === "cli") {
     overrides.lint = {
       ...overrides.lint,
       plugins: getTypedOptionValue(command, "plugin")
     };
   }
-  if (command.name() === "check" && command.getOptionValueSource("minF1") === "cli") {
+  if ((command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("minF1") === "cli") {
     overrides.trigger = {
       ...overrides.trigger,
       threshold: getTypedOptionValue(command, "minF1")
     };
   }
-  if (command.name() === "check" && command.getOptionValueSource("minAssertPassRate") === "cli") {
+  if ((command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("minAssertPassRate") === "cli") {
     overrides.eval = {
       ...overrides.eval,
       threshold: getTypedOptionValue(command, "minAssertPassRate")
@@ -3483,6 +4160,12 @@ function resolveApiKey(provider, override) {
 // src/providers/anthropic.ts
 import Anthropic from "@anthropic-ai/sdk";
+function isAnthropicTextBlock(block) {
+  return block.type === "text";
+}
+function isAnthropicToolUseBlock(block) {
+  return block.type === "tool_use";
+}
 function wait(ms) {
   return new Promise((resolve) => {
     setTimeout(resolve, ms);
@@ -3508,27 +4191,11 @@ var AnthropicProvider = class {
   constructor(apiKey) {
     this.client = new Anthropic({ apiKey });
   }
-  async sendMessage(systemPrompt, userMessage, options) {
+  async createMessage(request) {
     let lastError;
     for (let attempt = 0; attempt < 3; attempt += 1) {
       try {
-        const response = await this.client.messages.create({
-          model: options.model,
-          max_tokens: 2048,
-          system: systemPrompt,
-          messages: [
-            {
-              role: "user",
-              content: userMessage
-            }
-          ]
-        });
-        const textBlocks = response.content.filter((block) => block.type === "text");
-        const text = textBlocks.map((block) => block.text).join("\n").trim();
-        if (text.length === 0) {
-          throw new Error("Model returned an empty response.");
-        }
-        return text;
+        return await this.client.messages.create(request);
       } catch (error) {
         lastError = error;
         if (!isRateLimitError(error) || attempt === 2) {
@@ -3543,6 +4210,55 @@ var AnthropicProvider = class {
     }
     throw new Error("Anthropic API call failed with an unknown error.");
   }
+  toAnthropicMessages(messages) {
+    return messages.map((message) => ({
+      role: message.role,
+      content: message.content
+    }));
+  }
+  async sendMessage(systemPrompt, userMessage, options) {
+    const response = await this.createMessage({
+      model: options.model,
+      max_tokens: 2048,
+      system: systemPrompt,
+      messages: [
+        {
+          role: "user",
+          content: userMessage
+        }
+      ]
+    });
+    const textBlocks = response.content.filter(isAnthropicTextBlock);
+    const text = textBlocks.map((block) => block.text).join("\n").trim();
+    if (text.length === 0) {
+      throw new Error("Model returned an empty response.");
+    }
+    return text;
+  }
+  async sendWithTools(systemPrompt, messages, options) {
+    const response = await this.createMessage({
+      model: options.model,
+      max_tokens: 2048,
+      system: systemPrompt,
+      messages: this.toAnthropicMessages(messages),
+      tools: options.tools.map((tool) => ({
+        name: tool.name,
+        description: tool.description,
+        input_schema: tool.parameters ?? { type: "object", properties: {} }
+      }))
+    });
+    const textContent = response.content.filter(isAnthropicTextBlock).map((block) => block.text).join("\n").trim();
+    const toolUseBlocks = response.content.filter(isAnthropicToolUseBlock).map((block) => ({
+      id: block.id,
+      name: block.name,
+      arguments: block.input
+    }));
+    return {
+      textContent,
+      toolUseBlocks,
+      stopReason: response.stop_reason ?? "end_turn"
+    };
+  }
 };
 // src/providers/openai.ts
@@ -3579,17 +4295,82 @@ function extractTextContent(content) {
   const text = content.map((item) => item.type === "text" || !item.type ? item.text ?? "" : "").join("\n").trim();
   return text;
 }
-var OpenAIProvider = class {
-  name = "openai";
-  apiKey;
-  client;
-  constructor(apiKey) {
-    this.apiKey = apiKey;
-    this.client = null;
+function parseToolArguments(raw, toolName) {
+  if (!raw || raw.trim() === "") {
+    return {};
   }
-  async ensureClient() {
-    if (this.client) {
-      return this.client;
+  try {
+    const parsed = JSON.parse(raw);
+    if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
+      throw new Error("Tool arguments must be a JSON object.");
+    }
+    return parsed;
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    throw new Error(`OpenAI tool call arguments for '${toolName}' were not valid JSON: ${message}`);
+  }
+}
+function getBlockText(blocks) {
+  return blocks.filter((block) => block.type === "text").map((block) => String(block.text ?? "")).join("\n").trim();
+}
+function mapAssistantBlocksToMessage(blocks) {
+  const textContent = getBlockText(blocks);
+  const toolCalls = blocks.filter((block) => block.type === "tool_use").map((block) => ({
+    id: String(block.id ?? ""),
+    type: "function",
+    function: {
+      name: String(block.name ?? ""),
+      arguments: JSON.stringify(block.input ?? {})
+    }
+  }));
+  return {
+    role: "assistant",
+    content: textContent.length > 0 ? textContent : null,
+    ...toolCalls.length > 0 ? { tool_calls: toolCalls } : {}
+  };
+}
+function mapUserBlocksToMessages(blocks) {
+  const toolResults = blocks.filter((block) => block.type === "tool_result");
+  if (toolResults.length > 0) {
+    return toolResults.map((block) => ({
+      role: "tool",
+      tool_call_id: String(block.tool_use_id ?? ""),
+      content: String(block.content ?? "")
+    }));
+  }
+  const textContent = getBlockText(blocks);
+  return [
+    {
+      role: "user",
+      content: textContent
+    }
+  ];
+}
+function mapConversationBlockToMessages(block) {
+  if (typeof block.content === "string") {
+    return [
+      {
+        role: block.role,
+        content: block.content
+      }
+    ];
+  }
+  if (block.role === "assistant") {
+    return [mapAssistantBlocksToMessage(block.content)];
+  }
+  return mapUserBlocksToMessages(block.content);
+}
+var OpenAIProvider = class {
+  name = "openai";
+  apiKey;
+  client;
+  constructor(apiKey) {
+    this.apiKey = apiKey;
+    this.client = null;
+  }
+  async ensureClient() {
+    if (this.client) {
+      return this.client;
     }
     let openAiModule;
     try {
@@ -3607,30 +4388,12 @@ var OpenAIProvider = class {
     this.client = new OpenAIConstructor({ apiKey: this.apiKey });
     return this.client;
   }
-  async sendMessage(systemPrompt, userMessage, options) {
+  async createCompletion(input) {
     const client = await this.ensureClient();
     let lastError;
     for (let attempt = 0; attempt < 3; attempt += 1) {
       try {
-        const response = await client.chat.completions.create({
-          model: options.model,
-          max_tokens: 2048,
-          messages: [
-            {
-              role: "system",
-              content: systemPrompt
-            },
-            {
-              role: "user",
-              content: userMessage
-            }
-          ]
-        });
-        const text = (response.choices ?? []).map((choice) => extractTextContent(choice.message?.content)).join("\n").trim();
-        if (text.length === 0) {
-          throw new Error("Model returned an empty response.");
-        }
-        return text;
+        return await client.chat.completions.create(input);
       } catch (error) {
         lastError = error;
         if (!isRetriableError(error) || attempt === 2) {
@@ -3645,6 +4408,57 @@ var OpenAIProvider = class {
     }
     throw new Error("OpenAI API call failed with an unknown error.");
   }
+  toOpenAiMessages(systemPrompt, messages) {
+    return [
+      {
+        role: "system",
+        content: systemPrompt
+      },
+      ...messages.flatMap((message) => mapConversationBlockToMessages(message))
+    ];
+  }
+  async sendMessage(systemPrompt, userMessage, options) {
+    const response = await this.createCompletion({
+      model: options.model,
+      max_tokens: 2048,
+      messages: this.toOpenAiMessages(systemPrompt, [{ role: "user", content: userMessage }])
+    });
+    const text = (response.choices ?? []).map((choice) => extractTextContent(choice.message?.content)).join("\n").trim();
+    if (text.length === 0) {
+      throw new Error("Model returned an empty response.");
+    }
+    return text;
+  }
+  async sendWithTools(systemPrompt, messages, options) {
+    const response = await this.createCompletion({
+      model: options.model,
+      max_tokens: 2048,
+      messages: this.toOpenAiMessages(systemPrompt, messages),
+      tools: options.tools.map((tool) => ({
+        type: "function",
+        function: {
+          name: tool.name,
+          description: tool.description,
+          parameters: tool.parameters
+        }
+      }))
+    });
+    const choice = response.choices?.[0];
+    const message = choice?.message;
+    const toolUseBlocks = (message?.tool_calls ?? []).map((toolCall, index) => {
+      const toolName = toolCall.function?.name ?? `tool-${index + 1}`;
+      return {
+        id: toolCall.id ?? `${toolName}-${index + 1}`,
+        name: toolName,
+        arguments: parseToolArguments(toolCall.function?.arguments, toolName)
+      };
+    });
+    return {
+      textContent: extractTextContent(message?.content),
+      toolUseBlocks,
+      stopReason: choice?.finish_reason === "stop" ? "end_turn" : choice?.finish_reason === "tool_calls" ? "tool_use" : choice?.finish_reason ?? "end_turn"
+    };
+  }
 };
 // src/providers/index.ts
@@ -3815,7 +4629,8 @@ async function handleEvalCommand(targetPath, options, command) {
       graderModel,
       numRuns: options.numRuns,
       concurrency: options.concurrency,
-      prompts
+      prompts,
+      maxToolIterations: options.maxToolIterations
     });
     if (options.saveResults) {
       await writeJsonFile(options.saveResults, result);
@@ -3862,7 +4677,8 @@ function registerEvalCommand(program) {
         verbose: Boolean(parsedCli.data.verbose),
         apiKey: parsedCli.data.apiKey,
         numRuns: config.eval.numRuns,
-        concurrency: config.concurrency
+        concurrency: config.concurrency,
+        maxToolIterations: config.eval.maxToolIterations
       },
       command
     );
@@ -3919,7 +4735,8 @@ async function runCheck(inputPath, options) {
         graderModel: options.graderModel,
         numRuns: options.evalNumRuns,
         prompts: options.prompts,
-        concurrency: options.concurrency
+        concurrency: options.concurrency,
+        maxToolIterations: options.evalMaxToolIterations
       };
       if ((options.concurrency ?? 5) === 1) {
         options.onStage?.("trigger");
@@ -4041,6 +4858,7 @@ async function handleCheckCommand(targetPath, options, command) {
       triggerSeed: options.triggerSeed,
       prompts,
       evalNumRuns: options.numRuns,
+      evalMaxToolIterations: options.maxToolIterations,
       concurrency: options.concurrency,
       minF1: options.minF1,
       minAssertPassRate: options.minAssertPassRate,
@@ -4106,6 +4924,7 @@ function registerCheckCommand(program) {
         minF1: config.trigger.threshold,
         minAssertPassRate: config.eval.threshold,
         numRuns: config.eval.numRuns,
+        maxToolIterations: config.eval.maxToolIterations,
         concurrency: config.concurrency,
         html: parsedCli.data.html,
         lintFailOn: config.lint.failOn,
@@ -4121,12 +4940,868 @@ function registerCheckCommand(program) {
   });
 }
+// src/commands/improve.ts
+import ora4 from "ora";
+import { z as z12 } from "zod";
+// src/core/improver.ts
+import fs12 from "node:fs/promises";
+import os from "node:os";
+import path7 from "node:path";
+import yaml2 from "js-yaml";
+import { z as z11 } from "zod";
+var improveRewriteSchema = z11.object({
+  frontmatter: z11.record(z11.unknown()),
+  content: z11.string().min(1),
+  changeSummary: z11.array(z11.string().min(1)).min(1),
+  targetedProblems: z11.array(z11.string().min(1)).min(1)
+});
+function calculateEvalAssertPassRate2(result) {
+  if (!result || result.summary.totalAssertions === 0) {
+    return 0;
+  }
+  return result.summary.passedAssertions / result.summary.totalAssertions;
+}
+function extractJsonObject2(raw) {
+  const trimmed = raw.trim();
+  if (trimmed.startsWith("{") && trimmed.endsWith("}")) {
+    return JSON.parse(trimmed);
+  }
+  const start = trimmed.indexOf("{");
+  const end = trimmed.lastIndexOf("}");
+  if (start >= 0 && end > start) {
+    return JSON.parse(trimmed.slice(start, end + 1));
+  }
+  throw new Error("Improver did not return a JSON object.");
+}
+function orderFrontmatter(frontmatter) {
+  const ordered = {};
+  for (const key of ["name", "description", "license"]) {
+    if (Object.prototype.hasOwnProperty.call(frontmatter, key)) {
+      ordered[key] = frontmatter[key];
+    }
+  }
+  for (const [key, value] of Object.entries(frontmatter)) {
+    if (!Object.prototype.hasOwnProperty.call(ordered, key)) {
+      ordered[key] = value;
+    }
+  }
+  return ordered;
+}
+function detectLineEnding(raw) {
+  return raw.includes("\r\n") ? "\r\n" : "\n";
+}
+function buildSkillMarkdown(frontmatter, content, lineEnding) {
+  const normalizedBody = content.trim();
+  if (normalizedBody.length === 0) {
+    throw new Error("Candidate rewrite produced an empty SKILL.md body.");
+  }
+  const frontmatterBlock = yaml2.dump(orderFrontmatter(frontmatter), {
+    lineWidth: 0,
+    noRefs: true,
+    sortKeys: false
+  }).replace(/\n/g, lineEnding);
+  return `---${lineEnding}${frontmatterBlock}---${lineEnding}${lineEnding}${normalizedBody.replace(/\n/g, lineEnding)}${lineEnding}`;
+}
+async function validateRelativeReferences(raw, skillRoot) {
+  for (const reference of extractRelativeFileReferences(raw)) {
+    const resolved = path7.resolve(skillRoot, reference);
+    const relativeToRoot = path7.relative(skillRoot, resolved);
+    const escapesRoot = relativeToRoot === "" ? false : relativeToRoot.startsWith("..") || path7.isAbsolute(relativeToRoot);
+    if (escapesRoot) {
+      throw new Error(`Candidate rewrite introduced an out-of-root reference: ${reference}`);
+    }
+    if (!await pathExists(resolved)) {
+      throw new Error(`Candidate rewrite introduced a broken relative reference: ${reference}`);
+    }
+  }
+}
+async function buildCandidate(skill, rewrite) {
+  if (typeof rewrite.frontmatter.name === "string" && rewrite.frontmatter.name !== skill.frontmatter.name) {
+    throw new Error(`Candidate rewrite attempted to rename skill '${skill.frontmatter.name}' to '${rewrite.frontmatter.name}'.`);
+  }
+  if (skill.frontmatter.license && typeof rewrite.frontmatter.license === "string" && rewrite.frontmatter.license !== skill.frontmatter.license) {
+    throw new Error(
+      `Candidate rewrite attempted to change license '${skill.frontmatter.license}' to '${rewrite.frontmatter.license}'.`
+    );
+  }
+  const mergedFrontmatter = {
+    ...skill.frontmatter,
+    ...rewrite.frontmatter,
+    name: skill.frontmatter.name,
+    ...skill.frontmatter.license ? { license: skill.frontmatter.license } : {}
+  };
+  const raw = buildSkillMarkdown(mergedFrontmatter, rewrite.content, detectLineEnding(skill.raw));
+  parseSkillDocumentStrict(raw, skill.skillRoot, skill.skillFile);
+  await validateRelativeReferences(raw, skill.skillRoot);
+  return {
+    frontmatter: mergedFrontmatter,
+    content: rewrite.content.trim(),
+    raw,
+    changeSummary: rewrite.changeSummary,
+    targetedProblems: rewrite.targetedProblems
+  };
+}
+function extractActionableIssues(result) {
+  const lintIssues = result.lint.issues.filter((issue) => issue.status !== "pass").map((issue) => ({
+    checkId: issue.checkId,
+    title: issue.title,
+    status: issue.status === "warn" ? "warn" : "fail",
+    message: issue.message,
+    suggestion: issue.suggestion,
+    startLine: issue.startLine,
+    endLine: issue.endLine
+  }));
+  const triggerFailures = result.trigger?.cases.filter((testCase) => !testCase.matched).map((testCase) => ({
+    query: testCase.query,
+    expected: testCase.expected,
+    actual: testCase.actual,
+    selectedCompetitor: testCase.selectedCompetitor,
+    rawModelResponse: testCase.rawModelResponse
+  })) ?? [];
+  const evalFailures = result.eval?.results.flatMap(
+    (promptResult) => promptResult.assertions.filter((assertion) => !assertion.passed).map((assertion) => ({
+      prompt: promptResult.prompt,
+      assertion: assertion.assertion,
+      evidence: assertion.evidence,
+      source: assertion.source === "grader" || assertion.source === "tool" ? assertion.source : "unknown"
+    }))
+  ) ?? [];
+  return {
+    lintIssues,
+    triggerFailures,
+    evalFailures,
+    triggerSuggestions: result.trigger?.suggestions ?? []
+  };
+}
+function hasActionableProblems(brief) {
+  return brief.lintIssues.length > 0 || brief.triggerFailures.length > 0 || brief.evalFailures.length > 0 || brief.triggerSuggestions.length > 0;
+}
+async function listSkillFiles(skillRoot) {
+  const entries = await fs12.readdir(skillRoot, { withFileTypes: true });
+  const files = [];
+  for (const entry of entries) {
+    const absolutePath = path7.join(skillRoot, entry.name);
+    if (entry.isDirectory()) {
+      files.push(...await listSkillFiles(absolutePath));
+      continue;
+    }
+    if (entry.isFile()) {
+      files.push(path7.relative(skillRoot, absolutePath).split(path7.sep).join("/"));
+    }
+  }
+  return files.sort();
+}
+async function requestRewrite(skill, baseline, brief, provider, model) {
+  const availableFiles = await listSkillFiles(skill.skillRoot);
+  const systemPrompt = [
+    "You rewrite Agent Skill files to improve measured quality.",
+    "Return JSON only.",
+    "Required format:",
+    '{"frontmatter": {...}, "content": "...", "changeSummary": ["..."], "targetedProblems": ["..."]}',
+    "The content field must contain only the markdown body of SKILL.md, without YAML frontmatter fences.",
+    `Keep the skill name exactly '${skill.frontmatter.name}'.`,
+    skill.frontmatter.license ? `Keep the license exactly '${skill.frontmatter.license}'.` : "Do not remove any valid existing frontmatter fields.",
+    "Do not invent new scripts, assets, references, APIs, or tools.",
+    "Only reference files that already exist under the skill root.",
+    "Optimize for trigger clarity, explicit scope boundaries, concrete examples, safety guidance, and tool usage instructions."
+  ].join(" ");
+  const baselineTriggerF1 = baseline.trigger?.metrics.f1 ?? 0;
+  const baselineEvalPassRate = calculateEvalAssertPassRate2(baseline.eval);
+  const userPrompt = [
+    `Skill file: ${skill.skillFile}`,
+    `Current trigger F1: ${baselineTriggerF1.toFixed(4)}`,
+    `Current eval assertion pass rate: ${baselineEvalPassRate.toFixed(4)}`,
+    `Lint failures: ${baseline.lint.summary.failures}`,
+    `Lint warnings: ${baseline.lint.summary.warnings}`,
+    "",
+    "Available files under the skill root:",
+    ...availableFiles.map((file) => `- ${file}`),
+    "",
+    "Current SKILL.md:",
+    "```markdown",
+    skill.raw,
+    "```",
+    "",
+    "Actionable problems to fix:",
+    JSON.stringify(brief, null, 2),
+    "",
+    "Rewrite the skill to address only these evidenced problems. Keep the instructions tight and practical."
+  ].join("\n");
+  const raw = await provider.sendMessage(systemPrompt, userPrompt, { model });
+  const parsed = improveRewriteSchema.safeParse(extractJsonObject2(raw));
+  if (!parsed.success) {
+    throw new Error(`Failed to parse improve output: ${parsed.error.issues[0]?.message ?? "invalid improve JSON"}`);
+  }
+  return parsed.data;
+}
+async function createVerificationDirectory(skillRoot, candidateRaw) {
+  const tempRoot = await fs12.mkdtemp(path7.join(os.tmpdir(), "skilltest-improve-"));
+  const tempSkillRoot = path7.join(tempRoot, path7.basename(skillRoot));
+  await fs12.cp(skillRoot, tempSkillRoot, { recursive: true });
+  await fs12.writeFile(path7.join(tempSkillRoot, "SKILL.md"), candidateRaw, "utf8");
+  return {
+    tempRoot,
+    skillPath: tempSkillRoot
+  };
+}
+function buildDelta(baseline, verification) {
+  const baselineTriggerF1 = baseline.trigger?.metrics.f1 ?? 0;
+  const verificationTriggerF1 = verification.trigger?.metrics.f1 ?? 0;
+  const baselineEvalPassRate = calculateEvalAssertPassRate2(baseline.eval);
+  const verificationEvalPassRate = calculateEvalAssertPassRate2(verification.eval);
+  const lintFailuresDelta = baseline.lint.summary.failures - verification.lint.summary.failures;
+  const lintWarningsDelta = baseline.lint.summary.warnings - verification.lint.summary.warnings;
+  const triggerF1Delta = verificationTriggerF1 - baselineTriggerF1;
+  const evalPassRateDelta = verificationEvalPassRate - baselineEvalPassRate;
+  const hasRegression = verification.lint.summary.failures > baseline.lint.summary.failures || verification.lint.summary.warnings > baseline.lint.summary.warnings || verificationTriggerF1 < baselineTriggerF1 || verificationEvalPassRate < baselineEvalPassRate;
+  const improved = verification.gates.overallPassed !== baseline.gates.overallPassed ? verification.gates.overallPassed : lintFailuresDelta > 0 || lintWarningsDelta > 0 || triggerF1Delta > 0 || evalPassRateDelta > 0;
+  return {
+    lintFailures: {
+      before: baseline.lint.summary.failures,
+      after: verification.lint.summary.failures,
+      delta: lintFailuresDelta
+    },
+    lintWarnings: {
+      before: baseline.lint.summary.warnings,
+      after: verification.lint.summary.warnings,
+      delta: lintWarningsDelta
+    },
+    triggerF1: {
+      before: baselineTriggerF1,
+      after: verificationTriggerF1,
+      delta: triggerF1Delta
+    },
+    evalAssertPassRate: {
+      before: baselineEvalPassRate,
+      after: verificationEvalPassRate,
+      delta: evalPassRateDelta
+    },
+    overallPassed: {
+      before: baseline.gates.overallPassed,
+      after: verification.gates.overallPassed
+    },
+    improved,
+    hasRegression
+  };
+}
+function normalizeVerificationTarget(result, target) {
+  return {
+    ...result,
+    target
+  };
+}
+function buildBlockingReason(delta, verification) {
+  if (delta.hasRegression) {
+    return "Candidate rewrite regressed one or more quality metrics on the frozen test set.";
+  }
+  if (!delta.improved) {
+    return "Candidate rewrite did not produce a measurable improvement on the frozen test set.";
+  }
+  if (!verification.gates.overallPassed) {
+    return "Candidate rewrite improved the skill but still failed the configured quality gates.";
+  }
+  return void 0;
+}
+async function maybeWriteOutput(outputPath, raw) {
+  const absolutePath = path7.resolve(outputPath);
+  await fs12.mkdir(path7.dirname(absolutePath), { recursive: true });
+  await fs12.writeFile(absolutePath, raw, "utf8");
+  return absolutePath;
+}
+async function runImprove(inputPath, options) {
+  options.onStage?.("baseline");
+  const baseline = await runCheck(inputPath, {
+    provider: options.provider,
+    model: options.model,
+    graderModel: options.model,
+    lintFailOn: options.lintFailOn,
+    lintSuppress: options.lintSuppress,
+    lintPlugins: options.lintPlugins,
+    compare: options.compare,
+    numQueries: options.numQueries,
+    triggerSeed: options.triggerSeed,
+    queries: options.queries,
+    evalNumRuns: options.evalNumRuns,
+    prompts: options.prompts,
+    evalMaxToolIterations: options.evalMaxToolIterations,
+    concurrency: options.concurrency,
+    minF1: options.minF1,
+    minAssertPassRate: options.minAssertPassRate,
+    continueOnLintFail: true,
+    verbose: options.verbose
+  });
+  if (!baseline.trigger || !baseline.eval) {
+    return {
+      target: inputPath,
+      provider: options.provider.name,
+      model: options.model,
+      originalRaw: "",
+      thresholds: {
+        minF1: options.minF1,
+        minAssertPassRate: options.minAssertPassRate
+      },
+      baseline,
+      candidate: null,
+      verification: null,
+      delta: null,
+      applied: false,
+      blockedReason: baseline.triggerSkippedReason ?? baseline.evalSkippedReason ?? "Improve requires a strictly parseable skill so trigger and eval can be frozen."
+    };
+  }
+  const skill = await parseSkillStrict(inputPath);
+  const brief = extractActionableIssues(baseline);
+  if (!hasActionableProblems(brief)) {
+    return {
+      target: inputPath,
+      provider: options.provider.name,
+      model: options.model,
+      originalRaw: skill.raw,
+      thresholds: {
+        minF1: options.minF1,
+        minAssertPassRate: options.minAssertPassRate
+      },
+      baseline,
+      candidate: null,
+      verification: null,
+      delta: null,
+      applied: false,
+      blockedReason: "No actionable failures, warnings, or mismatches were found to improve."
+    };
+  }
+  options.onStage?.("generate");
+  const rewrite = await requestRewrite(skill, baseline, brief, options.provider, options.model);
+  options.onStage?.("validate");
+  const candidate = await buildCandidate(skill, rewrite);
+  if (candidate.raw === skill.raw) {
+    return {
+      target: inputPath,
+      provider: options.provider.name,
+      model: options.model,
+      originalRaw: skill.raw,
+      thresholds: {
+        minF1: options.minF1,
+        minAssertPassRate: options.minAssertPassRate
+      },
+      baseline,
+      candidate,
+      verification: null,
+      delta: null,
+      applied: false,
+      blockedReason: "Candidate rewrite produced no changes."
+    };
+  }
+  options.onStage?.("verify");
+  const verificationDirectory = await createVerificationDirectory(skill.skillRoot, candidate.raw);
+  let verification;
+  try {
+    verification = normalizeVerificationTarget(
+      await runCheck(verificationDirectory.skillPath, {
+        provider: options.provider,
+        model: options.model,
+        graderModel: options.model,
+        lintFailOn: options.lintFailOn,
+        lintSuppress: options.lintSuppress,
+        lintPlugins: options.lintPlugins,
+        compare: options.compare,
+        numQueries: baseline.trigger.queries.length,
+        triggerSeed: options.triggerSeed,
+        queries: baseline.trigger.queries,
+        evalNumRuns: baseline.eval.prompts.length,
+        prompts: baseline.eval.prompts,
+        evalMaxToolIterations: options.evalMaxToolIterations,
+        concurrency: options.concurrency,
+        minF1: options.minF1,
+        minAssertPassRate: options.minAssertPassRate,
+        continueOnLintFail: true,
+        verbose: options.verbose
+      }),
+      inputPath
+    );
+  } finally {
+    await fs12.rm(verificationDirectory.tempRoot, { recursive: true, force: true });
+  }
+  const delta = buildDelta(baseline, verification);
+  const blockedReason = buildBlockingReason(delta, verification);
+  let applied = false;
+  let outputPath;
+  if (!blockedReason) {
+    if (options.outputPath) {
+      options.onStage?.("write");
+      outputPath = await maybeWriteOutput(options.outputPath, candidate.raw);
+    }
+    if (options.apply) {
+      options.onStage?.("write");
+      await fs12.writeFile(skill.skillFile, candidate.raw, "utf8");
+      applied = true;
+    }
+  }
+  return {
+    target: inputPath,
+    provider: options.provider.name,
+    model: options.model,
+    originalRaw: skill.raw,
+    thresholds: {
+      minF1: options.minF1,
+      minAssertPassRate: options.minAssertPassRate
+    },
+    baseline,
+    candidate,
+    verification,
+    delta,
+    applied,
+    ...outputPath ? { outputPath } : {},
+    ...blockedReason ? { blockedReason } : {}
+  };
+}
+// src/commands/improve.ts
+var improveCliSchema = z12.object({
+  apiKey: z12.string().optional(),
+  queries: z12.string().optional(),
+  compare: z12.array(z12.string().min(1)).optional(),
+  seed: z12.number().int().optional(),
+  prompts: z12.string().optional(),
+  plugin: z12.array(z12.string().min(1)).optional(),
+  concurrency: z12.number().int().min(1).optional(),
+  output: z12.string().optional(),
+  saveResults: z12.string().optional(),
+  apply: z12.boolean().optional(),
+  verbose: z12.boolean().optional()
+});
+var DEFAULT_ANTHROPIC_MODEL4 = "claude-sonnet-4-5-20250929";
+var DEFAULT_OPENAI_MODEL4 = "gpt-4.1-mini";
+function collectPluginPaths3(value, previous = []) {
+  return [...previous, value];
+}
+function resolveModel4(provider, model) {
+  if (provider === "openai" && model === DEFAULT_ANTHROPIC_MODEL4) {
+    return DEFAULT_OPENAI_MODEL4;
+  }
+  return model;
+}
+async function handleImproveCommand(targetPath, options, command) {
+  const spinner = options.json || !process.stdout.isTTY ? null : ora4("Preparing improvement run...").start();
+  try {
+    if (spinner) {
+      spinner.text = "Initializing model provider...";
+    }
+    const provider = createProvider(options.provider, options.apiKey);
+    let queries = void 0;
+    if (options.queries) {
+      if (spinner) {
+        spinner.text = "Loading frozen trigger queries...";
+      }
+      queries = await loadTriggerQueriesFile(options.queries);
+    }
+    let prompts = void 0;
+    if (options.prompts) {
+      if (spinner) {
+        spinner.text = "Loading eval prompts...";
+      }
+      prompts = await loadEvalPromptsJson(options.prompts);
+    } else {
+      prompts = await loadConfiguredEvalPrompts(command);
+    }
+    const model = resolveModel4(options.provider, options.model);
+    const result = await runImprove(targetPath, {
+      provider,
+      model,
+      lintFailOn: options.lintFailOn,
+      lintSuppress: options.lintSuppress,
+      lintPlugins: options.lintPlugins,
+      compare: options.compare,
+      numQueries: options.numQueries,
+      triggerSeed: options.triggerSeed,
+      queries,
+      prompts,
+      evalNumRuns: options.numRuns,
+      evalMaxToolIterations: options.maxToolIterations,
+      minF1: options.minF1,
+      minAssertPassRate: options.minAssertPassRate,
+      concurrency: options.concurrency,
+      apply: options.apply,
+      outputPath: options.output,
+      verbose: options.verbose,
+      onStage: (stage) => {
+        if (!spinner) {
+          return;
+        }
+        if (stage === "baseline") {
+          spinner.text = "Running baseline check...";
+        } else if (stage === "generate") {
+          spinner.text = "Generating candidate rewrite...";
+        } else if (stage === "validate") {
+          spinner.text = "Validating candidate rewrite...";
+        } else if (stage === "verify") {
+          spinner.text = "Verifying candidate against frozen test inputs...";
+        } else if (stage === "write") {
+          spinner.text = options.apply ? "Writing improved SKILL.md..." : "Writing candidate output...";
+        }
+      }
+    });
+    if (options.saveResults) {
+      await writeJsonFile(options.saveResults, result);
+    }
+    spinner?.stop();
+    if (options.json) {
+      writeResult(result, true);
+    } else {
+      writeResult(renderImproveReport(result, options.color, options.verbose), false);
+    }
+    process.exitCode = result.blockedReason ? 1 : 0;
+  } catch (error) {
+    spinner?.stop();
+    writeError(error, options.json);
+    process.exitCode = 2;
+  }
+}
+function registerImproveCommand(program) {
+  program.command("improve").description("Rewrite SKILL.md, verify it on frozen test inputs, and optionally apply it.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai").option("--model <model>", "Model for baseline, rewrite, and verification runs").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--compare <path...>", "Path(s) to sibling skill directories to include as competitors").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible trigger results", (value) => Number.parseInt(value, 10)).option("--prompts <path>", "Path to eval prompts JSON").option("--plugin <path>", "Load a custom lint plugin file", collectPluginPaths3, []).option("--concurrency <n>", "Maximum in-flight trigger/eval tasks", (value) => Number.parseInt(value, 10)).option("--output <path>", "Write the verified candidate SKILL.md to a separate file").option("--save-results <path>", "Save the full improve result JSON").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value)).option(
+    "--min-assert-pass-rate <n>",
+    "Minimum required eval assertion pass rate (0-1)",
+    (value) => Number.parseFloat(value)
+  ).option("--apply", "Apply the verified rewrite to the source SKILL.md").option("--verbose", "Include detailed baseline and verification reports").action(async (targetPath, _commandOptions, command) => {
+    const globalOptions = getGlobalCliOptions(command);
+    const config = getResolvedConfig(command);
+    const parsedCli = improveCliSchema.safeParse(command.opts());
+    if (!parsedCli.success) {
+      writeError(new Error(parsedCli.error.issues[0]?.message ?? "Invalid improve options."), globalOptions.json);
+      process.exitCode = 2;
+      return;
+    }
+    await handleImproveCommand(
+      targetPath,
+      {
+        ...globalOptions,
+        provider: config.provider,
+        model: config.model,
+        apiKey: parsedCli.data.apiKey,
+        queries: parsedCli.data.queries,
+        compare: config.trigger.compare,
+        numQueries: config.trigger.numQueries,
+        prompts: parsedCli.data.prompts,
+        minF1: config.trigger.threshold,
+        minAssertPassRate: config.eval.threshold,
+        numRuns: config.eval.numRuns,
+        maxToolIterations: config.eval.maxToolIterations,
+        concurrency: config.concurrency,
+        lintFailOn: config.lint.failOn,
+        lintSuppress: config.lint.suppress,
+        lintPlugins: config.lint.plugins,
+        triggerSeed: parsedCli.data.seed ?? config.trigger.seed,
+        output: parsedCli.data.output,
+        saveResults: parsedCli.data.saveResults,
+        apply: Boolean(parsedCli.data.apply),
+        verbose: Boolean(parsedCli.data.verbose)
+      },
+      command
+    );
+  });
+}
+// src/commands/route.ts
+import fs13 from "node:fs/promises";
+import ora5 from "ora";
+import { z as z14 } from "zod";
+// src/core/route-tester.ts
+import path8 from "node:path";
+import { z as z13 } from "zod";
+var stringArraySchema = z13.array(z13.string().min(1));
+function parseJsonArrayFromModelOutput2(raw) {
+  const trimmed = raw.trim();
+  if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
+    return JSON.parse(trimmed);
+  }
+  const start = trimmed.indexOf("[");
+  const end = trimmed.lastIndexOf("]");
+  if (start >= 0 && end > start) {
+    return JSON.parse(trimmed.slice(start, end + 1));
+  }
+  throw new Error("Model did not return a JSON array.");
+}
+function parseRouteDecision(rawResponse, skillNames) {
+  const normalized = rawResponse.trim().toLowerCase();
+  if (normalized === "none" || normalized.startsWith("none")) {
+    return "none";
+  }
+  for (const skillName of skillNames) {
+    const escaped = skillName.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+    const regex = new RegExp(`\\b${escaped}\\b`, "i");
+    if (regex.test(rawResponse)) {
+      return skillName;
+    }
+  }
+  return "unrecognized";
+}
+async function discoverSkillPaths(skillDir) {
+  const allFiles = await listFilesRecursive(skillDir);
+  return allFiles.filter((f) => path8.basename(f) === "SKILL.md");
+}
+async function generatePositiveQueriesForSkill(skill, provider, model, count) {
+  const systemPrompt = [
+    "You generate realistic user queries that should trigger a specific agent skill.",
+    "Return a JSON array of strings only. No markdown, no comments.",
+    "Each string is one realistic user query that clearly belongs to this skill.",
+    "Queries should look like real user requests with enough context to drive a routing decision."
+  ].join(" ");
+  const userPrompt = [
+    `Skill name: ${skill.frontmatter.name}`,
+    `Skill description: ${skill.frontmatter.description}`,
+    `Generate exactly ${count} distinct queries that should trigger this skill.`
+  ].join("\n");
+  const raw = await provider.sendMessage(systemPrompt, userPrompt, { model });
+  const parsed = stringArraySchema.safeParse(parseJsonArrayFromModelOutput2(raw));
+  if (!parsed.success) {
+    throw new Error(
+      `Failed to parse generated queries for skill '${skill.frontmatter.name}': ${parsed.error.issues[0]?.message ?? "invalid format"}`
+    );
+  }
+  if (parsed.data.length < count) {
+    throw new Error(
+      `Expected ${count} queries for skill '${skill.frontmatter.name}', got ${parsed.data.length}.`
+    );
+  }
+  return parsed.data.slice(0, count);
+}
+function buildSkillListText(skills) {
+  return skills.map((s) => `- ${s.frontmatter.name}: ${s.frontmatter.description}`).join("\n");
+}
+function buildConfusionMatrix(cases, skillNames, numQueriesPerSkill) {
+  const allActualValues = [...skillNames, "none", "unrecognized"];
+  const matrix = {};
+  for (const target of skillNames) {
+    matrix[target] = {};
+    for (const actual of allActualValues) {
+      matrix[target][actual] = 0;
+    }
+  }
+  for (const c of cases) {
+    const row = matrix[c.targetSkill];
+    if (row) {
+      row[c.actualSkill] = (row[c.actualSkill] ?? 0) + 1;
+    }
+  }
+  const matrixPct = {};
+  const divisor = numQueriesPerSkill > 0 ? numQueriesPerSkill : 1;
+  for (const target of skillNames) {
+    matrixPct[target] = {};
+    for (const actual of allActualValues) {
+      matrixPct[target][actual] = (matrix[target][actual] ?? 0) / divisor;
+    }
+  }
+  return { matrix, matrixPct };
+}
+function computePerSkillMetrics(skillNames, matrix, numQueriesPerSkill) {
+  return skillNames.map((skill) => {
+    const tp = matrix[skill]?.[skill] ?? 0;
+    const fp = skillNames.filter((s) => s !== skill).reduce((sum, other) => sum + (matrix[other]?.[skill] ?? 0), 0);
+    const recall = numQueriesPerSkill === 0 ? 0 : tp / numQueriesPerSkill;
+    const precDenom = tp + fp;
+    const precision = precDenom === 0 ? 0 : tp / precDenom;
+    const f1 = precision + recall === 0 ? 0 : 2 * precision * recall / (precision + recall);
+    return { skill, queriesTotal: numQueriesPerSkill, correct: tp, precision, recall, f1 };
+  });
+}
+function detectConflicts(skillNames, matrixPct, conflictThreshold) {
+  const conflicts = [];
+  for (let i = 0; i < skillNames.length; i++) {
+    for (let j = i + 1; j < skillNames.length; j++) {
+      const skillA = skillNames[i];
+      const skillB = skillNames[j];
+      const bleedAtoB = matrixPct[skillA]?.[skillB] ?? 0;
+      const bleedBtoA = matrixPct[skillB]?.[skillA] ?? 0;
+      if (Math.max(bleedAtoB, bleedBtoA) > conflictThreshold) {
+        conflicts.push({ skillA, skillB, bleedAtoB, bleedBtoA });
+      }
+    }
+  }
+  return conflicts;
+}
+function buildRouteSuggestions(perSkillMetrics, conflicts) {
+  const suggestions = [];
+  for (const metrics of perSkillMetrics) {
+    if (metrics.f1 < 0.7) {
+      suggestions.push(
+        `'${metrics.skill}' has low F1 (${(metrics.f1 * 100).toFixed(1)}%) \u2014 consider clarifying its description and scope boundaries.`
+      );
+    }
+  }
+  for (const conflict of conflicts) {
+    suggestions.push(
+      `'${conflict.skillA}' and '${conflict.skillB}' overlap: ${(conflict.bleedAtoB * 100).toFixed(1)}% of ${conflict.skillA} queries routed to ${conflict.skillB}, ${(conflict.bleedBtoA * 100).toFixed(1)}% the other way \u2014 consider narrowing scope boundaries.`
+    );
+  }
+  if (suggestions.length === 0) {
+    suggestions.push("Routing looks clean. All skills are well-differentiated on this sample.");
+  }
+  return suggestions;
+}
+async function runRouteTest(skillDir, options) {
+  const numQueriesPerSkill = options.numQueriesPerSkill ?? 10;
+  const conflictThreshold = options.conflictThreshold ?? 0.1;
+  const concurrency = options.concurrency ?? 5;
+  const absoluteSkillDir = path8.resolve(skillDir);
+  const skillPaths = await discoverSkillPaths(absoluteSkillDir);
+  if (skillPaths.length < 2) {
+    throw new Error(
+      `Route test requires at least 2 skills. Found ${skillPaths.length} in: ${skillDir}`
+    );
+  }
+  if (skillPaths.length > 20) {
+    process.stderr.write(
+      `Warning: ${skillPaths.length} skills found. This will make ${skillPaths.length * numQueriesPerSkill} routing model calls.
+`
+    );
+  }
+  const skills = await Promise.all(skillPaths.map((p) => parseSkillStrict(p)));
+  const skillNames = skills.map((s) => s.frontmatter.name);
+  const queriesPerSkill = await pMap(
+    skills,
+    (skill) => generatePositiveQueriesForSkill(skill, options.provider, options.model, numQueriesPerSkill),
+    concurrency
+  );
+  const workItems = [];
+  for (let i = 0; i < skills.length; i++) {
+    const skill = skills[i];
+    const queries = queriesPerSkill[i];
+    for (const query of queries) {
+      workItems.push({ query, targetSkill: skill.frontmatter.name });
+    }
+  }
+  const skillListText = buildSkillListText(skills);
+  const systemPrompt = "Select the single best skill for the user's request from the provided list. Respond with only the skill name, or 'none' if nothing fits.";
+  const cases = await pMap(
+    workItems,
+    async ({ query, targetSkill }) => {
+      const userPrompt = `Available skills:
+${skillListText}
+User query: ${query}`;
+      const rawResponse = await options.provider.sendMessage(systemPrompt, userPrompt, { model: options.model });
+      const actualSkill = parseRouteDecision(rawResponse, skillNames);
+      return {
+        query,
+        targetSkill,
+        actualSkill,
+        correct: actualSkill === targetSkill,
+        rawModelResponse: options.verbose ? rawResponse : void 0
+      };
+    },
+    concurrency
+  );
+  const { matrix, matrixPct } = buildConfusionMatrix(cases, skillNames, numQueriesPerSkill);
+  const perSkillMetrics = computePerSkillMetrics(skillNames, matrix, numQueriesPerSkill);
+  const conflicts = detectConflicts(skillNames, matrixPct, conflictThreshold);
+  const correctCount = cases.filter((c) => c.correct).length;
+  const overallAccuracy = cases.length === 0 ? 0 : correctCount / cases.length;
+  const suggestions = buildRouteSuggestions(perSkillMetrics, conflicts);
+  return {
+    skillDir: absoluteSkillDir,
+    skills: skillNames,
+    model: options.model,
+    provider: options.provider.name,
+    seed: options.seed,
+    numQueriesPerSkill,
+    cases,
+    matrix,
+    matrixPct,
+    perSkillMetrics,
+    conflicts,
+    suggestions,
+    overallAccuracy
+  };
+}
+// src/commands/route.ts
+var routeCliSchema = z14.object({
+  numQueries: z14.number().int().min(1).optional(),
+  conflictThreshold: z14.number().min(0).max(1).optional(),
+  saveQueries: z14.string().optional(),
+  seed: z14.number().int().optional(),
+  concurrency: z14.number().int().min(1).optional(),
+  html: z14.string().optional(),
+  verbose: z14.boolean().optional(),
+  apiKey: z14.string().optional()
+});
+var DEFAULT_ANTHROPIC_MODEL5 = "claude-sonnet-4-5-20250929";
+var DEFAULT_OPENAI_MODEL5 = "gpt-4.1-mini";
+function resolveModel5(provider, model) {
+  if (provider === "openai" && model === DEFAULT_ANTHROPIC_MODEL5) {
+    return DEFAULT_OPENAI_MODEL5;
+  }
+  return model;
+}
+async function handleRouteCommand(skillDir, options) {
+  const spinner = options.json || !process.stdout.isTTY ? null : ora5("Preparing route evaluation...").start();
+  try {
+    if (spinner) spinner.text = "Initializing model provider...";
+    const provider = createProvider(options.provider, options.apiKey);
+    if (spinner) spinner.text = "Running route simulations...";
+    const model = resolveModel5(options.provider, options.model);
+    const result = await runRouteTest(skillDir, {
+      model,
+      provider,
+      numQueriesPerSkill: options.numQueriesPerSkill,
+      conflictThreshold: options.conflictThreshold,
+      seed: options.seed,
+      concurrency: options.concurrency,
+      verbose: options.verbose
+    });
+    if (options.saveQueries) {
+      await writeJsonFile(
+        options.saveQueries,
+        result.cases.map((c) => ({ query: c.query, targetSkill: c.targetSkill }))
+      );
+    }
+    spinner?.stop();
+    if (options.json) {
+      writeResult(result, true);
+    } else {
+      writeResult(renderRouteReport(result, options.color, options.verbose), false);
+    }
+    if (options.html) {
+      await fs13.writeFile(options.html, renderRouteHtml(result), "utf8");
+    }
+  } catch (error) {
+    spinner?.stop();
+    writeError(error, options.json);
+    process.exitCode = 2;
+  }
+}
+function registerRouteCommand(program) {
+  program.command("route").description("Validate multi-skill routing across all skills in a directory.").argument("<skillDir>", "Directory containing skill subdirectories with SKILL.md files").option("--model <model>", "Model to use").option("--provider <provider>", "LLM provider: anthropic|openai").option("--num-queries <n>", "Queries per skill (default: 10)", (value) => Number.parseInt(value, 10)).option("--conflict-threshold <n>", "Bleed fraction to flag as conflict (default: 0.1)", (value) => Number.parseFloat(value)).option("--seed <number>", "RNG seed for reproducibility metadata", (value) => Number.parseInt(value, 10)).option("--concurrency <n>", "Maximum in-flight requests", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--save-queries <path>", "Save generated queries as JSON").option("--api-key <key>", "API key override").option("--verbose", "Show raw model responses").action(async (skillDir, _commandOptions, command) => {
+    const globalOptions = getGlobalCliOptions(command);
+    const config = getResolvedConfig(command);
+    const parsedCli = routeCliSchema.safeParse(command.opts());
+    if (!parsedCli.success) {
+      writeError(new Error(parsedCli.error.issues[0]?.message ?? "Invalid route options."), globalOptions.json);
+      process.exitCode = 2;
+      return;
+    }
+    await handleRouteCommand(skillDir, {
+      ...globalOptions,
+      model: config.model,
+      provider: config.provider,
+      numQueriesPerSkill: parsedCli.data.numQueries ?? 10,
+      conflictThreshold: parsedCli.data.conflictThreshold ?? 0.1,
+      saveQueries: parsedCli.data.saveQueries,
+      seed: parsedCli.data.seed,
+      concurrency: parsedCli.data.concurrency ?? config.concurrency,
+      html: parsedCli.data.html,
+      verbose: Boolean(parsedCli.data.verbose),
+      apiKey: parsedCli.data.apiKey
+    });
+  });
+}
 // src/index.ts
 function resolveVersion() {
   try {
     const currentFilePath = fileURLToPath(import.meta.url);
-    const packageJsonPath = path7.resolve(path7.dirname(currentFilePath), "..", "package.json");
-    const raw = fs12.readFileSync(packageJsonPath, "utf8");
+    const packageJsonPath = path9.resolve(path9.dirname(currentFilePath), "..", "package.json");
+    const raw = fs14.readFileSync(packageJsonPath, "utf8");
     const parsed = JSON.parse(raw);
     return parsed.version ?? "0.0.0";
   } catch {
@@ -4159,6 +5834,8 @@ async function run(argv) {
   registerTriggerCommand(program);
   registerEvalCommand(program);
   registerCheckCommand(program);
+  registerImproveCommand(program);
+  registerRouteCommand(program);
   try {
     await program.parseAsync(argv);
   } catch (error) {