npm - skilltest - Versions diffs - 0.7.0 → 0.9.0 - Mend

skilltest 0.7.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/index.js CHANGED Viewed

@@ -1,13 +1,13 @@
 #!/usr/bin/env node
 // src/index.ts
-import fs11 from "node:fs";
-import path6 from "node:path";
+import fs13 from "node:fs";
+import path8 from "node:path";
 import { fileURLToPath } from "node:url";
 import { Command } from "commander";
 // src/commands/lint.ts
-import fs6 from "node:fs/promises";
+import fs7 from "node:fs/promises";
 import { z as z6 } from "zod";
 // src/core/skill-parser.ts
@@ -100,7 +100,10 @@ function parseFrontmatter(rawSkill) {
 }
 async function parseSkillStrict(inputPath) {
   const skillContext = await loadSkillFile(inputPath);
-  const parsedFrontmatter = parseFrontmatter(skillContext.raw);
+  return parseSkillDocumentStrict(skillContext.raw, skillContext.skillRoot, skillContext.skillFile);
+}
+function parseSkillDocumentStrict(rawSkill, skillRoot, skillFile) {
+  const parsedFrontmatter = parseFrontmatter(rawSkill);
   if (!parsedFrontmatter.hasFrontmatter) {
     throw new Error("SKILL.md is missing YAML frontmatter.");
   }
@@ -113,9 +116,9 @@ async function parseSkillStrict(inputPath) {
     throw new Error(`Invalid frontmatter field '${issue.path.join(".")}': ${issue.message}`);
   }
   return {
-    skillRoot: skillContext.skillRoot,
-    skillFile: skillContext.skillFile,
-    raw: skillContext.raw,
+    skillRoot,
+    skillFile,
+    raw: rawSkill,
     content: parsedFrontmatter.content,
     frontmatterRaw: parsedFrontmatter.rawFrontmatter,
     frontmatter: validation.data
@@ -972,6 +975,116 @@ function runFrontmatterChecks(context) {
   return issues;
 }
+// src/core/linter/plugin.ts
+import fs4 from "node:fs/promises";
+import path4 from "node:path";
+import { pathToFileURL } from "node:url";
+function normalizeRuleCheckId(checkId) {
+  return checkId.includes(":") ? checkId : `plugin:${checkId}`;
+}
+function buildPluginValidationError(filePath, message) {
+  return new Error(`Invalid lint plugin at ${filePath}: ${message}`);
+}
+function validatePluginCandidate(candidate, filePath, exportName) {
+  if (!candidate || typeof candidate !== "object" || !("rules" in candidate)) {
+    throw buildPluginValidationError(filePath, `${exportName} export must be an object with a rules array.`);
+  }
+  const rules = candidate.rules;
+  if (!Array.isArray(rules)) {
+    throw buildPluginValidationError(filePath, `${exportName} export must include a rules array.`);
+  }
+  return {
+    rules: rules.map((rule, index) => {
+      if (!rule || typeof rule !== "object") {
+        throw buildPluginValidationError(filePath, `rule at index ${index} must be an object.`);
+      }
+      const checkId = rule.checkId;
+      if (typeof checkId !== "string" || checkId.trim() === "") {
+        throw buildPluginValidationError(filePath, `rule at index ${index} must have a non-empty string checkId.`);
+      }
+      const title = rule.title;
+      if (typeof title !== "string" || title.trim() === "") {
+        throw buildPluginValidationError(filePath, `rule at index ${index} must have a non-empty string title.`);
+      }
+      const check = rule.check;
+      if (typeof check !== "function") {
+        throw buildPluginValidationError(filePath, `rule '${checkId}' must have a check function.`);
+      }
+      return {
+        checkId: normalizeRuleCheckId(checkId),
+        title,
+        check
+      };
+    })
+  };
+}
+async function loadPlugin(filePath) {
+  const absolutePath = path4.resolve(filePath);
+  try {
+    await fs4.access(absolutePath);
+  } catch {
+    throw new Error(`Failed to load lint plugin at ${absolutePath}: file does not exist.`);
+  }
+  let loadedModule;
+  try {
+    loadedModule = await import(pathToFileURL(absolutePath).href);
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    throw new Error(`Failed to load lint plugin at ${absolutePath}: ${message}`);
+  }
+  const validationErrors = [];
+  for (const [exportName, candidate] of [
+    ["default", loadedModule.default],
+    ["plugin", loadedModule.plugin]
+  ]) {
+    if (candidate === void 0) {
+      continue;
+    }
+    try {
+      return validatePluginCandidate(candidate, absolutePath, exportName);
+    } catch (error) {
+      validationErrors.push(error instanceof Error ? error.message : String(error));
+    }
+  }
+  if (validationErrors.length > 0) {
+    throw new Error(validationErrors.join(" "));
+  }
+  throw buildPluginValidationError(
+    absolutePath,
+    "expected a default export or named export 'plugin' containing a rules array."
+  );
+}
+function buildRuleExecutionError(rule, error) {
+  const message = error instanceof Error ? error.message : String(error);
+  return {
+    id: `plugin.load-error.${rule.checkId.replace(/[^A-Za-z0-9]+/g, "-").replace(/^-+|-+$/g, "").toLowerCase()}`,
+    checkId: "plugin:load-error",
+    title: "Plugin Rule Error",
+    status: "fail",
+    message: `Plugin rule '${rule.checkId}' failed: ${message}`
+  };
+}
+async function runPluginRules(plugin, context) {
+  const issues = [];
+  for (const rule of plugin.rules) {
+    try {
+      const result = await rule.check(context);
+      if (!Array.isArray(result)) {
+        throw new Error("check function must return an array of lint issues.");
+      }
+      issues.push(
+        ...result.map((issue) => ({
+          ...issue,
+          checkId: rule.checkId
+        }))
+      );
+    } catch (error) {
+      issues.push(buildRuleExecutionError(rule, error));
+    }
+  }
+  return issues;
+}
 // src/core/linter/security.ts
 var DANGEROUS_COMMAND_PATTERNS = [
   {
@@ -1179,8 +1292,8 @@ function runSecurityChecks(context) {
 }
 // src/core/linter/structure.ts
-import fs4 from "node:fs/promises";
-import path4 from "node:path";
+import fs5 from "node:fs/promises";
+import path5 from "node:path";
 function hasTableOfContents(content) {
   if (/^#{1,6}\s+table of contents\b/im.test(content)) {
     return true;
@@ -1221,21 +1334,21 @@ async function runStructureChecks(context) {
       message: `SKILL.md length is ${context.skill.lineCount} lines.`
     });
   }
-  const referencesDir = path4.join(context.skill.skillRoot, "references");
+  const referencesDir = path5.join(context.skill.skillRoot, "references");
   if (await pathExists(referencesDir)) {
     const files = await listFilesRecursive(referencesDir);
     let oversizedWithoutToc = 0;
     for (const file of files) {
-      const raw = await fs4.readFile(file, "utf8");
+      const raw = await fs5.readFile(file, "utf8");
       const lineCount = raw === "" ? 0 : raw.split(/\r?\n/).length;
       if (lineCount > 300 && !hasTableOfContents(raw)) {
         oversizedWithoutToc += 1;
         issues.push({
-          id: `structure.references.toc.${toPosixPath(path4.relative(context.skill.skillRoot, file))}`,
+          id: `structure.references.toc.${toPosixPath(path5.relative(context.skill.skillRoot, file))}`,
           checkId: "structure:toc",
           title: "Reference File Navigation",
           status: "warn",
-          message: `${toPosixPath(path4.relative(context.skill.skillRoot, file))} is ${lineCount} lines and has no table of contents.`,
+          message: `${toPosixPath(path5.relative(context.skill.skillRoot, file))} is ${lineCount} lines and has no table of contents.`,
           suggestion: "Add a table of contents for long reference files."
         });
       }
@@ -1265,7 +1378,7 @@ async function runStructureChecks(context) {
     other: []
   };
   for (const reference of references) {
-    const resolved = path4.resolve(context.skill.skillRoot, reference);
+    const resolved = path5.resolve(context.skill.skillRoot, reference);
     if (!await pathExists(resolved)) {
       const kind = classifyReferencePath(reference);
       missingByType[kind].push(reference);
@@ -1362,6 +1475,10 @@ async function runLinter(inputPath, options = {}) {
   issues.push(...runSecurityChecks(context));
   issues.push(...await runDisclosureChecks(context));
   issues.push(...runCompatibilityChecks(context));
+  for (const pluginPath of options.plugins ?? []) {
+    const plugin = await loadPlugin(pluginPath);
+    issues.push(...await runPluginRules(plugin, context));
+  }
   const filteredIssues = issues.filter((issue) => !suppressedCheckIds.has(issue.checkId));
   return {
     target: inputPath,
@@ -1401,6 +1518,9 @@ function badgeLabel(status) {
 function renderBadge(status) {
   return `<span class="badge ${status}">${badgeLabel(status)}</span>`;
 }
+function renderMetaBadge(label) {
+  return `<span class="meta-badge">${escapeHtml(label)}</span>`;
+}
 function renderStatCards(stats) {
   return `<div class="stats-grid">${stats.map(
     (stat) => `
@@ -1525,10 +1645,10 @@ function renderLintIssueList(report) {
   const info = skippedSecurityPatterns > 0 ? `<p class="info-line">Skipped security patterns in examples/comments: ${escapeHtml(skippedSecurityPatterns)}</p>` : "";
   return `<div class="row-list">${rows}</div>${info}`;
 }
-function renderTriggerCaseRow(testCase) {
+function renderTriggerCaseRow(testCase, showSelectedCompetitor) {
   const details = testCase.rawModelResponse ? renderDetails("Model response", renderPreBlock(testCase.rawModelResponse)) : "";
   return `
-    <div class="row">
+    <div class="row${testCase.selectedCompetitor ? " competitor-selected" : ""}">
       <div class="row-header">
         <div>
           <div class="row-title">${escapeHtml(testCase.query)}</div>
@@ -1540,12 +1660,29 @@ function renderTriggerCaseRow(testCase) {
       </div>
       ${renderDefinitionList([
     { label: "Expected", value: testCase.expected },
-    { label: "Actual", value: testCase.actual }
+    { label: "Actual", value: testCase.actual },
+    ...showSelectedCompetitor ? [{ label: "Selected competitor", value: testCase.selectedCompetitor ?? "none" }] : []
   ])}
       ${details}
     </div>
   `;
 }
+function renderCompetitorSkillsSection(result) {
+  if (!result.competitors || result.competitors.length === 0) {
+    return "";
+  }
+  return renderSectionCard(
+    "Competitor Skills",
+    `<div class="row-list">${result.competitors.map(
+      (competitor) => renderMessageRow(
+        "warn",
+        competitor.name,
+        competitor.description,
+        renderDefinitionList([{ label: "Source", value: competitor.sourcePath }])
+      )
+    ).join("")}</div>`
+  );
+}
 function promptStatus(promptResult) {
   if (promptResult.totalAssertions === 0) {
     return "skip";
@@ -1559,10 +1696,37 @@ function promptStatus(promptResult) {
   return "warn";
 }
 function renderAssertionRow(assertion) {
-  return renderDetails(
-    `${badgeLabel(assertion.passed ? "pass" : "fail")} ${assertion.assertion}`,
-    renderPreBlock(assertion.evidence)
-  );
+  return `
+    <details class="detail-block">
+      <summary>
+        ${renderBadge(assertion.passed ? "pass" : "fail")}
+        ${assertion.source === "tool" ? renderMetaBadge("Tool") : ""}
+        <span>${escapeHtml(assertion.assertion)}</span>
+      </summary>
+      <div class="detail-content">${renderPreBlock(assertion.evidence)}</div>
+    </details>
+  `;
+}
+function renderToolCallsSection(promptResult) {
+  if (!promptResult.toolCalls || promptResult.toolCalls.length === 0) {
+    return "";
+  }
+  const toolRows = promptResult.toolCalls.map(
+    (toolCall) => `
+        <div class="tool-call">
+          <div class="row-header">
+            <div>
+              <div class="row-title">${escapeHtml(toolCall.name)}</div>
+              <div class="row-subtitle">${escapeHtml(`turn ${toolCall.turnIndex}`)}</div>
+            </div>
+            ${renderMetaBadge("Tool Call")}
+          </div>
+          ${renderDefinitionList([{ label: "Arguments", value: JSON.stringify(toolCall.arguments) }])}
+          ${renderDetails("Mock response", renderPreBlock(toolCall.response))}
+        </div>
+      `
+  ).join("");
+  return renderDetails("Tool Calls", `<div class="tool-call-list">${toolRows}</div>`);
 }
 function renderEvalPromptRow(promptResult) {
   const assertionDetails = promptResult.assertions.map((assertion) => renderAssertionRow(assertion)).join("");
@@ -1581,9 +1745,12 @@ function renderEvalPromptRow(promptResult) {
       <div class="row-body">${escapeHtml(promptResult.responseSummary)}</div>
       ${renderDefinitionList([
     { label: "Passed assertions", value: String(promptResult.passedAssertions) },
-    { label: "Total assertions", value: String(promptResult.totalAssertions) }
+    { label: "Total assertions", value: String(promptResult.totalAssertions) },
+    ...promptResult.toolCalls ? [{ label: "Tool calls", value: String(promptResult.toolCalls.length) }] : [],
+    ...promptResult.loopIterations !== void 0 ? [{ label: "Loop iterations", value: String(promptResult.loopIterations) }] : []
   ])}
       ${renderDetails("Assertion evidence", assertionDetails || `<p>No assertions.</p>`)}
+      ${renderToolCallsSection(promptResult)}
       ${responseDetails}
     </div>
   `;
@@ -1638,6 +1805,7 @@ function renderHtmlDocument(title, body) {
         --pass: #22c55e;
         --warn: #eab308;
         --fail: #ef4444;
+        --competitor: #f97316;
         --skip: #6b7280;
         --shadow: 0 10px 30px rgba(15, 23, 42, 0.08);
       }
@@ -1786,6 +1954,11 @@ function renderHtmlDocument(title, body) {
         background: var(--surface-muted);
       }
+      .row.competitor-selected {
+        border-color: rgba(249, 115, 22, 0.45);
+        background: rgba(249, 115, 22, 0.08);
+      }
       .row-header {
         display: flex;
         justify-content: space-between;
@@ -1844,6 +2017,20 @@ function renderHtmlDocument(title, body) {
         background: rgba(107, 114, 128, 0.14);
       }
+      .meta-badge {
+        display: inline-flex;
+        align-items: center;
+        justify-content: center;
+        padding: 3px 10px;
+        border-radius: 999px;
+        border: 1px solid rgba(17, 24, 39, 0.16);
+        background: rgba(17, 24, 39, 0.06);
+        color: var(--text);
+        font-size: 0.76rem;
+        font-weight: 700;
+        white-space: nowrap;
+      }
       details {
         margin-top: 10px;
       }
@@ -1858,6 +2045,13 @@ function renderHtmlDocument(title, body) {
         padding-top: 10px;
       }
+      .detail-block summary {
+        display: flex;
+        align-items: center;
+        gap: 8px;
+        flex-wrap: wrap;
+      }
       .detail-content p {
         margin: 0;
       }
@@ -1908,6 +2102,18 @@ function renderHtmlDocument(title, body) {
         overflow-wrap: anywhere;
       }
+      .tool-call-list {
+        display: grid;
+        gap: 12px;
+      }
+      .tool-call {
+        border: 1px solid var(--border);
+        border-radius: 12px;
+        padding: 14px;
+        background: #fffaf0;
+      }
       ul {
         margin: 0;
         padding-left: 20px;
@@ -1965,6 +2171,7 @@ function renderTriggerHtml(result) {
   const target = resolveOptionalTarget(htmlResult, result.skillName);
   const matchedCount = result.cases.filter((testCase) => testCase.matched).length;
   const matchRate = result.cases.length === 0 ? 0 : matchedCount / result.cases.length;
+  const hasCompetitors = Boolean(result.competitors && result.competitors.length > 0);
   const body = [
     renderHeaderCard(
       "trigger",
@@ -1980,10 +2187,15 @@ function renderTriggerHtml(result) {
         { label: "Provider", value: result.provider },
         { label: "Model", value: result.model },
         { label: "Seed", value: result.seed !== void 0 ? String(result.seed) : "none" },
+        ...hasCompetitors ? [{ label: "Competitors", value: String(result.competitors?.length ?? 0) }] : [],
         { label: "Queries", value: String(result.queries.length) }
       ]
     ),
-    renderSectionCard("Trigger Cases", `<div class="row-list">${result.cases.map((testCase) => renderTriggerCaseRow(testCase)).join("")}</div>`),
+    renderCompetitorSkillsSection(result),
+    renderSectionCard(
+      "Trigger Cases",
+      `<div class="row-list">${result.cases.map((testCase) => renderTriggerCaseRow(testCase, hasCompetitors)).join("")}</div>`
+    ),
     renderSectionCard(
       "Suggestions",
       `<ul>${result.suggestions.map((suggestion) => `<li>${escapeHtml(suggestion)}</li>`).join("")}</ul>`
@@ -2023,7 +2235,8 @@ function renderEvalHtml(result) {
 }
 function renderCheckHtml(result) {
   const skillName = result.trigger?.skillName ?? result.eval?.skillName ?? result.target;
-  const triggerBody = result.trigger ? `<div class="row-list">${result.trigger.cases.map((testCase) => renderTriggerCaseRow(testCase)).join("")}</div>
+  const triggerBody = result.trigger ? `${renderCompetitorSkillsSection(result.trigger)}
+       <div class="row-list">${result.trigger.cases.map((testCase) => renderTriggerCaseRow(testCase, Boolean(result.trigger?.competitors?.length))).join("")}</div>
        <div class="card" style="margin-top: 16px;">
          <h2>Trigger Suggestions</h2>
          <ul>${result.trigger.suggestions.map((suggestion) => `<li>${escapeHtml(suggestion)}</li>`).join("")}</ul>
@@ -2123,46 +2336,111 @@ function countSkippedSecurityPatterns2(issues) {
     return total + (issue.skippedPatterns?.length ?? 0);
   }, 0);
 }
+function formatPercent2(value) {
+  return `${(value * 100).toFixed(1)}%`;
+}
+function formatSignedNumber(value, digits = 4) {
+  const prefix = value > 0 ? "+" : "";
+  return `${prefix}${value.toFixed(digits)}`;
+}
+function diffChangedLines(beforeText, afterText) {
+  const beforeLines = beforeText.split(/\r?\n/);
+  const afterLines = afterText.split(/\r?\n/);
+  const dp = Array.from({ length: beforeLines.length + 1 }, () => Array(afterLines.length + 1).fill(0));
+  for (let beforeIndex2 = beforeLines.length - 1; beforeIndex2 >= 0; beforeIndex2 -= 1) {
+    for (let afterIndex2 = afterLines.length - 1; afterIndex2 >= 0; afterIndex2 -= 1) {
+      if (beforeLines[beforeIndex2] === afterLines[afterIndex2]) {
+        dp[beforeIndex2][afterIndex2] = 1 + (dp[beforeIndex2 + 1][afterIndex2 + 1] ?? 0);
+      } else {
+        dp[beforeIndex2][afterIndex2] = Math.max(dp[beforeIndex2 + 1][afterIndex2] ?? 0, dp[beforeIndex2][afterIndex2 + 1] ?? 0);
+      }
+    }
+  }
+  const changedLines = [];
+  let beforeIndex = 0;
+  let afterIndex = 0;
+  while (beforeIndex < beforeLines.length && afterIndex < afterLines.length) {
+    if (beforeLines[beforeIndex] === afterLines[afterIndex]) {
+      beforeIndex += 1;
+      afterIndex += 1;
+      continue;
+    }
+    const skipBefore = dp[beforeIndex + 1][afterIndex] ?? 0;
+    const skipAfter = dp[beforeIndex][afterIndex + 1] ?? 0;
+    if (skipBefore >= skipAfter) {
+      changedLines.push({ type: "-", line: beforeLines[beforeIndex] ?? "" });
+      beforeIndex += 1;
+    } else {
+      changedLines.push({ type: "+", line: afterLines[afterIndex] ?? "" });
+      afterIndex += 1;
+    }
+  }
+  while (beforeIndex < beforeLines.length) {
+    changedLines.push({ type: "-", line: beforeLines[beforeIndex] ?? "" });
+    beforeIndex += 1;
+  }
+  while (afterIndex < afterLines.length) {
+    changedLines.push({ type: "+", line: afterLines[afterIndex] ?? "" });
+    afterIndex += 1;
+  }
+  return changedLines;
+}
+function renderDiffPreview(beforeText, afterText, maxLines = 40) {
+  const changedLines = diffChangedLines(beforeText, afterText);
+  if (changedLines.length === 0) {
+    return ["  (no content changes)"];
+  }
+  const previewLines = changedLines.slice(0, maxLines).map((entry) => `  ${entry.type} ${entry.line}`);
+  if (changedLines.length > maxLines) {
+    previewLines.push(`  ... ${changedLines.length - maxLines} more changed line(s)`);
+  }
+  return previewLines;
+}
+function summarizeToolCalls(toolCalls) {
+  const counts = /* @__PURE__ */ new Map();
+  for (const toolCall of toolCalls) {
+    counts.set(toolCall.name, (counts.get(toolCall.name) ?? 0) + 1);
+  }
+  return Array.from(counts.entries()).map(([name, count]) => `${name} x${count}`).join(", ");
+}
 function renderLintReport(report, enableColor) {
   const c = getChalkInstance(enableColor);
   const { passed, warnings, failures, total } = report.summary;
   const headerLines = [
-    `\u250C\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510`,
-    `\u2502 skilltest lint                                                \u2502`,
-    `\u251C\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2524`,
-    `\u2502 target: ${report.target}`,
-    `\u2502 summary: ${passed}/${total} checks passed, ${warnings} warnings, ${failures} failures`,
-    `\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518`
+    "skilltest lint",
+    `target: ${report.target}`,
+    `summary: ${passed}/${total} checks passed, ${warnings} warnings, ${failures} failures`
   ];
   const renderedIssues = report.issues.map((issue) => renderIssueLine(issue, c)).join("\n");
   const skippedSecurityPatterns = countSkippedSecurityPatterns2(report.issues);
   const infoLine = skippedSecurityPatterns > 0 ? `
-  ${c.cyan("\u2139")} ${skippedSecurityPatterns} security pattern(s) found in code examples/comments (not flagged)` : "";
+  ${c.cyan("INFO")} ${skippedSecurityPatterns} security pattern(s) found in code examples/comments (not flagged)` : "";
   return `${headerLines.join("\n")}
 ${renderedIssues}${infoLine}`;
 }
-function formatPercent2(value) {
-  return `${(value * 100).toFixed(1)}%`;
-}
 function renderTriggerReport(result, enableColor, verbose) {
   const c = getChalkInstance(enableColor);
-  const lines = [];
-  lines.push("\u250C\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510");
-  lines.push("\u2502 skilltest trigger                                             \u2502");
-  lines.push("\u251C\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2524");
-  lines.push(`\u2502 skill: ${result.skillName}`);
-  lines.push(`\u2502 provider/model: ${result.provider}/${result.model}`);
+  const lines = [
+    "skilltest trigger",
+    `skill: ${result.skillName}`,
+    `provider/model: ${result.provider}/${result.model}`
+  ];
+  if (result.competitors && result.competitors.length > 0) {
+    lines.push(`competitors: ${result.competitors.map((competitor) => competitor.name).join(", ")}`);
+  }
   lines.push(
-    `\u2502 precision: ${formatPercent2(result.metrics.precision)}  recall: ${formatPercent2(result.metrics.recall)}  f1: ${formatPercent2(result.metrics.f1)}`
+    `precision: ${formatPercent2(result.metrics.precision)}  recall: ${formatPercent2(result.metrics.recall)}  f1: ${formatPercent2(result.metrics.f1)}`
   );
   lines.push(
-    `\u2502 TP ${result.metrics.truePositives}  TN ${result.metrics.trueNegatives}  FP ${result.metrics.falsePositives}  FN ${result.metrics.falseNegatives}`
+    `TP ${result.metrics.truePositives}  TN ${result.metrics.trueNegatives}  FP ${result.metrics.falsePositives}  FN ${result.metrics.falseNegatives}`
   );
-  lines.push("\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518");
   for (const [index, testCase] of result.cases.entries()) {
     const status = testCase.matched ? c.green("PASS") : c.red("FAIL");
     lines.push(`${index + 1}. ${status} query: ${testCase.query}`);
     lines.push(`   expected: ${testCase.expected} | actual: ${testCase.actual}`);
+    if (verbose && testCase.selectedCompetitor) {
+      lines.push(`   competitor selected: ${testCase.selectedCompetitor}`);
+    }
     if (verbose && testCase.rawModelResponse) {
       lines.push(`   model: ${testCase.rawModelResponse.replace(/\s+/g, " ").trim()}`);
     }
@@ -2175,24 +2453,35 @@ function renderTriggerReport(result, enableColor, verbose) {
 }
 function renderEvalReport(result, enableColor, verbose) {
   const c = getChalkInstance(enableColor);
-  const lines = [];
-  lines.push("\u250C\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510");
-  lines.push("\u2502 skilltest eval                                                \u2502");
-  lines.push("\u251C\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2524");
-  lines.push(`\u2502 skill: ${result.skillName}`);
-  lines.push(`\u2502 provider/model: ${result.provider}/${result.model}`);
-  lines.push(`\u2502 grader model: ${result.graderModel}`);
-  lines.push(`\u2502 assertions passed: ${result.summary.passedAssertions}/${result.summary.totalAssertions}`);
-  lines.push("\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518");
+  const lines = [
+    "skilltest eval",
+    `skill: ${result.skillName}`,
+    `provider/model: ${result.provider}/${result.model}`,
+    `grader model: ${result.graderModel}`,
+    `assertions passed: ${result.summary.passedAssertions}/${result.summary.totalAssertions}`
+  ];
   for (const [index, promptResult] of result.results.entries()) {
     lines.push(`${index + 1}. prompt: ${promptResult.prompt}`);
     lines.push(`   response summary: ${promptResult.responseSummary.replace(/\s+/g, " ").trim()}`);
+    if (promptResult.toolCalls) {
+      lines.push(`   Tools: ${promptResult.toolCalls.length} calls (${summarizeToolCalls(promptResult.toolCalls)})`);
+      if (promptResult.loopIterations !== void 0) {
+        lines.push(`   loop iterations: ${promptResult.loopIterations}`);
+      }
+    }
     for (const assertion of promptResult.assertions) {
       const status = assertion.passed ? c.green("PASS") : c.red("FAIL");
       lines.push(`   ${status} ${assertion.assertion}`);
       lines.push(`      evidence: ${assertion.evidence}`);
     }
     if (verbose) {
+      if (promptResult.toolCalls) {
+        for (const toolCall of promptResult.toolCalls) {
+          lines.push(`   tool ${toolCall.turnIndex}: ${toolCall.name}`);
+          lines.push(`      arguments: ${JSON.stringify(toolCall.arguments)}`);
+          lines.push(`      response: ${toolCall.response}`);
+        }
+      }
       lines.push(`   full response: ${promptResult.response}`);
     }
   }
@@ -2229,7 +2518,7 @@ function renderCheckReport(result, enableColor, verbose) {
   }
   const skippedSecurityPatterns = countSkippedSecurityPatterns2(result.lint.issues);
   if (skippedSecurityPatterns > 0) {
-    lines.push(`  ${c.cyan("\u2139")} ${skippedSecurityPatterns} security pattern(s) found in code examples/comments (not flagged)`);
+    lines.push(`  ${c.cyan("INFO")} ${skippedSecurityPatterns} security pattern(s) found in code examples/comments (not flagged)`);
   }
   lines.push("");
   lines.push("Trigger");
@@ -2240,11 +2529,17 @@ function renderCheckReport(result, enableColor, verbose) {
     lines.push(
       `  TP ${result.trigger.metrics.truePositives} TN ${result.trigger.metrics.trueNegatives} FP ${result.trigger.metrics.falsePositives} FN ${result.trigger.metrics.falseNegatives}`
     );
+    if (result.trigger.competitors && result.trigger.competitors.length > 0) {
+      lines.push(`  competitors: ${result.trigger.competitors.map((competitor) => competitor.name).join(", ")}`);
+    }
     const triggerCases = verbose ? result.trigger.cases : result.trigger.cases.filter((testCase) => !testCase.matched);
     for (const testCase of triggerCases) {
       const status = testCase.matched ? c.green("PASS") : c.red("FAIL");
       lines.push(`  - ${status} ${testCase.query}`);
       lines.push(`    expected=${testCase.expected} actual=${testCase.actual}`);
+      if (testCase.selectedCompetitor) {
+        lines.push(`    competitor selected=${testCase.selectedCompetitor}`);
+      }
     }
   } else {
     lines.push(`- ${triggerGate} ${result.triggerSkippedReason ?? "Skipped."}`);
@@ -2263,6 +2558,12 @@ function renderCheckReport(result, enableColor, verbose) {
       }
       lines.push(`  - prompt: ${promptResult.prompt}`);
       lines.push(`    response summary: ${promptResult.responseSummary.replace(/\s+/g, " ").trim()}`);
+      if (promptResult.toolCalls) {
+        lines.push(`    Tools: ${promptResult.toolCalls.length} calls (${summarizeToolCalls(promptResult.toolCalls)})`);
+        if (promptResult.loopIterations !== void 0) {
+          lines.push(`    loop iterations: ${promptResult.loopIterations}`);
+        }
+      }
       const assertionsToRender = verbose ? promptResult.assertions : failedAssertions;
       for (const assertion of assertionsToRender) {
         const assertionStatus = assertion.passed ? c.green("PASS") : c.red("FAIL");
@@ -2270,6 +2571,13 @@ function renderCheckReport(result, enableColor, verbose) {
         lines.push(`      evidence: ${assertion.evidence}`);
       }
       if (verbose) {
+        if (promptResult.toolCalls) {
+          for (const toolCall of promptResult.toolCalls) {
+            lines.push(`    tool ${toolCall.turnIndex}: ${toolCall.name}`);
+            lines.push(`      arguments: ${JSON.stringify(toolCall.arguments)}`);
+            lines.push(`      response: ${toolCall.response}`);
+          }
+        }
         lines.push(`    full response: ${promptResult.response}`);
       }
     }
@@ -2284,9 +2592,76 @@ function renderCheckReport(result, enableColor, verbose) {
   lines.push(`- overall: ${overallGate}`);
   return lines.join("\n");
 }
+function renderImproveReport(result, enableColor, verbose = false) {
+  const c = getChalkInstance(enableColor);
+  const lines = [
+    "skilltest improve",
+    `target: ${result.target}`,
+    `provider/model: ${result.provider}/${result.model}`,
+    `thresholds: min-f1=${result.thresholds.minF1.toFixed(2)} min-assert-pass-rate=${result.thresholds.minAssertPassRate.toFixed(2)}`
+  ];
+  const statusLabel = result.blockedReason ? c.red("BLOCKED") : result.applied ? c.green("APPLIED") : c.green("VERIFIED");
+  lines.push(`status: ${statusLabel}`);
+  if (result.candidate) {
+    lines.push("");
+    lines.push("Change Summary");
+    for (const item of result.candidate.changeSummary) {
+      lines.push(`- ${item}`);
+    }
+    lines.push("");
+    lines.push("Targeted Problems");
+    for (const item of result.candidate.targetedProblems) {
+      lines.push(`- ${item}`);
+    }
+  }
+  if (result.delta && result.verification) {
+    lines.push("");
+    lines.push("Before / After");
+    lines.push(
+      `- lint failures: ${result.delta.lintFailures.before} -> ${result.delta.lintFailures.after} (${formatSignedNumber(result.delta.lintFailures.delta, 0)})`
+    );
+    lines.push(
+      `- lint warnings: ${result.delta.lintWarnings.before} -> ${result.delta.lintWarnings.after} (${formatSignedNumber(result.delta.lintWarnings.delta, 0)})`
+    );
+    lines.push(
+      `- trigger f1: ${formatPercent2(result.delta.triggerF1.before)} -> ${formatPercent2(result.delta.triggerF1.after)} (${formatSignedNumber(result.delta.triggerF1.delta)})`
+    );
+    lines.push(
+      `- eval assertion pass rate: ${formatPercent2(result.delta.evalAssertPassRate.before)} -> ${formatPercent2(result.delta.evalAssertPassRate.after)} (${formatSignedNumber(result.delta.evalAssertPassRate.delta)})`
+    );
+    lines.push(
+      `- overall gate: ${result.delta.overallPassed.before ? c.green("PASS") : c.red("FAIL")} -> ${result.delta.overallPassed.after ? c.green("PASS") : c.red("FAIL")}`
+    );
+  }
+  if (result.outputPath) {
+    lines.push("");
+    lines.push(`output: ${result.outputPath}`);
+  }
+  if (result.blockedReason) {
+    lines.push("");
+    lines.push("Blocked");
+    lines.push(`- ${result.blockedReason}`);
+  }
+  if (result.candidate) {
+    lines.push("");
+    lines.push("Diff Preview");
+    lines.push(...renderDiffPreview(result.originalRaw, result.candidate.raw));
+  }
+  if (verbose) {
+    lines.push("");
+    lines.push("Baseline");
+    lines.push(renderCheckReport(result.baseline, enableColor, true));
+    if (result.verification) {
+      lines.push("");
+      lines.push("Verification");
+      lines.push(renderCheckReport(result.verification, enableColor, true));
+    }
+  }
+  return lines.join("\n");
+}
 // src/commands/common.ts
-import fs5 from "node:fs/promises";
+import fs6 from "node:fs/promises";
 import { z as z5 } from "zod";
 // src/core/eval-runner.ts
@@ -2355,7 +2730,10 @@ function parseGraderOutput(raw) {
 async function gradeResponse(options) {
   const prompts = buildGraderPrompts(options);
   const raw = await options.provider.sendMessage(prompts.systemPrompt, prompts.userPrompt, { model: options.model });
-  return parseGraderOutput(raw);
+  return parseGraderOutput(raw).map((assertion) => ({
+    ...assertion,
+    source: "grader"
+  }));
 }
 // src/utils/concurrency.ts
@@ -2410,12 +2788,290 @@ async function pMap(items, fn, concurrency) {
   });
 }
+// src/core/tool-environment.ts
+function isPlainObject(value) {
+  return value !== null && typeof value === "object" && !Array.isArray(value);
+}
+function deepEqual(left, right) {
+  if (Array.isArray(left) && Array.isArray(right)) {
+    if (left.length !== right.length) {
+      return false;
+    }
+    return left.every((item, index) => deepEqual(item, right[index]));
+  }
+  if (isPlainObject(left) && isPlainObject(right)) {
+    const leftKeys = Object.keys(left);
+    const rightKeys = Object.keys(right);
+    if (leftKeys.length !== rightKeys.length) {
+      return false;
+    }
+    return leftKeys.every((key) => deepEqual(left[key], right[key]));
+  }
+  return left === right;
+}
+function matchesArgumentSubset(actual, expected) {
+  if (Array.isArray(expected)) {
+    if (!Array.isArray(actual) || actual.length !== expected.length) {
+      return false;
+    }
+    return expected.every((value, index) => matchesArgumentSubset(actual[index], value));
+  }
+  if (isPlainObject(expected)) {
+    if (!isPlainObject(actual)) {
+      return false;
+    }
+    return Object.entries(expected).every(([key, value]) => matchesArgumentSubset(actual[key], value));
+  }
+  return deepEqual(actual, expected);
+}
+function parseResponsePattern(pattern) {
+  if (pattern === "*") {
+    return null;
+  }
+  try {
+    const parsed = JSON.parse(pattern);
+    return isPlainObject(parsed) ? parsed : null;
+  } catch {
+    return null;
+  }
+}
+function renderFallbackResponse(tool, args) {
+  return `[mock] No mock response configured for tool '${tool.name}' with arguments: ${JSON.stringify(args)}`;
+}
+function resolveToolResponse(tool, args) {
+  const exactMatchKey = JSON.stringify(args);
+  const exactMatch = tool.responses[exactMatchKey];
+  if (exactMatch !== void 0) {
+    return exactMatch;
+  }
+  let bestPartialMatch = null;
+  for (const [pattern, response] of Object.entries(tool.responses)) {
+    if (pattern === "*") {
+      continue;
+    }
+    const parsedPattern = parseResponsePattern(pattern);
+    if (!parsedPattern) {
+      continue;
+    }
+    if (!matchesArgumentSubset(args, parsedPattern)) {
+      continue;
+    }
+    const specificity = Object.keys(parsedPattern).length;
+    if (!bestPartialMatch || specificity > bestPartialMatch.specificity) {
+      bestPartialMatch = { specificity, response };
+    }
+  }
+  if (bestPartialMatch) {
+    return bestPartialMatch.response;
+  }
+  const wildcardMatch = tool.responses["*"];
+  if (wildcardMatch !== void 0) {
+    return wildcardMatch;
+  }
+  return renderFallbackResponse(tool, args);
+}
+function toProviderToolDefinitions(mockTools) {
+  return mockTools.map((tool) => {
+    const parameters = tool.parameters ?? [];
+    return {
+      name: tool.name,
+      description: tool.description,
+      parameters: {
+        type: "object",
+        properties: Object.fromEntries(
+          parameters.map((parameter) => [
+            parameter.name,
+            {
+              type: parameter.type,
+              description: parameter.description
+            }
+          ])
+        ),
+        required: parameters.filter((parameter) => parameter.required).map((parameter) => parameter.name)
+      }
+    };
+  });
+}
+function toAssistantConversationBlocks(response) {
+  const contentBlocks = [];
+  if (response.textContent.trim().length > 0) {
+    contentBlocks.push({
+      type: "text",
+      text: response.textContent
+    });
+  }
+  for (const block of response.toolUseBlocks) {
+    contentBlocks.push({
+      type: "tool_use",
+      id: block.id,
+      name: block.name,
+      input: block.arguments
+    });
+  }
+  return contentBlocks.length === 0 ? [] : [
+    {
+      role: "assistant",
+      content: contentBlocks
+    }
+  ];
+}
+async function runWithTools(options) {
+  const maxIterations = options.maxIterations ?? 10;
+  const toolsByName = new Map(options.tools.map((tool) => [tool.name, tool]));
+  const providerTools = toProviderToolDefinitions(options.tools);
+  const messages = [{ role: "user", content: options.userMessage }];
+  const toolCalls = [];
+  let finalResponse = "";
+  let loopIterations = 0;
+  while (loopIterations < maxIterations) {
+    loopIterations += 1;
+    const response = await options.provider.sendWithTools(options.systemPrompt, messages, {
+      model: options.model,
+      tools: providerTools
+    });
+    if (response.textContent.trim().length > 0) {
+      finalResponse = response.textContent;
+    }
+    if (response.toolUseBlocks.length === 0) {
+      return {
+        finalResponse,
+        toolCalls,
+        loopIterations
+      };
+    }
+    messages.push(...toAssistantConversationBlocks(response));
+    const toolResultBlocks = [];
+    for (const toolUse of response.toolUseBlocks) {
+      const tool = toolsByName.get(toolUse.name);
+      const resolvedResponse = tool ? resolveToolResponse(tool, toolUse.arguments) : `[mock] No tool named '${toolUse.name}' is registered.`;
+      toolCalls.push({
+        name: toolUse.name,
+        arguments: toolUse.arguments,
+        response: resolvedResponse,
+        turnIndex: loopIterations
+      });
+      toolResultBlocks.push({
+        type: "tool_result",
+        tool_use_id: toolUse.id,
+        content: resolvedResponse
+      });
+    }
+    messages.push({
+      role: "user",
+      content: toolResultBlocks
+    });
+  }
+  const terminationNote = `[skilltest: tool loop terminated after ${maxIterations} iterations]`;
+  finalResponse = finalResponse ? `${finalResponse}
+${terminationNote}` : terminationNote;
+  return {
+    finalResponse,
+    toolCalls,
+    loopIterations
+  };
+}
 // src/core/eval-runner.ts
+var toolParameterSchema = z3.object({
+  name: z3.string().min(1),
+  type: z3.enum(["string", "number", "boolean", "object", "array"]),
+  description: z3.string().min(1),
+  required: z3.boolean().optional()
+});
+var mockToolDefinitionSchema = z3.object({
+  name: z3.string().min(1),
+  description: z3.string().min(1),
+  parameters: z3.array(toolParameterSchema).optional(),
+  responses: z3.record(z3.string())
+});
+var toolAssertionSchema = z3.object({
+  type: z3.enum(["tool_called", "tool_not_called", "tool_call_order", "tool_argument_match"]),
+  toolName: z3.string().min(1).optional(),
+  toolNames: z3.array(z3.string().min(1)).optional(),
+  expectedArgs: z3.record(z3.unknown()).optional(),
+  description: z3.string().min(1)
+}).superRefine((value, context) => {
+  if ((value.type === "tool_called" || value.type === "tool_not_called" || value.type === "tool_argument_match") && !value.toolName) {
+    context.addIssue({
+      code: z3.ZodIssueCode.custom,
+      message: `${value.type} requires toolName.`
+    });
+  }
+  if (value.type === "tool_call_order" && (!value.toolNames || value.toolNames.length === 0)) {
+    context.addIssue({
+      code: z3.ZodIssueCode.custom,
+      message: "tool_call_order requires toolNames."
+    });
+  }
+  if (value.type === "tool_argument_match" && !value.expectedArgs) {
+    context.addIssue({
+      code: z3.ZodIssueCode.custom,
+      message: "tool_argument_match requires expectedArgs."
+    });
+  }
+});
 var evalPromptSchema = z3.object({
   prompt: z3.string().min(1),
-  assertions: z3.array(z3.string().min(1)).optional()
+  assertions: z3.array(z3.string().min(1)).optional(),
+  tools: z3.array(mockToolDefinitionSchema).optional(),
+  toolAssertions: z3.array(toolAssertionSchema).optional()
 });
 var evalPromptArraySchema = z3.array(evalPromptSchema);
+function formatExpectedOrder(toolNames) {
+  return `[${toolNames.join(", ")}]`;
+}
+function formatActualOrder(toolCalls, toolNames) {
+  const relevantNames = new Set(toolNames);
+  const actualOrder = toolCalls.filter((toolCall) => relevantNames.has(toolCall.name)).map((toolCall) => toolCall.name);
+  return `[${actualOrder.join(", ")}]`;
+}
+function evaluateToolAssertions(toolAssertions, toolCalls) {
+  return toolAssertions.map((toolAssertion) => {
+    if (toolAssertion.type === "tool_called") {
+      const matchingCalls = toolCalls.filter((toolCall) => toolCall.name === toolAssertion.toolName);
+      return {
+        assertion: toolAssertion.description,
+        passed: matchingCalls.length > 0,
+        evidence: matchingCalls.length > 0 ? `Tool '${toolAssertion.toolName}' was called ${matchingCalls.length} time${matchingCalls.length === 1 ? "" : "s"}.` : `Tool '${toolAssertion.toolName}' was not called.`,
+        source: "tool"
+      };
+    }
+    if (toolAssertion.type === "tool_not_called") {
+      const matchingCalls = toolCalls.filter((toolCall) => toolCall.name === toolAssertion.toolName);
+      return {
+        assertion: toolAssertion.description,
+        passed: matchingCalls.length === 0,
+        evidence: matchingCalls.length === 0 ? `Tool '${toolAssertion.toolName}' was not called.` : `Tool '${toolAssertion.toolName}' was called ${matchingCalls.length} time${matchingCalls.length === 1 ? "" : "s"}.`,
+        source: "tool"
+      };
+    }
+    if (toolAssertion.type === "tool_call_order") {
+      const expectedOrder = toolAssertion.toolNames ?? [];
+      let nextExpectedIndex = 0;
+      for (const toolCall of toolCalls) {
+        if (toolCall.name === expectedOrder[nextExpectedIndex]) {
+          nextExpectedIndex += 1;
+        }
+      }
+      return {
+        assertion: toolAssertion.description,
+        passed: nextExpectedIndex === expectedOrder.length,
+        evidence: nextExpectedIndex === expectedOrder.length ? `Observed tool call order ${formatExpectedOrder(expectedOrder)}.` : `Expected call order ${formatExpectedOrder(expectedOrder)} but got ${formatActualOrder(toolCalls, expectedOrder)}.`,
+        source: "tool"
+      };
+    }
+    const matchingCall = toolCalls.find(
+      (toolCall) => toolCall.name === toolAssertion.toolName && matchesArgumentSubset(toolCall.arguments, toolAssertion.expectedArgs ?? {})
+    );
+    return {
+      assertion: toolAssertion.description,
+      passed: Boolean(matchingCall),
+      evidence: matchingCall ? `Tool '${toolAssertion.toolName}' was called with arguments matching ${JSON.stringify(toolAssertion.expectedArgs ?? {})}.` : `No '${toolAssertion.toolName}' call matched ${JSON.stringify(toolAssertion.expectedArgs ?? {})}.`,
+      source: "tool"
+    };
+  });
+}
 function extractJsonArray(raw) {
   const trimmed = raw.trim();
   if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
@@ -2442,6 +3098,7 @@ async function generatePrompts(skill, provider, model, count) {
     skill.content,
     "",
     `Generate ${count} prompts that stress the main capabilities and likely edge cases.`,
+    // Tool-aware prompts require user-defined mock responses and are not auto-generated.
     "Each prompt should include 2-4 assertions."
   ].join("\n");
   const raw = await provider.sendMessage(systemPrompt, userPrompt, { model });
@@ -2465,7 +3122,24 @@ async function runEval(skill, options) {
   const results = await pMap(
     prompts,
     async (evalPrompt) => {
-      const response = await options.provider.sendMessage(systemPrompt, evalPrompt.prompt, { model: options.model });
+      let response;
+      let toolCalls;
+      let loopIterations;
+      if (evalPrompt.tools && evalPrompt.tools.length > 0) {
+        const toolRun = await runWithTools({
+          provider: options.provider,
+          model: options.model,
+          systemPrompt,
+          userMessage: evalPrompt.prompt,
+          tools: evalPrompt.tools,
+          maxIterations: options.maxToolIterations
+        });
+        response = toolRun.finalResponse;
+        toolCalls = toolRun.toolCalls;
+        loopIterations = toolRun.loopIterations;
+      } else {
+        response = await options.provider.sendMessage(systemPrompt, evalPrompt.prompt, { model: options.model });
+      }
       const gradedAssertions = await gradeResponse({
         provider: options.provider,
         model: options.graderModel,
@@ -2475,14 +3149,18 @@ async function runEval(skill, options) {
         modelResponse: response,
         assertions: evalPrompt.assertions
       });
-      const passedAssertions2 = gradedAssertions.filter((assertion) => assertion.passed).length;
+      const structuralAssertions = evalPrompt.toolAssertions && evalPrompt.toolAssertions.length > 0 ? evaluateToolAssertions(evalPrompt.toolAssertions, toolCalls ?? []) : [];
+      const assertions = [...gradedAssertions, ...structuralAssertions];
+      const passedAssertions2 = assertions.filter((assertion) => assertion.passed).length;
       return {
         prompt: evalPrompt.prompt,
-        assertions: gradedAssertions,
+        assertions,
         responseSummary: response.slice(0, 200),
         response,
         passedAssertions: passedAssertions2,
-        totalAssertions: gradedAssertions.length
+        totalAssertions: assertions.length,
+        ...toolCalls ? { toolCalls } : {},
+        ...loopIterations !== void 0 ? { loopIterations } : {}
       };
     },
     options.concurrency ?? 5
@@ -2568,7 +3246,7 @@ function parseJsonArrayFromModelOutput(raw) {
   }
   throw new Error("Model did not return a JSON array.");
 }
-async function generateQueriesWithModel(skill, provider, model, numQueries) {
+async function generateQueriesWithModel(skill, provider, model, numQueries, competitors) {
   validateNumQueries(numQueries);
   const shouldTriggerCount = Math.floor(numQueries / 2);
   const shouldNotTriggerCount = numQueries - shouldTriggerCount;
@@ -2581,6 +3259,15 @@ async function generateQueriesWithModel(skill, provider, model, numQueries) {
   const userPrompt = [
     `Skill name: ${skill.frontmatter.name}`,
     `Skill description: ${skill.frontmatter.description}`,
+    ...competitors && competitors.length > 0 ? [
+      "",
+      "Competitor skills in the same domain:",
+      ...competitors.map((competitor) => `- ${competitor.name}: ${competitor.description}`),
+      "",
+      "Generate queries that test whether the target skill triggers correctly even when these similar skills exist.",
+      "Positive queries should clearly belong to the target skill, not the competitors.",
+      "Negative queries should belong to a competitor or to no skill at all."
+    ] : [],
     `Generate ${numQueries} prompts total.`,
     `Exactly ${shouldTriggerCount} should have should_trigger=true.`,
     `Exactly ${shouldNotTriggerCount} should have should_trigger=false.`,
@@ -2614,16 +3301,33 @@ function parseDecision(rawResponse, skillNames) {
   }
   return "unrecognized";
 }
-function prepareTriggerQueries(skill, queries, seed) {
+function prepareTriggerQueries(skill, queries, seed, competitors) {
   const rng = createRng(seed);
+  const competitorCandidates = (competitors ?? []).map((competitor) => ({
+    name: competitor.name,
+    description: competitor.description
+  }));
   return queries.map((testQuery) => {
-    const fakeCount = 5 + Math.floor(rng() * 5);
+    const usingCompetitors = competitorCandidates.length > 0;
+    const fakeCount = usingCompetitors ? testQuery.should_trigger ? 2 + Math.floor(rng() * 3) : 3 + Math.floor(rng() * 3) : 5 + Math.floor(rng() * 5);
     const fakeSkills = sample(FAKE_SKILLS, fakeCount, rng);
-    const allSkills = shuffle(
+    const allSkills = usingCompetitors ? shuffle(
       [
+        ...competitorCandidates,
         ...fakeSkills,
-        {
-          name: skill.frontmatter.name,
+        ...testQuery.should_trigger ? [
+          {
+            name: skill.frontmatter.name,
+            description: skill.frontmatter.description
+          }
+        ] : []
+      ],
+      rng
+    ) : shuffle(
+      [
+        ...fakeSkills,
+        {
+          name: skill.frontmatter.name,
           description: skill.frontmatter.description
         }
       ],
@@ -2673,25 +3377,82 @@ function calculateMetrics(skillName, cases) {
     f1
   };
 }
-function buildSuggestions(metrics) {
+function assertCompetitorNamesDistinct(skillName, competitors) {
+  for (const competitor of competitors) {
+    if (competitor.name === skillName) {
+      throw new Error(`Competitor skill '${competitor.name}' has the same name as the skill under test.`);
+    }
+  }
+}
+function buildTriggerCaseResult(options) {
+  const expected = options.testQuery.should_trigger ? options.skillName : "none";
+  const matched = options.testQuery.should_trigger ? options.decision === options.skillName : options.decision !== options.skillName;
+  const selectedCompetitor = options.competitorNames?.includes(options.decision) ? options.decision : void 0;
+  return {
+    query: options.testQuery.query,
+    shouldTrigger: options.testQuery.should_trigger,
+    expected,
+    actual: options.decision,
+    matched,
+    selectedCompetitor,
+    rawModelResponse: options.rawModelResponse
+  };
+}
+function buildSuggestions(skillName, metrics, cases, competitors) {
   const suggestions = [];
   if (metrics.falseNegatives > 0) {
     suggestions.push(
       "False negatives found: clarify capability keywords and add explicit 'use when ...' phrasing in description."
     );
+    if (competitors && competitors.length > 0) {
+      const competitorCounts = /* @__PURE__ */ new Map();
+      for (const testCase of cases) {
+        if (!testCase.shouldTrigger || testCase.actual === skillName || !testCase.selectedCompetitor) {
+          continue;
+        }
+        competitorCounts.set(testCase.selectedCompetitor, (competitorCounts.get(testCase.selectedCompetitor) ?? 0) + 1);
+      }
+      for (const [competitorName, count] of competitorCounts.entries()) {
+        suggestions.push(
+          `Skill '${competitorName}' was selected instead of '${skillName}' for ${count} quer${count === 1 ? "y" : "ies"}. Differentiate your description from '${competitorName}'.`
+        );
+      }
+    }
   }
   if (metrics.falsePositives > 0) {
     suggestions.push("False positives found: narrow scope boundaries and add explicit non-goals in description.");
+    if (competitors && competitors.length > 0) {
+      suggestions.push(
+        `With competitor skills present, ${metrics.falsePositives} negative quer${metrics.falsePositives === 1 ? "y still" : "ies still"} triggered '${skillName}'. Narrow your description's scope boundaries.`
+      );
+    }
   }
   if (suggestions.length === 0) {
     suggestions.push("Trigger behavior looks clean on this sample. Keep monitoring with domain-specific custom queries.");
   }
   return suggestions;
 }
+async function loadCompetitorSkills(comparePaths) {
+  const competitors = [];
+  for (const comparePath of comparePaths) {
+    const parsed = await parseSkillStrict(comparePath);
+    competitors.push({
+      name: parsed.frontmatter.name,
+      description: parsed.frontmatter.description,
+      sourcePath: comparePath
+    });
+  }
+  return competitors;
+}
 async function runTriggerTest(skill, options) {
-  const queries = options.queries && options.queries.length > 0 ? triggerQueryArraySchema.parse(options.queries) : await generateQueriesWithModel(skill, options.provider, options.model, options.numQueries);
+  const competitors = options.compare && options.compare.length > 0 ? await loadCompetitorSkills(options.compare) : void 0;
+  if (competitors && competitors.length > 0) {
+    assertCompetitorNamesDistinct(skill.frontmatter.name, competitors);
+  }
+  const queries = options.queries && options.queries.length > 0 ? triggerQueryArraySchema.parse(options.queries) : await generateQueriesWithModel(skill, options.provider, options.model, options.numQueries, competitors);
   const skillName = skill.frontmatter.name;
-  const preparedQueries = prepareTriggerQueries(skill, queries, options.seed);
+  const preparedQueries = prepareTriggerQueries(skill, queries, options.seed, competitors);
+  const competitorNames = competitors?.map((competitor) => competitor.name) ?? [];
   const systemPrompt = [
     "You are selecting one skill to activate for a user query.",
     "Choose the single best matching skill name from the provided list, or 'none' if no skill is a good fit.",
@@ -2704,18 +3465,15 @@ async function runTriggerTest(skill, options) {
       const rawResponse = await options.provider.sendMessage(systemPrompt, userPrompt, { model: options.model });
       const decision = parseDecision(
         rawResponse,
-        allSkills.map((entry) => entry.name)
+        Array.from(/* @__PURE__ */ new Set([skillName, ...allSkills.map((entry) => entry.name)]))
       );
-      const expected = testQuery.should_trigger ? skillName : "none";
-      const matched = testQuery.should_trigger ? decision === skillName : decision !== skillName;
-      return {
-        query: testQuery.query,
-        shouldTrigger: testQuery.should_trigger,
-        expected,
-        actual: decision,
-        matched,
+      return buildTriggerCaseResult({
+        testQuery,
+        skillName,
+        decision,
+        competitorNames,
         rawModelResponse: options.verbose ? rawResponse : void 0
-      };
+      });
     },
     options.concurrency ?? 5
   );
@@ -2725,10 +3483,11 @@ async function runTriggerTest(skill, options) {
     model: options.model,
     provider: options.provider.name,
     seed: options.seed,
+    competitors,
     queries,
     cases: results,
     metrics,
-    suggestions: buildSuggestions(metrics)
+    suggestions: buildSuggestions(skillName, metrics, results, competitors)
   };
 }
@@ -2739,10 +3498,7 @@ function renderJson(value) {
 // src/commands/common.ts
 var executionContextByCommand = /* @__PURE__ */ new WeakMap();
-var singleEvalPromptSchema = z5.object({
-  prompt: z5.string().min(1),
-  assertions: z5.array(z5.string().min(1)).optional()
-});
+var singleEvalPromptSchema = evalPromptSchema;
 var promptStringArraySchema = z5.array(z5.string().min(1));
 var assertionsObjectSchema = z5.object({
   assertions: z5.array(z5.string().min(1))
@@ -2777,6 +3533,22 @@ function normalizeEvalPrompts(value, sourceLabel) {
 function parseAssertionsFromText(raw) {
   return raw.split(/\r?\n/).map((line) => line.trim().replace(/^[-*]\s+/, "").replace(/^\d+\.\s+/, "")).filter((line) => line.length > 0);
 }
+function cloneEvalPrompt(prompt) {
+  return {
+    prompt: prompt.prompt,
+    assertions: prompt.assertions ? [...prompt.assertions] : void 0,
+    tools: prompt.tools ? prompt.tools.map((tool) => ({
+      ...tool,
+      parameters: tool.parameters ? tool.parameters.map((parameter) => ({ ...parameter })) : void 0,
+      responses: { ...tool.responses }
+    })) : void 0,
+    toolAssertions: prompt.toolAssertions ? prompt.toolAssertions.map((toolAssertion) => ({
+      ...toolAssertion,
+      toolNames: toolAssertion.toolNames ? [...toolAssertion.toolNames] : void 0,
+      expectedArgs: toolAssertion.expectedArgs ? { ...toolAssertion.expectedArgs } : void 0
+    })) : void 0
+  };
+}
 function normalizeAssertions(value, sourceLabel) {
   const assertionArray = z5.array(z5.string().min(1)).safeParse(value);
   if (assertionArray.success) {
@@ -2843,23 +3615,20 @@ async function loadConfiguredEvalPrompts(command) {
   if (!promptFile && assertionsFile) {
     throw new Error("Config field eval.assertionsFile requires eval.promptFile.");
   }
-  const promptRaw = await fs5.readFile(promptFile, "utf8");
+  const promptRaw = await fs6.readFile(promptFile, "utf8");
   let prompts = normalizeEvalPrompts(parseJsonIfPossible(promptRaw), promptFile);
   if (assertionsFile) {
-    const assertionsRaw = await fs5.readFile(assertionsFile, "utf8");
+    const assertionsRaw = await fs6.readFile(assertionsFile, "utf8");
     const assertions = normalizeAssertions(parseJsonIfPossible(assertionsRaw), assertionsFile);
     prompts = prompts.map((prompt) => ({
-      prompt: prompt.prompt,
+      ...cloneEvalPrompt(prompt),
       assertions: [...assertions]
     }));
   }
   const numRunsWasExplicit = context.configFile?.eval?.numRuns !== void 0;
   if (numRunsWasExplicit && prompts.length === 1 && context.config.eval.numRuns > 1) {
     const promptTemplate = prompts[0];
-    prompts = Array.from({ length: context.config.eval.numRuns }, () => ({
-      prompt: promptTemplate.prompt,
-      assertions: promptTemplate.assertions ? [...promptTemplate.assertions] : void 0
-    }));
+    prompts = Array.from({ length: context.config.eval.numRuns }, () => cloneEvalPrompt(promptTemplate));
   }
   return prompts;
 }
@@ -2885,18 +3654,22 @@ function writeError(error, asJson) {
 // src/commands/lint.ts
 var lintCliSchema = z6.object({
-  html: z6.string().optional()
+  html: z6.string().optional(),
+  plugin: z6.array(z6.string().min(1)).optional()
 });
+function collectPluginPaths(value, previous = []) {
+  return [...previous, value];
+}
 async function handleLintCommand(targetPath, options) {
   try {
-    const report = await runLinter(targetPath, { suppress: options.suppress });
+    const report = await runLinter(targetPath, { suppress: options.suppress, plugins: options.plugins });
     if (options.json) {
       writeResult(report, true);
     } else {
       writeResult(renderLintReport(report, options.color), false);
     }
     if (options.html) {
-      await fs6.writeFile(options.html, renderLintHtml(report), "utf8");
+      await fs7.writeFile(options.html, renderLintHtml(report), "utf8");
     }
     if (lintFails(report, options.failOn)) {
       process.exitCode = 1;
@@ -2907,7 +3680,7 @@ async function handleLintCommand(targetPath, options) {
   }
 }
 function registerLintCommand(program) {
-  program.command("lint").description("Run static lint checks against a SKILL.md file or skill directory.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--html <path>", "Write an HTML report to the given file path").action(async (targetPath, _commandOptions, command) => {
+  program.command("lint").description("Run static lint checks against a SKILL.md file or skill directory.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--html <path>", "Write an HTML report to the given file path").option("--plugin <path>", "Load a custom lint plugin file", collectPluginPaths, []).action(async (targetPath, _commandOptions, command) => {
     const globalOptions = getGlobalCliOptions(command);
     const config = getResolvedConfig(command);
     const parsedCli = lintCliSchema.safeParse(command.opts());
@@ -2920,36 +3693,40 @@ function registerLintCommand(program) {
       ...globalOptions,
       failOn: config.lint.failOn,
       suppress: config.lint.suppress,
+      plugins: config.lint.plugins,
       html: parsedCli.data.html
     });
   });
 }
 // src/commands/trigger.ts
-import fs8 from "node:fs/promises";
+import fs9 from "node:fs/promises";
 import ora from "ora";
 import { z as z8 } from "zod";
 // src/utils/config.ts
-import fs7 from "node:fs/promises";
-import path5 from "node:path";
+import fs8 from "node:fs/promises";
+import path6 from "node:path";
 import { z as z7 } from "zod";
 var providerNameSchema = z7.enum(["anthropic", "openai"]);
 var lintFailOnSchema = z7.enum(["error", "warn"]);
 var lintConfigSchema = z7.object({
   failOn: lintFailOnSchema.optional(),
-  suppress: z7.array(z7.string().min(1)).optional()
+  suppress: z7.array(z7.string().min(1)).optional(),
+  plugins: z7.array(z7.string().min(1)).optional()
 }).strict();
 var triggerConfigSchema = z7.object({
   numQueries: z7.number().int().min(2).refine((value) => value % 2 === 0, "trigger.numQueries must be an even number."),
   threshold: z7.number().min(0).max(1).optional(),
-  seed: z7.number().int().optional()
+  seed: z7.number().int().optional(),
+  compare: z7.array(z7.string().min(1)).optional()
 }).strict().partial();
 var evalConfigSchema = z7.object({
   numRuns: z7.number().int().min(1).optional(),
   threshold: z7.number().min(0).max(1).optional(),
   promptFile: z7.string().min(1).optional(),
-  assertionsFile: z7.string().min(1).optional()
+  assertionsFile: z7.string().min(1).optional(),
+  maxToolIterations: z7.number().int().min(1).max(50).optional()
 }).strict().partial();
 var skilltestConfigSchema = z7.object({
   provider: providerNameSchema.optional(),
@@ -2967,18 +3744,21 @@ var resolvedSkilltestConfigSchema = z7.object({
   concurrency: z7.number().int().min(1),
   lint: z7.object({
     failOn: lintFailOnSchema,
-    suppress: z7.array(z7.string().min(1))
+    suppress: z7.array(z7.string().min(1)),
+    plugins: z7.array(z7.string().min(1))
   }),
   trigger: z7.object({
     numQueries: z7.number().int().min(2).refine((value) => value % 2 === 0, "trigger.numQueries must be an even number."),
     threshold: z7.number().min(0).max(1),
-    seed: z7.number().int().optional()
+    seed: z7.number().int().optional(),
+    compare: z7.array(z7.string().min(1))
   }),
   eval: z7.object({
     numRuns: z7.number().int().min(1),
     threshold: z7.number().min(0).max(1),
     promptFile: z7.string().min(1).optional(),
-    assertionsFile: z7.string().min(1).optional()
+    assertionsFile: z7.string().min(1).optional(),
+    maxToolIterations: z7.number().int().min(1).max(50)
   })
 });
 var DEFAULT_SKILLTEST_CONFIG = {
@@ -2988,15 +3768,18 @@ var DEFAULT_SKILLTEST_CONFIG = {
   concurrency: 5,
   lint: {
     failOn: "error",
-    suppress: []
+    suppress: [],
+    plugins: []
   },
   trigger: {
     numQueries: 20,
-    threshold: 0.8
+    threshold: 0.8,
+    compare: []
   },
   eval: {
     numRuns: 5,
-    threshold: 0.9
+    threshold: 0.9,
+    maxToolIterations: 10
   }
 };
 function formatIssuePath(issuePath) {
@@ -3014,7 +3797,7 @@ function buildConfigValidationError(error, sourceLabel) {
 async function readJsonObject(filePath, label) {
   let raw;
   try {
-    raw = await fs7.readFile(filePath, "utf8");
+    raw = await fs8.readFile(filePath, "utf8");
   } catch (error) {
     const message = error instanceof Error ? error.message : String(error);
     throw new Error(`Failed to read ${label}: ${message}`);
@@ -3038,13 +3821,13 @@ async function loadConfigFromJsonFile(filePath) {
   return {
     configFile: parsed.data,
     sourcePath: filePath,
-    sourceDirectory: path5.dirname(filePath)
+    sourceDirectory: path6.dirname(filePath)
   };
 }
 async function loadConfigFromNearestPackageJson(startDirectory) {
-  let currentDirectory = path5.resolve(startDirectory);
+  let currentDirectory = path6.resolve(startDirectory);
   while (true) {
-    const packageJsonPath = path5.join(currentDirectory, "package.json");
+    const packageJsonPath = path6.join(currentDirectory, "package.json");
     if (await pathExists(packageJsonPath)) {
       const raw = await readJsonObject(packageJsonPath, packageJsonPath);
       const packageJsonSchema = z7.object({
@@ -3063,7 +3846,7 @@ async function loadConfigFromNearestPackageJson(startDirectory) {
         sourceDirectory: currentDirectory
       };
     }
-    const parentDirectory = path5.dirname(currentDirectory);
+    const parentDirectory = path6.dirname(currentDirectory);
     if (parentDirectory === currentDirectory) {
       return null;
     }
@@ -3076,7 +3859,7 @@ async function resolveSkillDirectoryConfig(targetPath) {
   }
   try {
     const { skillRoot } = await resolveSkillPath(targetPath);
-    return loadConfigFromJsonFile(path5.join(skillRoot, ".skilltestrc"));
+    return loadConfigFromJsonFile(path6.join(skillRoot, ".skilltestrc"));
   } catch {
     return null;
   }
@@ -3085,7 +3868,13 @@ function resolveConfigRelativePath(baseDirectory, value) {
   if (!value) {
     return void 0;
   }
-  return path5.resolve(baseDirectory, value);
+  return path6.resolve(baseDirectory, value);
+}
+function resolveConfigRelativePaths(baseDirectory, values) {
+  if (!values || values.length === 0) {
+    return [];
+  }
+  return values.map((value) => path6.resolve(baseDirectory, value));
 }
 function mergeConfigLayers(configFile = {}, cliFlags = {}, baseDirectory = process.cwd()) {
   const merged = {
@@ -3095,12 +3884,20 @@ function mergeConfigLayers(configFile = {}, cliFlags = {}, baseDirectory = proce
     concurrency: cliFlags.concurrency ?? configFile.concurrency ?? DEFAULT_SKILLTEST_CONFIG.concurrency,
     lint: {
       failOn: cliFlags.lint?.failOn ?? configFile.lint?.failOn ?? DEFAULT_SKILLTEST_CONFIG.lint.failOn,
-      suppress: cliFlags.lint?.suppress ?? configFile.lint?.suppress ?? DEFAULT_SKILLTEST_CONFIG.lint.suppress
+      suppress: cliFlags.lint?.suppress ?? configFile.lint?.suppress ?? DEFAULT_SKILLTEST_CONFIG.lint.suppress,
+      plugins: resolveConfigRelativePaths(
+        baseDirectory,
+        cliFlags.lint?.plugins ?? configFile.lint?.plugins ?? DEFAULT_SKILLTEST_CONFIG.lint.plugins
+      )
     },
     trigger: {
       numQueries: cliFlags.trigger?.numQueries ?? configFile.trigger?.numQueries ?? DEFAULT_SKILLTEST_CONFIG.trigger.numQueries,
       threshold: cliFlags.trigger?.threshold ?? configFile.trigger?.threshold ?? DEFAULT_SKILLTEST_CONFIG.trigger.threshold,
-      seed: cliFlags.trigger?.seed ?? configFile.trigger?.seed
+      seed: cliFlags.trigger?.seed ?? configFile.trigger?.seed,
+      compare: resolveConfigRelativePaths(
+        baseDirectory,
+        cliFlags.trigger?.compare ?? configFile.trigger?.compare ?? DEFAULT_SKILLTEST_CONFIG.trigger.compare
+      )
     },
     eval: {
       numRuns: cliFlags.eval?.numRuns ?? configFile.eval?.numRuns ?? DEFAULT_SKILLTEST_CONFIG.eval.numRuns,
@@ -3112,7 +3909,8 @@ function mergeConfigLayers(configFile = {}, cliFlags = {}, baseDirectory = proce
       assertionsFile: resolveConfigRelativePath(
         baseDirectory,
         cliFlags.eval?.assertionsFile ?? configFile.eval?.assertionsFile ?? DEFAULT_SKILLTEST_CONFIG.eval.assertionsFile
-      )
+      ),
+      maxToolIterations: cliFlags.eval?.maxToolIterations ?? configFile.eval?.maxToolIterations ?? DEFAULT_SKILLTEST_CONFIG.eval.maxToolIterations
     }
   };
   return resolvedSkilltestConfigSchema.parse(merged);
@@ -3136,22 +3934,34 @@ function extractCliConfigOverrides(command) {
   if (command.getOptionValueSource("model") === "cli") {
     overrides.model = getTypedOptionValue(command, "model");
   }
-  if ((command.name() === "trigger" || command.name() === "eval" || command.name() === "check") && command.getOptionValueSource("concurrency") === "cli") {
+  if ((command.name() === "trigger" || command.name() === "eval" || command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("concurrency") === "cli") {
     overrides.concurrency = getTypedOptionValue(command, "concurrency");
   }
-  if ((command.name() === "trigger" || command.name() === "check") && command.getOptionValueSource("numQueries") === "cli") {
+  if ((command.name() === "trigger" || command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("numQueries") === "cli") {
     overrides.trigger = {
       ...overrides.trigger,
       numQueries: getTypedOptionValue(command, "numQueries")
     };
   }
-  if (command.name() === "check" && command.getOptionValueSource("minF1") === "cli") {
+  if ((command.name() === "trigger" || command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("compare") === "cli") {
+    overrides.trigger = {
+      ...overrides.trigger,
+      compare: getTypedOptionValue(command, "compare")
+    };
+  }
+  if ((command.name() === "lint" || command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("plugin") === "cli") {
+    overrides.lint = {
+      ...overrides.lint,
+      plugins: getTypedOptionValue(command, "plugin")
+    };
+  }
+  if ((command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("minF1") === "cli") {
     overrides.trigger = {
       ...overrides.trigger,
       threshold: getTypedOptionValue(command, "minF1")
     };
   }
-  if (command.name() === "check" && command.getOptionValueSource("minAssertPassRate") === "cli") {
+  if ((command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("minAssertPassRate") === "cli") {
     overrides.eval = {
       ...overrides.eval,
       threshold: getTypedOptionValue(command, "minAssertPassRate")
@@ -3172,7 +3982,7 @@ async function resolveConfigContext(targetPath, cliFlags) {
       config: mergeConfigLayers(skillDirectoryConfig.configFile, cliFlags, skillDirectoryConfig.sourceDirectory)
     };
   }
-  const cwdConfigPath = path5.join(cwd, ".skilltestrc");
+  const cwdConfigPath = path6.join(cwd, ".skilltestrc");
   const cwdConfig = await loadConfigFromJsonFile(cwdConfigPath);
   if (cwdConfig) {
     return {
@@ -3216,6 +4026,12 @@ function resolveApiKey(provider, override) {
 // src/providers/anthropic.ts
 import Anthropic from "@anthropic-ai/sdk";
+function isAnthropicTextBlock(block) {
+  return block.type === "text";
+}
+function isAnthropicToolUseBlock(block) {
+  return block.type === "tool_use";
+}
 function wait(ms) {
   return new Promise((resolve) => {
     setTimeout(resolve, ms);
@@ -3241,27 +4057,11 @@ var AnthropicProvider = class {
   constructor(apiKey) {
     this.client = new Anthropic({ apiKey });
   }
-  async sendMessage(systemPrompt, userMessage, options) {
+  async createMessage(request) {
     let lastError;
     for (let attempt = 0; attempt < 3; attempt += 1) {
       try {
-        const response = await this.client.messages.create({
-          model: options.model,
-          max_tokens: 2048,
-          system: systemPrompt,
-          messages: [
-            {
-              role: "user",
-              content: userMessage
-            }
-          ]
-        });
-        const textBlocks = response.content.filter((block) => block.type === "text");
-        const text = textBlocks.map((block) => block.text).join("\n").trim();
-        if (text.length === 0) {
-          throw new Error("Model returned an empty response.");
-        }
-        return text;
+        return await this.client.messages.create(request);
       } catch (error) {
         lastError = error;
         if (!isRateLimitError(error) || attempt === 2) {
@@ -3276,6 +4076,55 @@ var AnthropicProvider = class {
     }
     throw new Error("Anthropic API call failed with an unknown error.");
   }
+  toAnthropicMessages(messages) {
+    return messages.map((message) => ({
+      role: message.role,
+      content: message.content
+    }));
+  }
+  async sendMessage(systemPrompt, userMessage, options) {
+    const response = await this.createMessage({
+      model: options.model,
+      max_tokens: 2048,
+      system: systemPrompt,
+      messages: [
+        {
+          role: "user",
+          content: userMessage
+        }
+      ]
+    });
+    const textBlocks = response.content.filter(isAnthropicTextBlock);
+    const text = textBlocks.map((block) => block.text).join("\n").trim();
+    if (text.length === 0) {
+      throw new Error("Model returned an empty response.");
+    }
+    return text;
+  }
+  async sendWithTools(systemPrompt, messages, options) {
+    const response = await this.createMessage({
+      model: options.model,
+      max_tokens: 2048,
+      system: systemPrompt,
+      messages: this.toAnthropicMessages(messages),
+      tools: options.tools.map((tool) => ({
+        name: tool.name,
+        description: tool.description,
+        input_schema: tool.parameters ?? { type: "object", properties: {} }
+      }))
+    });
+    const textContent = response.content.filter(isAnthropicTextBlock).map((block) => block.text).join("\n").trim();
+    const toolUseBlocks = response.content.filter(isAnthropicToolUseBlock).map((block) => ({
+      id: block.id,
+      name: block.name,
+      arguments: block.input
+    }));
+    return {
+      textContent,
+      toolUseBlocks,
+      stopReason: response.stop_reason ?? "end_turn"
+    };
+  }
 };
 // src/providers/openai.ts
@@ -3312,6 +4161,71 @@ function extractTextContent(content) {
   const text = content.map((item) => item.type === "text" || !item.type ? item.text ?? "" : "").join("\n").trim();
   return text;
 }
+function parseToolArguments(raw, toolName) {
+  if (!raw || raw.trim() === "") {
+    return {};
+  }
+  try {
+    const parsed = JSON.parse(raw);
+    if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
+      throw new Error("Tool arguments must be a JSON object.");
+    }
+    return parsed;
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    throw new Error(`OpenAI tool call arguments for '${toolName}' were not valid JSON: ${message}`);
+  }
+}
+function getBlockText(blocks) {
+  return blocks.filter((block) => block.type === "text").map((block) => String(block.text ?? "")).join("\n").trim();
+}
+function mapAssistantBlocksToMessage(blocks) {
+  const textContent = getBlockText(blocks);
+  const toolCalls = blocks.filter((block) => block.type === "tool_use").map((block) => ({
+    id: String(block.id ?? ""),
+    type: "function",
+    function: {
+      name: String(block.name ?? ""),
+      arguments: JSON.stringify(block.input ?? {})
+    }
+  }));
+  return {
+    role: "assistant",
+    content: textContent.length > 0 ? textContent : null,
+    ...toolCalls.length > 0 ? { tool_calls: toolCalls } : {}
+  };
+}
+function mapUserBlocksToMessages(blocks) {
+  const toolResults = blocks.filter((block) => block.type === "tool_result");
+  if (toolResults.length > 0) {
+    return toolResults.map((block) => ({
+      role: "tool",
+      tool_call_id: String(block.tool_use_id ?? ""),
+      content: String(block.content ?? "")
+    }));
+  }
+  const textContent = getBlockText(blocks);
+  return [
+    {
+      role: "user",
+      content: textContent
+    }
+  ];
+}
+function mapConversationBlockToMessages(block) {
+  if (typeof block.content === "string") {
+    return [
+      {
+        role: block.role,
+        content: block.content
+      }
+    ];
+  }
+  if (block.role === "assistant") {
+    return [mapAssistantBlocksToMessage(block.content)];
+  }
+  return mapUserBlocksToMessages(block.content);
+}
 var OpenAIProvider = class {
   name = "openai";
   apiKey;
@@ -3340,30 +4254,12 @@ var OpenAIProvider = class {
     this.client = new OpenAIConstructor({ apiKey: this.apiKey });
     return this.client;
   }
-  async sendMessage(systemPrompt, userMessage, options) {
+  async createCompletion(input) {
     const client = await this.ensureClient();
     let lastError;
     for (let attempt = 0; attempt < 3; attempt += 1) {
       try {
-        const response = await client.chat.completions.create({
-          model: options.model,
-          max_tokens: 2048,
-          messages: [
-            {
-              role: "system",
-              content: systemPrompt
-            },
-            {
-              role: "user",
-              content: userMessage
-            }
-          ]
-        });
-        const text = (response.choices ?? []).map((choice) => extractTextContent(choice.message?.content)).join("\n").trim();
-        if (text.length === 0) {
-          throw new Error("Model returned an empty response.");
-        }
-        return text;
+        return await client.chat.completions.create(input);
       } catch (error) {
         lastError = error;
         if (!isRetriableError(error) || attempt === 2) {
@@ -3378,6 +4274,57 @@ var OpenAIProvider = class {
     }
     throw new Error("OpenAI API call failed with an unknown error.");
   }
+  toOpenAiMessages(systemPrompt, messages) {
+    return [
+      {
+        role: "system",
+        content: systemPrompt
+      },
+      ...messages.flatMap((message) => mapConversationBlockToMessages(message))
+    ];
+  }
+  async sendMessage(systemPrompt, userMessage, options) {
+    const response = await this.createCompletion({
+      model: options.model,
+      max_tokens: 2048,
+      messages: this.toOpenAiMessages(systemPrompt, [{ role: "user", content: userMessage }])
+    });
+    const text = (response.choices ?? []).map((choice) => extractTextContent(choice.message?.content)).join("\n").trim();
+    if (text.length === 0) {
+      throw new Error("Model returned an empty response.");
+    }
+    return text;
+  }
+  async sendWithTools(systemPrompt, messages, options) {
+    const response = await this.createCompletion({
+      model: options.model,
+      max_tokens: 2048,
+      messages: this.toOpenAiMessages(systemPrompt, messages),
+      tools: options.tools.map((tool) => ({
+        type: "function",
+        function: {
+          name: tool.name,
+          description: tool.description,
+          parameters: tool.parameters
+        }
+      }))
+    });
+    const choice = response.choices?.[0];
+    const message = choice?.message;
+    const toolUseBlocks = (message?.tool_calls ?? []).map((toolCall, index) => {
+      const toolName = toolCall.function?.name ?? `tool-${index + 1}`;
+      return {
+        id: toolCall.id ?? `${toolName}-${index + 1}`,
+        name: toolName,
+        arguments: parseToolArguments(toolCall.function?.arguments, toolName)
+      };
+    });
+    return {
+      textContent: extractTextContent(message?.content),
+      toolUseBlocks,
+      stopReason: choice?.finish_reason === "stop" ? "end_turn" : choice?.finish_reason === "tool_calls" ? "tool_use" : choice?.finish_reason ?? "end_turn"
+    };
+  }
 };
 // src/providers/index.ts
@@ -3393,6 +4340,7 @@ function createProvider(providerName, apiKeyOverride) {
 var triggerCliSchema = z8.object({
   queries: z8.string().optional(),
   saveQueries: z8.string().optional(),
+  compare: z8.array(z8.string().min(1)).optional(),
   seed: z8.number().int().optional(),
   concurrency: z8.number().int().min(1).optional(),
   html: z8.string().optional(),
@@ -3441,6 +4389,7 @@ async function handleTriggerCommand(targetPath, options) {
       provider,
       queries,
       numQueries: options.numQueries,
+      compare: options.compare,
       seed: options.seed,
       concurrency: options.concurrency,
       verbose: options.verbose
@@ -3459,7 +4408,7 @@ async function handleTriggerCommand(targetPath, options) {
         ...result,
         target: targetPath
       };
-      await fs8.writeFile(options.html, renderTriggerHtml(htmlResult), "utf8");
+      await fs9.writeFile(options.html, renderTriggerHtml(htmlResult), "utf8");
     }
   } catch (error) {
     spinner?.stop();
@@ -3468,7 +4417,7 @@ async function handleTriggerCommand(targetPath, options) {
   }
 }
 function registerTriggerCommand(program) {
-  program.command("trigger").description("Evaluate whether a skill description triggers correctly.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--model <model>", "Model to use").option("--provider <provider>", "LLM provider: anthropic|openai").option("--queries <path>", "Path to custom test queries JSON").option("--num-queries <n>", "Number of auto-generated queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--concurrency <n>", "Maximum in-flight trigger requests", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--save-queries <path>", "Save generated queries to a JSON file").option("--api-key <key>", "API key override").option("--verbose", "Show full model decisions").action(async (targetPath, _commandOptions, command) => {
+  program.command("trigger").description("Evaluate whether a skill description triggers correctly.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--model <model>", "Model to use").option("--provider <provider>", "LLM provider: anthropic|openai").option("--queries <path>", "Path to custom test queries JSON").option("--compare <path...>", "Path(s) to sibling skill directories to include as competitors").option("--num-queries <n>", "Number of auto-generated queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--concurrency <n>", "Maximum in-flight trigger requests", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--save-queries <path>", "Save generated queries to a JSON file").option("--api-key <key>", "API key override").option("--verbose", "Show full model decisions").action(async (targetPath, _commandOptions, command) => {
     const globalOptions = getGlobalCliOptions(command);
     const config = getResolvedConfig(command);
     const parsedCli = triggerCliSchema.safeParse(command.opts());
@@ -3483,6 +4432,7 @@ function registerTriggerCommand(program) {
       provider: config.provider,
       queries: parsedCli.data.queries,
       numQueries: config.trigger.numQueries,
+      compare: config.trigger.compare,
       saveQueries: parsedCli.data.saveQueries,
       seed: parsedCli.data.seed ?? config.trigger.seed,
       concurrency: config.concurrency,
@@ -3494,7 +4444,7 @@ function registerTriggerCommand(program) {
 }
 // src/commands/eval.ts
-import fs9 from "node:fs/promises";
+import fs10 from "node:fs/promises";
 import ora2 from "ora";
 import { z as z9 } from "zod";
 var evalCliSchema = z9.object({
@@ -3545,7 +4495,8 @@ async function handleEvalCommand(targetPath, options, command) {
       graderModel,
       numRuns: options.numRuns,
       concurrency: options.concurrency,
-      prompts
+      prompts,
+      maxToolIterations: options.maxToolIterations
     });
     if (options.saveResults) {
       await writeJsonFile(options.saveResults, result);
@@ -3561,7 +4512,7 @@ async function handleEvalCommand(targetPath, options, command) {
         ...result,
         target: targetPath
       };
-      await fs9.writeFile(options.html, renderEvalHtml(htmlResult), "utf8");
+      await fs10.writeFile(options.html, renderEvalHtml(htmlResult), "utf8");
     }
   } catch (error) {
     spinner?.stop();
@@ -3592,7 +4543,8 @@ function registerEvalCommand(program) {
         verbose: Boolean(parsedCli.data.verbose),
         apiKey: parsedCli.data.apiKey,
         numRuns: config.eval.numRuns,
-        concurrency: config.concurrency
+        concurrency: config.concurrency,
+        maxToolIterations: config.eval.maxToolIterations
       },
       command
     );
@@ -3600,7 +4552,7 @@ function registerEvalCommand(program) {
 }
 // src/commands/check.ts
-import fs10 from "node:fs/promises";
+import fs11 from "node:fs/promises";
 import ora3 from "ora";
 import { z as z10 } from "zod";
@@ -3613,7 +4565,7 @@ function calculateEvalAssertPassRate(result) {
 }
 async function runCheck(inputPath, options) {
   options.onStage?.("lint");
-  const lint = await runLinter(inputPath, { suppress: options.lintSuppress });
+  const lint = await runLinter(inputPath, { suppress: options.lintSuppress, plugins: options.lintPlugins });
   const lintPassed = !lintFails(lint, options.lintFailOn);
   let trigger = null;
   let evalResult = null;
@@ -3637,6 +4589,7 @@ async function runCheck(inputPath, options) {
         provider: options.provider,
         model: options.model,
         queries: options.queries,
+        compare: options.compare,
         numQueries: options.numQueries,
         seed: options.triggerSeed,
         concurrency: options.concurrency,
@@ -3648,7 +4601,8 @@ async function runCheck(inputPath, options) {
         graderModel: options.graderModel,
         numRuns: options.evalNumRuns,
         prompts: options.prompts,
-        concurrency: options.concurrency
+        concurrency: options.concurrency,
+        maxToolIterations: options.evalMaxToolIterations
       };
       if ((options.concurrency ?? 5) === 1) {
         options.onStage?.("trigger");
@@ -3698,8 +4652,10 @@ var checkCliSchema = z10.object({
   graderModel: z10.string().optional(),
   apiKey: z10.string().optional(),
   queries: z10.string().optional(),
+  compare: z10.array(z10.string().min(1)).optional(),
   seed: z10.number().int().optional(),
   prompts: z10.string().optional(),
+  plugin: z10.array(z10.string().min(1)).optional(),
   concurrency: z10.number().int().min(1).optional(),
   html: z10.string().optional(),
   saveResults: z10.string().optional(),
@@ -3708,6 +4664,9 @@ var checkCliSchema = z10.object({
 });
 var DEFAULT_ANTHROPIC_MODEL3 = "claude-sonnet-4-5-20250929";
 var DEFAULT_OPENAI_MODEL3 = "gpt-4.1-mini";
+function collectPluginPaths2(value, previous = []) {
+  return [...previous, value];
+}
 function resolveModel3(provider, model) {
   if (provider === "openai" && model === DEFAULT_ANTHROPIC_MODEL3) {
     return DEFAULT_OPENAI_MODEL3;
@@ -3758,11 +4717,14 @@ async function handleCheckCommand(targetPath, options, command) {
       graderModel,
       lintFailOn: options.lintFailOn,
       lintSuppress: options.lintSuppress,
+      lintPlugins: options.lintPlugins,
       queries,
+      compare: options.compare,
       numQueries: options.numQueries,
       triggerSeed: options.triggerSeed,
       prompts,
       evalNumRuns: options.numRuns,
+      evalMaxToolIterations: options.maxToolIterations,
       concurrency: options.concurrency,
       minF1: options.minF1,
       minAssertPassRate: options.minAssertPassRate,
@@ -3794,7 +4756,7 @@ async function handleCheckCommand(targetPath, options, command) {
       );
     }
     if (options.html) {
-      await fs10.writeFile(options.html, renderCheckHtml(result), "utf8");
+      await fs11.writeFile(options.html, renderCheckHtml(result), "utf8");
     }
     process.exitCode = result.gates.overallPassed ? 0 : 1;
   } catch (error) {
@@ -3804,7 +4766,7 @@ async function handleCheckCommand(targetPath, options, command) {
   }
 }
 function registerCheckCommand(program) {
-  program.command("check").description("Run lint + trigger + eval with threshold-based quality gates.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai").option("--model <model>", "Model for trigger/eval runs").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--prompts <path>", "Path to eval prompts JSON").option("--concurrency <n>", "Maximum in-flight trigger/eval tasks", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value)).option("--min-assert-pass-rate <n>", "Minimum required eval assertion pass rate (0-1)", (value) => Number.parseFloat(value)).option("--save-results <path>", "Save combined check results to JSON").option("--continue-on-lint-fail", "Continue trigger/eval stages even when lint has failures").option("--verbose", "Show detailed trigger/eval output sections").action(async (targetPath, _commandOptions, command) => {
+  program.command("check").description("Run lint + trigger + eval with threshold-based quality gates.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai").option("--model <model>", "Model for trigger/eval runs").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--compare <path...>", "Path(s) to sibling skill directories to include as competitors").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--prompts <path>", "Path to eval prompts JSON").option("--plugin <path>", "Load a custom lint plugin file", collectPluginPaths2, []).option("--concurrency <n>", "Maximum in-flight trigger/eval tasks", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value)).option("--min-assert-pass-rate <n>", "Minimum required eval assertion pass rate (0-1)", (value) => Number.parseFloat(value)).option("--save-results <path>", "Save combined check results to JSON").option("--continue-on-lint-fail", "Continue trigger/eval stages even when lint has failures").option("--verbose", "Show detailed trigger/eval output sections").action(async (targetPath, _commandOptions, command) => {
     const globalOptions = getGlobalCliOptions(command);
     const config = getResolvedConfig(command);
     const parsedCli = checkCliSchema.safeParse(command.opts());
@@ -3822,15 +4784,18 @@ function registerCheckCommand(program) {
         graderModel: parsedCli.data.graderModel,
         apiKey: parsedCli.data.apiKey,
         queries: parsedCli.data.queries,
+        compare: config.trigger.compare,
         numQueries: config.trigger.numQueries,
         prompts: parsedCli.data.prompts,
         minF1: config.trigger.threshold,
         minAssertPassRate: config.eval.threshold,
         numRuns: config.eval.numRuns,
+        maxToolIterations: config.eval.maxToolIterations,
         concurrency: config.concurrency,
         html: parsedCli.data.html,
         lintFailOn: config.lint.failOn,
         lintSuppress: config.lint.suppress,
+        lintPlugins: config.lint.plugins,
         triggerSeed: parsedCli.data.seed ?? config.trigger.seed,
         saveResults: parsedCli.data.saveResults,
         continueOnLintFail: Boolean(parsedCli.data.continueOnLintFail),
@@ -3841,12 +4806,572 @@ function registerCheckCommand(program) {
   });
 }
+// src/commands/improve.ts
+import ora4 from "ora";
+import { z as z12 } from "zod";
+// src/core/improver.ts
+import fs12 from "node:fs/promises";
+import os from "node:os";
+import path7 from "node:path";
+import yaml2 from "js-yaml";
+import { z as z11 } from "zod";
+var improveRewriteSchema = z11.object({
+  frontmatter: z11.record(z11.unknown()),
+  content: z11.string().min(1),
+  changeSummary: z11.array(z11.string().min(1)).min(1),
+  targetedProblems: z11.array(z11.string().min(1)).min(1)
+});
+function calculateEvalAssertPassRate2(result) {
+  if (!result || result.summary.totalAssertions === 0) {
+    return 0;
+  }
+  return result.summary.passedAssertions / result.summary.totalAssertions;
+}
+function extractJsonObject2(raw) {
+  const trimmed = raw.trim();
+  if (trimmed.startsWith("{") && trimmed.endsWith("}")) {
+    return JSON.parse(trimmed);
+  }
+  const start = trimmed.indexOf("{");
+  const end = trimmed.lastIndexOf("}");
+  if (start >= 0 && end > start) {
+    return JSON.parse(trimmed.slice(start, end + 1));
+  }
+  throw new Error("Improver did not return a JSON object.");
+}
+function orderFrontmatter(frontmatter) {
+  const ordered = {};
+  for (const key of ["name", "description", "license"]) {
+    if (Object.prototype.hasOwnProperty.call(frontmatter, key)) {
+      ordered[key] = frontmatter[key];
+    }
+  }
+  for (const [key, value] of Object.entries(frontmatter)) {
+    if (!Object.prototype.hasOwnProperty.call(ordered, key)) {
+      ordered[key] = value;
+    }
+  }
+  return ordered;
+}
+function detectLineEnding(raw) {
+  return raw.includes("\r\n") ? "\r\n" : "\n";
+}
+function buildSkillMarkdown(frontmatter, content, lineEnding) {
+  const normalizedBody = content.trim();
+  if (normalizedBody.length === 0) {
+    throw new Error("Candidate rewrite produced an empty SKILL.md body.");
+  }
+  const frontmatterBlock = yaml2.dump(orderFrontmatter(frontmatter), {
+    lineWidth: 0,
+    noRefs: true,
+    sortKeys: false
+  }).replace(/\n/g, lineEnding);
+  return `---${lineEnding}${frontmatterBlock}---${lineEnding}${lineEnding}${normalizedBody.replace(/\n/g, lineEnding)}${lineEnding}`;
+}
+async function validateRelativeReferences(raw, skillRoot) {
+  for (const reference of extractRelativeFileReferences(raw)) {
+    const resolved = path7.resolve(skillRoot, reference);
+    const relativeToRoot = path7.relative(skillRoot, resolved);
+    const escapesRoot = relativeToRoot === "" ? false : relativeToRoot.startsWith("..") || path7.isAbsolute(relativeToRoot);
+    if (escapesRoot) {
+      throw new Error(`Candidate rewrite introduced an out-of-root reference: ${reference}`);
+    }
+    if (!await pathExists(resolved)) {
+      throw new Error(`Candidate rewrite introduced a broken relative reference: ${reference}`);
+    }
+  }
+}
+async function buildCandidate(skill, rewrite) {
+  if (typeof rewrite.frontmatter.name === "string" && rewrite.frontmatter.name !== skill.frontmatter.name) {
+    throw new Error(`Candidate rewrite attempted to rename skill '${skill.frontmatter.name}' to '${rewrite.frontmatter.name}'.`);
+  }
+  if (skill.frontmatter.license && typeof rewrite.frontmatter.license === "string" && rewrite.frontmatter.license !== skill.frontmatter.license) {
+    throw new Error(
+      `Candidate rewrite attempted to change license '${skill.frontmatter.license}' to '${rewrite.frontmatter.license}'.`
+    );
+  }
+  const mergedFrontmatter = {
+    ...skill.frontmatter,
+    ...rewrite.frontmatter,
+    name: skill.frontmatter.name,
+    ...skill.frontmatter.license ? { license: skill.frontmatter.license } : {}
+  };
+  const raw = buildSkillMarkdown(mergedFrontmatter, rewrite.content, detectLineEnding(skill.raw));
+  parseSkillDocumentStrict(raw, skill.skillRoot, skill.skillFile);
+  await validateRelativeReferences(raw, skill.skillRoot);
+  return {
+    frontmatter: mergedFrontmatter,
+    content: rewrite.content.trim(),
+    raw,
+    changeSummary: rewrite.changeSummary,
+    targetedProblems: rewrite.targetedProblems
+  };
+}
+function extractActionableIssues(result) {
+  const lintIssues = result.lint.issues.filter((issue) => issue.status !== "pass").map((issue) => ({
+    checkId: issue.checkId,
+    title: issue.title,
+    status: issue.status === "warn" ? "warn" : "fail",
+    message: issue.message,
+    suggestion: issue.suggestion,
+    startLine: issue.startLine,
+    endLine: issue.endLine
+  }));
+  const triggerFailures = result.trigger?.cases.filter((testCase) => !testCase.matched).map((testCase) => ({
+    query: testCase.query,
+    expected: testCase.expected,
+    actual: testCase.actual,
+    selectedCompetitor: testCase.selectedCompetitor,
+    rawModelResponse: testCase.rawModelResponse
+  })) ?? [];
+  const evalFailures = result.eval?.results.flatMap(
+    (promptResult) => promptResult.assertions.filter((assertion) => !assertion.passed).map((assertion) => ({
+      prompt: promptResult.prompt,
+      assertion: assertion.assertion,
+      evidence: assertion.evidence,
+      source: assertion.source === "grader" || assertion.source === "tool" ? assertion.source : "unknown"
+    }))
+  ) ?? [];
+  return {
+    lintIssues,
+    triggerFailures,
+    evalFailures,
+    triggerSuggestions: result.trigger?.suggestions ?? []
+  };
+}
+function hasActionableProblems(brief) {
+  return brief.lintIssues.length > 0 || brief.triggerFailures.length > 0 || brief.evalFailures.length > 0 || brief.triggerSuggestions.length > 0;
+}
+async function listSkillFiles(skillRoot) {
+  const entries = await fs12.readdir(skillRoot, { withFileTypes: true });
+  const files = [];
+  for (const entry of entries) {
+    const absolutePath = path7.join(skillRoot, entry.name);
+    if (entry.isDirectory()) {
+      files.push(...await listSkillFiles(absolutePath));
+      continue;
+    }
+    if (entry.isFile()) {
+      files.push(path7.relative(skillRoot, absolutePath).split(path7.sep).join("/"));
+    }
+  }
+  return files.sort();
+}
+async function requestRewrite(skill, baseline, brief, provider, model) {
+  const availableFiles = await listSkillFiles(skill.skillRoot);
+  const systemPrompt = [
+    "You rewrite Agent Skill files to improve measured quality.",
+    "Return JSON only.",
+    "Required format:",
+    '{"frontmatter": {...}, "content": "...", "changeSummary": ["..."], "targetedProblems": ["..."]}',
+    "The content field must contain only the markdown body of SKILL.md, without YAML frontmatter fences.",
+    `Keep the skill name exactly '${skill.frontmatter.name}'.`,
+    skill.frontmatter.license ? `Keep the license exactly '${skill.frontmatter.license}'.` : "Do not remove any valid existing frontmatter fields.",
+    "Do not invent new scripts, assets, references, APIs, or tools.",
+    "Only reference files that already exist under the skill root.",
+    "Optimize for trigger clarity, explicit scope boundaries, concrete examples, safety guidance, and tool usage instructions."
+  ].join(" ");
+  const baselineTriggerF1 = baseline.trigger?.metrics.f1 ?? 0;
+  const baselineEvalPassRate = calculateEvalAssertPassRate2(baseline.eval);
+  const userPrompt = [
+    `Skill file: ${skill.skillFile}`,
+    `Current trigger F1: ${baselineTriggerF1.toFixed(4)}`,
+    `Current eval assertion pass rate: ${baselineEvalPassRate.toFixed(4)}`,
+    `Lint failures: ${baseline.lint.summary.failures}`,
+    `Lint warnings: ${baseline.lint.summary.warnings}`,
+    "",
+    "Available files under the skill root:",
+    ...availableFiles.map((file) => `- ${file}`),
+    "",
+    "Current SKILL.md:",
+    "```markdown",
+    skill.raw,
+    "```",
+    "",
+    "Actionable problems to fix:",
+    JSON.stringify(brief, null, 2),
+    "",
+    "Rewrite the skill to address only these evidenced problems. Keep the instructions tight and practical."
+  ].join("\n");
+  const raw = await provider.sendMessage(systemPrompt, userPrompt, { model });
+  const parsed = improveRewriteSchema.safeParse(extractJsonObject2(raw));
+  if (!parsed.success) {
+    throw new Error(`Failed to parse improve output: ${parsed.error.issues[0]?.message ?? "invalid improve JSON"}`);
+  }
+  return parsed.data;
+}
+async function createVerificationDirectory(skillRoot, candidateRaw) {
+  const tempRoot = await fs12.mkdtemp(path7.join(os.tmpdir(), "skilltest-improve-"));
+  const tempSkillRoot = path7.join(tempRoot, path7.basename(skillRoot));
+  await fs12.cp(skillRoot, tempSkillRoot, { recursive: true });
+  await fs12.writeFile(path7.join(tempSkillRoot, "SKILL.md"), candidateRaw, "utf8");
+  return {
+    tempRoot,
+    skillPath: tempSkillRoot
+  };
+}
+function buildDelta(baseline, verification) {
+  const baselineTriggerF1 = baseline.trigger?.metrics.f1 ?? 0;
+  const verificationTriggerF1 = verification.trigger?.metrics.f1 ?? 0;
+  const baselineEvalPassRate = calculateEvalAssertPassRate2(baseline.eval);
+  const verificationEvalPassRate = calculateEvalAssertPassRate2(verification.eval);
+  const lintFailuresDelta = baseline.lint.summary.failures - verification.lint.summary.failures;
+  const lintWarningsDelta = baseline.lint.summary.warnings - verification.lint.summary.warnings;
+  const triggerF1Delta = verificationTriggerF1 - baselineTriggerF1;
+  const evalPassRateDelta = verificationEvalPassRate - baselineEvalPassRate;
+  const hasRegression = verification.lint.summary.failures > baseline.lint.summary.failures || verification.lint.summary.warnings > baseline.lint.summary.warnings || verificationTriggerF1 < baselineTriggerF1 || verificationEvalPassRate < baselineEvalPassRate;
+  const improved = verification.gates.overallPassed !== baseline.gates.overallPassed ? verification.gates.overallPassed : lintFailuresDelta > 0 || lintWarningsDelta > 0 || triggerF1Delta > 0 || evalPassRateDelta > 0;
+  return {
+    lintFailures: {
+      before: baseline.lint.summary.failures,
+      after: verification.lint.summary.failures,
+      delta: lintFailuresDelta
+    },
+    lintWarnings: {
+      before: baseline.lint.summary.warnings,
+      after: verification.lint.summary.warnings,
+      delta: lintWarningsDelta
+    },
+    triggerF1: {
+      before: baselineTriggerF1,
+      after: verificationTriggerF1,
+      delta: triggerF1Delta
+    },
+    evalAssertPassRate: {
+      before: baselineEvalPassRate,
+      after: verificationEvalPassRate,
+      delta: evalPassRateDelta
+    },
+    overallPassed: {
+      before: baseline.gates.overallPassed,
+      after: verification.gates.overallPassed
+    },
+    improved,
+    hasRegression
+  };
+}
+function normalizeVerificationTarget(result, target) {
+  return {
+    ...result,
+    target
+  };
+}
+function buildBlockingReason(delta, verification) {
+  if (delta.hasRegression) {
+    return "Candidate rewrite regressed one or more quality metrics on the frozen test set.";
+  }
+  if (!delta.improved) {
+    return "Candidate rewrite did not produce a measurable improvement on the frozen test set.";
+  }
+  if (!verification.gates.overallPassed) {
+    return "Candidate rewrite improved the skill but still failed the configured quality gates.";
+  }
+  return void 0;
+}
+async function maybeWriteOutput(outputPath, raw) {
+  const absolutePath = path7.resolve(outputPath);
+  await fs12.mkdir(path7.dirname(absolutePath), { recursive: true });
+  await fs12.writeFile(absolutePath, raw, "utf8");
+  return absolutePath;
+}
+async function runImprove(inputPath, options) {
+  options.onStage?.("baseline");
+  const baseline = await runCheck(inputPath, {
+    provider: options.provider,
+    model: options.model,
+    graderModel: options.model,
+    lintFailOn: options.lintFailOn,
+    lintSuppress: options.lintSuppress,
+    lintPlugins: options.lintPlugins,
+    compare: options.compare,
+    numQueries: options.numQueries,
+    triggerSeed: options.triggerSeed,
+    queries: options.queries,
+    evalNumRuns: options.evalNumRuns,
+    prompts: options.prompts,
+    evalMaxToolIterations: options.evalMaxToolIterations,
+    concurrency: options.concurrency,
+    minF1: options.minF1,
+    minAssertPassRate: options.minAssertPassRate,
+    continueOnLintFail: true,
+    verbose: options.verbose
+  });
+  if (!baseline.trigger || !baseline.eval) {
+    return {
+      target: inputPath,
+      provider: options.provider.name,
+      model: options.model,
+      originalRaw: "",
+      thresholds: {
+        minF1: options.minF1,
+        minAssertPassRate: options.minAssertPassRate
+      },
+      baseline,
+      candidate: null,
+      verification: null,
+      delta: null,
+      applied: false,
+      blockedReason: baseline.triggerSkippedReason ?? baseline.evalSkippedReason ?? "Improve requires a strictly parseable skill so trigger and eval can be frozen."
+    };
+  }
+  const skill = await parseSkillStrict(inputPath);
+  const brief = extractActionableIssues(baseline);
+  if (!hasActionableProblems(brief)) {
+    return {
+      target: inputPath,
+      provider: options.provider.name,
+      model: options.model,
+      originalRaw: skill.raw,
+      thresholds: {
+        minF1: options.minF1,
+        minAssertPassRate: options.minAssertPassRate
+      },
+      baseline,
+      candidate: null,
+      verification: null,
+      delta: null,
+      applied: false,
+      blockedReason: "No actionable failures, warnings, or mismatches were found to improve."
+    };
+  }
+  options.onStage?.("generate");
+  const rewrite = await requestRewrite(skill, baseline, brief, options.provider, options.model);
+  options.onStage?.("validate");
+  const candidate = await buildCandidate(skill, rewrite);
+  if (candidate.raw === skill.raw) {
+    return {
+      target: inputPath,
+      provider: options.provider.name,
+      model: options.model,
+      originalRaw: skill.raw,
+      thresholds: {
+        minF1: options.minF1,
+        minAssertPassRate: options.minAssertPassRate
+      },
+      baseline,
+      candidate,
+      verification: null,
+      delta: null,
+      applied: false,
+      blockedReason: "Candidate rewrite produced no changes."
+    };
+  }
+  options.onStage?.("verify");
+  const verificationDirectory = await createVerificationDirectory(skill.skillRoot, candidate.raw);
+  let verification;
+  try {
+    verification = normalizeVerificationTarget(
+      await runCheck(verificationDirectory.skillPath, {
+        provider: options.provider,
+        model: options.model,
+        graderModel: options.model,
+        lintFailOn: options.lintFailOn,
+        lintSuppress: options.lintSuppress,
+        lintPlugins: options.lintPlugins,
+        compare: options.compare,
+        numQueries: baseline.trigger.queries.length,
+        triggerSeed: options.triggerSeed,
+        queries: baseline.trigger.queries,
+        evalNumRuns: baseline.eval.prompts.length,
+        prompts: baseline.eval.prompts,
+        evalMaxToolIterations: options.evalMaxToolIterations,
+        concurrency: options.concurrency,
+        minF1: options.minF1,
+        minAssertPassRate: options.minAssertPassRate,
+        continueOnLintFail: true,
+        verbose: options.verbose
+      }),
+      inputPath
+    );
+  } finally {
+    await fs12.rm(verificationDirectory.tempRoot, { recursive: true, force: true });
+  }
+  const delta = buildDelta(baseline, verification);
+  const blockedReason = buildBlockingReason(delta, verification);
+  let applied = false;
+  let outputPath;
+  if (!blockedReason) {
+    if (options.outputPath) {
+      options.onStage?.("write");
+      outputPath = await maybeWriteOutput(options.outputPath, candidate.raw);
+    }
+    if (options.apply) {
+      options.onStage?.("write");
+      await fs12.writeFile(skill.skillFile, candidate.raw, "utf8");
+      applied = true;
+    }
+  }
+  return {
+    target: inputPath,
+    provider: options.provider.name,
+    model: options.model,
+    originalRaw: skill.raw,
+    thresholds: {
+      minF1: options.minF1,
+      minAssertPassRate: options.minAssertPassRate
+    },
+    baseline,
+    candidate,
+    verification,
+    delta,
+    applied,
+    ...outputPath ? { outputPath } : {},
+    ...blockedReason ? { blockedReason } : {}
+  };
+}
+// src/commands/improve.ts
+var improveCliSchema = z12.object({
+  apiKey: z12.string().optional(),
+  queries: z12.string().optional(),
+  compare: z12.array(z12.string().min(1)).optional(),
+  seed: z12.number().int().optional(),
+  prompts: z12.string().optional(),
+  plugin: z12.array(z12.string().min(1)).optional(),
+  concurrency: z12.number().int().min(1).optional(),
+  output: z12.string().optional(),
+  saveResults: z12.string().optional(),
+  apply: z12.boolean().optional(),
+  verbose: z12.boolean().optional()
+});
+var DEFAULT_ANTHROPIC_MODEL4 = "claude-sonnet-4-5-20250929";
+var DEFAULT_OPENAI_MODEL4 = "gpt-4.1-mini";
+function collectPluginPaths3(value, previous = []) {
+  return [...previous, value];
+}
+function resolveModel4(provider, model) {
+  if (provider === "openai" && model === DEFAULT_ANTHROPIC_MODEL4) {
+    return DEFAULT_OPENAI_MODEL4;
+  }
+  return model;
+}
+async function handleImproveCommand(targetPath, options, command) {
+  const spinner = options.json || !process.stdout.isTTY ? null : ora4("Preparing improvement run...").start();
+  try {
+    if (spinner) {
+      spinner.text = "Initializing model provider...";
+    }
+    const provider = createProvider(options.provider, options.apiKey);
+    let queries = void 0;
+    if (options.queries) {
+      if (spinner) {
+        spinner.text = "Loading frozen trigger queries...";
+      }
+      queries = await loadTriggerQueriesFile(options.queries);
+    }
+    let prompts = void 0;
+    if (options.prompts) {
+      if (spinner) {
+        spinner.text = "Loading eval prompts...";
+      }
+      prompts = await loadEvalPromptsJson(options.prompts);
+    } else {
+      prompts = await loadConfiguredEvalPrompts(command);
+    }
+    const model = resolveModel4(options.provider, options.model);
+    const result = await runImprove(targetPath, {
+      provider,
+      model,
+      lintFailOn: options.lintFailOn,
+      lintSuppress: options.lintSuppress,
+      lintPlugins: options.lintPlugins,
+      compare: options.compare,
+      numQueries: options.numQueries,
+      triggerSeed: options.triggerSeed,
+      queries,
+      prompts,
+      evalNumRuns: options.numRuns,
+      evalMaxToolIterations: options.maxToolIterations,
+      minF1: options.minF1,
+      minAssertPassRate: options.minAssertPassRate,
+      concurrency: options.concurrency,
+      apply: options.apply,
+      outputPath: options.output,
+      verbose: options.verbose,
+      onStage: (stage) => {
+        if (!spinner) {
+          return;
+        }
+        if (stage === "baseline") {
+          spinner.text = "Running baseline check...";
+        } else if (stage === "generate") {
+          spinner.text = "Generating candidate rewrite...";
+        } else if (stage === "validate") {
+          spinner.text = "Validating candidate rewrite...";
+        } else if (stage === "verify") {
+          spinner.text = "Verifying candidate against frozen test inputs...";
+        } else if (stage === "write") {
+          spinner.text = options.apply ? "Writing improved SKILL.md..." : "Writing candidate output...";
+        }
+      }
+    });
+    if (options.saveResults) {
+      await writeJsonFile(options.saveResults, result);
+    }
+    spinner?.stop();
+    if (options.json) {
+      writeResult(result, true);
+    } else {
+      writeResult(renderImproveReport(result, options.color, options.verbose), false);
+    }
+    process.exitCode = result.blockedReason ? 1 : 0;
+  } catch (error) {
+    spinner?.stop();
+    writeError(error, options.json);
+    process.exitCode = 2;
+  }
+}
+function registerImproveCommand(program) {
+  program.command("improve").description("Rewrite SKILL.md, verify it on frozen test inputs, and optionally apply it.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai").option("--model <model>", "Model for baseline, rewrite, and verification runs").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--compare <path...>", "Path(s) to sibling skill directories to include as competitors").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible trigger results", (value) => Number.parseInt(value, 10)).option("--prompts <path>", "Path to eval prompts JSON").option("--plugin <path>", "Load a custom lint plugin file", collectPluginPaths3, []).option("--concurrency <n>", "Maximum in-flight trigger/eval tasks", (value) => Number.parseInt(value, 10)).option("--output <path>", "Write the verified candidate SKILL.md to a separate file").option("--save-results <path>", "Save the full improve result JSON").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value)).option(
+    "--min-assert-pass-rate <n>",
+    "Minimum required eval assertion pass rate (0-1)",
+    (value) => Number.parseFloat(value)
+  ).option("--apply", "Apply the verified rewrite to the source SKILL.md").option("--verbose", "Include detailed baseline and verification reports").action(async (targetPath, _commandOptions, command) => {
+    const globalOptions = getGlobalCliOptions(command);
+    const config = getResolvedConfig(command);
+    const parsedCli = improveCliSchema.safeParse(command.opts());
+    if (!parsedCli.success) {
+      writeError(new Error(parsedCli.error.issues[0]?.message ?? "Invalid improve options."), globalOptions.json);
+      process.exitCode = 2;
+      return;
+    }
+    await handleImproveCommand(
+      targetPath,
+      {
+        ...globalOptions,
+        provider: config.provider,
+        model: config.model,
+        apiKey: parsedCli.data.apiKey,
+        queries: parsedCli.data.queries,
+        compare: config.trigger.compare,
+        numQueries: config.trigger.numQueries,
+        prompts: parsedCli.data.prompts,
+        minF1: config.trigger.threshold,
+        minAssertPassRate: config.eval.threshold,
+        numRuns: config.eval.numRuns,
+        maxToolIterations: config.eval.maxToolIterations,
+        concurrency: config.concurrency,
+        lintFailOn: config.lint.failOn,
+        lintSuppress: config.lint.suppress,
+        lintPlugins: config.lint.plugins,
+        triggerSeed: parsedCli.data.seed ?? config.trigger.seed,
+        output: parsedCli.data.output,
+        saveResults: parsedCli.data.saveResults,
+        apply: Boolean(parsedCli.data.apply),
+        verbose: Boolean(parsedCli.data.verbose)
+      },
+      command
+    );
+  });
+}
 // src/index.ts
 function resolveVersion() {
   try {
     const currentFilePath = fileURLToPath(import.meta.url);
-    const packageJsonPath = path6.resolve(path6.dirname(currentFilePath), "..", "package.json");
-    const raw = fs11.readFileSync(packageJsonPath, "utf8");
+    const packageJsonPath = path8.resolve(path8.dirname(currentFilePath), "..", "package.json");
+    const raw = fs13.readFileSync(packageJsonPath, "utf8");
     const parsed = JSON.parse(raw);
     return parsed.version ?? "0.0.0";
   } catch {
@@ -3879,6 +5404,7 @@ async function run(argv) {
   registerTriggerCommand(program);
   registerEvalCommand(program);
   registerCheckCommand(program);
+  registerImproveCommand(program);
   try {
     await program.parseAsync(argv);
   } catch (error) {