npm - skilltest - Versions diffs - 0.9.0 → 0.10.0 - Mend

skilltest 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.js CHANGED Viewed

@@ -1,8 +1,8 @@
 #!/usr/bin/env node
 // src/index.ts
-import fs13 from "node:fs";
-import path8 from "node:path";
+import fs14 from "node:fs";
+import path9 from "node:path";
 import { fileURLToPath } from "node:url";
 import { Command } from "commander";
@@ -2315,6 +2315,76 @@ function renderCheckHtml(result) {
   );
   return renderHtmlDocument(`skilltest check - ${skillName}`, [header, lintSection, triggerSection, evalSection, qualityGate].join(""));
 }
+function renderRouteMatrix(result) {
+  const cols = [...result.skills, "none"];
+  const headerCells = cols.map((col) => `<th>${escapeHtml(col)}</th>`).join("");
+  const rows = result.skills.map((target) => {
+    const cells = cols.map((col) => {
+      const pct = result.matrixPct[target]?.[col] ?? 0;
+      const isDiag = col === target;
+      const bg = isDiag ? "background:rgba(34,197,94,0.18);" : pct > 0.15 ? "background:rgba(239,68,68,0.18);" : pct > 0.05 ? "background:rgba(234,179,8,0.12);" : "";
+      return `<td style="${bg}">${escapeHtml(formatPercent(pct))}</td>`;
+    }).join("");
+    return `<tr><th>${escapeHtml(target)}</th>${cells}</tr>`;
+  }).join("");
+  return `<style>.rt{border-collapse:collapse;font-size:.85rem;width:100%}.rt th,.rt td{border:1px solid #d4d4d8;padding:8px 12px;text-align:center}.rt thead th{background:#fafafa;font-weight:700}</style><div style="overflow-x:auto"><table class="rt"><thead><tr><th></th>${headerCells}</tr></thead><tbody>${rows}</tbody></table></div>`;
+}
+function renderRouteHtml(result) {
+  const conflictCount = result.conflicts.length;
+  const overallStatus = result.overallAccuracy >= 0.8 ? "pass" : "warn";
+  const conflictStatus = conflictCount === 0 ? "pass" : "warn";
+  const header = renderHeaderCard(
+    "route",
+    `Routing Report \u2014 ${result.skills.length} skills`,
+    result.skillDir,
+    [
+      { label: "Overall accuracy", value: formatPercent(result.overallAccuracy), status: overallStatus },
+      { label: "Conflicts", value: String(conflictCount), status: conflictStatus },
+      { label: "Skills", value: String(result.skills.length) },
+      { label: "Queries/skill", value: String(result.numQueriesPerSkill) }
+    ],
+    [
+      { label: "Provider", value: result.provider },
+      { label: "Model", value: result.model },
+      { label: "Seed", value: result.seed !== void 0 ? String(result.seed) : "none" }
+    ]
+  );
+  const matrixSection = renderSectionCard("Routing Matrix", renderRouteMatrix(result));
+  const metricsRows = result.perSkillMetrics.map((m) => {
+    const status = m.f1 >= 0.8 ? "pass" : "warn";
+    return renderMessageRow(
+      status,
+      m.skill,
+      `F1: ${formatPercent(m.f1)}  precision: ${formatPercent(m.precision)}  recall: ${formatPercent(m.recall)}`,
+      renderDefinitionList([
+        { label: "Queries", value: String(m.queriesTotal) },
+        { label: "Correct", value: String(m.correct) },
+        { label: "Precision", value: formatPercent(m.precision) },
+        { label: "Recall", value: formatPercent(m.recall) }
+      ])
+    );
+  }).join("");
+  const metricsSection = renderSectionCard("Per-Skill Metrics", `<div class="row-list">${metricsRows}</div>`);
+  let conflictsSection = "";
+  if (result.conflicts.length > 0) {
+    const conflictRows = result.conflicts.map(
+      (conflict) => renderMessageRow(
+        "warn",
+        `${escapeHtml(conflict.skillA)} \u2194 ${escapeHtml(conflict.skillB)}`,
+        `${formatPercent(conflict.bleedAtoB)} of ${escapeHtml(conflict.skillA)} queries routed to ${escapeHtml(conflict.skillB)}; ${formatPercent(conflict.bleedBtoA)} the other way`
+      )
+    ).join("");
+    conflictsSection = renderSectionCard("Conflicts", `<div class="row-list">${conflictRows}</div>`);
+  }
+  const suggestionsSection = renderSectionCard(
+    "Suggestions",
+    `<ul>${result.suggestions.map((s) => `<li>${escapeHtml(s)}</li>`).join("")}</ul>`
+  );
+  return renderHtmlDocument(
+    `skilltest route \u2014 ${result.skillDir}`,
+    [header, matrixSection, metricsSection, conflictsSection, suggestionsSection].join("")
+  );
+}
 // src/reporters/terminal.ts
 import { Chalk } from "chalk";
@@ -2659,6 +2729,70 @@ function renderImproveReport(result, enableColor, verbose = false) {
   }
   return lines.join("\n");
 }
+function renderRouteReport(result, enableColor, verbose) {
+  const c = getChalkInstance(enableColor);
+  const lines = [
+    "skilltest route",
+    `directory: ${result.skillDir}`,
+    `provider/model: ${result.provider}/${result.model}`,
+    `skills: ${result.skills.length}  queries per skill: ${result.numQueriesPerSkill}`
+  ];
+  lines.push("");
+  lines.push("Per-skill metrics:");
+  for (const m of result.perSkillMetrics) {
+    const badge = m.f1 >= 0.8 ? c.green("PASS") : c.yellow("WARN");
+    lines.push(
+      `  ${m.skill.padEnd(24)} F1: ${formatPercent2(m.f1).padEnd(7)}  precision: ${formatPercent2(m.precision).padEnd(7)}  recall: ${formatPercent2(m.recall)}  [${badge}]`
+    );
+  }
+  lines.push("");
+  lines.push("Routing matrix (% of row queries routed to column):");
+  const colHeaders = [...result.skills, "none"];
+  const colWidth = 10;
+  const rowLabelWidth = 24;
+  const headerRow = "".padEnd(rowLabelWidth) + colHeaders.map((h) => h.slice(0, colWidth - 1).padEnd(colWidth)).join("");
+  lines.push("  " + headerRow);
+  for (const targetSkill of result.skills) {
+    const rowLabel = ("  " + targetSkill).padEnd(rowLabelWidth);
+    const cells = colHeaders.map((col) => {
+      const pct = result.matrixPct[targetSkill]?.[col] ?? 0;
+      const formatted = formatPercent2(pct).padEnd(colWidth);
+      if (col === targetSkill) return c.green(formatted);
+      if (pct > 0.1) return c.yellow(formatted);
+      return formatted;
+    }).join("");
+    lines.push(rowLabel + cells);
+  }
+  if (result.conflicts.length > 0) {
+    lines.push("");
+    lines.push("Conflicts detected:");
+    for (const conflict of result.conflicts) {
+      lines.push(
+        `  ${conflict.skillA} <-> ${conflict.skillB}  ${formatPercent2(conflict.bleedAtoB)} / ${formatPercent2(conflict.bleedBtoA)} bleed  [${c.yellow("WARN")}]`
+      );
+    }
+  }
+  lines.push("");
+  lines.push(`Overall accuracy: ${formatPercent2(result.overallAccuracy)}`);
+  lines.push("");
+  lines.push("Suggestions:");
+  for (const suggestion of result.suggestions) {
+    lines.push(`- ${suggestion}`);
+  }
+  if (verbose) {
+    lines.push("");
+    lines.push("Cases:");
+    for (const [index, testCase] of result.cases.entries()) {
+      const status = testCase.correct ? c.green("PASS") : c.red("FAIL");
+      lines.push(`  ${index + 1}. ${status} [${testCase.targetSkill}] ${testCase.query}`);
+      lines.push(`     routed to: ${testCase.actualSkill}`);
+      if (testCase.rawModelResponse) {
+        lines.push(`     model: ${testCase.rawModelResponse.replace(/\s+/g, " ").trim()}`);
+      }
+    }
+  }
+  return lines.join("\n");
+}
 // src/commands/common.ts
 import fs6 from "node:fs/promises";
@@ -3934,7 +4068,7 @@ function extractCliConfigOverrides(command) {
   if (command.getOptionValueSource("model") === "cli") {
     overrides.model = getTypedOptionValue(command, "model");
   }
-  if ((command.name() === "trigger" || command.name() === "eval" || command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("concurrency") === "cli") {
+  if ((command.name() === "trigger" || command.name() === "eval" || command.name() === "check" || command.name() === "improve" || command.name() === "route") && command.getOptionValueSource("concurrency") === "cli") {
     overrides.concurrency = getTypedOptionValue(command, "concurrency");
   }
   if ((command.name() === "trigger" || command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("numQueries") === "cli") {
@@ -5366,12 +5500,308 @@ function registerImproveCommand(program) {
   });
 }
+// src/commands/route.ts
+import fs13 from "node:fs/promises";
+import ora5 from "ora";
+import { z as z14 } from "zod";
+// src/core/route-tester.ts
+import path8 from "node:path";
+import { z as z13 } from "zod";
+var stringArraySchema = z13.array(z13.string().min(1));
+function parseJsonArrayFromModelOutput2(raw) {
+  const trimmed = raw.trim();
+  if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
+    return JSON.parse(trimmed);
+  }
+  const start = trimmed.indexOf("[");
+  const end = trimmed.lastIndexOf("]");
+  if (start >= 0 && end > start) {
+    return JSON.parse(trimmed.slice(start, end + 1));
+  }
+  throw new Error("Model did not return a JSON array.");
+}
+function parseRouteDecision(rawResponse, skillNames) {
+  const normalized = rawResponse.trim().toLowerCase();
+  if (normalized === "none" || normalized.startsWith("none")) {
+    return "none";
+  }
+  for (const skillName of skillNames) {
+    const escaped = skillName.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+    const regex = new RegExp(`\\b${escaped}\\b`, "i");
+    if (regex.test(rawResponse)) {
+      return skillName;
+    }
+  }
+  return "unrecognized";
+}
+async function discoverSkillPaths(skillDir) {
+  const allFiles = await listFilesRecursive(skillDir);
+  return allFiles.filter((f) => path8.basename(f) === "SKILL.md");
+}
+async function generatePositiveQueriesForSkill(skill, provider, model, count) {
+  const systemPrompt = [
+    "You generate realistic user queries that should trigger a specific agent skill.",
+    "Return a JSON array of strings only. No markdown, no comments.",
+    "Each string is one realistic user query that clearly belongs to this skill.",
+    "Queries should look like real user requests with enough context to drive a routing decision."
+  ].join(" ");
+  const userPrompt = [
+    `Skill name: ${skill.frontmatter.name}`,
+    `Skill description: ${skill.frontmatter.description}`,
+    `Generate exactly ${count} distinct queries that should trigger this skill.`
+  ].join("\n");
+  const raw = await provider.sendMessage(systemPrompt, userPrompt, { model });
+  const parsed = stringArraySchema.safeParse(parseJsonArrayFromModelOutput2(raw));
+  if (!parsed.success) {
+    throw new Error(
+      `Failed to parse generated queries for skill '${skill.frontmatter.name}': ${parsed.error.issues[0]?.message ?? "invalid format"}`
+    );
+  }
+  if (parsed.data.length < count) {
+    throw new Error(
+      `Expected ${count} queries for skill '${skill.frontmatter.name}', got ${parsed.data.length}.`
+    );
+  }
+  return parsed.data.slice(0, count);
+}
+function buildSkillListText(skills) {
+  return skills.map((s) => `- ${s.frontmatter.name}: ${s.frontmatter.description}`).join("\n");
+}
+function buildConfusionMatrix(cases, skillNames, numQueriesPerSkill) {
+  const allActualValues = [...skillNames, "none", "unrecognized"];
+  const matrix = {};
+  for (const target of skillNames) {
+    matrix[target] = {};
+    for (const actual of allActualValues) {
+      matrix[target][actual] = 0;
+    }
+  }
+  for (const c of cases) {
+    const row = matrix[c.targetSkill];
+    if (row) {
+      row[c.actualSkill] = (row[c.actualSkill] ?? 0) + 1;
+    }
+  }
+  const matrixPct = {};
+  const divisor = numQueriesPerSkill > 0 ? numQueriesPerSkill : 1;
+  for (const target of skillNames) {
+    matrixPct[target] = {};
+    for (const actual of allActualValues) {
+      matrixPct[target][actual] = (matrix[target][actual] ?? 0) / divisor;
+    }
+  }
+  return { matrix, matrixPct };
+}
+function computePerSkillMetrics(skillNames, matrix, numQueriesPerSkill) {
+  return skillNames.map((skill) => {
+    const tp = matrix[skill]?.[skill] ?? 0;
+    const fp = skillNames.filter((s) => s !== skill).reduce((sum, other) => sum + (matrix[other]?.[skill] ?? 0), 0);
+    const recall = numQueriesPerSkill === 0 ? 0 : tp / numQueriesPerSkill;
+    const precDenom = tp + fp;
+    const precision = precDenom === 0 ? 0 : tp / precDenom;
+    const f1 = precision + recall === 0 ? 0 : 2 * precision * recall / (precision + recall);
+    return { skill, queriesTotal: numQueriesPerSkill, correct: tp, precision, recall, f1 };
+  });
+}
+function detectConflicts(skillNames, matrixPct, conflictThreshold) {
+  const conflicts = [];
+  for (let i = 0; i < skillNames.length; i++) {
+    for (let j = i + 1; j < skillNames.length; j++) {
+      const skillA = skillNames[i];
+      const skillB = skillNames[j];
+      const bleedAtoB = matrixPct[skillA]?.[skillB] ?? 0;
+      const bleedBtoA = matrixPct[skillB]?.[skillA] ?? 0;
+      if (Math.max(bleedAtoB, bleedBtoA) > conflictThreshold) {
+        conflicts.push({ skillA, skillB, bleedAtoB, bleedBtoA });
+      }
+    }
+  }
+  return conflicts;
+}
+function buildRouteSuggestions(perSkillMetrics, conflicts) {
+  const suggestions = [];
+  for (const metrics of perSkillMetrics) {
+    if (metrics.f1 < 0.7) {
+      suggestions.push(
+        `'${metrics.skill}' has low F1 (${(metrics.f1 * 100).toFixed(1)}%) \u2014 consider clarifying its description and scope boundaries.`
+      );
+    }
+  }
+  for (const conflict of conflicts) {
+    suggestions.push(
+      `'${conflict.skillA}' and '${conflict.skillB}' overlap: ${(conflict.bleedAtoB * 100).toFixed(1)}% of ${conflict.skillA} queries routed to ${conflict.skillB}, ${(conflict.bleedBtoA * 100).toFixed(1)}% the other way \u2014 consider narrowing scope boundaries.`
+    );
+  }
+  if (suggestions.length === 0) {
+    suggestions.push("Routing looks clean. All skills are well-differentiated on this sample.");
+  }
+  return suggestions;
+}
+async function runRouteTest(skillDir, options) {
+  const numQueriesPerSkill = options.numQueriesPerSkill ?? 10;
+  const conflictThreshold = options.conflictThreshold ?? 0.1;
+  const concurrency = options.concurrency ?? 5;
+  const absoluteSkillDir = path8.resolve(skillDir);
+  const skillPaths = await discoverSkillPaths(absoluteSkillDir);
+  if (skillPaths.length < 2) {
+    throw new Error(
+      `Route test requires at least 2 skills. Found ${skillPaths.length} in: ${skillDir}`
+    );
+  }
+  if (skillPaths.length > 20) {
+    process.stderr.write(
+      `Warning: ${skillPaths.length} skills found. This will make ${skillPaths.length * numQueriesPerSkill} routing model calls.
+`
+    );
+  }
+  const skills = await Promise.all(skillPaths.map((p) => parseSkillStrict(p)));
+  const skillNames = skills.map((s) => s.frontmatter.name);
+  const queriesPerSkill = await pMap(
+    skills,
+    (skill) => generatePositiveQueriesForSkill(skill, options.provider, options.model, numQueriesPerSkill),
+    concurrency
+  );
+  const workItems = [];
+  for (let i = 0; i < skills.length; i++) {
+    const skill = skills[i];
+    const queries = queriesPerSkill[i];
+    for (const query of queries) {
+      workItems.push({ query, targetSkill: skill.frontmatter.name });
+    }
+  }
+  const skillListText = buildSkillListText(skills);
+  const systemPrompt = "Select the single best skill for the user's request from the provided list. Respond with only the skill name, or 'none' if nothing fits.";
+  const cases = await pMap(
+    workItems,
+    async ({ query, targetSkill }) => {
+      const userPrompt = `Available skills:
+${skillListText}
+User query: ${query}`;
+      const rawResponse = await options.provider.sendMessage(systemPrompt, userPrompt, { model: options.model });
+      const actualSkill = parseRouteDecision(rawResponse, skillNames);
+      return {
+        query,
+        targetSkill,
+        actualSkill,
+        correct: actualSkill === targetSkill,
+        rawModelResponse: options.verbose ? rawResponse : void 0
+      };
+    },
+    concurrency
+  );
+  const { matrix, matrixPct } = buildConfusionMatrix(cases, skillNames, numQueriesPerSkill);
+  const perSkillMetrics = computePerSkillMetrics(skillNames, matrix, numQueriesPerSkill);
+  const conflicts = detectConflicts(skillNames, matrixPct, conflictThreshold);
+  const correctCount = cases.filter((c) => c.correct).length;
+  const overallAccuracy = cases.length === 0 ? 0 : correctCount / cases.length;
+  const suggestions = buildRouteSuggestions(perSkillMetrics, conflicts);
+  return {
+    skillDir: absoluteSkillDir,
+    skills: skillNames,
+    model: options.model,
+    provider: options.provider.name,
+    seed: options.seed,
+    numQueriesPerSkill,
+    cases,
+    matrix,
+    matrixPct,
+    perSkillMetrics,
+    conflicts,
+    suggestions,
+    overallAccuracy
+  };
+}
+// src/commands/route.ts
+var routeCliSchema = z14.object({
+  numQueries: z14.number().int().min(1).optional(),
+  conflictThreshold: z14.number().min(0).max(1).optional(),
+  saveQueries: z14.string().optional(),
+  seed: z14.number().int().optional(),
+  concurrency: z14.number().int().min(1).optional(),
+  html: z14.string().optional(),
+  verbose: z14.boolean().optional(),
+  apiKey: z14.string().optional()
+});
+var DEFAULT_ANTHROPIC_MODEL5 = "claude-sonnet-4-5-20250929";
+var DEFAULT_OPENAI_MODEL5 = "gpt-4.1-mini";
+function resolveModel5(provider, model) {
+  if (provider === "openai" && model === DEFAULT_ANTHROPIC_MODEL5) {
+    return DEFAULT_OPENAI_MODEL5;
+  }
+  return model;
+}
+async function handleRouteCommand(skillDir, options) {
+  const spinner = options.json || !process.stdout.isTTY ? null : ora5("Preparing route evaluation...").start();
+  try {
+    if (spinner) spinner.text = "Initializing model provider...";
+    const provider = createProvider(options.provider, options.apiKey);
+    if (spinner) spinner.text = "Running route simulations...";
+    const model = resolveModel5(options.provider, options.model);
+    const result = await runRouteTest(skillDir, {
+      model,
+      provider,
+      numQueriesPerSkill: options.numQueriesPerSkill,
+      conflictThreshold: options.conflictThreshold,
+      seed: options.seed,
+      concurrency: options.concurrency,
+      verbose: options.verbose
+    });
+    if (options.saveQueries) {
+      await writeJsonFile(
+        options.saveQueries,
+        result.cases.map((c) => ({ query: c.query, targetSkill: c.targetSkill }))
+      );
+    }
+    spinner?.stop();
+    if (options.json) {
+      writeResult(result, true);
+    } else {
+      writeResult(renderRouteReport(result, options.color, options.verbose), false);
+    }
+    if (options.html) {
+      await fs13.writeFile(options.html, renderRouteHtml(result), "utf8");
+    }
+  } catch (error) {
+    spinner?.stop();
+    writeError(error, options.json);
+    process.exitCode = 2;
+  }
+}
+function registerRouteCommand(program) {
+  program.command("route").description("Validate multi-skill routing across all skills in a directory.").argument("<skillDir>", "Directory containing skill subdirectories with SKILL.md files").option("--model <model>", "Model to use").option("--provider <provider>", "LLM provider: anthropic|openai").option("--num-queries <n>", "Queries per skill (default: 10)", (value) => Number.parseInt(value, 10)).option("--conflict-threshold <n>", "Bleed fraction to flag as conflict (default: 0.1)", (value) => Number.parseFloat(value)).option("--seed <number>", "RNG seed for reproducibility metadata", (value) => Number.parseInt(value, 10)).option("--concurrency <n>", "Maximum in-flight requests", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--save-queries <path>", "Save generated queries as JSON").option("--api-key <key>", "API key override").option("--verbose", "Show raw model responses").action(async (skillDir, _commandOptions, command) => {
+    const globalOptions = getGlobalCliOptions(command);
+    const config = getResolvedConfig(command);
+    const parsedCli = routeCliSchema.safeParse(command.opts());
+    if (!parsedCli.success) {
+      writeError(new Error(parsedCli.error.issues[0]?.message ?? "Invalid route options."), globalOptions.json);
+      process.exitCode = 2;
+      return;
+    }
+    await handleRouteCommand(skillDir, {
+      ...globalOptions,
+      model: config.model,
+      provider: config.provider,
+      numQueriesPerSkill: parsedCli.data.numQueries ?? 10,
+      conflictThreshold: parsedCli.data.conflictThreshold ?? 0.1,
+      saveQueries: parsedCli.data.saveQueries,
+      seed: parsedCli.data.seed,
+      concurrency: parsedCli.data.concurrency ?? config.concurrency,
+      html: parsedCli.data.html,
+      verbose: Boolean(parsedCli.data.verbose),
+      apiKey: parsedCli.data.apiKey
+    });
+  });
+}
 // src/index.ts
 function resolveVersion() {
   try {
     const currentFilePath = fileURLToPath(import.meta.url);
-    const packageJsonPath = path8.resolve(path8.dirname(currentFilePath), "..", "package.json");
-    const raw = fs13.readFileSync(packageJsonPath, "utf8");
+    const packageJsonPath = path9.resolve(path9.dirname(currentFilePath), "..", "package.json");
+    const raw = fs14.readFileSync(packageJsonPath, "utf8");
     const parsed = JSON.parse(raw);
     return parsed.version ?? "0.0.0";
   } catch {
@@ -5405,6 +5835,7 @@ async function run(argv) {
   registerEvalCommand(program);
   registerCheckCommand(program);
   registerImproveCommand(program);
+  registerRouteCommand(program);
   try {
     await program.parseAsync(argv);
   } catch (error) {