skilltest 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +436 -5
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/index.ts
|
|
4
|
-
import
|
|
5
|
-
import
|
|
4
|
+
import fs14 from "node:fs";
|
|
5
|
+
import path9 from "node:path";
|
|
6
6
|
import { fileURLToPath } from "node:url";
|
|
7
7
|
import { Command } from "commander";
|
|
8
8
|
|
|
@@ -2315,6 +2315,76 @@ function renderCheckHtml(result) {
|
|
|
2315
2315
|
);
|
|
2316
2316
|
return renderHtmlDocument(`skilltest check - ${skillName}`, [header, lintSection, triggerSection, evalSection, qualityGate].join(""));
|
|
2317
2317
|
}
|
|
2318
|
+
function renderRouteMatrix(result) {
|
|
2319
|
+
const cols = [...result.skills, "none"];
|
|
2320
|
+
const headerCells = cols.map((col) => `<th>${escapeHtml(col)}</th>`).join("");
|
|
2321
|
+
const rows = result.skills.map((target) => {
|
|
2322
|
+
const cells = cols.map((col) => {
|
|
2323
|
+
const pct = result.matrixPct[target]?.[col] ?? 0;
|
|
2324
|
+
const isDiag = col === target;
|
|
2325
|
+
const bg = isDiag ? "background:rgba(34,197,94,0.18);" : pct > 0.15 ? "background:rgba(239,68,68,0.18);" : pct > 0.05 ? "background:rgba(234,179,8,0.12);" : "";
|
|
2326
|
+
return `<td style="${bg}">${escapeHtml(formatPercent(pct))}</td>`;
|
|
2327
|
+
}).join("");
|
|
2328
|
+
return `<tr><th>${escapeHtml(target)}</th>${cells}</tr>`;
|
|
2329
|
+
}).join("");
|
|
2330
|
+
return `<style>.rt{border-collapse:collapse;font-size:.85rem;width:100%}.rt th,.rt td{border:1px solid #d4d4d8;padding:8px 12px;text-align:center}.rt thead th{background:#fafafa;font-weight:700}</style><div style="overflow-x:auto"><table class="rt"><thead><tr><th></th>${headerCells}</tr></thead><tbody>${rows}</tbody></table></div>`;
|
|
2331
|
+
}
|
|
2332
|
+
function renderRouteHtml(result) {
|
|
2333
|
+
const conflictCount = result.conflicts.length;
|
|
2334
|
+
const overallStatus = result.overallAccuracy >= 0.8 ? "pass" : "warn";
|
|
2335
|
+
const conflictStatus = conflictCount === 0 ? "pass" : "warn";
|
|
2336
|
+
const header = renderHeaderCard(
|
|
2337
|
+
"route",
|
|
2338
|
+
`Routing Report \u2014 ${result.skills.length} skills`,
|
|
2339
|
+
result.skillDir,
|
|
2340
|
+
[
|
|
2341
|
+
{ label: "Overall accuracy", value: formatPercent(result.overallAccuracy), status: overallStatus },
|
|
2342
|
+
{ label: "Conflicts", value: String(conflictCount), status: conflictStatus },
|
|
2343
|
+
{ label: "Skills", value: String(result.skills.length) },
|
|
2344
|
+
{ label: "Queries/skill", value: String(result.numQueriesPerSkill) }
|
|
2345
|
+
],
|
|
2346
|
+
[
|
|
2347
|
+
{ label: "Provider", value: result.provider },
|
|
2348
|
+
{ label: "Model", value: result.model },
|
|
2349
|
+
{ label: "Seed", value: result.seed !== void 0 ? String(result.seed) : "none" }
|
|
2350
|
+
]
|
|
2351
|
+
);
|
|
2352
|
+
const matrixSection = renderSectionCard("Routing Matrix", renderRouteMatrix(result));
|
|
2353
|
+
const metricsRows = result.perSkillMetrics.map((m) => {
|
|
2354
|
+
const status = m.f1 >= 0.8 ? "pass" : "warn";
|
|
2355
|
+
return renderMessageRow(
|
|
2356
|
+
status,
|
|
2357
|
+
m.skill,
|
|
2358
|
+
`F1: ${formatPercent(m.f1)} precision: ${formatPercent(m.precision)} recall: ${formatPercent(m.recall)}`,
|
|
2359
|
+
renderDefinitionList([
|
|
2360
|
+
{ label: "Queries", value: String(m.queriesTotal) },
|
|
2361
|
+
{ label: "Correct", value: String(m.correct) },
|
|
2362
|
+
{ label: "Precision", value: formatPercent(m.precision) },
|
|
2363
|
+
{ label: "Recall", value: formatPercent(m.recall) }
|
|
2364
|
+
])
|
|
2365
|
+
);
|
|
2366
|
+
}).join("");
|
|
2367
|
+
const metricsSection = renderSectionCard("Per-Skill Metrics", `<div class="row-list">${metricsRows}</div>`);
|
|
2368
|
+
let conflictsSection = "";
|
|
2369
|
+
if (result.conflicts.length > 0) {
|
|
2370
|
+
const conflictRows = result.conflicts.map(
|
|
2371
|
+
(conflict) => renderMessageRow(
|
|
2372
|
+
"warn",
|
|
2373
|
+
`${escapeHtml(conflict.skillA)} \u2194 ${escapeHtml(conflict.skillB)}`,
|
|
2374
|
+
`${formatPercent(conflict.bleedAtoB)} of ${escapeHtml(conflict.skillA)} queries routed to ${escapeHtml(conflict.skillB)}; ${formatPercent(conflict.bleedBtoA)} the other way`
|
|
2375
|
+
)
|
|
2376
|
+
).join("");
|
|
2377
|
+
conflictsSection = renderSectionCard("Conflicts", `<div class="row-list">${conflictRows}</div>`);
|
|
2378
|
+
}
|
|
2379
|
+
const suggestionsSection = renderSectionCard(
|
|
2380
|
+
"Suggestions",
|
|
2381
|
+
`<ul>${result.suggestions.map((s) => `<li>${escapeHtml(s)}</li>`).join("")}</ul>`
|
|
2382
|
+
);
|
|
2383
|
+
return renderHtmlDocument(
|
|
2384
|
+
`skilltest route \u2014 ${result.skillDir}`,
|
|
2385
|
+
[header, matrixSection, metricsSection, conflictsSection, suggestionsSection].join("")
|
|
2386
|
+
);
|
|
2387
|
+
}
|
|
2318
2388
|
|
|
2319
2389
|
// src/reporters/terminal.ts
|
|
2320
2390
|
import { Chalk } from "chalk";
|
|
@@ -2659,6 +2729,70 @@ function renderImproveReport(result, enableColor, verbose = false) {
|
|
|
2659
2729
|
}
|
|
2660
2730
|
return lines.join("\n");
|
|
2661
2731
|
}
|
|
2732
|
+
function renderRouteReport(result, enableColor, verbose) {
|
|
2733
|
+
const c = getChalkInstance(enableColor);
|
|
2734
|
+
const lines = [
|
|
2735
|
+
"skilltest route",
|
|
2736
|
+
`directory: ${result.skillDir}`,
|
|
2737
|
+
`provider/model: ${result.provider}/${result.model}`,
|
|
2738
|
+
`skills: ${result.skills.length} queries per skill: ${result.numQueriesPerSkill}`
|
|
2739
|
+
];
|
|
2740
|
+
lines.push("");
|
|
2741
|
+
lines.push("Per-skill metrics:");
|
|
2742
|
+
for (const m of result.perSkillMetrics) {
|
|
2743
|
+
const badge = m.f1 >= 0.8 ? c.green("PASS") : c.yellow("WARN");
|
|
2744
|
+
lines.push(
|
|
2745
|
+
` ${m.skill.padEnd(24)} F1: ${formatPercent2(m.f1).padEnd(7)} precision: ${formatPercent2(m.precision).padEnd(7)} recall: ${formatPercent2(m.recall)} [${badge}]`
|
|
2746
|
+
);
|
|
2747
|
+
}
|
|
2748
|
+
lines.push("");
|
|
2749
|
+
lines.push("Routing matrix (% of row queries routed to column):");
|
|
2750
|
+
const colHeaders = [...result.skills, "none"];
|
|
2751
|
+
const colWidth = 10;
|
|
2752
|
+
const rowLabelWidth = 24;
|
|
2753
|
+
const headerRow = "".padEnd(rowLabelWidth) + colHeaders.map((h) => h.slice(0, colWidth - 1).padEnd(colWidth)).join("");
|
|
2754
|
+
lines.push(" " + headerRow);
|
|
2755
|
+
for (const targetSkill of result.skills) {
|
|
2756
|
+
const rowLabel = (" " + targetSkill).padEnd(rowLabelWidth);
|
|
2757
|
+
const cells = colHeaders.map((col) => {
|
|
2758
|
+
const pct = result.matrixPct[targetSkill]?.[col] ?? 0;
|
|
2759
|
+
const formatted = formatPercent2(pct).padEnd(colWidth);
|
|
2760
|
+
if (col === targetSkill) return c.green(formatted);
|
|
2761
|
+
if (pct > 0.1) return c.yellow(formatted);
|
|
2762
|
+
return formatted;
|
|
2763
|
+
}).join("");
|
|
2764
|
+
lines.push(rowLabel + cells);
|
|
2765
|
+
}
|
|
2766
|
+
if (result.conflicts.length > 0) {
|
|
2767
|
+
lines.push("");
|
|
2768
|
+
lines.push("Conflicts detected:");
|
|
2769
|
+
for (const conflict of result.conflicts) {
|
|
2770
|
+
lines.push(
|
|
2771
|
+
` ${conflict.skillA} <-> ${conflict.skillB} ${formatPercent2(conflict.bleedAtoB)} / ${formatPercent2(conflict.bleedBtoA)} bleed [${c.yellow("WARN")}]`
|
|
2772
|
+
);
|
|
2773
|
+
}
|
|
2774
|
+
}
|
|
2775
|
+
lines.push("");
|
|
2776
|
+
lines.push(`Overall accuracy: ${formatPercent2(result.overallAccuracy)}`);
|
|
2777
|
+
lines.push("");
|
|
2778
|
+
lines.push("Suggestions:");
|
|
2779
|
+
for (const suggestion of result.suggestions) {
|
|
2780
|
+
lines.push(`- ${suggestion}`);
|
|
2781
|
+
}
|
|
2782
|
+
if (verbose) {
|
|
2783
|
+
lines.push("");
|
|
2784
|
+
lines.push("Cases:");
|
|
2785
|
+
for (const [index, testCase] of result.cases.entries()) {
|
|
2786
|
+
const status = testCase.correct ? c.green("PASS") : c.red("FAIL");
|
|
2787
|
+
lines.push(` ${index + 1}. ${status} [${testCase.targetSkill}] ${testCase.query}`);
|
|
2788
|
+
lines.push(` routed to: ${testCase.actualSkill}`);
|
|
2789
|
+
if (testCase.rawModelResponse) {
|
|
2790
|
+
lines.push(` model: ${testCase.rawModelResponse.replace(/\s+/g, " ").trim()}`);
|
|
2791
|
+
}
|
|
2792
|
+
}
|
|
2793
|
+
}
|
|
2794
|
+
return lines.join("\n");
|
|
2795
|
+
}
|
|
2662
2796
|
|
|
2663
2797
|
// src/commands/common.ts
|
|
2664
2798
|
import fs6 from "node:fs/promises";
|
|
@@ -3934,7 +4068,7 @@ function extractCliConfigOverrides(command) {
|
|
|
3934
4068
|
if (command.getOptionValueSource("model") === "cli") {
|
|
3935
4069
|
overrides.model = getTypedOptionValue(command, "model");
|
|
3936
4070
|
}
|
|
3937
|
-
if ((command.name() === "trigger" || command.name() === "eval" || command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("concurrency") === "cli") {
|
|
4071
|
+
if ((command.name() === "trigger" || command.name() === "eval" || command.name() === "check" || command.name() === "improve" || command.name() === "route") && command.getOptionValueSource("concurrency") === "cli") {
|
|
3938
4072
|
overrides.concurrency = getTypedOptionValue(command, "concurrency");
|
|
3939
4073
|
}
|
|
3940
4074
|
if ((command.name() === "trigger" || command.name() === "check" || command.name() === "improve") && command.getOptionValueSource("numQueries") === "cli") {
|
|
@@ -5366,12 +5500,308 @@ function registerImproveCommand(program) {
|
|
|
5366
5500
|
});
|
|
5367
5501
|
}
|
|
5368
5502
|
|
|
5503
|
+
// src/commands/route.ts
|
|
5504
|
+
import fs13 from "node:fs/promises";
|
|
5505
|
+
import ora5 from "ora";
|
|
5506
|
+
import { z as z14 } from "zod";
|
|
5507
|
+
|
|
5508
|
+
// src/core/route-tester.ts
|
|
5509
|
+
import path8 from "node:path";
|
|
5510
|
+
import { z as z13 } from "zod";
|
|
5511
|
+
var stringArraySchema = z13.array(z13.string().min(1));
|
|
5512
|
+
function parseJsonArrayFromModelOutput2(raw) {
|
|
5513
|
+
const trimmed = raw.trim();
|
|
5514
|
+
if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
|
|
5515
|
+
return JSON.parse(trimmed);
|
|
5516
|
+
}
|
|
5517
|
+
const start = trimmed.indexOf("[");
|
|
5518
|
+
const end = trimmed.lastIndexOf("]");
|
|
5519
|
+
if (start >= 0 && end > start) {
|
|
5520
|
+
return JSON.parse(trimmed.slice(start, end + 1));
|
|
5521
|
+
}
|
|
5522
|
+
throw new Error("Model did not return a JSON array.");
|
|
5523
|
+
}
|
|
5524
|
+
function parseRouteDecision(rawResponse, skillNames) {
|
|
5525
|
+
const normalized = rawResponse.trim().toLowerCase();
|
|
5526
|
+
if (normalized === "none" || normalized.startsWith("none")) {
|
|
5527
|
+
return "none";
|
|
5528
|
+
}
|
|
5529
|
+
for (const skillName of skillNames) {
|
|
5530
|
+
const escaped = skillName.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
5531
|
+
const regex = new RegExp(`\\b${escaped}\\b`, "i");
|
|
5532
|
+
if (regex.test(rawResponse)) {
|
|
5533
|
+
return skillName;
|
|
5534
|
+
}
|
|
5535
|
+
}
|
|
5536
|
+
return "unrecognized";
|
|
5537
|
+
}
|
|
5538
|
+
async function discoverSkillPaths(skillDir) {
|
|
5539
|
+
const allFiles = await listFilesRecursive(skillDir);
|
|
5540
|
+
return allFiles.filter((f) => path8.basename(f) === "SKILL.md");
|
|
5541
|
+
}
|
|
5542
|
+
async function generatePositiveQueriesForSkill(skill, provider, model, count) {
|
|
5543
|
+
const systemPrompt = [
|
|
5544
|
+
"You generate realistic user queries that should trigger a specific agent skill.",
|
|
5545
|
+
"Return a JSON array of strings only. No markdown, no comments.",
|
|
5546
|
+
"Each string is one realistic user query that clearly belongs to this skill.",
|
|
5547
|
+
"Queries should look like real user requests with enough context to drive a routing decision."
|
|
5548
|
+
].join(" ");
|
|
5549
|
+
const userPrompt = [
|
|
5550
|
+
`Skill name: ${skill.frontmatter.name}`,
|
|
5551
|
+
`Skill description: ${skill.frontmatter.description}`,
|
|
5552
|
+
`Generate exactly ${count} distinct queries that should trigger this skill.`
|
|
5553
|
+
].join("\n");
|
|
5554
|
+
const raw = await provider.sendMessage(systemPrompt, userPrompt, { model });
|
|
5555
|
+
const parsed = stringArraySchema.safeParse(parseJsonArrayFromModelOutput2(raw));
|
|
5556
|
+
if (!parsed.success) {
|
|
5557
|
+
throw new Error(
|
|
5558
|
+
`Failed to parse generated queries for skill '${skill.frontmatter.name}': ${parsed.error.issues[0]?.message ?? "invalid format"}`
|
|
5559
|
+
);
|
|
5560
|
+
}
|
|
5561
|
+
if (parsed.data.length < count) {
|
|
5562
|
+
throw new Error(
|
|
5563
|
+
`Expected ${count} queries for skill '${skill.frontmatter.name}', got ${parsed.data.length}.`
|
|
5564
|
+
);
|
|
5565
|
+
}
|
|
5566
|
+
return parsed.data.slice(0, count);
|
|
5567
|
+
}
|
|
5568
|
+
function buildSkillListText(skills) {
|
|
5569
|
+
return skills.map((s) => `- ${s.frontmatter.name}: ${s.frontmatter.description}`).join("\n");
|
|
5570
|
+
}
|
|
5571
|
+
function buildConfusionMatrix(cases, skillNames, numQueriesPerSkill) {
|
|
5572
|
+
const allActualValues = [...skillNames, "none", "unrecognized"];
|
|
5573
|
+
const matrix = {};
|
|
5574
|
+
for (const target of skillNames) {
|
|
5575
|
+
matrix[target] = {};
|
|
5576
|
+
for (const actual of allActualValues) {
|
|
5577
|
+
matrix[target][actual] = 0;
|
|
5578
|
+
}
|
|
5579
|
+
}
|
|
5580
|
+
for (const c of cases) {
|
|
5581
|
+
const row = matrix[c.targetSkill];
|
|
5582
|
+
if (row) {
|
|
5583
|
+
row[c.actualSkill] = (row[c.actualSkill] ?? 0) + 1;
|
|
5584
|
+
}
|
|
5585
|
+
}
|
|
5586
|
+
const matrixPct = {};
|
|
5587
|
+
const divisor = numQueriesPerSkill > 0 ? numQueriesPerSkill : 1;
|
|
5588
|
+
for (const target of skillNames) {
|
|
5589
|
+
matrixPct[target] = {};
|
|
5590
|
+
for (const actual of allActualValues) {
|
|
5591
|
+
matrixPct[target][actual] = (matrix[target][actual] ?? 0) / divisor;
|
|
5592
|
+
}
|
|
5593
|
+
}
|
|
5594
|
+
return { matrix, matrixPct };
|
|
5595
|
+
}
|
|
5596
|
+
function computePerSkillMetrics(skillNames, matrix, numQueriesPerSkill) {
|
|
5597
|
+
return skillNames.map((skill) => {
|
|
5598
|
+
const tp = matrix[skill]?.[skill] ?? 0;
|
|
5599
|
+
const fp = skillNames.filter((s) => s !== skill).reduce((sum, other) => sum + (matrix[other]?.[skill] ?? 0), 0);
|
|
5600
|
+
const recall = numQueriesPerSkill === 0 ? 0 : tp / numQueriesPerSkill;
|
|
5601
|
+
const precDenom = tp + fp;
|
|
5602
|
+
const precision = precDenom === 0 ? 0 : tp / precDenom;
|
|
5603
|
+
const f1 = precision + recall === 0 ? 0 : 2 * precision * recall / (precision + recall);
|
|
5604
|
+
return { skill, queriesTotal: numQueriesPerSkill, correct: tp, precision, recall, f1 };
|
|
5605
|
+
});
|
|
5606
|
+
}
|
|
5607
|
+
function detectConflicts(skillNames, matrixPct, conflictThreshold) {
|
|
5608
|
+
const conflicts = [];
|
|
5609
|
+
for (let i = 0; i < skillNames.length; i++) {
|
|
5610
|
+
for (let j = i + 1; j < skillNames.length; j++) {
|
|
5611
|
+
const skillA = skillNames[i];
|
|
5612
|
+
const skillB = skillNames[j];
|
|
5613
|
+
const bleedAtoB = matrixPct[skillA]?.[skillB] ?? 0;
|
|
5614
|
+
const bleedBtoA = matrixPct[skillB]?.[skillA] ?? 0;
|
|
5615
|
+
if (Math.max(bleedAtoB, bleedBtoA) > conflictThreshold) {
|
|
5616
|
+
conflicts.push({ skillA, skillB, bleedAtoB, bleedBtoA });
|
|
5617
|
+
}
|
|
5618
|
+
}
|
|
5619
|
+
}
|
|
5620
|
+
return conflicts;
|
|
5621
|
+
}
|
|
5622
|
+
function buildRouteSuggestions(perSkillMetrics, conflicts) {
|
|
5623
|
+
const suggestions = [];
|
|
5624
|
+
for (const metrics of perSkillMetrics) {
|
|
5625
|
+
if (metrics.f1 < 0.7) {
|
|
5626
|
+
suggestions.push(
|
|
5627
|
+
`'${metrics.skill}' has low F1 (${(metrics.f1 * 100).toFixed(1)}%) \u2014 consider clarifying its description and scope boundaries.`
|
|
5628
|
+
);
|
|
5629
|
+
}
|
|
5630
|
+
}
|
|
5631
|
+
for (const conflict of conflicts) {
|
|
5632
|
+
suggestions.push(
|
|
5633
|
+
`'${conflict.skillA}' and '${conflict.skillB}' overlap: ${(conflict.bleedAtoB * 100).toFixed(1)}% of ${conflict.skillA} queries routed to ${conflict.skillB}, ${(conflict.bleedBtoA * 100).toFixed(1)}% the other way \u2014 consider narrowing scope boundaries.`
|
|
5634
|
+
);
|
|
5635
|
+
}
|
|
5636
|
+
if (suggestions.length === 0) {
|
|
5637
|
+
suggestions.push("Routing looks clean. All skills are well-differentiated on this sample.");
|
|
5638
|
+
}
|
|
5639
|
+
return suggestions;
|
|
5640
|
+
}
|
|
5641
|
+
async function runRouteTest(skillDir, options) {
|
|
5642
|
+
const numQueriesPerSkill = options.numQueriesPerSkill ?? 10;
|
|
5643
|
+
const conflictThreshold = options.conflictThreshold ?? 0.1;
|
|
5644
|
+
const concurrency = options.concurrency ?? 5;
|
|
5645
|
+
const absoluteSkillDir = path8.resolve(skillDir);
|
|
5646
|
+
const skillPaths = await discoverSkillPaths(absoluteSkillDir);
|
|
5647
|
+
if (skillPaths.length < 2) {
|
|
5648
|
+
throw new Error(
|
|
5649
|
+
`Route test requires at least 2 skills. Found ${skillPaths.length} in: ${skillDir}`
|
|
5650
|
+
);
|
|
5651
|
+
}
|
|
5652
|
+
if (skillPaths.length > 20) {
|
|
5653
|
+
process.stderr.write(
|
|
5654
|
+
`Warning: ${skillPaths.length} skills found. This will make ${skillPaths.length * numQueriesPerSkill} routing model calls.
|
|
5655
|
+
`
|
|
5656
|
+
);
|
|
5657
|
+
}
|
|
5658
|
+
const skills = await Promise.all(skillPaths.map((p) => parseSkillStrict(p)));
|
|
5659
|
+
const skillNames = skills.map((s) => s.frontmatter.name);
|
|
5660
|
+
const queriesPerSkill = await pMap(
|
|
5661
|
+
skills,
|
|
5662
|
+
(skill) => generatePositiveQueriesForSkill(skill, options.provider, options.model, numQueriesPerSkill),
|
|
5663
|
+
concurrency
|
|
5664
|
+
);
|
|
5665
|
+
const workItems = [];
|
|
5666
|
+
for (let i = 0; i < skills.length; i++) {
|
|
5667
|
+
const skill = skills[i];
|
|
5668
|
+
const queries = queriesPerSkill[i];
|
|
5669
|
+
for (const query of queries) {
|
|
5670
|
+
workItems.push({ query, targetSkill: skill.frontmatter.name });
|
|
5671
|
+
}
|
|
5672
|
+
}
|
|
5673
|
+
const skillListText = buildSkillListText(skills);
|
|
5674
|
+
const systemPrompt = "Select the single best skill for the user's request from the provided list. Respond with only the skill name, or 'none' if nothing fits.";
|
|
5675
|
+
const cases = await pMap(
|
|
5676
|
+
workItems,
|
|
5677
|
+
async ({ query, targetSkill }) => {
|
|
5678
|
+
const userPrompt = `Available skills:
|
|
5679
|
+
${skillListText}
|
|
5680
|
+
|
|
5681
|
+
User query: ${query}`;
|
|
5682
|
+
const rawResponse = await options.provider.sendMessage(systemPrompt, userPrompt, { model: options.model });
|
|
5683
|
+
const actualSkill = parseRouteDecision(rawResponse, skillNames);
|
|
5684
|
+
return {
|
|
5685
|
+
query,
|
|
5686
|
+
targetSkill,
|
|
5687
|
+
actualSkill,
|
|
5688
|
+
correct: actualSkill === targetSkill,
|
|
5689
|
+
rawModelResponse: options.verbose ? rawResponse : void 0
|
|
5690
|
+
};
|
|
5691
|
+
},
|
|
5692
|
+
concurrency
|
|
5693
|
+
);
|
|
5694
|
+
const { matrix, matrixPct } = buildConfusionMatrix(cases, skillNames, numQueriesPerSkill);
|
|
5695
|
+
const perSkillMetrics = computePerSkillMetrics(skillNames, matrix, numQueriesPerSkill);
|
|
5696
|
+
const conflicts = detectConflicts(skillNames, matrixPct, conflictThreshold);
|
|
5697
|
+
const correctCount = cases.filter((c) => c.correct).length;
|
|
5698
|
+
const overallAccuracy = cases.length === 0 ? 0 : correctCount / cases.length;
|
|
5699
|
+
const suggestions = buildRouteSuggestions(perSkillMetrics, conflicts);
|
|
5700
|
+
return {
|
|
5701
|
+
skillDir: absoluteSkillDir,
|
|
5702
|
+
skills: skillNames,
|
|
5703
|
+
model: options.model,
|
|
5704
|
+
provider: options.provider.name,
|
|
5705
|
+
seed: options.seed,
|
|
5706
|
+
numQueriesPerSkill,
|
|
5707
|
+
cases,
|
|
5708
|
+
matrix,
|
|
5709
|
+
matrixPct,
|
|
5710
|
+
perSkillMetrics,
|
|
5711
|
+
conflicts,
|
|
5712
|
+
suggestions,
|
|
5713
|
+
overallAccuracy
|
|
5714
|
+
};
|
|
5715
|
+
}
|
|
5716
|
+
|
|
5717
|
+
// src/commands/route.ts
|
|
5718
|
+
var routeCliSchema = z14.object({
|
|
5719
|
+
numQueries: z14.number().int().min(1).optional(),
|
|
5720
|
+
conflictThreshold: z14.number().min(0).max(1).optional(),
|
|
5721
|
+
saveQueries: z14.string().optional(),
|
|
5722
|
+
seed: z14.number().int().optional(),
|
|
5723
|
+
concurrency: z14.number().int().min(1).optional(),
|
|
5724
|
+
html: z14.string().optional(),
|
|
5725
|
+
verbose: z14.boolean().optional(),
|
|
5726
|
+
apiKey: z14.string().optional()
|
|
5727
|
+
});
|
|
5728
|
+
var DEFAULT_ANTHROPIC_MODEL5 = "claude-sonnet-4-5-20250929";
|
|
5729
|
+
var DEFAULT_OPENAI_MODEL5 = "gpt-4.1-mini";
|
|
5730
|
+
function resolveModel5(provider, model) {
|
|
5731
|
+
if (provider === "openai" && model === DEFAULT_ANTHROPIC_MODEL5) {
|
|
5732
|
+
return DEFAULT_OPENAI_MODEL5;
|
|
5733
|
+
}
|
|
5734
|
+
return model;
|
|
5735
|
+
}
|
|
5736
|
+
async function handleRouteCommand(skillDir, options) {
|
|
5737
|
+
const spinner = options.json || !process.stdout.isTTY ? null : ora5("Preparing route evaluation...").start();
|
|
5738
|
+
try {
|
|
5739
|
+
if (spinner) spinner.text = "Initializing model provider...";
|
|
5740
|
+
const provider = createProvider(options.provider, options.apiKey);
|
|
5741
|
+
if (spinner) spinner.text = "Running route simulations...";
|
|
5742
|
+
const model = resolveModel5(options.provider, options.model);
|
|
5743
|
+
const result = await runRouteTest(skillDir, {
|
|
5744
|
+
model,
|
|
5745
|
+
provider,
|
|
5746
|
+
numQueriesPerSkill: options.numQueriesPerSkill,
|
|
5747
|
+
conflictThreshold: options.conflictThreshold,
|
|
5748
|
+
seed: options.seed,
|
|
5749
|
+
concurrency: options.concurrency,
|
|
5750
|
+
verbose: options.verbose
|
|
5751
|
+
});
|
|
5752
|
+
if (options.saveQueries) {
|
|
5753
|
+
await writeJsonFile(
|
|
5754
|
+
options.saveQueries,
|
|
5755
|
+
result.cases.map((c) => ({ query: c.query, targetSkill: c.targetSkill }))
|
|
5756
|
+
);
|
|
5757
|
+
}
|
|
5758
|
+
spinner?.stop();
|
|
5759
|
+
if (options.json) {
|
|
5760
|
+
writeResult(result, true);
|
|
5761
|
+
} else {
|
|
5762
|
+
writeResult(renderRouteReport(result, options.color, options.verbose), false);
|
|
5763
|
+
}
|
|
5764
|
+
if (options.html) {
|
|
5765
|
+
await fs13.writeFile(options.html, renderRouteHtml(result), "utf8");
|
|
5766
|
+
}
|
|
5767
|
+
} catch (error) {
|
|
5768
|
+
spinner?.stop();
|
|
5769
|
+
writeError(error, options.json);
|
|
5770
|
+
process.exitCode = 2;
|
|
5771
|
+
}
|
|
5772
|
+
}
|
|
5773
|
+
function registerRouteCommand(program) {
|
|
5774
|
+
program.command("route").description("Validate multi-skill routing across all skills in a directory.").argument("<skillDir>", "Directory containing skill subdirectories with SKILL.md files").option("--model <model>", "Model to use").option("--provider <provider>", "LLM provider: anthropic|openai").option("--num-queries <n>", "Queries per skill (default: 10)", (value) => Number.parseInt(value, 10)).option("--conflict-threshold <n>", "Bleed fraction to flag as conflict (default: 0.1)", (value) => Number.parseFloat(value)).option("--seed <number>", "RNG seed for reproducibility metadata", (value) => Number.parseInt(value, 10)).option("--concurrency <n>", "Maximum in-flight requests", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--save-queries <path>", "Save generated queries as JSON").option("--api-key <key>", "API key override").option("--verbose", "Show raw model responses").action(async (skillDir, _commandOptions, command) => {
|
|
5775
|
+
const globalOptions = getGlobalCliOptions(command);
|
|
5776
|
+
const config = getResolvedConfig(command);
|
|
5777
|
+
const parsedCli = routeCliSchema.safeParse(command.opts());
|
|
5778
|
+
if (!parsedCli.success) {
|
|
5779
|
+
writeError(new Error(parsedCli.error.issues[0]?.message ?? "Invalid route options."), globalOptions.json);
|
|
5780
|
+
process.exitCode = 2;
|
|
5781
|
+
return;
|
|
5782
|
+
}
|
|
5783
|
+
await handleRouteCommand(skillDir, {
|
|
5784
|
+
...globalOptions,
|
|
5785
|
+
model: config.model,
|
|
5786
|
+
provider: config.provider,
|
|
5787
|
+
numQueriesPerSkill: parsedCli.data.numQueries ?? 10,
|
|
5788
|
+
conflictThreshold: parsedCli.data.conflictThreshold ?? 0.1,
|
|
5789
|
+
saveQueries: parsedCli.data.saveQueries,
|
|
5790
|
+
seed: parsedCli.data.seed,
|
|
5791
|
+
concurrency: parsedCli.data.concurrency ?? config.concurrency,
|
|
5792
|
+
html: parsedCli.data.html,
|
|
5793
|
+
verbose: Boolean(parsedCli.data.verbose),
|
|
5794
|
+
apiKey: parsedCli.data.apiKey
|
|
5795
|
+
});
|
|
5796
|
+
});
|
|
5797
|
+
}
|
|
5798
|
+
|
|
5369
5799
|
// src/index.ts
|
|
5370
5800
|
function resolveVersion() {
|
|
5371
5801
|
try {
|
|
5372
5802
|
const currentFilePath = fileURLToPath(import.meta.url);
|
|
5373
|
-
const packageJsonPath =
|
|
5374
|
-
const raw =
|
|
5803
|
+
const packageJsonPath = path9.resolve(path9.dirname(currentFilePath), "..", "package.json");
|
|
5804
|
+
const raw = fs14.readFileSync(packageJsonPath, "utf8");
|
|
5375
5805
|
const parsed = JSON.parse(raw);
|
|
5376
5806
|
return parsed.version ?? "0.0.0";
|
|
5377
5807
|
} catch {
|
|
@@ -5405,6 +5835,7 @@ async function run(argv) {
|
|
|
5405
5835
|
registerEvalCommand(program);
|
|
5406
5836
|
registerCheckCommand(program);
|
|
5407
5837
|
registerImproveCommand(program);
|
|
5838
|
+
registerRouteCommand(program);
|
|
5408
5839
|
try {
|
|
5409
5840
|
await program.parseAsync(argv);
|
|
5410
5841
|
} catch (error) {
|