vskill 0.2.75 → 0.2.77
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +27 -0
- package/dist/commands/eval/__tests__/coverage.test.js +1 -1
- package/dist/commands/eval/__tests__/coverage.test.js.map +1 -1
- package/dist/commands/eval/serve.js +10 -4
- package/dist/commands/eval/serve.js.map +1 -1
- package/dist/eval/__tests__/benchmark.test.js +2 -2
- package/dist/eval/__tests__/benchmark.test.js.map +1 -1
- package/dist/eval/__tests__/llm.test.js +7 -7
- package/dist/eval/__tests__/llm.test.js.map +1 -1
- package/dist/eval/__tests__/mcp-detector.test.d.ts +1 -0
- package/dist/eval/__tests__/mcp-detector.test.js +125 -0
- package/dist/eval/__tests__/mcp-detector.test.js.map +1 -0
- package/dist/eval/__tests__/skill-scanner.test.js +2 -1
- package/dist/eval/__tests__/skill-scanner.test.js.map +1 -1
- package/dist/eval/benchmark-history.d.ts +31 -4
- package/dist/eval/benchmark-history.js +83 -4
- package/dist/eval/benchmark-history.js.map +1 -1
- package/dist/eval/benchmark.d.ts +32 -0
- package/dist/eval/benchmark.js.map +1 -1
- package/dist/eval/llm.js +2 -2
- package/dist/eval/llm.js.map +1 -1
- package/dist/eval/mcp-detector.d.ts +13 -0
- package/dist/eval/mcp-detector.js +123 -0
- package/dist/eval/mcp-detector.js.map +1 -0
- package/dist/eval/skill-scanner.js +14 -2
- package/dist/eval/skill-scanner.js.map +1 -1
- package/dist/eval-server/api-routes.js +183 -117
- package/dist/eval-server/api-routes.js.map +1 -1
- package/dist/eval-server/benchmark-runner.d.ts +16 -0
- package/dist/eval-server/benchmark-runner.js +114 -0
- package/dist/eval-server/benchmark-runner.js.map +1 -0
- package/dist/eval-server/eval-server.js +4 -0
- package/dist/eval-server/eval-server.js.map +1 -1
- package/dist/eval-server/improve-routes.d.ts +2 -0
- package/dist/eval-server/improve-routes.js +77 -0
- package/dist/eval-server/improve-routes.js.map +1 -0
- package/dist/eval-server/model-compare-routes.d.ts +2 -0
- package/dist/eval-server/model-compare-routes.js +119 -0
- package/dist/eval-server/model-compare-routes.js.map +1 -0
- package/dist/eval-server/skill-resolver.d.ts +1 -0
- package/dist/eval-server/skill-resolver.js +24 -0
- package/dist/eval-server/skill-resolver.js.map +1 -0
- package/dist/eval-ui/assets/index-CFJYUUyc.js +73 -0
- package/dist/eval-ui/assets/index-DSspu3L6.css +1 -0
- package/dist/eval-ui/index.html +2 -2
- package/dist/installer/canonical.js +1 -1
- package/dist/installer/canonical.js.map +1 -1
- package/dist/installer/canonical.test.js +8 -0
- package/dist/installer/canonical.test.js.map +1 -1
- package/package.json +1 -1
- package/dist/eval-ui/assets/index-BsNUxjb1.js +0 -70
- package/dist/eval-ui/assets/index-D5mEzX7i.css +0 -1
|
@@ -8,6 +8,8 @@ import { fileURLToPath } from "node:url";
|
|
|
8
8
|
import { Router } from "./router.js";
|
|
9
9
|
import { sendJson } from "./router.js";
|
|
10
10
|
import { registerRoutes } from "./api-routes.js";
|
|
11
|
+
import { registerImproveRoutes } from "./improve-routes.js";
|
|
12
|
+
import { registerModelCompareRoutes } from "./model-compare-routes.js";
|
|
11
13
|
const __filename = fileURLToPath(import.meta.url);
|
|
12
14
|
const __dirname = path.dirname(__filename);
|
|
13
15
|
const MIME_TYPES = {
|
|
@@ -28,6 +30,8 @@ export async function startEvalServer(opts) {
|
|
|
28
30
|
const { port, root } = opts;
|
|
29
31
|
// Register API routes
|
|
30
32
|
registerRoutes(router, root, opts.projectName);
|
|
33
|
+
registerImproveRoutes(router, root);
|
|
34
|
+
registerModelCompareRoutes(router, root);
|
|
31
35
|
// Static asset directory
|
|
32
36
|
const staticDir = path.resolve(__dirname, "../eval-ui");
|
|
33
37
|
const server = http.createServer(async (req, res) => {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"eval-server.js","sourceRoot":"","sources":["../../src/eval-server/eval-server.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,gDAAgD;AAChD,8EAA8E;AAE9E,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AACzC,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AACrC,OAAO,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AACvC,OAAO,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;
|
|
1
|
+
{"version":3,"file":"eval-server.js","sourceRoot":"","sources":["../../src/eval-server/eval-server.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,gDAAgD;AAChD,8EAA8E;AAE9E,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AACzC,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AACrC,OAAO,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AACvC,OAAO,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AACjD,OAAO,EAAE,qBAAqB,EAAE,MAAM,qBAAqB,CAAC;AAC5D,OAAO,EAAE,0BAA0B,EAAE,MAAM,2BAA2B,CAAC;AAEvE,MAAM,UAAU,GAAG,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AAClD,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC;AAE3C,MAAM,UAAU,GAA2B;IACzC,OAAO,EAAE,WAAW;IACpB,KAAK,EAAE,wBAAwB;IAC/B,MAAM,EAAE,UAAU;IAClB,OAAO,EAAE,kBAAkB;IAC3B,MAAM,EAAE,WAAW;IACnB,MAAM,EAAE,eAAe;IACvB,MAAM,EAAE,cAAc;IACtB,OAAO,EAAE,WAAW;IACpB,QAAQ,EAAE,YAAY;IACtB,MAAM,EAAE,UAAU;IAClB,MAAM,EAAE,kBAAkB;CAC3B,CAAC;AAQF,MAAM,CAAC,KAAK,UAAU,eAAe,CAAC,IAAuB;IAC3D,MAAM,MAAM,GAAG,IAAI,MAAM,EAAE,CAAC;IAC5B,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,GAAG,IAAI,CAAC;IAE5B,sBAAsB;IACtB,cAAc,CAAC,MAAM,EAAE,IAAI,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC;IAC/C,qBAAqB,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IACpC,0BAA0B,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IAEzC,yBAAyB;IACzB,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,YAAY,CAAC,CAAC;IAExD,MAAM,MAAM,GAAG,IAAI,CAAC,YAAY,CAAC,KAAK,EAAE,GAAG,EAAE,GAAG,EAAE,EAAE;QAClD,wBAAwB;QACxB,IAAI,GAAG,CAAC,MAAM,KAAK,SAAS,EAAE,CAAC;YAC7B,MAAM,SAAS,GAAG,GAAG,CAAC,GAAG,EAAE,UAAU,CAAC,OAAO,CAAC,CAAC;YAC/C,IAAI,SAAS,IAAK,MAAc,CAAC,OAAO,EAAE,CAAC;gBACxC,MAAc,CAAC,OAAO,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;gBAClC,OAAO;YACT,CAAC;YACD,GAAG,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;YACnB,GAAG,CAAC,GAAG,EAAE,CAAC;YACV,OAAO;QACT,CAAC;QAED,uBAAuB;QACvB,MAAM,OAAO,GAAG,MAAM,MAAM,CAAC,MAAM,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;QAC9C,IAAI,OAAO;YAAE,OAAO;QAEpB,6BAA6B;QAC7B,IAAI,GAAG,CAAC,GAAG,EAAE,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;YACjC,QAAQ,CAAC,GAAG,EAAE,EAAE,KAAK,EAAE,WAAW,EAAE,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC;YAChD,OAAO;QACT,CAAC;QAED,qBAAqB;QACrB,MAAM,WAAW,CAAC,GAAG,EAAE,GAAG,EAAE,SAAS,CAAC,CAAC;IACzC,CAAC,CAAC,CAAC;IAEH,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;QAC7B,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,GAAG,EAAE;YACvB,OAAO,CAAC,GAAG,CAAC,uCAAuC,IAAI,IAAI,CAAC,CAAC;YAC7D,OAAO,CAAC,MAAM,CAAC,CAAC;QAClB,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC;AAED,KAAK,UAAU,WAAW,CACxB,GAAyB,EACzB,GAAwB,EACxB,SAAiB;IAEjB,IAAI,OAAO,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,GAAG,IAAI,GAAG,EAAE,kBAAkB,CAAC,CAAC,QAAQ,CAAC;IACnE,IAAI,OAAO,KAAK,GAAG;QAAE,OAAO,GAAG,aAAa,CAAC;IAE7C,mCAAmC;IACnC,MAAM,QAAQ,GAAG,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC,OAAO,CAAC,eAAe,EAAE,EAAE,CAAC,CAAC;IACtE,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC;IAEhD,kCAAkC;IAClC,IAAI,CAAC,QAAQ,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE,CAAC;QACpC,GAAG,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;QACnB,GAAG,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC;QACrB,OAAO;IACT,CAAC;IAED,IAAI,CAAC;QACH,MAAM,IAAI,GAAG,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;QACnC,IAAI,IAAI,CAAC,MAAM,EAAE,EAAE,CAAC;YAClB,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;YACnC,MAAM,WAAW,GAAG,UAAU,CAAC,GAAG,CAAC,IAAI,0BAA0B,CAAC;YAClE,GAAG,CAAC,SAAS,CAAC,GAAG,EAAE,EAAE,cAAc,EAAE,WAAW,EAAE,CAAC,CAAC;YACpD,EAAE,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YACxC,OAAO;QACT,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,gCAAgC;IAClC,CAAC;IAED,0DAA0D;IAC1D,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,YAAY,CAAC,CAAC;IACrD,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,EAAE,CAAC,YAAY,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC;QACpD,GAAG,CAAC,SAAS,CAAC,GAAG,EAAE,EAAE,cAAc,EAAE,WAAW,EAAE,CAAC,CAAC;QACpD,GAAG,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;IACnB,CAAC;IAAC,MAAM,CAAC;QACP,GAAG,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;QACnB,GAAG,CAAC,GAAG,CAAC,+CAA+C,CAAC,CAAC;IAC3D,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
// ---------------------------------------------------------------------------
|
|
2
|
+
// improve-routes.ts -- AI skill improvement + apply endpoints
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
import { readFileSync, writeFileSync, existsSync } from "node:fs";
|
|
5
|
+
import { join } from "node:path";
|
|
6
|
+
import { sendJson, readBody } from "./router.js";
|
|
7
|
+
import { resolveSkillDir } from "./skill-resolver.js";
|
|
8
|
+
import { readBenchmark } from "../eval/benchmark.js";
|
|
9
|
+
import { createLlmClient } from "../eval/llm.js";
|
|
10
|
+
export function registerImproveRoutes(router, root) {
|
|
11
|
+
// POST /api/skills/:plugin/:skill/improve
|
|
12
|
+
router.post("/api/skills/:plugin/:skill/improve", async (req, res, params) => {
|
|
13
|
+
const skillDir = resolveSkillDir(root, params.plugin, params.skill);
|
|
14
|
+
const skillMdPath = join(skillDir, "SKILL.md");
|
|
15
|
+
if (!existsSync(skillMdPath)) {
|
|
16
|
+
sendJson(res, { error: "SKILL.md not found" }, 404, req);
|
|
17
|
+
return;
|
|
18
|
+
}
|
|
19
|
+
const body = (await readBody(req));
|
|
20
|
+
try {
|
|
21
|
+
const original = readFileSync(skillMdPath, "utf-8");
|
|
22
|
+
// Gather recent failures from latest benchmark
|
|
23
|
+
let failureContext = "";
|
|
24
|
+
const benchmark = await readBenchmark(skillDir);
|
|
25
|
+
if (benchmark) {
|
|
26
|
+
const failedCases = benchmark.cases
|
|
27
|
+
.filter((c) => c.status === "fail")
|
|
28
|
+
.slice(0, 10);
|
|
29
|
+
if (failedCases.length > 0) {
|
|
30
|
+
const failures = failedCases.map((c) => {
|
|
31
|
+
const failedAssertions = c.assertions
|
|
32
|
+
.filter((a) => !a.pass)
|
|
33
|
+
.map((a) => ` - ${a.text}: ${a.reasoning}`)
|
|
34
|
+
.join("\n");
|
|
35
|
+
return `Case "${c.eval_name}":\n${failedAssertions}`;
|
|
36
|
+
});
|
|
37
|
+
failureContext = `\n\n## Recent Benchmark Failures (${failedCases.length} cases)\n${failures.join("\n\n")}`;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
const systemPrompt = `You are an expert AI skill engineer. Your task is to improve a SKILL.md file based on its current content and any benchmark failures provided. Return ONLY the improved SKILL.md content — no explanations, no code fences, no preamble. The output should be valid SKILL.md that can be written directly to disk.
|
|
41
|
+
|
|
42
|
+
After the improved content, on a new line, write "---REASONING---" followed by a brief explanation of what you changed and why.`;
|
|
43
|
+
const userPrompt = `## Current SKILL.md\n${original}${failureContext}\n\nPlease improve this SKILL.md to address the failures and improve overall quality. Return the full improved content followed by ---REASONING--- and your explanation.`;
|
|
44
|
+
const client = createLlmClient({
|
|
45
|
+
provider: body.provider,
|
|
46
|
+
model: body.model,
|
|
47
|
+
});
|
|
48
|
+
const result = await client.generate(systemPrompt, userPrompt);
|
|
49
|
+
// Parse improved content and reasoning
|
|
50
|
+
const parts = result.text.split("---REASONING---");
|
|
51
|
+
const improved = parts[0].trim();
|
|
52
|
+
const reasoning = parts.length > 1 ? parts[1].trim() : "Improvements applied.";
|
|
53
|
+
sendJson(res, { original, improved, reasoning }, 200, req);
|
|
54
|
+
}
|
|
55
|
+
catch (err) {
|
|
56
|
+
sendJson(res, { error: `Improvement failed: ${err.message}` }, 500, req);
|
|
57
|
+
}
|
|
58
|
+
});
|
|
59
|
+
// POST /api/skills/:plugin/:skill/apply-improvement
|
|
60
|
+
router.post("/api/skills/:plugin/:skill/apply-improvement", async (req, res, params) => {
|
|
61
|
+
const skillDir = resolveSkillDir(root, params.plugin, params.skill);
|
|
62
|
+
const skillMdPath = join(skillDir, "SKILL.md");
|
|
63
|
+
const body = (await readBody(req));
|
|
64
|
+
if (!body.content || typeof body.content !== "string" || !body.content.trim()) {
|
|
65
|
+
sendJson(res, { error: "Content is required and must be non-empty" }, 400, req);
|
|
66
|
+
return;
|
|
67
|
+
}
|
|
68
|
+
try {
|
|
69
|
+
writeFileSync(skillMdPath, body.content, "utf-8");
|
|
70
|
+
sendJson(res, { ok: true }, 200, req);
|
|
71
|
+
}
|
|
72
|
+
catch (err) {
|
|
73
|
+
sendJson(res, { error: `Failed to write SKILL.md: ${err.message}` }, 500, req);
|
|
74
|
+
}
|
|
75
|
+
});
|
|
76
|
+
}
|
|
77
|
+
//# sourceMappingURL=improve-routes.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"improve-routes.js","sourceRoot":"","sources":["../../src/eval-server/improve-routes.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,8DAA8D;AAC9D,8EAA8E;AAE9E,OAAO,EAAE,YAAY,EAAE,aAAa,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AAClE,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAEjC,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AACjD,OAAO,EAAE,eAAe,EAAE,MAAM,qBAAqB,CAAC;AACtD,OAAO,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAC;AACrD,OAAO,EAAE,eAAe,EAAE,MAAM,gBAAgB,CAAC;AAGjD,MAAM,UAAU,qBAAqB,CAAC,MAAc,EAAE,IAAY;IAChE,0CAA0C;IAC1C,MAAM,CAAC,IAAI,CAAC,oCAAoC,EAAE,KAAK,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,EAAE,EAAE;QAC3E,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC;QACpE,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;QAE/C,IAAI,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;YAC7B,QAAQ,CAAC,GAAG,EAAE,EAAE,KAAK,EAAE,oBAAoB,EAAE,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC;YACzD,OAAO;QACT,CAAC;QAED,MAAM,IAAI,GAAG,CAAC,MAAM,QAAQ,CAAC,GAAG,CAAC,CAGhC,CAAC;QAEF,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,YAAY,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC;YAEpD,+CAA+C;YAC/C,IAAI,cAAc,GAAG,EAAE,CAAC;YACxB,MAAM,SAAS,GAAG,MAAM,aAAa,CAAC,QAAQ,CAAC,CAAC;YAChD,IAAI,SAAS,EAAE,CAAC;gBACd,MAAM,WAAW,GAAG,SAAS,CAAC,KAAK;qBAChC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,MAAM,CAAC;qBAClC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;gBAEhB,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBAC3B,MAAM,QAAQ,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE;wBACrC,MAAM,gBAAgB,GAAG,CAAC,CAAC,UAAU;6BAClC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;6BACtB,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,SAAS,EAAE,CAAC;6BAC3C,IAAI,CAAC,IAAI,CAAC,CAAC;wBACd,OAAO,SAAS,CAAC,CAAC,SAAS,OAAO,gBAAgB,EAAE,CAAC;oBACvD,CAAC,CAAC,CAAC;oBACH,cAAc,GAAG,qCAAqC,WAAW,CAAC,MAAM,YAAY,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC;gBAC9G,CAAC;YACH,CAAC;YAED,MAAM,YAAY,GAAG;;gIAEqG,CAAC;YAE3H,MAAM,UAAU,GAAG,wBAAwB,QAAQ,GAAG,cAAc,0KAA0K,CAAC;YAE/O,MAAM,MAAM,GAAG,eAAe,CAAC;gBAC7B,QAAQ,EAAE,IAAI,CAAC,QAAQ;gBACvB,KAAK,EAAE,IAAI,CAAC,KAAK;aAClB,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,YAAY,EAAE,UAAU,CAAC,CAAC;YAE/D,uCAAuC;YACvC,MAAM,KAAK,GAAG,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,iBAAiB,CAAC,CAAC;YACnD,MAAM,QAAQ,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;YACjC,MAAM,SAAS,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,uBAAuB,CAAC;YAE/E,QAAQ,CAAC,GAAG,EAAE,EAAE,QAAQ,EAAE,QAAQ,EAAE,SAAS,EAAE,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC;QAC7D,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,QAAQ,CACN,GAAG,EACH,EAAE,KAAK,EAAE,uBAAwB,GAAa,CAAC,OAAO,EAAE,EAAE,EAC1D,GAAG,EACH,GAAG,CACJ,CAAC;QACJ,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,oDAAoD;IACpD,MAAM,CAAC,IAAI,CAAC,8CAA8C,EAAE,KAAK,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,EAAE,EAAE;QACrF,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC;QACpE,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;QAE/C,MAAM,IAAI,GAAG,CAAC,MAAM,QAAQ,CAAC,GAAG,CAAC,CAAyB,CAAC;QAE3D,IAAI,CAAC,IAAI,CAAC,OAAO,IAAI,OAAO,IAAI,CAAC,OAAO,KAAK,QAAQ,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;YAC9E,QAAQ,CAAC,GAAG,EAAE,EAAE,KAAK,EAAE,2CAA2C,EAAE,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC;YAChF,OAAO;QACT,CAAC;QAED,IAAI,CAAC;YACH,aAAa,CAAC,WAAW,EAAE,IAAI,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC;YAClD,QAAQ,CAAC,GAAG,EAAE,EAAE,EAAE,EAAE,IAAI,EAAE,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC;QACxC,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,QAAQ,CACN,GAAG,EACH,EAAE,KAAK,EAAE,6BAA8B,GAAa,CAAC,OAAO,EAAE,EAAE,EAChE,GAAG,EACH,GAAG,CACJ,CAAC;QACJ,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC"}
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
// ---------------------------------------------------------------------------
|
|
2
|
+
// model-compare-routes.ts -- per-test-case model A/B comparison (SSE)
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
import { readFileSync, existsSync } from "node:fs";
|
|
5
|
+
import { join } from "node:path";
|
|
6
|
+
import { readBody } from "./router.js";
|
|
7
|
+
import { resolveSkillDir } from "./skill-resolver.js";
|
|
8
|
+
import { initSSE, sendSSE, sendSSEDone } from "./sse-helpers.js";
|
|
9
|
+
import { loadAndValidateEvals } from "../eval/schema.js";
|
|
10
|
+
import { createLlmClient } from "../eval/llm.js";
|
|
11
|
+
import { judgeAssertion } from "../eval/judge.js";
|
|
12
|
+
export function registerModelCompareRoutes(router, root) {
|
|
13
|
+
router.post("/api/skills/:plugin/:skill/compare-models", async (req, res, params) => {
|
|
14
|
+
const skillDir = resolveSkillDir(root, params.plugin, params.skill);
|
|
15
|
+
let aborted = false;
|
|
16
|
+
res.on("close", () => { aborted = true; });
|
|
17
|
+
const body = (await readBody(req));
|
|
18
|
+
initSSE(res, req);
|
|
19
|
+
try {
|
|
20
|
+
// Load eval case
|
|
21
|
+
const evals = loadAndValidateEvals(skillDir);
|
|
22
|
+
const evalCase = evals.evals.find((e) => e.id === body.eval_id);
|
|
23
|
+
if (!evalCase) {
|
|
24
|
+
sendSSE(res, "error", { error: `Eval case #${body.eval_id} not found` });
|
|
25
|
+
sendSSEDone(res, { error: `Eval case #${body.eval_id} not found` });
|
|
26
|
+
return;
|
|
27
|
+
}
|
|
28
|
+
// Read SKILL.md for system prompt
|
|
29
|
+
const skillMdPath = join(skillDir, "SKILL.md");
|
|
30
|
+
const skillContent = existsSync(skillMdPath) ? readFileSync(skillMdPath, "utf-8") : "";
|
|
31
|
+
const systemPrompt = skillContent
|
|
32
|
+
? `You are an AI assistant enhanced with the following skill:\n\n${skillContent}`
|
|
33
|
+
: "You are a helpful AI assistant.";
|
|
34
|
+
// Run Model A
|
|
35
|
+
if (aborted)
|
|
36
|
+
return;
|
|
37
|
+
sendSSE(res, "model_a_start", { model: body.modelA });
|
|
38
|
+
const clientA = createLlmClient(body.modelA);
|
|
39
|
+
const resultA = await clientA.generate(systemPrompt, evalCase.prompt);
|
|
40
|
+
const totalTokensA = resultA.inputTokens != null && resultA.outputTokens != null
|
|
41
|
+
? resultA.inputTokens + resultA.outputTokens
|
|
42
|
+
: null;
|
|
43
|
+
// Judge assertions for Model A
|
|
44
|
+
const assertionsA = [];
|
|
45
|
+
for (const assertion of evalCase.assertions) {
|
|
46
|
+
if (aborted)
|
|
47
|
+
return;
|
|
48
|
+
const judged = await judgeAssertion(resultA.text, assertion, clientA);
|
|
49
|
+
assertionsA.push(judged);
|
|
50
|
+
}
|
|
51
|
+
if (aborted)
|
|
52
|
+
return;
|
|
53
|
+
sendSSE(res, "model_a_result", {
|
|
54
|
+
model: clientA.model,
|
|
55
|
+
output: resultA.text,
|
|
56
|
+
durationMs: resultA.durationMs,
|
|
57
|
+
tokens: totalTokensA,
|
|
58
|
+
assertions: assertionsA,
|
|
59
|
+
passRate: assertionsA.length > 0
|
|
60
|
+
? assertionsA.filter((a) => a.pass).length / assertionsA.length
|
|
61
|
+
: 0,
|
|
62
|
+
});
|
|
63
|
+
// Run Model B
|
|
64
|
+
if (aborted)
|
|
65
|
+
return;
|
|
66
|
+
sendSSE(res, "model_b_start", { model: body.modelB });
|
|
67
|
+
const clientB = createLlmClient(body.modelB);
|
|
68
|
+
const resultB = await clientB.generate(systemPrompt, evalCase.prompt);
|
|
69
|
+
const totalTokensB = resultB.inputTokens != null && resultB.outputTokens != null
|
|
70
|
+
? resultB.inputTokens + resultB.outputTokens
|
|
71
|
+
: null;
|
|
72
|
+
// Judge assertions for Model B
|
|
73
|
+
const assertionsB = [];
|
|
74
|
+
for (const assertion of evalCase.assertions) {
|
|
75
|
+
if (aborted)
|
|
76
|
+
return;
|
|
77
|
+
const judged = await judgeAssertion(resultB.text, assertion, clientB);
|
|
78
|
+
assertionsB.push(judged);
|
|
79
|
+
}
|
|
80
|
+
if (aborted)
|
|
81
|
+
return;
|
|
82
|
+
sendSSE(res, "model_b_result", {
|
|
83
|
+
model: clientB.model,
|
|
84
|
+
output: resultB.text,
|
|
85
|
+
durationMs: resultB.durationMs,
|
|
86
|
+
tokens: totalTokensB,
|
|
87
|
+
assertions: assertionsB,
|
|
88
|
+
passRate: assertionsB.length > 0
|
|
89
|
+
? assertionsB.filter((a) => a.pass).length / assertionsB.length
|
|
90
|
+
: 0,
|
|
91
|
+
});
|
|
92
|
+
// Done — results are ephemeral (not saved to history)
|
|
93
|
+
sendSSEDone(res, {
|
|
94
|
+
eval_id: body.eval_id,
|
|
95
|
+
eval_name: evalCase.name,
|
|
96
|
+
modelA: {
|
|
97
|
+
model: clientA.model,
|
|
98
|
+
output: resultA.text,
|
|
99
|
+
durationMs: resultA.durationMs,
|
|
100
|
+
tokens: totalTokensA,
|
|
101
|
+
assertions: assertionsA,
|
|
102
|
+
},
|
|
103
|
+
modelB: {
|
|
104
|
+
model: clientB.model,
|
|
105
|
+
output: resultB.text,
|
|
106
|
+
durationMs: resultB.durationMs,
|
|
107
|
+
tokens: totalTokensB,
|
|
108
|
+
assertions: assertionsB,
|
|
109
|
+
},
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
catch (err) {
|
|
113
|
+
if (!aborted) {
|
|
114
|
+
sendSSEDone(res, { error: err instanceof Error ? err.message : String(err) });
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
});
|
|
118
|
+
}
|
|
119
|
+
//# sourceMappingURL=model-compare-routes.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"model-compare-routes.js","sourceRoot":"","sources":["../../src/eval-server/model-compare-routes.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,sEAAsE;AACtE,8EAA8E;AAE9E,OAAO,EAAE,YAAY,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACnD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAEjC,OAAO,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AACvC,OAAO,EAAE,eAAe,EAAE,MAAM,qBAAqB,CAAC;AACtD,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AACjE,OAAO,EAAE,oBAAoB,EAAE,MAAM,mBAAmB,CAAC;AACzD,OAAO,EAAE,eAAe,EAAE,MAAM,gBAAgB,CAAC;AAEjD,OAAO,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAC;AAOlD,MAAM,UAAU,0BAA0B,CAAC,MAAc,EAAE,IAAY;IACrE,MAAM,CAAC,IAAI,CAAC,2CAA2C,EAAE,KAAK,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,EAAE,EAAE;QAClF,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC;QACpE,IAAI,OAAO,GAAG,KAAK,CAAC;QACpB,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,EAAE,GAAG,OAAO,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QAE3C,MAAM,IAAI,GAAG,CAAC,MAAM,QAAQ,CAAC,GAAG,CAAC,CAIhC,CAAC;QAEF,OAAO,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;QAElB,IAAI,CAAC;YACH,iBAAiB;YACjB,MAAM,KAAK,GAAG,oBAAoB,CAAC,QAAQ,CAAC,CAAC;YAC7C,MAAM,QAAQ,GAAG,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,IAAI,CAAC,OAAO,CAAC,CAAC;YAChE,IAAI,CAAC,QAAQ,EAAE,CAAC;gBACd,OAAO,CAAC,GAAG,EAAE,OAAO,EAAE,EAAE,KAAK,EAAE,cAAc,IAAI,CAAC,OAAO,YAAY,EAAE,CAAC,CAAC;gBACzE,WAAW,CAAC,GAAG,EAAE,EAAE,KAAK,EAAE,cAAc,IAAI,CAAC,OAAO,YAAY,EAAE,CAAC,CAAC;gBACpE,OAAO;YACT,CAAC;YAED,kCAAkC;YAClC,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;YAC/C,MAAM,YAAY,GAAG,UAAU,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YACvF,MAAM,YAAY,GAAG,YAAY;gBAC/B,CAAC,CAAC,iEAAiE,YAAY,EAAE;gBACjF,CAAC,CAAC,iCAAiC,CAAC;YAEtC,cAAc;YACd,IAAI,OAAO;gBAAE,OAAO;YACpB,OAAO,CAAC,GAAG,EAAE,eAAe,EAAE,EAAE,KAAK,EAAE,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;YAEtD,MAAM,OAAO,GAAG,eAAe,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YAC7C,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,QAAQ,CAAC,YAAY,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC;YACtE,MAAM,YAAY,GAAG,OAAO,CAAC,WAAW,IAAI,IAAI,IAAI,OAAO,CAAC,YAAY,IAAI,IAAI;gBAC9E,CAAC,CAAC,OAAO,CAAC,WAAW,GAAG,OAAO,CAAC,YAAY;gBAC5C,CAAC,CAAC,IAAI,CAAC;YAET,+BAA+B;YAC/B,MAAM,WAAW,GAAG,EAAE,CAAC;YACvB,KAAK,MAAM,SAAS,IAAI,QAAQ,CAAC,UAAU,EAAE,CAAC;gBAC5C,IAAI,OAAO;oBAAE,OAAO;gBACpB,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,OAAO,CAAC,IAAI,EAAE,SAAS,EAAE,OAAO,CAAC,CAAC;gBACtE,WAAW,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YAC3B,CAAC;YAED,IAAI,OAAO;gBAAE,OAAO;YACpB,OAAO,CAAC,GAAG,EAAE,gBAAgB,EAAE;gBAC7B,KAAK,EAAE,OAAO,CAAC,KAAK;gBACpB,MAAM,EAAE,OAAO,CAAC,IAAI;gBACpB,UAAU,EAAE,OAAO,CAAC,UAAU;gBAC9B,MAAM,EAAE,YAAY;gBACpB,UAAU,EAAE,WAAW;gBACvB,QAAQ,EAAE,WAAW,CAAC,MAAM,GAAG,CAAC;oBAC9B,CAAC,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,GAAG,WAAW,CAAC,MAAM;oBAC/D,CAAC,CAAC,CAAC;aACN,CAAC,CAAC;YAEH,cAAc;YACd,IAAI,OAAO;gBAAE,OAAO;YACpB,OAAO,CAAC,GAAG,EAAE,eAAe,EAAE,EAAE,KAAK,EAAE,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;YAEtD,MAAM,OAAO,GAAG,eAAe,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YAC7C,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,QAAQ,CAAC,YAAY,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC;YACtE,MAAM,YAAY,GAAG,OAAO,CAAC,WAAW,IAAI,IAAI,IAAI,OAAO,CAAC,YAAY,IAAI,IAAI;gBAC9E,CAAC,CAAC,OAAO,CAAC,WAAW,GAAG,OAAO,CAAC,YAAY;gBAC5C,CAAC,CAAC,IAAI,CAAC;YAET,+BAA+B;YAC/B,MAAM,WAAW,GAAG,EAAE,CAAC;YACvB,KAAK,MAAM,SAAS,IAAI,QAAQ,CAAC,UAAU,EAAE,CAAC;gBAC5C,IAAI,OAAO;oBAAE,OAAO;gBACpB,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,OAAO,CAAC,IAAI,EAAE,SAAS,EAAE,OAAO,CAAC,CAAC;gBACtE,WAAW,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YAC3B,CAAC;YAED,IAAI,OAAO;gBAAE,OAAO;YACpB,OAAO,CAAC,GAAG,EAAE,gBAAgB,EAAE;gBAC7B,KAAK,EAAE,OAAO,CAAC,KAAK;gBACpB,MAAM,EAAE,OAAO,CAAC,IAAI;gBACpB,UAAU,EAAE,OAAO,CAAC,UAAU;gBAC9B,MAAM,EAAE,YAAY;gBACpB,UAAU,EAAE,WAAW;gBACvB,QAAQ,EAAE,WAAW,CAAC,MAAM,GAAG,CAAC;oBAC9B,CAAC,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,GAAG,WAAW,CAAC,MAAM;oBAC/D,CAAC,CAAC,CAAC;aACN,CAAC,CAAC;YAEH,sDAAsD;YACtD,WAAW,CAAC,GAAG,EAAE;gBACf,OAAO,EAAE,IAAI,CAAC,OAAO;gBACrB,SAAS,EAAE,QAAQ,CAAC,IAAI;gBACxB,MAAM,EAAE;oBACN,KAAK,EAAE,OAAO,CAAC,KAAK;oBACpB,MAAM,EAAE,OAAO,CAAC,IAAI;oBACpB,UAAU,EAAE,OAAO,CAAC,UAAU;oBAC9B,MAAM,EAAE,YAAY;oBACpB,UAAU,EAAE,WAAW;iBACxB;gBACD,MAAM,EAAE;oBACN,KAAK,EAAE,OAAO,CAAC,KAAK;oBACpB,MAAM,EAAE,OAAO,CAAC,IAAI;oBACpB,UAAU,EAAE,OAAO,CAAC,UAAU;oBAC9B,MAAM,EAAE,YAAY;oBACpB,UAAU,EAAE,WAAW;iBACxB;aACF,CAAC,CAAC;QACL,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,IAAI,CAAC,OAAO,EAAE,CAAC;gBACb,WAAW,CAAC,GAAG,EAAE,EAAE,KAAK,EAAE,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YAChF,CAAC;QACH,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function resolveSkillDir(root: string, plugin: string, skill: string): string;
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
// ---------------------------------------------------------------------------
|
|
2
|
+
// skill-resolver.ts -- shared skill directory resolution
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
import { existsSync } from "node:fs";
|
|
5
|
+
import { join, basename } from "node:path";
|
|
6
|
+
export function resolveSkillDir(root, plugin, skill) {
|
|
7
|
+
// Layout 4 (self): root IS the skill directory (has SKILL.md)
|
|
8
|
+
if (basename(root) === skill && existsSync(join(root, "SKILL.md")))
|
|
9
|
+
return root;
|
|
10
|
+
// Try direct layout: {root}/{plugin}/skills/{skill}/
|
|
11
|
+
const directPath = join(root, plugin, "skills", skill);
|
|
12
|
+
if (existsSync(directPath))
|
|
13
|
+
return directPath;
|
|
14
|
+
// Try nested plugins/ layout: {root}/plugins/{plugin}/skills/{skill}/
|
|
15
|
+
const nestedPath = join(root, "plugins", plugin, "skills", skill);
|
|
16
|
+
if (existsSync(nestedPath))
|
|
17
|
+
return nestedPath;
|
|
18
|
+
// Try root layout: {root}/skills/{skill}/
|
|
19
|
+
const rootPath = join(root, "skills", skill);
|
|
20
|
+
if (existsSync(rootPath))
|
|
21
|
+
return rootPath;
|
|
22
|
+
return directPath;
|
|
23
|
+
}
|
|
24
|
+
//# sourceMappingURL=skill-resolver.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"skill-resolver.js","sourceRoot":"","sources":["../../src/eval-server/skill-resolver.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,yDAAyD;AACzD,8EAA8E;AAE9E,OAAO,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACrC,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AAE3C,MAAM,UAAU,eAAe,CAAC,IAAY,EAAE,MAAc,EAAE,KAAa;IACzE,8DAA8D;IAC9D,IAAI,QAAQ,CAAC,IAAI,CAAC,KAAK,KAAK,IAAI,UAAU,CAAC,IAAI,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC;QAAE,OAAO,IAAI,CAAC;IAEhF,qDAAqD;IACrD,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC;IACvD,IAAI,UAAU,CAAC,UAAU,CAAC;QAAE,OAAO,UAAU,CAAC;IAE9C,sEAAsE;IACtE,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,EAAE,SAAS,EAAE,MAAM,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC;IAClE,IAAI,UAAU,CAAC,UAAU,CAAC;QAAE,OAAO,UAAU,CAAC;IAE9C,0CAA0C;IAC1C,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC;IAC7C,IAAI,UAAU,CAAC,QAAQ,CAAC;QAAE,OAAO,QAAQ,CAAC;IAE1C,OAAO,UAAU,CAAC;AACpB,CAAC"}
|