vskill 0.2.27 → 0.2.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/__tests__/eval-router.test.d.ts +1 -0
- package/dist/commands/__tests__/eval-router.test.js +60 -0
- package/dist/commands/__tests__/eval-router.test.js.map +1 -0
- package/dist/commands/add.js +166 -24
- package/dist/commands/add.js.map +1 -1
- package/dist/commands/add.test.js +96 -1
- package/dist/commands/add.test.js.map +1 -1
- package/dist/commands/eval/__tests__/coverage.test.d.ts +1 -0
- package/dist/commands/eval/__tests__/coverage.test.js +122 -0
- package/dist/commands/eval/__tests__/coverage.test.js.map +1 -0
- package/dist/commands/eval/__tests__/generate-all.test.d.ts +1 -0
- package/dist/commands/eval/__tests__/generate-all.test.js +133 -0
- package/dist/commands/eval/__tests__/generate-all.test.js.map +1 -0
- package/dist/commands/eval/__tests__/init.test.d.ts +1 -0
- package/dist/commands/eval/__tests__/init.test.js +116 -0
- package/dist/commands/eval/__tests__/init.test.js.map +1 -0
- package/dist/commands/eval/__tests__/run.test.d.ts +1 -0
- package/dist/commands/eval/__tests__/run.test.js +149 -0
- package/dist/commands/eval/__tests__/run.test.js.map +1 -0
- package/dist/commands/eval/coverage.d.ts +1 -0
- package/dist/commands/eval/coverage.js +79 -0
- package/dist/commands/eval/coverage.js.map +1 -0
- package/dist/commands/eval/generate-all.d.ts +1 -0
- package/dist/commands/eval/generate-all.js +64 -0
- package/dist/commands/eval/generate-all.js.map +1 -0
- package/dist/commands/eval/init.d.ts +1 -0
- package/dist/commands/eval/init.js +38 -0
- package/dist/commands/eval/init.js.map +1 -0
- package/dist/commands/eval/run.d.ts +1 -0
- package/dist/commands/eval/run.js +107 -0
- package/dist/commands/eval/run.js.map +1 -0
- package/dist/commands/eval.d.ts +4 -0
- package/dist/commands/eval.js +48 -0
- package/dist/commands/eval.js.map +1 -0
- package/dist/eval/__tests__/benchmark.test.d.ts +1 -0
- package/dist/eval/__tests__/benchmark.test.js +65 -0
- package/dist/eval/__tests__/benchmark.test.js.map +1 -0
- package/dist/eval/__tests__/judge.test.d.ts +1 -0
- package/dist/eval/__tests__/judge.test.js +45 -0
- package/dist/eval/__tests__/judge.test.js.map +1 -0
- package/dist/eval/__tests__/llm.test.d.ts +1 -0
- package/dist/eval/__tests__/llm.test.js +85 -0
- package/dist/eval/__tests__/llm.test.js.map +1 -0
- package/dist/eval/__tests__/prompt-builder.test.d.ts +1 -0
- package/dist/eval/__tests__/prompt-builder.test.js +72 -0
- package/dist/eval/__tests__/prompt-builder.test.js.map +1 -0
- package/dist/eval/__tests__/schema.test.d.ts +1 -0
- package/dist/eval/__tests__/schema.test.js +209 -0
- package/dist/eval/__tests__/schema.test.js.map +1 -0
- package/dist/eval/__tests__/skill-scanner.test.d.ts +1 -0
- package/dist/eval/__tests__/skill-scanner.test.js +78 -0
- package/dist/eval/__tests__/skill-scanner.test.js.map +1 -0
- package/dist/eval/benchmark.d.ts +22 -0
- package/dist/eval/benchmark.js +24 -0
- package/dist/eval/benchmark.js.map +1 -0
- package/dist/eval/judge.d.ts +9 -0
- package/dist/eval/judge.js +40 -0
- package/dist/eval/judge.js.map +1 -0
- package/dist/eval/llm.d.ts +5 -0
- package/dist/eval/llm.js +34 -0
- package/dist/eval/llm.js.map +1 -0
- package/dist/eval/prompt-builder.d.ts +3 -0
- package/dist/eval/prompt-builder.js +155 -0
- package/dist/eval/prompt-builder.js.map +1 -0
- package/dist/eval/schema.d.ts +26 -0
- package/dist/eval/schema.js +128 -0
- package/dist/eval/schema.js.map +1 -0
- package/dist/eval/skill-scanner.d.ts +8 -0
- package/dist/eval/skill-scanner.js +44 -0
- package/dist/eval/skill-scanner.js.map +1 -0
- package/dist/index.js +9 -0
- package/dist/index.js.map +1 -1
- package/dist/marketplace/index.d.ts +2 -2
- package/dist/marketplace/index.js +1 -1
- package/dist/marketplace/index.js.map +1 -1
- package/dist/marketplace/marketplace.d.ts +13 -0
- package/dist/marketplace/marketplace.js +35 -0
- package/dist/marketplace/marketplace.js.map +1 -1
- package/package.json +2 -1
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"init.js","sourceRoot":"","sources":["../../../src/commands/eval/init.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,gEAAgE;AAChE,8EAA8E;AAE9E,OAAO,EAAE,YAAY,EAAE,aAAa,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,SAAS,CAAC;AAC7E,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,EAAE,mBAAmB,EAAE,mBAAmB,EAAE,MAAM,8BAA8B,CAAC;AACxF,OAAO,EAAE,KAAK,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,EAAE,MAAM,uBAAuB,CAAC;AAEhE,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,QAAgB,EAChB,KAAc;IAEd,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;IAC/C,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IACzC,MAAM,SAAS,GAAG,IAAI,CAAC,QAAQ,EAAE,YAAY,CAAC,CAAC;IAE/C,wBAAwB;IACxB,IAAI,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;QAC7B,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,yBAAyB,WAAW,EAAE,CAAC,CAAC,CAAC;QAC3D,OAAO;IACT,CAAC;IAED,4BAA4B;IAC5B,IAAI,UAAU,CAAC,SAAS,CAAC,IAAI,CAAC,KAAK,EAAE,CAAC;QACpC,OAAO,CAAC,GAAG,CACT,MAAM,CAAC,qDAAqD,CAAC,CAC9D,CAAC;QACF,OAAO;IACT,CAAC;IAED,MAAM,YAAY,GAAG,YAAY,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC;IACxD,MAAM,MAAM,GAAG,mBAAmB,CAAC,YAAY,CAAC,CAAC;IAEjD,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,GAAG,GAAG,MAAM,MAAM,CAAC,QAAQ,CAC/B,qFAAqF,EACrF,MAAM,CACP,CAAC;QAEF,MAAM,SAAS,GAAG,mBAAmB,CAAC,GAAG,CAAC,CAAC;QAE3C,SAAS,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QACzC,aAAa,CAAC,SAAS,EAAE,IAAI,CAAC,SAAS,CAAC,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;QAEtE,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,WAAW,SAAS,EAAE,CAAC,CAAC,CAAC;QAC3C,OAAO,CAAC,GAAG,CACT,GAAG,CAAC,KAAK,SAAS,CAAC,KAAK,CAAC,MAAM,gBAAgB,SAAS,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,CAAC,MAAM,EAAE,CAAC,CAAC,aAAa,CAAC,CAC9H,CAAC;IACJ,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,OAAO,CAAC,KAAK,CACX,GAAG,CAAC,4BAA4B,CAAC,GAAG,GAAG,CAAE,GAAa,CAAC,OAAO,CAAC,CAChE,CAAC;IACJ,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function runEvalRun(skillDir: string): Promise<void>;
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
// ---------------------------------------------------------------------------
|
|
2
|
+
// vskill eval run -- execute eval cases and grade assertions
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
import { loadAndValidateEvals, EvalValidationError } from "../../eval/schema.js";
|
|
5
|
+
import { createLlmClient } from "../../eval/llm.js";
|
|
6
|
+
import { judgeAssertion } from "../../eval/judge.js";
|
|
7
|
+
import { writeBenchmark } from "../../eval/benchmark.js";
|
|
8
|
+
import { green, red, yellow, bold, dim, table } from "../../utils/output.js";
|
|
9
|
+
export async function runEvalRun(skillDir) {
|
|
10
|
+
// Load and validate evals.json
|
|
11
|
+
let evalsFile;
|
|
12
|
+
try {
|
|
13
|
+
evalsFile = loadAndValidateEvals(skillDir);
|
|
14
|
+
}
|
|
15
|
+
catch (err) {
|
|
16
|
+
if (err instanceof EvalValidationError) {
|
|
17
|
+
const firstMsg = err.errors[0]?.message || "";
|
|
18
|
+
if (firstMsg.includes("No evals.json")) {
|
|
19
|
+
console.error(red(`No evals.json found at ${skillDir}/evals/evals.json`));
|
|
20
|
+
}
|
|
21
|
+
else {
|
|
22
|
+
console.error(red(`Invalid evals.json: ${err.message}`));
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
else {
|
|
26
|
+
console.error(red(`Error loading evals: ${err.message}`));
|
|
27
|
+
}
|
|
28
|
+
process.exit(1);
|
|
29
|
+
return;
|
|
30
|
+
}
|
|
31
|
+
const client = createLlmClient();
|
|
32
|
+
const model = client.model;
|
|
33
|
+
const benchmarkCases = [];
|
|
34
|
+
const tableRows = [];
|
|
35
|
+
for (const evalCase of evalsFile.evals) {
|
|
36
|
+
try {
|
|
37
|
+
// Step 1: Send prompt to LLM
|
|
38
|
+
const output = await client.generate("You are an AI skill being evaluated. Respond to the prompt as the skill would.", evalCase.prompt);
|
|
39
|
+
// Step 2: Judge each assertion
|
|
40
|
+
const assertionResults = [];
|
|
41
|
+
let passCount = 0;
|
|
42
|
+
for (const assertion of evalCase.assertions) {
|
|
43
|
+
const result = await judgeAssertion(output, assertion, client);
|
|
44
|
+
assertionResults.push(result);
|
|
45
|
+
if (result.pass)
|
|
46
|
+
passCount++;
|
|
47
|
+
const truncatedText = assertion.text.length > 60
|
|
48
|
+
? assertion.text.slice(0, 57) + "..."
|
|
49
|
+
: assertion.text;
|
|
50
|
+
tableRows.push([
|
|
51
|
+
evalCase.name,
|
|
52
|
+
assertion.id,
|
|
53
|
+
truncatedText,
|
|
54
|
+
result.pass ? green("PASS") : red("FAIL"),
|
|
55
|
+
]);
|
|
56
|
+
}
|
|
57
|
+
const passRate = evalCase.assertions.length > 0
|
|
58
|
+
? passCount / evalCase.assertions.length
|
|
59
|
+
: 0;
|
|
60
|
+
const allPassed = passCount === evalCase.assertions.length;
|
|
61
|
+
benchmarkCases.push({
|
|
62
|
+
eval_id: evalCase.id,
|
|
63
|
+
eval_name: evalCase.name,
|
|
64
|
+
status: allPassed ? "pass" : "fail",
|
|
65
|
+
error_message: null,
|
|
66
|
+
pass_rate: passRate,
|
|
67
|
+
assertions: assertionResults,
|
|
68
|
+
});
|
|
69
|
+
}
|
|
70
|
+
catch (err) {
|
|
71
|
+
// Mark case as error, continue with remaining
|
|
72
|
+
benchmarkCases.push({
|
|
73
|
+
eval_id: evalCase.id,
|
|
74
|
+
eval_name: evalCase.name,
|
|
75
|
+
status: "error",
|
|
76
|
+
error_message: err.message,
|
|
77
|
+
pass_rate: 0,
|
|
78
|
+
assertions: [],
|
|
79
|
+
});
|
|
80
|
+
tableRows.push([
|
|
81
|
+
evalCase.name,
|
|
82
|
+
"-",
|
|
83
|
+
dim("Error: " + err.message.slice(0, 50)),
|
|
84
|
+
yellow("ERROR"),
|
|
85
|
+
]);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
// Print results table
|
|
89
|
+
const headers = ["EVAL", "ASSERTION", "TEXT", "STATUS"];
|
|
90
|
+
console.log(bold(`\nEval Results: ${evalsFile.skill_name}\n`));
|
|
91
|
+
console.log(table(headers, tableRows));
|
|
92
|
+
// Compute summary
|
|
93
|
+
const passed = benchmarkCases.filter((c) => c.status === "pass").length;
|
|
94
|
+
const failed = benchmarkCases.filter((c) => c.status === "fail").length;
|
|
95
|
+
const errors = benchmarkCases.filter((c) => c.status === "error").length;
|
|
96
|
+
console.log(`\n${green(`${passed} passed`)} ${failed > 0 ? red(`${failed} failed`) : ""} ${errors > 0 ? yellow(`${errors} errors`) : ""}`.trim());
|
|
97
|
+
// Write benchmark.json
|
|
98
|
+
const benchmark = {
|
|
99
|
+
timestamp: new Date().toISOString(),
|
|
100
|
+
model,
|
|
101
|
+
skill_name: evalsFile.skill_name,
|
|
102
|
+
cases: benchmarkCases,
|
|
103
|
+
};
|
|
104
|
+
await writeBenchmark(skillDir, benchmark);
|
|
105
|
+
console.log(dim(`\nBenchmark written to ${skillDir}/evals/benchmark.json`));
|
|
106
|
+
}
|
|
107
|
+
//# sourceMappingURL=run.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"run.js","sourceRoot":"","sources":["../../../src/commands/eval/run.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,6DAA6D;AAC7D,8EAA8E;AAE9E,OAAO,EAAE,oBAAoB,EAAE,mBAAmB,EAAE,MAAM,sBAAsB,CAAC;AACjF,OAAO,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AACrD,OAAO,EAAE,cAAc,EAAE,MAAM,yBAAyB,CAAC;AAEzD,OAAO,EAAE,KAAK,EAAE,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,uBAAuB,CAAC;AAE7E,MAAM,CAAC,KAAK,UAAU,UAAU,CAAC,QAAgB;IAC/C,+BAA+B;IAC/B,IAAI,SAAS,CAAC;IACd,IAAI,CAAC;QACH,SAAS,GAAG,oBAAoB,CAAC,QAAQ,CAAC,CAAC;IAC7C,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,IAAI,GAAG,YAAY,mBAAmB,EAAE,CAAC;YACvC,MAAM,QAAQ,GAAG,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,OAAO,IAAI,EAAE,CAAC;YAC9C,IAAI,QAAQ,CAAC,QAAQ,CAAC,eAAe,CAAC,EAAE,CAAC;gBACvC,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,0BAA0B,QAAQ,mBAAmB,CAAC,CAAC,CAAC;YAC5E,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,uBAAuB,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;YAC3D,CAAC;QACH,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,wBAAyB,GAAa,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QACvE,CAAC;QACD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAChB,OAAO;IACT,CAAC;IAED,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;IACjC,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC;IAC3B,MAAM,cAAc,GAAoB,EAAE,CAAC;IAC3C,MAAM,SAAS,GAAe,EAAE,CAAC;IAEjC,KAAK,MAAM,QAAQ,IAAI,SAAS,CAAC,KAAK,EAAE,CAAC;QACvC,IAAI,CAAC;YACH,6BAA6B;YAC7B,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,QAAQ,CAClC,gFAAgF,EAChF,QAAQ,CAAC,MAAM,CAChB,CAAC;YAEF,+BAA+B;YAC/B,MAAM,gBAAgB,GAAG,EAAE,CAAC;YAC5B,IAAI,SAAS,GAAG,CAAC,CAAC;YAElB,KAAK,MAAM,SAAS,IAAI,QAAQ,CAAC,UAAU,EAAE,CAAC;gBAC5C,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,MAAM,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC;gBAC/D,gBAAgB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;gBAC9B,IAAI,MAAM,CAAC,IAAI;oBAAE,SAAS,EAAE,CAAC;gBAE7B,MAAM,aAAa,GACjB,SAAS,CAAC,IAAI,CAAC,MAAM,GAAG,EAAE;oBACxB,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,KAAK;oBACrC,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC;gBAErB,SAAS,CAAC,IAAI,CAAC;oBACb,QAAQ,CAAC,IAAI;oBACb,SAAS,CAAC,EAAE;oBACZ,aAAa;oBACb,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC;iBAC1C,CAAC,CAAC;YACL,CAAC;YAED,MAAM,QAAQ,GAAG,QAAQ,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC;gBAC7C,CAAC,CAAC,SAAS,GAAG,QAAQ,CAAC,UAAU,CAAC,MAAM;gBACxC,CAAC,CAAC,CAAC,CAAC;YACN,MAAM,SAAS,GAAG,SAAS,KAAK,QAAQ,CAAC,UAAU,CAAC,MAAM,CAAC;YAE3D,cAAc,CAAC,IAAI,CAAC;gBAClB,OAAO,EAAE,QAAQ,CAAC,EAAE;gBACpB,SAAS,EAAE,QAAQ,CAAC,IAAI;gBACxB,MAAM,EAAE,SAAS,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;gBACnC,aAAa,EAAE,IAAI;gBACnB,SAAS,EAAE,QAAQ;gBACnB,UAAU,EAAE,gBAAgB;aAC7B,CAAC,CAAC;QACL,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,8CAA8C;YAC9C,cAAc,CAAC,IAAI,CAAC;gBAClB,OAAO,EAAE,QAAQ,CAAC,EAAE;gBACpB,SAAS,EAAE,QAAQ,CAAC,IAAI;gBACxB,MAAM,EAAE,OAAO;gBACf,aAAa,EAAG,GAAa,CAAC,OAAO;gBACrC,SAAS,EAAE,CAAC;gBACZ,UAAU,EAAE,EAAE;aACf,CAAC,CAAC;YAEH,SAAS,CAAC,IAAI,CAAC;gBACb,QAAQ,CAAC,IAAI;gBACb,GAAG;gBACH,GAAG,CAAC,SAAS,GAAI,GAAa,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;gBACpD,MAAM,CAAC,OAAO,CAAC;aAChB,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,sBAAsB;IACtB,MAAM,OAAO,GAAG,CAAC,MAAM,EAAE,WAAW,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC;IACxD,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,mBAAmB,SAAS,CAAC,UAAU,IAAI,CAAC,CAAC,CAAC;IAC/D,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,OAAO,EAAE,SAAS,CAAC,CAAC,CAAC;IAEvC,kBAAkB;IAClB,MAAM,MAAM,GAAG,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,MAAM,CAAC;IACxE,MAAM,MAAM,GAAG,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,MAAM,CAAC;IACxE,MAAM,MAAM,GAAG,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,OAAO,CAAC,CAAC,MAAM,CAAC;IACzE,OAAO,CAAC,GAAG,CACT,KAAK,KAAK,CAAC,GAAG,MAAM,SAAS,CAAC,IAAI,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,MAAM,SAAS,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,MAAM,SAAS,CAAC,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,EAAE,CACrI,CAAC;IAEF,uBAAuB;IACvB,MAAM,SAAS,GAAoB;QACjC,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACnC,KAAK;QACL,UAAU,EAAE,SAAS,CAAC,UAAU;QAChC,KAAK,EAAE,cAAc;KACtB,CAAC;IAEF,MAAM,cAAc,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;IAC1C,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,0BAA0B,QAAQ,uBAAuB,CAAC,CAAC,CAAC;AAC9E,CAAC"}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
// ---------------------------------------------------------------------------
|
|
2
|
+
// vskill eval -- subcommand router
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
import { join, resolve } from "node:path";
|
|
5
|
+
import { red, dim } from "../utils/output.js";
|
|
6
|
+
export async function evalCommand(subcommand, target, opts = {}) {
|
|
7
|
+
const root = opts.root ? resolve(opts.root) : resolve("plugins");
|
|
8
|
+
switch (subcommand) {
|
|
9
|
+
case "init": {
|
|
10
|
+
if (!target) {
|
|
11
|
+
console.error(red("Usage: vskill eval init <plugin>/<skill>"));
|
|
12
|
+
process.exit(1);
|
|
13
|
+
}
|
|
14
|
+
const skillDir = resolveSkillDir(root, target);
|
|
15
|
+
const { runEvalInit } = await import("./eval/init.js");
|
|
16
|
+
return runEvalInit(skillDir, !!opts.force);
|
|
17
|
+
}
|
|
18
|
+
case "run": {
|
|
19
|
+
if (!target) {
|
|
20
|
+
console.error(red("Usage: vskill eval run <plugin>/<skill>"));
|
|
21
|
+
process.exit(1);
|
|
22
|
+
}
|
|
23
|
+
const skillDir = resolveSkillDir(root, target);
|
|
24
|
+
const { runEvalRun } = await import("./eval/run.js");
|
|
25
|
+
return runEvalRun(skillDir);
|
|
26
|
+
}
|
|
27
|
+
case "coverage": {
|
|
28
|
+
const { runEvalCoverage } = await import("./eval/coverage.js");
|
|
29
|
+
return runEvalCoverage(root);
|
|
30
|
+
}
|
|
31
|
+
case "generate-all": {
|
|
32
|
+
const { runEvalGenerateAll } = await import("./eval/generate-all.js");
|
|
33
|
+
return runEvalGenerateAll(root, !!opts.force);
|
|
34
|
+
}
|
|
35
|
+
default:
|
|
36
|
+
console.error(red(`Unknown subcommand: "${subcommand}"\n`) +
|
|
37
|
+
dim("Available: init, run, coverage, generate-all"));
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
function resolveSkillDir(root, target) {
|
|
41
|
+
const parts = target.split("/");
|
|
42
|
+
if (parts.length !== 2) {
|
|
43
|
+
console.error(red(`Invalid target "${target}". Expected format: <plugin>/<skill>`));
|
|
44
|
+
process.exit(1);
|
|
45
|
+
}
|
|
46
|
+
return join(root, parts[0], "skills", parts[1]);
|
|
47
|
+
}
|
|
48
|
+
//# sourceMappingURL=eval.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval.js","sourceRoot":"","sources":["../../src/commands/eval.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,mCAAmC;AACnC,8EAA8E;AAE9E,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAC1C,OAAO,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,oBAAoB,CAAC;AAE9C,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,UAAkB,EAClB,MAAe,EACf,OAA2C,EAAE;IAE7C,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;IAEjE,QAAQ,UAAU,EAAE,CAAC;QACnB,KAAK,MAAM,CAAC,CAAC,CAAC;YACZ,IAAI,CAAC,MAAM,EAAE,CAAC;gBACZ,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,0CAA0C,CAAC,CAAC,CAAC;gBAC/D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAC/C,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,MAAM,CAAC,gBAAgB,CAAC,CAAC;YACvD,OAAO,WAAW,CAAC,QAAQ,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAC7C,CAAC;QAED,KAAK,KAAK,CAAC,CAAC,CAAC;YACX,IAAI,CAAC,MAAM,EAAE,CAAC;gBACZ,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,yCAAyC,CAAC,CAAC,CAAC;gBAC9D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAC/C,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,MAAM,CAAC,eAAe,CAAC,CAAC;YACrD,OAAO,UAAU,CAAC,QAAQ,CAAC,CAAC;QAC9B,CAAC;QAED,KAAK,UAAU,CAAC,CAAC,CAAC;YAChB,MAAM,EAAE,eAAe,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;YAC/D,OAAO,eAAe,CAAC,IAAI,CAAC,CAAC;QAC/B,CAAC;QAED,KAAK,cAAc,CAAC,CAAC,CAAC;YACpB,MAAM,EAAE,kBAAkB,EAAE,GAAG,MAAM,MAAM,CAAC,wBAAwB,CAAC,CAAC;YACtE,OAAO,kBAAkB,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAChD,CAAC;QAED;YACE,OAAO,CAAC,KAAK,CACX,GAAG,CAAC,wBAAwB,UAAU,KAAK,CAAC;gBAC1C,GAAG,CAAC,8CAA8C,CAAC,CACtD,CAAC;IACN,CAAC;AACH,CAAC;AAED,SAAS,eAAe,CAAC,IAAY,EAAE,MAAc;IACnD,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAChC,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,OAAO,CAAC,KAAK,CACX,GAAG,CAAC,mBAAmB,MAAM,sCAAsC,CAAC,CACrE,CAAC;QACF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IACD,OAAO,IAAI,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;AAClD,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import { describe, it, expect, beforeEach, afterEach } from "vitest";
|
|
2
|
+
import { mkdirSync, writeFileSync, rmSync } from "node:fs";
|
|
3
|
+
import { join } from "node:path";
|
|
4
|
+
import { tmpdir } from "node:os";
|
|
5
|
+
import { writeBenchmark, readBenchmark } from "../benchmark.js";
|
|
6
|
+
// ---------------------------------------------------------------------------
|
|
7
|
+
// Helpers
|
|
8
|
+
// ---------------------------------------------------------------------------
|
|
9
|
+
let testDir;
|
|
10
|
+
const SAMPLE_BENCHMARK = {
|
|
11
|
+
timestamp: "2026-03-01T00:00:00.000Z",
|
|
12
|
+
model: "claude-sonnet-4-20250514",
|
|
13
|
+
skill_name: "test-skill",
|
|
14
|
+
cases: [
|
|
15
|
+
{
|
|
16
|
+
eval_id: 1,
|
|
17
|
+
eval_name: "Basic test",
|
|
18
|
+
status: "pass",
|
|
19
|
+
error_message: null,
|
|
20
|
+
pass_rate: 1.0,
|
|
21
|
+
assertions: [
|
|
22
|
+
{
|
|
23
|
+
id: "a1",
|
|
24
|
+
text: "Check result",
|
|
25
|
+
pass: true,
|
|
26
|
+
reasoning: "Looks good",
|
|
27
|
+
},
|
|
28
|
+
],
|
|
29
|
+
},
|
|
30
|
+
],
|
|
31
|
+
};
|
|
32
|
+
// ---------------------------------------------------------------------------
|
|
33
|
+
// Tests
|
|
34
|
+
// ---------------------------------------------------------------------------
|
|
35
|
+
describe("benchmark", () => {
|
|
36
|
+
beforeEach(() => {
|
|
37
|
+
testDir = join(tmpdir(), `vskill-bench-${Date.now()}`);
|
|
38
|
+
mkdirSync(join(testDir, "evals"), { recursive: true });
|
|
39
|
+
});
|
|
40
|
+
afterEach(() => {
|
|
41
|
+
rmSync(testDir, { recursive: true, force: true });
|
|
42
|
+
});
|
|
43
|
+
it("writes benchmark.json with all required fields", async () => {
|
|
44
|
+
await writeBenchmark(testDir, SAMPLE_BENCHMARK);
|
|
45
|
+
const result = await readBenchmark(testDir);
|
|
46
|
+
expect(result).not.toBeNull();
|
|
47
|
+
expect(result.timestamp).toBe("2026-03-01T00:00:00.000Z");
|
|
48
|
+
expect(result.model).toBe("claude-sonnet-4-20250514");
|
|
49
|
+
expect(result.skill_name).toBe("test-skill");
|
|
50
|
+
expect(result.cases).toHaveLength(1);
|
|
51
|
+
expect(result.cases[0].assertions).toHaveLength(1);
|
|
52
|
+
});
|
|
53
|
+
it("reads benchmark.json and returns typed result", async () => {
|
|
54
|
+
writeFileSync(join(testDir, "evals", "benchmark.json"), JSON.stringify(SAMPLE_BENCHMARK));
|
|
55
|
+
const result = await readBenchmark(testDir);
|
|
56
|
+
expect(result.skill_name).toBe("test-skill");
|
|
57
|
+
expect(result.cases[0].pass_rate).toBe(1.0);
|
|
58
|
+
});
|
|
59
|
+
it("returns null for missing benchmark.json", async () => {
|
|
60
|
+
rmSync(join(testDir, "evals"), { recursive: true, force: true });
|
|
61
|
+
const result = await readBenchmark(testDir);
|
|
62
|
+
expect(result).toBeNull();
|
|
63
|
+
});
|
|
64
|
+
});
|
|
65
|
+
//# sourceMappingURL=benchmark.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"benchmark.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/benchmark.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,QAAQ,CAAC;AACrE,OAAO,EAAE,SAAS,EAAE,aAAa,EAAE,MAAM,EAAE,MAAM,SAAS,CAAC;AAC3D,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,MAAM,EAAE,MAAM,SAAS,CAAC;AACjC,OAAO,EAAE,cAAc,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAGhE,8EAA8E;AAC9E,UAAU;AACV,8EAA8E;AAE9E,IAAI,OAAe,CAAC;AAEpB,MAAM,gBAAgB,GAAoB;IACxC,SAAS,EAAE,0BAA0B;IACrC,KAAK,EAAE,0BAA0B;IACjC,UAAU,EAAE,YAAY;IACxB,KAAK,EAAE;QACL;YACE,OAAO,EAAE,CAAC;YACV,SAAS,EAAE,YAAY;YACvB,MAAM,EAAE,MAAM;YACd,aAAa,EAAE,IAAI;YACnB,SAAS,EAAE,GAAG;YACd,UAAU,EAAE;gBACV;oBACE,EAAE,EAAE,IAAI;oBACR,IAAI,EAAE,cAAc;oBACpB,IAAI,EAAE,IAAI;oBACV,SAAS,EAAE,YAAY;iBACxB;aACF;SACF;KACF;CACF,CAAC;AAEF,8EAA8E;AAC9E,QAAQ;AACR,8EAA8E;AAE9E,QAAQ,CAAC,WAAW,EAAE,GAAG,EAAE;IACzB,UAAU,CAAC,GAAG,EAAE;QACd,OAAO,GAAG,IAAI,CAAC,MAAM,EAAE,EAAE,gBAAgB,IAAI,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;QACvD,SAAS,CAAC,IAAI,CAAC,OAAO,EAAE,OAAO,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IACzD,CAAC,CAAC,CAAC;IAEH,SAAS,CAAC,GAAG,EAAE;QACb,MAAM,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;IACpD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,gDAAgD,EAAE,KAAK,IAAI,EAAE;QAC9D,MAAM,cAAc,CAAC,OAAO,EAAE,gBAAgB,CAAC,CAAC;QAEhD,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;QAC5C,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC;QAC9B,MAAM,CAAC,MAAO,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC;QAC3D,MAAM,CAAC,MAAO,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC;QACvD,MAAM,CAAC,MAAO,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QAC9C,MAAM,CAAC,MAAO,CAAC,KAAK,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QACtC,MAAM,CAAC,MAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;IACtD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,+CAA+C,EAAE,KAAK,IAAI,EAAE;QAC7D,aAAa,CACX,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,gBAAgB,CAAC,EACxC,IAAI,CAAC,SAAS,CAAC,gBAAgB,CAAC,CACjC,CAAC;QAEF,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;QAC5C,MAAM,CAAC,MAAO,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QAC9C,MAAM,CAAC,MAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAC/C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,yCAAyC,EAAE,KAAK,IAAI,EAAE;QACvD,MAAM,CAAC,IAAI,CAAC,OAAO,EAAE,OAAO,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;QAEjE,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;QAC5C,MAAM,CAAC,MAAM,CAAC,CAAC,QAAQ,EAAE,CAAC;IAC5B,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { describe, it, expect, vi } from "vitest";
|
|
2
|
+
import { judgeAssertion } from "../judge.js";
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
// Helpers
|
|
5
|
+
// ---------------------------------------------------------------------------
|
|
6
|
+
function mockClient(response) {
|
|
7
|
+
return { generate: vi.fn().mockResolvedValue(response), model: "test-model" };
|
|
8
|
+
}
|
|
9
|
+
const ASSERTION = {
|
|
10
|
+
id: "assert-1",
|
|
11
|
+
text: "Output mentions a file path",
|
|
12
|
+
type: "boolean",
|
|
13
|
+
};
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
// Tests
|
|
16
|
+
// ---------------------------------------------------------------------------
|
|
17
|
+
describe("judgeAssertion", () => {
|
|
18
|
+
it("returns pass result when LLM judge says pass", async () => {
|
|
19
|
+
const client = mockClient(JSON.stringify({ pass: true, reasoning: "output contains file path" }));
|
|
20
|
+
const result = await judgeAssertion("The report has been saved to reports/q1.csv", ASSERTION, client);
|
|
21
|
+
expect(result.pass).toBe(true);
|
|
22
|
+
expect(result.reasoning).toBe("output contains file path");
|
|
23
|
+
expect(result.id).toBe("assert-1");
|
|
24
|
+
expect(result.text).toBe("Output mentions a file path");
|
|
25
|
+
});
|
|
26
|
+
it("returns fail result when LLM judge says fail", async () => {
|
|
27
|
+
const client = mockClient(JSON.stringify({
|
|
28
|
+
pass: false,
|
|
29
|
+
reasoning: "no file path found in output",
|
|
30
|
+
}));
|
|
31
|
+
const result = await judgeAssertion("Hello world", ASSERTION, client);
|
|
32
|
+
expect(result.pass).toBe(false);
|
|
33
|
+
expect(result.reasoning).toBe("no file path found in output");
|
|
34
|
+
});
|
|
35
|
+
it("throws on malformed judge response", async () => {
|
|
36
|
+
const client = mockClient("This is not JSON");
|
|
37
|
+
await expect(judgeAssertion("some output", ASSERTION, client)).rejects.toThrow(/invalid judge output/i);
|
|
38
|
+
});
|
|
39
|
+
it("handles JSON wrapped in code fence", async () => {
|
|
40
|
+
const client = mockClient('```json\n{"pass": true, "reasoning": "looks good"}\n```');
|
|
41
|
+
const result = await judgeAssertion("some output", ASSERTION, client);
|
|
42
|
+
expect(result.pass).toBe(true);
|
|
43
|
+
});
|
|
44
|
+
});
|
|
45
|
+
//# sourceMappingURL=judge.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"judge.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/judge.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAc,MAAM,QAAQ,CAAC;AAG9D,OAAO,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAE7C,8EAA8E;AAC9E,UAAU;AACV,8EAA8E;AAE9E,SAAS,UAAU,CAAC,QAAgB;IAClC,OAAO,EAAE,QAAQ,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,QAAQ,CAAC,EAAE,KAAK,EAAE,YAAY,EAAE,CAAC;AAChF,CAAC;AAED,MAAM,SAAS,GAAc;IAC3B,EAAE,EAAE,UAAU;IACd,IAAI,EAAE,6BAA6B;IACnC,IAAI,EAAE,SAAS;CAChB,CAAC;AAEF,8EAA8E;AAC9E,QAAQ;AACR,8EAA8E;AAE9E,QAAQ,CAAC,gBAAgB,EAAE,GAAG,EAAE;IAC9B,EAAE,CAAC,8CAA8C,EAAE,KAAK,IAAI,EAAE;QAC5D,MAAM,MAAM,GAAG,UAAU,CACvB,IAAI,CAAC,SAAS,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,2BAA2B,EAAE,CAAC,CACvE,CAAC;QAEF,MAAM,MAAM,GAAG,MAAM,cAAc,CACjC,6CAA6C,EAC7C,SAAS,EACT,MAAM,CACP,CAAC;QAEF,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC/B,MAAM,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,2BAA2B,CAAC,CAAC;QAC3D,MAAM,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACnC,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,6BAA6B,CAAC,CAAC;IAC1D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,8CAA8C,EAAE,KAAK,IAAI,EAAE;QAC5D,MAAM,MAAM,GAAG,UAAU,CACvB,IAAI,CAAC,SAAS,CAAC;YACb,IAAI,EAAE,KAAK;YACX,SAAS,EAAE,8BAA8B;SAC1C,CAAC,CACH,CAAC;QAEF,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,aAAa,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC;QAEtE,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAChC,MAAM,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,8BAA8B,CAAC,CAAC;IAChE,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,oCAAoC,EAAE,KAAK,IAAI,EAAE;QAClD,MAAM,MAAM,GAAG,UAAU,CAAC,kBAAkB,CAAC,CAAC;QAE9C,MAAM,MAAM,CACV,cAAc,CAAC,aAAa,EAAE,SAAS,EAAE,MAAM,CAAC,CACjD,CAAC,OAAO,CAAC,OAAO,CAAC,uBAAuB,CAAC,CAAC;IAC7C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,oCAAoC,EAAE,KAAK,IAAI,EAAE;QAClD,MAAM,MAAM,GAAG,UAAU,CACvB,yDAAyD,CAC1D,CAAC;QAEF,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,aAAa,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC;QACtE,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACjC,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
|
|
2
|
+
// ---------------------------------------------------------------------------
|
|
3
|
+
// Mocks
|
|
4
|
+
// ---------------------------------------------------------------------------
|
|
5
|
+
const mockCreate = vi.hoisted(() => vi.fn());
|
|
6
|
+
vi.mock("@anthropic-ai/sdk", () => ({
|
|
7
|
+
default: class MockAnthropic {
|
|
8
|
+
messages = { create: mockCreate };
|
|
9
|
+
},
|
|
10
|
+
}));
|
|
11
|
+
// ---------------------------------------------------------------------------
|
|
12
|
+
// Import module under test AFTER mocks
|
|
13
|
+
// ---------------------------------------------------------------------------
|
|
14
|
+
const { createLlmClient } = await import("../llm.js");
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
// Tests
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
describe("createLlmClient", () => {
|
|
19
|
+
const origEnv = { ...process.env };
|
|
20
|
+
beforeEach(() => {
|
|
21
|
+
vi.resetAllMocks();
|
|
22
|
+
process.env.ANTHROPIC_API_KEY = "test-key";
|
|
23
|
+
delete process.env.VSKILL_EVAL_MODEL;
|
|
24
|
+
});
|
|
25
|
+
afterEach(() => {
|
|
26
|
+
process.env = { ...origEnv };
|
|
27
|
+
});
|
|
28
|
+
it("returns text content on successful generate call", async () => {
|
|
29
|
+
mockCreate.mockResolvedValue({
|
|
30
|
+
content: [{ type: "text", text: "Generated response" }],
|
|
31
|
+
});
|
|
32
|
+
const client = createLlmClient();
|
|
33
|
+
const result = await client.generate("system prompt", "user prompt");
|
|
34
|
+
expect(result).toBe("Generated response");
|
|
35
|
+
expect(mockCreate).toHaveBeenCalledOnce();
|
|
36
|
+
});
|
|
37
|
+
it("uses default model claude-sonnet-4-20250514 when env not set", async () => {
|
|
38
|
+
mockCreate.mockResolvedValue({
|
|
39
|
+
content: [{ type: "text", text: "ok" }],
|
|
40
|
+
});
|
|
41
|
+
const client = createLlmClient();
|
|
42
|
+
await client.generate("sys", "usr");
|
|
43
|
+
expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ model: "claude-sonnet-4-20250514" }), expect.anything());
|
|
44
|
+
});
|
|
45
|
+
it("uses custom model from VSKILL_EVAL_MODEL env var", async () => {
|
|
46
|
+
process.env.VSKILL_EVAL_MODEL = "claude-opus-4-20250514";
|
|
47
|
+
mockCreate.mockResolvedValue({
|
|
48
|
+
content: [{ type: "text", text: "ok" }],
|
|
49
|
+
});
|
|
50
|
+
const client = createLlmClient();
|
|
51
|
+
await client.generate("sys", "usr");
|
|
52
|
+
expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ model: "claude-opus-4-20250514" }), expect.anything());
|
|
53
|
+
});
|
|
54
|
+
it("propagates network error from SDK", async () => {
|
|
55
|
+
mockCreate.mockRejectedValue(new Error("Connection timeout"));
|
|
56
|
+
const client = createLlmClient();
|
|
57
|
+
await expect(client.generate("sys", "usr")).rejects.toThrow("Connection timeout");
|
|
58
|
+
});
|
|
59
|
+
it("passes system and user prompts correctly", async () => {
|
|
60
|
+
mockCreate.mockResolvedValue({
|
|
61
|
+
content: [{ type: "text", text: "ok" }],
|
|
62
|
+
});
|
|
63
|
+
const client = createLlmClient();
|
|
64
|
+
await client.generate("my system prompt", "my user prompt");
|
|
65
|
+
expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({
|
|
66
|
+
system: "my system prompt",
|
|
67
|
+
messages: [{ role: "user", content: "my user prompt" }],
|
|
68
|
+
max_tokens: 4096,
|
|
69
|
+
}), expect.anything());
|
|
70
|
+
});
|
|
71
|
+
it("throws when ANTHROPIC_API_KEY is not set", () => {
|
|
72
|
+
delete process.env.ANTHROPIC_API_KEY;
|
|
73
|
+
expect(() => createLlmClient()).toThrow("ANTHROPIC_API_KEY is not set");
|
|
74
|
+
});
|
|
75
|
+
it("exposes model name on the client", () => {
|
|
76
|
+
const client = createLlmClient();
|
|
77
|
+
expect(client.model).toBe("claude-sonnet-4-20250514");
|
|
78
|
+
});
|
|
79
|
+
it("exposes custom model name when VSKILL_EVAL_MODEL is set", () => {
|
|
80
|
+
process.env.VSKILL_EVAL_MODEL = "claude-opus-4-20250514";
|
|
81
|
+
const client = createLlmClient();
|
|
82
|
+
expect(client.model).toBe("claude-opus-4-20250514");
|
|
83
|
+
});
|
|
84
|
+
});
|
|
85
|
+
//# sourceMappingURL=llm.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"llm.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/llm.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,QAAQ,CAAC;AAEzE,8EAA8E;AAC9E,QAAQ;AACR,8EAA8E;AAE9E,MAAM,UAAU,GAAG,EAAE,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;AAE7C,EAAE,CAAC,IAAI,CAAC,mBAAmB,EAAE,GAAG,EAAE,CAAC,CAAC;IAClC,OAAO,EAAE,MAAM,aAAa;QAC1B,QAAQ,GAAG,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC;KACnC;CACF,CAAC,CAAC,CAAC;AAEJ,8EAA8E;AAC9E,uCAAuC;AACvC,8EAA8E;AAE9E,MAAM,EAAE,eAAe,EAAE,GAAG,MAAM,MAAM,CAAC,WAAW,CAAC,CAAC;AAEtD,8EAA8E;AAC9E,QAAQ;AACR,8EAA8E;AAE9E,QAAQ,CAAC,iBAAiB,EAAE,GAAG,EAAE;IAC/B,MAAM,OAAO,GAAG,EAAE,GAAG,OAAO,CAAC,GAAG,EAAE,CAAC;IAEnC,UAAU,CAAC,GAAG,EAAE;QACd,EAAE,CAAC,aAAa,EAAE,CAAC;QACnB,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,UAAU,CAAC;QAC3C,OAAO,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC;IACvC,CAAC,CAAC,CAAC;IAEH,SAAS,CAAC,GAAG,EAAE;QACb,OAAO,CAAC,GAAG,GAAG,EAAE,GAAG,OAAO,EAAE,CAAC;IAC/B,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,kDAAkD,EAAE,KAAK,IAAI,EAAE;QAChE,UAAU,CAAC,iBAAiB,CAAC;YAC3B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,oBAAoB,EAAE,CAAC;SACxD,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,eAAe,EAAE,aAAa,CAAC,CAAC;QAErE,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC;QAC1C,MAAM,CAAC,UAAU,CAAC,CAAC,oBAAoB,EAAE,CAAC;IAC5C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,8DAA8D,EAAE,KAAK,IAAI,EAAE;QAC5E,UAAU,CAAC,iBAAiB,CAAC;YAC3B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;SACxC,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;QAEpC,MAAM,CAAC,UAAU,CAAC,CAAC,oBAAoB,CACrC,MAAM,CAAC,gBAAgB,CAAC,EAAE,KAAK,EAAE,0BAA0B,EAAE,CAAC,EAC9D,MAAM,CAAC,QAAQ,EAAE,CAClB,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,kDAAkD,EAAE,KAAK,IAAI,EAAE;QAChE,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,wBAAwB,CAAC;QACzD,UAAU,CAAC,iBAAiB,CAAC;YAC3B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;SACxC,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;QAEpC,MAAM,CAAC,UAAU,CAAC,CAAC,oBAAoB,CACrC,MAAM,CAAC,gBAAgB,CAAC,EAAE,KAAK,EAAE,wBAAwB,EAAE,CAAC,EAC5D,MAAM,CAAC,QAAQ,EAAE,CAClB,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,mCAAmC,EAAE,KAAK,IAAI,EAAE;QACjD,UAAU,CAAC,iBAAiB,CAAC,IAAI,KAAK,CAAC,oBAAoB,CAAC,CAAC,CAAC;QAE9D,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CACzD,oBAAoB,CACrB,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;QACxD,UAAU,CAAC,iBAAiB,CAAC;YAC3B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;SACxC,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,MAAM,CAAC,QAAQ,CAAC,kBAAkB,EAAE,gBAAgB,CAAC,CAAC;QAE5D,MAAM,CAAC,UAAU,CAAC,CAAC,oBAAoB,CACrC,MAAM,CAAC,gBAAgB,CAAC;YACtB,MAAM,EAAE,kBAAkB;YAC1B,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,EAAE,CAAC;YACvD,UAAU,EAAE,IAAI;SACjB,CAAC,EACF,MAAM,CAAC,QAAQ,EAAE,CAClB,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;QAClD,OAAO,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC;QACrC,MAAM,CAAC,GAAG,EAAE,CAAC,eAAe,EAAE,CAAC,CAAC,OAAO,CAAC,8BAA8B,CAAC,CAAC;IAC1E,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,kCAAkC,EAAE,GAAG,EAAE;QAC1C,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC;IACxD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,yDAAyD,EAAE,GAAG,EAAE;QACjE,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,wBAAwB,CAAC;QACzD,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,wBAAwB,CAAC,CAAC;IACtD,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { buildEvalInitPrompt, parseGeneratedEvals, } from "../prompt-builder.js";
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
// buildEvalInitPrompt
|
|
5
|
+
// ---------------------------------------------------------------------------
|
|
6
|
+
describe("buildEvalInitPrompt", () => {
|
|
7
|
+
const skillContent = "# My Skill\nThis skill does amazing things.";
|
|
8
|
+
it("includes skill content in the prompt", () => {
|
|
9
|
+
const prompt = buildEvalInitPrompt(skillContent);
|
|
10
|
+
expect(prompt).toContain(skillContent);
|
|
11
|
+
});
|
|
12
|
+
it("includes schema reference fields", () => {
|
|
13
|
+
const prompt = buildEvalInitPrompt(skillContent);
|
|
14
|
+
expect(prompt).toContain("skill_name");
|
|
15
|
+
expect(prompt).toContain("assertions");
|
|
16
|
+
expect(prompt).toContain("expected_output");
|
|
17
|
+
});
|
|
18
|
+
it("includes social-media-posting example", () => {
|
|
19
|
+
const prompt = buildEvalInitPrompt(skillContent);
|
|
20
|
+
expect(prompt).toContain("social-media-posting");
|
|
21
|
+
});
|
|
22
|
+
it("includes best practices section", () => {
|
|
23
|
+
const prompt = buildEvalInitPrompt(skillContent);
|
|
24
|
+
expect(prompt).toContain("Best Practices");
|
|
25
|
+
expect(prompt).toContain("objectively verifiable");
|
|
26
|
+
});
|
|
27
|
+
});
|
|
28
|
+
// ---------------------------------------------------------------------------
|
|
29
|
+
// parseGeneratedEvals
|
|
30
|
+
// ---------------------------------------------------------------------------
|
|
31
|
+
describe("parseGeneratedEvals", () => {
|
|
32
|
+
it("extracts JSON from markdown code fence", () => {
|
|
33
|
+
const raw = `Here is the evals.json:
|
|
34
|
+
|
|
35
|
+
\`\`\`json
|
|
36
|
+
{
|
|
37
|
+
"skill_name": "test-skill",
|
|
38
|
+
"evals": [
|
|
39
|
+
{
|
|
40
|
+
"id": 1,
|
|
41
|
+
"name": "Basic test",
|
|
42
|
+
"prompt": "Test prompt",
|
|
43
|
+
"expected_output": "Expected output",
|
|
44
|
+
"files": [],
|
|
45
|
+
"assertions": [
|
|
46
|
+
{ "id": "a1", "text": "Check result", "type": "boolean" }
|
|
47
|
+
]
|
|
48
|
+
}
|
|
49
|
+
]
|
|
50
|
+
}
|
|
51
|
+
\`\`\`
|
|
52
|
+
|
|
53
|
+
That's the evals file.`;
|
|
54
|
+
const result = parseGeneratedEvals(raw);
|
|
55
|
+
expect(result.skill_name).toBe("test-skill");
|
|
56
|
+
expect(result.evals).toHaveLength(1);
|
|
57
|
+
expect(result.evals[0].assertions).toHaveLength(1);
|
|
58
|
+
});
|
|
59
|
+
it("throws when no code block is found", () => {
|
|
60
|
+
const raw = "Just some text without any JSON code block.";
|
|
61
|
+
expect(() => parseGeneratedEvals(raw)).toThrow(/code block/i);
|
|
62
|
+
});
|
|
63
|
+
it("throws when JSON inside fence is invalid", () => {
|
|
64
|
+
const raw = "```json\n{ invalid json }\n```";
|
|
65
|
+
expect(() => parseGeneratedEvals(raw)).toThrow();
|
|
66
|
+
});
|
|
67
|
+
it("validates extracted JSON against schema", () => {
|
|
68
|
+
const raw = '```json\n{ "skill_name": "test" }\n```';
|
|
69
|
+
expect(() => parseGeneratedEvals(raw)).toThrow(); // missing evals array
|
|
70
|
+
});
|
|
71
|
+
});
|
|
72
|
+
//# sourceMappingURL=prompt-builder.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"prompt-builder.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/prompt-builder.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAC9C,OAAO,EACL,mBAAmB,EACnB,mBAAmB,GACpB,MAAM,sBAAsB,CAAC;AAE9B,8EAA8E;AAC9E,sBAAsB;AACtB,8EAA8E;AAE9E,QAAQ,CAAC,qBAAqB,EAAE,GAAG,EAAE;IACnC,MAAM,YAAY,GAAG,6CAA6C,CAAC;IAEnE,EAAE,CAAC,sCAAsC,EAAE,GAAG,EAAE;QAC9C,MAAM,MAAM,GAAG,mBAAmB,CAAC,YAAY,CAAC,CAAC;QACjD,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;IACzC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,kCAAkC,EAAE,GAAG,EAAE;QAC1C,MAAM,MAAM,GAAG,mBAAmB,CAAC,YAAY,CAAC,CAAC;QACjD,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;IAC9C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,uCAAuC,EAAE,GAAG,EAAE;QAC/C,MAAM,MAAM,GAAG,mBAAmB,CAAC,YAAY,CAAC,CAAC;QACjD,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,sBAAsB,CAAC,CAAC;IACnD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,iCAAiC,EAAE,GAAG,EAAE;QACzC,MAAM,MAAM,GAAG,mBAAmB,CAAC,YAAY,CAAC,CAAC;QACjD,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,gBAAgB,CAAC,CAAC;QAC3C,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,wBAAwB,CAAC,CAAC;IACrD,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,8EAA8E;AAC9E,sBAAsB;AACtB,8EAA8E;AAE9E,QAAQ,CAAC,qBAAqB,EAAE,GAAG,EAAE;IACnC,EAAE,CAAC,wCAAwC,EAAE,GAAG,EAAE;QAChD,MAAM,GAAG,GAAG;;;;;;;;;;;;;;;;;;;;uBAoBO,CAAC;QAEpB,MAAM,MAAM,GAAG,mBAAmB,CAAC,GAAG,CAAC,CAAC;QACxC,MAAM,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QAC7C,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;IACrD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,oCAAoC,EAAE,GAAG,EAAE;QAC5C,MAAM,GAAG,GAAG,6CAA6C,CAAC;QAC1D,MAAM,CAAC,GAAG,EAAE,CAAC,mBAAmB,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC;IAChE,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;QAClD,MAAM,GAAG,GAAG,gCAAgC,CAAC;QAC7C,MAAM,CAAC,GAAG,EAAE,CAAC,mBAAmB,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,EAAE,CAAC;IACnD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,yCAAyC,EAAE,GAAG,EAAE;QACjD,MAAM,GAAG,GAAG,wCAAwC,CAAC;QACrD,MAAM,CAAC,GAAG,EAAE,CAAC,mBAAmB,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,sBAAsB;IAC1E,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|