vskill 0.2.27 → 0.2.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. package/dist/commands/__tests__/eval-router.test.d.ts +1 -0
  2. package/dist/commands/__tests__/eval-router.test.js +60 -0
  3. package/dist/commands/__tests__/eval-router.test.js.map +1 -0
  4. package/dist/commands/add.js +166 -24
  5. package/dist/commands/add.js.map +1 -1
  6. package/dist/commands/add.test.js +96 -1
  7. package/dist/commands/add.test.js.map +1 -1
  8. package/dist/commands/eval/__tests__/coverage.test.d.ts +1 -0
  9. package/dist/commands/eval/__tests__/coverage.test.js +122 -0
  10. package/dist/commands/eval/__tests__/coverage.test.js.map +1 -0
  11. package/dist/commands/eval/__tests__/generate-all.test.d.ts +1 -0
  12. package/dist/commands/eval/__tests__/generate-all.test.js +133 -0
  13. package/dist/commands/eval/__tests__/generate-all.test.js.map +1 -0
  14. package/dist/commands/eval/__tests__/init.test.d.ts +1 -0
  15. package/dist/commands/eval/__tests__/init.test.js +116 -0
  16. package/dist/commands/eval/__tests__/init.test.js.map +1 -0
  17. package/dist/commands/eval/__tests__/run.test.d.ts +1 -0
  18. package/dist/commands/eval/__tests__/run.test.js +149 -0
  19. package/dist/commands/eval/__tests__/run.test.js.map +1 -0
  20. package/dist/commands/eval/coverage.d.ts +1 -0
  21. package/dist/commands/eval/coverage.js +79 -0
  22. package/dist/commands/eval/coverage.js.map +1 -0
  23. package/dist/commands/eval/generate-all.d.ts +1 -0
  24. package/dist/commands/eval/generate-all.js +64 -0
  25. package/dist/commands/eval/generate-all.js.map +1 -0
  26. package/dist/commands/eval/init.d.ts +1 -0
  27. package/dist/commands/eval/init.js +38 -0
  28. package/dist/commands/eval/init.js.map +1 -0
  29. package/dist/commands/eval/run.d.ts +1 -0
  30. package/dist/commands/eval/run.js +107 -0
  31. package/dist/commands/eval/run.js.map +1 -0
  32. package/dist/commands/eval.d.ts +4 -0
  33. package/dist/commands/eval.js +48 -0
  34. package/dist/commands/eval.js.map +1 -0
  35. package/dist/eval/__tests__/benchmark.test.d.ts +1 -0
  36. package/dist/eval/__tests__/benchmark.test.js +65 -0
  37. package/dist/eval/__tests__/benchmark.test.js.map +1 -0
  38. package/dist/eval/__tests__/judge.test.d.ts +1 -0
  39. package/dist/eval/__tests__/judge.test.js +45 -0
  40. package/dist/eval/__tests__/judge.test.js.map +1 -0
  41. package/dist/eval/__tests__/llm.test.d.ts +1 -0
  42. package/dist/eval/__tests__/llm.test.js +85 -0
  43. package/dist/eval/__tests__/llm.test.js.map +1 -0
  44. package/dist/eval/__tests__/prompt-builder.test.d.ts +1 -0
  45. package/dist/eval/__tests__/prompt-builder.test.js +72 -0
  46. package/dist/eval/__tests__/prompt-builder.test.js.map +1 -0
  47. package/dist/eval/__tests__/schema.test.d.ts +1 -0
  48. package/dist/eval/__tests__/schema.test.js +209 -0
  49. package/dist/eval/__tests__/schema.test.js.map +1 -0
  50. package/dist/eval/__tests__/skill-scanner.test.d.ts +1 -0
  51. package/dist/eval/__tests__/skill-scanner.test.js +78 -0
  52. package/dist/eval/__tests__/skill-scanner.test.js.map +1 -0
  53. package/dist/eval/benchmark.d.ts +22 -0
  54. package/dist/eval/benchmark.js +24 -0
  55. package/dist/eval/benchmark.js.map +1 -0
  56. package/dist/eval/judge.d.ts +9 -0
  57. package/dist/eval/judge.js +40 -0
  58. package/dist/eval/judge.js.map +1 -0
  59. package/dist/eval/llm.d.ts +5 -0
  60. package/dist/eval/llm.js +34 -0
  61. package/dist/eval/llm.js.map +1 -0
  62. package/dist/eval/prompt-builder.d.ts +3 -0
  63. package/dist/eval/prompt-builder.js +155 -0
  64. package/dist/eval/prompt-builder.js.map +1 -0
  65. package/dist/eval/schema.d.ts +26 -0
  66. package/dist/eval/schema.js +128 -0
  67. package/dist/eval/schema.js.map +1 -0
  68. package/dist/eval/skill-scanner.d.ts +8 -0
  69. package/dist/eval/skill-scanner.js +44 -0
  70. package/dist/eval/skill-scanner.js.map +1 -0
  71. package/dist/index.js +9 -0
  72. package/dist/index.js.map +1 -1
  73. package/dist/marketplace/index.d.ts +2 -2
  74. package/dist/marketplace/index.js +1 -1
  75. package/dist/marketplace/index.js.map +1 -1
  76. package/dist/marketplace/marketplace.d.ts +13 -0
  77. package/dist/marketplace/marketplace.js +35 -0
  78. package/dist/marketplace/marketplace.js.map +1 -1
  79. package/package.json +2 -1
@@ -0,0 +1 @@
1
+ {"version":3,"file":"init.js","sourceRoot":"","sources":["../../../src/commands/eval/init.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,gEAAgE;AAChE,8EAA8E;AAE9E,OAAO,EAAE,YAAY,EAAE,aAAa,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,SAAS,CAAC;AAC7E,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,EAAE,mBAAmB,EAAE,mBAAmB,EAAE,MAAM,8BAA8B,CAAC;AACxF,OAAO,EAAE,KAAK,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,EAAE,MAAM,uBAAuB,CAAC;AAEhE,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,QAAgB,EAChB,KAAc;IAEd,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;IAC/C,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IACzC,MAAM,SAAS,GAAG,IAAI,CAAC,QAAQ,EAAE,YAAY,CAAC,CAAC;IAE/C,wBAAwB;IACxB,IAAI,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;QAC7B,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,yBAAyB,WAAW,EAAE,CAAC,CAAC,CAAC;QAC3D,OAAO;IACT,CAAC;IAED,4BAA4B;IAC5B,IAAI,UAAU,CAAC,SAAS,CAAC,IAAI,CAAC,KAAK,EAAE,CAAC;QACpC,OAAO,CAAC,GAAG,CACT,MAAM,CAAC,qDAAqD,CAAC,CAC9D,CAAC;QACF,OAAO;IACT,CAAC;IAED,MAAM,YAAY,GAAG,YAAY,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC;IACxD,MAAM,MAAM,GAAG,mBAAmB,CAAC,YAAY,CAAC,CAAC;IAEjD,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,GAAG,GAAG,MAAM,MAAM,CAAC,QAAQ,CAC/B,qFAAqF,EACrF,MAAM,CACP,CAAC;QAEF,MAAM,SAAS,GAAG,mBAAmB,CAAC,GAAG,CAAC,CAAC;QAE3C,SAAS,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QACzC,aAAa,CAAC,SAAS,EAAE,IAAI,CAAC,SAAS,CAAC,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;QAEtE,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,WAAW,SAAS,EAAE,CAAC,CAAC,CAAC;QAC3C,OAAO,CAAC,GAAG,CACT,GAAG,CAAC,KAAK,SAAS,CAAC,KAAK,CAAC,MAAM,gBAAgB,SAAS,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,CAAC,MAAM,EAAE,CAAC,CAAC,aAAa,CAAC,CAC9H,CAAC;IACJ,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,OAAO,CAAC,KAAK,CACX,GAAG,CAAC,4BAA4B,CAAC,GAAG,GAAG,CAAE,GAAa,CAAC,OAAO,CAAC,CAChE,CAAC;IACJ,CAAC;AACH,CAAC"}
@@ -0,0 +1 @@
1
+ export declare function runEvalRun(skillDir: string): Promise<void>;
@@ -0,0 +1,107 @@
1
+ // ---------------------------------------------------------------------------
2
+ // vskill eval run -- execute eval cases and grade assertions
3
+ // ---------------------------------------------------------------------------
4
+ import { loadAndValidateEvals, EvalValidationError } from "../../eval/schema.js";
5
+ import { createLlmClient } from "../../eval/llm.js";
6
+ import { judgeAssertion } from "../../eval/judge.js";
7
+ import { writeBenchmark } from "../../eval/benchmark.js";
8
+ import { green, red, yellow, bold, dim, table } from "../../utils/output.js";
9
+ export async function runEvalRun(skillDir) {
10
+ // Load and validate evals.json
11
+ let evalsFile;
12
+ try {
13
+ evalsFile = loadAndValidateEvals(skillDir);
14
+ }
15
+ catch (err) {
16
+ if (err instanceof EvalValidationError) {
17
+ const firstMsg = err.errors[0]?.message || "";
18
+ if (firstMsg.includes("No evals.json")) {
19
+ console.error(red(`No evals.json found at ${skillDir}/evals/evals.json`));
20
+ }
21
+ else {
22
+ console.error(red(`Invalid evals.json: ${err.message}`));
23
+ }
24
+ }
25
+ else {
26
+ console.error(red(`Error loading evals: ${err.message}`));
27
+ }
28
+ process.exit(1);
29
+ return;
30
+ }
31
+ const client = createLlmClient();
32
+ const model = client.model;
33
+ const benchmarkCases = [];
34
+ const tableRows = [];
35
+ for (const evalCase of evalsFile.evals) {
36
+ try {
37
+ // Step 1: Send prompt to LLM
38
+ const output = await client.generate("You are an AI skill being evaluated. Respond to the prompt as the skill would.", evalCase.prompt);
39
+ // Step 2: Judge each assertion
40
+ const assertionResults = [];
41
+ let passCount = 0;
42
+ for (const assertion of evalCase.assertions) {
43
+ const result = await judgeAssertion(output, assertion, client);
44
+ assertionResults.push(result);
45
+ if (result.pass)
46
+ passCount++;
47
+ const truncatedText = assertion.text.length > 60
48
+ ? assertion.text.slice(0, 57) + "..."
49
+ : assertion.text;
50
+ tableRows.push([
51
+ evalCase.name,
52
+ assertion.id,
53
+ truncatedText,
54
+ result.pass ? green("PASS") : red("FAIL"),
55
+ ]);
56
+ }
57
+ const passRate = evalCase.assertions.length > 0
58
+ ? passCount / evalCase.assertions.length
59
+ : 0;
60
+ const allPassed = passCount === evalCase.assertions.length;
61
+ benchmarkCases.push({
62
+ eval_id: evalCase.id,
63
+ eval_name: evalCase.name,
64
+ status: allPassed ? "pass" : "fail",
65
+ error_message: null,
66
+ pass_rate: passRate,
67
+ assertions: assertionResults,
68
+ });
69
+ }
70
+ catch (err) {
71
+ // Mark case as error, continue with remaining
72
+ benchmarkCases.push({
73
+ eval_id: evalCase.id,
74
+ eval_name: evalCase.name,
75
+ status: "error",
76
+ error_message: err.message,
77
+ pass_rate: 0,
78
+ assertions: [],
79
+ });
80
+ tableRows.push([
81
+ evalCase.name,
82
+ "-",
83
+ dim("Error: " + err.message.slice(0, 50)),
84
+ yellow("ERROR"),
85
+ ]);
86
+ }
87
+ }
88
+ // Print results table
89
+ const headers = ["EVAL", "ASSERTION", "TEXT", "STATUS"];
90
+ console.log(bold(`\nEval Results: ${evalsFile.skill_name}\n`));
91
+ console.log(table(headers, tableRows));
92
+ // Compute summary
93
+ const passed = benchmarkCases.filter((c) => c.status === "pass").length;
94
+ const failed = benchmarkCases.filter((c) => c.status === "fail").length;
95
+ const errors = benchmarkCases.filter((c) => c.status === "error").length;
96
+ console.log(`\n${green(`${passed} passed`)} ${failed > 0 ? red(`${failed} failed`) : ""} ${errors > 0 ? yellow(`${errors} errors`) : ""}`.trim());
97
+ // Write benchmark.json
98
+ const benchmark = {
99
+ timestamp: new Date().toISOString(),
100
+ model,
101
+ skill_name: evalsFile.skill_name,
102
+ cases: benchmarkCases,
103
+ };
104
+ await writeBenchmark(skillDir, benchmark);
105
+ console.log(dim(`\nBenchmark written to ${skillDir}/evals/benchmark.json`));
106
+ }
107
+ //# sourceMappingURL=run.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"run.js","sourceRoot":"","sources":["../../../src/commands/eval/run.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,6DAA6D;AAC7D,8EAA8E;AAE9E,OAAO,EAAE,oBAAoB,EAAE,mBAAmB,EAAE,MAAM,sBAAsB,CAAC;AACjF,OAAO,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AACrD,OAAO,EAAE,cAAc,EAAE,MAAM,yBAAyB,CAAC;AAEzD,OAAO,EAAE,KAAK,EAAE,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,uBAAuB,CAAC;AAE7E,MAAM,CAAC,KAAK,UAAU,UAAU,CAAC,QAAgB;IAC/C,+BAA+B;IAC/B,IAAI,SAAS,CAAC;IACd,IAAI,CAAC;QACH,SAAS,GAAG,oBAAoB,CAAC,QAAQ,CAAC,CAAC;IAC7C,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,IAAI,GAAG,YAAY,mBAAmB,EAAE,CAAC;YACvC,MAAM,QAAQ,GAAG,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,OAAO,IAAI,EAAE,CAAC;YAC9C,IAAI,QAAQ,CAAC,QAAQ,CAAC,eAAe,CAAC,EAAE,CAAC;gBACvC,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,0BAA0B,QAAQ,mBAAmB,CAAC,CAAC,CAAC;YAC5E,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,uBAAuB,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;YAC3D,CAAC;QACH,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,wBAAyB,GAAa,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QACvE,CAAC;QACD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAChB,OAAO;IACT,CAAC;IAED,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;IACjC,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC;IAC3B,MAAM,cAAc,GAAoB,EAAE,CAAC;IAC3C,MAAM,SAAS,GAAe,EAAE,CAAC;IAEjC,KAAK,MAAM,QAAQ,IAAI,SAAS,CAAC,KAAK,EAAE,CAAC;QACvC,IAAI,CAAC;YACH,6BAA6B;YAC7B,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,QAAQ,CAClC,gFAAgF,EAChF,QAAQ,CAAC,MAAM,CAChB,CAAC;YAEF,+BAA+B;YAC/B,MAAM,gBAAgB,GAAG,EAAE,CAAC;YAC5B,IAAI,SAAS,GAAG,CAAC,CAAC;YAElB,KAAK,MAAM,SAAS,IAAI,QAAQ,CAAC,UAAU,EAAE,CAAC;gBAC5C,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,MAAM,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC;gBAC/D,gBAAgB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;gBAC9B,IAAI,MAAM,CAAC,IAAI;oBAAE,SAAS,EAAE,CAAC;gBAE7B,MAAM,aAAa,GACjB,SAAS,CAAC,IAAI,CAAC,MAAM,GAAG,EAAE;oBACxB,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,KAAK;oBACrC,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC;gBAErB,SAAS,CAAC,IAAI,CAAC;oBACb,QAAQ,CAAC,IAAI;oBACb,SAAS,CAAC,EAAE;oBACZ,aAAa;oBACb,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC;iBAC1C,CAAC,CAAC;YACL,CAAC;YAED,MAAM,QAAQ,GAAG,QAAQ,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC;gBAC7C,CAAC,CAAC,SAAS,GAAG,QAAQ,CAAC,UAAU,CAAC,MAAM;gBACxC,CAAC,CAAC,CAAC,CAAC;YACN,MAAM,SAAS,GAAG,SAAS,KAAK,QAAQ,CAAC,UAAU,CAAC,MAAM,CAAC;YAE3D,cAAc,CAAC,IAAI,CAAC;gBAClB,OAAO,EAAE,QAAQ,CAAC,EAAE;gBACpB,SAAS,EAAE,QAAQ,CAAC,IAAI;gBACxB,MAAM,EAAE,SAAS,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;gBACnC,aAAa,EAAE,IAAI;gBACnB,SAAS,EAAE,QAAQ;gBACnB,UAAU,EAAE,gBAAgB;aAC7B,CAAC,CAAC;QACL,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,8CAA8C;YAC9C,cAAc,CAAC,IAAI,CAAC;gBAClB,OAAO,EAAE,QAAQ,CAAC,EAAE;gBACpB,SAAS,EAAE,QAAQ,CAAC,IAAI;gBACxB,MAAM,EAAE,OAAO;gBACf,aAAa,EAAG,GAAa,CAAC,OAAO;gBACrC,SAAS,EAAE,CAAC;gBACZ,UAAU,EAAE,EAAE;aACf,CAAC,CAAC;YAEH,SAAS,CAAC,IAAI,CAAC;gBACb,QAAQ,CAAC,IAAI;gBACb,GAAG;gBACH,GAAG,CAAC,SAAS,GAAI,GAAa,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;gBACpD,MAAM,CAAC,OAAO,CAAC;aAChB,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,sBAAsB;IACtB,MAAM,OAAO,GAAG,CAAC,MAAM,EAAE,WAAW,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC;IACxD,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,mBAAmB,SAAS,CAAC,UAAU,IAAI,CAAC,CAAC,CAAC;IAC/D,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,OAAO,EAAE,SAAS,CAAC,CAAC,CAAC;IAEvC,kBAAkB;IAClB,MAAM,MAAM,GAAG,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,MAAM,CAAC;IACxE,MAAM,MAAM,GAAG,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,MAAM,CAAC;IACxE,MAAM,MAAM,GAAG,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,OAAO,CAAC,CAAC,MAAM,CAAC;IACzE,OAAO,CAAC,GAAG,CACT,KAAK,KAAK,CAAC,GAAG,MAAM,SAAS,CAAC,IAAI,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,MAAM,SAAS,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,MAAM,SAAS,CAAC,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,EAAE,CACrI,CAAC;IAEF,uBAAuB;IACvB,MAAM,SAAS,GAAoB;QACjC,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACnC,KAAK;QACL,UAAU,EAAE,SAAS,CAAC,UAAU;QAChC,KAAK,EAAE,cAAc;KACtB,CAAC;IAEF,MAAM,cAAc,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;IAC1C,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,0BAA0B,QAAQ,uBAAuB,CAAC,CAAC,CAAC;AAC9E,CAAC"}
@@ -0,0 +1,4 @@
1
+ export declare function evalCommand(subcommand: string, target?: string, opts?: {
2
+ force?: boolean;
3
+ root?: string;
4
+ }): Promise<void>;
@@ -0,0 +1,48 @@
1
+ // ---------------------------------------------------------------------------
2
+ // vskill eval -- subcommand router
3
+ // ---------------------------------------------------------------------------
4
+ import { join, resolve } from "node:path";
5
+ import { red, dim } from "../utils/output.js";
6
+ export async function evalCommand(subcommand, target, opts = {}) {
7
+ const root = opts.root ? resolve(opts.root) : resolve("plugins");
8
+ switch (subcommand) {
9
+ case "init": {
10
+ if (!target) {
11
+ console.error(red("Usage: vskill eval init <plugin>/<skill>"));
12
+ process.exit(1);
13
+ }
14
+ const skillDir = resolveSkillDir(root, target);
15
+ const { runEvalInit } = await import("./eval/init.js");
16
+ return runEvalInit(skillDir, !!opts.force);
17
+ }
18
+ case "run": {
19
+ if (!target) {
20
+ console.error(red("Usage: vskill eval run <plugin>/<skill>"));
21
+ process.exit(1);
22
+ }
23
+ const skillDir = resolveSkillDir(root, target);
24
+ const { runEvalRun } = await import("./eval/run.js");
25
+ return runEvalRun(skillDir);
26
+ }
27
+ case "coverage": {
28
+ const { runEvalCoverage } = await import("./eval/coverage.js");
29
+ return runEvalCoverage(root);
30
+ }
31
+ case "generate-all": {
32
+ const { runEvalGenerateAll } = await import("./eval/generate-all.js");
33
+ return runEvalGenerateAll(root, !!opts.force);
34
+ }
35
+ default:
36
+ console.error(red(`Unknown subcommand: "${subcommand}"\n`) +
37
+ dim("Available: init, run, coverage, generate-all"));
38
+ }
39
+ }
40
+ function resolveSkillDir(root, target) {
41
+ const parts = target.split("/");
42
+ if (parts.length !== 2) {
43
+ console.error(red(`Invalid target "${target}". Expected format: <plugin>/<skill>`));
44
+ process.exit(1);
45
+ }
46
+ return join(root, parts[0], "skills", parts[1]);
47
+ }
48
+ //# sourceMappingURL=eval.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"eval.js","sourceRoot":"","sources":["../../src/commands/eval.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,mCAAmC;AACnC,8EAA8E;AAE9E,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAC1C,OAAO,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,oBAAoB,CAAC;AAE9C,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,UAAkB,EAClB,MAAe,EACf,OAA2C,EAAE;IAE7C,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;IAEjE,QAAQ,UAAU,EAAE,CAAC;QACnB,KAAK,MAAM,CAAC,CAAC,CAAC;YACZ,IAAI,CAAC,MAAM,EAAE,CAAC;gBACZ,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,0CAA0C,CAAC,CAAC,CAAC;gBAC/D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAC/C,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,MAAM,CAAC,gBAAgB,CAAC,CAAC;YACvD,OAAO,WAAW,CAAC,QAAQ,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAC7C,CAAC;QAED,KAAK,KAAK,CAAC,CAAC,CAAC;YACX,IAAI,CAAC,MAAM,EAAE,CAAC;gBACZ,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,yCAAyC,CAAC,CAAC,CAAC;gBAC9D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAC/C,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,MAAM,CAAC,eAAe,CAAC,CAAC;YACrD,OAAO,UAAU,CAAC,QAAQ,CAAC,CAAC;QAC9B,CAAC;QAED,KAAK,UAAU,CAAC,CAAC,CAAC;YAChB,MAAM,EAAE,eAAe,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;YAC/D,OAAO,eAAe,CAAC,IAAI,CAAC,CAAC;QAC/B,CAAC;QAED,KAAK,cAAc,CAAC,CAAC,CAAC;YACpB,MAAM,EAAE,kBAAkB,EAAE,GAAG,MAAM,MAAM,CAAC,wBAAwB,CAAC,CAAC;YACtE,OAAO,kBAAkB,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAChD,CAAC;QAED;YACE,OAAO,CAAC,KAAK,CACX,GAAG,CAAC,wBAAwB,UAAU,KAAK,CAAC;gBAC1C,GAAG,CAAC,8CAA8C,CAAC,CACtD,CAAC;IACN,CAAC;AACH,CAAC;AAED,SAAS,eAAe,CAAC,IAAY,EAAE,MAAc;IACnD,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAChC,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,OAAO,CAAC,KAAK,CACX,GAAG,CAAC,mBAAmB,MAAM,sCAAsC,CAAC,CACrE,CAAC;QACF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IACD,OAAO,IAAI,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;AAClD,CAAC"}
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,65 @@
1
+ import { describe, it, expect, beforeEach, afterEach } from "vitest";
2
+ import { mkdirSync, writeFileSync, rmSync } from "node:fs";
3
+ import { join } from "node:path";
4
+ import { tmpdir } from "node:os";
5
+ import { writeBenchmark, readBenchmark } from "../benchmark.js";
6
+ // ---------------------------------------------------------------------------
7
+ // Helpers
8
+ // ---------------------------------------------------------------------------
9
+ let testDir;
10
+ const SAMPLE_BENCHMARK = {
11
+ timestamp: "2026-03-01T00:00:00.000Z",
12
+ model: "claude-sonnet-4-20250514",
13
+ skill_name: "test-skill",
14
+ cases: [
15
+ {
16
+ eval_id: 1,
17
+ eval_name: "Basic test",
18
+ status: "pass",
19
+ error_message: null,
20
+ pass_rate: 1.0,
21
+ assertions: [
22
+ {
23
+ id: "a1",
24
+ text: "Check result",
25
+ pass: true,
26
+ reasoning: "Looks good",
27
+ },
28
+ ],
29
+ },
30
+ ],
31
+ };
32
+ // ---------------------------------------------------------------------------
33
+ // Tests
34
+ // ---------------------------------------------------------------------------
35
+ describe("benchmark", () => {
36
+ beforeEach(() => {
37
+ testDir = join(tmpdir(), `vskill-bench-${Date.now()}`);
38
+ mkdirSync(join(testDir, "evals"), { recursive: true });
39
+ });
40
+ afterEach(() => {
41
+ rmSync(testDir, { recursive: true, force: true });
42
+ });
43
+ it("writes benchmark.json with all required fields", async () => {
44
+ await writeBenchmark(testDir, SAMPLE_BENCHMARK);
45
+ const result = await readBenchmark(testDir);
46
+ expect(result).not.toBeNull();
47
+ expect(result.timestamp).toBe("2026-03-01T00:00:00.000Z");
48
+ expect(result.model).toBe("claude-sonnet-4-20250514");
49
+ expect(result.skill_name).toBe("test-skill");
50
+ expect(result.cases).toHaveLength(1);
51
+ expect(result.cases[0].assertions).toHaveLength(1);
52
+ });
53
+ it("reads benchmark.json and returns typed result", async () => {
54
+ writeFileSync(join(testDir, "evals", "benchmark.json"), JSON.stringify(SAMPLE_BENCHMARK));
55
+ const result = await readBenchmark(testDir);
56
+ expect(result.skill_name).toBe("test-skill");
57
+ expect(result.cases[0].pass_rate).toBe(1.0);
58
+ });
59
+ it("returns null for missing benchmark.json", async () => {
60
+ rmSync(join(testDir, "evals"), { recursive: true, force: true });
61
+ const result = await readBenchmark(testDir);
62
+ expect(result).toBeNull();
63
+ });
64
+ });
65
+ //# sourceMappingURL=benchmark.test.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"benchmark.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/benchmark.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,QAAQ,CAAC;AACrE,OAAO,EAAE,SAAS,EAAE,aAAa,EAAE,MAAM,EAAE,MAAM,SAAS,CAAC;AAC3D,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,MAAM,EAAE,MAAM,SAAS,CAAC;AACjC,OAAO,EAAE,cAAc,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAGhE,8EAA8E;AAC9E,UAAU;AACV,8EAA8E;AAE9E,IAAI,OAAe,CAAC;AAEpB,MAAM,gBAAgB,GAAoB;IACxC,SAAS,EAAE,0BAA0B;IACrC,KAAK,EAAE,0BAA0B;IACjC,UAAU,EAAE,YAAY;IACxB,KAAK,EAAE;QACL;YACE,OAAO,EAAE,CAAC;YACV,SAAS,EAAE,YAAY;YACvB,MAAM,EAAE,MAAM;YACd,aAAa,EAAE,IAAI;YACnB,SAAS,EAAE,GAAG;YACd,UAAU,EAAE;gBACV;oBACE,EAAE,EAAE,IAAI;oBACR,IAAI,EAAE,cAAc;oBACpB,IAAI,EAAE,IAAI;oBACV,SAAS,EAAE,YAAY;iBACxB;aACF;SACF;KACF;CACF,CAAC;AAEF,8EAA8E;AAC9E,QAAQ;AACR,8EAA8E;AAE9E,QAAQ,CAAC,WAAW,EAAE,GAAG,EAAE;IACzB,UAAU,CAAC,GAAG,EAAE;QACd,OAAO,GAAG,IAAI,CAAC,MAAM,EAAE,EAAE,gBAAgB,IAAI,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;QACvD,SAAS,CAAC,IAAI,CAAC,OAAO,EAAE,OAAO,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IACzD,CAAC,CAAC,CAAC;IAEH,SAAS,CAAC,GAAG,EAAE;QACb,MAAM,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;IACpD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,gDAAgD,EAAE,KAAK,IAAI,EAAE;QAC9D,MAAM,cAAc,CAAC,OAAO,EAAE,gBAAgB,CAAC,CAAC;QAEhD,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;QAC5C,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC;QAC9B,MAAM,CAAC,MAAO,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC;QAC3D,MAAM,CAAC,MAAO,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC;QACvD,MAAM,CAAC,MAAO,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QAC9C,MAAM,CAAC,MAAO,CAAC,KAAK,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QACtC,MAAM,CAAC,MAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;IACtD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,+CAA+C,EAAE,KAAK,IAAI,EAAE;QAC7D,aAAa,CACX,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,gBAAgB,CAAC,EACxC,IAAI,CAAC,SAAS,CAAC,gBAAgB,CAAC,CACjC,CAAC;QAEF,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;QAC5C,MAAM,CAAC,MAAO,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QAC9C,MAAM,CAAC,MAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAC/C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,yCAAyC,EAAE,KAAK,IAAI,EAAE;QACvD,MAAM,CAAC,IAAI,CAAC,OAAO,EAAE,OAAO,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;QAEjE,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;QAC5C,MAAM,CAAC,MAAM,CAAC,CAAC,QAAQ,EAAE,CAAC;IAC5B,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,45 @@
1
+ import { describe, it, expect, vi } from "vitest";
2
+ import { judgeAssertion } from "../judge.js";
3
+ // ---------------------------------------------------------------------------
4
+ // Helpers
5
+ // ---------------------------------------------------------------------------
6
+ function mockClient(response) {
7
+ return { generate: vi.fn().mockResolvedValue(response), model: "test-model" };
8
+ }
9
+ const ASSERTION = {
10
+ id: "assert-1",
11
+ text: "Output mentions a file path",
12
+ type: "boolean",
13
+ };
14
+ // ---------------------------------------------------------------------------
15
+ // Tests
16
+ // ---------------------------------------------------------------------------
17
+ describe("judgeAssertion", () => {
18
+ it("returns pass result when LLM judge says pass", async () => {
19
+ const client = mockClient(JSON.stringify({ pass: true, reasoning: "output contains file path" }));
20
+ const result = await judgeAssertion("The report has been saved to reports/q1.csv", ASSERTION, client);
21
+ expect(result.pass).toBe(true);
22
+ expect(result.reasoning).toBe("output contains file path");
23
+ expect(result.id).toBe("assert-1");
24
+ expect(result.text).toBe("Output mentions a file path");
25
+ });
26
+ it("returns fail result when LLM judge says fail", async () => {
27
+ const client = mockClient(JSON.stringify({
28
+ pass: false,
29
+ reasoning: "no file path found in output",
30
+ }));
31
+ const result = await judgeAssertion("Hello world", ASSERTION, client);
32
+ expect(result.pass).toBe(false);
33
+ expect(result.reasoning).toBe("no file path found in output");
34
+ });
35
+ it("throws on malformed judge response", async () => {
36
+ const client = mockClient("This is not JSON");
37
+ await expect(judgeAssertion("some output", ASSERTION, client)).rejects.toThrow(/invalid judge output/i);
38
+ });
39
+ it("handles JSON wrapped in code fence", async () => {
40
+ const client = mockClient('```json\n{"pass": true, "reasoning": "looks good"}\n```');
41
+ const result = await judgeAssertion("some output", ASSERTION, client);
42
+ expect(result.pass).toBe(true);
43
+ });
44
+ });
45
+ //# sourceMappingURL=judge.test.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"judge.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/judge.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAc,MAAM,QAAQ,CAAC;AAG9D,OAAO,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAE7C,8EAA8E;AAC9E,UAAU;AACV,8EAA8E;AAE9E,SAAS,UAAU,CAAC,QAAgB;IAClC,OAAO,EAAE,QAAQ,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,QAAQ,CAAC,EAAE,KAAK,EAAE,YAAY,EAAE,CAAC;AAChF,CAAC;AAED,MAAM,SAAS,GAAc;IAC3B,EAAE,EAAE,UAAU;IACd,IAAI,EAAE,6BAA6B;IACnC,IAAI,EAAE,SAAS;CAChB,CAAC;AAEF,8EAA8E;AAC9E,QAAQ;AACR,8EAA8E;AAE9E,QAAQ,CAAC,gBAAgB,EAAE,GAAG,EAAE;IAC9B,EAAE,CAAC,8CAA8C,EAAE,KAAK,IAAI,EAAE;QAC5D,MAAM,MAAM,GAAG,UAAU,CACvB,IAAI,CAAC,SAAS,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,2BAA2B,EAAE,CAAC,CACvE,CAAC;QAEF,MAAM,MAAM,GAAG,MAAM,cAAc,CACjC,6CAA6C,EAC7C,SAAS,EACT,MAAM,CACP,CAAC;QAEF,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC/B,MAAM,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,2BAA2B,CAAC,CAAC;QAC3D,MAAM,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACnC,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,6BAA6B,CAAC,CAAC;IAC1D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,8CAA8C,EAAE,KAAK,IAAI,EAAE;QAC5D,MAAM,MAAM,GAAG,UAAU,CACvB,IAAI,CAAC,SAAS,CAAC;YACb,IAAI,EAAE,KAAK;YACX,SAAS,EAAE,8BAA8B;SAC1C,CAAC,CACH,CAAC;QAEF,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,aAAa,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC;QAEtE,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAChC,MAAM,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,8BAA8B,CAAC,CAAC;IAChE,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,oCAAoC,EAAE,KAAK,IAAI,EAAE;QAClD,MAAM,MAAM,GAAG,UAAU,CAAC,kBAAkB,CAAC,CAAC;QAE9C,MAAM,MAAM,CACV,cAAc,CAAC,aAAa,EAAE,SAAS,EAAE,MAAM,CAAC,CACjD,CAAC,OAAO,CAAC,OAAO,CAAC,uBAAuB,CAAC,CAAC;IAC7C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,oCAAoC,EAAE,KAAK,IAAI,EAAE;QAClD,MAAM,MAAM,GAAG,UAAU,CACvB,yDAAyD,CAC1D,CAAC;QAEF,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,aAAa,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC;QACtE,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACjC,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,85 @@
1
+ import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
2
+ // ---------------------------------------------------------------------------
3
+ // Mocks
4
+ // ---------------------------------------------------------------------------
5
+ const mockCreate = vi.hoisted(() => vi.fn());
6
+ vi.mock("@anthropic-ai/sdk", () => ({
7
+ default: class MockAnthropic {
8
+ messages = { create: mockCreate };
9
+ },
10
+ }));
11
+ // ---------------------------------------------------------------------------
12
+ // Import module under test AFTER mocks
13
+ // ---------------------------------------------------------------------------
14
+ const { createLlmClient } = await import("../llm.js");
15
+ // ---------------------------------------------------------------------------
16
+ // Tests
17
+ // ---------------------------------------------------------------------------
18
+ describe("createLlmClient", () => {
19
+ const origEnv = { ...process.env };
20
+ beforeEach(() => {
21
+ vi.resetAllMocks();
22
+ process.env.ANTHROPIC_API_KEY = "test-key";
23
+ delete process.env.VSKILL_EVAL_MODEL;
24
+ });
25
+ afterEach(() => {
26
+ process.env = { ...origEnv };
27
+ });
28
+ it("returns text content on successful generate call", async () => {
29
+ mockCreate.mockResolvedValue({
30
+ content: [{ type: "text", text: "Generated response" }],
31
+ });
32
+ const client = createLlmClient();
33
+ const result = await client.generate("system prompt", "user prompt");
34
+ expect(result).toBe("Generated response");
35
+ expect(mockCreate).toHaveBeenCalledOnce();
36
+ });
37
+ it("uses default model claude-sonnet-4-20250514 when env not set", async () => {
38
+ mockCreate.mockResolvedValue({
39
+ content: [{ type: "text", text: "ok" }],
40
+ });
41
+ const client = createLlmClient();
42
+ await client.generate("sys", "usr");
43
+ expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ model: "claude-sonnet-4-20250514" }), expect.anything());
44
+ });
45
+ it("uses custom model from VSKILL_EVAL_MODEL env var", async () => {
46
+ process.env.VSKILL_EVAL_MODEL = "claude-opus-4-20250514";
47
+ mockCreate.mockResolvedValue({
48
+ content: [{ type: "text", text: "ok" }],
49
+ });
50
+ const client = createLlmClient();
51
+ await client.generate("sys", "usr");
52
+ expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ model: "claude-opus-4-20250514" }), expect.anything());
53
+ });
54
+ it("propagates network error from SDK", async () => {
55
+ mockCreate.mockRejectedValue(new Error("Connection timeout"));
56
+ const client = createLlmClient();
57
+ await expect(client.generate("sys", "usr")).rejects.toThrow("Connection timeout");
58
+ });
59
+ it("passes system and user prompts correctly", async () => {
60
+ mockCreate.mockResolvedValue({
61
+ content: [{ type: "text", text: "ok" }],
62
+ });
63
+ const client = createLlmClient();
64
+ await client.generate("my system prompt", "my user prompt");
65
+ expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({
66
+ system: "my system prompt",
67
+ messages: [{ role: "user", content: "my user prompt" }],
68
+ max_tokens: 4096,
69
+ }), expect.anything());
70
+ });
71
+ it("throws when ANTHROPIC_API_KEY is not set", () => {
72
+ delete process.env.ANTHROPIC_API_KEY;
73
+ expect(() => createLlmClient()).toThrow("ANTHROPIC_API_KEY is not set");
74
+ });
75
+ it("exposes model name on the client", () => {
76
+ const client = createLlmClient();
77
+ expect(client.model).toBe("claude-sonnet-4-20250514");
78
+ });
79
+ it("exposes custom model name when VSKILL_EVAL_MODEL is set", () => {
80
+ process.env.VSKILL_EVAL_MODEL = "claude-opus-4-20250514";
81
+ const client = createLlmClient();
82
+ expect(client.model).toBe("claude-opus-4-20250514");
83
+ });
84
+ });
85
+ //# sourceMappingURL=llm.test.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"llm.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/llm.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,QAAQ,CAAC;AAEzE,8EAA8E;AAC9E,QAAQ;AACR,8EAA8E;AAE9E,MAAM,UAAU,GAAG,EAAE,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;AAE7C,EAAE,CAAC,IAAI,CAAC,mBAAmB,EAAE,GAAG,EAAE,CAAC,CAAC;IAClC,OAAO,EAAE,MAAM,aAAa;QAC1B,QAAQ,GAAG,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC;KACnC;CACF,CAAC,CAAC,CAAC;AAEJ,8EAA8E;AAC9E,uCAAuC;AACvC,8EAA8E;AAE9E,MAAM,EAAE,eAAe,EAAE,GAAG,MAAM,MAAM,CAAC,WAAW,CAAC,CAAC;AAEtD,8EAA8E;AAC9E,QAAQ;AACR,8EAA8E;AAE9E,QAAQ,CAAC,iBAAiB,EAAE,GAAG,EAAE;IAC/B,MAAM,OAAO,GAAG,EAAE,GAAG,OAAO,CAAC,GAAG,EAAE,CAAC;IAEnC,UAAU,CAAC,GAAG,EAAE;QACd,EAAE,CAAC,aAAa,EAAE,CAAC;QACnB,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,UAAU,CAAC;QAC3C,OAAO,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC;IACvC,CAAC,CAAC,CAAC;IAEH,SAAS,CAAC,GAAG,EAAE;QACb,OAAO,CAAC,GAAG,GAAG,EAAE,GAAG,OAAO,EAAE,CAAC;IAC/B,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,kDAAkD,EAAE,KAAK,IAAI,EAAE;QAChE,UAAU,CAAC,iBAAiB,CAAC;YAC3B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,oBAAoB,EAAE,CAAC;SACxD,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,eAAe,EAAE,aAAa,CAAC,CAAC;QAErE,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC;QAC1C,MAAM,CAAC,UAAU,CAAC,CAAC,oBAAoB,EAAE,CAAC;IAC5C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,8DAA8D,EAAE,KAAK,IAAI,EAAE;QAC5E,UAAU,CAAC,iBAAiB,CAAC;YAC3B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;SACxC,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;QAEpC,MAAM,CAAC,UAAU,CAAC,CAAC,oBAAoB,CACrC,MAAM,CAAC,gBAAgB,CAAC,EAAE,KAAK,EAAE,0BAA0B,EAAE,CAAC,EAC9D,MAAM,CAAC,QAAQ,EAAE,CAClB,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,kDAAkD,EAAE,KAAK,IAAI,EAAE;QAChE,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,wBAAwB,CAAC;QACzD,UAAU,CAAC,iBAAiB,CAAC;YAC3B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;SACxC,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;QAEpC,MAAM,CAAC,UAAU,CAAC,CAAC,oBAAoB,CACrC,MAAM,CAAC,gBAAgB,CAAC,EAAE,KAAK,EAAE,wBAAwB,EAAE,CAAC,EAC5D,MAAM,CAAC,QAAQ,EAAE,CAClB,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,mCAAmC,EAAE,KAAK,IAAI,EAAE;QACjD,UAAU,CAAC,iBAAiB,CAAC,IAAI,KAAK,CAAC,oBAAoB,CAAC,CAAC,CAAC;QAE9D,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CACzD,oBAAoB,CACrB,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;QACxD,UAAU,CAAC,iBAAiB,CAAC;YAC3B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;SACxC,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,MAAM,CAAC,QAAQ,CAAC,kBAAkB,EAAE,gBAAgB,CAAC,CAAC;QAE5D,MAAM,CAAC,UAAU,CAAC,CAAC,oBAAoB,CACrC,MAAM,CAAC,gBAAgB,CAAC;YACtB,MAAM,EAAE,kBAAkB;YAC1B,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,EAAE,CAAC;YACvD,UAAU,EAAE,IAAI;SACjB,CAAC,EACF,MAAM,CAAC,QAAQ,EAAE,CAClB,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;QAClD,OAAO,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC;QACrC,MAAM,CAAC,GAAG,EAAE,CAAC,eAAe,EAAE,CAAC,CAAC,OAAO,CAAC,8BAA8B,CAAC,CAAC;IAC1E,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,kCAAkC,EAAE,GAAG,EAAE;QAC1C,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC;IACxD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,yDAAyD,EAAE,GAAG,EAAE;QACjE,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,wBAAwB,CAAC;QACzD,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,wBAAwB,CAAC,CAAC;IACtD,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,72 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { buildEvalInitPrompt, parseGeneratedEvals, } from "../prompt-builder.js";
3
+ // ---------------------------------------------------------------------------
4
+ // buildEvalInitPrompt
5
+ // ---------------------------------------------------------------------------
6
+ describe("buildEvalInitPrompt", () => {
7
+ const skillContent = "# My Skill\nThis skill does amazing things.";
8
+ it("includes skill content in the prompt", () => {
9
+ const prompt = buildEvalInitPrompt(skillContent);
10
+ expect(prompt).toContain(skillContent);
11
+ });
12
+ it("includes schema reference fields", () => {
13
+ const prompt = buildEvalInitPrompt(skillContent);
14
+ expect(prompt).toContain("skill_name");
15
+ expect(prompt).toContain("assertions");
16
+ expect(prompt).toContain("expected_output");
17
+ });
18
+ it("includes social-media-posting example", () => {
19
+ const prompt = buildEvalInitPrompt(skillContent);
20
+ expect(prompt).toContain("social-media-posting");
21
+ });
22
+ it("includes best practices section", () => {
23
+ const prompt = buildEvalInitPrompt(skillContent);
24
+ expect(prompt).toContain("Best Practices");
25
+ expect(prompt).toContain("objectively verifiable");
26
+ });
27
+ });
28
+ // ---------------------------------------------------------------------------
29
+ // parseGeneratedEvals
30
+ // ---------------------------------------------------------------------------
31
+ describe("parseGeneratedEvals", () => {
32
+ it("extracts JSON from markdown code fence", () => {
33
+ const raw = `Here is the evals.json:
34
+
35
+ \`\`\`json
36
+ {
37
+ "skill_name": "test-skill",
38
+ "evals": [
39
+ {
40
+ "id": 1,
41
+ "name": "Basic test",
42
+ "prompt": "Test prompt",
43
+ "expected_output": "Expected output",
44
+ "files": [],
45
+ "assertions": [
46
+ { "id": "a1", "text": "Check result", "type": "boolean" }
47
+ ]
48
+ }
49
+ ]
50
+ }
51
+ \`\`\`
52
+
53
+ That's the evals file.`;
54
+ const result = parseGeneratedEvals(raw);
55
+ expect(result.skill_name).toBe("test-skill");
56
+ expect(result.evals).toHaveLength(1);
57
+ expect(result.evals[0].assertions).toHaveLength(1);
58
+ });
59
+ it("throws when no code block is found", () => {
60
+ const raw = "Just some text without any JSON code block.";
61
+ expect(() => parseGeneratedEvals(raw)).toThrow(/code block/i);
62
+ });
63
+ it("throws when JSON inside fence is invalid", () => {
64
+ const raw = "```json\n{ invalid json }\n```";
65
+ expect(() => parseGeneratedEvals(raw)).toThrow();
66
+ });
67
+ it("validates extracted JSON against schema", () => {
68
+ const raw = '```json\n{ "skill_name": "test" }\n```';
69
+ expect(() => parseGeneratedEvals(raw)).toThrow(); // missing evals array
70
+ });
71
+ });
72
+ //# sourceMappingURL=prompt-builder.test.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"prompt-builder.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/prompt-builder.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAC9C,OAAO,EACL,mBAAmB,EACnB,mBAAmB,GACpB,MAAM,sBAAsB,CAAC;AAE9B,8EAA8E;AAC9E,sBAAsB;AACtB,8EAA8E;AAE9E,QAAQ,CAAC,qBAAqB,EAAE,GAAG,EAAE;IACnC,MAAM,YAAY,GAAG,6CAA6C,CAAC;IAEnE,EAAE,CAAC,sCAAsC,EAAE,GAAG,EAAE;QAC9C,MAAM,MAAM,GAAG,mBAAmB,CAAC,YAAY,CAAC,CAAC;QACjD,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;IACzC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,kCAAkC,EAAE,GAAG,EAAE;QAC1C,MAAM,MAAM,GAAG,mBAAmB,CAAC,YAAY,CAAC,CAAC;QACjD,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;IAC9C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,uCAAuC,EAAE,GAAG,EAAE;QAC/C,MAAM,MAAM,GAAG,mBAAmB,CAAC,YAAY,CAAC,CAAC;QACjD,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,sBAAsB,CAAC,CAAC;IACnD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,iCAAiC,EAAE,GAAG,EAAE;QACzC,MAAM,MAAM,GAAG,mBAAmB,CAAC,YAAY,CAAC,CAAC;QACjD,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,gBAAgB,CAAC,CAAC;QAC3C,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,wBAAwB,CAAC,CAAC;IACrD,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,8EAA8E;AAC9E,sBAAsB;AACtB,8EAA8E;AAE9E,QAAQ,CAAC,qBAAqB,EAAE,GAAG,EAAE;IACnC,EAAE,CAAC,wCAAwC,EAAE,GAAG,EAAE;QAChD,MAAM,GAAG,GAAG;;;;;;;;;;;;;;;;;;;;uBAoBO,CAAC;QAEpB,MAAM,MAAM,GAAG,mBAAmB,CAAC,GAAG,CAAC,CAAC;QACxC,MAAM,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QAC7C,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;IACrD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,oCAAoC,EAAE,GAAG,EAAE;QAC5C,MAAM,GAAG,GAAG,6CAA6C,CAAC;QAC1D,MAAM,CAAC,GAAG,EAAE,CAAC,mBAAmB,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC;IAChE,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;QAClD,MAAM,GAAG,GAAG,gCAAgC,CAAC;QAC7C,MAAM,CAAC,GAAG,EAAE,CAAC,mBAAmB,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,EAAE,CAAC;IACnD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,yCAAyC,EAAE,GAAG,EAAE;QACjD,MAAM,GAAG,GAAG,wCAAwC,CAAC;QACrD,MAAM,CAAC,GAAG,EAAE,CAAC,mBAAmB,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,sBAAsB;IAC1E,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
@@ -0,0 +1 @@
1
+ export {};