vskill 0.2.26 → 0.2.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/dist/commands/__tests__/eval-router.test.d.ts +1 -0
  2. package/dist/commands/__tests__/eval-router.test.js +60 -0
  3. package/dist/commands/__tests__/eval-router.test.js.map +1 -0
  4. package/dist/commands/add.js +113 -19
  5. package/dist/commands/add.js.map +1 -1
  6. package/dist/commands/eval/__tests__/coverage.test.d.ts +1 -0
  7. package/dist/commands/eval/__tests__/coverage.test.js +122 -0
  8. package/dist/commands/eval/__tests__/coverage.test.js.map +1 -0
  9. package/dist/commands/eval/__tests__/generate-all.test.d.ts +1 -0
  10. package/dist/commands/eval/__tests__/generate-all.test.js +133 -0
  11. package/dist/commands/eval/__tests__/generate-all.test.js.map +1 -0
  12. package/dist/commands/eval/__tests__/init.test.d.ts +1 -0
  13. package/dist/commands/eval/__tests__/init.test.js +116 -0
  14. package/dist/commands/eval/__tests__/init.test.js.map +1 -0
  15. package/dist/commands/eval/__tests__/run.test.d.ts +1 -0
  16. package/dist/commands/eval/__tests__/run.test.js +149 -0
  17. package/dist/commands/eval/__tests__/run.test.js.map +1 -0
  18. package/dist/commands/eval/coverage.d.ts +1 -0
  19. package/dist/commands/eval/coverage.js +79 -0
  20. package/dist/commands/eval/coverage.js.map +1 -0
  21. package/dist/commands/eval/generate-all.d.ts +1 -0
  22. package/dist/commands/eval/generate-all.js +64 -0
  23. package/dist/commands/eval/generate-all.js.map +1 -0
  24. package/dist/commands/eval/init.d.ts +1 -0
  25. package/dist/commands/eval/init.js +38 -0
  26. package/dist/commands/eval/init.js.map +1 -0
  27. package/dist/commands/eval/run.d.ts +1 -0
  28. package/dist/commands/eval/run.js +107 -0
  29. package/dist/commands/eval/run.js.map +1 -0
  30. package/dist/commands/eval.d.ts +4 -0
  31. package/dist/commands/eval.js +48 -0
  32. package/dist/commands/eval.js.map +1 -0
  33. package/dist/eval/__tests__/benchmark.test.d.ts +1 -0
  34. package/dist/eval/__tests__/benchmark.test.js +65 -0
  35. package/dist/eval/__tests__/benchmark.test.js.map +1 -0
  36. package/dist/eval/__tests__/judge.test.d.ts +1 -0
  37. package/dist/eval/__tests__/judge.test.js +45 -0
  38. package/dist/eval/__tests__/judge.test.js.map +1 -0
  39. package/dist/eval/__tests__/llm.test.d.ts +1 -0
  40. package/dist/eval/__tests__/llm.test.js +85 -0
  41. package/dist/eval/__tests__/llm.test.js.map +1 -0
  42. package/dist/eval/__tests__/prompt-builder.test.d.ts +1 -0
  43. package/dist/eval/__tests__/prompt-builder.test.js +72 -0
  44. package/dist/eval/__tests__/prompt-builder.test.js.map +1 -0
  45. package/dist/eval/__tests__/schema.test.d.ts +1 -0
  46. package/dist/eval/__tests__/schema.test.js +209 -0
  47. package/dist/eval/__tests__/schema.test.js.map +1 -0
  48. package/dist/eval/__tests__/skill-scanner.test.d.ts +1 -0
  49. package/dist/eval/__tests__/skill-scanner.test.js +78 -0
  50. package/dist/eval/__tests__/skill-scanner.test.js.map +1 -0
  51. package/dist/eval/benchmark.d.ts +22 -0
  52. package/dist/eval/benchmark.js +24 -0
  53. package/dist/eval/benchmark.js.map +1 -0
  54. package/dist/eval/judge.d.ts +9 -0
  55. package/dist/eval/judge.js +40 -0
  56. package/dist/eval/judge.js.map +1 -0
  57. package/dist/eval/llm.d.ts +5 -0
  58. package/dist/eval/llm.js +34 -0
  59. package/dist/eval/llm.js.map +1 -0
  60. package/dist/eval/prompt-builder.d.ts +3 -0
  61. package/dist/eval/prompt-builder.js +155 -0
  62. package/dist/eval/prompt-builder.js.map +1 -0
  63. package/dist/eval/schema.d.ts +26 -0
  64. package/dist/eval/schema.js +128 -0
  65. package/dist/eval/schema.js.map +1 -0
  66. package/dist/eval/skill-scanner.d.ts +8 -0
  67. package/dist/eval/skill-scanner.js +44 -0
  68. package/dist/eval/skill-scanner.js.map +1 -0
  69. package/dist/index.js +9 -0
  70. package/dist/index.js.map +1 -1
  71. package/dist/marketplace/index.d.ts +2 -2
  72. package/dist/marketplace/index.js +1 -1
  73. package/dist/marketplace/index.js.map +1 -1
  74. package/dist/marketplace/marketplace.d.ts +13 -0
  75. package/dist/marketplace/marketplace.js +35 -0
  76. package/dist/marketplace/marketplace.js.map +1 -1
  77. package/package.json +2 -1
@@ -0,0 +1,78 @@
1
+ import { describe, it, expect, beforeEach, afterEach } from "vitest";
2
+ import { mkdirSync, writeFileSync, rmSync } from "node:fs";
3
+ import { join } from "node:path";
4
+ import { tmpdir } from "node:os";
5
+ import { scanSkills } from "../skill-scanner.js";
6
+ // ---------------------------------------------------------------------------
7
+ // Test helpers
8
+ // ---------------------------------------------------------------------------
9
+ let testDir;
10
+ function createSkill(plugin, skill, opts = {}) {
11
+ const skillDir = join(testDir, plugin, "skills", skill);
12
+ mkdirSync(skillDir, { recursive: true });
13
+ writeFileSync(join(skillDir, "SKILL.md"), `# ${skill}`);
14
+ if (opts.evals) {
15
+ const evalsDir = join(skillDir, "evals");
16
+ mkdirSync(evalsDir, { recursive: true });
17
+ writeFileSync(join(evalsDir, "evals.json"), JSON.stringify({ skill_name: skill, evals: [] }));
18
+ }
19
+ if (opts.benchmark) {
20
+ const evalsDir = join(skillDir, "evals");
21
+ mkdirSync(evalsDir, { recursive: true });
22
+ writeFileSync(join(evalsDir, "benchmark.json"), JSON.stringify({ timestamp: "2026-03-01T00:00:00Z" }));
23
+ }
24
+ }
25
+ // ---------------------------------------------------------------------------
26
+ // Tests
27
+ // ---------------------------------------------------------------------------
28
+ describe("scanSkills", () => {
29
+ beforeEach(() => {
30
+ testDir = join(tmpdir(), `vskill-test-${Date.now()}`);
31
+ mkdirSync(testDir, { recursive: true });
32
+ });
33
+ afterEach(() => {
34
+ rmSync(testDir, { recursive: true, force: true });
35
+ });
36
+ it("discovers skills in plugins directory", async () => {
37
+ createSkill("marketing", "social-media-posting");
38
+ createSkill("devtools", "code-review");
39
+ const skills = await scanSkills(testDir);
40
+ expect(skills).toHaveLength(2);
41
+ const names = skills.map((s) => s.skill).sort();
42
+ expect(names).toEqual(["code-review", "social-media-posting"]);
43
+ });
44
+ it("sets hasEvals=true when evals.json exists", async () => {
45
+ createSkill("marketing", "social-media-posting", { evals: true });
46
+ const skills = await scanSkills(testDir);
47
+ expect(skills[0].hasEvals).toBe(true);
48
+ });
49
+ it("sets hasEvals=false when evals.json is absent", async () => {
50
+ createSkill("marketing", "social-media-posting");
51
+ const skills = await scanSkills(testDir);
52
+ expect(skills[0].hasEvals).toBe(false);
53
+ });
54
+ it("sets hasBenchmark=true when benchmark.json exists", async () => {
55
+ createSkill("marketing", "social-media-posting", {
56
+ evals: true,
57
+ benchmark: true,
58
+ });
59
+ const skills = await scanSkills(testDir);
60
+ expect(skills[0].hasBenchmark).toBe(true);
61
+ });
62
+ it("sets hasBenchmark=false when benchmark.json is absent", async () => {
63
+ createSkill("marketing", "social-media-posting", { evals: true });
64
+ const skills = await scanSkills(testDir);
65
+ expect(skills[0].hasBenchmark).toBe(false);
66
+ });
67
+ it("returns correct plugin and skill names", async () => {
68
+ createSkill("marketing", "social-media-posting");
69
+ const skills = await scanSkills(testDir);
70
+ expect(skills[0].plugin).toBe("marketing");
71
+ expect(skills[0].skill).toBe("social-media-posting");
72
+ });
73
+ it("returns empty array for empty root", async () => {
74
+ const skills = await scanSkills(testDir);
75
+ expect(skills).toEqual([]);
76
+ });
77
+ });
78
+ //# sourceMappingURL=skill-scanner.test.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"skill-scanner.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/skill-scanner.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,QAAQ,CAAC;AACrE,OAAO,EAAE,SAAS,EAAE,aAAa,EAAE,MAAM,EAAE,MAAM,SAAS,CAAC;AAC3D,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,MAAM,EAAE,MAAM,SAAS,CAAC;AACjC,OAAO,EAAE,UAAU,EAAE,MAAM,qBAAqB,CAAC;AAEjD,8EAA8E;AAC9E,eAAe;AACf,8EAA8E;AAE9E,IAAI,OAAe,CAAC;AAEpB,SAAS,WAAW,CAClB,MAAc,EACd,KAAa,EACb,OAAiD,EAAE;IAEnD,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,EAAE,MAAM,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC;IACxD,SAAS,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IACzC,aAAa,CAAC,IAAI,CAAC,QAAQ,EAAE,UAAU,CAAC,EAAE,KAAK,KAAK,EAAE,CAAC,CAAC;IAExD,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;QACf,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QACzC,SAAS,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QACzC,aAAa,CACX,IAAI,CAAC,QAAQ,EAAE,YAAY,CAAC,EAC5B,IAAI,CAAC,SAAS,CAAC,EAAE,UAAU,EAAE,KAAK,EAAE,KAAK,EAAE,EAAE,EAAE,CAAC,CACjD,CAAC;IACJ,CAAC;IAED,IAAI,IAAI,CAAC,SAAS,EAAE,CAAC;QACnB,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QACzC,SAAS,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QACzC,aAAa,CACX,IAAI,CAAC,QAAQ,EAAE,gBAAgB,CAAC,EAChC,IAAI,CAAC,SAAS,CAAC,EAAE,SAAS,EAAE,sBAAsB,EAAE,CAAC,CACtD,CAAC;IACJ,CAAC;AACH,CAAC;AAED,8EAA8E;AAC9E,QAAQ;AACR,8EAA8E;AAE9E,QAAQ,CAAC,YAAY,EAAE,GAAG,EAAE;IAC1B,UAAU,CAAC,GAAG,EAAE;QACd,OAAO,GAAG,IAAI,CAAC,MAAM,EAAE,EAAE,eAAe,IAAI,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;QACtD,SAAS,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC1C,CAAC,CAAC,CAAC;IAEH,SAAS,CAAC,GAAG,EAAE;QACb,MAAM,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;IACpD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,uCAAuC,EAAE,KAAK,IAAI,EAAE;QACrD,WAAW,CAAC,WAAW,EAAE,sBAAsB,CAAC,CAAC;QACjD,WAAW,CAAC,UAAU,EAAE,aAAa,CAAC,CAAC;QAEvC,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,CAAC;QAEzC,MAAM,CAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAC/B,MAAM,KAAK,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,EAAE,CAAC;QAChD,MAAM,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,CAAC,aAAa,EAAE,sBAAsB,CAAC,CAAC,CAAC;IACjE,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,2CAA2C,EAAE,KAAK,IAAI,EAAE;QACzD,WAAW,CAAC,WAAW,EAAE,sBAAsB,EAAE,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;QAElE,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,CAAC;QAEzC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACxC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,+CAA+C,EAAE,KAAK,IAAI,EAAE;QAC7D,WAAW,CAAC,WAAW,EAAE,sBAAsB,CAAC,CAAC;QAEjD,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,CAAC;QAEzC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IACzC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,mDAAmD,EAAE,KAAK,IAAI,EAAE;QACjE,WAAW,CAAC,WAAW,EAAE,sBAAsB,EAAE;YAC/C,KAAK,EAAE,IAAI;YACX,SAAS,EAAE,IAAI;SAChB,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,CAAC;QAEzC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC5C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,uDAAuD,EAAE,KAAK,IAAI,EAAE;QACrE,WAAW,CAAC,WAAW,EAAE,sBAAsB,EAAE,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;QAElE,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,CAAC;QAEzC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAC7C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,wCAAwC,EAAE,KAAK,IAAI,EAAE;QACtD,WAAW,CAAC,WAAW,EAAE,sBAAsB,CAAC,CAAC;QAEjD,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,CAAC;QAEzC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QAC3C,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,sBAAsB,CAAC,CAAC;IACvD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,oCAAoC,EAAE,KAAK,IAAI,EAAE;QAClD,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,CAAC;QACzC,MAAM,CAAC,MAAM,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;IAC7B,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
@@ -0,0 +1,22 @@
1
+ export interface BenchmarkAssertionResult {
2
+ id: string;
3
+ text: string;
4
+ pass: boolean;
5
+ reasoning: string;
6
+ }
7
+ export interface BenchmarkCase {
8
+ eval_id: number;
9
+ eval_name: string;
10
+ status: "pass" | "fail" | "error";
11
+ error_message: string | null;
12
+ pass_rate: number;
13
+ assertions: BenchmarkAssertionResult[];
14
+ }
15
+ export interface BenchmarkResult {
16
+ timestamp: string;
17
+ model: string;
18
+ skill_name: string;
19
+ cases: BenchmarkCase[];
20
+ }
21
+ export declare function writeBenchmark(skillDir: string, result: BenchmarkResult): Promise<void>;
22
+ export declare function readBenchmark(skillDir: string): Promise<BenchmarkResult | null>;
@@ -0,0 +1,24 @@
1
+ // ---------------------------------------------------------------------------
2
+ // benchmark.json read/write
3
+ // ---------------------------------------------------------------------------
4
+ import { writeFileSync, readFileSync, existsSync, mkdirSync } from "node:fs";
5
+ import { join } from "node:path";
6
+ export async function writeBenchmark(skillDir, result) {
7
+ const evalsDir = join(skillDir, "evals");
8
+ mkdirSync(evalsDir, { recursive: true });
9
+ const filePath = join(evalsDir, "benchmark.json");
10
+ writeFileSync(filePath, JSON.stringify(result, null, 2), "utf-8");
11
+ }
12
+ export async function readBenchmark(skillDir) {
13
+ const filePath = join(skillDir, "evals", "benchmark.json");
14
+ if (!existsSync(filePath))
15
+ return null;
16
+ try {
17
+ const raw = readFileSync(filePath, "utf-8");
18
+ return JSON.parse(raw);
19
+ }
20
+ catch {
21
+ return null;
22
+ }
23
+ }
24
+ //# sourceMappingURL=benchmark.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"benchmark.js","sourceRoot":"","sources":["../../src/eval/benchmark.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,4BAA4B;AAC5B,8EAA8E;AAE9E,OAAO,EAAE,aAAa,EAAE,YAAY,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,SAAS,CAAC;AAC7E,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAyBjC,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,QAAgB,EAChB,MAAuB;IAEvB,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IACzC,SAAS,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IACzC,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,gBAAgB,CAAC,CAAC;IAClD,aAAa,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;AACpE,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,QAAgB;IAEhB,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,gBAAgB,CAAC,CAAC;IAC3D,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC;QAAE,OAAO,IAAI,CAAC;IAEvC,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAC5C,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAoB,CAAC;IAC5C,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC"}
@@ -0,0 +1,9 @@
1
+ import type { Assertion } from "./schema.js";
2
+ import type { LlmClient } from "./llm.js";
3
+ export interface AssertionResult {
4
+ id: string;
5
+ text: string;
6
+ pass: boolean;
7
+ reasoning: string;
8
+ }
9
+ export declare function judgeAssertion(output: string, assertion: Assertion, client: LlmClient): Promise<AssertionResult>;
@@ -0,0 +1,40 @@
1
+ // ---------------------------------------------------------------------------
2
+ // Assertion-level LLM judge
3
+ // ---------------------------------------------------------------------------
4
+ const JUDGE_SYSTEM = `You are a binary assertion evaluator. Given an LLM output and an assertion, determine if the output satisfies the assertion. Respond with ONLY a JSON object: { "pass": boolean, "reasoning": "brief explanation" }`;
5
+ export async function judgeAssertion(output, assertion, client) {
6
+ const userPrompt = `## LLM Output
7
+ ${output}
8
+
9
+ ## Assertion to Verify
10
+ ${assertion.text}
11
+
12
+ Does the LLM output satisfy this assertion? Respond with JSON only: { "pass": boolean, "reasoning": "..." }`;
13
+ const raw = await client.generate(JUDGE_SYSTEM, userPrompt);
14
+ const parsed = parseJudgeResponse(raw);
15
+ return {
16
+ id: assertion.id,
17
+ text: assertion.text,
18
+ pass: parsed.pass,
19
+ reasoning: parsed.reasoning,
20
+ };
21
+ }
22
+ function parseJudgeResponse(raw) {
23
+ // Try to extract JSON from code fence first
24
+ const fenceMatch = raw.match(/```(?:json)?\s*\n([\s\S]*?)\n```/);
25
+ const jsonStr = fenceMatch ? fenceMatch[1] : raw;
26
+ try {
27
+ const parsed = JSON.parse(jsonStr);
28
+ if (typeof parsed.pass !== "boolean") {
29
+ throw new Error("missing pass field");
30
+ }
31
+ return {
32
+ pass: parsed.pass,
33
+ reasoning: typeof parsed.reasoning === "string" ? parsed.reasoning : "",
34
+ };
35
+ }
36
+ catch {
37
+ throw new Error(`Invalid judge output: expected JSON with { pass, reasoning }, got: ${raw.slice(0, 100)}`);
38
+ }
39
+ }
40
+ //# sourceMappingURL=judge.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"judge.js","sourceRoot":"","sources":["../../src/eval/judge.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,4BAA4B;AAC5B,8EAA8E;AAY9E,MAAM,YAAY,GAAG,qNAAqN,CAAC;AAE3O,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,MAAc,EACd,SAAoB,EACpB,MAAiB;IAEjB,MAAM,UAAU,GAAG;EACnB,MAAM;;;EAGN,SAAS,CAAC,IAAI;;4GAE4F,CAAC;IAE3G,MAAM,GAAG,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,YAAY,EAAE,UAAU,CAAC,CAAC;IAE5D,MAAM,MAAM,GAAG,kBAAkB,CAAC,GAAG,CAAC,CAAC;IAEvC,OAAO;QACL,EAAE,EAAE,SAAS,CAAC,EAAE;QAChB,IAAI,EAAE,SAAS,CAAC,IAAI;QACpB,IAAI,EAAE,MAAM,CAAC,IAAI;QACjB,SAAS,EAAE,MAAM,CAAC,SAAS;KAC5B,CAAC;AACJ,CAAC;AAED,SAAS,kBAAkB,CAAC,GAAW;IACrC,4CAA4C;IAC5C,MAAM,UAAU,GAAG,GAAG,CAAC,KAAK,CAAC,kCAAkC,CAAC,CAAC;IACjE,MAAM,OAAO,GAAG,UAAU,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC;IAEjD,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QACnC,IAAI,OAAO,MAAM,CAAC,IAAI,KAAK,SAAS,EAAE,CAAC;YACrC,MAAM,IAAI,KAAK,CAAC,oBAAoB,CAAC,CAAC;QACxC,CAAC;QACD,OAAO;YACL,IAAI,EAAE,MAAM,CAAC,IAAI;YACjB,SAAS,EAAE,OAAO,MAAM,CAAC,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE;SACxE,CAAC;IACJ,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CACb,sEAAsE,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAC1F,CAAC;IACJ,CAAC;AACH,CAAC"}
@@ -0,0 +1,5 @@
1
+ export interface LlmClient {
2
+ generate(systemPrompt: string, userPrompt: string): Promise<string>;
3
+ readonly model: string;
4
+ }
5
+ export declare function createLlmClient(): LlmClient;
@@ -0,0 +1,34 @@
1
+ // ---------------------------------------------------------------------------
2
+ // Anthropic LLM client wrapper for eval commands
3
+ // ---------------------------------------------------------------------------
4
+ import Anthropic from "@anthropic-ai/sdk";
5
+ const DEFAULT_MODEL = "claude-sonnet-4-20250514";
6
+ export function createLlmClient() {
7
+ const apiKey = process.env.ANTHROPIC_API_KEY;
8
+ if (!apiKey) {
9
+ throw new Error("ANTHROPIC_API_KEY is not set. Export it before running eval commands:\n export ANTHROPIC_API_KEY=sk-ant-...");
10
+ }
11
+ const client = new Anthropic({ apiKey });
12
+ const model = process.env.VSKILL_EVAL_MODEL || DEFAULT_MODEL;
13
+ return {
14
+ model,
15
+ async generate(systemPrompt, userPrompt) {
16
+ const controller = new AbortController();
17
+ const timeout = setTimeout(() => controller.abort(), 120_000);
18
+ try {
19
+ const response = await client.messages.create({
20
+ model,
21
+ system: systemPrompt,
22
+ messages: [{ role: "user", content: userPrompt }],
23
+ max_tokens: 4096,
24
+ }, { signal: controller.signal });
25
+ const textBlock = response.content.find((b) => b.type === "text");
26
+ return textBlock && "text" in textBlock ? textBlock.text : "";
27
+ }
28
+ finally {
29
+ clearTimeout(timeout);
30
+ }
31
+ },
32
+ };
33
+ }
34
+ //# sourceMappingURL=llm.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"llm.js","sourceRoot":"","sources":["../../src/eval/llm.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,iDAAiD;AACjD,8EAA8E;AAE9E,OAAO,SAAS,MAAM,mBAAmB,CAAC;AAE1C,MAAM,aAAa,GAAG,0BAA0B,CAAC;AAOjD,MAAM,UAAU,eAAe;IAC7B,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC;IAC7C,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,IAAI,KAAK,CACb,8GAA8G,CAC/G,CAAC;IACJ,CAAC;IAED,MAAM,MAAM,GAAG,IAAI,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;IACzC,MAAM,KAAK,GAAG,OAAO,CAAC,GAAG,CAAC,iBAAiB,IAAI,aAAa,CAAC;IAE7D,OAAO;QACL,KAAK;QACL,KAAK,CAAC,QAAQ,CAAC,YAAoB,EAAE,UAAkB;YACrD,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;YACzC,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,OAAO,CAAC,CAAC;YAC9D,IAAI,CAAC;gBACH,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,MAAM,CAC3C;oBACE,KAAK;oBACL,MAAM,EAAE,YAAY;oBACpB,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,UAAU,EAAE,CAAC;oBACjD,UAAU,EAAE,IAAI;iBACjB,EACD,EAAE,MAAM,EAAE,UAAU,CAAC,MAAM,EAAE,CAC9B,CAAC;gBAEF,MAAM,SAAS,GAAG,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,MAAM,CAAC,CAAC;gBAClE,OAAO,SAAS,IAAI,MAAM,IAAI,SAAS,CAAC,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC;YAChE,CAAC;oBAAS,CAAC;gBACT,YAAY,CAAC,OAAO,CAAC,CAAC;YACxB,CAAC;QACH,CAAC;KACF,CAAC;AACJ,CAAC"}
@@ -0,0 +1,3 @@
1
+ import type { EvalsFile } from "./schema.js";
2
+ export declare function buildEvalInitPrompt(skillContent: string): string;
3
+ export declare function parseGeneratedEvals(raw: string): EvalsFile;
@@ -0,0 +1,155 @@
1
+ // ---------------------------------------------------------------------------
2
+ // Eval generation prompt assembly + response parsing
3
+ // ---------------------------------------------------------------------------
4
+ // ---------------------------------------------------------------------------
5
+ // Schema reference (embedded)
6
+ // ---------------------------------------------------------------------------
7
+ const SCHEMA_REFERENCE = `
8
+ ## evals.json Schema
9
+
10
+ The file MUST be valid JSON with this structure:
11
+
12
+ {
13
+ "skill_name": "<string, required - the skill identifier>",
14
+ "evals": [
15
+ {
16
+ "id": <number, required - unique integer per eval case>,
17
+ "name": "<string, required - descriptive name for this test case>",
18
+ "prompt": "<string, required - the user prompt to send to the LLM>",
19
+ "expected_output": "<string, required - description of what correct output looks like>",
20
+ "files": ["<optional array of file paths relevant to this eval>"],
21
+ "assertions": [
22
+ {
23
+ "id": "<string, required - unique within this eval case, e.g. 'assert-1'>",
24
+ "text": "<string, required - the assertion to verify against the output>",
25
+ "type": "boolean"
26
+ }
27
+ ]
28
+ }
29
+ ]
30
+ }
31
+
32
+ Every eval case MUST have at least 1 assertion. Assertion IDs must be unique within each eval case.
33
+ `;
34
+ // ---------------------------------------------------------------------------
35
+ // Example (embedded from social-media-posting)
36
+ // ---------------------------------------------------------------------------
37
+ const EXAMPLE_EVALS = `
38
+ ## Example: social-media-posting evals.json
39
+
40
+ {
41
+ "skill_name": "social-media-posting",
42
+ "evals": [
43
+ {
44
+ "id": 1,
45
+ "name": "LinkedIn announcement post",
46
+ "prompt": "Write a LinkedIn post announcing our new AI-powered analytics dashboard. Target audience: B2B SaaS founders. Tone: professional but excited. Include a call to action to sign up for the beta at analytics.example.com/beta",
47
+ "expected_output": "A professional LinkedIn post with product announcement, value proposition, and CTA",
48
+ "files": [],
49
+ "assertions": [
50
+ { "id": "assert-1", "text": "Post mentions the AI-powered analytics dashboard by name", "type": "boolean" },
51
+ { "id": "assert-2", "text": "Post includes the beta signup URL analytics.example.com/beta", "type": "boolean" },
52
+ { "id": "assert-3", "text": "Post uses a professional tone appropriate for B2B SaaS audience", "type": "boolean" },
53
+ { "id": "assert-4", "text": "Post includes a clear call to action", "type": "boolean" }
54
+ ]
55
+ }
56
+ ]
57
+ }
58
+ `;
59
+ // ---------------------------------------------------------------------------
60
+ // Best practices (embedded)
61
+ // ---------------------------------------------------------------------------
62
+ const BEST_PRACTICES = `
63
+ ## Best Practices for Eval Generation
64
+
65
+ 1. **Realistic prompts with substantive detail**: Include specific details like file paths, column names, audience types, or configuration values. Avoid generic prompts like "do something with this skill."
66
+
67
+ 2. **Objectively verifiable assertions**: Each assertion should be checkable by an LLM judge with a clear yes/no answer. "The output mentions X" is verifiable. "The output is good" is not.
68
+
69
+ 3. **Descriptive assertion names/IDs**: Assertion IDs should read clearly in a benchmark viewer. Use descriptive IDs like "mentions-file-path" or "includes-cta-url".
70
+
71
+ 4. **Skip assertions for purely subjective qualities**: Don't assert on tone, creativity, or style unless there's an objective proxy (e.g., "uses formal language" instead of "sounds professional").
72
+
73
+ 5. **Generate 2-3 test cases**: Each representing a different realistic usage scenario for this skill. Cover the primary use case and at least one edge case or variation.
74
+ `;
75
+ // ---------------------------------------------------------------------------
76
+ // Prompt builder
77
+ // ---------------------------------------------------------------------------
78
+ export function buildEvalInitPrompt(skillContent) {
79
+ return `You are an expert eval generator for AI skills. Your task is to create an evals.json file for the skill described below.
80
+
81
+ ## Skill Content (SKILL.md)
82
+
83
+ ${skillContent}
84
+
85
+ ${SCHEMA_REFERENCE}
86
+
87
+ ${EXAMPLE_EVALS}
88
+
89
+ ${BEST_PRACTICES}
90
+
91
+ ## Instructions
92
+
93
+ Generate a complete evals.json for this skill. Output ONLY the JSON inside a \`\`\`json code fence. Generate 2-3 eval cases with realistic, specific prompts and objectively verifiable assertions. Each case must have at least 2 assertions.`;
94
+ }
95
+ // ---------------------------------------------------------------------------
96
+ // Response parser
97
+ // ---------------------------------------------------------------------------
98
+ export function parseGeneratedEvals(raw) {
99
+ // Extract JSON from ```json ... ``` code fence
100
+ const match = raw.match(/```(?:json)?\s*\n([\s\S]*?)\n```/);
101
+ if (!match) {
102
+ throw new Error("No JSON code block found in LLM response. Expected ```json ... ``` fence.");
103
+ }
104
+ const jsonStr = match[1];
105
+ let parsed;
106
+ try {
107
+ parsed = JSON.parse(jsonStr);
108
+ }
109
+ catch (err) {
110
+ throw new Error(`Invalid JSON in code block: ${err.message}`);
111
+ }
112
+ // Validate against schema
113
+ validateParsedEvals(parsed);
114
+ // Normalize
115
+ return {
116
+ skill_name: parsed.skill_name,
117
+ evals: parsed.evals.map((e) => ({
118
+ id: e.id,
119
+ name: e.name,
120
+ prompt: e.prompt,
121
+ expected_output: e.expected_output,
122
+ files: Array.isArray(e.files) ? e.files : [],
123
+ assertions: e.assertions.map((a) => ({
124
+ id: a.id,
125
+ text: a.text,
126
+ type: a.type || "boolean",
127
+ })),
128
+ })),
129
+ };
130
+ }
131
+ function validateParsedEvals(parsed) {
132
+ const errors = [];
133
+ if (typeof parsed.skill_name !== "string" || !parsed.skill_name) {
134
+ errors.push("missing skill_name");
135
+ }
136
+ if (!Array.isArray(parsed.evals) || parsed.evals.length === 0) {
137
+ errors.push("missing or empty evals array");
138
+ }
139
+ if (Array.isArray(parsed.evals)) {
140
+ for (let i = 0; i < parsed.evals.length; i++) {
141
+ const e = parsed.evals[i];
142
+ if (!e.prompt)
143
+ errors.push(`evals[${i}] missing prompt`);
144
+ if (!e.name)
145
+ errors.push(`evals[${i}] missing name`);
146
+ if (!Array.isArray(e.assertions) || e.assertions.length === 0) {
147
+ errors.push(`evals[${i}] missing or empty assertions`);
148
+ }
149
+ }
150
+ }
151
+ if (errors.length > 0) {
152
+ throw new Error(`Invalid evals structure: ${errors.join(", ")}`);
153
+ }
154
+ }
155
+ //# sourceMappingURL=prompt-builder.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"prompt-builder.js","sourceRoot":"","sources":["../../src/eval/prompt-builder.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,qDAAqD;AACrD,8EAA8E;AAI9E,8EAA8E;AAC9E,8BAA8B;AAC9B,8EAA8E;AAE9E,MAAM,gBAAgB,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;CA0BxB,CAAC;AAEF,8EAA8E;AAC9E,+CAA+C;AAC/C,8EAA8E;AAE9E,MAAM,aAAa,GAAG;;;;;;;;;;;;;;;;;;;;;CAqBrB,CAAC;AAEF,8EAA8E;AAC9E,4BAA4B;AAC5B,8EAA8E;AAE9E,MAAM,cAAc,GAAG;;;;;;;;;;;;CAYtB,CAAC;AAEF,8EAA8E;AAC9E,iBAAiB;AACjB,8EAA8E;AAE9E,MAAM,UAAU,mBAAmB,CAAC,YAAoB;IACtD,OAAO;;;;EAIP,YAAY;;EAEZ,gBAAgB;;EAEhB,aAAa;;EAEb,cAAc;;;;+OAI+N,CAAC;AAChP,CAAC;AAED,8EAA8E;AAC9E,kBAAkB;AAClB,8EAA8E;AAE9E,MAAM,UAAU,mBAAmB,CAAC,GAAW;IAC7C,+CAA+C;IAC/C,MAAM,KAAK,GAAG,GAAG,CAAC,KAAK,CAAC,kCAAkC,CAAC,CAAC;IAC5D,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,MAAM,IAAI,KAAK,CACb,2EAA2E,CAC5E,CAAC;IACJ,CAAC;IAED,MAAM,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;IACzB,IAAI,MAAW,CAAC;IAChB,IAAI,CAAC;QACH,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;IAC/B,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,MAAM,IAAI,KAAK,CACb,+BAAgC,GAAa,CAAC,OAAO,EAAE,CACxD,CAAC;IACJ,CAAC;IAED,0BAA0B;IAC1B,mBAAmB,CAAC,MAAM,CAAC,CAAC;IAE5B,YAAY;IACZ,OAAO;QACL,UAAU,EAAE,MAAM,CAAC,UAAU;QAC7B,KAAK,EAAE,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAM,EAAE,EAAE,CAAC,CAAC;YACnC,EAAE,EAAE,CAAC,CAAC,EAAE;YACR,IAAI,EAAE,CAAC,CAAC,IAAI;YACZ,MAAM,EAAE,CAAC,CAAC,MAAM;YAChB,eAAe,EAAE,CAAC,CAAC,eAAe;YAClC,KAAK,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE;YAC5C,UAAU,EAAE,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAM,EAAE,EAAE,CAAC,CAAC;gBACxC,EAAE,EAAE,CAAC,CAAC,EAAE;gBACR,IAAI,EAAE,CAAC,CAAC,IAAI;gBACZ,IAAI,EAAE,CAAC,CAAC,IAAI,IAAI,SAAS;aAC1B,CAAC,CAAC;SACJ,CAAC,CAAC;KACJ,CAAC;AACJ,CAAC;AAED,SAAS,mBAAmB,CAAC,MAAW;IACtC,MAAM,MAAM,GAAa,EAAE,CAAC;IAE5B,IAAI,OAAO,MAAM,CAAC,UAAU,KAAK,QAAQ,IAAI,CAAC,MAAM,CAAC,UAAU,EAAE,CAAC;QAChE,MAAM,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC;IACpC,CAAC;IACD,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,MAAM,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC9D,MAAM,CAAC,IAAI,CAAC,8BAA8B,CAAC,CAAC;IAC9C,CAAC;IAED,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,CAAC;QAChC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC7C,MAAM,CAAC,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;YAC1B,IAAI,CAAC,CAAC,CAAC,MAAM;gBAAE,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,kBAAkB,CAAC,CAAC;YACzD,IAAI,CAAC,CAAC,CAAC,IAAI;gBAAE,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,gBAAgB,CAAC,CAAC;YACrD,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC9D,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,+BAA+B,CAAC,CAAC;YACzD,CAAC;QACH,CAAC;IACH,CAAC;IAED,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACtB,MAAM,IAAI,KAAK,CAAC,4BAA4B,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACnE,CAAC;AACH,CAAC"}
@@ -0,0 +1,26 @@
1
+ export interface Assertion {
2
+ id: string;
3
+ text: string;
4
+ type: "boolean";
5
+ }
6
+ export interface EvalCase {
7
+ id: number;
8
+ name: string;
9
+ prompt: string;
10
+ expected_output: string;
11
+ files: string[];
12
+ assertions: Assertion[];
13
+ }
14
+ export interface EvalsFile {
15
+ skill_name: string;
16
+ evals: EvalCase[];
17
+ }
18
+ export interface ValidationError {
19
+ path: string;
20
+ message: string;
21
+ }
22
+ export declare class EvalValidationError extends Error {
23
+ errors: ValidationError[];
24
+ constructor(errors: ValidationError[]);
25
+ }
26
+ export declare function loadAndValidateEvals(skillDir: string): EvalsFile;
@@ -0,0 +1,128 @@
1
+ // ---------------------------------------------------------------------------
2
+ // evals.json schema validation
3
+ // ---------------------------------------------------------------------------
4
+ import { readFileSync, existsSync } from "node:fs";
5
+ import { join } from "node:path";
6
+ export class EvalValidationError extends Error {
7
+ errors;
8
+ constructor(errors) {
9
+ const msg = errors.map((e) => `${e.path}: ${e.message}`).join("; ");
10
+ super(`Eval validation failed: ${msg}`);
11
+ this.name = "EvalValidationError";
12
+ this.errors = errors;
13
+ }
14
+ }
15
+ // ---------------------------------------------------------------------------
16
+ // Validator
17
+ // ---------------------------------------------------------------------------
18
+ export function loadAndValidateEvals(skillDir) {
19
+ const filePath = join(skillDir, "evals", "evals.json");
20
+ if (!existsSync(filePath)) {
21
+ throw new EvalValidationError([
22
+ { path: filePath, message: "No evals.json found" },
23
+ ]);
24
+ }
25
+ const raw = readFileSync(filePath, "utf-8");
26
+ let parsed;
27
+ try {
28
+ parsed = JSON.parse(raw);
29
+ }
30
+ catch (err) {
31
+ const syntaxErr = err;
32
+ throw new EvalValidationError([
33
+ {
34
+ path: filePath,
35
+ message: `JSON parse error: ${syntaxErr.message}`,
36
+ },
37
+ ]);
38
+ }
39
+ const errors = [];
40
+ if (typeof parsed.skill_name !== "string" || !parsed.skill_name) {
41
+ errors.push({ path: "skill_name", message: "required string field" });
42
+ }
43
+ if (!Array.isArray(parsed.evals)) {
44
+ errors.push({ path: "evals", message: "required array field" });
45
+ throw new EvalValidationError(errors);
46
+ }
47
+ for (let i = 0; i < parsed.evals.length; i++) {
48
+ const evalCase = parsed.evals[i];
49
+ const prefix = `evals[${i}]`;
50
+ if (typeof evalCase.id !== "number") {
51
+ errors.push({ path: `${prefix}.id`, message: "required number field" });
52
+ }
53
+ if (typeof evalCase.name !== "string" || !evalCase.name) {
54
+ errors.push({
55
+ path: `${prefix}.name`,
56
+ message: "required string field",
57
+ });
58
+ }
59
+ if (typeof evalCase.prompt !== "string" || !evalCase.prompt) {
60
+ errors.push({
61
+ path: `${prefix}.prompt`,
62
+ message: "required string field",
63
+ });
64
+ }
65
+ if (typeof evalCase.expected_output !== "string" ||
66
+ !evalCase.expected_output) {
67
+ errors.push({
68
+ path: `${prefix}.expected_output`,
69
+ message: "required string field",
70
+ });
71
+ }
72
+ if (!Array.isArray(evalCase.assertions)) {
73
+ errors.push({
74
+ path: `${prefix}.assertions`,
75
+ message: "required array field",
76
+ });
77
+ continue;
78
+ }
79
+ if (evalCase.assertions.length === 0) {
80
+ errors.push({
81
+ path: `${prefix}.assertions`,
82
+ message: "must have at least 1 assertion",
83
+ });
84
+ continue;
85
+ }
86
+ // Check for duplicate assertion IDs
87
+ const seenIds = new Set();
88
+ for (const assertion of evalCase.assertions) {
89
+ if (typeof assertion.id !== "string" || !assertion.id) {
90
+ errors.push({
91
+ path: `${prefix}.assertions[].id`,
92
+ message: "required string field",
93
+ });
94
+ }
95
+ if (typeof assertion.text !== "string" || !assertion.text) {
96
+ errors.push({
97
+ path: `${prefix}.assertions[].text`,
98
+ message: "required string field",
99
+ });
100
+ }
101
+ if (assertion.id && seenIds.has(assertion.id)) {
102
+ errors.push({
103
+ path: `${prefix}.assertions`,
104
+ message: `duplicate assertion ID: ${assertion.id}`,
105
+ });
106
+ }
107
+ seenIds.add(assertion.id);
108
+ }
109
+ }
110
+ if (errors.length > 0) {
111
+ throw new EvalValidationError(errors);
112
+ }
113
+ // Normalize: default files to []
114
+ const evals = parsed.evals.map((e) => ({
115
+ id: e.id,
116
+ name: e.name,
117
+ prompt: e.prompt,
118
+ expected_output: e.expected_output,
119
+ files: Array.isArray(e.files) ? e.files : [],
120
+ assertions: e.assertions.map((a) => ({
121
+ id: a.id,
122
+ text: a.text,
123
+ type: a.type || "boolean",
124
+ })),
125
+ }));
126
+ return { skill_name: parsed.skill_name, evals };
127
+ }
128
+ //# sourceMappingURL=schema.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"schema.js","sourceRoot":"","sources":["../../src/eval/schema.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,+BAA+B;AAC/B,8EAA8E;AAE9E,OAAO,EAAE,YAAY,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACnD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AA+BjC,MAAM,OAAO,mBAAoB,SAAQ,KAAK;IAC5C,MAAM,CAAoB;IAE1B,YAAY,MAAyB;QACnC,MAAM,GAAG,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACpE,KAAK,CAAC,2BAA2B,GAAG,EAAE,CAAC,CAAC;QACxC,IAAI,CAAC,IAAI,GAAG,qBAAqB,CAAC;QAClC,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;IACvB,CAAC;CACF;AAED,8EAA8E;AAC9E,YAAY;AACZ,8EAA8E;AAE9E,MAAM,UAAU,oBAAoB,CAAC,QAAgB;IACnD,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,YAAY,CAAC,CAAC;IAEvD,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;QAC1B,MAAM,IAAI,mBAAmB,CAAC;YAC5B,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,qBAAqB,EAAE;SACnD,CAAC,CAAC;IACL,CAAC;IAED,MAAM,GAAG,GAAG,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IAE5C,IAAI,MAAW,CAAC;IAChB,IAAI,CAAC;QACH,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAC3B,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,MAAM,SAAS,GAAG,GAAkB,CAAC;QACrC,MAAM,IAAI,mBAAmB,CAAC;YAC5B;gBACE,IAAI,EAAE,QAAQ;gBACd,OAAO,EAAE,qBAAqB,SAAS,CAAC,OAAO,EAAE;aAClD;SACF,CAAC,CAAC;IACL,CAAC;IAED,MAAM,MAAM,GAAsB,EAAE,CAAC;IAErC,IAAI,OAAO,MAAM,CAAC,UAAU,KAAK,QAAQ,IAAI,CAAC,MAAM,CAAC,UAAU,EAAE,CAAC;QAChE,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,YAAY,EAAE,OAAO,EAAE,uBAAuB,EAAE,CAAC,CAAC;IACxE,CAAC;IAED,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,CAAC;QACjC,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,OAAO,EAAE,sBAAsB,EAAE,CAAC,CAAC;QAChE,MAAM,IAAI,mBAAmB,CAAC,MAAM,CAAC,CAAC;IACxC,CAAC;IAED,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC7C,MAAM,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACjC,MAAM,MAAM,GAAG,SAAS,CAAC,GAAG,CAAC;QAE7B,IAAI,OAAO,QAAQ,CAAC,EAAE,KAAK,QAAQ,EAAE,CAAC;YACpC,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,GAAG,MAAM,KAAK,EAAE,OAAO,EAAE,uBAAuB,EAAE,CAAC,CAAC;QAC1E,CAAC;QACD,IAAI,OAAO,QAAQ,CAAC,IAAI,KAAK,QAAQ,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;YACxD,MAAM,CAAC,IAAI,CAAC;gBACV,IAAI,EAAE,GAAG,MAAM,OAAO;gBACtB,OAAO,EAAE,uBAAuB;aACjC,CAAC,CAAC;QACL,CAAC;QACD,IAAI,OAAO,QAAQ,CAAC,MAAM,KAAK,QAAQ,IAAI,CAAC,QAAQ,CAAC,MAAM,EAAE,CAAC;YAC5D,MAAM,CAAC,IAAI,CAAC;gBACV,IAAI,EAAE,GAAG,MAAM,SAAS;gBACxB,OAAO,EAAE,uBAAuB;aACjC,CAAC,CAAC;QACL,CAAC;QACD,IACE,OAAO,QAAQ,CAAC,eAAe,KAAK,QAAQ;YAC5C,CAAC,QAAQ,CAAC,eAAe,EACzB,CAAC;YACD,MAAM,CAAC,IAAI,CAAC;gBACV,IAAI,EAAE,GAAG,MAAM,kBAAkB;gBACjC,OAAO,EAAE,uBAAuB;aACjC,CAAC,CAAC;QACL,CAAC;QAED,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAC,EAAE,CAAC;YACxC,MAAM,CAAC,IAAI,CAAC;gBACV,IAAI,EAAE,GAAG,MAAM,aAAa;gBAC5B,OAAO,EAAE,sBAAsB;aAChC,CAAC,CAAC;YACH,SAAS;QACX,CAAC;QAED,IAAI,QAAQ,CAAC,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACrC,MAAM,CAAC,IAAI,CAAC;gBACV,IAAI,EAAE,GAAG,MAAM,aAAa;gBAC5B,OAAO,EAAE,gCAAgC;aAC1C,CAAC,CAAC;YACH,SAAS;QACX,CAAC;QAED,oCAAoC;QACpC,MAAM,OAAO,GAAG,IAAI,GAAG,EAAU,CAAC;QAClC,KAAK,MAAM,SAAS,IAAI,QAAQ,CAAC,UAAU,EAAE,CAAC;YAC5C,IAAI,OAAO,SAAS,CAAC,EAAE,KAAK,QAAQ,IAAI,CAAC,SAAS,CAAC,EAAE,EAAE,CAAC;gBACtD,MAAM,CAAC,IAAI,CAAC;oBACV,IAAI,EAAE,GAAG,MAAM,kBAAkB;oBACjC,OAAO,EAAE,uBAAuB;iBACjC,CAAC,CAAC;YACL,CAAC;YACD,IAAI,OAAO,SAAS,CAAC,IAAI,KAAK,QAAQ,IAAI,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC;gBAC1D,MAAM,CAAC,IAAI,CAAC;oBACV,IAAI,EAAE,GAAG,MAAM,oBAAoB;oBACnC,OAAO,EAAE,uBAAuB;iBACjC,CAAC,CAAC;YACL,CAAC;YACD,IAAI,SAAS,CAAC,EAAE,IAAI,OAAO,CAAC,GAAG,CAAC,SAAS,CAAC,EAAE,CAAC,EAAE,CAAC;gBAC9C,MAAM,CAAC,IAAI,CAAC;oBACV,IAAI,EAAE,GAAG,MAAM,aAAa;oBAC5B,OAAO,EAAE,2BAA2B,SAAS,CAAC,EAAE,EAAE;iBACnD,CAAC,CAAC;YACL,CAAC;YACD,OAAO,CAAC,GAAG,CAAC,SAAS,CAAC,EAAE,CAAC,CAAC;QAC5B,CAAC;IACH,CAAC;IAED,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACtB,MAAM,IAAI,mBAAmB,CAAC,MAAM,CAAC,CAAC;IACxC,CAAC;IAED,iCAAiC;IACjC,MAAM,KAAK,GAAe,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAM,EAAE,EAAE,CAAC,CAAC;QACtD,EAAE,EAAE,CAAC,CAAC,EAAE;QACR,IAAI,EAAE,CAAC,CAAC,IAAI;QACZ,MAAM,EAAE,CAAC,CAAC,MAAM;QAChB,eAAe,EAAE,CAAC,CAAC,eAAe;QAClC,KAAK,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE;QAC5C,UAAU,EAAE,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAM,EAAE,EAAE,CAAC,CAAC;YACxC,EAAE,EAAE,CAAC,CAAC,EAAE;YACR,IAAI,EAAE,CAAC,CAAC,IAAI;YACZ,IAAI,EAAE,CAAC,CAAC,IAAI,IAAI,SAAS;SAC1B,CAAC,CAAC;KACJ,CAAC,CAAC,CAAC;IAEJ,OAAO,EAAE,UAAU,EAAE,MAAM,CAAC,UAAU,EAAE,KAAK,EAAE,CAAC;AAClD,CAAC"}
@@ -0,0 +1,8 @@
1
+ export interface SkillInfo {
2
+ plugin: string;
3
+ skill: string;
4
+ dir: string;
5
+ hasEvals: boolean;
6
+ hasBenchmark: boolean;
7
+ }
8
+ export declare function scanSkills(root: string): Promise<SkillInfo[]>;
@@ -0,0 +1,44 @@
1
+ // ---------------------------------------------------------------------------
2
+ // Filesystem scanner for plugin skills
3
+ // ---------------------------------------------------------------------------
4
+ import { readdirSync, existsSync } from "node:fs";
5
+ import { join } from "node:path";
6
+ export async function scanSkills(root) {
7
+ const skills = [];
8
+ if (!existsSync(root))
9
+ return skills;
10
+ let plugins;
11
+ try {
12
+ plugins = readdirSync(root, { withFileTypes: true })
13
+ .filter((d) => d.isDirectory())
14
+ .map((d) => d.name);
15
+ }
16
+ catch {
17
+ return skills;
18
+ }
19
+ for (const plugin of plugins) {
20
+ const skillsDir = join(root, plugin, "skills");
21
+ if (!existsSync(skillsDir))
22
+ continue;
23
+ let skillDirs;
24
+ try {
25
+ skillDirs = readdirSync(skillsDir, { withFileTypes: true })
26
+ .filter((d) => d.isDirectory())
27
+ .map((d) => d.name);
28
+ }
29
+ catch {
30
+ continue;
31
+ }
32
+ for (const skill of skillDirs) {
33
+ const skillDir = join(skillsDir, skill);
34
+ const skillMd = join(skillDir, "SKILL.md");
35
+ if (!existsSync(skillMd))
36
+ continue;
37
+ const hasEvals = existsSync(join(skillDir, "evals", "evals.json"));
38
+ const hasBenchmark = existsSync(join(skillDir, "evals", "benchmark.json"));
39
+ skills.push({ plugin, skill, dir: skillDir, hasEvals, hasBenchmark });
40
+ }
41
+ }
42
+ return skills;
43
+ }
44
+ //# sourceMappingURL=skill-scanner.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"skill-scanner.js","sourceRoot":"","sources":["../../src/eval/skill-scanner.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,uCAAuC;AACvC,8EAA8E;AAE9E,OAAO,EAAE,WAAW,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AAClD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAUjC,MAAM,CAAC,KAAK,UAAU,UAAU,CAAC,IAAY;IAC3C,MAAM,MAAM,GAAgB,EAAE,CAAC;IAE/B,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC;QAAE,OAAO,MAAM,CAAC;IAErC,IAAI,OAAiB,CAAC;IACtB,IAAI,CAAC;QACH,OAAO,GAAG,WAAW,CAAC,IAAI,EAAE,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC;aACjD,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC;aAC9B,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;IACxB,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC7B,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC;QAC/C,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC;YAAE,SAAS;QAErC,IAAI,SAAmB,CAAC;QACxB,IAAI,CAAC;YACH,SAAS,GAAG,WAAW,CAAC,SAAS,EAAE,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC;iBACxD,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC;iBAC9B,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QACxB,CAAC;QAAC,MAAM,CAAC;YACP,SAAS;QACX,CAAC;QAED,KAAK,MAAM,KAAK,IAAI,SAAS,EAAE,CAAC;YAC9B,MAAM,QAAQ,GAAG,IAAI,CAAC,SAAS,EAAE,KAAK,CAAC,CAAC;YACxC,MAAM,OAAO,GAAG,IAAI,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;YAE3C,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC;gBAAE,SAAS;YAEnC,MAAM,QAAQ,GAAG,UAAU,CAAC,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,YAAY,CAAC,CAAC,CAAC;YACnE,MAAM,YAAY,GAAG,UAAU,CAC7B,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,gBAAgB,CAAC,CAC1C,CAAC;YAEF,MAAM,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,KAAK,EAAE,GAAG,EAAE,QAAQ,EAAE,QAAQ,EAAE,YAAY,EAAE,CAAC,CAAC;QACxE,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC"}
package/dist/index.js CHANGED
@@ -131,5 +131,14 @@ program
131
131
  const { blocklistCommand } = await import("./commands/blocklist.js");
132
132
  await blocklistCommand(subcommand || "list", name);
133
133
  });
134
+ program
135
+ .command("eval [subcommand] [target]")
136
+ .description("Eval commands: init, run, coverage, generate-all")
137
+ .option("--force", "Overwrite existing evals.json")
138
+ .option("--root <path>", "Root directory for skill plugins (default: plugins/)")
139
+ .action(async (subcommand, target, opts) => {
140
+ const { evalCommand } = await import("./commands/eval.js");
141
+ await evalCommand(subcommand || "coverage", target, opts);
142
+ });
134
143
  program.parse();
135
144
  //# sourceMappingURL=index.js.map