vskill 0.2.55 → 0.2.56

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/README.md +6 -3
  2. package/dist/commands/eval/__tests__/run.test.js +7 -2
  3. package/dist/commands/eval/__tests__/run.test.js.map +1 -1
  4. package/dist/commands/eval/run.js +24 -3
  5. package/dist/commands/eval/run.js.map +1 -1
  6. package/dist/commands/eval/serve.d.ts +1 -0
  7. package/dist/commands/eval/serve.js +51 -0
  8. package/dist/commands/eval/serve.js.map +1 -0
  9. package/dist/commands/eval.d.ts +1 -0
  10. package/dist/commands/eval.js +6 -1
  11. package/dist/commands/eval.js.map +1 -1
  12. package/dist/eval/__tests__/activation-tester.test.d.ts +1 -0
  13. package/dist/eval/__tests__/activation-tester.test.js +94 -0
  14. package/dist/eval/__tests__/activation-tester.test.js.map +1 -0
  15. package/dist/eval/__tests__/benchmark-history.test.d.ts +1 -0
  16. package/dist/eval/__tests__/benchmark-history.test.js +200 -0
  17. package/dist/eval/__tests__/benchmark-history.test.js.map +1 -0
  18. package/dist/eval/__tests__/comparator.test.d.ts +1 -0
  19. package/dist/eval/__tests__/comparator.test.js +136 -0
  20. package/dist/eval/__tests__/comparator.test.js.map +1 -0
  21. package/dist/eval/__tests__/llm.test.js +161 -44
  22. package/dist/eval/__tests__/llm.test.js.map +1 -1
  23. package/dist/eval/__tests__/verdict.test.d.ts +1 -0
  24. package/dist/eval/__tests__/verdict.test.js +47 -0
  25. package/dist/eval/__tests__/verdict.test.js.map +1 -0
  26. package/dist/eval/activation-tester.d.ts +25 -0
  27. package/dist/eval/activation-tester.js +89 -0
  28. package/dist/eval/activation-tester.js.map +1 -0
  29. package/dist/eval/benchmark-history.d.ts +23 -0
  30. package/dist/eval/benchmark-history.js +108 -0
  31. package/dist/eval/benchmark-history.js.map +1 -0
  32. package/dist/eval/comparator.d.ts +29 -0
  33. package/dist/eval/comparator.js +100 -0
  34. package/dist/eval/comparator.js.map +1 -0
  35. package/dist/eval/llm.js +119 -6
  36. package/dist/eval/llm.js.map +1 -1
  37. package/dist/eval/verdict.d.ts +3 -0
  38. package/dist/eval/verdict.js +28 -0
  39. package/dist/eval/verdict.js.map +1 -0
  40. package/dist/eval-server/api-routes.d.ts +2 -0
  41. package/dist/eval-server/api-routes.js +425 -0
  42. package/dist/eval-server/api-routes.js.map +1 -0
  43. package/dist/eval-server/eval-server.d.ts +6 -0
  44. package/dist/eval-server/eval-server.js +102 -0
  45. package/dist/eval-server/eval-server.js.map +1 -0
  46. package/dist/eval-server/router.d.ts +14 -0
  47. package/dist/eval-server/router.js +117 -0
  48. package/dist/eval-server/router.js.map +1 -0
  49. package/dist/eval-server/sse-helpers.d.ts +4 -0
  50. package/dist/eval-server/sse-helpers.js +24 -0
  51. package/dist/eval-server/sse-helpers.js.map +1 -0
  52. package/dist/eval-ui/assets/index-BYpLv_X1.css +1 -0
  53. package/dist/eval-ui/assets/index-Od6Ch9-a.js +70 -0
  54. package/dist/eval-ui/index.html +13 -0
  55. package/dist/index.js +2 -1
  56. package/dist/index.js.map +1 -1
  57. package/package.json +15 -2
@@ -0,0 +1,136 @@
1
+ import { describe, it, expect, vi } from "vitest";
2
+ import { generateComparisonOutputs, scoreComparison, runComparison, } from "../comparator.js";
3
+ function mockClient(responses) {
4
+ let callIndex = 0;
5
+ return {
6
+ model: "test-model",
7
+ generate: vi.fn(async () => responses[callIndex++] ?? ""),
8
+ };
9
+ }
10
+ describe("generateComparisonOutputs", () => {
11
+ it("generates skill and baseline outputs sequentially", async () => {
12
+ const client = mockClient(["skill response", "baseline response"]);
13
+ const result = await generateComparisonOutputs("test prompt", "# Skill Content", client);
14
+ expect(result.skillOutput).toBe("skill response");
15
+ expect(result.baselineOutput).toBe("baseline response");
16
+ expect(result.skillDurationMs).toBeGreaterThanOrEqual(0);
17
+ expect(result.baselineDurationMs).toBeGreaterThanOrEqual(0);
18
+ expect(client.generate).toHaveBeenCalledTimes(2);
19
+ // First call should include skill content
20
+ const firstCall = client.generate.mock.calls[0];
21
+ expect(firstCall[0]).toContain("Skill Content");
22
+ // Second call should be generic
23
+ const secondCall = client.generate.mock.calls[1];
24
+ expect(secondCall[0]).toContain("helpful AI assistant");
25
+ });
26
+ });
27
+ describe("scoreComparison", () => {
28
+ it("parses JSON scores from LLM response", async () => {
29
+ const client = mockClient([
30
+ JSON.stringify({
31
+ content_score_a: 4,
32
+ structure_score_a: 3,
33
+ content_score_b: 5,
34
+ structure_score_b: 4,
35
+ winner: "second",
36
+ reasoning: "B is better",
37
+ }),
38
+ ]);
39
+ const result = await scoreComparison("output A", "output B", "prompt", client);
40
+ expect(result.contentScoreA).toBe(4);
41
+ expect(result.structureScoreA).toBe(3);
42
+ expect(result.contentScoreB).toBe(5);
43
+ expect(result.structureScoreB).toBe(4);
44
+ expect(result.winner).toBe("second");
45
+ });
46
+ it("parses JSON from code fence", async () => {
47
+ const client = mockClient([
48
+ '```json\n{"content_score_a": 3, "structure_score_a": 3, "content_score_b": 3, "structure_score_b": 3, "winner": "tie"}\n```',
49
+ ]);
50
+ const result = await scoreComparison("A", "B", "p", client);
51
+ expect(result.winner).toBe("tie");
52
+ expect(result.contentScoreA).toBe(3);
53
+ });
54
+ it("clamps scores to 1-5 range", async () => {
55
+ const client = mockClient([
56
+ JSON.stringify({
57
+ content_score_a: 0,
58
+ structure_score_a: 10,
59
+ content_score_b: -1,
60
+ structure_score_b: 6,
61
+ winner: "first",
62
+ }),
63
+ ]);
64
+ const result = await scoreComparison("A", "B", "p", client);
65
+ expect(result.contentScoreA).toBe(1);
66
+ expect(result.structureScoreA).toBe(5);
67
+ expect(result.contentScoreB).toBe(1);
68
+ expect(result.structureScoreB).toBe(5);
69
+ });
70
+ it("defaults invalid winner to tie", async () => {
71
+ const client = mockClient([
72
+ JSON.stringify({
73
+ content_score_a: 3,
74
+ structure_score_a: 3,
75
+ content_score_b: 3,
76
+ structure_score_b: 3,
77
+ winner: "invalid",
78
+ }),
79
+ ]);
80
+ const result = await scoreComparison("A", "B", "p", client);
81
+ expect(result.winner).toBe("tie");
82
+ });
83
+ });
84
+ describe("runComparison", () => {
85
+ it("maps scores back to skill/baseline correctly", async () => {
86
+ // Mock: first two calls = skill + baseline outputs, third = scoring
87
+ const client = mockClient([
88
+ "skill output here",
89
+ "baseline output here",
90
+ JSON.stringify({
91
+ content_score_a: 4,
92
+ structure_score_a: 5,
93
+ content_score_b: 2,
94
+ structure_score_b: 3,
95
+ winner: "first",
96
+ reasoning: "A is better",
97
+ }),
98
+ ]);
99
+ // Fix randomness for deterministic test
100
+ vi.spyOn(Math, "random").mockReturnValue(0.3); // < 0.5 → skill is A
101
+ const result = await runComparison("test prompt", "skill content", client);
102
+ expect(result.prompt).toBe("test prompt");
103
+ expect(result.skillOutput).toBe("skill output here");
104
+ expect(result.baselineOutput).toBe("baseline output here");
105
+ // skill is A, so scores map directly
106
+ expect(result.skillContentScore).toBe(4);
107
+ expect(result.skillStructureScore).toBe(5);
108
+ expect(result.baselineContentScore).toBe(2);
109
+ expect(result.baselineStructureScore).toBe(3);
110
+ expect(result.winner).toBe("skill");
111
+ vi.restoreAllMocks();
112
+ });
113
+ it("maps scores correctly when baseline is A", async () => {
114
+ const client = mockClient([
115
+ "skill out",
116
+ "baseline out",
117
+ JSON.stringify({
118
+ content_score_a: 2,
119
+ structure_score_a: 2,
120
+ content_score_b: 4,
121
+ structure_score_b: 4,
122
+ winner: "second",
123
+ }),
124
+ ]);
125
+ // > 0.5 → skill is B
126
+ vi.spyOn(Math, "random").mockReturnValue(0.7);
127
+ const result = await runComparison("p", "s", client);
128
+ // skill is B → scores.contentScoreB is skill
129
+ expect(result.skillContentScore).toBe(4);
130
+ expect(result.baselineContentScore).toBe(2);
131
+ // winner "second" = B = skill
132
+ expect(result.winner).toBe("skill");
133
+ vi.restoreAllMocks();
134
+ });
135
+ });
136
+ //# sourceMappingURL=comparator.test.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"comparator.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/comparator.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,QAAQ,CAAC;AAClD,OAAO,EACL,yBAAyB,EACzB,eAAe,EACf,aAAa,GACd,MAAM,kBAAkB,CAAC;AAG1B,SAAS,UAAU,CAAC,SAAmB;IACrC,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,OAAO;QACL,KAAK,EAAE,YAAY;QACnB,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC,KAAK,IAAI,EAAE,CAAC,SAAS,CAAC,SAAS,EAAE,CAAC,IAAI,EAAE,CAAC;KAC1D,CAAC;AACJ,CAAC;AAED,QAAQ,CAAC,2BAA2B,EAAE,GAAG,EAAE;IACzC,EAAE,CAAC,mDAAmD,EAAE,KAAK,IAAI,EAAE;QACjE,MAAM,MAAM,GAAG,UAAU,CAAC,CAAC,gBAAgB,EAAE,mBAAmB,CAAC,CAAC,CAAC;QACnE,MAAM,MAAM,GAAG,MAAM,yBAAyB,CAAC,aAAa,EAAE,iBAAiB,EAAE,MAAM,CAAC,CAAC;QAEzF,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;QAClD,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;QACxD,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,CAAC;QACzD,MAAM,CAAC,MAAM,CAAC,kBAAkB,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,CAAC;QAC5D,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,qBAAqB,CAAC,CAAC,CAAC,CAAC;QAEjD,0CAA0C;QAC1C,MAAM,SAAS,GAAI,MAAM,CAAC,QAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACzD,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,eAAe,CAAC,CAAC;QAEhD,gCAAgC;QAChC,MAAM,UAAU,GAAI,MAAM,CAAC,QAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAC1D,MAAM,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,sBAAsB,CAAC,CAAC;IAC1D,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,QAAQ,CAAC,iBAAiB,EAAE,GAAG,EAAE;IAC/B,EAAE,CAAC,sCAAsC,EAAE,KAAK,IAAI,EAAE;QACpD,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,QAAQ;gBAChB,SAAS,EAAE,aAAa;aACzB,CAAC;SACH,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,UAAU,EAAE,UAAU,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC;QAC/E,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACvC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,6BAA6B,EAAE,KAAK,IAAI,EAAE;QAC3C,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,6HAA6H;SAC9H,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,CAAC,CAAC;QAC5D,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAClC,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACvC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,4BAA4B,EAAE,KAAK,IAAI,EAAE;QAC1C,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,EAAE;gBACrB,eAAe,EAAE,CAAC,CAAC;gBACnB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,OAAO;aAChB,CAAC;SACH,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,CAAC,CAAC;QAC5D,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACzC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,gCAAgC,EAAE,KAAK,IAAI,EAAE;QAC9C,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,SAAS;aAClB,CAAC;SACH,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,CAAC,CAAC;QAC5D,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IACpC,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,QAAQ,CAAC,eAAe,EAAE,GAAG,EAAE;IAC7B,EAAE,CAAC,8CAA8C,EAAE,KAAK,IAAI,EAAE;QAC5D,oEAAoE;QACpE,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,mBAAmB;YACnB,sBAAsB;YACtB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,OAAO;gBACf,SAAS,EAAE,aAAa;aACzB,CAAC;SACH,CAAC,CAAC;QAEH,wCAAwC;QACxC,EAAE,CAAC,KAAK,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC,CAAC,qBAAqB;QAEpE,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,aAAa,EAAE,eAAe,EAAE,MAAM,CAAC,CAAC;QAE3E,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC1C,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;QACrD,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,sBAAsB,CAAC,CAAC;QAC3D,qCAAqC;QACrC,MAAM,CAAC,MAAM,CAAC,iBAAiB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACzC,MAAM,CAAC,MAAM,CAAC,mBAAmB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3C,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC5C,MAAM,CAAC,MAAM,CAAC,sBAAsB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC9C,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAEpC,EAAE,CAAC,eAAe,EAAE,CAAC;IACvB,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;QACxD,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,WAAW;YACX,cAAc;YACd,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,QAAQ;aACjB,CAAC;SACH,CAAC,CAAC;QAEH,qBAAqB;QACrB,EAAE,CAAC,KAAK,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC;QAE9C,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,GAAG,EAAE,GAAG,EAAE,MAAM,CAAC,CAAC;QACrD,6CAA6C;QAC7C,MAAM,CAAC,MAAM,CAAC,iBAAiB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACzC,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC5C,8BAA8B;QAC9B,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAEpC,EAAE,CAAC,eAAe,EAAE,CAAC;IACvB,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
@@ -3,11 +3,18 @@ import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
3
3
  // Mocks
4
4
  // ---------------------------------------------------------------------------
5
5
  const mockCreate = vi.hoisted(() => vi.fn());
6
+ const mockExecFile = vi.hoisted(() => vi.fn());
6
7
  vi.mock("@anthropic-ai/sdk", () => ({
7
8
  default: class MockAnthropic {
8
9
  messages = { create: mockCreate };
9
10
  },
10
11
  }));
12
+ vi.mock("node:child_process", () => ({
13
+ execFile: mockExecFile,
14
+ }));
15
+ vi.mock("node:util", () => ({
16
+ promisify: (fn) => fn,
17
+ }));
11
18
  // ---------------------------------------------------------------------------
12
19
  // Import module under test AFTER mocks
13
20
  // ---------------------------------------------------------------------------
@@ -19,67 +26,177 @@ describe("createLlmClient", () => {
19
26
  const origEnv = { ...process.env };
20
27
  beforeEach(() => {
21
28
  vi.resetAllMocks();
22
- process.env.ANTHROPIC_API_KEY = "test-key";
29
+ delete process.env.VSKILL_EVAL_PROVIDER;
23
30
  delete process.env.VSKILL_EVAL_MODEL;
31
+ delete process.env.ANTHROPIC_API_KEY;
32
+ delete process.env.OLLAMA_BASE_URL;
33
+ delete process.env.CLAUDECODE;
24
34
  });
25
35
  afterEach(() => {
26
36
  process.env = { ...origEnv };
27
37
  });
28
- it("returns text content on successful generate call", async () => {
29
- mockCreate.mockResolvedValue({
30
- content: [{ type: "text", text: "Generated response" }],
31
- });
38
+ // -------------------------------------------------------------------------
39
+ // Auto-detection
40
+ // -------------------------------------------------------------------------
41
+ it("defaults to claude-cli from a plain terminal", () => {
32
42
  const client = createLlmClient();
33
- const result = await client.generate("system prompt", "user prompt");
34
- expect(result).toBe("Generated response");
35
- expect(mockCreate).toHaveBeenCalledOnce();
43
+ expect(client.model).toBe("claude-sonnet");
36
44
  });
37
- it("uses default model claude-sonnet-4-20250514 when env not set", async () => {
38
- mockCreate.mockResolvedValue({
39
- content: [{ type: "text", text: "ok" }],
40
- });
45
+ it("auto-detects ollama inside Claude Code session", () => {
46
+ process.env.CLAUDECODE = "1";
41
47
  const client = createLlmClient();
42
- await client.generate("sys", "usr");
43
- expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ model: "claude-sonnet-4-20250514" }), expect.anything());
48
+ expect(client.model).toBe("llama3.1:8b");
44
49
  });
45
- it("uses custom model from VSKILL_EVAL_MODEL env var", async () => {
46
- process.env.VSKILL_EVAL_MODEL = "claude-opus-4-20250514";
47
- mockCreate.mockResolvedValue({
48
- content: [{ type: "text", text: "ok" }],
49
- });
50
+ it("auto-detects anthropic when ANTHROPIC_API_KEY is set", () => {
51
+ process.env.ANTHROPIC_API_KEY = "test-key";
50
52
  const client = createLlmClient();
51
- await client.generate("sys", "usr");
52
- expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ model: "claude-opus-4-20250514" }), expect.anything());
53
+ expect(client.model).toBe("claude-sonnet-4-20250514");
53
54
  });
54
- it("propagates network error from SDK", async () => {
55
- mockCreate.mockRejectedValue(new Error("Connection timeout"));
55
+ it("CLAUDECODE takes priority over ANTHROPIC_API_KEY for auto-detection", () => {
56
+ process.env.CLAUDECODE = "1";
57
+ process.env.ANTHROPIC_API_KEY = "test-key";
56
58
  const client = createLlmClient();
57
- await expect(client.generate("sys", "usr")).rejects.toThrow("Connection timeout");
59
+ expect(client.model).toBe("llama3.1:8b");
58
60
  });
59
- it("passes system and user prompts correctly", async () => {
60
- mockCreate.mockResolvedValue({
61
- content: [{ type: "text", text: "ok" }],
62
- });
61
+ it("explicit VSKILL_EVAL_PROVIDER overrides auto-detection", () => {
62
+ process.env.VSKILL_EVAL_PROVIDER = "ollama";
63
+ process.env.ANTHROPIC_API_KEY = "test-key";
63
64
  const client = createLlmClient();
64
- await client.generate("my system prompt", "my user prompt");
65
- expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({
66
- system: "my system prompt",
67
- messages: [{ role: "user", content: "my user prompt" }],
68
- max_tokens: 4096,
69
- }), expect.anything());
65
+ expect(client.model).toBe("llama3.1:8b");
70
66
  });
71
- it("throws when ANTHROPIC_API_KEY is not set", () => {
72
- delete process.env.ANTHROPIC_API_KEY;
73
- expect(() => createLlmClient()).toThrow("ANTHROPIC_API_KEY is not set");
67
+ it("throws on unknown provider", () => {
68
+ process.env.VSKILL_EVAL_PROVIDER = "gpt-magic";
69
+ expect(() => createLlmClient()).toThrow('Unknown VSKILL_EVAL_PROVIDER: "gpt-magic"');
74
70
  });
75
- it("exposes model name on the client", () => {
76
- const client = createLlmClient();
77
- expect(client.model).toBe("claude-sonnet-4-20250514");
71
+ // -------------------------------------------------------------------------
72
+ // Anthropic provider
73
+ // -------------------------------------------------------------------------
74
+ describe("anthropic provider", () => {
75
+ beforeEach(() => {
76
+ process.env.VSKILL_EVAL_PROVIDER = "anthropic";
77
+ process.env.ANTHROPIC_API_KEY = "test-key";
78
+ });
79
+ it("returns text content on successful generate call", async () => {
80
+ mockCreate.mockResolvedValue({
81
+ content: [{ type: "text", text: "Generated response" }],
82
+ });
83
+ const client = createLlmClient();
84
+ const result = await client.generate("system prompt", "user prompt");
85
+ expect(result).toBe("Generated response");
86
+ expect(mockCreate).toHaveBeenCalledOnce();
87
+ });
88
+ it("uses default model claude-sonnet-4-20250514", async () => {
89
+ mockCreate.mockResolvedValue({
90
+ content: [{ type: "text", text: "ok" }],
91
+ });
92
+ const client = createLlmClient();
93
+ await client.generate("sys", "usr");
94
+ expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ model: "claude-sonnet-4-20250514" }), expect.anything());
95
+ });
96
+ it("uses custom model from VSKILL_EVAL_MODEL", async () => {
97
+ process.env.VSKILL_EVAL_MODEL = "claude-opus-4-20250514";
98
+ mockCreate.mockResolvedValue({
99
+ content: [{ type: "text", text: "ok" }],
100
+ });
101
+ const client = createLlmClient();
102
+ await client.generate("sys", "usr");
103
+ expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ model: "claude-opus-4-20250514" }), expect.anything());
104
+ });
105
+ it("propagates network error from SDK", async () => {
106
+ mockCreate.mockRejectedValue(new Error("Connection timeout"));
107
+ const client = createLlmClient();
108
+ await expect(client.generate("sys", "usr")).rejects.toThrow("Connection timeout");
109
+ });
110
+ it("passes system and user prompts correctly", async () => {
111
+ mockCreate.mockResolvedValue({
112
+ content: [{ type: "text", text: "ok" }],
113
+ });
114
+ const client = createLlmClient();
115
+ await client.generate("my system prompt", "my user prompt");
116
+ expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({
117
+ system: "my system prompt",
118
+ messages: [{ role: "user", content: "my user prompt" }],
119
+ max_tokens: 4096,
120
+ }), expect.anything());
121
+ });
122
+ it("throws when ANTHROPIC_API_KEY is not set", () => {
123
+ delete process.env.ANTHROPIC_API_KEY;
124
+ expect(() => createLlmClient()).toThrow("ANTHROPIC_API_KEY is not set");
125
+ });
78
126
  });
79
- it("exposes custom model name when VSKILL_EVAL_MODEL is set", () => {
80
- process.env.VSKILL_EVAL_MODEL = "claude-opus-4-20250514";
81
- const client = createLlmClient();
82
- expect(client.model).toBe("claude-opus-4-20250514");
127
+ // -------------------------------------------------------------------------
128
+ // Claude CLI provider
129
+ // -------------------------------------------------------------------------
130
+ describe("claude-cli provider", () => {
131
+ beforeEach(() => {
132
+ process.env.VSKILL_EVAL_PROVIDER = "claude-cli";
133
+ });
134
+ it("calls claude CLI with --model flag", async () => {
135
+ mockExecFile.mockResolvedValue({ stdout: "CLI response\n" });
136
+ const client = createLlmClient();
137
+ const result = await client.generate("system prompt", "user prompt");
138
+ expect(result).toBe("CLI response");
139
+ expect(mockExecFile).toHaveBeenCalledWith("claude", ["-p", "system prompt\n\nuser prompt", "--model", "sonnet", "--no-input"], expect.objectContaining({ timeout: 120_000 }));
140
+ });
141
+ it("defaults to sonnet model", () => {
142
+ const client = createLlmClient();
143
+ expect(client.model).toBe("claude-sonnet");
144
+ });
145
+ it("passes custom model from VSKILL_EVAL_MODEL", async () => {
146
+ process.env.VSKILL_EVAL_MODEL = "opus";
147
+ mockExecFile.mockResolvedValue({ stdout: "ok\n" });
148
+ const client = createLlmClient();
149
+ expect(client.model).toBe("claude-opus");
150
+ await client.generate("sys", "usr");
151
+ expect(mockExecFile).toHaveBeenCalledWith("claude", expect.arrayContaining(["--model", "opus"]), expect.anything());
152
+ });
153
+ it("throws helpful error when claude CLI not found", async () => {
154
+ const err = new Error("ENOENT");
155
+ err.code = "ENOENT";
156
+ mockExecFile.mockRejectedValue(err);
157
+ const client = createLlmClient();
158
+ await expect(client.generate("sys", "usr")).rejects.toThrow("Claude CLI not found");
159
+ });
160
+ it("throws when explicitly selected inside Claude Code session", () => {
161
+ process.env.CLAUDECODE = "1";
162
+ expect(() => createLlmClient()).toThrow("Cannot use claude-cli provider inside a Claude Code session");
163
+ });
164
+ });
165
+ // -------------------------------------------------------------------------
166
+ // Ollama provider
167
+ // -------------------------------------------------------------------------
168
+ describe("ollama provider", () => {
169
+ beforeEach(() => {
170
+ process.env.VSKILL_EVAL_PROVIDER = "ollama";
171
+ });
172
+ it("uses default model llama3.1:8b", () => {
173
+ const client = createLlmClient();
174
+ expect(client.model).toBe("llama3.1:8b");
175
+ });
176
+ it("uses custom model from VSKILL_EVAL_MODEL", () => {
177
+ process.env.VSKILL_EVAL_MODEL = "qwen2.5:32b";
178
+ const client = createLlmClient();
179
+ expect(client.model).toBe("qwen2.5:32b");
180
+ });
181
+ it("calls Ollama HTTP API with correct payload", async () => {
182
+ const mockFetch = vi.spyOn(globalThis, "fetch").mockResolvedValue(new Response(JSON.stringify({ response: "Ollama reply" }), { status: 200 }));
183
+ const client = createLlmClient();
184
+ const result = await client.generate("system prompt", "user prompt");
185
+ expect(result).toBe("Ollama reply");
186
+ expect(mockFetch).toHaveBeenCalledWith("http://localhost:11434/api/generate", expect.objectContaining({
187
+ method: "POST",
188
+ body: expect.stringContaining('"model":"llama3.1:8b"'),
189
+ }));
190
+ mockFetch.mockRestore();
191
+ });
192
+ it("uses custom base URL from OLLAMA_BASE_URL", async () => {
193
+ process.env.OLLAMA_BASE_URL = "http://gpu-server:11434";
194
+ const mockFetch = vi.spyOn(globalThis, "fetch").mockResolvedValue(new Response(JSON.stringify({ response: "ok" }), { status: 200 }));
195
+ const client = createLlmClient();
196
+ await client.generate("sys", "usr");
197
+ expect(mockFetch).toHaveBeenCalledWith("http://gpu-server:11434/api/generate", expect.anything());
198
+ mockFetch.mockRestore();
199
+ });
83
200
  });
84
201
  });
85
202
  //# sourceMappingURL=llm.test.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"llm.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/llm.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,QAAQ,CAAC;AAEzE,8EAA8E;AAC9E,QAAQ;AACR,8EAA8E;AAE9E,MAAM,UAAU,GAAG,EAAE,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;AAE7C,EAAE,CAAC,IAAI,CAAC,mBAAmB,EAAE,GAAG,EAAE,CAAC,CAAC;IAClC,OAAO,EAAE,MAAM,aAAa;QAC1B,QAAQ,GAAG,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC;KACnC;CACF,CAAC,CAAC,CAAC;AAEJ,8EAA8E;AAC9E,uCAAuC;AACvC,8EAA8E;AAE9E,MAAM,EAAE,eAAe,EAAE,GAAG,MAAM,MAAM,CAAC,WAAW,CAAC,CAAC;AAEtD,8EAA8E;AAC9E,QAAQ;AACR,8EAA8E;AAE9E,QAAQ,CAAC,iBAAiB,EAAE,GAAG,EAAE;IAC/B,MAAM,OAAO,GAAG,EAAE,GAAG,OAAO,CAAC,GAAG,EAAE,CAAC;IAEnC,UAAU,CAAC,GAAG,EAAE;QACd,EAAE,CAAC,aAAa,EAAE,CAAC;QACnB,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,UAAU,CAAC;QAC3C,OAAO,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC;IACvC,CAAC,CAAC,CAAC;IAEH,SAAS,CAAC,GAAG,EAAE;QACb,OAAO,CAAC,GAAG,GAAG,EAAE,GAAG,OAAO,EAAE,CAAC;IAC/B,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,kDAAkD,EAAE,KAAK,IAAI,EAAE;QAChE,UAAU,CAAC,iBAAiB,CAAC;YAC3B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,oBAAoB,EAAE,CAAC;SACxD,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,eAAe,EAAE,aAAa,CAAC,CAAC;QAErE,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC;QAC1C,MAAM,CAAC,UAAU,CAAC,CAAC,oBAAoB,EAAE,CAAC;IAC5C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,8DAA8D,EAAE,KAAK,IAAI,EAAE;QAC5E,UAAU,CAAC,iBAAiB,CAAC;YAC3B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;SACxC,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;QAEpC,MAAM,CAAC,UAAU,CAAC,CAAC,oBAAoB,CACrC,MAAM,CAAC,gBAAgB,CAAC,EAAE,KAAK,EAAE,0BAA0B,EAAE,CAAC,EAC9D,MAAM,CAAC,QAAQ,EAAE,CAClB,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,kDAAkD,EAAE,KAAK,IAAI,EAAE;QAChE,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,wBAAwB,CAAC;QACzD,UAAU,CAAC,iBAAiB,CAAC;YAC3B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;SACxC,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;QAEpC,MAAM,CAAC,UAAU,CAAC,CAAC,oBAAoB,CACrC,MAAM,CAAC,gBAAgB,CAAC,EAAE,KAAK,EAAE,wBAAwB,EAAE,CAAC,EAC5D,MAAM,CAAC,QAAQ,EAAE,CAClB,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,mCAAmC,EAAE,KAAK,IAAI,EAAE;QACjD,UAAU,CAAC,iBAAiB,CAAC,IAAI,KAAK,CAAC,oBAAoB,CAAC,CAAC,CAAC;QAE9D,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CACzD,oBAAoB,CACrB,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;QACxD,UAAU,CAAC,iBAAiB,CAAC;YAC3B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;SACxC,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,MAAM,CAAC,QAAQ,CAAC,kBAAkB,EAAE,gBAAgB,CAAC,CAAC;QAE5D,MAAM,CAAC,UAAU,CAAC,CAAC,oBAAoB,CACrC,MAAM,CAAC,gBAAgB,CAAC;YACtB,MAAM,EAAE,kBAAkB;YAC1B,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,EAAE,CAAC;YACvD,UAAU,EAAE,IAAI;SACjB,CAAC,EACF,MAAM,CAAC,QAAQ,EAAE,CAClB,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;QAClD,OAAO,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC;QACrC,MAAM,CAAC,GAAG,EAAE,CAAC,eAAe,EAAE,CAAC,CAAC,OAAO,CAAC,8BAA8B,CAAC,CAAC;IAC1E,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,kCAAkC,EAAE,GAAG,EAAE;QAC1C,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC;IACxD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,yDAAyD,EAAE,GAAG,EAAE;QACjE,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,wBAAwB,CAAC;QACzD,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,wBAAwB,CAAC,CAAC;IACtD,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
1
+ {"version":3,"file":"llm.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/llm.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,QAAQ,CAAC;AAEzE,8EAA8E;AAC9E,QAAQ;AACR,8EAA8E;AAE9E,MAAM,UAAU,GAAG,EAAE,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;AAC7C,MAAM,YAAY,GAAG,EAAE,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;AAE/C,EAAE,CAAC,IAAI,CAAC,mBAAmB,EAAE,GAAG,EAAE,CAAC,CAAC;IAClC,OAAO,EAAE,MAAM,aAAa;QAC1B,QAAQ,GAAG,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC;KACnC;CACF,CAAC,CAAC,CAAC;AAEJ,EAAE,CAAC,IAAI,CAAC,oBAAoB,EAAE,GAAG,EAAE,CAAC,CAAC;IACnC,QAAQ,EAAE,YAAY;CACvB,CAAC,CAAC,CAAC;AAEJ,EAAE,CAAC,IAAI,CAAC,WAAW,EAAE,GAAG,EAAE,CAAC,CAAC;IAC1B,SAAS,EAAE,CAAC,EAAO,EAAE,EAAE,CAAC,EAAE;CAC3B,CAAC,CAAC,CAAC;AAEJ,8EAA8E;AAC9E,uCAAuC;AACvC,8EAA8E;AAE9E,MAAM,EAAE,eAAe,EAAE,GAAG,MAAM,MAAM,CAAC,WAAW,CAAC,CAAC;AAEtD,8EAA8E;AAC9E,QAAQ;AACR,8EAA8E;AAE9E,QAAQ,CAAC,iBAAiB,EAAE,GAAG,EAAE;IAC/B,MAAM,OAAO,GAAG,EAAE,GAAG,OAAO,CAAC,GAAG,EAAE,CAAC;IAEnC,UAAU,CAAC,GAAG,EAAE;QACd,EAAE,CAAC,aAAa,EAAE,CAAC;QACnB,OAAO,OAAO,CAAC,GAAG,CAAC,oBAAoB,CAAC;QACxC,OAAO,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC;QACrC,OAAO,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC;QACrC,OAAO,OAAO,CAAC,GAAG,CAAC,eAAe,CAAC;QACnC,OAAO,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC;IAChC,CAAC,CAAC,CAAC;IAEH,SAAS,CAAC,GAAG,EAAE;QACb,OAAO,CAAC,GAAG,GAAG,EAAE,GAAG,OAAO,EAAE,CAAC;IAC/B,CAAC,CAAC,CAAC;IAEH,4EAA4E;IAC5E,iBAAiB;IACjB,4EAA4E;IAE5E,EAAE,CAAC,8CAA8C,EAAE,GAAG,EAAE;QACtD,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;IAC7C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,gDAAgD,EAAE,GAAG,EAAE;QACxD,OAAO,CAAC,GAAG,CAAC,UAAU,GAAG,GAAG,CAAC;QAC7B,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;IAC3C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,sDAAsD,EAAE,GAAG,EAAE;QAC9D,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,UAAU,CAAC;QAC3C,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC;IACxD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,qEAAqE,EAAE,GAAG,EAAE;QAC7E,OAAO,CAAC,GAAG,CAAC,UAAU,GAAG,GAAG,CAAC;QAC7B,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,UAAU,CAAC;QAC3C,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;IAC3C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,wDAAwD,EAAE,GAAG,EAAE;QAChE,OAAO,CAAC,GAAG,CAAC,oBAAoB,GAAG,QAAQ,CAAC;QAC5C,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,UAAU,CAAC;QAC3C,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;IAC3C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,4BAA4B,EAAE,GAAG,EAAE;QACpC,OAAO,CAAC,GAAG,CAAC,oBAAoB,GAAG,WAAW,CAAC;QAC/C,MAAM,CAAC,GAAG,EAAE,CAAC,eAAe,EAAE,CAAC,CAAC,OAAO,CAAC,2CAA2C,CAAC,CAAC;IACvF,CAAC,CAAC,CAAC;IAEH,4EAA4E;IAC5E,qBAAqB;IACrB,4EAA4E;IAE5E,QAAQ,CAAC,oBAAoB,EAAE,GAAG,EAAE;QAClC,UAAU,CAAC,GAAG,EAAE;YACd,OAAO,CAAC,GAAG,CAAC,oBAAoB,GAAG,WAAW,CAAC;YAC/C,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,UAAU,CAAC;QAC7C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,kDAAkD,EAAE,KAAK,IAAI,EAAE;YAChE,UAAU,CAAC,iBAAiB,CAAC;gBAC3B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,oBAAoB,EAAE,CAAC;aACxD,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,eAAe,EAAE,aAAa,CAAC,CAAC;YAErE,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC;YAC1C,MAAM,CAAC,UAAU,CAAC,CAAC,oBAAoB,EAAE,CAAC;QAC5C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,6CAA6C,EAAE,KAAK,IAAI,EAAE;YAC3D,UAAU,CAAC,iBAAiB,CAAC;gBAC3B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;aACxC,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;YAEpC,MAAM,CAAC,UAAU,CAAC,CAAC,oBAAoB,CACrC,MAAM,CAAC,gBAAgB,CAAC,EAAE,KAAK,EAAE,0BAA0B,EAAE,CAAC,EAC9D,MAAM,CAAC,QAAQ,EAAE,CAClB,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;YACxD,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,wBAAwB,CAAC;YACzD,UAAU,CAAC,iBAAiB,CAAC;gBAC3B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;aACxC,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;YAEpC,MAAM,CAAC,UAAU,CAAC,CAAC,oBAAoB,CACrC,MAAM,CAAC,gBAAgB,CAAC,EAAE,KAAK,EAAE,wBAAwB,EAAE,CAAC,EAC5D,MAAM,CAAC,QAAQ,EAAE,CAClB,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,mCAAmC,EAAE,KAAK,IAAI,EAAE;YACjD,UAAU,CAAC,iBAAiB,CAAC,IAAI,KAAK,CAAC,oBAAoB,CAAC,CAAC,CAAC;YAE9D,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CACzD,oBAAoB,CACrB,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;YACxD,UAAU,CAAC,iBAAiB,CAAC;gBAC3B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;aACxC,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,CAAC,QAAQ,CAAC,kBAAkB,EAAE,gBAAgB,CAAC,CAAC;YAE5D,MAAM,CAAC,UAAU,CAAC,CAAC,oBAAoB,CACrC,MAAM,CAAC,gBAAgB,CAAC;gBACtB,MAAM,EAAE,kBAAkB;gBAC1B,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,EAAE,CAAC;gBACvD,UAAU,EAAE,IAAI;aACjB,CAAC,EACF,MAAM,CAAC,QAAQ,EAAE,CAClB,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;YAClD,OAAO,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC;YACrC,MAAM,CAAC,GAAG,EAAE,CAAC,eAAe,EAAE,CAAC,CAAC,OAAO,CAAC,8BAA8B,CAAC,CAAC;QAC1E,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,4EAA4E;IAC5E,sBAAsB;IACtB,4EAA4E;IAE5E,QAAQ,CAAC,qBAAqB,EAAE,GAAG,EAAE;QACnC,UAAU,CAAC,GAAG,EAAE;YACd,OAAO,CAAC,GAAG,CAAC,oBAAoB,GAAG,YAAY,CAAC;QAClD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,oCAAoC,EAAE,KAAK,IAAI,EAAE;YAClD,YAAY,CAAC,iBAAiB,CAAC,EAAE,MAAM,EAAE,gBAAgB,EAAE,CAAC,CAAC;YAE7D,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,eAAe,EAAE,aAAa,CAAC,CAAC;YAErE,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;YACpC,MAAM,CAAC,YAAY,CAAC,CAAC,oBAAoB,CACvC,QAAQ,EACR,CAAC,IAAI,EAAE,8BAA8B,EAAE,SAAS,EAAE,QAAQ,EAAE,YAAY,CAAC,EACzE,MAAM,CAAC,gBAAgB,CAAC,EAAE,OAAO,EAAE,OAAO,EAAE,CAAC,CAC9C,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0BAA0B,EAAE,GAAG,EAAE;YAClC,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;QAC7C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4CAA4C,EAAE,KAAK,IAAI,EAAE;YAC1D,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,MAAM,CAAC;YACvC,YAAY,CAAC,iBAAiB,CAAC,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;YAEnD,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;YACzC,MAAM,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;YAEpC,MAAM,CAAC,YAAY,CAAC,CAAC,oBAAoB,CACvC,QAAQ,EACR,MAAM,CAAC,eAAe,CAAC,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC,EAC3C,MAAM,CAAC,QAAQ,EAAE,CAClB,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,gDAAgD,EAAE,KAAK,IAAI,EAAE;YAC9D,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,QAAQ,CAAQ,CAAC;YACvC,GAAG,CAAC,IAAI,GAAG,QAAQ,CAAC;YACpB,YAAY,CAAC,iBAAiB,CAAC,GAAG,CAAC,CAAC;YAEpC,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CACzD,sBAAsB,CACvB,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4DAA4D,EAAE,GAAG,EAAE;YACpE,OAAO,CAAC,GAAG,CAAC,UAAU,GAAG,GAAG,CAAC;YAC7B,MAAM,CAAC,GAAG,EAAE,CAAC,eAAe,EAAE,CAAC,CAAC,OAAO,CACrC,6DAA6D,CAC9D,CAAC;QACJ,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,4EAA4E;IAC5E,kBAAkB;IAClB,4EAA4E;IAE5E,QAAQ,CAAC,iBAAiB,EAAE,GAAG,EAAE;QAC/B,UAAU,CAAC,GAAG,EAAE;YACd,OAAO,CAAC,GAAG,CAAC,oBAAoB,GAAG,QAAQ,CAAC;QAC9C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,gCAAgC,EAAE,GAAG,EAAE;YACxC,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC3C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;YAClD,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,aAAa,CAAC;YAC9C,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC3C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4CAA4C,EAAE,KAAK,IAAI,EAAE;YAC1D,MAAM,SAAS,GAAG,EAAE,CAAC,KAAK,CAAC,UAAU,EAAE,OAAO,CAAC,CAAC,iBAAiB,CAC/D,IAAI,QAAQ,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,cAAc,EAAE,CAAC,EAAE,EAAE,MAAM,EAAE,GAAG,EAAE,CAAC,CAC5E,CAAC;YAEF,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,eAAe,EAAE,aAAa,CAAC,CAAC;YAErE,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;YACpC,MAAM,CAAC,SAAS,CAAC,CAAC,oBAAoB,CACpC,qCAAqC,EACrC,MAAM,CAAC,gBAAgB,CAAC;gBACtB,MAAM,EAAE,MAAM;gBACd,IAAI,EAAE,MAAM,CAAC,gBAAgB,CAAC,uBAAuB,CAAC;aACvD,CAAC,CACH,CAAC;YAEF,SAAS,CAAC,WAAW,EAAE,CAAC;QAC1B,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,2CAA2C,EAAE,KAAK,IAAI,EAAE;YACzD,OAAO,CAAC,GAAG,CAAC,eAAe,GAAG,yBAAyB,CAAC;YAExD,MAAM,SAAS,GAAG,EAAE,CAAC,KAAK,CAAC,UAAU,EAAE,OAAO,CAAC,CAAC,iBAAiB,CAC/D,IAAI,QAAQ,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC,EAAE,EAAE,MAAM,EAAE,GAAG,EAAE,CAAC,CAClE,CAAC;YAEF,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;YAEpC,MAAM,CAAC,SAAS,CAAC,CAAC,oBAAoB,CACpC,sCAAsC,EACtC,MAAM,CAAC,QAAQ,EAAE,CAClB,CAAC;YAEF,SAAS,CAAC,WAAW,EAAE,CAAC;QAC1B,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,47 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { computeVerdict, verdictColor } from "../verdict.js";
3
+ describe("computeVerdict", () => {
4
+ it("returns EFFECTIVE when passRate >= 0.8 and skill rubric > baseline + 1", () => {
5
+ expect(computeVerdict(0.85, 4.5, 3.0)).toBe("EFFECTIVE");
6
+ expect(computeVerdict(0.80, 4.0, 2.5)).toBe("EFFECTIVE");
7
+ expect(computeVerdict(1.0, 5.0, 1.0)).toBe("EFFECTIVE");
8
+ });
9
+ it("returns MARGINAL when passRate >= 0.6 and skill rubric > baseline (but not EFFECTIVE)", () => {
10
+ expect(computeVerdict(0.70, 3.5, 3.0)).toBe("MARGINAL");
11
+ expect(computeVerdict(0.60, 2.5, 2.0)).toBe("MARGINAL");
12
+ // High pass rate but rubric only slightly better → MARGINAL
13
+ expect(computeVerdict(0.85, 3.5, 3.0)).toBe("MARGINAL");
14
+ });
15
+ it("returns INEFFECTIVE when passRate >= 0.4 (but not MARGINAL)", () => {
16
+ expect(computeVerdict(0.50, 2.5, 3.0)).toBe("INEFFECTIVE");
17
+ expect(computeVerdict(0.45, 3.0, 3.0)).toBe("INEFFECTIVE");
18
+ expect(computeVerdict(0.40, 1.0, 5.0)).toBe("INEFFECTIVE");
19
+ });
20
+ it("returns DEGRADING when passRate < 0.4", () => {
21
+ expect(computeVerdict(0.30, 2.0, 3.0)).toBe("DEGRADING");
22
+ expect(computeVerdict(0.10, 1.0, 1.0)).toBe("DEGRADING");
23
+ expect(computeVerdict(0.0, 0.0, 0.0)).toBe("DEGRADING");
24
+ expect(computeVerdict(0.39, 5.0, 1.0)).toBe("DEGRADING");
25
+ });
26
+ it("handles boundary values correctly", () => {
27
+ // Exactly 0.8 pass rate, exactly +1 rubric → EFFECTIVE
28
+ expect(computeVerdict(0.8, 4.0, 2.9)).toBe("EFFECTIVE");
29
+ // 0.8 pass rate but rubric diff exactly 1 → NOT EFFECTIVE (needs >1)
30
+ expect(computeVerdict(0.8, 4.0, 3.0)).toBe("MARGINAL");
31
+ // Exactly 0.6 pass rate, skill > baseline → MARGINAL
32
+ expect(computeVerdict(0.6, 3.1, 3.0)).toBe("MARGINAL");
33
+ // Exactly 0.6 pass rate, skill = baseline → INEFFECTIVE
34
+ expect(computeVerdict(0.6, 3.0, 3.0)).toBe("INEFFECTIVE");
35
+ // Exactly 0.4 pass rate → INEFFECTIVE
36
+ expect(computeVerdict(0.4, 3.0, 3.0)).toBe("INEFFECTIVE");
37
+ });
38
+ });
39
+ describe("verdictColor", () => {
40
+ it("returns correct colors for each verdict", () => {
41
+ expect(verdictColor("EFFECTIVE")).toBe("green");
42
+ expect(verdictColor("MARGINAL")).toBe("yellow");
43
+ expect(verdictColor("INEFFECTIVE")).toBe("orange");
44
+ expect(verdictColor("DEGRADING")).toBe("red");
45
+ });
46
+ });
47
+ //# sourceMappingURL=verdict.test.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"verdict.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/verdict.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAC9C,OAAO,EAAE,cAAc,EAAE,YAAY,EAAE,MAAM,eAAe,CAAC;AAE7D,QAAQ,CAAC,gBAAgB,EAAE,GAAG,EAAE;IAC9B,EAAE,CAAC,wEAAwE,EAAE,GAAG,EAAE;QAChF,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACzD,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACzD,MAAM,CAAC,cAAc,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;IAC1D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,uFAAuF,EAAE,GAAG,EAAE;QAC/F,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACxD,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACxD,4DAA4D;QAC5D,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;IAC1D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,6DAA6D,EAAE,GAAG,EAAE;QACrE,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC3D,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC3D,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;IAC7D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,uCAAuC,EAAE,GAAG,EAAE;QAC/C,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACzD,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACzD,MAAM,CAAC,cAAc,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACxD,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;IAC3D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,mCAAmC,EAAE,GAAG,EAAE;QAC3C,uDAAuD;QACvD,MAAM,CAAC,cAAc,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACxD,qEAAqE;QACrE,MAAM,CAAC,cAAc,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACvD,qDAAqD;QACrD,MAAM,CAAC,cAAc,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACvD,wDAAwD;QACxD,MAAM,CAAC,cAAc,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC1D,sCAAsC;QACtC,MAAM,CAAC,cAAc,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;IAC5D,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,QAAQ,CAAC,cAAc,EAAE,GAAG,EAAE;IAC5B,EAAE,CAAC,yCAAyC,EAAE,GAAG,EAAE;QACjD,MAAM,CAAC,YAAY,CAAC,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAChD,MAAM,CAAC,YAAY,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAChD,MAAM,CAAC,YAAY,CAAC,aAAa,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QACnD,MAAM,CAAC,YAAY,CAAC,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAChD,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
@@ -0,0 +1,25 @@
1
+ import type { LlmClient } from "./llm.js";
2
+ export interface ActivationPrompt {
3
+ prompt: string;
4
+ expected: "should_activate" | "should_not_activate";
5
+ }
6
+ export interface ActivationResult {
7
+ prompt: string;
8
+ expected: "should_activate" | "should_not_activate";
9
+ activate: boolean;
10
+ confidence: "high" | "medium" | "low";
11
+ reasoning: string;
12
+ classification: "TP" | "TN" | "FP" | "FN";
13
+ }
14
+ export interface ActivationSummary {
15
+ results: ActivationResult[];
16
+ precision: number;
17
+ recall: number;
18
+ reliability: number;
19
+ total: number;
20
+ tp: number;
21
+ tn: number;
22
+ fp: number;
23
+ fn: number;
24
+ }
25
+ export declare function testActivation(skillDescription: string, prompts: ActivationPrompt[], client: LlmClient, onResult?: (result: ActivationResult) => void): Promise<ActivationSummary>;
@@ -0,0 +1,89 @@
1
+ // ---------------------------------------------------------------------------
2
+ // activation-tester.ts -- test SKILL.md description auto-activation quality
3
+ // ---------------------------------------------------------------------------
4
+ const ACTIVATION_SYSTEM_PROMPT = `You are evaluating whether a user prompt would trigger an AI skill based on its description.
5
+
6
+ Given the skill description and a user prompt, determine:
7
+ 1. Would this prompt trigger this skill? (yes/no)
8
+ 2. How confident are you? (high/medium/low)
9
+ 3. Brief reasoning
10
+
11
+ Respond with ONLY valid JSON:
12
+ {
13
+ "activate": true/false,
14
+ "confidence": "high" | "medium" | "low",
15
+ "reasoning": "brief explanation"
16
+ }`;
17
+ export async function testActivation(skillDescription, prompts, client, onResult) {
18
+ const results = [];
19
+ for (const p of prompts) {
20
+ const userPrompt = `## Skill Description
21
+ ${skillDescription}
22
+
23
+ ## User Prompt
24
+ ${p.prompt}
25
+
26
+ Would this user prompt trigger this skill?`;
27
+ try {
28
+ const response = await client.generate(ACTIVATION_SYSTEM_PROMPT, userPrompt);
29
+ const jsonMatch = response.match(/```(?:json)?\s*([\s\S]*?)```/) || [null, response];
30
+ const json = JSON.parse(jsonMatch[1].trim());
31
+ const activate = !!json.activate;
32
+ const confidence = ["high", "medium", "low"].includes(json.confidence)
33
+ ? json.confidence
34
+ : "low";
35
+ const classification = classifyResult(p.expected, activate);
36
+ const result = {
37
+ prompt: p.prompt,
38
+ expected: p.expected,
39
+ activate,
40
+ confidence,
41
+ reasoning: String(json.reasoning || ""),
42
+ classification,
43
+ };
44
+ results.push(result);
45
+ onResult?.(result);
46
+ }
47
+ catch (err) {
48
+ const result = {
49
+ prompt: p.prompt,
50
+ expected: p.expected,
51
+ activate: false,
52
+ confidence: "low",
53
+ reasoning: `Error: ${err instanceof Error ? err.message : String(err)}`,
54
+ classification: p.expected === "should_activate" ? "FN" : "TN",
55
+ };
56
+ results.push(result);
57
+ onResult?.(result);
58
+ }
59
+ }
60
+ return computeSummary(results);
61
+ }
62
+ function classifyResult(expected, actual) {
63
+ if (expected === "should_activate" && actual)
64
+ return "TP";
65
+ if (expected === "should_activate" && !actual)
66
+ return "FN";
67
+ if (expected === "should_not_activate" && !actual)
68
+ return "TN";
69
+ return "FP";
70
+ }
71
+ function computeSummary(results) {
72
+ const tp = results.filter((r) => r.classification === "TP").length;
73
+ const tn = results.filter((r) => r.classification === "TN").length;
74
+ const fp = results.filter((r) => r.classification === "FP").length;
75
+ const fn = results.filter((r) => r.classification === "FN").length;
76
+ const total = results.length;
77
+ return {
78
+ results,
79
+ precision: tp + fp > 0 ? tp / (tp + fp) : 0,
80
+ recall: tp + fn > 0 ? tp / (tp + fn) : 0,
81
+ reliability: total > 0 ? (tp + tn) / total : 0,
82
+ total,
83
+ tp,
84
+ tn,
85
+ fp,
86
+ fn,
87
+ };
88
+ }
89
+ //# sourceMappingURL=activation-tester.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"activation-tester.js","sourceRoot":"","sources":["../../src/eval/activation-tester.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,4EAA4E;AAC5E,8EAA8E;AA8B9E,MAAM,wBAAwB,GAAG;;;;;;;;;;;;EAY/B,CAAC;AAEH,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,gBAAwB,EACxB,OAA2B,EAC3B,MAAiB,EACjB,QAA6C;IAE7C,MAAM,OAAO,GAAuB,EAAE,CAAC;IAEvC,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;QACxB,MAAM,UAAU,GAAG;EACrB,gBAAgB;;;EAGhB,CAAC,CAAC,MAAM;;2CAEiC,CAAC;QAExC,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,wBAAwB,EAAE,UAAU,CAAC,CAAC;YAC7E,MAAM,SAAS,GAAG,QAAQ,CAAC,KAAK,CAAC,8BAA8B,CAAC,IAAI,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;YACrF,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,CAAE,CAAC,IAAI,EAAE,CAAC,CAAC;YAE9C,MAAM,QAAQ,GAAG,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC;YACjC,MAAM,UAAU,GAAG,CAAC,MAAM,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,UAAU,CAAC;gBACpE,CAAC,CAAE,IAAI,CAAC,UAAwC;gBAChD,CAAC,CAAC,KAAK,CAAC;YAEV,MAAM,cAAc,GAAG,cAAc,CAAC,CAAC,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;YAE5D,MAAM,MAAM,GAAqB;gBAC/B,MAAM,EAAE,CAAC,CAAC,MAAM;gBAChB,QAAQ,EAAE,CAAC,CAAC,QAAQ;gBACpB,QAAQ;gBACR,UAAU;gBACV,SAAS,EAAE,MAAM,CAAC,IAAI,CAAC,SAAS,IAAI,EAAE,CAAC;gBACvC,cAAc;aACf,CAAC;YACF,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACrB,QAAQ,EAAE,CAAC,MAAM,CAAC,CAAC;QACrB,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,MAAM,MAAM,GAAqB;gBAC/B,MAAM,EAAE,CAAC,CAAC,MAAM;gBAChB,QAAQ,EAAE,CAAC,CAAC,QAAQ;gBACpB,QAAQ,EAAE,KAAK;gBACf,UAAU,EAAE,KAAK;gBACjB,SAAS,EAAE,UAAU,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE;gBACvE,cAAc,EAAE,CAAC,CAAC,QAAQ,KAAK,iBAAiB,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI;aAC/D,CAAC;YACF,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACrB,QAAQ,EAAE,CAAC,MAAM,CAAC,CAAC;QACrB,CAAC;IACH,CAAC;IAED,OAAO,cAAc,CAAC,OAAO,CAAC,CAAC;AACjC,CAAC;AAED,SAAS,cAAc,CACrB,QAAmD,EACnD,MAAe;IAEf,IAAI,QAAQ,KAAK,iBAAiB,IAAI,MAAM;QAAE,OAAO,IAAI,CAAC;IAC1D,IAAI,QAAQ,KAAK,iBAAiB,IAAI,CAAC,MAAM;QAAE,OAAO,IAAI,CAAC;IAC3D,IAAI,QAAQ,KAAK,qBAAqB,IAAI,CAAC,MAAM;QAAE,OAAO,IAAI,CAAC;IAC/D,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,cAAc,CAAC,OAA2B;IACjD,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,cAAc,KAAK,IAAI,CAAC,CAAC,MAAM,CAAC;IACnE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,cAAc,KAAK,IAAI,CAAC,CAAC,MAAM,CAAC;IACnE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,cAAc,KAAK,IAAI,CAAC,CAAC,MAAM,CAAC;IACnE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,cAAc,KAAK,IAAI,CAAC,CAAC,MAAM,CAAC;IACnE,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC;IAE7B,OAAO;QACL,OAAO;QACP,SAAS,EAAE,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QAC3C,MAAM,EAAE,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QACxC,WAAW,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,GAAG,EAAE,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QAC9C,KAAK;QACL,EAAE;QACF,EAAE;QACF,EAAE;QACF,EAAE;KACH,CAAC;AACJ,CAAC"}
@@ -0,0 +1,23 @@
1
+ import type { BenchmarkResult } from "./benchmark.js";
2
+ export interface HistorySummary {
3
+ timestamp: string;
4
+ filename: string;
5
+ model: string;
6
+ skillName: string;
7
+ passRate: number;
8
+ type: "benchmark" | "comparison";
9
+ }
10
+ export interface RegressionEntry {
11
+ assertionId: string;
12
+ evalId: number;
13
+ evalName: string;
14
+ previousStatus: boolean;
15
+ currentStatus: boolean;
16
+ change: "regression" | "improvement";
17
+ }
18
+ export declare function writeHistoryEntry(skillDir: string, result: BenchmarkResult & {
19
+ type?: "benchmark" | "comparison";
20
+ }): Promise<string>;
21
+ export declare function listHistory(skillDir: string): Promise<HistorySummary[]>;
22
+ export declare function readHistoryEntry(skillDir: string, timestamp: string): Promise<BenchmarkResult | null>;
23
+ export declare function computeRegressions(current: BenchmarkResult, previous: BenchmarkResult): RegressionEntry[];