vskill 0.2.55 → 0.2.57
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -3
- package/dist/commands/eval/__tests__/run.test.js +7 -2
- package/dist/commands/eval/__tests__/run.test.js.map +1 -1
- package/dist/commands/eval/run.js +24 -3
- package/dist/commands/eval/run.js.map +1 -1
- package/dist/commands/eval/serve.d.ts +1 -0
- package/dist/commands/eval/serve.js +51 -0
- package/dist/commands/eval/serve.js.map +1 -0
- package/dist/commands/eval.d.ts +1 -0
- package/dist/commands/eval.js +19 -3
- package/dist/commands/eval.js.map +1 -1
- package/dist/eval/__tests__/activation-tester.test.d.ts +1 -0
- package/dist/eval/__tests__/activation-tester.test.js +94 -0
- package/dist/eval/__tests__/activation-tester.test.js.map +1 -0
- package/dist/eval/__tests__/benchmark-history.test.d.ts +1 -0
- package/dist/eval/__tests__/benchmark-history.test.js +200 -0
- package/dist/eval/__tests__/benchmark-history.test.js.map +1 -0
- package/dist/eval/__tests__/comparator.test.d.ts +1 -0
- package/dist/eval/__tests__/comparator.test.js +136 -0
- package/dist/eval/__tests__/comparator.test.js.map +1 -0
- package/dist/eval/__tests__/llm.test.js +161 -44
- package/dist/eval/__tests__/llm.test.js.map +1 -1
- package/dist/eval/__tests__/skill-scanner.test.js +40 -1
- package/dist/eval/__tests__/skill-scanner.test.js.map +1 -1
- package/dist/eval/__tests__/verdict.test.d.ts +1 -0
- package/dist/eval/__tests__/verdict.test.js +47 -0
- package/dist/eval/__tests__/verdict.test.js.map +1 -0
- package/dist/eval/activation-tester.d.ts +25 -0
- package/dist/eval/activation-tester.js +89 -0
- package/dist/eval/activation-tester.js.map +1 -0
- package/dist/eval/benchmark-history.d.ts +23 -0
- package/dist/eval/benchmark-history.js +108 -0
- package/dist/eval/benchmark-history.js.map +1 -0
- package/dist/eval/comparator.d.ts +29 -0
- package/dist/eval/comparator.js +100 -0
- package/dist/eval/comparator.js.map +1 -0
- package/dist/eval/llm.js +119 -6
- package/dist/eval/llm.js.map +1 -1
- package/dist/eval/skill-scanner.js +35 -26
- package/dist/eval/skill-scanner.js.map +1 -1
- package/dist/eval/verdict.d.ts +3 -0
- package/dist/eval/verdict.js +28 -0
- package/dist/eval/verdict.js.map +1 -0
- package/dist/eval-server/api-routes.d.ts +2 -0
- package/dist/eval-server/api-routes.js +425 -0
- package/dist/eval-server/api-routes.js.map +1 -0
- package/dist/eval-server/eval-server.d.ts +6 -0
- package/dist/eval-server/eval-server.js +102 -0
- package/dist/eval-server/eval-server.js.map +1 -0
- package/dist/eval-server/router.d.ts +14 -0
- package/dist/eval-server/router.js +117 -0
- package/dist/eval-server/router.js.map +1 -0
- package/dist/eval-server/sse-helpers.d.ts +4 -0
- package/dist/eval-server/sse-helpers.js +24 -0
- package/dist/eval-server/sse-helpers.js.map +1 -0
- package/dist/eval-ui/assets/index-BYpLv_X1.css +1 -0
- package/dist/eval-ui/assets/index-Od6Ch9-a.js +70 -0
- package/dist/eval-ui/index.html +13 -0
- package/dist/index.js +3 -2
- package/dist/index.js.map +1 -1
- package/package.json +15 -2
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
import { describe, it, expect, vi } from "vitest";
|
|
2
|
+
import { generateComparisonOutputs, scoreComparison, runComparison, } from "../comparator.js";
|
|
3
|
+
function mockClient(responses) {
|
|
4
|
+
let callIndex = 0;
|
|
5
|
+
return {
|
|
6
|
+
model: "test-model",
|
|
7
|
+
generate: vi.fn(async () => responses[callIndex++] ?? ""),
|
|
8
|
+
};
|
|
9
|
+
}
|
|
10
|
+
describe("generateComparisonOutputs", () => {
|
|
11
|
+
it("generates skill and baseline outputs sequentially", async () => {
|
|
12
|
+
const client = mockClient(["skill response", "baseline response"]);
|
|
13
|
+
const result = await generateComparisonOutputs("test prompt", "# Skill Content", client);
|
|
14
|
+
expect(result.skillOutput).toBe("skill response");
|
|
15
|
+
expect(result.baselineOutput).toBe("baseline response");
|
|
16
|
+
expect(result.skillDurationMs).toBeGreaterThanOrEqual(0);
|
|
17
|
+
expect(result.baselineDurationMs).toBeGreaterThanOrEqual(0);
|
|
18
|
+
expect(client.generate).toHaveBeenCalledTimes(2);
|
|
19
|
+
// First call should include skill content
|
|
20
|
+
const firstCall = client.generate.mock.calls[0];
|
|
21
|
+
expect(firstCall[0]).toContain("Skill Content");
|
|
22
|
+
// Second call should be generic
|
|
23
|
+
const secondCall = client.generate.mock.calls[1];
|
|
24
|
+
expect(secondCall[0]).toContain("helpful AI assistant");
|
|
25
|
+
});
|
|
26
|
+
});
|
|
27
|
+
describe("scoreComparison", () => {
|
|
28
|
+
it("parses JSON scores from LLM response", async () => {
|
|
29
|
+
const client = mockClient([
|
|
30
|
+
JSON.stringify({
|
|
31
|
+
content_score_a: 4,
|
|
32
|
+
structure_score_a: 3,
|
|
33
|
+
content_score_b: 5,
|
|
34
|
+
structure_score_b: 4,
|
|
35
|
+
winner: "second",
|
|
36
|
+
reasoning: "B is better",
|
|
37
|
+
}),
|
|
38
|
+
]);
|
|
39
|
+
const result = await scoreComparison("output A", "output B", "prompt", client);
|
|
40
|
+
expect(result.contentScoreA).toBe(4);
|
|
41
|
+
expect(result.structureScoreA).toBe(3);
|
|
42
|
+
expect(result.contentScoreB).toBe(5);
|
|
43
|
+
expect(result.structureScoreB).toBe(4);
|
|
44
|
+
expect(result.winner).toBe("second");
|
|
45
|
+
});
|
|
46
|
+
it("parses JSON from code fence", async () => {
|
|
47
|
+
const client = mockClient([
|
|
48
|
+
'```json\n{"content_score_a": 3, "structure_score_a": 3, "content_score_b": 3, "structure_score_b": 3, "winner": "tie"}\n```',
|
|
49
|
+
]);
|
|
50
|
+
const result = await scoreComparison("A", "B", "p", client);
|
|
51
|
+
expect(result.winner).toBe("tie");
|
|
52
|
+
expect(result.contentScoreA).toBe(3);
|
|
53
|
+
});
|
|
54
|
+
it("clamps scores to 1-5 range", async () => {
|
|
55
|
+
const client = mockClient([
|
|
56
|
+
JSON.stringify({
|
|
57
|
+
content_score_a: 0,
|
|
58
|
+
structure_score_a: 10,
|
|
59
|
+
content_score_b: -1,
|
|
60
|
+
structure_score_b: 6,
|
|
61
|
+
winner: "first",
|
|
62
|
+
}),
|
|
63
|
+
]);
|
|
64
|
+
const result = await scoreComparison("A", "B", "p", client);
|
|
65
|
+
expect(result.contentScoreA).toBe(1);
|
|
66
|
+
expect(result.structureScoreA).toBe(5);
|
|
67
|
+
expect(result.contentScoreB).toBe(1);
|
|
68
|
+
expect(result.structureScoreB).toBe(5);
|
|
69
|
+
});
|
|
70
|
+
it("defaults invalid winner to tie", async () => {
|
|
71
|
+
const client = mockClient([
|
|
72
|
+
JSON.stringify({
|
|
73
|
+
content_score_a: 3,
|
|
74
|
+
structure_score_a: 3,
|
|
75
|
+
content_score_b: 3,
|
|
76
|
+
structure_score_b: 3,
|
|
77
|
+
winner: "invalid",
|
|
78
|
+
}),
|
|
79
|
+
]);
|
|
80
|
+
const result = await scoreComparison("A", "B", "p", client);
|
|
81
|
+
expect(result.winner).toBe("tie");
|
|
82
|
+
});
|
|
83
|
+
});
|
|
84
|
+
describe("runComparison", () => {
|
|
85
|
+
it("maps scores back to skill/baseline correctly", async () => {
|
|
86
|
+
// Mock: first two calls = skill + baseline outputs, third = scoring
|
|
87
|
+
const client = mockClient([
|
|
88
|
+
"skill output here",
|
|
89
|
+
"baseline output here",
|
|
90
|
+
JSON.stringify({
|
|
91
|
+
content_score_a: 4,
|
|
92
|
+
structure_score_a: 5,
|
|
93
|
+
content_score_b: 2,
|
|
94
|
+
structure_score_b: 3,
|
|
95
|
+
winner: "first",
|
|
96
|
+
reasoning: "A is better",
|
|
97
|
+
}),
|
|
98
|
+
]);
|
|
99
|
+
// Fix randomness for deterministic test
|
|
100
|
+
vi.spyOn(Math, "random").mockReturnValue(0.3); // < 0.5 → skill is A
|
|
101
|
+
const result = await runComparison("test prompt", "skill content", client);
|
|
102
|
+
expect(result.prompt).toBe("test prompt");
|
|
103
|
+
expect(result.skillOutput).toBe("skill output here");
|
|
104
|
+
expect(result.baselineOutput).toBe("baseline output here");
|
|
105
|
+
// skill is A, so scores map directly
|
|
106
|
+
expect(result.skillContentScore).toBe(4);
|
|
107
|
+
expect(result.skillStructureScore).toBe(5);
|
|
108
|
+
expect(result.baselineContentScore).toBe(2);
|
|
109
|
+
expect(result.baselineStructureScore).toBe(3);
|
|
110
|
+
expect(result.winner).toBe("skill");
|
|
111
|
+
vi.restoreAllMocks();
|
|
112
|
+
});
|
|
113
|
+
it("maps scores correctly when baseline is A", async () => {
|
|
114
|
+
const client = mockClient([
|
|
115
|
+
"skill out",
|
|
116
|
+
"baseline out",
|
|
117
|
+
JSON.stringify({
|
|
118
|
+
content_score_a: 2,
|
|
119
|
+
structure_score_a: 2,
|
|
120
|
+
content_score_b: 4,
|
|
121
|
+
structure_score_b: 4,
|
|
122
|
+
winner: "second",
|
|
123
|
+
}),
|
|
124
|
+
]);
|
|
125
|
+
// > 0.5 → skill is B
|
|
126
|
+
vi.spyOn(Math, "random").mockReturnValue(0.7);
|
|
127
|
+
const result = await runComparison("p", "s", client);
|
|
128
|
+
// skill is B → scores.contentScoreB is skill
|
|
129
|
+
expect(result.skillContentScore).toBe(4);
|
|
130
|
+
expect(result.baselineContentScore).toBe(2);
|
|
131
|
+
// winner "second" = B = skill
|
|
132
|
+
expect(result.winner).toBe("skill");
|
|
133
|
+
vi.restoreAllMocks();
|
|
134
|
+
});
|
|
135
|
+
});
|
|
136
|
+
//# sourceMappingURL=comparator.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"comparator.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/comparator.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,QAAQ,CAAC;AAClD,OAAO,EACL,yBAAyB,EACzB,eAAe,EACf,aAAa,GACd,MAAM,kBAAkB,CAAC;AAG1B,SAAS,UAAU,CAAC,SAAmB;IACrC,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,OAAO;QACL,KAAK,EAAE,YAAY;QACnB,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC,KAAK,IAAI,EAAE,CAAC,SAAS,CAAC,SAAS,EAAE,CAAC,IAAI,EAAE,CAAC;KAC1D,CAAC;AACJ,CAAC;AAED,QAAQ,CAAC,2BAA2B,EAAE,GAAG,EAAE;IACzC,EAAE,CAAC,mDAAmD,EAAE,KAAK,IAAI,EAAE;QACjE,MAAM,MAAM,GAAG,UAAU,CAAC,CAAC,gBAAgB,EAAE,mBAAmB,CAAC,CAAC,CAAC;QACnE,MAAM,MAAM,GAAG,MAAM,yBAAyB,CAAC,aAAa,EAAE,iBAAiB,EAAE,MAAM,CAAC,CAAC;QAEzF,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;QAClD,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;QACxD,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,CAAC;QACzD,MAAM,CAAC,MAAM,CAAC,kBAAkB,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,CAAC;QAC5D,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,qBAAqB,CAAC,CAAC,CAAC,CAAC;QAEjD,0CAA0C;QAC1C,MAAM,SAAS,GAAI,MAAM,CAAC,QAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACzD,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,eAAe,CAAC,CAAC;QAEhD,gCAAgC;QAChC,MAAM,UAAU,GAAI,MAAM,CAAC,QAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAC1D,MAAM,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,sBAAsB,CAAC,CAAC;IAC1D,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,QAAQ,CAAC,iBAAiB,EAAE,GAAG,EAAE;IAC/B,EAAE,CAAC,sCAAsC,EAAE,KAAK,IAAI,EAAE;QACpD,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,QAAQ;gBAChB,SAAS,EAAE,aAAa;aACzB,CAAC;SACH,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,UAAU,EAAE,UAAU,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC;QAC/E,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACvC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,6BAA6B,EAAE,KAAK,IAAI,EAAE;QAC3C,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,6HAA6H;SAC9H,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,CAAC,CAAC;QAC5D,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAClC,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACvC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,4BAA4B,EAAE,KAAK,IAAI,EAAE;QAC1C,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,EAAE;gBACrB,eAAe,EAAE,CAAC,CAAC;gBACnB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,OAAO;aAChB,CAAC;SACH,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,CAAC,CAAC;QAC5D,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACzC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,gCAAgC,EAAE,KAAK,IAAI,EAAE;QAC9C,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,SAAS;aAClB,CAAC;SACH,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,CAAC,CAAC;QAC5D,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IACpC,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,QAAQ,CAAC,eAAe,EAAE,GAAG,EAAE;IAC7B,EAAE,CAAC,8CAA8C,EAAE,KAAK,IAAI,EAAE;QAC5D,oEAAoE;QACpE,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,mBAAmB;YACnB,sBAAsB;YACtB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,OAAO;gBACf,SAAS,EAAE,aAAa;aACzB,CAAC;SACH,CAAC,CAAC;QAEH,wCAAwC;QACxC,EAAE,CAAC,KAAK,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC,CAAC,qBAAqB;QAEpE,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,aAAa,EAAE,eAAe,EAAE,MAAM,CAAC,CAAC;QAE3E,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC1C,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;QACrD,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,sBAAsB,CAAC,CAAC;QAC3D,qCAAqC;QACrC,MAAM,CAAC,MAAM,CAAC,iBAAiB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACzC,MAAM,CAAC,MAAM,CAAC,mBAAmB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3C,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC5C,MAAM,CAAC,MAAM,CAAC,sBAAsB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC9C,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAEpC,EAAE,CAAC,eAAe,EAAE,CAAC;IACvB,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;QACxD,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,WAAW;YACX,cAAc;YACd,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,QAAQ;aACjB,CAAC;SACH,CAAC,CAAC;QAEH,qBAAqB;QACrB,EAAE,CAAC,KAAK,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC;QAE9C,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,GAAG,EAAE,GAAG,EAAE,MAAM,CAAC,CAAC;QACrD,6CAA6C;QAC7C,MAAM,CAAC,MAAM,CAAC,iBAAiB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACzC,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC5C,8BAA8B;QAC9B,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAEpC,EAAE,CAAC,eAAe,EAAE,CAAC;IACvB,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -3,11 +3,18 @@ import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
|
|
|
3
3
|
// Mocks
|
|
4
4
|
// ---------------------------------------------------------------------------
|
|
5
5
|
const mockCreate = vi.hoisted(() => vi.fn());
|
|
6
|
+
const mockExecFile = vi.hoisted(() => vi.fn());
|
|
6
7
|
vi.mock("@anthropic-ai/sdk", () => ({
|
|
7
8
|
default: class MockAnthropic {
|
|
8
9
|
messages = { create: mockCreate };
|
|
9
10
|
},
|
|
10
11
|
}));
|
|
12
|
+
vi.mock("node:child_process", () => ({
|
|
13
|
+
execFile: mockExecFile,
|
|
14
|
+
}));
|
|
15
|
+
vi.mock("node:util", () => ({
|
|
16
|
+
promisify: (fn) => fn,
|
|
17
|
+
}));
|
|
11
18
|
// ---------------------------------------------------------------------------
|
|
12
19
|
// Import module under test AFTER mocks
|
|
13
20
|
// ---------------------------------------------------------------------------
|
|
@@ -19,67 +26,177 @@ describe("createLlmClient", () => {
|
|
|
19
26
|
const origEnv = { ...process.env };
|
|
20
27
|
beforeEach(() => {
|
|
21
28
|
vi.resetAllMocks();
|
|
22
|
-
process.env.
|
|
29
|
+
delete process.env.VSKILL_EVAL_PROVIDER;
|
|
23
30
|
delete process.env.VSKILL_EVAL_MODEL;
|
|
31
|
+
delete process.env.ANTHROPIC_API_KEY;
|
|
32
|
+
delete process.env.OLLAMA_BASE_URL;
|
|
33
|
+
delete process.env.CLAUDECODE;
|
|
24
34
|
});
|
|
25
35
|
afterEach(() => {
|
|
26
36
|
process.env = { ...origEnv };
|
|
27
37
|
});
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
38
|
+
// -------------------------------------------------------------------------
|
|
39
|
+
// Auto-detection
|
|
40
|
+
// -------------------------------------------------------------------------
|
|
41
|
+
it("defaults to claude-cli from a plain terminal", () => {
|
|
32
42
|
const client = createLlmClient();
|
|
33
|
-
|
|
34
|
-
expect(result).toBe("Generated response");
|
|
35
|
-
expect(mockCreate).toHaveBeenCalledOnce();
|
|
43
|
+
expect(client.model).toBe("claude-sonnet");
|
|
36
44
|
});
|
|
37
|
-
it("
|
|
38
|
-
|
|
39
|
-
content: [{ type: "text", text: "ok" }],
|
|
40
|
-
});
|
|
45
|
+
it("auto-detects ollama inside Claude Code session", () => {
|
|
46
|
+
process.env.CLAUDECODE = "1";
|
|
41
47
|
const client = createLlmClient();
|
|
42
|
-
|
|
43
|
-
expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ model: "claude-sonnet-4-20250514" }), expect.anything());
|
|
48
|
+
expect(client.model).toBe("llama3.1:8b");
|
|
44
49
|
});
|
|
45
|
-
it("
|
|
46
|
-
process.env.
|
|
47
|
-
mockCreate.mockResolvedValue({
|
|
48
|
-
content: [{ type: "text", text: "ok" }],
|
|
49
|
-
});
|
|
50
|
+
it("auto-detects anthropic when ANTHROPIC_API_KEY is set", () => {
|
|
51
|
+
process.env.ANTHROPIC_API_KEY = "test-key";
|
|
50
52
|
const client = createLlmClient();
|
|
51
|
-
|
|
52
|
-
expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ model: "claude-opus-4-20250514" }), expect.anything());
|
|
53
|
+
expect(client.model).toBe("claude-sonnet-4-20250514");
|
|
53
54
|
});
|
|
54
|
-
it("
|
|
55
|
-
|
|
55
|
+
it("CLAUDECODE takes priority over ANTHROPIC_API_KEY for auto-detection", () => {
|
|
56
|
+
process.env.CLAUDECODE = "1";
|
|
57
|
+
process.env.ANTHROPIC_API_KEY = "test-key";
|
|
56
58
|
const client = createLlmClient();
|
|
57
|
-
|
|
59
|
+
expect(client.model).toBe("llama3.1:8b");
|
|
58
60
|
});
|
|
59
|
-
it("
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
});
|
|
61
|
+
it("explicit VSKILL_EVAL_PROVIDER overrides auto-detection", () => {
|
|
62
|
+
process.env.VSKILL_EVAL_PROVIDER = "ollama";
|
|
63
|
+
process.env.ANTHROPIC_API_KEY = "test-key";
|
|
63
64
|
const client = createLlmClient();
|
|
64
|
-
|
|
65
|
-
expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({
|
|
66
|
-
system: "my system prompt",
|
|
67
|
-
messages: [{ role: "user", content: "my user prompt" }],
|
|
68
|
-
max_tokens: 4096,
|
|
69
|
-
}), expect.anything());
|
|
65
|
+
expect(client.model).toBe("llama3.1:8b");
|
|
70
66
|
});
|
|
71
|
-
it("throws
|
|
72
|
-
|
|
73
|
-
expect(() => createLlmClient()).toThrow(
|
|
67
|
+
it("throws on unknown provider", () => {
|
|
68
|
+
process.env.VSKILL_EVAL_PROVIDER = "gpt-magic";
|
|
69
|
+
expect(() => createLlmClient()).toThrow('Unknown VSKILL_EVAL_PROVIDER: "gpt-magic"');
|
|
74
70
|
});
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
71
|
+
// -------------------------------------------------------------------------
|
|
72
|
+
// Anthropic provider
|
|
73
|
+
// -------------------------------------------------------------------------
|
|
74
|
+
describe("anthropic provider", () => {
|
|
75
|
+
beforeEach(() => {
|
|
76
|
+
process.env.VSKILL_EVAL_PROVIDER = "anthropic";
|
|
77
|
+
process.env.ANTHROPIC_API_KEY = "test-key";
|
|
78
|
+
});
|
|
79
|
+
it("returns text content on successful generate call", async () => {
|
|
80
|
+
mockCreate.mockResolvedValue({
|
|
81
|
+
content: [{ type: "text", text: "Generated response" }],
|
|
82
|
+
});
|
|
83
|
+
const client = createLlmClient();
|
|
84
|
+
const result = await client.generate("system prompt", "user prompt");
|
|
85
|
+
expect(result).toBe("Generated response");
|
|
86
|
+
expect(mockCreate).toHaveBeenCalledOnce();
|
|
87
|
+
});
|
|
88
|
+
it("uses default model claude-sonnet-4-20250514", async () => {
|
|
89
|
+
mockCreate.mockResolvedValue({
|
|
90
|
+
content: [{ type: "text", text: "ok" }],
|
|
91
|
+
});
|
|
92
|
+
const client = createLlmClient();
|
|
93
|
+
await client.generate("sys", "usr");
|
|
94
|
+
expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ model: "claude-sonnet-4-20250514" }), expect.anything());
|
|
95
|
+
});
|
|
96
|
+
it("uses custom model from VSKILL_EVAL_MODEL", async () => {
|
|
97
|
+
process.env.VSKILL_EVAL_MODEL = "claude-opus-4-20250514";
|
|
98
|
+
mockCreate.mockResolvedValue({
|
|
99
|
+
content: [{ type: "text", text: "ok" }],
|
|
100
|
+
});
|
|
101
|
+
const client = createLlmClient();
|
|
102
|
+
await client.generate("sys", "usr");
|
|
103
|
+
expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ model: "claude-opus-4-20250514" }), expect.anything());
|
|
104
|
+
});
|
|
105
|
+
it("propagates network error from SDK", async () => {
|
|
106
|
+
mockCreate.mockRejectedValue(new Error("Connection timeout"));
|
|
107
|
+
const client = createLlmClient();
|
|
108
|
+
await expect(client.generate("sys", "usr")).rejects.toThrow("Connection timeout");
|
|
109
|
+
});
|
|
110
|
+
it("passes system and user prompts correctly", async () => {
|
|
111
|
+
mockCreate.mockResolvedValue({
|
|
112
|
+
content: [{ type: "text", text: "ok" }],
|
|
113
|
+
});
|
|
114
|
+
const client = createLlmClient();
|
|
115
|
+
await client.generate("my system prompt", "my user prompt");
|
|
116
|
+
expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({
|
|
117
|
+
system: "my system prompt",
|
|
118
|
+
messages: [{ role: "user", content: "my user prompt" }],
|
|
119
|
+
max_tokens: 4096,
|
|
120
|
+
}), expect.anything());
|
|
121
|
+
});
|
|
122
|
+
it("throws when ANTHROPIC_API_KEY is not set", () => {
|
|
123
|
+
delete process.env.ANTHROPIC_API_KEY;
|
|
124
|
+
expect(() => createLlmClient()).toThrow("ANTHROPIC_API_KEY is not set");
|
|
125
|
+
});
|
|
78
126
|
});
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
127
|
+
// -------------------------------------------------------------------------
|
|
128
|
+
// Claude CLI provider
|
|
129
|
+
// -------------------------------------------------------------------------
|
|
130
|
+
describe("claude-cli provider", () => {
|
|
131
|
+
beforeEach(() => {
|
|
132
|
+
process.env.VSKILL_EVAL_PROVIDER = "claude-cli";
|
|
133
|
+
});
|
|
134
|
+
it("calls claude CLI with --model flag", async () => {
|
|
135
|
+
mockExecFile.mockResolvedValue({ stdout: "CLI response\n" });
|
|
136
|
+
const client = createLlmClient();
|
|
137
|
+
const result = await client.generate("system prompt", "user prompt");
|
|
138
|
+
expect(result).toBe("CLI response");
|
|
139
|
+
expect(mockExecFile).toHaveBeenCalledWith("claude", ["-p", "system prompt\n\nuser prompt", "--model", "sonnet", "--no-input"], expect.objectContaining({ timeout: 120_000 }));
|
|
140
|
+
});
|
|
141
|
+
it("defaults to sonnet model", () => {
|
|
142
|
+
const client = createLlmClient();
|
|
143
|
+
expect(client.model).toBe("claude-sonnet");
|
|
144
|
+
});
|
|
145
|
+
it("passes custom model from VSKILL_EVAL_MODEL", async () => {
|
|
146
|
+
process.env.VSKILL_EVAL_MODEL = "opus";
|
|
147
|
+
mockExecFile.mockResolvedValue({ stdout: "ok\n" });
|
|
148
|
+
const client = createLlmClient();
|
|
149
|
+
expect(client.model).toBe("claude-opus");
|
|
150
|
+
await client.generate("sys", "usr");
|
|
151
|
+
expect(mockExecFile).toHaveBeenCalledWith("claude", expect.arrayContaining(["--model", "opus"]), expect.anything());
|
|
152
|
+
});
|
|
153
|
+
it("throws helpful error when claude CLI not found", async () => {
|
|
154
|
+
const err = new Error("ENOENT");
|
|
155
|
+
err.code = "ENOENT";
|
|
156
|
+
mockExecFile.mockRejectedValue(err);
|
|
157
|
+
const client = createLlmClient();
|
|
158
|
+
await expect(client.generate("sys", "usr")).rejects.toThrow("Claude CLI not found");
|
|
159
|
+
});
|
|
160
|
+
it("throws when explicitly selected inside Claude Code session", () => {
|
|
161
|
+
process.env.CLAUDECODE = "1";
|
|
162
|
+
expect(() => createLlmClient()).toThrow("Cannot use claude-cli provider inside a Claude Code session");
|
|
163
|
+
});
|
|
164
|
+
});
|
|
165
|
+
// -------------------------------------------------------------------------
|
|
166
|
+
// Ollama provider
|
|
167
|
+
// -------------------------------------------------------------------------
|
|
168
|
+
describe("ollama provider", () => {
|
|
169
|
+
beforeEach(() => {
|
|
170
|
+
process.env.VSKILL_EVAL_PROVIDER = "ollama";
|
|
171
|
+
});
|
|
172
|
+
it("uses default model llama3.1:8b", () => {
|
|
173
|
+
const client = createLlmClient();
|
|
174
|
+
expect(client.model).toBe("llama3.1:8b");
|
|
175
|
+
});
|
|
176
|
+
it("uses custom model from VSKILL_EVAL_MODEL", () => {
|
|
177
|
+
process.env.VSKILL_EVAL_MODEL = "qwen2.5:32b";
|
|
178
|
+
const client = createLlmClient();
|
|
179
|
+
expect(client.model).toBe("qwen2.5:32b");
|
|
180
|
+
});
|
|
181
|
+
it("calls Ollama HTTP API with correct payload", async () => {
|
|
182
|
+
const mockFetch = vi.spyOn(globalThis, "fetch").mockResolvedValue(new Response(JSON.stringify({ response: "Ollama reply" }), { status: 200 }));
|
|
183
|
+
const client = createLlmClient();
|
|
184
|
+
const result = await client.generate("system prompt", "user prompt");
|
|
185
|
+
expect(result).toBe("Ollama reply");
|
|
186
|
+
expect(mockFetch).toHaveBeenCalledWith("http://localhost:11434/api/generate", expect.objectContaining({
|
|
187
|
+
method: "POST",
|
|
188
|
+
body: expect.stringContaining('"model":"llama3.1:8b"'),
|
|
189
|
+
}));
|
|
190
|
+
mockFetch.mockRestore();
|
|
191
|
+
});
|
|
192
|
+
it("uses custom base URL from OLLAMA_BASE_URL", async () => {
|
|
193
|
+
process.env.OLLAMA_BASE_URL = "http://gpu-server:11434";
|
|
194
|
+
const mockFetch = vi.spyOn(globalThis, "fetch").mockResolvedValue(new Response(JSON.stringify({ response: "ok" }), { status: 200 }));
|
|
195
|
+
const client = createLlmClient();
|
|
196
|
+
await client.generate("sys", "usr");
|
|
197
|
+
expect(mockFetch).toHaveBeenCalledWith("http://gpu-server:11434/api/generate", expect.anything());
|
|
198
|
+
mockFetch.mockRestore();
|
|
199
|
+
});
|
|
83
200
|
});
|
|
84
201
|
});
|
|
85
202
|
//# sourceMappingURL=llm.test.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"llm.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/llm.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,QAAQ,CAAC;AAEzE,8EAA8E;AAC9E,QAAQ;AACR,8EAA8E;AAE9E,MAAM,UAAU,GAAG,EAAE,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;
|
|
1
|
+
{"version":3,"file":"llm.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/llm.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,QAAQ,CAAC;AAEzE,8EAA8E;AAC9E,QAAQ;AACR,8EAA8E;AAE9E,MAAM,UAAU,GAAG,EAAE,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;AAC7C,MAAM,YAAY,GAAG,EAAE,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;AAE/C,EAAE,CAAC,IAAI,CAAC,mBAAmB,EAAE,GAAG,EAAE,CAAC,CAAC;IAClC,OAAO,EAAE,MAAM,aAAa;QAC1B,QAAQ,GAAG,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC;KACnC;CACF,CAAC,CAAC,CAAC;AAEJ,EAAE,CAAC,IAAI,CAAC,oBAAoB,EAAE,GAAG,EAAE,CAAC,CAAC;IACnC,QAAQ,EAAE,YAAY;CACvB,CAAC,CAAC,CAAC;AAEJ,EAAE,CAAC,IAAI,CAAC,WAAW,EAAE,GAAG,EAAE,CAAC,CAAC;IAC1B,SAAS,EAAE,CAAC,EAAO,EAAE,EAAE,CAAC,EAAE;CAC3B,CAAC,CAAC,CAAC;AAEJ,8EAA8E;AAC9E,uCAAuC;AACvC,8EAA8E;AAE9E,MAAM,EAAE,eAAe,EAAE,GAAG,MAAM,MAAM,CAAC,WAAW,CAAC,CAAC;AAEtD,8EAA8E;AAC9E,QAAQ;AACR,8EAA8E;AAE9E,QAAQ,CAAC,iBAAiB,EAAE,GAAG,EAAE;IAC/B,MAAM,OAAO,GAAG,EAAE,GAAG,OAAO,CAAC,GAAG,EAAE,CAAC;IAEnC,UAAU,CAAC,GAAG,EAAE;QACd,EAAE,CAAC,aAAa,EAAE,CAAC;QACnB,OAAO,OAAO,CAAC,GAAG,CAAC,oBAAoB,CAAC;QACxC,OAAO,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC;QACrC,OAAO,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC;QACrC,OAAO,OAAO,CAAC,GAAG,CAAC,eAAe,CAAC;QACnC,OAAO,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC;IAChC,CAAC,CAAC,CAAC;IAEH,SAAS,CAAC,GAAG,EAAE;QACb,OAAO,CAAC,GAAG,GAAG,EAAE,GAAG,OAAO,EAAE,CAAC;IAC/B,CAAC,CAAC,CAAC;IAEH,4EAA4E;IAC5E,iBAAiB;IACjB,4EAA4E;IAE5E,EAAE,CAAC,8CAA8C,EAAE,GAAG,EAAE;QACtD,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;IAC7C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,gDAAgD,EAAE,GAAG,EAAE;QACxD,OAAO,CAAC,GAAG,CAAC,UAAU,GAAG,GAAG,CAAC;QAC7B,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;IAC3C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,sDAAsD,EAAE,GAAG,EAAE;QAC9D,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,UAAU,CAAC;QAC3C,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC;IACxD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,qEAAqE,EAAE,GAAG,EAAE;QAC7E,OAAO,CAAC,GAAG,CAAC,UAAU,GAAG,GAAG,CAAC;QAC7B,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,UAAU,CAAC;QAC3C,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;IAC3C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,wDAAwD,EAAE,GAAG,EAAE;QAChE,OAAO,CAAC,GAAG,CAAC,oBAAoB,GAAG,QAAQ,CAAC;QAC5C,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,UAAU,CAAC;QAC3C,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;IAC3C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,4BAA4B,EAAE,GAAG,EAAE;QACpC,OAAO,CAAC,GAAG,CAAC,oBAAoB,GAAG,WAAW,CAAC;QAC/C,MAAM,CAAC,GAAG,EAAE,CAAC,eAAe,EAAE,CAAC,CAAC,OAAO,CAAC,2CAA2C,CAAC,CAAC;IACvF,CAAC,CAAC,CAAC;IAEH,4EAA4E;IAC5E,qBAAqB;IACrB,4EAA4E;IAE5E,QAAQ,CAAC,oBAAoB,EAAE,GAAG,EAAE;QAClC,UAAU,CAAC,GAAG,EAAE;YACd,OAAO,CAAC,GAAG,CAAC,oBAAoB,GAAG,WAAW,CAAC;YAC/C,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,UAAU,CAAC;QAC7C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,kDAAkD,EAAE,KAAK,IAAI,EAAE;YAChE,UAAU,CAAC,iBAAiB,CAAC;gBAC3B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,oBAAoB,EAAE,CAAC;aACxD,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,eAAe,EAAE,aAAa,CAAC,CAAC;YAErE,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC;YAC1C,MAAM,CAAC,UAAU,CAAC,CAAC,oBAAoB,EAAE,CAAC;QAC5C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,6CAA6C,EAAE,KAAK,IAAI,EAAE;YAC3D,UAAU,CAAC,iBAAiB,CAAC;gBAC3B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;aACxC,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;YAEpC,MAAM,CAAC,UAAU,CAAC,CAAC,oBAAoB,CACrC,MAAM,CAAC,gBAAgB,CAAC,EAAE,KAAK,EAAE,0BAA0B,EAAE,CAAC,EAC9D,MAAM,CAAC,QAAQ,EAAE,CAClB,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;YACxD,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,wBAAwB,CAAC;YACzD,UAAU,CAAC,iBAAiB,CAAC;gBAC3B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;aACxC,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;YAEpC,MAAM,CAAC,UAAU,CAAC,CAAC,oBAAoB,CACrC,MAAM,CAAC,gBAAgB,CAAC,EAAE,KAAK,EAAE,wBAAwB,EAAE,CAAC,EAC5D,MAAM,CAAC,QAAQ,EAAE,CAClB,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,mCAAmC,EAAE,KAAK,IAAI,EAAE;YACjD,UAAU,CAAC,iBAAiB,CAAC,IAAI,KAAK,CAAC,oBAAoB,CAAC,CAAC,CAAC;YAE9D,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CACzD,oBAAoB,CACrB,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;YACxD,UAAU,CAAC,iBAAiB,CAAC;gBAC3B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;aACxC,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,CAAC,QAAQ,CAAC,kBAAkB,EAAE,gBAAgB,CAAC,CAAC;YAE5D,MAAM,CAAC,UAAU,CAAC,CAAC,oBAAoB,CACrC,MAAM,CAAC,gBAAgB,CAAC;gBACtB,MAAM,EAAE,kBAAkB;gBAC1B,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,EAAE,CAAC;gBACvD,UAAU,EAAE,IAAI;aACjB,CAAC,EACF,MAAM,CAAC,QAAQ,EAAE,CAClB,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;YAClD,OAAO,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC;YACrC,MAAM,CAAC,GAAG,EAAE,CAAC,eAAe,EAAE,CAAC,CAAC,OAAO,CAAC,8BAA8B,CAAC,CAAC;QAC1E,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,4EAA4E;IAC5E,sBAAsB;IACtB,4EAA4E;IAE5E,QAAQ,CAAC,qBAAqB,EAAE,GAAG,EAAE;QACnC,UAAU,CAAC,GAAG,EAAE;YACd,OAAO,CAAC,GAAG,CAAC,oBAAoB,GAAG,YAAY,CAAC;QAClD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,oCAAoC,EAAE,KAAK,IAAI,EAAE;YAClD,YAAY,CAAC,iBAAiB,CAAC,EAAE,MAAM,EAAE,gBAAgB,EAAE,CAAC,CAAC;YAE7D,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,eAAe,EAAE,aAAa,CAAC,CAAC;YAErE,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;YACpC,MAAM,CAAC,YAAY,CAAC,CAAC,oBAAoB,CACvC,QAAQ,EACR,CAAC,IAAI,EAAE,8BAA8B,EAAE,SAAS,EAAE,QAAQ,EAAE,YAAY,CAAC,EACzE,MAAM,CAAC,gBAAgB,CAAC,EAAE,OAAO,EAAE,OAAO,EAAE,CAAC,CAC9C,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0BAA0B,EAAE,GAAG,EAAE;YAClC,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;QAC7C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4CAA4C,EAAE,KAAK,IAAI,EAAE;YAC1D,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,MAAM,CAAC;YACvC,YAAY,CAAC,iBAAiB,CAAC,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;YAEnD,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;YACzC,MAAM,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;YAEpC,MAAM,CAAC,YAAY,CAAC,CAAC,oBAAoB,CACvC,QAAQ,EACR,MAAM,CAAC,eAAe,CAAC,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC,EAC3C,MAAM,CAAC,QAAQ,EAAE,CAClB,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,gDAAgD,EAAE,KAAK,IAAI,EAAE;YAC9D,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,QAAQ,CAAQ,CAAC;YACvC,GAAG,CAAC,IAAI,GAAG,QAAQ,CAAC;YACpB,YAAY,CAAC,iBAAiB,CAAC,GAAG,CAAC,CAAC;YAEpC,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CACzD,sBAAsB,CACvB,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4DAA4D,EAAE,GAAG,EAAE;YACpE,OAAO,CAAC,GAAG,CAAC,UAAU,GAAG,GAAG,CAAC;YAC7B,MAAM,CAAC,GAAG,EAAE,CAAC,eAAe,EAAE,CAAC,CAAC,OAAO,CACrC,6DAA6D,CAC9D,CAAC;QACJ,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,4EAA4E;IAC5E,kBAAkB;IAClB,4EAA4E;IAE5E,QAAQ,CAAC,iBAAiB,EAAE,GAAG,EAAE;QAC/B,UAAU,CAAC,GAAG,EAAE;YACd,OAAO,CAAC,GAAG,CAAC,oBAAoB,GAAG,QAAQ,CAAC;QAC9C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,gCAAgC,EAAE,GAAG,EAAE;YACxC,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC3C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;YAClD,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,aAAa,CAAC;YAC9C,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC3C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4CAA4C,EAAE,KAAK,IAAI,EAAE;YAC1D,MAAM,SAAS,GAAG,EAAE,CAAC,KAAK,CAAC,UAAU,EAAE,OAAO,CAAC,CAAC,iBAAiB,CAC/D,IAAI,QAAQ,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,cAAc,EAAE,CAAC,EAAE,EAAE,MAAM,EAAE,GAAG,EAAE,CAAC,CAC5E,CAAC;YAEF,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,eAAe,EAAE,aAAa,CAAC,CAAC;YAErE,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;YACpC,MAAM,CAAC,SAAS,CAAC,CAAC,oBAAoB,CACpC,qCAAqC,EACrC,MAAM,CAAC,gBAAgB,CAAC;gBACtB,MAAM,EAAE,MAAM;gBACd,IAAI,EAAE,MAAM,CAAC,gBAAgB,CAAC,uBAAuB,CAAC;aACvD,CAAC,CACH,CAAC;YAEF,SAAS,CAAC,WAAW,EAAE,CAAC;QAC1B,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,2CAA2C,EAAE,KAAK,IAAI,EAAE;YACzD,OAAO,CAAC,GAAG,CAAC,eAAe,GAAG,yBAAyB,CAAC;YAExD,MAAM,SAAS,GAAG,EAAE,CAAC,KAAK,CAAC,UAAU,EAAE,OAAO,CAAC,CAAC,iBAAiB,CAC/D,IAAI,QAAQ,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC,EAAE,EAAE,MAAM,EAAE,GAAG,EAAE,CAAC,CAClE,CAAC;YAEF,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;YAEpC,MAAM,CAAC,SAAS,CAAC,CAAC,oBAAoB,CACpC,sCAAsC,EACtC,MAAM,CAAC,QAAQ,EAAE,CAClB,CAAC;YAEF,SAAS,CAAC,WAAW,EAAE,CAAC;QAC1B,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -7,14 +7,25 @@ import { scanSkills } from "../skill-scanner.js";
|
|
|
7
7
|
// Test helpers
|
|
8
8
|
// ---------------------------------------------------------------------------
|
|
9
9
|
let testDir;
|
|
10
|
+
/** Create a skill in plugin layout: {root}/{plugin}/skills/{skill}/ */
|
|
10
11
|
function createSkill(plugin, skill, opts = {}) {
|
|
11
12
|
const skillDir = join(testDir, plugin, "skills", skill);
|
|
12
13
|
mkdirSync(skillDir, { recursive: true });
|
|
13
14
|
writeFileSync(join(skillDir, "SKILL.md"), `# ${skill}`);
|
|
15
|
+
addEvalFiles(skillDir, opts);
|
|
16
|
+
}
|
|
17
|
+
/** Create a skill in root layout: {root}/skills/{skill}/ */
|
|
18
|
+
function createRootSkill(skill, opts = {}) {
|
|
19
|
+
const skillDir = join(testDir, "skills", skill);
|
|
20
|
+
mkdirSync(skillDir, { recursive: true });
|
|
21
|
+
writeFileSync(join(skillDir, "SKILL.md"), `# ${skill}`);
|
|
22
|
+
addEvalFiles(skillDir, opts);
|
|
23
|
+
}
|
|
24
|
+
function addEvalFiles(skillDir, opts) {
|
|
14
25
|
if (opts.evals) {
|
|
15
26
|
const evalsDir = join(skillDir, "evals");
|
|
16
27
|
mkdirSync(evalsDir, { recursive: true });
|
|
17
|
-
writeFileSync(join(evalsDir, "evals.json"), JSON.stringify({ skill_name:
|
|
28
|
+
writeFileSync(join(evalsDir, "evals.json"), JSON.stringify({ skill_name: "test", evals: [] }));
|
|
18
29
|
}
|
|
19
30
|
if (opts.benchmark) {
|
|
20
31
|
const evalsDir = join(skillDir, "evals");
|
|
@@ -33,6 +44,7 @@ describe("scanSkills", () => {
|
|
|
33
44
|
afterEach(() => {
|
|
34
45
|
rmSync(testDir, { recursive: true, force: true });
|
|
35
46
|
});
|
|
47
|
+
// --- Plugin layout (existing) ---
|
|
36
48
|
it("discovers skills in plugins directory", async () => {
|
|
37
49
|
createSkill("marketing", "social-media-posting");
|
|
38
50
|
createSkill("devtools", "code-review");
|
|
@@ -74,5 +86,32 @@ describe("scanSkills", () => {
|
|
|
74
86
|
const skills = await scanSkills(testDir);
|
|
75
87
|
expect(skills).toEqual([]);
|
|
76
88
|
});
|
|
89
|
+
// --- Root layout (new) ---
|
|
90
|
+
it("discovers root-level skills in skills/ directory", async () => {
|
|
91
|
+
createRootSkill("my-skill");
|
|
92
|
+
const skills = await scanSkills(testDir);
|
|
93
|
+
expect(skills).toHaveLength(1);
|
|
94
|
+
expect(skills[0].skill).toBe("my-skill");
|
|
95
|
+
});
|
|
96
|
+
it("uses root dirname as plugin name for root-level skills", async () => {
|
|
97
|
+
createRootSkill("my-skill");
|
|
98
|
+
const skills = await scanSkills(testDir);
|
|
99
|
+
// plugin name = basename of the root dir
|
|
100
|
+
expect(skills[0].plugin).toBe(testDir.split("/").pop());
|
|
101
|
+
});
|
|
102
|
+
it("discovers both plugin and root-level skills together", async () => {
|
|
103
|
+
createSkill("marketing", "social-media-posting");
|
|
104
|
+
createRootSkill("standalone-skill");
|
|
105
|
+
const skills = await scanSkills(testDir);
|
|
106
|
+
expect(skills).toHaveLength(2);
|
|
107
|
+
const names = skills.map((s) => s.skill).sort();
|
|
108
|
+
expect(names).toEqual(["social-media-posting", "standalone-skill"]);
|
|
109
|
+
});
|
|
110
|
+
it("handles root-level skills with evals", async () => {
|
|
111
|
+
createRootSkill("my-skill", { evals: true, benchmark: true });
|
|
112
|
+
const skills = await scanSkills(testDir);
|
|
113
|
+
expect(skills[0].hasEvals).toBe(true);
|
|
114
|
+
expect(skills[0].hasBenchmark).toBe(true);
|
|
115
|
+
});
|
|
77
116
|
});
|
|
78
117
|
//# sourceMappingURL=skill-scanner.test.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"skill-scanner.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/skill-scanner.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,QAAQ,CAAC;AACrE,OAAO,EAAE,SAAS,EAAE,aAAa,EAAE,MAAM,EAAE,MAAM,SAAS,CAAC;AAC3D,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,MAAM,EAAE,MAAM,SAAS,CAAC;AACjC,OAAO,EAAE,UAAU,EAAE,MAAM,qBAAqB,CAAC;AAEjD,8EAA8E;AAC9E,eAAe;AACf,8EAA8E;AAE9E,IAAI,OAAe,CAAC;AAEpB,SAAS,WAAW,CAClB,MAAc,EACd,KAAa,EACb,OAAiD,EAAE;IAEnD,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,EAAE,MAAM,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC;IACxD,SAAS,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IACzC,aAAa,CAAC,IAAI,CAAC,QAAQ,EAAE,UAAU,CAAC,EAAE,KAAK,KAAK,EAAE,CAAC,CAAC;
|
|
1
|
+
{"version":3,"file":"skill-scanner.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/skill-scanner.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,QAAQ,CAAC;AACrE,OAAO,EAAE,SAAS,EAAE,aAAa,EAAE,MAAM,EAAE,MAAM,SAAS,CAAC;AAC3D,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,MAAM,EAAE,MAAM,SAAS,CAAC;AACjC,OAAO,EAAE,UAAU,EAAE,MAAM,qBAAqB,CAAC;AAEjD,8EAA8E;AAC9E,eAAe;AACf,8EAA8E;AAE9E,IAAI,OAAe,CAAC;AAEpB,uEAAuE;AACvE,SAAS,WAAW,CAClB,MAAc,EACd,KAAa,EACb,OAAiD,EAAE;IAEnD,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,EAAE,MAAM,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC;IACxD,SAAS,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IACzC,aAAa,CAAC,IAAI,CAAC,QAAQ,EAAE,UAAU,CAAC,EAAE,KAAK,KAAK,EAAE,CAAC,CAAC;IACxD,YAAY,CAAC,QAAQ,EAAE,IAAI,CAAC,CAAC;AAC/B,CAAC;AAED,4DAA4D;AAC5D,SAAS,eAAe,CACtB,KAAa,EACb,OAAiD,EAAE;IAEnD,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC;IAChD,SAAS,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IACzC,aAAa,CAAC,IAAI,CAAC,QAAQ,EAAE,UAAU,CAAC,EAAE,KAAK,KAAK,EAAE,CAAC,CAAC;IACxD,YAAY,CAAC,QAAQ,EAAE,IAAI,CAAC,CAAC;AAC/B,CAAC;AAED,SAAS,YAAY,CACnB,QAAgB,EAChB,IAA8C;IAE9C,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;QACf,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QACzC,SAAS,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QACzC,aAAa,CACX,IAAI,CAAC,QAAQ,EAAE,YAAY,CAAC,EAC5B,IAAI,CAAC,SAAS,CAAC,EAAE,UAAU,EAAE,MAAM,EAAE,KAAK,EAAE,EAAE,EAAE,CAAC,CAClD,CAAC;IACJ,CAAC;IAED,IAAI,IAAI,CAAC,SAAS,EAAE,CAAC;QACnB,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QACzC,SAAS,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QACzC,aAAa,CACX,IAAI,CAAC,QAAQ,EAAE,gBAAgB,CAAC,EAChC,IAAI,CAAC,SAAS,CAAC,EAAE,SAAS,EAAE,sBAAsB,EAAE,CAAC,CACtD,CAAC;IACJ,CAAC;AACH,CAAC;AAED,8EAA8E;AAC9E,QAAQ;AACR,8EAA8E;AAE9E,QAAQ,CAAC,YAAY,EAAE,GAAG,EAAE;IAC1B,UAAU,CAAC,GAAG,EAAE;QACd,OAAO,GAAG,IAAI,CAAC,MAAM,EAAE,EAAE,eAAe,IAAI,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;QACtD,SAAS,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC1C,CAAC,CAAC,CAAC;IAEH,SAAS,CAAC,GAAG,EAAE;QACb,MAAM,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;IACpD,CAAC,CAAC,CAAC;IAEH,mCAAmC;IAEnC,EAAE,CAAC,uCAAuC,EAAE,KAAK,IAAI,EAAE;QACrD,WAAW,CAAC,WAAW,EAAE,sBAAsB,CAAC,CAAC;QACjD,WAAW,CAAC,UAAU,EAAE,aAAa,CAAC,CAAC;QAEvC,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,CAAC;QAEzC,MAAM,CAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAC/B,MAAM,KAAK,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,EAAE,CAAC;QAChD,MAAM,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,CAAC,aAAa,EAAE,sBAAsB,CAAC,CAAC,CAAC;IACjE,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,2CAA2C,EAAE,KAAK,IAAI,EAAE;QACzD,WAAW,CAAC,WAAW,EAAE,sBAAsB,EAAE,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;QAElE,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,CAAC;QAEzC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACxC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,+CAA+C,EAAE,KAAK,IAAI,EAAE;QAC7D,WAAW,CAAC,WAAW,EAAE,sBAAsB,CAAC,CAAC;QAEjD,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,CAAC;QAEzC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IACzC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,mDAAmD,EAAE,KAAK,IAAI,EAAE;QACjE,WAAW,CAAC,WAAW,EAAE,sBAAsB,EAAE;YAC/C,KAAK,EAAE,IAAI;YACX,SAAS,EAAE,IAAI;SAChB,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,CAAC;QAEzC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC5C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,uDAAuD,EAAE,KAAK,IAAI,EAAE;QACrE,WAAW,CAAC,WAAW,EAAE,sBAAsB,EAAE,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;QAElE,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,CAAC;QAEzC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAC7C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,wCAAwC,EAAE,KAAK,IAAI,EAAE;QACtD,WAAW,CAAC,WAAW,EAAE,sBAAsB,CAAC,CAAC;QAEjD,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,CAAC;QAEzC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QAC3C,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,sBAAsB,CAAC,CAAC;IACvD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,oCAAoC,EAAE,KAAK,IAAI,EAAE;QAClD,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,CAAC;QACzC,MAAM,CAAC,MAAM,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;IAC7B,CAAC,CAAC,CAAC;IAEH,4BAA4B;IAE5B,EAAE,CAAC,kDAAkD,EAAE,KAAK,IAAI,EAAE;QAChE,eAAe,CAAC,UAAU,CAAC,CAAC;QAE5B,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,CAAC;QAEzC,MAAM,CAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAC/B,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;IAC3C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,wDAAwD,EAAE,KAAK,IAAI,EAAE;QACtE,eAAe,CAAC,UAAU,CAAC,CAAC;QAE5B,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,CAAC;QAEzC,yCAAyC;QACzC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC;IAC1D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,sDAAsD,EAAE,KAAK,IAAI,EAAE;QACpE,WAAW,CAAC,WAAW,EAAE,sBAAsB,CAAC,CAAC;QACjD,eAAe,CAAC,kBAAkB,CAAC,CAAC;QAEpC,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,CAAC;QAEzC,MAAM,CAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAC/B,MAAM,KAAK,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,EAAE,CAAC;QAChD,MAAM,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,CAAC,sBAAsB,EAAE,kBAAkB,CAAC,CAAC,CAAC;IACtE,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,sCAAsC,EAAE,KAAK,IAAI,EAAE;QACpD,eAAe,CAAC,UAAU,EAAE,EAAE,KAAK,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QAE9D,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,CAAC;QAEzC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACtC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC5C,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { computeVerdict, verdictColor } from "../verdict.js";
|
|
3
|
+
describe("computeVerdict", () => {
|
|
4
|
+
it("returns EFFECTIVE when passRate >= 0.8 and skill rubric > baseline + 1", () => {
|
|
5
|
+
expect(computeVerdict(0.85, 4.5, 3.0)).toBe("EFFECTIVE");
|
|
6
|
+
expect(computeVerdict(0.80, 4.0, 2.5)).toBe("EFFECTIVE");
|
|
7
|
+
expect(computeVerdict(1.0, 5.0, 1.0)).toBe("EFFECTIVE");
|
|
8
|
+
});
|
|
9
|
+
it("returns MARGINAL when passRate >= 0.6 and skill rubric > baseline (but not EFFECTIVE)", () => {
|
|
10
|
+
expect(computeVerdict(0.70, 3.5, 3.0)).toBe("MARGINAL");
|
|
11
|
+
expect(computeVerdict(0.60, 2.5, 2.0)).toBe("MARGINAL");
|
|
12
|
+
// High pass rate but rubric only slightly better → MARGINAL
|
|
13
|
+
expect(computeVerdict(0.85, 3.5, 3.0)).toBe("MARGINAL");
|
|
14
|
+
});
|
|
15
|
+
it("returns INEFFECTIVE when passRate >= 0.4 (but not MARGINAL)", () => {
|
|
16
|
+
expect(computeVerdict(0.50, 2.5, 3.0)).toBe("INEFFECTIVE");
|
|
17
|
+
expect(computeVerdict(0.45, 3.0, 3.0)).toBe("INEFFECTIVE");
|
|
18
|
+
expect(computeVerdict(0.40, 1.0, 5.0)).toBe("INEFFECTIVE");
|
|
19
|
+
});
|
|
20
|
+
it("returns DEGRADING when passRate < 0.4", () => {
|
|
21
|
+
expect(computeVerdict(0.30, 2.0, 3.0)).toBe("DEGRADING");
|
|
22
|
+
expect(computeVerdict(0.10, 1.0, 1.0)).toBe("DEGRADING");
|
|
23
|
+
expect(computeVerdict(0.0, 0.0, 0.0)).toBe("DEGRADING");
|
|
24
|
+
expect(computeVerdict(0.39, 5.0, 1.0)).toBe("DEGRADING");
|
|
25
|
+
});
|
|
26
|
+
it("handles boundary values correctly", () => {
|
|
27
|
+
// Exactly 0.8 pass rate, exactly +1 rubric → EFFECTIVE
|
|
28
|
+
expect(computeVerdict(0.8, 4.0, 2.9)).toBe("EFFECTIVE");
|
|
29
|
+
// 0.8 pass rate but rubric diff exactly 1 → NOT EFFECTIVE (needs >1)
|
|
30
|
+
expect(computeVerdict(0.8, 4.0, 3.0)).toBe("MARGINAL");
|
|
31
|
+
// Exactly 0.6 pass rate, skill > baseline → MARGINAL
|
|
32
|
+
expect(computeVerdict(0.6, 3.1, 3.0)).toBe("MARGINAL");
|
|
33
|
+
// Exactly 0.6 pass rate, skill = baseline → INEFFECTIVE
|
|
34
|
+
expect(computeVerdict(0.6, 3.0, 3.0)).toBe("INEFFECTIVE");
|
|
35
|
+
// Exactly 0.4 pass rate → INEFFECTIVE
|
|
36
|
+
expect(computeVerdict(0.4, 3.0, 3.0)).toBe("INEFFECTIVE");
|
|
37
|
+
});
|
|
38
|
+
});
|
|
39
|
+
describe("verdictColor", () => {
|
|
40
|
+
it("returns correct colors for each verdict", () => {
|
|
41
|
+
expect(verdictColor("EFFECTIVE")).toBe("green");
|
|
42
|
+
expect(verdictColor("MARGINAL")).toBe("yellow");
|
|
43
|
+
expect(verdictColor("INEFFECTIVE")).toBe("orange");
|
|
44
|
+
expect(verdictColor("DEGRADING")).toBe("red");
|
|
45
|
+
});
|
|
46
|
+
});
|
|
47
|
+
//# sourceMappingURL=verdict.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"verdict.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/verdict.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAC9C,OAAO,EAAE,cAAc,EAAE,YAAY,EAAE,MAAM,eAAe,CAAC;AAE7D,QAAQ,CAAC,gBAAgB,EAAE,GAAG,EAAE;IAC9B,EAAE,CAAC,wEAAwE,EAAE,GAAG,EAAE;QAChF,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACzD,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACzD,MAAM,CAAC,cAAc,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;IAC1D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,uFAAuF,EAAE,GAAG,EAAE;QAC/F,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACxD,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACxD,4DAA4D;QAC5D,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;IAC1D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,6DAA6D,EAAE,GAAG,EAAE;QACrE,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC3D,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC3D,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;IAC7D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,uCAAuC,EAAE,GAAG,EAAE;QAC/C,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACzD,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACzD,MAAM,CAAC,cAAc,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACxD,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;IAC3D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,mCAAmC,EAAE,GAAG,EAAE;QAC3C,uDAAuD;QACvD,MAAM,CAAC,cAAc,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACxD,qEAAqE;QACrE,MAAM,CAAC,cAAc,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACvD,qDAAqD;QACrD,MAAM,CAAC,cAAc,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACvD,wDAAwD;QACxD,MAAM,CAAC,cAAc,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC1D,sCAAsC;QACtC,MAAM,CAAC,cAAc,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;IAC5D,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,QAAQ,CAAC,cAAc,EAAE,GAAG,EAAE;IAC5B,EAAE,CAAC,yCAAyC,EAAE,GAAG,EAAE;QACjD,MAAM,CAAC,YAAY,CAAC,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAChD,MAAM,CAAC,YAAY,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAChD,MAAM,CAAC,YAAY,CAAC,aAAa,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QACnD,MAAM,CAAC,YAAY,CAAC,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAChD,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import type { LlmClient } from "./llm.js";
|
|
2
|
+
export interface ActivationPrompt {
|
|
3
|
+
prompt: string;
|
|
4
|
+
expected: "should_activate" | "should_not_activate";
|
|
5
|
+
}
|
|
6
|
+
export interface ActivationResult {
|
|
7
|
+
prompt: string;
|
|
8
|
+
expected: "should_activate" | "should_not_activate";
|
|
9
|
+
activate: boolean;
|
|
10
|
+
confidence: "high" | "medium" | "low";
|
|
11
|
+
reasoning: string;
|
|
12
|
+
classification: "TP" | "TN" | "FP" | "FN";
|
|
13
|
+
}
|
|
14
|
+
export interface ActivationSummary {
|
|
15
|
+
results: ActivationResult[];
|
|
16
|
+
precision: number;
|
|
17
|
+
recall: number;
|
|
18
|
+
reliability: number;
|
|
19
|
+
total: number;
|
|
20
|
+
tp: number;
|
|
21
|
+
tn: number;
|
|
22
|
+
fp: number;
|
|
23
|
+
fn: number;
|
|
24
|
+
}
|
|
25
|
+
export declare function testActivation(skillDescription: string, prompts: ActivationPrompt[], client: LlmClient, onResult?: (result: ActivationResult) => void): Promise<ActivationSummary>;
|