vskill 0.2.55 → 0.2.56
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -3
- package/dist/commands/eval/__tests__/run.test.js +7 -2
- package/dist/commands/eval/__tests__/run.test.js.map +1 -1
- package/dist/commands/eval/run.js +24 -3
- package/dist/commands/eval/run.js.map +1 -1
- package/dist/commands/eval/serve.d.ts +1 -0
- package/dist/commands/eval/serve.js +51 -0
- package/dist/commands/eval/serve.js.map +1 -0
- package/dist/commands/eval.d.ts +1 -0
- package/dist/commands/eval.js +6 -1
- package/dist/commands/eval.js.map +1 -1
- package/dist/eval/__tests__/activation-tester.test.d.ts +1 -0
- package/dist/eval/__tests__/activation-tester.test.js +94 -0
- package/dist/eval/__tests__/activation-tester.test.js.map +1 -0
- package/dist/eval/__tests__/benchmark-history.test.d.ts +1 -0
- package/dist/eval/__tests__/benchmark-history.test.js +200 -0
- package/dist/eval/__tests__/benchmark-history.test.js.map +1 -0
- package/dist/eval/__tests__/comparator.test.d.ts +1 -0
- package/dist/eval/__tests__/comparator.test.js +136 -0
- package/dist/eval/__tests__/comparator.test.js.map +1 -0
- package/dist/eval/__tests__/llm.test.js +161 -44
- package/dist/eval/__tests__/llm.test.js.map +1 -1
- package/dist/eval/__tests__/verdict.test.d.ts +1 -0
- package/dist/eval/__tests__/verdict.test.js +47 -0
- package/dist/eval/__tests__/verdict.test.js.map +1 -0
- package/dist/eval/activation-tester.d.ts +25 -0
- package/dist/eval/activation-tester.js +89 -0
- package/dist/eval/activation-tester.js.map +1 -0
- package/dist/eval/benchmark-history.d.ts +23 -0
- package/dist/eval/benchmark-history.js +108 -0
- package/dist/eval/benchmark-history.js.map +1 -0
- package/dist/eval/comparator.d.ts +29 -0
- package/dist/eval/comparator.js +100 -0
- package/dist/eval/comparator.js.map +1 -0
- package/dist/eval/llm.js +119 -6
- package/dist/eval/llm.js.map +1 -1
- package/dist/eval/verdict.d.ts +3 -0
- package/dist/eval/verdict.js +28 -0
- package/dist/eval/verdict.js.map +1 -0
- package/dist/eval-server/api-routes.d.ts +2 -0
- package/dist/eval-server/api-routes.js +425 -0
- package/dist/eval-server/api-routes.js.map +1 -0
- package/dist/eval-server/eval-server.d.ts +6 -0
- package/dist/eval-server/eval-server.js +102 -0
- package/dist/eval-server/eval-server.js.map +1 -0
- package/dist/eval-server/router.d.ts +14 -0
- package/dist/eval-server/router.js +117 -0
- package/dist/eval-server/router.js.map +1 -0
- package/dist/eval-server/sse-helpers.d.ts +4 -0
- package/dist/eval-server/sse-helpers.js +24 -0
- package/dist/eval-server/sse-helpers.js.map +1 -0
- package/dist/eval-ui/assets/index-BYpLv_X1.css +1 -0
- package/dist/eval-ui/assets/index-Od6Ch9-a.js +70 -0
- package/dist/eval-ui/index.html +13 -0
- package/dist/index.js +2 -1
- package/dist/index.js.map +1 -1
- package/package.json +15 -2
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
import { describe, it, expect, vi } from "vitest";
|
|
2
|
+
import { generateComparisonOutputs, scoreComparison, runComparison, } from "../comparator.js";
|
|
3
|
+
function mockClient(responses) {
|
|
4
|
+
let callIndex = 0;
|
|
5
|
+
return {
|
|
6
|
+
model: "test-model",
|
|
7
|
+
generate: vi.fn(async () => responses[callIndex++] ?? ""),
|
|
8
|
+
};
|
|
9
|
+
}
|
|
10
|
+
describe("generateComparisonOutputs", () => {
|
|
11
|
+
it("generates skill and baseline outputs sequentially", async () => {
|
|
12
|
+
const client = mockClient(["skill response", "baseline response"]);
|
|
13
|
+
const result = await generateComparisonOutputs("test prompt", "# Skill Content", client);
|
|
14
|
+
expect(result.skillOutput).toBe("skill response");
|
|
15
|
+
expect(result.baselineOutput).toBe("baseline response");
|
|
16
|
+
expect(result.skillDurationMs).toBeGreaterThanOrEqual(0);
|
|
17
|
+
expect(result.baselineDurationMs).toBeGreaterThanOrEqual(0);
|
|
18
|
+
expect(client.generate).toHaveBeenCalledTimes(2);
|
|
19
|
+
// First call should include skill content
|
|
20
|
+
const firstCall = client.generate.mock.calls[0];
|
|
21
|
+
expect(firstCall[0]).toContain("Skill Content");
|
|
22
|
+
// Second call should be generic
|
|
23
|
+
const secondCall = client.generate.mock.calls[1];
|
|
24
|
+
expect(secondCall[0]).toContain("helpful AI assistant");
|
|
25
|
+
});
|
|
26
|
+
});
|
|
27
|
+
describe("scoreComparison", () => {
|
|
28
|
+
it("parses JSON scores from LLM response", async () => {
|
|
29
|
+
const client = mockClient([
|
|
30
|
+
JSON.stringify({
|
|
31
|
+
content_score_a: 4,
|
|
32
|
+
structure_score_a: 3,
|
|
33
|
+
content_score_b: 5,
|
|
34
|
+
structure_score_b: 4,
|
|
35
|
+
winner: "second",
|
|
36
|
+
reasoning: "B is better",
|
|
37
|
+
}),
|
|
38
|
+
]);
|
|
39
|
+
const result = await scoreComparison("output A", "output B", "prompt", client);
|
|
40
|
+
expect(result.contentScoreA).toBe(4);
|
|
41
|
+
expect(result.structureScoreA).toBe(3);
|
|
42
|
+
expect(result.contentScoreB).toBe(5);
|
|
43
|
+
expect(result.structureScoreB).toBe(4);
|
|
44
|
+
expect(result.winner).toBe("second");
|
|
45
|
+
});
|
|
46
|
+
it("parses JSON from code fence", async () => {
|
|
47
|
+
const client = mockClient([
|
|
48
|
+
'```json\n{"content_score_a": 3, "structure_score_a": 3, "content_score_b": 3, "structure_score_b": 3, "winner": "tie"}\n```',
|
|
49
|
+
]);
|
|
50
|
+
const result = await scoreComparison("A", "B", "p", client);
|
|
51
|
+
expect(result.winner).toBe("tie");
|
|
52
|
+
expect(result.contentScoreA).toBe(3);
|
|
53
|
+
});
|
|
54
|
+
it("clamps scores to 1-5 range", async () => {
|
|
55
|
+
const client = mockClient([
|
|
56
|
+
JSON.stringify({
|
|
57
|
+
content_score_a: 0,
|
|
58
|
+
structure_score_a: 10,
|
|
59
|
+
content_score_b: -1,
|
|
60
|
+
structure_score_b: 6,
|
|
61
|
+
winner: "first",
|
|
62
|
+
}),
|
|
63
|
+
]);
|
|
64
|
+
const result = await scoreComparison("A", "B", "p", client);
|
|
65
|
+
expect(result.contentScoreA).toBe(1);
|
|
66
|
+
expect(result.structureScoreA).toBe(5);
|
|
67
|
+
expect(result.contentScoreB).toBe(1);
|
|
68
|
+
expect(result.structureScoreB).toBe(5);
|
|
69
|
+
});
|
|
70
|
+
it("defaults invalid winner to tie", async () => {
|
|
71
|
+
const client = mockClient([
|
|
72
|
+
JSON.stringify({
|
|
73
|
+
content_score_a: 3,
|
|
74
|
+
structure_score_a: 3,
|
|
75
|
+
content_score_b: 3,
|
|
76
|
+
structure_score_b: 3,
|
|
77
|
+
winner: "invalid",
|
|
78
|
+
}),
|
|
79
|
+
]);
|
|
80
|
+
const result = await scoreComparison("A", "B", "p", client);
|
|
81
|
+
expect(result.winner).toBe("tie");
|
|
82
|
+
});
|
|
83
|
+
});
|
|
84
|
+
describe("runComparison", () => {
|
|
85
|
+
it("maps scores back to skill/baseline correctly", async () => {
|
|
86
|
+
// Mock: first two calls = skill + baseline outputs, third = scoring
|
|
87
|
+
const client = mockClient([
|
|
88
|
+
"skill output here",
|
|
89
|
+
"baseline output here",
|
|
90
|
+
JSON.stringify({
|
|
91
|
+
content_score_a: 4,
|
|
92
|
+
structure_score_a: 5,
|
|
93
|
+
content_score_b: 2,
|
|
94
|
+
structure_score_b: 3,
|
|
95
|
+
winner: "first",
|
|
96
|
+
reasoning: "A is better",
|
|
97
|
+
}),
|
|
98
|
+
]);
|
|
99
|
+
// Fix randomness for deterministic test
|
|
100
|
+
vi.spyOn(Math, "random").mockReturnValue(0.3); // < 0.5 → skill is A
|
|
101
|
+
const result = await runComparison("test prompt", "skill content", client);
|
|
102
|
+
expect(result.prompt).toBe("test prompt");
|
|
103
|
+
expect(result.skillOutput).toBe("skill output here");
|
|
104
|
+
expect(result.baselineOutput).toBe("baseline output here");
|
|
105
|
+
// skill is A, so scores map directly
|
|
106
|
+
expect(result.skillContentScore).toBe(4);
|
|
107
|
+
expect(result.skillStructureScore).toBe(5);
|
|
108
|
+
expect(result.baselineContentScore).toBe(2);
|
|
109
|
+
expect(result.baselineStructureScore).toBe(3);
|
|
110
|
+
expect(result.winner).toBe("skill");
|
|
111
|
+
vi.restoreAllMocks();
|
|
112
|
+
});
|
|
113
|
+
it("maps scores correctly when baseline is A", async () => {
|
|
114
|
+
const client = mockClient([
|
|
115
|
+
"skill out",
|
|
116
|
+
"baseline out",
|
|
117
|
+
JSON.stringify({
|
|
118
|
+
content_score_a: 2,
|
|
119
|
+
structure_score_a: 2,
|
|
120
|
+
content_score_b: 4,
|
|
121
|
+
structure_score_b: 4,
|
|
122
|
+
winner: "second",
|
|
123
|
+
}),
|
|
124
|
+
]);
|
|
125
|
+
// > 0.5 → skill is B
|
|
126
|
+
vi.spyOn(Math, "random").mockReturnValue(0.7);
|
|
127
|
+
const result = await runComparison("p", "s", client);
|
|
128
|
+
// skill is B → scores.contentScoreB is skill
|
|
129
|
+
expect(result.skillContentScore).toBe(4);
|
|
130
|
+
expect(result.baselineContentScore).toBe(2);
|
|
131
|
+
// winner "second" = B = skill
|
|
132
|
+
expect(result.winner).toBe("skill");
|
|
133
|
+
vi.restoreAllMocks();
|
|
134
|
+
});
|
|
135
|
+
});
|
|
136
|
+
//# sourceMappingURL=comparator.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"comparator.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/comparator.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,QAAQ,CAAC;AAClD,OAAO,EACL,yBAAyB,EACzB,eAAe,EACf,aAAa,GACd,MAAM,kBAAkB,CAAC;AAG1B,SAAS,UAAU,CAAC,SAAmB;IACrC,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,OAAO;QACL,KAAK,EAAE,YAAY;QACnB,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC,KAAK,IAAI,EAAE,CAAC,SAAS,CAAC,SAAS,EAAE,CAAC,IAAI,EAAE,CAAC;KAC1D,CAAC;AACJ,CAAC;AAED,QAAQ,CAAC,2BAA2B,EAAE,GAAG,EAAE;IACzC,EAAE,CAAC,mDAAmD,EAAE,KAAK,IAAI,EAAE;QACjE,MAAM,MAAM,GAAG,UAAU,CAAC,CAAC,gBAAgB,EAAE,mBAAmB,CAAC,CAAC,CAAC;QACnE,MAAM,MAAM,GAAG,MAAM,yBAAyB,CAAC,aAAa,EAAE,iBAAiB,EAAE,MAAM,CAAC,CAAC;QAEzF,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;QAClD,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;QACxD,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,CAAC;QACzD,MAAM,CAAC,MAAM,CAAC,kBAAkB,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,CAAC;QAC5D,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,qBAAqB,CAAC,CAAC,CAAC,CAAC;QAEjD,0CAA0C;QAC1C,MAAM,SAAS,GAAI,MAAM,CAAC,QAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACzD,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,eAAe,CAAC,CAAC;QAEhD,gCAAgC;QAChC,MAAM,UAAU,GAAI,MAAM,CAAC,QAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAC1D,MAAM,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,sBAAsB,CAAC,CAAC;IAC1D,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,QAAQ,CAAC,iBAAiB,EAAE,GAAG,EAAE;IAC/B,EAAE,CAAC,sCAAsC,EAAE,KAAK,IAAI,EAAE;QACpD,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,QAAQ;gBAChB,SAAS,EAAE,aAAa;aACzB,CAAC;SACH,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,UAAU,EAAE,UAAU,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC;QAC/E,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACvC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,6BAA6B,EAAE,KAAK,IAAI,EAAE;QAC3C,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,6HAA6H;SAC9H,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,CAAC,CAAC;QAC5D,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAClC,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACvC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,4BAA4B,EAAE,KAAK,IAAI,EAAE;QAC1C,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,EAAE;gBACrB,eAAe,EAAE,CAAC,CAAC;gBACnB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,OAAO;aAChB,CAAC;SACH,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,CAAC,CAAC;QAC5D,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACzC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,gCAAgC,EAAE,KAAK,IAAI,EAAE;QAC9C,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,SAAS;aAClB,CAAC;SACH,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,CAAC,CAAC;QAC5D,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IACpC,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,QAAQ,CAAC,eAAe,EAAE,GAAG,EAAE;IAC7B,EAAE,CAAC,8CAA8C,EAAE,KAAK,IAAI,EAAE;QAC5D,oEAAoE;QACpE,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,mBAAmB;YACnB,sBAAsB;YACtB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,OAAO;gBACf,SAAS,EAAE,aAAa;aACzB,CAAC;SACH,CAAC,CAAC;QAEH,wCAAwC;QACxC,EAAE,CAAC,KAAK,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC,CAAC,qBAAqB;QAEpE,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,aAAa,EAAE,eAAe,EAAE,MAAM,CAAC,CAAC;QAE3E,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC1C,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;QACrD,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,sBAAsB,CAAC,CAAC;QAC3D,qCAAqC;QACrC,MAAM,CAAC,MAAM,CAAC,iBAAiB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACzC,MAAM,CAAC,MAAM,CAAC,mBAAmB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3C,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC5C,MAAM,CAAC,MAAM,CAAC,sBAAsB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC9C,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAEpC,EAAE,CAAC,eAAe,EAAE,CAAC;IACvB,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;QACxD,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,WAAW;YACX,cAAc;YACd,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,QAAQ;aACjB,CAAC;SACH,CAAC,CAAC;QAEH,qBAAqB;QACrB,EAAE,CAAC,KAAK,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC;QAE9C,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,GAAG,EAAE,GAAG,EAAE,MAAM,CAAC,CAAC;QACrD,6CAA6C;QAC7C,MAAM,CAAC,MAAM,CAAC,iBAAiB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACzC,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC5C,8BAA8B;QAC9B,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAEpC,EAAE,CAAC,eAAe,EAAE,CAAC;IACvB,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -3,11 +3,18 @@ import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
|
|
|
3
3
|
// Mocks
|
|
4
4
|
// ---------------------------------------------------------------------------
|
|
5
5
|
const mockCreate = vi.hoisted(() => vi.fn());
|
|
6
|
+
const mockExecFile = vi.hoisted(() => vi.fn());
|
|
6
7
|
vi.mock("@anthropic-ai/sdk", () => ({
|
|
7
8
|
default: class MockAnthropic {
|
|
8
9
|
messages = { create: mockCreate };
|
|
9
10
|
},
|
|
10
11
|
}));
|
|
12
|
+
vi.mock("node:child_process", () => ({
|
|
13
|
+
execFile: mockExecFile,
|
|
14
|
+
}));
|
|
15
|
+
vi.mock("node:util", () => ({
|
|
16
|
+
promisify: (fn) => fn,
|
|
17
|
+
}));
|
|
11
18
|
// ---------------------------------------------------------------------------
|
|
12
19
|
// Import module under test AFTER mocks
|
|
13
20
|
// ---------------------------------------------------------------------------
|
|
@@ -19,67 +26,177 @@ describe("createLlmClient", () => {
|
|
|
19
26
|
const origEnv = { ...process.env };
|
|
20
27
|
beforeEach(() => {
|
|
21
28
|
vi.resetAllMocks();
|
|
22
|
-
process.env.
|
|
29
|
+
delete process.env.VSKILL_EVAL_PROVIDER;
|
|
23
30
|
delete process.env.VSKILL_EVAL_MODEL;
|
|
31
|
+
delete process.env.ANTHROPIC_API_KEY;
|
|
32
|
+
delete process.env.OLLAMA_BASE_URL;
|
|
33
|
+
delete process.env.CLAUDECODE;
|
|
24
34
|
});
|
|
25
35
|
afterEach(() => {
|
|
26
36
|
process.env = { ...origEnv };
|
|
27
37
|
});
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
38
|
+
// -------------------------------------------------------------------------
|
|
39
|
+
// Auto-detection
|
|
40
|
+
// -------------------------------------------------------------------------
|
|
41
|
+
it("defaults to claude-cli from a plain terminal", () => {
|
|
32
42
|
const client = createLlmClient();
|
|
33
|
-
|
|
34
|
-
expect(result).toBe("Generated response");
|
|
35
|
-
expect(mockCreate).toHaveBeenCalledOnce();
|
|
43
|
+
expect(client.model).toBe("claude-sonnet");
|
|
36
44
|
});
|
|
37
|
-
it("
|
|
38
|
-
|
|
39
|
-
content: [{ type: "text", text: "ok" }],
|
|
40
|
-
});
|
|
45
|
+
it("auto-detects ollama inside Claude Code session", () => {
|
|
46
|
+
process.env.CLAUDECODE = "1";
|
|
41
47
|
const client = createLlmClient();
|
|
42
|
-
|
|
43
|
-
expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ model: "claude-sonnet-4-20250514" }), expect.anything());
|
|
48
|
+
expect(client.model).toBe("llama3.1:8b");
|
|
44
49
|
});
|
|
45
|
-
it("
|
|
46
|
-
process.env.
|
|
47
|
-
mockCreate.mockResolvedValue({
|
|
48
|
-
content: [{ type: "text", text: "ok" }],
|
|
49
|
-
});
|
|
50
|
+
it("auto-detects anthropic when ANTHROPIC_API_KEY is set", () => {
|
|
51
|
+
process.env.ANTHROPIC_API_KEY = "test-key";
|
|
50
52
|
const client = createLlmClient();
|
|
51
|
-
|
|
52
|
-
expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ model: "claude-opus-4-20250514" }), expect.anything());
|
|
53
|
+
expect(client.model).toBe("claude-sonnet-4-20250514");
|
|
53
54
|
});
|
|
54
|
-
it("
|
|
55
|
-
|
|
55
|
+
it("CLAUDECODE takes priority over ANTHROPIC_API_KEY for auto-detection", () => {
|
|
56
|
+
process.env.CLAUDECODE = "1";
|
|
57
|
+
process.env.ANTHROPIC_API_KEY = "test-key";
|
|
56
58
|
const client = createLlmClient();
|
|
57
|
-
|
|
59
|
+
expect(client.model).toBe("llama3.1:8b");
|
|
58
60
|
});
|
|
59
|
-
it("
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
});
|
|
61
|
+
it("explicit VSKILL_EVAL_PROVIDER overrides auto-detection", () => {
|
|
62
|
+
process.env.VSKILL_EVAL_PROVIDER = "ollama";
|
|
63
|
+
process.env.ANTHROPIC_API_KEY = "test-key";
|
|
63
64
|
const client = createLlmClient();
|
|
64
|
-
|
|
65
|
-
expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({
|
|
66
|
-
system: "my system prompt",
|
|
67
|
-
messages: [{ role: "user", content: "my user prompt" }],
|
|
68
|
-
max_tokens: 4096,
|
|
69
|
-
}), expect.anything());
|
|
65
|
+
expect(client.model).toBe("llama3.1:8b");
|
|
70
66
|
});
|
|
71
|
-
it("throws
|
|
72
|
-
|
|
73
|
-
expect(() => createLlmClient()).toThrow(
|
|
67
|
+
it("throws on unknown provider", () => {
|
|
68
|
+
process.env.VSKILL_EVAL_PROVIDER = "gpt-magic";
|
|
69
|
+
expect(() => createLlmClient()).toThrow('Unknown VSKILL_EVAL_PROVIDER: "gpt-magic"');
|
|
74
70
|
});
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
71
|
+
// -------------------------------------------------------------------------
|
|
72
|
+
// Anthropic provider
|
|
73
|
+
// -------------------------------------------------------------------------
|
|
74
|
+
describe("anthropic provider", () => {
|
|
75
|
+
beforeEach(() => {
|
|
76
|
+
process.env.VSKILL_EVAL_PROVIDER = "anthropic";
|
|
77
|
+
process.env.ANTHROPIC_API_KEY = "test-key";
|
|
78
|
+
});
|
|
79
|
+
it("returns text content on successful generate call", async () => {
|
|
80
|
+
mockCreate.mockResolvedValue({
|
|
81
|
+
content: [{ type: "text", text: "Generated response" }],
|
|
82
|
+
});
|
|
83
|
+
const client = createLlmClient();
|
|
84
|
+
const result = await client.generate("system prompt", "user prompt");
|
|
85
|
+
expect(result).toBe("Generated response");
|
|
86
|
+
expect(mockCreate).toHaveBeenCalledOnce();
|
|
87
|
+
});
|
|
88
|
+
it("uses default model claude-sonnet-4-20250514", async () => {
|
|
89
|
+
mockCreate.mockResolvedValue({
|
|
90
|
+
content: [{ type: "text", text: "ok" }],
|
|
91
|
+
});
|
|
92
|
+
const client = createLlmClient();
|
|
93
|
+
await client.generate("sys", "usr");
|
|
94
|
+
expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ model: "claude-sonnet-4-20250514" }), expect.anything());
|
|
95
|
+
});
|
|
96
|
+
it("uses custom model from VSKILL_EVAL_MODEL", async () => {
|
|
97
|
+
process.env.VSKILL_EVAL_MODEL = "claude-opus-4-20250514";
|
|
98
|
+
mockCreate.mockResolvedValue({
|
|
99
|
+
content: [{ type: "text", text: "ok" }],
|
|
100
|
+
});
|
|
101
|
+
const client = createLlmClient();
|
|
102
|
+
await client.generate("sys", "usr");
|
|
103
|
+
expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ model: "claude-opus-4-20250514" }), expect.anything());
|
|
104
|
+
});
|
|
105
|
+
it("propagates network error from SDK", async () => {
|
|
106
|
+
mockCreate.mockRejectedValue(new Error("Connection timeout"));
|
|
107
|
+
const client = createLlmClient();
|
|
108
|
+
await expect(client.generate("sys", "usr")).rejects.toThrow("Connection timeout");
|
|
109
|
+
});
|
|
110
|
+
it("passes system and user prompts correctly", async () => {
|
|
111
|
+
mockCreate.mockResolvedValue({
|
|
112
|
+
content: [{ type: "text", text: "ok" }],
|
|
113
|
+
});
|
|
114
|
+
const client = createLlmClient();
|
|
115
|
+
await client.generate("my system prompt", "my user prompt");
|
|
116
|
+
expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({
|
|
117
|
+
system: "my system prompt",
|
|
118
|
+
messages: [{ role: "user", content: "my user prompt" }],
|
|
119
|
+
max_tokens: 4096,
|
|
120
|
+
}), expect.anything());
|
|
121
|
+
});
|
|
122
|
+
it("throws when ANTHROPIC_API_KEY is not set", () => {
|
|
123
|
+
delete process.env.ANTHROPIC_API_KEY;
|
|
124
|
+
expect(() => createLlmClient()).toThrow("ANTHROPIC_API_KEY is not set");
|
|
125
|
+
});
|
|
78
126
|
});
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
127
|
+
// -------------------------------------------------------------------------
|
|
128
|
+
// Claude CLI provider
|
|
129
|
+
// -------------------------------------------------------------------------
|
|
130
|
+
describe("claude-cli provider", () => {
|
|
131
|
+
beforeEach(() => {
|
|
132
|
+
process.env.VSKILL_EVAL_PROVIDER = "claude-cli";
|
|
133
|
+
});
|
|
134
|
+
it("calls claude CLI with --model flag", async () => {
|
|
135
|
+
mockExecFile.mockResolvedValue({ stdout: "CLI response\n" });
|
|
136
|
+
const client = createLlmClient();
|
|
137
|
+
const result = await client.generate("system prompt", "user prompt");
|
|
138
|
+
expect(result).toBe("CLI response");
|
|
139
|
+
expect(mockExecFile).toHaveBeenCalledWith("claude", ["-p", "system prompt\n\nuser prompt", "--model", "sonnet", "--no-input"], expect.objectContaining({ timeout: 120_000 }));
|
|
140
|
+
});
|
|
141
|
+
it("defaults to sonnet model", () => {
|
|
142
|
+
const client = createLlmClient();
|
|
143
|
+
expect(client.model).toBe("claude-sonnet");
|
|
144
|
+
});
|
|
145
|
+
it("passes custom model from VSKILL_EVAL_MODEL", async () => {
|
|
146
|
+
process.env.VSKILL_EVAL_MODEL = "opus";
|
|
147
|
+
mockExecFile.mockResolvedValue({ stdout: "ok\n" });
|
|
148
|
+
const client = createLlmClient();
|
|
149
|
+
expect(client.model).toBe("claude-opus");
|
|
150
|
+
await client.generate("sys", "usr");
|
|
151
|
+
expect(mockExecFile).toHaveBeenCalledWith("claude", expect.arrayContaining(["--model", "opus"]), expect.anything());
|
|
152
|
+
});
|
|
153
|
+
it("throws helpful error when claude CLI not found", async () => {
|
|
154
|
+
const err = new Error("ENOENT");
|
|
155
|
+
err.code = "ENOENT";
|
|
156
|
+
mockExecFile.mockRejectedValue(err);
|
|
157
|
+
const client = createLlmClient();
|
|
158
|
+
await expect(client.generate("sys", "usr")).rejects.toThrow("Claude CLI not found");
|
|
159
|
+
});
|
|
160
|
+
it("throws when explicitly selected inside Claude Code session", () => {
|
|
161
|
+
process.env.CLAUDECODE = "1";
|
|
162
|
+
expect(() => createLlmClient()).toThrow("Cannot use claude-cli provider inside a Claude Code session");
|
|
163
|
+
});
|
|
164
|
+
});
|
|
165
|
+
// -------------------------------------------------------------------------
|
|
166
|
+
// Ollama provider
|
|
167
|
+
// -------------------------------------------------------------------------
|
|
168
|
+
describe("ollama provider", () => {
|
|
169
|
+
beforeEach(() => {
|
|
170
|
+
process.env.VSKILL_EVAL_PROVIDER = "ollama";
|
|
171
|
+
});
|
|
172
|
+
it("uses default model llama3.1:8b", () => {
|
|
173
|
+
const client = createLlmClient();
|
|
174
|
+
expect(client.model).toBe("llama3.1:8b");
|
|
175
|
+
});
|
|
176
|
+
it("uses custom model from VSKILL_EVAL_MODEL", () => {
|
|
177
|
+
process.env.VSKILL_EVAL_MODEL = "qwen2.5:32b";
|
|
178
|
+
const client = createLlmClient();
|
|
179
|
+
expect(client.model).toBe("qwen2.5:32b");
|
|
180
|
+
});
|
|
181
|
+
it("calls Ollama HTTP API with correct payload", async () => {
|
|
182
|
+
const mockFetch = vi.spyOn(globalThis, "fetch").mockResolvedValue(new Response(JSON.stringify({ response: "Ollama reply" }), { status: 200 }));
|
|
183
|
+
const client = createLlmClient();
|
|
184
|
+
const result = await client.generate("system prompt", "user prompt");
|
|
185
|
+
expect(result).toBe("Ollama reply");
|
|
186
|
+
expect(mockFetch).toHaveBeenCalledWith("http://localhost:11434/api/generate", expect.objectContaining({
|
|
187
|
+
method: "POST",
|
|
188
|
+
body: expect.stringContaining('"model":"llama3.1:8b"'),
|
|
189
|
+
}));
|
|
190
|
+
mockFetch.mockRestore();
|
|
191
|
+
});
|
|
192
|
+
it("uses custom base URL from OLLAMA_BASE_URL", async () => {
|
|
193
|
+
process.env.OLLAMA_BASE_URL = "http://gpu-server:11434";
|
|
194
|
+
const mockFetch = vi.spyOn(globalThis, "fetch").mockResolvedValue(new Response(JSON.stringify({ response: "ok" }), { status: 200 }));
|
|
195
|
+
const client = createLlmClient();
|
|
196
|
+
await client.generate("sys", "usr");
|
|
197
|
+
expect(mockFetch).toHaveBeenCalledWith("http://gpu-server:11434/api/generate", expect.anything());
|
|
198
|
+
mockFetch.mockRestore();
|
|
199
|
+
});
|
|
83
200
|
});
|
|
84
201
|
});
|
|
85
202
|
//# sourceMappingURL=llm.test.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"llm.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/llm.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,QAAQ,CAAC;AAEzE,8EAA8E;AAC9E,QAAQ;AACR,8EAA8E;AAE9E,MAAM,UAAU,GAAG,EAAE,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;
|
|
1
|
+
{"version":3,"file":"llm.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/llm.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,QAAQ,CAAC;AAEzE,8EAA8E;AAC9E,QAAQ;AACR,8EAA8E;AAE9E,MAAM,UAAU,GAAG,EAAE,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;AAC7C,MAAM,YAAY,GAAG,EAAE,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;AAE/C,EAAE,CAAC,IAAI,CAAC,mBAAmB,EAAE,GAAG,EAAE,CAAC,CAAC;IAClC,OAAO,EAAE,MAAM,aAAa;QAC1B,QAAQ,GAAG,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC;KACnC;CACF,CAAC,CAAC,CAAC;AAEJ,EAAE,CAAC,IAAI,CAAC,oBAAoB,EAAE,GAAG,EAAE,CAAC,CAAC;IACnC,QAAQ,EAAE,YAAY;CACvB,CAAC,CAAC,CAAC;AAEJ,EAAE,CAAC,IAAI,CAAC,WAAW,EAAE,GAAG,EAAE,CAAC,CAAC;IAC1B,SAAS,EAAE,CAAC,EAAO,EAAE,EAAE,CAAC,EAAE;CAC3B,CAAC,CAAC,CAAC;AAEJ,8EAA8E;AAC9E,uCAAuC;AACvC,8EAA8E;AAE9E,MAAM,EAAE,eAAe,EAAE,GAAG,MAAM,MAAM,CAAC,WAAW,CAAC,CAAC;AAEtD,8EAA8E;AAC9E,QAAQ;AACR,8EAA8E;AAE9E,QAAQ,CAAC,iBAAiB,EAAE,GAAG,EAAE;IAC/B,MAAM,OAAO,GAAG,EAAE,GAAG,OAAO,CAAC,GAAG,EAAE,CAAC;IAEnC,UAAU,CAAC,GAAG,EAAE;QACd,EAAE,CAAC,aAAa,EAAE,CAAC;QACnB,OAAO,OAAO,CAAC,GAAG,CAAC,oBAAoB,CAAC;QACxC,OAAO,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC;QACrC,OAAO,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC;QACrC,OAAO,OAAO,CAAC,GAAG,CAAC,eAAe,CAAC;QACnC,OAAO,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC;IAChC,CAAC,CAAC,CAAC;IAEH,SAAS,CAAC,GAAG,EAAE;QACb,OAAO,CAAC,GAAG,GAAG,EAAE,GAAG,OAAO,EAAE,CAAC;IAC/B,CAAC,CAAC,CAAC;IAEH,4EAA4E;IAC5E,iBAAiB;IACjB,4EAA4E;IAE5E,EAAE,CAAC,8CAA8C,EAAE,GAAG,EAAE;QACtD,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;IAC7C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,gDAAgD,EAAE,GAAG,EAAE;QACxD,OAAO,CAAC,GAAG,CAAC,UAAU,GAAG,GAAG,CAAC;QAC7B,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;IAC3C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,sDAAsD,EAAE,GAAG,EAAE;QAC9D,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,UAAU,CAAC;QAC3C,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC;IACxD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,qEAAqE,EAAE,GAAG,EAAE;QAC7E,OAAO,CAAC,GAAG,CAAC,UAAU,GAAG,GAAG,CAAC;QAC7B,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,UAAU,CAAC;QAC3C,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;IAC3C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,wDAAwD,EAAE,GAAG,EAAE;QAChE,OAAO,CAAC,GAAG,CAAC,oBAAoB,GAAG,QAAQ,CAAC;QAC5C,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,UAAU,CAAC;QAC3C,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;QACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;IAC3C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,4BAA4B,EAAE,GAAG,EAAE;QACpC,OAAO,CAAC,GAAG,CAAC,oBAAoB,GAAG,WAAW,CAAC;QAC/C,MAAM,CAAC,GAAG,EAAE,CAAC,eAAe,EAAE,CAAC,CAAC,OAAO,CAAC,2CAA2C,CAAC,CAAC;IACvF,CAAC,CAAC,CAAC;IAEH,4EAA4E;IAC5E,qBAAqB;IACrB,4EAA4E;IAE5E,QAAQ,CAAC,oBAAoB,EAAE,GAAG,EAAE;QAClC,UAAU,CAAC,GAAG,EAAE;YACd,OAAO,CAAC,GAAG,CAAC,oBAAoB,GAAG,WAAW,CAAC;YAC/C,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,UAAU,CAAC;QAC7C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,kDAAkD,EAAE,KAAK,IAAI,EAAE;YAChE,UAAU,CAAC,iBAAiB,CAAC;gBAC3B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,oBAAoB,EAAE,CAAC;aACxD,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,eAAe,EAAE,aAAa,CAAC,CAAC;YAErE,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC;YAC1C,MAAM,CAAC,UAAU,CAAC,CAAC,oBAAoB,EAAE,CAAC;QAC5C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,6CAA6C,EAAE,KAAK,IAAI,EAAE;YAC3D,UAAU,CAAC,iBAAiB,CAAC;gBAC3B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;aACxC,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;YAEpC,MAAM,CAAC,UAAU,CAAC,CAAC,oBAAoB,CACrC,MAAM,CAAC,gBAAgB,CAAC,EAAE,KAAK,EAAE,0BAA0B,EAAE,CAAC,EAC9D,MAAM,CAAC,QAAQ,EAAE,CAClB,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;YACxD,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,wBAAwB,CAAC;YACzD,UAAU,CAAC,iBAAiB,CAAC;gBAC3B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;aACxC,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;YAEpC,MAAM,CAAC,UAAU,CAAC,CAAC,oBAAoB,CACrC,MAAM,CAAC,gBAAgB,CAAC,EAAE,KAAK,EAAE,wBAAwB,EAAE,CAAC,EAC5D,MAAM,CAAC,QAAQ,EAAE,CAClB,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,mCAAmC,EAAE,KAAK,IAAI,EAAE;YACjD,UAAU,CAAC,iBAAiB,CAAC,IAAI,KAAK,CAAC,oBAAoB,CAAC,CAAC,CAAC;YAE9D,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CACzD,oBAAoB,CACrB,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;YACxD,UAAU,CAAC,iBAAiB,CAAC;gBAC3B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;aACxC,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,CAAC,QAAQ,CAAC,kBAAkB,EAAE,gBAAgB,CAAC,CAAC;YAE5D,MAAM,CAAC,UAAU,CAAC,CAAC,oBAAoB,CACrC,MAAM,CAAC,gBAAgB,CAAC;gBACtB,MAAM,EAAE,kBAAkB;gBAC1B,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,EAAE,CAAC;gBACvD,UAAU,EAAE,IAAI;aACjB,CAAC,EACF,MAAM,CAAC,QAAQ,EAAE,CAClB,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;YAClD,OAAO,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC;YACrC,MAAM,CAAC,GAAG,EAAE,CAAC,eAAe,EAAE,CAAC,CAAC,OAAO,CAAC,8BAA8B,CAAC,CAAC;QAC1E,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,4EAA4E;IAC5E,sBAAsB;IACtB,4EAA4E;IAE5E,QAAQ,CAAC,qBAAqB,EAAE,GAAG,EAAE;QACnC,UAAU,CAAC,GAAG,EAAE;YACd,OAAO,CAAC,GAAG,CAAC,oBAAoB,GAAG,YAAY,CAAC;QAClD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,oCAAoC,EAAE,KAAK,IAAI,EAAE;YAClD,YAAY,CAAC,iBAAiB,CAAC,EAAE,MAAM,EAAE,gBAAgB,EAAE,CAAC,CAAC;YAE7D,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,eAAe,EAAE,aAAa,CAAC,CAAC;YAErE,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;YACpC,MAAM,CAAC,YAAY,CAAC,CAAC,oBAAoB,CACvC,QAAQ,EACR,CAAC,IAAI,EAAE,8BAA8B,EAAE,SAAS,EAAE,QAAQ,EAAE,YAAY,CAAC,EACzE,MAAM,CAAC,gBAAgB,CAAC,EAAE,OAAO,EAAE,OAAO,EAAE,CAAC,CAC9C,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0BAA0B,EAAE,GAAG,EAAE;YAClC,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;QAC7C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4CAA4C,EAAE,KAAK,IAAI,EAAE;YAC1D,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,MAAM,CAAC;YACvC,YAAY,CAAC,iBAAiB,CAAC,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;YAEnD,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;YACzC,MAAM,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;YAEpC,MAAM,CAAC,YAAY,CAAC,CAAC,oBAAoB,CACvC,QAAQ,EACR,MAAM,CAAC,eAAe,CAAC,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC,EAC3C,MAAM,CAAC,QAAQ,EAAE,CAClB,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,gDAAgD,EAAE,KAAK,IAAI,EAAE;YAC9D,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,QAAQ,CAAQ,CAAC;YACvC,GAAG,CAAC,IAAI,GAAG,QAAQ,CAAC;YACpB,YAAY,CAAC,iBAAiB,CAAC,GAAG,CAAC,CAAC;YAEpC,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CACzD,sBAAsB,CACvB,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4DAA4D,EAAE,GAAG,EAAE;YACpE,OAAO,CAAC,GAAG,CAAC,UAAU,GAAG,GAAG,CAAC;YAC7B,MAAM,CAAC,GAAG,EAAE,CAAC,eAAe,EAAE,CAAC,CAAC,OAAO,CACrC,6DAA6D,CAC9D,CAAC;QACJ,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,4EAA4E;IAC5E,kBAAkB;IAClB,4EAA4E;IAE5E,QAAQ,CAAC,iBAAiB,EAAE,GAAG,EAAE;QAC/B,UAAU,CAAC,GAAG,EAAE;YACd,OAAO,CAAC,GAAG,CAAC,oBAAoB,GAAG,QAAQ,CAAC;QAC9C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,gCAAgC,EAAE,GAAG,EAAE;YACxC,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC3C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;YAClD,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,aAAa,CAAC;YAC9C,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC3C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4CAA4C,EAAE,KAAK,IAAI,EAAE;YAC1D,MAAM,SAAS,GAAG,EAAE,CAAC,KAAK,CAAC,UAAU,EAAE,OAAO,CAAC,CAAC,iBAAiB,CAC/D,IAAI,QAAQ,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,cAAc,EAAE,CAAC,EAAE,EAAE,MAAM,EAAE,GAAG,EAAE,CAAC,CAC5E,CAAC;YAEF,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,eAAe,EAAE,aAAa,CAAC,CAAC;YAErE,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;YACpC,MAAM,CAAC,SAAS,CAAC,CAAC,oBAAoB,CACpC,qCAAqC,EACrC,MAAM,CAAC,gBAAgB,CAAC;gBACtB,MAAM,EAAE,MAAM;gBACd,IAAI,EAAE,MAAM,CAAC,gBAAgB,CAAC,uBAAuB,CAAC;aACvD,CAAC,CACH,CAAC;YAEF,SAAS,CAAC,WAAW,EAAE,CAAC;QAC1B,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,2CAA2C,EAAE,KAAK,IAAI,EAAE;YACzD,OAAO,CAAC,GAAG,CAAC,eAAe,GAAG,yBAAyB,CAAC;YAExD,MAAM,SAAS,GAAG,EAAE,CAAC,KAAK,CAAC,UAAU,EAAE,OAAO,CAAC,CAAC,iBAAiB,CAC/D,IAAI,QAAQ,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC,EAAE,EAAE,MAAM,EAAE,GAAG,EAAE,CAAC,CAClE,CAAC;YAEF,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;YAEpC,MAAM,CAAC,SAAS,CAAC,CAAC,oBAAoB,CACpC,sCAAsC,EACtC,MAAM,CAAC,QAAQ,EAAE,CAClB,CAAC;YAEF,SAAS,CAAC,WAAW,EAAE,CAAC;QAC1B,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { computeVerdict, verdictColor } from "../verdict.js";
|
|
3
|
+
describe("computeVerdict", () => {
|
|
4
|
+
it("returns EFFECTIVE when passRate >= 0.8 and skill rubric > baseline + 1", () => {
|
|
5
|
+
expect(computeVerdict(0.85, 4.5, 3.0)).toBe("EFFECTIVE");
|
|
6
|
+
expect(computeVerdict(0.80, 4.0, 2.5)).toBe("EFFECTIVE");
|
|
7
|
+
expect(computeVerdict(1.0, 5.0, 1.0)).toBe("EFFECTIVE");
|
|
8
|
+
});
|
|
9
|
+
it("returns MARGINAL when passRate >= 0.6 and skill rubric > baseline (but not EFFECTIVE)", () => {
|
|
10
|
+
expect(computeVerdict(0.70, 3.5, 3.0)).toBe("MARGINAL");
|
|
11
|
+
expect(computeVerdict(0.60, 2.5, 2.0)).toBe("MARGINAL");
|
|
12
|
+
// High pass rate but rubric only slightly better → MARGINAL
|
|
13
|
+
expect(computeVerdict(0.85, 3.5, 3.0)).toBe("MARGINAL");
|
|
14
|
+
});
|
|
15
|
+
it("returns INEFFECTIVE when passRate >= 0.4 (but not MARGINAL)", () => {
|
|
16
|
+
expect(computeVerdict(0.50, 2.5, 3.0)).toBe("INEFFECTIVE");
|
|
17
|
+
expect(computeVerdict(0.45, 3.0, 3.0)).toBe("INEFFECTIVE");
|
|
18
|
+
expect(computeVerdict(0.40, 1.0, 5.0)).toBe("INEFFECTIVE");
|
|
19
|
+
});
|
|
20
|
+
it("returns DEGRADING when passRate < 0.4", () => {
|
|
21
|
+
expect(computeVerdict(0.30, 2.0, 3.0)).toBe("DEGRADING");
|
|
22
|
+
expect(computeVerdict(0.10, 1.0, 1.0)).toBe("DEGRADING");
|
|
23
|
+
expect(computeVerdict(0.0, 0.0, 0.0)).toBe("DEGRADING");
|
|
24
|
+
expect(computeVerdict(0.39, 5.0, 1.0)).toBe("DEGRADING");
|
|
25
|
+
});
|
|
26
|
+
it("handles boundary values correctly", () => {
|
|
27
|
+
// Exactly 0.8 pass rate, exactly +1 rubric → EFFECTIVE
|
|
28
|
+
expect(computeVerdict(0.8, 4.0, 2.9)).toBe("EFFECTIVE");
|
|
29
|
+
// 0.8 pass rate but rubric diff exactly 1 → NOT EFFECTIVE (needs >1)
|
|
30
|
+
expect(computeVerdict(0.8, 4.0, 3.0)).toBe("MARGINAL");
|
|
31
|
+
// Exactly 0.6 pass rate, skill > baseline → MARGINAL
|
|
32
|
+
expect(computeVerdict(0.6, 3.1, 3.0)).toBe("MARGINAL");
|
|
33
|
+
// Exactly 0.6 pass rate, skill = baseline → INEFFECTIVE
|
|
34
|
+
expect(computeVerdict(0.6, 3.0, 3.0)).toBe("INEFFECTIVE");
|
|
35
|
+
// Exactly 0.4 pass rate → INEFFECTIVE
|
|
36
|
+
expect(computeVerdict(0.4, 3.0, 3.0)).toBe("INEFFECTIVE");
|
|
37
|
+
});
|
|
38
|
+
});
|
|
39
|
+
describe("verdictColor", () => {
|
|
40
|
+
it("returns correct colors for each verdict", () => {
|
|
41
|
+
expect(verdictColor("EFFECTIVE")).toBe("green");
|
|
42
|
+
expect(verdictColor("MARGINAL")).toBe("yellow");
|
|
43
|
+
expect(verdictColor("INEFFECTIVE")).toBe("orange");
|
|
44
|
+
expect(verdictColor("DEGRADING")).toBe("red");
|
|
45
|
+
});
|
|
46
|
+
});
|
|
47
|
+
//# sourceMappingURL=verdict.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"verdict.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/verdict.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAC9C,OAAO,EAAE,cAAc,EAAE,YAAY,EAAE,MAAM,eAAe,CAAC;AAE7D,QAAQ,CAAC,gBAAgB,EAAE,GAAG,EAAE;IAC9B,EAAE,CAAC,wEAAwE,EAAE,GAAG,EAAE;QAChF,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACzD,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACzD,MAAM,CAAC,cAAc,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;IAC1D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,uFAAuF,EAAE,GAAG,EAAE;QAC/F,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACxD,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACxD,4DAA4D;QAC5D,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;IAC1D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,6DAA6D,EAAE,GAAG,EAAE;QACrE,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC3D,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC3D,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;IAC7D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,uCAAuC,EAAE,GAAG,EAAE;QAC/C,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACzD,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACzD,MAAM,CAAC,cAAc,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACxD,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;IAC3D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,mCAAmC,EAAE,GAAG,EAAE;QAC3C,uDAAuD;QACvD,MAAM,CAAC,cAAc,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACxD,qEAAqE;QACrE,MAAM,CAAC,cAAc,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACvD,qDAAqD;QACrD,MAAM,CAAC,cAAc,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACvD,wDAAwD;QACxD,MAAM,CAAC,cAAc,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC1D,sCAAsC;QACtC,MAAM,CAAC,cAAc,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;IAC5D,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,QAAQ,CAAC,cAAc,EAAE,GAAG,EAAE;IAC5B,EAAE,CAAC,yCAAyC,EAAE,GAAG,EAAE;QACjD,MAAM,CAAC,YAAY,CAAC,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAChD,MAAM,CAAC,YAAY,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAChD,MAAM,CAAC,YAAY,CAAC,aAAa,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QACnD,MAAM,CAAC,YAAY,CAAC,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAChD,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import type { LlmClient } from "./llm.js";
|
|
2
|
+
export interface ActivationPrompt {
|
|
3
|
+
prompt: string;
|
|
4
|
+
expected: "should_activate" | "should_not_activate";
|
|
5
|
+
}
|
|
6
|
+
export interface ActivationResult {
|
|
7
|
+
prompt: string;
|
|
8
|
+
expected: "should_activate" | "should_not_activate";
|
|
9
|
+
activate: boolean;
|
|
10
|
+
confidence: "high" | "medium" | "low";
|
|
11
|
+
reasoning: string;
|
|
12
|
+
classification: "TP" | "TN" | "FP" | "FN";
|
|
13
|
+
}
|
|
14
|
+
export interface ActivationSummary {
|
|
15
|
+
results: ActivationResult[];
|
|
16
|
+
precision: number;
|
|
17
|
+
recall: number;
|
|
18
|
+
reliability: number;
|
|
19
|
+
total: number;
|
|
20
|
+
tp: number;
|
|
21
|
+
tn: number;
|
|
22
|
+
fp: number;
|
|
23
|
+
fn: number;
|
|
24
|
+
}
|
|
25
|
+
export declare function testActivation(skillDescription: string, prompts: ActivationPrompt[], client: LlmClient, onResult?: (result: ActivationResult) => void): Promise<ActivationSummary>;
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
// ---------------------------------------------------------------------------
|
|
2
|
+
// activation-tester.ts -- test SKILL.md description auto-activation quality
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
const ACTIVATION_SYSTEM_PROMPT = `You are evaluating whether a user prompt would trigger an AI skill based on its description.
|
|
5
|
+
|
|
6
|
+
Given the skill description and a user prompt, determine:
|
|
7
|
+
1. Would this prompt trigger this skill? (yes/no)
|
|
8
|
+
2. How confident are you? (high/medium/low)
|
|
9
|
+
3. Brief reasoning
|
|
10
|
+
|
|
11
|
+
Respond with ONLY valid JSON:
|
|
12
|
+
{
|
|
13
|
+
"activate": true/false,
|
|
14
|
+
"confidence": "high" | "medium" | "low",
|
|
15
|
+
"reasoning": "brief explanation"
|
|
16
|
+
}`;
|
|
17
|
+
export async function testActivation(skillDescription, prompts, client, onResult) {
|
|
18
|
+
const results = [];
|
|
19
|
+
for (const p of prompts) {
|
|
20
|
+
const userPrompt = `## Skill Description
|
|
21
|
+
${skillDescription}
|
|
22
|
+
|
|
23
|
+
## User Prompt
|
|
24
|
+
${p.prompt}
|
|
25
|
+
|
|
26
|
+
Would this user prompt trigger this skill?`;
|
|
27
|
+
try {
|
|
28
|
+
const response = await client.generate(ACTIVATION_SYSTEM_PROMPT, userPrompt);
|
|
29
|
+
const jsonMatch = response.match(/```(?:json)?\s*([\s\S]*?)```/) || [null, response];
|
|
30
|
+
const json = JSON.parse(jsonMatch[1].trim());
|
|
31
|
+
const activate = !!json.activate;
|
|
32
|
+
const confidence = ["high", "medium", "low"].includes(json.confidence)
|
|
33
|
+
? json.confidence
|
|
34
|
+
: "low";
|
|
35
|
+
const classification = classifyResult(p.expected, activate);
|
|
36
|
+
const result = {
|
|
37
|
+
prompt: p.prompt,
|
|
38
|
+
expected: p.expected,
|
|
39
|
+
activate,
|
|
40
|
+
confidence,
|
|
41
|
+
reasoning: String(json.reasoning || ""),
|
|
42
|
+
classification,
|
|
43
|
+
};
|
|
44
|
+
results.push(result);
|
|
45
|
+
onResult?.(result);
|
|
46
|
+
}
|
|
47
|
+
catch (err) {
|
|
48
|
+
const result = {
|
|
49
|
+
prompt: p.prompt,
|
|
50
|
+
expected: p.expected,
|
|
51
|
+
activate: false,
|
|
52
|
+
confidence: "low",
|
|
53
|
+
reasoning: `Error: ${err instanceof Error ? err.message : String(err)}`,
|
|
54
|
+
classification: p.expected === "should_activate" ? "FN" : "TN",
|
|
55
|
+
};
|
|
56
|
+
results.push(result);
|
|
57
|
+
onResult?.(result);
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
return computeSummary(results);
|
|
61
|
+
}
|
|
62
|
+
function classifyResult(expected, actual) {
|
|
63
|
+
if (expected === "should_activate" && actual)
|
|
64
|
+
return "TP";
|
|
65
|
+
if (expected === "should_activate" && !actual)
|
|
66
|
+
return "FN";
|
|
67
|
+
if (expected === "should_not_activate" && !actual)
|
|
68
|
+
return "TN";
|
|
69
|
+
return "FP";
|
|
70
|
+
}
|
|
71
|
+
function computeSummary(results) {
|
|
72
|
+
const tp = results.filter((r) => r.classification === "TP").length;
|
|
73
|
+
const tn = results.filter((r) => r.classification === "TN").length;
|
|
74
|
+
const fp = results.filter((r) => r.classification === "FP").length;
|
|
75
|
+
const fn = results.filter((r) => r.classification === "FN").length;
|
|
76
|
+
const total = results.length;
|
|
77
|
+
return {
|
|
78
|
+
results,
|
|
79
|
+
precision: tp + fp > 0 ? tp / (tp + fp) : 0,
|
|
80
|
+
recall: tp + fn > 0 ? tp / (tp + fn) : 0,
|
|
81
|
+
reliability: total > 0 ? (tp + tn) / total : 0,
|
|
82
|
+
total,
|
|
83
|
+
tp,
|
|
84
|
+
tn,
|
|
85
|
+
fp,
|
|
86
|
+
fn,
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
//# sourceMappingURL=activation-tester.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"activation-tester.js","sourceRoot":"","sources":["../../src/eval/activation-tester.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,4EAA4E;AAC5E,8EAA8E;AA8B9E,MAAM,wBAAwB,GAAG;;;;;;;;;;;;EAY/B,CAAC;AAEH,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,gBAAwB,EACxB,OAA2B,EAC3B,MAAiB,EACjB,QAA6C;IAE7C,MAAM,OAAO,GAAuB,EAAE,CAAC;IAEvC,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;QACxB,MAAM,UAAU,GAAG;EACrB,gBAAgB;;;EAGhB,CAAC,CAAC,MAAM;;2CAEiC,CAAC;QAExC,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,wBAAwB,EAAE,UAAU,CAAC,CAAC;YAC7E,MAAM,SAAS,GAAG,QAAQ,CAAC,KAAK,CAAC,8BAA8B,CAAC,IAAI,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;YACrF,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,CAAE,CAAC,IAAI,EAAE,CAAC,CAAC;YAE9C,MAAM,QAAQ,GAAG,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC;YACjC,MAAM,UAAU,GAAG,CAAC,MAAM,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,UAAU,CAAC;gBACpE,CAAC,CAAE,IAAI,CAAC,UAAwC;gBAChD,CAAC,CAAC,KAAK,CAAC;YAEV,MAAM,cAAc,GAAG,cAAc,CAAC,CAAC,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;YAE5D,MAAM,MAAM,GAAqB;gBAC/B,MAAM,EAAE,CAAC,CAAC,MAAM;gBAChB,QAAQ,EAAE,CAAC,CAAC,QAAQ;gBACpB,QAAQ;gBACR,UAAU;gBACV,SAAS,EAAE,MAAM,CAAC,IAAI,CAAC,SAAS,IAAI,EAAE,CAAC;gBACvC,cAAc;aACf,CAAC;YACF,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACrB,QAAQ,EAAE,CAAC,MAAM,CAAC,CAAC;QACrB,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,MAAM,MAAM,GAAqB;gBAC/B,MAAM,EAAE,CAAC,CAAC,MAAM;gBAChB,QAAQ,EAAE,CAAC,CAAC,QAAQ;gBACpB,QAAQ,EAAE,KAAK;gBACf,UAAU,EAAE,KAAK;gBACjB,SAAS,EAAE,UAAU,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE;gBACvE,cAAc,EAAE,CAAC,CAAC,QAAQ,KAAK,iBAAiB,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI;aAC/D,CAAC;YACF,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACrB,QAAQ,EAAE,CAAC,MAAM,CAAC,CAAC;QACrB,CAAC;IACH,CAAC;IAED,OAAO,cAAc,CAAC,OAAO,CAAC,CAAC;AACjC,CAAC;AAED,SAAS,cAAc,CACrB,QAAmD,EACnD,MAAe;IAEf,IAAI,QAAQ,KAAK,iBAAiB,IAAI,MAAM;QAAE,OAAO,IAAI,CAAC;IAC1D,IAAI,QAAQ,KAAK,iBAAiB,IAAI,CAAC,MAAM;QAAE,OAAO,IAAI,CAAC;IAC3D,IAAI,QAAQ,KAAK,qBAAqB,IAAI,CAAC,MAAM;QAAE,OAAO,IAAI,CAAC;IAC/D,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,cAAc,CAAC,OAA2B;IACjD,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,cAAc,KAAK,IAAI,CAAC,CAAC,MAAM,CAAC;IACnE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,cAAc,KAAK,IAAI,CAAC,CAAC,MAAM,CAAC;IACnE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,cAAc,KAAK,IAAI,CAAC,CAAC,MAAM,CAAC;IACnE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,cAAc,KAAK,IAAI,CAAC,CAAC,MAAM,CAAC;IACnE,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC;IAE7B,OAAO;QACL,OAAO;QACP,SAAS,EAAE,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QAC3C,MAAM,EAAE,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QACxC,WAAW,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,GAAG,EAAE,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QAC9C,KAAK;QACL,EAAE;QACF,EAAE;QACF,EAAE;QACF,EAAE;KACH,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import type { BenchmarkResult } from "./benchmark.js";
|
|
2
|
+
export interface HistorySummary {
|
|
3
|
+
timestamp: string;
|
|
4
|
+
filename: string;
|
|
5
|
+
model: string;
|
|
6
|
+
skillName: string;
|
|
7
|
+
passRate: number;
|
|
8
|
+
type: "benchmark" | "comparison";
|
|
9
|
+
}
|
|
10
|
+
export interface RegressionEntry {
|
|
11
|
+
assertionId: string;
|
|
12
|
+
evalId: number;
|
|
13
|
+
evalName: string;
|
|
14
|
+
previousStatus: boolean;
|
|
15
|
+
currentStatus: boolean;
|
|
16
|
+
change: "regression" | "improvement";
|
|
17
|
+
}
|
|
18
|
+
export declare function writeHistoryEntry(skillDir: string, result: BenchmarkResult & {
|
|
19
|
+
type?: "benchmark" | "comparison";
|
|
20
|
+
}): Promise<string>;
|
|
21
|
+
export declare function listHistory(skillDir: string): Promise<HistorySummary[]>;
|
|
22
|
+
export declare function readHistoryEntry(skillDir: string, timestamp: string): Promise<BenchmarkResult | null>;
|
|
23
|
+
export declare function computeRegressions(current: BenchmarkResult, previous: BenchmarkResult): RegressionEntry[];
|