vskill 0.5.2 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (205) hide show
  1. package/dist/agents/agents-registry.test.d.ts +1 -0
  2. package/dist/agents/agents-registry.test.js +248 -0
  3. package/dist/agents/agents-registry.test.js.map +1 -0
  4. package/dist/api/client.test.d.ts +1 -0
  5. package/dist/api/client.test.js +428 -0
  6. package/dist/api/client.test.js.map +1 -0
  7. package/dist/audit/audit-integration.test.d.ts +1 -0
  8. package/dist/audit/audit-integration.test.js +92 -0
  9. package/dist/audit/audit-integration.test.js.map +1 -0
  10. package/dist/audit/audit-llm.test.d.ts +1 -0
  11. package/dist/audit/audit-llm.test.js +110 -0
  12. package/dist/audit/audit-llm.test.js.map +1 -0
  13. package/dist/audit/audit-patterns.test.d.ts +1 -0
  14. package/dist/audit/audit-patterns.test.js +91 -0
  15. package/dist/audit/audit-patterns.test.js.map +1 -0
  16. package/dist/audit/audit-scanner.test.d.ts +1 -0
  17. package/dist/audit/audit-scanner.test.js +112 -0
  18. package/dist/audit/audit-scanner.test.js.map +1 -0
  19. package/dist/audit/audit-types.test.d.ts +1 -0
  20. package/dist/audit/audit-types.test.js +140 -0
  21. package/dist/audit/audit-types.test.js.map +1 -0
  22. package/dist/audit/config.test.d.ts +1 -0
  23. package/dist/audit/config.test.js +44 -0
  24. package/dist/audit/config.test.js.map +1 -0
  25. package/dist/audit/file-discovery.test.d.ts +1 -0
  26. package/dist/audit/file-discovery.test.js +120 -0
  27. package/dist/audit/file-discovery.test.js.map +1 -0
  28. package/dist/audit/fix-suggestions.test.d.ts +1 -0
  29. package/dist/audit/fix-suggestions.test.js +35 -0
  30. package/dist/audit/fix-suggestions.test.js.map +1 -0
  31. package/dist/audit/formatters/json-formatter.test.d.ts +1 -0
  32. package/dist/audit/formatters/json-formatter.test.js +49 -0
  33. package/dist/audit/formatters/json-formatter.test.js.map +1 -0
  34. package/dist/audit/formatters/report-formatter.test.d.ts +1 -0
  35. package/dist/audit/formatters/report-formatter.test.js +51 -0
  36. package/dist/audit/formatters/report-formatter.test.js.map +1 -0
  37. package/dist/audit/formatters/sarif-formatter.test.d.ts +1 -0
  38. package/dist/audit/formatters/sarif-formatter.test.js +71 -0
  39. package/dist/audit/formatters/sarif-formatter.test.js.map +1 -0
  40. package/dist/audit/formatters/terminal-formatter.test.d.ts +1 -0
  41. package/dist/audit/formatters/terminal-formatter.test.js +51 -0
  42. package/dist/audit/formatters/terminal-formatter.test.js.map +1 -0
  43. package/dist/blocklist/blocklist-e2e.test.d.ts +1 -0
  44. package/dist/blocklist/blocklist-e2e.test.js +346 -0
  45. package/dist/blocklist/blocklist-e2e.test.js.map +1 -0
  46. package/dist/blocklist/blocklist.test.d.ts +1 -0
  47. package/dist/blocklist/blocklist.test.js +259 -0
  48. package/dist/blocklist/blocklist.test.js.map +1 -0
  49. package/dist/commands/__tests__/eval-router.test.d.ts +1 -0
  50. package/dist/commands/__tests__/eval-router.test.js +60 -0
  51. package/dist/commands/__tests__/eval-router.test.js.map +1 -0
  52. package/dist/commands/__tests__/eval-serve.test.d.ts +1 -0
  53. package/dist/commands/__tests__/eval-serve.test.js +23 -0
  54. package/dist/commands/__tests__/eval-serve.test.js.map +1 -0
  55. package/dist/commands/add-blocklist-e2e.test.d.ts +1 -0
  56. package/dist/commands/add-blocklist-e2e.test.js +397 -0
  57. package/dist/commands/add-blocklist-e2e.test.js.map +1 -0
  58. package/dist/commands/add-wizard.test.d.ts +1 -0
  59. package/dist/commands/add-wizard.test.js +392 -0
  60. package/dist/commands/add-wizard.test.js.map +1 -0
  61. package/dist/commands/add.test.d.ts +1 -0
  62. package/dist/commands/add.test.js +2365 -0
  63. package/dist/commands/add.test.js.map +1 -0
  64. package/dist/commands/audit.test.d.ts +1 -0
  65. package/dist/commands/audit.test.js +79 -0
  66. package/dist/commands/audit.test.js.map +1 -0
  67. package/dist/commands/blocklist.test.d.ts +1 -0
  68. package/dist/commands/blocklist.test.js +158 -0
  69. package/dist/commands/blocklist.test.js.map +1 -0
  70. package/dist/commands/eval/__tests__/coverage.test.d.ts +1 -0
  71. package/dist/commands/eval/__tests__/coverage.test.js +122 -0
  72. package/dist/commands/eval/__tests__/coverage.test.js.map +1 -0
  73. package/dist/commands/eval/__tests__/generate-all.test.d.ts +1 -0
  74. package/dist/commands/eval/__tests__/generate-all.test.js +133 -0
  75. package/dist/commands/eval/__tests__/generate-all.test.js.map +1 -0
  76. package/dist/commands/eval/__tests__/init.test.d.ts +1 -0
  77. package/dist/commands/eval/__tests__/init.test.js +116 -0
  78. package/dist/commands/eval/__tests__/init.test.js.map +1 -0
  79. package/dist/commands/eval/__tests__/run.test.d.ts +1 -0
  80. package/dist/commands/eval/__tests__/run.test.js +186 -0
  81. package/dist/commands/eval/__tests__/run.test.js.map +1 -0
  82. package/dist/commands/find.test.d.ts +1 -0
  83. package/dist/commands/find.test.js +481 -0
  84. package/dist/commands/find.test.js.map +1 -0
  85. package/dist/commands/marketplace.test.d.ts +1 -0
  86. package/dist/commands/marketplace.test.js +129 -0
  87. package/dist/commands/marketplace.test.js.map +1 -0
  88. package/dist/commands/remove.test.d.ts +1 -0
  89. package/dist/commands/remove.test.js +164 -0
  90. package/dist/commands/remove.test.js.map +1 -0
  91. package/dist/commands/should-skip.test.d.ts +1 -0
  92. package/dist/commands/should-skip.test.js +56 -0
  93. package/dist/commands/should-skip.test.js.map +1 -0
  94. package/dist/commands/submit.test.d.ts +1 -0
  95. package/dist/commands/submit.test.js +83 -0
  96. package/dist/commands/submit.test.js.map +1 -0
  97. package/dist/commands/update.test.d.ts +1 -0
  98. package/dist/commands/update.test.js +250 -0
  99. package/dist/commands/update.test.js.map +1 -0
  100. package/dist/discovery/github-tree.test.d.ts +1 -0
  101. package/dist/discovery/github-tree.test.js +372 -0
  102. package/dist/discovery/github-tree.test.js.map +1 -0
  103. package/dist/eval/__tests__/activation-tester.test.d.ts +1 -0
  104. package/dist/eval/__tests__/activation-tester.test.js +203 -0
  105. package/dist/eval/__tests__/activation-tester.test.js.map +1 -0
  106. package/dist/eval/__tests__/benchmark-history.test.d.ts +1 -0
  107. package/dist/eval/__tests__/benchmark-history.test.js +422 -0
  108. package/dist/eval/__tests__/benchmark-history.test.js.map +1 -0
  109. package/dist/eval/__tests__/benchmark.test.d.ts +1 -0
  110. package/dist/eval/__tests__/benchmark.test.js +94 -0
  111. package/dist/eval/__tests__/benchmark.test.js.map +1 -0
  112. package/dist/eval/__tests__/comparator.test.d.ts +1 -0
  113. package/dist/eval/__tests__/comparator.test.js +282 -0
  114. package/dist/eval/__tests__/comparator.test.js.map +1 -0
  115. package/dist/eval/__tests__/judge.test.d.ts +1 -0
  116. package/dist/eval/__tests__/judge.test.js +122 -0
  117. package/dist/eval/__tests__/judge.test.js.map +1 -0
  118. package/dist/eval/__tests__/llm.test.d.ts +1 -0
  119. package/dist/eval/__tests__/llm.test.js +543 -0
  120. package/dist/eval/__tests__/llm.test.js.map +1 -0
  121. package/dist/eval/__tests__/mcp-detector.test.d.ts +1 -0
  122. package/dist/eval/__tests__/mcp-detector.test.js +180 -0
  123. package/dist/eval/__tests__/mcp-detector.test.js.map +1 -0
  124. package/dist/eval/__tests__/prompt-builder.test.d.ts +1 -0
  125. package/dist/eval/__tests__/prompt-builder.test.js +142 -0
  126. package/dist/eval/__tests__/prompt-builder.test.js.map +1 -0
  127. package/dist/eval/__tests__/schema.test.d.ts +1 -0
  128. package/dist/eval/__tests__/schema.test.js +247 -0
  129. package/dist/eval/__tests__/schema.test.js.map +1 -0
  130. package/dist/eval/__tests__/skill-scanner.test.d.ts +1 -0
  131. package/dist/eval/__tests__/skill-scanner.test.js +228 -0
  132. package/dist/eval/__tests__/skill-scanner.test.js.map +1 -0
  133. package/dist/eval/__tests__/verdict.test.d.ts +1 -0
  134. package/dist/eval/__tests__/verdict.test.js +47 -0
  135. package/dist/eval/__tests__/verdict.test.js.map +1 -0
  136. package/dist/eval-server/__tests__/benchmark-runner.test.d.ts +1 -0
  137. package/dist/eval-server/__tests__/benchmark-runner.test.js +301 -0
  138. package/dist/eval-server/__tests__/benchmark-runner.test.js.map +1 -0
  139. package/dist/eval-server/__tests__/comparison-sse-events.test.d.ts +1 -0
  140. package/dist/eval-server/__tests__/comparison-sse-events.test.js +278 -0
  141. package/dist/eval-server/__tests__/comparison-sse-events.test.js.map +1 -0
  142. package/dist/eval-server/__tests__/sse-helpers.test.d.ts +1 -0
  143. package/dist/eval-server/__tests__/sse-helpers.test.js +128 -0
  144. package/dist/eval-server/__tests__/sse-helpers.test.js.map +1 -0
  145. package/dist/installer/canonical.test.d.ts +1 -0
  146. package/dist/installer/canonical.test.js +264 -0
  147. package/dist/installer/canonical.test.js.map +1 -0
  148. package/dist/lockfile/lockfile.test.d.ts +1 -0
  149. package/dist/lockfile/lockfile.test.js +204 -0
  150. package/dist/lockfile/lockfile.test.js.map +1 -0
  151. package/dist/lockfile/project-root.test.d.ts +1 -0
  152. package/dist/lockfile/project-root.test.js +49 -0
  153. package/dist/lockfile/project-root.test.js.map +1 -0
  154. package/dist/marketplace/marketplace.test.d.ts +1 -0
  155. package/dist/marketplace/marketplace.test.js +312 -0
  156. package/dist/marketplace/marketplace.test.js.map +1 -0
  157. package/dist/resolvers/source-resolver.test.d.ts +1 -0
  158. package/dist/resolvers/source-resolver.test.js +104 -0
  159. package/dist/resolvers/source-resolver.test.js.map +1 -0
  160. package/dist/resolvers/url-resolver.test.d.ts +1 -0
  161. package/dist/resolvers/url-resolver.test.js +49 -0
  162. package/dist/resolvers/url-resolver.test.js.map +1 -0
  163. package/dist/scanner/dci-integration.test.d.ts +1 -0
  164. package/dist/scanner/dci-integration.test.js +83 -0
  165. package/dist/scanner/dci-integration.test.js.map +1 -0
  166. package/dist/scanner/patterns.test.d.ts +1 -0
  167. package/dist/scanner/patterns.test.js +832 -0
  168. package/dist/scanner/patterns.test.js.map +1 -0
  169. package/dist/scanner/tier1.test.d.ts +1 -0
  170. package/dist/scanner/tier1.test.js +305 -0
  171. package/dist/scanner/tier1.test.js.map +1 -0
  172. package/dist/security/platform-security.test.d.ts +1 -0
  173. package/dist/security/platform-security.test.js +92 -0
  174. package/dist/security/platform-security.test.js.map +1 -0
  175. package/dist/settings/settings.test.d.ts +1 -0
  176. package/dist/settings/settings.test.js +103 -0
  177. package/dist/settings/settings.test.js.map +1 -0
  178. package/dist/updater/source-fetcher.test.d.ts +1 -0
  179. package/dist/updater/source-fetcher.test.js +192 -0
  180. package/dist/updater/source-fetcher.test.js.map +1 -0
  181. package/dist/utils/__tests__/paths.test.d.ts +1 -0
  182. package/dist/utils/__tests__/paths.test.js +22 -0
  183. package/dist/utils/__tests__/paths.test.js.map +1 -0
  184. package/dist/utils/__tests__/resolve-binary.integration.test.d.ts +1 -0
  185. package/dist/utils/__tests__/resolve-binary.integration.test.js +138 -0
  186. package/dist/utils/__tests__/resolve-binary.integration.test.js.map +1 -0
  187. package/dist/utils/__tests__/resolve-binary.test.d.ts +1 -0
  188. package/dist/utils/__tests__/resolve-binary.test.js +175 -0
  189. package/dist/utils/__tests__/resolve-binary.test.js.map +1 -0
  190. package/dist/utils/__tests__/validation.test.d.ts +1 -0
  191. package/dist/utils/__tests__/validation.test.js +107 -0
  192. package/dist/utils/__tests__/validation.test.js.map +1 -0
  193. package/dist/utils/agent-filter.test.d.ts +1 -0
  194. package/dist/utils/agent-filter.test.js +75 -0
  195. package/dist/utils/agent-filter.test.js.map +1 -0
  196. package/dist/utils/output.test.d.ts +1 -0
  197. package/dist/utils/output.test.js +28 -0
  198. package/dist/utils/output.test.js.map +1 -0
  199. package/dist/utils/project-root.test.d.ts +1 -0
  200. package/dist/utils/project-root.test.js +74 -0
  201. package/dist/utils/project-root.test.js.map +1 -0
  202. package/dist/utils/prompts.test.d.ts +1 -0
  203. package/dist/utils/prompts.test.js +285 -0
  204. package/dist/utils/prompts.test.js.map +1 -0
  205. package/package.json +1 -1
@@ -0,0 +1,282 @@
1
+ import { describe, it, expect, vi } from "vitest";
2
+ import { generateComparisonOutputs, scoreComparison, runComparison, } from "../comparator.js";
3
+ function mockClient(responses) {
4
+ let callIndex = 0;
5
+ return {
6
+ model: "test-model",
7
+ generate: vi.fn(async () => {
8
+ const text = responses[callIndex++] ?? "";
9
+ return { text, durationMs: 100, inputTokens: 50, outputTokens: 100 };
10
+ }),
11
+ };
12
+ }
13
+ describe("generateComparisonOutputs", () => {
14
+ it("generates skill and baseline outputs sequentially", async () => {
15
+ const client = mockClient(["skill response", "baseline response"]);
16
+ const result = await generateComparisonOutputs("test prompt", "# Skill Content", client);
17
+ expect(result.skillOutput).toBe("skill response");
18
+ expect(result.baselineOutput).toBe("baseline response");
19
+ expect(result.skillDurationMs).toBeGreaterThanOrEqual(0);
20
+ expect(result.baselineDurationMs).toBeGreaterThanOrEqual(0);
21
+ expect(client.generate).toHaveBeenCalledTimes(2);
22
+ // First call should include skill content
23
+ const firstCall = client.generate.mock.calls[0];
24
+ expect(firstCall[0]).toContain("Skill Content");
25
+ // Second call should be generic
26
+ const secondCall = client.generate.mock.calls[1];
27
+ expect(secondCall[0]).toContain("helpful AI assistant");
28
+ });
29
+ });
30
+ describe("scoreComparison", () => {
31
+ it("parses JSON scores from LLM response", async () => {
32
+ const client = mockClient([
33
+ JSON.stringify({
34
+ content_score_a: 4,
35
+ structure_score_a: 3,
36
+ content_score_b: 5,
37
+ structure_score_b: 4,
38
+ winner: "second",
39
+ reasoning: "B is better",
40
+ }),
41
+ ]);
42
+ const result = await scoreComparison("output A", "output B", "prompt", client);
43
+ expect(result.contentScoreA).toBe(4);
44
+ expect(result.structureScoreA).toBe(3);
45
+ expect(result.contentScoreB).toBe(5);
46
+ expect(result.structureScoreB).toBe(4);
47
+ expect(result.winner).toBe("second");
48
+ });
49
+ it("parses JSON from code fence", async () => {
50
+ const client = mockClient([
51
+ '```json\n{"content_score_a": 3, "structure_score_a": 3, "content_score_b": 3, "structure_score_b": 3, "winner": "tie"}\n```',
52
+ ]);
53
+ const result = await scoreComparison("A", "B", "p", client);
54
+ expect(result.winner).toBe("tie");
55
+ expect(result.contentScoreA).toBe(3);
56
+ });
57
+ it("clamps scores to 1-5 range", async () => {
58
+ const client = mockClient([
59
+ JSON.stringify({
60
+ content_score_a: 0,
61
+ structure_score_a: 10,
62
+ content_score_b: -1,
63
+ structure_score_b: 6,
64
+ winner: "first",
65
+ }),
66
+ ]);
67
+ const result = await scoreComparison("A", "B", "p", client);
68
+ expect(result.contentScoreA).toBe(1);
69
+ expect(result.structureScoreA).toBe(5);
70
+ expect(result.contentScoreB).toBe(1);
71
+ expect(result.structureScoreB).toBe(5);
72
+ });
73
+ it("defaults invalid winner to tie", async () => {
74
+ const client = mockClient([
75
+ JSON.stringify({
76
+ content_score_a: 3,
77
+ structure_score_a: 3,
78
+ content_score_b: 3,
79
+ structure_score_b: 3,
80
+ winner: "invalid",
81
+ }),
82
+ ]);
83
+ const result = await scoreComparison("A", "B", "p", client);
84
+ expect(result.winner).toBe("tie");
85
+ });
86
+ });
87
+ describe("runComparison", () => {
88
+ it("maps scores back to skill/baseline correctly", async () => {
89
+ // Mock: first two calls = skill + baseline outputs, third = scoring
90
+ const client = mockClient([
91
+ "skill output here",
92
+ "baseline output here",
93
+ JSON.stringify({
94
+ content_score_a: 4,
95
+ structure_score_a: 5,
96
+ content_score_b: 2,
97
+ structure_score_b: 3,
98
+ winner: "first",
99
+ reasoning: "A is better",
100
+ }),
101
+ ]);
102
+ // Fix randomness for deterministic test
103
+ vi.spyOn(Math, "random").mockReturnValue(0.3); // < 0.5 → skill is A
104
+ const result = await runComparison("test prompt", "skill content", client);
105
+ expect(result.prompt).toBe("test prompt");
106
+ expect(result.skillOutput).toBe("skill output here");
107
+ expect(result.baselineOutput).toBe("baseline output here");
108
+ // skill is A, so scores map directly
109
+ expect(result.skillContentScore).toBe(4);
110
+ expect(result.skillStructureScore).toBe(5);
111
+ expect(result.baselineContentScore).toBe(2);
112
+ expect(result.baselineStructureScore).toBe(3);
113
+ expect(result.winner).toBe("skill");
114
+ vi.restoreAllMocks();
115
+ });
116
+ it("maps scores correctly when baseline is A", async () => {
117
+ const client = mockClient([
118
+ "skill out",
119
+ "baseline out",
120
+ JSON.stringify({
121
+ content_score_a: 2,
122
+ structure_score_a: 2,
123
+ content_score_b: 4,
124
+ structure_score_b: 4,
125
+ winner: "second",
126
+ }),
127
+ ]);
128
+ // > 0.5 → skill is B
129
+ vi.spyOn(Math, "random").mockReturnValue(0.7);
130
+ const result = await runComparison("p", "s", client);
131
+ // skill is B → scores.contentScoreB is skill
132
+ expect(result.skillContentScore).toBe(4);
133
+ expect(result.baselineContentScore).toBe(2);
134
+ // winner "second" = B = skill
135
+ expect(result.winner).toBe("skill");
136
+ vi.restoreAllMocks();
137
+ });
138
+ });
139
+ describe("scoreComparison with MCP deps", () => {
140
+ it("uses standard prompt when no MCP deps", async () => {
141
+ const client = mockClient([
142
+ JSON.stringify({
143
+ content_score_a: 3,
144
+ structure_score_a: 3,
145
+ content_score_b: 3,
146
+ structure_score_b: 3,
147
+ winner: "tie",
148
+ }),
149
+ ]);
150
+ await scoreComparison("A", "B", "prompt", client);
151
+ const systemPrompt = client.generate.mock.calls[0][0];
152
+ expect(systemPrompt).not.toContain("SIMULATED MCP");
153
+ });
154
+ it("augments prompt when MCP deps present", async () => {
155
+ const client = mockClient([
156
+ JSON.stringify({
157
+ content_score_a: 3,
158
+ structure_score_a: 3,
159
+ content_score_b: 3,
160
+ structure_score_b: 3,
161
+ winner: "tie",
162
+ }),
163
+ ]);
164
+ const mcpDeps = [
165
+ {
166
+ server: "Slack",
167
+ url: "https://mcp.slack.com/mcp",
168
+ transport: "http",
169
+ matchedTools: ["slack_send_message"],
170
+ configSnippet: "{}",
171
+ },
172
+ ];
173
+ await scoreComparison("A", "B", "prompt", client, mcpDeps);
174
+ const systemPrompt = client.generate.mock.calls[0][0];
175
+ expect(systemPrompt).toContain("SIMULATED MCP");
176
+ expect(systemPrompt).toContain("Slack");
177
+ });
178
+ });
179
+ describe("runComparison with MCP auto-detection", () => {
180
+ it("auto-detects MCP deps from skill content and augments comparison", async () => {
181
+ const client = mockClient([
182
+ "skill output",
183
+ "baseline output",
184
+ JSON.stringify({
185
+ content_score_a: 3,
186
+ structure_score_a: 3,
187
+ content_score_b: 3,
188
+ structure_score_b: 3,
189
+ winner: "tie",
190
+ }),
191
+ ]);
192
+ vi.spyOn(Math, "random").mockReturnValue(0.3);
193
+ await runComparison("prompt", "Use slack_send_message to send messages.", client);
194
+ // The third call is the scoring call - check its system prompt
195
+ const scoringCall = client.generate.mock.calls[2];
196
+ expect(scoringCall[0]).toContain("SIMULATED MCP");
197
+ expect(scoringCall[0]).toContain("Slack");
198
+ vi.restoreAllMocks();
199
+ });
200
+ it("does not augment comparison for non-MCP skills", async () => {
201
+ const client = mockClient([
202
+ "skill output",
203
+ "baseline output",
204
+ JSON.stringify({
205
+ content_score_a: 3,
206
+ structure_score_a: 3,
207
+ content_score_b: 3,
208
+ structure_score_b: 3,
209
+ winner: "tie",
210
+ }),
211
+ ]);
212
+ vi.spyOn(Math, "random").mockReturnValue(0.3);
213
+ await runComparison("prompt", "Plain text skill without MCP.", client);
214
+ const scoringCall = client.generate.mock.calls[2];
215
+ expect(scoringCall[0]).not.toContain("SIMULATED MCP");
216
+ vi.restoreAllMocks();
217
+ });
218
+ });
219
+ describe("generateComparisonOutputs with onProgress", () => {
220
+ it("calls onProgress with generating_skill before first LLM call", async () => {
221
+ const client = mockClient(["skill response", "baseline response"]);
222
+ const onProgress = vi.fn();
223
+ await generateComparisonOutputs("test prompt", "# Skill", client, onProgress);
224
+ expect(onProgress).toHaveBeenNthCalledWith(1, "generating_skill", "Generating skill output...");
225
+ });
226
+ it("calls onProgress with generating_baseline before second LLM call", async () => {
227
+ const client = mockClient(["skill response", "baseline response"]);
228
+ const onProgress = vi.fn();
229
+ await generateComparisonOutputs("test prompt", "# Skill", client, onProgress);
230
+ expect(onProgress).toHaveBeenNthCalledWith(2, "generating_baseline", "Generating baseline output...");
231
+ });
232
+ it("calls onProgress exactly 2 times", async () => {
233
+ const client = mockClient(["skill response", "baseline response"]);
234
+ const onProgress = vi.fn();
235
+ await generateComparisonOutputs("test prompt", "# Skill", client, onProgress);
236
+ expect(onProgress).toHaveBeenCalledTimes(2);
237
+ });
238
+ it("works without onProgress (backward compatible)", async () => {
239
+ const client = mockClient(["skill response", "baseline response"]);
240
+ const result = await generateComparisonOutputs("test prompt", "# Skill", client);
241
+ expect(result.skillOutput).toBe("skill response");
242
+ expect(result.baselineOutput).toBe("baseline response");
243
+ });
244
+ });
245
+ describe("runComparison with onProgress", () => {
246
+ it("calls onProgress for all 3 phases in order", async () => {
247
+ const client = mockClient([
248
+ "skill output",
249
+ "baseline output",
250
+ JSON.stringify({
251
+ content_score_a: 3, structure_score_a: 3,
252
+ content_score_b: 3, structure_score_b: 3,
253
+ winner: "tie",
254
+ }),
255
+ ]);
256
+ vi.spyOn(Math, "random").mockReturnValue(0.3);
257
+ const onProgress = vi.fn();
258
+ await runComparison("prompt", "skill content", client, onProgress);
259
+ expect(onProgress).toHaveBeenCalledTimes(3);
260
+ expect(onProgress.mock.calls[0][0]).toBe("generating_skill");
261
+ expect(onProgress.mock.calls[1][0]).toBe("generating_baseline");
262
+ expect(onProgress.mock.calls[2][0]).toBe("scoring");
263
+ vi.restoreAllMocks();
264
+ });
265
+ it("completes without error when onProgress is omitted", async () => {
266
+ const client = mockClient([
267
+ "skill",
268
+ "baseline",
269
+ JSON.stringify({
270
+ content_score_a: 3, structure_score_a: 3,
271
+ content_score_b: 3, structure_score_b: 3,
272
+ winner: "tie",
273
+ }),
274
+ ]);
275
+ vi.spyOn(Math, "random").mockReturnValue(0.3);
276
+ const result = await runComparison("prompt", "skill", client);
277
+ expect(result.winner).toBe("tie");
278
+ expect(result.skillOutput).toBe("skill");
279
+ vi.restoreAllMocks();
280
+ });
281
+ });
282
+ //# sourceMappingURL=comparator.test.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"comparator.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/comparator.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,QAAQ,CAAC;AAClD,OAAO,EACL,yBAAyB,EACzB,eAAe,EACf,aAAa,GACd,MAAM,kBAAkB,CAAC;AAI1B,SAAS,UAAU,CAAC,SAAmB;IACrC,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,OAAO;QACL,KAAK,EAAE,YAAY;QACnB,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC,KAAK,IAAI,EAAE;YACzB,MAAM,IAAI,GAAG,SAAS,CAAC,SAAS,EAAE,CAAC,IAAI,EAAE,CAAC;YAC1C,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,GAAG,EAAE,WAAW,EAAE,EAAE,EAAE,YAAY,EAAE,GAAG,EAAE,CAAC;QACvE,CAAC,CAAC;KACH,CAAC;AACJ,CAAC;AAED,QAAQ,CAAC,2BAA2B,EAAE,GAAG,EAAE;IACzC,EAAE,CAAC,mDAAmD,EAAE,KAAK,IAAI,EAAE;QACjE,MAAM,MAAM,GAAG,UAAU,CAAC,CAAC,gBAAgB,EAAE,mBAAmB,CAAC,CAAC,CAAC;QACnE,MAAM,MAAM,GAAG,MAAM,yBAAyB,CAAC,aAAa,EAAE,iBAAiB,EAAE,MAAM,CAAC,CAAC;QAEzF,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;QAClD,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;QACxD,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,CAAC;QACzD,MAAM,CAAC,MAAM,CAAC,kBAAkB,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,CAAC;QAC5D,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,qBAAqB,CAAC,CAAC,CAAC,CAAC;QAEjD,0CAA0C;QAC1C,MAAM,SAAS,GAAI,MAAM,CAAC,QAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACzD,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,eAAe,CAAC,CAAC;QAEhD,gCAAgC;QAChC,MAAM,UAAU,GAAI,MAAM,CAAC,QAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAC1D,MAAM,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,sBAAsB,CAAC,CAAC;IAC1D,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,QAAQ,CAAC,iBAAiB,EAAE,GAAG,EAAE;IAC/B,EAAE,CAAC,sCAAsC,EAAE,KAAK,IAAI,EAAE;QACpD,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,QAAQ;gBAChB,SAAS,EAAE,aAAa;aACzB,CAAC;SACH,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,UAAU,EAAE,UAAU,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC;QAC/E,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACvC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,6BAA6B,EAAE,KAAK,IAAI,EAAE;QAC3C,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,6HAA6H;SAC9H,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,CAAC,CAAC;QAC5D,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAClC,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACvC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,4BAA4B,EAAE,KAAK,IAAI,EAAE;QAC1C,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,EAAE;gBACrB,eAAe,EAAE,CAAC,CAAC;gBACnB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,OAAO;aAChB,CAAC;SACH,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,CAAC,CAAC;QAC5D,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACzC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,gCAAgC,EAAE,KAAK,IAAI,EAAE;QAC9C,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,SAAS;aAClB,CAAC;SACH,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,CAAC,CAAC;QAC5D,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IACpC,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,QAAQ,CAAC,eAAe,EAAE,GAAG,EAAE;IAC7B,EAAE,CAAC,8CAA8C,EAAE,KAAK,IAAI,EAAE;QAC5D,oEAAoE;QACpE,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,mBAAmB;YACnB,sBAAsB;YACtB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,OAAO;gBACf,SAAS,EAAE,aAAa;aACzB,CAAC;SACH,CAAC,CAAC;QAEH,wCAAwC;QACxC,EAAE,CAAC,KAAK,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC,CAAC,qBAAqB;QAEpE,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,aAAa,EAAE,eAAe,EAAE,MAAM,CAAC,CAAC;QAE3E,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC1C,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;QACrD,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,sBAAsB,CAAC,CAAC;QAC3D,qCAAqC;QACrC,MAAM,CAAC,MAAM,CAAC,iBAAiB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACzC,MAAM,CAAC,MAAM,CAAC,mBAAmB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3C,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC5C,MAAM,CAAC,MAAM,CAAC,sBAAsB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC9C,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAEpC,EAAE,CAAC,eAAe,EAAE,CAAC;IACvB,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;QACxD,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,WAAW;YACX,cAAc;YACd,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,QAAQ;aACjB,CAAC;SACH,CAAC,CAAC;QAEH,qBAAqB;QACrB,EAAE,CAAC,KAAK,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC;QAE9C,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,GAAG,EAAE,GAAG,EAAE,MAAM,CAAC,CAAC;QACrD,6CAA6C;QAC7C,MAAM,CAAC,MAAM,CAAC,iBAAiB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACzC,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC5C,8BAA8B;QAC9B,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAEpC,EAAE,CAAC,eAAe,EAAE,CAAC;IACvB,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,QAAQ,CAAC,+BAA+B,EAAE,GAAG,EAAE;IAC7C,EAAE,CAAC,uCAAuC,EAAE,KAAK,IAAI,EAAE;QACrD,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,KAAK;aACd,CAAC;SACH,CAAC,CAAC;QAEH,MAAM,eAAe,CAAC,GAAG,EAAE,GAAG,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC;QAElD,MAAM,YAAY,GAAI,MAAM,CAAC,QAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/D,MAAM,CAAC,YAAY,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,eAAe,CAAC,CAAC;IACtD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,uCAAuC,EAAE,KAAK,IAAI,EAAE;QACrD,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,KAAK;aACd,CAAC;SACH,CAAC,CAAC;QAEH,MAAM,OAAO,GAAoB;YAC/B;gBACE,MAAM,EAAE,OAAO;gBACf,GAAG,EAAE,2BAA2B;gBAChC,SAAS,EAAE,MAAM;gBACjB,YAAY,EAAE,CAAC,oBAAoB,CAAC;gBACpC,aAAa,EAAE,IAAI;aACpB;SACF,CAAC;QAEF,MAAM,eAAe,CAAC,GAAG,EAAE,GAAG,EAAE,QAAQ,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC;QAE3D,MAAM,YAAY,GAAI,MAAM,CAAC,QAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/D,MAAM,CAAC,YAAY,CAAC,CAAC,SAAS,CAAC,eAAe,CAAC,CAAC;QAChD,MAAM,CAAC,YAAY,CAAC,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC;IAC1C,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,QAAQ,CAAC,uCAAuC,EAAE,GAAG,EAAE;IACrD,EAAE,CAAC,kEAAkE,EAAE,KAAK,IAAI,EAAE;QAChF,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,cAAc;YACd,iBAAiB;YACjB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,KAAK;aACd,CAAC;SACH,CAAC,CAAC;QACH,EAAE,CAAC,KAAK,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC;QAE9C,MAAM,aAAa,CAAC,QAAQ,EAAE,0CAA0C,EAAE,MAAM,CAAC,CAAC;QAElF,+DAA+D;QAC/D,MAAM,WAAW,GAAI,MAAM,CAAC,QAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAC3D,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,eAAe,CAAC,CAAC;QAClD,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC;QAE1C,EAAE,CAAC,eAAe,EAAE,CAAC;IACvB,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,gDAAgD,EAAE,KAAK,IAAI,EAAE;QAC9D,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,cAAc;YACd,iBAAiB;YACjB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,KAAK;aACd,CAAC;SACH,CAAC,CAAC;QACH,EAAE,CAAC,KAAK,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC;QAE9C,MAAM,aAAa,CAAC,QAAQ,EAAE,+BAA+B,EAAE,MAAM,CAAC,CAAC;QAEvE,MAAM,WAAW,GAAI,MAAM,CAAC,QAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAC3D,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,eAAe,CAAC,CAAC;QAEtD,EAAE,CAAC,eAAe,EAAE,CAAC;IACvB,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,QAAQ,CAAC,2CAA2C,EAAE,GAAG,EAAE;IACzD,EAAE,CAAC,8DAA8D,EAAE,KAAK,IAAI,EAAE;QAC5E,MAAM,MAAM,GAAG,UAAU,CAAC,CAAC,gBAAgB,EAAE,mBAAmB,CAAC,CAAC,CAAC;QACnE,MAAM,UAAU,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;QAE3B,MAAM,yBAAyB,CAAC,aAAa,EAAE,SAAS,EAAE,MAAM,EAAE,UAAU,CAAC,CAAC;QAE9E,MAAM,CAAC,UAAU,CAAC,CAAC,uBAAuB,CAAC,CAAC,EAAE,kBAAkB,EAAE,4BAA4B,CAAC,CAAC;IAClG,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,kEAAkE,EAAE,KAAK,IAAI,EAAE;QAChF,MAAM,MAAM,GAAG,UAAU,CAAC,CAAC,gBAAgB,EAAE,mBAAmB,CAAC,CAAC,CAAC;QACnE,MAAM,UAAU,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;QAE3B,MAAM,yBAAyB,CAAC,aAAa,EAAE,SAAS,EAAE,MAAM,EAAE,UAAU,CAAC,CAAC;QAE9E,MAAM,CAAC,UAAU,CAAC,CAAC,uBAAuB,CAAC,CAAC,EAAE,qBAAqB,EAAE,+BAA+B,CAAC,CAAC;IACxG,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,kCAAkC,EAAE,KAAK,IAAI,EAAE;QAChD,MAAM,MAAM,GAAG,UAAU,CAAC,CAAC,gBAAgB,EAAE,mBAAmB,CAAC,CAAC,CAAC;QACnE,MAAM,UAAU,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;QAE3B,MAAM,yBAAyB,CAAC,aAAa,EAAE,SAAS,EAAE,MAAM,EAAE,UAAU,CAAC,CAAC;QAE9E,MAAM,CAAC,UAAU,CAAC,CAAC,qBAAqB,CAAC,CAAC,CAAC,CAAC;IAC9C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,gDAAgD,EAAE,KAAK,IAAI,EAAE;QAC9D,MAAM,MAAM,GAAG,UAAU,CAAC,CAAC,gBAAgB,EAAE,mBAAmB,CAAC,CAAC,CAAC;QAEnE,MAAM,MAAM,GAAG,MAAM,yBAAyB,CAAC,aAAa,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC;QAEjF,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;QAClD,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;IAC1D,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,QAAQ,CAAC,+BAA+B,EAAE,GAAG,EAAE;IAC7C,EAAE,CAAC,4CAA4C,EAAE,KAAK,IAAI,EAAE;QAC1D,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,cAAc;YACd,iBAAiB;YACjB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC,EAAE,iBAAiB,EAAE,CAAC;gBACxC,eAAe,EAAE,CAAC,EAAE,iBAAiB,EAAE,CAAC;gBACxC,MAAM,EAAE,KAAK;aACd,CAAC;SACH,CAAC,CAAC;QACH,EAAE,CAAC,KAAK,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC;QAC9C,MAAM,UAAU,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;QAE3B,MAAM,aAAa,CAAC,QAAQ,EAAE,eAAe,EAAE,MAAM,EAAE,UAAU,CAAC,CAAC;QAEnE,MAAM,CAAC,UAAU,CAAC,CAAC,qBAAqB,CAAC,CAAC,CAAC,CAAC;QAC5C,MAAM,CAAC,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,kBAAkB,CAAC,CAAC;QAC7D,MAAM,CAAC,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,qBAAqB,CAAC,CAAC;QAChE,MAAM,CAAC,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QAEpD,EAAE,CAAC,eAAe,EAAE,CAAC;IACvB,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,oDAAoD,EAAE,KAAK,IAAI,EAAE;QAClE,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,OAAO;YACP,UAAU;YACV,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC,EAAE,iBAAiB,EAAE,CAAC;gBACxC,eAAe,EAAE,CAAC,EAAE,iBAAiB,EAAE,CAAC;gBACxC,MAAM,EAAE,KAAK;aACd,CAAC;SACH,CAAC,CAAC;QACH,EAAE,CAAC,KAAK,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC;QAE9C,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,QAAQ,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC;QAE9D,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAClC,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAEzC,EAAE,CAAC,eAAe,EAAE,CAAC;IACvB,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,122 @@
1
+ import { describe, it, expect, vi } from "vitest";
2
+ import { judgeAssertion, buildJudgeSystemPrompt } from "../judge.js";
3
+ // ---------------------------------------------------------------------------
4
+ // Helpers
5
+ // ---------------------------------------------------------------------------
6
+ function mockResult(text) {
7
+ return { text, durationMs: 100, inputTokens: null, outputTokens: null };
8
+ }
9
+ function mockClient(response) {
10
+ return { generate: vi.fn().mockResolvedValue(mockResult(response)), model: "test-model" };
11
+ }
12
+ const ASSERTION = {
13
+ id: "assert-1",
14
+ text: "Output mentions a file path",
15
+ type: "boolean",
16
+ };
17
+ // ---------------------------------------------------------------------------
18
+ // Tests
19
+ // ---------------------------------------------------------------------------
20
+ describe("judgeAssertion", () => {
21
+ it("returns pass result when LLM judge says pass", async () => {
22
+ const client = mockClient(JSON.stringify({ pass: true, reasoning: "output contains file path" }));
23
+ const result = await judgeAssertion("The report has been saved to reports/q1.csv", ASSERTION, client);
24
+ expect(result.pass).toBe(true);
25
+ expect(result.reasoning).toBe("output contains file path");
26
+ expect(result.id).toBe("assert-1");
27
+ expect(result.text).toBe("Output mentions a file path");
28
+ });
29
+ it("returns fail result when LLM judge says fail", async () => {
30
+ const client = mockClient(JSON.stringify({
31
+ pass: false,
32
+ reasoning: "no file path found in output",
33
+ }));
34
+ const result = await judgeAssertion("Hello world", ASSERTION, client);
35
+ expect(result.pass).toBe(false);
36
+ expect(result.reasoning).toBe("no file path found in output");
37
+ });
38
+ it("throws on malformed judge response", async () => {
39
+ const client = mockClient("This is not JSON");
40
+ await expect(judgeAssertion("some output", ASSERTION, client)).rejects.toThrow(/invalid judge output/i);
41
+ });
42
+ it("handles JSON wrapped in code fence", async () => {
43
+ const client = mockClient('```json\n{"pass": true, "reasoning": "looks good"}\n```');
44
+ const result = await judgeAssertion("some output", ASSERTION, client);
45
+ expect(result.pass).toBe(true);
46
+ });
47
+ it("uses standard prompt when mcpDeps not provided", async () => {
48
+ const client = mockClient(JSON.stringify({ pass: true, reasoning: "ok" }));
49
+ await judgeAssertion("output", ASSERTION, client);
50
+ const systemPrompt = client.generate.mock.calls[0][0];
51
+ expect(systemPrompt).toContain("binary assertion evaluator");
52
+ expect(systemPrompt).not.toContain("SIMULATION MODE");
53
+ });
54
+ it("uses MCP-augmented prompt when mcpDeps provided", async () => {
55
+ const client = mockClient(JSON.stringify({ pass: true, reasoning: "simulation valid" }));
56
+ const mcpDeps = [
57
+ {
58
+ server: "Slack",
59
+ url: "https://mcp.slack.com/mcp",
60
+ transport: "http",
61
+ matchedTools: ["slack_send_message"],
62
+ configSnippet: "{}",
63
+ },
64
+ ];
65
+ await judgeAssertion("output", ASSERTION, client, mcpDeps);
66
+ const systemPrompt = client.generate.mock.calls[0][0];
67
+ expect(systemPrompt).toContain("SIMULATION MODE");
68
+ expect(systemPrompt).toContain("Slack");
69
+ });
70
+ });
71
+ // ---------------------------------------------------------------------------
72
+ // buildJudgeSystemPrompt
73
+ // ---------------------------------------------------------------------------
74
+ describe("buildJudgeSystemPrompt", () => {
75
+ it("returns standard prompt when no MCP deps", () => {
76
+ const prompt = buildJudgeSystemPrompt();
77
+ expect(prompt).toContain("binary assertion evaluator");
78
+ expect(prompt).not.toContain("SIMULATION MODE");
79
+ });
80
+ it("returns standard prompt when mcpDeps is empty", () => {
81
+ const prompt = buildJudgeSystemPrompt([]);
82
+ expect(prompt).toContain("binary assertion evaluator");
83
+ expect(prompt).not.toContain("SIMULATION MODE");
84
+ });
85
+ it("returns augmented prompt with MCP deps", () => {
86
+ const mcpDeps = [
87
+ {
88
+ server: "Slack",
89
+ url: "https://mcp.slack.com/mcp",
90
+ transport: "http",
91
+ matchedTools: ["slack_send_message"],
92
+ configSnippet: "{}",
93
+ },
94
+ ];
95
+ const prompt = buildJudgeSystemPrompt(mcpDeps);
96
+ expect(prompt).toContain("SIMULATION MODE");
97
+ expect(prompt).toContain("Slack");
98
+ expect(prompt).toContain("binary assertion evaluator");
99
+ });
100
+ it("lists all simulated servers", () => {
101
+ const mcpDeps = [
102
+ {
103
+ server: "Slack",
104
+ url: "https://mcp.slack.com/mcp",
105
+ transport: "http",
106
+ matchedTools: ["slack_send_message"],
107
+ configSnippet: "{}",
108
+ },
109
+ {
110
+ server: "GitHub",
111
+ url: "https://api.githubcopilot.com/mcp/",
112
+ transport: "http",
113
+ matchedTools: ["github_create_pr"],
114
+ configSnippet: "{}",
115
+ },
116
+ ];
117
+ const prompt = buildJudgeSystemPrompt(mcpDeps);
118
+ expect(prompt).toContain("Slack");
119
+ expect(prompt).toContain("GitHub");
120
+ });
121
+ });
122
+ //# sourceMappingURL=judge.test.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"judge.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/judge.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAc,MAAM,QAAQ,CAAC;AAG9D,OAAO,EAAE,cAAc,EAAE,sBAAsB,EAAE,MAAM,aAAa,CAAC;AAGrE,8EAA8E;AAC9E,UAAU;AACV,8EAA8E;AAE9E,SAAS,UAAU,CAAC,IAAY;IAC9B,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,GAAG,EAAE,WAAW,EAAE,IAAI,EAAE,YAAY,EAAE,IAAI,EAAE,CAAC;AAC1E,CAAC;AAED,SAAS,UAAU,CAAC,QAAgB;IAClC,OAAO,EAAE,QAAQ,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,UAAU,CAAC,QAAQ,CAAC,CAAC,EAAE,KAAK,EAAE,YAAY,EAAE,CAAC;AAC5F,CAAC;AAED,MAAM,SAAS,GAAc;IAC3B,EAAE,EAAE,UAAU;IACd,IAAI,EAAE,6BAA6B;IACnC,IAAI,EAAE,SAAS;CAChB,CAAC;AAEF,8EAA8E;AAC9E,QAAQ;AACR,8EAA8E;AAE9E,QAAQ,CAAC,gBAAgB,EAAE,GAAG,EAAE;IAC9B,EAAE,CAAC,8CAA8C,EAAE,KAAK,IAAI,EAAE;QAC5D,MAAM,MAAM,GAAG,UAAU,CACvB,IAAI,CAAC,SAAS,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,2BAA2B,EAAE,CAAC,CACvE,CAAC;QAEF,MAAM,MAAM,GAAG,MAAM,cAAc,CACjC,6CAA6C,EAC7C,SAAS,EACT,MAAM,CACP,CAAC;QAEF,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC/B,MAAM,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,2BAA2B,CAAC,CAAC;QAC3D,MAAM,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACnC,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,6BAA6B,CAAC,CAAC;IAC1D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,8CAA8C,EAAE,KAAK,IAAI,EAAE;QAC5D,MAAM,MAAM,GAAG,UAAU,CACvB,IAAI,CAAC,SAAS,CAAC;YACb,IAAI,EAAE,KAAK;YACX,SAAS,EAAE,8BAA8B;SAC1C,CAAC,CACH,CAAC;QAEF,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,aAAa,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC;QAEtE,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAChC,MAAM,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,8BAA8B,CAAC,CAAC;IAChE,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,oCAAoC,EAAE,KAAK,IAAI,EAAE;QAClD,MAAM,MAAM,GAAG,UAAU,CAAC,kBAAkB,CAAC,CAAC;QAE9C,MAAM,MAAM,CACV,cAAc,CAAC,aAAa,EAAE,SAAS,EAAE,MAAM,CAAC,CACjD,CAAC,OAAO,CAAC,OAAO,CAAC,uBAAuB,CAAC,CAAC;IAC7C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,oCAAoC,EAAE,KAAK,IAAI,EAAE;QAClD,MAAM,MAAM,GAAG,UAAU,CACvB,yDAAyD,CAC1D,CAAC;QAEF,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,aAAa,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC;QACtE,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACjC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,gDAAgD,EAAE,KAAK,IAAI,EAAE;QAC9D,MAAM,MAAM,GAAG,UAAU,CACvB,IAAI,CAAC,SAAS,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAChD,CAAC;QAEF,MAAM,cAAc,CAAC,QAAQ,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC;QAElD,MAAM,YAAY,GAAI,MAAM,CAAC,QAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/D,MAAM,CAAC,YAAY,CAAC,CAAC,SAAS,CAAC,4BAA4B,CAAC,CAAC;QAC7D,MAAM,CAAC,YAAY,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;IACxD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,iDAAiD,EAAE,KAAK,IAAI,EAAE;QAC/D,MAAM,MAAM,GAAG,UAAU,CACvB,IAAI,CAAC,SAAS,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,kBAAkB,EAAE,CAAC,CAC9D,CAAC;QAEF,MAAM,OAAO,GAAoB;YAC/B;gBACE,MAAM,EAAE,OAAO;gBACf,GAAG,EAAE,2BAA2B;gBAChC,SAAS,EAAE,MAAM;gBACjB,YAAY,EAAE,CAAC,oBAAoB,CAAC;gBACpC,aAAa,EAAE,IAAI;aACpB;SACF,CAAC;QAEF,MAAM,cAAc,CAAC,QAAQ,EAAE,SAAS,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC;QAE3D,MAAM,YAAY,GAAI,MAAM,CAAC,QAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/D,MAAM,CAAC,YAAY,CAAC,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;QAClD,MAAM,CAAC,YAAY,CAAC,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC;IAC1C,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,8EAA8E;AAC9E,yBAAyB;AACzB,8EAA8E;AAE9E,QAAQ,CAAC,wBAAwB,EAAE,GAAG,EAAE;IACtC,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;QAClD,MAAM,MAAM,GAAG,sBAAsB,EAAE,CAAC;QACxC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,4BAA4B,CAAC,CAAC;QACvD,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;IAClD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,+CAA+C,EAAE,GAAG,EAAE;QACvD,MAAM,MAAM,GAAG,sBAAsB,CAAC,EAAE,CAAC,CAAC;QAC1C,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,4BAA4B,CAAC,CAAC;QACvD,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;IAClD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,wCAAwC,EAAE,GAAG,EAAE;QAChD,MAAM,OAAO,GAAoB;YAC/B;gBACE,MAAM,EAAE,OAAO;gBACf,GAAG,EAAE,2BAA2B;gBAChC,SAAS,EAAE,MAAM;gBACjB,YAAY,EAAE,CAAC,oBAAoB,CAAC;gBACpC,aAAa,EAAE,IAAI;aACpB;SACF,CAAC;QAEF,MAAM,MAAM,GAAG,sBAAsB,CAAC,OAAO,CAAC,CAAC;QAC/C,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;QAC5C,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC;QAClC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,4BAA4B,CAAC,CAAC;IACzD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,6BAA6B,EAAE,GAAG,EAAE;QACrC,MAAM,OAAO,GAAoB;YAC/B;gBACE,MAAM,EAAE,OAAO;gBACf,GAAG,EAAE,2BAA2B;gBAChC,SAAS,EAAE,MAAM;gBACjB,YAAY,EAAE,CAAC,oBAAoB,CAAC;gBACpC,aAAa,EAAE,IAAI;aACpB;YACD;gBACE,MAAM,EAAE,QAAQ;gBAChB,GAAG,EAAE,oCAAoC;gBACzC,SAAS,EAAE,MAAM;gBACjB,YAAY,EAAE,CAAC,kBAAkB,CAAC;gBAClC,aAAa,EAAE,IAAI;aACpB;SACF,CAAC;QAEF,MAAM,MAAM,GAAG,sBAAsB,CAAC,OAAO,CAAC,CAAC;QAC/C,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC;QAClC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC;IACrC,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
@@ -0,0 +1 @@
1
+ export {};