vskill 0.5.127 → 0.5.129

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (232) hide show
  1. package/agents.json +1 -1
  2. package/dist/bin.js +0 -0
  3. package/dist/eval-server/api-routes.js +32 -0
  4. package/dist/eval-server/api-routes.js.map +1 -1
  5. package/dist/eval-server/eval-server.js +2 -0
  6. package/dist/eval-server/eval-server.js.map +1 -1
  7. package/dist/eval-server/git-routes.d.ts +5 -0
  8. package/dist/eval-server/git-routes.js +172 -0
  9. package/dist/eval-server/git-routes.js.map +1 -0
  10. package/dist/eval-server/router.d.ts +2 -0
  11. package/dist/eval-server/router.js +3 -1
  12. package/dist/eval-server/router.js.map +1 -1
  13. package/dist/eval-server/skill-name-resolver.d.ts +13 -0
  14. package/dist/eval-server/skill-name-resolver.js +42 -0
  15. package/dist/eval-server/skill-name-resolver.js.map +1 -1
  16. package/dist/eval-ui/assets/{CommandPalette-LtejhB76.js → CommandPalette-BIjKkHCC.js} +1 -1
  17. package/dist/eval-ui/assets/{CreateSkillPage-3RuqLQnm.js → CreateSkillPage-CmV0WZOn.js} +1 -1
  18. package/dist/eval-ui/assets/{FindSkillsPalette-Dbl2_IC4.js → FindSkillsPalette-BtYgEsVD.js} +2 -2
  19. package/dist/eval-ui/assets/{SearchPaletteCore-CLZmxDat.js → SearchPaletteCore-CAEO357X.js} +1 -1
  20. package/dist/eval-ui/assets/{SkillDetailPanel-RQc5u4nm.js → SkillDetailPanel-AH9wV3VU.js} +1 -1
  21. package/dist/eval-ui/assets/{UpdateDropdown-O8YGQyMH.js → UpdateDropdown-BwMxB2Ub.js} +1 -1
  22. package/dist/eval-ui/assets/index-ZByf-wxl.js +102 -0
  23. package/dist/eval-ui/assets/skill-url-C4ekwoGs.js +1 -0
  24. package/dist/eval-ui/index.html +1 -1
  25. package/dist/index.js +0 -0
  26. package/package.json +1 -1
  27. package/dist/agents/agents-registry.test.d.ts +0 -1
  28. package/dist/agents/agents-registry.test.js +0 -248
  29. package/dist/agents/agents-registry.test.js.map +0 -1
  30. package/dist/api/client.test.d.ts +0 -1
  31. package/dist/api/client.test.js +0 -428
  32. package/dist/api/client.test.js.map +0 -1
  33. package/dist/audit/audit-integration.test.d.ts +0 -1
  34. package/dist/audit/audit-integration.test.js +0 -92
  35. package/dist/audit/audit-integration.test.js.map +0 -1
  36. package/dist/audit/audit-llm.test.d.ts +0 -1
  37. package/dist/audit/audit-llm.test.js +0 -110
  38. package/dist/audit/audit-llm.test.js.map +0 -1
  39. package/dist/audit/audit-patterns.test.d.ts +0 -1
  40. package/dist/audit/audit-patterns.test.js +0 -91
  41. package/dist/audit/audit-patterns.test.js.map +0 -1
  42. package/dist/audit/audit-scanner.test.d.ts +0 -1
  43. package/dist/audit/audit-scanner.test.js +0 -112
  44. package/dist/audit/audit-scanner.test.js.map +0 -1
  45. package/dist/audit/audit-types.test.d.ts +0 -1
  46. package/dist/audit/audit-types.test.js +0 -140
  47. package/dist/audit/audit-types.test.js.map +0 -1
  48. package/dist/audit/config.test.d.ts +0 -1
  49. package/dist/audit/config.test.js +0 -44
  50. package/dist/audit/config.test.js.map +0 -1
  51. package/dist/audit/file-discovery.test.d.ts +0 -1
  52. package/dist/audit/file-discovery.test.js +0 -120
  53. package/dist/audit/file-discovery.test.js.map +0 -1
  54. package/dist/audit/fix-suggestions.test.d.ts +0 -1
  55. package/dist/audit/fix-suggestions.test.js +0 -35
  56. package/dist/audit/fix-suggestions.test.js.map +0 -1
  57. package/dist/audit/formatters/json-formatter.test.d.ts +0 -1
  58. package/dist/audit/formatters/json-formatter.test.js +0 -49
  59. package/dist/audit/formatters/json-formatter.test.js.map +0 -1
  60. package/dist/audit/formatters/report-formatter.test.d.ts +0 -1
  61. package/dist/audit/formatters/report-formatter.test.js +0 -51
  62. package/dist/audit/formatters/report-formatter.test.js.map +0 -1
  63. package/dist/audit/formatters/sarif-formatter.test.d.ts +0 -1
  64. package/dist/audit/formatters/sarif-formatter.test.js +0 -71
  65. package/dist/audit/formatters/sarif-formatter.test.js.map +0 -1
  66. package/dist/audit/formatters/terminal-formatter.test.d.ts +0 -1
  67. package/dist/audit/formatters/terminal-formatter.test.js +0 -51
  68. package/dist/audit/formatters/terminal-formatter.test.js.map +0 -1
  69. package/dist/blocklist/blocklist-e2e.test.d.ts +0 -1
  70. package/dist/blocklist/blocklist-e2e.test.js +0 -346
  71. package/dist/blocklist/blocklist-e2e.test.js.map +0 -1
  72. package/dist/blocklist/blocklist.test.d.ts +0 -1
  73. package/dist/blocklist/blocklist.test.js +0 -259
  74. package/dist/blocklist/blocklist.test.js.map +0 -1
  75. package/dist/commands/__tests__/eval-router.test.d.ts +0 -1
  76. package/dist/commands/__tests__/eval-router.test.js +0 -60
  77. package/dist/commands/__tests__/eval-router.test.js.map +0 -1
  78. package/dist/commands/__tests__/eval-serve.test.d.ts +0 -1
  79. package/dist/commands/__tests__/eval-serve.test.js +0 -23
  80. package/dist/commands/__tests__/eval-serve.test.js.map +0 -1
  81. package/dist/commands/add-blocklist-e2e.test.d.ts +0 -1
  82. package/dist/commands/add-blocklist-e2e.test.js +0 -397
  83. package/dist/commands/add-blocklist-e2e.test.js.map +0 -1
  84. package/dist/commands/add-wizard.test.d.ts +0 -1
  85. package/dist/commands/add-wizard.test.js +0 -392
  86. package/dist/commands/add-wizard.test.js.map +0 -1
  87. package/dist/commands/add.test.d.ts +0 -1
  88. package/dist/commands/add.test.js +0 -2365
  89. package/dist/commands/add.test.js.map +0 -1
  90. package/dist/commands/audit.test.d.ts +0 -1
  91. package/dist/commands/audit.test.js +0 -79
  92. package/dist/commands/audit.test.js.map +0 -1
  93. package/dist/commands/blocklist.test.d.ts +0 -1
  94. package/dist/commands/blocklist.test.js +0 -158
  95. package/dist/commands/blocklist.test.js.map +0 -1
  96. package/dist/commands/eval/__tests__/coverage.test.d.ts +0 -1
  97. package/dist/commands/eval/__tests__/coverage.test.js +0 -122
  98. package/dist/commands/eval/__tests__/coverage.test.js.map +0 -1
  99. package/dist/commands/eval/__tests__/generate-all.test.d.ts +0 -1
  100. package/dist/commands/eval/__tests__/generate-all.test.js +0 -133
  101. package/dist/commands/eval/__tests__/generate-all.test.js.map +0 -1
  102. package/dist/commands/eval/__tests__/init.test.d.ts +0 -1
  103. package/dist/commands/eval/__tests__/init.test.js +0 -116
  104. package/dist/commands/eval/__tests__/init.test.js.map +0 -1
  105. package/dist/commands/eval/__tests__/run.test.d.ts +0 -1
  106. package/dist/commands/eval/__tests__/run.test.js +0 -186
  107. package/dist/commands/eval/__tests__/run.test.js.map +0 -1
  108. package/dist/commands/find.test.d.ts +0 -1
  109. package/dist/commands/find.test.js +0 -481
  110. package/dist/commands/find.test.js.map +0 -1
  111. package/dist/commands/marketplace.test.d.ts +0 -1
  112. package/dist/commands/marketplace.test.js +0 -129
  113. package/dist/commands/marketplace.test.js.map +0 -1
  114. package/dist/commands/remove.test.d.ts +0 -1
  115. package/dist/commands/remove.test.js +0 -164
  116. package/dist/commands/remove.test.js.map +0 -1
  117. package/dist/commands/should-skip.test.d.ts +0 -1
  118. package/dist/commands/should-skip.test.js +0 -56
  119. package/dist/commands/should-skip.test.js.map +0 -1
  120. package/dist/commands/submit.test.d.ts +0 -1
  121. package/dist/commands/submit.test.js +0 -83
  122. package/dist/commands/submit.test.js.map +0 -1
  123. package/dist/commands/update.test.d.ts +0 -1
  124. package/dist/commands/update.test.js +0 -250
  125. package/dist/commands/update.test.js.map +0 -1
  126. package/dist/discovery/github-tree.test.d.ts +0 -1
  127. package/dist/discovery/github-tree.test.js +0 -372
  128. package/dist/discovery/github-tree.test.js.map +0 -1
  129. package/dist/eval/__tests__/activation-tester.test.d.ts +0 -1
  130. package/dist/eval/__tests__/activation-tester.test.js +0 -203
  131. package/dist/eval/__tests__/activation-tester.test.js.map +0 -1
  132. package/dist/eval/__tests__/benchmark-history.test.d.ts +0 -1
  133. package/dist/eval/__tests__/benchmark-history.test.js +0 -422
  134. package/dist/eval/__tests__/benchmark-history.test.js.map +0 -1
  135. package/dist/eval/__tests__/benchmark.test.d.ts +0 -1
  136. package/dist/eval/__tests__/benchmark.test.js +0 -94
  137. package/dist/eval/__tests__/benchmark.test.js.map +0 -1
  138. package/dist/eval/__tests__/comparator.test.d.ts +0 -1
  139. package/dist/eval/__tests__/comparator.test.js +0 -282
  140. package/dist/eval/__tests__/comparator.test.js.map +0 -1
  141. package/dist/eval/__tests__/judge.test.d.ts +0 -1
  142. package/dist/eval/__tests__/judge.test.js +0 -122
  143. package/dist/eval/__tests__/judge.test.js.map +0 -1
  144. package/dist/eval/__tests__/llm.test.d.ts +0 -1
  145. package/dist/eval/__tests__/llm.test.js +0 -543
  146. package/dist/eval/__tests__/llm.test.js.map +0 -1
  147. package/dist/eval/__tests__/mcp-detector.test.d.ts +0 -1
  148. package/dist/eval/__tests__/mcp-detector.test.js +0 -180
  149. package/dist/eval/__tests__/mcp-detector.test.js.map +0 -1
  150. package/dist/eval/__tests__/prompt-builder.test.d.ts +0 -1
  151. package/dist/eval/__tests__/prompt-builder.test.js +0 -142
  152. package/dist/eval/__tests__/prompt-builder.test.js.map +0 -1
  153. package/dist/eval/__tests__/schema.test.d.ts +0 -1
  154. package/dist/eval/__tests__/schema.test.js +0 -247
  155. package/dist/eval/__tests__/schema.test.js.map +0 -1
  156. package/dist/eval/__tests__/skill-scanner.test.d.ts +0 -1
  157. package/dist/eval/__tests__/skill-scanner.test.js +0 -228
  158. package/dist/eval/__tests__/skill-scanner.test.js.map +0 -1
  159. package/dist/eval/__tests__/verdict.test.d.ts +0 -1
  160. package/dist/eval/__tests__/verdict.test.js +0 -47
  161. package/dist/eval/__tests__/verdict.test.js.map +0 -1
  162. package/dist/eval-server/__tests__/benchmark-runner.test.d.ts +0 -1
  163. package/dist/eval-server/__tests__/benchmark-runner.test.js +0 -301
  164. package/dist/eval-server/__tests__/benchmark-runner.test.js.map +0 -1
  165. package/dist/eval-server/__tests__/comparison-sse-events.test.d.ts +0 -1
  166. package/dist/eval-server/__tests__/comparison-sse-events.test.js +0 -278
  167. package/dist/eval-server/__tests__/comparison-sse-events.test.js.map +0 -1
  168. package/dist/eval-server/__tests__/sse-helpers.test.d.ts +0 -1
  169. package/dist/eval-server/__tests__/sse-helpers.test.js +0 -128
  170. package/dist/eval-server/__tests__/sse-helpers.test.js.map +0 -1
  171. package/dist/eval-ui/assets/index-DlZduKAT.js +0 -102
  172. package/dist/eval-ui/assets/skill-url-BpZjDR8A.js +0 -1
  173. package/dist/installer/canonical.test.d.ts +0 -1
  174. package/dist/installer/canonical.test.js +0 -264
  175. package/dist/installer/canonical.test.js.map +0 -1
  176. package/dist/lockfile/lockfile.test.d.ts +0 -1
  177. package/dist/lockfile/lockfile.test.js +0 -204
  178. package/dist/lockfile/lockfile.test.js.map +0 -1
  179. package/dist/lockfile/project-root.test.d.ts +0 -1
  180. package/dist/lockfile/project-root.test.js +0 -49
  181. package/dist/lockfile/project-root.test.js.map +0 -1
  182. package/dist/marketplace/marketplace.test.d.ts +0 -1
  183. package/dist/marketplace/marketplace.test.js +0 -312
  184. package/dist/marketplace/marketplace.test.js.map +0 -1
  185. package/dist/resolvers/source-resolver.test.d.ts +0 -1
  186. package/dist/resolvers/source-resolver.test.js +0 -104
  187. package/dist/resolvers/source-resolver.test.js.map +0 -1
  188. package/dist/resolvers/url-resolver.test.d.ts +0 -1
  189. package/dist/resolvers/url-resolver.test.js +0 -49
  190. package/dist/resolvers/url-resolver.test.js.map +0 -1
  191. package/dist/scanner/dci-integration.test.d.ts +0 -1
  192. package/dist/scanner/dci-integration.test.js +0 -83
  193. package/dist/scanner/dci-integration.test.js.map +0 -1
  194. package/dist/scanner/patterns.test.d.ts +0 -1
  195. package/dist/scanner/patterns.test.js +0 -832
  196. package/dist/scanner/patterns.test.js.map +0 -1
  197. package/dist/scanner/tier1.test.d.ts +0 -1
  198. package/dist/scanner/tier1.test.js +0 -305
  199. package/dist/scanner/tier1.test.js.map +0 -1
  200. package/dist/security/platform-security.test.d.ts +0 -1
  201. package/dist/security/platform-security.test.js +0 -92
  202. package/dist/security/platform-security.test.js.map +0 -1
  203. package/dist/settings/settings.test.d.ts +0 -1
  204. package/dist/settings/settings.test.js +0 -103
  205. package/dist/settings/settings.test.js.map +0 -1
  206. package/dist/updater/source-fetcher.test.d.ts +0 -1
  207. package/dist/updater/source-fetcher.test.js +0 -192
  208. package/dist/updater/source-fetcher.test.js.map +0 -1
  209. package/dist/utils/__tests__/paths.test.d.ts +0 -1
  210. package/dist/utils/__tests__/paths.test.js +0 -22
  211. package/dist/utils/__tests__/paths.test.js.map +0 -1
  212. package/dist/utils/__tests__/resolve-binary.integration.test.d.ts +0 -1
  213. package/dist/utils/__tests__/resolve-binary.integration.test.js +0 -138
  214. package/dist/utils/__tests__/resolve-binary.integration.test.js.map +0 -1
  215. package/dist/utils/__tests__/resolve-binary.test.d.ts +0 -1
  216. package/dist/utils/__tests__/resolve-binary.test.js +0 -175
  217. package/dist/utils/__tests__/resolve-binary.test.js.map +0 -1
  218. package/dist/utils/__tests__/validation.test.d.ts +0 -1
  219. package/dist/utils/__tests__/validation.test.js +0 -107
  220. package/dist/utils/__tests__/validation.test.js.map +0 -1
  221. package/dist/utils/agent-filter.test.d.ts +0 -1
  222. package/dist/utils/agent-filter.test.js +0 -75
  223. package/dist/utils/agent-filter.test.js.map +0 -1
  224. package/dist/utils/output.test.d.ts +0 -1
  225. package/dist/utils/output.test.js +0 -28
  226. package/dist/utils/output.test.js.map +0 -1
  227. package/dist/utils/project-root.test.d.ts +0 -1
  228. package/dist/utils/project-root.test.js +0 -74
  229. package/dist/utils/project-root.test.js.map +0 -1
  230. package/dist/utils/prompts.test.d.ts +0 -1
  231. package/dist/utils/prompts.test.js +0 -285
  232. package/dist/utils/prompts.test.js.map +0 -1
@@ -1,282 +0,0 @@
1
- import { describe, it, expect, vi } from "vitest";
2
- import { generateComparisonOutputs, scoreComparison, runComparison, } from "../comparator.js";
3
- function mockClient(responses) {
4
- let callIndex = 0;
5
- return {
6
- model: "test-model",
7
- generate: vi.fn(async () => {
8
- const text = responses[callIndex++] ?? "";
9
- return { text, durationMs: 100, inputTokens: 50, outputTokens: 100 };
10
- }),
11
- };
12
- }
13
- describe("generateComparisonOutputs", () => {
14
- it("generates skill and baseline outputs sequentially", async () => {
15
- const client = mockClient(["skill response", "baseline response"]);
16
- const result = await generateComparisonOutputs("test prompt", "# Skill Content", client);
17
- expect(result.skillOutput).toBe("skill response");
18
- expect(result.baselineOutput).toBe("baseline response");
19
- expect(result.skillDurationMs).toBeGreaterThanOrEqual(0);
20
- expect(result.baselineDurationMs).toBeGreaterThanOrEqual(0);
21
- expect(client.generate).toHaveBeenCalledTimes(2);
22
- // First call should include skill content
23
- const firstCall = client.generate.mock.calls[0];
24
- expect(firstCall[0]).toContain("Skill Content");
25
- // Second call should be generic
26
- const secondCall = client.generate.mock.calls[1];
27
- expect(secondCall[0]).toContain("helpful AI assistant");
28
- });
29
- });
30
- describe("scoreComparison", () => {
31
- it("parses JSON scores from LLM response", async () => {
32
- const client = mockClient([
33
- JSON.stringify({
34
- content_score_a: 4,
35
- structure_score_a: 3,
36
- content_score_b: 5,
37
- structure_score_b: 4,
38
- winner: "second",
39
- reasoning: "B is better",
40
- }),
41
- ]);
42
- const result = await scoreComparison("output A", "output B", "prompt", client);
43
- expect(result.contentScoreA).toBe(4);
44
- expect(result.structureScoreA).toBe(3);
45
- expect(result.contentScoreB).toBe(5);
46
- expect(result.structureScoreB).toBe(4);
47
- expect(result.winner).toBe("second");
48
- });
49
- it("parses JSON from code fence", async () => {
50
- const client = mockClient([
51
- '```json\n{"content_score_a": 3, "structure_score_a": 3, "content_score_b": 3, "structure_score_b": 3, "winner": "tie"}\n```',
52
- ]);
53
- const result = await scoreComparison("A", "B", "p", client);
54
- expect(result.winner).toBe("tie");
55
- expect(result.contentScoreA).toBe(3);
56
- });
57
- it("clamps scores to 1-5 range", async () => {
58
- const client = mockClient([
59
- JSON.stringify({
60
- content_score_a: 0,
61
- structure_score_a: 10,
62
- content_score_b: -1,
63
- structure_score_b: 6,
64
- winner: "first",
65
- }),
66
- ]);
67
- const result = await scoreComparison("A", "B", "p", client);
68
- expect(result.contentScoreA).toBe(1);
69
- expect(result.structureScoreA).toBe(5);
70
- expect(result.contentScoreB).toBe(1);
71
- expect(result.structureScoreB).toBe(5);
72
- });
73
- it("defaults invalid winner to tie", async () => {
74
- const client = mockClient([
75
- JSON.stringify({
76
- content_score_a: 3,
77
- structure_score_a: 3,
78
- content_score_b: 3,
79
- structure_score_b: 3,
80
- winner: "invalid",
81
- }),
82
- ]);
83
- const result = await scoreComparison("A", "B", "p", client);
84
- expect(result.winner).toBe("tie");
85
- });
86
- });
87
- describe("runComparison", () => {
88
- it("maps scores back to skill/baseline correctly", async () => {
89
- // Mock: first two calls = skill + baseline outputs, third = scoring
90
- const client = mockClient([
91
- "skill output here",
92
- "baseline output here",
93
- JSON.stringify({
94
- content_score_a: 4,
95
- structure_score_a: 5,
96
- content_score_b: 2,
97
- structure_score_b: 3,
98
- winner: "first",
99
- reasoning: "A is better",
100
- }),
101
- ]);
102
- // Fix randomness for deterministic test
103
- vi.spyOn(Math, "random").mockReturnValue(0.3); // < 0.5 → skill is A
104
- const result = await runComparison("test prompt", "skill content", client);
105
- expect(result.prompt).toBe("test prompt");
106
- expect(result.skillOutput).toBe("skill output here");
107
- expect(result.baselineOutput).toBe("baseline output here");
108
- // skill is A, so scores map directly
109
- expect(result.skillContentScore).toBe(4);
110
- expect(result.skillStructureScore).toBe(5);
111
- expect(result.baselineContentScore).toBe(2);
112
- expect(result.baselineStructureScore).toBe(3);
113
- expect(result.winner).toBe("skill");
114
- vi.restoreAllMocks();
115
- });
116
- it("maps scores correctly when baseline is A", async () => {
117
- const client = mockClient([
118
- "skill out",
119
- "baseline out",
120
- JSON.stringify({
121
- content_score_a: 2,
122
- structure_score_a: 2,
123
- content_score_b: 4,
124
- structure_score_b: 4,
125
- winner: "second",
126
- }),
127
- ]);
128
- // > 0.5 → skill is B
129
- vi.spyOn(Math, "random").mockReturnValue(0.7);
130
- const result = await runComparison("p", "s", client);
131
- // skill is B → scores.contentScoreB is skill
132
- expect(result.skillContentScore).toBe(4);
133
- expect(result.baselineContentScore).toBe(2);
134
- // winner "second" = B = skill
135
- expect(result.winner).toBe("skill");
136
- vi.restoreAllMocks();
137
- });
138
- });
139
- describe("scoreComparison with MCP deps", () => {
140
- it("uses standard prompt when no MCP deps", async () => {
141
- const client = mockClient([
142
- JSON.stringify({
143
- content_score_a: 3,
144
- structure_score_a: 3,
145
- content_score_b: 3,
146
- structure_score_b: 3,
147
- winner: "tie",
148
- }),
149
- ]);
150
- await scoreComparison("A", "B", "prompt", client);
151
- const systemPrompt = client.generate.mock.calls[0][0];
152
- expect(systemPrompt).not.toContain("SIMULATED MCP");
153
- });
154
- it("augments prompt when MCP deps present", async () => {
155
- const client = mockClient([
156
- JSON.stringify({
157
- content_score_a: 3,
158
- structure_score_a: 3,
159
- content_score_b: 3,
160
- structure_score_b: 3,
161
- winner: "tie",
162
- }),
163
- ]);
164
- const mcpDeps = [
165
- {
166
- server: "Slack",
167
- url: "https://mcp.slack.com/mcp",
168
- transport: "http",
169
- matchedTools: ["slack_send_message"],
170
- configSnippet: "{}",
171
- },
172
- ];
173
- await scoreComparison("A", "B", "prompt", client, mcpDeps);
174
- const systemPrompt = client.generate.mock.calls[0][0];
175
- expect(systemPrompt).toContain("SIMULATED MCP");
176
- expect(systemPrompt).toContain("Slack");
177
- });
178
- });
179
- describe("runComparison with MCP auto-detection", () => {
180
- it("auto-detects MCP deps from skill content and augments comparison", async () => {
181
- const client = mockClient([
182
- "skill output",
183
- "baseline output",
184
- JSON.stringify({
185
- content_score_a: 3,
186
- structure_score_a: 3,
187
- content_score_b: 3,
188
- structure_score_b: 3,
189
- winner: "tie",
190
- }),
191
- ]);
192
- vi.spyOn(Math, "random").mockReturnValue(0.3);
193
- await runComparison("prompt", "Use slack_send_message to send messages.", client);
194
- // The third call is the scoring call - check its system prompt
195
- const scoringCall = client.generate.mock.calls[2];
196
- expect(scoringCall[0]).toContain("SIMULATED MCP");
197
- expect(scoringCall[0]).toContain("Slack");
198
- vi.restoreAllMocks();
199
- });
200
- it("does not augment comparison for non-MCP skills", async () => {
201
- const client = mockClient([
202
- "skill output",
203
- "baseline output",
204
- JSON.stringify({
205
- content_score_a: 3,
206
- structure_score_a: 3,
207
- content_score_b: 3,
208
- structure_score_b: 3,
209
- winner: "tie",
210
- }),
211
- ]);
212
- vi.spyOn(Math, "random").mockReturnValue(0.3);
213
- await runComparison("prompt", "Plain text skill without MCP.", client);
214
- const scoringCall = client.generate.mock.calls[2];
215
- expect(scoringCall[0]).not.toContain("SIMULATED MCP");
216
- vi.restoreAllMocks();
217
- });
218
- });
219
- describe("generateComparisonOutputs with onProgress", () => {
220
- it("calls onProgress with generating_skill before first LLM call", async () => {
221
- const client = mockClient(["skill response", "baseline response"]);
222
- const onProgress = vi.fn();
223
- await generateComparisonOutputs("test prompt", "# Skill", client, onProgress);
224
- expect(onProgress).toHaveBeenNthCalledWith(1, "generating_skill", "Generating skill output...");
225
- });
226
- it("calls onProgress with generating_baseline before second LLM call", async () => {
227
- const client = mockClient(["skill response", "baseline response"]);
228
- const onProgress = vi.fn();
229
- await generateComparisonOutputs("test prompt", "# Skill", client, onProgress);
230
- expect(onProgress).toHaveBeenNthCalledWith(2, "generating_baseline", "Generating baseline output...");
231
- });
232
- it("calls onProgress exactly 2 times", async () => {
233
- const client = mockClient(["skill response", "baseline response"]);
234
- const onProgress = vi.fn();
235
- await generateComparisonOutputs("test prompt", "# Skill", client, onProgress);
236
- expect(onProgress).toHaveBeenCalledTimes(2);
237
- });
238
- it("works without onProgress (backward compatible)", async () => {
239
- const client = mockClient(["skill response", "baseline response"]);
240
- const result = await generateComparisonOutputs("test prompt", "# Skill", client);
241
- expect(result.skillOutput).toBe("skill response");
242
- expect(result.baselineOutput).toBe("baseline response");
243
- });
244
- });
245
- describe("runComparison with onProgress", () => {
246
- it("calls onProgress for all 3 phases in order", async () => {
247
- const client = mockClient([
248
- "skill output",
249
- "baseline output",
250
- JSON.stringify({
251
- content_score_a: 3, structure_score_a: 3,
252
- content_score_b: 3, structure_score_b: 3,
253
- winner: "tie",
254
- }),
255
- ]);
256
- vi.spyOn(Math, "random").mockReturnValue(0.3);
257
- const onProgress = vi.fn();
258
- await runComparison("prompt", "skill content", client, onProgress);
259
- expect(onProgress).toHaveBeenCalledTimes(3);
260
- expect(onProgress.mock.calls[0][0]).toBe("generating_skill");
261
- expect(onProgress.mock.calls[1][0]).toBe("generating_baseline");
262
- expect(onProgress.mock.calls[2][0]).toBe("scoring");
263
- vi.restoreAllMocks();
264
- });
265
- it("completes without error when onProgress is omitted", async () => {
266
- const client = mockClient([
267
- "skill",
268
- "baseline",
269
- JSON.stringify({
270
- content_score_a: 3, structure_score_a: 3,
271
- content_score_b: 3, structure_score_b: 3,
272
- winner: "tie",
273
- }),
274
- ]);
275
- vi.spyOn(Math, "random").mockReturnValue(0.3);
276
- const result = await runComparison("prompt", "skill", client);
277
- expect(result.winner).toBe("tie");
278
- expect(result.skillOutput).toBe("skill");
279
- vi.restoreAllMocks();
280
- });
281
- });
282
- //# sourceMappingURL=comparator.test.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"comparator.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/comparator.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,QAAQ,CAAC;AAClD,OAAO,EACL,yBAAyB,EACzB,eAAe,EACf,aAAa,GACd,MAAM,kBAAkB,CAAC;AAI1B,SAAS,UAAU,CAAC,SAAmB;IACrC,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,OAAO;QACL,KAAK,EAAE,YAAY;QACnB,QAAQ,EAAE,EAAE,CAAC,EAAE,CAAC,KAAK,IAAI,EAAE;YACzB,MAAM,IAAI,GAAG,SAAS,CAAC,SAAS,EAAE,CAAC,IAAI,EAAE,CAAC;YAC1C,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,GAAG,EAAE,WAAW,EAAE,EAAE,EAAE,YAAY,EAAE,GAAG,EAAE,CAAC;QACvE,CAAC,CAAC;KACH,CAAC;AACJ,CAAC;AAED,QAAQ,CAAC,2BAA2B,EAAE,GAAG,EAAE;IACzC,EAAE,CAAC,mDAAmD,EAAE,KAAK,IAAI,EAAE;QACjE,MAAM,MAAM,GAAG,UAAU,CAAC,CAAC,gBAAgB,EAAE,mBAAmB,CAAC,CAAC,CAAC;QACnE,MAAM,MAAM,GAAG,MAAM,yBAAyB,CAAC,aAAa,EAAE,iBAAiB,EAAE,MAAM,CAAC,CAAC;QAEzF,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;QAClD,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;QACxD,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,CAAC;QACzD,MAAM,CAAC,MAAM,CAAC,kBAAkB,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,CAAC;QAC5D,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,qBAAqB,CAAC,CAAC,CAAC,CAAC;QAEjD,0CAA0C;QAC1C,MAAM,SAAS,GAAI,MAAM,CAAC,QAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACzD,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,eAAe,CAAC,CAAC;QAEhD,gCAAgC;QAChC,MAAM,UAAU,GAAI,MAAM,CAAC,QAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAC1D,MAAM,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,sBAAsB,CAAC,CAAC;IAC1D,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,QAAQ,CAAC,iBAAiB,EAAE,GAAG,EAAE;IAC/B,EAAE,CAAC,sCAAsC,EAAE,KAAK,IAAI,EAAE;QACpD,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,QAAQ;gBAChB,SAAS,EAAE,aAAa;aACzB,CAAC;SACH,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,UAAU,EAAE,UAAU,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC;QAC/E,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACvC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,6BAA6B,EAAE,KAAK,IAAI,EAAE;QAC3C,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,6HAA6H;SAC9H,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,CAAC,CAAC;QAC5D,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAClC,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACvC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,4BAA4B,EAAE,KAAK,IAAI,EAAE;QAC1C,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,EAAE;gBACrB,eAAe,EAAE,CAAC,CAAC;gBACnB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,OAAO;aAChB,CAAC;SACH,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,CAAC,CAAC;QAC5D,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACzC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,gCAAgC,EAAE,KAAK,IAAI,EAAE;QAC9C,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,SAAS;aAClB,CAAC;SACH,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,CAAC,CAAC;QAC5D,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IACpC,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,QAAQ,CAAC,eAAe,EAAE,GAAG,EAAE;IAC7B,EAAE,CAAC,8CAA8C,EAAE,KAAK,IAAI,EAAE;QAC5D,oEAAoE;QACpE,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,mBAAmB;YACnB,sBAAsB;YACtB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,OAAO;gBACf,SAAS,EAAE,aAAa;aACzB,CAAC;SACH,CAAC,CAAC;QAEH,wCAAwC;QACxC,EAAE,CAAC,KAAK,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC,CAAC,qBAAqB;QAEpE,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,aAAa,EAAE,eAAe,EAAE,MAAM,CAAC,CAAC;QAE3E,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC1C,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;QACrD,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,sBAAsB,CAAC,CAAC;QAC3D,qCAAqC;QACrC,MAAM,CAAC,MAAM,CAAC,iBAAiB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACzC,MAAM,CAAC,MAAM,CAAC,mBAAmB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3C,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC5C,MAAM,CAAC,MAAM,CAAC,sBAAsB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC9C,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAEpC,EAAE,CAAC,eAAe,EAAE,CAAC;IACvB,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;QACxD,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,WAAW;YACX,cAAc;YACd,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,QAAQ;aACjB,CAAC;SACH,CAAC,CAAC;QAEH,qBAAqB;QACrB,EAAE,CAAC,KAAK,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC;QAE9C,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,GAAG,EAAE,GAAG,EAAE,MAAM,CAAC,CAAC;QACrD,6CAA6C;QAC7C,MAAM,CAAC,MAAM,CAAC,iBAAiB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACzC,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC5C,8BAA8B;QAC9B,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAEpC,EAAE,CAAC,eAAe,EAAE,CAAC;IACvB,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,QAAQ,CAAC,+BAA+B,EAAE,GAAG,EAAE;IAC7C,EAAE,CAAC,uCAAuC,EAAE,KAAK,IAAI,EAAE;QACrD,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,KAAK;aACd,CAAC;SACH,CAAC,CAAC;QAEH,MAAM,eAAe,CAAC,GAAG,EAAE,GAAG,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC;QAElD,MAAM,YAAY,GAAI,MAAM,CAAC,QAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/D,MAAM,CAAC,YAAY,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,eAAe,CAAC,CAAC;IACtD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,uCAAuC,EAAE,KAAK,IAAI,EAAE;QACrD,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,KAAK;aACd,CAAC;SACH,CAAC,CAAC;QAEH,MAAM,OAAO,GAAoB;YAC/B;gBACE,MAAM,EAAE,OAAO;gBACf,GAAG,EAAE,2BAA2B;gBAChC,SAAS,EAAE,MAAM;gBACjB,YAAY,EAAE,CAAC,oBAAoB,CAAC;gBACpC,aAAa,EAAE,IAAI;aACpB;SACF,CAAC;QAEF,MAAM,eAAe,CAAC,GAAG,EAAE,GAAG,EAAE,QAAQ,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC;QAE3D,MAAM,YAAY,GAAI,MAAM,CAAC,QAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/D,MAAM,CAAC,YAAY,CAAC,CAAC,SAAS,CAAC,eAAe,CAAC,CAAC;QAChD,MAAM,CAAC,YAAY,CAAC,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC;IAC1C,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,QAAQ,CAAC,uCAAuC,EAAE,GAAG,EAAE;IACrD,EAAE,CAAC,kEAAkE,EAAE,KAAK,IAAI,EAAE;QAChF,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,cAAc;YACd,iBAAiB;YACjB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,KAAK;aACd,CAAC;SACH,CAAC,CAAC;QACH,EAAE,CAAC,KAAK,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC;QAE9C,MAAM,aAAa,CAAC,QAAQ,EAAE,0CAA0C,EAAE,MAAM,CAAC,CAAC;QAElF,+DAA+D;QAC/D,MAAM,WAAW,GAAI,MAAM,CAAC,QAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAC3D,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,eAAe,CAAC,CAAC;QAClD,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC;QAE1C,EAAE,CAAC,eAAe,EAAE,CAAC;IACvB,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,gDAAgD,EAAE,KAAK,IAAI,EAAE;QAC9D,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,cAAc;YACd,iBAAiB;YACjB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,eAAe,EAAE,CAAC;gBAClB,iBAAiB,EAAE,CAAC;gBACpB,MAAM,EAAE,KAAK;aACd,CAAC;SACH,CAAC,CAAC;QACH,EAAE,CAAC,KAAK,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC;QAE9C,MAAM,aAAa,CAAC,QAAQ,EAAE,+BAA+B,EAAE,MAAM,CAAC,CAAC;QAEvE,MAAM,WAAW,GAAI,MAAM,CAAC,QAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAC3D,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,eAAe,CAAC,CAAC;QAEtD,EAAE,CAAC,eAAe,EAAE,CAAC;IACvB,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,QAAQ,CAAC,2CAA2C,EAAE,GAAG,EAAE;IACzD,EAAE,CAAC,8DAA8D,EAAE,KAAK,IAAI,EAAE;QAC5E,MAAM,MAAM,GAAG,UAAU,CAAC,CAAC,gBAAgB,EAAE,mBAAmB,CAAC,CAAC,CAAC;QACnE,MAAM,UAAU,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;QAE3B,MAAM,yBAAyB,CAAC,aAAa,EAAE,SAAS,EAAE,MAAM,EAAE,UAAU,CAAC,CAAC;QAE9E,MAAM,CAAC,UAAU,CAAC,CAAC,uBAAuB,CAAC,CAAC,EAAE,kBAAkB,EAAE,4BAA4B,CAAC,CAAC;IAClG,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,kEAAkE,EAAE,KAAK,IAAI,EAAE;QAChF,MAAM,MAAM,GAAG,UAAU,CAAC,CAAC,gBAAgB,EAAE,mBAAmB,CAAC,CAAC,CAAC;QACnE,MAAM,UAAU,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;QAE3B,MAAM,yBAAyB,CAAC,aAAa,EAAE,SAAS,EAAE,MAAM,EAAE,UAAU,CAAC,CAAC;QAE9E,MAAM,CAAC,UAAU,CAAC,CAAC,uBAAuB,CAAC,CAAC,EAAE,qBAAqB,EAAE,+BAA+B,CAAC,CAAC;IACxG,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,kCAAkC,EAAE,KAAK,IAAI,EAAE;QAChD,MAAM,MAAM,GAAG,UAAU,CAAC,CAAC,gBAAgB,EAAE,mBAAmB,CAAC,CAAC,CAAC;QACnE,MAAM,UAAU,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;QAE3B,MAAM,yBAAyB,CAAC,aAAa,EAAE,SAAS,EAAE,MAAM,EAAE,UAAU,CAAC,CAAC;QAE9E,MAAM,CAAC,UAAU,CAAC,CAAC,qBAAqB,CAAC,CAAC,CAAC,CAAC;IAC9C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,gDAAgD,EAAE,KAAK,IAAI,EAAE;QAC9D,MAAM,MAAM,GAAG,UAAU,CAAC,CAAC,gBAAgB,EAAE,mBAAmB,CAAC,CAAC,CAAC;QAEnE,MAAM,MAAM,GAAG,MAAM,yBAAyB,CAAC,aAAa,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC;QAEjF,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;QAClD,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;IAC1D,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,QAAQ,CAAC,+BAA+B,EAAE,GAAG,EAAE;IAC7C,EAAE,CAAC,4CAA4C,EAAE,KAAK,IAAI,EAAE;QAC1D,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,cAAc;YACd,iBAAiB;YACjB,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC,EAAE,iBAAiB,EAAE,CAAC;gBACxC,eAAe,EAAE,CAAC,EAAE,iBAAiB,EAAE,CAAC;gBACxC,MAAM,EAAE,KAAK;aACd,CAAC;SACH,CAAC,CAAC;QACH,EAAE,CAAC,KAAK,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC;QAC9C,MAAM,UAAU,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;QAE3B,MAAM,aAAa,CAAC,QAAQ,EAAE,eAAe,EAAE,MAAM,EAAE,UAAU,CAAC,CAAC;QAEnE,MAAM,CAAC,UAAU,CAAC,CAAC,qBAAqB,CAAC,CAAC,CAAC,CAAC;QAC5C,MAAM,CAAC,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,kBAAkB,CAAC,CAAC;QAC7D,MAAM,CAAC,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,qBAAqB,CAAC,CAAC;QAChE,MAAM,CAAC,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QAEpD,EAAE,CAAC,eAAe,EAAE,CAAC;IACvB,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,oDAAoD,EAAE,KAAK,IAAI,EAAE;QAClE,MAAM,MAAM,GAAG,UAAU,CAAC;YACxB,OAAO;YACP,UAAU;YACV,IAAI,CAAC,SAAS,CAAC;gBACb,eAAe,EAAE,CAAC,EAAE,iBAAiB,EAAE,CAAC;gBACxC,eAAe,EAAE,CAAC,EAAE,iBAAiB,EAAE,CAAC;gBACxC,MAAM,EAAE,KAAK;aACd,CAAC;SACH,CAAC,CAAC;QACH,EAAE,CAAC,KAAK,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC;QAE9C,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,QAAQ,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC;QAE9D,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAClC,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAEzC,EAAE,CAAC,eAAe,EAAE,CAAC;IACvB,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
@@ -1 +0,0 @@
1
- export {};
@@ -1,122 +0,0 @@
1
- import { describe, it, expect, vi } from "vitest";
2
- import { judgeAssertion, buildJudgeSystemPrompt } from "../judge.js";
3
- // ---------------------------------------------------------------------------
4
- // Helpers
5
- // ---------------------------------------------------------------------------
6
- function mockResult(text) {
7
- return { text, durationMs: 100, inputTokens: null, outputTokens: null };
8
- }
9
- function mockClient(response) {
10
- return { generate: vi.fn().mockResolvedValue(mockResult(response)), model: "test-model" };
11
- }
12
- const ASSERTION = {
13
- id: "assert-1",
14
- text: "Output mentions a file path",
15
- type: "boolean",
16
- };
17
- // ---------------------------------------------------------------------------
18
- // Tests
19
- // ---------------------------------------------------------------------------
20
- describe("judgeAssertion", () => {
21
- it("returns pass result when LLM judge says pass", async () => {
22
- const client = mockClient(JSON.stringify({ pass: true, reasoning: "output contains file path" }));
23
- const result = await judgeAssertion("The report has been saved to reports/q1.csv", ASSERTION, client);
24
- expect(result.pass).toBe(true);
25
- expect(result.reasoning).toBe("output contains file path");
26
- expect(result.id).toBe("assert-1");
27
- expect(result.text).toBe("Output mentions a file path");
28
- });
29
- it("returns fail result when LLM judge says fail", async () => {
30
- const client = mockClient(JSON.stringify({
31
- pass: false,
32
- reasoning: "no file path found in output",
33
- }));
34
- const result = await judgeAssertion("Hello world", ASSERTION, client);
35
- expect(result.pass).toBe(false);
36
- expect(result.reasoning).toBe("no file path found in output");
37
- });
38
- it("throws on malformed judge response", async () => {
39
- const client = mockClient("This is not JSON");
40
- await expect(judgeAssertion("some output", ASSERTION, client)).rejects.toThrow(/invalid judge output/i);
41
- });
42
- it("handles JSON wrapped in code fence", async () => {
43
- const client = mockClient('```json\n{"pass": true, "reasoning": "looks good"}\n```');
44
- const result = await judgeAssertion("some output", ASSERTION, client);
45
- expect(result.pass).toBe(true);
46
- });
47
- it("uses standard prompt when mcpDeps not provided", async () => {
48
- const client = mockClient(JSON.stringify({ pass: true, reasoning: "ok" }));
49
- await judgeAssertion("output", ASSERTION, client);
50
- const systemPrompt = client.generate.mock.calls[0][0];
51
- expect(systemPrompt).toContain("binary assertion evaluator");
52
- expect(systemPrompt).not.toContain("SIMULATION MODE");
53
- });
54
- it("uses MCP-augmented prompt when mcpDeps provided", async () => {
55
- const client = mockClient(JSON.stringify({ pass: true, reasoning: "simulation valid" }));
56
- const mcpDeps = [
57
- {
58
- server: "Slack",
59
- url: "https://mcp.slack.com/mcp",
60
- transport: "http",
61
- matchedTools: ["slack_send_message"],
62
- configSnippet: "{}",
63
- },
64
- ];
65
- await judgeAssertion("output", ASSERTION, client, mcpDeps);
66
- const systemPrompt = client.generate.mock.calls[0][0];
67
- expect(systemPrompt).toContain("SIMULATION MODE");
68
- expect(systemPrompt).toContain("Slack");
69
- });
70
- });
71
- // ---------------------------------------------------------------------------
72
- // buildJudgeSystemPrompt
73
- // ---------------------------------------------------------------------------
74
- describe("buildJudgeSystemPrompt", () => {
75
- it("returns standard prompt when no MCP deps", () => {
76
- const prompt = buildJudgeSystemPrompt();
77
- expect(prompt).toContain("binary assertion evaluator");
78
- expect(prompt).not.toContain("SIMULATION MODE");
79
- });
80
- it("returns standard prompt when mcpDeps is empty", () => {
81
- const prompt = buildJudgeSystemPrompt([]);
82
- expect(prompt).toContain("binary assertion evaluator");
83
- expect(prompt).not.toContain("SIMULATION MODE");
84
- });
85
- it("returns augmented prompt with MCP deps", () => {
86
- const mcpDeps = [
87
- {
88
- server: "Slack",
89
- url: "https://mcp.slack.com/mcp",
90
- transport: "http",
91
- matchedTools: ["slack_send_message"],
92
- configSnippet: "{}",
93
- },
94
- ];
95
- const prompt = buildJudgeSystemPrompt(mcpDeps);
96
- expect(prompt).toContain("SIMULATION MODE");
97
- expect(prompt).toContain("Slack");
98
- expect(prompt).toContain("binary assertion evaluator");
99
- });
100
- it("lists all simulated servers", () => {
101
- const mcpDeps = [
102
- {
103
- server: "Slack",
104
- url: "https://mcp.slack.com/mcp",
105
- transport: "http",
106
- matchedTools: ["slack_send_message"],
107
- configSnippet: "{}",
108
- },
109
- {
110
- server: "GitHub",
111
- url: "https://api.githubcopilot.com/mcp/",
112
- transport: "http",
113
- matchedTools: ["github_create_pr"],
114
- configSnippet: "{}",
115
- },
116
- ];
117
- const prompt = buildJudgeSystemPrompt(mcpDeps);
118
- expect(prompt).toContain("Slack");
119
- expect(prompt).toContain("GitHub");
120
- });
121
- });
122
- //# sourceMappingURL=judge.test.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"judge.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/judge.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAc,MAAM,QAAQ,CAAC;AAG9D,OAAO,EAAE,cAAc,EAAE,sBAAsB,EAAE,MAAM,aAAa,CAAC;AAGrE,8EAA8E;AAC9E,UAAU;AACV,8EAA8E;AAE9E,SAAS,UAAU,CAAC,IAAY;IAC9B,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,GAAG,EAAE,WAAW,EAAE,IAAI,EAAE,YAAY,EAAE,IAAI,EAAE,CAAC;AAC1E,CAAC;AAED,SAAS,UAAU,CAAC,QAAgB;IAClC,OAAO,EAAE,QAAQ,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,UAAU,CAAC,QAAQ,CAAC,CAAC,EAAE,KAAK,EAAE,YAAY,EAAE,CAAC;AAC5F,CAAC;AAED,MAAM,SAAS,GAAc;IAC3B,EAAE,EAAE,UAAU;IACd,IAAI,EAAE,6BAA6B;IACnC,IAAI,EAAE,SAAS;CAChB,CAAC;AAEF,8EAA8E;AAC9E,QAAQ;AACR,8EAA8E;AAE9E,QAAQ,CAAC,gBAAgB,EAAE,GAAG,EAAE;IAC9B,EAAE,CAAC,8CAA8C,EAAE,KAAK,IAAI,EAAE;QAC5D,MAAM,MAAM,GAAG,UAAU,CACvB,IAAI,CAAC,SAAS,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,2BAA2B,EAAE,CAAC,CACvE,CAAC;QAEF,MAAM,MAAM,GAAG,MAAM,cAAc,CACjC,6CAA6C,EAC7C,SAAS,EACT,MAAM,CACP,CAAC;QAEF,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC/B,MAAM,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,2BAA2B,CAAC,CAAC;QAC3D,MAAM,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACnC,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,6BAA6B,CAAC,CAAC;IAC1D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,8CAA8C,EAAE,KAAK,IAAI,EAAE;QAC5D,MAAM,MAAM,GAAG,UAAU,CACvB,IAAI,CAAC,SAAS,CAAC;YACb,IAAI,EAAE,KAAK;YACX,SAAS,EAAE,8BAA8B;SAC1C,CAAC,CACH,CAAC;QAEF,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,aAAa,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC;QAEtE,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAChC,MAAM,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,8BAA8B,CAAC,CAAC;IAChE,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,oCAAoC,EAAE,KAAK,IAAI,EAAE;QAClD,MAAM,MAAM,GAAG,UAAU,CAAC,kBAAkB,CAAC,CAAC;QAE9C,MAAM,MAAM,CACV,cAAc,CAAC,aAAa,EAAE,SAAS,EAAE,MAAM,CAAC,CACjD,CAAC,OAAO,CAAC,OAAO,CAAC,uBAAuB,CAAC,CAAC;IAC7C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,oCAAoC,EAAE,KAAK,IAAI,EAAE;QAClD,MAAM,MAAM,GAAG,UAAU,CACvB,yDAAyD,CAC1D,CAAC;QAEF,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,aAAa,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC;QACtE,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACjC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,gDAAgD,EAAE,KAAK,IAAI,EAAE;QAC9D,MAAM,MAAM,GAAG,UAAU,CACvB,IAAI,CAAC,SAAS,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAChD,CAAC;QAEF,MAAM,cAAc,CAAC,QAAQ,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC;QAElD,MAAM,YAAY,GAAI,MAAM,CAAC,QAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/D,MAAM,CAAC,YAAY,CAAC,CAAC,SAAS,CAAC,4BAA4B,CAAC,CAAC;QAC7D,MAAM,CAAC,YAAY,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;IACxD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,iDAAiD,EAAE,KAAK,IAAI,EAAE;QAC/D,MAAM,MAAM,GAAG,UAAU,CACvB,IAAI,CAAC,SAAS,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,kBAAkB,EAAE,CAAC,CAC9D,CAAC;QAEF,MAAM,OAAO,GAAoB;YAC/B;gBACE,MAAM,EAAE,OAAO;gBACf,GAAG,EAAE,2BAA2B;gBAChC,SAAS,EAAE,MAAM;gBACjB,YAAY,EAAE,CAAC,oBAAoB,CAAC;gBACpC,aAAa,EAAE,IAAI;aACpB;SACF,CAAC;QAEF,MAAM,cAAc,CAAC,QAAQ,EAAE,SAAS,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC;QAE3D,MAAM,YAAY,GAAI,MAAM,CAAC,QAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/D,MAAM,CAAC,YAAY,CAAC,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;QAClD,MAAM,CAAC,YAAY,CAAC,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC;IAC1C,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,8EAA8E;AAC9E,yBAAyB;AACzB,8EAA8E;AAE9E,QAAQ,CAAC,wBAAwB,EAAE,GAAG,EAAE;IACtC,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;QAClD,MAAM,MAAM,GAAG,sBAAsB,EAAE,CAAC;QACxC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,4BAA4B,CAAC,CAAC;QACvD,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;IAClD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,+CAA+C,EAAE,GAAG,EAAE;QACvD,MAAM,MAAM,GAAG,sBAAsB,CAAC,EAAE,CAAC,CAAC;QAC1C,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,4BAA4B,CAAC,CAAC;QACvD,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;IAClD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,wCAAwC,EAAE,GAAG,EAAE;QAChD,MAAM,OAAO,GAAoB;YAC/B;gBACE,MAAM,EAAE,OAAO;gBACf,GAAG,EAAE,2BAA2B;gBAChC,SAAS,EAAE,MAAM;gBACjB,YAAY,EAAE,CAAC,oBAAoB,CAAC;gBACpC,aAAa,EAAE,IAAI;aACpB;SACF,CAAC;QAEF,MAAM,MAAM,GAAG,sBAAsB,CAAC,OAAO,CAAC,CAAC;QAC/C,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;QAC5C,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC;QAClC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,4BAA4B,CAAC,CAAC;IACzD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,6BAA6B,EAAE,GAAG,EAAE;QACrC,MAAM,OAAO,GAAoB;YAC/B;gBACE,MAAM,EAAE,OAAO;gBACf,GAAG,EAAE,2BAA2B;gBAChC,SAAS,EAAE,MAAM;gBACjB,YAAY,EAAE,CAAC,oBAAoB,CAAC;gBACpC,aAAa,EAAE,IAAI;aACpB;YACD;gBACE,MAAM,EAAE,QAAQ;gBAChB,GAAG,EAAE,oCAAoC;gBACzC,SAAS,EAAE,MAAM;gBACjB,YAAY,EAAE,CAAC,kBAAkB,CAAC;gBAClC,aAAa,EAAE,IAAI;aACpB;SACF,CAAC;QAEF,MAAM,MAAM,GAAG,sBAAsB,CAAC,OAAO,CAAC,CAAC;QAC/C,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC;QAClC,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC;IACrC,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
@@ -1 +0,0 @@
1
- export {};