dialectic 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/.cursor/commands/setup-test.mdc +175 -0
  2. package/.cursor/rules/basic-code-cleanup.mdc +1110 -0
  3. package/.cursor/rules/riper5.mdc +96 -0
  4. package/.env.example +6 -0
  5. package/AGENTS.md +1052 -0
  6. package/LICENSE +21 -0
  7. package/README.md +93 -0
  8. package/WARP.md +113 -0
  9. package/dialectic-1.0.0.tgz +0 -0
  10. package/dialectic.js +10 -0
  11. package/docs/commands.md +375 -0
  12. package/docs/configuration.md +882 -0
  13. package/docs/context_summarization.md +1023 -0
  14. package/docs/debate_flow.md +1127 -0
  15. package/docs/eval_flow.md +795 -0
  16. package/docs/evaluator.md +141 -0
  17. package/examples/debate-config-openrouter.json +48 -0
  18. package/examples/debate_config1.json +48 -0
  19. package/examples/eval/eval1/eval_config1.json +13 -0
  20. package/examples/eval/eval1/result1.json +62 -0
  21. package/examples/eval/eval1/result2.json +97 -0
  22. package/examples/eval_summary_format.md +11 -0
  23. package/examples/example3/debate-config.json +64 -0
  24. package/examples/example3/eval_config2.json +25 -0
  25. package/examples/example3/problem.md +17 -0
  26. package/examples/example3/rounds_test/eval_run.sh +16 -0
  27. package/examples/example3/rounds_test/run_test.sh +16 -0
  28. package/examples/kata1/architect-only-solution_2-rounds.json +121 -0
  29. package/examples/kata1/architect-perf-solution_2-rounds.json +234 -0
  30. package/examples/kata1/debate-config-kata1.json +54 -0
  31. package/examples/kata1/eval_architect-only_2-rounds.json +97 -0
  32. package/examples/kata1/eval_architect-perf_2-rounds.json +97 -0
  33. package/examples/kata1/kata1-report.md +12224 -0
  34. package/examples/kata1/kata1-report_temps-01_01_01_07.md +2451 -0
  35. package/examples/kata1/kata1.md +5 -0
  36. package/examples/kata1/meta.txt +1 -0
  37. package/examples/kata2/debate-config.json +54 -0
  38. package/examples/kata2/eval_config1.json +21 -0
  39. package/examples/kata2/eval_config2.json +25 -0
  40. package/examples/kata2/kata2.md +5 -0
  41. package/examples/kata2/only_architect/debate-config.json +45 -0
  42. package/examples/kata2/only_architect/eval_run.sh +11 -0
  43. package/examples/kata2/only_architect/run_test.sh +5 -0
  44. package/examples/kata2/rounds_test/eval_run.sh +11 -0
  45. package/examples/kata2/rounds_test/run_test.sh +5 -0
  46. package/examples/kata2/summary_length_test/eval_run.sh +11 -0
  47. package/examples/kata2/summary_length_test/eval_run_w_clarify.sh +7 -0
  48. package/examples/kata2/summary_length_test/run_test.sh +5 -0
  49. package/examples/task-queue/debate-config.json +76 -0
  50. package/examples/task-queue/debate_report.md +566 -0
  51. package/examples/task-queue/task-queue-system.md +25 -0
  52. package/jest.config.ts +13 -0
  53. package/multi_agent_debate_spec.md +2980 -0
  54. package/package.json +38 -0
  55. package/sanity-check-problem.txt +9 -0
  56. package/src/agents/prompts/architect-prompts.ts +203 -0
  57. package/src/agents/prompts/generalist-prompts.ts +157 -0
  58. package/src/agents/prompts/index.ts +41 -0
  59. package/src/agents/prompts/judge-prompts.ts +19 -0
  60. package/src/agents/prompts/kiss-prompts.ts +230 -0
  61. package/src/agents/prompts/performance-prompts.ts +142 -0
  62. package/src/agents/prompts/prompt-types.ts +68 -0
  63. package/src/agents/prompts/security-prompts.ts +149 -0
  64. package/src/agents/prompts/shared.ts +144 -0
  65. package/src/agents/prompts/testing-prompts.ts +149 -0
  66. package/src/agents/role-based-agent.ts +386 -0
  67. package/src/cli/commands/debate.ts +761 -0
  68. package/src/cli/commands/eval.ts +475 -0
  69. package/src/cli/commands/report.ts +265 -0
  70. package/src/cli/index.ts +79 -0
  71. package/src/core/agent.ts +198 -0
  72. package/src/core/clarifications.ts +34 -0
  73. package/src/core/judge.ts +257 -0
  74. package/src/core/orchestrator.ts +432 -0
  75. package/src/core/state-manager.ts +322 -0
  76. package/src/eval/evaluator-agent.ts +130 -0
  77. package/src/eval/prompts/system.md +41 -0
  78. package/src/eval/prompts/user.md +64 -0
  79. package/src/providers/llm-provider.ts +25 -0
  80. package/src/providers/openai-provider.ts +84 -0
  81. package/src/providers/openrouter-provider.ts +122 -0
  82. package/src/providers/provider-factory.ts +64 -0
  83. package/src/types/agent.types.ts +141 -0
  84. package/src/types/config.types.ts +47 -0
  85. package/src/types/debate.types.ts +237 -0
  86. package/src/types/eval.types.ts +85 -0
  87. package/src/utils/common.ts +104 -0
  88. package/src/utils/context-formatter.ts +102 -0
  89. package/src/utils/context-summarizer.ts +143 -0
  90. package/src/utils/env-loader.ts +46 -0
  91. package/src/utils/exit-codes.ts +5 -0
  92. package/src/utils/id.ts +11 -0
  93. package/src/utils/logger.ts +48 -0
  94. package/src/utils/paths.ts +10 -0
  95. package/src/utils/progress-ui.ts +313 -0
  96. package/src/utils/prompt-loader.ts +79 -0
  97. package/src/utils/report-generator.ts +301 -0
  98. package/tests/clarifications.spec.ts +128 -0
  99. package/tests/cli.debate.spec.ts +144 -0
  100. package/tests/config-loading.spec.ts +206 -0
  101. package/tests/context-summarizer.spec.ts +131 -0
  102. package/tests/debate-config-custom.json +38 -0
  103. package/tests/env-loader.spec.ts +149 -0
  104. package/tests/eval.command.spec.ts +1191 -0
  105. package/tests/logger.spec.ts +19 -0
  106. package/tests/openai-provider.spec.ts +26 -0
  107. package/tests/openrouter-provider.spec.ts +279 -0
  108. package/tests/orchestrator-summary.spec.ts +386 -0
  109. package/tests/orchestrator.spec.ts +207 -0
  110. package/tests/prompt-loader.spec.ts +52 -0
  111. package/tests/prompts/architect.md +16 -0
  112. package/tests/provider-factory.spec.ts +150 -0
  113. package/tests/report.command.spec.ts +546 -0
  114. package/tests/role-based-agent-summary.spec.ts +476 -0
  115. package/tests/security-agent.spec.ts +221 -0
  116. package/tests/shared-prompts.spec.ts +318 -0
  117. package/tests/state-manager.spec.ts +251 -0
  118. package/tests/summary-prompts.spec.ts +153 -0
  119. package/tsconfig.json +49 -0
@@ -0,0 +1,1191 @@
1
+ import fs from 'fs';
2
+ import path from 'path';
3
+ import os from 'os';
4
+ import { runCli } from '../src/cli/index';
5
+ import { EXIT_INVALID_ARGS, EXIT_CONFIG_ERROR } from '../src/utils/exit-codes';
6
+ import { EvaluatorAgent } from '../src/eval/evaluator-agent';
7
+
8
+ // Mock env-loader
9
+ jest.mock('../src/utils/env-loader', () => ({
10
+ loadEnvironmentFile: jest.fn()
11
+ }));
12
+
13
+ // Mock provider-factory
14
+ jest.mock('../src/providers/provider-factory', () => ({
15
+ createProvider: jest.fn()
16
+ }));
17
+
18
+ import { loadEnvironmentFile } from '../src/utils/env-loader';
19
+ import { createProvider } from '../src/providers/provider-factory';
20
+
21
+ const mockedLoadEnvironmentFile = loadEnvironmentFile as jest.MockedFunction<typeof loadEnvironmentFile>;
22
+ const mockedCreateProvider = createProvider as jest.MockedFunction<typeof createProvider>;
23
+
24
+ describe('CLI eval command', () => {
25
+ const originalEnv = process.env;
26
+ let stderrSpy: jest.SpyInstance;
27
+ let stdoutSpy: jest.SpyInstance;
28
+ let exitSpy: jest.SpyInstance;
29
+ let tmpDir: string;
30
+
31
+ beforeEach(() => {
32
+ process.env = { ...originalEnv, OPENAI_API_KEY: 'test-key' };
33
+ stderrSpy = jest.spyOn(process.stderr, 'write').mockImplementation(() => true);
34
+ stdoutSpy = jest.spyOn(process.stdout, 'write').mockImplementation(() => true);
35
+ exitSpy = jest.spyOn(process, 'exit').mockImplementation(((code?: number) => {
36
+ throw new Error(`process.exit: ${code}`);
37
+ }) as any);
38
+ mockedLoadEnvironmentFile.mockClear();
39
+ mockedCreateProvider.mockClear();
40
+
41
+ tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'eval-test-'));
42
+ });
43
+
44
+ afterEach(() => {
45
+ process.env = originalEnv;
46
+ stderrSpy.mockRestore();
47
+ stdoutSpy.mockRestore();
48
+ exitSpy.mockRestore();
49
+
50
+ try {
51
+ fs.rmSync(tmpDir, { recursive: true, force: true });
52
+ } catch {}
53
+ });
54
+
55
+ describe('Required flags validation', () => {
56
+ it('should reject when --config flag is missing', async () => {
57
+ await expect(runCli(['eval', '--debate', 'some-debate.json']))
58
+ .rejects.toThrow();
59
+ });
60
+
61
+ it('should reject when --debate flag is missing', async () => {
62
+ await expect(runCli(['eval', '--config', 'some-config.json']))
63
+ .rejects.toThrow();
64
+ });
65
+
66
+ it('should reject when both required flags are missing', async () => {
67
+ await expect(runCli(['eval']))
68
+ .rejects.toThrow();
69
+ });
70
+ });
71
+
72
+ describe('File existence validation', () => {
73
+ it('should exit with invalid args when config file does not exist', async () => {
74
+ const debatePath = path.join(tmpDir, 'debate.json');
75
+ fs.writeFileSync(debatePath, JSON.stringify({
76
+ problem: 'Test problem',
77
+ finalSolution: { description: 'Test solution' }
78
+ }));
79
+
80
+ await expect(runCli(['eval', '--config', 'nonexistent.json', '--debate', debatePath]))
81
+ .rejects.toHaveProperty('code', EXIT_INVALID_ARGS);
82
+ });
83
+
84
+ it('should exit with invalid args when debate file does not exist', async () => {
85
+ const configPath = path.join(tmpDir, 'config.json');
86
+ fs.writeFileSync(configPath, JSON.stringify({
87
+ agents: [{ id: 'e1', name: 'Evaluator', model: 'gpt-4', provider: 'openai' }]
88
+ }));
89
+
90
+ await expect(runCli(['eval', '--config', configPath, '--debate', 'nonexistent.json']))
91
+ .rejects.toHaveProperty('code', EXIT_INVALID_ARGS);
92
+ });
93
+ });
94
+
95
+ describe('Config validation', () => {
96
+ it('should reject config without agents array', async () => {
97
+ const configPath = path.join(tmpDir, 'config.json');
98
+ const debatePath = path.join(tmpDir, 'debate.json');
99
+
100
+ fs.writeFileSync(configPath, JSON.stringify({ foo: 'bar' }));
101
+ fs.writeFileSync(debatePath, JSON.stringify({
102
+ problem: 'Test problem',
103
+ finalSolution: { description: 'Test solution' }
104
+ }));
105
+
106
+ await expect(runCli(['eval', '--config', configPath, '--debate', debatePath]))
107
+ .rejects.toHaveProperty('code', EXIT_INVALID_ARGS);
108
+ expect(stderrSpy).toHaveBeenCalledWith(
109
+ expect.stringContaining('agents array required')
110
+ );
111
+ });
112
+
113
+ it('should reject config with empty agents array', async () => {
114
+ const configPath = path.join(tmpDir, 'config.json');
115
+ const debatePath = path.join(tmpDir, 'debate.json');
116
+
117
+ fs.writeFileSync(configPath, JSON.stringify({ agents: [] }));
118
+ fs.writeFileSync(debatePath, JSON.stringify({
119
+ problem: 'Test problem',
120
+ finalSolution: { description: 'Test solution' }
121
+ }));
122
+
123
+ await expect(runCli(['eval', '--config', configPath, '--debate', debatePath]))
124
+ .rejects.toHaveProperty('code', EXIT_INVALID_ARGS);
125
+ });
126
+
127
+ it('should reject config with malformed JSON', async () => {
128
+ const configPath = path.join(tmpDir, 'config.json');
129
+ const debatePath = path.join(tmpDir, 'debate.json');
130
+
131
+ fs.writeFileSync(configPath, '{ agents: [invalid json}');
132
+ fs.writeFileSync(debatePath, JSON.stringify({
133
+ problem: 'Test problem',
134
+ finalSolution: { description: 'Test solution' }
135
+ }));
136
+
137
+ await expect(runCli(['eval', '--config', configPath, '--debate', debatePath]))
138
+ .rejects.toHaveProperty('code', EXIT_INVALID_ARGS);
139
+ });
140
+
141
+ it('should filter out disabled evaluators', async () => {
142
+ const configPath = path.join(tmpDir, 'config.json');
143
+ const debatePath = path.join(tmpDir, 'debate.json');
144
+
145
+ fs.writeFileSync(configPath, JSON.stringify({
146
+ agents: [
147
+ { id: 'e1', name: 'E1', model: 'gpt-4', provider: 'openai', enabled: false },
148
+ { id: 'e2', name: 'E2', model: 'gpt-4', provider: 'openai', enabled: false }
149
+ ]
150
+ }));
151
+ fs.writeFileSync(debatePath, JSON.stringify({
152
+ problem: 'Test problem',
153
+ finalSolution: { description: 'Test solution' }
154
+ }));
155
+
156
+ await expect(runCli(['eval', '--config', configPath, '--debate', debatePath]))
157
+ .rejects.toHaveProperty('code', EXIT_INVALID_ARGS);
158
+ expect(stderrSpy).toHaveBeenCalledWith(
159
+ expect.stringContaining('No enabled evaluator agents')
160
+ );
161
+ });
162
+ });
163
+
164
+ describe('Debate input validation', () => {
165
+ let configPath: string;
166
+
167
+ beforeEach(() => {
168
+ configPath = path.join(tmpDir, 'config.json');
169
+ fs.writeFileSync(configPath, JSON.stringify({
170
+ agents: [{ id: 'e1', name: 'Evaluator', model: 'gpt-4', provider: 'openai' }]
171
+ }));
172
+ });
173
+
174
+ it('should reject debate JSON without problem field', async () => {
175
+ const debatePath = path.join(tmpDir, 'debate.json');
176
+ fs.writeFileSync(debatePath, JSON.stringify({
177
+ finalSolution: { description: 'Test solution' }
178
+ }));
179
+
180
+ await expect(runCli(['eval', '--config', configPath, '--debate', debatePath]))
181
+ .rejects.toHaveProperty('code', EXIT_INVALID_ARGS);
182
+ expect(stderrSpy).toHaveBeenCalledWith(
183
+ expect.stringContaining('missing non-empty problem')
184
+ );
185
+ });
186
+
187
+ it('should reject debate JSON with empty problem', async () => {
188
+ const debatePath = path.join(tmpDir, 'debate.json');
189
+ fs.writeFileSync(debatePath, JSON.stringify({
190
+ problem: ' ',
191
+ finalSolution: { description: 'Test solution' }
192
+ }));
193
+
194
+ await expect(runCli(['eval', '--config', configPath, '--debate', debatePath]))
195
+ .rejects.toHaveProperty('code', EXIT_INVALID_ARGS);
196
+ });
197
+
198
+ it('should reject debate JSON without finalSolution', async () => {
199
+ const debatePath = path.join(tmpDir, 'debate.json');
200
+ fs.writeFileSync(debatePath, JSON.stringify({
201
+ problem: 'Test problem'
202
+ }));
203
+
204
+ await expect(runCli(['eval', '--config', configPath, '--debate', debatePath]))
205
+ .rejects.toHaveProperty('code', EXIT_INVALID_ARGS);
206
+ expect(stderrSpy).toHaveBeenCalledWith(
207
+ expect.stringContaining('missing non-empty finalSolution.description')
208
+ );
209
+ });
210
+
211
+ it('should reject debate JSON with empty finalSolution.description', async () => {
212
+ const debatePath = path.join(tmpDir, 'debate.json');
213
+ fs.writeFileSync(debatePath, JSON.stringify({
214
+ problem: 'Test problem',
215
+ finalSolution: { description: '' }
216
+ }));
217
+
218
+ await expect(runCli(['eval', '--config', configPath, '--debate', debatePath]))
219
+ .rejects.toHaveProperty('code', EXIT_INVALID_ARGS);
220
+ });
221
+
222
+ it('should reject debate JSON with malformed JSON', async () => {
223
+ const debatePath = path.join(tmpDir, 'debate.json');
224
+ fs.writeFileSync(debatePath, '{ problem: invalid }');
225
+
226
+ await expect(runCli(['eval', '--config', configPath, '--debate', debatePath]))
227
+ .rejects.toHaveProperty('code', EXIT_INVALID_ARGS);
228
+ });
229
+ });
230
+
231
+ describe('Environment file loading', () => {
232
+ let configPath: string;
233
+ let debatePath: string;
234
+
235
+ beforeEach(() => {
236
+ configPath = path.join(tmpDir, 'config.json');
237
+ debatePath = path.join(tmpDir, 'debate.json');
238
+
239
+ fs.writeFileSync(configPath, JSON.stringify({
240
+ agents: [{ id: 'e1', name: 'E1', model: 'gpt-4', provider: 'openai' }]
241
+ }));
242
+ fs.writeFileSync(debatePath, JSON.stringify({
243
+ problem: 'Test problem',
244
+ finalSolution: { description: 'Test solution' }
245
+ }));
246
+
247
+ // Mock provider and evaluator
248
+ const mockProvider = { complete: jest.fn() };
249
+ mockedCreateProvider.mockReturnValue(mockProvider as any);
250
+ jest.spyOn(EvaluatorAgent.prototype, 'evaluate').mockResolvedValue({
251
+ id: 'e1',
252
+ rawText: '{"evaluation":{"functional_completeness":{"score":8}},"overall_summary":{"overall_score":8}}',
253
+ latencyMs: 100
254
+ });
255
+ });
256
+
257
+ afterEach(() => {
258
+ jest.restoreAllMocks();
259
+ });
260
+
261
+ it('should call loadEnvironmentFile with default parameters', async () => {
262
+ await runCli(['eval', '--config', configPath, '--debate', debatePath]);
263
+ expect(mockedLoadEnvironmentFile).toHaveBeenCalledWith(undefined, undefined);
264
+ });
265
+
266
+ it('should call loadEnvironmentFile with custom env file', async () => {
267
+ await runCli(['eval', '--config', configPath, '--debate', debatePath, '--env-file', 'custom.env']);
268
+ expect(mockedLoadEnvironmentFile).toHaveBeenCalledWith('custom.env', undefined);
269
+ });
270
+
271
+ it('should call loadEnvironmentFile with verbose flag', async () => {
272
+ await runCli(['eval', '--config', configPath, '--debate', debatePath, '--verbose']);
273
+ expect(mockedLoadEnvironmentFile).toHaveBeenCalledWith(undefined, true);
274
+ });
275
+
276
+ it('should call loadEnvironmentFile with both custom env file and verbose', async () => {
277
+ await runCli(['eval', '--config', configPath, '--debate', debatePath, '--env-file', 'prod.env', '--verbose']);
278
+ expect(mockedLoadEnvironmentFile).toHaveBeenCalledWith('prod.env', true);
279
+ });
280
+ });
281
+
282
+ describe('Provider factory integration', () => {
283
+ let configPath: string;
284
+ let debatePath: string;
285
+
286
+ beforeEach(() => {
287
+ configPath = path.join(tmpDir, 'config.json');
288
+ debatePath = path.join(tmpDir, 'debate.json');
289
+
290
+ fs.writeFileSync(debatePath, JSON.stringify({
291
+ problem: 'Test problem',
292
+ finalSolution: { description: 'Test solution' }
293
+ }));
294
+ });
295
+
296
+ it('should call createProvider for each enabled agent', async () => {
297
+ fs.writeFileSync(configPath, JSON.stringify({
298
+ agents: [
299
+ { id: 'e1', name: 'E1', model: 'gpt-4', provider: 'openai' },
300
+ { id: 'e2', name: 'E2', model: 'gpt-3.5-turbo', provider: 'openrouter' }
301
+ ]
302
+ }));
303
+
304
+ const mockProvider = { complete: jest.fn() };
305
+ mockedCreateProvider.mockReturnValue(mockProvider as any);
306
+ jest.spyOn(EvaluatorAgent.prototype, 'evaluate').mockResolvedValue({
307
+ id: 'e1',
308
+ rawText: '{"evaluation":{"functional_completeness":{"score":8}},"overall_summary":{"overall_score":8}}',
309
+ latencyMs: 100
310
+ });
311
+
312
+ await runCli(['eval', '--config', configPath, '--debate', debatePath]);
313
+
314
+ expect(mockedCreateProvider).toHaveBeenCalledTimes(2);
315
+ expect(mockedCreateProvider).toHaveBeenCalledWith('openai');
316
+ expect(mockedCreateProvider).toHaveBeenCalledWith('openrouter');
317
+ });
318
+
319
+ it('should propagate provider factory errors (missing API keys)', async () => {
320
+ fs.writeFileSync(configPath, JSON.stringify({
321
+ agents: [{ id: 'e1', name: 'E1', model: 'gpt-4', provider: 'openai' }]
322
+ }));
323
+
324
+ mockedCreateProvider.mockImplementation(() => {
325
+ const err: any = new Error('Missing API key for openai');
326
+ err.code = EXIT_CONFIG_ERROR;
327
+ throw err;
328
+ });
329
+
330
+ await expect(runCli(['eval', '--config', configPath, '--debate', debatePath]))
331
+ .rejects.toHaveProperty('code', EXIT_CONFIG_ERROR);
332
+ });
333
+ });
334
+
335
+ describe('Evaluator execution and result parsing', () => {
336
+ let configPath: string;
337
+ let debatePath: string;
338
+ let mockProvider: any;
339
+
340
+ beforeEach(() => {
341
+ configPath = path.join(tmpDir, 'config.json');
342
+ debatePath = path.join(tmpDir, 'debate.json');
343
+
344
+ fs.writeFileSync(configPath, JSON.stringify({
345
+ agents: [{ id: 'e1', name: 'E1', model: 'gpt-4', provider: 'openai' }]
346
+ }));
347
+ fs.writeFileSync(debatePath, JSON.stringify({
348
+ problem: 'Design a rate limiter',
349
+ finalSolution: { description: 'Use token bucket algorithm' }
350
+ }));
351
+
352
+ mockProvider = { complete: jest.fn() };
353
+ mockedCreateProvider.mockReturnValue(mockProvider);
354
+ });
355
+
356
+ it('should successfully parse valid JSON response', async () => {
357
+ const validResponse = {
358
+ evaluation: {
359
+ functional_completeness: { score: 8, reasoning: 'Good coverage' },
360
+ non_functional: {
361
+ performance_scalability: { score: 7 },
362
+ security: { score: 9 },
363
+ maintainability_evolvability: { score: 8 },
364
+ regulatory_compliance: { score: 6 },
365
+ testability: { score: 7 }
366
+ }
367
+ },
368
+ overall_summary: {
369
+ strengths: 'Well designed',
370
+ weaknesses: 'Could improve X',
371
+ overall_score: 8
372
+ }
373
+ };
374
+
375
+ jest.spyOn(EvaluatorAgent.prototype, 'evaluate').mockResolvedValue({
376
+ id: 'e1',
377
+ rawText: JSON.stringify(validResponse),
378
+ latencyMs: 100
379
+ });
380
+
381
+ await runCli(['eval', '--config', configPath, '--debate', debatePath]);
382
+
383
+ const output = stdoutSpy.mock.calls.join('');
384
+ expect(output).toContain('8.00'); // functional completeness
385
+ expect(output).toContain('7.00'); // performance
386
+ expect(output).toContain('9.00'); // security
387
+ });
388
+
389
+ it('should extract JSON from text with surrounding content', async () => {
390
+ const responseWithExtra = 'Here is the evaluation:\n' +
391
+ '{"evaluation":{"functional_completeness":{"score":8}},"overall_summary":{"overall_score":8}}\n' +
392
+ 'Additional text here';
393
+
394
+ jest.spyOn(EvaluatorAgent.prototype, 'evaluate').mockResolvedValue({
395
+ id: 'e1',
396
+ rawText: responseWithExtra,
397
+ latencyMs: 100
398
+ });
399
+
400
+ await runCli(['eval', '--config', configPath, '--debate', debatePath]);
401
+
402
+ const output = stdoutSpy.mock.calls.join('');
403
+ expect(output).toContain('8.00');
404
+ });
405
+
406
+ it('should skip agent with invalid JSON and warn', async () => {
407
+ jest.spyOn(EvaluatorAgent.prototype, 'evaluate').mockResolvedValue({
408
+ id: 'e1',
409
+ rawText: 'This is not valid JSON at all',
410
+ latencyMs: 100
411
+ });
412
+
413
+ await runCli(['eval', '--config', configPath, '--debate', debatePath]);
414
+
415
+ expect(stderrSpy).toHaveBeenCalledWith(
416
+ expect.stringContaining('[e1] Invalid JSON output; skipping agent')
417
+ );
418
+
419
+ const output = stdoutSpy.mock.calls.join('');
420
+ expect(output).toContain('N/A'); // All scores should be N/A
421
+ });
422
+
423
+ it('should skip agent that throws error during evaluation', async () => {
424
+ jest.spyOn(EvaluatorAgent.prototype, 'evaluate').mockRejectedValue(
425
+ new Error('Network timeout')
426
+ );
427
+
428
+ await runCli(['eval', '--config', configPath, '--debate', debatePath]);
429
+
430
+ expect(stderrSpy).toHaveBeenCalledWith(
431
+ expect.stringContaining('[e1] Skipped due to error')
432
+ );
433
+ });
434
+ });
435
+
436
+ describe('Score validation and clamping', () => {
437
+ let configPath: string;
438
+ let debatePath: string;
439
+
440
+ beforeEach(() => {
441
+ configPath = path.join(tmpDir, 'config.json');
442
+ debatePath = path.join(tmpDir, 'debate.json');
443
+
444
+ fs.writeFileSync(configPath, JSON.stringify({
445
+ agents: [{ id: 'e1', name: 'E1', model: 'gpt-4', provider: 'openai' }]
446
+ }));
447
+ fs.writeFileSync(debatePath, JSON.stringify({
448
+ problem: 'Test',
449
+ finalSolution: { description: 'Solution' }
450
+ }));
451
+
452
+ mockedCreateProvider.mockReturnValue({ complete: jest.fn() } as any);
453
+ });
454
+
455
+ it('should clamp score below 1 to 1 and warn', async () => {
456
+ jest.spyOn(EvaluatorAgent.prototype, 'evaluate').mockResolvedValue({
457
+ id: 'e1',
458
+ rawText: JSON.stringify({
459
+ evaluation: { functional_completeness: { score: -5 } },
460
+ overall_summary: { overall_score: 0.5 }
461
+ }),
462
+ latencyMs: 100
463
+ });
464
+
465
+ await runCli(['eval', '--config', configPath, '--debate', debatePath]);
466
+
467
+ expect(stderrSpy).toHaveBeenCalledWith(
468
+ expect.stringContaining('clamped to [1,10] from -5')
469
+ );
470
+ expect(stderrSpy).toHaveBeenCalledWith(
471
+ expect.stringContaining('clamped to [1,10] from 0.5')
472
+ );
473
+
474
+ const output = stdoutSpy.mock.calls.join('');
475
+ expect(output).toContain('1.00'); // Clamped scores
476
+ });
477
+
478
+ it('should clamp score above 10 to 10 and warn', async () => {
479
+ jest.spyOn(EvaluatorAgent.prototype, 'evaluate').mockResolvedValue({
480
+ id: 'e1',
481
+ rawText: JSON.stringify({
482
+ evaluation: { functional_completeness: { score: 15 } },
483
+ overall_summary: { overall_score: 100 }
484
+ }),
485
+ latencyMs: 100
486
+ });
487
+
488
+ await runCli(['eval', '--config', configPath, '--debate', debatePath]);
489
+
490
+ expect(stderrSpy).toHaveBeenCalledWith(
491
+ expect.stringContaining('clamped to [1,10] from 15')
492
+ );
493
+ expect(stderrSpy).toHaveBeenCalledWith(
494
+ expect.stringContaining('clamped to [1,10] from 100')
495
+ );
496
+
497
+ const output = stdoutSpy.mock.calls.join('');
498
+ expect(output).toContain('10.00');
499
+ });
500
+
501
+ it('should ignore non-numeric scores and warn', async () => {
502
+ jest.spyOn(EvaluatorAgent.prototype, 'evaluate').mockResolvedValue({
503
+ id: 'e1',
504
+ rawText: JSON.stringify({
505
+ evaluation: { functional_completeness: { score: 'eight' } },
506
+ overall_summary: { overall_score: null }
507
+ }),
508
+ latencyMs: 100
509
+ });
510
+
511
+ await runCli(['eval', '--config', configPath, '--debate', debatePath]);
512
+
513
+ expect(stderrSpy).toHaveBeenCalledWith(
514
+ expect.stringContaining('[e1] Invalid or missing numeric score')
515
+ );
516
+
517
+ const output = stdoutSpy.mock.calls.join('');
518
+ expect(output).toContain('N/A');
519
+ });
520
+
521
+ it('should ignore missing score fields silently when field is absent', async () => {
522
+ jest.spyOn(EvaluatorAgent.prototype, 'evaluate').mockResolvedValue({
523
+ id: 'e1',
524
+ rawText: JSON.stringify({
525
+ evaluation: { functional_completeness: { score: 8 } },
526
+ overall_summary: { overall_score: 8 }
527
+ // Missing all non_functional scores
528
+ }),
529
+ latencyMs: 100
530
+ });
531
+
532
+ await runCli(['eval', '--config', configPath, '--debate', debatePath]);
533
+
534
+ const output = stdoutSpy.mock.calls.join('');
535
+ expect(output).toContain('8.00'); // fc and overall
536
+ expect(output).toContain('N/A'); // missing scores
537
+ });
538
+ });
539
+
540
+ describe('Score averaging across multiple agents', () => {
541
+ let configPath: string;
542
+ let debatePath: string;
543
+
544
+ beforeEach(() => {
545
+ configPath = path.join(tmpDir, 'config.json');
546
+ debatePath = path.join(tmpDir, 'debate.json');
547
+
548
+ fs.writeFileSync(configPath, JSON.stringify({
549
+ agents: [
550
+ { id: 'e1', name: 'E1', model: 'gpt-4', provider: 'openai' },
551
+ { id: 'e2', name: 'E2', model: 'gpt-4', provider: 'openai' },
552
+ { id: 'e3', name: 'E3', model: 'gpt-4', provider: 'openai' }
553
+ ]
554
+ }));
555
+ fs.writeFileSync(debatePath, JSON.stringify({
556
+ problem: 'Test',
557
+ finalSolution: { description: 'Solution' }
558
+ }));
559
+
560
+ mockedCreateProvider.mockReturnValue({ complete: jest.fn() } as any);
561
+ });
562
+
563
+ it('should average scores from multiple agents', async () => {
564
+ const evalSpy = jest.spyOn(EvaluatorAgent.prototype, 'evaluate');
565
+
566
+ evalSpy.mockResolvedValueOnce({
567
+ id: 'e1',
568
+ rawText: JSON.stringify({
569
+ evaluation: { functional_completeness: { score: 8 } },
570
+ overall_summary: { overall_score: 8 }
571
+ }),
572
+ latencyMs: 100
573
+ });
574
+
575
+ evalSpy.mockResolvedValueOnce({
576
+ id: 'e2',
577
+ rawText: JSON.stringify({
578
+ evaluation: { functional_completeness: { score: 6 } },
579
+ overall_summary: { overall_score: 6 }
580
+ }),
581
+ latencyMs: 100
582
+ });
583
+
584
+ evalSpy.mockResolvedValueOnce({
585
+ id: 'e3',
586
+ rawText: JSON.stringify({
587
+ evaluation: { functional_completeness: { score: 7 } },
588
+ overall_summary: { overall_score: 7 }
589
+ }),
590
+ latencyMs: 100
591
+ });
592
+
593
+ await runCli(['eval', '--config', configPath, '--debate', debatePath]);
594
+
595
+ const output = stdoutSpy.mock.calls.join('');
596
+ // Average of 8, 6, 7 = 7.00
597
+ expect(output).toContain('7.00');
598
+ });
599
+
600
+ it('should average only present values when some agents fail', async () => {
601
+ const evalSpy = jest.spyOn(EvaluatorAgent.prototype, 'evaluate');
602
+
603
+ evalSpy.mockResolvedValueOnce({
604
+ id: 'e1',
605
+ rawText: JSON.stringify({
606
+ evaluation: { functional_completeness: { score: 8 } },
607
+ overall_summary: { overall_score: 8 }
608
+ }),
609
+ latencyMs: 100
610
+ });
611
+
612
+ evalSpy.mockRejectedValueOnce(new Error('Timeout'));
613
+
614
+ evalSpy.mockResolvedValueOnce({
615
+ id: 'e3',
616
+ rawText: JSON.stringify({
617
+ evaluation: { functional_completeness: { score: 6 } },
618
+ overall_summary: { overall_score: 6 }
619
+ }),
620
+ latencyMs: 100
621
+ });
622
+
623
+ await runCli(['eval', '--config', configPath, '--debate', debatePath]);
624
+
625
+ const output = stdoutSpy.mock.calls.join('');
626
+ // Average of 8, 6 = 7.00 (e2 skipped)
627
+ expect(output).toContain('7.00');
628
+ });
629
+
630
+ it('should round to 2 decimal places', async () => {
631
+ const evalSpy = jest.spyOn(EvaluatorAgent.prototype, 'evaluate');
632
+
633
+ evalSpy.mockResolvedValueOnce({
634
+ id: 'e1',
635
+ rawText: JSON.stringify({
636
+ evaluation: { functional_completeness: { score: 8 } },
637
+ overall_summary: { overall_score: 8 }
638
+ }),
639
+ latencyMs: 100
640
+ });
641
+
642
+ evalSpy.mockResolvedValueOnce({
643
+ id: 'e2',
644
+ rawText: JSON.stringify({
645
+ evaluation: { functional_completeness: { score: 7 } },
646
+ overall_summary: { overall_score: 7 }
647
+ }),
648
+ latencyMs: 100
649
+ });
650
+
651
+ evalSpy.mockResolvedValueOnce({
652
+ id: 'e3',
653
+ rawText: JSON.stringify({
654
+ evaluation: { functional_completeness: { score: 8 } },
655
+ overall_summary: { overall_score: 8 }
656
+ }),
657
+ latencyMs: 100
658
+ });
659
+
660
+ await runCli(['eval', '--config', configPath, '--debate', debatePath]);
661
+
662
+ const output = stdoutSpy.mock.calls.join('');
663
+ // Average of 8, 7, 8 = 7.666... => 7.67
664
+ expect(output).toContain('7.67');
665
+ });
666
+ });
667
+
668
+ describe('Clarifications handling', () => {
669
+ let configPath: string;
670
+ let debatePath: string;
671
+
672
+ beforeEach(() => {
673
+ configPath = path.join(tmpDir, 'config.json');
674
+ debatePath = path.join(tmpDir, 'debate.json');
675
+
676
+ fs.writeFileSync(configPath, JSON.stringify({
677
+ agents: [{ id: 'e1', name: 'E1', model: 'gpt-4', provider: 'openai' }]
678
+ }));
679
+
680
+ mockedCreateProvider.mockReturnValue({ complete: jest.fn() } as any);
681
+ });
682
+
683
+ it('should format clarifications with fenced code blocks', async () => {
684
+ fs.writeFileSync(debatePath, JSON.stringify({
685
+ problem: 'Test',
686
+ finalSolution: { description: 'Solution' },
687
+ clarifications: [
688
+ {
689
+ agentName: 'Architect',
690
+ role: 'architect',
691
+ items: [
692
+ { id: 'q1', question: 'What is the scale?', answer: '1M users' }
693
+ ]
694
+ }
695
+ ]
696
+ }));
697
+
698
+ const evaluateSpy = jest.spyOn(EvaluatorAgent.prototype, 'evaluate');
699
+ evaluateSpy.mockResolvedValue({
700
+ id: 'e1',
701
+ rawText: JSON.stringify({
702
+ evaluation: { functional_completeness: { score: 8 } },
703
+ overall_summary: { overall_score: 8 }
704
+ }),
705
+ latencyMs: 100
706
+ });
707
+
708
+ await runCli(['eval', '--config', configPath, '--debate', debatePath]);
709
+
710
+ // Check that evaluate was called with properly formatted clarifications
711
+ expect(evaluateSpy).toHaveBeenCalled();
712
+ const call = evaluateSpy.mock.calls[0]?.[0];
713
+ expect(call).toBeDefined();
714
+ expect(call?.clarificationsMarkdown).toContain('### Architect (architect)');
715
+ expect(call?.clarificationsMarkdown).toContain('```text');
716
+ expect(call?.clarificationsMarkdown).toContain('What is the scale?');
717
+ expect(call?.clarificationsMarkdown).toContain('1M users');
718
+ });
719
+
720
+ it('should handle debates without clarifications', async () => {
721
+ fs.writeFileSync(debatePath, JSON.stringify({
722
+ problem: 'Test',
723
+ finalSolution: { description: 'Solution' }
724
+ // No clarifications field
725
+ }));
726
+
727
+ const evaluateSpy = jest.spyOn(EvaluatorAgent.prototype, 'evaluate');
728
+ evaluateSpy.mockResolvedValue({
729
+ id: 'e1',
730
+ rawText: JSON.stringify({
731
+ evaluation: { functional_completeness: { score: 8 } },
732
+ overall_summary: { overall_score: 8 }
733
+ }),
734
+ latencyMs: 100
735
+ });
736
+
737
+ await runCli(['eval', '--config', configPath, '--debate', debatePath]);
738
+
739
+ expect(evaluateSpy).toHaveBeenCalled();
740
+ const call = evaluateSpy.mock.calls[0]?.[0];
741
+ expect(call).toBeDefined();
742
+ // Should have empty fenced code blocks
743
+ expect(call?.clarificationsMarkdown).toMatch(/```.*```/);
744
+ });
745
+
746
+ it('should preserve NA answers in clarifications', async () => {
747
+ fs.writeFileSync(debatePath, JSON.stringify({
748
+ problem: 'Test',
749
+ finalSolution: { description: 'Solution' },
750
+ clarifications: [
751
+ {
752
+ agentName: 'Security',
753
+ role: 'security',
754
+ items: [
755
+ { id: 'q1', question: 'Security requirements?', answer: 'NA' }
756
+ ]
757
+ }
758
+ ]
759
+ }));
760
+
761
+ const evaluateSpy = jest.spyOn(EvaluatorAgent.prototype, 'evaluate');
762
+ evaluateSpy.mockResolvedValue({
763
+ id: 'e1',
764
+ rawText: JSON.stringify({
765
+ evaluation: { functional_completeness: { score: 8 } },
766
+ overall_summary: { overall_score: 8 }
767
+ }),
768
+ latencyMs: 100
769
+ });
770
+
771
+ await runCli(['eval', '--config', configPath, '--debate', debatePath]);
772
+
773
+ const call = evaluateSpy.mock.calls[0]?.[0];
774
+ expect(call).toBeDefined();
775
+ expect(call?.clarificationsMarkdown).toContain('NA');
776
+ });
777
+ });
778
+
779
+ describe('Markdown output format', () => {
780
+ let configPath: string;
781
+ let debatePath: string;
782
+
783
+ beforeEach(() => {
784
+ configPath = path.join(tmpDir, 'config.json');
785
+ debatePath = path.join(tmpDir, 'debate.json');
786
+
787
+ fs.writeFileSync(configPath, JSON.stringify({
788
+ agents: [{ id: 'e1', name: 'E1', model: 'gpt-4', provider: 'openai' }]
789
+ }));
790
+ fs.writeFileSync(debatePath, JSON.stringify({
791
+ problem: 'Test',
792
+ finalSolution: { description: 'Solution' }
793
+ }));
794
+
795
+ mockedCreateProvider.mockReturnValue({ complete: jest.fn() } as any);
796
+ });
797
+
798
+ it('should output markdown table to stdout by default', async () => {
799
+ jest.spyOn(EvaluatorAgent.prototype, 'evaluate').mockResolvedValue({
800
+ id: 'e1',
801
+ rawText: JSON.stringify({
802
+ evaluation: {
803
+ functional_completeness: { score: 8 },
804
+ non_functional: {
805
+ performance_scalability: { score: 7 },
806
+ security: { score: 9 },
807
+ maintainability_evolvability: { score: 8 },
808
+ regulatory_compliance: { score: 6 },
809
+ testability: { score: 7 }
810
+ }
811
+ },
812
+ overall_summary: { overall_score: 8 }
813
+ }),
814
+ latencyMs: 100
815
+ });
816
+
817
+ await runCli(['eval', '--config', configPath, '--debate', debatePath]);
818
+
819
+ const output = stdoutSpy.mock.calls.join('');
820
+
821
+ // Check table structure
822
+ expect(output).toContain('| Functional Completeness');
823
+ expect(output).toContain('| Performance & Scalability');
824
+ expect(output).toContain('| Security');
825
+ expect(output).toContain('| Maintainability & Evolvability');
826
+ expect(output).toContain('| Regulatory Compliance');
827
+ expect(output).toContain('| Testability');
828
+ expect(output).toContain('| Overall Score');
829
+
830
+ // Check values with 2 decimal places
831
+ expect(output).toContain('8.00');
832
+ expect(output).toContain('7.00');
833
+ expect(output).toContain('9.00');
834
+ expect(output).toContain('6.00');
835
+ });
836
+
837
+ it('should write markdown table to file when --output specified (non-json)', async () => {
838
+ const outputPath = path.join(tmpDir, 'results.md');
839
+
840
+ jest.spyOn(EvaluatorAgent.prototype, 'evaluate').mockResolvedValue({
841
+ id: 'e1',
842
+ rawText: JSON.stringify({
843
+ evaluation: { functional_completeness: { score: 8 } },
844
+ overall_summary: { overall_score: 8 }
845
+ }),
846
+ latencyMs: 100
847
+ });
848
+
849
+ await runCli(['eval', '--config', configPath, '--debate', debatePath, '--output', outputPath]);
850
+
851
+ expect(fs.existsSync(outputPath)).toBe(true);
852
+ const content = fs.readFileSync(outputPath, 'utf-8');
853
+ expect(content).toContain('| Functional Completeness');
854
+ expect(content).toContain('8.00');
855
+ });
856
+
857
+ it('should show N/A for missing scores in markdown', async () => {
858
+ jest.spyOn(EvaluatorAgent.prototype, 'evaluate').mockResolvedValue({
859
+ id: 'e1',
860
+ rawText: JSON.stringify({
861
+ evaluation: { functional_completeness: { score: 8 } },
862
+ overall_summary: { overall_score: 8 }
863
+ // Missing all non_functional scores
864
+ }),
865
+ latencyMs: 100
866
+ });
867
+
868
+ await runCli(['eval', '--config', configPath, '--debate', debatePath]);
869
+
870
+ const output = stdoutSpy.mock.calls.join('');
871
+ expect(output).toContain('8.00'); // fc
872
+ expect(output).toContain('N/A'); // missing scores
873
+ });
874
+ });
875
+
876
+ describe('JSON output format', () => {
877
+ let configPath: string;
878
+ let debatePath: string;
879
+
880
+ beforeEach(() => {
881
+ configPath = path.join(tmpDir, 'config.json');
882
+ debatePath = path.join(tmpDir, 'debate.json');
883
+
884
+ fs.writeFileSync(configPath, JSON.stringify({
885
+ agents: [
886
+ { id: 'e1', name: 'E1', model: 'gpt-4', provider: 'openai' },
887
+ { id: 'e2', name: 'E2', model: 'gpt-4', provider: 'openai' }
888
+ ]
889
+ }));
890
+ fs.writeFileSync(debatePath, JSON.stringify({
891
+ problem: 'Test',
892
+ finalSolution: { description: 'Solution' }
893
+ }));
894
+
895
+ mockedCreateProvider.mockReturnValue({ complete: jest.fn() } as any);
896
+ });
897
+
898
+ it('should write JSON output when --output ends with .json', async () => {
899
+ const outputPath = path.join(tmpDir, 'results.json');
900
+
901
+ const evalSpy = jest.spyOn(EvaluatorAgent.prototype, 'evaluate');
902
+ evalSpy.mockResolvedValueOnce({
903
+ id: 'e1',
904
+ rawText: JSON.stringify({
905
+ evaluation: { functional_completeness: { score: 8, reasoning: 'Good' } },
906
+ overall_summary: { overall_score: 8, strengths: 'Strong', weaknesses: 'Minor' }
907
+ }),
908
+ latencyMs: 100
909
+ });
910
+ evalSpy.mockResolvedValueOnce({
911
+ id: 'e2',
912
+ rawText: JSON.stringify({
913
+ evaluation: { functional_completeness: { score: 6 } },
914
+ overall_summary: { overall_score: 6 }
915
+ }),
916
+ latencyMs: 100
917
+ });
918
+
919
+ await runCli(['eval', '--config', configPath, '--debate', debatePath, '--output', outputPath]);
920
+
921
+ expect(fs.existsSync(outputPath)).toBe(true);
922
+ const content = JSON.parse(fs.readFileSync(outputPath, 'utf-8'));
923
+
924
+ // Check structure
925
+ expect(content).toHaveProperty('evaluation');
926
+ expect(content).toHaveProperty('overall_score');
927
+ expect(content).toHaveProperty('agents');
928
+
929
+ // Check aggregated averages (8 + 6) / 2 = 7
930
+ expect(content.evaluation.functional_completeness.average_score).toBe(7);
931
+ expect(content.overall_score).toBe(7);
932
+
933
+ // Check per-agent results
934
+ expect(content.agents).toHaveProperty('e1');
935
+ expect(content.agents).toHaveProperty('e2');
936
+ expect(content.agents.e1.evaluation.functional_completeness.score).toBe(8);
937
+ expect(content.agents.e1.evaluation.functional_completeness.reasoning).toBe('Good');
938
+ });
939
+
940
+ it('should use null for N/A values in JSON output', async () => {
941
+ const outputPath = path.join(tmpDir, 'results.json');
942
+
943
+ jest.spyOn(EvaluatorAgent.prototype, 'evaluate').mockResolvedValue({
944
+ id: 'e1',
945
+ rawText: JSON.stringify({
946
+ evaluation: { functional_completeness: { score: 8 } },
947
+ overall_summary: { overall_score: 8 }
948
+ // Missing non_functional scores
949
+ }),
950
+ latencyMs: 100
951
+ });
952
+
953
+ await runCli(['eval', '--config', configPath, '--debate', debatePath, '--output', outputPath]);
954
+
955
+ const content = JSON.parse(fs.readFileSync(outputPath, 'utf-8'));
956
+ expect(content.evaluation.non_functional.performance_scalability.average_score).toBeNull();
957
+ expect(content.evaluation.non_functional.security.average_score).toBeNull();
958
+ });
959
+
960
+ it('should write JSON even when output path case is .JSON', async () => {
961
+ const outputPath = path.join(tmpDir, 'results.JSON');
962
+
963
+ jest.spyOn(EvaluatorAgent.prototype, 'evaluate').mockResolvedValue({
964
+ id: 'e1',
965
+ rawText: JSON.stringify({
966
+ evaluation: { functional_completeness: { score: 8 } },
967
+ overall_summary: { overall_score: 8 }
968
+ }),
969
+ latencyMs: 100
970
+ });
971
+
972
+ await runCli(['eval', '--config', configPath, '--debate', debatePath, '--output', outputPath]);
973
+
974
+ expect(fs.existsSync(outputPath)).toBe(true);
975
+ const content = JSON.parse(fs.readFileSync(outputPath, 'utf-8'));
976
+ expect(content).toHaveProperty('evaluation');
977
+ });
978
+ });
979
+
980
+ describe('Verbose mode', () => {
981
+ let configPath: string;
982
+ let debatePath: string;
983
+
984
+ beforeEach(() => {
985
+ configPath = path.join(tmpDir, 'config.json');
986
+ debatePath = path.join(tmpDir, 'debate.json');
987
+
988
+ fs.writeFileSync(configPath, JSON.stringify({
989
+ agents: [{ id: 'e1', name: 'E1', model: 'gpt-4', provider: 'openai' }]
990
+ }));
991
+ fs.writeFileSync(debatePath, JSON.stringify({
992
+ problem: 'Test',
993
+ finalSolution: { description: 'Solution' }
994
+ }));
995
+
996
+ mockedCreateProvider.mockReturnValue({ complete: jest.fn() } as any);
997
+ });
998
+
999
+ it('should log provider and model info in verbose mode', async () => {
1000
+ jest.spyOn(EvaluatorAgent.prototype, 'evaluate').mockResolvedValue({
1001
+ id: 'e1',
1002
+ rawText: JSON.stringify({
1003
+ evaluation: { functional_completeness: { score: 8 } },
1004
+ overall_summary: { overall_score: 8 }
1005
+ }),
1006
+ latencyMs: 100
1007
+ });
1008
+
1009
+ await runCli(['eval', '--config', configPath, '--debate', debatePath, '--verbose']);
1010
+
1011
+ expect(stderrSpy).toHaveBeenCalledWith(
1012
+ expect.stringContaining('[e1] provider=openai model=gpt-4')
1013
+ );
1014
+ });
1015
+
1016
+ it('should log prompt sources in verbose mode (built-in)', async () => {
1017
+ jest.spyOn(EvaluatorAgent.prototype, 'evaluate').mockResolvedValue({
1018
+ id: 'e1',
1019
+ rawText: JSON.stringify({
1020
+ evaluation: { functional_completeness: { score: 8 } },
1021
+ overall_summary: { overall_score: 8 }
1022
+ }),
1023
+ latencyMs: 100
1024
+ });
1025
+
1026
+ await runCli(['eval', '--config', configPath, '--debate', debatePath, '--verbose']);
1027
+
1028
+ expect(stderrSpy).toHaveBeenCalledWith(
1029
+ expect.stringMatching(/systemPrompt=.*built-in default/)
1030
+ );
1031
+ expect(stderrSpy).toHaveBeenCalledWith(
1032
+ expect.stringMatching(/userPrompt=.*built-in default/)
1033
+ );
1034
+ });
1035
+
1036
+ it('should not log verbose info when verbose flag is absent', async () => {
1037
+ jest.spyOn(EvaluatorAgent.prototype, 'evaluate').mockResolvedValue({
1038
+ id: 'e1',
1039
+ rawText: JSON.stringify({
1040
+ evaluation: { functional_completeness: { score: 8 } },
1041
+ overall_summary: { overall_score: 8 }
1042
+ }),
1043
+ latencyMs: 100
1044
+ });
1045
+
1046
+ stderrSpy.mockClear();
1047
+ await runCli(['eval', '--config', configPath, '--debate', debatePath]);
1048
+
1049
+ // Should not have verbose provider/model logs
1050
+ const stderrCalls = stderrSpy.mock.calls.map(c => c[0]).join('');
1051
+ expect(stderrCalls).not.toContain('provider=openai model=gpt-4');
1052
+ });
1053
+ });
1054
+
1055
+ describe('Prompt resolution', () => {
1056
+ let configPath: string;
1057
+ let debatePath: string;
1058
+ let promptsDir: string;
1059
+
1060
+ beforeEach(() => {
1061
+ configPath = path.join(tmpDir, 'config.json');
1062
+ debatePath = path.join(tmpDir, 'debate.json');
1063
+ promptsDir = path.join(tmpDir, 'prompts');
1064
+
1065
+ fs.mkdirSync(promptsDir);
1066
+ fs.writeFileSync(debatePath, JSON.stringify({
1067
+ problem: 'Test',
1068
+ finalSolution: { description: 'Solution' }
1069
+ }));
1070
+
1071
+ mockedCreateProvider.mockReturnValue({ complete: jest.fn() } as any);
1072
+ });
1073
+
1074
+ it('should use custom system prompt from file when specified', async () => {
1075
+ const customSystemPrompt = 'Custom evaluator system prompt';
1076
+ fs.writeFileSync(path.join(promptsDir, 'eval-system.md'), customSystemPrompt);
1077
+
1078
+ fs.writeFileSync(configPath, JSON.stringify({
1079
+ agents: [{
1080
+ id: 'e1',
1081
+ name: 'E1',
1082
+ model: 'gpt-4',
1083
+ provider: 'openai',
1084
+ systemPromptPath: './prompts/eval-system.md'
1085
+ }]
1086
+ }));
1087
+
1088
+ const evaluateSpy = jest.spyOn(EvaluatorAgent.prototype, 'evaluate');
1089
+ evaluateSpy.mockResolvedValue({
1090
+ id: 'e1',
1091
+ rawText: JSON.stringify({
1092
+ evaluation: { functional_completeness: { score: 8 } },
1093
+ overall_summary: { overall_score: 8 }
1094
+ }),
1095
+ latencyMs: 100
1096
+ });
1097
+
1098
+ await runCli(['eval', '--config', configPath, '--debate', debatePath]);
1099
+
1100
+ // Verify custom prompt was used
1101
+ const agent = evaluateSpy.mock.instances[0] as any;
1102
+ expect(agent.resolvedSystemPrompt).toContain('Custom evaluator');
1103
+ });
1104
+
1105
+ it('should use custom user prompt from file when specified', async () => {
1106
+ const customUserPrompt = 'Evaluate: {problem} {clarifications} {final_solution}';
1107
+ fs.writeFileSync(path.join(promptsDir, 'eval-user.md'), customUserPrompt);
1108
+
1109
+ fs.writeFileSync(configPath, JSON.stringify({
1110
+ agents: [{
1111
+ id: 'e1',
1112
+ name: 'E1',
1113
+ model: 'gpt-4',
1114
+ provider: 'openai',
1115
+ userPromptPath: './prompts/eval-user.md'
1116
+ }]
1117
+ }));
1118
+
1119
+ const evaluateSpy = jest.spyOn(EvaluatorAgent.prototype, 'evaluate');
1120
+ evaluateSpy.mockResolvedValue({
1121
+ id: 'e1',
1122
+ rawText: JSON.stringify({
1123
+ evaluation: { functional_completeness: { score: 8 } },
1124
+ overall_summary: { overall_score: 8 }
1125
+ }),
1126
+ latencyMs: 100
1127
+ });
1128
+
1129
+ await runCli(['eval', '--config', configPath, '--debate', debatePath]);
1130
+
1131
+ const agent = evaluateSpy.mock.instances[0] as any;
1132
+ expect(agent.resolvedUserPromptTemplate).toContain('Evaluate:');
1133
+ });
1134
+ });
1135
+
1136
+ describe('All score categories', () => {
1137
+ let configPath: string;
1138
+ let debatePath: string;
1139
+
1140
+ beforeEach(() => {
1141
+ configPath = path.join(tmpDir, 'config.json');
1142
+ debatePath = path.join(tmpDir, 'debate.json');
1143
+
1144
+ fs.writeFileSync(configPath, JSON.stringify({
1145
+ agents: [{ id: 'e1', name: 'E1', model: 'gpt-4', provider: 'openai' }]
1146
+ }));
1147
+ fs.writeFileSync(debatePath, JSON.stringify({
1148
+ problem: 'Test',
1149
+ finalSolution: { description: 'Solution' }
1150
+ }));
1151
+
1152
+ mockedCreateProvider.mockReturnValue({ complete: jest.fn() } as any);
1153
+ });
1154
+
1155
+ it('should handle all score categories correctly', async () => {
1156
+ jest.spyOn(EvaluatorAgent.prototype, 'evaluate').mockResolvedValue({
1157
+ id: 'e1',
1158
+ rawText: JSON.stringify({
1159
+ evaluation: {
1160
+ functional_completeness: { score: 9, reasoning: 'Excellent' },
1161
+ non_functional: {
1162
+ performance_scalability: { score: 8, reasoning: 'Fast' },
1163
+ security: { score: 7, reasoning: 'Secure' },
1164
+ maintainability_evolvability: { score: 6, reasoning: 'Maintainable' },
1165
+ regulatory_compliance: { score: 5, reasoning: 'Compliant' },
1166
+ testability: { score: 4, reasoning: 'Testable' }
1167
+ }
1168
+ },
1169
+ overall_summary: {
1170
+ strengths: 'Good design',
1171
+ weaknesses: 'Some issues',
1172
+ overall_score: 7
1173
+ }
1174
+ }),
1175
+ latencyMs: 100
1176
+ });
1177
+
1178
+ await runCli(['eval', '--config', configPath, '--debate', debatePath]);
1179
+
1180
+ const output = stdoutSpy.mock.calls.join('');
1181
+
1182
+ expect(output).toContain('9.00'); // functional_completeness
1183
+ expect(output).toContain('8.00'); // performance_scalability
1184
+ expect(output).toContain('7.00'); // security and overall
1185
+ expect(output).toContain('6.00'); // maintainability
1186
+ expect(output).toContain('5.00'); // regulatory
1187
+ expect(output).toContain('4.00'); // testability
1188
+ });
1189
+ });
1190
+ });
1191
+