dialectic 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/.cursor/commands/setup-test.mdc +175 -0
  2. package/.cursor/rules/basic-code-cleanup.mdc +1110 -0
  3. package/.cursor/rules/riper5.mdc +96 -0
  4. package/.env.example +6 -0
  5. package/AGENTS.md +1052 -0
  6. package/LICENSE +21 -0
  7. package/README.md +93 -0
  8. package/WARP.md +113 -0
  9. package/dialectic-1.0.0.tgz +0 -0
  10. package/dialectic.js +10 -0
  11. package/docs/commands.md +375 -0
  12. package/docs/configuration.md +882 -0
  13. package/docs/context_summarization.md +1023 -0
  14. package/docs/debate_flow.md +1127 -0
  15. package/docs/eval_flow.md +795 -0
  16. package/docs/evaluator.md +141 -0
  17. package/examples/debate-config-openrouter.json +48 -0
  18. package/examples/debate_config1.json +48 -0
  19. package/examples/eval/eval1/eval_config1.json +13 -0
  20. package/examples/eval/eval1/result1.json +62 -0
  21. package/examples/eval/eval1/result2.json +97 -0
  22. package/examples/eval_summary_format.md +11 -0
  23. package/examples/example3/debate-config.json +64 -0
  24. package/examples/example3/eval_config2.json +25 -0
  25. package/examples/example3/problem.md +17 -0
  26. package/examples/example3/rounds_test/eval_run.sh +16 -0
  27. package/examples/example3/rounds_test/run_test.sh +16 -0
  28. package/examples/kata1/architect-only-solution_2-rounds.json +121 -0
  29. package/examples/kata1/architect-perf-solution_2-rounds.json +234 -0
  30. package/examples/kata1/debate-config-kata1.json +54 -0
  31. package/examples/kata1/eval_architect-only_2-rounds.json +97 -0
  32. package/examples/kata1/eval_architect-perf_2-rounds.json +97 -0
  33. package/examples/kata1/kata1-report.md +12224 -0
  34. package/examples/kata1/kata1-report_temps-01_01_01_07.md +2451 -0
  35. package/examples/kata1/kata1.md +5 -0
  36. package/examples/kata1/meta.txt +1 -0
  37. package/examples/kata2/debate-config.json +54 -0
  38. package/examples/kata2/eval_config1.json +21 -0
  39. package/examples/kata2/eval_config2.json +25 -0
  40. package/examples/kata2/kata2.md +5 -0
  41. package/examples/kata2/only_architect/debate-config.json +45 -0
  42. package/examples/kata2/only_architect/eval_run.sh +11 -0
  43. package/examples/kata2/only_architect/run_test.sh +5 -0
  44. package/examples/kata2/rounds_test/eval_run.sh +11 -0
  45. package/examples/kata2/rounds_test/run_test.sh +5 -0
  46. package/examples/kata2/summary_length_test/eval_run.sh +11 -0
  47. package/examples/kata2/summary_length_test/eval_run_w_clarify.sh +7 -0
  48. package/examples/kata2/summary_length_test/run_test.sh +5 -0
  49. package/examples/task-queue/debate-config.json +76 -0
  50. package/examples/task-queue/debate_report.md +566 -0
  51. package/examples/task-queue/task-queue-system.md +25 -0
  52. package/jest.config.ts +13 -0
  53. package/multi_agent_debate_spec.md +2980 -0
  54. package/package.json +38 -0
  55. package/sanity-check-problem.txt +9 -0
  56. package/src/agents/prompts/architect-prompts.ts +203 -0
  57. package/src/agents/prompts/generalist-prompts.ts +157 -0
  58. package/src/agents/prompts/index.ts +41 -0
  59. package/src/agents/prompts/judge-prompts.ts +19 -0
  60. package/src/agents/prompts/kiss-prompts.ts +230 -0
  61. package/src/agents/prompts/performance-prompts.ts +142 -0
  62. package/src/agents/prompts/prompt-types.ts +68 -0
  63. package/src/agents/prompts/security-prompts.ts +149 -0
  64. package/src/agents/prompts/shared.ts +144 -0
  65. package/src/agents/prompts/testing-prompts.ts +149 -0
  66. package/src/agents/role-based-agent.ts +386 -0
  67. package/src/cli/commands/debate.ts +761 -0
  68. package/src/cli/commands/eval.ts +475 -0
  69. package/src/cli/commands/report.ts +265 -0
  70. package/src/cli/index.ts +79 -0
  71. package/src/core/agent.ts +198 -0
  72. package/src/core/clarifications.ts +34 -0
  73. package/src/core/judge.ts +257 -0
  74. package/src/core/orchestrator.ts +432 -0
  75. package/src/core/state-manager.ts +322 -0
  76. package/src/eval/evaluator-agent.ts +130 -0
  77. package/src/eval/prompts/system.md +41 -0
  78. package/src/eval/prompts/user.md +64 -0
  79. package/src/providers/llm-provider.ts +25 -0
  80. package/src/providers/openai-provider.ts +84 -0
  81. package/src/providers/openrouter-provider.ts +122 -0
  82. package/src/providers/provider-factory.ts +64 -0
  83. package/src/types/agent.types.ts +141 -0
  84. package/src/types/config.types.ts +47 -0
  85. package/src/types/debate.types.ts +237 -0
  86. package/src/types/eval.types.ts +85 -0
  87. package/src/utils/common.ts +104 -0
  88. package/src/utils/context-formatter.ts +102 -0
  89. package/src/utils/context-summarizer.ts +143 -0
  90. package/src/utils/env-loader.ts +46 -0
  91. package/src/utils/exit-codes.ts +5 -0
  92. package/src/utils/id.ts +11 -0
  93. package/src/utils/logger.ts +48 -0
  94. package/src/utils/paths.ts +10 -0
  95. package/src/utils/progress-ui.ts +313 -0
  96. package/src/utils/prompt-loader.ts +79 -0
  97. package/src/utils/report-generator.ts +301 -0
  98. package/tests/clarifications.spec.ts +128 -0
  99. package/tests/cli.debate.spec.ts +144 -0
  100. package/tests/config-loading.spec.ts +206 -0
  101. package/tests/context-summarizer.spec.ts +131 -0
  102. package/tests/debate-config-custom.json +38 -0
  103. package/tests/env-loader.spec.ts +149 -0
  104. package/tests/eval.command.spec.ts +1191 -0
  105. package/tests/logger.spec.ts +19 -0
  106. package/tests/openai-provider.spec.ts +26 -0
  107. package/tests/openrouter-provider.spec.ts +279 -0
  108. package/tests/orchestrator-summary.spec.ts +386 -0
  109. package/tests/orchestrator.spec.ts +207 -0
  110. package/tests/prompt-loader.spec.ts +52 -0
  111. package/tests/prompts/architect.md +16 -0
  112. package/tests/provider-factory.spec.ts +150 -0
  113. package/tests/report.command.spec.ts +546 -0
  114. package/tests/role-based-agent-summary.spec.ts +476 -0
  115. package/tests/security-agent.spec.ts +221 -0
  116. package/tests/shared-prompts.spec.ts +318 -0
  117. package/tests/state-manager.spec.ts +251 -0
  118. package/tests/summary-prompts.spec.ts +153 -0
  119. package/tsconfig.json +49 -0
@@ -0,0 +1,322 @@
1
+ import fs from 'fs';
2
+ import path from 'path';
3
+ import { Contribution, DebateRound, DebateState, Solution, DebateSummary, DEBATE_STATUS, AgentClarifications } from '../types/debate.types';
4
+
5
+ // File-level constants to eliminate magic strings and improve clarity
6
+ const DEFAULT_DEBATES_DIR = 'debates';
7
+ const FILE_EXTENSION_JSON = '.json';
8
+ const FILE_ENCODING_UTF8 = 'utf-8';
9
+ const ID_PREFIX = 'deb-';
10
+ const JSON_SPACE = 2;
11
+
12
+ /**
13
+ * StateManager persists and retrieves debate state to the filesystem
14
+ * while keeping an in-memory cache for fast access.
15
+ */
16
+ export class StateManager {
17
+ private debates: Map<string, DebateState> = new Map();
18
+ private baseDir: string;
19
+
20
+ /**
21
+ * @param baseDir - Base directory where debate JSON files are stored (defaults to ./debates).
22
+ */
23
+ constructor(baseDir: string = path.resolve(process.cwd(), DEFAULT_DEBATES_DIR)) {
24
+ this.baseDir = baseDir;
25
+ this.ensureDirectoryExists();
26
+ }
27
+
28
+ /**
29
+ * Creates a new debate entry, initializes state, and persists it.
30
+ * @param problem - Problem statement for the debate.
31
+ * @param context - Optional additional context.
32
+ * @returns The created DebateState.
33
+ */
34
+ async createDebate(problem: string, context?: string): Promise<DebateState> {
35
+ const now = new Date();
36
+ const state: DebateState = {
37
+ id: this.generateId(now),
38
+ problem,
39
+ // Conditional spread: only include context property if defined (avoids explicit undefined with exactOptionalPropertyTypes)
40
+ ...(context !== undefined && { context }),
41
+ status: DEBATE_STATUS.RUNNING,
42
+ currentRound: 0,
43
+ rounds: [],
44
+ createdAt: now,
45
+ updatedAt: now,
46
+ };
47
+
48
+ this.debates.set(state.id, state);
49
+ await this.save(state);
50
+ return state;
51
+ }
52
+
53
+ /**
54
+ * Adds a contribution to the current round of the specified debate and persists the updated state.
55
+ *
56
+ * @param debateId - The unique identifier of the debate to which the contribution should be added.
57
+ * @param contribution - The Contribution object to append to the current round.
58
+ * @throws {Error} If the debate with the given ID does not exist.
59
+ * @throws {Error} If there is no active round for the debate (i.e., beginRound has not been called).
60
+ *
61
+ * This method locates the debate in the in-memory cache, verifies that a round is active,
62
+ * appends the contribution to the current round's contributions array, updates the debate's
63
+ * updatedAt timestamp, and persists the state to disk.
64
+ */
65
+ async addContribution(debateId: string, contribution: Contribution): Promise<void> {
66
+ // Direct in-memory access: only active debates can be modified (don't load completed debates from disk)
67
+ const state = this.debates.get(debateId);
68
+ if (!state) throw new Error(`Debate ${debateId} not found`);
69
+
70
+ const round: DebateRound | undefined = state.rounds[state.currentRound - 1];
71
+ if (!round) throw new Error(`No active round for debate ${debateId}. Call beginRound() before adding contributions.`);
72
+
73
+ round.contributions.push(contribution);
74
+ state.updatedAt = new Date();
75
+ await this.save(state);
76
+ }
77
+
78
+ /**
79
+ * Adds a summary to the current round of the specified debate and persists the updated state.
80
+ *
81
+ * Summaries are generated by agents when context becomes too large and needs to be condensed.
82
+ * Each agent's summary is stored by their agentId for easy lookup.
83
+ *
84
+ * @param debateId - The unique identifier of the debate to which the summary should be added.
85
+ * @param summary - The DebateSummary object to add to the current round (keyed by agentId).
86
+ * @throws {Error} If the debate with the given ID does not exist.
87
+ * @throws {Error} If there is no active round for the debate (i.e., beginRound has not been called).
88
+ *
89
+ * This method locates the debate in the in-memory cache, verifies that a round is active,
90
+ * initializes the summaries Record if not present, stores the summary by agentId,
91
+ * updates the debate's updatedAt timestamp, and persists the state to disk.
92
+ */
93
+ async addSummary(debateId: string, summary: DebateSummary): Promise<void> {
94
+ const state = this.debates.get(debateId);
95
+ if (!state) throw new Error(`Debate ${debateId} not found`);
96
+
97
+ const round: DebateRound | undefined = state.rounds[state.currentRound - 1];
98
+ if (!round) throw new Error(`No active round for debate ${debateId}. Call beginRound() before adding summaries.`);
99
+
100
+ // Initialize summaries Record if not present
101
+ if (!round.summaries) {
102
+ round.summaries = {};
103
+ }
104
+
105
+ // Store summary by agentId
106
+ round.summaries[summary.agentId] = summary;
107
+ state.updatedAt = new Date();
108
+ await this.save(state);
109
+ }
110
+
111
+ /**
112
+ * Adds a judge summary to the debate state.
113
+ * @param debateId - The unique identifier of the debate.
114
+ * @param summary - The judge summary to store.
115
+ * @throws {Error} If the debate with the given ID does not exist in memory.
116
+ *
117
+ * This method stores the judge summary in the debate state, updates the updatedAt timestamp,
118
+ * and persists the state to disk.
119
+ */
120
+ async addJudgeSummary(debateId: string, summary: DebateSummary): Promise<void> {
121
+ const state = this.debates.get(debateId);
122
+ if (!state) throw new Error(`Debate ${debateId} not found`);
123
+
124
+ state.judgeSummary = summary;
125
+ state.updatedAt = new Date();
126
+ await this.save(state);
127
+ }
128
+
129
+ /**
130
+ * Persists clarifications (Q&A) collected before round 1.
131
+ * @param debateId - The unique identifier of the debate.
132
+ * @param clarifications - Grouped clarifications by agent.
133
+ */
134
+ async setClarifications(debateId: string, clarifications: AgentClarifications[]): Promise<void> {
135
+ const state = this.debates.get(debateId);
136
+ if (!state) throw new Error(`Debate ${debateId} not found`);
137
+ state.clarifications = clarifications;
138
+ state.updatedAt = new Date();
139
+ await this.save(state);
140
+ }
141
+
142
+ /**
143
+ * Marks the specified debate as completed, attaches the final solution, updates the status and timestamp,
144
+ * and persists the updated debate state to disk.
145
+ *
146
+ * @param debateId - The unique identifier of the debate to complete.
147
+ * @param solution - The final Solution object synthesized by the judge.
148
+ * @throws {Error} If the debate with the given ID does not exist in memory.
149
+ *
150
+ * This method sets the debate's status to COMPLETED, assigns the provided solution to the
151
+ * finalSolution property, updates the updatedAt timestamp, and saves the state.
152
+ */
153
+ async completeDebate(debateId: string, solution: Solution): Promise<void> {
154
+ const state = this.debates.get(debateId);
155
+ if (!state) throw new Error(`Debate ${debateId} not found`);
156
+
157
+ state.status = DEBATE_STATUS.COMPLETED;
158
+ state.finalSolution = solution;
159
+ state.updatedAt = new Date();
160
+ await this.save(state);
161
+ }
162
+
163
+ /**
164
+ * Marks a debate as failed and persists the updated status.
165
+ *
166
+ * @param debateId - The unique identifier of the debate to fail.
167
+ * @param _error - The error that caused the debate to fail (unused).
168
+ * @throws {Error} If the debate with the given ID does not exist in memory.
169
+ *
170
+ * This method sets the debate's status to FAILED, updates the updatedAt timestamp, and saves the state.
171
+ * If the debate does not exist, this method does nothing.
172
+ */
173
+ async failDebate(debateId: string, _error: Error): Promise<void> {
174
+ const state = this.debates.get(debateId);
175
+ if (!state) return;
176
+ state.status = DEBATE_STATUS.FAILED;
177
+ state.updatedAt = new Date();
178
+ await this.save(state);
179
+ }
180
+
181
+ /**
182
+ * Retrieves a debate by id from in-memory cache, falling back to disk if needed.
183
+ *
184
+ * @param debateId - The unique identifier of the debate to retrieve.
185
+ * @returns The DebateState object if found, or null if not found.
186
+ *
187
+ * This method first checks the in-memory cache, then attempts to load from disk if not found.
188
+ * It revives date fields for createdAt/updatedAt.
189
+ */
190
+ async getDebate(debateId: string): Promise<DebateState | null> {
191
+ const inMem = this.debates.get(debateId);
192
+ if (inMem) return inMem;
193
+
194
+ const filePath = this.getFilePath(debateId);
195
+ if (!fs.existsSync(filePath)) return null;
196
+ const raw = await fs.promises.readFile(filePath, FILE_ENCODING_UTF8);
197
+ const parsed = JSON.parse(raw);
198
+ // Revive top-level dates; round timestamps remain as serialized strings unless separately revived.
199
+ parsed.createdAt = new Date(parsed.createdAt);
200
+ parsed.updatedAt = new Date(parsed.updatedAt);
201
+ return parsed as DebateState;
202
+ }
203
+
204
+ /**
205
+ * Lists all debates stored on disk, sorted by most recent creation time.
206
+ *
207
+ * This method scans the base directory for all debate JSON files, loads each debate state,
208
+ * and returns an array of DebateState objects sorted in descending order by their createdAt timestamp.
209
+ *
210
+ * Notes:
211
+ * - Only files with the expected JSON extension are considered.
212
+ * - If a debate file cannot be loaded or parsed, it is skipped.
213
+ * - Debates are loaded using getDebate, which revives date fields.
214
+ *
215
+ * @returns {Promise<DebateState[]>} - An array of DebateState objects, most recent first.
216
+ */
217
+ async listDebates(): Promise<DebateState[]> {
218
+ const files = fs.existsSync(this.baseDir) ? await fs.promises.readdir(this.baseDir) : [];
219
+ const debates: DebateState[] = [];
220
+ for (const file of files) {
221
+ if (!file.endsWith(FILE_EXTENSION_JSON)) continue;
222
+ const id = file.replace(new RegExp(`${FILE_EXTENSION_JSON.replace('.', '\\.')}$`), '');
223
+ const d = await this.getDebate(id);
224
+ if (d) debates.push(d);
225
+ }
226
+ return debates.sort((a, b) => b.createdAt.getTime() - a.createdAt.getTime());
227
+ }
228
+
229
+ /**
230
+ * Starts a new round for the specified debate, increments currentRound, and persists state.
231
+ *
232
+ * This method locates the debate in the in-memory cache, creates a new round object,
233
+ * appends it to the state's rounds array, updates the currentRound counter, updates the
234
+ * updatedAt timestamp, and persists the state to disk.
235
+ *
236
+ * @param debateId - The unique identifier of the debate to begin a round for.
237
+ * @returns The newly created DebateRound object.
238
+ */
239
+ async beginRound(debateId: string): Promise<DebateRound> {
240
+ const state = this.debates.get(debateId);
241
+ if (!state) throw new Error(`Debate ${debateId} not found`);
242
+
243
+ const round: DebateRound = {
244
+ roundNumber: state.rounds.length + 1,
245
+ contributions: [],
246
+ timestamp: new Date(),
247
+ };
248
+
249
+ state.rounds.push(round);
250
+ state.currentRound = round.roundNumber;
251
+ state.updatedAt = new Date();
252
+ await this.save(state);
253
+ return round;
254
+ }
255
+
256
+ /**
257
+ * Persists the debate state to disk in JSON format.
258
+ *
259
+ * @param state - The DebateState object to save.
260
+ */
261
+ private async save(state: DebateState): Promise<void> {
262
+ const filePath = this.getFilePath(state.id);
263
+ const serialized = JSON.stringify(state, null, JSON_SPACE);
264
+ await fs.promises.writeFile(filePath, serialized, FILE_ENCODING_UTF8);
265
+ }
266
+
267
+ /**
268
+ * Persists the prompt source provenance for a debate (agents and judge) and saves the state.
269
+ * Intended to be called once per debate initialization.
270
+ */
271
+ async setPromptSources(debateId: string, sources: DebateState['promptSources']): Promise<void> {
272
+ const state = this.debates.get(debateId);
273
+ if (!state) throw new Error(`Debate ${debateId} not found`);
274
+ if (sources) {
275
+ state.promptSources = sources;
276
+ } else {
277
+ delete state.promptSources;
278
+ }
279
+ state.updatedAt = new Date();
280
+ await this.save(state);
281
+ }
282
+
283
+ /**
284
+ * Ensures the base directory exists on disk.
285
+ */
286
+ private ensureDirectoryExists() {
287
+ if (!fs.existsSync(this.baseDir)) {
288
+ // Use recursive mkdir to create nested directories if needed
289
+ fs.mkdirSync(this.baseDir, { recursive: true });
290
+ }
291
+ }
292
+
293
+ /**
294
+ * Computes the absolute file path for the JSON file corresponding to a given debate id.
295
+ *
296
+ * The file is stored in the base directory with the debate id as the filename and a .json extension.
297
+ *
298
+ * @param debateId - The unique identifier of the debate.
299
+ * @returns The absolute file path where the debate state is persisted.
300
+ */
301
+ private getFilePath(debateId: string): string {
302
+ return path.join(this.baseDir, `${debateId}${FILE_EXTENSION_JSON}`);
303
+ }
304
+
305
+ /**
306
+ * Generates a unique debate id using a timestamp and a short random suffix.
307
+ *
308
+ * @param now - The current date and time.
309
+ * @returns The unique debate id.
310
+ */
311
+ private generateId(now: Date): string {
312
+ const pad = (n: number) => n.toString().padStart(2, '0');
313
+ const yyyy = now.getFullYear();
314
+ const MM = pad(now.getMonth() + 1);
315
+ const dd = pad(now.getDate());
316
+ const hh = pad(now.getHours());
317
+ const mm = pad(now.getMinutes());
318
+ const ss = pad(now.getSeconds());
319
+ const rand = Math.random().toString(36).slice(2, 6);
320
+ return `${ID_PREFIX}${yyyy}${MM}${dd}-${hh}${mm}${ss}-${rand}`;
321
+ }
322
+ }
@@ -0,0 +1,130 @@
1
+ import { LLMProvider, CompletionResponse } from '../providers/llm-provider';
2
+ import { createProvider } from '../providers/provider-factory';
3
+ import { writeStderr } from '../cli/index';
4
+ import { EvaluatorConfig, EvaluatorInputs } from '../types/eval.types';
5
+
6
+ export interface EvaluatorResult {
7
+ id: string;
8
+ rawText: string;
9
+ latencyMs: number;
10
+ usage?: { inputTokens?: number; outputTokens?: number; totalTokens?: number };
11
+ }
12
+
13
+ /**
14
+ * EvaluatorAgent
15
+ *
16
+ * This class represents an evaluator agent that uses a language model (LLM) provider to evaluate a software solution or debate.
17
+ * It encapsulates the agent's configuration, associated model, prompt templates, and low-level logic to perform a deterministic evaluation.
18
+ *
19
+ * Usage:
20
+ * - Instantiate via static fromConfig() or directly via the constructor.
21
+ * - Call evaluate(inputs) to perform evaluation and receive response/result details.
22
+ */
23
+ export class EvaluatorAgent {
24
+ /** Unique identifier for the evaluator agent */
25
+ readonly id: string;
26
+ /** Human-readable agent name */
27
+ readonly name: string;
28
+ /** Model name used for the LLM invocation */
29
+ readonly model: string;
30
+ /** LLMProvider instance (e.g., OpenAI, Azure, etc.) */
31
+ readonly provider: LLMProvider;
32
+ /** Resolved system prompt string used for the LLM */
33
+ readonly resolvedSystemPrompt: string;
34
+ /** Resolved user prompt template (with placeholders) for evaluation */
35
+ readonly resolvedUserPromptTemplate: string;
36
+
37
+ /**
38
+ * Fixed temperature value for model calls to ensure deterministic evaluation.
39
+ * This is set to a low value (0.1) to reduce randomness/variance in LLM output.
40
+ */
41
+ private static readonly FIXED_TEMPERATURE = 0.1;
42
+
43
+ /**
44
+ * Constructs an EvaluatorAgent instance.
45
+ *
46
+ * @param config - EvaluatorConfig object defining agent identity and settings.
47
+ * @param provider - Instantiated LLMProvider for this agent.
48
+ * @param resolvedSystemPrompt - The system prompt string for agent context.
49
+ * @param resolvedUserPromptTemplate - The user prompt template with placeholders.
50
+ */
51
+ constructor(
52
+ config: EvaluatorConfig,
53
+ provider: LLMProvider,
54
+ resolvedSystemPrompt: string,
55
+ resolvedUserPromptTemplate: string
56
+ ) {
57
+ this.id = config.id;
58
+ this.name = config.name;
59
+ this.model = config.model;
60
+ this.provider = provider;
61
+ this.resolvedSystemPrompt = resolvedSystemPrompt;
62
+ this.resolvedUserPromptTemplate = resolvedUserPromptTemplate;
63
+ }
64
+
65
+ /**
66
+ * Creates an EvaluatorAgent instance from an EvaluatorConfig, resolving the provider.
67
+ *
68
+ * @param cfg - The EvaluatorConfig object.
69
+ * @param resolvedSystemPrompt - The preloaded system prompt.
70
+ * @param resolvedUserPromptTemplate - The preloaded user prompt template.
71
+ * @returns A new EvaluatorAgent instance.
72
+ */
73
+ static fromConfig(
74
+ cfg: EvaluatorConfig,
75
+ resolvedSystemPrompt: string,
76
+ resolvedUserPromptTemplate: string
77
+ ): EvaluatorAgent {
78
+ const provider = createProvider(cfg.provider);
79
+ return new EvaluatorAgent(cfg, provider, resolvedSystemPrompt, resolvedUserPromptTemplate);
80
+ }
81
+
82
+ /**
83
+ * Renders the user prompt by replacing placeholders with actual inputs.
84
+ *
85
+ * @param inputs - The evaluation inputs (problem, clarifications, and solution).
86
+ * @returns The rendered user prompt string.
87
+ */
88
+ private renderUserPrompt(inputs: EvaluatorInputs): string {
89
+ return this.resolvedUserPromptTemplate
90
+ .replace('{problem}', inputs.problem)
91
+ .replace('{clarifications}', inputs.clarificationsMarkdown)
92
+ .replace('{final_solution}', inputs.finalSolution);
93
+ }
94
+
95
+ /**
96
+ * Performs the evaluation using the underlying LLMProvider.
97
+ *
98
+ * @param inputs - The inputs required for evaluation (problem, clarifications, finalSolution).
99
+ * @returns The evaluation result, including raw text, latency, and optional usage data.
100
+ * @throws An error if LLM call fails.
101
+ */
102
+ async evaluate(inputs: EvaluatorInputs): Promise<EvaluatorResult> {
103
+ const userPrompt = this.renderUserPrompt(inputs);
104
+ const systemPrompt = this.resolvedSystemPrompt;
105
+ const started = Date.now();
106
+
107
+ const llmCall = this.provider.complete({
108
+ model: this.model,
109
+ temperature: EvaluatorAgent.FIXED_TEMPERATURE,
110
+ systemPrompt,
111
+ userPrompt,
112
+ });
113
+
114
+ try {
115
+ const res: CompletionResponse = await llmCall;
116
+ const latencyMs = Date.now() - started;
117
+ return {
118
+ id: this.id,
119
+ rawText: res.text,
120
+ latencyMs,
121
+ ...(res.usage !== undefined && { usage: res.usage }),
122
+ };
123
+ } catch (err: any) {
124
+ writeStderr(`[${this.id}] Evaluation failed: ${err?.message ?? 'unknown error'}\n`);
125
+ throw err;
126
+ }
127
+ }
128
+ }
129
+
130
+
@@ -0,0 +1,41 @@
1
+ You are an expert software design evaluator and reviewer.
2
+ Your role is to critically assess the quality, soundness, and completeness of a proposed software design solution. Act as an impartial but rigorously critical reviewer—similar to a professional peer reviewer in software engineering.
3
+
4
+ You need to assess the solution on the following qualities and provide score for each quality in the range of 1 to 10:
5
+ - Functional completeness (functional_completeness): how well does the the suggested solution address the problem statement. Does it cover all cases? Does it take into account edge cases?
6
+ - A score of 1 indicates nothing is addressed, there is no indication of the problem being solved.
7
+ - A score of 5 indicates most problems and points raised in the problem description are addressed in some way.
8
+ - A score of 10 indicates all problems are addressed, and other implied/assumed issues or inconsistencies are also addressed. The solution is bullet proof from a functional point of view.
9
+ - Performance and Scalbility (performance_scalability): how well does the proposed architecture/solution address declared or assumed runtime load and scale.
10
+ - A score of 1 indicates no performance consideration at all. The proposed solution seems to be negligent in how it addresses scale and runtime performance (latency and resource consumption).
11
+ - A score of 5 indicates some consideration is given to performance and scaling options. Decisions were made to accommodate some volume of users and/or processing.
12
+ - A score of 10 indicates the design fully covers all known and implied (even if unmentioned) performance considerations. It explicitly addresses latency, it takes into account future scaling and provides a solution that minimizes latency and resource use.
13
+ - Security (security): how well does the proposed architecture/solution address declated or assumed security and privacy issues.
14
+ - A score of 1 indicates no security consideration were taken into account. The design seems to neglect security issues and may in fact contain security issues.
15
+ - A score of 5 indidcates the solution seems to have taken into account security issues (e.g. privacy, access control, authentication).
16
+ - A score of 10 indicates the solution covers all possible security issues and explicitly addresses them in the proposed solution. The offered solution explicilty mentions security aspects as part of the decision reasoning.
17
+ - Maintainability (maintainability_evolvability): how well does the proposed solution/architecture address potential changes and maintenance issues. Does it take into account troubleshooting? does it decompose responsibilities? does it address potential "blast radius" of future changes and tries to reduce impact of changes?
18
+ - A score of 1 indicates a solution that is highly coupled. No indication of thought about future evolution or decomposition.
19
+ - A score of 5 indicates the solution proposed involves some degree of decomposition to components, with clear boundaries and responsibilities, focusing changes in specific places.
20
+ - A score of 10 indicates a solution that takes evolution as an explicit reason to design choices, defining and declaring clear component responsibilites and clear scalable interfaces between components.
21
+ - Regulatory Compoliance (regulatory_compliance): does the solution take into account any possible regulatory impact on the solution (data privacy laws, data protection laws, GDPR, etc)? Does the solution address and indicate this has gone into the reasoning process?
22
+ - Testability (testability): does the solution proposed lend itself to being tested in a scalable manner? can changes be tested easily and independently for different components? Does the solution address also 3rd party integrations?
23
+ - A score of 1 indicates no indication of testing taken. Solution is convoluted and requires a difficult setup in order to test functionality properly.
24
+ - A score of 5 indicates the solution takes testing into account, allowing for isolated testing of changes of specific parts of the system.
25
+ - A score of 10 indicates the solution take testing as a primary consideration, and explicitly addresses and proposes how to tackle different kinds of tests, with relatively low overhead, and accessible to continously run.
26
+
27
+ Guidelines:
28
+ ⦁ Identify strengths, weaknesses, and trade-offs in the proposed solution.
29
+ ⦁ Focus on defensible reasoning, not just positive feedback.
30
+ ⦁ For each score you assign, provide concrete justification based on the proposal.
31
+ ⦁ Be skeptical: reward completeness and clarity, penalise ambiguity, missing assumptions, or unjustified claims.
32
+ ⦁ Before scoring, internally compare the proposed design against typical professional standards (for example: completeness of requirements coverage, handling of non-functional concerns, clarity of assumptions and constraints).
33
+ ⦁ Use the following scale as a guideline:
34
+ ⦁ 10 = exceptional, exemplary in every respect
35
+ ⦁ 8-9 = strong and well reasoned, with minor weaknesses
36
+ ⦁ 5-7 = acceptable but with clear gaps or risks
37
+ ⦁ 1-4 = poor, flawed, or incomplete
38
+ ⦁ Output ONLY a single valid JSON object conforming exactly to the requested schema.
39
+ ⦁ Do not include any text outside the JSON object.
40
+ ⦁ Scores must be integers in the range 1 to 10.
41
+ ⦁ If you cannot reasonably infer a score for a field, omit that field.
@@ -0,0 +1,64 @@
1
+ Evaluate the following debate outcome in terms of the software design’s quality and soundness.
2
+
3
+ Problem:
4
+ """
5
+ {problem}
6
+ """
7
+
8
+ Clarifications (if any):
9
+ {clarifications}
10
+
11
+ Final Solution Description:
12
+ """
13
+ {final_solution}
14
+ """
15
+
16
+ In your evaluation, focus on:
17
+ - How well the solution satisfies the problem requirements and constraints.
18
+ - How clearly and convincingly it discusses or justifies trade-offs.
19
+ - Whether it acknowledges potential risks, limitations, or missing elements.
20
+ - The realism and implementability of the design decisions.
21
+
22
+ Return ONLY a single JSON object matching this schema:
23
+ {
24
+ "evaluation": {
25
+ "functional_completeness": {
26
+ "score": <integer 1..10>,
27
+ "reasoning": "<string>"
28
+ },
29
+ "non_functional": {
30
+ "performance_scalability": {
31
+ "score": <integer 1..10>,
32
+ "reasoning": "<string>"
33
+ },
34
+ "security": {
35
+ "score": <integer 1..10>,
36
+ "reasoning": "<string>"
37
+ },
38
+ "maintainability_evolvability": {
39
+ "score": <integer 1..10>,
40
+ "reasoning": "<string>"
41
+ },
42
+ "regulatory_compliance": {
43
+ "score": <integer 1..10>,
44
+ "reasoning": "<string>"
45
+ },
46
+ "testability": {
47
+ "score": <integer 1..10>,
48
+ "reasoning": "<string>"
49
+ }
50
+ }
51
+ },
52
+ "overall_summary": {
53
+ "strengths": "<brief strengths>",
54
+ "weaknesses": "<brief weaknesses>",
55
+ "overall_score": <integer 1..10>
56
+ }
57
+ }
58
+
59
+ Rules:
60
+ - Be concise but specific in reasoning; avoid generic praise such as “looks good”.
61
+ - Highlight gaps, contradictions, implicit assumptions, or missing aspects wherever they appear.
62
+ - Scores must be integers between 1 and 10.
63
+ - If you cannot provide a score for a field, omit that field instead of guessing.
64
+ - Output only the JSON object. Do not wrap it in code fences.
@@ -0,0 +1,25 @@
1
+ export interface CompletionRequest {
2
+ model: string;
3
+ systemPrompt: string;
4
+ userPrompt: string;
5
+ temperature: number;
6
+ maxTokens?: number;
7
+ stopSequences?: string[];
8
+ }
9
+
10
+ export interface CompletionUsage {
11
+ inputTokens?: number;
12
+ outputTokens?: number;
13
+ totalTokens?: number;
14
+ }
15
+
16
+ export interface CompletionResponse {
17
+ text: string;
18
+ usage?: CompletionUsage;
19
+ }
20
+
21
+ export interface LLMProvider {
22
+ complete(request: CompletionRequest): Promise<CompletionResponse>;
23
+ stream?(request: CompletionRequest): AsyncIterator<string>;
24
+ generateEmbedding?(text: string): Promise<number[]>;
25
+ }
@@ -0,0 +1,84 @@
1
+ import OpenAI from 'openai';
2
+ import { CompletionRequest, CompletionResponse, LLMProvider } from './llm-provider';
3
+
4
+ export class OpenAIProvider implements LLMProvider {
5
+ private client: OpenAI;
6
+
7
+ constructor(apiKey: string) {
8
+ this.client = new OpenAI({ apiKey });
9
+ }
10
+
11
+ async complete(request: CompletionRequest): Promise<CompletionResponse> {
12
+ // Try Responses API first
13
+ try {
14
+ // Build Responses API payload conditionally
15
+ const respPayload: any = {
16
+ model: request.model,
17
+ temperature: request.temperature,
18
+ input: [
19
+ { role: 'system', content: request.systemPrompt },
20
+ { role: 'user', content: request.userPrompt },
21
+ ],
22
+ };
23
+ if (request.maxTokens != null) respPayload.max_output_tokens = request.maxTokens;
24
+ if (request.stopSequences) respPayload.stop = request.stopSequences;
25
+
26
+ const resp = await (this.client as any).responses?.create?.(respPayload);
27
+
28
+ if (resp && resp.output_text) {
29
+ const usage = resp?.usage ?? resp?.output?.usage;
30
+ const out: CompletionResponse = { text: resp.output_text as string };
31
+ if (usage) {
32
+ out.usage = {
33
+ inputTokens: usage.input_tokens ?? usage.inputTokens,
34
+ outputTokens: usage.output_tokens ?? usage.outputTokens,
35
+ totalTokens: usage.total_tokens ?? usage.totalTokens,
36
+ };
37
+ }
38
+ return out;
39
+ }
40
+ // Some SDK shapes use output[0]?.content[0]?.text
41
+ const outText: string | undefined = resp?.output?.[0]?.content?.[0]?.text;
42
+ if (outText) {
43
+ const usage = resp?.usage ?? resp?.output?.usage;
44
+ const out: CompletionResponse = { text: outText };
45
+ if (usage) {
46
+ out.usage = {
47
+ inputTokens: usage.input_tokens ?? usage.inputTokens,
48
+ outputTokens: usage.output_tokens ?? usage.outputTokens,
49
+ totalTokens: usage.total_tokens ?? usage.totalTokens,
50
+ };
51
+ }
52
+ return out;
53
+ }
54
+
55
+ // Fallback if Responses API returned unexpected shape
56
+ throw new Error('Unexpected Responses API response shape');
57
+ } catch (_err) {
58
+ // Fallback to Chat Completions API
59
+ const chatPayload: any = {
60
+ model: request.model,
61
+ messages: [
62
+ { role: 'system', content: request.systemPrompt },
63
+ { role: 'user', content: request.userPrompt },
64
+ ],
65
+ temperature: request.temperature,
66
+ };
67
+ if (request.maxTokens != null) chatPayload.max_tokens = request.maxTokens;
68
+ if (request.stopSequences) chatPayload.stop = request.stopSequences;
69
+
70
+ const chat = await this.client.chat.completions.create(chatPayload);
71
+ const txt = chat.choices[0]?.message?.content ?? '';
72
+ const usage = (chat as any).usage;
73
+ const out: CompletionResponse = { text: txt };
74
+ if (usage) {
75
+ out.usage = {
76
+ inputTokens: usage.prompt_tokens ?? usage.input_tokens,
77
+ outputTokens: usage.completion_tokens ?? usage.output_tokens,
78
+ totalTokens: usage.total_tokens,
79
+ };
80
+ }
81
+ return out;
82
+ }
83
+ }
84
+ }