dialectic 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/.cursor/commands/setup-test.mdc +175 -0
  2. package/.cursor/rules/basic-code-cleanup.mdc +1110 -0
  3. package/.cursor/rules/riper5.mdc +96 -0
  4. package/.env.example +6 -0
  5. package/AGENTS.md +1052 -0
  6. package/LICENSE +21 -0
  7. package/README.md +93 -0
  8. package/WARP.md +113 -0
  9. package/dialectic-1.0.0.tgz +0 -0
  10. package/dialectic.js +10 -0
  11. package/docs/commands.md +375 -0
  12. package/docs/configuration.md +882 -0
  13. package/docs/context_summarization.md +1023 -0
  14. package/docs/debate_flow.md +1127 -0
  15. package/docs/eval_flow.md +795 -0
  16. package/docs/evaluator.md +141 -0
  17. package/examples/debate-config-openrouter.json +48 -0
  18. package/examples/debate_config1.json +48 -0
  19. package/examples/eval/eval1/eval_config1.json +13 -0
  20. package/examples/eval/eval1/result1.json +62 -0
  21. package/examples/eval/eval1/result2.json +97 -0
  22. package/examples/eval_summary_format.md +11 -0
  23. package/examples/example3/debate-config.json +64 -0
  24. package/examples/example3/eval_config2.json +25 -0
  25. package/examples/example3/problem.md +17 -0
  26. package/examples/example3/rounds_test/eval_run.sh +16 -0
  27. package/examples/example3/rounds_test/run_test.sh +16 -0
  28. package/examples/kata1/architect-only-solution_2-rounds.json +121 -0
  29. package/examples/kata1/architect-perf-solution_2-rounds.json +234 -0
  30. package/examples/kata1/debate-config-kata1.json +54 -0
  31. package/examples/kata1/eval_architect-only_2-rounds.json +97 -0
  32. package/examples/kata1/eval_architect-perf_2-rounds.json +97 -0
  33. package/examples/kata1/kata1-report.md +12224 -0
  34. package/examples/kata1/kata1-report_temps-01_01_01_07.md +2451 -0
  35. package/examples/kata1/kata1.md +5 -0
  36. package/examples/kata1/meta.txt +1 -0
  37. package/examples/kata2/debate-config.json +54 -0
  38. package/examples/kata2/eval_config1.json +21 -0
  39. package/examples/kata2/eval_config2.json +25 -0
  40. package/examples/kata2/kata2.md +5 -0
  41. package/examples/kata2/only_architect/debate-config.json +45 -0
  42. package/examples/kata2/only_architect/eval_run.sh +11 -0
  43. package/examples/kata2/only_architect/run_test.sh +5 -0
  44. package/examples/kata2/rounds_test/eval_run.sh +11 -0
  45. package/examples/kata2/rounds_test/run_test.sh +5 -0
  46. package/examples/kata2/summary_length_test/eval_run.sh +11 -0
  47. package/examples/kata2/summary_length_test/eval_run_w_clarify.sh +7 -0
  48. package/examples/kata2/summary_length_test/run_test.sh +5 -0
  49. package/examples/task-queue/debate-config.json +76 -0
  50. package/examples/task-queue/debate_report.md +566 -0
  51. package/examples/task-queue/task-queue-system.md +25 -0
  52. package/jest.config.ts +13 -0
  53. package/multi_agent_debate_spec.md +2980 -0
  54. package/package.json +38 -0
  55. package/sanity-check-problem.txt +9 -0
  56. package/src/agents/prompts/architect-prompts.ts +203 -0
  57. package/src/agents/prompts/generalist-prompts.ts +157 -0
  58. package/src/agents/prompts/index.ts +41 -0
  59. package/src/agents/prompts/judge-prompts.ts +19 -0
  60. package/src/agents/prompts/kiss-prompts.ts +230 -0
  61. package/src/agents/prompts/performance-prompts.ts +142 -0
  62. package/src/agents/prompts/prompt-types.ts +68 -0
  63. package/src/agents/prompts/security-prompts.ts +149 -0
  64. package/src/agents/prompts/shared.ts +144 -0
  65. package/src/agents/prompts/testing-prompts.ts +149 -0
  66. package/src/agents/role-based-agent.ts +386 -0
  67. package/src/cli/commands/debate.ts +761 -0
  68. package/src/cli/commands/eval.ts +475 -0
  69. package/src/cli/commands/report.ts +265 -0
  70. package/src/cli/index.ts +79 -0
  71. package/src/core/agent.ts +198 -0
  72. package/src/core/clarifications.ts +34 -0
  73. package/src/core/judge.ts +257 -0
  74. package/src/core/orchestrator.ts +432 -0
  75. package/src/core/state-manager.ts +322 -0
  76. package/src/eval/evaluator-agent.ts +130 -0
  77. package/src/eval/prompts/system.md +41 -0
  78. package/src/eval/prompts/user.md +64 -0
  79. package/src/providers/llm-provider.ts +25 -0
  80. package/src/providers/openai-provider.ts +84 -0
  81. package/src/providers/openrouter-provider.ts +122 -0
  82. package/src/providers/provider-factory.ts +64 -0
  83. package/src/types/agent.types.ts +141 -0
  84. package/src/types/config.types.ts +47 -0
  85. package/src/types/debate.types.ts +237 -0
  86. package/src/types/eval.types.ts +85 -0
  87. package/src/utils/common.ts +104 -0
  88. package/src/utils/context-formatter.ts +102 -0
  89. package/src/utils/context-summarizer.ts +143 -0
  90. package/src/utils/env-loader.ts +46 -0
  91. package/src/utils/exit-codes.ts +5 -0
  92. package/src/utils/id.ts +11 -0
  93. package/src/utils/logger.ts +48 -0
  94. package/src/utils/paths.ts +10 -0
  95. package/src/utils/progress-ui.ts +313 -0
  96. package/src/utils/prompt-loader.ts +79 -0
  97. package/src/utils/report-generator.ts +301 -0
  98. package/tests/clarifications.spec.ts +128 -0
  99. package/tests/cli.debate.spec.ts +144 -0
  100. package/tests/config-loading.spec.ts +206 -0
  101. package/tests/context-summarizer.spec.ts +131 -0
  102. package/tests/debate-config-custom.json +38 -0
  103. package/tests/env-loader.spec.ts +149 -0
  104. package/tests/eval.command.spec.ts +1191 -0
  105. package/tests/logger.spec.ts +19 -0
  106. package/tests/openai-provider.spec.ts +26 -0
  107. package/tests/openrouter-provider.spec.ts +279 -0
  108. package/tests/orchestrator-summary.spec.ts +386 -0
  109. package/tests/orchestrator.spec.ts +207 -0
  110. package/tests/prompt-loader.spec.ts +52 -0
  111. package/tests/prompts/architect.md +16 -0
  112. package/tests/provider-factory.spec.ts +150 -0
  113. package/tests/report.command.spec.ts +546 -0
  114. package/tests/role-based-agent-summary.spec.ts +476 -0
  115. package/tests/security-agent.spec.ts +221 -0
  116. package/tests/shared-prompts.spec.ts +318 -0
  117. package/tests/state-manager.spec.ts +251 -0
  118. package/tests/summary-prompts.spec.ts +153 -0
  119. package/tsconfig.json +49 -0
@@ -0,0 +1,475 @@
1
+ import fs from 'fs';
2
+ import path from 'path';
3
+ import { Command } from 'commander';
4
+ import { writeStderr } from '../index';
5
+ import { loadEnvironmentFile } from '../../utils/env-loader';
6
+ import { EXIT_INVALID_ARGS, EXIT_GENERAL_ERROR } from '../../utils/exit-codes';
7
+ import { EvaluatorConfig, ParsedEvaluation, AggregatedJsonOutput, AggregatedAverages, clampScoreToRange, isEnabledEvaluator } from '../../types/eval.types';
8
+ import { DebateState } from '../../types/debate.types';
9
+ import { EvaluatorAgent } from '../../eval/evaluator-agent';
10
+ import { resolvePrompt, readBuiltInPrompt } from '../../utils/prompt-loader';
11
+ import { PROMPT_SOURCES, LLM_PROVIDERS } from '../../types/agent.types';
12
+ import { numOrUndefined, averageOrNull, createValidationError, readJsonFile } from '../../utils/common';
13
+
14
+ const FILE_ENCODING_UTF8 = 'utf-8';
15
+ const JSON_INDENT_SPACES = 2;
16
+
17
+ /**
18
+ * Result of loading an evaluator configuration file.
19
+ */
20
+ type LoadedEvaluatorConfig = {
21
+ /** Array of evaluator agent configurations */
22
+ agents: EvaluatorConfig[];
23
+ /** Absolute directory path containing the configuration file */
24
+ configDir: string;
25
+ };
26
+
27
+
28
+
29
+ /**
30
+ * Builds a Markdown-formatted string representing all clarifications exchanged during a debate.
31
+ *
32
+ * The output contains a sequence of agent clarification sections. Each section begins with an
33
+ * H3 header including the agent's name and role, followed by all questions and answers exchanged
34
+ * by that agent, formatted in code blocks for clarity.
35
+ *
36
+ * If there are no clarifications, returns a minimal Markdown code block separator.
37
+ *
38
+ * Example output (for one clarification):
39
+ *
40
+ * ### Alice (Judge)
41
+ * Question (q1):
42
+ *
43
+ * ```text
44
+ * What are the system's scalability requirements?
45
+ * ```
46
+ *
47
+ * Answer:
48
+ *
49
+ * ```text
50
+ * The system must support 10k concurrent users.
51
+ * ```
52
+ *
53
+ * @param {DebateState} state - The debate state object containing clarifications.
54
+ * @returns {string} Markdown string summarizing all clarifications for insertion into prompts.
55
+ */
56
+ function buildClarificationsMarkdown(state: DebateState): string {
57
+ if (!state.clarifications || state.clarifications.length === 0) return '``````\n``````';
58
+ let out = '';
59
+ for (const group of state.clarifications) {
60
+ out += `### ${group.agentName} (${group.role})\n`;
61
+ for (const item of group.items) {
62
+ out += `Question (${item.id}):\n\n\`\`\`text\n${item.question}\n\`\`\`\n\n`;
63
+ out += `Answer:\n\n\`\`\`text\n${item.answer}\n\`\`\`\n\n`;
64
+ }
65
+ }
66
+ return out.trim();
67
+ }
68
+
69
+ /**
70
+ * Extracts and parses the first JSON object found in a string.
71
+ *
72
+ * This function searches the input text for the first occurrence of a substring
73
+ * that resembles a JSON object (i.e., text between the first '{' and the matching '}').
74
+ * It then attempts to parse this substring as JSON.
75
+ *
76
+ * If no curly-brace-enclosed object is found, it will attempt to parse the entire string.
77
+ * If parsing fails at any point, the function returns null.
78
+ *
79
+ * @param {string} text - The input string to search for a JSON object.
80
+ * @returns {Record<string, any> | null} The parsed object if successful, or null if parsing fails.
81
+ */
82
+ function parseFirstJsonObject(text: string): Record<string, any> | null {
83
+ const match = text.match(/\{[\s\S]*\}/);
84
+ const json = match ? match[0] : text;
85
+ try {
86
+ return JSON.parse(json);
87
+ } catch {
88
+ return null;
89
+ }
90
+ }
91
+
92
+ /**
93
+ * Attempts to validate, clamp, and append a numeric score to an array, issuing warnings if invalid.
94
+ *
95
+ * This function is used to aggregate scores that may be missing or out of the valid range for evaluation metrics.
96
+ * - If the input value `v` is not a finite number, it issues a warning (with the given agent and label) and skips it.
97
+ * - If the number is outside the allowed range (1 to 10), it is clamped to that range and a separate warning is issued.
98
+ * - If the final value is valid, it is appended to the destination array `arr`.
99
+ *
100
+ * @param {number[]} arr - The array to which the (possibly clamped) numeric value will be appended.
101
+ * @param {unknown} v - The value to validate and potentially append.
102
+ * @param {string} warnLabel - The label used in warnings, describing the metric or field involved.
103
+ * @param {string} agentId - The agent identifier used in warning messages.
104
+ */
105
+ function pushIfValid(arr: number[], v: unknown, warnLabel: string, agentId: string) {
106
+ const n = numOrUndefined(v);
107
+ if (n === undefined) {
108
+ writeStderr(`[${agentId}] Invalid or missing numeric score for ${warnLabel}; ignoring\n`);
109
+ return;
110
+ }
111
+ const clamped = clampScoreToRange(n);
112
+ if (clamped !== n) {
113
+ writeStderr(`[${agentId}] Score for ${warnLabel} clamped to [1,10] from ${n}\n`);
114
+ }
115
+ if (clamped !== undefined) arr.push(clamped);
116
+ }
117
+
118
+ /**
119
+ * Renders a markdown table displaying aggregated evaluation scores.
120
+ *
121
+ * This function takes an object containing aggregate scores for various evaluation metrics
122
+ * (such as functional completeness, performance, security, etc.) and formats them into
123
+ * a markdown table suitable for display or reporting.
124
+ *
125
+ * Each value is formatted to two decimal places if available, or "N/A" if null or undefined.
126
+ * The table columns are:
127
+ * - Functional Completeness
128
+ * - Performance & Scalability
129
+ * - Security
130
+ * - Maintainability & Evolvability
131
+ * - Regulatory Compliance
132
+ * - Testability
133
+ * - Overall Score
134
+ *
135
+ * @param {AggregatedAverages} agg - An object containing aggregated (averaged) scores for each metric.
136
+ * @returns {string} The markdown table as a string.
137
+ */
138
+ function renderMarkdownTable(agg: AggregatedAverages): string {
139
+ const f = (v: number | null) => v == null ? 'N/A' : v.toFixed(2);
140
+ let table = '';
141
+ table += `| Functional Completeness | Performance & Scalability | Security | Maintainability & Evolvability | Regulatory Compliance | Testability | Overall Score |\n`;
142
+ table += `|------------------------|---------------------------|----------|-------------------------------|------------------------|------------|---------------|\n`;
143
+ table += `| ${f(agg.functional_completeness)} | ${f(agg.performance_scalability)} | ${f(agg.security)} | ${f(agg.maintainability_evolvability)} | ${f(agg.regulatory_compliance)} | ${f(agg.testability)} | ${f(agg.overall_score)} |\n`;
144
+ return table;
145
+ }
146
+
147
+ /**
148
+ * Writes evaluation results to a file or stdout in JSON or Markdown format.
149
+ *
150
+ * This function handles the output of evaluation results based on the output path:
151
+ * - If outputPath ends with '.json', writes a detailed JSON file containing aggregated averages and per-agent results.
152
+ * - Otherwise, writes a Markdown table with aggregated scores to the file or stdout (if no path provided).
153
+ *
154
+ * @param {AggregatedAverages} aggregatedAverages - The aggregated average scores across all metrics.
155
+ * @param {Record<string, ParsedEvaluation>} perAgentResults - Per-agent parsed evaluation results, keyed by agent ID.
156
+ * @param {string | undefined} outputPath - Optional output file path. If undefined, writes Markdown to stdout.
157
+ * @returns {Promise<void>} A promise that resolves when the output has been written.
158
+ */
159
+ async function writeEvaluationResults(
160
+ aggregatedAverages: AggregatedAverages,
161
+ perAgentResults: Record<string, ParsedEvaluation>,
162
+ outputPath: string | undefined
163
+ ): Promise<void> {
164
+ const resolvedPath = outputPath ? path.resolve(process.cwd(), outputPath) : undefined;
165
+
166
+ if (resolvedPath && resolvedPath.toLowerCase().endsWith('.json')) {
167
+ const jsonOut: AggregatedJsonOutput = {
168
+ evaluation: {
169
+ functional_completeness: { average_score: aggregatedAverages.functional_completeness },
170
+ non_functional: {
171
+ performance_scalability: { average_score: aggregatedAverages.performance_scalability },
172
+ security: { average_score: aggregatedAverages.security },
173
+ maintainability_evolvability: { average_score: aggregatedAverages.maintainability_evolvability },
174
+ regulatory_compliance: { average_score: aggregatedAverages.regulatory_compliance },
175
+ testability: { average_score: aggregatedAverages.testability },
176
+ },
177
+ },
178
+ overall_score: aggregatedAverages.overall_score,
179
+ agents: perAgentResults,
180
+ };
181
+ await fs.promises.writeFile(resolvedPath, JSON.stringify(jsonOut, null, JSON_INDENT_SPACES), FILE_ENCODING_UTF8);
182
+ } else {
183
+ const md = renderMarkdownTable(aggregatedAverages);
184
+ if (resolvedPath) {
185
+ await fs.promises.writeFile(resolvedPath, md, FILE_ENCODING_UTF8);
186
+ } else {
187
+ process.stdout.write(md + '\n');
188
+ }
189
+ }
190
+ }
191
+
192
+ /**
193
+ * Loads and validates an evaluator configuration JSON file.
194
+ *
195
+ * This function reads the evaluator configuration file specified by the given path, validates
196
+ * that it contains a non-empty array of evaluator agent definitions, and constructs an array
197
+ * of EvaluatorConfig objects. It also determines the directory of the configuration file, which
198
+ * can be useful for resolving relative file paths inside the configuration.
199
+ *
200
+ * The expected structure of the configuration file is:
201
+ * {
202
+ * "agents": [
203
+ * {
204
+ * "id": string | number,
205
+ * "name": string,
206
+ * "model": string,
207
+ * "provider": string,
208
+ * "systemPromptPath"?: string,
209
+ * "userPromptPath"?: string,
210
+ * "timeout"?: number,
211
+ * "enabled"?: boolean
212
+ * },
213
+ * ...
214
+ * ]
215
+ * }
216
+ *
217
+ * @param {string} configPath - The path to the evaluator configuration JSON file, relative or absolute.
218
+ * @returns {LoadedEvaluatorConfig} An object containing the agent configurations and config directory path.
219
+ * @throws {Error} Throws a validation error with a specific exit code if:
220
+ * - The config file cannot be read or parsed as valid JSON.
221
+ * - The agents array is missing, not an array, or has zero entries.
222
+ */
223
+ function loadEvaluatorConfig(configPath: string): LoadedEvaluatorConfig {
224
+ const abs = path.resolve(process.cwd(), configPath);
225
+ const cfg = readJsonFile<any>(configPath, 'Evaluator config file');
226
+ if (!cfg || !Array.isArray(cfg.agents) || cfg.agents.length === 0) {
227
+ throw createValidationError('Invalid evaluator config: agents array required (length >= 1)', EXIT_INVALID_ARGS);
228
+ }
229
+ const configDir = path.dirname(abs);
230
+ const agents: EvaluatorConfig[] = cfg.agents.map((a: unknown) => {
231
+ // Type guard and validation for raw agent config
232
+ if (!a || typeof a !== 'object') {
233
+ throw createValidationError('Invalid evaluator config: agent must be an object', EXIT_INVALID_ARGS);
234
+ }
235
+ const agent = a as Record<string, unknown>;
236
+ // Validate provider is a valid LLM provider
237
+ const provider = typeof agent.provider === 'string'
238
+ ? (agent.provider as typeof LLM_PROVIDERS.OPENAI | typeof LLM_PROVIDERS.OPENROUTER)
239
+ : LLM_PROVIDERS.OPENAI; // Default to openai if invalid
240
+
241
+ return {
242
+ id: String(agent.id ?? ''),
243
+ name: String(agent.name ?? ''),
244
+ model: String(agent.model ?? ''),
245
+ provider,
246
+ systemPromptPath: typeof agent.systemPromptPath === 'string' ? agent.systemPromptPath : undefined,
247
+ userPromptPath: typeof agent.userPromptPath === 'string' ? agent.userPromptPath : undefined,
248
+ timeout: typeof agent.timeout === 'number' ? agent.timeout : undefined,
249
+ enabled: typeof agent.enabled === 'boolean' ? agent.enabled : undefined,
250
+ } as EvaluatorConfig;
251
+ });
252
+ return { agents, configDir };
253
+ }
254
+
255
+ /**
256
+ * Loads evaluator configuration, filters for enabled agents, and validates that at least one enabled agent exists.
257
+ *
258
+ * This is a convenience helper that combines loading the configuration, filtering for enabled evaluators,
259
+ * and validating that the result is non-empty.
260
+ *
261
+ * @param {string} configPath - The path to the evaluator configuration JSON file.
262
+ * @returns {{ enabledAgents: EvaluatorConfig[], configDir: string }} Object containing enabled agents and config directory.
263
+ * @throws {Error} Throws a validation error if no enabled evaluator agents are found in the config.
264
+ */
265
+ function loadAndValidateEnabledAgents(configPath: string): { enabledAgents: EvaluatorConfig[], configDir: string } {
266
+ const { agents: rawAgents, configDir } = loadEvaluatorConfig(configPath);
267
+ const enabledAgents = rawAgents.filter(isEnabledEvaluator);
268
+ if (enabledAgents.length === 0) {
269
+ throw createValidationError('No enabled evaluator agents found in config', EXIT_INVALID_ARGS);
270
+ }
271
+ return { enabledAgents, configDir };
272
+ }
273
+
274
+ /**
275
+ * Loads and validates a debate state file, extracting required fields for evaluation.
276
+ *
277
+ * This function reads the debate JSON file, validates that required fields (problem and final solution)
278
+ * are present and non-empty, and builds a Markdown representation of the clarifications.
279
+ *
280
+ * @param {string} debatePath - The path to the debate state JSON file.
281
+ * @returns {{ problem: string, finalSolution: string, clarificationsMarkdown: string }} Object containing validated debate data.
282
+ * @throws {Error} Throws a validation error if:
283
+ * - The debate file cannot be read or parsed.
284
+ * - The problem field is missing or empty.
285
+ * - The finalSolution.description field is missing or empty.
286
+ */
287
+ function loadAndValidateDebateState(debatePath: string): { problem: string, finalSolution: string, clarificationsMarkdown: string } {
288
+ const debate: DebateState = readJsonFile<DebateState>(debatePath, 'Debate file');
289
+ const problem = (debate.problem || '').trim();
290
+ const finalSolution = (debate.finalSolution && debate.finalSolution.description || '').trim();
291
+ if (!problem) throw createValidationError('Invalid debate JSON: missing non-empty problem', EXIT_INVALID_ARGS);
292
+ if (!finalSolution) throw createValidationError('Invalid debate JSON: missing non-empty finalSolution.description', EXIT_INVALID_ARGS);
293
+
294
+ const clarificationsMarkdown = buildClarificationsMarkdown(debate);
295
+
296
+ return { problem, finalSolution, clarificationsMarkdown };
297
+ }
298
+
299
+ /**
300
+ * Validates and parses an evaluator agent's result from a Promise.allSettled outcome.
301
+ *
302
+ * This function checks if the promise was fulfilled, extracts the raw text output,
303
+ * attempts to parse it as JSON, and logs warnings for any failures. If the result
304
+ * is invalid or cannot be parsed, it returns null.
305
+ *
306
+ * @param {PromiseSettledResult<any>} result - The settled promise result from an evaluator agent.
307
+ * @param {string} agentId - The agent identifier used in warning messages.
308
+ * @returns {ParsedEvaluation | null} The parsed evaluation object, or null if validation/parsing failed.
309
+ */
310
+ function validateAndParseEvaluatorResult(result: PromiseSettledResult<any>, agentId: string): ParsedEvaluation | null {
311
+ if (result.status !== 'fulfilled') {
312
+ writeStderr(`[${agentId}] Skipped due to error\n`);
313
+ return null;
314
+ }
315
+ const rawText = result.value.rawText || '';
316
+ const parsed = parseFirstJsonObject(rawText);
317
+ if (parsed === null) {
318
+ writeStderr(`[${agentId}] Invalid JSON output; skipping agent\n`);
319
+ return null;
320
+ }
321
+ return parsed as ParsedEvaluation;
322
+ }
323
+
324
+ /**
325
+ * Builds an array of EvaluatorAgent instances from enabled evaluator configurations.
326
+ *
327
+ * This function loads default prompts for evaluators, resolves custom prompts (if specified in the
328
+ * configuration), and constructs EvaluatorAgent instances. It optionally logs verbose information
329
+ * about each agent's provider, model, and prompt sources.
330
+ *
331
+ * @param {EvaluatorConfig[]} enabledAgents - Array of enabled evaluator configurations.
332
+ * @param {string} configDir - Absolute path to the configuration directory (for resolving relative prompt paths).
333
+ * @param {boolean} verbose - If true, logs detailed information about each agent to stderr.
334
+ * @returns {EvaluatorAgent[]} Array of instantiated EvaluatorAgent instances ready for evaluation.
335
+ */
336
+ function buildEvaluatorAgents(enabledAgents: EvaluatorConfig[], configDir: string, verbose: boolean): EvaluatorAgent[] {
337
+ const sysDefault = readBuiltInPrompt(
338
+ 'eval/prompts/system.md',
339
+ 'You are an expert software design evaluator. Output ONLY a single JSON object as specified.'
340
+ );
341
+ const userDefault = readBuiltInPrompt(
342
+ 'eval/prompts/user.md',
343
+ '{ "evaluation": {}, "overall_summary": { "overall_score": 5 } }'
344
+ );
345
+
346
+ return enabledAgents.map((evaluatorConfig) => {
347
+ const sysRes = resolvePrompt({
348
+ label: evaluatorConfig.name,
349
+ configDir,
350
+ ...(evaluatorConfig.systemPromptPath !== undefined && { promptPath: evaluatorConfig.systemPromptPath }),
351
+ defaultText: sysDefault
352
+ });
353
+ const userRes = resolvePrompt({
354
+ label: `${evaluatorConfig.name} (user)`,
355
+ configDir,
356
+ ...(evaluatorConfig.userPromptPath !== undefined && { promptPath: evaluatorConfig.userPromptPath }),
357
+ defaultText: userDefault
358
+ });
359
+
360
+ if (verbose) {
361
+ const sysSrc = sysRes.source === PROMPT_SOURCES.FILE ? sysRes.absPath : 'built-in default';
362
+ const usrSrc = userRes.source === PROMPT_SOURCES.FILE ? userRes.absPath : 'built-in default';
363
+ writeStderr(`[${evaluatorConfig.id}] provider=${evaluatorConfig.provider} model=${evaluatorConfig.model} systemPrompt=${sysSrc} userPrompt=${usrSrc}\n`);
364
+ }
365
+
366
+ return EvaluatorAgent.fromConfig(evaluatorConfig, sysRes.text, userRes.text);
367
+ });
368
+ }
369
+
370
+ /**
371
+ * Registers the 'eval' CLI command, which evaluates a completed debate using multiple evaluator agents.
372
+ *
373
+ * This command aggregates scores and outputs either a JSON or Markdown summary of the evaluation.
374
+ *
375
+ * @param {Command} program - Commander.js program object to which the command is added.
376
+ *
377
+ * Command-line Options:
378
+ * -c, --config <path> Path to evaluator configuration JSON file.
379
+ * -d, --debate <path> Path to debate JSON file (DebateState format).
380
+ * --env-file <path> Optional: Path to .env file for environment variables.
381
+ * -v, --verbose Optional: Enable verbose diagnostic output.
382
+ * -o, --output <path> Optional: Output file destination; if ends with ".json" outputs as JSON, otherwise as Markdown.
383
+ *
384
+ * Behavior:
385
+ * - Loads and validates evaluator configuration and debate state files.
386
+ * - Constructs and runs all enabled evaluator agents in parallel on the given debate data.
387
+ * - Parses, validates, and aggregates the numeric outputs from each agent.
388
+ * - Outputs an aggregated summary (JSON or Markdown table) to file or stdout, and per-agent results to a JSON map if JSON output is selected.
389
+ * - Handles errors gracefully, providing error messages and exit codes.
390
+ *
391
+ * Output:
392
+ * - If --output ends with .json: writes full machine-readable output (includes per-agent data and averages).
393
+ * - Otherwise: outputs a Markdown table with score averages to stdout or file.
394
+ *
395
+ * Errors:
396
+ * - Exits with explicit error codes and user-friendly messages on invalid arguments, missing files, or evaluation failures.
397
+ */
398
+ export function evalCommand(program: Command) {
399
+ program
400
+ .command('eval')
401
+ .requiredOption('-c, --config <path>', 'Path to evaluator configuration JSON')
402
+ .requiredOption('-d, --debate <path>', 'Path to debate JSON file (DebateState)')
403
+ .option('--env-file <path>', 'Path to .env file')
404
+ .option('-v, --verbose', 'Verbose diagnostics')
405
+ .option('-o, --output <path>', 'Output destination (json => aggregated JSON; otherwise Markdown)')
406
+ .description('Evaluate a completed debate using evaluator agents')
407
+ .action(async (options: any) =>
408
+ {
409
+ try {
410
+
411
+ loadEnvironmentFile(options.envFile, options.verbose);
412
+
413
+ const { enabledAgents, configDir } = loadAndValidateEnabledAgents(options.config);
414
+ const evaluators = buildEvaluatorAgents(enabledAgents, configDir, options.verbose);
415
+ const { problem, finalSolution, clarificationsMarkdown } = loadAndValidateDebateState(options.debate);
416
+
417
+ const inputs = { problem, clarificationsMarkdown, finalSolution };
418
+
419
+ // Run all in parallel
420
+ const results = await Promise.allSettled(evaluators.map((e) => e.evaluate(inputs)));
421
+
422
+ const perAgentParsed: Record<string, ParsedEvaluation> = {};
423
+ const arrFc: number[] = [];
424
+ const arrPerf: number[] = [];
425
+ const arrSec: number[] = [];
426
+ const arrMaint: number[] = [];
427
+ const arrReg: number[] = [];
428
+ const arrTest: number[] = [];
429
+ const arrOverall: number[] = [];
430
+
431
+ results.forEach((res, idx) => {
432
+ const agent = evaluators[idx];
433
+ if (!agent) return;
434
+ const agentId = agent.id;
435
+
436
+ const parsed = validateAndParseEvaluatorResult(res, agentId);
437
+ if (parsed === null) return;
438
+
439
+ perAgentParsed[agentId] = parsed;
440
+
441
+ const evalObj = parsed.evaluation || {};
442
+ const func = evalObj.functional_completeness || {};
443
+ const nonf = evalObj.non_functional || {};
444
+ const overallSummary = parsed.overall_summary || {};
445
+
446
+ pushIfValid(arrFc, func.score, 'functional_completeness.score', agentId);
447
+ pushIfValid(arrPerf, nonf.performance_scalability?.score, 'non_functional.performance_scalability.score', agentId);
448
+ pushIfValid(arrSec, nonf.security?.score, 'non_functional.security.score', agentId);
449
+ pushIfValid(arrMaint, nonf.maintainability_evolvability?.score, 'non_functional.maintainability_evolvability.score', agentId);
450
+ pushIfValid(arrReg, nonf.regulatory_compliance?.score, 'non_functional.regulatory_compliance.score', agentId);
451
+ pushIfValid(arrTest, nonf.testability?.score, 'non_functional.testability.score', agentId);
452
+ pushIfValid(arrOverall, overallSummary.overall_score, 'overall_summary.overall_score', agentId);
453
+ });
454
+
455
+ const agg: AggregatedAverages = {
456
+ functional_completeness: averageOrNull(arrFc),
457
+ performance_scalability: averageOrNull(arrPerf),
458
+ security: averageOrNull(arrSec),
459
+ maintainability_evolvability: averageOrNull(arrMaint),
460
+ regulatory_compliance: averageOrNull(arrReg),
461
+ testability: averageOrNull(arrTest),
462
+ overall_score: averageOrNull(arrOverall),
463
+ };
464
+
465
+ await writeEvaluationResults(agg, perAgentParsed, options.output);
466
+ } catch (err: any) {
467
+ const code = typeof err?.code === 'number' ? err.code : EXIT_GENERAL_ERROR;
468
+ writeStderr((err?.message || 'Unknown error') + '\n');
469
+ // Rethrow for runCli catch to set process exit when direct run
470
+ throw Object.assign(new Error(err?.message || 'Unknown error'), { code });
471
+ }
472
+ });
473
+ }
474
+
475
+