dialectic 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/.cursor/commands/setup-test.mdc +175 -0
  2. package/.cursor/rules/basic-code-cleanup.mdc +1110 -0
  3. package/.cursor/rules/riper5.mdc +96 -0
  4. package/.env.example +6 -0
  5. package/AGENTS.md +1052 -0
  6. package/LICENSE +21 -0
  7. package/README.md +93 -0
  8. package/WARP.md +113 -0
  9. package/dialectic-1.0.0.tgz +0 -0
  10. package/dialectic.js +10 -0
  11. package/docs/commands.md +375 -0
  12. package/docs/configuration.md +882 -0
  13. package/docs/context_summarization.md +1023 -0
  14. package/docs/debate_flow.md +1127 -0
  15. package/docs/eval_flow.md +795 -0
  16. package/docs/evaluator.md +141 -0
  17. package/examples/debate-config-openrouter.json +48 -0
  18. package/examples/debate_config1.json +48 -0
  19. package/examples/eval/eval1/eval_config1.json +13 -0
  20. package/examples/eval/eval1/result1.json +62 -0
  21. package/examples/eval/eval1/result2.json +97 -0
  22. package/examples/eval_summary_format.md +11 -0
  23. package/examples/example3/debate-config.json +64 -0
  24. package/examples/example3/eval_config2.json +25 -0
  25. package/examples/example3/problem.md +17 -0
  26. package/examples/example3/rounds_test/eval_run.sh +16 -0
  27. package/examples/example3/rounds_test/run_test.sh +16 -0
  28. package/examples/kata1/architect-only-solution_2-rounds.json +121 -0
  29. package/examples/kata1/architect-perf-solution_2-rounds.json +234 -0
  30. package/examples/kata1/debate-config-kata1.json +54 -0
  31. package/examples/kata1/eval_architect-only_2-rounds.json +97 -0
  32. package/examples/kata1/eval_architect-perf_2-rounds.json +97 -0
  33. package/examples/kata1/kata1-report.md +12224 -0
  34. package/examples/kata1/kata1-report_temps-01_01_01_07.md +2451 -0
  35. package/examples/kata1/kata1.md +5 -0
  36. package/examples/kata1/meta.txt +1 -0
  37. package/examples/kata2/debate-config.json +54 -0
  38. package/examples/kata2/eval_config1.json +21 -0
  39. package/examples/kata2/eval_config2.json +25 -0
  40. package/examples/kata2/kata2.md +5 -0
  41. package/examples/kata2/only_architect/debate-config.json +45 -0
  42. package/examples/kata2/only_architect/eval_run.sh +11 -0
  43. package/examples/kata2/only_architect/run_test.sh +5 -0
  44. package/examples/kata2/rounds_test/eval_run.sh +11 -0
  45. package/examples/kata2/rounds_test/run_test.sh +5 -0
  46. package/examples/kata2/summary_length_test/eval_run.sh +11 -0
  47. package/examples/kata2/summary_length_test/eval_run_w_clarify.sh +7 -0
  48. package/examples/kata2/summary_length_test/run_test.sh +5 -0
  49. package/examples/task-queue/debate-config.json +76 -0
  50. package/examples/task-queue/debate_report.md +566 -0
  51. package/examples/task-queue/task-queue-system.md +25 -0
  52. package/jest.config.ts +13 -0
  53. package/multi_agent_debate_spec.md +2980 -0
  54. package/package.json +38 -0
  55. package/sanity-check-problem.txt +9 -0
  56. package/src/agents/prompts/architect-prompts.ts +203 -0
  57. package/src/agents/prompts/generalist-prompts.ts +157 -0
  58. package/src/agents/prompts/index.ts +41 -0
  59. package/src/agents/prompts/judge-prompts.ts +19 -0
  60. package/src/agents/prompts/kiss-prompts.ts +230 -0
  61. package/src/agents/prompts/performance-prompts.ts +142 -0
  62. package/src/agents/prompts/prompt-types.ts +68 -0
  63. package/src/agents/prompts/security-prompts.ts +149 -0
  64. package/src/agents/prompts/shared.ts +144 -0
  65. package/src/agents/prompts/testing-prompts.ts +149 -0
  66. package/src/agents/role-based-agent.ts +386 -0
  67. package/src/cli/commands/debate.ts +761 -0
  68. package/src/cli/commands/eval.ts +475 -0
  69. package/src/cli/commands/report.ts +265 -0
  70. package/src/cli/index.ts +79 -0
  71. package/src/core/agent.ts +198 -0
  72. package/src/core/clarifications.ts +34 -0
  73. package/src/core/judge.ts +257 -0
  74. package/src/core/orchestrator.ts +432 -0
  75. package/src/core/state-manager.ts +322 -0
  76. package/src/eval/evaluator-agent.ts +130 -0
  77. package/src/eval/prompts/system.md +41 -0
  78. package/src/eval/prompts/user.md +64 -0
  79. package/src/providers/llm-provider.ts +25 -0
  80. package/src/providers/openai-provider.ts +84 -0
  81. package/src/providers/openrouter-provider.ts +122 -0
  82. package/src/providers/provider-factory.ts +64 -0
  83. package/src/types/agent.types.ts +141 -0
  84. package/src/types/config.types.ts +47 -0
  85. package/src/types/debate.types.ts +237 -0
  86. package/src/types/eval.types.ts +85 -0
  87. package/src/utils/common.ts +104 -0
  88. package/src/utils/context-formatter.ts +102 -0
  89. package/src/utils/context-summarizer.ts +143 -0
  90. package/src/utils/env-loader.ts +46 -0
  91. package/src/utils/exit-codes.ts +5 -0
  92. package/src/utils/id.ts +11 -0
  93. package/src/utils/logger.ts +48 -0
  94. package/src/utils/paths.ts +10 -0
  95. package/src/utils/progress-ui.ts +313 -0
  96. package/src/utils/prompt-loader.ts +79 -0
  97. package/src/utils/report-generator.ts +301 -0
  98. package/tests/clarifications.spec.ts +128 -0
  99. package/tests/cli.debate.spec.ts +144 -0
  100. package/tests/config-loading.spec.ts +206 -0
  101. package/tests/context-summarizer.spec.ts +131 -0
  102. package/tests/debate-config-custom.json +38 -0
  103. package/tests/env-loader.spec.ts +149 -0
  104. package/tests/eval.command.spec.ts +1191 -0
  105. package/tests/logger.spec.ts +19 -0
  106. package/tests/openai-provider.spec.ts +26 -0
  107. package/tests/openrouter-provider.spec.ts +279 -0
  108. package/tests/orchestrator-summary.spec.ts +386 -0
  109. package/tests/orchestrator.spec.ts +207 -0
  110. package/tests/prompt-loader.spec.ts +52 -0
  111. package/tests/prompts/architect.md +16 -0
  112. package/tests/provider-factory.spec.ts +150 -0
  113. package/tests/report.command.spec.ts +546 -0
  114. package/tests/role-based-agent-summary.spec.ts +476 -0
  115. package/tests/security-agent.spec.ts +221 -0
  116. package/tests/shared-prompts.spec.ts +318 -0
  117. package/tests/state-manager.spec.ts +251 -0
  118. package/tests/summary-prompts.spec.ts +153 -0
  119. package/tsconfig.json +49 -0
@@ -0,0 +1,795 @@
1
+ # Evaluation Flow Documentation
2
+
3
+ This document provides a detailed explanation of the execution flow for the eval command, from CLI invocation through to final output.
4
+
5
+ ## Overview
6
+
7
+ The evaluation system uses multiple evaluator agents to assess completed debates. The flow involves loading evaluator configuration, instantiating evaluator agents, loading and validating debate state, running all evaluators in parallel, parsing and aggregating their results, and outputting the aggregated evaluation in JSON or Markdown format.
8
+
9
+ ## Sequence Diagram
10
+
11
+ The following diagram illustrates the complete flow of an evaluation execution:
12
+
13
+ ```mermaid
14
+ sequenceDiagram
15
+ participant CLI as CLI Entry Point
16
+ participant Cmd as evalCommand
17
+ participant Env as loadEnvironmentFile
18
+ participant Config as loadEvaluatorConfig
19
+ participant Builder as buildEvaluatorAgents
20
+ participant Prompt as resolvePrompt
21
+ participant Provider as ProviderFactory
22
+ participant EA as EvaluatorAgent
23
+ participant Debate as loadAndValidateDebateState
24
+ participant FS as File System
25
+ participant Parser as parseFirstJsonObject
26
+ participant Agg as Aggregate Scores
27
+ participant Output as writeEvaluationResults
28
+
29
+ CLI->>Cmd: runCli(argv)
30
+ Cmd->>Cmd: parse arguments
31
+ Cmd->>Env: loadEnvironmentFile(envFilePath, verbose)
32
+ Env->>FS: read .env file (if exists)
33
+ Env-->>Cmd: environment variables loaded
34
+
35
+ Cmd->>Config: loadAndValidateEnabledAgents(configPath)
36
+ Config->>FS: read evaluator config JSON
37
+ Config->>Config: validate agents array exists and non-empty
38
+ Config->>Config: filter enabled agents (enabled !== false)
39
+ Config->>Config: validate at least one enabled agent
40
+ Config-->>Cmd: { enabledAgents, configDir }
41
+
42
+ Cmd->>Builder: buildEvaluatorAgents(enabledAgents, configDir, verbose)
43
+ loop For each enabled agent
44
+ Builder->>Prompt: resolvePrompt(systemPromptPath, configDir)
45
+ alt systemPromptPath provided
46
+ Prompt->>FS: read system prompt file (UTF-8)
47
+ Prompt-->>Builder: { text, source: 'file', absPath }
48
+ else no systemPromptPath
49
+ Prompt->>Prompt: readBuiltInPrompt('eval/prompts/system.md')
50
+ Prompt-->>Builder: { text, source: 'built-in' }
51
+ end
52
+
53
+ Builder->>Prompt: resolvePrompt(userPromptPath, configDir)
54
+ alt userPromptPath provided
55
+ Prompt->>FS: read user prompt file (UTF-8)
56
+ Prompt-->>Builder: { text, source: 'file', absPath }
57
+ else no userPromptPath
58
+ Prompt->>Prompt: readBuiltInPrompt('eval/prompts/user.md')
59
+ Prompt-->>Builder: { text, source: 'built-in' }
60
+ end
61
+
62
+ Builder->>Provider: createProvider(agent.provider)
63
+ Provider-->>Builder: LLMProvider instance
64
+
65
+ Builder->>EA: EvaluatorAgent.fromConfig(config, systemPrompt, userPrompt)
66
+ EA->>EA: new EvaluatorAgent(config, provider, systemPrompt, userPrompt)
67
+ EA-->>Builder: EvaluatorAgent instance
68
+ end
69
+ Builder-->>Cmd: evaluators[]
70
+
71
+ Cmd->>Debate: loadAndValidateDebateState(debatePath)
72
+ Debate->>FS: read debate JSON file
73
+ Debate->>Debate: parse DebateState JSON
74
+ Debate->>Debate: validate problem field exists and non-empty
75
+ Debate->>Debate: validate finalSolution.description exists and non-empty
76
+ Debate->>Debate: buildClarificationsMarkdown(state)
77
+ Debate-->>Cmd: { problem, finalSolution, clarificationsMarkdown }
78
+
79
+ Cmd->>Cmd: build inputs object { problem, clarificationsMarkdown, finalSolution }
80
+
81
+ Cmd->>Cmd: Promise.allSettled(evaluators.map(e => e.evaluate(inputs)))
82
+ par Parallel evaluation for all agents
83
+ Cmd->>EA: evaluator.evaluate(inputs)
84
+ EA->>EA: renderUserPrompt(inputs)
85
+ EA->>EA: replace placeholders {problem}, {clarifications}, {final_solution}
86
+ EA->>EA: provider.complete({ model, temperature: 0.1, systemPrompt, userPrompt })
87
+ EA->>Provider: complete(request)
88
+ Provider->>Provider: try Responses API
89
+ alt Responses API available
90
+ Provider->>Provider: call responses.create()
91
+ else Fallback
92
+ Provider->>Provider: call chat.completions.create()
93
+ end
94
+ Provider-->>EA: { text, usage, latency }
95
+ EA->>EA: measure latency
96
+ EA-->>Cmd: { id, rawText, latencyMs, usage }
97
+ end
98
+ Cmd-->>Cmd: PromiseSettledResult[] results
99
+
100
+ Cmd->>Cmd: process results and aggregate scores
101
+ loop For each result
102
+ Cmd->>Parser: validateAndParseEvaluatorResult(result, agentId)
103
+ alt result.status === 'fulfilled'
104
+ Parser->>Parser: parseFirstJsonObject(rawText)
105
+ Parser->>Parser: extract JSON object from text
106
+ Parser-->>Cmd: ParsedEvaluation | null
107
+ else result.status === 'rejected'
108
+ Parser-->>Cmd: null (writes warning to stderr)
109
+ end
110
+
111
+ alt ParsedEvaluation valid
112
+ Cmd->>Agg: pushIfValid(arrFc, func.score, ...)
113
+ Cmd->>Agg: pushIfValid(arrPerf, nonf.performance_scalability.score, ...)
114
+ Cmd->>Agg: pushIfValid(arrSec, nonf.security.score, ...)
115
+ Cmd->>Agg: pushIfValid(arrMaint, nonf.maintainability_evolvability.score, ...)
116
+ Cmd->>Agg: pushIfValid(arrReg, nonf.regulatory_compliance.score, ...)
117
+ Cmd->>Agg: pushIfValid(arrTest, nonf.testability.score, ...)
118
+ Cmd->>Agg: pushIfValid(arrOverall, overall_summary.overall_score, ...)
119
+ end
120
+ end
121
+
122
+ Cmd->>Agg: calculate averages (averageOrNull) for each metric
123
+ Agg-->>Cmd: AggregatedAverages
124
+
125
+ Cmd->>Output: writeEvaluationResults(aggregatedAverages, perAgentResults, outputPath)
126
+ alt outputPath ends with .json
127
+ Output->>Output: build AggregatedJsonOutput object
128
+ Output->>FS: write JSON file (UTF-8, 2-space indent)
129
+ else outputPath exists but not .json
130
+ Output->>Output: renderMarkdownTable(aggregatedAverages)
131
+ Output->>FS: write Markdown file (UTF-8)
132
+ else no outputPath
133
+ Output->>Output: renderMarkdownTable(aggregatedAverages)
134
+ Output->>CLI: write to stdout
135
+ end
136
+
137
+ Cmd-->>CLI: exit code 0
138
+ ```
139
+
140
+ ## Detailed Flow Description
141
+
142
+ ### 1. CLI Entry Point
143
+
144
+ **Function**: `runCli(argv: string[])`
145
+ **Location**: `src/cli/index.ts`
146
+
147
+ The entry point for the evaluation system. This function:
148
+ - Creates a Commander program instance
149
+ - Sets program metadata (name, description, version)
150
+ - Registers the eval command via `evalCommand(program)`
151
+ - Parses command line arguments
152
+ - Handles top-level errors and maps them to exit codes
153
+
154
+ **Parameters**:
155
+ - `argv`: Array of command-line arguments (excluding node and script name)
156
+
157
+ **Returns**: Promise that resolves on success or rejects with an error containing an exit code
158
+
159
+ ### 2. Command Registration
160
+
161
+ **Function**: `evalCommand(program: Command)`
162
+ **Location**: `src/cli/commands/eval.ts`
163
+
164
+ Registers the eval command and its action handler with Commander. Defines:
165
+ - Command name: `eval`
166
+ - Options: `-c, --config <path>`, `-d, --debate <path>`, `--env-file <path>`, `-v, --verbose`, `-o, --output <path>`
167
+ - Action handler that executes when the command is invoked
168
+
169
+ **Parameters**:
170
+ - `program`: Commander instance to register the command with
171
+
172
+ ### 3. Environment File Loading
173
+
174
+ **Function**: `loadEnvironmentFile(envFilePath?: string, verbose?: boolean)`
175
+ **Location**: `src/utils/env-loader.ts`
176
+
177
+ Loads environment variables from a .env file using the dotenv library.
178
+
179
+ **Parameters**:
180
+ - `envFilePath`: Optional path to .env file (default: `.env` in current working directory)
181
+ - `verbose`: Whether to output verbose logging
182
+
183
+ **Behavior**:
184
+ - If default `.env` file doesn't exist: continues silently (non-breaking), optionally warns in verbose mode
185
+ - If custom env file path specified and doesn't exist: throws error
186
+ - Uses dotenv to parse and load environment variables
187
+ - Loads API keys required for LLM providers (OPENAI_API_KEY, OPENROUTER_API_KEY)
188
+
189
+ ### 4. Evaluator Configuration Loading
190
+
191
+ **Function**: `loadAndValidateEnabledAgents(configPath: string)`
192
+ **Location**: `src/cli/commands/eval.ts`
193
+
194
+ Loads evaluator configuration file, validates structure, and filters for enabled agents.
195
+
196
+ **Parameters**:
197
+ - `configPath`: Path to evaluator configuration JSON file
198
+
199
+ **Returns**: Object containing:
200
+ - `enabledAgents`: Array of enabled evaluator configurations
201
+ - `configDir`: Absolute directory path containing the configuration file
202
+
203
+ **Execution Flow**:
204
+
205
+ #### 4.1 Load Configuration File
206
+ 1. Calls `loadEvaluatorConfig(configPath)`:
207
+ - Resolves path relative to current working directory
208
+ - Calls `readJsonFile()` to read and parse JSON
209
+ - Validates file exists and is a regular file
210
+ - Validates JSON is parseable
211
+ - Validates `agents` array exists and has at least one entry
212
+ - Maps raw agent objects to `EvaluatorConfig` format
213
+ - Returns `{ agents, configDir }`
214
+
215
+ #### 4.2 Filter Enabled Agents
216
+ 1. Filters agents using `isEnabledEvaluator()`:
217
+ - Agent is enabled if `enabled !== false` (defaults to true if undefined)
218
+ - Removes agents where `enabled: false`
219
+
220
+ #### 4.3 Validate Enabled Agents Exist
221
+ 1. Validates at least one enabled agent remains:
222
+ - If no enabled agents found, throws validation error with EXIT_INVALID_ARGS
223
+
224
+ **Configuration Schema**:
225
+ ```json
226
+ {
227
+ "agents": [
228
+ {
229
+ "id": "eval-1",
230
+ "name": "Evaluator 1",
231
+ "model": "gpt-4",
232
+ "provider": "openai",
233
+ "systemPromptPath": "./prompts/system.md",
234
+ "userPromptPath": "./prompts/user.md",
235
+ "timeout": 30000,
236
+ "enabled": true
237
+ }
238
+ ]
239
+ }
240
+ ```
241
+
242
+ ### 5. Evaluator Agent Instantiation
243
+
244
+ **Function**: `buildEvaluatorAgents(enabledAgents: EvaluatorConfig[], configDir: string, verbose: boolean)`
245
+ **Location**: `src/cli/commands/eval.ts`
246
+
247
+ Creates EvaluatorAgent instances from enabled evaluator configurations.
248
+
249
+ **Parameters**:
250
+ - `enabledAgents`: Array of enabled evaluator configurations
251
+ - `configDir`: Absolute directory path for resolving relative prompt paths
252
+ - `verbose`: Whether to log detailed information about each agent
253
+
254
+ **Returns**: Array of `EvaluatorAgent` instances
255
+
256
+ **Execution Flow**:
257
+
258
+ #### 5.1 Load Default Prompts
259
+ 1. Loads built-in system prompt:
260
+ - Calls `readBuiltInPrompt('eval/prompts/system.md', fallbackText)`
261
+ - Attempts to read from `dist/eval/prompts/system.md` (runtime)
262
+ - Falls back to `src/eval/prompts/system.md` (tests)
263
+ - Uses fallback text if file unavailable
264
+
265
+ 2. Loads built-in user prompt:
266
+ - Calls `readBuiltInPrompt('eval/prompts/user.md', fallbackText)`
267
+ - Same resolution strategy as system prompt
268
+
269
+ #### 5.2 For Each Enabled Agent
270
+ 1. **Resolve System Prompt**:
271
+ - Calls `resolvePrompt()` with agent's `systemPromptPath`:
272
+ - If `systemPromptPath` provided: resolves relative to `configDir`, reads file (UTF-8)
273
+ - If file missing/unreadable/empty: warns and uses built-in default
274
+ - Returns `{ text, source, absPath? }`
275
+
276
+ 2. **Resolve User Prompt**:
277
+ - Calls `resolvePrompt()` with agent's `userPromptPath`:
278
+ - Same resolution strategy as system prompt
279
+ - Returns `{ text, source, absPath? }`
280
+
281
+ 3. **Create Provider**:
282
+ - Calls `createProvider(agent.provider)`:
283
+ - Creates `OpenAIProvider` for "openai"
284
+ - Creates `OpenRouterProvider` for "openrouter"
285
+ - Returns `LLMProvider` instance
286
+
287
+ 4. **Instantiate EvaluatorAgent**:
288
+ - Calls `EvaluatorAgent.fromConfig(config, systemPrompt, userPrompt)`:
289
+ - Creates provider instance
290
+ - Constructs `EvaluatorAgent` with:
291
+ - Configuration (id, name, model)
292
+ - Provider instance
293
+ - Resolved system prompt
294
+ - Resolved user prompt template
295
+
296
+ 5. **Verbose Logging** (if enabled):
297
+ - Logs to stderr: agent ID, provider, model, system prompt source, user prompt source
298
+
299
+ ### 6. Debate State Loading and Validation
300
+
301
+ **Function**: `loadAndValidateDebateState(debatePath: string)`
302
+ **Location**: `src/cli/commands/eval.ts`
303
+
304
+ Loads and validates a debate state file, extracting required fields for evaluation.
305
+
306
+ **Parameters**:
307
+ - `debatePath`: Path to debate state JSON file
308
+
309
+ **Returns**: Object containing:
310
+ - `problem`: Problem statement string
311
+ - `finalSolution`: Final solution description string
312
+ - `clarificationsMarkdown`: Markdown-formatted clarifications string
313
+
314
+ **Execution Flow**:
315
+
316
+ #### 6.1 Load Debate File
317
+ 1. Calls `readJsonFile<DebateState>(debatePath)`:
318
+ - Resolves path relative to current working directory
319
+ - Validates file exists and is a regular file
320
+ - Reads file content as UTF-8
321
+ - Parses JSON to `DebateState` object
322
+
323
+ #### 6.2 Validate Required Fields
324
+ 1. **Problem Validation**:
325
+ - Extracts `debate.problem` and trims whitespace
326
+ - If empty or missing, throws validation error with EXIT_INVALID_ARGS
327
+
328
+ 2. **Final Solution Validation**:
329
+ - Extracts `debate.finalSolution.description` and trims whitespace
330
+ - If empty or missing, throws validation error with EXIT_INVALID_ARGS
331
+
332
+ #### 6.3 Build Clarifications Markdown
333
+ 1. Calls `buildClarificationsMarkdown(debate)`:
334
+ - If no clarifications: returns minimal code block separator
335
+ - Otherwise: builds Markdown-formatted string:
336
+ - For each agent's clarification group:
337
+ - Adds H3 header: `### {agentName} ({role})`
338
+ - For each question-answer pair:
339
+ - Adds question: `Question (q1):\n\n```text\n{question}\n```\n\n`
340
+ - Adds answer: `Answer:\n\n```text\n{answer}\n```\n\n`
341
+ - Returns trimmed Markdown string
342
+
343
+ ### 7. Evaluation Execution
344
+
345
+ **Method**: `evaluator.evaluate(inputs: EvaluatorInputs)`
346
+ **Location**: `src/eval/evaluator-agent.ts`
347
+
348
+ Performs the evaluation using the underlying LLM provider.
349
+
350
+ **Parameters**:
351
+ - `inputs`: Object containing:
352
+ - `problem`: Problem statement string
353
+ - `clarificationsMarkdown`: Markdown-formatted clarifications
354
+ - `finalSolution`: Final solution description string
355
+
356
+ **Returns**: `EvaluatorResult` containing:
357
+ - `id`: Agent identifier
358
+ - `rawText`: Raw LLM response text
359
+ - `latencyMs`: Latency in milliseconds
360
+ - `usage`: Optional token usage statistics
361
+
362
+ **Execution Flow**:
363
+
364
+ #### 7.1 Render User Prompt
365
+ 1. Calls `renderUserPrompt(inputs)`:
366
+ - Replaces `{problem}` placeholder with `inputs.problem`
367
+ - Replaces `{clarifications}` placeholder with `inputs.clarificationsMarkdown`
368
+ - Replaces `{final_solution}` placeholder with `inputs.finalSolution`
369
+ - Returns rendered user prompt string
370
+
371
+ #### 7.2 Invoke LLM Provider
372
+ 1. Records start time for latency measurement
373
+ 2. Calls `provider.complete()` with:
374
+ - `model`: Agent's configured model
375
+ - `temperature`: Fixed at 0.1 (for deterministic evaluation)
376
+ - `systemPrompt`: Resolved system prompt
377
+ - `userPrompt`: Rendered user prompt
378
+
379
+ 3. Provider execution:
380
+ - **Primary**: Attempts Responses API (`client.responses.create()`)
381
+ - **Fallback**: Uses Chat Completions API (`client.chat.completions.create()`)
382
+ - Returns `CompletionResponse` with text and usage statistics
383
+
384
+ #### 7.3 Measure Latency and Build Result
385
+ 1. Calculates latency: `Date.now() - started`
386
+ 2. Builds `EvaluatorResult`:
387
+ - `id`: Agent identifier
388
+ - `rawText`: Response text from provider
389
+ - `latencyMs`: Calculated latency
390
+ - `usage`: Token usage (if available)
391
+
392
+ #### 7.4 Error Handling
393
+ - If provider call fails: logs error to stderr with agent ID and rethrows
394
+ - Error propagates to Promise.allSettled for graceful handling
395
+
396
+ ### 8. Parallel Evaluation Execution
397
+
398
+ **Location**: `src/cli/commands/eval.ts` (action handler)
399
+
400
+ Runs all evaluator agents in parallel and collects results.
401
+
402
+ **Execution Flow**:
403
+
404
+ #### 8.1 Build Inputs Object
405
+ 1. Creates `inputs` object:
406
+ ```typescript
407
+ {
408
+ problem,
409
+ clarificationsMarkdown,
410
+ finalSolution
411
+ }
412
+ ```
413
+
414
+ #### 8.2 Execute All Evaluators
415
+ 1. Calls `Promise.allSettled(evaluators.map(e => e.evaluate(inputs)))`:
416
+ - Maps each evaluator to its `evaluate()` promise
417
+ - Uses `allSettled` to handle failures gracefully (doesn't fail fast)
418
+ - All evaluations run concurrently in parallel
419
+ - Returns array of `PromiseSettledResult` objects
420
+
421
+ #### 8.3 Process Results
422
+ 1. Iterates through `PromiseSettledResult[]`:
423
+ - For each result:
424
+ - Calls `validateAndParseEvaluatorResult(result, agentId)`
425
+ - If result is rejected: logs warning, returns null
426
+ - If result is fulfilled: parses JSON from raw text, returns `ParsedEvaluation | null`
427
+
428
+ ### 9. Result Parsing and Validation
429
+
430
+ **Function**: `validateAndParseEvaluatorResult(result: PromiseSettledResult<any>, agentId: string)`
431
+ **Location**: `src/cli/commands/eval.ts`
432
+
433
+ Validates and parses an evaluator agent's result from a Promise.allSettled outcome.
434
+
435
+ **Parameters**:
436
+ - `result`: The settled promise result from an evaluator agent
437
+ - `agentId`: Agent identifier for warning messages
438
+
439
+ **Returns**: `ParsedEvaluation | null`
440
+
441
+ **Execution Flow**:
442
+
443
+ #### 9.1 Check Promise Status
444
+ 1. If `result.status !== 'fulfilled'`:
445
+ - Writes warning to stderr: `[agentId] Skipped due to error`
446
+ - Returns null
447
+
448
+ #### 9.2 Extract Raw Text
449
+ 1. Extracts `result.value.rawText` (empty string if missing)
450
+
451
+ #### 9.3 Parse JSON
452
+ 1. Calls `parseFirstJsonObject(rawText)`:
453
+ - Searches for first JSON object in text (between first `{` and matching `}`)
454
+ - If no match found, attempts to parse entire string
455
+ - If parsing fails, returns null
456
+
457
+ 2. If parsing returns null:
458
+ - Writes warning to stderr: `[agentId] Invalid JSON output; skipping agent`
459
+ - Returns null
460
+
461
+ 3. Returns parsed object as `ParsedEvaluation`
462
+
463
+ ### 10. Score Aggregation
464
+
465
+ **Location**: `src/cli/commands/eval.ts` (action handler)
466
+
467
+ Aggregates scores from all evaluators and calculates averages.
468
+
469
+ **Execution Flow**:
470
+
471
+ #### 10.1 Initialize Score Arrays
472
+ 1. Creates arrays for each metric:
473
+ - `arrFc`: Functional completeness scores
474
+ - `arrPerf`: Performance & scalability scores
475
+ - `arrSec`: Security scores
476
+ - `arrMaint`: Maintainability & evolvability scores
477
+ - `arrReg`: Regulatory compliance scores
478
+ - `arrTest`: Testability scores
479
+ - `arrOverall`: Overall scores
480
+
481
+ #### 10.2 Extract and Validate Scores
482
+ For each parsed evaluation result:
483
+
484
+ 1. **Extract Scores**:
485
+ - `functional_completeness.score` from `parsed.evaluation.functional_completeness`
486
+ - `performance_scalability.score` from `parsed.evaluation.non_functional.performance_scalability`
487
+ - `security.score` from `parsed.evaluation.non_functional.security`
488
+ - `maintainability_evolvability.score` from `parsed.evaluation.non_functional.maintainability_evolvability`
489
+ - `regulatory_compliance.score` from `parsed.evaluation.non_functional.regulatory_compliance`
490
+ - `testability.score` from `parsed.evaluation.non_functional.testability`
491
+ - `overall_score` from `parsed.overall_summary.overall_score`
492
+
493
+ 2. **Validate and Push Scores**:
494
+ - Calls `pushIfValid(array, value, label, agentId)` for each score:
495
+ - Validates value is a finite number using `numOrUndefined()`
496
+ - If invalid: writes warning to stderr, skips
497
+ - Clamps value to range [1, 10] using `clampScoreToRange()`
498
+ - If clamped: writes warning to stderr with original value
499
+ - Appends clamped value to array
500
+
501
+ #### 10.3 Calculate Averages
502
+ 1. Calculates average for each metric using `averageOrNull()`:
503
+ - If array is empty: returns `null` (N/A)
504
+ - Otherwise: calculates average and returns as number
505
+ - Creates `AggregatedAverages` object:
506
+ ```typescript
507
+ {
508
+ functional_completeness: number | null,
509
+ performance_scalability: number | null,
510
+ security: number | null,
511
+ maintainability_evolvability: number | null,
512
+ regulatory_compliance: number | null,
513
+ testability: number | null,
514
+ overall_score: number | null
515
+ }
516
+ ```
517
+
518
+ ### 11. Result Output
519
+
520
+ **Function**: `writeEvaluationResults(aggregatedAverages, perAgentResults, outputPath)`
521
+ **Location**: `src/cli/commands/eval.ts`
522
+
523
+ Writes evaluation results to a file or stdout in JSON or Markdown format.
524
+
525
+ **Parameters**:
526
+ - `aggregatedAverages`: Aggregated average scores for all metrics
527
+ - `perAgentResults`: Per-agent parsed evaluation results, keyed by agent ID
528
+ - `outputPath`: Optional output file path
529
+
530
+ **Execution Flow**:
531
+
532
+ #### 11.1 Resolve Output Path
533
+ 1. If `outputPath` provided: resolves relative to current working directory
534
+ 2. Otherwise: `undefined` (write to stdout)
535
+
536
+ #### 11.2 JSON Output Format
537
+ If resolved path ends with `.json`:
538
+
539
+ 1. Builds `AggregatedJsonOutput` object:
540
+ ```typescript
541
+ {
542
+ evaluation: {
543
+ functional_completeness: { average_score: number | null },
544
+ non_functional: {
545
+ performance_scalability: { average_score: number | null },
546
+ security: { average_score: number | null },
547
+ maintainability_evolvability: { average_score: number | null },
548
+ regulatory_compliance: { average_score: number | null },
549
+ testability: { average_score: number | null }
550
+ }
551
+ },
552
+ overall_score: number | null,
553
+ agents: Record<string, ParsedEvaluation>
554
+ }
555
+ ```
556
+
557
+ 2. Writes JSON file:
558
+ - Formats with 2-space indent
559
+ - Encodes as UTF-8
560
+ - Writes to resolved path
561
+
562
+ #### 11.3 Markdown Output Format
563
+ Otherwise (path doesn't end with `.json` or no path):
564
+
565
+ 1. Calls `renderMarkdownTable(aggregatedAverages)`:
566
+ - Formats each score to 2 decimal places (or "N/A" if null)
567
+ - Builds Markdown table:
568
+ ```markdown
569
+ | Functional Completeness | Performance & Scalability | Security | Maintainability & Evolvability | Regulatory Compliance | Testability | Overall Score |
570
+ |------------------------|---------------------------|----------|-------------------------------|------------------------|------------|---------------|
571
+ | 8.50 | 7.25 | 9.00 | 8.00 | N/A | 7.75 | 8.10 |
572
+ ```
573
+
574
+ 2. Writes output:
575
+ - If `outputPath` provided: writes Markdown file (UTF-8)
576
+ - Otherwise: writes to stdout
577
+
578
+ ### 12. Error Handling
579
+
580
+ The system uses structured error handling with exit codes:
581
+
582
+ **Exit Codes**:
583
+ - `0`: Success
584
+ - `1`: General error (EXIT_GENERAL_ERROR)
585
+ - `2`: Invalid arguments (EXIT_INVALID_ARGS)
586
+ - `4`: Configuration error (EXIT_CONFIG_ERROR)
587
+
588
+ **Error Flow**:
589
+ 1. Errors thrown in action handler are caught
590
+ 2. Exit code extracted from error object if present, otherwise defaults to EXIT_GENERAL_ERROR
591
+ 3. Error message written to stderr
592
+ 4. Error re-thrown to top-level CLI handler
593
+ 5. Top-level handler extracts code and calls `process.exit(code)`
594
+
595
+ **Validation Errors**:
596
+ - Missing or invalid config file: EXIT_INVALID_ARGS
597
+ - No enabled evaluator agents: EXIT_INVALID_ARGS
598
+ - Missing or invalid debate file: EXIT_INVALID_ARGS
599
+ - Missing problem or final solution: EXIT_INVALID_ARGS
600
+ - Missing API keys: EXIT_CONFIG_ERROR (handled by provider factory)
601
+
602
+ **Evaluation Errors**:
603
+ - Individual evaluator failures: logged to stderr, agent skipped, evaluation continues
604
+ - Invalid JSON output: logged to stderr, agent skipped, evaluation continues
605
+ - Missing scores: logged to stderr, score skipped, aggregation continues
606
+
607
+ ## Key Data Structures
608
+
609
+ ### EvaluatorConfig
610
+ Represents an evaluator agent configuration:
611
+ - `id`: Unique identifier (string)
612
+ - `name`: Human-readable name (string)
613
+ - `model`: LLM model name (string)
614
+ - `provider`: Provider type ("openai" or "openrouter")
615
+ - `systemPromptPath`: Optional path to system prompt file
616
+ - `userPromptPath`: Optional path to user prompt file
617
+ - `timeout`: Optional timeout in milliseconds (currently ignored)
618
+ - `enabled`: Optional boolean (defaults to true if undefined)
619
+
620
+ ### EvaluatorInputs
621
+ Inputs passed to each evaluator agent:
622
+ - `problem`: Problem statement string
623
+ - `clarificationsMarkdown`: Markdown-formatted clarifications string
624
+ - `finalSolution`: Final solution description string
625
+
626
+ ### EvaluatorResult
627
+ Result from a single evaluator agent:
628
+ - `id`: Agent identifier
629
+ - `rawText`: Raw LLM response text
630
+ - `latencyMs`: Latency in milliseconds
631
+ - `usage`: Optional token usage statistics (inputTokens, outputTokens, totalTokens)
632
+
633
+ ### ParsedEvaluation
634
+ Parsed evaluation output from an evaluator agent:
635
+ - `evaluation`: Optional evaluation object containing:
636
+ - `functional_completeness`: Optional object with `score` (number) and `reasoning` (string)
637
+ - `non_functional`: Optional object containing:
638
+ - `performance_scalability`: Optional object with `score` and `reasoning`
639
+ - `security`: Optional object with `score` and `reasoning`
640
+ - `maintainability_evolvability`: Optional object with `score` and `reasoning`
641
+ - `regulatory_compliance`: Optional object with `score` and `reasoning`
642
+ - `testability`: Optional object with `score` and `reasoning`
643
+ - `overall_summary`: Optional object with:
644
+ - `strengths`: Optional string
645
+ - `weaknesses`: Optional string
646
+ - `overall_score`: Optional number
647
+
648
+ ### AggregatedAverages
649
+ Aggregated average scores across all evaluators:
650
+ - `functional_completeness`: number | null
651
+ - `performance_scalability`: number | null
652
+ - `security`: number | null
653
+ - `maintainability_evolvability`: number | null
654
+ - `regulatory_compliance`: number | null
655
+ - `testability`: number | null
656
+ - `overall_score`: number | null
657
+
658
+ ### AggregatedJsonOutput
659
+ Complete JSON output format:
660
+ - `evaluation`: Object with functional and non-functional metrics
661
+ - `overall_score`: Overall average score
662
+ - `agents`: Record mapping agent IDs to their `ParsedEvaluation` objects
663
+
664
+ ## File System Interactions
665
+
666
+ ### Configuration File Loading
667
+
668
+ **File Format**: JSON file with evaluator agent configurations
669
+
670
+ **Location**: Specified via `--config <path>` option
671
+
672
+ **Validation**:
673
+ - File must exist and be a regular file
674
+ - Must contain valid JSON
675
+ - Must have `agents` array with at least one entry
676
+ - At least one agent must be enabled (`enabled !== false`)
677
+
678
+ **Error Handling**:
679
+ - Missing file: EXIT_INVALID_ARGS
680
+ - Invalid JSON: EXIT_INVALID_ARGS
681
+ - Missing or empty agents array: EXIT_INVALID_ARGS
682
+ - No enabled agents: EXIT_INVALID_ARGS
683
+
684
+ ### Debate State File Loading
685
+
686
+ **File Format**: JSON file with `DebateState` structure
687
+
688
+ **Location**: Specified via `--debate <path>` option
689
+
690
+ **Required Fields**:
691
+ - `problem`: Non-empty string
692
+ - `finalSolution.description`: Non-empty string
693
+
694
+ **Optional Fields**:
695
+ - `clarifications`: Array of `AgentClarifications` objects
696
+
697
+ **Validation**:
698
+ - File must exist and be a regular file
699
+ - Must contain valid JSON
700
+ - Problem field must exist and be non-empty after trimming
701
+ - Final solution description must exist and be non-empty after trimming
702
+
703
+ **Error Handling**:
704
+ - Missing file: EXIT_INVALID_ARGS
705
+ - Invalid JSON: EXIT_INVALID_ARGS
706
+ - Missing or empty problem: EXIT_INVALID_ARGS
707
+ - Missing or empty final solution: EXIT_INVALID_ARGS
708
+
709
+ ### Prompt File Loading
710
+
711
+ **System Prompt**:
712
+ - Default: `src/eval/prompts/system.md` (or `dist/eval/prompts/system.md` at runtime)
713
+ - Custom: Path specified in `EvaluatorConfig.systemPromptPath`, resolved relative to config directory
714
+
715
+ **User Prompt**:
716
+ - Default: `src/eval/prompts/user.md` (or `dist/eval/prompts/user.md` at runtime)
717
+ - Custom: Path specified in `EvaluatorConfig.userPromptPath`, resolved relative to config directory
718
+
719
+ **Error Handling**:
720
+ - Missing custom prompt file: Warning to stderr, falls back to built-in default
721
+ - Empty custom prompt file: Warning to stderr, falls back to built-in default
722
+ - Unreadable custom prompt file: Warning to stderr, falls back to built-in default
723
+
724
+ ### Output File Writing
725
+
726
+ **JSON Output**:
727
+ - Format: Pretty-printed JSON with 2-space indent
728
+ - Encoding: UTF-8
729
+ - Content: Complete `AggregatedJsonOutput` object
730
+
731
+ **Markdown Output**:
732
+ - Format: Markdown table with aggregated scores
733
+ - Encoding: UTF-8
734
+ - Content: Single table row with formatted scores
735
+
736
+ **Output Location**:
737
+ - If `--output` specified: writes to file at resolved path
738
+ - Otherwise: writes Markdown table to stdout
739
+
740
+ ## Concurrency Model
741
+
742
+ ### Parallel Operations
743
+
744
+ The following operations run concurrently:
745
+ - **Evaluator agent execution**: All evaluators run in parallel via `Promise.allSettled()`
746
+ - Each evaluator's LLM call runs independently
747
+ - Results are collected as they complete
748
+
749
+ ### Sequential Operations
750
+
751
+ The following operations run sequentially:
752
+ - Configuration loading → Agent building → Debate loading → Evaluation → Aggregation → Output
753
+ - File system reads (config, debate, prompts)
754
+ - Result parsing and score extraction (though this could be parallelized)
755
+
756
+ ### Error Isolation
757
+
758
+ - Individual evaluator failures are isolated using `Promise.allSettled()`
759
+ - Failed evaluators are logged and skipped
760
+ - Aggregation continues with successful evaluators only
761
+ - Missing or invalid scores are logged and skipped per-agent
762
+
763
+ ## Performance Considerations
764
+
765
+ ### Latency
766
+
767
+ - **Total latency**: Dominated by slowest evaluator (due to parallel execution)
768
+ - **Per-evaluator latency**: LLM API call time + network latency
769
+ - **Parsing latency**: Negligible (JSON parsing is fast)
770
+ - **Aggregation latency**: Negligible (simple arithmetic)
771
+
772
+ ### Token Usage
773
+
774
+ - **Per-evaluator tokens**: Depends on prompt size and model response
775
+ - **Total tokens**: Sum of all evaluator token usage
776
+ - **Prompt size**: Includes problem statement, clarifications (if any), and final solution
777
+
778
+ ### Scalability
779
+
780
+ - **Parallel execution**: All evaluators run concurrently, so adding more evaluators doesn't linearly increase total time
781
+ - **Bottleneck**: Slowest evaluator determines total execution time
782
+ - **Cost**: Grows linearly with number of evaluators (each makes one LLM call)
783
+
784
+ ## Extension Points
785
+
786
+ The architecture supports extension through:
787
+
788
+ 1. **New Providers**: Implement `LLMProvider` interface for other LLM services
789
+ 2. **Custom Prompts**: Specify custom system and user prompts via configuration file
790
+ 3. **Additional Metrics**: Extend `ParsedEvaluation` interface and aggregation logic
791
+ 4. **Alternative Output Formats**: Extend `writeEvaluationResults()` to support additional formats
792
+ 5. **Custom Score Ranges**: Modify `clampScoreToRange()` to support different score ranges
793
+ 6. **Enhanced Validation**: Add custom validation logic in `validateAndParseEvaluatorResult()`
794
+ 7. **Progress Tracking**: Add progress hooks similar to debate orchestrator for real-time feedback
795
+