dialectic 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.cursor/commands/setup-test.mdc +175 -0
- package/.cursor/rules/basic-code-cleanup.mdc +1110 -0
- package/.cursor/rules/riper5.mdc +96 -0
- package/.env.example +6 -0
- package/AGENTS.md +1052 -0
- package/LICENSE +21 -0
- package/README.md +93 -0
- package/WARP.md +113 -0
- package/dialectic-1.0.0.tgz +0 -0
- package/dialectic.js +10 -0
- package/docs/commands.md +375 -0
- package/docs/configuration.md +882 -0
- package/docs/context_summarization.md +1023 -0
- package/docs/debate_flow.md +1127 -0
- package/docs/eval_flow.md +795 -0
- package/docs/evaluator.md +141 -0
- package/examples/debate-config-openrouter.json +48 -0
- package/examples/debate_config1.json +48 -0
- package/examples/eval/eval1/eval_config1.json +13 -0
- package/examples/eval/eval1/result1.json +62 -0
- package/examples/eval/eval1/result2.json +97 -0
- package/examples/eval_summary_format.md +11 -0
- package/examples/example3/debate-config.json +64 -0
- package/examples/example3/eval_config2.json +25 -0
- package/examples/example3/problem.md +17 -0
- package/examples/example3/rounds_test/eval_run.sh +16 -0
- package/examples/example3/rounds_test/run_test.sh +16 -0
- package/examples/kata1/architect-only-solution_2-rounds.json +121 -0
- package/examples/kata1/architect-perf-solution_2-rounds.json +234 -0
- package/examples/kata1/debate-config-kata1.json +54 -0
- package/examples/kata1/eval_architect-only_2-rounds.json +97 -0
- package/examples/kata1/eval_architect-perf_2-rounds.json +97 -0
- package/examples/kata1/kata1-report.md +12224 -0
- package/examples/kata1/kata1-report_temps-01_01_01_07.md +2451 -0
- package/examples/kata1/kata1.md +5 -0
- package/examples/kata1/meta.txt +1 -0
- package/examples/kata2/debate-config.json +54 -0
- package/examples/kata2/eval_config1.json +21 -0
- package/examples/kata2/eval_config2.json +25 -0
- package/examples/kata2/kata2.md +5 -0
- package/examples/kata2/only_architect/debate-config.json +45 -0
- package/examples/kata2/only_architect/eval_run.sh +11 -0
- package/examples/kata2/only_architect/run_test.sh +5 -0
- package/examples/kata2/rounds_test/eval_run.sh +11 -0
- package/examples/kata2/rounds_test/run_test.sh +5 -0
- package/examples/kata2/summary_length_test/eval_run.sh +11 -0
- package/examples/kata2/summary_length_test/eval_run_w_clarify.sh +7 -0
- package/examples/kata2/summary_length_test/run_test.sh +5 -0
- package/examples/task-queue/debate-config.json +76 -0
- package/examples/task-queue/debate_report.md +566 -0
- package/examples/task-queue/task-queue-system.md +25 -0
- package/jest.config.ts +13 -0
- package/multi_agent_debate_spec.md +2980 -0
- package/package.json +38 -0
- package/sanity-check-problem.txt +9 -0
- package/src/agents/prompts/architect-prompts.ts +203 -0
- package/src/agents/prompts/generalist-prompts.ts +157 -0
- package/src/agents/prompts/index.ts +41 -0
- package/src/agents/prompts/judge-prompts.ts +19 -0
- package/src/agents/prompts/kiss-prompts.ts +230 -0
- package/src/agents/prompts/performance-prompts.ts +142 -0
- package/src/agents/prompts/prompt-types.ts +68 -0
- package/src/agents/prompts/security-prompts.ts +149 -0
- package/src/agents/prompts/shared.ts +144 -0
- package/src/agents/prompts/testing-prompts.ts +149 -0
- package/src/agents/role-based-agent.ts +386 -0
- package/src/cli/commands/debate.ts +761 -0
- package/src/cli/commands/eval.ts +475 -0
- package/src/cli/commands/report.ts +265 -0
- package/src/cli/index.ts +79 -0
- package/src/core/agent.ts +198 -0
- package/src/core/clarifications.ts +34 -0
- package/src/core/judge.ts +257 -0
- package/src/core/orchestrator.ts +432 -0
- package/src/core/state-manager.ts +322 -0
- package/src/eval/evaluator-agent.ts +130 -0
- package/src/eval/prompts/system.md +41 -0
- package/src/eval/prompts/user.md +64 -0
- package/src/providers/llm-provider.ts +25 -0
- package/src/providers/openai-provider.ts +84 -0
- package/src/providers/openrouter-provider.ts +122 -0
- package/src/providers/provider-factory.ts +64 -0
- package/src/types/agent.types.ts +141 -0
- package/src/types/config.types.ts +47 -0
- package/src/types/debate.types.ts +237 -0
- package/src/types/eval.types.ts +85 -0
- package/src/utils/common.ts +104 -0
- package/src/utils/context-formatter.ts +102 -0
- package/src/utils/context-summarizer.ts +143 -0
- package/src/utils/env-loader.ts +46 -0
- package/src/utils/exit-codes.ts +5 -0
- package/src/utils/id.ts +11 -0
- package/src/utils/logger.ts +48 -0
- package/src/utils/paths.ts +10 -0
- package/src/utils/progress-ui.ts +313 -0
- package/src/utils/prompt-loader.ts +79 -0
- package/src/utils/report-generator.ts +301 -0
- package/tests/clarifications.spec.ts +128 -0
- package/tests/cli.debate.spec.ts +144 -0
- package/tests/config-loading.spec.ts +206 -0
- package/tests/context-summarizer.spec.ts +131 -0
- package/tests/debate-config-custom.json +38 -0
- package/tests/env-loader.spec.ts +149 -0
- package/tests/eval.command.spec.ts +1191 -0
- package/tests/logger.spec.ts +19 -0
- package/tests/openai-provider.spec.ts +26 -0
- package/tests/openrouter-provider.spec.ts +279 -0
- package/tests/orchestrator-summary.spec.ts +386 -0
- package/tests/orchestrator.spec.ts +207 -0
- package/tests/prompt-loader.spec.ts +52 -0
- package/tests/prompts/architect.md +16 -0
- package/tests/provider-factory.spec.ts +150 -0
- package/tests/report.command.spec.ts +546 -0
- package/tests/role-based-agent-summary.spec.ts +476 -0
- package/tests/security-agent.spec.ts +221 -0
- package/tests/shared-prompts.spec.ts +318 -0
- package/tests/state-manager.spec.ts +251 -0
- package/tests/summary-prompts.spec.ts +153 -0
- package/tsconfig.json +49 -0
|
@@ -0,0 +1,795 @@
|
|
|
1
|
+
# Evaluation Flow Documentation
|
|
2
|
+
|
|
3
|
+
This document provides a detailed explanation of the execution flow for the eval command, from CLI invocation through to final output.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
The evaluation system uses multiple evaluator agents to assess completed debates. The flow involves loading evaluator configuration, instantiating evaluator agents, loading and validating debate state, running all evaluators in parallel, parsing and aggregating their results, and outputting the aggregated evaluation in JSON or Markdown format.
|
|
8
|
+
|
|
9
|
+
## Sequence Diagram
|
|
10
|
+
|
|
11
|
+
The following diagram illustrates the complete flow of an evaluation execution:
|
|
12
|
+
|
|
13
|
+
```mermaid
|
|
14
|
+
sequenceDiagram
|
|
15
|
+
participant CLI as CLI Entry Point
|
|
16
|
+
participant Cmd as evalCommand
|
|
17
|
+
participant Env as loadEnvironmentFile
|
|
18
|
+
participant Config as loadEvaluatorConfig
|
|
19
|
+
participant Builder as buildEvaluatorAgents
|
|
20
|
+
participant Prompt as resolvePrompt
|
|
21
|
+
participant Provider as ProviderFactory
|
|
22
|
+
participant EA as EvaluatorAgent
|
|
23
|
+
participant Debate as loadAndValidateDebateState
|
|
24
|
+
participant FS as File System
|
|
25
|
+
participant Parser as parseFirstJsonObject
|
|
26
|
+
participant Agg as Aggregate Scores
|
|
27
|
+
participant Output as writeEvaluationResults
|
|
28
|
+
|
|
29
|
+
CLI->>Cmd: runCli(argv)
|
|
30
|
+
Cmd->>Cmd: parse arguments
|
|
31
|
+
Cmd->>Env: loadEnvironmentFile(envFilePath, verbose)
|
|
32
|
+
Env->>FS: read .env file (if exists)
|
|
33
|
+
Env-->>Cmd: environment variables loaded
|
|
34
|
+
|
|
35
|
+
Cmd->>Config: loadAndValidateEnabledAgents(configPath)
|
|
36
|
+
Config->>FS: read evaluator config JSON
|
|
37
|
+
Config->>Config: validate agents array exists and non-empty
|
|
38
|
+
Config->>Config: filter enabled agents (enabled !== false)
|
|
39
|
+
Config->>Config: validate at least one enabled agent
|
|
40
|
+
Config-->>Cmd: { enabledAgents, configDir }
|
|
41
|
+
|
|
42
|
+
Cmd->>Builder: buildEvaluatorAgents(enabledAgents, configDir, verbose)
|
|
43
|
+
loop For each enabled agent
|
|
44
|
+
Builder->>Prompt: resolvePrompt(systemPromptPath, configDir)
|
|
45
|
+
alt systemPromptPath provided
|
|
46
|
+
Prompt->>FS: read system prompt file (UTF-8)
|
|
47
|
+
Prompt-->>Builder: { text, source: 'file', absPath }
|
|
48
|
+
else no systemPromptPath
|
|
49
|
+
Prompt->>Prompt: readBuiltInPrompt('eval/prompts/system.md')
|
|
50
|
+
Prompt-->>Builder: { text, source: 'built-in' }
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
Builder->>Prompt: resolvePrompt(userPromptPath, configDir)
|
|
54
|
+
alt userPromptPath provided
|
|
55
|
+
Prompt->>FS: read user prompt file (UTF-8)
|
|
56
|
+
Prompt-->>Builder: { text, source: 'file', absPath }
|
|
57
|
+
else no userPromptPath
|
|
58
|
+
Prompt->>Prompt: readBuiltInPrompt('eval/prompts/user.md')
|
|
59
|
+
Prompt-->>Builder: { text, source: 'built-in' }
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
Builder->>Provider: createProvider(agent.provider)
|
|
63
|
+
Provider-->>Builder: LLMProvider instance
|
|
64
|
+
|
|
65
|
+
Builder->>EA: EvaluatorAgent.fromConfig(config, systemPrompt, userPrompt)
|
|
66
|
+
EA->>EA: new EvaluatorAgent(config, provider, systemPrompt, userPrompt)
|
|
67
|
+
EA-->>Builder: EvaluatorAgent instance
|
|
68
|
+
end
|
|
69
|
+
Builder-->>Cmd: evaluators[]
|
|
70
|
+
|
|
71
|
+
Cmd->>Debate: loadAndValidateDebateState(debatePath)
|
|
72
|
+
Debate->>FS: read debate JSON file
|
|
73
|
+
Debate->>Debate: parse DebateState JSON
|
|
74
|
+
Debate->>Debate: validate problem field exists and non-empty
|
|
75
|
+
Debate->>Debate: validate finalSolution.description exists and non-empty
|
|
76
|
+
Debate->>Debate: buildClarificationsMarkdown(state)
|
|
77
|
+
Debate-->>Cmd: { problem, finalSolution, clarificationsMarkdown }
|
|
78
|
+
|
|
79
|
+
Cmd->>Cmd: build inputs object { problem, clarificationsMarkdown, finalSolution }
|
|
80
|
+
|
|
81
|
+
Cmd->>Cmd: Promise.allSettled(evaluators.map(e => e.evaluate(inputs)))
|
|
82
|
+
par Parallel evaluation for all agents
|
|
83
|
+
Cmd->>EA: evaluator.evaluate(inputs)
|
|
84
|
+
EA->>EA: renderUserPrompt(inputs)
|
|
85
|
+
EA->>EA: replace placeholders {problem}, {clarifications}, {final_solution}
|
|
86
|
+
EA->>EA: provider.complete({ model, temperature: 0.1, systemPrompt, userPrompt })
|
|
87
|
+
EA->>Provider: complete(request)
|
|
88
|
+
Provider->>Provider: try Responses API
|
|
89
|
+
alt Responses API available
|
|
90
|
+
Provider->>Provider: call responses.create()
|
|
91
|
+
else Fallback
|
|
92
|
+
Provider->>Provider: call chat.completions.create()
|
|
93
|
+
end
|
|
94
|
+
Provider-->>EA: { text, usage, latency }
|
|
95
|
+
EA->>EA: measure latency
|
|
96
|
+
EA-->>Cmd: { id, rawText, latencyMs, usage }
|
|
97
|
+
end
|
|
98
|
+
Cmd-->>Cmd: PromiseSettledResult[] results
|
|
99
|
+
|
|
100
|
+
Cmd->>Cmd: process results and aggregate scores
|
|
101
|
+
loop For each result
|
|
102
|
+
Cmd->>Parser: validateAndParseEvaluatorResult(result, agentId)
|
|
103
|
+
alt result.status === 'fulfilled'
|
|
104
|
+
Parser->>Parser: parseFirstJsonObject(rawText)
|
|
105
|
+
Parser->>Parser: extract JSON object from text
|
|
106
|
+
Parser-->>Cmd: ParsedEvaluation | null
|
|
107
|
+
else result.status === 'rejected'
|
|
108
|
+
Parser-->>Cmd: null (writes warning to stderr)
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
alt ParsedEvaluation valid
|
|
112
|
+
Cmd->>Agg: pushIfValid(arrFc, func.score, ...)
|
|
113
|
+
Cmd->>Agg: pushIfValid(arrPerf, nonf.performance_scalability.score, ...)
|
|
114
|
+
Cmd->>Agg: pushIfValid(arrSec, nonf.security.score, ...)
|
|
115
|
+
Cmd->>Agg: pushIfValid(arrMaint, nonf.maintainability_evolvability.score, ...)
|
|
116
|
+
Cmd->>Agg: pushIfValid(arrReg, nonf.regulatory_compliance.score, ...)
|
|
117
|
+
Cmd->>Agg: pushIfValid(arrTest, nonf.testability.score, ...)
|
|
118
|
+
Cmd->>Agg: pushIfValid(arrOverall, overall_summary.overall_score, ...)
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
Cmd->>Agg: calculate averages (averageOrNull) for each metric
|
|
123
|
+
Agg-->>Cmd: AggregatedAverages
|
|
124
|
+
|
|
125
|
+
Cmd->>Output: writeEvaluationResults(aggregatedAverages, perAgentResults, outputPath)
|
|
126
|
+
alt outputPath ends with .json
|
|
127
|
+
Output->>Output: build AggregatedJsonOutput object
|
|
128
|
+
Output->>FS: write JSON file (UTF-8, 2-space indent)
|
|
129
|
+
else outputPath exists but not .json
|
|
130
|
+
Output->>Output: renderMarkdownTable(aggregatedAverages)
|
|
131
|
+
Output->>FS: write Markdown file (UTF-8)
|
|
132
|
+
else no outputPath
|
|
133
|
+
Output->>Output: renderMarkdownTable(aggregatedAverages)
|
|
134
|
+
Output->>CLI: write to stdout
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
Cmd-->>CLI: exit code 0
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## Detailed Flow Description
|
|
141
|
+
|
|
142
|
+
### 1. CLI Entry Point
|
|
143
|
+
|
|
144
|
+
**Function**: `runCli(argv: string[])`
|
|
145
|
+
**Location**: `src/cli/index.ts`
|
|
146
|
+
|
|
147
|
+
The entry point for the evaluation system. This function:
|
|
148
|
+
- Creates a Commander program instance
|
|
149
|
+
- Sets program metadata (name, description, version)
|
|
150
|
+
- Registers the eval command via `evalCommand(program)`
|
|
151
|
+
- Parses command line arguments
|
|
152
|
+
- Handles top-level errors and maps them to exit codes
|
|
153
|
+
|
|
154
|
+
**Parameters**:
|
|
155
|
+
- `argv`: Array of command-line arguments (excluding node and script name)
|
|
156
|
+
|
|
157
|
+
**Returns**: Promise that resolves on success or rejects with an error containing an exit code
|
|
158
|
+
|
|
159
|
+
### 2. Command Registration
|
|
160
|
+
|
|
161
|
+
**Function**: `evalCommand(program: Command)`
|
|
162
|
+
**Location**: `src/cli/commands/eval.ts`
|
|
163
|
+
|
|
164
|
+
Registers the eval command and its action handler with Commander. Defines:
|
|
165
|
+
- Command name: `eval`
|
|
166
|
+
- Options: `-c, --config <path>`, `-d, --debate <path>`, `--env-file <path>`, `-v, --verbose`, `-o, --output <path>`
|
|
167
|
+
- Action handler that executes when the command is invoked
|
|
168
|
+
|
|
169
|
+
**Parameters**:
|
|
170
|
+
- `program`: Commander instance to register the command with
|
|
171
|
+
|
|
172
|
+
### 3. Environment File Loading
|
|
173
|
+
|
|
174
|
+
**Function**: `loadEnvironmentFile(envFilePath?: string, verbose?: boolean)`
|
|
175
|
+
**Location**: `src/utils/env-loader.ts`
|
|
176
|
+
|
|
177
|
+
Loads environment variables from a .env file using the dotenv library.
|
|
178
|
+
|
|
179
|
+
**Parameters**:
|
|
180
|
+
- `envFilePath`: Optional path to .env file (default: `.env` in current working directory)
|
|
181
|
+
- `verbose`: Whether to output verbose logging
|
|
182
|
+
|
|
183
|
+
**Behavior**:
|
|
184
|
+
- If default `.env` file doesn't exist: continues silently (non-breaking), optionally warns in verbose mode
|
|
185
|
+
- If custom env file path specified and doesn't exist: throws error
|
|
186
|
+
- Uses dotenv to parse and load environment variables
|
|
187
|
+
- Loads API keys required for LLM providers (OPENAI_API_KEY, OPENROUTER_API_KEY)
|
|
188
|
+
|
|
189
|
+
### 4. Evaluator Configuration Loading
|
|
190
|
+
|
|
191
|
+
**Function**: `loadAndValidateEnabledAgents(configPath: string)`
|
|
192
|
+
**Location**: `src/cli/commands/eval.ts`
|
|
193
|
+
|
|
194
|
+
Loads evaluator configuration file, validates structure, and filters for enabled agents.
|
|
195
|
+
|
|
196
|
+
**Parameters**:
|
|
197
|
+
- `configPath`: Path to evaluator configuration JSON file
|
|
198
|
+
|
|
199
|
+
**Returns**: Object containing:
|
|
200
|
+
- `enabledAgents`: Array of enabled evaluator configurations
|
|
201
|
+
- `configDir`: Absolute directory path containing the configuration file
|
|
202
|
+
|
|
203
|
+
**Execution Flow**:
|
|
204
|
+
|
|
205
|
+
#### 4.1 Load Configuration File
|
|
206
|
+
1. Calls `loadEvaluatorConfig(configPath)`:
|
|
207
|
+
- Resolves path relative to current working directory
|
|
208
|
+
- Calls `readJsonFile()` to read and parse JSON
|
|
209
|
+
- Validates file exists and is a regular file
|
|
210
|
+
- Validates JSON is parseable
|
|
211
|
+
- Validates `agents` array exists and has at least one entry
|
|
212
|
+
- Maps raw agent objects to `EvaluatorConfig` format
|
|
213
|
+
- Returns `{ agents, configDir }`
|
|
214
|
+
|
|
215
|
+
#### 4.2 Filter Enabled Agents
|
|
216
|
+
1. Filters agents using `isEnabledEvaluator()`:
|
|
217
|
+
- Agent is enabled if `enabled !== false` (defaults to true if undefined)
|
|
218
|
+
- Removes agents where `enabled: false`
|
|
219
|
+
|
|
220
|
+
#### 4.3 Validate Enabled Agents Exist
|
|
221
|
+
1. Validates at least one enabled agent remains:
|
|
222
|
+
- If no enabled agents found, throws validation error with EXIT_INVALID_ARGS
|
|
223
|
+
|
|
224
|
+
**Configuration Schema**:
|
|
225
|
+
```json
|
|
226
|
+
{
|
|
227
|
+
"agents": [
|
|
228
|
+
{
|
|
229
|
+
"id": "eval-1",
|
|
230
|
+
"name": "Evaluator 1",
|
|
231
|
+
"model": "gpt-4",
|
|
232
|
+
"provider": "openai",
|
|
233
|
+
"systemPromptPath": "./prompts/system.md",
|
|
234
|
+
"userPromptPath": "./prompts/user.md",
|
|
235
|
+
"timeout": 30000,
|
|
236
|
+
"enabled": true
|
|
237
|
+
}
|
|
238
|
+
]
|
|
239
|
+
}
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
### 5. Evaluator Agent Instantiation
|
|
243
|
+
|
|
244
|
+
**Function**: `buildEvaluatorAgents(enabledAgents: EvaluatorConfig[], configDir: string, verbose: boolean)`
|
|
245
|
+
**Location**: `src/cli/commands/eval.ts`
|
|
246
|
+
|
|
247
|
+
Creates EvaluatorAgent instances from enabled evaluator configurations.
|
|
248
|
+
|
|
249
|
+
**Parameters**:
|
|
250
|
+
- `enabledAgents`: Array of enabled evaluator configurations
|
|
251
|
+
- `configDir`: Absolute directory path for resolving relative prompt paths
|
|
252
|
+
- `verbose`: Whether to log detailed information about each agent
|
|
253
|
+
|
|
254
|
+
**Returns**: Array of `EvaluatorAgent` instances
|
|
255
|
+
|
|
256
|
+
**Execution Flow**:
|
|
257
|
+
|
|
258
|
+
#### 5.1 Load Default Prompts
|
|
259
|
+
1. Loads built-in system prompt:
|
|
260
|
+
- Calls `readBuiltInPrompt('eval/prompts/system.md', fallbackText)`
|
|
261
|
+
- Attempts to read from `dist/eval/prompts/system.md` (runtime)
|
|
262
|
+
- Falls back to `src/eval/prompts/system.md` (tests)
|
|
263
|
+
- Uses fallback text if file unavailable
|
|
264
|
+
|
|
265
|
+
2. Loads built-in user prompt:
|
|
266
|
+
- Calls `readBuiltInPrompt('eval/prompts/user.md', fallbackText)`
|
|
267
|
+
- Same resolution strategy as system prompt
|
|
268
|
+
|
|
269
|
+
#### 5.2 For Each Enabled Agent
|
|
270
|
+
1. **Resolve System Prompt**:
|
|
271
|
+
- Calls `resolvePrompt()` with agent's `systemPromptPath`:
|
|
272
|
+
- If `systemPromptPath` provided: resolves relative to `configDir`, reads file (UTF-8)
|
|
273
|
+
- If file missing/unreadable/empty: warns and uses built-in default
|
|
274
|
+
- Returns `{ text, source, absPath? }`
|
|
275
|
+
|
|
276
|
+
2. **Resolve User Prompt**:
|
|
277
|
+
- Calls `resolvePrompt()` with agent's `userPromptPath`:
|
|
278
|
+
- Same resolution strategy as system prompt
|
|
279
|
+
- Returns `{ text, source, absPath? }`
|
|
280
|
+
|
|
281
|
+
3. **Create Provider**:
|
|
282
|
+
- Calls `createProvider(agent.provider)`:
|
|
283
|
+
- Creates `OpenAIProvider` for "openai"
|
|
284
|
+
- Creates `OpenRouterProvider` for "openrouter"
|
|
285
|
+
- Returns `LLMProvider` instance
|
|
286
|
+
|
|
287
|
+
4. **Instantiate EvaluatorAgent**:
|
|
288
|
+
- Calls `EvaluatorAgent.fromConfig(config, systemPrompt, userPrompt)`:
|
|
289
|
+
- Creates provider instance
|
|
290
|
+
- Constructs `EvaluatorAgent` with:
|
|
291
|
+
- Configuration (id, name, model)
|
|
292
|
+
- Provider instance
|
|
293
|
+
- Resolved system prompt
|
|
294
|
+
- Resolved user prompt template
|
|
295
|
+
|
|
296
|
+
5. **Verbose Logging** (if enabled):
|
|
297
|
+
- Logs to stderr: agent ID, provider, model, system prompt source, user prompt source
|
|
298
|
+
|
|
299
|
+
### 6. Debate State Loading and Validation
|
|
300
|
+
|
|
301
|
+
**Function**: `loadAndValidateDebateState(debatePath: string)`
|
|
302
|
+
**Location**: `src/cli/commands/eval.ts`
|
|
303
|
+
|
|
304
|
+
Loads and validates a debate state file, extracting required fields for evaluation.
|
|
305
|
+
|
|
306
|
+
**Parameters**:
|
|
307
|
+
- `debatePath`: Path to debate state JSON file
|
|
308
|
+
|
|
309
|
+
**Returns**: Object containing:
|
|
310
|
+
- `problem`: Problem statement string
|
|
311
|
+
- `finalSolution`: Final solution description string
|
|
312
|
+
- `clarificationsMarkdown`: Markdown-formatted clarifications string
|
|
313
|
+
|
|
314
|
+
**Execution Flow**:
|
|
315
|
+
|
|
316
|
+
#### 6.1 Load Debate File
|
|
317
|
+
1. Calls `readJsonFile<DebateState>(debatePath)`:
|
|
318
|
+
- Resolves path relative to current working directory
|
|
319
|
+
- Validates file exists and is a regular file
|
|
320
|
+
- Reads file content as UTF-8
|
|
321
|
+
- Parses JSON to `DebateState` object
|
|
322
|
+
|
|
323
|
+
#### 6.2 Validate Required Fields
|
|
324
|
+
1. **Problem Validation**:
|
|
325
|
+
- Extracts `debate.problem` and trims whitespace
|
|
326
|
+
- If empty or missing, throws validation error with EXIT_INVALID_ARGS
|
|
327
|
+
|
|
328
|
+
2. **Final Solution Validation**:
|
|
329
|
+
- Extracts `debate.finalSolution.description` and trims whitespace
|
|
330
|
+
- If empty or missing, throws validation error with EXIT_INVALID_ARGS
|
|
331
|
+
|
|
332
|
+
#### 6.3 Build Clarifications Markdown
|
|
333
|
+
1. Calls `buildClarificationsMarkdown(debate)`:
|
|
334
|
+
- If no clarifications: returns minimal code block separator
|
|
335
|
+
- Otherwise: builds Markdown-formatted string:
|
|
336
|
+
- For each agent's clarification group:
|
|
337
|
+
- Adds H3 header: `### {agentName} ({role})`
|
|
338
|
+
- For each question-answer pair:
|
|
339
|
+
- Adds question: `Question (q1):\n\n```text\n{question}\n```\n\n`
|
|
340
|
+
- Adds answer: `Answer:\n\n```text\n{answer}\n```\n\n`
|
|
341
|
+
- Returns trimmed Markdown string
|
|
342
|
+
|
|
343
|
+
### 7. Evaluation Execution
|
|
344
|
+
|
|
345
|
+
**Method**: `evaluator.evaluate(inputs: EvaluatorInputs)`
|
|
346
|
+
**Location**: `src/eval/evaluator-agent.ts`
|
|
347
|
+
|
|
348
|
+
Performs the evaluation using the underlying LLM provider.
|
|
349
|
+
|
|
350
|
+
**Parameters**:
|
|
351
|
+
- `inputs`: Object containing:
|
|
352
|
+
- `problem`: Problem statement string
|
|
353
|
+
- `clarificationsMarkdown`: Markdown-formatted clarifications
|
|
354
|
+
- `finalSolution`: Final solution description string
|
|
355
|
+
|
|
356
|
+
**Returns**: `EvaluatorResult` containing:
|
|
357
|
+
- `id`: Agent identifier
|
|
358
|
+
- `rawText`: Raw LLM response text
|
|
359
|
+
- `latencyMs`: Latency in milliseconds
|
|
360
|
+
- `usage`: Optional token usage statistics
|
|
361
|
+
|
|
362
|
+
**Execution Flow**:
|
|
363
|
+
|
|
364
|
+
#### 7.1 Render User Prompt
|
|
365
|
+
1. Calls `renderUserPrompt(inputs)`:
|
|
366
|
+
- Replaces `{problem}` placeholder with `inputs.problem`
|
|
367
|
+
- Replaces `{clarifications}` placeholder with `inputs.clarificationsMarkdown`
|
|
368
|
+
- Replaces `{final_solution}` placeholder with `inputs.finalSolution`
|
|
369
|
+
- Returns rendered user prompt string
|
|
370
|
+
|
|
371
|
+
#### 7.2 Invoke LLM Provider
|
|
372
|
+
1. Records start time for latency measurement
|
|
373
|
+
2. Calls `provider.complete()` with:
|
|
374
|
+
- `model`: Agent's configured model
|
|
375
|
+
- `temperature`: Fixed at 0.1 (for deterministic evaluation)
|
|
376
|
+
- `systemPrompt`: Resolved system prompt
|
|
377
|
+
- `userPrompt`: Rendered user prompt
|
|
378
|
+
|
|
379
|
+
3. Provider execution:
|
|
380
|
+
- **Primary**: Attempts Responses API (`client.responses.create()`)
|
|
381
|
+
- **Fallback**: Uses Chat Completions API (`client.chat.completions.create()`)
|
|
382
|
+
- Returns `CompletionResponse` with text and usage statistics
|
|
383
|
+
|
|
384
|
+
#### 7.3 Measure Latency and Build Result
|
|
385
|
+
1. Calculates latency: `Date.now() - started`
|
|
386
|
+
2. Builds `EvaluatorResult`:
|
|
387
|
+
- `id`: Agent identifier
|
|
388
|
+
- `rawText`: Response text from provider
|
|
389
|
+
- `latencyMs`: Calculated latency
|
|
390
|
+
- `usage`: Token usage (if available)
|
|
391
|
+
|
|
392
|
+
#### 7.4 Error Handling
|
|
393
|
+
- If provider call fails: logs error to stderr with agent ID and rethrows
|
|
394
|
+
- Error propagates to Promise.allSettled for graceful handling
|
|
395
|
+
|
|
396
|
+
### 8. Parallel Evaluation Execution
|
|
397
|
+
|
|
398
|
+
**Location**: `src/cli/commands/eval.ts` (action handler)
|
|
399
|
+
|
|
400
|
+
Runs all evaluator agents in parallel and collects results.
|
|
401
|
+
|
|
402
|
+
**Execution Flow**:
|
|
403
|
+
|
|
404
|
+
#### 8.1 Build Inputs Object
|
|
405
|
+
1. Creates `inputs` object:
|
|
406
|
+
```typescript
|
|
407
|
+
{
|
|
408
|
+
problem,
|
|
409
|
+
clarificationsMarkdown,
|
|
410
|
+
finalSolution
|
|
411
|
+
}
|
|
412
|
+
```
|
|
413
|
+
|
|
414
|
+
#### 8.2 Execute All Evaluators
|
|
415
|
+
1. Calls `Promise.allSettled(evaluators.map(e => e.evaluate(inputs)))`:
|
|
416
|
+
- Maps each evaluator to its `evaluate()` promise
|
|
417
|
+
- Uses `allSettled` to handle failures gracefully (doesn't fail fast)
|
|
418
|
+
- All evaluations run concurrently in parallel
|
|
419
|
+
- Returns array of `PromiseSettledResult` objects
|
|
420
|
+
|
|
421
|
+
#### 8.3 Process Results
|
|
422
|
+
1. Iterates through `PromiseSettledResult[]`:
|
|
423
|
+
- For each result:
|
|
424
|
+
- Calls `validateAndParseEvaluatorResult(result, agentId)`
|
|
425
|
+
- If result is rejected: logs warning, returns null
|
|
426
|
+
- If result is fulfilled: parses JSON from raw text, returns `ParsedEvaluation | null`
|
|
427
|
+
|
|
428
|
+
### 9. Result Parsing and Validation
|
|
429
|
+
|
|
430
|
+
**Function**: `validateAndParseEvaluatorResult(result: PromiseSettledResult<any>, agentId: string)`
|
|
431
|
+
**Location**: `src/cli/commands/eval.ts`
|
|
432
|
+
|
|
433
|
+
Validates and parses an evaluator agent's result from a Promise.allSettled outcome.
|
|
434
|
+
|
|
435
|
+
**Parameters**:
|
|
436
|
+
- `result`: The settled promise result from an evaluator agent
|
|
437
|
+
- `agentId`: Agent identifier for warning messages
|
|
438
|
+
|
|
439
|
+
**Returns**: `ParsedEvaluation | null`
|
|
440
|
+
|
|
441
|
+
**Execution Flow**:
|
|
442
|
+
|
|
443
|
+
#### 9.1 Check Promise Status
|
|
444
|
+
1. If `result.status !== 'fulfilled'`:
|
|
445
|
+
- Writes warning to stderr: `[agentId] Skipped due to error`
|
|
446
|
+
- Returns null
|
|
447
|
+
|
|
448
|
+
#### 9.2 Extract Raw Text
|
|
449
|
+
1. Extracts `result.value.rawText` (empty string if missing)
|
|
450
|
+
|
|
451
|
+
#### 9.3 Parse JSON
|
|
452
|
+
1. Calls `parseFirstJsonObject(rawText)`:
|
|
453
|
+
- Searches for first JSON object in text (between first `{` and matching `}`)
|
|
454
|
+
- If no match found, attempts to parse entire string
|
|
455
|
+
- If parsing fails, returns null
|
|
456
|
+
|
|
457
|
+
2. If parsing returns null:
|
|
458
|
+
- Writes warning to stderr: `[agentId] Invalid JSON output; skipping agent`
|
|
459
|
+
- Returns null
|
|
460
|
+
|
|
461
|
+
3. Returns parsed object as `ParsedEvaluation`
|
|
462
|
+
|
|
463
|
+
### 10. Score Aggregation
|
|
464
|
+
|
|
465
|
+
**Location**: `src/cli/commands/eval.ts` (action handler)
|
|
466
|
+
|
|
467
|
+
Aggregates scores from all evaluators and calculates averages.
|
|
468
|
+
|
|
469
|
+
**Execution Flow**:
|
|
470
|
+
|
|
471
|
+
#### 10.1 Initialize Score Arrays
|
|
472
|
+
1. Creates arrays for each metric:
|
|
473
|
+
- `arrFc`: Functional completeness scores
|
|
474
|
+
- `arrPerf`: Performance & scalability scores
|
|
475
|
+
- `arrSec`: Security scores
|
|
476
|
+
- `arrMaint`: Maintainability & evolvability scores
|
|
477
|
+
- `arrReg`: Regulatory compliance scores
|
|
478
|
+
- `arrTest`: Testability scores
|
|
479
|
+
- `arrOverall`: Overall scores
|
|
480
|
+
|
|
481
|
+
#### 10.2 Extract and Validate Scores
|
|
482
|
+
For each parsed evaluation result:
|
|
483
|
+
|
|
484
|
+
1. **Extract Scores**:
|
|
485
|
+
- `functional_completeness.score` from `parsed.evaluation.functional_completeness`
|
|
486
|
+
- `performance_scalability.score` from `parsed.evaluation.non_functional.performance_scalability`
|
|
487
|
+
- `security.score` from `parsed.evaluation.non_functional.security`
|
|
488
|
+
- `maintainability_evolvability.score` from `parsed.evaluation.non_functional.maintainability_evolvability`
|
|
489
|
+
- `regulatory_compliance.score` from `parsed.evaluation.non_functional.regulatory_compliance`
|
|
490
|
+
- `testability.score` from `parsed.evaluation.non_functional.testability`
|
|
491
|
+
- `overall_score` from `parsed.overall_summary.overall_score`
|
|
492
|
+
|
|
493
|
+
2. **Validate and Push Scores**:
|
|
494
|
+
- Calls `pushIfValid(array, value, label, agentId)` for each score:
|
|
495
|
+
- Validates value is a finite number using `numOrUndefined()`
|
|
496
|
+
- If invalid: writes warning to stderr, skips
|
|
497
|
+
- Clamps value to range [1, 10] using `clampScoreToRange()`
|
|
498
|
+
- If clamped: writes warning to stderr with original value
|
|
499
|
+
- Appends clamped value to array
|
|
500
|
+
|
|
501
|
+
#### 10.3 Calculate Averages
|
|
502
|
+
1. Calculates average for each metric using `averageOrNull()`:
|
|
503
|
+
- If array is empty: returns `null` (N/A)
|
|
504
|
+
- Otherwise: calculates average and returns as number
|
|
505
|
+
- Creates `AggregatedAverages` object:
|
|
506
|
+
```typescript
|
|
507
|
+
{
|
|
508
|
+
functional_completeness: number | null,
|
|
509
|
+
performance_scalability: number | null,
|
|
510
|
+
security: number | null,
|
|
511
|
+
maintainability_evolvability: number | null,
|
|
512
|
+
regulatory_compliance: number | null,
|
|
513
|
+
testability: number | null,
|
|
514
|
+
overall_score: number | null
|
|
515
|
+
}
|
|
516
|
+
```
|
|
517
|
+
|
|
518
|
+
### 11. Result Output
|
|
519
|
+
|
|
520
|
+
**Function**: `writeEvaluationResults(aggregatedAverages, perAgentResults, outputPath)`
|
|
521
|
+
**Location**: `src/cli/commands/eval.ts`
|
|
522
|
+
|
|
523
|
+
Writes evaluation results to a file or stdout in JSON or Markdown format.
|
|
524
|
+
|
|
525
|
+
**Parameters**:
|
|
526
|
+
- `aggregatedAverages`: Aggregated average scores for all metrics
|
|
527
|
+
- `perAgentResults`: Per-agent parsed evaluation results, keyed by agent ID
|
|
528
|
+
- `outputPath`: Optional output file path
|
|
529
|
+
|
|
530
|
+
**Execution Flow**:
|
|
531
|
+
|
|
532
|
+
#### 11.1 Resolve Output Path
|
|
533
|
+
1. If `outputPath` provided: resolves relative to current working directory
|
|
534
|
+
2. Otherwise: `undefined` (write to stdout)
|
|
535
|
+
|
|
536
|
+
#### 11.2 JSON Output Format
|
|
537
|
+
If resolved path ends with `.json`:
|
|
538
|
+
|
|
539
|
+
1. Builds `AggregatedJsonOutput` object:
|
|
540
|
+
```typescript
|
|
541
|
+
{
|
|
542
|
+
evaluation: {
|
|
543
|
+
functional_completeness: { average_score: number | null },
|
|
544
|
+
non_functional: {
|
|
545
|
+
performance_scalability: { average_score: number | null },
|
|
546
|
+
security: { average_score: number | null },
|
|
547
|
+
maintainability_evolvability: { average_score: number | null },
|
|
548
|
+
regulatory_compliance: { average_score: number | null },
|
|
549
|
+
testability: { average_score: number | null }
|
|
550
|
+
}
|
|
551
|
+
},
|
|
552
|
+
overall_score: number | null,
|
|
553
|
+
agents: Record<string, ParsedEvaluation>
|
|
554
|
+
}
|
|
555
|
+
```
|
|
556
|
+
|
|
557
|
+
2. Writes JSON file:
|
|
558
|
+
- Formats with 2-space indent
|
|
559
|
+
- Encodes as UTF-8
|
|
560
|
+
- Writes to resolved path
|
|
561
|
+
|
|
562
|
+
#### 11.3 Markdown Output Format
|
|
563
|
+
Otherwise (path doesn't end with `.json` or no path):
|
|
564
|
+
|
|
565
|
+
1. Calls `renderMarkdownTable(aggregatedAverages)`:
|
|
566
|
+
- Formats each score to 2 decimal places (or "N/A" if null)
|
|
567
|
+
- Builds Markdown table:
|
|
568
|
+
```markdown
|
|
569
|
+
| Functional Completeness | Performance & Scalability | Security | Maintainability & Evolvability | Regulatory Compliance | Testability | Overall Score |
|
|
570
|
+
|------------------------|---------------------------|----------|-------------------------------|------------------------|------------|---------------|
|
|
571
|
+
| 8.50 | 7.25 | 9.00 | 8.00 | N/A | 7.75 | 8.10 |
|
|
572
|
+
```
|
|
573
|
+
|
|
574
|
+
2. Writes output:
|
|
575
|
+
- If `outputPath` provided: writes Markdown file (UTF-8)
|
|
576
|
+
- Otherwise: writes to stdout
|
|
577
|
+
|
|
578
|
+
### 12. Error Handling
|
|
579
|
+
|
|
580
|
+
The system uses structured error handling with exit codes:
|
|
581
|
+
|
|
582
|
+
**Exit Codes**:
|
|
583
|
+
- `0`: Success
|
|
584
|
+
- `1`: General error (EXIT_GENERAL_ERROR)
|
|
585
|
+
- `2`: Invalid arguments (EXIT_INVALID_ARGS)
|
|
586
|
+
- `4`: Configuration error (EXIT_CONFIG_ERROR)
|
|
587
|
+
|
|
588
|
+
**Error Flow**:
|
|
589
|
+
1. Errors thrown in action handler are caught
|
|
590
|
+
2. Exit code extracted from error object if present, otherwise defaults to EXIT_GENERAL_ERROR
|
|
591
|
+
3. Error message written to stderr
|
|
592
|
+
4. Error re-thrown to top-level CLI handler
|
|
593
|
+
5. Top-level handler extracts code and calls `process.exit(code)`
|
|
594
|
+
|
|
595
|
+
**Validation Errors**:
|
|
596
|
+
- Missing or invalid config file: EXIT_INVALID_ARGS
|
|
597
|
+
- No enabled evaluator agents: EXIT_INVALID_ARGS
|
|
598
|
+
- Missing or invalid debate file: EXIT_INVALID_ARGS
|
|
599
|
+
- Missing problem or final solution: EXIT_INVALID_ARGS
|
|
600
|
+
- Missing API keys: EXIT_CONFIG_ERROR (handled by provider factory)
|
|
601
|
+
|
|
602
|
+
**Evaluation Errors**:
|
|
603
|
+
- Individual evaluator failures: logged to stderr, agent skipped, evaluation continues
|
|
604
|
+
- Invalid JSON output: logged to stderr, agent skipped, evaluation continues
|
|
605
|
+
- Missing scores: logged to stderr, score skipped, aggregation continues
|
|
606
|
+
|
|
607
|
+
## Key Data Structures
|
|
608
|
+
|
|
609
|
+
### EvaluatorConfig
|
|
610
|
+
Represents an evaluator agent configuration:
|
|
611
|
+
- `id`: Unique identifier (string)
|
|
612
|
+
- `name`: Human-readable name (string)
|
|
613
|
+
- `model`: LLM model name (string)
|
|
614
|
+
- `provider`: Provider type ("openai" or "openrouter")
|
|
615
|
+
- `systemPromptPath`: Optional path to system prompt file
|
|
616
|
+
- `userPromptPath`: Optional path to user prompt file
|
|
617
|
+
- `timeout`: Optional timeout in milliseconds (currently ignored)
|
|
618
|
+
- `enabled`: Optional boolean (defaults to true if undefined)
|
|
619
|
+
|
|
620
|
+
### EvaluatorInputs
|
|
621
|
+
Inputs passed to each evaluator agent:
|
|
622
|
+
- `problem`: Problem statement string
|
|
623
|
+
- `clarificationsMarkdown`: Markdown-formatted clarifications string
|
|
624
|
+
- `finalSolution`: Final solution description string
|
|
625
|
+
|
|
626
|
+
### EvaluatorResult
|
|
627
|
+
Result from a single evaluator agent:
|
|
628
|
+
- `id`: Agent identifier
|
|
629
|
+
- `rawText`: Raw LLM response text
|
|
630
|
+
- `latencyMs`: Latency in milliseconds
|
|
631
|
+
- `usage`: Optional token usage statistics (inputTokens, outputTokens, totalTokens)
|
|
632
|
+
|
|
633
|
+
### ParsedEvaluation
|
|
634
|
+
Parsed evaluation output from an evaluator agent:
|
|
635
|
+
- `evaluation`: Optional evaluation object containing:
|
|
636
|
+
- `functional_completeness`: Optional object with `score` (number) and `reasoning` (string)
|
|
637
|
+
- `non_functional`: Optional object containing:
|
|
638
|
+
- `performance_scalability`: Optional object with `score` and `reasoning`
|
|
639
|
+
- `security`: Optional object with `score` and `reasoning`
|
|
640
|
+
- `maintainability_evolvability`: Optional object with `score` and `reasoning`
|
|
641
|
+
- `regulatory_compliance`: Optional object with `score` and `reasoning`
|
|
642
|
+
- `testability`: Optional object with `score` and `reasoning`
|
|
643
|
+
- `overall_summary`: Optional object with:
|
|
644
|
+
- `strengths`: Optional string
|
|
645
|
+
- `weaknesses`: Optional string
|
|
646
|
+
- `overall_score`: Optional number
|
|
647
|
+
|
|
648
|
+
### AggregatedAverages
|
|
649
|
+
Aggregated average scores across all evaluators:
|
|
650
|
+
- `functional_completeness`: number | null
|
|
651
|
+
- `performance_scalability`: number | null
|
|
652
|
+
- `security`: number | null
|
|
653
|
+
- `maintainability_evolvability`: number | null
|
|
654
|
+
- `regulatory_compliance`: number | null
|
|
655
|
+
- `testability`: number | null
|
|
656
|
+
- `overall_score`: number | null
|
|
657
|
+
|
|
658
|
+
### AggregatedJsonOutput
|
|
659
|
+
Complete JSON output format:
|
|
660
|
+
- `evaluation`: Object with functional and non-functional metrics
|
|
661
|
+
- `overall_score`: Overall average score
|
|
662
|
+
- `agents`: Record mapping agent IDs to their `ParsedEvaluation` objects
|
|
663
|
+
|
|
664
|
+
## File System Interactions
|
|
665
|
+
|
|
666
|
+
### Configuration File Loading
|
|
667
|
+
|
|
668
|
+
**File Format**: JSON file with evaluator agent configurations
|
|
669
|
+
|
|
670
|
+
**Location**: Specified via `--config <path>` option
|
|
671
|
+
|
|
672
|
+
**Validation**:
|
|
673
|
+
- File must exist and be a regular file
|
|
674
|
+
- Must contain valid JSON
|
|
675
|
+
- Must have `agents` array with at least one entry
|
|
676
|
+
- At least one agent must be enabled (`enabled !== false`)
|
|
677
|
+
|
|
678
|
+
**Error Handling**:
|
|
679
|
+
- Missing file: EXIT_INVALID_ARGS
|
|
680
|
+
- Invalid JSON: EXIT_INVALID_ARGS
|
|
681
|
+
- Missing or empty agents array: EXIT_INVALID_ARGS
|
|
682
|
+
- No enabled agents: EXIT_INVALID_ARGS
|
|
683
|
+
|
|
684
|
+
### Debate State File Loading
|
|
685
|
+
|
|
686
|
+
**File Format**: JSON file with `DebateState` structure
|
|
687
|
+
|
|
688
|
+
**Location**: Specified via `--debate <path>` option
|
|
689
|
+
|
|
690
|
+
**Required Fields**:
|
|
691
|
+
- `problem`: Non-empty string
|
|
692
|
+
- `finalSolution.description`: Non-empty string
|
|
693
|
+
|
|
694
|
+
**Optional Fields**:
|
|
695
|
+
- `clarifications`: Array of `AgentClarifications` objects
|
|
696
|
+
|
|
697
|
+
**Validation**:
|
|
698
|
+
- File must exist and be a regular file
|
|
699
|
+
- Must contain valid JSON
|
|
700
|
+
- Problem field must exist and be non-empty after trimming
|
|
701
|
+
- Final solution description must exist and be non-empty after trimming
|
|
702
|
+
|
|
703
|
+
**Error Handling**:
|
|
704
|
+
- Missing file: EXIT_INVALID_ARGS
|
|
705
|
+
- Invalid JSON: EXIT_INVALID_ARGS
|
|
706
|
+
- Missing or empty problem: EXIT_INVALID_ARGS
|
|
707
|
+
- Missing or empty final solution: EXIT_INVALID_ARGS
|
|
708
|
+
|
|
709
|
+
### Prompt File Loading
|
|
710
|
+
|
|
711
|
+
**System Prompt**:
|
|
712
|
+
- Default: `src/eval/prompts/system.md` (or `dist/eval/prompts/system.md` at runtime)
|
|
713
|
+
- Custom: Path specified in `EvaluatorConfig.systemPromptPath`, resolved relative to config directory
|
|
714
|
+
|
|
715
|
+
**User Prompt**:
|
|
716
|
+
- Default: `src/eval/prompts/user.md` (or `dist/eval/prompts/user.md` at runtime)
|
|
717
|
+
- Custom: Path specified in `EvaluatorConfig.userPromptPath`, resolved relative to config directory
|
|
718
|
+
|
|
719
|
+
**Error Handling**:
|
|
720
|
+
- Missing custom prompt file: Warning to stderr, falls back to built-in default
|
|
721
|
+
- Empty custom prompt file: Warning to stderr, falls back to built-in default
|
|
722
|
+
- Unreadable custom prompt file: Warning to stderr, falls back to built-in default
|
|
723
|
+
|
|
724
|
+
### Output File Writing
|
|
725
|
+
|
|
726
|
+
**JSON Output**:
|
|
727
|
+
- Format: Pretty-printed JSON with 2-space indent
|
|
728
|
+
- Encoding: UTF-8
|
|
729
|
+
- Content: Complete `AggregatedJsonOutput` object
|
|
730
|
+
|
|
731
|
+
**Markdown Output**:
|
|
732
|
+
- Format: Markdown table with aggregated scores
|
|
733
|
+
- Encoding: UTF-8
|
|
734
|
+
- Content: Single table row with formatted scores
|
|
735
|
+
|
|
736
|
+
**Output Location**:
|
|
737
|
+
- If `--output` specified: writes to file at resolved path
|
|
738
|
+
- Otherwise: writes Markdown table to stdout
|
|
739
|
+
|
|
740
|
+
## Concurrency Model
|
|
741
|
+
|
|
742
|
+
### Parallel Operations
|
|
743
|
+
|
|
744
|
+
The following operations run concurrently:
|
|
745
|
+
- **Evaluator agent execution**: All evaluators run in parallel via `Promise.allSettled()`
|
|
746
|
+
- Each evaluator's LLM call runs independently
|
|
747
|
+
- Results are collected as they complete
|
|
748
|
+
|
|
749
|
+
### Sequential Operations
|
|
750
|
+
|
|
751
|
+
The following operations run sequentially:
|
|
752
|
+
- Configuration loading → Agent building → Debate loading → Evaluation → Aggregation → Output
|
|
753
|
+
- File system reads (config, debate, prompts)
|
|
754
|
+
- Result parsing and score extraction (though this could be parallelized)
|
|
755
|
+
|
|
756
|
+
### Error Isolation
|
|
757
|
+
|
|
758
|
+
- Individual evaluator failures are isolated using `Promise.allSettled()`
|
|
759
|
+
- Failed evaluators are logged and skipped
|
|
760
|
+
- Aggregation continues with successful evaluators only
|
|
761
|
+
- Missing or invalid scores are logged and skipped per-agent
|
|
762
|
+
|
|
763
|
+
## Performance Considerations
|
|
764
|
+
|
|
765
|
+
### Latency
|
|
766
|
+
|
|
767
|
+
- **Total latency**: Dominated by slowest evaluator (due to parallel execution)
|
|
768
|
+
- **Per-evaluator latency**: LLM API call time + network latency
|
|
769
|
+
- **Parsing latency**: Negligible (JSON parsing is fast)
|
|
770
|
+
- **Aggregation latency**: Negligible (simple arithmetic)
|
|
771
|
+
|
|
772
|
+
### Token Usage
|
|
773
|
+
|
|
774
|
+
- **Per-evaluator tokens**: Depends on prompt size and model response
|
|
775
|
+
- **Total tokens**: Sum of all evaluator token usage
|
|
776
|
+
- **Prompt size**: Includes problem statement, clarifications (if any), and final solution
|
|
777
|
+
|
|
778
|
+
### Scalability
|
|
779
|
+
|
|
780
|
+
- **Parallel execution**: All evaluators run concurrently, so adding more evaluators doesn't linearly increase total time
|
|
781
|
+
- **Bottleneck**: Slowest evaluator determines total execution time
|
|
782
|
+
- **Cost**: Grows linearly with number of evaluators (each makes one LLM call)
|
|
783
|
+
|
|
784
|
+
## Extension Points
|
|
785
|
+
|
|
786
|
+
The architecture supports extension through:
|
|
787
|
+
|
|
788
|
+
1. **New Providers**: Implement `LLMProvider` interface for other LLM services
|
|
789
|
+
2. **Custom Prompts**: Specify custom system and user prompts via configuration file
|
|
790
|
+
3. **Additional Metrics**: Extend `ParsedEvaluation` interface and aggregation logic
|
|
791
|
+
4. **Alternative Output Formats**: Extend `writeEvaluationResults()` to support additional formats
|
|
792
|
+
5. **Custom Score Ranges**: Modify `clampScoreToRange()` to support different score ranges
|
|
793
|
+
6. **Enhanced Validation**: Add custom validation logic in `validateAndParseEvaluatorResult()`
|
|
794
|
+
7. **Progress Tracking**: Add progress hooks similar to debate orchestrator for real-time feedback
|
|
795
|
+
|