dialectic 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/.cursor/commands/setup-test.mdc +175 -0
  2. package/.cursor/rules/basic-code-cleanup.mdc +1110 -0
  3. package/.cursor/rules/riper5.mdc +96 -0
  4. package/.env.example +6 -0
  5. package/AGENTS.md +1052 -0
  6. package/LICENSE +21 -0
  7. package/README.md +93 -0
  8. package/WARP.md +113 -0
  9. package/dialectic-1.0.0.tgz +0 -0
  10. package/dialectic.js +10 -0
  11. package/docs/commands.md +375 -0
  12. package/docs/configuration.md +882 -0
  13. package/docs/context_summarization.md +1023 -0
  14. package/docs/debate_flow.md +1127 -0
  15. package/docs/eval_flow.md +795 -0
  16. package/docs/evaluator.md +141 -0
  17. package/examples/debate-config-openrouter.json +48 -0
  18. package/examples/debate_config1.json +48 -0
  19. package/examples/eval/eval1/eval_config1.json +13 -0
  20. package/examples/eval/eval1/result1.json +62 -0
  21. package/examples/eval/eval1/result2.json +97 -0
  22. package/examples/eval_summary_format.md +11 -0
  23. package/examples/example3/debate-config.json +64 -0
  24. package/examples/example3/eval_config2.json +25 -0
  25. package/examples/example3/problem.md +17 -0
  26. package/examples/example3/rounds_test/eval_run.sh +16 -0
  27. package/examples/example3/rounds_test/run_test.sh +16 -0
  28. package/examples/kata1/architect-only-solution_2-rounds.json +121 -0
  29. package/examples/kata1/architect-perf-solution_2-rounds.json +234 -0
  30. package/examples/kata1/debate-config-kata1.json +54 -0
  31. package/examples/kata1/eval_architect-only_2-rounds.json +97 -0
  32. package/examples/kata1/eval_architect-perf_2-rounds.json +97 -0
  33. package/examples/kata1/kata1-report.md +12224 -0
  34. package/examples/kata1/kata1-report_temps-01_01_01_07.md +2451 -0
  35. package/examples/kata1/kata1.md +5 -0
  36. package/examples/kata1/meta.txt +1 -0
  37. package/examples/kata2/debate-config.json +54 -0
  38. package/examples/kata2/eval_config1.json +21 -0
  39. package/examples/kata2/eval_config2.json +25 -0
  40. package/examples/kata2/kata2.md +5 -0
  41. package/examples/kata2/only_architect/debate-config.json +45 -0
  42. package/examples/kata2/only_architect/eval_run.sh +11 -0
  43. package/examples/kata2/only_architect/run_test.sh +5 -0
  44. package/examples/kata2/rounds_test/eval_run.sh +11 -0
  45. package/examples/kata2/rounds_test/run_test.sh +5 -0
  46. package/examples/kata2/summary_length_test/eval_run.sh +11 -0
  47. package/examples/kata2/summary_length_test/eval_run_w_clarify.sh +7 -0
  48. package/examples/kata2/summary_length_test/run_test.sh +5 -0
  49. package/examples/task-queue/debate-config.json +76 -0
  50. package/examples/task-queue/debate_report.md +566 -0
  51. package/examples/task-queue/task-queue-system.md +25 -0
  52. package/jest.config.ts +13 -0
  53. package/multi_agent_debate_spec.md +2980 -0
  54. package/package.json +38 -0
  55. package/sanity-check-problem.txt +9 -0
  56. package/src/agents/prompts/architect-prompts.ts +203 -0
  57. package/src/agents/prompts/generalist-prompts.ts +157 -0
  58. package/src/agents/prompts/index.ts +41 -0
  59. package/src/agents/prompts/judge-prompts.ts +19 -0
  60. package/src/agents/prompts/kiss-prompts.ts +230 -0
  61. package/src/agents/prompts/performance-prompts.ts +142 -0
  62. package/src/agents/prompts/prompt-types.ts +68 -0
  63. package/src/agents/prompts/security-prompts.ts +149 -0
  64. package/src/agents/prompts/shared.ts +144 -0
  65. package/src/agents/prompts/testing-prompts.ts +149 -0
  66. package/src/agents/role-based-agent.ts +386 -0
  67. package/src/cli/commands/debate.ts +761 -0
  68. package/src/cli/commands/eval.ts +475 -0
  69. package/src/cli/commands/report.ts +265 -0
  70. package/src/cli/index.ts +79 -0
  71. package/src/core/agent.ts +198 -0
  72. package/src/core/clarifications.ts +34 -0
  73. package/src/core/judge.ts +257 -0
  74. package/src/core/orchestrator.ts +432 -0
  75. package/src/core/state-manager.ts +322 -0
  76. package/src/eval/evaluator-agent.ts +130 -0
  77. package/src/eval/prompts/system.md +41 -0
  78. package/src/eval/prompts/user.md +64 -0
  79. package/src/providers/llm-provider.ts +25 -0
  80. package/src/providers/openai-provider.ts +84 -0
  81. package/src/providers/openrouter-provider.ts +122 -0
  82. package/src/providers/provider-factory.ts +64 -0
  83. package/src/types/agent.types.ts +141 -0
  84. package/src/types/config.types.ts +47 -0
  85. package/src/types/debate.types.ts +237 -0
  86. package/src/types/eval.types.ts +85 -0
  87. package/src/utils/common.ts +104 -0
  88. package/src/utils/context-formatter.ts +102 -0
  89. package/src/utils/context-summarizer.ts +143 -0
  90. package/src/utils/env-loader.ts +46 -0
  91. package/src/utils/exit-codes.ts +5 -0
  92. package/src/utils/id.ts +11 -0
  93. package/src/utils/logger.ts +48 -0
  94. package/src/utils/paths.ts +10 -0
  95. package/src/utils/progress-ui.ts +313 -0
  96. package/src/utils/prompt-loader.ts +79 -0
  97. package/src/utils/report-generator.ts +301 -0
  98. package/tests/clarifications.spec.ts +128 -0
  99. package/tests/cli.debate.spec.ts +144 -0
  100. package/tests/config-loading.spec.ts +206 -0
  101. package/tests/context-summarizer.spec.ts +131 -0
  102. package/tests/debate-config-custom.json +38 -0
  103. package/tests/env-loader.spec.ts +149 -0
  104. package/tests/eval.command.spec.ts +1191 -0
  105. package/tests/logger.spec.ts +19 -0
  106. package/tests/openai-provider.spec.ts +26 -0
  107. package/tests/openrouter-provider.spec.ts +279 -0
  108. package/tests/orchestrator-summary.spec.ts +386 -0
  109. package/tests/orchestrator.spec.ts +207 -0
  110. package/tests/prompt-loader.spec.ts +52 -0
  111. package/tests/prompts/architect.md +16 -0
  112. package/tests/provider-factory.spec.ts +150 -0
  113. package/tests/report.command.spec.ts +546 -0
  114. package/tests/role-based-agent-summary.spec.ts +476 -0
  115. package/tests/security-agent.spec.ts +221 -0
  116. package/tests/shared-prompts.spec.ts +318 -0
  117. package/tests/state-manager.spec.ts +251 -0
  118. package/tests/summary-prompts.spec.ts +153 -0
  119. package/tsconfig.json +49 -0
@@ -0,0 +1,1023 @@
1
+ # Context Summarization - Design Documentation
2
+
3
+ ## Overview
4
+
5
+ The context summarization feature manages debate history length by allowing agents to condense their perspective-based history when it grows large. This prevents context window limitations while preserving critical insights for multi-round debates. The judge agent also supports summarization during synthesis to manage the final round's content size.
6
+
7
+ ### Purpose
8
+
9
+ As debates progress through multiple rounds, the conversation history grows. Without summarization:
10
+ - Agents would eventually hit LLM context window limits
11
+ - Token costs would increase unnecessarily
12
+ - Irrelevant historical details would clutter the context
13
+
14
+ Context summarization solves this by letting each agent create concise summaries of their relevant history, reducing context size while preserving key information.
15
+
16
+ ---
17
+
18
+ ## Core Behavior
19
+
20
+ ### What Gets Summarized
21
+
22
+ Each agent summarizes **only their perspective** of the debate:
23
+ - **Their proposals**: All proposals made by this agent across rounds
24
+ - **Critiques they received**: Only critiques targeting this agent's proposals (not critiques of other agents)
25
+ - **Their refinements**: All refinements made by this agent in response to feedback
26
+
27
+ This perspective-based filtering ensures agents summarize only information relevant to their future contributions.
28
+
29
+ ### When Summarization Occurs
30
+
31
+ Summarization happens **at the beginning of each round**, before the proposal phase:
32
+
33
+ 1. **Evaluation**: Each agent calculates the total character count of their perspective-based history
34
+ 2. **Decision**: If the count exceeds the configured threshold (default: 5000 characters), summarization is triggered
35
+ 3. **Generation**: The agent calls an LLM to create a concise summary (max length: 2500 characters by default) using the agent's configured model, temperature, and provider (with fallbacks)
36
+ 4. **Storage**: The summary and metadata are persisted in the debate state
37
+ 5. **Usage**: The agent uses the summary instead of full history for subsequent debate phases in this round
38
+
39
+ **Judge Summarization**: The judge also performs summarization during the synthesis phase if the final round's proposals and refinements exceed the threshold. This provides a focused view of the most recent solution attempts for synthesis.
40
+
41
+ ### How It Works
42
+
43
+ ```
44
+ Round N starts
45
+
46
+ For each agent:
47
+ └─> Calculate character count of agent's perspective
48
+ ├─> Below threshold? → Use full history (no summarization)
49
+ └─> Above threshold? → Generate summary
50
+ ├─> Filter history to agent's perspective
51
+ ├─> Call LLM with summarization prompt (using configured model/temperature/provider with fallbacks)
52
+ ├─> Store summary with metadata
53
+ └─> Use summary for this round's debate phases
54
+
55
+ Synthesis phase starts
56
+
57
+ Judge:
58
+ └─> Calculate character count of final round's proposals and refinements
59
+ ├─> Below threshold? → Use full history for synthesis
60
+ └─> Above threshold? → Generate summary (using judge-configured model/temperature/provider with fallbacks)
61
+ ├─> Filter to final round's proposals and refinements
62
+ ├─> Call LLM with judge-specific summarization prompt
63
+ ├─> Store summary in DebateState.judgeSummary
64
+ └─> Use summary for synthesis prompt
65
+ ```
66
+
67
+ ---
68
+
69
+ ## Architecture & Design Decisions
70
+
71
+ ### Design Choice 1: Hybrid Architecture
72
+
73
+ **Decision**: Combine centralized summarization strategies with agent-owned decision-making.
74
+
75
+ **Rationale**:
76
+ - **Agent Autonomy**: Each agent decides independently when to summarize based on their own history and thresholds
77
+ - **Centralized Strategy**: Summarization logic is implemented via a pluggable `ContextSummarizer` interface, allowing different strategies to be swapped in
78
+ - **Best of Both**: Agents control "when" and "what" to summarize; strategies control "how" to summarize
79
+
80
+ **Implementation**:
81
+ - `ContextSummarizer` interface defines the contract for summarization strategies
82
+ - `LengthBasedSummarizer` is the current implementation (triggers on character count threshold)
83
+ - Future strategies (semantic, hierarchical, RAG-based) can be added without changing agent code
84
+
85
+ ### Design Choice 2: Per-Round Fresh Summaries
86
+
87
+ **Decision**: Recalculate summaries from full history each round, rather than maintaining incremental summaries.
88
+
89
+ **Rationale**:
90
+ - **Simplicity**: No need to manage running summary state or handle incremental updates
91
+ - **Accuracy**: Each summary is fresh and comprehensive, avoiding accumulation of errors or omissions
92
+ - **Flexibility**: Changing summary parameters or prompts affects all future summaries immediately
93
+ - **Debugging**: Easier to understand and debug since each summary is independent
94
+
95
+ **Trade-off**: Slight redundancy in LLM calls (re-summarizing some content), but the simplicity and accuracy gains outweigh the cost.
96
+
97
+ ### Design Choice 3: Perspective-Based Filtering
98
+
99
+ **Decision**: Each agent filters history to their own perspective before summarizing.
100
+
101
+ **Rationale**:
102
+ - **Relevance**: Agents only need information relevant to their future contributions
103
+ - **Efficiency**: Smaller input to summarization LLM = faster and cheaper
104
+ - **Privacy**: Agents don't need to see critiques of other agents
105
+ - **Scalability**: Character count grows linearly with agent's own activity, not with total debate size
106
+
107
+ **Example**: In a debate with 3 agents and 5 rounds, the architect agent would summarize:
108
+ - Their 5 proposals
109
+ - Critiques they received (not critiques of performance or security agents)
110
+ - Their 5 refinements
111
+
112
+ ### Design Choice 4: Two-Level Configuration
113
+
114
+ **Decision**: Support both system-wide and per-agent configuration, with agent settings overriding system settings.
115
+
116
+ **Rationale**:
117
+ - **Convenience**: System-wide defaults apply to all agents (DRY principle)
118
+ - **Flexibility**: Individual agents can override with custom settings
119
+ - **Use Cases**:
120
+ - Architect might need higher threshold (longer context for design decisions)
121
+ - Security might need lower threshold (focus on recent threats)
122
+ - Some agents can disable summarization entirely
123
+
124
+ **Implementation**:
125
+ ```json
126
+ {
127
+ "debate": {
128
+ "summarization": {
129
+ "enabled": true,
130
+ "threshold": 5000,
131
+ "maxLength": 2500,
132
+ "method": "length-based"
133
+ }
134
+ },
135
+ "agents": [
136
+ {
137
+ "id": "agent-architect",
138
+ "summarization": {
139
+ "threshold": 8000 // Override: architect gets more context
140
+ }
141
+ }
142
+ ]
143
+ }
144
+ ```
145
+
146
+ ### Design Choice 5: Storage in Debate Rounds and State
147
+
148
+ **Decision**: Store agent summaries as a `Record<string, DebateSummary>` within each `DebateRound`, keyed by agent ID. Store judge summaries separately in `DebateState.judgeSummary`.
149
+
150
+ **Rationale**:
151
+ - **Auditability**: Full history of what was summarized, when, and by whom
152
+ - **Debugging**: Can trace exactly what context each agent saw in each round
153
+ - **Reproducibility**: Complete debate state includes all summarization decisions
154
+ - **Metadata**: Capture timing, token usage, and compression ratios for analysis
155
+ - **Agent Isolation**: Each agent's summary is easily accessible by their ID without mixing data
156
+ - **Efficient Lookup**: O(1) access to an agent's summary in any round
157
+ - **Judge Separation**: Judge summaries are stored separately since they're used for synthesis, not per-round context
158
+
159
+ **Structure**:
160
+ ```typescript
161
+ interface DebateRound {
162
+ roundNumber: number;
163
+ contributions: Contribution[];
164
+ summaries?: Record<string, DebateSummary>; // Keyed by agentId
165
+ timestamp: Date;
166
+ }
167
+
168
+ interface DebateState {
169
+ id: string;
170
+ problem: string;
171
+ context?: string;
172
+ status: DebateStatus;
173
+ currentRound: number;
174
+ rounds: DebateRound[];
175
+ finalSolution?: Solution;
176
+ judgeSummary?: DebateSummary; // Judge's synthesis summary
177
+ createdAt: Date;
178
+ updatedAt: Date;
179
+ promptSources?: {
180
+ agents: AgentPromptMetadata[];
181
+ judge: JudgePromptMetadata;
182
+ };
183
+ }
184
+
185
+ interface DebateSummary {
186
+ agentId: string;
187
+ agentRole: AgentRole;
188
+ summary: string; // The actual summary text sent to LLM
189
+ metadata: {
190
+ beforeChars: number;
191
+ afterChars: number;
192
+ method: string;
193
+ timestamp: Date;
194
+ latencyMs?: number;
195
+ tokensUsed?: number;
196
+ model?: string;
197
+ temperature?: number;
198
+ provider?: 'openai' | 'openrouter';
199
+ };
200
+ }
201
+ ```
202
+
203
+ **Usage Example**:
204
+ ```typescript
205
+ // Storing an agent summary
206
+ round.summaries = {};
207
+ round.summaries[agent.id] = summary;
208
+
209
+ // Storing a judge summary
210
+ state.judgeSummary = summary;
211
+
212
+ // Retrieving an agent summary
213
+ const agentSummary = round.summaries?.[agentId];
214
+
215
+ // Retrieving a judge summary
216
+ const judgeSummary = state.judgeSummary;
217
+ ```
218
+
219
+ ---
220
+
221
+ ## System Components
222
+
223
+ ### Component Interaction
224
+
225
+ ```
226
+ ┌─────────────────────────────────────────────────────────┐
227
+ │ DebateOrchestrator │
228
+ │ - Coordinates debate flow │
229
+ │ - Calls summarizationPhase() before proposal phase │
230
+ │ - Calls judge.prepareContext() before synthesis │
231
+ │ - Passes prepared contexts to debate phases │
232
+ └────────────┬────────────────────────────┬───────────────┘
233
+ │ │
234
+ ▼ ▼
235
+ ┌────────────────┐ ┌─────────────────┐
236
+ │ RoleBasedAgent│ │ StateManager │
237
+ │ - shouldSummarize() │ - addSummary() │
238
+ │ - prepareContext() │ - addJudgeSummary() │
239
+ └────────┬───────┘ └─────────────────┘
240
+
241
+
242
+ ┌─────────────────────────┐
243
+ │ ContextSummarizer │
244
+ │ (Strategy Interface) │
245
+ └────────┬────────────────┘
246
+
247
+
248
+ ┌─────────────────────────┐
249
+ │ LengthBasedSummarizer │
250
+ │ - summarize() │
251
+ │ - Calls LLM │
252
+ └─────────────────────────┘
253
+
254
+
255
+ ┌─────────────────────────┐
256
+ │ JudgeAgent │
257
+ │ - shouldSummarize() │
258
+ │ - prepareContext() │
259
+ │ - getFinalRoundRelevantContent() │
260
+ └─────────────────────────┘
261
+ ```
262
+
263
+ ### 1. Orchestrator (`src/core/orchestrator.ts`)
264
+
265
+ **Responsibilities**:
266
+ - Coordinate debate flow across rounds
267
+ - Execute summarization phase before each proposal phase
268
+ - Call judge context preparation before synthesis phase
269
+ - Pass prepared contexts (with or without summaries) to agents
270
+
271
+ **Key Methods**:
272
+
273
+ `summarizationPhase(state: DebateState, roundNumber: number)`
274
+ ```typescript
275
+ // Simplified implementation
276
+ async summarizationPhase(state, roundNumber) {
277
+ const baseContext = this.buildContext(state);
278
+ const preparedContexts = new Map();
279
+
280
+ for (const agent of this.agents) {
281
+ // Let agent decide and prepare context
282
+ const result = await agent.prepareContext(baseContext, roundNumber);
283
+
284
+ if (result.summary) {
285
+ // Store summary in debate state (keyed by agentId)
286
+ await this.stateManager.addSummary(state.id, result.summary);
287
+ // Notify progress UI
288
+ this.hooks?.onSummarizationComplete?.(/*...*/);
289
+ }
290
+
291
+ // Note: prepareContext returns the same context (no modification)
292
+ // Summaries are looked up later when formatting prompts
293
+ preparedContexts.set(agent.config.id, result.context);
294
+ }
295
+
296
+ return preparedContexts;
297
+ }
298
+ ```
299
+
300
+ `synthesisPhase(state: DebateState)`
301
+ ```typescript
302
+ // Simplified implementation
303
+ async synthesisPhase(state) {
304
+ // Prepare judge context with potential summarization
305
+ const result = await this.judge.prepareContext(state.rounds);
306
+
307
+ // Store judge summary if one was created
308
+ if (result.summary) {
309
+ await this.stateManager.addJudgeSummary(state.id, result.summary);
310
+ }
311
+
312
+ const ctx = this.buildContext(state);
313
+ const solution = await this.judge.synthesize(state.problem, state.rounds, ctx);
314
+ return solution;
315
+ }
316
+ ```
317
+
318
+ **Design Rationale**: The orchestrator doesn't make summarization decisions—it just coordinates the process. This keeps agent autonomy while ensuring consistent execution order.
319
+
320
+ ### 2. RoleBasedAgent (`src/agents/role-based-agent.ts`)
321
+
322
+ **Responsibilities**:
323
+ - Decide when to summarize based on threshold
324
+ - Filter history to agent's perspective
325
+ - Generate summaries via `ContextSummarizer`
326
+ - Manage summarization configuration
327
+
328
+ **Key Methods**:
329
+
330
+ **`shouldSummarize(context: DebateContext): boolean`**
331
+ ```typescript
332
+ // Simplified implementation
333
+ shouldSummarize(context) {
334
+ if (!this.summaryConfig.enabled) return false;
335
+ if (!context.history) return false;
336
+
337
+ // Calculate character count of agent's perspective
338
+ let totalChars = 0;
339
+ for (const round of context.history) {
340
+ for (const contribution of round.contributions) {
341
+ if (this.isRelevantToMe(contribution)) {
342
+ totalChars += contribution.content.length;
343
+ }
344
+ }
345
+ }
346
+
347
+ return totalChars >= this.summaryConfig.threshold;
348
+ }
349
+ ```
350
+
351
+ **`prepareContext(context: DebateContext, roundNumber: number)`**
352
+ ```typescript
353
+ // Simplified implementation
354
+ async prepareContext(context, roundNumber) {
355
+ // Should we summarize?
356
+ if (!this.shouldSummarize(context)) {
357
+ return { context }; // Return original context unchanged
358
+ }
359
+
360
+ // Filter history to my perspective
361
+ const myHistory = this.filterToMyPerspective(context.history);
362
+
363
+ // Generate summary
364
+ const result = await this.summarizer.summarize(
365
+ myHistory,
366
+ this.config.role,
367
+ this.summaryConfig,
368
+ this.resolvedSystemPrompt,
369
+ this.resolvedSummaryPrompt
370
+ );
371
+
372
+ // Build summary object for storage
373
+ const summary = {
374
+ agentId: this.config.id,
375
+ agentRole: this.config.role,
376
+ summary: result.summary, // The actual text
377
+ metadata: result.metadata
378
+ };
379
+
380
+ // Return original context + summary for persistence
381
+ // Summary will be looked up from rounds when formatting prompts
382
+ return { context, summary };
383
+ }
384
+ ```
385
+
386
+ **Design Rationale**: The agent encapsulates all summarization logic—the orchestrator just calls `prepareContext()` and handles the result. The context is not modified; summaries are stored separately and retrieved when needed for prompt formatting. This prevents data mixing between agents.
387
+
388
+ ### 3. JudgeAgent (`src/core/judge.ts`)
389
+
390
+ **Responsibilities**:
391
+ - Decide when to summarize final round content for synthesis
392
+ - Filter final round to proposals and refinements only
393
+ - Generate summaries using judge-specific prompts
394
+ - Use summaries in synthesis prompt building
395
+
396
+ **Key Methods**:
397
+
398
+ **`shouldSummarize(rounds: DebateRound[]): boolean`**
399
+ ```typescript
400
+ // Simplified implementation
401
+ shouldSummarize(rounds) {
402
+ if (!this.summaryConfig.enabled) return false;
403
+ if (!rounds || rounds.length === 0) return false;
404
+
405
+ const finalRound = rounds[rounds.length - 1];
406
+ const relevantContent = this.getFinalRoundRelevantContent(rounds);
407
+
408
+ return relevantContent.length >= this.summaryConfig.threshold;
409
+ }
410
+ ```
411
+
412
+ **`getFinalRoundRelevantContent(rounds: DebateRound[]): string`**
413
+ ```typescript
414
+ // Simplified implementation
415
+ getFinalRoundRelevantContent(rounds) {
416
+ const finalRound = rounds[rounds.length - 1];
417
+ if (!finalRound) return '';
418
+
419
+ const relevantContributions = finalRound.contributions.filter(c =>
420
+ c.type === 'proposal' || c.type === 'refinement'
421
+ );
422
+
423
+ return relevantContributions.map(c => c.content).join('\n\n');
424
+ }
425
+ ```
426
+
427
+ **`prepareContext(rounds: DebateRound[]): Promise<ContextPreparationResult>`**
428
+ ```typescript
429
+ // Simplified implementation
430
+ async prepareContext(rounds) {
431
+ if (!this.shouldSummarize(rounds)) {
432
+ return { context: { problem: '', history: rounds } };
433
+ }
434
+
435
+ try {
436
+ const relevantContent = this.getFinalRoundRelevantContent(rounds);
437
+ const summaryPrompt = this.buildSummaryPrompt(relevantContent);
438
+
439
+ const result = await this.summarizer.summarize(
440
+ relevantContent,
441
+ 'generalist', // Judge role
442
+ this.summaryConfig,
443
+ this.systemPrompt,
444
+ summaryPrompt
445
+ );
446
+
447
+ const summary = {
448
+ agentId: this.config.id,
449
+ agentRole: 'generalist',
450
+ summary: result.summary,
451
+ metadata: result.metadata
452
+ };
453
+
454
+ return { context: { problem: '', history: rounds }, summary };
455
+ } catch (error) {
456
+ console.error('Judge summarization failed:', error);
457
+ return { context: { problem: '', history: rounds } };
458
+ }
459
+ }
460
+ ```
461
+
462
+ **Design Rationale**: The judge uses the same summarization infrastructure as agents but focuses specifically on final round content. This provides a focused view of the most recent solution attempts for synthesis while maintaining consistency with the overall summarization architecture.
463
+
464
+ ### 4. ContextSummarizer (`src/utils/context-summarizer.ts`)
465
+
466
+ **Responsibilities**:
467
+ - Define interface for summarization strategies
468
+ - Implement length-based summarization
469
+ - Call LLM with appropriate prompts
470
+ - Measure and return metadata
471
+
472
+ **Interface**:
473
+ ```typescript
474
+ interface ContextSummarizer {
475
+ summarize(
476
+ content: string,
477
+ role: AgentRole,
478
+ config: SummarizationConfig,
479
+ systemPrompt: string,
480
+ summaryPrompt: string
481
+ ): Promise<SummarizationResult>;
482
+ }
483
+ ```
484
+
485
+ **Current Implementation**: `LengthBasedSummarizer`
486
+ - Uses LLM to generate summaries when content exceeds threshold
487
+ - Truncates to `maxLength` if needed
488
+ - Captures timing and token usage metadata
489
+
490
+ **Design Rationale**: The interface separates "what to summarize" (agent's responsibility) from "how to summarize" (strategy's responsibility). This makes it easy to add new strategies.
491
+
492
+ ### 5. StateManager (`src/core/state-manager.ts`)
493
+
494
+ **Responsibilities**:
495
+ - Persist summaries to disk as part of debate state
496
+ - Maintain summary history per round (keyed by agent ID)
497
+ - Ensure atomic saves
498
+
499
+ **Key Methods**:
500
+
501
+ `addSummary(debateId: string, summary: DebateSummary)`
502
+ ```typescript
503
+ async addSummary(debateId, summary) {
504
+ const state = this.debates.get(debateId);
505
+ const round = state.rounds[state.currentRound - 1];
506
+
507
+ // Initialize summaries Record if needed
508
+ if (!round.summaries) {
509
+ round.summaries = {};
510
+ }
511
+
512
+ // Store summary by agentId
513
+ round.summaries[summary.agentId] = summary;
514
+ await this.save(state);
515
+ }
516
+ ```
517
+
518
+ `addJudgeSummary(debateId: string, summary: DebateSummary)`
519
+ ```typescript
520
+ async addJudgeSummary(debateId, summary) {
521
+ const state = this.debates.get(debateId);
522
+ if (!state) throw new Error(`Debate ${debateId} not found`);
523
+
524
+ state.judgeSummary = summary;
525
+ state.updatedAt = new Date();
526
+ await this.save(state);
527
+ }
528
+ ```
529
+
530
+ **Design Rationale**: Storing summaries in rounds (not agents) provides a complete audit trail and enables debugging/analysis of summarization behavior over time. Keying by agent ID provides efficient, isolated access to each agent's summary.
531
+
532
+ ### 6. Context Formatter (`src/utils/context-formatter.ts`)
533
+
534
+ **Responsibilities**:
535
+ - Format debate context for inclusion in LLM prompts
536
+ - Retrieve agent-specific summaries from debate history
537
+ - Fall back to full history if no summary exists
538
+
539
+ **Key Function**: `formatContextSection(context: DebateContext, agentId: string)`
540
+ ```typescript
541
+ // Simplified implementation
542
+ function formatContextSection(context, agentId) {
543
+ if (!context?.history || context.history.length === 0) {
544
+ return ''; // No history
545
+ }
546
+
547
+ // Search backwards through rounds to find this agent's most recent summary
548
+ for (let i = context.history.length - 1; i >= 0; i--) {
549
+ const round = context.history[i];
550
+ const agentSummary = round.summaries?.[agentId];
551
+
552
+ if (agentSummary) {
553
+ // Found summary - format it for prompt
554
+ return `=== Previous Debate Context ===\n\n` +
555
+ `[SUMMARY from Round ${round.roundNumber}]\n` +
556
+ `${agentSummary.summary}\n\n` +
557
+ `===================================\n\n`;
558
+ }
559
+ }
560
+
561
+ // No summary found - fall back to full history
562
+ return `=== Previous Debate Rounds ===\n\n` +
563
+ `${formatHistory(context.history)}\n\n` +
564
+ `===================================\n\n`;
565
+ }
566
+ ```
567
+
568
+ **Usage in Prompts**:
569
+ ```typescript
570
+ // In role prompt implementations
571
+ proposePrompt: (problem: string, context?: DebateContext, agentId?: string) => {
572
+ const basePrompt = `Problem to solve:\n${problem}\n\n...`;
573
+ return prependContext(basePrompt, context, agentId);
574
+ }
575
+ ```
576
+
577
+ **Design Rationale**:
578
+ - **Backward Search**: Looking from most recent round backwards ensures we get the freshest summary
579
+ - **Isolation**: Each agent only sees their own summary, preventing data mixing
580
+ - **Graceful Fallback**: If no summary exists, full history is used automatically
581
+ - **Transparent**: Role prompt implementations don't need to know about summary logic
582
+
583
+ ---
584
+
585
+ ## Configuration & Customization
586
+
587
+ ### Configuration Levels
588
+
589
+ 1. **System-Wide** (`debate.summarization` in config file):
590
+ - Default settings for all agents
591
+ - Applied unless overridden
592
+
593
+ 2. **Per-Agent** (`AgentConfig.summarization`):
594
+ - Agent-specific overrides
595
+ - Merged with system-wide settings
596
+
597
+ ### Configuration Fields
598
+
599
+ ```typescript
600
+ interface SummarizationConfig {
601
+ enabled: boolean; // Enable/disable summarization
602
+ threshold: number; // Character count threshold
603
+ maxLength: number; // Max summary length
604
+ method: string; // Summarization method
605
+ promptPath?: string; // Optional custom prompt file
606
+ }
607
+ ```
608
+
609
+ **Defaults**:
610
+ - `enabled: true`
611
+ - `threshold: 5000` characters
612
+ - `maxLength: 2500` characters
613
+ - `method: 'length-based'`
614
+
615
+ ### Custom Summary Prompts
616
+
617
+ **Per-Agent Prompt** (`AgentConfig.summaryPromptPath`):
618
+ ```json
619
+ {
620
+ "agents": [
621
+ {
622
+ "id": "agent-architect",
623
+ "summaryPromptPath": "./prompts/architect-summary.md"
624
+ }
625
+ ]
626
+ }
627
+ ```
628
+
629
+ **System-Wide Prompt** (`debate.summarization.promptPath`):
630
+ ```json
631
+ {
632
+ "debate": {
633
+ "summarization": {
634
+ "promptPath": "./prompts/generic-summary.md"
635
+ }
636
+ }
637
+ }
638
+ ```
639
+
640
+ **Fallback Behavior**:
641
+ 1. Use `summaryPromptPath` if specified (per-agent)
642
+ 2. Fall back to `summarization.promptPath` if specified (system-wide)
643
+ 3. Fall back to role-specific built-in prompt
644
+ 4. If file read fails, warn user and use built-in prompt
645
+
646
+ **Prompt Resolution**: Follows same pattern as `systemPromptPath` (see `src/utils/prompt-loader.ts`)
647
+
648
+ ---
649
+
650
+ ## Extension Points & Future Strategies
651
+
652
+ ### Current Strategy: Length-Based
653
+
654
+ **Trigger**: Character count exceeds threshold
655
+ **Method**: LLM-based summarization with role-specific prompts
656
+ **Output**: Concise summary within max length
657
+
658
+ ### Future Strategy 1: Semantic Summarization
659
+
660
+ **Concept**: Summarize based on semantic similarity rather than raw length.
661
+
662
+ **Design**:
663
+ ```typescript
664
+ class SemanticSummarizer implements ContextSummarizer {
665
+ async summarize(content, role, config, systemPrompt, summaryPrompt) {
666
+ // 1. Embed contributions using embedding model
667
+ // 2. Cluster by semantic similarity
668
+ // 3. Identify representative contributions per cluster
669
+ // 4. Summarize cluster representatives
670
+ // 5. Return hierarchical summary
671
+ }
672
+ }
673
+ ```
674
+
675
+ **Configuration**:
676
+ ```json
677
+ {
678
+ "summarization": {
679
+ "method": "semantic",
680
+ "semanticThreshold": 0.8, // Similarity threshold
681
+ "embeddingModel": "text-embedding-ada-002"
682
+ }
683
+ }
684
+ ```
685
+
686
+ ### Future Strategy 2: Hierarchical Summarization
687
+
688
+ **Concept**: Multi-level summaries (contribution → round → phase → overall).
689
+
690
+ **Design**:
691
+ ```typescript
692
+ class HierarchicalSummarizer implements ContextSummarizer {
693
+ async summarize(content, role, config, systemPrompt, summaryPrompt) {
694
+ // 1. Summarize individual contributions
695
+ // 2. Summarize round-level (combine contribution summaries)
696
+ // 3. Summarize phase-level (proposals, critiques, refinements)
697
+ // 4. Create overall summary from phase summaries
698
+ }
699
+ }
700
+ ```
701
+
702
+ **Benefits**:
703
+ - Granular summaries at different levels
704
+ - Can query specific round/phase summaries
705
+ - Better preservation of chronological structure
706
+
707
+ ### Future Strategy 3: RAG-Based Summarization
708
+
709
+ **Concept**: Store full history in vector database, retrieve relevant context on-demand.
710
+
711
+ **Design**:
712
+ ```typescript
713
+ class RAGSummarizer implements ContextSummarizer {
714
+ async summarize(content, role, config, systemPrompt, summaryPrompt) {
715
+ // 1. Store contributions in vector DB
716
+ // 2. When needed, query DB for relevant contributions
717
+ // 3. Return retrieved context as "summary"
718
+ }
719
+ }
720
+ ```
721
+
722
+ **Benefits**:
723
+ - No information loss (full history preserved)
724
+ - Dynamic retrieval based on current context
725
+ - Can adjust retrieval criteria per query
726
+
727
+ ### Extensibility Pattern
728
+
729
+ **Adding a New Strategy**:
730
+
731
+ 1. **Implement Interface**:
732
+ ```typescript
733
+ class MyCustomSummarizer implements ContextSummarizer {
734
+ constructor(private provider: LLMProvider, private customConfig: any) {}
735
+
736
+ async summarize(
737
+ content: string,
738
+ role: AgentRole,
739
+ config: SummarizationConfig,
740
+ systemPrompt: string,
741
+ summaryPrompt: string
742
+ ): Promise<SummarizationResult> {
743
+ // Your custom logic
744
+ }
745
+ }
746
+ ```
747
+
748
+ 2. **Update Agent Factory** (`RoleBasedAgent.create()`):
749
+ ```typescript
750
+ if (summaryConfig.enabled) {
751
+ if (summaryConfig.method === 'length-based') {
752
+ this.summarizer = new LengthBasedSummarizer(provider);
753
+ } else if (summaryConfig.method === 'semantic') {
754
+ this.summarizer = new SemanticSummarizer(provider, semanticConfig);
755
+ } else if (summaryConfig.method === 'my-custom') {
756
+ this.summarizer = new MyCustomSummarizer(provider, customConfig);
757
+ }
758
+ }
759
+ ```
760
+
761
+ 3. **Add Configuration**:
762
+ ```typescript
763
+ // In src/types/debate.types.ts
764
+ export const SUMMARIZATION_METHODS = {
765
+ LENGTH_BASED: 'length-based',
766
+ SEMANTIC: 'semantic',
767
+ MY_CUSTOM: 'my-custom',
768
+ } as const;
769
+ ```
770
+
771
+ **No Changes Needed To**:
772
+ - Orchestrator (it just calls `agent.prepareContext()`)
773
+ - StateManager (it just stores results)
774
+ - CLI (it just passes configuration)
775
+
776
+ ---
777
+
778
+ ## Implementation Details
779
+
780
+ ### Data Flow
781
+
782
+ ```
783
+ 1. Orchestrator.summarizationPhase()
784
+ └─> For each agent:
785
+ └─> Agent.prepareContext(baseContext, roundNumber)
786
+ ├─> Agent.shouldSummarize(baseContext)
787
+ │ └─> Calculate character count
788
+ │ └─> Compare to threshold
789
+ │ └─> Return true/false
790
+
791
+ ├─> If false: return { context: baseContext }
792
+
793
+ └─> If true:
794
+ ├─> Filter history to perspective
795
+ ├─> Join into text
796
+ ├─> ContextSummarizer.summarize(text, ...)
797
+ │ └─> Call LLM
798
+ │ └─> Capture metadata
799
+ │ └─> Return summary + metadata
800
+ ├─> Build DebateSummary object (agentId + summary text + metadata)
801
+ ├─> Return { context: baseContext, summary: debateSummary }
802
+ │ (Note: context is NOT modified)
803
+
804
+ 2. Orchestrator receives result
805
+ ├─> If summary exists: StateManager.addSummary(summary)
806
+ │ └─> StateManager: round.summaries[agentId] = summary
807
+ ├─> Store prepared context (unmodified)
808
+
809
+ 3. Orchestrator.proposalPhase()
810
+ └─> For each agent:
811
+ └─> Agent.propose(problem, context)
812
+ └─> rolePrompts.proposePrompt(problem, context, agentId)
813
+ └─> prependContext(basePrompt, context, agentId)
814
+ └─> formatContextSection(context, agentId)
815
+ └─> Search backwards in context.history
816
+ └─> Find round.summaries[agentId]
817
+ ├─> Found? Prepend summary to prompt
818
+ └─> Not found? Prepend full history (or nothing)
819
+ └─> Send formatted prompt to LLM
820
+
821
+ 4. Orchestrator.critiquePhase() / refinementPhase()
822
+ └─> Same pattern: context + agentId → backward search → agent's summary
823
+
824
+ 5. Orchestrator.synthesisPhase()
825
+ └─> Judge.prepareContext(rounds)
826
+ ├─> Judge.shouldSummarize(rounds)
827
+ │ └─> Calculate character count of final round proposals/refinements
828
+ │ └─> Compare to threshold
829
+ │ └─> Return true/false
830
+
831
+ ├─> If false: return { context: { problem: '', history: rounds } }
832
+
833
+ └─> If true:
834
+ ├─> Judge.getFinalRoundRelevantContent(rounds)
835
+ │ └─> Filter final round to proposals and refinements only
836
+ ├─> ContextSummarizer.summarize(relevantContent, ...)
837
+ │ └─> Call LLM with judge-specific summary prompt
838
+ │ └─> Capture metadata
839
+ │ └─> Return summary + metadata
840
+ ├─> Build DebateSummary object (judgeId + summary text + metadata)
841
+ ├─> Return { context: { problem: '', history: rounds }, summary: debateSummary }
842
+
843
+ ├─> If judge summary exists: StateManager.addJudgeSummary(summary)
844
+ │ └─> StateManager: state.judgeSummary = summary
845
+
846
+ └─> Judge.synthesize(problem, rounds, context)
847
+ └─> Judge.buildSynthesisPrompt(problem, rounds)
848
+ └─> If summarization was used: include only final round's key contributions
849
+ └─> If no summarization: include all rounds with full history
850
+ └─> Send formatted prompt to LLM
851
+ ```
852
+
853
+ ### Character Count Calculation
854
+
855
+ **For Agents**: The agent counts characters from:
856
+ - **Own proposals**: `contribution.type === 'proposal' && contribution.agentId === this.config.id`
857
+ - **Received critiques**: `contribution.type === 'critique' && contribution.targetAgentId === this.config.id`
858
+ - **Own refinements**: `contribution.type === 'refinement' && contribution.agentId === this.config.id`
859
+
860
+ **For Judge**: The judge counts characters from:
861
+ - **Final round proposals**: `contribution.type === 'proposal'` from the last round
862
+ - **Final round refinements**: `contribution.type === 'refinement'` from the last round
863
+ - **Excludes critiques**: Judge does not include critiques in its summarization
864
+
865
+ **Why character count?**
866
+ - Simple and deterministic
867
+ - Works across all LLM providers
868
+ - Easy to configure and understand
869
+ - Good proxy for context size
870
+
871
+ **Alternative considered**: Token count
872
+ - **Rejected because**: Token counting is provider-specific and requires tokenizer overhead
873
+ - **Future option**: Could add token-based thresholds as an alternative method
874
+
875
+ ### Error Handling
876
+
877
+ **Summarization Failure**:
878
+ ```typescript
879
+ try {
880
+ const result = await this.summarizer.summarize(/*...*/);
881
+ const summary = { agentId, agentRole, summary: result.summary, metadata };
882
+ return { context, summary }; // context unchanged
883
+ } catch (error) {
884
+ // Log warning to stderr
885
+ process.stderr.write(`Warning: Summarization failed. Falling back to full history.\n`);
886
+ // Return original context (graceful degradation)
887
+ return { context };
888
+ }
889
+ ```
890
+
891
+ **Missing Summarizer**:
892
+ ```typescript
893
+ if (!this.summarizer) {
894
+ process.stderr.write(`Warning: Summarization enabled but no summarizer available.\n`);
895
+ return { context };
896
+ }
897
+ ```
898
+
899
+ **Design Rationale**: Summarization is an optimization, not a requirement. Failures should never break the debate—always fall back to full history.
900
+
901
+ ---
902
+
903
+ ## Trade-offs & Design Rationale
904
+
905
+ ### Trade-off 1: Fresh Summaries vs. Incremental Updates
906
+
907
+ **Choice**: Fresh summaries each round
908
+
909
+ **Pros**:
910
+ - Simpler implementation (no state management)
911
+ - No accumulation of errors
912
+ - Easy to change summary parameters
913
+ - Easier debugging
914
+
915
+ **Cons**:
916
+ - Redundant LLM calls (re-summarizing some content)
917
+ - Slightly higher token cost
918
+
919
+ **Rationale**: Simplicity and accuracy are more valuable than marginal token savings. Most debates are short (3-5 rounds), so redundancy is minimal.
920
+
921
+ ### Trade-off 2: Perspective-Based vs. Full History
922
+
923
+ **Choice**: Each agent summarizes only their perspective
924
+
925
+ **Pros**:
926
+ - More relevant summaries
927
+ - Smaller input to LLM (faster, cheaper)
928
+ - Better privacy/separation
929
+ - Scales better (O(agent activity) not O(total activity))
930
+
931
+ **Cons**:
932
+ - Agents might miss broader context
933
+ - More complex filtering logic
934
+
935
+ **Rationale**: Agents work best when focused on their role-specific context. Full history would dilute relevant information with irrelevant critiques of other agents.
936
+
937
+ ### Trade-off 3: Round Storage vs. Agent Storage
938
+
939
+ **Choice**: Store summaries in `DebateRound.summaries[]`
940
+
941
+ **Pros**:
942
+ - Complete audit trail
943
+ - Easy to debug (see exactly what each agent saw when)
944
+ - Reproducible (can replay debate with same contexts)
945
+ - Enables analysis (compression ratios, effectiveness)
946
+
947
+ **Cons**:
948
+ - Larger state files
949
+ - More data to persist each round
950
+
951
+ **Rationale**: Debugging and auditability are critical for a debate system. The extra storage cost is minimal compared to the value of complete provenance.
952
+
953
+ ### Trade-off 4: Configuration Flexibility vs. Simplicity
954
+
955
+ **Choice**: Two-level configuration (system + agent)
956
+
957
+ **Pros**:
958
+ - Flexible (customize per agent or use defaults)
959
+ - DRY (define once, apply everywhere)
960
+ - Gradual adoption (enable for some agents first)
961
+
962
+ **Cons**:
963
+ - More complex configuration
964
+ - Merging logic required
965
+
966
+ **Rationale**: Real-world usage demands flexibility. Different agents have different context needs—architect needs more context than security. Two-level config supports both simple and advanced use cases.
967
+
968
+ ---
969
+
970
+ ## References
971
+
972
+ ### Code
973
+
974
+ - **Orchestrator**: `src/core/orchestrator.ts` (lines 179-217: `summarizationPhase()`, lines 219-235: `synthesisPhase()`)
975
+ - **Agent**: `src/agents/role-based-agent.ts` (lines 210-338: `shouldSummarize()`, `prepareContext()`)
976
+ - **Judge**: `src/core/judge.ts` (lines 45-120: `shouldSummarize()`, `prepareContext()`, `getFinalRoundRelevantContent()`)
977
+ - **Summarizer**: `src/utils/context-summarizer.ts` (lines 49-99: `LengthBasedSummarizer`)
978
+ - **State Manager**: `src/core/state-manager.ts` (lines 261-275: `addSummary()`, lines 277-285: `addJudgeSummary()`)
979
+ - **Judge Prompts**: `src/agents/prompts/judge-prompts.ts` (judge-specific summary prompts)
980
+ - **CLI Integration**: `src/cli/commands/debate.ts` (lines 152-211: agent factory with summarization, judge creation with summarization)
981
+
982
+ ### Documentation
983
+
984
+ - **User Guide**: `README.md` (lines 207-213: Context Summarization section)
985
+ - **Configuration**: `docs/configuration.md` (lines 301-520: Context Summarization Configuration)
986
+ - **Flow Diagram**: `docs/debate_flow.md` (lines 130-200: Summarization phase sequence)
987
+
988
+ ### Tests
989
+
990
+ - **Summarizer Tests**: `tests/context-summarizer.spec.ts`
991
+ - **Agent Tests**: `tests/role-based-agent-summary.spec.ts`
992
+ - **Orchestrator Tests**: `tests/orchestrator-summary.spec.ts`
993
+ - **State Tests**: `tests/state-manager.spec.ts` (summarization section)
994
+ - **Config Tests**: `tests/config-loading.spec.ts` (summarization section)
995
+ - **Prompt Tests**: `tests/summary-prompts.spec.ts`
996
+ - **Judge Tests**: `tests/orchestrator.spec.ts` (updated with judge summarization)
997
+
998
+ ---
999
+
1000
+ ## Summary
1001
+
1002
+ The context summarization feature provides **automatic, configurable, and extensible** management of debate history:
1003
+
1004
+ - **Automatic**: Agents and judge decide when to summarize based on thresholds
1005
+ - **Configurable**: Two-level configuration (system + agent) with custom prompts
1006
+ - **Extensible**: Pluggable strategies via `ContextSummarizer` interface
1007
+
1008
+ **Key Design Principles**:
1009
+ 1. **Agent Autonomy**: Agents control their own summarization decisions
1010
+ 2. **Judge Integration**: Judge uses same summarization infrastructure for synthesis
1011
+ 3. **Graceful Degradation**: Failures fall back to full history (never break debate)
1012
+ 4. **Complete Provenance**: All summaries persisted with metadata for debugging
1013
+ 5. **Simple First**: Start with length-based, design for future strategies
1014
+ 6. **Separation of Concerns**: "When/what" (agent/judge) vs "how" (strategy) vs "coordination" (orchestrator)
1015
+
1016
+ **Judge-Specific Features**:
1017
+ - Summarizes only final round's proposals and refinements (excludes critiques)
1018
+ - Uses judge-specific summary prompts for synthesis-focused summarization
1019
+ - Stores summaries separately in `DebateState.judgeSummary` for synthesis context
1020
+ - Integrates seamlessly with existing summarization infrastructure
1021
+
1022
+ The architecture balances simplicity for current needs with extensibility for future enhancements, ensuring the feature can evolve as new summarization techniques emerge.
1023
+