opencode-multiagent 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +209 -0
- package/agents/advisor.md +57 -0
- package/agents/auditor.md +45 -0
- package/agents/critic.md +127 -0
- package/agents/deep-worker.md +65 -0
- package/agents/devil.md +36 -0
- package/agents/executor.md +141 -0
- package/agents/heavy-worker.md +68 -0
- package/agents/lead.md +155 -0
- package/agents/librarian.md +62 -0
- package/agents/planner.md +121 -0
- package/agents/qa.md +50 -0
- package/agents/quick.md +65 -0
- package/agents/reviewer.md +55 -0
- package/agents/scout.md +58 -0
- package/agents/scribe.md +78 -0
- package/agents/strategist.md +63 -0
- package/agents/ui-heavy-worker.md +62 -0
- package/agents/ui-worker.md +69 -0
- package/agents/validator.md +47 -0
- package/agents/worker.md +68 -0
- package/commands/execute.md +14 -0
- package/commands/init-deep.md +18 -0
- package/commands/init.md +18 -0
- package/commands/inspect.md +13 -0
- package/commands/plan.md +15 -0
- package/commands/quality.md +14 -0
- package/commands/review.md +14 -0
- package/commands/status.md +15 -0
- package/defaults/agent-settings.json +102 -0
- package/defaults/agent-settings.schema.json +25 -0
- package/defaults/flags.json +35 -0
- package/defaults/flags.schema.json +119 -0
- package/defaults/mcp-defaults.json +47 -0
- package/defaults/mcp-defaults.schema.json +38 -0
- package/defaults/profiles.json +53 -0
- package/defaults/profiles.schema.json +60 -0
- package/defaults/team-profiles.json +83 -0
- package/examples/opencode.json +4 -0
- package/examples/opencode.with-overrides.json +23 -0
- package/package.json +62 -0
- package/skills/advanced-evaluation/SKILL.md +454 -0
- package/skills/advanced-evaluation/manifest.json +20 -0
- package/skills/cek-context-engineering/SKILL.md +1261 -0
- package/skills/cek-context-engineering/manifest.json +17 -0
- package/skills/cek-prompt-engineering/SKILL.md +559 -0
- package/skills/cek-prompt-engineering/manifest.json +17 -0
- package/skills/cek-test-prompt/SKILL.md +714 -0
- package/skills/cek-test-prompt/manifest.json +17 -0
- package/skills/cek-thought-based-reasoning/SKILL.md +658 -0
- package/skills/cek-thought-based-reasoning/manifest.json +17 -0
- package/skills/context-degradation/SKILL.md +231 -0
- package/skills/context-degradation/manifest.json +17 -0
- package/skills/debate/SKILL.md +316 -0
- package/skills/debate/manifest.json +19 -0
- package/skills/design-first/SKILL.md +5 -0
- package/skills/design-first/manifest.json +20 -0
- package/skills/dispatching-parallel-agents/SKILL.md +180 -0
- package/skills/dispatching-parallel-agents/manifest.json +18 -0
- package/skills/drift-analysis/SKILL.md +324 -0
- package/skills/drift-analysis/manifest.json +19 -0
- package/skills/evaluation/SKILL.md +5 -0
- package/skills/evaluation/manifest.json +19 -0
- package/skills/executing-plans/SKILL.md +70 -0
- package/skills/executing-plans/manifest.json +17 -0
- package/skills/handoff-protocols/SKILL.md +5 -0
- package/skills/handoff-protocols/manifest.json +19 -0
- package/skills/parallel-investigation/SKILL.md +206 -0
- package/skills/parallel-investigation/manifest.json +18 -0
- package/skills/reflexion-critique/SKILL.md +477 -0
- package/skills/reflexion-critique/manifest.json +17 -0
- package/skills/reflexion-reflect/SKILL.md +650 -0
- package/skills/reflexion-reflect/manifest.json +17 -0
- package/skills/root-cause-analysis/SKILL.md +5 -0
- package/skills/root-cause-analysis/manifest.json +20 -0
- package/skills/sadd-judge-with-debate/SKILL.md +426 -0
- package/skills/sadd-judge-with-debate/manifest.json +17 -0
- package/skills/structured-code-review/SKILL.md +5 -0
- package/skills/structured-code-review/manifest.json +18 -0
- package/skills/task-decomposition/SKILL.md +5 -0
- package/skills/task-decomposition/manifest.json +20 -0
- package/skills/verification-before-completion/SKILL.md +5 -0
- package/skills/verification-before-completion/manifest.json +22 -0
- package/skills/verification-gates/SKILL.md +281 -0
- package/skills/verification-gates/manifest.json +19 -0
- package/src/control-plane.ts +21 -0
- package/src/index.ts +8 -0
- package/src/opencode-multiagent/compiler.ts +168 -0
- package/src/opencode-multiagent/constants.ts +178 -0
- package/src/opencode-multiagent/file-lock.ts +90 -0
- package/src/opencode-multiagent/hooks.ts +599 -0
- package/src/opencode-multiagent/log.ts +12 -0
- package/src/opencode-multiagent/mailbox.ts +287 -0
- package/src/opencode-multiagent/markdown.ts +99 -0
- package/src/opencode-multiagent/mcp.ts +35 -0
- package/src/opencode-multiagent/policy.ts +67 -0
- package/src/opencode-multiagent/quality.ts +140 -0
- package/src/opencode-multiagent/runtime.ts +55 -0
- package/src/opencode-multiagent/skills.ts +144 -0
- package/src/opencode-multiagent/supervision.ts +156 -0
- package/src/opencode-multiagent/task-manager.ts +148 -0
- package/src/opencode-multiagent/team-manager.ts +219 -0
- package/src/opencode-multiagent/team-tools.ts +359 -0
- package/src/opencode-multiagent/telemetry.ts +124 -0
- package/src/opencode-multiagent/utils.ts +54 -0
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: context-degradation
|
|
3
|
+
description: This skill should be used when the user asks to "diagnose context problems", "fix lost-in-middle issues", "debug agent failures", "understand context poisoning", or mentions context degradation, attention patterns, context clash, context confusion, or agent performance degradation. Provides patterns for recognizing and mitigating context failures.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Context Degradation Patterns
|
|
7
|
+
|
|
8
|
+
Language models exhibit predictable degradation patterns as context length increases. Understanding these patterns is essential for diagnosing failures and designing resilient systems. Context degradation is not a binary state but a continuum of performance degradation that manifests in several distinct ways.
|
|
9
|
+
|
|
10
|
+
## When to Activate
|
|
11
|
+
|
|
12
|
+
Activate this skill when:
|
|
13
|
+
- Agent performance degrades unexpectedly during long conversations
|
|
14
|
+
- Debugging cases where agents produce incorrect or irrelevant outputs
|
|
15
|
+
- Designing systems that must handle large contexts reliably
|
|
16
|
+
- Evaluating context engineering choices for production systems
|
|
17
|
+
- Investigating "lost in middle" phenomena in agent outputs
|
|
18
|
+
- Analyzing context-related failures in agent behavior
|
|
19
|
+
|
|
20
|
+
## Core Concepts
|
|
21
|
+
|
|
22
|
+
Context degradation manifests through several distinct patterns. The lost-in-middle phenomenon causes information in the center of context to receive less attention. Context poisoning occurs when errors compound through repeated reference. Context distraction happens when irrelevant information overwhelms relevant content. Context confusion arises when the model cannot determine which context applies. Context clash develops when accumulated information directly conflicts.
|
|
23
|
+
|
|
24
|
+
These patterns are predictable and can be mitigated through architectural patterns like compaction, masking, partitioning, and isolation.
|
|
25
|
+
|
|
26
|
+
## Detailed Topics
|
|
27
|
+
|
|
28
|
+
### The Lost-in-Middle Phenomenon
|
|
29
|
+
|
|
30
|
+
The most well-documented degradation pattern is the "lost-in-middle" effect, where models demonstrate U-shaped attention curves. Information at the beginning and end of context receives reliable attention, while information buried in the middle suffers from dramatically reduced recall accuracy.
|
|
31
|
+
|
|
32
|
+
**Empirical Evidence**
|
|
33
|
+
Research demonstrates that relevant information placed in the middle of context experiences 10-40% lower recall accuracy compared to the same information at the beginning or end. This is not a failure of the model but a consequence of attention mechanics and training data distributions.
|
|
34
|
+
|
|
35
|
+
Models allocate massive attention to the first token (often the BOS token) to stabilize internal states. This creates an "attention sink" that soaks up attention budget. As context grows, the limited budget is stretched thinner, and middle tokens fail to garner sufficient attention weight for reliable retrieval.
|
|
36
|
+
|
|
37
|
+
**Practical Implications**
|
|
38
|
+
Design context placement with attention patterns in mind. Place critical information at the beginning or end of context. Consider whether information will be queried directly or needs to support reasoning—if the latter, placement matters less but overall signal quality matters more.
|
|
39
|
+
|
|
40
|
+
For long documents or conversations, use summary structures that surface key information at attention-favored positions. Use explicit section headers and transitions to help models navigate structure.
|
|
41
|
+
|
|
42
|
+
### Context Poisoning
|
|
43
|
+
|
|
44
|
+
Context poisoning occurs when hallucinations, errors, or incorrect information enters context and compounds through repeated reference. Once poisoned, context creates feedback loops that reinforce incorrect beliefs.
|
|
45
|
+
|
|
46
|
+
**How Poisoning Occurs**
|
|
47
|
+
Poisoning typically enters through three pathways. First, tool outputs may contain errors or unexpected formats that models accept as ground truth. Second, retrieved documents may contain incorrect or outdated information that models incorporate into reasoning. Third, model-generated summaries or intermediate outputs may introduce hallucinations that persist in context.
|
|
48
|
+
|
|
49
|
+
The compounding effect is severe. If an agent's goals section becomes poisoned, it develops strategies that take substantial effort to undo. Each subsequent decision references the poisoned content, reinforcing incorrect assumptions.
|
|
50
|
+
|
|
51
|
+
**Detection and Recovery**
|
|
52
|
+
Watch for symptoms including degraded output quality on tasks that previously succeeded, tool misalignment where agents call wrong tools or parameters, and hallucinations that persist despite correction attempts. When these symptoms appear, consider context poisoning.
|
|
53
|
+
|
|
54
|
+
Recovery requires removing or replacing poisoned content. This may involve truncating context to before the poisoning point, explicitly noting the poisoning in context and asking for re-evaluation, or restarting with clean context and preserving only verified information.
|
|
55
|
+
|
|
56
|
+
### Context Distraction
|
|
57
|
+
|
|
58
|
+
Context distraction emerges when context grows so long that models over-focus on provided information at the expense of their training knowledge. The model attends to everything in context regardless of relevance, and this creates pressure to use provided information even when internal knowledge is more accurate.
|
|
59
|
+
|
|
60
|
+
**The Distractor Effect**
|
|
61
|
+
Research shows that even a single irrelevant document in context reduces performance on tasks involving relevant documents. Multiple distractors compound degradation. The effect is not about noise in absolute terms but about attention allocation—irrelevant information competes with relevant information for limited attention budget.
|
|
62
|
+
|
|
63
|
+
Models do not have a mechanism to "skip" irrelevant context. They must attend to everything provided, and this obligation creates distraction even when the irrelevant information is clearly not useful.
|
|
64
|
+
|
|
65
|
+
**Mitigation Strategies**
|
|
66
|
+
Mitigate distraction through careful curation of what enters context. Apply relevance filtering before loading retrieved documents. Use namespacing and organization to make irrelevant sections easy to ignore structurally. Consider whether information truly needs to be in context or can be accessed through tool calls instead.
|
|
67
|
+
|
|
68
|
+
### Context Confusion
|
|
69
|
+
|
|
70
|
+
Context confusion arises when irrelevant information influences responses in ways that degrade quality. This is related to distraction but distinct—confusion concerns the influence of context on model behavior rather than attention allocation.
|
|
71
|
+
|
|
72
|
+
If you put something in context, the model has to pay attention to it. The model may incorporate irrelevant information, use inappropriate tool definitions, or apply constraints that came from different contexts. Confusion is especially problematic when context contains multiple task types or when switching between tasks within a single session.
|
|
73
|
+
|
|
74
|
+
**Signs of Confusion**
|
|
75
|
+
Watch for responses that address the wrong aspect of a query, tool calls that seem appropriate for a different task, or outputs that mix requirements from multiple sources. These indicate confusion about what context applies to the current situation.
|
|
76
|
+
|
|
77
|
+
**Architectural Solutions**
|
|
78
|
+
Architectural solutions include explicit task segmentation where different tasks get different context windows, clear transitions between task contexts, and state management that isolates context for different objectives.
|
|
79
|
+
|
|
80
|
+
### Context Clash
|
|
81
|
+
|
|
82
|
+
Context clash develops when accumulated information directly conflicts, creating contradictory guidance that derails reasoning. This differs from poisoning where one piece of information is incorrect—in clash, multiple correct pieces of information contradict each other.
|
|
83
|
+
|
|
84
|
+
**Sources of Clash**
|
|
85
|
+
Clash commonly arises from multi-source retrieval where different sources have contradictory information, version conflicts where outdated and current information both appear in context, and perspective conflicts where different viewpoints are valid but incompatible.
|
|
86
|
+
|
|
87
|
+
**Resolution Approaches**
|
|
88
|
+
Resolution approaches include explicit conflict marking that identifies contradictions and requests clarification, priority rules that establish which source takes precedence, and version filtering that excludes outdated information from context.
|
|
89
|
+
|
|
90
|
+
### Empirical Benchmarks and Thresholds
|
|
91
|
+
|
|
92
|
+
Research provides concrete data on degradation patterns that inform design decisions.
|
|
93
|
+
|
|
94
|
+
**RULER Benchmark Findings**
|
|
95
|
+
The RULER benchmark delivers sobering findings: only 50% of models claiming 32K+ context maintain satisfactory performance at 32K tokens. GPT-5.2 shows the least degradation among current models, while many still drop 30+ points at extended contexts. Near-perfect scores on simple needle-in-haystack tests do not translate to real long-context understanding.
|
|
96
|
+
|
|
97
|
+
**Model-Specific Degradation Thresholds**
|
|
98
|
+
| Model | Degradation Onset | Severe Degradation | Notes |
|
|
99
|
+
|-------|-------------------|-------------------|-------|
|
|
100
|
+
| GPT-5.2 | ~64K tokens | ~200K tokens | Best overall degradation resistance with thinking mode |
|
|
101
|
+
| Claude Opus 4.5 | ~100K tokens | ~180K tokens | 200K context window, strong attention management |
|
|
102
|
+
| Claude Sonnet 4.5 | ~80K tokens | ~150K tokens | Optimized for agents and coding tasks |
|
|
103
|
+
| Gemini 3 Pro | ~500K tokens | ~800K tokens | 1M context window, native multimodality |
|
|
104
|
+
| Gemini 3 Flash | ~300K tokens | ~600K tokens | 3x speed of Gemini 2.5, 81.2% MMMU-Pro |
|
|
105
|
+
|
|
106
|
+
**Model-Specific Behavior Patterns**
|
|
107
|
+
Different models exhibit distinct failure modes under context pressure:
|
|
108
|
+
|
|
109
|
+
- **Claude 4.5 series**: Lowest hallucination rates with calibrated uncertainty. Claude Opus 4.5 achieves 80.9% on SWE-bench Verified. Tends to refuse or ask clarification rather than fabricate.
|
|
110
|
+
- **GPT-5.2**: Two modes available - instant (fast) and thinking (reasoning). Thinking mode reduces hallucination through step-by-step verification but increases latency.
|
|
111
|
+
- **Gemini 3 Pro/Flash**: Native multimodality with 1M context window. Gemini 3 Flash offers 3x speed improvement over previous generation. Strong at multi-modal reasoning across text, code, images, audio, and video.
|
|
112
|
+
|
|
113
|
+
These patterns inform model selection for different use cases. High-stakes tasks benefit from Claude 4.5's conservative approach or GPT-5.2's thinking mode; speed-critical tasks may use instant modes.
|
|
114
|
+
|
|
115
|
+
### Counterintuitive Findings
|
|
116
|
+
|
|
117
|
+
Research reveals several counterintuitive patterns that challenge assumptions about context management.
|
|
118
|
+
|
|
119
|
+
**Shuffled Haystacks Outperform Coherent Ones**
|
|
120
|
+
Studies found that shuffled (incoherent) haystacks produce better performance than logically coherent ones. This suggests that coherent context may create false associations that confuse retrieval, while incoherent context forces models to rely on exact matching.
|
|
121
|
+
|
|
122
|
+
**Single Distractors Have Outsized Impact**
|
|
123
|
+
Even a single irrelevant document reduces performance significantly. The effect is not proportional to the amount of noise but follows a step function where the presence of any distractor triggers degradation.
|
|
124
|
+
|
|
125
|
+
**Needle-Question Similarity Correlation**
|
|
126
|
+
Lower similarity between needle and question pairs shows faster degradation with context length. Tasks requiring inference across dissimilar content are particularly vulnerable.
|
|
127
|
+
|
|
128
|
+
### When Larger Contexts Hurt
|
|
129
|
+
|
|
130
|
+
Larger context windows do not uniformly improve performance. In many cases, larger contexts create new problems that outweigh benefits.
|
|
131
|
+
|
|
132
|
+
**Performance Degradation Curves**
|
|
133
|
+
Models exhibit non-linear degradation with context length. Performance remains stable up to a threshold, then degrades rapidly. The threshold varies by model and task complexity. For many models, meaningful degradation begins around 8,000-16,000 tokens even when context windows support much larger sizes.
|
|
134
|
+
|
|
135
|
+
**Cost Implications**
|
|
136
|
+
Processing cost grows disproportionately with context length. The cost to process a 400K token context is not double the cost of 200K—it increases exponentially in both time and computing resources. For many applications, this makes large-context processing economically impractical.
|
|
137
|
+
|
|
138
|
+
**Cognitive Load Metaphor**
|
|
139
|
+
Even with an infinite context, asking a single model to maintain consistent quality across dozens of independent tasks creates a cognitive bottleneck. The model must constantly switch context between items, maintain a comparative framework, and ensure stylistic consistency. This is not a problem that more context solves.
|
|
140
|
+
|
|
141
|
+
## Practical Guidance
|
|
142
|
+
|
|
143
|
+
### The Four-Bucket Approach
|
|
144
|
+
|
|
145
|
+
Four strategies address different aspects of context degradation:
|
|
146
|
+
|
|
147
|
+
**Write**: Save context outside the window using scratchpads, file systems, or external storage. This keeps active context lean while preserving information access.
|
|
148
|
+
|
|
149
|
+
**Select**: Pull relevant context into the window through retrieval, filtering, and prioritization. This addresses distraction by excluding irrelevant information.
|
|
150
|
+
|
|
151
|
+
**Compress**: Reduce tokens while preserving information through summarization, abstraction, and observation masking. This extends effective context capacity.
|
|
152
|
+
|
|
153
|
+
**Isolate**: Split context across sub-agents or sessions to prevent any single context from growing large enough to degrade. This is the most aggressive strategy but often the most effective.
|
|
154
|
+
|
|
155
|
+
### Architectural Patterns
|
|
156
|
+
|
|
157
|
+
Implement these strategies through specific architectural patterns. Use just-in-time context loading to retrieve information only when needed. Use observation masking to replace verbose tool outputs with compact references. Use sub-agent architectures to isolate context for different tasks. Use compaction to summarize growing context before it exceeds limits.
|
|
158
|
+
|
|
159
|
+
## Examples
|
|
160
|
+
|
|
161
|
+
**Example 1: Detecting Degradation**
|
|
162
|
+
```yaml
|
|
163
|
+
# Context grows during long conversation
|
|
164
|
+
turn_1: 1000 tokens
|
|
165
|
+
turn_5: 8000 tokens
|
|
166
|
+
turn_10: 25000 tokens
|
|
167
|
+
turn_20: 60000 tokens (degradation begins)
|
|
168
|
+
turn_30: 90000 tokens (significant degradation)
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
**Example 2: Mitigating Lost-in-Middle**
|
|
172
|
+
```markdown
|
|
173
|
+
# Organize context with critical info at edges
|
|
174
|
+
|
|
175
|
+
[CURRENT TASK] # At start
|
|
176
|
+
- Goal: Generate quarterly report
|
|
177
|
+
- Deadline: End of week
|
|
178
|
+
|
|
179
|
+
[DETAILED CONTEXT] # Middle (less attention)
|
|
180
|
+
- 50 pages of data
|
|
181
|
+
- Multiple analysis sections
|
|
182
|
+
- Supporting evidence
|
|
183
|
+
|
|
184
|
+
[KEY FINDINGS] # At end
|
|
185
|
+
- Revenue up 15%
|
|
186
|
+
- Costs down 8%
|
|
187
|
+
- Growth in Region A
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
## Guidelines
|
|
191
|
+
|
|
192
|
+
1. Monitor context length and performance correlation during development
|
|
193
|
+
2. Place critical information at beginning or end of context
|
|
194
|
+
3. Implement compaction triggers before degradation becomes severe
|
|
195
|
+
4. Validate retrieved documents for accuracy before adding to context
|
|
196
|
+
5. Use versioning to prevent outdated information from causing clash
|
|
197
|
+
6. Segment tasks to prevent context confusion across different objectives
|
|
198
|
+
7. Design for graceful degradation rather than assuming perfect conditions
|
|
199
|
+
8. Test with progressively larger contexts to find degradation thresholds
|
|
200
|
+
|
|
201
|
+
## Integration
|
|
202
|
+
|
|
203
|
+
This skill builds on context-fundamentals and should be studied after understanding basic context concepts. It connects to:
|
|
204
|
+
|
|
205
|
+
- context-optimization - Techniques for mitigating degradation
|
|
206
|
+
- multi-agent-patterns - Using isolation to prevent degradation
|
|
207
|
+
- evaluation - Measuring and detecting degradation in production
|
|
208
|
+
|
|
209
|
+
## References
|
|
210
|
+
|
|
211
|
+
Internal reference:
|
|
212
|
+
- [Degradation Patterns Reference](./references/patterns.md) - Detailed technical reference
|
|
213
|
+
|
|
214
|
+
Related skills in this collection:
|
|
215
|
+
- context-fundamentals - Context basics
|
|
216
|
+
- context-optimization - Mitigation techniques
|
|
217
|
+
- evaluation - Detection and measurement
|
|
218
|
+
|
|
219
|
+
External resources:
|
|
220
|
+
- Research on attention mechanisms and context window limitations
|
|
221
|
+
- Studies on the "lost-in-middle" phenomenon
|
|
222
|
+
- Production engineering guides from AI labs
|
|
223
|
+
|
|
224
|
+
---
|
|
225
|
+
|
|
226
|
+
## Skill Metadata
|
|
227
|
+
|
|
228
|
+
**Created**: 2025-12-20
|
|
229
|
+
**Last Updated**: 2025-12-20
|
|
230
|
+
**Author**: Agent Skills for Context Engineering Contributors
|
|
231
|
+
**Version**: 1.0.0
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "context-degradation",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Diagnose context confusion, attention loss, and degraded agent behavior",
|
|
5
|
+
"triggers": [
|
|
6
|
+
"context degradation",
|
|
7
|
+
"lost in middle",
|
|
8
|
+
"context poisoning",
|
|
9
|
+
"context confusion",
|
|
10
|
+
"attention pattern"
|
|
11
|
+
],
|
|
12
|
+
"applicable_agents": [
|
|
13
|
+
"strategist"
|
|
14
|
+
],
|
|
15
|
+
"max_context_tokens": 2200,
|
|
16
|
+
"entry_file": "SKILL.md"
|
|
17
|
+
}
|
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: debate
|
|
3
|
+
description: "Structured AI debate templates and synthesis. Use when orchestrating multi-round debates between AI tools, 'debate topic', 'argue about', 'stress test idea', 'devil advocate'."
|
|
4
|
+
version: 5.1.0
|
|
5
|
+
argument-hint: "[topic] [--proposer=tool] [--challenger=tool] [--rounds=N] [--effort=level]"
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# debate
|
|
9
|
+
|
|
10
|
+
Prompt templates, context assembly rules, and synthesis format for structured multi-round debates between AI tools.
|
|
11
|
+
|
|
12
|
+
## Arguments
|
|
13
|
+
|
|
14
|
+
Parse from `$ARGUMENTS`:
|
|
15
|
+
- **topic**: The debate question/topic (required)
|
|
16
|
+
- **--proposer**: Tool for the proposer role (claude, gemini, codex, opencode, copilot)
|
|
17
|
+
- **--challenger**: Tool for the challenger role (must differ from proposer)
|
|
18
|
+
- **--rounds**: Number of back-and-forth rounds (1-5, default: 2)
|
|
19
|
+
- **--effort**: Thinking effort applied to all tool invocations (low, medium, high, max)
|
|
20
|
+
- **--model-proposer**: Specific model for proposer (optional)
|
|
21
|
+
- **--model-challenger**: Specific model for challenger (optional)
|
|
22
|
+
|
|
23
|
+
## Universal Rules
|
|
24
|
+
|
|
25
|
+
ALL participants (proposer AND challenger) MUST support claims with specific evidence (file path, code pattern, benchmark, or documented behavior). Unsupported claims from either side will be flagged by the other participant and noted in the verdict. This applies to every round.
|
|
26
|
+
|
|
27
|
+
## Prompt Templates
|
|
28
|
+
|
|
29
|
+
### Round 1: Proposer Opening
|
|
30
|
+
|
|
31
|
+
```
|
|
32
|
+
You are participating in a structured debate as the PROPOSER.
|
|
33
|
+
|
|
34
|
+
Topic: {topic}
|
|
35
|
+
|
|
36
|
+
Your job: Analyze this topic thoroughly and present your position. Take a clear stance. Do not hedge excessively.
|
|
37
|
+
|
|
38
|
+
You MUST support each claim with specific evidence (file path, code pattern, benchmark, or documented behavior). Unsupported claims will be challenged. "I think" or "generally speaking" without evidence is not acceptable.
|
|
39
|
+
|
|
40
|
+
Provide your analysis:
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### Round 1: Challenger Response
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
You are participating in a structured debate as the CHALLENGER.
|
|
47
|
+
|
|
48
|
+
Topic: {topic}
|
|
49
|
+
|
|
50
|
+
The PROPOSER ({proposer_tool}) argued:
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
{proposer_round1_response}
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
Your job: Find weaknesses, blind spots, and flaws in the proposer's argument. You MUST identify at least one genuine flaw or overlooked consideration before agreeing on anything. Propose concrete alternatives where you disagree.
|
|
57
|
+
|
|
58
|
+
Rules:
|
|
59
|
+
- Do NOT say "great point" or validate the proposer's reasoning before critiquing it
|
|
60
|
+
- Lead with what's WRONG or MISSING, then acknowledge what's right
|
|
61
|
+
- If you genuinely agree on a point, explain what RISK remains despite the agreement
|
|
62
|
+
- Propose at least one concrete alternative approach
|
|
63
|
+
- You MUST address at least these categories: correctness, security implications, and developer experience
|
|
64
|
+
- Do NOT agree with ANY claim unless you can cite specific evidence (file path, code pattern, or documented behavior) that supports the agreement. Unsupported agreement is not allowed.
|
|
65
|
+
- If the proposer makes a claim without evidence, call it out: "This claim is unsupported."
|
|
66
|
+
|
|
67
|
+
Provide your challenge:
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Round 2+: Proposer Defense
|
|
71
|
+
|
|
72
|
+
```
|
|
73
|
+
You are the PROPOSER in round {round} of a structured debate.
|
|
74
|
+
|
|
75
|
+
Topic: {topic}
|
|
76
|
+
|
|
77
|
+
{context_summary}
|
|
78
|
+
|
|
79
|
+
The CHALLENGER ({challenger_tool}) raised these points in round {previous_round}:
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
{challenger_previous_response}
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
Your job: Address each challenge directly. For each point:
|
|
86
|
+
- If they're right, concede explicitly and explain how your position evolves
|
|
87
|
+
- If they're wrong, explain why with specific evidence (file path, code pattern, benchmark, or documented behavior)
|
|
88
|
+
- If it's a tradeoff, acknowledge the tradeoff and explain why you still favor your approach with evidence
|
|
89
|
+
|
|
90
|
+
Every claim you make -- whether concession, rebuttal, or new argument -- MUST cite specific evidence. The challenger will reject unsupported claims.
|
|
91
|
+
|
|
92
|
+
Do NOT simply restate your original position. Your response must show you engaged with the specific challenges raised.
|
|
93
|
+
|
|
94
|
+
Provide your defense:
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Round 2+: Challenger Follow-up
|
|
98
|
+
|
|
99
|
+
```
|
|
100
|
+
You are the CHALLENGER in round {round} of a structured debate.
|
|
101
|
+
|
|
102
|
+
Topic: {topic}
|
|
103
|
+
|
|
104
|
+
{context_summary}
|
|
105
|
+
|
|
106
|
+
The PROPOSER ({proposer_tool}) responded to your challenges:
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
{proposer_previous_response}
|
|
110
|
+
---
|
|
111
|
+
|
|
112
|
+
IMPORTANT: Do NOT let the proposer reframe your challenges as agreements. If they say "we actually agree" but haven't addressed the substance, reject it. Default to suspicion, not acceptance.
|
|
113
|
+
|
|
114
|
+
Your job: Evaluate the proposer's defense. For each point they addressed:
|
|
115
|
+
- Did they dodge, superficially address, or respond without evidence? Call it out: "This defense is unsupported" or "This dodges the original concern"
|
|
116
|
+
- Did they concede any point? Hold them to it -- they cannot walk it back later without new evidence
|
|
117
|
+
- Are there NEW weaknesses in their revised position?
|
|
118
|
+
- Did they adequately address your concern with specific evidence? Only then acknowledge it, and cite what convinced you
|
|
119
|
+
|
|
120
|
+
You MUST either identify at least one new weakness or unresolved concern, OR explicitly certify a previous concern as genuinely resolved with specific evidence for why you're now satisfied. "I'm convinced because [evidence]" is acceptable. "I agree now" without evidence is not.
|
|
121
|
+
If you see new problems, raise them.
|
|
122
|
+
|
|
123
|
+
Provide your follow-up:
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## Context Assembly
|
|
127
|
+
|
|
128
|
+
### Rounds 1-2: Full context
|
|
129
|
+
|
|
130
|
+
Include the full text of all prior exchanges in the prompt. Context is small enough (typically under 5000 tokens total).
|
|
131
|
+
|
|
132
|
+
Format for context block:
|
|
133
|
+
```
|
|
134
|
+
Previous exchanges:
|
|
135
|
+
|
|
136
|
+
Round 1 - Proposer ({proposer_tool}):
|
|
137
|
+
{full response}
|
|
138
|
+
|
|
139
|
+
Round 1 - Challenger ({challenger_tool}):
|
|
140
|
+
{full response}
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### Round 3+: Summarized context
|
|
144
|
+
|
|
145
|
+
For rounds 3 and beyond, replace full exchange text from rounds 1 through N-2 with a summary. Only include the most recent round's responses in full.
|
|
146
|
+
|
|
147
|
+
Format:
|
|
148
|
+
```
|
|
149
|
+
Summary of rounds 1-{N-2}:
|
|
150
|
+
{summary of key positions, agreements, and open disagreements}
|
|
151
|
+
|
|
152
|
+
Round {N-1} - Proposer ({proposer_tool}):
|
|
153
|
+
{full response}
|
|
154
|
+
|
|
155
|
+
Round {N-1} - Challenger ({challenger_tool}):
|
|
156
|
+
{full response}
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
The orchestrator agent (opus) generates the summary. Target: 500-800 tokens. MUST preserve:
|
|
160
|
+
- Each side's core position
|
|
161
|
+
- All concessions (verbatim quotes, not paraphrased)
|
|
162
|
+
- All evidence citations that support agreements
|
|
163
|
+
- Points of disagreement (unresolved)
|
|
164
|
+
- Any contradictions between rounds (e.g., proposer concedes in round 1 but walks it back in round 2 -- note both explicitly)
|
|
165
|
+
|
|
166
|
+
## Synthesis Format
|
|
167
|
+
|
|
168
|
+
After all rounds complete, the orchestrator produces this structured output:
|
|
169
|
+
|
|
170
|
+
```
|
|
171
|
+
## Debate Summary
|
|
172
|
+
|
|
173
|
+
**Topic**: {topic}
|
|
174
|
+
**Proposer**: {proposer_tool} ({proposer_model})
|
|
175
|
+
**Challenger**: {challenger_tool} ({challenger_model})
|
|
176
|
+
**Rounds**: {rounds_completed}
|
|
177
|
+
**Rigor**: Structured perspective comparison (prompt-enforced adversarial rules, no deterministic verification)
|
|
178
|
+
|
|
179
|
+
### Verdict
|
|
180
|
+
|
|
181
|
+
{winner_tool} had the stronger argument because: {specific reasoning citing debate evidence}
|
|
182
|
+
|
|
183
|
+
### Debate Quality
|
|
184
|
+
|
|
185
|
+
Rate the debate on these dimensions:
|
|
186
|
+
- **Genuine disagreement**: Did the challenger maintain independent positions, or converge toward the proposer? (high/medium/low)
|
|
187
|
+
- **Evidence quality**: Did both sides cite specific examples, or argue from generalities? (high/medium/low)
|
|
188
|
+
- **Challenge depth**: Were the challenges substantive, or surface-level? (high/medium/low)
|
|
189
|
+
|
|
190
|
+
### Key Agreements
|
|
191
|
+
- {agreed point 1} (evidence: {what supports this agreement})
|
|
192
|
+
- {agreed point 2} (evidence: {what supports this agreement})
|
|
193
|
+
|
|
194
|
+
### Key Disagreements
|
|
195
|
+
- {point}: {proposer_tool} argues {X}, {challenger_tool} argues {Y}
|
|
196
|
+
|
|
197
|
+
### Unresolved Questions
|
|
198
|
+
- {question that neither side adequately addressed}
|
|
199
|
+
|
|
200
|
+
### Recommendation
|
|
201
|
+
{Orchestrator's recommendation - must pick a direction, not "both have merit"}
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
**Synthesis rules:**
|
|
205
|
+
- The verdict MUST pick a side. "Both approaches have merit" is NOT acceptable.
|
|
206
|
+
- Cite specific arguments from the debate as evidence for the verdict.
|
|
207
|
+
- The recommendation must be actionable - what should the user DO based on this debate.
|
|
208
|
+
- Unresolved questions highlight where the debate fell short, not where both sides are "equally valid."
|
|
209
|
+
|
|
210
|
+
## State File Schema
|
|
211
|
+
|
|
212
|
+
Save to `{AI_STATE_DIR}/debate/last-debate.json`:
|
|
213
|
+
|
|
214
|
+
```json
|
|
215
|
+
{
|
|
216
|
+
"id": "debate-{ISO timestamp}-{4 char random hex}",
|
|
217
|
+
"topic": "original topic text",
|
|
218
|
+
"proposer": {"tool": "claude", "model": "opus"},
|
|
219
|
+
"challenger": {"tool": "gemini", "model": "gemini-3.1-pro-preview"},
|
|
220
|
+
"effort": "high",
|
|
221
|
+
"rounds_completed": 2,
|
|
222
|
+
"max_rounds": 2,
|
|
223
|
+
"status": "completed",
|
|
224
|
+
"exchanges": [
|
|
225
|
+
{"round": 1, "role": "proposer", "tool": "claude", "response": "...", "duration_ms": 8500},
|
|
226
|
+
{"round": 1, "role": "challenger", "tool": "gemini", "response": "...", "duration_ms": 12000},
|
|
227
|
+
{"round": 2, "role": "proposer", "tool": "claude", "response": "...", "duration_ms": 9200},
|
|
228
|
+
{"round": 2, "role": "challenger", "tool": "gemini", "response": "...", "duration_ms": 11000}
|
|
229
|
+
],
|
|
230
|
+
"verdict": {
|
|
231
|
+
"winner": "claude",
|
|
232
|
+
"reasoning": "...",
|
|
233
|
+
"agreements": ["..."],
|
|
234
|
+
"disagreements": ["..."],
|
|
235
|
+
"recommendation": "..."
|
|
236
|
+
},
|
|
237
|
+
"timestamp": "{ISO 8601 timestamp}"
|
|
238
|
+
}
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
Platform state directory:
|
|
242
|
+
- Claude Code: `.claude/`
|
|
243
|
+
- OpenCode: `.opencode/`
|
|
244
|
+
- Codex CLI: `.codex/`
|
|
245
|
+
|
|
246
|
+
## Error Handling
|
|
247
|
+
|
|
248
|
+
| Error | Action |
|
|
249
|
+
|-------|--------|
|
|
250
|
+
| Proposer fails round 1 | Abort debate. Cannot proceed without opening position. |
|
|
251
|
+
| Challenger fails round 1 | Show proposer's position with note: "[WARN] Challenger failed. Showing proposer's uncontested position." |
|
|
252
|
+
| Any tool fails mid-debate | Synthesize from completed rounds. Note incomplete round in output. |
|
|
253
|
+
| Tool invocation timeout (>240s) | Round 1 proposer: abort. Round 1 challenger: proceed with uncontested. Round 2+: synthesize from completed rounds with timeout note. |
|
|
254
|
+
| Consult result envelope indicates failure (status/exit/error/empty output) | Treat as tool failure for that role/round and apply the same role+round policy above. |
|
|
255
|
+
| Structured parse fails after successful envelope | Treat as tool failure for that role/round, include only sanitized parse metadata (`PARSE_ERROR:<type>:<code>`, redact secrets, strip control chars, max 200 chars), then apply the same role+round policy above. |
|
|
256
|
+
| All rounds timeout | "[ERROR] Debate failed: all tool invocations timed out." |
|
|
257
|
+
| No successful exchanges recorded (non-timeout) | "[ERROR] Debate failed: no successful exchanges were recorded." |
|
|
258
|
+
|
|
259
|
+
## External Tool Quick Reference
|
|
260
|
+
|
|
261
|
+
> Canonical source: `plugins/consult/skills/consult/SKILL.md`. Build and execute CLI commands directly using these templates. Do NOT invoke via `Skill: consult` - in Claude Code that loads the interactive command wrapper and causes a recursive loop. Write the question to `{AI_STATE_DIR}/consult/question.tmp` first, then execute the command via Bash.
|
|
262
|
+
|
|
263
|
+
### Safe Command Patterns
|
|
264
|
+
|
|
265
|
+
| Provider | Safe Command Pattern |
|
|
266
|
+
|----------|---------------------|
|
|
267
|
+
| Claude | `claude -p - --output-format json --model "MODEL" --max-turns TURNS --allowedTools "Read,Glob,Grep" < "{AI_STATE_DIR}/consult/question.tmp"` |
|
|
268
|
+
| Gemini | `gemini -p - --output-format json -m "MODEL" < "{AI_STATE_DIR}/consult/question.tmp"` |
|
|
269
|
+
| Codex | `codex exec "$(cat "{AI_STATE_DIR}/consult/question.tmp")" --json -m "MODEL" -c model_reasoning_effort="LEVEL"` |
|
|
270
|
+
| OpenCode | `opencode run - --format json --model "MODEL" --variant "VARIANT" < "{AI_STATE_DIR}/consult/question.tmp"` |
|
|
271
|
+
| Copilot | `copilot -p - < "{AI_STATE_DIR}/consult/question.tmp"` |
|
|
272
|
+
|
|
273
|
+
### Effort-to-Model Mapping
|
|
274
|
+
|
|
275
|
+
| Effort | Claude | Gemini | Codex | OpenCode | Copilot |
|
|
276
|
+
|--------|--------|--------|-------|----------|---------|
|
|
277
|
+
| low | claude-haiku-4-5 (1 turn) | gemini-3-flash-preview | gpt-5.3-codex (low) | default (low) | no control |
|
|
278
|
+
| medium | claude-sonnet-4-6 (3 turns) | gemini-3-flash-preview | gpt-5.3-codex (medium) | default (medium) | no control |
|
|
279
|
+
| high | claude-opus-4-6 (5 turns) | gemini-3.1-pro-preview | gpt-5.3-codex (high) | default (high) | no control |
|
|
280
|
+
| max | claude-opus-4-6 (10 turns) | gemini-3.1-pro-preview | gpt-5.3-codex (high) | default + --thinking | no control |
|
|
281
|
+
|
|
282
|
+
### Output Parsing
|
|
283
|
+
|
|
284
|
+
| Provider | Parse Expression |
|
|
285
|
+
|----------|-----------------|
|
|
286
|
+
| Claude | `JSON.parse(stdout).result` |
|
|
287
|
+
| Gemini | `JSON.parse(stdout).response` |
|
|
288
|
+
| Codex | `JSON.parse(stdout).message` or raw text |
|
|
289
|
+
| OpenCode | Newline-delimited JSON. Concatenate `part.text` from events where `type === "text"`. Session ID from `event.sessionID`. |
|
|
290
|
+
| Copilot | Raw stdout text |
|
|
291
|
+
|
|
292
|
+
Parse discipline:
|
|
293
|
+
1. Evaluate execution status first (timeout/non-zero/error/empty output) before any parsing.
|
|
294
|
+
2. Parse only when execution status is successful.
|
|
295
|
+
3. If parse fails, surface only sanitized parse metadata (never raw stdout/stderr snippets) and apply role/round failure policy instead of hanging or continuing silently.
|
|
296
|
+
|
|
297
|
+
### ACP Transport Commands
|
|
298
|
+
|
|
299
|
+
> ACP is an alternative transport available when providers support it. Build and execute CLI commands directly - do NOT use `Skill: consult` (recursive loop in Claude Code).
|
|
300
|
+
|
|
301
|
+
| Provider | ACP Command Pattern |
|
|
302
|
+
|----------|-------------------|
|
|
303
|
+
| Claude | `node acp/run.js --provider="claude" --question-file="{AI_STATE_DIR}/consult/question.tmp" --timeout=240000 --model="MODEL"` |
|
|
304
|
+
| Gemini | `node acp/run.js --provider="gemini" --question-file="{AI_STATE_DIR}/consult/question.tmp" --timeout=240000 --model="MODEL"` |
|
|
305
|
+
| Codex | `node acp/run.js --provider="codex" --question-file="{AI_STATE_DIR}/consult/question.tmp" --timeout=240000 --model="MODEL"` |
|
|
306
|
+
| OpenCode | `node acp/run.js --provider="opencode" --question-file="{AI_STATE_DIR}/consult/question.tmp" --timeout=240000 --model="MODEL"` |
|
|
307
|
+
| Copilot | `node acp/run.js --provider="copilot" --question-file="{AI_STATE_DIR}/consult/question.tmp" --timeout=240000` |
|
|
308
|
+
| Kiro | `node acp/run.js --provider="kiro" --question-file="{AI_STATE_DIR}/consult/question.tmp" --timeout=240000` |
|
|
309
|
+
|
|
310
|
+
Note the 240000ms timeout (240s) for debate rounds vs 120000ms (120s) for consult.
|
|
311
|
+
|
|
312
|
+
**Kiro**: ACP-only provider. No CLI mode. Available when `kiro-cli` is on PATH.
|
|
313
|
+
|
|
314
|
+
### ACP Output Parsing
|
|
315
|
+
|
|
316
|
+
ACP transport output is parsed identically to CLI transport - the ACP runner (`acp/run.js`) normalizes responses into the same JSON envelope format. The `transport` field in the envelope indicates `"acp"` or `"cli"`.
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "debate",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Structured debate templates for stress-testing positions and solutions",
|
|
5
|
+
"triggers": [
|
|
6
|
+
"debate",
|
|
7
|
+
"argue",
|
|
8
|
+
"stress test",
|
|
9
|
+
"devils advocate",
|
|
10
|
+
"counter argument"
|
|
11
|
+
],
|
|
12
|
+
"applicable_agents": [
|
|
13
|
+
"critic",
|
|
14
|
+
"strategist",
|
|
15
|
+
"librarian"
|
|
16
|
+
],
|
|
17
|
+
"max_context_tokens": 2200,
|
|
18
|
+
"entry_file": "SKILL.md"
|
|
19
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "design-first",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Design the approach before implementation starts",
|
|
5
|
+
"triggers": [
|
|
6
|
+
"design",
|
|
7
|
+
"architecture",
|
|
8
|
+
"plan",
|
|
9
|
+
"schema",
|
|
10
|
+
"api",
|
|
11
|
+
"interface"
|
|
12
|
+
],
|
|
13
|
+
"applicable_agents": [
|
|
14
|
+
"planner",
|
|
15
|
+
"worker",
|
|
16
|
+
"heavy-worker"
|
|
17
|
+
],
|
|
18
|
+
"max_context_tokens": 1500,
|
|
19
|
+
"entry_file": "SKILL.md"
|
|
20
|
+
}
|