opencode-multiagent 0.2.0 → 0.3.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. package/AGENTS.md +62 -0
  2. package/CHANGELOG.md +18 -0
  3. package/CONTRIBUTING.md +36 -0
  4. package/README.md +41 -165
  5. package/README.tr.md +84 -0
  6. package/RELEASE.md +68 -0
  7. package/agents/advisor.md +9 -6
  8. package/agents/auditor.md +8 -6
  9. package/agents/critic.md +19 -10
  10. package/agents/deep-worker.md +11 -7
  11. package/agents/devil.md +3 -1
  12. package/agents/executor.md +20 -19
  13. package/agents/heavy-worker.md +11 -7
  14. package/agents/lead.md +22 -30
  15. package/agents/librarian.md +6 -2
  16. package/agents/planner.md +18 -10
  17. package/agents/qa.md +9 -6
  18. package/agents/quick.md +12 -7
  19. package/agents/reviewer.md +9 -6
  20. package/agents/scout.md +9 -5
  21. package/agents/scribe.md +33 -28
  22. package/agents/strategist.md +10 -7
  23. package/agents/ui-heavy-worker.md +11 -7
  24. package/agents/ui-worker.md +12 -7
  25. package/agents/validator.md +8 -5
  26. package/agents/worker.md +12 -7
  27. package/commands/execute.md +1 -0
  28. package/commands/init-deep.md +1 -0
  29. package/commands/init.md +1 -0
  30. package/commands/inspect.md +1 -0
  31. package/commands/plan.md +1 -0
  32. package/commands/quality.md +1 -0
  33. package/commands/review.md +1 -0
  34. package/commands/status.md +1 -0
  35. package/defaults/opencode-multiagent.json +223 -0
  36. package/defaults/opencode-multiagent.schema.json +249 -0
  37. package/dist/control-plane.d.ts +4 -0
  38. package/dist/control-plane.d.ts.map +1 -0
  39. package/dist/index.d.ts +5 -0
  40. package/dist/index.d.ts.map +1 -0
  41. package/dist/index.js +1583 -0
  42. package/dist/opencode-multiagent/compiler.d.ts +19 -0
  43. package/dist/opencode-multiagent/compiler.d.ts.map +1 -0
  44. package/dist/opencode-multiagent/constants.d.ts +116 -0
  45. package/dist/opencode-multiagent/constants.d.ts.map +1 -0
  46. package/dist/opencode-multiagent/defaults.d.ts +10 -0
  47. package/dist/opencode-multiagent/defaults.d.ts.map +1 -0
  48. package/dist/opencode-multiagent/file-lock.d.ts +15 -0
  49. package/dist/opencode-multiagent/file-lock.d.ts.map +1 -0
  50. package/dist/opencode-multiagent/hooks.d.ts +62 -0
  51. package/dist/opencode-multiagent/hooks.d.ts.map +1 -0
  52. package/dist/opencode-multiagent/log.d.ts +2 -0
  53. package/dist/opencode-multiagent/log.d.ts.map +1 -0
  54. package/dist/opencode-multiagent/markdown.d.ts +8 -0
  55. package/dist/opencode-multiagent/markdown.d.ts.map +1 -0
  56. package/dist/opencode-multiagent/mcp.d.ts +3 -0
  57. package/dist/opencode-multiagent/mcp.d.ts.map +1 -0
  58. package/dist/opencode-multiagent/policy.d.ts +5 -0
  59. package/dist/opencode-multiagent/policy.d.ts.map +1 -0
  60. package/dist/opencode-multiagent/quality.d.ts +14 -0
  61. package/dist/opencode-multiagent/quality.d.ts.map +1 -0
  62. package/dist/opencode-multiagent/runtime.d.ts +7 -0
  63. package/dist/opencode-multiagent/runtime.d.ts.map +1 -0
  64. package/dist/opencode-multiagent/session-tracker.d.ts +32 -0
  65. package/dist/opencode-multiagent/session-tracker.d.ts.map +1 -0
  66. package/dist/opencode-multiagent/skills.d.ts +17 -0
  67. package/dist/opencode-multiagent/skills.d.ts.map +1 -0
  68. package/dist/opencode-multiagent/supervision.d.ts +12 -0
  69. package/dist/opencode-multiagent/supervision.d.ts.map +1 -0
  70. package/dist/opencode-multiagent/task-manager.d.ts +48 -0
  71. package/dist/opencode-multiagent/task-manager.d.ts.map +1 -0
  72. package/dist/opencode-multiagent/telemetry.d.ts +26 -0
  73. package/dist/opencode-multiagent/telemetry.d.ts.map +1 -0
  74. package/dist/opencode-multiagent/tools.d.ts +56 -0
  75. package/dist/opencode-multiagent/tools.d.ts.map +1 -0
  76. package/dist/opencode-multiagent/types.d.ts +36 -0
  77. package/dist/opencode-multiagent/types.d.ts.map +1 -0
  78. package/dist/opencode-multiagent/utils.d.ts +9 -0
  79. package/dist/opencode-multiagent/utils.d.ts.map +1 -0
  80. package/docs/agents.md +260 -0
  81. package/docs/agents.tr.md +260 -0
  82. package/docs/configuration.md +255 -0
  83. package/docs/configuration.tr.md +255 -0
  84. package/docs/usage-guide.md +226 -0
  85. package/docs/usage-guide.tr.md +227 -0
  86. package/examples/opencode.with-overrides.json +1 -5
  87. package/package.json +23 -13
  88. package/skills/advanced-evaluation/SKILL.md +37 -21
  89. package/skills/advanced-evaluation/manifest.json +2 -13
  90. package/skills/cek-context-engineering/SKILL.md +159 -87
  91. package/skills/cek-context-engineering/manifest.json +1 -3
  92. package/skills/cek-prompt-engineering/SKILL.md +13 -10
  93. package/skills/cek-prompt-engineering/manifest.json +1 -3
  94. package/skills/cek-test-prompt/SKILL.md +38 -28
  95. package/skills/cek-test-prompt/manifest.json +1 -3
  96. package/skills/cek-thought-based-reasoning/SKILL.md +75 -21
  97. package/skills/cek-thought-based-reasoning/manifest.json +1 -3
  98. package/skills/context-degradation/SKILL.md +14 -13
  99. package/skills/context-degradation/manifest.json +1 -3
  100. package/skills/debate/SKILL.md +23 -78
  101. package/skills/debate/manifest.json +2 -12
  102. package/skills/design-first/manifest.json +2 -13
  103. package/skills/dispatching-parallel-agents/SKILL.md +14 -3
  104. package/skills/dispatching-parallel-agents/manifest.json +1 -4
  105. package/skills/drift-analysis/SKILL.md +50 -29
  106. package/skills/drift-analysis/manifest.json +2 -12
  107. package/skills/evaluation/manifest.json +2 -12
  108. package/skills/executing-plans/SKILL.md +15 -8
  109. package/skills/executing-plans/manifest.json +1 -3
  110. package/skills/handoff-protocols/manifest.json +2 -12
  111. package/skills/parallel-investigation/SKILL.md +25 -12
  112. package/skills/parallel-investigation/manifest.json +1 -4
  113. package/skills/reflexion-critique/SKILL.md +21 -10
  114. package/skills/reflexion-critique/manifest.json +1 -3
  115. package/skills/reflexion-reflect/SKILL.md +36 -34
  116. package/skills/reflexion-reflect/manifest.json +2 -10
  117. package/skills/root-cause-analysis/manifest.json +2 -13
  118. package/skills/sadd-judge-with-debate/SKILL.md +50 -26
  119. package/skills/sadd-judge-with-debate/manifest.json +1 -3
  120. package/skills/structured-code-review/manifest.json +2 -11
  121. package/skills/task-decomposition/manifest.json +2 -13
  122. package/skills/verification-before-completion/manifest.json +2 -15
  123. package/skills/verification-gates/SKILL.md +27 -19
  124. package/skills/verification-gates/manifest.json +2 -12
  125. package/defaults/agent-settings.json +0 -102
  126. package/defaults/agent-settings.schema.json +0 -25
  127. package/defaults/flags.json +0 -35
  128. package/defaults/flags.schema.json +0 -119
  129. package/defaults/mcp-defaults.json +0 -47
  130. package/defaults/mcp-defaults.schema.json +0 -38
  131. package/defaults/profiles.json +0 -53
  132. package/defaults/profiles.schema.json +0 -60
  133. package/defaults/team-profiles.json +0 -83
  134. package/src/control-plane.ts +0 -21
  135. package/src/index.ts +0 -8
  136. package/src/opencode-multiagent/compiler.ts +0 -168
  137. package/src/opencode-multiagent/constants.ts +0 -178
  138. package/src/opencode-multiagent/file-lock.ts +0 -90
  139. package/src/opencode-multiagent/hooks.ts +0 -599
  140. package/src/opencode-multiagent/log.ts +0 -12
  141. package/src/opencode-multiagent/mailbox.ts +0 -287
  142. package/src/opencode-multiagent/markdown.ts +0 -99
  143. package/src/opencode-multiagent/mcp.ts +0 -35
  144. package/src/opencode-multiagent/policy.ts +0 -67
  145. package/src/opencode-multiagent/quality.ts +0 -140
  146. package/src/opencode-multiagent/runtime.ts +0 -55
  147. package/src/opencode-multiagent/skills.ts +0 -144
  148. package/src/opencode-multiagent/supervision.ts +0 -156
  149. package/src/opencode-multiagent/task-manager.ts +0 -148
  150. package/src/opencode-multiagent/team-manager.ts +0 -219
  151. package/src/opencode-multiagent/team-tools.ts +0 -359
  152. package/src/opencode-multiagent/telemetry.ts +0 -124
  153. package/src/opencode-multiagent/utils.ts +0 -54
@@ -9,19 +9,18 @@ description: Use when tackling complex reasoning tasks requiring step-by-step lo
9
9
 
10
10
  Chain-of-Thought (CoT) prompting and its variants encourage LLMs to generate intermediate reasoning steps before arriving at a final answer, significantly improving performance on complex reasoning tasks. These techniques transform how models approach problems by making implicit reasoning explicit.
11
11
 
12
-
13
12
  ## Quick Reference
14
13
 
15
- | Technique | When to Use | Complexity | Accuracy Gain |
16
- |-----------|-------------|------------|---------------|
17
- | Zero-shot CoT | Quick reasoning, no examples available | Low | +20-60% |
18
- | Few-shot CoT | Have good examples, consistent format needed | Medium | +30-70% |
19
- | Self-Consistency | High-stakes decisions, need confidence | Medium | +10-20% over CoT |
20
- | Tree of Thoughts | Complex problems requiring exploration | High | +50-70% on hard tasks |
21
- | Least-to-Most | Multi-step problems with subproblems | Medium | +30-80% |
22
- | ReAct | Tasks requiring external information | Medium | +15-35% |
23
- | PAL | Mathematical/computational problems | Medium | +10-15% |
24
- | Reflexion | Iterative improvement, learning from errors | High | +10-20% |
14
+ | Technique | When to Use | Complexity | Accuracy Gain |
15
+ | ---------------- | -------------------------------------------- | ---------- | --------------------- |
16
+ | Zero-shot CoT | Quick reasoning, no examples available | Low | +20-60% |
17
+ | Few-shot CoT | Have good examples, consistent format needed | Medium | +30-70% |
18
+ | Self-Consistency | High-stakes decisions, need confidence | Medium | +10-20% over CoT |
19
+ | Tree of Thoughts | Complex problems requiring exploration | High | +50-70% on hard tasks |
20
+ | Least-to-Most | Multi-step problems with subproblems | Medium | +30-80% |
21
+ | ReAct | Tasks requiring external information | Medium | +15-35% |
22
+ | PAL | Mathematical/computational problems | Medium | +10-15% |
23
+ | Reflexion | Iterative improvement, learning from errors | High | +10-20% |
25
24
 
26
25
  ---
27
26
 
@@ -33,12 +32,14 @@ Chain-of-Thought (CoT) prompting and its variants encourage LLMs to generate int
33
32
  **Citations**: 14,255+
34
33
 
35
34
  #### When to Use
35
+
36
36
  - Multi-step arithmetic or math word problems
37
37
  - Commonsense reasoning requiring logical deduction
38
38
  - Symbolic reasoning tasks
39
39
  - When you have good exemplars showing reasoning
40
40
 
41
41
  #### How It Works
42
+
42
43
  Provide few-shot examples that include intermediate reasoning steps, not just question-answer pairs. The model learns to generate similar step-by-step reasoning.
43
44
 
44
45
  #### Prompt Template
@@ -55,11 +56,13 @@ A:
55
56
  ```
56
57
 
57
58
  #### Strengths
59
+
58
60
  - Significant accuracy improvements on reasoning tasks
59
61
  - Interpretable intermediate steps
60
62
  - Works well with large models (>100B parameters)
61
63
 
62
64
  #### Limitations
65
+
63
66
  - Requires crafting good exemplars
64
67
  - Less effective on smaller models
65
68
  - Can still make calculation errors
@@ -72,12 +75,14 @@ A:
72
75
  **Citations**: 5,985+
73
76
 
74
77
  #### When to Use
78
+
75
79
  - No exemplars available
76
80
  - Quick reasoning needed
77
81
  - General-purpose reasoning across task types
78
82
  - Prototyping before creating few-shot examples
79
83
 
80
84
  #### How It Works
85
+
81
86
  Simply append "Let's think step by step" (or similar phrase) to the prompt. This triggers the model to generate reasoning steps without any examples.
82
87
 
83
88
  #### Prompt Template
@@ -89,6 +94,7 @@ Let's think step by step.
89
94
  ```
90
95
 
91
96
  **Alternative trigger phrases**:
97
+
92
98
  - "Let's work this out step by step to be sure we have the right answer."
93
99
  - "Let's break this down."
94
100
  - "Let's approach this systematically."
@@ -97,23 +103,27 @@ Let's think step by step.
97
103
  #### Two-Stage Approach (More Robust)
98
104
 
99
105
  **Stage 1 - Reasoning Extraction**:
106
+
100
107
  ```
101
108
  Q: [QUESTION]
102
109
  A: Let's think step by step.
103
110
  ```
104
111
 
105
112
  **Stage 2 - Answer Extraction**:
113
+
106
114
  ```
107
115
  [REASONING FROM STAGE 1]
108
116
  Therefore, the answer is
109
117
  ```
110
118
 
111
119
  #### Strengths
120
+
112
121
  - No exemplar crafting required
113
122
  - Generalizes across task types
114
123
  - Simple to implement
115
124
 
116
125
  #### Limitations
126
+
117
127
  - Less effective than few-shot CoT
118
128
  - Can produce verbose or irrelevant reasoning
119
129
  - Sensitive to exact phrasing
@@ -126,12 +136,14 @@ Therefore, the answer is
126
136
  **Citations**: 5,379+
127
137
 
128
138
  #### When to Use
139
+
129
140
  - High-stakes decisions requiring confidence
130
141
  - Problems with multiple valid reasoning paths
131
142
  - When you need to reduce variance in outputs
132
143
  - Verification of reasoning correctness
133
144
 
134
145
  #### How It Works
146
+
135
147
  Sample multiple diverse reasoning paths, then select the most consistent answer via majority voting. The intuition: correct answers can be reached through multiple reasoning paths.
136
148
 
137
149
  #### Prompt Template
@@ -161,11 +173,13 @@ def self_consistency(prompt, n_samples=5, temperature=0.7):
161
173
  ```
162
174
 
163
175
  #### Strengths
176
+
164
177
  - Significant accuracy boost over single-path CoT
165
178
  - Provides confidence measure (agreement level)
166
179
  - Task-agnostic improvement
167
180
 
168
181
  #### Limitations
182
+
169
183
  - Higher computational cost (N times more generations)
170
184
  - Requires extractable discrete answers
171
185
  - Diminishing returns beyond ~10-20 samples
@@ -178,17 +192,20 @@ def self_consistency(prompt, n_samples=5, temperature=0.7):
178
192
  **Citations**: 3,026+
179
193
 
180
194
  #### When to Use
195
+
181
196
  - Complex problems requiring exploration/backtracking
182
197
  - Tasks where initial decisions are pivotal
183
198
  - Creative problem-solving (writing, puzzles)
184
199
  - When CoT alone achieves <50% accuracy
185
200
 
186
201
  #### How It Works
202
+
187
203
  Generalize CoT to a tree structure where each node is a "thought" (coherent language unit). Uses search algorithms (BFS/DFS) with self-evaluation to explore and select promising reasoning paths.
188
204
 
189
205
  #### Prompt Template
190
206
 
191
207
  **Thought Generation**:
208
+
192
209
  ```
193
210
  Given the current state:
194
211
  [STATE]
@@ -197,6 +214,7 @@ Generate 3-5 possible next steps to solve this problem.
197
214
  ```
198
215
 
199
216
  **State Evaluation**:
217
+
200
218
  ```
201
219
  Evaluate if the following partial solution is:
202
220
  - "sure" (definitely leads to solution)
@@ -208,6 +226,7 @@ Partial solution:
208
226
  ```
209
227
 
210
228
  **BFS/DFS Search**:
229
+
211
230
  ```python
212
231
  def tree_of_thoughts(problem, max_depth=3, beam_width=3):
213
232
  queue = [(problem, [])] # (state, thought_path)
@@ -249,11 +268,13 @@ Evaluation: "impossible" - no way to get 24 from these
249
268
  ```
250
269
 
251
270
  #### Strengths
271
+
252
272
  - Dramatically improves performance on hard tasks (4% → 74% on Game of 24)
253
273
  - Enables backtracking and exploration
254
274
  - Self-evaluation catches errors early
255
275
 
256
276
  #### Limitations
277
+
257
278
  - Significantly higher computational cost
258
279
  - Requires task-specific thought decomposition
259
280
  - Complex to implement
@@ -266,19 +287,23 @@ Evaluation: "impossible" - no way to get 24 from these
266
287
  **Citations**: 1,466+
267
288
 
268
289
  #### When to Use
290
+
269
291
  - Problems harder than your exemplars
270
292
  - Compositional generalization tasks
271
293
  - Multi-step problems with clear subproblems
272
294
  - Symbol manipulation and SCAN-like tasks
273
295
 
274
296
  #### How It Works
297
+
275
298
  Two-stage process:
299
+
276
300
  1. **Decomposition**: Break complex problem into simpler subproblems
277
301
  2. **Sequential Solving**: Solve subproblems in order, using previous answers
278
302
 
279
303
  #### Prompt Template
280
304
 
281
305
  **Stage 1: Decomposition**
306
+
282
307
  ```
283
308
  Q: Four years ago, Kody was only half as old as Mohamed. If Mohamed is currently twice as old as 30 years old, how old is Kody?
284
309
 
@@ -289,6 +314,7 @@ Then we need to solve:
289
314
  ```
290
315
 
291
316
  **Stage 2: Sequential Solving**
317
+
292
318
  ```
293
319
  Q: If Mohamed is currently twice as old as 30 years old, how old is Mohamed?
294
320
  A: Mohamed is twice as old as 30, so Mohamed is 30 * 2 = 60 years old.
@@ -300,11 +326,13 @@ The answer is 32.
300
326
  ```
301
327
 
302
328
  #### Strengths
329
+
303
330
  - Excellent at generalizing to harder problems
304
331
  - Works well on compositional tasks
305
332
  - Explicit problem decomposition aids interpretability
306
333
 
307
334
  #### Limitations
335
+
308
336
  - Requires two-stage prompting
309
337
  - Decomposition step can fail on novel structures
310
338
  - More complex setup than single-stage CoT
@@ -317,12 +345,14 @@ The answer is 32.
317
345
  **Citations**: 5,012+
318
346
 
319
347
  #### When to Use
348
+
320
349
  - Tasks requiring external information (search, APIs)
321
350
  - Interactive decision-making environments
322
351
  - Multi-hop question answering
323
352
  - When pure reasoning leads to hallucination
324
353
 
325
354
  #### How It Works
355
+
326
356
  Interleave reasoning traces ("Thought") with actions ("Action") and observations ("Observation"). Reasoning helps plan actions; actions provide new information for reasoning.
327
357
 
328
358
  #### Prompt Template
@@ -351,16 +381,19 @@ Action 5: Finish[1,800 to 7,000 ft]
351
381
  ```
352
382
 
353
383
  #### Action Types
384
+
354
385
  - `Search[query]` - Search for information
355
386
  - `Lookup[keyword]` - Look up keyword in current context
356
387
  - `Finish[answer]` - Return final answer
357
388
 
358
389
  #### Strengths
390
+
359
391
  - Reduces hallucination by grounding in external knowledge
360
392
  - Interpretable action traces
361
393
  - Handles exceptions through adaptive reasoning
362
394
 
363
395
  #### Limitations
396
+
364
397
  - Requires integration with external tools
365
398
  - More complex orchestration
366
399
  - Action space must be defined
@@ -373,12 +406,14 @@ Action 5: Finish[1,800 to 7,000 ft]
373
406
  **Citations**: 608+
374
407
 
375
408
  #### When to Use
409
+
376
410
  - Mathematical/arithmetic reasoning
377
411
  - Problems requiring precise computation
378
412
  - Symbolic manipulation
379
413
  - When CoT makes calculation errors
380
414
 
381
415
  #### How It Works
416
+
382
417
  Generate code (typically Python) instead of natural language reasoning. Execute the code to get the answer. The LLM handles decomposition; the interpreter handles computation.
383
418
 
384
419
  #### Prompt Template
@@ -410,11 +445,13 @@ def solution():
410
445
  ```
411
446
 
412
447
  #### Strengths
448
+
413
449
  - Eliminates arithmetic errors
414
450
  - Clear variable naming aids interpretability
415
451
  - Leverages code execution for verification
416
452
 
417
453
  #### Limitations
454
+
418
455
  - Requires code interpreter
419
456
  - Not suitable for non-computational reasoning
420
457
  - Model must generate syntactically correct code
@@ -427,12 +464,14 @@ def solution():
427
464
  **Citations**: 838+
428
465
 
429
466
  #### When to Use
467
+
430
468
  - No manually crafted exemplars available
431
469
  - Want to automate few-shot CoT setup
432
470
  - Scaling CoT to many tasks
433
471
  - When zero-shot CoT isn't sufficient
434
472
 
435
473
  #### How It Works
474
+
436
475
  1. Cluster questions by diversity
437
476
  2. Use Zero-shot CoT to generate reasoning chains for representative questions
438
477
  3. Use these auto-generated chains as few-shot exemplars
@@ -440,6 +479,7 @@ def solution():
440
479
  #### Prompt Template
441
480
 
442
481
  **Step 1: Generate diverse demonstrations**
482
+
443
483
  ```python
444
484
  # Cluster questions
445
485
  clusters = cluster_questions(all_questions, k=8)
@@ -453,6 +493,7 @@ for cluster in clusters:
453
493
  ```
454
494
 
455
495
  **Step 2: Use as few-shot exemplars**
496
+
456
497
  ```
457
498
  Q: [Demo question 1]
458
499
  A: Let's think step by step. [Generated reasoning 1]
@@ -467,11 +508,13 @@ A: Let's think step by step.
467
508
  ```
468
509
 
469
510
  #### Strengths
511
+
470
512
  - No manual exemplar creation
471
513
  - Diversity sampling improves robustness
472
514
  - Matches manual CoT performance
473
515
 
474
516
  #### Limitations
517
+
475
518
  - Quality depends on zero-shot CoT quality
476
519
  - Clustering requires similarity metric
477
520
  - Some generated chains contain errors
@@ -484,17 +527,20 @@ A: Let's think step by step.
484
527
  **Citations**: 2,179+
485
528
 
486
529
  #### When to Use
530
+
487
531
  - Iterative improvement over multiple attempts
488
532
  - Learning from errors without fine-tuning
489
533
  - Complex coding or decision-making tasks
490
534
  - When single-pass reasoning is insufficient
491
535
 
492
536
  #### How It Works
537
+
493
538
  After task failure, the agent generates a verbal "reflection" analyzing what went wrong. This reflection is stored in memory and used in subsequent attempts to avoid repeating mistakes.
494
539
 
495
540
  #### Prompt Template
496
541
 
497
542
  **Initial Attempt**:
543
+
498
544
  ```
499
545
  Task: [TASK DESCRIPTION]
500
546
 
@@ -505,6 +551,7 @@ Result: [FAILURE/PARTIAL SUCCESS]
505
551
  ```
506
552
 
507
553
  **Reflection**:
554
+
508
555
  ```
509
556
  The previous attempt failed because:
510
557
  1. [SPECIFIC ERROR ANALYSIS]
@@ -515,6 +562,7 @@ Reflection: In the next attempt, I should...
515
562
  ```
516
563
 
517
564
  **Subsequent Attempt (with memory)**:
565
+
518
566
  ```
519
567
  Task: [TASK DESCRIPTION]
520
568
 
@@ -546,11 +594,13 @@ Test Result: Passed all tests
546
594
  ```
547
595
 
548
596
  #### Strengths
597
+
549
598
  - Learns from errors without weight updates
550
599
  - Achieves 91% on HumanEval (surpassing GPT-4's 80%)
551
600
  - Builds episodic memory of insights
552
601
 
553
602
  #### Limitations
603
+
554
604
  - Requires multiple attempts
555
605
  - Memory management for long sessions
556
606
  - Quality of reflection affects improvement
@@ -600,21 +650,26 @@ Least-to-Most Need exploration?
600
650
  ## Best Practices
601
651
 
602
652
  ### 1. Start Simple
653
+
603
654
  Begin with Zero-shot CoT ("Let's think step by step"), then progress to more complex techniques if needed.
604
655
 
605
656
  ### 2. Match Technique to Task
657
+
606
658
  - **Math/Logic**: CoT, PAL, Self-Consistency
607
659
  - **Multi-hop QA**: ReAct, Least-to-Most
608
660
  - **Creative/Puzzles**: Tree of Thoughts
609
661
  - **Iterative Tasks**: Reflexion
610
662
 
611
663
  ### 3. Combine Techniques
664
+
612
665
  Techniques are often complementary:
666
+
613
667
  - ReAct + Self-Consistency for robust factual answers
614
668
  - ToT + PAL for complex computational exploration
615
669
  - Least-to-Most + Reflexion for hard multi-step problems
616
670
 
617
671
  ### 4. Prompt Engineering Tips
672
+
618
673
  - Use clear step markers ("Step 1:", "First,", etc.)
619
674
  - Include diverse exemplars covering edge cases
620
675
  - Format consistently across examples
@@ -624,16 +679,15 @@ Techniques are often complementary:
624
679
 
625
680
  ## Common Mistakes
626
681
 
627
- | Mistake | Why It's Wrong | Fix |
628
- |---------|---------------|-----|
629
- | Using CoT for simple lookups | Adds unnecessary tokens and latency | Reserve for multi-step reasoning |
630
- | Too few samples in Self-Consistency | Majority voting needs adequate samples | Use 5-10 samples minimum |
631
- | Generic "think step by step" without checking output | Model may produce irrelevant reasoning | Validate reasoning quality, not just presence |
632
- | Mixing techniques without understanding trade-offs | Computational cost without benefit | Understand when each technique adds value |
633
- | Using PAL without code interpreter | Code generation is useless without execution | Ensure execution environment available |
634
- | Not testing exemplar quality in few-shot CoT | Poor exemplars lead to poor reasoning | Validate exemplars solve problems correctly |
635
- | Applying Tree of Thoughts to linear problems | Massive overhead for no benefit | Use ToT only when exploration needed |
636
-
682
+ | Mistake | Why It's Wrong | Fix |
683
+ | ---------------------------------------------------- | -------------------------------------------- | --------------------------------------------- |
684
+ | Using CoT for simple lookups | Adds unnecessary tokens and latency | Reserve for multi-step reasoning |
685
+ | Too few samples in Self-Consistency | Majority voting needs adequate samples | Use 5-10 samples minimum |
686
+ | Generic "think step by step" without checking output | Model may produce irrelevant reasoning | Validate reasoning quality, not just presence |
687
+ | Mixing techniques without understanding trade-offs | Computational cost without benefit | Understand when each technique adds value |
688
+ | Using PAL without code interpreter | Code generation is useless without execution | Ensure execution environment available |
689
+ | Not testing exemplar quality in few-shot CoT | Poor exemplars lead to poor reasoning | Validate exemplars solve problems correctly |
690
+ | Applying Tree of Thoughts to linear problems | Massive overhead for no benefit | Use ToT only when exploration needed |
637
691
 
638
692
  ---
639
693
 
@@ -9,9 +9,7 @@
9
9
  "tree of thoughts",
10
10
  "reasoning"
11
11
  ],
12
- "applicable_agents": [
13
- "critic"
14
- ],
12
+ "applicable_agents": ["critic"],
15
13
  "max_context_tokens": 2600,
16
14
  "entry_file": "SKILL.md"
17
15
  }
@@ -10,6 +10,7 @@ Language models exhibit predictable degradation patterns as context length incre
10
10
  ## When to Activate
11
11
 
12
12
  Activate this skill when:
13
+
13
14
  - Agent performance degrades unexpectedly during long conversations
14
15
  - Debugging cases where agents produce incorrect or irrelevant outputs
15
16
  - Designing systems that must handle large contexts reliably
@@ -159,6 +160,7 @@ Implement these strategies through specific architectural patterns. Use just-in-
159
160
  ## Examples
160
161
 
161
162
  **Example 1: Detecting Degradation**
163
+
162
164
  ```yaml
163
165
  # Context grows during long conversation
164
166
  turn_1: 1000 tokens
@@ -169,19 +171,23 @@ turn_30: 90000 tokens (significant degradation)
169
171
  ```
170
172
 
171
173
  **Example 2: Mitigating Lost-in-Middle**
174
+
172
175
  ```markdown
173
176
  # Organize context with critical info at edges
174
177
 
175
- [CURRENT TASK] # At start
178
+ [CURRENT TASK] # At start
179
+
176
180
  - Goal: Generate quarterly report
177
181
  - Deadline: End of week
178
182
 
179
- [DETAILED CONTEXT] # Middle (less attention)
183
+ [DETAILED CONTEXT] # Middle (less attention)
184
+
180
185
  - 50 pages of data
181
186
  - Multiple analysis sections
182
187
  - Supporting evidence
183
188
 
184
- [KEY FINDINGS] # At end
189
+ [KEY FINDINGS] # At end
190
+
185
191
  - Revenue up 15%
186
192
  - Costs down 8%
187
193
  - Growth in Region A
@@ -200,23 +206,18 @@ turn_30: 90000 tokens (significant degradation)
200
206
 
201
207
  ## Integration
202
208
 
203
- This skill builds on context-fundamentals and should be studied after understanding basic context concepts. It connects to:
209
+ This skill builds on understanding basic context mechanics (attention patterns, token budgets, placement strategies) and connects to:
204
210
 
205
- - context-optimization - Techniques for mitigating degradation
206
- - multi-agent-patterns - Using isolation to prevent degradation
207
- - evaluation - Measuring and detecting degradation in production
211
+ - **evaluation** - Measuring and detecting degradation in production
208
212
 
209
213
  ## References
210
214
 
211
- Internal reference:
212
- - [Degradation Patterns Reference](./references/patterns.md) - Detailed technical reference
213
-
214
215
  Related skills in this collection:
215
- - context-fundamentals - Context basics
216
- - context-optimization - Mitigation techniques
217
- - evaluation - Detection and measurement
216
+
217
+ - **evaluation** - Detection and measurement
218
218
 
219
219
  External resources:
220
+
220
221
  - Research on attention mechanisms and context window limitations
221
222
  - Studies on the "lost-in-middle" phenomenon
222
223
  - Production engineering guides from AI labs
@@ -9,9 +9,7 @@
9
9
  "context confusion",
10
10
  "attention pattern"
11
11
  ],
12
- "applicable_agents": [
13
- "strategist"
14
- ],
12
+ "applicable_agents": ["strategist"],
15
13
  "max_context_tokens": 2200,
16
14
  "entry_file": "SKILL.md"
17
15
  }