opencode-multiagent 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +209 -0
  3. package/agents/advisor.md +57 -0
  4. package/agents/auditor.md +45 -0
  5. package/agents/critic.md +127 -0
  6. package/agents/deep-worker.md +65 -0
  7. package/agents/devil.md +36 -0
  8. package/agents/executor.md +141 -0
  9. package/agents/heavy-worker.md +68 -0
  10. package/agents/lead.md +155 -0
  11. package/agents/librarian.md +62 -0
  12. package/agents/planner.md +121 -0
  13. package/agents/qa.md +50 -0
  14. package/agents/quick.md +65 -0
  15. package/agents/reviewer.md +55 -0
  16. package/agents/scout.md +58 -0
  17. package/agents/scribe.md +78 -0
  18. package/agents/strategist.md +63 -0
  19. package/agents/ui-heavy-worker.md +62 -0
  20. package/agents/ui-worker.md +69 -0
  21. package/agents/validator.md +47 -0
  22. package/agents/worker.md +68 -0
  23. package/commands/execute.md +14 -0
  24. package/commands/init-deep.md +18 -0
  25. package/commands/init.md +18 -0
  26. package/commands/inspect.md +13 -0
  27. package/commands/plan.md +15 -0
  28. package/commands/quality.md +14 -0
  29. package/commands/review.md +14 -0
  30. package/commands/status.md +15 -0
  31. package/defaults/agent-settings.json +102 -0
  32. package/defaults/agent-settings.schema.json +25 -0
  33. package/defaults/flags.json +35 -0
  34. package/defaults/flags.schema.json +119 -0
  35. package/defaults/mcp-defaults.json +47 -0
  36. package/defaults/mcp-defaults.schema.json +38 -0
  37. package/defaults/profiles.json +53 -0
  38. package/defaults/profiles.schema.json +60 -0
  39. package/defaults/team-profiles.json +83 -0
  40. package/examples/opencode.json +4 -0
  41. package/examples/opencode.with-overrides.json +23 -0
  42. package/package.json +62 -0
  43. package/skills/advanced-evaluation/SKILL.md +454 -0
  44. package/skills/advanced-evaluation/manifest.json +20 -0
  45. package/skills/cek-context-engineering/SKILL.md +1261 -0
  46. package/skills/cek-context-engineering/manifest.json +17 -0
  47. package/skills/cek-prompt-engineering/SKILL.md +559 -0
  48. package/skills/cek-prompt-engineering/manifest.json +17 -0
  49. package/skills/cek-test-prompt/SKILL.md +714 -0
  50. package/skills/cek-test-prompt/manifest.json +17 -0
  51. package/skills/cek-thought-based-reasoning/SKILL.md +658 -0
  52. package/skills/cek-thought-based-reasoning/manifest.json +17 -0
  53. package/skills/context-degradation/SKILL.md +231 -0
  54. package/skills/context-degradation/manifest.json +17 -0
  55. package/skills/debate/SKILL.md +316 -0
  56. package/skills/debate/manifest.json +19 -0
  57. package/skills/design-first/SKILL.md +5 -0
  58. package/skills/design-first/manifest.json +20 -0
  59. package/skills/dispatching-parallel-agents/SKILL.md +180 -0
  60. package/skills/dispatching-parallel-agents/manifest.json +18 -0
  61. package/skills/drift-analysis/SKILL.md +324 -0
  62. package/skills/drift-analysis/manifest.json +19 -0
  63. package/skills/evaluation/SKILL.md +5 -0
  64. package/skills/evaluation/manifest.json +19 -0
  65. package/skills/executing-plans/SKILL.md +70 -0
  66. package/skills/executing-plans/manifest.json +17 -0
  67. package/skills/handoff-protocols/SKILL.md +5 -0
  68. package/skills/handoff-protocols/manifest.json +19 -0
  69. package/skills/parallel-investigation/SKILL.md +206 -0
  70. package/skills/parallel-investigation/manifest.json +18 -0
  71. package/skills/reflexion-critique/SKILL.md +477 -0
  72. package/skills/reflexion-critique/manifest.json +17 -0
  73. package/skills/reflexion-reflect/SKILL.md +650 -0
  74. package/skills/reflexion-reflect/manifest.json +17 -0
  75. package/skills/root-cause-analysis/SKILL.md +5 -0
  76. package/skills/root-cause-analysis/manifest.json +20 -0
  77. package/skills/sadd-judge-with-debate/SKILL.md +426 -0
  78. package/skills/sadd-judge-with-debate/manifest.json +17 -0
  79. package/skills/structured-code-review/SKILL.md +5 -0
  80. package/skills/structured-code-review/manifest.json +18 -0
  81. package/skills/task-decomposition/SKILL.md +5 -0
  82. package/skills/task-decomposition/manifest.json +20 -0
  83. package/skills/verification-before-completion/SKILL.md +5 -0
  84. package/skills/verification-before-completion/manifest.json +22 -0
  85. package/skills/verification-gates/SKILL.md +281 -0
  86. package/skills/verification-gates/manifest.json +19 -0
  87. package/src/control-plane.ts +21 -0
  88. package/src/index.ts +8 -0
  89. package/src/opencode-multiagent/compiler.ts +168 -0
  90. package/src/opencode-multiagent/constants.ts +178 -0
  91. package/src/opencode-multiagent/file-lock.ts +90 -0
  92. package/src/opencode-multiagent/hooks.ts +599 -0
  93. package/src/opencode-multiagent/log.ts +12 -0
  94. package/src/opencode-multiagent/mailbox.ts +287 -0
  95. package/src/opencode-multiagent/markdown.ts +99 -0
  96. package/src/opencode-multiagent/mcp.ts +35 -0
  97. package/src/opencode-multiagent/policy.ts +67 -0
  98. package/src/opencode-multiagent/quality.ts +140 -0
  99. package/src/opencode-multiagent/runtime.ts +55 -0
  100. package/src/opencode-multiagent/skills.ts +144 -0
  101. package/src/opencode-multiagent/supervision.ts +156 -0
  102. package/src/opencode-multiagent/task-manager.ts +148 -0
  103. package/src/opencode-multiagent/team-manager.ts +219 -0
  104. package/src/opencode-multiagent/team-tools.ts +359 -0
  105. package/src/opencode-multiagent/telemetry.ts +124 -0
  106. package/src/opencode-multiagent/utils.ts +54 -0
@@ -0,0 +1,17 @@
1
+ {
2
+ "name": "cek-test-prompt",
3
+ "version": "1.0.0",
4
+ "description": "Test-prompt workflow for red-green-refactor style prompt validation",
5
+ "triggers": [
6
+ "test prompt",
7
+ "prompt test",
8
+ "verify prompt",
9
+ "prompt behavior",
10
+ "red green prompt"
11
+ ],
12
+ "applicable_agents": [
13
+ "critic"
14
+ ],
15
+ "max_context_tokens": 2400,
16
+ "entry_file": "SKILL.md"
17
+ }
@@ -0,0 +1,658 @@
1
+ ---
2
+ name: cek-thought-based-reasoning
3
+ description: Use when tackling complex reasoning tasks requiring step-by-step logic, multi-step arithmetic, commonsense reasoning, symbolic manipulation, or problems where simple prompting fails - provides comprehensive guide to Chain-of-Thought and related prompting techniques (Zero-shot CoT, Self-Consistency, Tree of Thoughts, Least-to-Most, ReAct, PAL, Reflexion) with templates, decision matrices, and research-backed patterns
4
+ ---
5
+
6
+ # Thought-Based Reasoning Techniques for LLMs
7
+
8
+ ## Overview
9
+
10
+ Chain-of-Thought (CoT) prompting and its variants encourage LLMs to generate intermediate reasoning steps before arriving at a final answer, significantly improving performance on complex reasoning tasks. These techniques transform how models approach problems by making implicit reasoning explicit.
11
+
12
+
13
+ ## Quick Reference
14
+
15
+ | Technique | When to Use | Complexity | Accuracy Gain |
16
+ |-----------|-------------|------------|---------------|
17
+ | Zero-shot CoT | Quick reasoning, no examples available | Low | +20-60% |
18
+ | Few-shot CoT | Have good examples, consistent format needed | Medium | +30-70% |
19
+ | Self-Consistency | High-stakes decisions, need confidence | Medium | +10-20% over CoT |
20
+ | Tree of Thoughts | Complex problems requiring exploration | High | +50-70% on hard tasks |
21
+ | Least-to-Most | Multi-step problems with subproblems | Medium | +30-80% |
22
+ | ReAct | Tasks requiring external information | Medium | +15-35% |
23
+ | PAL | Mathematical/computational problems | Medium | +10-15% |
24
+ | Reflexion | Iterative improvement, learning from errors | High | +10-20% |
25
+
26
+ ---
27
+
28
+ ## Core Techniques
29
+
30
+ ### 1. Chain-of-Thought (CoT) Prompting
31
+
32
+ **Paper**: "Chain of Thought Prompting Elicits Reasoning in Large Language Models" (Wei et al., 2022)
33
+ **Citations**: 14,255+
34
+
35
+ #### When to Use
36
+ - Multi-step arithmetic or math word problems
37
+ - Commonsense reasoning requiring logical deduction
38
+ - Symbolic reasoning tasks
39
+ - When you have good exemplars showing reasoning
40
+
41
+ #### How It Works
42
+ Provide few-shot examples that include intermediate reasoning steps, not just question-answer pairs. The model learns to generate similar step-by-step reasoning.
43
+
44
+ #### Prompt Template
45
+
46
+ ```
47
+ Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?
48
+ A: Roger started with 5 balls. 2 cans of 3 tennis balls each is 6 tennis balls. 5 + 6 = 11. The answer is 11.
49
+
50
+ Q: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?
51
+ A: The cafeteria had 23 apples originally. They used 20 to make lunch. So they had 23 - 20 = 3. They bought 6 more apples, so they have 3 + 6 = 9. The answer is 9.
52
+
53
+ Q: [YOUR QUESTION HERE]
54
+ A:
55
+ ```
56
+
57
+ #### Strengths
58
+ - Significant accuracy improvements on reasoning tasks
59
+ - Interpretable intermediate steps
60
+ - Works well with large models (>100B parameters)
61
+
62
+ #### Limitations
63
+ - Requires crafting good exemplars
64
+ - Less effective on smaller models
65
+ - Can still make calculation errors
66
+
67
+ ---
68
+
69
+ ### 2. Zero-shot Chain-of-Thought
70
+
71
+ **Paper**: "Large Language Models are Zero-Shot Reasoners" (Kojima et al., 2022)
72
+ **Citations**: 5,985+
73
+
74
+ #### When to Use
75
+ - No exemplars available
76
+ - Quick reasoning needed
77
+ - General-purpose reasoning across task types
78
+ - Prototyping before creating few-shot examples
79
+
80
+ #### How It Works
81
+ Simply append "Let's think step by step" (or similar phrase) to the prompt. This triggers the model to generate reasoning steps without any examples.
82
+
83
+ #### Prompt Template
84
+
85
+ ```
86
+ Q: A juggler can juggle 16 balls. Half of the balls are golf balls, and half of the golf balls are blue. How many blue golf balls are there?
87
+
88
+ Let's think step by step.
89
+ ```
90
+
91
+ **Alternative trigger phrases**:
92
+ - "Let's work this out step by step to be sure we have the right answer."
93
+ - "Let's break this down."
94
+ - "Let's approach this systematically."
95
+ - "First, let me understand the problem..."
96
+
97
+ #### Two-Stage Approach (More Robust)
98
+
99
+ **Stage 1 - Reasoning Extraction**:
100
+ ```
101
+ Q: [QUESTION]
102
+ A: Let's think step by step.
103
+ ```
104
+
105
+ **Stage 2 - Answer Extraction**:
106
+ ```
107
+ [REASONING FROM STAGE 1]
108
+ Therefore, the answer is
109
+ ```
110
+
111
+ #### Strengths
112
+ - No exemplar crafting required
113
+ - Generalizes across task types
114
+ - Simple to implement
115
+
116
+ #### Limitations
117
+ - Less effective than few-shot CoT
118
+ - Can produce verbose or irrelevant reasoning
119
+ - Sensitive to exact phrasing
120
+
121
+ ---
122
+
123
+ ### 3. Self-Consistency
124
+
125
+ **Paper**: "Self-Consistency Improves Chain of Thought Reasoning in Language Models" (Wang et al., 2022)
126
+ **Citations**: 5,379+
127
+
128
+ #### When to Use
129
+ - High-stakes decisions requiring confidence
130
+ - Problems with multiple valid reasoning paths
131
+ - When you need to reduce variance in outputs
132
+ - Verification of reasoning correctness
133
+
134
+ #### How It Works
135
+ Sample multiple diverse reasoning paths, then select the most consistent answer via majority voting. The intuition: correct answers can be reached through multiple reasoning paths.
136
+
137
+ #### Prompt Template
138
+
139
+ ```
140
+ [Use any CoT prompt - zero-shot or few-shot]
141
+
142
+ [Generate N samples with temperature > 0]
143
+
144
+ [Extract final answers from each sample]
145
+
146
+ [Return the most frequent answer (majority vote)]
147
+ ```
148
+
149
+ #### Implementation Example
150
+
151
+ ```python
152
+ def self_consistency(prompt, n_samples=5, temperature=0.7):
153
+ answers = []
154
+ for _ in range(n_samples):
155
+ response = llm.generate(prompt, temperature=temperature)
156
+ answer = extract_answer(response)
157
+ answers.append(answer)
158
+
159
+ # Majority vote
160
+ return Counter(answers).most_common(1)[0][0]
161
+ ```
162
+
163
+ #### Strengths
164
+ - Significant accuracy boost over single-path CoT
165
+ - Provides confidence measure (agreement level)
166
+ - Task-agnostic improvement
167
+
168
+ #### Limitations
169
+ - Higher computational cost (N times more generations)
170
+ - Requires extractable discrete answers
171
+ - Diminishing returns beyond ~10-20 samples
172
+
173
+ ---
174
+
175
+ ### 4. Tree of Thoughts (ToT)
176
+
177
+ **Paper**: "Tree of Thoughts: Deliberate Problem Solving with Large Language Models" (Yao et al., 2023)
178
+ **Citations**: 3,026+
179
+
180
+ #### When to Use
181
+ - Complex problems requiring exploration/backtracking
182
+ - Tasks where initial decisions are pivotal
183
+ - Creative problem-solving (writing, puzzles)
184
+ - When CoT alone achieves <50% accuracy
185
+
186
+ #### How It Works
187
+ Generalize CoT to a tree structure where each node is a "thought" (coherent language unit). Uses search algorithms (BFS/DFS) with self-evaluation to explore and select promising reasoning paths.
188
+
189
+ #### Prompt Template
190
+
191
+ **Thought Generation**:
192
+ ```
193
+ Given the current state:
194
+ [STATE]
195
+
196
+ Generate 3-5 possible next steps to solve this problem.
197
+ ```
198
+
199
+ **State Evaluation**:
200
+ ```
201
+ Evaluate if the following partial solution is:
202
+ - "sure" (definitely leads to solution)
203
+ - "maybe" (could potentially work)
204
+ - "impossible" (cannot lead to solution)
205
+
206
+ Partial solution:
207
+ [THOUGHTS SO FAR]
208
+ ```
209
+
210
+ **BFS/DFS Search**:
211
+ ```python
212
+ def tree_of_thoughts(problem, max_depth=3, beam_width=3):
213
+ queue = [(problem, [])] # (state, thought_path)
214
+
215
+ while queue:
216
+ state, path = queue.pop(0)
217
+
218
+ if is_solved(state):
219
+ return path
220
+
221
+ # Generate candidate thoughts
222
+ thoughts = generate_thoughts(state, k=5)
223
+
224
+ # Evaluate and keep top-k
225
+ evaluated = [(t, evaluate(state, t)) for t in thoughts]
226
+ top_k = sorted(evaluated, key=lambda x: x[1])[:beam_width]
227
+
228
+ for thought, score in top_k:
229
+ if score != "impossible":
230
+ new_state = apply_thought(state, thought)
231
+ queue.append((new_state, path + [thought]))
232
+
233
+ return None
234
+ ```
235
+
236
+ #### Example: Game of 24
237
+
238
+ ```
239
+ Problem: Use 4, 9, 10, 13 to get 24 (use +, -, *, / and each number once)
240
+
241
+ Thought 1: 13 - 9 = 4 (Now have: 4, 4, 10)
242
+ Evaluation: "maybe" - have two 4s and 10, could work
243
+
244
+ Thought 2: 10 - 4 = 6 (Now have: 4, 6, 13)
245
+ Evaluation: "maybe" - 4 * 6 = 24, need to use 13
246
+
247
+ Thought 3: 4 + 9 = 13 (Now have: 10, 13, 13)
248
+ Evaluation: "impossible" - no way to get 24 from these
249
+ ```
250
+
251
+ #### Strengths
252
+ - Dramatically improves performance on hard tasks (4% → 74% on Game of 24)
253
+ - Enables backtracking and exploration
254
+ - Self-evaluation catches errors early
255
+
256
+ #### Limitations
257
+ - Significantly higher computational cost
258
+ - Requires task-specific thought decomposition
259
+ - Complex to implement
260
+
261
+ ---
262
+
263
+ ### 5. Least-to-Most Prompting
264
+
265
+ **Paper**: "Least-to-Most Prompting Enables Complex Reasoning in Large Language Models" (Zhou et al., 2022)
266
+ **Citations**: 1,466+
267
+
268
+ #### When to Use
269
+ - Problems harder than your exemplars
270
+ - Compositional generalization tasks
271
+ - Multi-step problems with clear subproblems
272
+ - Symbol manipulation and SCAN-like tasks
273
+
274
+ #### How It Works
275
+ Two-stage process:
276
+ 1. **Decomposition**: Break complex problem into simpler subproblems
277
+ 2. **Sequential Solving**: Solve subproblems in order, using previous answers
278
+
279
+ #### Prompt Template
280
+
281
+ **Stage 1: Decomposition**
282
+ ```
283
+ Q: Four years ago, Kody was only half as old as Mohamed. If Mohamed is currently twice as old as 30 years old, how old is Kody?
284
+
285
+ To solve "Four years ago, Kody was only half as old as Mohamed. If Mohamed is currently twice as old as 30 years old, how old is Kody?", we need to first solve:
286
+ - "If Mohamed is currently twice as old as 30 years old, how old is Mohamed?"
287
+ Then we need to solve:
288
+ - "Four years ago, Kody was only half as old as Mohamed. How old is Kody?"
289
+ ```
290
+
291
+ **Stage 2: Sequential Solving**
292
+ ```
293
+ Q: If Mohamed is currently twice as old as 30 years old, how old is Mohamed?
294
+ A: Mohamed is twice as old as 30, so Mohamed is 30 * 2 = 60 years old.
295
+
296
+ Q: Four years ago, Kody was only half as old as Mohamed. Mohamed is 60 years old. How old is Kody?
297
+ A: Four years ago, Mohamed was 60 - 4 = 56 years old. Kody was half as old, so Kody was 56 / 2 = 28 years old four years ago. Therefore, Kody is currently 28 + 4 = 32 years old.
298
+
299
+ The answer is 32.
300
+ ```
301
+
302
+ #### Strengths
303
+ - Excellent at generalizing to harder problems
304
+ - Works well on compositional tasks
305
+ - Explicit problem decomposition aids interpretability
306
+
307
+ #### Limitations
308
+ - Requires two-stage prompting
309
+ - Decomposition step can fail on novel structures
310
+ - More complex setup than single-stage CoT
311
+
312
+ ---
313
+
314
+ ### 6. ReAct (Reasoning + Acting)
315
+
316
+ **Paper**: "ReAct: Synergizing Reasoning and Acting in Language Models" (Yao et al., 2022)
317
+ **Citations**: 5,012+
318
+
319
+ #### When to Use
320
+ - Tasks requiring external information (search, APIs)
321
+ - Interactive decision-making environments
322
+ - Multi-hop question answering
323
+ - When pure reasoning leads to hallucination
324
+
325
+ #### How It Works
326
+ Interleave reasoning traces ("Thought") with actions ("Action") and observations ("Observation"). Reasoning helps plan actions; actions provide new information for reasoning.
327
+
328
+ #### Prompt Template
329
+
330
+ ```
331
+ Question: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?
332
+
333
+ Thought 1: I need to search Colorado orogeny, find the area that the eastern sector extends into, then find the elevation range of that area.
334
+ Action 1: Search[Colorado orogeny]
335
+ Observation 1: The Colorado orogeny was an episode of mountain building (an orogeny) in Colorado and surrounding areas.
336
+
337
+ Thought 2: It does not mention the eastern sector. So I need to look up eastern sector.
338
+ Action 2: Lookup[eastern sector]
339
+ Observation 2: The eastern sector extends into the High Plains.
340
+
341
+ Thought 3: The eastern sector extends into the High Plains. I need to search High Plains and find its elevation range.
342
+ Action 3: Search[High Plains]
343
+ Observation 3: High Plains refers to one of two distinct land regions: High Plains (United States)...
344
+
345
+ Thought 4: I need to search High Plains (United States).
346
+ Action 4: Search[High Plains (United States)]
347
+ Observation 4: The High Plains are a subregion of the Great Plains... elevation of around 1,800 to 7,000 ft (550 to 2,130 m).
348
+
349
+ Thought 5: The elevation range is 1,800 to 7,000 ft.
350
+ Action 5: Finish[1,800 to 7,000 ft]
351
+ ```
352
+
353
+ #### Action Types
354
+ - `Search[query]` - Search for information
355
+ - `Lookup[keyword]` - Look up keyword in current context
356
+ - `Finish[answer]` - Return final answer
357
+
358
+ #### Strengths
359
+ - Reduces hallucination by grounding in external knowledge
360
+ - Interpretable action traces
361
+ - Handles exceptions through adaptive reasoning
362
+
363
+ #### Limitations
364
+ - Requires integration with external tools
365
+ - More complex orchestration
366
+ - Action space must be defined
367
+
368
+ ---
369
+
370
+ ### 7. PAL (Program-Aided Language Models)
371
+
372
+ **Paper**: "PAL: Program-aided Language Models" (Gao et al., 2022)
373
+ **Citations**: 608+
374
+
375
+ #### When to Use
376
+ - Mathematical/arithmetic reasoning
377
+ - Problems requiring precise computation
378
+ - Symbolic manipulation
379
+ - When CoT makes calculation errors
380
+
381
+ #### How It Works
382
+ Generate code (typically Python) instead of natural language reasoning. Execute the code to get the answer. The LLM handles decomposition; the interpreter handles computation.
383
+
384
+ #### Prompt Template
385
+
386
+ ```
387
+ Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?
388
+
389
+ # solution in Python:
390
+ def solution():
391
+ """Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?"""
392
+ tennis_balls_initial = 5
393
+ bought_cans = 2
394
+ tennis_balls_per_can = 3
395
+ tennis_balls_bought = bought_cans * tennis_balls_per_can
396
+ tennis_balls_total = tennis_balls_initial + tennis_balls_bought
397
+ return tennis_balls_total
398
+
399
+ Q: The bakers at the Beverly Hills Bakery baked 200 loaves of bread on Monday morning. They sold 93 loaves in the morning and 39 loaves in the afternoon. A grocery store returned 6 unsold loaves. How many loaves of bread did they have left?
400
+
401
+ # solution in Python:
402
+ def solution():
403
+ """The bakers baked 200 loaves. They sold 93 in morning, 39 in afternoon. A store returned 6. How many left?"""
404
+ loaves_baked = 200
405
+ loaves_sold_morning = 93
406
+ loaves_sold_afternoon = 39
407
+ loaves_returned = 6
408
+ loaves_left = loaves_baked - loaves_sold_morning - loaves_sold_afternoon + loaves_returned
409
+ return loaves_left
410
+ ```
411
+
412
+ #### Strengths
413
+ - Eliminates arithmetic errors
414
+ - Clear variable naming aids interpretability
415
+ - Leverages code execution for verification
416
+
417
+ #### Limitations
418
+ - Requires code interpreter
419
+ - Not suitable for non-computational reasoning
420
+ - Model must generate syntactically correct code
421
+
422
+ ---
423
+
424
+ ### 8. Auto-CoT
425
+
426
+ **Paper**: "Automatic Chain of Thought Prompting in Large Language Models" (Zhang et al., 2022)
427
+ **Citations**: 838+
428
+
429
+ #### When to Use
430
+ - No manually crafted exemplars available
431
+ - Want to automate few-shot CoT setup
432
+ - Scaling CoT to many tasks
433
+ - When zero-shot CoT isn't sufficient
434
+
435
+ #### How It Works
436
+ 1. Cluster questions by diversity
437
+ 2. Use Zero-shot CoT to generate reasoning chains for representative questions
438
+ 3. Use these auto-generated chains as few-shot exemplars
439
+
440
+ #### Prompt Template
441
+
442
+ **Step 1: Generate diverse demonstrations**
443
+ ```python
444
+ # Cluster questions
445
+ clusters = cluster_questions(all_questions, k=8)
446
+
447
+ # For each cluster, pick representative and generate CoT
448
+ demonstrations = []
449
+ for cluster in clusters:
450
+ question = select_representative(cluster)
451
+ reasoning = zero_shot_cot(question) # "Let's think step by step"
452
+ demonstrations.append((question, reasoning))
453
+ ```
454
+
455
+ **Step 2: Use as few-shot exemplars**
456
+ ```
457
+ Q: [Demo question 1]
458
+ A: Let's think step by step. [Generated reasoning 1]
459
+
460
+ Q: [Demo question 2]
461
+ A: Let's think step by step. [Generated reasoning 2]
462
+
463
+ ...
464
+
465
+ Q: [New question]
466
+ A: Let's think step by step.
467
+ ```
468
+
469
+ #### Strengths
470
+ - No manual exemplar creation
471
+ - Diversity sampling improves robustness
472
+ - Matches manual CoT performance
473
+
474
+ #### Limitations
475
+ - Quality depends on zero-shot CoT quality
476
+ - Clustering requires similarity metric
477
+ - Some generated chains contain errors
478
+
479
+ ---
480
+
481
+ ### 9. Reflexion
482
+
483
+ **Paper**: "Reflexion: Language Agents with Verbal Reinforcement Learning" (Shinn et al., 2023)
484
+ **Citations**: 2,179+
485
+
486
+ #### When to Use
487
+ - Iterative improvement over multiple attempts
488
+ - Learning from errors without fine-tuning
489
+ - Complex coding or decision-making tasks
490
+ - When single-pass reasoning is insufficient
491
+
492
+ #### How It Works
493
+ After task failure, the agent generates a verbal "reflection" analyzing what went wrong. This reflection is stored in memory and used in subsequent attempts to avoid repeating mistakes.
494
+
495
+ #### Prompt Template
496
+
497
+ **Initial Attempt**:
498
+ ```
499
+ Task: [TASK DESCRIPTION]
500
+
501
+ Thought: [REASONING]
502
+ Action: [ACTION]
503
+ ...
504
+ Result: [FAILURE/PARTIAL SUCCESS]
505
+ ```
506
+
507
+ **Reflection**:
508
+ ```
509
+ The previous attempt failed because:
510
+ 1. [SPECIFIC ERROR ANALYSIS]
511
+ 2. [WHAT SHOULD HAVE BEEN DONE]
512
+ 3. [KEY INSIGHT FOR NEXT ATTEMPT]
513
+
514
+ Reflection: In the next attempt, I should...
515
+ ```
516
+
517
+ **Subsequent Attempt (with memory)**:
518
+ ```
519
+ Task: [TASK DESCRIPTION]
520
+
521
+ Previous reflections:
522
+ - [REFLECTION 1]
523
+ - [REFLECTION 2]
524
+
525
+ Using these insights, I will now attempt the task again.
526
+
527
+ Thought: [IMPROVED REASONING]
528
+ Action: [BETTER ACTION]
529
+ ```
530
+
531
+ #### Example: Code Generation
532
+
533
+ ```
534
+ Task: Write a function to find the longest palindromic substring.
535
+
536
+ Attempt 1: [CODE WITH BUG]
537
+ Test Result: Failed on "babad" - expected "bab" or "aba", got "b"
538
+
539
+ Reflection: My solution only checked single characters. I need to:
540
+ 1. Consider substrings of all lengths
541
+ 2. Use expand-around-center technique for efficiency
542
+ 3. Track both start position and maximum length
543
+
544
+ Attempt 2: [IMPROVED CODE USING REFLECTION]
545
+ Test Result: Passed all tests
546
+ ```
547
+
548
+ #### Strengths
549
+ - Learns from errors without weight updates
550
+ - Achieves 91% on HumanEval (surpassing GPT-4's 80%)
551
+ - Builds episodic memory of insights
552
+
553
+ #### Limitations
554
+ - Requires multiple attempts
555
+ - Memory management for long sessions
556
+ - Quality of reflection affects improvement
557
+
558
+ ---
559
+
560
+ ## Decision Matrix: Which Technique to Use
561
+
562
+ ```
563
+ Need Examples?
564
+ / \
565
+ No Yes
566
+ | |
567
+ Zero-shot CoT Few-shot CoT
568
+ | |
569
+ Need higher accuracy? Need computation?
570
+ / \ |
571
+ Yes No PAL
572
+ | |
573
+ Self-Consistency Done with CoT
574
+ |
575
+ Still not enough?
576
+ / \
577
+ Yes No
578
+ | |
579
+ Problem decomposable? Done
580
+ / \
581
+ Yes No
582
+ | |
583
+ Least-to-Most Need exploration?
584
+ / \
585
+ Yes No
586
+ | |
587
+ Tree of Thoughts Need external info?
588
+ / \
589
+ Yes No
590
+ | |
591
+ ReAct Need iteration?
592
+ / \
593
+ Yes No
594
+ | |
595
+ Reflexion Use CoT
596
+ ```
597
+
598
+ ---
599
+
600
+ ## Best Practices
601
+
602
+ ### 1. Start Simple
603
+ Begin with Zero-shot CoT ("Let's think step by step"), then progress to more complex techniques if needed.
604
+
605
+ ### 2. Match Technique to Task
606
+ - **Math/Logic**: CoT, PAL, Self-Consistency
607
+ - **Multi-hop QA**: ReAct, Least-to-Most
608
+ - **Creative/Puzzles**: Tree of Thoughts
609
+ - **Iterative Tasks**: Reflexion
610
+
611
+ ### 3. Combine Techniques
612
+ Techniques are often complementary:
613
+ - ReAct + Self-Consistency for robust factual answers
614
+ - ToT + PAL for complex computational exploration
615
+ - Least-to-Most + Reflexion for hard multi-step problems
616
+
617
+ ### 4. Prompt Engineering Tips
618
+ - Use clear step markers ("Step 1:", "First,", etc.)
619
+ - Include diverse exemplars covering edge cases
620
+ - Format consistently across examples
621
+ - Add verification steps ("Let me verify...")
622
+
623
+ ---
624
+
625
+ ## Common Mistakes
626
+
627
+ | Mistake | Why It's Wrong | Fix |
628
+ |---------|---------------|-----|
629
+ | Using CoT for simple lookups | Adds unnecessary tokens and latency | Reserve for multi-step reasoning |
630
+ | Too few samples in Self-Consistency | Majority voting needs adequate samples | Use 5-10 samples minimum |
631
+ | Generic "think step by step" without checking output | Model may produce irrelevant reasoning | Validate reasoning quality, not just presence |
632
+ | Mixing techniques without understanding trade-offs | Computational cost without benefit | Understand when each technique adds value |
633
+ | Using PAL without code interpreter | Code generation is useless without execution | Ensure execution environment available |
634
+ | Not testing exemplar quality in few-shot CoT | Poor exemplars lead to poor reasoning | Validate exemplars solve problems correctly |
635
+ | Applying Tree of Thoughts to linear problems | Massive overhead for no benefit | Use ToT only when exploration needed |
636
+
637
+
638
+ ---
639
+
640
+ ## References
641
+
642
+ 1. Wei, J. et al. (2022). "Chain of Thought Prompting Elicits Reasoning in Large Language Models." [arXiv:2201.11903](https://arxiv.org/abs/2201.11903)
643
+
644
+ 2. Kojima, T. et al. (2022). "Large Language Models are Zero-Shot Reasoners." [arXiv:2205.11916](https://arxiv.org/abs/2205.11916)
645
+
646
+ 3. Wang, X. et al. (2022). "Self-Consistency Improves Chain of Thought Reasoning in Language Models." [arXiv:2203.11171](https://arxiv.org/abs/2203.11171)
647
+
648
+ 4. Yao, S. et al. (2023). "Tree of Thoughts: Deliberate Problem Solving with Large Language Models." [arXiv:2305.10601](https://arxiv.org/abs/2305.10601)
649
+
650
+ 5. Zhou, D. et al. (2022). "Least-to-Most Prompting Enables Complex Reasoning in Large Language Models." [arXiv:2205.10625](https://arxiv.org/abs/2205.10625)
651
+
652
+ 6. Yao, S. et al. (2022). "ReAct: Synergizing Reasoning and Acting in Language Models." [arXiv:2210.03629](https://arxiv.org/abs/2210.03629)
653
+
654
+ 7. Gao, L. et al. (2022). "PAL: Program-aided Language Models." [arXiv:2211.10435](https://arxiv.org/abs/2211.10435)
655
+
656
+ 8. Zhang, Z. et al. (2022). "Automatic Chain of Thought Prompting in Large Language Models." [arXiv:2210.03493](https://arxiv.org/abs/2210.03493)
657
+
658
+ 9. Shinn, N. et al. (2023). "Reflexion: Language Agents with Verbal Reinforcement Learning." [arXiv:2303.11366](https://arxiv.org/abs/2303.11366)
@@ -0,0 +1,17 @@
1
+ {
2
+ "name": "cek-thought-based-reasoning",
3
+ "version": "1.0.0",
4
+ "description": "Thought-based reasoning patterns for hard multi-step reasoning tasks",
5
+ "triggers": [
6
+ "reason carefully",
7
+ "chain of thought",
8
+ "step by step",
9
+ "tree of thoughts",
10
+ "reasoning"
11
+ ],
12
+ "applicable_agents": [
13
+ "critic"
14
+ ],
15
+ "max_context_tokens": 2600,
16
+ "entry_file": "SKILL.md"
17
+ }