opencode-multiagent 0.2.1 → 0.3.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. package/AGENTS.md +62 -0
  2. package/CHANGELOG.md +18 -0
  3. package/CONTRIBUTING.md +36 -0
  4. package/README.md +41 -165
  5. package/README.tr.md +84 -0
  6. package/RELEASE.md +68 -0
  7. package/agents/advisor.md +9 -6
  8. package/agents/auditor.md +8 -6
  9. package/agents/critic.md +19 -10
  10. package/agents/deep-worker.md +11 -7
  11. package/agents/devil.md +3 -1
  12. package/agents/executor.md +20 -19
  13. package/agents/heavy-worker.md +11 -7
  14. package/agents/lead.md +22 -30
  15. package/agents/librarian.md +6 -2
  16. package/agents/planner.md +18 -10
  17. package/agents/qa.md +9 -6
  18. package/agents/quick.md +12 -7
  19. package/agents/reviewer.md +9 -6
  20. package/agents/scout.md +9 -5
  21. package/agents/scribe.md +33 -28
  22. package/agents/strategist.md +10 -7
  23. package/agents/ui-heavy-worker.md +11 -7
  24. package/agents/ui-worker.md +12 -7
  25. package/agents/validator.md +8 -5
  26. package/agents/worker.md +12 -7
  27. package/commands/execute.md +1 -0
  28. package/commands/init-deep.md +1 -0
  29. package/commands/init.md +1 -0
  30. package/commands/inspect.md +1 -0
  31. package/commands/plan.md +1 -0
  32. package/commands/quality.md +1 -0
  33. package/commands/review.md +1 -0
  34. package/commands/status.md +1 -0
  35. package/defaults/opencode-multiagent.json +223 -0
  36. package/defaults/opencode-multiagent.schema.json +249 -0
  37. package/dist/control-plane.d.ts +4 -0
  38. package/dist/control-plane.d.ts.map +1 -0
  39. package/dist/index.d.ts +5 -0
  40. package/dist/index.d.ts.map +1 -0
  41. package/dist/index.js +1583 -0
  42. package/dist/opencode-multiagent/compiler.d.ts +19 -0
  43. package/dist/opencode-multiagent/compiler.d.ts.map +1 -0
  44. package/dist/opencode-multiagent/constants.d.ts +116 -0
  45. package/dist/opencode-multiagent/constants.d.ts.map +1 -0
  46. package/dist/opencode-multiagent/defaults.d.ts +10 -0
  47. package/dist/opencode-multiagent/defaults.d.ts.map +1 -0
  48. package/dist/opencode-multiagent/file-lock.d.ts +15 -0
  49. package/dist/opencode-multiagent/file-lock.d.ts.map +1 -0
  50. package/dist/opencode-multiagent/hooks.d.ts +62 -0
  51. package/dist/opencode-multiagent/hooks.d.ts.map +1 -0
  52. package/dist/opencode-multiagent/log.d.ts +2 -0
  53. package/dist/opencode-multiagent/log.d.ts.map +1 -0
  54. package/dist/opencode-multiagent/markdown.d.ts +8 -0
  55. package/dist/opencode-multiagent/markdown.d.ts.map +1 -0
  56. package/dist/opencode-multiagent/mcp.d.ts +3 -0
  57. package/dist/opencode-multiagent/mcp.d.ts.map +1 -0
  58. package/dist/opencode-multiagent/policy.d.ts +5 -0
  59. package/dist/opencode-multiagent/policy.d.ts.map +1 -0
  60. package/dist/opencode-multiagent/quality.d.ts +14 -0
  61. package/dist/opencode-multiagent/quality.d.ts.map +1 -0
  62. package/dist/opencode-multiagent/runtime.d.ts +7 -0
  63. package/dist/opencode-multiagent/runtime.d.ts.map +1 -0
  64. package/dist/opencode-multiagent/session-tracker.d.ts +32 -0
  65. package/dist/opencode-multiagent/session-tracker.d.ts.map +1 -0
  66. package/dist/opencode-multiagent/skills.d.ts +17 -0
  67. package/dist/opencode-multiagent/skills.d.ts.map +1 -0
  68. package/dist/opencode-multiagent/supervision.d.ts +12 -0
  69. package/dist/opencode-multiagent/supervision.d.ts.map +1 -0
  70. package/dist/opencode-multiagent/task-manager.d.ts +48 -0
  71. package/dist/opencode-multiagent/task-manager.d.ts.map +1 -0
  72. package/dist/opencode-multiagent/telemetry.d.ts +26 -0
  73. package/dist/opencode-multiagent/telemetry.d.ts.map +1 -0
  74. package/dist/opencode-multiagent/tools.d.ts +56 -0
  75. package/dist/opencode-multiagent/tools.d.ts.map +1 -0
  76. package/dist/opencode-multiagent/types.d.ts +36 -0
  77. package/dist/opencode-multiagent/types.d.ts.map +1 -0
  78. package/dist/opencode-multiagent/utils.d.ts +9 -0
  79. package/dist/opencode-multiagent/utils.d.ts.map +1 -0
  80. package/docs/agents.md +260 -0
  81. package/docs/agents.tr.md +260 -0
  82. package/docs/configuration.md +255 -0
  83. package/docs/configuration.tr.md +255 -0
  84. package/docs/usage-guide.md +226 -0
  85. package/docs/usage-guide.tr.md +227 -0
  86. package/examples/opencode.with-overrides.json +1 -5
  87. package/package.json +23 -13
  88. package/skills/advanced-evaluation/SKILL.md +37 -21
  89. package/skills/advanced-evaluation/manifest.json +2 -13
  90. package/skills/cek-context-engineering/SKILL.md +159 -87
  91. package/skills/cek-context-engineering/manifest.json +1 -3
  92. package/skills/cek-prompt-engineering/SKILL.md +13 -10
  93. package/skills/cek-prompt-engineering/manifest.json +1 -3
  94. package/skills/cek-test-prompt/SKILL.md +38 -28
  95. package/skills/cek-test-prompt/manifest.json +1 -3
  96. package/skills/cek-thought-based-reasoning/SKILL.md +75 -21
  97. package/skills/cek-thought-based-reasoning/manifest.json +1 -3
  98. package/skills/context-degradation/SKILL.md +14 -13
  99. package/skills/context-degradation/manifest.json +1 -3
  100. package/skills/debate/SKILL.md +23 -78
  101. package/skills/debate/manifest.json +2 -12
  102. package/skills/design-first/manifest.json +2 -13
  103. package/skills/dispatching-parallel-agents/SKILL.md +14 -3
  104. package/skills/dispatching-parallel-agents/manifest.json +1 -4
  105. package/skills/drift-analysis/SKILL.md +50 -29
  106. package/skills/drift-analysis/manifest.json +2 -12
  107. package/skills/evaluation/manifest.json +2 -12
  108. package/skills/executing-plans/SKILL.md +15 -8
  109. package/skills/executing-plans/manifest.json +1 -3
  110. package/skills/handoff-protocols/manifest.json +2 -12
  111. package/skills/parallel-investigation/SKILL.md +25 -12
  112. package/skills/parallel-investigation/manifest.json +1 -4
  113. package/skills/reflexion-critique/SKILL.md +21 -10
  114. package/skills/reflexion-critique/manifest.json +1 -3
  115. package/skills/reflexion-reflect/SKILL.md +36 -34
  116. package/skills/reflexion-reflect/manifest.json +2 -10
  117. package/skills/root-cause-analysis/manifest.json +2 -13
  118. package/skills/sadd-judge-with-debate/SKILL.md +50 -26
  119. package/skills/sadd-judge-with-debate/manifest.json +1 -3
  120. package/skills/structured-code-review/manifest.json +2 -11
  121. package/skills/task-decomposition/manifest.json +2 -13
  122. package/skills/verification-before-completion/manifest.json +2 -15
  123. package/skills/verification-gates/SKILL.md +27 -19
  124. package/skills/verification-gates/manifest.json +2 -12
  125. package/defaults/agent-settings.json +0 -102
  126. package/defaults/agent-settings.schema.json +0 -25
  127. package/defaults/flags.json +0 -35
  128. package/defaults/flags.schema.json +0 -119
  129. package/defaults/mcp-defaults.json +0 -47
  130. package/defaults/mcp-defaults.schema.json +0 -38
  131. package/defaults/profiles.json +0 -53
  132. package/defaults/profiles.schema.json +0 -60
  133. package/defaults/team-profiles.json +0 -83
  134. package/src/control-plane.ts +0 -21
  135. package/src/index.ts +0 -8
  136. package/src/opencode-multiagent/compiler.ts +0 -168
  137. package/src/opencode-multiagent/constants.ts +0 -178
  138. package/src/opencode-multiagent/file-lock.ts +0 -90
  139. package/src/opencode-multiagent/hooks.ts +0 -599
  140. package/src/opencode-multiagent/log.ts +0 -12
  141. package/src/opencode-multiagent/mailbox.ts +0 -287
  142. package/src/opencode-multiagent/markdown.ts +0 -99
  143. package/src/opencode-multiagent/mcp.ts +0 -35
  144. package/src/opencode-multiagent/policy.ts +0 -67
  145. package/src/opencode-multiagent/quality.ts +0 -140
  146. package/src/opencode-multiagent/runtime.ts +0 -55
  147. package/src/opencode-multiagent/skills.ts +0 -144
  148. package/src/opencode-multiagent/supervision.ts +0 -156
  149. package/src/opencode-multiagent/task-manager.ts +0 -148
  150. package/src/opencode-multiagent/team-manager.ts +0 -219
  151. package/src/opencode-multiagent/team-tools.ts +0 -359
  152. package/src/opencode-multiagent/telemetry.ts +0 -124
  153. package/src/opencode-multiagent/utils.ts +0 -54
@@ -28,11 +28,13 @@ Activate this skill when:
28
28
  Evaluation approaches fall into two primary categories with distinct reliability profiles:
29
29
 
30
30
  **Direct Scoring**: A single LLM rates one response on a defined scale.
31
+
31
32
  - Best for: Objective criteria (factual accuracy, instruction following, toxicity)
32
33
  - Reliability: Moderate to high for well-defined criteria
33
34
  - Failure mode: Score calibration drift, inconsistent scale interpretation
34
35
 
35
36
  **Pairwise Comparison**: An LLM compares two responses and selects the better one.
37
+
36
38
  - Best for: Subjective preferences (tone, style, persuasiveness)
37
39
  - Reliability: Higher than direct scoring for preferences
38
40
  - Failure mode: Position bias, length bias
@@ -57,12 +59,12 @@ LLM judges exhibit systematic biases that must be actively mitigated:
57
59
 
58
60
  Choose metrics based on the evaluation task structure:
59
61
 
60
- | Task Type | Primary Metrics | Secondary Metrics |
61
- |-----------|-----------------|-------------------|
62
- | Binary classification (pass/fail) | Recall, Precision, F1 | Cohen's κ |
63
- | Ordinal scale (1-5 rating) | Spearman's ρ, Kendall's τ | Cohen's κ (weighted) |
64
- | Pairwise preference | Agreement rate, Position consistency | Confidence calibration |
65
- | Multi-label | Macro-F1, Micro-F1 | Per-label precision/recall |
62
+ | Task Type | Primary Metrics | Secondary Metrics |
63
+ | --------------------------------- | ------------------------------------ | -------------------------- |
64
+ | Binary classification (pass/fail) | Recall, Precision, F1 | Cohen's κ |
65
+ | Ordinal scale (1-5 rating) | Spearman's ρ, Kendall's τ | Cohen's κ (weighted) |
66
+ | Pairwise preference | Agreement rate, Position consistency | Confidence calibration |
67
+ | Multi-label | Macro-F1, Micro-F1 | Per-label precision/recall |
66
68
 
67
69
  The critical insight: High absolute agreement matters less than systematic disagreement patterns. A judge that consistently disagrees with humans on specific criteria is more problematic than one with random noise.
68
70
 
@@ -73,6 +75,7 @@ The critical insight: High absolute agreement matters less than systematic disag
73
75
  Direct scoring requires three components: clear criteria, a calibrated scale, and structured output format.
74
76
 
75
77
  **Criteria Definition Pattern**:
78
+
76
79
  ```
77
80
  Criterion: [Name]
78
81
  Description: [What this criterion measures]
@@ -80,11 +83,13 @@ Weight: [Relative importance, 0-1]
80
83
  ```
81
84
 
82
85
  **Scale Calibration**:
86
+
83
87
  - 1-3 scales: Binary with neutral option, lowest cognitive load
84
88
  - 1-5 scales: Standard Likert, good balance of granularity and reliability
85
89
  - 1-10 scales: High granularity but harder to calibrate, use only with detailed rubrics
86
90
 
87
91
  **Prompt Structure for Direct Scoring**:
92
+
88
93
  ```
89
94
  You are an expert evaluator assessing response quality.
90
95
 
@@ -118,12 +123,14 @@ Respond with structured JSON containing scores, justifications, and summary.
118
123
  Pairwise comparison is inherently more reliable for preference-based evaluation but requires bias mitigation.
119
124
 
120
125
  **Position Bias Mitigation Protocol**:
126
+
121
127
  1. First pass: Response A in first position, Response B in second
122
128
  2. Second pass: Response B in first position, Response A in second
123
129
  3. Consistency check: If passes disagree, return TIE with reduced confidence
124
130
  4. Final verdict: Consistent winner with averaged confidence
125
131
 
126
132
  **Prompt Structure for Pairwise Comparison**:
133
+
127
134
  ```
128
135
  You are an expert evaluator comparing two AI responses.
129
136
 
@@ -155,6 +162,7 @@ JSON with per-criterion comparison, overall winner, confidence (0-1), and reason
155
162
  ```
156
163
 
157
164
  **Confidence Calibration**: Confidence scores should reflect position consistency:
165
+
158
166
  - Both passes agree: confidence = average of individual confidences
159
167
  - Passes disagree: confidence = 0.5, verdict = TIE
160
168
 
@@ -163,6 +171,7 @@ JSON with per-criterion comparison, overall winner, confidence (0-1), and reason
163
171
  Well-defined rubrics reduce evaluation variance by 40-60% compared to open-ended scoring.
164
172
 
165
173
  **Rubric Components**:
174
+
166
175
  1. **Level descriptions**: Clear boundaries for each score level
167
176
  2. **Characteristics**: Observable features that define each level
168
177
  3. **Examples**: Representative text for each level (optional but valuable)
@@ -170,6 +179,7 @@ Well-defined rubrics reduce evaluation variance by 40-60% compared to open-ended
170
179
  5. **Scoring guidelines**: General principles for consistent application
171
180
 
172
181
  **Strictness Calibration**:
182
+
173
183
  - **Lenient**: Lower bar for passing scores, appropriate for encouraging iteration
174
184
  - **Balanced**: Fair, typical expectations for production use
175
185
  - **Strict**: High standards, appropriate for safety-critical or high-stakes evaluation
@@ -218,22 +228,27 @@ Production evaluation systems require multiple layers:
218
228
  ### Common Anti-Patterns
219
229
 
220
230
  **Anti-pattern: Scoring without justification**
231
+
221
232
  - Problem: Scores lack grounding, difficult to debug or improve
222
233
  - Solution: Always require evidence-based justification before score
223
234
 
224
235
  **Anti-pattern: Single-pass pairwise comparison**
236
+
225
237
  - Problem: Position bias corrupts results
226
238
  - Solution: Always swap positions and check consistency
227
239
 
228
240
  **Anti-pattern: Overloaded criteria**
241
+
229
242
  - Problem: Criteria measuring multiple things are unreliable
230
243
  - Solution: One criterion = one measurable aspect
231
244
 
232
245
  **Anti-pattern: Missing edge case guidance**
246
+
233
247
  - Problem: Evaluators handle ambiguous cases inconsistently
234
248
  - Solution: Include edge cases in rubrics with explicit guidance
235
249
 
236
250
  **Anti-pattern: Ignoring confidence calibration**
251
+
237
252
  - Problem: High-confidence wrong judgments are worse than low-confidence
238
253
  - Solution: Calibrate confidence to position consistency and evidence strength
239
254
 
@@ -275,15 +290,17 @@ For high-volume evaluation:
275
290
  ### Example 1: Direct Scoring for Accuracy
276
291
 
277
292
  **Input**:
293
+
278
294
  ```
279
295
  Prompt: "What causes seasons on Earth?"
280
- Response: "Seasons are caused by Earth's tilted axis. As Earth orbits the Sun,
296
+ Response: "Seasons are caused by Earth's tilted axis. As Earth orbits the Sun,
281
297
  different hemispheres receive more direct sunlight at different times of year."
282
298
  Criterion: Factual Accuracy (weight: 1.0)
283
299
  Scale: 1-5
284
300
  ```
285
301
 
286
302
  **Output**:
303
+
287
304
  ```json
288
305
  {
289
306
  "criterion": "Factual Accuracy",
@@ -293,8 +310,8 @@ Scale: 1-5
293
310
  "Correctly explains differential sunlight by hemisphere",
294
311
  "No factual errors present"
295
312
  ],
296
- "justification": "Response accurately explains the cause of seasons with correct
297
- scientific reasoning. Both the axial tilt and its effect on sunlight distribution
313
+ "justification": "Response accurately explains the cause of seasons with correct
314
+ scientific reasoning. Both the axial tilt and its effect on sunlight distribution
298
315
  are correctly described.",
299
316
  "improvement": "Could add the specific tilt angle (23.5°) for completeness."
300
317
  }
@@ -303,6 +320,7 @@ are correctly described.",
303
320
  ### Example 2: Pairwise Comparison with Position Swap
304
321
 
305
322
  **Input**:
323
+
306
324
  ```
307
325
  Prompt: "Explain machine learning to a beginner"
308
326
  Response A: [Technical explanation with jargon]
@@ -311,22 +329,27 @@ Criteria: ["clarity", "accessibility"]
311
329
  ```
312
330
 
313
331
  **First Pass (A first)**:
332
+
314
333
  ```json
315
334
  { "winner": "B", "confidence": 0.8 }
316
335
  ```
317
336
 
318
337
  **Second Pass (B first)**:
338
+
319
339
  ```json
320
340
  { "winner": "A", "confidence": 0.6 }
321
341
  ```
342
+
322
343
  (Note: Winner is A because B was in first position)
323
344
 
324
345
  **Mapped Second Pass**:
346
+
325
347
  ```json
326
348
  { "winner": "B", "confidence": 0.6 }
327
349
  ```
328
350
 
329
351
  **Final Result**:
352
+
330
353
  ```json
331
354
  {
332
355
  "winner": "B",
@@ -342,6 +365,7 @@ Criteria: ["clarity", "accessibility"]
342
365
  ### Example 3: Rubric Generation
343
366
 
344
367
  **Input**:
368
+
345
369
  ```
346
370
  criterionName: "Code Readability"
347
371
  criterionDescription: "How easy the code is to understand and maintain"
@@ -351,6 +375,7 @@ strictness: "balanced"
351
375
  ```
352
376
 
353
377
  **Output** (abbreviated):
378
+
354
379
  ```json
355
380
  {
356
381
  "levels": [
@@ -420,28 +445,20 @@ strictness: "balanced"
420
445
 
421
446
  This skill integrates with:
422
447
 
423
- - **context-fundamentals** - Evaluation prompts require effective context structure
424
- - **tool-design** - Evaluation tools need proper schemas and error handling
425
- - **context-optimization** - Evaluation prompts can be optimized for token efficiency
426
448
  - **evaluation** (foundational) - This skill extends the foundational evaluation concepts
427
449
 
428
450
  ## References
429
451
 
430
- Internal reference:
431
- - [LLM-as-Judge Implementation Patterns](./references/implementation-patterns.md)
432
- - [Bias Mitigation Techniques](./references/bias-mitigation.md)
433
- - [Metric Selection Guide](./references/metrics-guide.md)
434
-
435
452
  External research:
453
+
436
454
  - [Eugene Yan: Evaluating the Effectiveness of LLM-Evaluators](https://eugeneyan.com/writing/llm-evaluators/)
437
455
  - [Judging LLM-as-a-Judge (Zheng et al., 2023)](https://arxiv.org/abs/2306.05685)
438
456
  - [G-Eval: NLG Evaluation using GPT-4 (Liu et al., 2023)](https://arxiv.org/abs/2303.16634)
439
457
  - [Large Language Models are not Fair Evaluators (Wang et al., 2023)](https://arxiv.org/abs/2305.17926)
440
458
 
441
459
  Related skills in this collection:
442
- - evaluation - Foundational evaluation concepts
443
- - context-fundamentals - Context structure for evaluation prompts
444
- - tool-design - Building evaluation tools
460
+
461
+ - **evaluation** - Foundational evaluation concepts
445
462
 
446
463
  ---
447
464
 
@@ -451,4 +468,3 @@ Related skills in this collection:
451
468
  **Last Updated**: 2024-12-24
452
469
  **Author**: Muratcan Koylan
453
470
  **Version**: 1.0.0
454
-
@@ -2,19 +2,8 @@
2
2
  "name": "advanced-evaluation",
3
3
  "version": "1.0.0",
4
4
  "description": "Advanced evaluation workflows for comparative and bias-aware judgment tasks",
5
- "triggers": [
6
- "advanced evaluation",
7
- "compare outputs",
8
- "pairwise",
9
- "position bias",
10
- "judge"
11
- ],
12
- "applicable_agents": [
13
- "critic",
14
- "strategist",
15
- "librarian",
16
- "reviewer"
17
- ],
5
+ "triggers": ["advanced evaluation", "compare outputs", "pairwise", "position bias", "judge"],
6
+ "applicable_agents": ["librarian"],
18
7
  "max_context_tokens": 2200,
19
8
  "entry_file": "SKILL.md"
20
9
  }