opencode-multiagent 0.2.1 → 0.3.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +62 -0
- package/CHANGELOG.md +18 -0
- package/CONTRIBUTING.md +36 -0
- package/README.md +41 -165
- package/README.tr.md +84 -0
- package/RELEASE.md +68 -0
- package/agents/advisor.md +9 -6
- package/agents/auditor.md +8 -6
- package/agents/critic.md +19 -10
- package/agents/deep-worker.md +11 -7
- package/agents/devil.md +3 -1
- package/agents/executor.md +20 -19
- package/agents/heavy-worker.md +11 -7
- package/agents/lead.md +22 -30
- package/agents/librarian.md +6 -2
- package/agents/planner.md +18 -10
- package/agents/qa.md +9 -6
- package/agents/quick.md +12 -7
- package/agents/reviewer.md +9 -6
- package/agents/scout.md +9 -5
- package/agents/scribe.md +33 -28
- package/agents/strategist.md +10 -7
- package/agents/ui-heavy-worker.md +11 -7
- package/agents/ui-worker.md +12 -7
- package/agents/validator.md +8 -5
- package/agents/worker.md +12 -7
- package/commands/execute.md +1 -0
- package/commands/init-deep.md +1 -0
- package/commands/init.md +1 -0
- package/commands/inspect.md +1 -0
- package/commands/plan.md +1 -0
- package/commands/quality.md +1 -0
- package/commands/review.md +1 -0
- package/commands/status.md +1 -0
- package/defaults/opencode-multiagent.json +223 -0
- package/defaults/opencode-multiagent.schema.json +249 -0
- package/dist/control-plane.d.ts +4 -0
- package/dist/control-plane.d.ts.map +1 -0
- package/dist/index.d.ts +5 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +1583 -0
- package/dist/opencode-multiagent/compiler.d.ts +19 -0
- package/dist/opencode-multiagent/compiler.d.ts.map +1 -0
- package/dist/opencode-multiagent/constants.d.ts +116 -0
- package/dist/opencode-multiagent/constants.d.ts.map +1 -0
- package/dist/opencode-multiagent/defaults.d.ts +10 -0
- package/dist/opencode-multiagent/defaults.d.ts.map +1 -0
- package/dist/opencode-multiagent/file-lock.d.ts +15 -0
- package/dist/opencode-multiagent/file-lock.d.ts.map +1 -0
- package/dist/opencode-multiagent/hooks.d.ts +62 -0
- package/dist/opencode-multiagent/hooks.d.ts.map +1 -0
- package/dist/opencode-multiagent/log.d.ts +2 -0
- package/dist/opencode-multiagent/log.d.ts.map +1 -0
- package/dist/opencode-multiagent/markdown.d.ts +8 -0
- package/dist/opencode-multiagent/markdown.d.ts.map +1 -0
- package/dist/opencode-multiagent/mcp.d.ts +3 -0
- package/dist/opencode-multiagent/mcp.d.ts.map +1 -0
- package/dist/opencode-multiagent/policy.d.ts +5 -0
- package/dist/opencode-multiagent/policy.d.ts.map +1 -0
- package/dist/opencode-multiagent/quality.d.ts +14 -0
- package/dist/opencode-multiagent/quality.d.ts.map +1 -0
- package/dist/opencode-multiagent/runtime.d.ts +7 -0
- package/dist/opencode-multiagent/runtime.d.ts.map +1 -0
- package/dist/opencode-multiagent/session-tracker.d.ts +32 -0
- package/dist/opencode-multiagent/session-tracker.d.ts.map +1 -0
- package/dist/opencode-multiagent/skills.d.ts +17 -0
- package/dist/opencode-multiagent/skills.d.ts.map +1 -0
- package/dist/opencode-multiagent/supervision.d.ts +12 -0
- package/dist/opencode-multiagent/supervision.d.ts.map +1 -0
- package/dist/opencode-multiagent/task-manager.d.ts +48 -0
- package/dist/opencode-multiagent/task-manager.d.ts.map +1 -0
- package/dist/opencode-multiagent/telemetry.d.ts +26 -0
- package/dist/opencode-multiagent/telemetry.d.ts.map +1 -0
- package/dist/opencode-multiagent/tools.d.ts +56 -0
- package/dist/opencode-multiagent/tools.d.ts.map +1 -0
- package/dist/opencode-multiagent/types.d.ts +36 -0
- package/dist/opencode-multiagent/types.d.ts.map +1 -0
- package/dist/opencode-multiagent/utils.d.ts +9 -0
- package/dist/opencode-multiagent/utils.d.ts.map +1 -0
- package/docs/agents.md +260 -0
- package/docs/agents.tr.md +260 -0
- package/docs/configuration.md +255 -0
- package/docs/configuration.tr.md +255 -0
- package/docs/usage-guide.md +226 -0
- package/docs/usage-guide.tr.md +227 -0
- package/examples/opencode.with-overrides.json +1 -5
- package/package.json +23 -13
- package/skills/advanced-evaluation/SKILL.md +37 -21
- package/skills/advanced-evaluation/manifest.json +2 -13
- package/skills/cek-context-engineering/SKILL.md +159 -87
- package/skills/cek-context-engineering/manifest.json +1 -3
- package/skills/cek-prompt-engineering/SKILL.md +13 -10
- package/skills/cek-prompt-engineering/manifest.json +1 -3
- package/skills/cek-test-prompt/SKILL.md +38 -28
- package/skills/cek-test-prompt/manifest.json +1 -3
- package/skills/cek-thought-based-reasoning/SKILL.md +75 -21
- package/skills/cek-thought-based-reasoning/manifest.json +1 -3
- package/skills/context-degradation/SKILL.md +14 -13
- package/skills/context-degradation/manifest.json +1 -3
- package/skills/debate/SKILL.md +23 -78
- package/skills/debate/manifest.json +2 -12
- package/skills/design-first/manifest.json +2 -13
- package/skills/dispatching-parallel-agents/SKILL.md +14 -3
- package/skills/dispatching-parallel-agents/manifest.json +1 -4
- package/skills/drift-analysis/SKILL.md +50 -29
- package/skills/drift-analysis/manifest.json +2 -12
- package/skills/evaluation/manifest.json +2 -12
- package/skills/executing-plans/SKILL.md +15 -8
- package/skills/executing-plans/manifest.json +1 -3
- package/skills/handoff-protocols/manifest.json +2 -12
- package/skills/parallel-investigation/SKILL.md +25 -12
- package/skills/parallel-investigation/manifest.json +1 -4
- package/skills/reflexion-critique/SKILL.md +21 -10
- package/skills/reflexion-critique/manifest.json +1 -3
- package/skills/reflexion-reflect/SKILL.md +36 -34
- package/skills/reflexion-reflect/manifest.json +2 -10
- package/skills/root-cause-analysis/manifest.json +2 -13
- package/skills/sadd-judge-with-debate/SKILL.md +50 -26
- package/skills/sadd-judge-with-debate/manifest.json +1 -3
- package/skills/structured-code-review/manifest.json +2 -11
- package/skills/task-decomposition/manifest.json +2 -13
- package/skills/verification-before-completion/manifest.json +2 -15
- package/skills/verification-gates/SKILL.md +27 -19
- package/skills/verification-gates/manifest.json +2 -12
- package/defaults/agent-settings.json +0 -102
- package/defaults/agent-settings.schema.json +0 -25
- package/defaults/flags.json +0 -35
- package/defaults/flags.schema.json +0 -119
- package/defaults/mcp-defaults.json +0 -47
- package/defaults/mcp-defaults.schema.json +0 -38
- package/defaults/profiles.json +0 -53
- package/defaults/profiles.schema.json +0 -60
- package/defaults/team-profiles.json +0 -83
- package/src/control-plane.ts +0 -21
- package/src/index.ts +0 -8
- package/src/opencode-multiagent/compiler.ts +0 -168
- package/src/opencode-multiagent/constants.ts +0 -178
- package/src/opencode-multiagent/file-lock.ts +0 -90
- package/src/opencode-multiagent/hooks.ts +0 -599
- package/src/opencode-multiagent/log.ts +0 -12
- package/src/opencode-multiagent/mailbox.ts +0 -287
- package/src/opencode-multiagent/markdown.ts +0 -99
- package/src/opencode-multiagent/mcp.ts +0 -35
- package/src/opencode-multiagent/policy.ts +0 -67
- package/src/opencode-multiagent/quality.ts +0 -140
- package/src/opencode-multiagent/runtime.ts +0 -55
- package/src/opencode-multiagent/skills.ts +0 -144
- package/src/opencode-multiagent/supervision.ts +0 -156
- package/src/opencode-multiagent/task-manager.ts +0 -148
- package/src/opencode-multiagent/team-manager.ts +0 -219
- package/src/opencode-multiagent/team-tools.ts +0 -359
- package/src/opencode-multiagent/telemetry.ts +0 -124
- package/src/opencode-multiagent/utils.ts +0 -54
|
@@ -28,11 +28,13 @@ Activate this skill when:
|
|
|
28
28
|
Evaluation approaches fall into two primary categories with distinct reliability profiles:
|
|
29
29
|
|
|
30
30
|
**Direct Scoring**: A single LLM rates one response on a defined scale.
|
|
31
|
+
|
|
31
32
|
- Best for: Objective criteria (factual accuracy, instruction following, toxicity)
|
|
32
33
|
- Reliability: Moderate to high for well-defined criteria
|
|
33
34
|
- Failure mode: Score calibration drift, inconsistent scale interpretation
|
|
34
35
|
|
|
35
36
|
**Pairwise Comparison**: An LLM compares two responses and selects the better one.
|
|
37
|
+
|
|
36
38
|
- Best for: Subjective preferences (tone, style, persuasiveness)
|
|
37
39
|
- Reliability: Higher than direct scoring for preferences
|
|
38
40
|
- Failure mode: Position bias, length bias
|
|
@@ -57,12 +59,12 @@ LLM judges exhibit systematic biases that must be actively mitigated:
|
|
|
57
59
|
|
|
58
60
|
Choose metrics based on the evaluation task structure:
|
|
59
61
|
|
|
60
|
-
| Task Type
|
|
61
|
-
|
|
62
|
-
| Binary classification (pass/fail) | Recall, Precision, F1
|
|
63
|
-
| Ordinal scale (1-5 rating)
|
|
64
|
-
| Pairwise preference
|
|
65
|
-
| Multi-label
|
|
62
|
+
| Task Type | Primary Metrics | Secondary Metrics |
|
|
63
|
+
| --------------------------------- | ------------------------------------ | -------------------------- |
|
|
64
|
+
| Binary classification (pass/fail) | Recall, Precision, F1 | Cohen's κ |
|
|
65
|
+
| Ordinal scale (1-5 rating) | Spearman's ρ, Kendall's τ | Cohen's κ (weighted) |
|
|
66
|
+
| Pairwise preference | Agreement rate, Position consistency | Confidence calibration |
|
|
67
|
+
| Multi-label | Macro-F1, Micro-F1 | Per-label precision/recall |
|
|
66
68
|
|
|
67
69
|
The critical insight: High absolute agreement matters less than systematic disagreement patterns. A judge that consistently disagrees with humans on specific criteria is more problematic than one with random noise.
|
|
68
70
|
|
|
@@ -73,6 +75,7 @@ The critical insight: High absolute agreement matters less than systematic disag
|
|
|
73
75
|
Direct scoring requires three components: clear criteria, a calibrated scale, and structured output format.
|
|
74
76
|
|
|
75
77
|
**Criteria Definition Pattern**:
|
|
78
|
+
|
|
76
79
|
```
|
|
77
80
|
Criterion: [Name]
|
|
78
81
|
Description: [What this criterion measures]
|
|
@@ -80,11 +83,13 @@ Weight: [Relative importance, 0-1]
|
|
|
80
83
|
```
|
|
81
84
|
|
|
82
85
|
**Scale Calibration**:
|
|
86
|
+
|
|
83
87
|
- 1-3 scales: Binary with neutral option, lowest cognitive load
|
|
84
88
|
- 1-5 scales: Standard Likert, good balance of granularity and reliability
|
|
85
89
|
- 1-10 scales: High granularity but harder to calibrate, use only with detailed rubrics
|
|
86
90
|
|
|
87
91
|
**Prompt Structure for Direct Scoring**:
|
|
92
|
+
|
|
88
93
|
```
|
|
89
94
|
You are an expert evaluator assessing response quality.
|
|
90
95
|
|
|
@@ -118,12 +123,14 @@ Respond with structured JSON containing scores, justifications, and summary.
|
|
|
118
123
|
Pairwise comparison is inherently more reliable for preference-based evaluation but requires bias mitigation.
|
|
119
124
|
|
|
120
125
|
**Position Bias Mitigation Protocol**:
|
|
126
|
+
|
|
121
127
|
1. First pass: Response A in first position, Response B in second
|
|
122
128
|
2. Second pass: Response B in first position, Response A in second
|
|
123
129
|
3. Consistency check: If passes disagree, return TIE with reduced confidence
|
|
124
130
|
4. Final verdict: Consistent winner with averaged confidence
|
|
125
131
|
|
|
126
132
|
**Prompt Structure for Pairwise Comparison**:
|
|
133
|
+
|
|
127
134
|
```
|
|
128
135
|
You are an expert evaluator comparing two AI responses.
|
|
129
136
|
|
|
@@ -155,6 +162,7 @@ JSON with per-criterion comparison, overall winner, confidence (0-1), and reason
|
|
|
155
162
|
```
|
|
156
163
|
|
|
157
164
|
**Confidence Calibration**: Confidence scores should reflect position consistency:
|
|
165
|
+
|
|
158
166
|
- Both passes agree: confidence = average of individual confidences
|
|
159
167
|
- Passes disagree: confidence = 0.5, verdict = TIE
|
|
160
168
|
|
|
@@ -163,6 +171,7 @@ JSON with per-criterion comparison, overall winner, confidence (0-1), and reason
|
|
|
163
171
|
Well-defined rubrics reduce evaluation variance by 40-60% compared to open-ended scoring.
|
|
164
172
|
|
|
165
173
|
**Rubric Components**:
|
|
174
|
+
|
|
166
175
|
1. **Level descriptions**: Clear boundaries for each score level
|
|
167
176
|
2. **Characteristics**: Observable features that define each level
|
|
168
177
|
3. **Examples**: Representative text for each level (optional but valuable)
|
|
@@ -170,6 +179,7 @@ Well-defined rubrics reduce evaluation variance by 40-60% compared to open-ended
|
|
|
170
179
|
5. **Scoring guidelines**: General principles for consistent application
|
|
171
180
|
|
|
172
181
|
**Strictness Calibration**:
|
|
182
|
+
|
|
173
183
|
- **Lenient**: Lower bar for passing scores, appropriate for encouraging iteration
|
|
174
184
|
- **Balanced**: Fair, typical expectations for production use
|
|
175
185
|
- **Strict**: High standards, appropriate for safety-critical or high-stakes evaluation
|
|
@@ -218,22 +228,27 @@ Production evaluation systems require multiple layers:
|
|
|
218
228
|
### Common Anti-Patterns
|
|
219
229
|
|
|
220
230
|
**Anti-pattern: Scoring without justification**
|
|
231
|
+
|
|
221
232
|
- Problem: Scores lack grounding, difficult to debug or improve
|
|
222
233
|
- Solution: Always require evidence-based justification before score
|
|
223
234
|
|
|
224
235
|
**Anti-pattern: Single-pass pairwise comparison**
|
|
236
|
+
|
|
225
237
|
- Problem: Position bias corrupts results
|
|
226
238
|
- Solution: Always swap positions and check consistency
|
|
227
239
|
|
|
228
240
|
**Anti-pattern: Overloaded criteria**
|
|
241
|
+
|
|
229
242
|
- Problem: Criteria measuring multiple things are unreliable
|
|
230
243
|
- Solution: One criterion = one measurable aspect
|
|
231
244
|
|
|
232
245
|
**Anti-pattern: Missing edge case guidance**
|
|
246
|
+
|
|
233
247
|
- Problem: Evaluators handle ambiguous cases inconsistently
|
|
234
248
|
- Solution: Include edge cases in rubrics with explicit guidance
|
|
235
249
|
|
|
236
250
|
**Anti-pattern: Ignoring confidence calibration**
|
|
251
|
+
|
|
237
252
|
- Problem: High-confidence wrong judgments are worse than low-confidence
|
|
238
253
|
- Solution: Calibrate confidence to position consistency and evidence strength
|
|
239
254
|
|
|
@@ -275,15 +290,17 @@ For high-volume evaluation:
|
|
|
275
290
|
### Example 1: Direct Scoring for Accuracy
|
|
276
291
|
|
|
277
292
|
**Input**:
|
|
293
|
+
|
|
278
294
|
```
|
|
279
295
|
Prompt: "What causes seasons on Earth?"
|
|
280
|
-
Response: "Seasons are caused by Earth's tilted axis. As Earth orbits the Sun,
|
|
296
|
+
Response: "Seasons are caused by Earth's tilted axis. As Earth orbits the Sun,
|
|
281
297
|
different hemispheres receive more direct sunlight at different times of year."
|
|
282
298
|
Criterion: Factual Accuracy (weight: 1.0)
|
|
283
299
|
Scale: 1-5
|
|
284
300
|
```
|
|
285
301
|
|
|
286
302
|
**Output**:
|
|
303
|
+
|
|
287
304
|
```json
|
|
288
305
|
{
|
|
289
306
|
"criterion": "Factual Accuracy",
|
|
@@ -293,8 +310,8 @@ Scale: 1-5
|
|
|
293
310
|
"Correctly explains differential sunlight by hemisphere",
|
|
294
311
|
"No factual errors present"
|
|
295
312
|
],
|
|
296
|
-
"justification": "Response accurately explains the cause of seasons with correct
|
|
297
|
-
scientific reasoning. Both the axial tilt and its effect on sunlight distribution
|
|
313
|
+
"justification": "Response accurately explains the cause of seasons with correct
|
|
314
|
+
scientific reasoning. Both the axial tilt and its effect on sunlight distribution
|
|
298
315
|
are correctly described.",
|
|
299
316
|
"improvement": "Could add the specific tilt angle (23.5°) for completeness."
|
|
300
317
|
}
|
|
@@ -303,6 +320,7 @@ are correctly described.",
|
|
|
303
320
|
### Example 2: Pairwise Comparison with Position Swap
|
|
304
321
|
|
|
305
322
|
**Input**:
|
|
323
|
+
|
|
306
324
|
```
|
|
307
325
|
Prompt: "Explain machine learning to a beginner"
|
|
308
326
|
Response A: [Technical explanation with jargon]
|
|
@@ -311,22 +329,27 @@ Criteria: ["clarity", "accessibility"]
|
|
|
311
329
|
```
|
|
312
330
|
|
|
313
331
|
**First Pass (A first)**:
|
|
332
|
+
|
|
314
333
|
```json
|
|
315
334
|
{ "winner": "B", "confidence": 0.8 }
|
|
316
335
|
```
|
|
317
336
|
|
|
318
337
|
**Second Pass (B first)**:
|
|
338
|
+
|
|
319
339
|
```json
|
|
320
340
|
{ "winner": "A", "confidence": 0.6 }
|
|
321
341
|
```
|
|
342
|
+
|
|
322
343
|
(Note: Winner is A because B was in first position)
|
|
323
344
|
|
|
324
345
|
**Mapped Second Pass**:
|
|
346
|
+
|
|
325
347
|
```json
|
|
326
348
|
{ "winner": "B", "confidence": 0.6 }
|
|
327
349
|
```
|
|
328
350
|
|
|
329
351
|
**Final Result**:
|
|
352
|
+
|
|
330
353
|
```json
|
|
331
354
|
{
|
|
332
355
|
"winner": "B",
|
|
@@ -342,6 +365,7 @@ Criteria: ["clarity", "accessibility"]
|
|
|
342
365
|
### Example 3: Rubric Generation
|
|
343
366
|
|
|
344
367
|
**Input**:
|
|
368
|
+
|
|
345
369
|
```
|
|
346
370
|
criterionName: "Code Readability"
|
|
347
371
|
criterionDescription: "How easy the code is to understand and maintain"
|
|
@@ -351,6 +375,7 @@ strictness: "balanced"
|
|
|
351
375
|
```
|
|
352
376
|
|
|
353
377
|
**Output** (abbreviated):
|
|
378
|
+
|
|
354
379
|
```json
|
|
355
380
|
{
|
|
356
381
|
"levels": [
|
|
@@ -420,28 +445,20 @@ strictness: "balanced"
|
|
|
420
445
|
|
|
421
446
|
This skill integrates with:
|
|
422
447
|
|
|
423
|
-
- **context-fundamentals** - Evaluation prompts require effective context structure
|
|
424
|
-
- **tool-design** - Evaluation tools need proper schemas and error handling
|
|
425
|
-
- **context-optimization** - Evaluation prompts can be optimized for token efficiency
|
|
426
448
|
- **evaluation** (foundational) - This skill extends the foundational evaluation concepts
|
|
427
449
|
|
|
428
450
|
## References
|
|
429
451
|
|
|
430
|
-
Internal reference:
|
|
431
|
-
- [LLM-as-Judge Implementation Patterns](./references/implementation-patterns.md)
|
|
432
|
-
- [Bias Mitigation Techniques](./references/bias-mitigation.md)
|
|
433
|
-
- [Metric Selection Guide](./references/metrics-guide.md)
|
|
434
|
-
|
|
435
452
|
External research:
|
|
453
|
+
|
|
436
454
|
- [Eugene Yan: Evaluating the Effectiveness of LLM-Evaluators](https://eugeneyan.com/writing/llm-evaluators/)
|
|
437
455
|
- [Judging LLM-as-a-Judge (Zheng et al., 2023)](https://arxiv.org/abs/2306.05685)
|
|
438
456
|
- [G-Eval: NLG Evaluation using GPT-4 (Liu et al., 2023)](https://arxiv.org/abs/2303.16634)
|
|
439
457
|
- [Large Language Models are not Fair Evaluators (Wang et al., 2023)](https://arxiv.org/abs/2305.17926)
|
|
440
458
|
|
|
441
459
|
Related skills in this collection:
|
|
442
|
-
|
|
443
|
-
-
|
|
444
|
-
- tool-design - Building evaluation tools
|
|
460
|
+
|
|
461
|
+
- **evaluation** - Foundational evaluation concepts
|
|
445
462
|
|
|
446
463
|
---
|
|
447
464
|
|
|
@@ -451,4 +468,3 @@ Related skills in this collection:
|
|
|
451
468
|
**Last Updated**: 2024-12-24
|
|
452
469
|
**Author**: Muratcan Koylan
|
|
453
470
|
**Version**: 1.0.0
|
|
454
|
-
|
|
@@ -2,19 +2,8 @@
|
|
|
2
2
|
"name": "advanced-evaluation",
|
|
3
3
|
"version": "1.0.0",
|
|
4
4
|
"description": "Advanced evaluation workflows for comparative and bias-aware judgment tasks",
|
|
5
|
-
"triggers": [
|
|
6
|
-
|
|
7
|
-
"compare outputs",
|
|
8
|
-
"pairwise",
|
|
9
|
-
"position bias",
|
|
10
|
-
"judge"
|
|
11
|
-
],
|
|
12
|
-
"applicable_agents": [
|
|
13
|
-
"critic",
|
|
14
|
-
"strategist",
|
|
15
|
-
"librarian",
|
|
16
|
-
"reviewer"
|
|
17
|
-
],
|
|
5
|
+
"triggers": ["advanced evaluation", "compare outputs", "pairwise", "position bias", "judge"],
|
|
6
|
+
"applicable_agents": ["librarian"],
|
|
18
7
|
"max_context_tokens": 2200,
|
|
19
8
|
"entry_file": "SKILL.md"
|
|
20
9
|
}
|