opencode-multiagent 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +209 -0
  3. package/agents/advisor.md +57 -0
  4. package/agents/auditor.md +45 -0
  5. package/agents/critic.md +127 -0
  6. package/agents/deep-worker.md +65 -0
  7. package/agents/devil.md +36 -0
  8. package/agents/executor.md +141 -0
  9. package/agents/heavy-worker.md +68 -0
  10. package/agents/lead.md +155 -0
  11. package/agents/librarian.md +62 -0
  12. package/agents/planner.md +121 -0
  13. package/agents/qa.md +50 -0
  14. package/agents/quick.md +65 -0
  15. package/agents/reviewer.md +55 -0
  16. package/agents/scout.md +58 -0
  17. package/agents/scribe.md +78 -0
  18. package/agents/strategist.md +63 -0
  19. package/agents/ui-heavy-worker.md +62 -0
  20. package/agents/ui-worker.md +69 -0
  21. package/agents/validator.md +47 -0
  22. package/agents/worker.md +68 -0
  23. package/commands/execute.md +14 -0
  24. package/commands/init-deep.md +18 -0
  25. package/commands/init.md +18 -0
  26. package/commands/inspect.md +13 -0
  27. package/commands/plan.md +15 -0
  28. package/commands/quality.md +14 -0
  29. package/commands/review.md +14 -0
  30. package/commands/status.md +15 -0
  31. package/defaults/agent-settings.json +102 -0
  32. package/defaults/agent-settings.schema.json +25 -0
  33. package/defaults/flags.json +35 -0
  34. package/defaults/flags.schema.json +119 -0
  35. package/defaults/mcp-defaults.json +47 -0
  36. package/defaults/mcp-defaults.schema.json +38 -0
  37. package/defaults/profiles.json +53 -0
  38. package/defaults/profiles.schema.json +60 -0
  39. package/defaults/team-profiles.json +83 -0
  40. package/examples/opencode.json +4 -0
  41. package/examples/opencode.with-overrides.json +23 -0
  42. package/package.json +62 -0
  43. package/skills/advanced-evaluation/SKILL.md +454 -0
  44. package/skills/advanced-evaluation/manifest.json +20 -0
  45. package/skills/cek-context-engineering/SKILL.md +1261 -0
  46. package/skills/cek-context-engineering/manifest.json +17 -0
  47. package/skills/cek-prompt-engineering/SKILL.md +559 -0
  48. package/skills/cek-prompt-engineering/manifest.json +17 -0
  49. package/skills/cek-test-prompt/SKILL.md +714 -0
  50. package/skills/cek-test-prompt/manifest.json +17 -0
  51. package/skills/cek-thought-based-reasoning/SKILL.md +658 -0
  52. package/skills/cek-thought-based-reasoning/manifest.json +17 -0
  53. package/skills/context-degradation/SKILL.md +231 -0
  54. package/skills/context-degradation/manifest.json +17 -0
  55. package/skills/debate/SKILL.md +316 -0
  56. package/skills/debate/manifest.json +19 -0
  57. package/skills/design-first/SKILL.md +5 -0
  58. package/skills/design-first/manifest.json +20 -0
  59. package/skills/dispatching-parallel-agents/SKILL.md +180 -0
  60. package/skills/dispatching-parallel-agents/manifest.json +18 -0
  61. package/skills/drift-analysis/SKILL.md +324 -0
  62. package/skills/drift-analysis/manifest.json +19 -0
  63. package/skills/evaluation/SKILL.md +5 -0
  64. package/skills/evaluation/manifest.json +19 -0
  65. package/skills/executing-plans/SKILL.md +70 -0
  66. package/skills/executing-plans/manifest.json +17 -0
  67. package/skills/handoff-protocols/SKILL.md +5 -0
  68. package/skills/handoff-protocols/manifest.json +19 -0
  69. package/skills/parallel-investigation/SKILL.md +206 -0
  70. package/skills/parallel-investigation/manifest.json +18 -0
  71. package/skills/reflexion-critique/SKILL.md +477 -0
  72. package/skills/reflexion-critique/manifest.json +17 -0
  73. package/skills/reflexion-reflect/SKILL.md +650 -0
  74. package/skills/reflexion-reflect/manifest.json +17 -0
  75. package/skills/root-cause-analysis/SKILL.md +5 -0
  76. package/skills/root-cause-analysis/manifest.json +20 -0
  77. package/skills/sadd-judge-with-debate/SKILL.md +426 -0
  78. package/skills/sadd-judge-with-debate/manifest.json +17 -0
  79. package/skills/structured-code-review/SKILL.md +5 -0
  80. package/skills/structured-code-review/manifest.json +18 -0
  81. package/skills/task-decomposition/SKILL.md +5 -0
  82. package/skills/task-decomposition/manifest.json +20 -0
  83. package/skills/verification-before-completion/SKILL.md +5 -0
  84. package/skills/verification-before-completion/manifest.json +22 -0
  85. package/skills/verification-gates/SKILL.md +281 -0
  86. package/skills/verification-gates/manifest.json +19 -0
  87. package/src/control-plane.ts +21 -0
  88. package/src/index.ts +8 -0
  89. package/src/opencode-multiagent/compiler.ts +168 -0
  90. package/src/opencode-multiagent/constants.ts +178 -0
  91. package/src/opencode-multiagent/file-lock.ts +90 -0
  92. package/src/opencode-multiagent/hooks.ts +599 -0
  93. package/src/opencode-multiagent/log.ts +12 -0
  94. package/src/opencode-multiagent/mailbox.ts +287 -0
  95. package/src/opencode-multiagent/markdown.ts +99 -0
  96. package/src/opencode-multiagent/mcp.ts +35 -0
  97. package/src/opencode-multiagent/policy.ts +67 -0
  98. package/src/opencode-multiagent/quality.ts +140 -0
  99. package/src/opencode-multiagent/runtime.ts +55 -0
  100. package/src/opencode-multiagent/skills.ts +144 -0
  101. package/src/opencode-multiagent/supervision.ts +156 -0
  102. package/src/opencode-multiagent/task-manager.ts +148 -0
  103. package/src/opencode-multiagent/team-manager.ts +219 -0
  104. package/src/opencode-multiagent/team-tools.ts +359 -0
  105. package/src/opencode-multiagent/telemetry.ts +124 -0
  106. package/src/opencode-multiagent/utils.ts +54 -0
@@ -0,0 +1,454 @@
1
+ ---
2
+ name: advanced-evaluation
3
+ description: This skill should be used when the user asks to "implement LLM-as-judge", "compare model outputs", "create evaluation rubrics", "mitigate evaluation bias", or mentions direct scoring, pairwise comparison, position bias, evaluation pipelines, or automated quality assessment.
4
+ ---
5
+
6
+ # Advanced Evaluation
7
+
8
+ This skill covers production-grade techniques for evaluating LLM outputs using LLMs as judges. It synthesizes research from academic papers, industry practices, and practical implementation experience into actionable patterns for building reliable evaluation systems.
9
+
10
+ **Key insight**: LLM-as-a-Judge is not a single technique but a family of approaches, each suited to different evaluation contexts. Choosing the right approach and mitigating known biases is the core competency this skill develops.
11
+
12
+ ## When to Activate
13
+
14
+ Activate this skill when:
15
+
16
+ - Building automated evaluation pipelines for LLM outputs
17
+ - Comparing multiple model responses to select the best one
18
+ - Establishing consistent quality standards across evaluation teams
19
+ - Debugging evaluation systems that show inconsistent results
20
+ - Designing A/B tests for prompt or model changes
21
+ - Creating rubrics for human or automated evaluation
22
+ - Analyzing correlation between automated and human judgments
23
+
24
+ ## Core Concepts
25
+
26
+ ### The Evaluation Taxonomy
27
+
28
+ Evaluation approaches fall into two primary categories with distinct reliability profiles:
29
+
30
+ **Direct Scoring**: A single LLM rates one response on a defined scale.
31
+ - Best for: Objective criteria (factual accuracy, instruction following, toxicity)
32
+ - Reliability: Moderate to high for well-defined criteria
33
+ - Failure mode: Score calibration drift, inconsistent scale interpretation
34
+
35
+ **Pairwise Comparison**: An LLM compares two responses and selects the better one.
36
+ - Best for: Subjective preferences (tone, style, persuasiveness)
37
+ - Reliability: Higher than direct scoring for preferences
38
+ - Failure mode: Position bias, length bias
39
+
40
+ Research from the MT-Bench paper (Zheng et al., 2023) establishes that pairwise comparison achieves higher agreement with human judges than direct scoring for preference-based evaluation, while direct scoring remains appropriate for objective criteria with clear ground truth.
41
+
42
+ ### The Bias Landscape
43
+
44
+ LLM judges exhibit systematic biases that must be actively mitigated:
45
+
46
+ **Position Bias**: First-position responses receive preferential treatment in pairwise comparison. Mitigation: Evaluate twice with swapped positions, use majority vote or consistency check.
47
+
48
+ **Length Bias**: Longer responses are rated higher regardless of quality. Mitigation: Explicit prompting to ignore length, length-normalized scoring.
49
+
50
+ **Self-Enhancement Bias**: Models rate their own outputs higher. Mitigation: Use different models for generation and evaluation, or acknowledge limitation.
51
+
52
+ **Verbosity Bias**: Detailed explanations receive higher scores even when unnecessary. Mitigation: Criteria-specific rubrics that penalize irrelevant detail.
53
+
54
+ **Authority Bias**: Confident, authoritative tone rated higher regardless of accuracy. Mitigation: Require evidence citation, fact-checking layer.
55
+
56
+ ### Metric Selection Framework
57
+
58
+ Choose metrics based on the evaluation task structure:
59
+
60
+ | Task Type | Primary Metrics | Secondary Metrics |
61
+ |-----------|-----------------|-------------------|
62
+ | Binary classification (pass/fail) | Recall, Precision, F1 | Cohen's κ |
63
+ | Ordinal scale (1-5 rating) | Spearman's ρ, Kendall's τ | Cohen's κ (weighted) |
64
+ | Pairwise preference | Agreement rate, Position consistency | Confidence calibration |
65
+ | Multi-label | Macro-F1, Micro-F1 | Per-label precision/recall |
66
+
67
+ The critical insight: High absolute agreement matters less than systematic disagreement patterns. A judge that consistently disagrees with humans on specific criteria is more problematic than one with random noise.
68
+
69
+ ## Evaluation Approaches
70
+
71
+ ### Direct Scoring Implementation
72
+
73
+ Direct scoring requires three components: clear criteria, a calibrated scale, and structured output format.
74
+
75
+ **Criteria Definition Pattern**:
76
+ ```
77
+ Criterion: [Name]
78
+ Description: [What this criterion measures]
79
+ Weight: [Relative importance, 0-1]
80
+ ```
81
+
82
+ **Scale Calibration**:
83
+ - 1-3 scales: Binary with neutral option, lowest cognitive load
84
+ - 1-5 scales: Standard Likert, good balance of granularity and reliability
85
+ - 1-10 scales: High granularity but harder to calibrate, use only with detailed rubrics
86
+
87
+ **Prompt Structure for Direct Scoring**:
88
+ ```
89
+ You are an expert evaluator assessing response quality.
90
+
91
+ ## Task
92
+ Evaluate the following response against each criterion.
93
+
94
+ ## Original Prompt
95
+ {prompt}
96
+
97
+ ## Response to Evaluate
98
+ {response}
99
+
100
+ ## Criteria
101
+ {for each criterion: name, description, weight}
102
+
103
+ ## Instructions
104
+ For each criterion:
105
+ 1. Find specific evidence in the response
106
+ 2. Score according to the rubric (1-{max} scale)
107
+ 3. Justify your score with evidence
108
+ 4. Suggest one specific improvement
109
+
110
+ ## Output Format
111
+ Respond with structured JSON containing scores, justifications, and summary.
112
+ ```
113
+
114
+ **Chain-of-Thought Requirement**: All scoring prompts must require justification before the score. Research shows this improves reliability by 15-25% compared to score-first approaches.
115
+
116
+ ### Pairwise Comparison Implementation
117
+
118
+ Pairwise comparison is inherently more reliable for preference-based evaluation but requires bias mitigation.
119
+
120
+ **Position Bias Mitigation Protocol**:
121
+ 1. First pass: Response A in first position, Response B in second
122
+ 2. Second pass: Response B in first position, Response A in second
123
+ 3. Consistency check: If passes disagree, return TIE with reduced confidence
124
+ 4. Final verdict: Consistent winner with averaged confidence
125
+
126
+ **Prompt Structure for Pairwise Comparison**:
127
+ ```
128
+ You are an expert evaluator comparing two AI responses.
129
+
130
+ ## Critical Instructions
131
+ - Do NOT prefer responses because they are longer
132
+ - Do NOT prefer responses based on position (first vs second)
133
+ - Focus ONLY on quality according to the specified criteria
134
+ - Ties are acceptable when responses are genuinely equivalent
135
+
136
+ ## Original Prompt
137
+ {prompt}
138
+
139
+ ## Response A
140
+ {response_a}
141
+
142
+ ## Response B
143
+ {response_b}
144
+
145
+ ## Comparison Criteria
146
+ {criteria list}
147
+
148
+ ## Instructions
149
+ 1. Analyze each response independently first
150
+ 2. Compare them on each criterion
151
+ 3. Determine overall winner with confidence level
152
+
153
+ ## Output Format
154
+ JSON with per-criterion comparison, overall winner, confidence (0-1), and reasoning.
155
+ ```
156
+
157
+ **Confidence Calibration**: Confidence scores should reflect position consistency:
158
+ - Both passes agree: confidence = average of individual confidences
159
+ - Passes disagree: confidence = 0.5, verdict = TIE
160
+
161
+ ### Rubric Generation
162
+
163
+ Well-defined rubrics reduce evaluation variance by 40-60% compared to open-ended scoring.
164
+
165
+ **Rubric Components**:
166
+ 1. **Level descriptions**: Clear boundaries for each score level
167
+ 2. **Characteristics**: Observable features that define each level
168
+ 3. **Examples**: Representative text for each level (optional but valuable)
169
+ 4. **Edge cases**: Guidance for ambiguous situations
170
+ 5. **Scoring guidelines**: General principles for consistent application
171
+
172
+ **Strictness Calibration**:
173
+ - **Lenient**: Lower bar for passing scores, appropriate for encouraging iteration
174
+ - **Balanced**: Fair, typical expectations for production use
175
+ - **Strict**: High standards, appropriate for safety-critical or high-stakes evaluation
176
+
177
+ **Domain Adaptation**: Rubrics should use domain-specific terminology. A "code readability" rubric mentions variables, functions, and comments. A "medical accuracy" rubric references clinical terminology and evidence standards.
178
+
179
+ ## Practical Guidance
180
+
181
+ ### Evaluation Pipeline Design
182
+
183
+ Production evaluation systems require multiple layers:
184
+
185
+ ```
186
+ ┌─────────────────────────────────────────────────┐
187
+ │ Evaluation Pipeline │
188
+ ├─────────────────────────────────────────────────┤
189
+ │ │
190
+ │ Input: Response + Prompt + Context │
191
+ │ │ │
192
+ │ ▼ │
193
+ │ ┌─────────────────────┐ │
194
+ │ │ Criteria Loader │ ◄── Rubrics, weights │
195
+ │ └──────────┬──────────┘ │
196
+ │ │ │
197
+ │ ▼ │
198
+ │ ┌─────────────────────┐ │
199
+ │ │ Primary Scorer │ ◄── Direct or Pairwise │
200
+ │ └──────────┬──────────┘ │
201
+ │ │ │
202
+ │ ▼ │
203
+ │ ┌─────────────────────┐ │
204
+ │ │ Bias Mitigation │ ◄── Position swap, etc. │
205
+ │ └──────────┬──────────┘ │
206
+ │ │ │
207
+ │ ▼ │
208
+ │ ┌─────────────────────┐ │
209
+ │ │ Confidence Scoring │ ◄── Calibration │
210
+ │ └──────────┬──────────┘ │
211
+ │ │ │
212
+ │ ▼ │
213
+ │ Output: Scores + Justifications + Confidence │
214
+ │ │
215
+ └─────────────────────────────────────────────────┘
216
+ ```
217
+
218
+ ### Common Anti-Patterns
219
+
220
+ **Anti-pattern: Scoring without justification**
221
+ - Problem: Scores lack grounding, difficult to debug or improve
222
+ - Solution: Always require evidence-based justification before score
223
+
224
+ **Anti-pattern: Single-pass pairwise comparison**
225
+ - Problem: Position bias corrupts results
226
+ - Solution: Always swap positions and check consistency
227
+
228
+ **Anti-pattern: Overloaded criteria**
229
+ - Problem: Criteria measuring multiple things are unreliable
230
+ - Solution: One criterion = one measurable aspect
231
+
232
+ **Anti-pattern: Missing edge case guidance**
233
+ - Problem: Evaluators handle ambiguous cases inconsistently
234
+ - Solution: Include edge cases in rubrics with explicit guidance
235
+
236
+ **Anti-pattern: Ignoring confidence calibration**
237
+ - Problem: High-confidence wrong judgments are worse than low-confidence
238
+ - Solution: Calibrate confidence to position consistency and evidence strength
239
+
240
+ ### Decision Framework: Direct vs. Pairwise
241
+
242
+ Use this decision tree:
243
+
244
+ ```
245
+ Is there an objective ground truth?
246
+ ├── Yes → Direct Scoring
247
+ │ └── Examples: factual accuracy, instruction following, format compliance
248
+
249
+ └── No → Is it a preference or quality judgment?
250
+ ├── Yes → Pairwise Comparison
251
+ │ └── Examples: tone, style, persuasiveness, creativity
252
+
253
+ └── No → Consider reference-based evaluation
254
+ └── Examples: summarization (compare to source), translation (compare to reference)
255
+ ```
256
+
257
+ ### Scaling Evaluation
258
+
259
+ For high-volume evaluation:
260
+
261
+ 1. **Panel of LLMs (PoLL)**: Use multiple models as judges, aggregate votes
262
+ - Reduces individual model bias
263
+ - More expensive but more reliable for high-stakes decisions
264
+
265
+ 2. **Hierarchical evaluation**: Fast cheap model for screening, expensive model for edge cases
266
+ - Cost-effective for large volumes
267
+ - Requires calibration of screening threshold
268
+
269
+ 3. **Human-in-the-loop**: Automated evaluation for clear cases, human review for low-confidence
270
+ - Best reliability for critical applications
271
+ - Design feedback loop to improve automated evaluation
272
+
273
+ ## Examples
274
+
275
+ ### Example 1: Direct Scoring for Accuracy
276
+
277
+ **Input**:
278
+ ```
279
+ Prompt: "What causes seasons on Earth?"
280
+ Response: "Seasons are caused by Earth's tilted axis. As Earth orbits the Sun,
281
+ different hemispheres receive more direct sunlight at different times of year."
282
+ Criterion: Factual Accuracy (weight: 1.0)
283
+ Scale: 1-5
284
+ ```
285
+
286
+ **Output**:
287
+ ```json
288
+ {
289
+ "criterion": "Factual Accuracy",
290
+ "score": 5,
291
+ "evidence": [
292
+ "Correctly identifies axial tilt as primary cause",
293
+ "Correctly explains differential sunlight by hemisphere",
294
+ "No factual errors present"
295
+ ],
296
+ "justification": "Response accurately explains the cause of seasons with correct
297
+ scientific reasoning. Both the axial tilt and its effect on sunlight distribution
298
+ are correctly described.",
299
+ "improvement": "Could add the specific tilt angle (23.5°) for completeness."
300
+ }
301
+ ```
302
+
303
+ ### Example 2: Pairwise Comparison with Position Swap
304
+
305
+ **Input**:
306
+ ```
307
+ Prompt: "Explain machine learning to a beginner"
308
+ Response A: [Technical explanation with jargon]
309
+ Response B: [Simple analogy-based explanation]
310
+ Criteria: ["clarity", "accessibility"]
311
+ ```
312
+
313
+ **First Pass (A first)**:
314
+ ```json
315
+ { "winner": "B", "confidence": 0.8 }
316
+ ```
317
+
318
+ **Second Pass (B first)**:
319
+ ```json
320
+ { "winner": "A", "confidence": 0.6 }
321
+ ```
322
+ (Note: Winner is A because B was in first position)
323
+
324
+ **Mapped Second Pass**:
325
+ ```json
326
+ { "winner": "B", "confidence": 0.6 }
327
+ ```
328
+
329
+ **Final Result**:
330
+ ```json
331
+ {
332
+ "winner": "B",
333
+ "confidence": 0.7,
334
+ "positionConsistency": {
335
+ "consistent": true,
336
+ "firstPassWinner": "B",
337
+ "secondPassWinner": "B"
338
+ }
339
+ }
340
+ ```
341
+
342
+ ### Example 3: Rubric Generation
343
+
344
+ **Input**:
345
+ ```
346
+ criterionName: "Code Readability"
347
+ criterionDescription: "How easy the code is to understand and maintain"
348
+ domain: "software engineering"
349
+ scale: "1-5"
350
+ strictness: "balanced"
351
+ ```
352
+
353
+ **Output** (abbreviated):
354
+ ```json
355
+ {
356
+ "levels": [
357
+ {
358
+ "score": 1,
359
+ "label": "Poor",
360
+ "description": "Code is difficult to understand without significant effort",
361
+ "characteristics": [
362
+ "No meaningful variable or function names",
363
+ "No comments or documentation",
364
+ "Deeply nested or convoluted logic"
365
+ ]
366
+ },
367
+ {
368
+ "score": 3,
369
+ "label": "Adequate",
370
+ "description": "Code is understandable with some effort",
371
+ "characteristics": [
372
+ "Most variables have meaningful names",
373
+ "Basic comments present for complex sections",
374
+ "Logic is followable but could be cleaner"
375
+ ]
376
+ },
377
+ {
378
+ "score": 5,
379
+ "label": "Excellent",
380
+ "description": "Code is immediately clear and maintainable",
381
+ "characteristics": [
382
+ "All names are descriptive and consistent",
383
+ "Comprehensive documentation",
384
+ "Clean, modular structure"
385
+ ]
386
+ }
387
+ ],
388
+ "edgeCases": [
389
+ {
390
+ "situation": "Code is well-structured but uses domain-specific abbreviations",
391
+ "guidance": "Score based on readability for domain experts, not general audience"
392
+ }
393
+ ]
394
+ }
395
+ ```
396
+
397
+ ## Guidelines
398
+
399
+ 1. **Always require justification before scores** - Chain-of-thought prompting improves reliability by 15-25%
400
+
401
+ 2. **Always swap positions in pairwise comparison** - Single-pass comparison is corrupted by position bias
402
+
403
+ 3. **Match scale granularity to rubric specificity** - Don't use 1-10 without detailed level descriptions
404
+
405
+ 4. **Separate objective and subjective criteria** - Use direct scoring for objective, pairwise for subjective
406
+
407
+ 5. **Include confidence scores** - Calibrate to position consistency and evidence strength
408
+
409
+ 6. **Define edge cases explicitly** - Ambiguous situations cause the most evaluation variance
410
+
411
+ 7. **Use domain-specific rubrics** - Generic rubrics produce generic (less useful) evaluations
412
+
413
+ 8. **Validate against human judgments** - Automated evaluation is only valuable if it correlates with human assessment
414
+
415
+ 9. **Monitor for systematic bias** - Track disagreement patterns by criterion, response type, model
416
+
417
+ 10. **Design for iteration** - Evaluation systems improve with feedback loops
418
+
419
+ ## Integration
420
+
421
+ This skill integrates with:
422
+
423
+ - **context-fundamentals** - Evaluation prompts require effective context structure
424
+ - **tool-design** - Evaluation tools need proper schemas and error handling
425
+ - **context-optimization** - Evaluation prompts can be optimized for token efficiency
426
+ - **evaluation** (foundational) - This skill extends the foundational evaluation concepts
427
+
428
+ ## References
429
+
430
+ Internal reference:
431
+ - [LLM-as-Judge Implementation Patterns](./references/implementation-patterns.md)
432
+ - [Bias Mitigation Techniques](./references/bias-mitigation.md)
433
+ - [Metric Selection Guide](./references/metrics-guide.md)
434
+
435
+ External research:
436
+ - [Eugene Yan: Evaluating the Effectiveness of LLM-Evaluators](https://eugeneyan.com/writing/llm-evaluators/)
437
+ - [Judging LLM-as-a-Judge (Zheng et al., 2023)](https://arxiv.org/abs/2306.05685)
438
+ - [G-Eval: NLG Evaluation using GPT-4 (Liu et al., 2023)](https://arxiv.org/abs/2303.16634)
439
+ - [Large Language Models are not Fair Evaluators (Wang et al., 2023)](https://arxiv.org/abs/2305.17926)
440
+
441
+ Related skills in this collection:
442
+ - evaluation - Foundational evaluation concepts
443
+ - context-fundamentals - Context structure for evaluation prompts
444
+ - tool-design - Building evaluation tools
445
+
446
+ ---
447
+
448
+ ## Skill Metadata
449
+
450
+ **Created**: 2024-12-24
451
+ **Last Updated**: 2024-12-24
452
+ **Author**: Muratcan Koylan
453
+ **Version**: 1.0.0
454
+
@@ -0,0 +1,20 @@
1
+ {
2
+ "name": "advanced-evaluation",
3
+ "version": "1.0.0",
4
+ "description": "Advanced evaluation workflows for comparative and bias-aware judgment tasks",
5
+ "triggers": [
6
+ "advanced evaluation",
7
+ "compare outputs",
8
+ "pairwise",
9
+ "position bias",
10
+ "judge"
11
+ ],
12
+ "applicable_agents": [
13
+ "critic",
14
+ "strategist",
15
+ "librarian",
16
+ "reviewer"
17
+ ],
18
+ "max_context_tokens": 2200,
19
+ "entry_file": "SKILL.md"
20
+ }