@synsci/cli-darwin-x64 1.1.73 → 1.1.74

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,385 @@
1
+ ---
2
+ name: llm-as-judge-evaluation
3
+ description: Evaluate LLM outputs using frontier models as judges. Use for pairwise model comparison, quality scoring with custom rubrics, and automated evaluation pipelines. Covers position bias mitigation, statistical significance, and generating preference data for DPO/RLHF.
4
+ version: 1.0.0
5
+ author: Synthetic Sciences
6
+ license: MIT
7
+ tags: [Evaluation, LLM-as-Judge, Pairwise Comparison, Quality Assessment, Rubric Design, Model Comparison, Automated Evaluation]
8
+ dependencies: [openai, anthropic, datasets, numpy]
9
+ ---
10
+
11
+ # LLM-as-Judge Evaluation
12
+
13
+ ## When to Use This Skill
14
+
15
+ Use LLM-as-Judge evaluation when you need to:
16
+ - **Compare a fine-tuned model vs frontier** — Does the student beat the teacher on your task?
17
+ - **Quality gates before deployment** — Automated go/no-go on model releases
18
+ - **Continuous evaluation** — Monitor production model quality over time
19
+ - **Generate preference data** — Create (chosen, rejected) pairs for DPO/RLHF training
20
+ - **Evaluate without ground truth** — When exact answers don't exist (creative, open-ended tasks)
21
+
22
+ ### When NOT to Use
23
+ - Tasks with verifiable answers (math, code execution) — use exact match or unit tests
24
+ - Extremely simple classification — use accuracy/F1 directly
25
+ - Safety evaluation — use dedicated safety benchmarks, not general judges
26
+
27
+ ## Pairwise Comparison
28
+
29
+ The most reliable LLM-as-judge method. Show a judge two outputs (A and B) and ask which is better.
30
+
31
+ ### Basic Implementation
32
+
33
+ ```python
34
+ import openai
35
+ import json
36
+ import random
37
+
38
+ client = openai.OpenAI()
39
+
40
+ PAIRWISE_PROMPT = """You are an expert evaluator. Compare two responses to the same prompt.
41
+
42
+ ## Task Context
43
+ {task_description}
44
+
45
+ ## User Input
46
+ {user_input}
47
+
48
+ ## Response A
49
+ {response_a}
50
+
51
+ ## Response B
52
+ {response_b}
53
+
54
+ ## Evaluation Criteria
55
+ {criteria}
56
+
57
+ Which response is better? Consider all criteria above.
58
+ Return JSON: {{"winner": "A" or "B" or "tie", "reasoning": "brief explanation"}}"""
59
+
60
+
61
+ def pairwise_compare(user_input, response_a, response_b, task_description, criteria,
62
+ model="gpt-4o", swap_positions=True):
63
+ """Compare two responses with position bias mitigation."""
64
+ results = []
65
+
66
+ # First comparison: A=position1, B=position2
67
+ prompt = PAIRWISE_PROMPT.format(
68
+ task_description=task_description,
69
+ user_input=user_input,
70
+ response_a=response_a,
71
+ response_b=response_b,
72
+ criteria=criteria,
73
+ )
74
+ resp = client.chat.completions.create(
75
+ model=model,
76
+ messages=[{"role": "user", "content": prompt}],
77
+ response_format={"type": "json_object"},
78
+ temperature=0,
79
+ )
80
+ result1 = json.loads(resp.choices[0].message.content)
81
+ results.append(result1["winner"])
82
+
83
+ if swap_positions:
84
+ # Second comparison: swap positions to detect position bias
85
+ prompt_swapped = PAIRWISE_PROMPT.format(
86
+ task_description=task_description,
87
+ user_input=user_input,
88
+ response_a=response_b, # Swapped
89
+ response_b=response_a, # Swapped
90
+ criteria=criteria,
91
+ )
92
+ resp2 = client.chat.completions.create(
93
+ model=model,
94
+ messages=[{"role": "user", "content": prompt_swapped}],
95
+ response_format={"type": "json_object"},
96
+ temperature=0,
97
+ )
98
+ result2 = json.loads(resp2.choices[0].message.content)
99
+ # Reverse the swapped result
100
+ swapped_winner = {"A": "B", "B": "A", "tie": "tie"}[result2["winner"]]
101
+ results.append(swapped_winner)
102
+
103
+ # Aggregate: both must agree, otherwise tie
104
+ if len(set(results)) == 1:
105
+ return results[0]
106
+ return "tie"
107
+ ```
108
+
109
+ ### Running a Full Evaluation
110
+
111
+ ```python
112
+ def evaluate_model_pair(eval_set, model_a_fn, model_b_fn, task_description, criteria,
113
+ judge_model="gpt-4o"):
114
+ """Run pairwise evaluation across an entire eval set.
115
+
116
+ Args:
117
+ eval_set: List of {"input": str, "reference": str (optional)}
118
+ model_a_fn: Function(input) -> str (e.g., frontier model)
119
+ model_b_fn: Function(input) -> str (e.g., fine-tuned model)
120
+ task_description: What the models are supposed to do
121
+ criteria: Evaluation criteria string
122
+ judge_model: Which model to use as judge
123
+ """
124
+ results = {"A": 0, "B": 0, "tie": 0}
125
+ details = []
126
+
127
+ for i, example in enumerate(eval_set):
128
+ # Generate responses
129
+ response_a = model_a_fn(example["input"])
130
+ response_b = model_b_fn(example["input"])
131
+
132
+ # Random assignment to positions (reduces systematic bias)
133
+ if random.random() < 0.5:
134
+ winner = pairwise_compare(
135
+ example["input"], response_a, response_b,
136
+ task_description, criteria, judge_model
137
+ )
138
+ else:
139
+ raw = pairwise_compare(
140
+ example["input"], response_b, response_a,
141
+ task_description, criteria, judge_model
142
+ )
143
+ winner = {"A": "B", "B": "A", "tie": "tie"}[raw]
144
+
145
+ results[winner] += 1
146
+ details.append({
147
+ "input": example["input"],
148
+ "response_a": response_a,
149
+ "response_b": response_b,
150
+ "winner": winner,
151
+ })
152
+
153
+ if (i + 1) % 20 == 0:
154
+ print(f"Progress: {i+1}/{len(eval_set)} — A:{results['A']} B:{results['B']} Tie:{results['tie']}")
155
+
156
+ total = sum(results.values())
157
+ report = {
158
+ "total_comparisons": total,
159
+ "model_a_wins": results["A"],
160
+ "model_b_wins": results["B"],
161
+ "ties": results["tie"],
162
+ "model_a_win_rate": results["A"] / total,
163
+ "model_b_win_rate": results["B"] / total,
164
+ "tie_rate": results["tie"] / total,
165
+ }
166
+ return report, details
167
+ ```
168
+
169
+ ## Likert Scoring (1-5 Scale)
170
+
171
+ For absolute quality assessment rather than comparison:
172
+
173
+ ```python
174
+ LIKERT_PROMPT = """You are an expert evaluator. Rate this response on a 1-5 scale.
175
+
176
+ ## Task Context
177
+ {task_description}
178
+
179
+ ## User Input
180
+ {user_input}
181
+
182
+ ## Response
183
+ {response}
184
+
185
+ ## Scoring Rubric
186
+ {rubric}
187
+
188
+ Rate the response on each dimension. Then provide an overall score.
189
+ Return JSON: {{"scores": {{"dimension_name": score, ...}}, "overall": score, "reasoning": "..."}}"""
190
+
191
+
192
+ def likert_score(user_input, response, task_description, rubric, model="gpt-4o"):
193
+ """Score a single response on a 1-5 Likert scale."""
194
+ prompt = LIKERT_PROMPT.format(
195
+ task_description=task_description,
196
+ user_input=user_input,
197
+ response=response,
198
+ rubric=rubric,
199
+ )
200
+ resp = client.chat.completions.create(
201
+ model=model,
202
+ messages=[{"role": "user", "content": prompt}],
203
+ response_format={"type": "json_object"},
204
+ temperature=0,
205
+ )
206
+ return json.loads(resp.choices[0].message.content)
207
+ ```
208
+
209
+ ## Custom Rubric Design
210
+
211
+ ### Template
212
+
213
+ ```python
214
+ RUBRIC_TEMPLATE = """
215
+ Score 1 (Poor): {poor_description}
216
+ Score 2 (Below Average): {below_avg_description}
217
+ Score 3 (Average): {avg_description}
218
+ Score 4 (Good): {good_description}
219
+ Score 5 (Excellent): {excellent_description}
220
+ """
221
+
222
+ # Example: Code generation rubric
223
+ CODE_RUBRIC = """
224
+ Dimensions:
225
+ 1. Correctness (weight: 0.4)
226
+ 1: Code has critical bugs, won't run
227
+ 2: Runs but produces wrong output in common cases
228
+ 3: Correct for common cases, fails on edge cases
229
+ 4: Correct for all cases, minor style issues
230
+ 5: Correct, clean, handles all edge cases
231
+
232
+ 2. Efficiency (weight: 0.2)
233
+ 1: Exponential or worse complexity
234
+ 2: Unnecessarily slow, obvious optimization missed
235
+ 3: Acceptable performance for typical inputs
236
+ 4: Well-optimized, good algorithmic choices
237
+ 5: Optimal or near-optimal solution
238
+
239
+ 3. Readability (weight: 0.2)
240
+ 1: Incomprehensible, no structure
241
+ 2: Hard to follow, poor naming
242
+ 3: Readable with effort, some unclear parts
243
+ 4: Clean code, good naming and structure
244
+ 5: Exemplary clarity, well-documented
245
+
246
+ 4. Completeness (weight: 0.2)
247
+ 1: Missing major requirements
248
+ 2: Partial implementation
249
+ 3: Implements core requirements
250
+ 4: Complete with good error handling
251
+ 5: Complete with tests, docs, error handling
252
+ """
253
+ ```
254
+
255
+ ## Position Bias Mitigation
256
+
257
+ LLM judges tend to prefer whichever response appears first. Always mitigate this:
258
+
259
+ ```python
260
+ def mitigated_pairwise(user_input, response_a, response_b, **kwargs):
261
+ """Run comparison twice with swapped positions."""
262
+ # Round 1: A first, B second
263
+ r1 = pairwise_compare(user_input, response_a, response_b, swap_positions=False, **kwargs)
264
+
265
+ # Round 2: B first, A second
266
+ r2_raw = pairwise_compare(user_input, response_b, response_a, swap_positions=False, **kwargs)
267
+ r2 = {"A": "B", "B": "A", "tie": "tie"}[r2_raw]
268
+
269
+ # Agreement check
270
+ if r1 == r2:
271
+ return r1 # Both rounds agree
272
+ return "tie" # Disagreement = inconclusive
273
+ ```
274
+
275
+ ## Statistical Significance
276
+
277
+ ### Bootstrap Confidence Intervals
278
+
279
+ ```python
280
+ import numpy as np
281
+
282
+ def bootstrap_win_rate(wins, total, n_bootstrap=10000, ci=0.95):
283
+ """Calculate bootstrap confidence interval for win rate."""
284
+ win_rate = wins / total
285
+ samples = np.random.binomial(total, win_rate, n_bootstrap) / total
286
+
287
+ alpha = (1 - ci) / 2
288
+ lower = np.percentile(samples, alpha * 100)
289
+ upper = np.percentile(samples, (1 - alpha) * 100)
290
+
291
+ return {
292
+ "win_rate": win_rate,
293
+ "ci_lower": lower,
294
+ "ci_upper": upper,
295
+ "significant": lower > 0.5 or upper < 0.5, # Significantly different from 50%
296
+ }
297
+ ```
298
+
299
+ ### Minimum Sample Size
300
+
301
+ | Desired precision | Minimum samples | Notes |
302
+ |-------------------|-----------------|-------|
303
+ | Directional (which is better) | 50-100 | Rough signal |
304
+ | Reliable estimate (+-5%) | 200-400 | Standard evaluation |
305
+ | High confidence (+-2%) | 500-1000 | Production decisions |
306
+ | Publication quality | 1000+ | Statistical rigor |
307
+
308
+ **Rule of thumb**: Use at least 100 examples for deployment decisions, 200+ for reliable win rates.
309
+
310
+ ## Generating Preference Data for DPO
311
+
312
+ Convert judge outputs to (chosen, rejected) pairs:
313
+
314
+ ```python
315
+ def generate_dpo_pairs(eval_set, model_a_fn, model_b_fn, task_description, criteria,
316
+ judge_model="gpt-4o"):
317
+ """Generate DPO training pairs from pairwise evaluation."""
318
+ pairs = []
319
+
320
+ for example in eval_set:
321
+ response_a = model_a_fn(example["input"])
322
+ response_b = model_b_fn(example["input"])
323
+
324
+ winner = pairwise_compare(
325
+ example["input"], response_a, response_b,
326
+ task_description, criteria, judge_model
327
+ )
328
+
329
+ if winner == "tie":
330
+ continue # Skip ties for DPO
331
+
332
+ chosen = response_a if winner == "A" else response_b
333
+ rejected = response_b if winner == "A" else response_a
334
+
335
+ pairs.append({
336
+ "prompt": example["input"],
337
+ "chosen": chosen,
338
+ "rejected": rejected,
339
+ })
340
+
341
+ print(f"Generated {len(pairs)} DPO pairs from {len(eval_set)} examples "
342
+ f"({len(eval_set) - len(pairs)} ties skipped)")
343
+ return pairs
344
+ ```
345
+
346
+ ## Multi-Judge Ensemble
347
+
348
+ Use multiple judge models for higher reliability:
349
+
350
+ ```python
351
+ def multi_judge_compare(user_input, response_a, response_b, task_description, criteria,
352
+ judges=None):
353
+ """Use multiple judge models and take majority vote."""
354
+ judges = judges or ["gpt-4o", "claude-sonnet-4-5-20250929"]
355
+ votes = []
356
+
357
+ for judge in judges:
358
+ winner = pairwise_compare(
359
+ user_input, response_a, response_b,
360
+ task_description, criteria, model=judge
361
+ )
362
+ votes.append(winner)
363
+
364
+ # Majority vote
365
+ from collections import Counter
366
+ vote_counts = Counter(votes)
367
+ majority = vote_counts.most_common(1)[0]
368
+
369
+ return {
370
+ "winner": majority[0],
371
+ "confidence": majority[1] / len(votes),
372
+ "votes": dict(vote_counts),
373
+ "judge_details": list(zip(judges, votes)),
374
+ }
375
+ ```
376
+
377
+ ## Quick Start Checklist
378
+
379
+ 1. **Define criteria**: Write a rubric specific to your task
380
+ 2. **Prepare eval set**: 100+ held-out examples with production inputs
381
+ 3. **Generate responses**: Run both models on the eval set
382
+ 4. **Run pairwise comparison**: With position bias mitigation
383
+ 5. **Check significance**: Bootstrap CI on win rate
384
+ 6. **Decision gate**: Student wins > 50% -> proceed to deploy
385
+ 7. **Save preference data**: Use ties and wins for DPO training
@@ -0,0 +1,95 @@
1
+ # Pairwise Comparison Reference
2
+
3
+ ## Overview
4
+
5
+ Pairwise comparison is the most reliable LLM-as-judge method. Instead of asking
6
+ "how good is this response?", you ask "which of these two responses is better?" —
7
+ a much easier judgment task that produces more consistent results.
8
+
9
+ ## Why Pairwise > Likert
10
+
11
+ | Aspect | Pairwise | Likert (1-5) |
12
+ |--------|----------|--------------|
13
+ | Inter-annotator agreement | High | Low-moderate |
14
+ | Calibration needed | No | Yes (what does "4" mean?) |
15
+ | Position bias | Mitigatable (swap) | N/A (single response) |
16
+ | Sensitivity | High (detects small differences) | Low (coarse scale) |
17
+ | Cost per comparison | 2x (need swap) | 1x |
18
+ | Best for | A/B testing, model selection | Monitoring, thresholds |
19
+
20
+ ## Advanced: Chain-of-Thought Judging
21
+
22
+ Better results when the judge explains its reasoning before deciding:
23
+
24
+ ```python
25
+ COT_PAIRWISE_PROMPT = """You are an expert evaluator comparing two responses.
26
+
27
+ ## Task: {task_description}
28
+
29
+ ## Input: {user_input}
30
+
31
+ ## Response A
32
+ {response_a}
33
+
34
+ ## Response B
35
+ {response_b}
36
+
37
+ ## Evaluation Criteria
38
+ {criteria}
39
+
40
+ Think step by step:
41
+ 1. Analyze Response A's strengths and weaknesses
42
+ 2. Analyze Response B's strengths and weaknesses
43
+ 3. Compare on each criterion
44
+ 4. Make your final judgment
45
+
46
+ Return JSON:
47
+ {{
48
+ "analysis_a": "strengths and weaknesses of A",
49
+ "analysis_b": "strengths and weaknesses of B",
50
+ "comparison": "criterion-by-criterion comparison",
51
+ "winner": "A" or "B" or "tie",
52
+ "confidence": "high" or "medium" or "low"
53
+ }}"""
54
+ ```
55
+
56
+ ## Handling Ties
57
+
58
+ Tie rates inform evaluation quality:
59
+
60
+ | Tie Rate | Interpretation | Action |
61
+ |----------|---------------|--------|
62
+ | < 10% | Clear quality difference | Good signal |
63
+ | 10-30% | Models are close | Normal, increase sample size |
64
+ | 30-50% | Very similar quality | May need finer-grained criteria |
65
+ | > 50% | Criteria too vague | Rewrite rubric with specific anchors |
66
+
67
+ ## Reference-Based Comparison
68
+
69
+ When you have a ground-truth reference, include it for more accurate judging:
70
+
71
+ ```python
72
+ REFERENCE_PAIRWISE_PROMPT = """Compare two responses against a known correct reference.
73
+
74
+ ## Input: {user_input}
75
+
76
+ ## Reference (ground truth)
77
+ {reference}
78
+
79
+ ## Response A
80
+ {response_a}
81
+
82
+ ## Response B
83
+ {response_b}
84
+
85
+ Which response is more faithful to the reference while remaining helpful?
86
+ Return JSON: {{"winner": "A" or "B" or "tie", "reasoning": "..."}}"""
87
+ ```
88
+
89
+ ## Common Pitfalls
90
+
91
+ 1. **Length bias**: Judges prefer longer responses. Add "conciseness" to criteria.
92
+ 2. **Format bias**: Judges prefer markdown/structured responses. Normalize formatting.
93
+ 3. **Sycophancy**: Judges prefer responses that agree with the user. Use neutral criteria.
94
+ 4. **Self-preference**: GPT-4 may prefer GPT-4 style. Use Claude as judge for GPT outputs and vice versa.
95
+ 5. **Instruction following vs quality**: Separate these in your rubric.
@@ -0,0 +1,169 @@
1
+ # Scoring Rubrics Reference
2
+
3
+ ## Overview
4
+
5
+ A good rubric is the difference between noisy and reliable LLM-as-judge evaluation.
6
+ This reference provides rubric templates for common evaluation scenarios.
7
+
8
+ ## Rubric Design Principles
9
+
10
+ 1. **Specific anchors**: Each score level must describe observable behavior, not vague quality
11
+ 2. **Independent dimensions**: Criteria should not overlap (avoid "quality" and "helpfulness")
12
+ 3. **Weighted dimensions**: Not all criteria matter equally — assign weights
13
+ 4. **Calibration examples**: Include 2-3 example responses with their expected scores
14
+ 5. **Task-aligned**: The rubric should match what your users actually care about
15
+
16
+ ## Template: General Quality
17
+
18
+ ```
19
+ Dimensions (all weighted equally unless specified):
20
+
21
+ 1. Accuracy
22
+ 1: Contains factual errors or hallucinations
23
+ 2: Mostly correct but with notable inaccuracies
24
+ 3: Factually correct on main points, minor issues
25
+ 4: Accurate and well-supported claims
26
+ 5: Perfectly accurate with appropriate caveats
27
+
28
+ 2. Relevance
29
+ 1: Does not address the user's question
30
+ 2: Partially relevant, misses key aspects
31
+ 3: Addresses the main question adequately
32
+ 4: Comprehensive coverage of the topic
33
+ 5: Precisely addresses every aspect of the question
34
+
35
+ 3. Clarity
36
+ 1: Confusing, poorly organized
37
+ 2: Understandable but hard to follow
38
+ 3: Clear and logically organized
39
+ 4: Well-structured with good flow
40
+ 5: Exceptionally clear, easy to scan and understand
41
+
42
+ 4. Conciseness
43
+ 1: Extremely verbose, buries the answer
44
+ 2: Contains significant unnecessary content
45
+ 3: Appropriate length for the question
46
+ 4: Efficiently communicated
47
+ 5: Optimal length — every word serves a purpose
48
+ ```
49
+
50
+ ## Template: Code Generation
51
+
52
+ ```
53
+ Dimensions:
54
+
55
+ 1. Correctness (weight: 0.40)
56
+ 1: Won't compile/run, fundamental logic errors
57
+ 2: Runs but fails on basic test cases
58
+ 3: Handles common cases correctly
59
+ 4: Handles edge cases, good error handling
60
+ 5: Correct, robust, handles all specified requirements
61
+
62
+ 2. Code Quality (weight: 0.25)
63
+ 1: Unreadable, no structure
64
+ 2: Poor naming, minimal structure
65
+ 3: Acceptable style, reasonable naming
66
+ 4: Clean, well-organized, follows conventions
67
+ 5: Exemplary code that teaches best practices
68
+
69
+ 3. Efficiency (weight: 0.15)
70
+ 1: Exponential complexity or worse
71
+ 2: Unnecessarily slow (wrong algorithm choice)
72
+ 3: Acceptable for typical input sizes
73
+ 4: Well-optimized, appropriate algorithms
74
+ 5: Optimal or near-optimal solution
75
+
76
+ 4. Completeness (weight: 0.20)
77
+ 1: Missing major requirements
78
+ 2: Partial implementation, key gaps
79
+ 3: Core requirements met
80
+ 4: Complete with error handling
81
+ 5: Complete with tests, docs, and error handling
82
+ ```
83
+
84
+ ## Template: Customer Support
85
+
86
+ ```
87
+ Dimensions:
88
+
89
+ 1. Problem Resolution (weight: 0.40)
90
+ 1: Does not address the customer's issue
91
+ 2: Acknowledges issue but provides wrong solution
92
+ 3: Provides a valid solution that may not be optimal
93
+ 4: Provides the best available solution
94
+ 5: Resolves issue and proactively prevents related problems
95
+
96
+ 2. Tone & Empathy (weight: 0.25)
97
+ 1: Rude, dismissive, or robotic
98
+ 2: Professional but cold
99
+ 3: Friendly and professional
100
+ 4: Warm, empathetic, personalized
101
+ 5: Exceptional rapport while maintaining professionalism
102
+
103
+ 3. Accuracy (weight: 0.20)
104
+ 1: Contains incorrect information about products/policies
105
+ 2: Mostly correct with some errors
106
+ 3: Factually accurate
107
+ 4: Accurate with helpful additional context
108
+ 5: Perfectly accurate with relevant links/resources
109
+
110
+ 4. Efficiency (weight: 0.15)
111
+ 1: Requires multiple follow-ups for basic resolution
112
+ 2: Could be more direct
113
+ 3: Reasonable number of steps to resolution
114
+ 4: Efficient resolution path
115
+ 5: Resolves in minimum possible interactions
116
+ ```
117
+
118
+ ## Template: Summarization
119
+
120
+ ```
121
+ Dimensions:
122
+
123
+ 1. Faithfulness (weight: 0.35)
124
+ 1: Contains hallucinated information not in source
125
+ 2: Mostly faithful but adds unsupported claims
126
+ 3: Faithful to source material
127
+ 4: Accurately represents source with proper nuance
128
+ 5: Perfectly faithful, captures nuance and caveats
129
+
130
+ 2. Coverage (weight: 0.30)
131
+ 1: Misses most key points
132
+ 2: Captures some key points, misses important ones
133
+ 3: Covers main points adequately
134
+ 4: Comprehensive coverage of key information
135
+ 5: Captures all important points and relationships
136
+
137
+ 3. Coherence (weight: 0.20)
138
+ 1: Disjointed, hard to follow
139
+ 2: Some logical flow issues
140
+ 3: Reads smoothly
141
+ 4: Well-organized with clear structure
142
+ 5: Exemplary narrative flow
143
+
144
+ 4. Conciseness (weight: 0.15)
145
+ 1: As long as original (no compression)
146
+ 2: Minimal compression, includes unnecessary details
147
+ 3: Reasonable length reduction
148
+ 4: Well-compressed, only essential information
149
+ 5: Maximum information density, every word counts
150
+ ```
151
+
152
+ ## Composite Scoring
153
+
154
+ ```python
155
+ def weighted_score(scores, weights):
156
+ """Calculate weighted composite score from dimension scores.
157
+
158
+ Args:
159
+ scores: dict of {"dimension": score} (1-5)
160
+ weights: dict of {"dimension": weight} (sums to 1.0)
161
+ """
162
+ total = sum(scores[dim] * weights[dim] for dim in scores)
163
+ return round(total, 2)
164
+
165
+ # Example
166
+ scores = {"correctness": 4, "quality": 3, "efficiency": 5, "completeness": 4}
167
+ weights = {"correctness": 0.4, "quality": 0.25, "efficiency": 0.15, "completeness": 0.2}
168
+ composite = weighted_score(scores, weights) # 3.85
169
+ ```