llmasajudge 0.1.15__tar.gz → 0.1.18__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llmasajudge-0.1.15 → llmasajudge-0.1.18}/PKG-INFO +1 -1
- {llmasajudge-0.1.15 → llmasajudge-0.1.18}/llmasajudge/__init__.py +41 -1
- llmasajudge-0.1.18/llmasajudge/ranker.py +772 -0
- {llmasajudge-0.1.15 → llmasajudge-0.1.18}/llmasajudge.egg-info/PKG-INFO +1 -1
- {llmasajudge-0.1.15 → llmasajudge-0.1.18}/llmasajudge.egg-info/SOURCES.txt +1 -0
- {llmasajudge-0.1.15 → llmasajudge-0.1.18}/pyproject.toml +1 -1
- {llmasajudge-0.1.15 → llmasajudge-0.1.18}/README.md +0 -0
- {llmasajudge-0.1.15 → llmasajudge-0.1.18}/llmasajudge.egg-info/dependency_links.txt +0 -0
- {llmasajudge-0.1.15 → llmasajudge-0.1.18}/llmasajudge.egg-info/requires.txt +0 -0
- {llmasajudge-0.1.15 → llmasajudge-0.1.18}/llmasajudge.egg-info/top_level.txt +0 -0
- {llmasajudge-0.1.15 → llmasajudge-0.1.18}/setup.cfg +0 -0
|
@@ -1664,4 +1664,44 @@ Output only valid JSON. No explanation. No extra text.""",
|
|
|
1664
1664
|
"result": final,
|
|
1665
1665
|
"mode": self.mode,
|
|
1666
1666
|
"votes": votes,
|
|
1667
|
-
}
|
|
1667
|
+
}
|
|
1668
|
+
|
|
1669
|
+
def rank(
|
|
1670
|
+
self,
|
|
1671
|
+
input: str,
|
|
1672
|
+
model_outputs: List[str],
|
|
1673
|
+
ground_truth: Optional[str] = None,
|
|
1674
|
+
ranking_mode: str = "single_shot",
|
|
1675
|
+
output_parser: Optional[Callable] = None,
|
|
1676
|
+
custom_template: Optional[str] = None,
|
|
1677
|
+
use_fully_custom_prompt: bool = False,
|
|
1678
|
+
max_tokens: int = 10000,
|
|
1679
|
+
) -> Dict[str, Any]:
|
|
1680
|
+
"""
|
|
1681
|
+
Rank multiple model outputs.
|
|
1682
|
+
|
|
1683
|
+
Args:
|
|
1684
|
+
input: Original prompt or task description
|
|
1685
|
+
model_outputs: List of model outputs to rank
|
|
1686
|
+
ground_truth: Optional reference answer
|
|
1687
|
+
ranking_mode: "single_shot" or "round_robin"
|
|
1688
|
+
output_parser: Function to parse judge output
|
|
1689
|
+
custom_template: Prompt template with placeholders
|
|
1690
|
+
use_fully_custom_prompt: If True, template used as-is
|
|
1691
|
+
max_tokens: Maximum tokens for judge response
|
|
1692
|
+
|
|
1693
|
+
Returns:
|
|
1694
|
+
Dict with ranking results (see ranker.rank() for details)
|
|
1695
|
+
"""
|
|
1696
|
+
from llmasajudge.ranker import rank as _rank
|
|
1697
|
+
return _rank(
|
|
1698
|
+
judge=self,
|
|
1699
|
+
input=input,
|
|
1700
|
+
model_outputs=model_outputs,
|
|
1701
|
+
ground_truth=ground_truth,
|
|
1702
|
+
ranking_mode=ranking_mode,
|
|
1703
|
+
output_parser=output_parser,
|
|
1704
|
+
custom_template=custom_template,
|
|
1705
|
+
use_fully_custom_prompt=use_fully_custom_prompt,
|
|
1706
|
+
max_tokens=max_tokens,
|
|
1707
|
+
)
|
|
@@ -0,0 +1,772 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LLMAsAJudge Ranking Extensions
|
|
3
|
+
|
|
4
|
+
Provides relative ranking functionality for evaluating multiple model outputs.
|
|
5
|
+
|
|
6
|
+
Supports two ranking modes:
|
|
7
|
+
1. single_shot: Judge sees all model_outputs at once and returns ranking/scores
|
|
8
|
+
2. round_robin: Judge compares model_outputs pairwise, results are aggregated
|
|
9
|
+
|
|
10
|
+
Usage:
|
|
11
|
+
from llmasajudge import LLMAsAJudge
|
|
12
|
+
from llmasajudge.ranker import rank
|
|
13
|
+
|
|
14
|
+
judge = LLMAsAJudge(models=["openai/gpt-4o-mini"])
|
|
15
|
+
|
|
16
|
+
result = rank(
|
|
17
|
+
judge=judge,
|
|
18
|
+
input="Explain recursion simply",
|
|
19
|
+
model_outputs=["Answer 1", "Answer 2", "Answer 3"],
|
|
20
|
+
ranking_mode="single_shot",
|
|
21
|
+
output_parser=ranking_parser,
|
|
22
|
+
custom_template=template
|
|
23
|
+
)
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
import re
|
|
27
|
+
import json
|
|
28
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
29
|
+
from itertools import combinations
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
__all__ = ["rank", "RankingParsers"]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# Default templates for ranking
|
|
36
|
+
DEFAULT_SINGLE_SHOT_TEMPLATE = """\
|
|
37
|
+
Rank the following candidate responses from BEST to WORST.
|
|
38
|
+
{ground_truth_section}
|
|
39
|
+
Task/Input:
|
|
40
|
+
{input_block}
|
|
41
|
+
|
|
42
|
+
Candidates:
|
|
43
|
+
{model_outputs}
|
|
44
|
+
|
|
45
|
+
Provide your ranking using the format: A > B > C > D (etc)
|
|
46
|
+
Return ONLY the ranking, no explanation."""
|
|
47
|
+
|
|
48
|
+
DEFAULT_SINGLE_SHOT_WITH_GT_TEMPLATE = """\
|
|
49
|
+
Rank the following candidate responses from BEST to WORST based on how well they match the ground truth answer.
|
|
50
|
+
{ground_truth_section}
|
|
51
|
+
Task/Input:
|
|
52
|
+
{input_block}
|
|
53
|
+
|
|
54
|
+
Ground Truth Answer:
|
|
55
|
+
{ground_truth}
|
|
56
|
+
|
|
57
|
+
Candidates:
|
|
58
|
+
{model_outputs}
|
|
59
|
+
|
|
60
|
+
Provide your ranking using the format: A > B > C > D (etc)
|
|
61
|
+
Return ONLY the ranking, no explanation."""
|
|
62
|
+
|
|
63
|
+
DEFAULT_ROUND_ROBIN_TEMPLATE = """\
|
|
64
|
+
Compare the following two responses and determine which is better.
|
|
65
|
+
|
|
66
|
+
Task/Input:
|
|
67
|
+
{input_block}
|
|
68
|
+
|
|
69
|
+
Option A:
|
|
70
|
+
{option_a}
|
|
71
|
+
|
|
72
|
+
Option B:
|
|
73
|
+
{option_b}
|
|
74
|
+
|
|
75
|
+
Which response is better? Return exactly one of: A, B, or tie"""
|
|
76
|
+
|
|
77
|
+
DEFAULT_ROUND_ROBIN_WITH_GT_TEMPLATE = """\
|
|
78
|
+
Compare the following two responses based on how well they match the ground truth answer.
|
|
79
|
+
|
|
80
|
+
Task/Input:
|
|
81
|
+
{input_block}
|
|
82
|
+
|
|
83
|
+
Ground Truth Answer:
|
|
84
|
+
{ground_truth}
|
|
85
|
+
|
|
86
|
+
Option A:
|
|
87
|
+
{option_a}
|
|
88
|
+
|
|
89
|
+
Option B:
|
|
90
|
+
{option_b}
|
|
91
|
+
|
|
92
|
+
Which response better matches the ground truth? Return exactly one of: A, B, or tie"""
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class RankingParsers:
|
|
96
|
+
"""Stock output parsers for ranking tasks."""
|
|
97
|
+
|
|
98
|
+
@staticmethod
|
|
99
|
+
def letter_ordering(response: str) -> List[str]:
|
|
100
|
+
"""
|
|
101
|
+
Parse ordering like "A > C > B" into ["A", "C", "B"].
|
|
102
|
+
Handles various separators: >, ->, =>
|
|
103
|
+
"""
|
|
104
|
+
if not response:
|
|
105
|
+
return []
|
|
106
|
+
|
|
107
|
+
# Try different separators
|
|
108
|
+
for sep in [">", "->", "=>"]:
|
|
109
|
+
if sep in response:
|
|
110
|
+
parts = [x.strip().upper() for x in response.split(sep)]
|
|
111
|
+
# Filter to single letters only
|
|
112
|
+
return [p for p in parts if len(p) == 1 and p.isalpha()]
|
|
113
|
+
|
|
114
|
+
# Fallback: extract all single letters in order
|
|
115
|
+
letters = re.findall(r'\b([A-Z])\b', response.upper())
|
|
116
|
+
return letters
|
|
117
|
+
|
|
118
|
+
@staticmethod
|
|
119
|
+
def json_scores(response: str) -> Optional[Dict[str, float]]:
|
|
120
|
+
"""
|
|
121
|
+
Parse JSON like {"A": 9.2, "B": 7.1, "C": 8.5}.
|
|
122
|
+
Returns dict mapping candidate labels to scores.
|
|
123
|
+
"""
|
|
124
|
+
if not response:
|
|
125
|
+
return None
|
|
126
|
+
|
|
127
|
+
try:
|
|
128
|
+
s = response.strip()
|
|
129
|
+
|
|
130
|
+
# Handle markdown code blocks
|
|
131
|
+
if "```json" in s.lower():
|
|
132
|
+
start = s.lower().find("```json") + 7
|
|
133
|
+
end = s.find("```", start)
|
|
134
|
+
if end > start:
|
|
135
|
+
s = s[start:end].strip()
|
|
136
|
+
elif "```" in s:
|
|
137
|
+
start = s.find("```") + 3
|
|
138
|
+
end = s.find("```", start)
|
|
139
|
+
if end > start:
|
|
140
|
+
s = s[start:end].strip()
|
|
141
|
+
|
|
142
|
+
# Extract JSON object
|
|
143
|
+
if '{' in s and '}' in s:
|
|
144
|
+
start_brace = s.find('{')
|
|
145
|
+
end_brace = s.rfind('}')
|
|
146
|
+
if start_brace < end_brace:
|
|
147
|
+
s = s[start_brace:end_brace + 1]
|
|
148
|
+
|
|
149
|
+
data = json.loads(s)
|
|
150
|
+
if not isinstance(data, dict):
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
# Convert all values to float
|
|
154
|
+
result = {}
|
|
155
|
+
for key, val in data.items():
|
|
156
|
+
if isinstance(val, (int, float)):
|
|
157
|
+
result[str(key).upper()] = float(val)
|
|
158
|
+
elif isinstance(val, str):
|
|
159
|
+
try:
|
|
160
|
+
result[str(key).upper()] = float(val)
|
|
161
|
+
except ValueError:
|
|
162
|
+
pass
|
|
163
|
+
|
|
164
|
+
return result if result else None
|
|
165
|
+
|
|
166
|
+
except (json.JSONDecodeError, ValueError):
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
@staticmethod
|
|
170
|
+
def pairwise_winner(response: str) -> Optional[str]:
|
|
171
|
+
"""
|
|
172
|
+
Parse pairwise comparison: "A", "B", or "tie".
|
|
173
|
+
Returns "A", "B", "tie", or None if unparseable.
|
|
174
|
+
"""
|
|
175
|
+
if not response:
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
text = response.strip().upper()
|
|
179
|
+
|
|
180
|
+
# Exact matches
|
|
181
|
+
if text == "A":
|
|
182
|
+
return "A"
|
|
183
|
+
if text == "B":
|
|
184
|
+
return "B"
|
|
185
|
+
if text == "TIE" or text == "TIED":
|
|
186
|
+
return "tie"
|
|
187
|
+
|
|
188
|
+
# Check for tie first (more specific)
|
|
189
|
+
if "TIE" in text or "TIED" in text or "DRAW" in text or "EQUAL" in text:
|
|
190
|
+
return "tie"
|
|
191
|
+
|
|
192
|
+
# Look for explicit answer patterns like "Answer: A", "Winner: B", "A is better", etc.
|
|
193
|
+
# Match word boundaries to avoid false positives
|
|
194
|
+
# Pattern to find answer declarations
|
|
195
|
+
answer_patterns = [
|
|
196
|
+
r'\bANSWER\s*:?\s*([AB])\b',
|
|
197
|
+
r'\bWINNER\s*:?\s*([AB])\b',
|
|
198
|
+
r'\bCHOOSE\s*:?\s*([AB])\b',
|
|
199
|
+
r'\bSELECT\s*:?\s*([AB])\b',
|
|
200
|
+
r'\bRESPONSE\s*:?\s*([AB])\b',
|
|
201
|
+
r'\bOPTION\s*:?\s*([AB])\b',
|
|
202
|
+
r'^\s*([AB])\s*$', # Just "A" or "B" alone
|
|
203
|
+
r'\b([AB])\s+IS\s+BETTER\b',
|
|
204
|
+
r'\bBETTER\s*:?\s*([AB])\b',
|
|
205
|
+
]
|
|
206
|
+
|
|
207
|
+
for pattern in answer_patterns:
|
|
208
|
+
match = re.search(pattern, text)
|
|
209
|
+
if match:
|
|
210
|
+
return match.group(1)
|
|
211
|
+
|
|
212
|
+
# Fallback: simple presence check (only if one appears more prominently)
|
|
213
|
+
# Count standalone occurrences
|
|
214
|
+
a_count = len(re.findall(r'\bA\b', text))
|
|
215
|
+
b_count = len(re.findall(r'\bB\b', text))
|
|
216
|
+
|
|
217
|
+
# If one clearly dominates, use it
|
|
218
|
+
if a_count > b_count and b_count == 0:
|
|
219
|
+
return "A"
|
|
220
|
+
if b_count > a_count and a_count == 0:
|
|
221
|
+
return "B"
|
|
222
|
+
|
|
223
|
+
# Last resort: check if only one appears at all
|
|
224
|
+
if "A" in text and "B" not in text:
|
|
225
|
+
return "A"
|
|
226
|
+
if "B" in text and "A" not in text:
|
|
227
|
+
return "B"
|
|
228
|
+
|
|
229
|
+
return None
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _format_model_outputs(model_outputs: List[str]) -> str:
|
|
233
|
+
"""
|
|
234
|
+
Format model_outputs as labeled blocks:
|
|
235
|
+
A)
|
|
236
|
+
<output 0>
|
|
237
|
+
|
|
238
|
+
B)
|
|
239
|
+
<output 1>
|
|
240
|
+
"""
|
|
241
|
+
labels = [chr(65 + i) for i in range(len(model_outputs))] # A, B, C, ...
|
|
242
|
+
blocks = []
|
|
243
|
+
for label, output in zip(labels, model_outputs):
|
|
244
|
+
blocks.append(f"{label})\n{output}")
|
|
245
|
+
return "\n\n".join(blocks)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _labels_to_indices(labels: List[str], num_outputs: int) -> List[int]:
|
|
249
|
+
"""
|
|
250
|
+
Convert letter labels ["A", "C", "B"] to indices [0, 2, 1].
|
|
251
|
+
"""
|
|
252
|
+
indices = []
|
|
253
|
+
for label in labels:
|
|
254
|
+
if len(label) != 1 or not label.isalpha():
|
|
255
|
+
continue
|
|
256
|
+
idx = ord(label.upper()) - 65 # A=0, B=1, etc.
|
|
257
|
+
if 0 <= idx < num_outputs:
|
|
258
|
+
indices.append(idx)
|
|
259
|
+
return indices
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def _scores_to_ranking(scores: Dict[str, float], num_outputs: int) -> List[int]:
|
|
263
|
+
"""
|
|
264
|
+
Convert score dict {"A": 9, "B": 7, "C": 8} to ranking [0, 2, 1] (descending).
|
|
265
|
+
"""
|
|
266
|
+
# Normalize keys to uppercase letters
|
|
267
|
+
normalized = {}
|
|
268
|
+
for k, v in scores.items():
|
|
269
|
+
label = str(k).upper()
|
|
270
|
+
if len(label) == 1 and label.isalpha():
|
|
271
|
+
idx = ord(label) - 65
|
|
272
|
+
if 0 <= idx < num_outputs:
|
|
273
|
+
normalized[idx] = float(v)
|
|
274
|
+
|
|
275
|
+
# Sort by score descending
|
|
276
|
+
sorted_indices = sorted(normalized.keys(), key=lambda i: normalized[i], reverse=True)
|
|
277
|
+
return sorted_indices
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _single_shot_rank(
|
|
281
|
+
judge,
|
|
282
|
+
input_text: str,
|
|
283
|
+
model_outputs: List[str],
|
|
284
|
+
ground_truth: Optional[str],
|
|
285
|
+
output_parser: Callable,
|
|
286
|
+
custom_template: Optional[str],
|
|
287
|
+
use_fully_custom_prompt: bool,
|
|
288
|
+
max_tokens: int,
|
|
289
|
+
) -> Dict[str, Any]:
|
|
290
|
+
"""
|
|
291
|
+
Execute single-shot ranking where judge sees all model_outputs at once.
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
{
|
|
295
|
+
"ranking": [0, 2, 1], # Indices in rank order
|
|
296
|
+
"labels": ["A", "C", "B"], # Letter labels in rank order
|
|
297
|
+
"scores": {"A": 9.2, "B": 7.1, "C": 8.5} or None,
|
|
298
|
+
"raw_votes": [...], # Individual judge outputs
|
|
299
|
+
}
|
|
300
|
+
"""
|
|
301
|
+
num_outputs = len(model_outputs)
|
|
302
|
+
formatted_outputs = _format_model_outputs(model_outputs)
|
|
303
|
+
|
|
304
|
+
# Build prompt
|
|
305
|
+
if use_fully_custom_prompt:
|
|
306
|
+
if custom_template is None:
|
|
307
|
+
raise ValueError("use_fully_custom_prompt=True requires custom_template")
|
|
308
|
+
prompt = custom_template
|
|
309
|
+
elif custom_template:
|
|
310
|
+
# Replace placeholders in custom template
|
|
311
|
+
prompt = custom_template
|
|
312
|
+
prompt = prompt.replace("{input_block}", input_text or "")
|
|
313
|
+
prompt = prompt.replace("{model_outputs}", formatted_outputs)
|
|
314
|
+
if ground_truth:
|
|
315
|
+
prompt = prompt.replace("{ground_truth}", ground_truth)
|
|
316
|
+
# Handle optional ground_truth_section placeholder
|
|
317
|
+
prompt = prompt.replace("{ground_truth_section}", "")
|
|
318
|
+
else:
|
|
319
|
+
# Use default template
|
|
320
|
+
if ground_truth:
|
|
321
|
+
template = DEFAULT_SINGLE_SHOT_WITH_GT_TEMPLATE
|
|
322
|
+
prompt = template.replace("{input_block}", input_text or "")
|
|
323
|
+
prompt = prompt.replace("{model_outputs}", formatted_outputs)
|
|
324
|
+
prompt = prompt.replace("{ground_truth}", ground_truth)
|
|
325
|
+
prompt = prompt.replace("{ground_truth_section}", "")
|
|
326
|
+
else:
|
|
327
|
+
template = DEFAULT_SINGLE_SHOT_TEMPLATE
|
|
328
|
+
prompt = template.replace("{input_block}", input_text or "")
|
|
329
|
+
prompt = prompt.replace("{model_outputs}", formatted_outputs)
|
|
330
|
+
prompt = prompt.replace("{ground_truth_section}", "")
|
|
331
|
+
|
|
332
|
+
# Use judge's internal voting mechanism
|
|
333
|
+
# We'll call judge with the constructed prompt
|
|
334
|
+
if use_fully_custom_prompt:
|
|
335
|
+
judge_result = judge.judge(prompt=prompt, max_tokens=max_tokens)
|
|
336
|
+
else:
|
|
337
|
+
# Pass empty values for standard params since we built prompt manually
|
|
338
|
+
# This is a bit hacky but works with current judge implementation
|
|
339
|
+
old_template = judge.template
|
|
340
|
+
judge.template = "{input_block}"
|
|
341
|
+
judge_result = judge.judge(input=prompt, model_output="", ground_truth="", max_tokens=max_tokens)
|
|
342
|
+
judge.template = old_template
|
|
343
|
+
|
|
344
|
+
# Parse each vote
|
|
345
|
+
raw_votes = judge_result.get("votes", [])
|
|
346
|
+
parsed_votes = []
|
|
347
|
+
|
|
348
|
+
for vote in raw_votes:
|
|
349
|
+
model = vote.get("model")
|
|
350
|
+
# Get raw response - need to call parser on it
|
|
351
|
+
# Since we used judge.judge(), the result is already in vote["result"]
|
|
352
|
+
# But we need the raw string to parse. Let's re-call the models manually.
|
|
353
|
+
pass
|
|
354
|
+
|
|
355
|
+
# Actually, let's refactor: we need direct model access for ranking
|
|
356
|
+
# The judge.judge() flow doesn't give us raw strings back
|
|
357
|
+
# Let's call models directly
|
|
358
|
+
|
|
359
|
+
votes = []
|
|
360
|
+
for model_name in judge.models:
|
|
361
|
+
try:
|
|
362
|
+
api_base, headers, temperature = judge._resolve_per_model(model_name)
|
|
363
|
+
raw_response = judge._attempt_completion(
|
|
364
|
+
model=model_name,
|
|
365
|
+
api_base=api_base,
|
|
366
|
+
headers=headers,
|
|
367
|
+
prompt=prompt,
|
|
368
|
+
temperature=temperature,
|
|
369
|
+
max_tokens=max_tokens,
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
parsed = output_parser(raw_response)
|
|
373
|
+
votes.append({
|
|
374
|
+
"model": model_name,
|
|
375
|
+
"raw_response": raw_response,
|
|
376
|
+
"parsed": parsed,
|
|
377
|
+
})
|
|
378
|
+
|
|
379
|
+
if judge.verbose:
|
|
380
|
+
print(f"Model {model_name} ranking: {parsed}", flush=True)
|
|
381
|
+
|
|
382
|
+
except Exception as e:
|
|
383
|
+
if judge.verbose:
|
|
384
|
+
print(f"Model {model_name} failed: {e}", flush=True)
|
|
385
|
+
votes.append({
|
|
386
|
+
"model": model_name,
|
|
387
|
+
"error": str(e),
|
|
388
|
+
"parsed": None,
|
|
389
|
+
})
|
|
390
|
+
|
|
391
|
+
# Handle custom generation functions
|
|
392
|
+
for idx, custom_fn in enumerate(judge.custom_generation_fns):
|
|
393
|
+
try:
|
|
394
|
+
raw_response = custom_fn(prompt)
|
|
395
|
+
parsed = output_parser(raw_response)
|
|
396
|
+
votes.append({
|
|
397
|
+
"model": f"custom_fn_{idx}",
|
|
398
|
+
"raw_response": raw_response,
|
|
399
|
+
"parsed": parsed,
|
|
400
|
+
})
|
|
401
|
+
except Exception as e:
|
|
402
|
+
if judge.verbose:
|
|
403
|
+
print(f"Custom function {idx} failed: {e}", flush=True)
|
|
404
|
+
votes.append({
|
|
405
|
+
"model": f"custom_fn_{idx}",
|
|
406
|
+
"error": str(e),
|
|
407
|
+
"parsed": None,
|
|
408
|
+
})
|
|
409
|
+
|
|
410
|
+
# Aggregate votes based on mode
|
|
411
|
+
mode = judge.mode
|
|
412
|
+
valid_votes = [v for v in votes if v.get("parsed") is not None]
|
|
413
|
+
|
|
414
|
+
if not valid_votes:
|
|
415
|
+
raise ValueError("No valid ranking votes received")
|
|
416
|
+
|
|
417
|
+
# Determine output type (ordering vs scores)
|
|
418
|
+
first_parsed = valid_votes[0]["parsed"]
|
|
419
|
+
|
|
420
|
+
if isinstance(first_parsed, list):
|
|
421
|
+
# Ordering format: ["A", "C", "B"]
|
|
422
|
+
if mode == "single":
|
|
423
|
+
final_labels = valid_votes[0]["parsed"]
|
|
424
|
+
elif mode == "majority":
|
|
425
|
+
# Use first valid vote for ordering (majority doesn't make sense for orderings)
|
|
426
|
+
# Could implement Borda count or similar, but for simplicity use first
|
|
427
|
+
final_labels = valid_votes[0]["parsed"]
|
|
428
|
+
else:
|
|
429
|
+
final_labels = valid_votes[0]["parsed"]
|
|
430
|
+
|
|
431
|
+
final_ranking = _labels_to_indices(final_labels, num_outputs)
|
|
432
|
+
final_scores = None
|
|
433
|
+
|
|
434
|
+
elif isinstance(first_parsed, dict):
|
|
435
|
+
# Score format: {"A": 9.2, "B": 7.1, "C": 8.5}
|
|
436
|
+
if mode == "single":
|
|
437
|
+
final_scores = valid_votes[0]["parsed"]
|
|
438
|
+
elif mode in ("majority", "average"):
|
|
439
|
+
# Average scores across judges
|
|
440
|
+
all_scores = {}
|
|
441
|
+
for vote in valid_votes:
|
|
442
|
+
scores = vote["parsed"]
|
|
443
|
+
if isinstance(scores, dict):
|
|
444
|
+
for label, score in scores.items():
|
|
445
|
+
label = str(label).upper()
|
|
446
|
+
if label not in all_scores:
|
|
447
|
+
all_scores[label] = []
|
|
448
|
+
all_scores[label].append(float(score))
|
|
449
|
+
|
|
450
|
+
final_scores = {k: sum(v) / len(v) for k, v in all_scores.items()}
|
|
451
|
+
else:
|
|
452
|
+
final_scores = valid_votes[0]["parsed"]
|
|
453
|
+
|
|
454
|
+
final_ranking = _scores_to_ranking(final_scores, num_outputs)
|
|
455
|
+
final_labels = [chr(65 + i) for i in final_ranking]
|
|
456
|
+
|
|
457
|
+
else:
|
|
458
|
+
raise ValueError(f"Unknown parsed format: {type(first_parsed)}")
|
|
459
|
+
|
|
460
|
+
return {
|
|
461
|
+
"ranking": final_ranking,
|
|
462
|
+
"labels": final_labels if isinstance(first_parsed, list) else [chr(65 + i) for i in final_ranking],
|
|
463
|
+
"scores": final_scores,
|
|
464
|
+
"raw_votes": votes,
|
|
465
|
+
"mode": mode,
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def _round_robin_rank(
|
|
470
|
+
judge,
|
|
471
|
+
input_text: str,
|
|
472
|
+
model_outputs: List[str],
|
|
473
|
+
ground_truth: Optional[str],
|
|
474
|
+
output_parser: Callable,
|
|
475
|
+
custom_template: Optional[str],
|
|
476
|
+
use_fully_custom_prompt: bool,
|
|
477
|
+
max_tokens: int,
|
|
478
|
+
) -> Dict[str, Any]:
|
|
479
|
+
"""
|
|
480
|
+
Execute round-robin ranking where judge compares all pairs.
|
|
481
|
+
|
|
482
|
+
For N model_outputs, performs N(N-1)/2 pairwise comparisons.
|
|
483
|
+
Aggregates results into final ranking based on win counts.
|
|
484
|
+
|
|
485
|
+
Returns:
|
|
486
|
+
{
|
|
487
|
+
"ranking": [2, 0, 1], # Indices sorted by wins (descending)
|
|
488
|
+
"wins": {0: 1, 1: 0, 2: 2}, # Win count per output
|
|
489
|
+
"pairwise_results": {(0,1): 0, (0,2): 2, (1,2): 2}, # Winner per pair
|
|
490
|
+
"raw_votes": {...}, # All pairwise judge votes
|
|
491
|
+
}
|
|
492
|
+
"""
|
|
493
|
+
n = len(model_outputs)
|
|
494
|
+
|
|
495
|
+
# Initialize tracking
|
|
496
|
+
wins = {i: 0 for i in range(n)}
|
|
497
|
+
pairwise_results = {}
|
|
498
|
+
all_votes = {}
|
|
499
|
+
|
|
500
|
+
# Generate all unique pairs
|
|
501
|
+
pairs = list(combinations(range(n), 2))
|
|
502
|
+
|
|
503
|
+
for i, j in pairs:
|
|
504
|
+
# Build pairwise prompt
|
|
505
|
+
if use_fully_custom_prompt:
|
|
506
|
+
if custom_template is None:
|
|
507
|
+
raise ValueError("use_fully_custom_prompt=True requires custom_template")
|
|
508
|
+
prompt = custom_template
|
|
509
|
+
elif custom_template:
|
|
510
|
+
# Replace placeholders in custom template
|
|
511
|
+
prompt = custom_template
|
|
512
|
+
prompt = prompt.replace("{input_block}", input_text or "")
|
|
513
|
+
prompt = prompt.replace("{option_a}", model_outputs[i])
|
|
514
|
+
prompt = prompt.replace("{option_b}", model_outputs[j])
|
|
515
|
+
if ground_truth:
|
|
516
|
+
prompt = prompt.replace("{ground_truth}", ground_truth)
|
|
517
|
+
else:
|
|
518
|
+
# Use default template
|
|
519
|
+
if ground_truth:
|
|
520
|
+
template = DEFAULT_ROUND_ROBIN_WITH_GT_TEMPLATE
|
|
521
|
+
prompt = template.replace("{input_block}", input_text or "")
|
|
522
|
+
prompt = prompt.replace("{option_a}", model_outputs[i])
|
|
523
|
+
prompt = prompt.replace("{option_b}", model_outputs[j])
|
|
524
|
+
prompt = prompt.replace("{ground_truth}", ground_truth)
|
|
525
|
+
else:
|
|
526
|
+
template = DEFAULT_ROUND_ROBIN_TEMPLATE
|
|
527
|
+
prompt = template.replace("{input_block}", input_text or "")
|
|
528
|
+
prompt = prompt.replace("{option_a}", model_outputs[i])
|
|
529
|
+
prompt = prompt.replace("{option_b}", model_outputs[j])
|
|
530
|
+
|
|
531
|
+
# Collect votes from all judges
|
|
532
|
+
votes = []
|
|
533
|
+
|
|
534
|
+
for model_name in judge.models:
|
|
535
|
+
try:
|
|
536
|
+
api_base, headers, temperature = judge._resolve_per_model(model_name)
|
|
537
|
+
raw_response = judge._attempt_completion(
|
|
538
|
+
model=model_name,
|
|
539
|
+
api_base=api_base,
|
|
540
|
+
headers=headers,
|
|
541
|
+
prompt=prompt,
|
|
542
|
+
temperature=temperature,
|
|
543
|
+
max_tokens=max_tokens,
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
parsed = output_parser(raw_response)
|
|
547
|
+
votes.append({
|
|
548
|
+
"model": model_name,
|
|
549
|
+
"raw_response": raw_response,
|
|
550
|
+
"parsed": parsed,
|
|
551
|
+
})
|
|
552
|
+
|
|
553
|
+
if judge.verbose:
|
|
554
|
+
print(f"Pair ({i},{j}): Model {model_name} raw response: {repr(raw_response)}", flush=True)
|
|
555
|
+
print(f"Pair ({i},{j}): Model {model_name} voted: {parsed}", flush=True)
|
|
556
|
+
|
|
557
|
+
except Exception as e:
|
|
558
|
+
if judge.verbose:
|
|
559
|
+
print(f"Pair ({i},{j}): Model {model_name} failed: {e}", flush=True)
|
|
560
|
+
votes.append({
|
|
561
|
+
"model": model_name,
|
|
562
|
+
"error": str(e),
|
|
563
|
+
"parsed": None,
|
|
564
|
+
})
|
|
565
|
+
|
|
566
|
+
# Handle custom generation functions
|
|
567
|
+
for idx, custom_fn in enumerate(judge.custom_generation_fns):
|
|
568
|
+
try:
|
|
569
|
+
raw_response = custom_fn(prompt)
|
|
570
|
+
parsed = output_parser(raw_response)
|
|
571
|
+
votes.append({
|
|
572
|
+
"model": f"custom_fn_{idx}",
|
|
573
|
+
"raw_response": raw_response,
|
|
574
|
+
"parsed": parsed,
|
|
575
|
+
})
|
|
576
|
+
except Exception as e:
|
|
577
|
+
if judge.verbose:
|
|
578
|
+
print(f"Pair ({i},{j}): Custom function {idx} failed: {e}", flush=True)
|
|
579
|
+
votes.append({
|
|
580
|
+
"model": f"custom_fn_{idx}",
|
|
581
|
+
"error": str(e),
|
|
582
|
+
"parsed": None,
|
|
583
|
+
})
|
|
584
|
+
|
|
585
|
+
# Aggregate votes for this pair
|
|
586
|
+
valid_votes = [v for v in votes if v.get("parsed") is not None]
|
|
587
|
+
|
|
588
|
+
if not valid_votes:
|
|
589
|
+
# No valid votes, mark as tie
|
|
590
|
+
pairwise_results[(i, j)] = "tie"
|
|
591
|
+
all_votes[(i, j)] = votes
|
|
592
|
+
continue
|
|
593
|
+
|
|
594
|
+
mode = judge.mode
|
|
595
|
+
|
|
596
|
+
if mode == "single":
|
|
597
|
+
winner = valid_votes[0]["parsed"]
|
|
598
|
+
elif mode in ("majority", "all"):
|
|
599
|
+
# Count votes for A, B, tie
|
|
600
|
+
vote_counts = {"A": 0, "B": 0, "tie": 0}
|
|
601
|
+
for vote in valid_votes:
|
|
602
|
+
result = vote["parsed"]
|
|
603
|
+
if result in vote_counts:
|
|
604
|
+
vote_counts[result] += 1
|
|
605
|
+
|
|
606
|
+
# Determine winner
|
|
607
|
+
if mode == "all":
|
|
608
|
+
# All judges must agree
|
|
609
|
+
if vote_counts["A"] == len(valid_votes):
|
|
610
|
+
winner = "A"
|
|
611
|
+
elif vote_counts["B"] == len(valid_votes):
|
|
612
|
+
winner = "B"
|
|
613
|
+
else:
|
|
614
|
+
winner = "tie"
|
|
615
|
+
else: # majority
|
|
616
|
+
max_votes = max(vote_counts.values())
|
|
617
|
+
# Check for tie in voting
|
|
618
|
+
max_keys = [k for k, v in vote_counts.items() if v == max_votes]
|
|
619
|
+
if len(max_keys) > 1:
|
|
620
|
+
winner = "tie"
|
|
621
|
+
else:
|
|
622
|
+
winner = max_keys[0]
|
|
623
|
+
else:
|
|
624
|
+
winner = valid_votes[0]["parsed"]
|
|
625
|
+
|
|
626
|
+
# Record result
|
|
627
|
+
if winner == "A":
|
|
628
|
+
pairwise_results[(i, j)] = i
|
|
629
|
+
wins[i] += 1
|
|
630
|
+
elif winner == "B":
|
|
631
|
+
pairwise_results[(i, j)] = j
|
|
632
|
+
wins[j] += 1
|
|
633
|
+
else: # tie
|
|
634
|
+
pairwise_results[(i, j)] = "tie"
|
|
635
|
+
|
|
636
|
+
all_votes[(i, j)] = votes
|
|
637
|
+
|
|
638
|
+
# Build final ranking from win counts
|
|
639
|
+
ranking = sorted(range(n), key=lambda idx: wins[idx], reverse=True)
|
|
640
|
+
|
|
641
|
+
return {
|
|
642
|
+
"ranking": ranking,
|
|
643
|
+
"wins": wins,
|
|
644
|
+
"pairwise_results": pairwise_results,
|
|
645
|
+
"raw_votes": all_votes,
|
|
646
|
+
"mode": judge.mode,
|
|
647
|
+
}
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
def rank(
|
|
651
|
+
judge,
|
|
652
|
+
input: str,
|
|
653
|
+
model_outputs: List[str],
|
|
654
|
+
ground_truth: Optional[str] = None,
|
|
655
|
+
ranking_mode: str = "single_shot",
|
|
656
|
+
output_parser: Optional[Union[str, Callable]] = None,
|
|
657
|
+
custom_template: Optional[str] = None,
|
|
658
|
+
use_fully_custom_prompt: bool = False,
|
|
659
|
+
max_tokens: int = 10000,
|
|
660
|
+
) -> Dict[str, Any]:
|
|
661
|
+
"""
|
|
662
|
+
Rank multiple model outputs using an LLM judge.
|
|
663
|
+
|
|
664
|
+
Args:
|
|
665
|
+
judge: LLMAsAJudge instance configured with models
|
|
666
|
+
input: Original prompt or task description
|
|
667
|
+
model_outputs: List of model outputs to rank
|
|
668
|
+
ground_truth: Optional reference answer. If provided, ranking will be based on
|
|
669
|
+
how well model_outputs match the ground truth. If None, model_outputs are ranked
|
|
670
|
+
purely by quality/preference.
|
|
671
|
+
ranking_mode: "single_shot" or "round_robin"
|
|
672
|
+
output_parser: Parser for judge output. Can be:
|
|
673
|
+
- String: "letter_ordering", "json_scores", "pairwise_winner"
|
|
674
|
+
- Callable: Custom parser function
|
|
675
|
+
- None: Auto-selects based on ranking_mode (defaults to "letter_ordering"
|
|
676
|
+
for single_shot, "pairwise_winner" for round_robin)
|
|
677
|
+
For single_shot: should return List[str] (ordering) or Dict[str, float] (scores)
|
|
678
|
+
For round_robin: should return "A", "B", or "tie"
|
|
679
|
+
custom_template: Optional prompt template with placeholders. If None, uses sensible
|
|
680
|
+
defaults that adapt based on whether ground_truth is provided.
|
|
681
|
+
Available placeholders:
|
|
682
|
+
- single_shot: {input_block}, {model_outputs}, {ground_truth}
|
|
683
|
+
- round_robin: {input_block}, {option_a}, {option_b}, {ground_truth}
|
|
684
|
+
use_fully_custom_prompt: If True, custom_template is used as-is without substitution
|
|
685
|
+
max_tokens: Maximum tokens for judge response
|
|
686
|
+
|
|
687
|
+
Returns:
|
|
688
|
+
Dict with ranking results and metadata. Format depends on ranking_mode:
|
|
689
|
+
|
|
690
|
+
single_shot:
|
|
691
|
+
{
|
|
692
|
+
"ranking": [0, 2, 1], # Indices in rank order
|
|
693
|
+
"labels": ["A", "C", "B"], # Letter labels in rank order
|
|
694
|
+
"scores": {...} or None, # Scores if parser returns dict
|
|
695
|
+
"raw_votes": [...], # Individual judge outputs
|
|
696
|
+
"mode": str, # Aggregation mode used
|
|
697
|
+
}
|
|
698
|
+
|
|
699
|
+
round_robin:
|
|
700
|
+
{
|
|
701
|
+
"ranking": [2, 0, 1], # Indices sorted by wins
|
|
702
|
+
"wins": {0: 1, 1: 0, 2: 2}, # Win count per output
|
|
703
|
+
"pairwise_results": {...}, # Winner per pair
|
|
704
|
+
"raw_votes": {...}, # All pairwise judge votes
|
|
705
|
+
"mode": str, # Aggregation mode used
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
Example:
|
|
709
|
+
>>> from llmasajudge import LLMAsAJudge
|
|
710
|
+
>>>
|
|
711
|
+
>>> judge = LLMAsAJudge(models=["openai/gpt-4o-mini"])
|
|
712
|
+
>>>
|
|
713
|
+
>>> # Using string parser name
|
|
714
|
+
>>> result = judge.rank(
|
|
715
|
+
... input="Explain recursion",
|
|
716
|
+
... model_outputs=["Answer 1", "Answer 2", "Answer 3"],
|
|
717
|
+
... ranking_mode="single_shot",
|
|
718
|
+
... output_parser="letter_ordering",
|
|
719
|
+
... custom_template="Rank from best to worst:\\n{model_outputs}\\nReturn: A > B > C"
|
|
720
|
+
... )
|
|
721
|
+
>>> print(result["ranking"]) # [0, 2, 1]
|
|
722
|
+
"""
|
|
723
|
+
if not model_outputs:
|
|
724
|
+
raise ValueError("Must provide at least one model output")
|
|
725
|
+
|
|
726
|
+
if ranking_mode not in ("single_shot", "round_robin"):
|
|
727
|
+
raise ValueError("ranking_mode must be 'single_shot' or 'round_robin'")
|
|
728
|
+
|
|
729
|
+
# Resolve output_parser (string or callable)
|
|
730
|
+
if output_parser is None:
|
|
731
|
+
# Auto-select default parser based on mode
|
|
732
|
+
if ranking_mode == "single_shot":
|
|
733
|
+
output_parser = RankingParsers.letter_ordering
|
|
734
|
+
else: # round_robin
|
|
735
|
+
output_parser = RankingParsers.pairwise_winner
|
|
736
|
+
elif isinstance(output_parser, str):
|
|
737
|
+
# Map string to parser function
|
|
738
|
+
parser_map = {
|
|
739
|
+
'letter_ordering': RankingParsers.letter_ordering,
|
|
740
|
+
'json_scores': RankingParsers.json_scores,
|
|
741
|
+
'pairwise_winner': RankingParsers.pairwise_winner,
|
|
742
|
+
}
|
|
743
|
+
if output_parser not in parser_map:
|
|
744
|
+
raise ValueError(
|
|
745
|
+
f"Unknown parser '{output_parser}'. "
|
|
746
|
+
f"Available: {list(parser_map.keys())}"
|
|
747
|
+
)
|
|
748
|
+
output_parser = parser_map[output_parser]
|
|
749
|
+
# else: assume it's a callable, use as-is
|
|
750
|
+
|
|
751
|
+
if ranking_mode == "single_shot":
|
|
752
|
+
return _single_shot_rank(
|
|
753
|
+
judge=judge,
|
|
754
|
+
input_text=input,
|
|
755
|
+
model_outputs=model_outputs,
|
|
756
|
+
ground_truth=ground_truth,
|
|
757
|
+
output_parser=output_parser,
|
|
758
|
+
custom_template=custom_template,
|
|
759
|
+
use_fully_custom_prompt=use_fully_custom_prompt,
|
|
760
|
+
max_tokens=max_tokens,
|
|
761
|
+
)
|
|
762
|
+
else: # round_robin
|
|
763
|
+
return _round_robin_rank(
|
|
764
|
+
judge=judge,
|
|
765
|
+
input_text=input,
|
|
766
|
+
model_outputs=model_outputs,
|
|
767
|
+
ground_truth=ground_truth,
|
|
768
|
+
output_parser=output_parser,
|
|
769
|
+
custom_template=custom_template,
|
|
770
|
+
use_fully_custom_prompt=use_fully_custom_prompt,
|
|
771
|
+
max_tokens=max_tokens,
|
|
772
|
+
)
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "llmasajudge"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.18"
|
|
8
8
|
description = "LLM Judge: simple right/wrong voting across models"
|
|
9
9
|
authors = [{name="Brett Young", email="byyoung3@gmail.com"}]
|
|
10
10
|
readme = "README.md"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|