llmasajudge 0.1.14__py3-none-any.whl → 0.1.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llmasajudge/__init__.py CHANGED
@@ -875,6 +875,7 @@ class ReturnType(Enum):
875
875
  BOOLEAN = "boolean"
876
876
  SCALAR = "scalar"
877
877
  MAP = "map"
878
+ STRING = "string" # For arbitrary string returns (categories, choices, etc.)
878
879
 
879
880
 
880
881
  class AggregationMode(Enum):
@@ -888,6 +889,7 @@ class AggregationMode(Enum):
888
889
  MIN = "min"
889
890
  MAX = "max"
890
891
  MEDIAN = "median"
892
+ # String modes - uses MAJORITY and SINGLE from above
891
893
 
892
894
 
893
895
  # Valid aggregation modes per return type
@@ -895,6 +897,7 @@ VALID_MODES = {
895
897
  ReturnType.BOOLEAN: {AggregationMode.MAJORITY, AggregationMode.SINGLE, AggregationMode.ALL},
896
898
  ReturnType.SCALAR: {AggregationMode.AVERAGE, AggregationMode.MIN, AggregationMode.MAX, AggregationMode.MEDIAN, AggregationMode.SINGLE},
897
899
  ReturnType.MAP: {AggregationMode.AVERAGE, AggregationMode.MIN, AggregationMode.MAX, AggregationMode.MEDIAN, AggregationMode.SINGLE},
900
+ ReturnType.STRING: {AggregationMode.MAJORITY, AggregationMode.SINGLE},
898
901
  }
899
902
 
900
903
  # Default aggregation modes per return type
@@ -902,6 +905,7 @@ DEFAULT_MODES = {
902
905
  ReturnType.BOOLEAN: AggregationMode.MAJORITY,
903
906
  ReturnType.SCALAR: AggregationMode.AVERAGE,
904
907
  ReturnType.MAP: AggregationMode.AVERAGE,
908
+ ReturnType.STRING: AggregationMode.MAJORITY,
905
909
  }
906
910
 
907
911
  # String to enum mapping (for backward compat)
@@ -1115,6 +1119,8 @@ def _infer_return_type(value: Any) -> Optional[ReturnType]:
1115
1119
  return ReturnType.SCALAR
1116
1120
  if isinstance(value, dict) and all(isinstance(v, (int, float)) for v in value.values()):
1117
1121
  return ReturnType.MAP
1122
+ if isinstance(value, str):
1123
+ return ReturnType.STRING
1118
1124
  return None
1119
1125
 
1120
1126
 
@@ -1374,15 +1380,21 @@ Output only valid JSON. No explanation. No extra text.""",
1374
1380
  last_err = None
1375
1381
  for i in range(attempts):
1376
1382
  try:
1377
- resp = completion(
1378
- model=model,
1379
- api_base=api_base,
1380
- messages=[{"role": "user", "content": prompt}],
1381
- temperature=temperature,
1382
- max_tokens=max_tokens,
1383
- extra_headers=headers,
1384
- caching=self.cache_enabled
1385
- )
1383
+ # GPT-5 models don't accept temperature argument
1384
+ completion_kwargs = {
1385
+ "model": model,
1386
+ "api_base": api_base,
1387
+ "messages": [{"role": "user", "content": prompt}],
1388
+ "max_tokens": max_tokens,
1389
+ "extra_headers": headers,
1390
+ "caching": self.cache_enabled
1391
+ }
1392
+
1393
+ # Only add temperature if NOT a gpt-5 model
1394
+ if "gpt-5" not in model.lower():
1395
+ completion_kwargs["temperature"] = temperature
1396
+
1397
+ resp = completion(**completion_kwargs)
1386
1398
  return (resp.choices[0].message.content or "").strip()
1387
1399
  except Exception as e:
1388
1400
  last_err = e
@@ -1453,20 +1465,20 @@ Output only valid JSON. No explanation. No extra text.""",
1453
1465
  valid = [v["result"] for v in votes if v["result"] is not None and isinstance(v["result"], dict)]
1454
1466
  if not valid:
1455
1467
  raise ValueError("No valid map votes to aggregate")
1456
-
1468
+
1457
1469
  keys = set()
1458
1470
  for v in valid:
1459
1471
  keys.update(v.keys())
1460
-
1472
+
1461
1473
  if self._mode == AggregationMode.SINGLE:
1462
1474
  return valid[0]
1463
-
1475
+
1464
1476
  result = {}
1465
1477
  for key in keys:
1466
1478
  values = [v[key] for v in valid if key in v]
1467
1479
  if not values:
1468
1480
  continue
1469
-
1481
+
1470
1482
  if self._mode == AggregationMode.AVERAGE:
1471
1483
  result[key] = sum(values) / len(values)
1472
1484
  elif self._mode == AggregationMode.MIN:
@@ -1478,9 +1490,38 @@ Output only valid JSON. No explanation. No extra text.""",
1478
1490
  n = len(s)
1479
1491
  mid = n // 2
1480
1492
  result[key] = (s[mid - 1] + s[mid]) / 2 if n % 2 == 0 else s[mid]
1481
-
1493
+
1482
1494
  return result
1483
1495
 
1496
+ def _aggregate_string(self, votes: List[Dict[str, Any]]) -> str:
1497
+ """
1498
+ Aggregate string votes with tie detection.
1499
+ Returns the majority string, or "tie" if there's no clear majority.
1500
+ """
1501
+ results = [v["result"] for v in votes if v["result"] is not None and isinstance(v["result"], str)]
1502
+ if not results:
1503
+ raise ValueError("No valid string votes to aggregate")
1504
+
1505
+ if self._mode == AggregationMode.SINGLE:
1506
+ return results[0]
1507
+
1508
+ # Count occurrences
1509
+ from collections import Counter
1510
+ counts = Counter(results)
1511
+
1512
+ # Get the most common
1513
+ most_common = counts.most_common()
1514
+
1515
+ if len(most_common) == 0:
1516
+ raise ValueError("No valid string votes to aggregate")
1517
+
1518
+ # Check for tie: if top two have same count
1519
+ if len(most_common) > 1 and most_common[0][1] == most_common[1][1]:
1520
+ return "tie"
1521
+
1522
+ # Return the majority
1523
+ return most_common[0][0]
1524
+
1484
1525
  def judge(
1485
1526
  self,
1486
1527
  input: Any = None,
@@ -1577,13 +1618,16 @@ Output only valid JSON. No explanation. No extra text.""",
1577
1618
  final = self._aggregate_scalar(votes)
1578
1619
  elif return_type == ReturnType.MAP:
1579
1620
  final = self._aggregate_map(votes)
1621
+ elif return_type == ReturnType.STRING:
1622
+ final = self._aggregate_string(votes)
1580
1623
  else:
1581
1624
  raise ValueError(f"Unknown return type: {return_type}")
1582
1625
 
1583
1626
  # Build backward-compatible response
1584
- # Boolean: correct=bool, scores=None
1585
- # Scalar: correct=score, scores=score (both fields for convenience)
1586
- # Map: correct=None, scores=map
1627
+ # Boolean: correct=bool, scores=None, result=bool
1628
+ # Scalar: correct=score, scores=score, result=score (both fields for convenience)
1629
+ # Map: correct=None, scores=map, result=map
1630
+ # String: correct=string, scores=None, result=string
1587
1631
  if return_type == ReturnType.BOOLEAN:
1588
1632
  # Also put "correct" in each vote for backward compat
1589
1633
  for v in votes:
@@ -1591,6 +1635,7 @@ Output only valid JSON. No explanation. No extra text.""",
1591
1635
  return {
1592
1636
  "correct": final,
1593
1637
  "scores": None,
1638
+ "result": final,
1594
1639
  "mode": self.mode,
1595
1640
  "votes": votes,
1596
1641
  }
@@ -1599,6 +1644,16 @@ Output only valid JSON. No explanation. No extra text.""",
1599
1644
  return {
1600
1645
  "correct": final,
1601
1646
  "scores": final,
1647
+ "result": final,
1648
+ "mode": self.mode,
1649
+ "votes": votes,
1650
+ }
1651
+ elif return_type == ReturnType.STRING:
1652
+ # For string, put result in correct field
1653
+ return {
1654
+ "correct": final,
1655
+ "scores": None,
1656
+ "result": final,
1602
1657
  "mode": self.mode,
1603
1658
  "votes": votes,
1604
1659
  }
@@ -1606,6 +1661,47 @@ Output only valid JSON. No explanation. No extra text.""",
1606
1661
  return {
1607
1662
  "correct": None,
1608
1663
  "scores": final,
1664
+ "result": final,
1609
1665
  "mode": self.mode,
1610
1666
  "votes": votes,
1611
- }
1667
+ }
1668
+
1669
+ def rank(
1670
+ self,
1671
+ input: str,
1672
+ model_outputs: List[str],
1673
+ ground_truth: Optional[str] = None,
1674
+ ranking_mode: str = "single_shot",
1675
+ output_parser: Optional[Callable] = None,
1676
+ custom_template: Optional[str] = None,
1677
+ use_fully_custom_prompt: bool = False,
1678
+ max_tokens: int = 10000,
1679
+ ) -> Dict[str, Any]:
1680
+ """
1681
+ Rank multiple model outputs.
1682
+
1683
+ Args:
1684
+ input: Original prompt or task description
1685
+ model_outputs: List of model outputs to rank
1686
+ ground_truth: Optional reference answer
1687
+ ranking_mode: "single_shot" or "round_robin"
1688
+ output_parser: Function to parse judge output
1689
+ custom_template: Prompt template with placeholders
1690
+ use_fully_custom_prompt: If True, template used as-is
1691
+ max_tokens: Maximum tokens for judge response
1692
+
1693
+ Returns:
1694
+ Dict with ranking results (see ranker.rank() for details)
1695
+ """
1696
+ from llmasajudge.ranker import rank as _rank
1697
+ return _rank(
1698
+ judge=self,
1699
+ input=input,
1700
+ model_outputs=model_outputs,
1701
+ ground_truth=ground_truth,
1702
+ ranking_mode=ranking_mode,
1703
+ output_parser=output_parser,
1704
+ custom_template=custom_template,
1705
+ use_fully_custom_prompt=use_fully_custom_prompt,
1706
+ max_tokens=max_tokens,
1707
+ )
llmasajudge/ranker.py ADDED
@@ -0,0 +1,772 @@
1
+ """
2
+ LLMAsAJudge Ranking Extensions
3
+
4
+ Provides relative ranking functionality for evaluating multiple model outputs.
5
+
6
+ Supports two ranking modes:
7
+ 1. single_shot: Judge sees all model_outputs at once and returns ranking/scores
8
+ 2. round_robin: Judge compares model_outputs pairwise, results are aggregated
9
+
10
+ Usage:
11
+ from llmasajudge import LLMAsAJudge
12
+ from llmasajudge.ranker import rank
13
+
14
+ judge = LLMAsAJudge(models=["openai/gpt-4o-mini"])
15
+
16
+ result = rank(
17
+ judge=judge,
18
+ input="Explain recursion simply",
19
+ model_outputs=["Answer 1", "Answer 2", "Answer 3"],
20
+ ranking_mode="single_shot",
21
+ output_parser=ranking_parser,
22
+ custom_template=template
23
+ )
24
+ """
25
+
26
+ import re
27
+ import json
28
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
29
+ from itertools import combinations
30
+
31
+
32
+ __all__ = ["rank", "RankingParsers"]
33
+
34
+
35
+ # Default templates for ranking
36
+ DEFAULT_SINGLE_SHOT_TEMPLATE = """\
37
+ Rank the following candidate responses from BEST to WORST.
38
+ {ground_truth_section}
39
+ Task/Input:
40
+ {input_block}
41
+
42
+ Candidates:
43
+ {model_outputs}
44
+
45
+ Provide your ranking using the format: A > B > C > D (etc)
46
+ Return ONLY the ranking, no explanation."""
47
+
48
+ DEFAULT_SINGLE_SHOT_WITH_GT_TEMPLATE = """\
49
+ Rank the following candidate responses from BEST to WORST based on how well they match the ground truth answer.
50
+ {ground_truth_section}
51
+ Task/Input:
52
+ {input_block}
53
+
54
+ Ground Truth Answer:
55
+ {ground_truth}
56
+
57
+ Candidates:
58
+ {model_outputs}
59
+
60
+ Provide your ranking using the format: A > B > C > D (etc)
61
+ Return ONLY the ranking, no explanation."""
62
+
63
+ DEFAULT_ROUND_ROBIN_TEMPLATE = """\
64
+ Compare the following two responses and determine which is better.
65
+
66
+ Task/Input:
67
+ {input_block}
68
+
69
+ Option A:
70
+ {option_a}
71
+
72
+ Option B:
73
+ {option_b}
74
+
75
+ Which response is better? Return exactly one of: A, B, or tie"""
76
+
77
+ DEFAULT_ROUND_ROBIN_WITH_GT_TEMPLATE = """\
78
+ Compare the following two responses based on how well they match the ground truth answer.
79
+
80
+ Task/Input:
81
+ {input_block}
82
+
83
+ Ground Truth Answer:
84
+ {ground_truth}
85
+
86
+ Option A:
87
+ {option_a}
88
+
89
+ Option B:
90
+ {option_b}
91
+
92
+ Which response better matches the ground truth? Return exactly one of: A, B, or tie"""
93
+
94
+
95
+ class RankingParsers:
96
+ """Stock output parsers for ranking tasks."""
97
+
98
+ @staticmethod
99
+ def letter_ordering(response: str) -> List[str]:
100
+ """
101
+ Parse ordering like "A > C > B" into ["A", "C", "B"].
102
+ Handles various separators: >, ->, =>
103
+ """
104
+ if not response:
105
+ return []
106
+
107
+ # Try different separators
108
+ for sep in [">", "->", "=>"]:
109
+ if sep in response:
110
+ parts = [x.strip().upper() for x in response.split(sep)]
111
+ # Filter to single letters only
112
+ return [p for p in parts if len(p) == 1 and p.isalpha()]
113
+
114
+ # Fallback: extract all single letters in order
115
+ letters = re.findall(r'\b([A-Z])\b', response.upper())
116
+ return letters
117
+
118
+ @staticmethod
119
+ def json_scores(response: str) -> Optional[Dict[str, float]]:
120
+ """
121
+ Parse JSON like {"A": 9.2, "B": 7.1, "C": 8.5}.
122
+ Returns dict mapping candidate labels to scores.
123
+ """
124
+ if not response:
125
+ return None
126
+
127
+ try:
128
+ s = response.strip()
129
+
130
+ # Handle markdown code blocks
131
+ if "```json" in s.lower():
132
+ start = s.lower().find("```json") + 7
133
+ end = s.find("```", start)
134
+ if end > start:
135
+ s = s[start:end].strip()
136
+ elif "```" in s:
137
+ start = s.find("```") + 3
138
+ end = s.find("```", start)
139
+ if end > start:
140
+ s = s[start:end].strip()
141
+
142
+ # Extract JSON object
143
+ if '{' in s and '}' in s:
144
+ start_brace = s.find('{')
145
+ end_brace = s.rfind('}')
146
+ if start_brace < end_brace:
147
+ s = s[start_brace:end_brace + 1]
148
+
149
+ data = json.loads(s)
150
+ if not isinstance(data, dict):
151
+ return None
152
+
153
+ # Convert all values to float
154
+ result = {}
155
+ for key, val in data.items():
156
+ if isinstance(val, (int, float)):
157
+ result[str(key).upper()] = float(val)
158
+ elif isinstance(val, str):
159
+ try:
160
+ result[str(key).upper()] = float(val)
161
+ except ValueError:
162
+ pass
163
+
164
+ return result if result else None
165
+
166
+ except (json.JSONDecodeError, ValueError):
167
+ return None
168
+
169
+ @staticmethod
170
+ def pairwise_winner(response: str) -> Optional[str]:
171
+ """
172
+ Parse pairwise comparison: "A", "B", or "tie".
173
+ Returns "A", "B", "tie", or None if unparseable.
174
+ """
175
+ if not response:
176
+ return None
177
+
178
+ text = response.strip().upper()
179
+
180
+ # Exact matches
181
+ if text == "A":
182
+ return "A"
183
+ if text == "B":
184
+ return "B"
185
+ if text == "TIE" or text == "TIED":
186
+ return "tie"
187
+
188
+ # Check for tie first (more specific)
189
+ if "TIE" in text or "TIED" in text or "DRAW" in text or "EQUAL" in text:
190
+ return "tie"
191
+
192
+ # Look for explicit answer patterns like "Answer: A", "Winner: B", "A is better", etc.
193
+ # Match word boundaries to avoid false positives
194
+ # Pattern to find answer declarations
195
+ answer_patterns = [
196
+ r'\bANSWER\s*:?\s*([AB])\b',
197
+ r'\bWINNER\s*:?\s*([AB])\b',
198
+ r'\bCHOOSE\s*:?\s*([AB])\b',
199
+ r'\bSELECT\s*:?\s*([AB])\b',
200
+ r'\bRESPONSE\s*:?\s*([AB])\b',
201
+ r'\bOPTION\s*:?\s*([AB])\b',
202
+ r'^\s*([AB])\s*$', # Just "A" or "B" alone
203
+ r'\b([AB])\s+IS\s+BETTER\b',
204
+ r'\bBETTER\s*:?\s*([AB])\b',
205
+ ]
206
+
207
+ for pattern in answer_patterns:
208
+ match = re.search(pattern, text)
209
+ if match:
210
+ return match.group(1)
211
+
212
+ # Fallback: simple presence check (only if one appears more prominently)
213
+ # Count standalone occurrences
214
+ a_count = len(re.findall(r'\bA\b', text))
215
+ b_count = len(re.findall(r'\bB\b', text))
216
+
217
+ # If one clearly dominates, use it
218
+ if a_count > b_count and b_count == 0:
219
+ return "A"
220
+ if b_count > a_count and a_count == 0:
221
+ return "B"
222
+
223
+ # Last resort: check if only one appears at all
224
+ if "A" in text and "B" not in text:
225
+ return "A"
226
+ if "B" in text and "A" not in text:
227
+ return "B"
228
+
229
+ return None
230
+
231
+
232
+ def _format_model_outputs(model_outputs: List[str]) -> str:
233
+ """
234
+ Format model_outputs as labeled blocks:
235
+ A)
236
+ <output 0>
237
+
238
+ B)
239
+ <output 1>
240
+ """
241
+ labels = [chr(65 + i) for i in range(len(model_outputs))] # A, B, C, ...
242
+ blocks = []
243
+ for label, output in zip(labels, model_outputs):
244
+ blocks.append(f"{label})\n{output}")
245
+ return "\n\n".join(blocks)
246
+
247
+
248
+ def _labels_to_indices(labels: List[str], num_outputs: int) -> List[int]:
249
+ """
250
+ Convert letter labels ["A", "C", "B"] to indices [0, 2, 1].
251
+ """
252
+ indices = []
253
+ for label in labels:
254
+ if len(label) != 1 or not label.isalpha():
255
+ continue
256
+ idx = ord(label.upper()) - 65 # A=0, B=1, etc.
257
+ if 0 <= idx < num_outputs:
258
+ indices.append(idx)
259
+ return indices
260
+
261
+
262
+ def _scores_to_ranking(scores: Dict[str, float], num_outputs: int) -> List[int]:
263
+ """
264
+ Convert score dict {"A": 9, "B": 7, "C": 8} to ranking [0, 2, 1] (descending).
265
+ """
266
+ # Normalize keys to uppercase letters
267
+ normalized = {}
268
+ for k, v in scores.items():
269
+ label = str(k).upper()
270
+ if len(label) == 1 and label.isalpha():
271
+ idx = ord(label) - 65
272
+ if 0 <= idx < num_outputs:
273
+ normalized[idx] = float(v)
274
+
275
+ # Sort by score descending
276
+ sorted_indices = sorted(normalized.keys(), key=lambda i: normalized[i], reverse=True)
277
+ return sorted_indices
278
+
279
+
280
+ def _single_shot_rank(
281
+ judge,
282
+ input_text: str,
283
+ model_outputs: List[str],
284
+ ground_truth: Optional[str],
285
+ output_parser: Callable,
286
+ custom_template: Optional[str],
287
+ use_fully_custom_prompt: bool,
288
+ max_tokens: int,
289
+ ) -> Dict[str, Any]:
290
+ """
291
+ Execute single-shot ranking where judge sees all model_outputs at once.
292
+
293
+ Returns:
294
+ {
295
+ "ranking": [0, 2, 1], # Indices in rank order
296
+ "labels": ["A", "C", "B"], # Letter labels in rank order
297
+ "scores": {"A": 9.2, "B": 7.1, "C": 8.5} or None,
298
+ "raw_votes": [...], # Individual judge outputs
299
+ }
300
+ """
301
+ num_outputs = len(model_outputs)
302
+ formatted_outputs = _format_model_outputs(model_outputs)
303
+
304
+ # Build prompt
305
+ if use_fully_custom_prompt:
306
+ if custom_template is None:
307
+ raise ValueError("use_fully_custom_prompt=True requires custom_template")
308
+ prompt = custom_template
309
+ elif custom_template:
310
+ # Replace placeholders in custom template
311
+ prompt = custom_template
312
+ prompt = prompt.replace("{input_block}", input_text or "")
313
+ prompt = prompt.replace("{model_outputs}", formatted_outputs)
314
+ if ground_truth:
315
+ prompt = prompt.replace("{ground_truth}", ground_truth)
316
+ # Handle optional ground_truth_section placeholder
317
+ prompt = prompt.replace("{ground_truth_section}", "")
318
+ else:
319
+ # Use default template
320
+ if ground_truth:
321
+ template = DEFAULT_SINGLE_SHOT_WITH_GT_TEMPLATE
322
+ prompt = template.replace("{input_block}", input_text or "")
323
+ prompt = prompt.replace("{model_outputs}", formatted_outputs)
324
+ prompt = prompt.replace("{ground_truth}", ground_truth)
325
+ prompt = prompt.replace("{ground_truth_section}", "")
326
+ else:
327
+ template = DEFAULT_SINGLE_SHOT_TEMPLATE
328
+ prompt = template.replace("{input_block}", input_text or "")
329
+ prompt = prompt.replace("{model_outputs}", formatted_outputs)
330
+ prompt = prompt.replace("{ground_truth_section}", "")
331
+
332
+ # Use judge's internal voting mechanism
333
+ # We'll call judge with the constructed prompt
334
+ if use_fully_custom_prompt:
335
+ judge_result = judge.judge(prompt=prompt, max_tokens=max_tokens)
336
+ else:
337
+ # Pass empty values for standard params since we built prompt manually
338
+ # This is a bit hacky but works with current judge implementation
339
+ old_template = judge.template
340
+ judge.template = "{input_block}"
341
+ judge_result = judge.judge(input=prompt, model_output="", ground_truth="", max_tokens=max_tokens)
342
+ judge.template = old_template
343
+
344
+ # Parse each vote
345
+ raw_votes = judge_result.get("votes", [])
346
+ parsed_votes = []
347
+
348
+ for vote in raw_votes:
349
+ model = vote.get("model")
350
+ # Get raw response - need to call parser on it
351
+ # Since we used judge.judge(), the result is already in vote["result"]
352
+ # But we need the raw string to parse. Let's re-call the models manually.
353
+ pass
354
+
355
+ # Actually, let's refactor: we need direct model access for ranking
356
+ # The judge.judge() flow doesn't give us raw strings back
357
+ # Let's call models directly
358
+
359
+ votes = []
360
+ for model_name in judge.models:
361
+ try:
362
+ api_base, headers, temperature = judge._resolve_per_model(model_name)
363
+ raw_response = judge._attempt_completion(
364
+ model=model_name,
365
+ api_base=api_base,
366
+ headers=headers,
367
+ prompt=prompt,
368
+ temperature=temperature,
369
+ max_tokens=max_tokens,
370
+ )
371
+
372
+ parsed = output_parser(raw_response)
373
+ votes.append({
374
+ "model": model_name,
375
+ "raw_response": raw_response,
376
+ "parsed": parsed,
377
+ })
378
+
379
+ if judge.verbose:
380
+ print(f"Model {model_name} ranking: {parsed}", flush=True)
381
+
382
+ except Exception as e:
383
+ if judge.verbose:
384
+ print(f"Model {model_name} failed: {e}", flush=True)
385
+ votes.append({
386
+ "model": model_name,
387
+ "error": str(e),
388
+ "parsed": None,
389
+ })
390
+
391
+ # Handle custom generation functions
392
+ for idx, custom_fn in enumerate(judge.custom_generation_fns):
393
+ try:
394
+ raw_response = custom_fn(prompt)
395
+ parsed = output_parser(raw_response)
396
+ votes.append({
397
+ "model": f"custom_fn_{idx}",
398
+ "raw_response": raw_response,
399
+ "parsed": parsed,
400
+ })
401
+ except Exception as e:
402
+ if judge.verbose:
403
+ print(f"Custom function {idx} failed: {e}", flush=True)
404
+ votes.append({
405
+ "model": f"custom_fn_{idx}",
406
+ "error": str(e),
407
+ "parsed": None,
408
+ })
409
+
410
+ # Aggregate votes based on mode
411
+ mode = judge.mode
412
+ valid_votes = [v for v in votes if v.get("parsed") is not None]
413
+
414
+ if not valid_votes:
415
+ raise ValueError("No valid ranking votes received")
416
+
417
+ # Determine output type (ordering vs scores)
418
+ first_parsed = valid_votes[0]["parsed"]
419
+
420
+ if isinstance(first_parsed, list):
421
+ # Ordering format: ["A", "C", "B"]
422
+ if mode == "single":
423
+ final_labels = valid_votes[0]["parsed"]
424
+ elif mode == "majority":
425
+ # Use first valid vote for ordering (majority doesn't make sense for orderings)
426
+ # Could implement Borda count or similar, but for simplicity use first
427
+ final_labels = valid_votes[0]["parsed"]
428
+ else:
429
+ final_labels = valid_votes[0]["parsed"]
430
+
431
+ final_ranking = _labels_to_indices(final_labels, num_outputs)
432
+ final_scores = None
433
+
434
+ elif isinstance(first_parsed, dict):
435
+ # Score format: {"A": 9.2, "B": 7.1, "C": 8.5}
436
+ if mode == "single":
437
+ final_scores = valid_votes[0]["parsed"]
438
+ elif mode in ("majority", "average"):
439
+ # Average scores across judges
440
+ all_scores = {}
441
+ for vote in valid_votes:
442
+ scores = vote["parsed"]
443
+ if isinstance(scores, dict):
444
+ for label, score in scores.items():
445
+ label = str(label).upper()
446
+ if label not in all_scores:
447
+ all_scores[label] = []
448
+ all_scores[label].append(float(score))
449
+
450
+ final_scores = {k: sum(v) / len(v) for k, v in all_scores.items()}
451
+ else:
452
+ final_scores = valid_votes[0]["parsed"]
453
+
454
+ final_ranking = _scores_to_ranking(final_scores, num_outputs)
455
+ final_labels = [chr(65 + i) for i in final_ranking]
456
+
457
+ else:
458
+ raise ValueError(f"Unknown parsed format: {type(first_parsed)}")
459
+
460
+ return {
461
+ "ranking": final_ranking,
462
+ "labels": final_labels if isinstance(first_parsed, list) else [chr(65 + i) for i in final_ranking],
463
+ "scores": final_scores,
464
+ "raw_votes": votes,
465
+ "mode": mode,
466
+ }
467
+
468
+
469
+ def _round_robin_rank(
470
+ judge,
471
+ input_text: str,
472
+ model_outputs: List[str],
473
+ ground_truth: Optional[str],
474
+ output_parser: Callable,
475
+ custom_template: Optional[str],
476
+ use_fully_custom_prompt: bool,
477
+ max_tokens: int,
478
+ ) -> Dict[str, Any]:
479
+ """
480
+ Execute round-robin ranking where judge compares all pairs.
481
+
482
+ For N model_outputs, performs N(N-1)/2 pairwise comparisons.
483
+ Aggregates results into final ranking based on win counts.
484
+
485
+ Returns:
486
+ {
487
+ "ranking": [2, 0, 1], # Indices sorted by wins (descending)
488
+ "wins": {0: 1, 1: 0, 2: 2}, # Win count per output
489
+ "pairwise_results": {(0,1): 0, (0,2): 2, (1,2): 2}, # Winner per pair
490
+ "raw_votes": {...}, # All pairwise judge votes
491
+ }
492
+ """
493
+ n = len(model_outputs)
494
+
495
+ # Initialize tracking
496
+ wins = {i: 0 for i in range(n)}
497
+ pairwise_results = {}
498
+ all_votes = {}
499
+
500
+ # Generate all unique pairs
501
+ pairs = list(combinations(range(n), 2))
502
+
503
+ for i, j in pairs:
504
+ # Build pairwise prompt
505
+ if use_fully_custom_prompt:
506
+ if custom_template is None:
507
+ raise ValueError("use_fully_custom_prompt=True requires custom_template")
508
+ prompt = custom_template
509
+ elif custom_template:
510
+ # Replace placeholders in custom template
511
+ prompt = custom_template
512
+ prompt = prompt.replace("{input_block}", input_text or "")
513
+ prompt = prompt.replace("{option_a}", model_outputs[i])
514
+ prompt = prompt.replace("{option_b}", model_outputs[j])
515
+ if ground_truth:
516
+ prompt = prompt.replace("{ground_truth}", ground_truth)
517
+ else:
518
+ # Use default template
519
+ if ground_truth:
520
+ template = DEFAULT_ROUND_ROBIN_WITH_GT_TEMPLATE
521
+ prompt = template.replace("{input_block}", input_text or "")
522
+ prompt = prompt.replace("{option_a}", model_outputs[i])
523
+ prompt = prompt.replace("{option_b}", model_outputs[j])
524
+ prompt = prompt.replace("{ground_truth}", ground_truth)
525
+ else:
526
+ template = DEFAULT_ROUND_ROBIN_TEMPLATE
527
+ prompt = template.replace("{input_block}", input_text or "")
528
+ prompt = prompt.replace("{option_a}", model_outputs[i])
529
+ prompt = prompt.replace("{option_b}", model_outputs[j])
530
+
531
+ # Collect votes from all judges
532
+ votes = []
533
+
534
+ for model_name in judge.models:
535
+ try:
536
+ api_base, headers, temperature = judge._resolve_per_model(model_name)
537
+ raw_response = judge._attempt_completion(
538
+ model=model_name,
539
+ api_base=api_base,
540
+ headers=headers,
541
+ prompt=prompt,
542
+ temperature=temperature,
543
+ max_tokens=max_tokens,
544
+ )
545
+
546
+ parsed = output_parser(raw_response)
547
+ votes.append({
548
+ "model": model_name,
549
+ "raw_response": raw_response,
550
+ "parsed": parsed,
551
+ })
552
+
553
+ if judge.verbose:
554
+ print(f"Pair ({i},{j}): Model {model_name} raw response: {repr(raw_response)}", flush=True)
555
+ print(f"Pair ({i},{j}): Model {model_name} voted: {parsed}", flush=True)
556
+
557
+ except Exception as e:
558
+ if judge.verbose:
559
+ print(f"Pair ({i},{j}): Model {model_name} failed: {e}", flush=True)
560
+ votes.append({
561
+ "model": model_name,
562
+ "error": str(e),
563
+ "parsed": None,
564
+ })
565
+
566
+ # Handle custom generation functions
567
+ for idx, custom_fn in enumerate(judge.custom_generation_fns):
568
+ try:
569
+ raw_response = custom_fn(prompt)
570
+ parsed = output_parser(raw_response)
571
+ votes.append({
572
+ "model": f"custom_fn_{idx}",
573
+ "raw_response": raw_response,
574
+ "parsed": parsed,
575
+ })
576
+ except Exception as e:
577
+ if judge.verbose:
578
+ print(f"Pair ({i},{j}): Custom function {idx} failed: {e}", flush=True)
579
+ votes.append({
580
+ "model": f"custom_fn_{idx}",
581
+ "error": str(e),
582
+ "parsed": None,
583
+ })
584
+
585
+ # Aggregate votes for this pair
586
+ valid_votes = [v for v in votes if v.get("parsed") is not None]
587
+
588
+ if not valid_votes:
589
+ # No valid votes, mark as tie
590
+ pairwise_results[(i, j)] = "tie"
591
+ all_votes[(i, j)] = votes
592
+ continue
593
+
594
+ mode = judge.mode
595
+
596
+ if mode == "single":
597
+ winner = valid_votes[0]["parsed"]
598
+ elif mode in ("majority", "all"):
599
+ # Count votes for A, B, tie
600
+ vote_counts = {"A": 0, "B": 0, "tie": 0}
601
+ for vote in valid_votes:
602
+ result = vote["parsed"]
603
+ if result in vote_counts:
604
+ vote_counts[result] += 1
605
+
606
+ # Determine winner
607
+ if mode == "all":
608
+ # All judges must agree
609
+ if vote_counts["A"] == len(valid_votes):
610
+ winner = "A"
611
+ elif vote_counts["B"] == len(valid_votes):
612
+ winner = "B"
613
+ else:
614
+ winner = "tie"
615
+ else: # majority
616
+ max_votes = max(vote_counts.values())
617
+ # Check for tie in voting
618
+ max_keys = [k for k, v in vote_counts.items() if v == max_votes]
619
+ if len(max_keys) > 1:
620
+ winner = "tie"
621
+ else:
622
+ winner = max_keys[0]
623
+ else:
624
+ winner = valid_votes[0]["parsed"]
625
+
626
+ # Record result
627
+ if winner == "A":
628
+ pairwise_results[(i, j)] = i
629
+ wins[i] += 1
630
+ elif winner == "B":
631
+ pairwise_results[(i, j)] = j
632
+ wins[j] += 1
633
+ else: # tie
634
+ pairwise_results[(i, j)] = "tie"
635
+
636
+ all_votes[(i, j)] = votes
637
+
638
+ # Build final ranking from win counts
639
+ ranking = sorted(range(n), key=lambda idx: wins[idx], reverse=True)
640
+
641
+ return {
642
+ "ranking": ranking,
643
+ "wins": wins,
644
+ "pairwise_results": pairwise_results,
645
+ "raw_votes": all_votes,
646
+ "mode": judge.mode,
647
+ }
648
+
649
+
650
+ def rank(
651
+ judge,
652
+ input: str,
653
+ model_outputs: List[str],
654
+ ground_truth: Optional[str] = None,
655
+ ranking_mode: str = "single_shot",
656
+ output_parser: Optional[Union[str, Callable]] = None,
657
+ custom_template: Optional[str] = None,
658
+ use_fully_custom_prompt: bool = False,
659
+ max_tokens: int = 10000,
660
+ ) -> Dict[str, Any]:
661
+ """
662
+ Rank multiple model outputs using an LLM judge.
663
+
664
+ Args:
665
+ judge: LLMAsAJudge instance configured with models
666
+ input: Original prompt or task description
667
+ model_outputs: List of model outputs to rank
668
+ ground_truth: Optional reference answer. If provided, ranking will be based on
669
+ how well model_outputs match the ground truth. If None, model_outputs are ranked
670
+ purely by quality/preference.
671
+ ranking_mode: "single_shot" or "round_robin"
672
+ output_parser: Parser for judge output. Can be:
673
+ - String: "letter_ordering", "json_scores", "pairwise_winner"
674
+ - Callable: Custom parser function
675
+ - None: Auto-selects based on ranking_mode (defaults to "letter_ordering"
676
+ for single_shot, "pairwise_winner" for round_robin)
677
+ For single_shot: should return List[str] (ordering) or Dict[str, float] (scores)
678
+ For round_robin: should return "A", "B", or "tie"
679
+ custom_template: Optional prompt template with placeholders. If None, uses sensible
680
+ defaults that adapt based on whether ground_truth is provided.
681
+ Available placeholders:
682
+ - single_shot: {input_block}, {model_outputs}, {ground_truth}
683
+ - round_robin: {input_block}, {option_a}, {option_b}, {ground_truth}
684
+ use_fully_custom_prompt: If True, custom_template is used as-is without substitution
685
+ max_tokens: Maximum tokens for judge response
686
+
687
+ Returns:
688
+ Dict with ranking results and metadata. Format depends on ranking_mode:
689
+
690
+ single_shot:
691
+ {
692
+ "ranking": [0, 2, 1], # Indices in rank order
693
+ "labels": ["A", "C", "B"], # Letter labels in rank order
694
+ "scores": {...} or None, # Scores if parser returns dict
695
+ "raw_votes": [...], # Individual judge outputs
696
+ "mode": str, # Aggregation mode used
697
+ }
698
+
699
+ round_robin:
700
+ {
701
+ "ranking": [2, 0, 1], # Indices sorted by wins
702
+ "wins": {0: 1, 1: 0, 2: 2}, # Win count per output
703
+ "pairwise_results": {...}, # Winner per pair
704
+ "raw_votes": {...}, # All pairwise judge votes
705
+ "mode": str, # Aggregation mode used
706
+ }
707
+
708
+ Example:
709
+ >>> from llmasajudge import LLMAsAJudge
710
+ >>>
711
+ >>> judge = LLMAsAJudge(models=["openai/gpt-4o-mini"])
712
+ >>>
713
+ >>> # Using string parser name
714
+ >>> result = judge.rank(
715
+ ... input="Explain recursion",
716
+ ... model_outputs=["Answer 1", "Answer 2", "Answer 3"],
717
+ ... ranking_mode="single_shot",
718
+ ... output_parser="letter_ordering",
719
+ ... custom_template="Rank from best to worst:\\n{model_outputs}\\nReturn: A > B > C"
720
+ ... )
721
+ >>> print(result["ranking"]) # [0, 2, 1]
722
+ """
723
+ if not model_outputs:
724
+ raise ValueError("Must provide at least one model output")
725
+
726
+ if ranking_mode not in ("single_shot", "round_robin"):
727
+ raise ValueError("ranking_mode must be 'single_shot' or 'round_robin'")
728
+
729
+ # Resolve output_parser (string or callable)
730
+ if output_parser is None:
731
+ # Auto-select default parser based on mode
732
+ if ranking_mode == "single_shot":
733
+ output_parser = RankingParsers.letter_ordering
734
+ else: # round_robin
735
+ output_parser = RankingParsers.pairwise_winner
736
+ elif isinstance(output_parser, str):
737
+ # Map string to parser function
738
+ parser_map = {
739
+ 'letter_ordering': RankingParsers.letter_ordering,
740
+ 'json_scores': RankingParsers.json_scores,
741
+ 'pairwise_winner': RankingParsers.pairwise_winner,
742
+ }
743
+ if output_parser not in parser_map:
744
+ raise ValueError(
745
+ f"Unknown parser '{output_parser}'. "
746
+ f"Available: {list(parser_map.keys())}"
747
+ )
748
+ output_parser = parser_map[output_parser]
749
+ # else: assume it's a callable, use as-is
750
+
751
+ if ranking_mode == "single_shot":
752
+ return _single_shot_rank(
753
+ judge=judge,
754
+ input_text=input,
755
+ model_outputs=model_outputs,
756
+ ground_truth=ground_truth,
757
+ output_parser=output_parser,
758
+ custom_template=custom_template,
759
+ use_fully_custom_prompt=use_fully_custom_prompt,
760
+ max_tokens=max_tokens,
761
+ )
762
+ else: # round_robin
763
+ return _round_robin_rank(
764
+ judge=judge,
765
+ input_text=input,
766
+ model_outputs=model_outputs,
767
+ ground_truth=ground_truth,
768
+ output_parser=output_parser,
769
+ custom_template=custom_template,
770
+ use_fully_custom_prompt=use_fully_custom_prompt,
771
+ max_tokens=max_tokens,
772
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llmasajudge
3
- Version: 0.1.14
3
+ Version: 0.1.18
4
4
  Summary: LLM Judge: simple right/wrong voting across models
5
5
  Author-email: Brett Young <byyoung3@gmail.com>
6
6
  Project-URL: Homepage, https://example.com
@@ -0,0 +1,6 @@
1
+ llmasajudge/__init__.py,sha256=TGVADN77vQtKy3JBGLe9F578jVCZ_Vz055P1CIk2vIQ,65215
2
+ llmasajudge/ranker.py,sha256=2Nr-J1DNPYVIja2Fl-ksuvOnJPEwYmfylDkdlYqCWtE,26829
3
+ llmasajudge-0.1.18.dist-info/METADATA,sha256=lV63AvuLpdzAjhVgN5PQr-2fiGn84QQizBMPLDYWsV0,515
4
+ llmasajudge-0.1.18.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
5
+ llmasajudge-0.1.18.dist-info/top_level.txt,sha256=rRaIpM1llpEqahR9flT3RjpZHi2o16iOgnGYJ8cO4_0,12
6
+ llmasajudge-0.1.18.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,5 +0,0 @@
1
- llmasajudge/__init__.py,sha256=OKaafNDE_1vOIPZshLrs37kGvSq5QXSHIWA9AVmeVTU,61627
2
- llmasajudge-0.1.14.dist-info/METADATA,sha256=xsjEyt76cmEvBd9Vn99ZevnhgRJ4HpBogHoysvZGCas,515
3
- llmasajudge-0.1.14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
4
- llmasajudge-0.1.14.dist-info/top_level.txt,sha256=rRaIpM1llpEqahR9flT3RjpZHi2o16iOgnGYJ8cO4_0,12
5
- llmasajudge-0.1.14.dist-info/RECORD,,