llmasajudge 0.1.14__py3-none-any.whl → 0.1.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llmasajudge/__init__.py +114 -18
- llmasajudge/ranker.py +772 -0
- {llmasajudge-0.1.14.dist-info → llmasajudge-0.1.18.dist-info}/METADATA +1 -1
- llmasajudge-0.1.18.dist-info/RECORD +6 -0
- {llmasajudge-0.1.14.dist-info → llmasajudge-0.1.18.dist-info}/WHEEL +1 -1
- llmasajudge-0.1.14.dist-info/RECORD +0 -5
- {llmasajudge-0.1.14.dist-info → llmasajudge-0.1.18.dist-info}/top_level.txt +0 -0
llmasajudge/__init__.py
CHANGED
|
@@ -875,6 +875,7 @@ class ReturnType(Enum):
|
|
|
875
875
|
BOOLEAN = "boolean"
|
|
876
876
|
SCALAR = "scalar"
|
|
877
877
|
MAP = "map"
|
|
878
|
+
STRING = "string" # For arbitrary string returns (categories, choices, etc.)
|
|
878
879
|
|
|
879
880
|
|
|
880
881
|
class AggregationMode(Enum):
|
|
@@ -888,6 +889,7 @@ class AggregationMode(Enum):
|
|
|
888
889
|
MIN = "min"
|
|
889
890
|
MAX = "max"
|
|
890
891
|
MEDIAN = "median"
|
|
892
|
+
# String modes - uses MAJORITY and SINGLE from above
|
|
891
893
|
|
|
892
894
|
|
|
893
895
|
# Valid aggregation modes per return type
|
|
@@ -895,6 +897,7 @@ VALID_MODES = {
|
|
|
895
897
|
ReturnType.BOOLEAN: {AggregationMode.MAJORITY, AggregationMode.SINGLE, AggregationMode.ALL},
|
|
896
898
|
ReturnType.SCALAR: {AggregationMode.AVERAGE, AggregationMode.MIN, AggregationMode.MAX, AggregationMode.MEDIAN, AggregationMode.SINGLE},
|
|
897
899
|
ReturnType.MAP: {AggregationMode.AVERAGE, AggregationMode.MIN, AggregationMode.MAX, AggregationMode.MEDIAN, AggregationMode.SINGLE},
|
|
900
|
+
ReturnType.STRING: {AggregationMode.MAJORITY, AggregationMode.SINGLE},
|
|
898
901
|
}
|
|
899
902
|
|
|
900
903
|
# Default aggregation modes per return type
|
|
@@ -902,6 +905,7 @@ DEFAULT_MODES = {
|
|
|
902
905
|
ReturnType.BOOLEAN: AggregationMode.MAJORITY,
|
|
903
906
|
ReturnType.SCALAR: AggregationMode.AVERAGE,
|
|
904
907
|
ReturnType.MAP: AggregationMode.AVERAGE,
|
|
908
|
+
ReturnType.STRING: AggregationMode.MAJORITY,
|
|
905
909
|
}
|
|
906
910
|
|
|
907
911
|
# String to enum mapping (for backward compat)
|
|
@@ -1115,6 +1119,8 @@ def _infer_return_type(value: Any) -> Optional[ReturnType]:
|
|
|
1115
1119
|
return ReturnType.SCALAR
|
|
1116
1120
|
if isinstance(value, dict) and all(isinstance(v, (int, float)) for v in value.values()):
|
|
1117
1121
|
return ReturnType.MAP
|
|
1122
|
+
if isinstance(value, str):
|
|
1123
|
+
return ReturnType.STRING
|
|
1118
1124
|
return None
|
|
1119
1125
|
|
|
1120
1126
|
|
|
@@ -1374,15 +1380,21 @@ Output only valid JSON. No explanation. No extra text.""",
|
|
|
1374
1380
|
last_err = None
|
|
1375
1381
|
for i in range(attempts):
|
|
1376
1382
|
try:
|
|
1377
|
-
|
|
1378
|
-
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
max_tokens
|
|
1383
|
-
extra_headers
|
|
1384
|
-
caching
|
|
1385
|
-
|
|
1383
|
+
# GPT-5 models don't accept temperature argument
|
|
1384
|
+
completion_kwargs = {
|
|
1385
|
+
"model": model,
|
|
1386
|
+
"api_base": api_base,
|
|
1387
|
+
"messages": [{"role": "user", "content": prompt}],
|
|
1388
|
+
"max_tokens": max_tokens,
|
|
1389
|
+
"extra_headers": headers,
|
|
1390
|
+
"caching": self.cache_enabled
|
|
1391
|
+
}
|
|
1392
|
+
|
|
1393
|
+
# Only add temperature if NOT a gpt-5 model
|
|
1394
|
+
if "gpt-5" not in model.lower():
|
|
1395
|
+
completion_kwargs["temperature"] = temperature
|
|
1396
|
+
|
|
1397
|
+
resp = completion(**completion_kwargs)
|
|
1386
1398
|
return (resp.choices[0].message.content or "").strip()
|
|
1387
1399
|
except Exception as e:
|
|
1388
1400
|
last_err = e
|
|
@@ -1453,20 +1465,20 @@ Output only valid JSON. No explanation. No extra text.""",
|
|
|
1453
1465
|
valid = [v["result"] for v in votes if v["result"] is not None and isinstance(v["result"], dict)]
|
|
1454
1466
|
if not valid:
|
|
1455
1467
|
raise ValueError("No valid map votes to aggregate")
|
|
1456
|
-
|
|
1468
|
+
|
|
1457
1469
|
keys = set()
|
|
1458
1470
|
for v in valid:
|
|
1459
1471
|
keys.update(v.keys())
|
|
1460
|
-
|
|
1472
|
+
|
|
1461
1473
|
if self._mode == AggregationMode.SINGLE:
|
|
1462
1474
|
return valid[0]
|
|
1463
|
-
|
|
1475
|
+
|
|
1464
1476
|
result = {}
|
|
1465
1477
|
for key in keys:
|
|
1466
1478
|
values = [v[key] for v in valid if key in v]
|
|
1467
1479
|
if not values:
|
|
1468
1480
|
continue
|
|
1469
|
-
|
|
1481
|
+
|
|
1470
1482
|
if self._mode == AggregationMode.AVERAGE:
|
|
1471
1483
|
result[key] = sum(values) / len(values)
|
|
1472
1484
|
elif self._mode == AggregationMode.MIN:
|
|
@@ -1478,9 +1490,38 @@ Output only valid JSON. No explanation. No extra text.""",
|
|
|
1478
1490
|
n = len(s)
|
|
1479
1491
|
mid = n // 2
|
|
1480
1492
|
result[key] = (s[mid - 1] + s[mid]) / 2 if n % 2 == 0 else s[mid]
|
|
1481
|
-
|
|
1493
|
+
|
|
1482
1494
|
return result
|
|
1483
1495
|
|
|
1496
|
+
def _aggregate_string(self, votes: List[Dict[str, Any]]) -> str:
|
|
1497
|
+
"""
|
|
1498
|
+
Aggregate string votes with tie detection.
|
|
1499
|
+
Returns the majority string, or "tie" if there's no clear majority.
|
|
1500
|
+
"""
|
|
1501
|
+
results = [v["result"] for v in votes if v["result"] is not None and isinstance(v["result"], str)]
|
|
1502
|
+
if not results:
|
|
1503
|
+
raise ValueError("No valid string votes to aggregate")
|
|
1504
|
+
|
|
1505
|
+
if self._mode == AggregationMode.SINGLE:
|
|
1506
|
+
return results[0]
|
|
1507
|
+
|
|
1508
|
+
# Count occurrences
|
|
1509
|
+
from collections import Counter
|
|
1510
|
+
counts = Counter(results)
|
|
1511
|
+
|
|
1512
|
+
# Get the most common
|
|
1513
|
+
most_common = counts.most_common()
|
|
1514
|
+
|
|
1515
|
+
if len(most_common) == 0:
|
|
1516
|
+
raise ValueError("No valid string votes to aggregate")
|
|
1517
|
+
|
|
1518
|
+
# Check for tie: if top two have same count
|
|
1519
|
+
if len(most_common) > 1 and most_common[0][1] == most_common[1][1]:
|
|
1520
|
+
return "tie"
|
|
1521
|
+
|
|
1522
|
+
# Return the majority
|
|
1523
|
+
return most_common[0][0]
|
|
1524
|
+
|
|
1484
1525
|
def judge(
|
|
1485
1526
|
self,
|
|
1486
1527
|
input: Any = None,
|
|
@@ -1577,13 +1618,16 @@ Output only valid JSON. No explanation. No extra text.""",
|
|
|
1577
1618
|
final = self._aggregate_scalar(votes)
|
|
1578
1619
|
elif return_type == ReturnType.MAP:
|
|
1579
1620
|
final = self._aggregate_map(votes)
|
|
1621
|
+
elif return_type == ReturnType.STRING:
|
|
1622
|
+
final = self._aggregate_string(votes)
|
|
1580
1623
|
else:
|
|
1581
1624
|
raise ValueError(f"Unknown return type: {return_type}")
|
|
1582
1625
|
|
|
1583
1626
|
# Build backward-compatible response
|
|
1584
|
-
# Boolean: correct=bool, scores=None
|
|
1585
|
-
# Scalar: correct=score, scores=score (both fields for convenience)
|
|
1586
|
-
# Map: correct=None, scores=map
|
|
1627
|
+
# Boolean: correct=bool, scores=None, result=bool
|
|
1628
|
+
# Scalar: correct=score, scores=score, result=score (both fields for convenience)
|
|
1629
|
+
# Map: correct=None, scores=map, result=map
|
|
1630
|
+
# String: correct=string, scores=None, result=string
|
|
1587
1631
|
if return_type == ReturnType.BOOLEAN:
|
|
1588
1632
|
# Also put "correct" in each vote for backward compat
|
|
1589
1633
|
for v in votes:
|
|
@@ -1591,6 +1635,7 @@ Output only valid JSON. No explanation. No extra text.""",
|
|
|
1591
1635
|
return {
|
|
1592
1636
|
"correct": final,
|
|
1593
1637
|
"scores": None,
|
|
1638
|
+
"result": final,
|
|
1594
1639
|
"mode": self.mode,
|
|
1595
1640
|
"votes": votes,
|
|
1596
1641
|
}
|
|
@@ -1599,6 +1644,16 @@ Output only valid JSON. No explanation. No extra text.""",
|
|
|
1599
1644
|
return {
|
|
1600
1645
|
"correct": final,
|
|
1601
1646
|
"scores": final,
|
|
1647
|
+
"result": final,
|
|
1648
|
+
"mode": self.mode,
|
|
1649
|
+
"votes": votes,
|
|
1650
|
+
}
|
|
1651
|
+
elif return_type == ReturnType.STRING:
|
|
1652
|
+
# For string, put result in correct field
|
|
1653
|
+
return {
|
|
1654
|
+
"correct": final,
|
|
1655
|
+
"scores": None,
|
|
1656
|
+
"result": final,
|
|
1602
1657
|
"mode": self.mode,
|
|
1603
1658
|
"votes": votes,
|
|
1604
1659
|
}
|
|
@@ -1606,6 +1661,47 @@ Output only valid JSON. No explanation. No extra text.""",
|
|
|
1606
1661
|
return {
|
|
1607
1662
|
"correct": None,
|
|
1608
1663
|
"scores": final,
|
|
1664
|
+
"result": final,
|
|
1609
1665
|
"mode": self.mode,
|
|
1610
1666
|
"votes": votes,
|
|
1611
|
-
}
|
|
1667
|
+
}
|
|
1668
|
+
|
|
1669
|
+
def rank(
|
|
1670
|
+
self,
|
|
1671
|
+
input: str,
|
|
1672
|
+
model_outputs: List[str],
|
|
1673
|
+
ground_truth: Optional[str] = None,
|
|
1674
|
+
ranking_mode: str = "single_shot",
|
|
1675
|
+
output_parser: Optional[Callable] = None,
|
|
1676
|
+
custom_template: Optional[str] = None,
|
|
1677
|
+
use_fully_custom_prompt: bool = False,
|
|
1678
|
+
max_tokens: int = 10000,
|
|
1679
|
+
) -> Dict[str, Any]:
|
|
1680
|
+
"""
|
|
1681
|
+
Rank multiple model outputs.
|
|
1682
|
+
|
|
1683
|
+
Args:
|
|
1684
|
+
input: Original prompt or task description
|
|
1685
|
+
model_outputs: List of model outputs to rank
|
|
1686
|
+
ground_truth: Optional reference answer
|
|
1687
|
+
ranking_mode: "single_shot" or "round_robin"
|
|
1688
|
+
output_parser: Function to parse judge output
|
|
1689
|
+
custom_template: Prompt template with placeholders
|
|
1690
|
+
use_fully_custom_prompt: If True, template used as-is
|
|
1691
|
+
max_tokens: Maximum tokens for judge response
|
|
1692
|
+
|
|
1693
|
+
Returns:
|
|
1694
|
+
Dict with ranking results (see ranker.rank() for details)
|
|
1695
|
+
"""
|
|
1696
|
+
from llmasajudge.ranker import rank as _rank
|
|
1697
|
+
return _rank(
|
|
1698
|
+
judge=self,
|
|
1699
|
+
input=input,
|
|
1700
|
+
model_outputs=model_outputs,
|
|
1701
|
+
ground_truth=ground_truth,
|
|
1702
|
+
ranking_mode=ranking_mode,
|
|
1703
|
+
output_parser=output_parser,
|
|
1704
|
+
custom_template=custom_template,
|
|
1705
|
+
use_fully_custom_prompt=use_fully_custom_prompt,
|
|
1706
|
+
max_tokens=max_tokens,
|
|
1707
|
+
)
|
llmasajudge/ranker.py
ADDED
|
@@ -0,0 +1,772 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LLMAsAJudge Ranking Extensions
|
|
3
|
+
|
|
4
|
+
Provides relative ranking functionality for evaluating multiple model outputs.
|
|
5
|
+
|
|
6
|
+
Supports two ranking modes:
|
|
7
|
+
1. single_shot: Judge sees all model_outputs at once and returns ranking/scores
|
|
8
|
+
2. round_robin: Judge compares model_outputs pairwise, results are aggregated
|
|
9
|
+
|
|
10
|
+
Usage:
|
|
11
|
+
from llmasajudge import LLMAsAJudge
|
|
12
|
+
from llmasajudge.ranker import rank
|
|
13
|
+
|
|
14
|
+
judge = LLMAsAJudge(models=["openai/gpt-4o-mini"])
|
|
15
|
+
|
|
16
|
+
result = rank(
|
|
17
|
+
judge=judge,
|
|
18
|
+
input="Explain recursion simply",
|
|
19
|
+
model_outputs=["Answer 1", "Answer 2", "Answer 3"],
|
|
20
|
+
ranking_mode="single_shot",
|
|
21
|
+
output_parser=ranking_parser,
|
|
22
|
+
custom_template=template
|
|
23
|
+
)
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
import re
|
|
27
|
+
import json
|
|
28
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
29
|
+
from itertools import combinations
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
__all__ = ["rank", "RankingParsers"]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# Default templates for ranking
|
|
36
|
+
DEFAULT_SINGLE_SHOT_TEMPLATE = """\
|
|
37
|
+
Rank the following candidate responses from BEST to WORST.
|
|
38
|
+
{ground_truth_section}
|
|
39
|
+
Task/Input:
|
|
40
|
+
{input_block}
|
|
41
|
+
|
|
42
|
+
Candidates:
|
|
43
|
+
{model_outputs}
|
|
44
|
+
|
|
45
|
+
Provide your ranking using the format: A > B > C > D (etc)
|
|
46
|
+
Return ONLY the ranking, no explanation."""
|
|
47
|
+
|
|
48
|
+
DEFAULT_SINGLE_SHOT_WITH_GT_TEMPLATE = """\
|
|
49
|
+
Rank the following candidate responses from BEST to WORST based on how well they match the ground truth answer.
|
|
50
|
+
{ground_truth_section}
|
|
51
|
+
Task/Input:
|
|
52
|
+
{input_block}
|
|
53
|
+
|
|
54
|
+
Ground Truth Answer:
|
|
55
|
+
{ground_truth}
|
|
56
|
+
|
|
57
|
+
Candidates:
|
|
58
|
+
{model_outputs}
|
|
59
|
+
|
|
60
|
+
Provide your ranking using the format: A > B > C > D (etc)
|
|
61
|
+
Return ONLY the ranking, no explanation."""
|
|
62
|
+
|
|
63
|
+
DEFAULT_ROUND_ROBIN_TEMPLATE = """\
|
|
64
|
+
Compare the following two responses and determine which is better.
|
|
65
|
+
|
|
66
|
+
Task/Input:
|
|
67
|
+
{input_block}
|
|
68
|
+
|
|
69
|
+
Option A:
|
|
70
|
+
{option_a}
|
|
71
|
+
|
|
72
|
+
Option B:
|
|
73
|
+
{option_b}
|
|
74
|
+
|
|
75
|
+
Which response is better? Return exactly one of: A, B, or tie"""
|
|
76
|
+
|
|
77
|
+
DEFAULT_ROUND_ROBIN_WITH_GT_TEMPLATE = """\
|
|
78
|
+
Compare the following two responses based on how well they match the ground truth answer.
|
|
79
|
+
|
|
80
|
+
Task/Input:
|
|
81
|
+
{input_block}
|
|
82
|
+
|
|
83
|
+
Ground Truth Answer:
|
|
84
|
+
{ground_truth}
|
|
85
|
+
|
|
86
|
+
Option A:
|
|
87
|
+
{option_a}
|
|
88
|
+
|
|
89
|
+
Option B:
|
|
90
|
+
{option_b}
|
|
91
|
+
|
|
92
|
+
Which response better matches the ground truth? Return exactly one of: A, B, or tie"""
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class RankingParsers:
|
|
96
|
+
"""Stock output parsers for ranking tasks."""
|
|
97
|
+
|
|
98
|
+
@staticmethod
|
|
99
|
+
def letter_ordering(response: str) -> List[str]:
|
|
100
|
+
"""
|
|
101
|
+
Parse ordering like "A > C > B" into ["A", "C", "B"].
|
|
102
|
+
Handles various separators: >, ->, =>
|
|
103
|
+
"""
|
|
104
|
+
if not response:
|
|
105
|
+
return []
|
|
106
|
+
|
|
107
|
+
# Try different separators
|
|
108
|
+
for sep in [">", "->", "=>"]:
|
|
109
|
+
if sep in response:
|
|
110
|
+
parts = [x.strip().upper() for x in response.split(sep)]
|
|
111
|
+
# Filter to single letters only
|
|
112
|
+
return [p for p in parts if len(p) == 1 and p.isalpha()]
|
|
113
|
+
|
|
114
|
+
# Fallback: extract all single letters in order
|
|
115
|
+
letters = re.findall(r'\b([A-Z])\b', response.upper())
|
|
116
|
+
return letters
|
|
117
|
+
|
|
118
|
+
@staticmethod
|
|
119
|
+
def json_scores(response: str) -> Optional[Dict[str, float]]:
|
|
120
|
+
"""
|
|
121
|
+
Parse JSON like {"A": 9.2, "B": 7.1, "C": 8.5}.
|
|
122
|
+
Returns dict mapping candidate labels to scores.
|
|
123
|
+
"""
|
|
124
|
+
if not response:
|
|
125
|
+
return None
|
|
126
|
+
|
|
127
|
+
try:
|
|
128
|
+
s = response.strip()
|
|
129
|
+
|
|
130
|
+
# Handle markdown code blocks
|
|
131
|
+
if "```json" in s.lower():
|
|
132
|
+
start = s.lower().find("```json") + 7
|
|
133
|
+
end = s.find("```", start)
|
|
134
|
+
if end > start:
|
|
135
|
+
s = s[start:end].strip()
|
|
136
|
+
elif "```" in s:
|
|
137
|
+
start = s.find("```") + 3
|
|
138
|
+
end = s.find("```", start)
|
|
139
|
+
if end > start:
|
|
140
|
+
s = s[start:end].strip()
|
|
141
|
+
|
|
142
|
+
# Extract JSON object
|
|
143
|
+
if '{' in s and '}' in s:
|
|
144
|
+
start_brace = s.find('{')
|
|
145
|
+
end_brace = s.rfind('}')
|
|
146
|
+
if start_brace < end_brace:
|
|
147
|
+
s = s[start_brace:end_brace + 1]
|
|
148
|
+
|
|
149
|
+
data = json.loads(s)
|
|
150
|
+
if not isinstance(data, dict):
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
# Convert all values to float
|
|
154
|
+
result = {}
|
|
155
|
+
for key, val in data.items():
|
|
156
|
+
if isinstance(val, (int, float)):
|
|
157
|
+
result[str(key).upper()] = float(val)
|
|
158
|
+
elif isinstance(val, str):
|
|
159
|
+
try:
|
|
160
|
+
result[str(key).upper()] = float(val)
|
|
161
|
+
except ValueError:
|
|
162
|
+
pass
|
|
163
|
+
|
|
164
|
+
return result if result else None
|
|
165
|
+
|
|
166
|
+
except (json.JSONDecodeError, ValueError):
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
@staticmethod
|
|
170
|
+
def pairwise_winner(response: str) -> Optional[str]:
|
|
171
|
+
"""
|
|
172
|
+
Parse pairwise comparison: "A", "B", or "tie".
|
|
173
|
+
Returns "A", "B", "tie", or None if unparseable.
|
|
174
|
+
"""
|
|
175
|
+
if not response:
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
text = response.strip().upper()
|
|
179
|
+
|
|
180
|
+
# Exact matches
|
|
181
|
+
if text == "A":
|
|
182
|
+
return "A"
|
|
183
|
+
if text == "B":
|
|
184
|
+
return "B"
|
|
185
|
+
if text == "TIE" or text == "TIED":
|
|
186
|
+
return "tie"
|
|
187
|
+
|
|
188
|
+
# Check for tie first (more specific)
|
|
189
|
+
if "TIE" in text or "TIED" in text or "DRAW" in text or "EQUAL" in text:
|
|
190
|
+
return "tie"
|
|
191
|
+
|
|
192
|
+
# Look for explicit answer patterns like "Answer: A", "Winner: B", "A is better", etc.
|
|
193
|
+
# Match word boundaries to avoid false positives
|
|
194
|
+
# Pattern to find answer declarations
|
|
195
|
+
answer_patterns = [
|
|
196
|
+
r'\bANSWER\s*:?\s*([AB])\b',
|
|
197
|
+
r'\bWINNER\s*:?\s*([AB])\b',
|
|
198
|
+
r'\bCHOOSE\s*:?\s*([AB])\b',
|
|
199
|
+
r'\bSELECT\s*:?\s*([AB])\b',
|
|
200
|
+
r'\bRESPONSE\s*:?\s*([AB])\b',
|
|
201
|
+
r'\bOPTION\s*:?\s*([AB])\b',
|
|
202
|
+
r'^\s*([AB])\s*$', # Just "A" or "B" alone
|
|
203
|
+
r'\b([AB])\s+IS\s+BETTER\b',
|
|
204
|
+
r'\bBETTER\s*:?\s*([AB])\b',
|
|
205
|
+
]
|
|
206
|
+
|
|
207
|
+
for pattern in answer_patterns:
|
|
208
|
+
match = re.search(pattern, text)
|
|
209
|
+
if match:
|
|
210
|
+
return match.group(1)
|
|
211
|
+
|
|
212
|
+
# Fallback: simple presence check (only if one appears more prominently)
|
|
213
|
+
# Count standalone occurrences
|
|
214
|
+
a_count = len(re.findall(r'\bA\b', text))
|
|
215
|
+
b_count = len(re.findall(r'\bB\b', text))
|
|
216
|
+
|
|
217
|
+
# If one clearly dominates, use it
|
|
218
|
+
if a_count > b_count and b_count == 0:
|
|
219
|
+
return "A"
|
|
220
|
+
if b_count > a_count and a_count == 0:
|
|
221
|
+
return "B"
|
|
222
|
+
|
|
223
|
+
# Last resort: check if only one appears at all
|
|
224
|
+
if "A" in text and "B" not in text:
|
|
225
|
+
return "A"
|
|
226
|
+
if "B" in text and "A" not in text:
|
|
227
|
+
return "B"
|
|
228
|
+
|
|
229
|
+
return None
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _format_model_outputs(model_outputs: List[str]) -> str:
|
|
233
|
+
"""
|
|
234
|
+
Format model_outputs as labeled blocks:
|
|
235
|
+
A)
|
|
236
|
+
<output 0>
|
|
237
|
+
|
|
238
|
+
B)
|
|
239
|
+
<output 1>
|
|
240
|
+
"""
|
|
241
|
+
labels = [chr(65 + i) for i in range(len(model_outputs))] # A, B, C, ...
|
|
242
|
+
blocks = []
|
|
243
|
+
for label, output in zip(labels, model_outputs):
|
|
244
|
+
blocks.append(f"{label})\n{output}")
|
|
245
|
+
return "\n\n".join(blocks)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _labels_to_indices(labels: List[str], num_outputs: int) -> List[int]:
|
|
249
|
+
"""
|
|
250
|
+
Convert letter labels ["A", "C", "B"] to indices [0, 2, 1].
|
|
251
|
+
"""
|
|
252
|
+
indices = []
|
|
253
|
+
for label in labels:
|
|
254
|
+
if len(label) != 1 or not label.isalpha():
|
|
255
|
+
continue
|
|
256
|
+
idx = ord(label.upper()) - 65 # A=0, B=1, etc.
|
|
257
|
+
if 0 <= idx < num_outputs:
|
|
258
|
+
indices.append(idx)
|
|
259
|
+
return indices
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def _scores_to_ranking(scores: Dict[str, float], num_outputs: int) -> List[int]:
|
|
263
|
+
"""
|
|
264
|
+
Convert score dict {"A": 9, "B": 7, "C": 8} to ranking [0, 2, 1] (descending).
|
|
265
|
+
"""
|
|
266
|
+
# Normalize keys to uppercase letters
|
|
267
|
+
normalized = {}
|
|
268
|
+
for k, v in scores.items():
|
|
269
|
+
label = str(k).upper()
|
|
270
|
+
if len(label) == 1 and label.isalpha():
|
|
271
|
+
idx = ord(label) - 65
|
|
272
|
+
if 0 <= idx < num_outputs:
|
|
273
|
+
normalized[idx] = float(v)
|
|
274
|
+
|
|
275
|
+
# Sort by score descending
|
|
276
|
+
sorted_indices = sorted(normalized.keys(), key=lambda i: normalized[i], reverse=True)
|
|
277
|
+
return sorted_indices
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _single_shot_rank(
|
|
281
|
+
judge,
|
|
282
|
+
input_text: str,
|
|
283
|
+
model_outputs: List[str],
|
|
284
|
+
ground_truth: Optional[str],
|
|
285
|
+
output_parser: Callable,
|
|
286
|
+
custom_template: Optional[str],
|
|
287
|
+
use_fully_custom_prompt: bool,
|
|
288
|
+
max_tokens: int,
|
|
289
|
+
) -> Dict[str, Any]:
|
|
290
|
+
"""
|
|
291
|
+
Execute single-shot ranking where judge sees all model_outputs at once.
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
{
|
|
295
|
+
"ranking": [0, 2, 1], # Indices in rank order
|
|
296
|
+
"labels": ["A", "C", "B"], # Letter labels in rank order
|
|
297
|
+
"scores": {"A": 9.2, "B": 7.1, "C": 8.5} or None,
|
|
298
|
+
"raw_votes": [...], # Individual judge outputs
|
|
299
|
+
}
|
|
300
|
+
"""
|
|
301
|
+
num_outputs = len(model_outputs)
|
|
302
|
+
formatted_outputs = _format_model_outputs(model_outputs)
|
|
303
|
+
|
|
304
|
+
# Build prompt
|
|
305
|
+
if use_fully_custom_prompt:
|
|
306
|
+
if custom_template is None:
|
|
307
|
+
raise ValueError("use_fully_custom_prompt=True requires custom_template")
|
|
308
|
+
prompt = custom_template
|
|
309
|
+
elif custom_template:
|
|
310
|
+
# Replace placeholders in custom template
|
|
311
|
+
prompt = custom_template
|
|
312
|
+
prompt = prompt.replace("{input_block}", input_text or "")
|
|
313
|
+
prompt = prompt.replace("{model_outputs}", formatted_outputs)
|
|
314
|
+
if ground_truth:
|
|
315
|
+
prompt = prompt.replace("{ground_truth}", ground_truth)
|
|
316
|
+
# Handle optional ground_truth_section placeholder
|
|
317
|
+
prompt = prompt.replace("{ground_truth_section}", "")
|
|
318
|
+
else:
|
|
319
|
+
# Use default template
|
|
320
|
+
if ground_truth:
|
|
321
|
+
template = DEFAULT_SINGLE_SHOT_WITH_GT_TEMPLATE
|
|
322
|
+
prompt = template.replace("{input_block}", input_text or "")
|
|
323
|
+
prompt = prompt.replace("{model_outputs}", formatted_outputs)
|
|
324
|
+
prompt = prompt.replace("{ground_truth}", ground_truth)
|
|
325
|
+
prompt = prompt.replace("{ground_truth_section}", "")
|
|
326
|
+
else:
|
|
327
|
+
template = DEFAULT_SINGLE_SHOT_TEMPLATE
|
|
328
|
+
prompt = template.replace("{input_block}", input_text or "")
|
|
329
|
+
prompt = prompt.replace("{model_outputs}", formatted_outputs)
|
|
330
|
+
prompt = prompt.replace("{ground_truth_section}", "")
|
|
331
|
+
|
|
332
|
+
# Use judge's internal voting mechanism
|
|
333
|
+
# We'll call judge with the constructed prompt
|
|
334
|
+
if use_fully_custom_prompt:
|
|
335
|
+
judge_result = judge.judge(prompt=prompt, max_tokens=max_tokens)
|
|
336
|
+
else:
|
|
337
|
+
# Pass empty values for standard params since we built prompt manually
|
|
338
|
+
# This is a bit hacky but works with current judge implementation
|
|
339
|
+
old_template = judge.template
|
|
340
|
+
judge.template = "{input_block}"
|
|
341
|
+
judge_result = judge.judge(input=prompt, model_output="", ground_truth="", max_tokens=max_tokens)
|
|
342
|
+
judge.template = old_template
|
|
343
|
+
|
|
344
|
+
# Parse each vote
|
|
345
|
+
raw_votes = judge_result.get("votes", [])
|
|
346
|
+
parsed_votes = []
|
|
347
|
+
|
|
348
|
+
for vote in raw_votes:
|
|
349
|
+
model = vote.get("model")
|
|
350
|
+
# Get raw response - need to call parser on it
|
|
351
|
+
# Since we used judge.judge(), the result is already in vote["result"]
|
|
352
|
+
# But we need the raw string to parse. Let's re-call the models manually.
|
|
353
|
+
pass
|
|
354
|
+
|
|
355
|
+
# Actually, let's refactor: we need direct model access for ranking
|
|
356
|
+
# The judge.judge() flow doesn't give us raw strings back
|
|
357
|
+
# Let's call models directly
|
|
358
|
+
|
|
359
|
+
votes = []
|
|
360
|
+
for model_name in judge.models:
|
|
361
|
+
try:
|
|
362
|
+
api_base, headers, temperature = judge._resolve_per_model(model_name)
|
|
363
|
+
raw_response = judge._attempt_completion(
|
|
364
|
+
model=model_name,
|
|
365
|
+
api_base=api_base,
|
|
366
|
+
headers=headers,
|
|
367
|
+
prompt=prompt,
|
|
368
|
+
temperature=temperature,
|
|
369
|
+
max_tokens=max_tokens,
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
parsed = output_parser(raw_response)
|
|
373
|
+
votes.append({
|
|
374
|
+
"model": model_name,
|
|
375
|
+
"raw_response": raw_response,
|
|
376
|
+
"parsed": parsed,
|
|
377
|
+
})
|
|
378
|
+
|
|
379
|
+
if judge.verbose:
|
|
380
|
+
print(f"Model {model_name} ranking: {parsed}", flush=True)
|
|
381
|
+
|
|
382
|
+
except Exception as e:
|
|
383
|
+
if judge.verbose:
|
|
384
|
+
print(f"Model {model_name} failed: {e}", flush=True)
|
|
385
|
+
votes.append({
|
|
386
|
+
"model": model_name,
|
|
387
|
+
"error": str(e),
|
|
388
|
+
"parsed": None,
|
|
389
|
+
})
|
|
390
|
+
|
|
391
|
+
# Handle custom generation functions
|
|
392
|
+
for idx, custom_fn in enumerate(judge.custom_generation_fns):
|
|
393
|
+
try:
|
|
394
|
+
raw_response = custom_fn(prompt)
|
|
395
|
+
parsed = output_parser(raw_response)
|
|
396
|
+
votes.append({
|
|
397
|
+
"model": f"custom_fn_{idx}",
|
|
398
|
+
"raw_response": raw_response,
|
|
399
|
+
"parsed": parsed,
|
|
400
|
+
})
|
|
401
|
+
except Exception as e:
|
|
402
|
+
if judge.verbose:
|
|
403
|
+
print(f"Custom function {idx} failed: {e}", flush=True)
|
|
404
|
+
votes.append({
|
|
405
|
+
"model": f"custom_fn_{idx}",
|
|
406
|
+
"error": str(e),
|
|
407
|
+
"parsed": None,
|
|
408
|
+
})
|
|
409
|
+
|
|
410
|
+
# Aggregate votes based on mode
|
|
411
|
+
mode = judge.mode
|
|
412
|
+
valid_votes = [v for v in votes if v.get("parsed") is not None]
|
|
413
|
+
|
|
414
|
+
if not valid_votes:
|
|
415
|
+
raise ValueError("No valid ranking votes received")
|
|
416
|
+
|
|
417
|
+
# Determine output type (ordering vs scores)
|
|
418
|
+
first_parsed = valid_votes[0]["parsed"]
|
|
419
|
+
|
|
420
|
+
if isinstance(first_parsed, list):
|
|
421
|
+
# Ordering format: ["A", "C", "B"]
|
|
422
|
+
if mode == "single":
|
|
423
|
+
final_labels = valid_votes[0]["parsed"]
|
|
424
|
+
elif mode == "majority":
|
|
425
|
+
# Use first valid vote for ordering (majority doesn't make sense for orderings)
|
|
426
|
+
# Could implement Borda count or similar, but for simplicity use first
|
|
427
|
+
final_labels = valid_votes[0]["parsed"]
|
|
428
|
+
else:
|
|
429
|
+
final_labels = valid_votes[0]["parsed"]
|
|
430
|
+
|
|
431
|
+
final_ranking = _labels_to_indices(final_labels, num_outputs)
|
|
432
|
+
final_scores = None
|
|
433
|
+
|
|
434
|
+
elif isinstance(first_parsed, dict):
|
|
435
|
+
# Score format: {"A": 9.2, "B": 7.1, "C": 8.5}
|
|
436
|
+
if mode == "single":
|
|
437
|
+
final_scores = valid_votes[0]["parsed"]
|
|
438
|
+
elif mode in ("majority", "average"):
|
|
439
|
+
# Average scores across judges
|
|
440
|
+
all_scores = {}
|
|
441
|
+
for vote in valid_votes:
|
|
442
|
+
scores = vote["parsed"]
|
|
443
|
+
if isinstance(scores, dict):
|
|
444
|
+
for label, score in scores.items():
|
|
445
|
+
label = str(label).upper()
|
|
446
|
+
if label not in all_scores:
|
|
447
|
+
all_scores[label] = []
|
|
448
|
+
all_scores[label].append(float(score))
|
|
449
|
+
|
|
450
|
+
final_scores = {k: sum(v) / len(v) for k, v in all_scores.items()}
|
|
451
|
+
else:
|
|
452
|
+
final_scores = valid_votes[0]["parsed"]
|
|
453
|
+
|
|
454
|
+
final_ranking = _scores_to_ranking(final_scores, num_outputs)
|
|
455
|
+
final_labels = [chr(65 + i) for i in final_ranking]
|
|
456
|
+
|
|
457
|
+
else:
|
|
458
|
+
raise ValueError(f"Unknown parsed format: {type(first_parsed)}")
|
|
459
|
+
|
|
460
|
+
return {
|
|
461
|
+
"ranking": final_ranking,
|
|
462
|
+
"labels": final_labels if isinstance(first_parsed, list) else [chr(65 + i) for i in final_ranking],
|
|
463
|
+
"scores": final_scores,
|
|
464
|
+
"raw_votes": votes,
|
|
465
|
+
"mode": mode,
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def _round_robin_rank(
|
|
470
|
+
judge,
|
|
471
|
+
input_text: str,
|
|
472
|
+
model_outputs: List[str],
|
|
473
|
+
ground_truth: Optional[str],
|
|
474
|
+
output_parser: Callable,
|
|
475
|
+
custom_template: Optional[str],
|
|
476
|
+
use_fully_custom_prompt: bool,
|
|
477
|
+
max_tokens: int,
|
|
478
|
+
) -> Dict[str, Any]:
|
|
479
|
+
"""
|
|
480
|
+
Execute round-robin ranking where judge compares all pairs.
|
|
481
|
+
|
|
482
|
+
For N model_outputs, performs N(N-1)/2 pairwise comparisons.
|
|
483
|
+
Aggregates results into final ranking based on win counts.
|
|
484
|
+
|
|
485
|
+
Returns:
|
|
486
|
+
{
|
|
487
|
+
"ranking": [2, 0, 1], # Indices sorted by wins (descending)
|
|
488
|
+
"wins": {0: 1, 1: 0, 2: 2}, # Win count per output
|
|
489
|
+
"pairwise_results": {(0,1): 0, (0,2): 2, (1,2): 2}, # Winner per pair
|
|
490
|
+
"raw_votes": {...}, # All pairwise judge votes
|
|
491
|
+
}
|
|
492
|
+
"""
|
|
493
|
+
n = len(model_outputs)
|
|
494
|
+
|
|
495
|
+
# Initialize tracking
|
|
496
|
+
wins = {i: 0 for i in range(n)}
|
|
497
|
+
pairwise_results = {}
|
|
498
|
+
all_votes = {}
|
|
499
|
+
|
|
500
|
+
# Generate all unique pairs
|
|
501
|
+
pairs = list(combinations(range(n), 2))
|
|
502
|
+
|
|
503
|
+
for i, j in pairs:
|
|
504
|
+
# Build pairwise prompt
|
|
505
|
+
if use_fully_custom_prompt:
|
|
506
|
+
if custom_template is None:
|
|
507
|
+
raise ValueError("use_fully_custom_prompt=True requires custom_template")
|
|
508
|
+
prompt = custom_template
|
|
509
|
+
elif custom_template:
|
|
510
|
+
# Replace placeholders in custom template
|
|
511
|
+
prompt = custom_template
|
|
512
|
+
prompt = prompt.replace("{input_block}", input_text or "")
|
|
513
|
+
prompt = prompt.replace("{option_a}", model_outputs[i])
|
|
514
|
+
prompt = prompt.replace("{option_b}", model_outputs[j])
|
|
515
|
+
if ground_truth:
|
|
516
|
+
prompt = prompt.replace("{ground_truth}", ground_truth)
|
|
517
|
+
else:
|
|
518
|
+
# Use default template
|
|
519
|
+
if ground_truth:
|
|
520
|
+
template = DEFAULT_ROUND_ROBIN_WITH_GT_TEMPLATE
|
|
521
|
+
prompt = template.replace("{input_block}", input_text or "")
|
|
522
|
+
prompt = prompt.replace("{option_a}", model_outputs[i])
|
|
523
|
+
prompt = prompt.replace("{option_b}", model_outputs[j])
|
|
524
|
+
prompt = prompt.replace("{ground_truth}", ground_truth)
|
|
525
|
+
else:
|
|
526
|
+
template = DEFAULT_ROUND_ROBIN_TEMPLATE
|
|
527
|
+
prompt = template.replace("{input_block}", input_text or "")
|
|
528
|
+
prompt = prompt.replace("{option_a}", model_outputs[i])
|
|
529
|
+
prompt = prompt.replace("{option_b}", model_outputs[j])
|
|
530
|
+
|
|
531
|
+
# Collect votes from all judges
|
|
532
|
+
votes = []
|
|
533
|
+
|
|
534
|
+
for model_name in judge.models:
|
|
535
|
+
try:
|
|
536
|
+
api_base, headers, temperature = judge._resolve_per_model(model_name)
|
|
537
|
+
raw_response = judge._attempt_completion(
|
|
538
|
+
model=model_name,
|
|
539
|
+
api_base=api_base,
|
|
540
|
+
headers=headers,
|
|
541
|
+
prompt=prompt,
|
|
542
|
+
temperature=temperature,
|
|
543
|
+
max_tokens=max_tokens,
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
parsed = output_parser(raw_response)
|
|
547
|
+
votes.append({
|
|
548
|
+
"model": model_name,
|
|
549
|
+
"raw_response": raw_response,
|
|
550
|
+
"parsed": parsed,
|
|
551
|
+
})
|
|
552
|
+
|
|
553
|
+
if judge.verbose:
|
|
554
|
+
print(f"Pair ({i},{j}): Model {model_name} raw response: {repr(raw_response)}", flush=True)
|
|
555
|
+
print(f"Pair ({i},{j}): Model {model_name} voted: {parsed}", flush=True)
|
|
556
|
+
|
|
557
|
+
except Exception as e:
|
|
558
|
+
if judge.verbose:
|
|
559
|
+
print(f"Pair ({i},{j}): Model {model_name} failed: {e}", flush=True)
|
|
560
|
+
votes.append({
|
|
561
|
+
"model": model_name,
|
|
562
|
+
"error": str(e),
|
|
563
|
+
"parsed": None,
|
|
564
|
+
})
|
|
565
|
+
|
|
566
|
+
# Handle custom generation functions
|
|
567
|
+
for idx, custom_fn in enumerate(judge.custom_generation_fns):
|
|
568
|
+
try:
|
|
569
|
+
raw_response = custom_fn(prompt)
|
|
570
|
+
parsed = output_parser(raw_response)
|
|
571
|
+
votes.append({
|
|
572
|
+
"model": f"custom_fn_{idx}",
|
|
573
|
+
"raw_response": raw_response,
|
|
574
|
+
"parsed": parsed,
|
|
575
|
+
})
|
|
576
|
+
except Exception as e:
|
|
577
|
+
if judge.verbose:
|
|
578
|
+
print(f"Pair ({i},{j}): Custom function {idx} failed: {e}", flush=True)
|
|
579
|
+
votes.append({
|
|
580
|
+
"model": f"custom_fn_{idx}",
|
|
581
|
+
"error": str(e),
|
|
582
|
+
"parsed": None,
|
|
583
|
+
})
|
|
584
|
+
|
|
585
|
+
# Aggregate votes for this pair
|
|
586
|
+
valid_votes = [v for v in votes if v.get("parsed") is not None]
|
|
587
|
+
|
|
588
|
+
if not valid_votes:
|
|
589
|
+
# No valid votes, mark as tie
|
|
590
|
+
pairwise_results[(i, j)] = "tie"
|
|
591
|
+
all_votes[(i, j)] = votes
|
|
592
|
+
continue
|
|
593
|
+
|
|
594
|
+
mode = judge.mode
|
|
595
|
+
|
|
596
|
+
if mode == "single":
|
|
597
|
+
winner = valid_votes[0]["parsed"]
|
|
598
|
+
elif mode in ("majority", "all"):
|
|
599
|
+
# Count votes for A, B, tie
|
|
600
|
+
vote_counts = {"A": 0, "B": 0, "tie": 0}
|
|
601
|
+
for vote in valid_votes:
|
|
602
|
+
result = vote["parsed"]
|
|
603
|
+
if result in vote_counts:
|
|
604
|
+
vote_counts[result] += 1
|
|
605
|
+
|
|
606
|
+
# Determine winner
|
|
607
|
+
if mode == "all":
|
|
608
|
+
# All judges must agree
|
|
609
|
+
if vote_counts["A"] == len(valid_votes):
|
|
610
|
+
winner = "A"
|
|
611
|
+
elif vote_counts["B"] == len(valid_votes):
|
|
612
|
+
winner = "B"
|
|
613
|
+
else:
|
|
614
|
+
winner = "tie"
|
|
615
|
+
else: # majority
|
|
616
|
+
max_votes = max(vote_counts.values())
|
|
617
|
+
# Check for tie in voting
|
|
618
|
+
max_keys = [k for k, v in vote_counts.items() if v == max_votes]
|
|
619
|
+
if len(max_keys) > 1:
|
|
620
|
+
winner = "tie"
|
|
621
|
+
else:
|
|
622
|
+
winner = max_keys[0]
|
|
623
|
+
else:
|
|
624
|
+
winner = valid_votes[0]["parsed"]
|
|
625
|
+
|
|
626
|
+
# Record result
|
|
627
|
+
if winner == "A":
|
|
628
|
+
pairwise_results[(i, j)] = i
|
|
629
|
+
wins[i] += 1
|
|
630
|
+
elif winner == "B":
|
|
631
|
+
pairwise_results[(i, j)] = j
|
|
632
|
+
wins[j] += 1
|
|
633
|
+
else: # tie
|
|
634
|
+
pairwise_results[(i, j)] = "tie"
|
|
635
|
+
|
|
636
|
+
all_votes[(i, j)] = votes
|
|
637
|
+
|
|
638
|
+
# Build final ranking from win counts
|
|
639
|
+
ranking = sorted(range(n), key=lambda idx: wins[idx], reverse=True)
|
|
640
|
+
|
|
641
|
+
return {
|
|
642
|
+
"ranking": ranking,
|
|
643
|
+
"wins": wins,
|
|
644
|
+
"pairwise_results": pairwise_results,
|
|
645
|
+
"raw_votes": all_votes,
|
|
646
|
+
"mode": judge.mode,
|
|
647
|
+
}
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
def rank(
|
|
651
|
+
judge,
|
|
652
|
+
input: str,
|
|
653
|
+
model_outputs: List[str],
|
|
654
|
+
ground_truth: Optional[str] = None,
|
|
655
|
+
ranking_mode: str = "single_shot",
|
|
656
|
+
output_parser: Optional[Union[str, Callable]] = None,
|
|
657
|
+
custom_template: Optional[str] = None,
|
|
658
|
+
use_fully_custom_prompt: bool = False,
|
|
659
|
+
max_tokens: int = 10000,
|
|
660
|
+
) -> Dict[str, Any]:
|
|
661
|
+
"""
|
|
662
|
+
Rank multiple model outputs using an LLM judge.
|
|
663
|
+
|
|
664
|
+
Args:
|
|
665
|
+
judge: LLMAsAJudge instance configured with models
|
|
666
|
+
input: Original prompt or task description
|
|
667
|
+
model_outputs: List of model outputs to rank
|
|
668
|
+
ground_truth: Optional reference answer. If provided, ranking will be based on
|
|
669
|
+
how well model_outputs match the ground truth. If None, model_outputs are ranked
|
|
670
|
+
purely by quality/preference.
|
|
671
|
+
ranking_mode: "single_shot" or "round_robin"
|
|
672
|
+
output_parser: Parser for judge output. Can be:
|
|
673
|
+
- String: "letter_ordering", "json_scores", "pairwise_winner"
|
|
674
|
+
- Callable: Custom parser function
|
|
675
|
+
- None: Auto-selects based on ranking_mode (defaults to "letter_ordering"
|
|
676
|
+
for single_shot, "pairwise_winner" for round_robin)
|
|
677
|
+
For single_shot: should return List[str] (ordering) or Dict[str, float] (scores)
|
|
678
|
+
For round_robin: should return "A", "B", or "tie"
|
|
679
|
+
custom_template: Optional prompt template with placeholders. If None, uses sensible
|
|
680
|
+
defaults that adapt based on whether ground_truth is provided.
|
|
681
|
+
Available placeholders:
|
|
682
|
+
- single_shot: {input_block}, {model_outputs}, {ground_truth}
|
|
683
|
+
- round_robin: {input_block}, {option_a}, {option_b}, {ground_truth}
|
|
684
|
+
use_fully_custom_prompt: If True, custom_template is used as-is without substitution
|
|
685
|
+
max_tokens: Maximum tokens for judge response
|
|
686
|
+
|
|
687
|
+
Returns:
|
|
688
|
+
Dict with ranking results and metadata. Format depends on ranking_mode:
|
|
689
|
+
|
|
690
|
+
single_shot:
|
|
691
|
+
{
|
|
692
|
+
"ranking": [0, 2, 1], # Indices in rank order
|
|
693
|
+
"labels": ["A", "C", "B"], # Letter labels in rank order
|
|
694
|
+
"scores": {...} or None, # Scores if parser returns dict
|
|
695
|
+
"raw_votes": [...], # Individual judge outputs
|
|
696
|
+
"mode": str, # Aggregation mode used
|
|
697
|
+
}
|
|
698
|
+
|
|
699
|
+
round_robin:
|
|
700
|
+
{
|
|
701
|
+
"ranking": [2, 0, 1], # Indices sorted by wins
|
|
702
|
+
"wins": {0: 1, 1: 0, 2: 2}, # Win count per output
|
|
703
|
+
"pairwise_results": {...}, # Winner per pair
|
|
704
|
+
"raw_votes": {...}, # All pairwise judge votes
|
|
705
|
+
"mode": str, # Aggregation mode used
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
Example:
|
|
709
|
+
>>> from llmasajudge import LLMAsAJudge
|
|
710
|
+
>>>
|
|
711
|
+
>>> judge = LLMAsAJudge(models=["openai/gpt-4o-mini"])
|
|
712
|
+
>>>
|
|
713
|
+
>>> # Using string parser name
|
|
714
|
+
>>> result = judge.rank(
|
|
715
|
+
... input="Explain recursion",
|
|
716
|
+
... model_outputs=["Answer 1", "Answer 2", "Answer 3"],
|
|
717
|
+
... ranking_mode="single_shot",
|
|
718
|
+
... output_parser="letter_ordering",
|
|
719
|
+
... custom_template="Rank from best to worst:\\n{model_outputs}\\nReturn: A > B > C"
|
|
720
|
+
... )
|
|
721
|
+
>>> print(result["ranking"]) # [0, 2, 1]
|
|
722
|
+
"""
|
|
723
|
+
if not model_outputs:
|
|
724
|
+
raise ValueError("Must provide at least one model output")
|
|
725
|
+
|
|
726
|
+
if ranking_mode not in ("single_shot", "round_robin"):
|
|
727
|
+
raise ValueError("ranking_mode must be 'single_shot' or 'round_robin'")
|
|
728
|
+
|
|
729
|
+
# Resolve output_parser (string or callable)
|
|
730
|
+
if output_parser is None:
|
|
731
|
+
# Auto-select default parser based on mode
|
|
732
|
+
if ranking_mode == "single_shot":
|
|
733
|
+
output_parser = RankingParsers.letter_ordering
|
|
734
|
+
else: # round_robin
|
|
735
|
+
output_parser = RankingParsers.pairwise_winner
|
|
736
|
+
elif isinstance(output_parser, str):
|
|
737
|
+
# Map string to parser function
|
|
738
|
+
parser_map = {
|
|
739
|
+
'letter_ordering': RankingParsers.letter_ordering,
|
|
740
|
+
'json_scores': RankingParsers.json_scores,
|
|
741
|
+
'pairwise_winner': RankingParsers.pairwise_winner,
|
|
742
|
+
}
|
|
743
|
+
if output_parser not in parser_map:
|
|
744
|
+
raise ValueError(
|
|
745
|
+
f"Unknown parser '{output_parser}'. "
|
|
746
|
+
f"Available: {list(parser_map.keys())}"
|
|
747
|
+
)
|
|
748
|
+
output_parser = parser_map[output_parser]
|
|
749
|
+
# else: assume it's a callable, use as-is
|
|
750
|
+
|
|
751
|
+
if ranking_mode == "single_shot":
|
|
752
|
+
return _single_shot_rank(
|
|
753
|
+
judge=judge,
|
|
754
|
+
input_text=input,
|
|
755
|
+
model_outputs=model_outputs,
|
|
756
|
+
ground_truth=ground_truth,
|
|
757
|
+
output_parser=output_parser,
|
|
758
|
+
custom_template=custom_template,
|
|
759
|
+
use_fully_custom_prompt=use_fully_custom_prompt,
|
|
760
|
+
max_tokens=max_tokens,
|
|
761
|
+
)
|
|
762
|
+
else: # round_robin
|
|
763
|
+
return _round_robin_rank(
|
|
764
|
+
judge=judge,
|
|
765
|
+
input_text=input,
|
|
766
|
+
model_outputs=model_outputs,
|
|
767
|
+
ground_truth=ground_truth,
|
|
768
|
+
output_parser=output_parser,
|
|
769
|
+
custom_template=custom_template,
|
|
770
|
+
use_fully_custom_prompt=use_fully_custom_prompt,
|
|
771
|
+
max_tokens=max_tokens,
|
|
772
|
+
)
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
llmasajudge/__init__.py,sha256=TGVADN77vQtKy3JBGLe9F578jVCZ_Vz055P1CIk2vIQ,65215
|
|
2
|
+
llmasajudge/ranker.py,sha256=2Nr-J1DNPYVIja2Fl-ksuvOnJPEwYmfylDkdlYqCWtE,26829
|
|
3
|
+
llmasajudge-0.1.18.dist-info/METADATA,sha256=lV63AvuLpdzAjhVgN5PQr-2fiGn84QQizBMPLDYWsV0,515
|
|
4
|
+
llmasajudge-0.1.18.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
5
|
+
llmasajudge-0.1.18.dist-info/top_level.txt,sha256=rRaIpM1llpEqahR9flT3RjpZHi2o16iOgnGYJ8cO4_0,12
|
|
6
|
+
llmasajudge-0.1.18.dist-info/RECORD,,
|
|
@@ -1,5 +0,0 @@
|
|
|
1
|
-
llmasajudge/__init__.py,sha256=OKaafNDE_1vOIPZshLrs37kGvSq5QXSHIWA9AVmeVTU,61627
|
|
2
|
-
llmasajudge-0.1.14.dist-info/METADATA,sha256=xsjEyt76cmEvBd9Vn99ZevnhgRJ4HpBogHoysvZGCas,515
|
|
3
|
-
llmasajudge-0.1.14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
4
|
-
llmasajudge-0.1.14.dist-info/top_level.txt,sha256=rRaIpM1llpEqahR9flT3RjpZHi2o16iOgnGYJ8cO4_0,12
|
|
5
|
-
llmasajudge-0.1.14.dist-info/RECORD,,
|
|
File without changes
|