llmasajudge 0.1.14__tar.gz → 0.1.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llmasajudge-0.1.14 → llmasajudge-0.1.15}/PKG-INFO +1 -1
- {llmasajudge-0.1.14 → llmasajudge-0.1.15}/llmasajudge/__init__.py +73 -17
- {llmasajudge-0.1.14 → llmasajudge-0.1.15}/llmasajudge.egg-info/PKG-INFO +1 -1
- {llmasajudge-0.1.14 → llmasajudge-0.1.15}/pyproject.toml +1 -1
- {llmasajudge-0.1.14 → llmasajudge-0.1.15}/README.md +0 -0
- {llmasajudge-0.1.14 → llmasajudge-0.1.15}/llmasajudge.egg-info/SOURCES.txt +0 -0
- {llmasajudge-0.1.14 → llmasajudge-0.1.15}/llmasajudge.egg-info/dependency_links.txt +0 -0
- {llmasajudge-0.1.14 → llmasajudge-0.1.15}/llmasajudge.egg-info/requires.txt +0 -0
- {llmasajudge-0.1.14 → llmasajudge-0.1.15}/llmasajudge.egg-info/top_level.txt +0 -0
- {llmasajudge-0.1.14 → llmasajudge-0.1.15}/setup.cfg +0 -0
|
@@ -875,6 +875,7 @@ class ReturnType(Enum):
|
|
|
875
875
|
BOOLEAN = "boolean"
|
|
876
876
|
SCALAR = "scalar"
|
|
877
877
|
MAP = "map"
|
|
878
|
+
STRING = "string" # For arbitrary string returns (categories, choices, etc.)
|
|
878
879
|
|
|
879
880
|
|
|
880
881
|
class AggregationMode(Enum):
|
|
@@ -888,6 +889,7 @@ class AggregationMode(Enum):
|
|
|
888
889
|
MIN = "min"
|
|
889
890
|
MAX = "max"
|
|
890
891
|
MEDIAN = "median"
|
|
892
|
+
# String modes - uses MAJORITY and SINGLE from above
|
|
891
893
|
|
|
892
894
|
|
|
893
895
|
# Valid aggregation modes per return type
|
|
@@ -895,6 +897,7 @@ VALID_MODES = {
|
|
|
895
897
|
ReturnType.BOOLEAN: {AggregationMode.MAJORITY, AggregationMode.SINGLE, AggregationMode.ALL},
|
|
896
898
|
ReturnType.SCALAR: {AggregationMode.AVERAGE, AggregationMode.MIN, AggregationMode.MAX, AggregationMode.MEDIAN, AggregationMode.SINGLE},
|
|
897
899
|
ReturnType.MAP: {AggregationMode.AVERAGE, AggregationMode.MIN, AggregationMode.MAX, AggregationMode.MEDIAN, AggregationMode.SINGLE},
|
|
900
|
+
ReturnType.STRING: {AggregationMode.MAJORITY, AggregationMode.SINGLE},
|
|
898
901
|
}
|
|
899
902
|
|
|
900
903
|
# Default aggregation modes per return type
|
|
@@ -902,6 +905,7 @@ DEFAULT_MODES = {
|
|
|
902
905
|
ReturnType.BOOLEAN: AggregationMode.MAJORITY,
|
|
903
906
|
ReturnType.SCALAR: AggregationMode.AVERAGE,
|
|
904
907
|
ReturnType.MAP: AggregationMode.AVERAGE,
|
|
908
|
+
ReturnType.STRING: AggregationMode.MAJORITY,
|
|
905
909
|
}
|
|
906
910
|
|
|
907
911
|
# String to enum mapping (for backward compat)
|
|
@@ -1115,6 +1119,8 @@ def _infer_return_type(value: Any) -> Optional[ReturnType]:
|
|
|
1115
1119
|
return ReturnType.SCALAR
|
|
1116
1120
|
if isinstance(value, dict) and all(isinstance(v, (int, float)) for v in value.values()):
|
|
1117
1121
|
return ReturnType.MAP
|
|
1122
|
+
if isinstance(value, str):
|
|
1123
|
+
return ReturnType.STRING
|
|
1118
1124
|
return None
|
|
1119
1125
|
|
|
1120
1126
|
|
|
@@ -1374,15 +1380,21 @@ Output only valid JSON. No explanation. No extra text.""",
|
|
|
1374
1380
|
last_err = None
|
|
1375
1381
|
for i in range(attempts):
|
|
1376
1382
|
try:
|
|
1377
|
-
|
|
1378
|
-
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
max_tokens
|
|
1383
|
-
extra_headers
|
|
1384
|
-
caching
|
|
1385
|
-
|
|
1383
|
+
# GPT-5 models don't accept temperature argument
|
|
1384
|
+
completion_kwargs = {
|
|
1385
|
+
"model": model,
|
|
1386
|
+
"api_base": api_base,
|
|
1387
|
+
"messages": [{"role": "user", "content": prompt}],
|
|
1388
|
+
"max_tokens": max_tokens,
|
|
1389
|
+
"extra_headers": headers,
|
|
1390
|
+
"caching": self.cache_enabled
|
|
1391
|
+
}
|
|
1392
|
+
|
|
1393
|
+
# Only add temperature if NOT a gpt-5 model
|
|
1394
|
+
if "gpt-5" not in model.lower():
|
|
1395
|
+
completion_kwargs["temperature"] = temperature
|
|
1396
|
+
|
|
1397
|
+
resp = completion(**completion_kwargs)
|
|
1386
1398
|
return (resp.choices[0].message.content or "").strip()
|
|
1387
1399
|
except Exception as e:
|
|
1388
1400
|
last_err = e
|
|
@@ -1453,20 +1465,20 @@ Output only valid JSON. No explanation. No extra text.""",
|
|
|
1453
1465
|
valid = [v["result"] for v in votes if v["result"] is not None and isinstance(v["result"], dict)]
|
|
1454
1466
|
if not valid:
|
|
1455
1467
|
raise ValueError("No valid map votes to aggregate")
|
|
1456
|
-
|
|
1468
|
+
|
|
1457
1469
|
keys = set()
|
|
1458
1470
|
for v in valid:
|
|
1459
1471
|
keys.update(v.keys())
|
|
1460
|
-
|
|
1472
|
+
|
|
1461
1473
|
if self._mode == AggregationMode.SINGLE:
|
|
1462
1474
|
return valid[0]
|
|
1463
|
-
|
|
1475
|
+
|
|
1464
1476
|
result = {}
|
|
1465
1477
|
for key in keys:
|
|
1466
1478
|
values = [v[key] for v in valid if key in v]
|
|
1467
1479
|
if not values:
|
|
1468
1480
|
continue
|
|
1469
|
-
|
|
1481
|
+
|
|
1470
1482
|
if self._mode == AggregationMode.AVERAGE:
|
|
1471
1483
|
result[key] = sum(values) / len(values)
|
|
1472
1484
|
elif self._mode == AggregationMode.MIN:
|
|
@@ -1478,9 +1490,38 @@ Output only valid JSON. No explanation. No extra text.""",
|
|
|
1478
1490
|
n = len(s)
|
|
1479
1491
|
mid = n // 2
|
|
1480
1492
|
result[key] = (s[mid - 1] + s[mid]) / 2 if n % 2 == 0 else s[mid]
|
|
1481
|
-
|
|
1493
|
+
|
|
1482
1494
|
return result
|
|
1483
1495
|
|
|
1496
|
+
def _aggregate_string(self, votes: List[Dict[str, Any]]) -> str:
|
|
1497
|
+
"""
|
|
1498
|
+
Aggregate string votes with tie detection.
|
|
1499
|
+
Returns the majority string, or "tie" if there's no clear majority.
|
|
1500
|
+
"""
|
|
1501
|
+
results = [v["result"] for v in votes if v["result"] is not None and isinstance(v["result"], str)]
|
|
1502
|
+
if not results:
|
|
1503
|
+
raise ValueError("No valid string votes to aggregate")
|
|
1504
|
+
|
|
1505
|
+
if self._mode == AggregationMode.SINGLE:
|
|
1506
|
+
return results[0]
|
|
1507
|
+
|
|
1508
|
+
# Count occurrences
|
|
1509
|
+
from collections import Counter
|
|
1510
|
+
counts = Counter(results)
|
|
1511
|
+
|
|
1512
|
+
# Get the most common
|
|
1513
|
+
most_common = counts.most_common()
|
|
1514
|
+
|
|
1515
|
+
if len(most_common) == 0:
|
|
1516
|
+
raise ValueError("No valid string votes to aggregate")
|
|
1517
|
+
|
|
1518
|
+
# Check for tie: if top two have same count
|
|
1519
|
+
if len(most_common) > 1 and most_common[0][1] == most_common[1][1]:
|
|
1520
|
+
return "tie"
|
|
1521
|
+
|
|
1522
|
+
# Return the majority
|
|
1523
|
+
return most_common[0][0]
|
|
1524
|
+
|
|
1484
1525
|
def judge(
|
|
1485
1526
|
self,
|
|
1486
1527
|
input: Any = None,
|
|
@@ -1577,13 +1618,16 @@ Output only valid JSON. No explanation. No extra text.""",
|
|
|
1577
1618
|
final = self._aggregate_scalar(votes)
|
|
1578
1619
|
elif return_type == ReturnType.MAP:
|
|
1579
1620
|
final = self._aggregate_map(votes)
|
|
1621
|
+
elif return_type == ReturnType.STRING:
|
|
1622
|
+
final = self._aggregate_string(votes)
|
|
1580
1623
|
else:
|
|
1581
1624
|
raise ValueError(f"Unknown return type: {return_type}")
|
|
1582
1625
|
|
|
1583
1626
|
# Build backward-compatible response
|
|
1584
|
-
# Boolean: correct=bool, scores=None
|
|
1585
|
-
# Scalar: correct=score, scores=score (both fields for convenience)
|
|
1586
|
-
# Map: correct=None, scores=map
|
|
1627
|
+
# Boolean: correct=bool, scores=None, result=bool
|
|
1628
|
+
# Scalar: correct=score, scores=score, result=score (both fields for convenience)
|
|
1629
|
+
# Map: correct=None, scores=map, result=map
|
|
1630
|
+
# String: correct=string, scores=None, result=string
|
|
1587
1631
|
if return_type == ReturnType.BOOLEAN:
|
|
1588
1632
|
# Also put "correct" in each vote for backward compat
|
|
1589
1633
|
for v in votes:
|
|
@@ -1591,6 +1635,7 @@ Output only valid JSON. No explanation. No extra text.""",
|
|
|
1591
1635
|
return {
|
|
1592
1636
|
"correct": final,
|
|
1593
1637
|
"scores": None,
|
|
1638
|
+
"result": final,
|
|
1594
1639
|
"mode": self.mode,
|
|
1595
1640
|
"votes": votes,
|
|
1596
1641
|
}
|
|
@@ -1599,6 +1644,16 @@ Output only valid JSON. No explanation. No extra text.""",
|
|
|
1599
1644
|
return {
|
|
1600
1645
|
"correct": final,
|
|
1601
1646
|
"scores": final,
|
|
1647
|
+
"result": final,
|
|
1648
|
+
"mode": self.mode,
|
|
1649
|
+
"votes": votes,
|
|
1650
|
+
}
|
|
1651
|
+
elif return_type == ReturnType.STRING:
|
|
1652
|
+
# For string, put result in correct field
|
|
1653
|
+
return {
|
|
1654
|
+
"correct": final,
|
|
1655
|
+
"scores": None,
|
|
1656
|
+
"result": final,
|
|
1602
1657
|
"mode": self.mode,
|
|
1603
1658
|
"votes": votes,
|
|
1604
1659
|
}
|
|
@@ -1606,6 +1661,7 @@ Output only valid JSON. No explanation. No extra text.""",
|
|
|
1606
1661
|
return {
|
|
1607
1662
|
"correct": None,
|
|
1608
1663
|
"scores": final,
|
|
1664
|
+
"result": final,
|
|
1609
1665
|
"mode": self.mode,
|
|
1610
1666
|
"votes": votes,
|
|
1611
1667
|
}
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "llmasajudge"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.15"
|
|
8
8
|
description = "LLM Judge: simple right/wrong voting across models"
|
|
9
9
|
authors = [{name="Brett Young", email="byyoung3@gmail.com"}]
|
|
10
10
|
readme = "README.md"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|