PyPI - llmasajudge - Versions diffs - 0.1.14__tar.gz → 0.1.15__tar.gz - Mend

@@ -875,6 +875,7 @@ class ReturnType(Enum):
     BOOLEAN = "boolean"
     SCALAR = "scalar"
     MAP = "map"
+    STRING = "string"  # For arbitrary string returns (categories, choices, etc.)
 class AggregationMode(Enum):
@@ -888,6 +889,7 @@ class AggregationMode(Enum):
     MIN = "min"
     MAX = "max"
     MEDIAN = "median"
+    # String modes - uses MAJORITY and SINGLE from above
 # Valid aggregation modes per return type
@@ -895,6 +897,7 @@ VALID_MODES = {
     ReturnType.BOOLEAN: {AggregationMode.MAJORITY, AggregationMode.SINGLE, AggregationMode.ALL},
     ReturnType.SCALAR: {AggregationMode.AVERAGE, AggregationMode.MIN, AggregationMode.MAX, AggregationMode.MEDIAN, AggregationMode.SINGLE},
     ReturnType.MAP: {AggregationMode.AVERAGE, AggregationMode.MIN, AggregationMode.MAX, AggregationMode.MEDIAN, AggregationMode.SINGLE},
+    ReturnType.STRING: {AggregationMode.MAJORITY, AggregationMode.SINGLE},
 }
 # Default aggregation modes per return type
@@ -902,6 +905,7 @@ DEFAULT_MODES = {
     ReturnType.BOOLEAN: AggregationMode.MAJORITY,
     ReturnType.SCALAR: AggregationMode.AVERAGE,
     ReturnType.MAP: AggregationMode.AVERAGE,
+    ReturnType.STRING: AggregationMode.MAJORITY,
 }
 # String to enum mapping (for backward compat)
@@ -1115,6 +1119,8 @@ def _infer_return_type(value: Any) -> Optional[ReturnType]:
         return ReturnType.SCALAR
     if isinstance(value, dict) and all(isinstance(v, (int, float)) for v in value.values()):
         return ReturnType.MAP
+    if isinstance(value, str):
+        return ReturnType.STRING
     return None
@@ -1374,15 +1380,21 @@ Output only valid JSON. No explanation. No extra text.""",
         last_err = None
         for i in range(attempts):
             try:
-                resp = completion(
-                    model=model,
-                    api_base=api_base,
-                    messages=[{"role": "user", "content": prompt}],
-                    temperature=temperature,
-                    max_tokens=max_tokens,
-                    extra_headers=headers,
-                    caching=self.cache_enabled
-                )
+                # GPT-5 models don't accept temperature argument
+                completion_kwargs = {
+                    "model": model,
+                    "api_base": api_base,
+                    "messages": [{"role": "user", "content": prompt}],
+                    "max_tokens": max_tokens,
+                    "extra_headers": headers,
+                    "caching": self.cache_enabled
+                }
+                # Only add temperature if NOT a gpt-5 model
+                if "gpt-5" not in model.lower():
+                    completion_kwargs["temperature"] = temperature
+                resp = completion(**completion_kwargs)
                 return (resp.choices[0].message.content or "").strip()
             except Exception as e:
                 last_err = e
@@ -1453,20 +1465,20 @@ Output only valid JSON. No explanation. No extra text.""",
         valid = [v["result"] for v in votes if v["result"] is not None and isinstance(v["result"], dict)]
         if not valid:
             raise ValueError("No valid map votes to aggregate")
         keys = set()
         for v in valid:
             keys.update(v.keys())
         if self._mode == AggregationMode.SINGLE:
             return valid[0]
         result = {}
         for key in keys:
             values = [v[key] for v in valid if key in v]
             if not values:
                 continue
             if self._mode == AggregationMode.AVERAGE:
                 result[key] = sum(values) / len(values)
             elif self._mode == AggregationMode.MIN:
@@ -1478,9 +1490,38 @@ Output only valid JSON. No explanation. No extra text.""",
                 n = len(s)
                 mid = n // 2
                 result[key] = (s[mid - 1] + s[mid]) / 2 if n % 2 == 0 else s[mid]
         return result
+    def _aggregate_string(self, votes: List[Dict[str, Any]]) -> str:
+        """
+        Aggregate string votes with tie detection.
+        Returns the majority string, or "tie" if there's no clear majority.
+        """
+        results = [v["result"] for v in votes if v["result"] is not None and isinstance(v["result"], str)]
+        if not results:
+            raise ValueError("No valid string votes to aggregate")
+        if self._mode == AggregationMode.SINGLE:
+            return results[0]
+        # Count occurrences
+        from collections import Counter
+        counts = Counter(results)
+        # Get the most common
+        most_common = counts.most_common()
+        if len(most_common) == 0:
+            raise ValueError("No valid string votes to aggregate")
+        # Check for tie: if top two have same count
+        if len(most_common) > 1 and most_common[0][1] == most_common[1][1]:
+            return "tie"
+        # Return the majority
+        return most_common[0][0]
     def judge(
         self,
         input: Any = None,
@@ -1577,13 +1618,16 @@ Output only valid JSON. No explanation. No extra text.""",
             final = self._aggregate_scalar(votes)
         elif return_type == ReturnType.MAP:
             final = self._aggregate_map(votes)
+        elif return_type == ReturnType.STRING:
+            final = self._aggregate_string(votes)
         else:
             raise ValueError(f"Unknown return type: {return_type}")
         # Build backward-compatible response
-        # Boolean: correct=bool, scores=None
-        # Scalar: correct=score, scores=score (both fields for convenience)
-        # Map: correct=None, scores=map
+        # Boolean: correct=bool, scores=None, result=bool
+        # Scalar: correct=score, scores=score, result=score (both fields for convenience)
+        # Map: correct=None, scores=map, result=map
+        # String: correct=string, scores=None, result=string
         if return_type == ReturnType.BOOLEAN:
             # Also put "correct" in each vote for backward compat
             for v in votes:
@@ -1591,6 +1635,7 @@ Output only valid JSON. No explanation. No extra text.""",
             return {
                 "correct": final,
                 "scores": None,
+                "result": final,
                 "mode": self.mode,
                 "votes": votes,
             }
@@ -1599,6 +1644,16 @@ Output only valid JSON. No explanation. No extra text.""",
             return {
                 "correct": final,
                 "scores": final,
+                "result": final,
+                "mode": self.mode,
+                "votes": votes,
+            }
+        elif return_type == ReturnType.STRING:
+            # For string, put result in correct field
+            return {
+                "correct": final,
+                "scores": None,
+                "result": final,
                 "mode": self.mode,
                 "votes": votes,
             }
@@ -1606,6 +1661,7 @@ Output only valid JSON. No explanation. No extra text.""",
             return {
                 "correct": None,
                 "scores": final,
+                "result": final,
                 "mode": self.mode,
                 "votes": votes,
             }

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: llmasajudge
-Version: 0.1.14
+Version: 0.1.15
 Summary: LLM Judge: simple right/wrong voting across models
 Author-email: Brett Young <byyoung3@gmail.com>
 Project-URL: Homepage, https://example.com

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: llmasajudge
-Version: 0.1.14
+Version: 0.1.15
 Summary: LLM Judge: simple right/wrong voting across models
 Author-email: Brett Young <byyoung3@gmail.com>
 Project-URL: Homepage, https://example.com

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "llmasajudge"
-version = "0.1.14"
+version = "0.1.15"
 description = "LLM Judge: simple right/wrong voting across models"
 authors = [{name="Brett Young", email="byyoung3@gmail.com"}]
 readme = "README.md"

llmasajudge 0.1.14__tar.gz → 0.1.15__tar.gz

llmasajudge 0.1.14tar.gz → 0.1.15tar.gz