PyPI - llmasajudge - Versions diffs - 0.1.14__py3-none-any.whl → 0.1.18__py3-none-any.whl - Mend

llmasajudge 0.1.14py3-none-any.whl → 0.1.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

llmasajudge/__init__.py +114 -18
llmasajudge/ranker.py +772 -0
{llmasajudge-0.1.14.dist-info → llmasajudge-0.1.18.dist-info}/METADATA +1 -1
llmasajudge-0.1.18.dist-info/RECORD +6 -0
{llmasajudge-0.1.14.dist-info → llmasajudge-0.1.18.dist-info}/WHEEL +1 -1
llmasajudge-0.1.14.dist-info/RECORD +0 -5
{llmasajudge-0.1.14.dist-info → llmasajudge-0.1.18.dist-info}/top_level.txt +0 -0

llmasajudge/__init__.py CHANGED Viewed

@@ -875,6 +875,7 @@ class ReturnType(Enum):
     BOOLEAN = "boolean"
     SCALAR = "scalar"
     MAP = "map"
+    STRING = "string"  # For arbitrary string returns (categories, choices, etc.)
 class AggregationMode(Enum):
@@ -888,6 +889,7 @@ class AggregationMode(Enum):
     MIN = "min"
     MAX = "max"
     MEDIAN = "median"
+    # String modes - uses MAJORITY and SINGLE from above
 # Valid aggregation modes per return type
@@ -895,6 +897,7 @@ VALID_MODES = {
     ReturnType.BOOLEAN: {AggregationMode.MAJORITY, AggregationMode.SINGLE, AggregationMode.ALL},
     ReturnType.SCALAR: {AggregationMode.AVERAGE, AggregationMode.MIN, AggregationMode.MAX, AggregationMode.MEDIAN, AggregationMode.SINGLE},
     ReturnType.MAP: {AggregationMode.AVERAGE, AggregationMode.MIN, AggregationMode.MAX, AggregationMode.MEDIAN, AggregationMode.SINGLE},
+    ReturnType.STRING: {AggregationMode.MAJORITY, AggregationMode.SINGLE},
 }
 # Default aggregation modes per return type
@@ -902,6 +905,7 @@ DEFAULT_MODES = {
     ReturnType.BOOLEAN: AggregationMode.MAJORITY,
     ReturnType.SCALAR: AggregationMode.AVERAGE,
     ReturnType.MAP: AggregationMode.AVERAGE,
+    ReturnType.STRING: AggregationMode.MAJORITY,
 }
 # String to enum mapping (for backward compat)
@@ -1115,6 +1119,8 @@ def _infer_return_type(value: Any) -> Optional[ReturnType]:
         return ReturnType.SCALAR
     if isinstance(value, dict) and all(isinstance(v, (int, float)) for v in value.values()):
         return ReturnType.MAP
+    if isinstance(value, str):
+        return ReturnType.STRING
     return None
@@ -1374,15 +1380,21 @@ Output only valid JSON. No explanation. No extra text.""",
         last_err = None
         for i in range(attempts):
             try:
-                resp = completion(
-                    model=model,
-                    api_base=api_base,
-                    messages=[{"role": "user", "content": prompt}],
-                    temperature=temperature,
-                    max_tokens=max_tokens,
-                    extra_headers=headers,
-                    caching=self.cache_enabled
-                )
+                # GPT-5 models don't accept temperature argument
+                completion_kwargs = {
+                    "model": model,
+                    "api_base": api_base,
+                    "messages": [{"role": "user", "content": prompt}],
+                    "max_tokens": max_tokens,
+                    "extra_headers": headers,
+                    "caching": self.cache_enabled
+                }
+                # Only add temperature if NOT a gpt-5 model
+                if "gpt-5" not in model.lower():
+                    completion_kwargs["temperature"] = temperature
+                resp = completion(**completion_kwargs)
                 return (resp.choices[0].message.content or "").strip()
             except Exception as e:
                 last_err = e
@@ -1453,20 +1465,20 @@ Output only valid JSON. No explanation. No extra text.""",
         valid = [v["result"] for v in votes if v["result"] is not None and isinstance(v["result"], dict)]
         if not valid:
             raise ValueError("No valid map votes to aggregate")
         keys = set()
         for v in valid:
             keys.update(v.keys())
         if self._mode == AggregationMode.SINGLE:
             return valid[0]
         result = {}
         for key in keys:
             values = [v[key] for v in valid if key in v]
             if not values:
                 continue
             if self._mode == AggregationMode.AVERAGE:
                 result[key] = sum(values) / len(values)
             elif self._mode == AggregationMode.MIN:
@@ -1478,9 +1490,38 @@ Output only valid JSON. No explanation. No extra text.""",
                 n = len(s)
                 mid = n // 2
                 result[key] = (s[mid - 1] + s[mid]) / 2 if n % 2 == 0 else s[mid]
         return result
+    def _aggregate_string(self, votes: List[Dict[str, Any]]) -> str:
+        """
+        Aggregate string votes with tie detection.
+        Returns the majority string, or "tie" if there's no clear majority.
+        """
+        results = [v["result"] for v in votes if v["result"] is not None and isinstance(v["result"], str)]
+        if not results:
+            raise ValueError("No valid string votes to aggregate")
+        if self._mode == AggregationMode.SINGLE:
+            return results[0]
+        # Count occurrences
+        from collections import Counter
+        counts = Counter(results)
+        # Get the most common
+        most_common = counts.most_common()
+        if len(most_common) == 0:
+            raise ValueError("No valid string votes to aggregate")
+        # Check for tie: if top two have same count
+        if len(most_common) > 1 and most_common[0][1] == most_common[1][1]:
+            return "tie"
+        # Return the majority
+        return most_common[0][0]
     def judge(
         self,
         input: Any = None,
@@ -1577,13 +1618,16 @@ Output only valid JSON. No explanation. No extra text.""",
             final = self._aggregate_scalar(votes)
         elif return_type == ReturnType.MAP:
             final = self._aggregate_map(votes)
+        elif return_type == ReturnType.STRING:
+            final = self._aggregate_string(votes)
         else:
             raise ValueError(f"Unknown return type: {return_type}")
         # Build backward-compatible response
-        # Boolean: correct=bool, scores=None
-        # Scalar: correct=score, scores=score (both fields for convenience)
-        # Map: correct=None, scores=map
+        # Boolean: correct=bool, scores=None, result=bool
+        # Scalar: correct=score, scores=score, result=score (both fields for convenience)
+        # Map: correct=None, scores=map, result=map
+        # String: correct=string, scores=None, result=string
         if return_type == ReturnType.BOOLEAN:
             # Also put "correct" in each vote for backward compat
             for v in votes:
@@ -1591,6 +1635,7 @@ Output only valid JSON. No explanation. No extra text.""",
             return {
                 "correct": final,
                 "scores": None,
+                "result": final,
                 "mode": self.mode,
                 "votes": votes,
             }
@@ -1599,6 +1644,16 @@ Output only valid JSON. No explanation. No extra text.""",
             return {
                 "correct": final,
                 "scores": final,
+                "result": final,
+                "mode": self.mode,
+                "votes": votes,
+            }
+        elif return_type == ReturnType.STRING:
+            # For string, put result in correct field
+            return {
+                "correct": final,
+                "scores": None,
+                "result": final,
                 "mode": self.mode,
                 "votes": votes,
             }
@@ -1606,6 +1661,47 @@ Output only valid JSON. No explanation. No extra text.""",
             return {
                 "correct": None,
                 "scores": final,
+                "result": final,
                 "mode": self.mode,
                 "votes": votes,
-            }
+            }
+    def rank(
+        self,
+        input: str,
+        model_outputs: List[str],
+        ground_truth: Optional[str] = None,
+        ranking_mode: str = "single_shot",
+        output_parser: Optional[Callable] = None,
+        custom_template: Optional[str] = None,
+        use_fully_custom_prompt: bool = False,
+        max_tokens: int = 10000,
+    ) -> Dict[str, Any]:
+        """
+        Rank multiple model outputs.
+        Args:
+            input: Original prompt or task description
+            model_outputs: List of model outputs to rank
+            ground_truth: Optional reference answer
+            ranking_mode: "single_shot" or "round_robin"
+            output_parser: Function to parse judge output
+            custom_template: Prompt template with placeholders
+            use_fully_custom_prompt: If True, template used as-is
+            max_tokens: Maximum tokens for judge response
+        Returns:
+            Dict with ranking results (see ranker.rank() for details)
+        """
+        from llmasajudge.ranker import rank as _rank
+        return _rank(
+            judge=self,
+            input=input,
+            model_outputs=model_outputs,
+            ground_truth=ground_truth,
+            ranking_mode=ranking_mode,
+            output_parser=output_parser,
+            custom_template=custom_template,
+            use_fully_custom_prompt=use_fully_custom_prompt,
+            max_tokens=max_tokens,
+        )

llmasajudge/ranker.py ADDED Viewed

@@ -0,0 +1,772 @@
+"""
+LLMAsAJudge Ranking Extensions
+Provides relative ranking functionality for evaluating multiple model outputs.
+Supports two ranking modes:
+1. single_shot: Judge sees all model_outputs at once and returns ranking/scores
+2. round_robin: Judge compares model_outputs pairwise, results are aggregated
+Usage:
+    from llmasajudge import LLMAsAJudge
+    from llmasajudge.ranker import rank
+    judge = LLMAsAJudge(models=["openai/gpt-4o-mini"])
+    result = rank(
+        judge=judge,
+        input="Explain recursion simply",
+        model_outputs=["Answer 1", "Answer 2", "Answer 3"],
+        ranking_mode="single_shot",
+        output_parser=ranking_parser,
+        custom_template=template
+    )
+"""
+import re
+import json
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from itertools import combinations
+__all__ = ["rank", "RankingParsers"]
+# Default templates for ranking
+DEFAULT_SINGLE_SHOT_TEMPLATE = """\
+Rank the following candidate responses from BEST to WORST.
+{ground_truth_section}
+Task/Input:
+{input_block}
+Candidates:
+{model_outputs}
+Provide your ranking using the format: A > B > C > D (etc)
+Return ONLY the ranking, no explanation."""
+DEFAULT_SINGLE_SHOT_WITH_GT_TEMPLATE = """\
+Rank the following candidate responses from BEST to WORST based on how well they match the ground truth answer.
+{ground_truth_section}
+Task/Input:
+{input_block}
+Ground Truth Answer:
+{ground_truth}
+Candidates:
+{model_outputs}
+Provide your ranking using the format: A > B > C > D (etc)
+Return ONLY the ranking, no explanation."""
+DEFAULT_ROUND_ROBIN_TEMPLATE = """\
+Compare the following two responses and determine which is better.
+Task/Input:
+{input_block}
+Option A:
+{option_a}
+Option B:
+{option_b}
+Which response is better? Return exactly one of: A, B, or tie"""
+DEFAULT_ROUND_ROBIN_WITH_GT_TEMPLATE = """\
+Compare the following two responses based on how well they match the ground truth answer.
+Task/Input:
+{input_block}
+Ground Truth Answer:
+{ground_truth}
+Option A:
+{option_a}
+Option B:
+{option_b}
+Which response better matches the ground truth? Return exactly one of: A, B, or tie"""
+class RankingParsers:
+    """Stock output parsers for ranking tasks."""
+    @staticmethod
+    def letter_ordering(response: str) -> List[str]:
+        """
+        Parse ordering like "A > C > B" into ["A", "C", "B"].
+        Handles various separators: >, ->, =>
+        """
+        if not response:
+            return []
+        # Try different separators
+        for sep in [">", "->", "=>"]:
+            if sep in response:
+                parts = [x.strip().upper() for x in response.split(sep)]
+                # Filter to single letters only
+                return [p for p in parts if len(p) == 1 and p.isalpha()]
+        # Fallback: extract all single letters in order
+        letters = re.findall(r'\b([A-Z])\b', response.upper())
+        return letters
+    @staticmethod
+    def json_scores(response: str) -> Optional[Dict[str, float]]:
+        """
+        Parse JSON like {"A": 9.2, "B": 7.1, "C": 8.5}.
+        Returns dict mapping candidate labels to scores.
+        """
+        if not response:
+            return None
+        try:
+            s = response.strip()
+            # Handle markdown code blocks
+            if "```json" in s.lower():
+                start = s.lower().find("```json") + 7
+                end = s.find("```", start)
+                if end > start:
+                    s = s[start:end].strip()
+            elif "```" in s:
+                start = s.find("```") + 3
+                end = s.find("```", start)
+                if end > start:
+                    s = s[start:end].strip()
+            # Extract JSON object
+            if '{' in s and '}' in s:
+                start_brace = s.find('{')
+                end_brace = s.rfind('}')
+                if start_brace < end_brace:
+                    s = s[start_brace:end_brace + 1]
+            data = json.loads(s)
+            if not isinstance(data, dict):
+                return None
+            # Convert all values to float
+            result = {}
+            for key, val in data.items():
+                if isinstance(val, (int, float)):
+                    result[str(key).upper()] = float(val)
+                elif isinstance(val, str):
+                    try:
+                        result[str(key).upper()] = float(val)
+                    except ValueError:
+                        pass
+            return result if result else None
+        except (json.JSONDecodeError, ValueError):
+            return None
+    @staticmethod
+    def pairwise_winner(response: str) -> Optional[str]:
+        """
+        Parse pairwise comparison: "A", "B", or "tie".
+        Returns "A", "B", "tie", or None if unparseable.
+        """
+        if not response:
+            return None
+        text = response.strip().upper()
+        # Exact matches
+        if text == "A":
+            return "A"
+        if text == "B":
+            return "B"
+        if text == "TIE" or text == "TIED":
+            return "tie"
+        # Check for tie first (more specific)
+        if "TIE" in text or "TIED" in text or "DRAW" in text or "EQUAL" in text:
+            return "tie"
+        # Look for explicit answer patterns like "Answer: A", "Winner: B", "A is better", etc.
+        # Match word boundaries to avoid false positives
+        # Pattern to find answer declarations
+        answer_patterns = [
+            r'\bANSWER\s*:?\s*([AB])\b',
+            r'\bWINNER\s*:?\s*([AB])\b',
+            r'\bCHOOSE\s*:?\s*([AB])\b',
+            r'\bSELECT\s*:?\s*([AB])\b',
+            r'\bRESPONSE\s*:?\s*([AB])\b',
+            r'\bOPTION\s*:?\s*([AB])\b',
+            r'^\s*([AB])\s*$',  # Just "A" or "B" alone
+            r'\b([AB])\s+IS\s+BETTER\b',
+            r'\bBETTER\s*:?\s*([AB])\b',
+        ]
+        for pattern in answer_patterns:
+            match = re.search(pattern, text)
+            if match:
+                return match.group(1)
+        # Fallback: simple presence check (only if one appears more prominently)
+        # Count standalone occurrences
+        a_count = len(re.findall(r'\bA\b', text))
+        b_count = len(re.findall(r'\bB\b', text))
+        # If one clearly dominates, use it
+        if a_count > b_count and b_count == 0:
+            return "A"
+        if b_count > a_count and a_count == 0:
+            return "B"
+        # Last resort: check if only one appears at all
+        if "A" in text and "B" not in text:
+            return "A"
+        if "B" in text and "A" not in text:
+            return "B"
+        return None
+def _format_model_outputs(model_outputs: List[str]) -> str:
+    """
+    Format model_outputs as labeled blocks:
+    A)
+    <output 0>
+    B)
+    <output 1>
+    """
+    labels = [chr(65 + i) for i in range(len(model_outputs))]  # A, B, C, ...
+    blocks = []
+    for label, output in zip(labels, model_outputs):
+        blocks.append(f"{label})\n{output}")
+    return "\n\n".join(blocks)
+def _labels_to_indices(labels: List[str], num_outputs: int) -> List[int]:
+    """
+    Convert letter labels ["A", "C", "B"] to indices [0, 2, 1].
+    """
+    indices = []
+    for label in labels:
+        if len(label) != 1 or not label.isalpha():
+            continue
+        idx = ord(label.upper()) - 65  # A=0, B=1, etc.
+        if 0 <= idx < num_outputs:
+            indices.append(idx)
+    return indices
+def _scores_to_ranking(scores: Dict[str, float], num_outputs: int) -> List[int]:
+    """
+    Convert score dict {"A": 9, "B": 7, "C": 8} to ranking [0, 2, 1] (descending).
+    """
+    # Normalize keys to uppercase letters
+    normalized = {}
+    for k, v in scores.items():
+        label = str(k).upper()
+        if len(label) == 1 and label.isalpha():
+            idx = ord(label) - 65
+            if 0 <= idx < num_outputs:
+                normalized[idx] = float(v)
+    # Sort by score descending
+    sorted_indices = sorted(normalized.keys(), key=lambda i: normalized[i], reverse=True)
+    return sorted_indices
+def _single_shot_rank(
+    judge,
+    input_text: str,
+    model_outputs: List[str],
+    ground_truth: Optional[str],
+    output_parser: Callable,
+    custom_template: Optional[str],
+    use_fully_custom_prompt: bool,
+    max_tokens: int,
+) -> Dict[str, Any]:
+    """
+    Execute single-shot ranking where judge sees all model_outputs at once.
+    Returns:
+        {
+            "ranking": [0, 2, 1],  # Indices in rank order
+            "labels": ["A", "C", "B"],  # Letter labels in rank order
+            "scores": {"A": 9.2, "B": 7.1, "C": 8.5} or None,
+            "raw_votes": [...],  # Individual judge outputs
+        }
+    """
+    num_outputs = len(model_outputs)
+    formatted_outputs = _format_model_outputs(model_outputs)
+    # Build prompt
+    if use_fully_custom_prompt:
+        if custom_template is None:
+            raise ValueError("use_fully_custom_prompt=True requires custom_template")
+        prompt = custom_template
+    elif custom_template:
+        # Replace placeholders in custom template
+        prompt = custom_template
+        prompt = prompt.replace("{input_block}", input_text or "")
+        prompt = prompt.replace("{model_outputs}", formatted_outputs)
+        if ground_truth:
+            prompt = prompt.replace("{ground_truth}", ground_truth)
+        # Handle optional ground_truth_section placeholder
+        prompt = prompt.replace("{ground_truth_section}", "")
+    else:
+        # Use default template
+        if ground_truth:
+            template = DEFAULT_SINGLE_SHOT_WITH_GT_TEMPLATE
+            prompt = template.replace("{input_block}", input_text or "")
+            prompt = prompt.replace("{model_outputs}", formatted_outputs)
+            prompt = prompt.replace("{ground_truth}", ground_truth)
+            prompt = prompt.replace("{ground_truth_section}", "")
+        else:
+            template = DEFAULT_SINGLE_SHOT_TEMPLATE
+            prompt = template.replace("{input_block}", input_text or "")
+            prompt = prompt.replace("{model_outputs}", formatted_outputs)
+            prompt = prompt.replace("{ground_truth_section}", "")
+    # Use judge's internal voting mechanism
+    # We'll call judge with the constructed prompt
+    if use_fully_custom_prompt:
+        judge_result = judge.judge(prompt=prompt, max_tokens=max_tokens)
+    else:
+        # Pass empty values for standard params since we built prompt manually
+        # This is a bit hacky but works with current judge implementation
+        old_template = judge.template
+        judge.template = "{input_block}"
+        judge_result = judge.judge(input=prompt, model_output="", ground_truth="", max_tokens=max_tokens)
+        judge.template = old_template
+    # Parse each vote
+    raw_votes = judge_result.get("votes", [])
+    parsed_votes = []
+    for vote in raw_votes:
+        model = vote.get("model")
+        # Get raw response - need to call parser on it
+        # Since we used judge.judge(), the result is already in vote["result"]
+        # But we need the raw string to parse. Let's re-call the models manually.
+        pass
+    # Actually, let's refactor: we need direct model access for ranking
+    # The judge.judge() flow doesn't give us raw strings back
+    # Let's call models directly
+    votes = []
+    for model_name in judge.models:
+        try:
+            api_base, headers, temperature = judge._resolve_per_model(model_name)
+            raw_response = judge._attempt_completion(
+                model=model_name,
+                api_base=api_base,
+                headers=headers,
+                prompt=prompt,
+                temperature=temperature,
+                max_tokens=max_tokens,
+            )
+            parsed = output_parser(raw_response)
+            votes.append({
+                "model": model_name,
+                "raw_response": raw_response,
+                "parsed": parsed,
+            })
+            if judge.verbose:
+                print(f"Model {model_name} ranking: {parsed}", flush=True)
+        except Exception as e:
+            if judge.verbose:
+                print(f"Model {model_name} failed: {e}", flush=True)
+            votes.append({
+                "model": model_name,
+                "error": str(e),
+                "parsed": None,
+            })
+    # Handle custom generation functions
+    for idx, custom_fn in enumerate(judge.custom_generation_fns):
+        try:
+            raw_response = custom_fn(prompt)
+            parsed = output_parser(raw_response)
+            votes.append({
+                "model": f"custom_fn_{idx}",
+                "raw_response": raw_response,
+                "parsed": parsed,
+            })
+        except Exception as e:
+            if judge.verbose:
+                print(f"Custom function {idx} failed: {e}", flush=True)
+            votes.append({
+                "model": f"custom_fn_{idx}",
+                "error": str(e),
+                "parsed": None,
+            })
+    # Aggregate votes based on mode
+    mode = judge.mode
+    valid_votes = [v for v in votes if v.get("parsed") is not None]
+    if not valid_votes:
+        raise ValueError("No valid ranking votes received")
+    # Determine output type (ordering vs scores)
+    first_parsed = valid_votes[0]["parsed"]
+    if isinstance(first_parsed, list):
+        # Ordering format: ["A", "C", "B"]
+        if mode == "single":
+            final_labels = valid_votes[0]["parsed"]
+        elif mode == "majority":
+            # Use first valid vote for ordering (majority doesn't make sense for orderings)
+            # Could implement Borda count or similar, but for simplicity use first
+            final_labels = valid_votes[0]["parsed"]
+        else:
+            final_labels = valid_votes[0]["parsed"]
+        final_ranking = _labels_to_indices(final_labels, num_outputs)
+        final_scores = None
+    elif isinstance(first_parsed, dict):
+        # Score format: {"A": 9.2, "B": 7.1, "C": 8.5}
+        if mode == "single":
+            final_scores = valid_votes[0]["parsed"]
+        elif mode in ("majority", "average"):
+            # Average scores across judges
+            all_scores = {}
+            for vote in valid_votes:
+                scores = vote["parsed"]
+                if isinstance(scores, dict):
+                    for label, score in scores.items():
+                        label = str(label).upper()
+                        if label not in all_scores:
+                            all_scores[label] = []
+                        all_scores[label].append(float(score))
+            final_scores = {k: sum(v) / len(v) for k, v in all_scores.items()}
+        else:
+            final_scores = valid_votes[0]["parsed"]
+        final_ranking = _scores_to_ranking(final_scores, num_outputs)
+        final_labels = [chr(65 + i) for i in final_ranking]
+    else:
+        raise ValueError(f"Unknown parsed format: {type(first_parsed)}")
+    return {
+        "ranking": final_ranking,
+        "labels": final_labels if isinstance(first_parsed, list) else [chr(65 + i) for i in final_ranking],
+        "scores": final_scores,
+        "raw_votes": votes,
+        "mode": mode,
+    }
+def _round_robin_rank(
+    judge,
+    input_text: str,
+    model_outputs: List[str],
+    ground_truth: Optional[str],
+    output_parser: Callable,
+    custom_template: Optional[str],
+    use_fully_custom_prompt: bool,
+    max_tokens: int,
+) -> Dict[str, Any]:
+    """
+    Execute round-robin ranking where judge compares all pairs.
+    For N model_outputs, performs N(N-1)/2 pairwise comparisons.
+    Aggregates results into final ranking based on win counts.
+    Returns:
+        {
+            "ranking": [2, 0, 1],  # Indices sorted by wins (descending)
+            "wins": {0: 1, 1: 0, 2: 2},  # Win count per output
+            "pairwise_results": {(0,1): 0, (0,2): 2, (1,2): 2},  # Winner per pair
+            "raw_votes": {...},  # All pairwise judge votes
+        }
+    """
+    n = len(model_outputs)
+    # Initialize tracking
+    wins = {i: 0 for i in range(n)}
+    pairwise_results = {}
+    all_votes = {}
+    # Generate all unique pairs
+    pairs = list(combinations(range(n), 2))
+    for i, j in pairs:
+        # Build pairwise prompt
+        if use_fully_custom_prompt:
+            if custom_template is None:
+                raise ValueError("use_fully_custom_prompt=True requires custom_template")
+            prompt = custom_template
+        elif custom_template:
+            # Replace placeholders in custom template
+            prompt = custom_template
+            prompt = prompt.replace("{input_block}", input_text or "")
+            prompt = prompt.replace("{option_a}", model_outputs[i])
+            prompt = prompt.replace("{option_b}", model_outputs[j])
+            if ground_truth:
+                prompt = prompt.replace("{ground_truth}", ground_truth)
+        else:
+            # Use default template
+            if ground_truth:
+                template = DEFAULT_ROUND_ROBIN_WITH_GT_TEMPLATE
+                prompt = template.replace("{input_block}", input_text or "")
+                prompt = prompt.replace("{option_a}", model_outputs[i])
+                prompt = prompt.replace("{option_b}", model_outputs[j])
+                prompt = prompt.replace("{ground_truth}", ground_truth)
+            else:
+                template = DEFAULT_ROUND_ROBIN_TEMPLATE
+                prompt = template.replace("{input_block}", input_text or "")
+                prompt = prompt.replace("{option_a}", model_outputs[i])
+                prompt = prompt.replace("{option_b}", model_outputs[j])
+        # Collect votes from all judges
+        votes = []
+        for model_name in judge.models:
+            try:
+                api_base, headers, temperature = judge._resolve_per_model(model_name)
+                raw_response = judge._attempt_completion(
+                    model=model_name,
+                    api_base=api_base,
+                    headers=headers,
+                    prompt=prompt,
+                    temperature=temperature,
+                    max_tokens=max_tokens,
+                )
+                parsed = output_parser(raw_response)
+                votes.append({
+                    "model": model_name,
+                    "raw_response": raw_response,
+                    "parsed": parsed,
+                })
+                if judge.verbose:
+                    print(f"Pair ({i},{j}): Model {model_name} raw response: {repr(raw_response)}", flush=True)
+                    print(f"Pair ({i},{j}): Model {model_name} voted: {parsed}", flush=True)
+            except Exception as e:
+                if judge.verbose:
+                    print(f"Pair ({i},{j}): Model {model_name} failed: {e}", flush=True)
+                votes.append({
+                    "model": model_name,
+                    "error": str(e),
+                    "parsed": None,
+                })
+        # Handle custom generation functions
+        for idx, custom_fn in enumerate(judge.custom_generation_fns):
+            try:
+                raw_response = custom_fn(prompt)
+                parsed = output_parser(raw_response)
+                votes.append({
+                    "model": f"custom_fn_{idx}",
+                    "raw_response": raw_response,
+                    "parsed": parsed,
+                })
+            except Exception as e:
+                if judge.verbose:
+                    print(f"Pair ({i},{j}): Custom function {idx} failed: {e}", flush=True)
+                votes.append({
+                    "model": f"custom_fn_{idx}",
+                    "error": str(e),
+                    "parsed": None,
+                })
+        # Aggregate votes for this pair
+        valid_votes = [v for v in votes if v.get("parsed") is not None]
+        if not valid_votes:
+            # No valid votes, mark as tie
+            pairwise_results[(i, j)] = "tie"
+            all_votes[(i, j)] = votes
+            continue
+        mode = judge.mode
+        if mode == "single":
+            winner = valid_votes[0]["parsed"]
+        elif mode in ("majority", "all"):
+            # Count votes for A, B, tie
+            vote_counts = {"A": 0, "B": 0, "tie": 0}
+            for vote in valid_votes:
+                result = vote["parsed"]
+                if result in vote_counts:
+                    vote_counts[result] += 1
+            # Determine winner
+            if mode == "all":
+                # All judges must agree
+                if vote_counts["A"] == len(valid_votes):
+                    winner = "A"
+                elif vote_counts["B"] == len(valid_votes):
+                    winner = "B"
+                else:
+                    winner = "tie"
+            else:  # majority
+                max_votes = max(vote_counts.values())
+                # Check for tie in voting
+                max_keys = [k for k, v in vote_counts.items() if v == max_votes]
+                if len(max_keys) > 1:
+                    winner = "tie"
+                else:
+                    winner = max_keys[0]
+        else:
+            winner = valid_votes[0]["parsed"]
+        # Record result
+        if winner == "A":
+            pairwise_results[(i, j)] = i
+            wins[i] += 1
+        elif winner == "B":
+            pairwise_results[(i, j)] = j
+            wins[j] += 1
+        else:  # tie
+            pairwise_results[(i, j)] = "tie"
+        all_votes[(i, j)] = votes
+    # Build final ranking from win counts
+    ranking = sorted(range(n), key=lambda idx: wins[idx], reverse=True)
+    return {
+        "ranking": ranking,
+        "wins": wins,
+        "pairwise_results": pairwise_results,
+        "raw_votes": all_votes,
+        "mode": judge.mode,
+    }
+def rank(
+    judge,
+    input: str,
+    model_outputs: List[str],
+    ground_truth: Optional[str] = None,
+    ranking_mode: str = "single_shot",
+    output_parser: Optional[Union[str, Callable]] = None,
+    custom_template: Optional[str] = None,
+    use_fully_custom_prompt: bool = False,
+    max_tokens: int = 10000,
+) -> Dict[str, Any]:
+    """
+    Rank multiple model outputs using an LLM judge.
+    Args:
+        judge: LLMAsAJudge instance configured with models
+        input: Original prompt or task description
+        model_outputs: List of model outputs to rank
+        ground_truth: Optional reference answer. If provided, ranking will be based on
+            how well model_outputs match the ground truth. If None, model_outputs are ranked
+            purely by quality/preference.
+        ranking_mode: "single_shot" or "round_robin"
+        output_parser: Parser for judge output. Can be:
+            - String: "letter_ordering", "json_scores", "pairwise_winner"
+            - Callable: Custom parser function
+            - None: Auto-selects based on ranking_mode (defaults to "letter_ordering"
+              for single_shot, "pairwise_winner" for round_robin)
+            For single_shot: should return List[str] (ordering) or Dict[str, float] (scores)
+            For round_robin: should return "A", "B", or "tie"
+        custom_template: Optional prompt template with placeholders. If None, uses sensible
+            defaults that adapt based on whether ground_truth is provided.
+            Available placeholders:
+            - single_shot: {input_block}, {model_outputs}, {ground_truth}
+            - round_robin: {input_block}, {option_a}, {option_b}, {ground_truth}
+        use_fully_custom_prompt: If True, custom_template is used as-is without substitution
+        max_tokens: Maximum tokens for judge response
+    Returns:
+        Dict with ranking results and metadata. Format depends on ranking_mode:
+        single_shot:
+            {
+                "ranking": [0, 2, 1],  # Indices in rank order
+                "labels": ["A", "C", "B"],  # Letter labels in rank order
+                "scores": {...} or None,  # Scores if parser returns dict
+                "raw_votes": [...],  # Individual judge outputs
+                "mode": str,  # Aggregation mode used
+            }
+        round_robin:
+            {
+                "ranking": [2, 0, 1],  # Indices sorted by wins
+                "wins": {0: 1, 1: 0, 2: 2},  # Win count per output
+                "pairwise_results": {...},  # Winner per pair
+                "raw_votes": {...},  # All pairwise judge votes
+                "mode": str,  # Aggregation mode used
+            }
+    Example:
+        >>> from llmasajudge import LLMAsAJudge
+        >>>
+        >>> judge = LLMAsAJudge(models=["openai/gpt-4o-mini"])
+        >>>
+        >>> # Using string parser name
+        >>> result = judge.rank(
+        ...     input="Explain recursion",
+        ...     model_outputs=["Answer 1", "Answer 2", "Answer 3"],
+        ...     ranking_mode="single_shot",
+        ...     output_parser="letter_ordering",
+        ...     custom_template="Rank from best to worst:\\n{model_outputs}\\nReturn: A > B > C"
+        ... )
+        >>> print(result["ranking"])  # [0, 2, 1]
+    """
+    if not model_outputs:
+        raise ValueError("Must provide at least one model output")
+    if ranking_mode not in ("single_shot", "round_robin"):
+        raise ValueError("ranking_mode must be 'single_shot' or 'round_robin'")
+    # Resolve output_parser (string or callable)
+    if output_parser is None:
+        # Auto-select default parser based on mode
+        if ranking_mode == "single_shot":
+            output_parser = RankingParsers.letter_ordering
+        else:  # round_robin
+            output_parser = RankingParsers.pairwise_winner
+    elif isinstance(output_parser, str):
+        # Map string to parser function
+        parser_map = {
+            'letter_ordering': RankingParsers.letter_ordering,
+            'json_scores': RankingParsers.json_scores,
+            'pairwise_winner': RankingParsers.pairwise_winner,
+        }
+        if output_parser not in parser_map:
+            raise ValueError(
+                f"Unknown parser '{output_parser}'. "
+                f"Available: {list(parser_map.keys())}"
+            )
+        output_parser = parser_map[output_parser]
+    # else: assume it's a callable, use as-is
+    if ranking_mode == "single_shot":
+        return _single_shot_rank(
+            judge=judge,
+            input_text=input,
+            model_outputs=model_outputs,
+            ground_truth=ground_truth,
+            output_parser=output_parser,
+            custom_template=custom_template,
+            use_fully_custom_prompt=use_fully_custom_prompt,
+            max_tokens=max_tokens,
+        )
+    else:  # round_robin
+        return _round_robin_rank(
+            judge=judge,
+            input_text=input,
+            model_outputs=model_outputs,
+            ground_truth=ground_truth,
+            output_parser=output_parser,
+            custom_template=custom_template,
+            use_fully_custom_prompt=use_fully_custom_prompt,
+            max_tokens=max_tokens,
+        )

{llmasajudge-0.1.14.dist-info → llmasajudge-0.1.18.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: llmasajudge
-Version: 0.1.14
+Version: 0.1.18
 Summary: LLM Judge: simple right/wrong voting across models
 Author-email: Brett Young <byyoung3@gmail.com>
 Project-URL: Homepage, https://example.com

llmasajudge-0.1.18.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,6 @@
+llmasajudge/__init__.py,sha256=TGVADN77vQtKy3JBGLe9F578jVCZ_Vz055P1CIk2vIQ,65215
+llmasajudge/ranker.py,sha256=2Nr-J1DNPYVIja2Fl-ksuvOnJPEwYmfylDkdlYqCWtE,26829
+llmasajudge-0.1.18.dist-info/METADATA,sha256=lV63AvuLpdzAjhVgN5PQr-2fiGn84QQizBMPLDYWsV0,515
+llmasajudge-0.1.18.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+llmasajudge-0.1.18.dist-info/top_level.txt,sha256=rRaIpM1llpEqahR9flT3RjpZHi2o16iOgnGYJ8cO4_0,12
+llmasajudge-0.1.18.dist-info/RECORD,,

{llmasajudge-0.1.14.dist-info → llmasajudge-0.1.18.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.9.0)
+Generator: setuptools (80.10.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

llmasajudge-0.1.14.dist-info/RECORD DELETED Viewed

@@ -1,5 +0,0 @@
-llmasajudge/__init__.py,sha256=OKaafNDE_1vOIPZshLrs37kGvSq5QXSHIWA9AVmeVTU,61627
-llmasajudge-0.1.14.dist-info/METADATA,sha256=xsjEyt76cmEvBd9Vn99ZevnhgRJ4HpBogHoysvZGCas,515
-llmasajudge-0.1.14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-llmasajudge-0.1.14.dist-info/top_level.txt,sha256=rRaIpM1llpEqahR9flT3RjpZHi2o16iOgnGYJ8cO4_0,12
-llmasajudge-0.1.14.dist-info/RECORD,,

{llmasajudge-0.1.14.dist-info → llmasajudge-0.1.18.dist-info}/top_level.txt RENAMED Viewed

File without changes

llmasajudge 0.1.14__py3-none-any.whl → 0.1.18__py3-none-any.whl

llmasajudge 0.1.14py3-none-any.whl → 0.1.18py3-none-any.whl