PyPI - vllm-judge - Versions diffs - 0.1.0__py3-none-any.whl - Mend

vllm-judge 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

vllm_judge/__init__.py +120 -0
vllm_judge/api/__init__.py +39 -0
vllm_judge/api/client.py +354 -0
vllm_judge/api/models.py +157 -0
vllm_judge/api/server.py +564 -0
vllm_judge/batch.py +147 -0
vllm_judge/cli.py +288 -0
vllm_judge/client.py +262 -0
vllm_judge/exceptions.py +42 -0
vllm_judge/judge.py +421 -0
vllm_judge/metrics.py +417 -0
vllm_judge/models.py +185 -0
vllm_judge/prompts.py +175 -0
vllm_judge/templating.py +206 -0
vllm_judge-0.1.0.dist-info/METADATA +124 -0
vllm_judge-0.1.0.dist-info/RECORD +19 -0
vllm_judge-0.1.0.dist-info/WHEEL +5 -0
vllm_judge-0.1.0.dist-info/entry_points.txt +2 -0
vllm_judge-0.1.0.dist-info/top_level.txt +1 -0

vllm_judge/prompts.py ADDED Viewed

@@ -0,0 +1,175 @@
+from typing import List, Dict, Union, Optional, Tuple, Any
+class PromptBuilder:
+    """Builds prompts for evaluation requests."""
+    @staticmethod
+    def build_messages(
+        response: Union[str, Dict[str, str]],
+        criteria: str,
+        rubric: Union[str, Dict[Union[int, float], str]] = None,
+        scale: Optional[Tuple[int, int]] = None,
+        examples: List[Dict[str, Any]] = None,
+        system_prompt: Optional[str] = None,
+        context: Optional[str] = None,
+        **kwargs
+    ) -> List[Dict[str, str]]:
+        """
+        Build chat messages for evaluation.
+        Args:
+            response: Single response or dict with 'a' and 'b' for comparison
+            criteria: What to evaluate for
+            rubric: Evaluation guide
+            scale: Numeric scale (min, max)
+            examples: Few-shot examples
+            system_prompt: Custom system message
+            context: Additional context
+            **kwargs: Additional parameters
+        Returns:
+            List of chat messages
+        """
+        # Detect evaluation type
+        is_comparison = isinstance(response, dict) and "a" in response and "b" in response
+        # System message
+        if not system_prompt:
+            # TODO: Add more detailed system prompts
+            system_prompt = "You are an impartial judge and expert evaluator "
+            if is_comparison:
+                system_prompt+="comparing responses objectively."
+            else:
+                system_prompt+="providing objective assessments."
+        # Output format instructions
+        system_prompt+="\nYou must respond in JSON format:\n"
+        system_prompt+="""{
+    "decision": <your judgment - string|number|boolean>,
+    "reasoning": "<concise explanation of your judgment>",
+    "score": <numeric score if requested, otherwise null>
+}"""
+        system_prompt+="\nDo not include any text in your response except for the JSON object."
+        # Build user message
+        user_content = PromptBuilder._build_user_prompt(
+            response=response,
+            criteria=criteria,
+            rubric=rubric,
+            scale=scale,
+            examples=examples,
+            is_comparison=is_comparison,
+            context=context,
+            **kwargs
+        )
+        return [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_content}
+        ]
+    @staticmethod
+    def _build_user_prompt(
+        response: Union[str, Dict[str, str]],
+        criteria: str,
+        rubric: Union[str, Dict[Union[int, float], str]],
+        scale: Optional[Tuple[int, int]],
+        examples: List[Dict[str, Any]],
+        is_comparison: bool,
+        context: Optional[str] = None,
+        **kwargs
+    ) -> str:
+        """Build the user message content."""
+        parts = []
+        # Task description
+        if is_comparison:
+            parts.append(f"Compare these two responses based on: {criteria}")
+            if context:
+                parts.append(f"\nContext: {context}")
+            parts.append(f"\nResponse A:\n{response['a']}")
+            parts.append(f"\nResponse B:\n{response['b']}")
+        else:
+            parts.append(f"Evaluate the following response based on: {criteria}")
+            if context:
+                parts.append(f"\nContext: {context}")
+            parts.append(f"\nResponse to evaluate:\n{response}")
+        # Add scale and rubric
+        if scale:
+            parts.append(f"\nProvide a score from {scale[0]} to {scale[1]}")
+            if isinstance(rubric, dict):
+                parts.append("\nScoring guide:")
+                # Sort by score in descending order
+                sorted_items = sorted(rubric.items(), key=lambda x: float(x[0]), reverse=True)
+                for score, description in sorted_items:
+                    parts.append(f"- {score}: {description}")
+            elif rubric:
+                parts.append(f"\nEvaluation guide: {rubric}")
+        elif rubric:
+            parts.append(f"\nEvaluation guide: {rubric}")
+        # Add examples if provided
+        if examples:
+            parts.append("\nExample evaluations:")
+            for i, ex in enumerate(examples, 1):
+                parts.append(f"\nExample {i}:")
+                # Handle different example formats
+                if "response" in ex:
+                    parts.append(f"Response: {ex['response']}")
+                elif "text" in ex:
+                    parts.append(f"Text: {ex['text']}")
+                if "decision" in ex:
+                    parts.append(f"Decision: {ex['decision']}")
+                if "score" in ex:
+                    parts.append(f"Score: {ex['score']}")
+                if "reasoning" in ex:
+                    parts.append(f"Reasoning: {ex['reasoning']}")
+        # Add any additional instructions
+        if kwargs.get("additional_instructions"):
+            parts.append(f"\nAdditional instructions: {kwargs['additional_instructions']}")
+        # Output format instructions
+        parts.append("\nYou must respond in JSON format:")
+        parts.append("""{
+    "decision": <your judgment - string|number|boolean>,
+    "reasoning": "<concise explanation of your judgment>",
+    "score": <numeric score if requested, otherwise null>
+}""")
+        return "\n".join(parts)
+    @staticmethod
+    def format_messages_as_text(messages: List[Dict[str, str]]) -> str:
+        """
+        Format chat messages as plain text for completion API.
+        Args:
+            messages: List of chat messages
+        Returns:
+            Formatted text prompt
+        """
+        parts = []
+        for message in messages:
+            role = message["role"]
+            content = message["content"]
+            if role == "system":
+                parts.append(f"System: {content}")
+            elif role == "user":
+                parts.append(f"\nUser: {content}")
+            elif role == "assistant":
+                parts.append(f"\nAssistant: {content}")
+        # Add a prompt for the assistant to respond
+        parts.append("\nAssistant:")
+        return "\n".join(parts)

vllm_judge/templating.py ADDED Viewed

@@ -0,0 +1,206 @@
+import string
+from typing import Dict, Any, List, Union, Set, Optional
+from vllm_judge.models import TemplateEngine
+from vllm_judge.exceptions import InvalidInputError
+class TemplateProcessor:
+    """Template processing for dynamic prompts.
+    Handles template variable substitution."""
+    @staticmethod
+    def apply_template(
+        template: Optional[Union[str, Dict]],
+        template_vars: Dict[str, Any],
+        engine: TemplateEngine = TemplateEngine.FORMAT,
+        strict: bool = True
+    ) -> Optional[Union[str, Dict]]:
+        """
+        Apply template variables to a template string or dict.
+        Args:
+            template: Template string, dict, or None
+            template_vars: Variables to substitute
+            engine: Template engine to use
+            strict: If True, raise error for missing variables
+        Returns:
+            Processed template
+        Raises:
+            InvalidInputError: If required variables are missing
+        """
+        if isinstance(template, dict):
+            # Process dict values recursively
+            return {
+                k: TemplateProcessor.apply_template(v, template_vars, engine, strict)
+                for k, v in template.items()
+            }
+        if not isinstance(template, str):
+            return template
+        if engine == TemplateEngine.FORMAT:
+            return TemplateProcessor._apply_format_template(
+                template, template_vars, strict
+            )
+        elif engine == TemplateEngine.JINJA2:
+            return TemplateProcessor._apply_jinja2_template(
+                template, template_vars, strict
+            )
+    @staticmethod
+    def _apply_format_template(
+        template: str,
+        template_vars: Dict[str, Any],
+        strict: bool
+    ) -> str:
+        """Apply str.format() style template."""
+        try:
+            # First check for missing variables if strict
+            if strict:
+                missing = TemplateProcessor.get_required_vars_format(template) - set(template_vars.keys())
+                if missing:
+                    raise InvalidInputError(
+                        f"Missing required template variables: {', '.join(sorted(missing))}"
+                    )
+            return template.format(**template_vars)
+        except KeyError as e:
+            if strict:
+                raise InvalidInputError(f"Missing template variable: {e}")
+            else:
+                # Partial formatting - leave missing variables as-is
+                return template.format_map(SafeDict(template_vars))
+    @staticmethod
+    def _apply_jinja2_template(
+        template: str,
+        template_vars: Dict[str, Any],
+        strict: bool
+    ) -> str:
+        """Apply Jinja2 template."""
+        try:
+            from jinja2 import Template, Environment, StrictUndefined, UndefinedError
+        except ImportError:
+            raise ImportError(
+                "Jinja2 is required for jinja2 template engine. "
+                "Install with: pip install vllm-judge[jinja2]"
+            )
+        try:
+            if strict:
+                # Use StrictUndefined to catch missing variables
+                env = Environment(undefined=StrictUndefined)
+                jinja_template = env.from_string(template)
+            else:
+                # Default behavior - missing variables render as empty
+                jinja_template = Template(template)
+            return jinja_template.render(**template_vars)
+        except UndefinedError as e:
+            raise InvalidInputError(f"Missing template variable in Jinja2 template: {e}")
+    @staticmethod
+    def get_required_vars(
+        template: Union[str, Dict, None],
+        engine: TemplateEngine = TemplateEngine.FORMAT
+    ) -> Set[str]:
+        """
+        Extract required variables from a template.
+        Args:
+            template: Template to analyze
+            engine: Template engine being used
+        Returns:
+            Set of required variable names
+        """
+        if isinstance(template, dict):
+            # Collect from all dict values
+            all_vars = set()
+            for v in template.values():
+                all_vars.update(TemplateProcessor.get_required_vars(v, engine))
+            return all_vars
+        if not isinstance(template, str):
+            return set()
+        if engine == TemplateEngine.FORMAT:
+            return TemplateProcessor.get_required_vars_format(template)
+        elif engine == TemplateEngine.JINJA2:
+            return TemplateProcessor.get_required_vars_jinja2(template)
+    @staticmethod
+    def get_required_vars_format(template: str) -> Set[str]:
+        """Extract variables from format string."""
+        formatter = string.Formatter()
+        variables = set()
+        try:
+            for _, field_name, _, _ in formatter.parse(template):
+                if field_name:
+                    # Handle nested fields like {user.name}
+                    base_var = field_name.split('.')[0].split('[')[0]
+                    variables.add(base_var)
+        except:
+            pass  # If parsing fails, return empty set
+        return variables
+    @staticmethod
+    def get_required_vars_jinja2(template: str) -> Set[str]:
+        """Extract variables from Jinja2 template."""
+        try:
+            from jinja2 import Environment, meta
+        except ImportError:
+            return set()  # Can't analyze without Jinja2
+        try:
+            env = Environment()
+            ast = env.parse(template)
+            return meta.find_undeclared_variables(ast)
+        except:
+            return set()
+    @staticmethod
+    def validate_template_vars(
+        provided_vars: Dict[str, Any],
+        required_vars: List[str],
+        template_defaults: Dict[str, Any] = None
+    ) -> Dict[str, Any]:
+        """
+        Validate and merge template variables.
+        Args:
+            provided_vars: User-provided variables
+            required_vars: Required variable names
+            template_defaults: Default values
+        Returns:
+            Merged template variables
+        Raises:
+            InvalidInputError: If required variables are missing
+        """
+        # Start with defaults
+        final_vars = dict(template_defaults or {})
+        # Override with provided vars
+        final_vars.update(provided_vars)
+        # Check required vars
+        missing = set(required_vars) - set(final_vars.keys())
+        if missing:
+            raise InvalidInputError(
+                f"Missing required template variables: {', '.join(sorted(missing))}"
+            )
+        return final_vars
+class SafeDict(dict):
+    """Dictionary that returns {key} for missing keys in format strings."""
+    def __missing__(self, key):
+        return f"{{{key}}}"

vllm_judge-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,124 @@
+Metadata-Version: 2.4
+Name: vllm_judge
+Version: 0.1.0
+Summary: LLM-as-a-Judge evaluations for vLLM hosted models
+Author: TrustyAI team
+Author-email: Sai Chandra Pandraju <saichandrapandraju@gmail.com>
+Project-URL: Homepage, https://github.com/saichandrapandraju/vllm_judge
+Project-URL: Repository, https://github.com/saichandrapandraju/vllm_judge
+Project-URL: Issues, https://github.com/saichandrapandraju/vllm_judge/issues
+Keywords: llm,evaluation,vllm,judge,ai,machine-learning,nlp,llm-evaluation,llm-as-judge
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+Requires-Dist: httpx>=0.24.0
+Requires-Dist: pydantic>=2.0.0
+Requires-Dist: tenacity>=8.0.0
+Requires-Dist: click>=8.0.0
+Provides-Extra: api
+Requires-Dist: fastapi>=0.100.0; extra == "api"
+Requires-Dist: uvicorn[standard]>=0.22.0; extra == "api"
+Requires-Dist: websockets>=11.0; extra == "api"
+Provides-Extra: jinja2
+Requires-Dist: jinja2>=3.0.0; extra == "jinja2"
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0.0; extra == "dev"
+Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
+Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
+Requires-Dist: black>=23.0.0; extra == "dev"
+Requires-Dist: isort>=5.12.0; extra == "dev"
+Requires-Dist: flake8>=6.0.0; extra == "dev"
+Requires-Dist: mypy>=1.0.0; extra == "dev"
+Provides-Extra: test
+Requires-Dist: pytest>=7.0.0; extra == "test"
+Requires-Dist: pytest-asyncio>=0.21.0; extra == "test"
+Requires-Dist: pytest-cov>=4.0.0; extra == "test"
+Requires-Dist: pytest-mock>=3.10.0; extra == "test"
+Provides-Extra: docs
+Requires-Dist: mkdocs>=1.5.0; extra == "docs"
+Requires-Dist: mkdocs-material>=9.0.0; extra == "docs"
+Requires-Dist: mkdocstrings[python]>=0.24.0; extra == "docs"
+# vLLM Judge
+A lightweight library for LLM-as-a-Judge evaluations using vLLM hosted models.
+## Features
+- 🚀 **Simple Interface**: Single `evaluate()` method that adapts to any use case
+- 🎯 **Pre-built Metrics**: 20+ ready-to-use evaluation metrics
+- 🔧 **Template Support**: Dynamic evaluations with template variables
+- ⚡ **High Performance**: Optimized for vLLM with automatic batching
+- 🌐 **API Mode**: Run as a REST API service
+- 🔄 **Async Native**: Built for high-throughput evaluations
+## Installation
+```bash
+# Basic installation
+pip install vllm_judge
+# With API support
+pip install vllm_judge[api]
+# With Jinja2 template support
+pip install vllm_judge[jinja2]
+# Everything
+pip install vllm_judge[api,jinja2]
+```
+## Quick Start
+```python
+from vllm_judge import Judge
+# Initialize with vLLM url
+judge = await Judge.from_url("http://localhost:8000")
+# Simple evaluation
+result = await judge.evaluate(
+    response="The Earth orbits around the Sun.",
+    criteria="scientific accuracy"
+)
+print(f"Decision: {result.decision}")
+print(f"Reasoning: {result.reasoning}")
+# Using pre-built metrics
+from vllm_judge import CODE_QUALITY
+result = await judge.evaluate(
+    response="def add(a, b): return a + b",
+    metric=CODE_QUALITY
+)
+# With template variables
+result = await judge.evaluate(
+    response="Essay content here...",
+    criteria="Evaluate this {doc_type} for {audience}",
+    template_vars={
+        "doc_type": "essay",
+        "audience": "high school students"
+    }
+)
+```
+## API Server
+Run Judge as a REST API:
+```bash
+vllm-judge serve --base-url http://localhost:8000 --port 9090 --host localhost
+```
+Then use the HTTP API:
+```python
+from vllm_judge.api import JudgeClient
+client = JudgeClient("http://localhost:9090")
+result = await client.evaluate(
+    response="Python is great!",
+    criteria="technical accuracy"
+)
+```

vllm_judge-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,19 @@
+vllm_judge/__init__.py,sha256=Sx6sERXfksr1eubHxXj_uTiVrXmHXINoY9-nP20EiSg,2363
+vllm_judge/batch.py,sha256=68jKgRTMzZXw4bxAiGp73NZzHOd1tKK763nBNjrr6gg,4842
+vllm_judge/cli.py,sha256=KQtUt_L4u5TPrS8xoyiKYt_hQ_FiHtGcrkecGEtktI8,10685
+vllm_judge/client.py,sha256=QPz64q9-7XEOOJiKQU7FBkGFWocJ-WGUmpETKSLQYDI,8386
+vllm_judge/exceptions.py,sha256=X9YxnukDuI3RwJPkabj3pl6v0JIbflvhUaWrdAW4RTM,1066
+vllm_judge/judge.py,sha256=y2qp18PVtobAyxqI246tEsju82W-OuGG4zXfajTEW-E,14101
+vllm_judge/metrics.py,sha256=QeGzaERvfRKQTt4JfquL1rW72GSkWdJ2_Nw_Hf0zqjY,15685
+vllm_judge/models.py,sha256=fbEUFPsY3xhv54WueWqEKvAgIcWTm-JO42N2-6k5LeM,7417
+vllm_judge/prompts.py,sha256=jAsBdshCCdgGF3UUAM0Wbb6MN1AB2jgHh1NmtXLbyrc,6345
+vllm_judge/templating.py,sha256=LjVFXFcwHl8xnBLLVr_IIqtN-EbLp0HZ5ndNbBpcJTQ,6998
+vllm_judge/api/__init__.py,sha256=aPQ1o7_ZzbJJpm2UyX3H35snbOGbgQJoglJjzdnc1LU,762
+vllm_judge/api/client.py,sha256=mcpdH-9ko6aEh_JAybpPPVhHqlO3l5K-lTujTlkTw8c,11302
+vllm_judge/api/models.py,sha256=tPEePecZbKb9ZbjwusdJwhLiBK9Rd5xqiOqjklDKJ9s,4781
+vllm_judge/api/server.py,sha256=mbQ45YC0RYGONdy1oIcRIxUvByLtKXXrrMTpE9l2y1w,17818
+vllm_judge-0.1.0.dist-info/METADATA,sha256=W0_-H1J-KEDOzAV8ZNgM6z8gkKxodsebmH3lBVR2jU4,3572
+vllm_judge-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+vllm_judge-0.1.0.dist-info/entry_points.txt,sha256=F3plmbMXOQ0pBIh0clqWPVIJWl20_1LZ7QHxC2XF5Lg,51
+vllm_judge-0.1.0.dist-info/top_level.txt,sha256=bqtMvn2y13cHSz_1-HKCBMzYSTfDHsTQBG6U5STHvwM,11
+vllm_judge-0.1.0.dist-info/RECORD,,

vllm_judge-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (80.9.0)
+Root-Is-Purelib: true
+Tag: py3-none-any

vllm_judge-0.1.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ vllm-judge = vllm_judge.cli:main

vllm_judge-0.1.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ vllm_judge