PyPI - vllm-judge - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl - Mend

vllm-judge 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

vllm_judge/__init__.py +6 -2
vllm_judge/api/client.py +2 -2
vllm_judge/cli.py +2 -2
vllm_judge/judge.py +39 -12
vllm_judge/metrics.py +9 -1
vllm_judge/models.py +10 -1
vllm_judge/utils.py +14 -0
{vllm_judge-0.1.1.dist-info → vllm_judge-0.1.3.dist-info}/METADATA +34 -22
vllm_judge-0.1.3.dist-info/RECORD +20 -0
{vllm_judge-0.1.1.dist-info → vllm_judge-0.1.3.dist-info}/WHEEL +1 -1
vllm_judge-0.1.1.dist-info/RECORD +0 -19
{vllm_judge-0.1.1.dist-info → vllm_judge-0.1.3.dist-info}/entry_points.txt +0 -0
{vllm_judge-0.1.1.dist-info → vllm_judge-0.1.3.dist-info}/top_level.txt +0 -0

vllm_judge/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@ A lightweight library for evaluating text responses using self-hosted language m
 via vLLM's OpenAI-compatible API.
 """
-__version__ = "0.1.1"
+__version__ = "0.1.3"
 from vllm_judge.judge import Judge
 from vllm_judge.models import (
@@ -13,7 +13,8 @@ from vllm_judge.models import (
     EvaluationResult,
     Metric,
     BatchResult,
-    TemplateEngine
+    TemplateEngine,
+    ModelSpecificMetric
 )
 from vllm_judge.templating import TemplateProcessor
 from vllm_judge.metrics import (
@@ -27,6 +28,7 @@ from vllm_judge.metrics import (
     # Safety metrics
     SAFETY,
     TOXICITY,
+    LLAMA_GUARD_3_SAFETY,
     # Code metrics
     CODE_QUALITY,
@@ -81,6 +83,7 @@ __all__ = [
     "BatchResult",
     "TemplateEngine",
     "TemplateProcessor",
+    "ModelSpecificMetric",
     # Metrics
     "HELPFULNESS",
@@ -90,6 +93,7 @@ __all__ = [
     "RELEVANCE",
     "SAFETY",
     "TOXICITY",
+    "LLAMA_GUARD_3_SAFETY",
     "CODE_QUALITY",
     "CODE_SECURITY",
     "CREATIVITY",

vllm_judge/api/client.py CHANGED Viewed

@@ -65,7 +65,7 @@ class JudgeClient:
     async def evaluate(
         self,
-        response: Union[str, Dict[str, str]],
+        content: Union[str, Dict[str, str]],
         criteria: str = None,
         rubric: Union[str, Dict[Union[int, float], str]] = None,
         scale: Optional[Tuple[int, int]] = None,
@@ -87,7 +87,7 @@ class JudgeClient:
             EvaluationResult
         """
         request = EvaluateRequest(
-            response=response,
+            response=content,
             criteria=criteria,
             rubric=rubric,
             scale=list(scale) if scale else None,

vllm_judge/cli.py CHANGED Viewed

@@ -75,7 +75,7 @@ def evaluate(
             # Use API client
             async with JudgeClient(api_url) as client:
                 result = await client.evaluate(
-                    response=response,
+                    content=response,
                     criteria=criteria,
                     metric=metric,
                     scale=scale,
@@ -91,7 +91,7 @@ def evaluate(
             judge = Judge.from_url(base_url, model=model)
             async with judge:
                 result = await judge.evaluate(
-                    response=response,
+                    content=response,
                     criteria=criteria,
                     metric=metric,
                     scale=scale,

vllm_judge/judge.py CHANGED Viewed

@@ -2,7 +2,7 @@ import json
 import re
 from typing import Union, Dict, List, Optional, Tuple, Any, Callable
-from vllm_judge.models import JudgeConfig, EvaluationResult, Metric, BatchResult, TemplateEngine
+from vllm_judge.models import JudgeConfig, EvaluationResult, Metric, BatchResult, TemplateEngine, ModelSpecificMetric
 from vllm_judge.client import VLLMClient
 from vllm_judge.prompts import PromptBuilder
 from vllm_judge.batch import BatchProcessor
@@ -14,6 +14,9 @@ from vllm_judge.exceptions import (
     MetricNotFoundError,
     VLLMJudgeError
 )
+import logging
+logger = logging.getLogger(__name__)
 class Judge:
@@ -60,7 +63,7 @@ class Judge:
     async def evaluate(
         self,
-        response: Union[str, Dict[str, str]],
+        content: Union[str, Dict[str, str]],
         criteria: str = None,
         rubric: Union[str, Dict[Union[int, float], str]] = None,
         scale: Optional[Tuple[int, int]] = None,
@@ -76,7 +79,7 @@ class Judge:
         Universal evaluation method that adapts to use case.
         Args:
-            response: String for single evaluation, dict {"a": ..., "b": ...} for comparison
+            content: String for single evaluation, dict {"a": ..., "b": ...} for comparison
             criteria: What to evaluate for (can contain template variables)
             rubric: Instructions for evaluation, can be string or dict containing mapping of score to description (can contain template variables)
             scale: Optional numeric scale (min, max)
@@ -96,6 +99,22 @@ class Judge:
             MetricNotFoundError: If metric name not found
             ParseError: If unable to parse model response
         """
+        # Handle model-specific metrics
+        if isinstance(metric, ModelSpecificMetric):
+            assert isinstance(content, str), "Model-specific metrics only support string content for now"
+            # logger.info(f"Evaluating model-specific metric {metric.name}.")
+            logger.info(f"We assume you're using {metric.model_pattern} type model. If not, please do not use this metric and use a normal metric instead.")
+            # Skip ALL our formatting
+            messages = [{"role": "user", "content": content}]
+            # vLLM applies model's chat template automatically
+            llm_response = await self._call_model(messages)
+            # Use metric's parser
+            return metric.parser_func(llm_response)
+        # Handle normal metrics
         # Handle metric parameter
         metric_template_vars = {}
@@ -138,7 +157,7 @@ class Judge:
         # Build messages
         messages = PromptBuilder.build_messages(
-            response=response,
+            response=content,
             criteria=criteria,
             rubric=rubric,
             scale=scale,
@@ -149,14 +168,7 @@ class Judge:
         )
         # Get LLM response
-        try:
-            if self.config.use_chat_api:
-                llm_response = await self.client.chat_completion(messages)
-            else:
-                prompt = PromptBuilder.format_messages_as_text(messages)
-                llm_response = await self.client.completion(prompt)
-        except Exception as e:
-            raise VLLMJudgeError(f"Failed to get model response: {e}")
+        llm_response = await self._call_model(messages)
         # Parse response
         result = self._parse_response(llm_response)
@@ -168,6 +180,21 @@ class Judge:
         return result
+    async def _call_model(self, messages: List[Dict[str, str]]) -> str:
+        """
+        Call the model with the given messages.
+        """
+        try:
+            if self.config.use_chat_api:
+                llm_response = await self.client.chat_completion(messages)
+            else:
+                prompt = PromptBuilder.format_messages_as_text(messages)
+                llm_response = await self.client.completion(prompt)
+            return llm_response
+        except Exception as e:
+            raise VLLMJudgeError(f"Failed to get model response: {e}")
     def _parse_response(self, response: str) -> EvaluationResult:
         """
         Parse LLM response into EvaluationResult.

vllm_judge/metrics.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from typing import Dict
-from vllm_judge.models import Metric,TemplateEngine
+from vllm_judge.models import Metric, TemplateEngine, ModelSpecificMetric
+from vllm_judge.utils import parse_llama_guard_3
 # Registry for built-in metrics
 BUILTIN_METRICS: Dict[str, Metric] = {}
@@ -11,6 +12,13 @@ def create_builtin_metric(metric: Metric) -> Metric:
     return metric
+# Llama Guard 3 safety metric
+LLAMA_GUARD_3_SAFETY = create_builtin_metric(ModelSpecificMetric(
+    name="llama_guard_3_safety",
+    model_pattern="llama_guard_3",
+    parser_func=parse_llama_guard_3
+))
 # General purpose metrics
 HELPFULNESS = create_builtin_metric(Metric(
     name="helpfulness",

vllm_judge/models.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Optional, Any, Dict, Union, List, Tuple
+from typing import Optional, Any, Dict, Union, List, Tuple, Callable
 from pydantic import BaseModel, Field, field_validator, ConfigDict
 from enum import Enum
@@ -159,6 +159,15 @@ class Metric:
     def __repr__(self):
         return f"Metric(name='{self.name}', criteria='{self.criteria}', template_engine='{self.template_engine}')"
+# Base class for model-specific metrics
+class ModelSpecificMetric(Metric):
+    """Metric that bypasses our prompt formatting."""
+    def __init__(self, name: str, model_pattern: str, parser_func: Callable[[str], EvaluationResult]):
+        super().__init__(name=name, criteria="model-specific evaluation")
+        self.model_pattern = model_pattern
+        self.parser_func = parser_func
+        # self.is_model_specific = True  # Flag for special handling
 class BatchResult(BaseModel):
     """Result of batch evaluation."""

vllm_judge/utils.py ADDED Viewed

@@ -0,0 +1,14 @@
+from vllm_judge.models import EvaluationResult
+# Llama Guard 3 parser
+def parse_llama_guard_3(response: str) -> EvaluationResult:
+    """Parse Llama Guard 3's 'safe/unsafe' format."""
+    lines = response.strip().split('\n')
+    is_safe = lines[0].lower().strip() == 'safe'
+    return EvaluationResult(
+        decision="safe" if is_safe else "unsafe",
+        reasoning=lines[1] if len(lines) > 1 else "No violations detected",
+        score=None,
+        metadata={"model_type": "llama_guard_3"}
+    )

{vllm_judge-0.1.1.dist-info → vllm_judge-0.1.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.4
+Metadata-Version: 2.1
 Name: vllm_judge
-Version: 0.1.1
+Version: 0.1.3
 Summary: LLM-as-a-Judge evaluations for vLLM hosted models
 Author: TrustyAI team
 Author-email: Sai Chandra Pandraju <saichandrapandraju@gmail.com>
@@ -18,6 +18,17 @@ Provides-Extra: api
 Requires-Dist: fastapi>=0.100.0; extra == "api"
 Requires-Dist: uvicorn[standard]>=0.22.0; extra == "api"
 Requires-Dist: websockets>=11.0; extra == "api"
+Provides-Extra: dev
+Requires-Dist: vllm-judge[api,docs,jinja2,test]; extra == "dev"
+Requires-Dist: black>=23.0.0; extra == "dev"
+Requires-Dist: isort>=5.12.0; extra == "dev"
+Requires-Dist: flake8>=6.0.0; extra == "dev"
+Requires-Dist: mypy>=1.0.0; extra == "dev"
+Provides-Extra: docs
+Requires-Dist: mkdocs>=1.5.0; extra == "docs"
+Requires-Dist: mkdocs-material>=9.0.0; extra == "docs"
+Requires-Dist: mkdocstrings[python]>=0.24.0; extra == "docs"
+Requires-Dist: mkdocs-material-extensions>=1.3.1; extra == "docs"
 Provides-Extra: jinja2
 Requires-Dist: jinja2>=3.0.0; extra == "jinja2"
 Provides-Extra: test
@@ -25,30 +36,22 @@ Requires-Dist: pytest>=7.0.0; extra == "test"
 Requires-Dist: pytest-asyncio>=0.21.0; extra == "test"
 Requires-Dist: pytest-cov>=4.0.0; extra == "test"
 Requires-Dist: pytest-mock>=3.10.0; extra == "test"
-Provides-Extra: docs
-Requires-Dist: mkdocs>=1.5.0; extra == "docs"
-Requires-Dist: mkdocs-material>=9.0.0; extra == "docs"
-Requires-Dist: mkdocstrings[python]>=0.24.0; extra == "docs"
-Requires-Dist: mkdocs-material-extensions>=1.3.1; extra == "docs"
-Provides-Extra: dev
-Requires-Dist: vllm_judge[api,docs,jinja2,test]; extra == "dev"
-Requires-Dist: black>=23.0.0; extra == "dev"
-Requires-Dist: isort>=5.12.0; extra == "dev"
-Requires-Dist: flake8>=6.0.0; extra == "dev"
-Requires-Dist: mypy>=1.0.0; extra == "dev"
+[![PyPI version](https://img.shields.io/pypi/v/vllm-judge.svg)
+](https://pypi.org/project/vllm-judge/)
 # vLLM Judge
-A lightweight library for LLM-as-a-Judge evaluations using vLLM hosted models. Please refer the [documentation](https://saichandrapandraju.github.io/vllm_judge/) for usage details.
+A lightweight library for LLM-as-a-Judge evaluations using vLLM hosted models. Evaluate LLM inputs & outputs at scale with just a few lines of code. From simple scoring to complex safety checks, vLLM Judge adapts to your needs. Please refer the [documentation](https://saichandrapandraju.github.io/vllm_judge/) for usage details.
 ## Features
 - 🚀 **Simple Interface**: Single `evaluate()` method that adapts to any use case
 - 🎯 **Pre-built Metrics**: 20+ ready-to-use evaluation metrics
+- 🛡️ **Model-Specific Support:** Seamlessly works with specialized models like Llama Guard without breaking their trained formats.
+- ⚡ **High Performance**: Async-first design enables high-throughput evaluations
 - 🔧 **Template Support**: Dynamic evaluations with template variables
-- ⚡ **High Performance**: Optimized for vLLM with automatic batching
 - 🌐 **API Mode**: Run as a REST API service
-- 🔄 **Async Native**: Built for high-throughput evaluations
 ## Installation
@@ -72,11 +75,11 @@ pip install vllm-judge[dev]
 from vllm_judge import Judge
 # Initialize with vLLM url
-judge = Judge.from_url("http://localhost:8000")
+judge = Judge.from_url("http://vllm-server:8000")
 # Simple evaluation
 result = await judge.evaluate(
-    response="The Earth orbits around the Sun.",
+    content="The Earth orbits around the Sun.",
     criteria="scientific accuracy"
 )
 print(f"Decision: {result.decision}")
@@ -86,19 +89,28 @@ print(f"Reasoning: {result.reasoning}")
 from vllm_judge import CODE_QUALITY
 result = await judge.evaluate(
-    response="def add(a, b): return a + b",
+    content="def add(a, b): return a + b",
     metric=CODE_QUALITY
 )
 # With template variables
 result = await judge.evaluate(
-    response="Essay content here...",
+    content="Essay content here...",
     criteria="Evaluate this {doc_type} for {audience}",
     template_vars={
         "doc_type": "essay",
         "audience": "high school students"
     }
 )
+# Works with specialized safety models out-of-the-box
+from vllm_judge import LLAMA_GUARD_3_SAFETY
+result = await judge.evaluate(
+    content="How do I make a bomb?",
+    metric=LLAMA_GUARD_3_SAFETY  # Automatically uses Llama Guard format
+)
+# Result: decision="unsafe", reasoning="S9"
 ```
 ## API Server
@@ -106,7 +118,7 @@ result = await judge.evaluate(
 Run Judge as a REST API:
 ```bash
-vllm-judge serve --base-url http://localhost:8000 --port 9090 --host localhost
+vllm-judge serve --base-url http://vllm-server:8000 --port 9090
 ```
 Then use the HTTP API:
@@ -116,7 +128,7 @@ from vllm_judge.api import JudgeClient
 client = JudgeClient("http://localhost:9090")
 result = await client.evaluate(
-    response="Python is great!",
+    content="Python is great!",
     criteria="technical accuracy"
 )
 ```

vllm_judge-0.1.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,20 @@
+vllm_judge/__init__.py,sha256=TBS7fQ4n7QEVwNtr4ErJu-T3m4c-8BwW4zDltt8S6Ko,2469
+vllm_judge/batch.py,sha256=68jKgRTMzZXw4bxAiGp73NZzHOd1tKK763nBNjrr6gg,4842
+vllm_judge/cli.py,sha256=mdoxNA5gQ1m3XBnNJYCE8uoi0RxrS9d3YIlrtdxRcME,10683
+vllm_judge/client.py,sha256=QPz64q9-7XEOOJiKQU7FBkGFWocJ-WGUmpETKSLQYDI,8386
+vllm_judge/exceptions.py,sha256=X9YxnukDuI3RwJPkabj3pl6v0JIbflvhUaWrdAW4RTM,1066
+vllm_judge/judge.py,sha256=FKMpl6ubugHqKlR-W1-arr4J2rkwnC76QM5oAFv_HyM,15220
+vllm_judge/metrics.py,sha256=lQOBaHqlX79L8yP9_YYd-dTaqvfOPo0nDMY0dtsnKvI,15960
+vllm_judge/models.py,sha256=aEXZmP2sM-9aetstzHE3ngZwvCcvnrqzcj-8oV0NCJA,7889
+vllm_judge/prompts.py,sha256=jAsBdshCCdgGF3UUAM0Wbb6MN1AB2jgHh1NmtXLbyrc,6345
+vllm_judge/templating.py,sha256=LjVFXFcwHl8xnBLLVr_IIqtN-EbLp0HZ5ndNbBpcJTQ,6998
+vllm_judge/utils.py,sha256=lhByBIMS_1EwvxEe31jFgVcTwcFwm5mWoJDXG4TnbvQ,509
+vllm_judge/api/__init__.py,sha256=aPQ1o7_ZzbJJpm2UyX3H35snbOGbgQJoglJjzdnc1LU,762
+vllm_judge/api/client.py,sha256=XRiveUw1edcknxO3zLFkYX_YbOObipx7dMFeSUjMSwk,11300
+vllm_judge/api/models.py,sha256=tPEePecZbKb9ZbjwusdJwhLiBK9Rd5xqiOqjklDKJ9s,4781
+vllm_judge/api/server.py,sha256=mbQ45YC0RYGONdy1oIcRIxUvByLtKXXrrMTpE9l2y1w,17818
+vllm_judge-0.1.3.dist-info/METADATA,sha256=L_Kf2ic1W5wn1D1Y4amZaxO6E2i6bEKjZ4JFVvh3-YA,4251
+vllm_judge-0.1.3.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
+vllm_judge-0.1.3.dist-info/entry_points.txt,sha256=F3plmbMXOQ0pBIh0clqWPVIJWl20_1LZ7QHxC2XF5Lg,51
+vllm_judge-0.1.3.dist-info/top_level.txt,sha256=bqtMvn2y13cHSz_1-HKCBMzYSTfDHsTQBG6U5STHvwM,11
+vllm_judge-0.1.3.dist-info/RECORD,,

{vllm_judge-0.1.1.dist-info → vllm_judge-0.1.3.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.9.0)
+Generator: setuptools (75.3.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

vllm_judge-0.1.1.dist-info/RECORD DELETED Viewed

@@ -1,19 +0,0 @@
-vllm_judge/__init__.py,sha256=iI-gdqNrjLwn7jzU7yjCZHCHKwbqrjbKp6OgAfl8Tu8,2363
-vllm_judge/batch.py,sha256=68jKgRTMzZXw4bxAiGp73NZzHOd1tKK763nBNjrr6gg,4842
-vllm_judge/cli.py,sha256=KQtUt_L4u5TPrS8xoyiKYt_hQ_FiHtGcrkecGEtktI8,10685
-vllm_judge/client.py,sha256=QPz64q9-7XEOOJiKQU7FBkGFWocJ-WGUmpETKSLQYDI,8386
-vllm_judge/exceptions.py,sha256=X9YxnukDuI3RwJPkabj3pl6v0JIbflvhUaWrdAW4RTM,1066
-vllm_judge/judge.py,sha256=y2qp18PVtobAyxqI246tEsju82W-OuGG4zXfajTEW-E,14101
-vllm_judge/metrics.py,sha256=QeGzaERvfRKQTt4JfquL1rW72GSkWdJ2_Nw_Hf0zqjY,15685
-vllm_judge/models.py,sha256=fbEUFPsY3xhv54WueWqEKvAgIcWTm-JO42N2-6k5LeM,7417
-vllm_judge/prompts.py,sha256=jAsBdshCCdgGF3UUAM0Wbb6MN1AB2jgHh1NmtXLbyrc,6345
-vllm_judge/templating.py,sha256=LjVFXFcwHl8xnBLLVr_IIqtN-EbLp0HZ5ndNbBpcJTQ,6998
-vllm_judge/api/__init__.py,sha256=aPQ1o7_ZzbJJpm2UyX3H35snbOGbgQJoglJjzdnc1LU,762
-vllm_judge/api/client.py,sha256=mcpdH-9ko6aEh_JAybpPPVhHqlO3l5K-lTujTlkTw8c,11302
-vllm_judge/api/models.py,sha256=tPEePecZbKb9ZbjwusdJwhLiBK9Rd5xqiOqjklDKJ9s,4781
-vllm_judge/api/server.py,sha256=mbQ45YC0RYGONdy1oIcRIxUvByLtKXXrrMTpE9l2y1w,17818
-vllm_judge-0.1.1.dist-info/METADATA,sha256=8tAJdnNjmSFrORci6TgJ2TTgZ8zmZCicBSgShbu31gY,3643
-vllm_judge-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-vllm_judge-0.1.1.dist-info/entry_points.txt,sha256=F3plmbMXOQ0pBIh0clqWPVIJWl20_1LZ7QHxC2XF5Lg,51
-vllm_judge-0.1.1.dist-info/top_level.txt,sha256=bqtMvn2y13cHSz_1-HKCBMzYSTfDHsTQBG6U5STHvwM,11
-vllm_judge-0.1.1.dist-info/RECORD,,

{vllm_judge-0.1.1.dist-info → vllm_judge-0.1.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{vllm_judge-0.1.1.dist-info → vllm_judge-0.1.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

vllm-judge 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

vllm-judge 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl