PyPI - vllm-judge - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

vllm-judge 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

vllm_judge/__init__.py +16 -2
vllm_judge/api/client.py +46 -9
vllm_judge/api/models.py +9 -2
vllm_judge/api/server.py +4 -2
vllm_judge/batch.py +4 -4
vllm_judge/cli.py +82 -6
vllm_judge/judge.py +59 -12
vllm_judge/metrics.py +744 -262
vllm_judge/models.py +3 -2
vllm_judge/prompts.py +35 -15
{vllm_judge-0.1.3.dist-info → vllm_judge-0.1.5.dist-info}/METADATA +1 -1
vllm_judge-0.1.5.dist-info/RECORD +20 -0
vllm_judge-0.1.3.dist-info/RECORD +0 -20
{vllm_judge-0.1.3.dist-info → vllm_judge-0.1.5.dist-info}/WHEEL +0 -0
{vllm_judge-0.1.3.dist-info → vllm_judge-0.1.5.dist-info}/entry_points.txt +0 -0
{vllm_judge-0.1.3.dist-info → vllm_judge-0.1.5.dist-info}/top_level.txt +0 -0

vllm_judge/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@ A lightweight library for evaluating text responses using self-hosted language m
 via vLLM's OpenAI-compatible API.
 """
-__version__ = "0.1.3"
+__version__ = "0.1.5"
 from vllm_judge.judge import Judge
 from vllm_judge.models import (
@@ -24,10 +24,11 @@ from vllm_judge.metrics import (
     CLARITY,
     CONCISENESS,
     RELEVANCE,
+    COHERENCE,
     # Safety metrics
     SAFETY,
     TOXICITY,
+    BIAS_DETECTION,
     LLAMA_GUARD_3_SAFETY,
     # Code metrics
@@ -61,6 +62,12 @@ from vllm_judge.metrics import (
     PRODUCT_REVIEW_TEMPLATE,
     MEDICAL_INFO_TEMPLATE,
     API_DOCS_TEMPLATE,
+    RAG_EVALUATION_TEMPLATE,
+    AGENT_PERFORMANCE_TEMPLATE,
+    # NLP metrics
+    TRANSLATION_QUALITY,
+    SUMMARIZATION_QUALITY,
 )
 from vllm_judge.exceptions import (
@@ -91,8 +98,10 @@ __all__ = [
     "CLARITY",
     "CONCISENESS",
     "RELEVANCE",
+    "COHERENCE",
     "SAFETY",
     "TOXICITY",
+    "BIAS_DETECTION",
     "LLAMA_GUARD_3_SAFETY",
     "CODE_QUALITY",
     "CODE_SECURITY",
@@ -112,6 +121,11 @@ __all__ = [
     "PRODUCT_REVIEW_TEMPLATE",
     "MEDICAL_INFO_TEMPLATE",
     "API_DOCS_TEMPLATE",
+    "RAG_EVALUATION_TEMPLATE",
+    "AGENT_PERFORMANCE_TEMPLATE",
+    "TRANSLATION_QUALITY",
+    "SUMMARIZATION_QUALITY",
     # Exceptions
     "VLLMJudgeError",
     "ConfigurationError",

vllm_judge/api/client.py CHANGED Viewed

@@ -66,6 +66,7 @@ class JudgeClient:
     async def evaluate(
         self,
         content: Union[str, Dict[str, str]],
+        input: Optional[str] = None,
         criteria: str = None,
         rubric: Union[str, Dict[Union[int, float], str]] = None,
         scale: Optional[Tuple[int, int]] = None,
@@ -87,7 +88,8 @@ class JudgeClient:
             EvaluationResult
         """
         request = EvaluateRequest(
-            response=content,
+            content=content,
+            input=input,
             criteria=criteria,
             rubric=rubric,
             scale=list(scale) if scale else None,
@@ -277,37 +279,69 @@ class JudgeClient:
     async def score(
         self,
         criteria: str,
-        response: str,
+        content: str,
+        input: Optional[str] = None,
         scale: Tuple[int, int] = (1, 10),
         **kwargs
     ) -> EvaluationResult:
         """Quick scoring evaluation."""
         return await self.evaluate(
-            response=response,
+            content=content,
+            input=input,
+            criteria=criteria,
+            scale=scale,
+            **kwargs
+        )
+    async def qa_evaluate(
+        self,
+        question: str,
+        answer: str,
+        criteria: str = "accuracy and completeness",
+        scale: Tuple[int, int] = (1, 10),
+        **kwargs
+    ) -> EvaluationResult:
+        """
+        Convenience method for QA evaluation via API.
+        Args:
+            question: The question being answered
+            answer: The answer to evaluate
+            criteria: Evaluation criteria (default: "accuracy and completeness")
+            scale: Numeric scale (default 1-10)
+            **kwargs: Additional parameters
+        Returns:
+            EvaluationResult with QA assessment
+        """
+        return await self.evaluate(
+            content=answer,
+            input=question,
             criteria=criteria,
             scale=scale,
             **kwargs
         )
     async def compare(
         self,
         response_a: str,
         response_b: str,
         criteria: str,
+        input: Optional[str] = None,
         **kwargs
     ) -> EvaluationResult:
         """Quick comparison evaluation."""
         return await self.evaluate(
-            response={"a": response_a, "b": response_b},
+            content={"a": response_a, "b": response_b},
+            input=input,
             criteria=criteria,
             **kwargs
         )
     async def classify(
         self,
-        response: str,
+        content: str,
         categories: List[str],
         criteria: str = None,
+        input: Optional[str] = None,
         **kwargs
     ) -> EvaluationResult:
         """Quick classification evaluation."""
@@ -317,7 +351,8 @@ class JudgeClient:
         rubric = f"Classify into one of these categories: {', '.join(categories)}"
         return await self.evaluate(
-            response=response,
+            content=content,
+            input=input,
             criteria=criteria,
             rubric=rubric,
             **kwargs
@@ -325,7 +360,8 @@ class JudgeClient:
     async def evaluate_streaming(
         self,
-        response: Union[str, Dict[str, str]],
+        content: Union[str, Dict[str, str]],
+        input: Optional[str] = None,
         **kwargs
     ) -> AsyncIterator[str]:
         """
@@ -339,7 +375,8 @@ class JudgeClient:
         async with websockets.connect(ws_url) as websocket:
             # Send request
             request_data = {
-                "response": response,
+                "content": content,
+                "input": input,
                 **kwargs
             }
             await websocket.send(json.dumps(request_data))

vllm_judge/api/models.py CHANGED Viewed

@@ -5,8 +5,15 @@ from datetime import datetime
 class EvaluateRequest(BaseModel):
     """Request model for single evaluation."""
-    response: Union[str, Dict[str, str]] = Field(
-        ..., description="Text to evaluate or dict with 'a' and 'b' for comparison"
+    content: Union[str, Dict[str, str]] = Field(
+        ...,
+        description="Content to evaluate (string or dict with 'a'/'b' for comparison)",
+        examples=["This is a response", {"a": "Response A", "b": "Response B"}]
+    )
+    input: Optional[str] = Field(
+        None,
+        description="Optional input/question/prompt that the content responds to",
+        examples=["What is the capital of France?", "Write a function to sort a list"]
     )
     criteria: Optional[str] = Field(
         None, description="What to evaluate for"

vllm_judge/api/server.py CHANGED Viewed

@@ -109,7 +109,8 @@ async def evaluate(request: EvaluateRequest):
         # Perform evaluation with template support
         result = await judge.evaluate(
-            response=request.response,
+            content=request.content,
+            input=request.input,
             criteria=request.criteria,
             rubric=request.rubric,
             scale=scale,
@@ -422,7 +423,8 @@ async def websocket_evaluate(websocket: WebSocket):
                 scale = tuple(request.scale) if request.scale else None
                 result = await judge.evaluate(
-                    response=request.response,
+                    content=request.content,
+                    input=request.input,
                     criteria=request.criteria,
                     rubric=request.rubric,
                     scale=scale,

vllm_judge/batch.py CHANGED Viewed

@@ -83,12 +83,12 @@ class BatchProcessor:
         async with self.semaphore:
             try:
                 # Extract response from kwargs
-                response = eval_kwargs.pop('response', None)
-                if not response:
-                    raise ValueError(f"Item {index} missing 'response' field")
+                content = eval_kwargs.pop('content', None)
+                if not content:
+                    raise ValueError(f"Item {index} missing 'content' field")
                 # Perform evaluation
-                result = await self.judge.evaluate(response=response, **eval_kwargs)
+                result = await self.judge.evaluate(content=content, **eval_kwargs)
                 # Update progress
                 async with self.progress_lock:

vllm_judge/cli.py CHANGED Viewed

@@ -50,7 +50,8 @@ def serve(base_url: str, model: str, host: str, port: int, reload: bool, max_con
 @click.option('--api-url', help='Judge API URL (if using remote server)')
 @click.option('--base-url', help='vLLM server URL (if using local)')
 @click.option('--model', help='Model name (if using local)')
-@click.option('--response', required=True, help='Text to evaluate')
+@click.option('--content', required=True, help='Text to evaluate')
+@click.option('--input', help='Input/question/prompt that the content responds to')
 @click.option('--criteria', help='Evaluation criteria')
 @click.option('--metric', help='Pre-defined metric name')
 @click.option('--scale', nargs=2, type=int, help='Numeric scale (min max)')
@@ -61,7 +62,8 @@ def evaluate(
     api_url: Optional[str],
     base_url: Optional[str],
     model: Optional[str],
-    response: str,
+    content: str,
+    input: Optional[str],
     criteria: Optional[str],
     metric: Optional[str],
     scale: Optional[tuple],
@@ -75,7 +77,8 @@ def evaluate(
             # Use API client
             async with JudgeClient(api_url) as client:
                 result = await client.evaluate(
-                    content=response,
+                    content=content,
+                    input=input,
                     criteria=criteria,
                     metric=metric,
                     scale=scale,
@@ -91,7 +94,8 @@ def evaluate(
             judge = Judge.from_url(base_url, model=model)
             async with judge:
                 result = await judge.evaluate(
-                    content=response,
+                    content=content,
+                    input=input,
                     criteria=criteria,
                     metric=metric,
                     scale=scale,
@@ -110,6 +114,60 @@ def evaluate(
     asyncio.run(run_evaluation())
+@cli.command()
+@click.option('--api-url', help='Judge API URL (if using remote server)')
+@click.option('--base-url', help='vLLM server URL (if using local)')
+@click.option('--model', help='Model name (if using local)')
+@click.option('--question', required=True, help='Question to evaluate answer for')
+@click.option('--answer', required=True, help='Answer to evaluate')
+@click.option('--criteria', default='accuracy and completeness', help='Evaluation criteria')
+@click.option('--scale', nargs=2, type=int, default=[1, 10], help='Numeric scale (min max)')
+@click.option('--output', type=click.Choice(['json', 'text']), default='text', help='Output format')
+def qa_evaluate(
+    api_url: Optional[str],
+    base_url: Optional[str],
+    model: Optional[str],
+    question: str,
+    answer: str,
+    criteria: str,
+    scale: tuple,
+    output: str
+):
+    """Evaluate a QA pair (question and answer)."""
+    async def run_qa_evaluation():
+        if api_url:
+            async with JudgeClient(api_url) as client:
+                result = await client.qa_evaluate(
+                    question=question,
+                    answer=answer,
+                    criteria=criteria,
+                    scale=scale
+                )
+        else:
+            if not base_url:
+                click.echo("Error: Either --api-url or --base-url is required", err=True)
+                sys.exit(1)
+            judge = Judge.from_url(base_url, model=model)
+            async with judge:
+                result = await judge.qa_evaluate(
+                    question=question,
+                    answer=answer,
+                    criteria=criteria,
+                    scale=scale
+                )
+        if output == 'json':
+            click.echo(json.dumps(result.model_dump(), indent=2))
+        else:
+            click.echo(f"Question: {question}")
+            click.echo(f"Answer: {answer}")
+            click.echo(f"Decision: {result.decision}")
+            if result.score is not None:
+                click.echo(f"Score: {result.score}")
+            click.echo(f"Reasoning: {result.reasoning}")
+    asyncio.run(run_qa_evaluation())
 @cli.command()
 @click.option('--api-url', help='Judge API URL (if using remote server)')
@@ -118,6 +176,7 @@ def evaluate(
 @click.option('--response-a', required=True, help='First response')
 @click.option('--response-b', required=True, help='Second response')
 @click.option('--criteria', required=True, help='Comparison criteria')
+@click.option('--input', help='Input/question that both responses address')
 @click.option('--output', type=click.Choice(['json', 'text']), default='text', help='Output format')
 def compare(
     api_url: Optional[str],
@@ -126,6 +185,7 @@ def compare(
     response_a: str,
     response_b: str,
     criteria: str,
+    input: Optional[str],
     output: str
 ):
     """Compare two responses."""
@@ -135,7 +195,8 @@ def compare(
                 result = await client.compare(
                     response_a=response_a,
                     response_b=response_b,
-                    criteria=criteria
+                    criteria=criteria,
+                    input=input
                 )
         else:
             if not base_url:
@@ -147,12 +208,17 @@ def compare(
                 result = await judge.compare(
                     response_a=response_a,
                     response_b=response_b,
-                    criteria=criteria
+                    criteria=criteria,
+                    input=input
                 )
         if output == 'json':
             click.echo(json.dumps(result.model_dump(), indent=2))
         else:
+            if input:
+                click.echo(f"Input: {input}")
+            click.echo(f"Response A: {response_a}")
+            click.echo(f"Response B: {response_b}")
             click.echo(f"Winner: {result.decision}")
             click.echo(f"Reasoning: {result.reasoning}")
@@ -281,6 +347,16 @@ def batch(api_url: str, file, use_async: bool, max_concurrent: Optional[int], ou
 def main():
     """Main entry point."""
+    cli.help = """vLLM Judge - LLM-as-a-Judge evaluation tool.
+Features:
+- Single response evaluation with optional input context
+- QA (Question-Answer) evaluation
+- Response comparison with optional input context
+- Batch evaluation from JSON files
+- API server mode
+- Built-in and custom metrics with template support
+"""
     cli()

vllm_judge/judge.py CHANGED Viewed

@@ -64,6 +64,7 @@ class Judge:
     async def evaluate(
         self,
         content: Union[str, Dict[str, str]],
+        input: Optional[str] = None,
         criteria: str = None,
         rubric: Union[str, Dict[Union[int, float], str]] = None,
         scale: Optional[Tuple[int, int]] = None,
@@ -80,6 +81,7 @@ class Judge:
         Args:
             content: String for single evaluation, dict {"a": ..., "b": ...} for comparison
+            input: Optional input/question/prompt that the content is responding to
             criteria: What to evaluate for (can contain template variables)
             rubric: Instructions for evaluation, can be string or dict containing mapping of score to description (can contain template variables)
             scale: Optional numeric scale (min, max)
@@ -140,6 +142,9 @@ class Judge:
         # Merge template variables (metric defaults + user provided)
         all_template_vars = {**metric_template_vars, **(template_vars or {})}
+        # Add input to template variables if provided
+        if input:
+            all_template_vars["input"] = input
         # Process templates
         criteria = TemplateProcessor.apply_template(
@@ -154,10 +159,14 @@ class Judge:
         context = TemplateProcessor.apply_template(
             context, all_template_vars, engine, strict=True
         )
+        input = TemplateProcessor.apply_template(
+            input, all_template_vars, engine, strict=True
+        )
         # Build messages
         messages = PromptBuilder.build_messages(
-            response=content,
+            content=content,
+            input=input,
             criteria=criteria,
             rubric=rubric,
             scale=scale,
@@ -264,7 +273,8 @@ class Judge:
     async def score(
         self,
         criteria: str,
-        response: str,
+        content: str,
+        input: Optional[str] = None,
         scale: Tuple[int, int] = (1, 10),
         **kwargs
     ) -> EvaluationResult:
@@ -273,7 +283,8 @@ class Judge:
         Args:
             criteria: What to evaluate
-            response: Response to evaluate
+            content: Response to evaluate
+            input: Optional input/question/prompt that the response addresses
             scale: Numeric scale (default 1-10)
             **kwargs: Additional parameters
@@ -281,7 +292,36 @@ class Judge:
             EvaluationResult with numeric score
         """
         return await self.evaluate(
-            response=response,
+            content=content,
+            input=input,
+            criteria=criteria,
+            scale=scale,
+            **kwargs
+        )
+    async def qa_evaluate(
+        self,
+        question: str,
+        answer: str,
+        criteria: str = "accuracy and completeness",
+        scale: Tuple[int, int] = (1, 10),
+        **kwargs
+    ) -> EvaluationResult:
+        """
+        Convenience method for QA evaluation.
+        Args:
+            question: The question being answered
+            answer: The answer to evaluate
+            criteria: Evaluation criteria (default: "accuracy and completeness")
+            scale: Numeric scale (default 1-10)
+            **kwargs: Additional parameters
+        Returns:
+            EvaluationResult with QA assessment
+        """
+        return await self.evaluate(
+            content=answer,
+            input=question,
             criteria=criteria,
             scale=scale,
             **kwargs
@@ -292,6 +332,7 @@ class Judge:
         response_a: str,
         response_b: str,
         criteria: str,
+        input: Optional[str] = None,
         **kwargs
     ) -> EvaluationResult:
         """
@@ -301,31 +342,35 @@ class Judge:
             response_a: First response
             response_b: Second response
             criteria: What to compare on
+            input: Optional input/question that both responses address
             **kwargs: Additional parameters
         Returns:
             EvaluationResult with decision of 'response_a' or 'response_b'
         """
         return await self.evaluate(
-            response={"a": response_a, "b": response_b},
+            content={"a": response_a, "b": response_b},
+            input=input,
             criteria=criteria,
             **kwargs
         )
     async def classify(
         self,
-        response: str,
+        content: str,
         categories: List[str],
         criteria: str = None,
+        input: Optional[str] = None,
         **kwargs
     ) -> EvaluationResult:
         """
         Quick classification evaluation.
         Args:
-            response: Response to classify
+            content: Content to classify
             categories: List of categories
             criteria: Classification criteria
+            input: Optional input/question that the response addresses
             **kwargs: Additional parameters
         Returns:
@@ -337,7 +382,8 @@ class Judge:
         rubric = f"Classify into one of these categories: {', '.join(categories)}"
         return await self.evaluate(
-            response=response,
+            content=content,
+            input=input,
             criteria=criteria,
             rubric=rubric,
             **kwargs
@@ -396,7 +442,7 @@ class Judge:
         Batch evaluation with high concurrency.
         Args:
-            data: List of evaluation inputs (each must have 'response' key)
+            data: List of evaluation inputs (each must have 'content' key)
             max_concurrent: Maximum concurrent requests
             progress_callback: Optional callback for progress updates
             **default_kwargs: Default parameters for all evaluations
@@ -406,9 +452,10 @@ class Judge:
         Example:
             results = await judge.batch_evaluate([
-                {"response": "Text 1", "criteria": "clarity"},
-                {"response": {"a": "A", "b": "B"}, "criteria": "quality"},
-                {"response": "Text 3", "metric": "safety"}
+                {"content": "Text 1", "criteria": "clarity"},
+                {"content": "Paris", "input": "What is the capital of France?", "criteria": "accuracy"},
+                {"content": {"a": "A", "b": "B"}, "criteria": "quality"},
+                {"content": "Text 3", "metric": "safety"}
             ])
         """
         processor = BatchProcessor(self, max_concurrent or self.config.max_concurrent)

vllm-judge 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

vllm-judge 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl