PyPI - vllm-judge - Versions diffs - 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

vllm-judge 0.1.5py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

vllm_judge/__init__.py +2 -2
vllm_judge/api/__init__.py +0 -3
vllm_judge/api/client.py +0 -3
vllm_judge/api/server.py +1 -5
vllm_judge/batch.py +2 -1
vllm_judge/{metrics.py → builtin_metrics.py} +1 -1
vllm_judge/cli.py +1 -5
vllm_judge/client.py +1 -6
vllm_judge/judge.py +2 -2
vllm_judge/models.py +0 -1
vllm_judge/{prompts.py → prompt_builder.py} +57 -37
{vllm_judge-0.1.5.dist-info → vllm_judge-0.1.6.dist-info}/METADATA +1 -1
vllm_judge-0.1.6.dist-info/RECORD +20 -0
vllm_judge-0.1.5.dist-info/RECORD +0 -20
/vllm_judge/{utils.py → parsers.py} +0 -0
{vllm_judge-0.1.5.dist-info → vllm_judge-0.1.6.dist-info}/WHEEL +0 -0
{vllm_judge-0.1.5.dist-info → vllm_judge-0.1.6.dist-info}/entry_points.txt +0 -0
{vllm_judge-0.1.5.dist-info → vllm_judge-0.1.6.dist-info}/top_level.txt +0 -0

vllm_judge/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@ A lightweight library for evaluating text responses using self-hosted language m
 via vLLM's OpenAI-compatible API.
 """
-__version__ = "0.1.5"
+__version__ = "0.1.6"
 from vllm_judge.judge import Judge
 from vllm_judge.models import (
@@ -17,7 +17,7 @@ from vllm_judge.models import (
     ModelSpecificMetric
 )
 from vllm_judge.templating import TemplateProcessor
-from vllm_judge.metrics import (
+from vllm_judge.builtin_metrics import (
     # General metrics
     HELPFULNESS,
     ACCURACY,

vllm_judge/api/__init__.py CHANGED Viewed

@@ -1,6 +1,3 @@
-"""
-API module for vLLM Judge.
-"""
 from vllm_judge.api.server import app, create_app, start_server
 from vllm_judge.api.client import JudgeClient
 from vllm_judge.api.models import (

vllm_judge/api/client.py CHANGED Viewed

@@ -1,6 +1,3 @@
-"""
-HTTP client for vLLM Judge API.
-"""
 import asyncio
 from typing import Union, Dict, List, Optional, Tuple, Any, AsyncIterator
 import httpx

vllm_judge/api/server.py CHANGED Viewed

@@ -1,7 +1,3 @@
-"""
-FastAPI server for vLLM Judge API.
-"""
-import asyncio
 import time
 import uuid
 from datetime import datetime
@@ -14,7 +10,7 @@ import uvicorn
 from vllm_judge.judge import Judge
 from vllm_judge.models import EvaluationResult, JudgeConfig
-from vllm_judge.metrics import BUILTIN_METRICS
+from vllm_judge.builtin_metrics import BUILTIN_METRICS
 from vllm_judge.exceptions import VLLMJudgeError
 from vllm_judge.api.models import (
     EvaluateRequest,

vllm_judge/batch.py CHANGED Viewed

@@ -17,7 +17,8 @@ class BatchProcessor:
             max_concurrent: Maximum concurrent requests
         """
         self.judge = judge
-        self.semaphore = asyncio.Semaphore(max_concurrent)
+        self.max_concurrent = max_concurrent
+        self.semaphore = asyncio.Semaphore(self.max_concurrent)
         self.progress_lock = asyncio.Lock()
         self.completed = 0

vllm_judge/{metrics.py → builtin_metrics.py} RENAMED Viewed

@@ -1,6 +1,6 @@
 from typing import Dict
 from vllm_judge.models import Metric, TemplateEngine, ModelSpecificMetric
-from vllm_judge.utils import parse_llama_guard_3
+from vllm_judge.parsers import parse_llama_guard_3
 # Registry for built-in metrics
 BUILTIN_METRICS: Dict[str, Metric] = {}

vllm_judge/cli.py CHANGED Viewed

@@ -1,6 +1,3 @@
-"""
-Command-line interface for vLLM Judge.
-"""
 import asyncio
 import json
 import sys
@@ -8,10 +5,9 @@ from typing import Optional
 import click
 from vllm_judge import Judge
-from vllm_judge.models import JudgeConfig
 from vllm_judge.api.server import start_server as start_api_server
 from vllm_judge.api.client import JudgeClient
-from vllm_judge.metrics import BUILTIN_METRICS
+from vllm_judge.builtin_metrics import BUILTIN_METRICS
 @click.group()

vllm_judge/client.py CHANGED Viewed

@@ -126,12 +126,8 @@ class VLLMClient:
             "messages": messages,
             "temperature": self.config.temperature,
             "max_tokens": self.config.max_tokens,
-            # "top_p": self.config.top_p,
         }
-        # # Request JSON response format if supported
-        # if self.config.temperature < 0.2:  # Only for low temperature
-        #     request_data["response_format"] = {"type": "json_object"}
         try:
             response = await self._request_with_retry(
@@ -172,7 +168,6 @@ class VLLMClient:
             "prompt": prompt,
             "temperature": self.config.temperature,
             "max_tokens": self.config.max_tokens,
-            # "top_p": self.config.top_p,
         }
         try:

vllm_judge/judge.py CHANGED Viewed

@@ -4,9 +4,9 @@ from typing import Union, Dict, List, Optional, Tuple, Any, Callable
 from vllm_judge.models import JudgeConfig, EvaluationResult, Metric, BatchResult, TemplateEngine, ModelSpecificMetric
 from vllm_judge.client import VLLMClient
-from vllm_judge.prompts import PromptBuilder
+from vllm_judge.prompt_builder import PromptBuilder
 from vllm_judge.batch import BatchProcessor
-from vllm_judge.metrics import BUILTIN_METRICS
+from vllm_judge.builtin_metrics import BUILTIN_METRICS
 from vllm_judge.templating import TemplateProcessor
 from vllm_judge.exceptions import (
     ParseError,

vllm_judge/models.py CHANGED Viewed

@@ -59,7 +59,6 @@ class JudgeConfig(BaseModel):
     # Model parameters
     temperature: float = Field(0.0, description="Sampling temperature")
     max_tokens: int = Field(256, description="Maximum tokens in response")
-    # top_p: float = Field(0.95, description="Top-p sampling")
     # Batch settings
     max_concurrent: int = Field(50, description="Maximum concurrent requests")

vllm_judge/{prompts.py → prompt_builder.py} RENAMED Viewed

@@ -1,5 +1,5 @@
 from typing import List, Dict, Union, Optional, Tuple, Any
+import json
 class PromptBuilder:
     """Builds prompts for evaluation requests."""
@@ -35,24 +35,36 @@ class PromptBuilder:
         """
         # Detect evaluation type
         is_comparison = isinstance(content, dict) and "a" in content and "b" in content
+        output_format = """
+# Output Format:
+The JSON object MUST have exactly these three fields:
+1. decision: (String | Boolean) This decision label should clearly state your main finding. This could be a string representing a specific class (eg., PASS, FAIL, CORRECT, INCORRECT, etc.) or a boolean value (true or false). If user provided a rubric, you should use the rubric to determine the decision label.
+2. score: (Number | null) A numerical score for the evaluation. If scoring is requested, provide the score as a number. If scoring is NOT requested or is not applicable for the specific task, you MUST use the value null for this field.
+3. reasoning: (String) A concise explanation justifying your decision and score (if a score was provided). This reasoning must directly and logically support your evaluation and refer to the specific evaluation criteria.
+The JSON object MUST be well-formed and adhere strictly to the following structure:
+{
+    "decision": <your judgment - string|boolean>,
+    "reasoning": <concise explanation of your judgment - string>,
+    "score": <numeric score if requested, otherwise null - number|null>
+}
+        """
         # System message
         if not system_prompt:
-            # TODO: Add more detailed system prompts
-            system_prompt = "You are an impartial judge and expert evaluator "
-            if is_comparison:
-                system_prompt+="comparing responses objectively."
-            else:
-                system_prompt+="providing objective assessments."
-        # Output format instructions
-        system_prompt+="\nYou must respond in JSON format:\n"
-        system_prompt+="""{
-    "decision": <your judgment - string|boolean>,
-    "reasoning": "<concise explanation of your judgment>",
-    "score": <numeric score if requested, otherwise null>
-}"""
-        system_prompt+="\nDo not include any text in your response except for the JSON object."
+            system_prompt = """You are an impartial judge and expert evaluator. Your task is to evaluate the provided content based on the specific evaluation criteria and rubric.
+# Key Instructions:
+1. Your evaluation must be objective, consistent, and based solely on the specified criteria. Do not let your own opinions or biases interfere.
+2. Focus exclusively on quality assessment.
+3. Do not be influenced by the length of the responses unless response length is explicitly relevant to the specified evaluation criteria (e.g., a task assessing conciseness or verbosity).
+4. Your entire response MUST be a single, valid JSON object and nothing else. Do not include any text or conversational filler before or after this JSON object.
+"""
+        system_prompt += output_format
         # Build user message
         user_content = PromptBuilder._build_user_prompt(
@@ -93,30 +105,30 @@ class PromptBuilder:
             parts.append(f'"{input}"')
             parts.append("")
+        parts.append("## Content to evaluate:")
+        if is_comparison:
+            parts.append(f"**Response A:**\n{content['a']}")
+            parts.append(f"**Response B:**\n{content['b']}")
+        else:
+            parts.append(content)
+        parts.append("## Evaluation Criteria:")
         # Task description
         if is_comparison:
-            if input:
-                parts.append(f"Compare how well these two responses address the input for: {criteria}")
-            else:
-                parts.append(f"Compare these two responses based on: {criteria}")
+            parts.append(f"Compare the two responses based on: {criteria}")
             if context:
                 parts.append(f"\nContext: {context}")
-            parts.append(f"\nResponse A:\n{content['a']}")
-            parts.append(f"\nResponse B:\n{content['b']}")
         else:
-            if input:
-                parts.append(f"Evaluate how well this content addresses the input for: {criteria}")
-            else:
-                parts.append(f"Evaluate the following content based on: {criteria}")
+            parts.append(f"Evaluate the content based on: {criteria}")
             if context:
                 parts.append(f"\nContext: {context}")
-            parts.append(f"\nContent to evaluate:\n{content}")
-        parts.append(f"\nYou must return a decision label/class (your judgement) for the `decision` field and a concise explanation for the `reasoning` field.")
+        parts.append(f"\nYou must return a decision label/class (your main judgement) for the `decision` field and a concise explanation for the `reasoning` field in the JSON object.")
         # Add scale and rubric
         if scale:
-            parts.append(f"\nIn addition to these, provide a score from {scale[0]} to {scale[1]}")
+            parts.append(f"In addition to these, provide a score from {scale[0]} to {scale[1]}")
             if isinstance(rubric, dict):
                 parts.append("\nScoring guide:")
@@ -127,14 +139,15 @@ class PromptBuilder:
             elif rubric:
                 parts.append(f"\nEvaluation guide: {rubric}")
         elif rubric:
+            parts.append("\nIn addition to these, provide a score if required by the following evaluation guide.")
             parts.append(f"\nEvaluation guide: {rubric}")
         # Add examples if provided
         if examples:
             parts.append("\nExample evaluations:")
-            for i, ex in enumerate(examples, 1):
-                parts.append(f"\nExample {i}:")
+            for i, ex in enumerate(examples):
+                parts.append(f"Example {i+1}:")
+                parts.append("Request:")
                 # Handle different example formats
                 if "input" in ex:
                     parts.append(f"Input: {ex['input']}")
@@ -143,17 +156,24 @@ class PromptBuilder:
                 elif "text" in ex:
                     parts.append(f"Text: {ex['text']}")
-                if "decision" in ex:
-                    parts.append(f"Decision: {ex['decision']}")
+                parts.append("Response:")
+                response = {}
+                if "decision" not in ex or ex["decision"] is None or ex["decision"] == "":
+                    raise ValueError("Example must include a decision field")
+                response["decision"] = ex["decision"]
                 if "score" in ex:
-                    parts.append(f"Score: {ex['score']}")
+                    response["score"] = ex["score"]
                 if "reasoning" in ex:
-                    parts.append(f"Reasoning: {ex['reasoning']}")
+                    response["reasoning"] = ex["reasoning"]
+                parts.append(json.dumps(response))
         # Add any additional instructions
         if kwargs.get("additional_instructions"):
-            parts.append(f"\nAdditional instructions: {kwargs['additional_instructions']}")
+            parts.append(f"Additional instructions: {kwargs['additional_instructions']}")
         # Output format instructions
         parts.append("\nYou must respond in JSON format:")

{vllm_judge-0.1.5.dist-info → vllm_judge-0.1.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vllm_judge
-Version: 0.1.5
+Version: 0.1.6
 Summary: LLM-as-a-Judge evaluations for vLLM hosted models
 Author: TrustyAI team
 Author-email: Sai Chandra Pandraju <saichandrapandraju@gmail.com>

vllm_judge-0.1.6.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,20 @@
+vllm_judge/__init__.py,sha256=bl6j1TXcPFsNcOKpFGX6FrkS0dikhApOKjhsBOaXm80,2800
+vllm_judge/batch.py,sha256=GJe6d2nsUWCxcSG-j5xnyovfKAM-YklWS0PNAwTMO9s,4886
+vllm_judge/builtin_metrics.py,sha256=XAhn5a-kJgip4NYkaTmkwiIWXjYGRcHoztAmqjmDO9A,48711
+vllm_judge/cli.py,sha256=3075NrduxYz_iITQ0ZnqdjK0jJ9vGpzC6B_23lAN3wc,13598
+vllm_judge/client.py,sha256=x3LBRUjnOmX0iEWdRqz-ALzb03qezZ92aMpdMFzHRcs,8096
+vllm_judge/exceptions.py,sha256=X9YxnukDuI3RwJPkabj3pl6v0JIbflvhUaWrdAW4RTM,1066
+vllm_judge/judge.py,sha256=X3oLXfWjmIOay5oDWBQNoEnxyDlF0sPf69HBjieW1Ug,16954
+vllm_judge/models.py,sha256=wN2JGddWAxT4EXhmfl3IjBYOpDG_9lGP125UWP4IKTw,7935
+vllm_judge/parsers.py,sha256=lhByBIMS_1EwvxEe31jFgVcTwcFwm5mWoJDXG4TnbvQ,509
+vllm_judge/prompt_builder.py,sha256=miQU_mKDKkTuRfVEiQT2LfN4QEvnphLu39s2YldOvCA,8754
+vllm_judge/templating.py,sha256=LjVFXFcwHl8xnBLLVr_IIqtN-EbLp0HZ5ndNbBpcJTQ,6998
+vllm_judge/api/__init__.py,sha256=yUqAIcqpBDUKoq8ZLcKPQaX45oesy3Nmb2yEwy-dHyU,727
+vllm_judge/api/client.py,sha256=RgbhzRLlOR8gia8_-Kbe2_wQC4tjNPzqObPz2GPP5ec,12409
+vllm_judge/api/models.py,sha256=GXj3slwytJWg5M4f5MPZ8Ft_hrkEEAZh0qgpYDy-Qe4,5102
+vllm_judge/api/server.py,sha256=B97RVVeVHLxf69_bSZBvP69DbTQhoFW2tZOBBS0ahrQ,17838
+vllm_judge-0.1.6.dist-info/METADATA,sha256=MRGak20XswQG2-Qq_iFCIUNqZcWfMOZsSA8GRWMj6ak,4251
+vllm_judge-0.1.6.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
+vllm_judge-0.1.6.dist-info/entry_points.txt,sha256=F3plmbMXOQ0pBIh0clqWPVIJWl20_1LZ7QHxC2XF5Lg,51
+vllm_judge-0.1.6.dist-info/top_level.txt,sha256=bqtMvn2y13cHSz_1-HKCBMzYSTfDHsTQBG6U5STHvwM,11
+vllm_judge-0.1.6.dist-info/RECORD,,

vllm_judge-0.1.5.dist-info/RECORD DELETED Viewed

@@ -1,20 +0,0 @@
-vllm_judge/__init__.py,sha256=6OKo_RbNOov83pZIPfg12ITxiE6UZh2_UOTjQsgWbFY,2792
-vllm_judge/batch.py,sha256=3zkatZxQESCjYz99qfLhxl2Dq2tHAfhtdTiXxjVqUxE,4836
-vllm_judge/cli.py,sha256=tnMqJ2RvCFaXUY4ok4IO-d9IRNJhEck60AJNzdCaqhg,13679
-vllm_judge/client.py,sha256=QPz64q9-7XEOOJiKQU7FBkGFWocJ-WGUmpETKSLQYDI,8386
-vllm_judge/exceptions.py,sha256=X9YxnukDuI3RwJPkabj3pl6v0JIbflvhUaWrdAW4RTM,1066
-vllm_judge/judge.py,sha256=SDT_cGDZzHu8NOjG6eqHQsYqIuXR12j7ocpyrVDhHrQ,16939
-vllm_judge/metrics.py,sha256=WwtR6Bb4cc0gDplhZnysNzD1EfOMCEzFc8-3hJMqnJs,48709
-vllm_judge/models.py,sha256=o4OdRtRdsz9n5RhHrz-uA9ylG0cGQg99NJYay0RaeDE,7998
-vllm_judge/prompts.py,sha256=KC8AfiIgKKxQuhT1bnnyYXrSBbcU2-RnkSLqDJfrt8o,7251
-vllm_judge/templating.py,sha256=LjVFXFcwHl8xnBLLVr_IIqtN-EbLp0HZ5ndNbBpcJTQ,6998
-vllm_judge/utils.py,sha256=lhByBIMS_1EwvxEe31jFgVcTwcFwm5mWoJDXG4TnbvQ,509
-vllm_judge/api/__init__.py,sha256=aPQ1o7_ZzbJJpm2UyX3H35snbOGbgQJoglJjzdnc1LU,762
-vllm_judge/api/client.py,sha256=l46IpQHJxmbDfXpyCOXfir70c_3hPaIr6OEiOzOMk5Q,12449
-vllm_judge/api/models.py,sha256=GXj3slwytJWg5M4f5MPZ8Ft_hrkEEAZh0qgpYDy-Qe4,5102
-vllm_judge/api/server.py,sha256=1UQMV6MRdlqHS6NYdrQI41bi_wNb0QC8RZD4jCEeTkU,17888
-vllm_judge-0.1.5.dist-info/METADATA,sha256=5UXUqyckWp9fGLQXcBxkI6ejmFfWpCjjpyIeMx96zTI,4251
-vllm_judge-0.1.5.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
-vllm_judge-0.1.5.dist-info/entry_points.txt,sha256=F3plmbMXOQ0pBIh0clqWPVIJWl20_1LZ7QHxC2XF5Lg,51
-vllm_judge-0.1.5.dist-info/top_level.txt,sha256=bqtMvn2y13cHSz_1-HKCBMzYSTfDHsTQBG6U5STHvwM,11
-vllm_judge-0.1.5.dist-info/RECORD,,

/vllm_judge/{utils.py → parsers.py} RENAMED Viewed

File without changes

{vllm_judge-0.1.5.dist-info → vllm_judge-0.1.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{vllm_judge-0.1.5.dist-info → vllm_judge-0.1.6.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{vllm_judge-0.1.5.dist-info → vllm_judge-0.1.6.dist-info}/top_level.txt RENAMED Viewed

File without changes

vllm-judge 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

vllm-judge 0.1.5py3-none-any.whl → 0.1.6py3-none-any.whl