PyPI - vllm-judge - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

vllm-judge 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

vllm_judge/__init__.py +17 -3
vllm_judge/api/__init__.py +0 -3
vllm_judge/api/client.py +0 -3
vllm_judge/api/server.py +1 -5
vllm_judge/batch.py +2 -1
vllm_judge/builtin_metrics.py +907 -0
vllm_judge/cli.py +1 -5
vllm_judge/client.py +1 -6
vllm_judge/judge.py +2 -2
vllm_judge/models.py +3 -3
vllm_judge/{prompts.py → prompt_builder.py} +60 -38
{vllm_judge-0.1.4.dist-info → vllm_judge-0.1.6.dist-info}/METADATA +1 -1
vllm_judge-0.1.6.dist-info/RECORD +20 -0
vllm_judge/metrics.py +0 -582
vllm_judge-0.1.4.dist-info/RECORD +0 -20
/vllm_judge/{utils.py → parsers.py} +0 -0
{vllm_judge-0.1.4.dist-info → vllm_judge-0.1.6.dist-info}/WHEEL +0 -0
{vllm_judge-0.1.4.dist-info → vllm_judge-0.1.6.dist-info}/entry_points.txt +0 -0
{vllm_judge-0.1.4.dist-info → vllm_judge-0.1.6.dist-info}/top_level.txt +0 -0

vllm_judge/cli.py CHANGED Viewed

@@ -1,6 +1,3 @@
-"""
-Command-line interface for vLLM Judge.
-"""
 import asyncio
 import json
 import sys
@@ -8,10 +5,9 @@ from typing import Optional
 import click
 from vllm_judge import Judge
-from vllm_judge.models import JudgeConfig
 from vllm_judge.api.server import start_server as start_api_server
 from vllm_judge.api.client import JudgeClient
-from vllm_judge.metrics import BUILTIN_METRICS
+from vllm_judge.builtin_metrics import BUILTIN_METRICS
 @click.group()

vllm_judge/client.py CHANGED Viewed

@@ -126,12 +126,8 @@ class VLLMClient:
             "messages": messages,
             "temperature": self.config.temperature,
             "max_tokens": self.config.max_tokens,
-            # "top_p": self.config.top_p,
         }
-        # # Request JSON response format if supported
-        # if self.config.temperature < 0.2:  # Only for low temperature
-        #     request_data["response_format"] = {"type": "json_object"}
         try:
             response = await self._request_with_retry(
@@ -172,7 +168,6 @@ class VLLMClient:
             "prompt": prompt,
             "temperature": self.config.temperature,
             "max_tokens": self.config.max_tokens,
-            # "top_p": self.config.top_p,
         }
         try:

vllm_judge/judge.py CHANGED Viewed

@@ -4,9 +4,9 @@ from typing import Union, Dict, List, Optional, Tuple, Any, Callable
 from vllm_judge.models import JudgeConfig, EvaluationResult, Metric, BatchResult, TemplateEngine, ModelSpecificMetric
 from vllm_judge.client import VLLMClient
-from vllm_judge.prompts import PromptBuilder
+from vllm_judge.prompt_builder import PromptBuilder
 from vllm_judge.batch import BatchProcessor
-from vllm_judge.metrics import BUILTIN_METRICS
+from vllm_judge.builtin_metrics import BUILTIN_METRICS
 from vllm_judge.templating import TemplateProcessor
 from vllm_judge.exceptions import (
     ParseError,

vllm_judge/models.py CHANGED Viewed

@@ -59,7 +59,6 @@ class JudgeConfig(BaseModel):
     # Model parameters
     temperature: float = Field(0.0, description="Sampling temperature")
     max_tokens: int = Field(256, description="Maximum tokens in response")
-    # top_p: float = Field(0.95, description="Top-p sampling")
     # Batch settings
     max_concurrent: int = Field(50, description="Maximum concurrent requests")
@@ -99,7 +98,8 @@ class Metric:
         system_prompt: Optional[str] = None,
         template_vars: Optional[Dict[str, Any]] = None,
         required_vars: Optional[List[str]] = None,
-        template_engine: Union[str, TemplateEngine] = TemplateEngine.FORMAT
+        template_engine: Union[str, TemplateEngine] = TemplateEngine.FORMAT,
+        additional_instructions: Optional[str] = None
     ):
         """
         Initialize a reusable metric.
@@ -125,7 +125,7 @@ class Metric:
         self.template_vars = template_vars or {}
         self.required_vars = required_vars or []
         self.template_engine = TemplateEngine(template_engine)
+        self.additional_instructions = additional_instructions
         # Auto-detect required variables if not specified
         if not self.required_vars and self.template_engine == TemplateEngine.FORMAT:
             self._auto_detect_required_vars()

vllm_judge/{prompts.py → prompt_builder.py} RENAMED Viewed

@@ -1,5 +1,5 @@
 from typing import List, Dict, Union, Optional, Tuple, Any
+import json
 class PromptBuilder:
     """Builds prompts for evaluation requests."""
@@ -35,24 +35,36 @@ class PromptBuilder:
         """
         # Detect evaluation type
         is_comparison = isinstance(content, dict) and "a" in content and "b" in content
+        output_format = """
+# Output Format:
+The JSON object MUST have exactly these three fields:
+1. decision: (String | Boolean) This decision label should clearly state your main finding. This could be a string representing a specific class (eg., PASS, FAIL, CORRECT, INCORRECT, etc.) or a boolean value (true or false). If user provided a rubric, you should use the rubric to determine the decision label.
+2. score: (Number | null) A numerical score for the evaluation. If scoring is requested, provide the score as a number. If scoring is NOT requested or is not applicable for the specific task, you MUST use the value null for this field.
+3. reasoning: (String) A concise explanation justifying your decision and score (if a score was provided). This reasoning must directly and logically support your evaluation and refer to the specific evaluation criteria.
+The JSON object MUST be well-formed and adhere strictly to the following structure:
+{
+    "decision": <your judgment - string|boolean>,
+    "reasoning": <concise explanation of your judgment - string>,
+    "score": <numeric score if requested, otherwise null - number|null>
+}
+        """
         # System message
         if not system_prompt:
-            # TODO: Add more detailed system prompts
-            system_prompt = "You are an impartial judge and expert evaluator "
-            if is_comparison:
-                system_prompt+="comparing responses objectively."
-            else:
-                system_prompt+="providing objective assessments."
-        # Output format instructions
-        system_prompt+="\nYou must respond in JSON format:\n"
-        system_prompt+="""{
-    "decision": <your judgment - string|number|boolean>,
-    "reasoning": "<concise explanation of your judgment>",
-    "score": <numeric score if requested, otherwise null>
-}"""
-        system_prompt+="\nDo not include any text in your response except for the JSON object."
+            system_prompt = """You are an impartial judge and expert evaluator. Your task is to evaluate the provided content based on the specific evaluation criteria and rubric.
+# Key Instructions:
+1. Your evaluation must be objective, consistent, and based solely on the specified criteria. Do not let your own opinions or biases interfere.
+2. Focus exclusively on quality assessment.
+3. Do not be influenced by the length of the responses unless response length is explicitly relevant to the specified evaluation criteria (e.g., a task assessing conciseness or verbosity).
+4. Your entire response MUST be a single, valid JSON object and nothing else. Do not include any text or conversational filler before or after this JSON object.
+"""
+        system_prompt += output_format
         # Build user message
         user_content = PromptBuilder._build_user_prompt(
@@ -93,28 +105,30 @@ class PromptBuilder:
             parts.append(f'"{input}"')
             parts.append("")
+        parts.append("## Content to evaluate:")
+        if is_comparison:
+            parts.append(f"**Response A:**\n{content['a']}")
+            parts.append(f"**Response B:**\n{content['b']}")
+        else:
+            parts.append(content)
+        parts.append("## Evaluation Criteria:")
         # Task description
         if is_comparison:
-            if input:
-                parts.append(f"Compare how well these two responses address the input for: {criteria}")
-            else:
-                parts.append(f"Compare these two responses based on: {criteria}")
+            parts.append(f"Compare the two responses based on: {criteria}")
             if context:
                 parts.append(f"\nContext: {context}")
-            parts.append(f"\nResponse A:\n{content['a']}")
-            parts.append(f"\nResponse B:\n{content['b']}")
         else:
-            if input:
-                parts.append(f"Evaluate how well this response addresses the input for: {criteria}")
-            else:
-                parts.append(f"Evaluate the following response based on: {criteria}")
+            parts.append(f"Evaluate the content based on: {criteria}")
             if context:
                 parts.append(f"\nContext: {context}")
-            parts.append(f"\nResponse to evaluate:\n{content}")
+        parts.append(f"\nYou must return a decision label/class (your main judgement) for the `decision` field and a concise explanation for the `reasoning` field in the JSON object.")
         # Add scale and rubric
         if scale:
-            parts.append(f"\nProvide a score from {scale[0]} to {scale[1]}")
+            parts.append(f"In addition to these, provide a score from {scale[0]} to {scale[1]}")
             if isinstance(rubric, dict):
                 parts.append("\nScoring guide:")
@@ -125,38 +139,46 @@ class PromptBuilder:
             elif rubric:
                 parts.append(f"\nEvaluation guide: {rubric}")
         elif rubric:
+            parts.append("\nIn addition to these, provide a score if required by the following evaluation guide.")
             parts.append(f"\nEvaluation guide: {rubric}")
         # Add examples if provided
         if examples:
             parts.append("\nExample evaluations:")
-            for i, ex in enumerate(examples, 1):
-                parts.append(f"\nExample {i}:")
+            for i, ex in enumerate(examples):
+                parts.append(f"Example {i+1}:")
+                parts.append("Request:")
                 # Handle different example formats
                 if "input" in ex:
                     parts.append(f"Input: {ex['input']}")
                 if "content" in ex:
-                    parts.append(f"Response: {ex['content']}")
+                    parts.append(f"Content: {ex['content']}")
                 elif "text" in ex:
                     parts.append(f"Text: {ex['text']}")
-                if "decision" in ex:
-                    parts.append(f"Decision: {ex['decision']}")
+                parts.append("Response:")
+                response = {}
+                if "decision" not in ex or ex["decision"] is None or ex["decision"] == "":
+                    raise ValueError("Example must include a decision field")
+                response["decision"] = ex["decision"]
                 if "score" in ex:
-                    parts.append(f"Score: {ex['score']}")
+                    response["score"] = ex["score"]
                 if "reasoning" in ex:
-                    parts.append(f"Reasoning: {ex['reasoning']}")
+                    response["reasoning"] = ex["reasoning"]
+                parts.append(json.dumps(response))
         # Add any additional instructions
         if kwargs.get("additional_instructions"):
-            parts.append(f"\nAdditional instructions: {kwargs['additional_instructions']}")
+            parts.append(f"Additional instructions: {kwargs['additional_instructions']}")
         # Output format instructions
         parts.append("\nYou must respond in JSON format:")
         parts.append("""{
-    "decision": <your judgment - string|number|boolean>,
+    "decision": <your judgment - string|boolean>,
     "reasoning": "<concise explanation of your judgment>",
     "score": <numeric score if requested, otherwise null>
 }""")

{vllm_judge-0.1.4.dist-info → vllm_judge-0.1.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vllm_judge
-Version: 0.1.4
+Version: 0.1.6
 Summary: LLM-as-a-Judge evaluations for vLLM hosted models
 Author: TrustyAI team
 Author-email: Sai Chandra Pandraju <saichandrapandraju@gmail.com>

vllm_judge-0.1.6.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,20 @@
+vllm_judge/__init__.py,sha256=bl6j1TXcPFsNcOKpFGX6FrkS0dikhApOKjhsBOaXm80,2800
+vllm_judge/batch.py,sha256=GJe6d2nsUWCxcSG-j5xnyovfKAM-YklWS0PNAwTMO9s,4886
+vllm_judge/builtin_metrics.py,sha256=XAhn5a-kJgip4NYkaTmkwiIWXjYGRcHoztAmqjmDO9A,48711
+vllm_judge/cli.py,sha256=3075NrduxYz_iITQ0ZnqdjK0jJ9vGpzC6B_23lAN3wc,13598
+vllm_judge/client.py,sha256=x3LBRUjnOmX0iEWdRqz-ALzb03qezZ92aMpdMFzHRcs,8096
+vllm_judge/exceptions.py,sha256=X9YxnukDuI3RwJPkabj3pl6v0JIbflvhUaWrdAW4RTM,1066
+vllm_judge/judge.py,sha256=X3oLXfWjmIOay5oDWBQNoEnxyDlF0sPf69HBjieW1Ug,16954
+vllm_judge/models.py,sha256=wN2JGddWAxT4EXhmfl3IjBYOpDG_9lGP125UWP4IKTw,7935
+vllm_judge/parsers.py,sha256=lhByBIMS_1EwvxEe31jFgVcTwcFwm5mWoJDXG4TnbvQ,509
+vllm_judge/prompt_builder.py,sha256=miQU_mKDKkTuRfVEiQT2LfN4QEvnphLu39s2YldOvCA,8754
+vllm_judge/templating.py,sha256=LjVFXFcwHl8xnBLLVr_IIqtN-EbLp0HZ5ndNbBpcJTQ,6998
+vllm_judge/api/__init__.py,sha256=yUqAIcqpBDUKoq8ZLcKPQaX45oesy3Nmb2yEwy-dHyU,727
+vllm_judge/api/client.py,sha256=RgbhzRLlOR8gia8_-Kbe2_wQC4tjNPzqObPz2GPP5ec,12409
+vllm_judge/api/models.py,sha256=GXj3slwytJWg5M4f5MPZ8Ft_hrkEEAZh0qgpYDy-Qe4,5102
+vllm_judge/api/server.py,sha256=B97RVVeVHLxf69_bSZBvP69DbTQhoFW2tZOBBS0ahrQ,17838
+vllm_judge-0.1.6.dist-info/METADATA,sha256=MRGak20XswQG2-Qq_iFCIUNqZcWfMOZsSA8GRWMj6ak,4251
+vllm_judge-0.1.6.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
+vllm_judge-0.1.6.dist-info/entry_points.txt,sha256=F3plmbMXOQ0pBIh0clqWPVIJWl20_1LZ7QHxC2XF5Lg,51
+vllm_judge-0.1.6.dist-info/top_level.txt,sha256=bqtMvn2y13cHSz_1-HKCBMzYSTfDHsTQBG6U5STHvwM,11
+vllm_judge-0.1.6.dist-info/RECORD,,

vllm-judge 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

vllm-judge 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl