PyPI - vllm-judge - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

vllm-judge 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

vllm_judge/__init__.py +16 -2
vllm_judge/api/client.py +46 -9
vllm_judge/api/models.py +9 -2
vllm_judge/api/server.py +4 -2
vllm_judge/batch.py +4 -4
vllm_judge/cli.py +82 -6
vllm_judge/judge.py +59 -12
vllm_judge/metrics.py +744 -262
vllm_judge/models.py +3 -2
vllm_judge/prompts.py +35 -15
{vllm_judge-0.1.3.dist-info → vllm_judge-0.1.5.dist-info}/METADATA +1 -1
vllm_judge-0.1.5.dist-info/RECORD +20 -0
vllm_judge-0.1.3.dist-info/RECORD +0 -20
{vllm_judge-0.1.3.dist-info → vllm_judge-0.1.5.dist-info}/WHEEL +0 -0
{vllm_judge-0.1.3.dist-info → vllm_judge-0.1.5.dist-info}/entry_points.txt +0 -0
{vllm_judge-0.1.3.dist-info → vllm_judge-0.1.5.dist-info}/top_level.txt +0 -0

vllm_judge/models.py CHANGED Viewed

@@ -99,7 +99,8 @@ class Metric:
         system_prompt: Optional[str] = None,
         template_vars: Optional[Dict[str, Any]] = None,
         required_vars: Optional[List[str]] = None,
-        template_engine: Union[str, TemplateEngine] = TemplateEngine.FORMAT
+        template_engine: Union[str, TemplateEngine] = TemplateEngine.FORMAT,
+        additional_instructions: Optional[str] = None
     ):
         """
         Initialize a reusable metric.
@@ -125,7 +126,7 @@ class Metric:
         self.template_vars = template_vars or {}
         self.required_vars = required_vars or []
         self.template_engine = TemplateEngine(template_engine)
+        self.additional_instructions = additional_instructions
         # Auto-detect required variables if not specified
         if not self.required_vars and self.template_engine == TemplateEngine.FORMAT:
             self._auto_detect_required_vars()

vllm_judge/prompts.py CHANGED Viewed

@@ -6,8 +6,9 @@ class PromptBuilder:
     @staticmethod
     def build_messages(
-        response: Union[str, Dict[str, str]],
+        content: Union[str, Dict[str, str]],
         criteria: str,
+        input: Optional[str] = None,
         rubric: Union[str, Dict[Union[int, float], str]] = None,
         scale: Optional[Tuple[int, int]] = None,
         examples: List[Dict[str, Any]] = None,
@@ -19,8 +20,9 @@ class PromptBuilder:
         Build chat messages for evaluation.
         Args:
-            response: Single response or dict with 'a' and 'b' for comparison
+            content: Single response or dict with 'a' and 'b' for comparison
             criteria: What to evaluate for
+            input: Optional input/question/prompt that the response addresses
             rubric: Evaluation guide
             scale: Numeric scale (min, max)
             examples: Few-shot examples
@@ -32,7 +34,7 @@ class PromptBuilder:
             List of chat messages
         """
         # Detect evaluation type
-        is_comparison = isinstance(response, dict) and "a" in response and "b" in response
+        is_comparison = isinstance(content, dict) and "a" in content and "b" in content
         # System message
         if not system_prompt:
@@ -46,7 +48,7 @@ class PromptBuilder:
         # Output format instructions
         system_prompt+="\nYou must respond in JSON format:\n"
         system_prompt+="""{
-    "decision": <your judgment - string|number|boolean>,
+    "decision": <your judgment - string|boolean>,
     "reasoning": "<concise explanation of your judgment>",
     "score": <numeric score if requested, otherwise null>
 }"""
@@ -54,7 +56,8 @@ class PromptBuilder:
         # Build user message
         user_content = PromptBuilder._build_user_prompt(
-            response=response,
+            content=content,
+            input=input,
             criteria=criteria,
             rubric=rubric,
             scale=scale,
@@ -71,34 +74,49 @@ class PromptBuilder:
     @staticmethod
     def _build_user_prompt(
-        response: Union[str, Dict[str, str]],
+        content: Union[str, Dict[str, str]],
         criteria: str,
         rubric: Union[str, Dict[Union[int, float], str]],
         scale: Optional[Tuple[int, int]],
         examples: List[Dict[str, Any]],
         is_comparison: bool,
         context: Optional[str] = None,
+        input: Optional[str] = None,
         **kwargs
     ) -> str:
         """Build the user message content."""
         parts = []
+        # Add input section if provided
+        if input:
+            parts.append("Given the following input/question:")
+            parts.append(f'"{input}"')
+            parts.append("")
         # Task description
         if is_comparison:
-            parts.append(f"Compare these two responses based on: {criteria}")
+            if input:
+                parts.append(f"Compare how well these two responses address the input for: {criteria}")
+            else:
+                parts.append(f"Compare these two responses based on: {criteria}")
             if context:
                 parts.append(f"\nContext: {context}")
-            parts.append(f"\nResponse A:\n{response['a']}")
-            parts.append(f"\nResponse B:\n{response['b']}")
+            parts.append(f"\nResponse A:\n{content['a']}")
+            parts.append(f"\nResponse B:\n{content['b']}")
         else:
-            parts.append(f"Evaluate the following response based on: {criteria}")
+            if input:
+                parts.append(f"Evaluate how well this content addresses the input for: {criteria}")
+            else:
+                parts.append(f"Evaluate the following content based on: {criteria}")
             if context:
                 parts.append(f"\nContext: {context}")
-            parts.append(f"\nResponse to evaluate:\n{response}")
+            parts.append(f"\nContent to evaluate:\n{content}")
+        parts.append(f"\nYou must return a decision label/class (your judgement) for the `decision` field and a concise explanation for the `reasoning` field.")
         # Add scale and rubric
         if scale:
-            parts.append(f"\nProvide a score from {scale[0]} to {scale[1]}")
+            parts.append(f"\nIn addition to these, provide a score from {scale[0]} to {scale[1]}")
             if isinstance(rubric, dict):
                 parts.append("\nScoring guide:")
@@ -118,8 +136,10 @@ class PromptBuilder:
                 parts.append(f"\nExample {i}:")
                 # Handle different example formats
-                if "response" in ex:
-                    parts.append(f"Response: {ex['response']}")
+                if "input" in ex:
+                    parts.append(f"Input: {ex['input']}")
+                if "content" in ex:
+                    parts.append(f"Content: {ex['content']}")
                 elif "text" in ex:
                     parts.append(f"Text: {ex['text']}")
@@ -138,7 +158,7 @@ class PromptBuilder:
         # Output format instructions
         parts.append("\nYou must respond in JSON format:")
         parts.append("""{
-    "decision": <your judgment - string|number|boolean>,
+    "decision": <your judgment - string|boolean>,
     "reasoning": "<concise explanation of your judgment>",
     "score": <numeric score if requested, otherwise null>
 }""")

{vllm_judge-0.1.3.dist-info → vllm_judge-0.1.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vllm_judge
-Version: 0.1.3
+Version: 0.1.5
 Summary: LLM-as-a-Judge evaluations for vLLM hosted models
 Author: TrustyAI team
 Author-email: Sai Chandra Pandraju <saichandrapandraju@gmail.com>

vllm_judge-0.1.5.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,20 @@
+vllm_judge/__init__.py,sha256=6OKo_RbNOov83pZIPfg12ITxiE6UZh2_UOTjQsgWbFY,2792
+vllm_judge/batch.py,sha256=3zkatZxQESCjYz99qfLhxl2Dq2tHAfhtdTiXxjVqUxE,4836
+vllm_judge/cli.py,sha256=tnMqJ2RvCFaXUY4ok4IO-d9IRNJhEck60AJNzdCaqhg,13679
+vllm_judge/client.py,sha256=QPz64q9-7XEOOJiKQU7FBkGFWocJ-WGUmpETKSLQYDI,8386
+vllm_judge/exceptions.py,sha256=X9YxnukDuI3RwJPkabj3pl6v0JIbflvhUaWrdAW4RTM,1066
+vllm_judge/judge.py,sha256=SDT_cGDZzHu8NOjG6eqHQsYqIuXR12j7ocpyrVDhHrQ,16939
+vllm_judge/metrics.py,sha256=WwtR6Bb4cc0gDplhZnysNzD1EfOMCEzFc8-3hJMqnJs,48709
+vllm_judge/models.py,sha256=o4OdRtRdsz9n5RhHrz-uA9ylG0cGQg99NJYay0RaeDE,7998
+vllm_judge/prompts.py,sha256=KC8AfiIgKKxQuhT1bnnyYXrSBbcU2-RnkSLqDJfrt8o,7251
+vllm_judge/templating.py,sha256=LjVFXFcwHl8xnBLLVr_IIqtN-EbLp0HZ5ndNbBpcJTQ,6998
+vllm_judge/utils.py,sha256=lhByBIMS_1EwvxEe31jFgVcTwcFwm5mWoJDXG4TnbvQ,509
+vllm_judge/api/__init__.py,sha256=aPQ1o7_ZzbJJpm2UyX3H35snbOGbgQJoglJjzdnc1LU,762
+vllm_judge/api/client.py,sha256=l46IpQHJxmbDfXpyCOXfir70c_3hPaIr6OEiOzOMk5Q,12449
+vllm_judge/api/models.py,sha256=GXj3slwytJWg5M4f5MPZ8Ft_hrkEEAZh0qgpYDy-Qe4,5102
+vllm_judge/api/server.py,sha256=1UQMV6MRdlqHS6NYdrQI41bi_wNb0QC8RZD4jCEeTkU,17888
+vllm_judge-0.1.5.dist-info/METADATA,sha256=5UXUqyckWp9fGLQXcBxkI6ejmFfWpCjjpyIeMx96zTI,4251
+vllm_judge-0.1.5.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
+vllm_judge-0.1.5.dist-info/entry_points.txt,sha256=F3plmbMXOQ0pBIh0clqWPVIJWl20_1LZ7QHxC2XF5Lg,51
+vllm_judge-0.1.5.dist-info/top_level.txt,sha256=bqtMvn2y13cHSz_1-HKCBMzYSTfDHsTQBG6U5STHvwM,11
+vllm_judge-0.1.5.dist-info/RECORD,,

vllm_judge-0.1.3.dist-info/RECORD DELETED Viewed

@@ -1,20 +0,0 @@
-vllm_judge/__init__.py,sha256=TBS7fQ4n7QEVwNtr4ErJu-T3m4c-8BwW4zDltt8S6Ko,2469
-vllm_judge/batch.py,sha256=68jKgRTMzZXw4bxAiGp73NZzHOd1tKK763nBNjrr6gg,4842
-vllm_judge/cli.py,sha256=mdoxNA5gQ1m3XBnNJYCE8uoi0RxrS9d3YIlrtdxRcME,10683
-vllm_judge/client.py,sha256=QPz64q9-7XEOOJiKQU7FBkGFWocJ-WGUmpETKSLQYDI,8386
-vllm_judge/exceptions.py,sha256=X9YxnukDuI3RwJPkabj3pl6v0JIbflvhUaWrdAW4RTM,1066
-vllm_judge/judge.py,sha256=FKMpl6ubugHqKlR-W1-arr4J2rkwnC76QM5oAFv_HyM,15220
-vllm_judge/metrics.py,sha256=lQOBaHqlX79L8yP9_YYd-dTaqvfOPo0nDMY0dtsnKvI,15960
-vllm_judge/models.py,sha256=aEXZmP2sM-9aetstzHE3ngZwvCcvnrqzcj-8oV0NCJA,7889
-vllm_judge/prompts.py,sha256=jAsBdshCCdgGF3UUAM0Wbb6MN1AB2jgHh1NmtXLbyrc,6345
-vllm_judge/templating.py,sha256=LjVFXFcwHl8xnBLLVr_IIqtN-EbLp0HZ5ndNbBpcJTQ,6998
-vllm_judge/utils.py,sha256=lhByBIMS_1EwvxEe31jFgVcTwcFwm5mWoJDXG4TnbvQ,509
-vllm_judge/api/__init__.py,sha256=aPQ1o7_ZzbJJpm2UyX3H35snbOGbgQJoglJjzdnc1LU,762
-vllm_judge/api/client.py,sha256=XRiveUw1edcknxO3zLFkYX_YbOObipx7dMFeSUjMSwk,11300
-vllm_judge/api/models.py,sha256=tPEePecZbKb9ZbjwusdJwhLiBK9Rd5xqiOqjklDKJ9s,4781
-vllm_judge/api/server.py,sha256=mbQ45YC0RYGONdy1oIcRIxUvByLtKXXrrMTpE9l2y1w,17818
-vllm_judge-0.1.3.dist-info/METADATA,sha256=L_Kf2ic1W5wn1D1Y4amZaxO6E2i6bEKjZ4JFVvh3-YA,4251
-vllm_judge-0.1.3.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
-vllm_judge-0.1.3.dist-info/entry_points.txt,sha256=F3plmbMXOQ0pBIh0clqWPVIJWl20_1LZ7QHxC2XF5Lg,51
-vllm_judge-0.1.3.dist-info/top_level.txt,sha256=bqtMvn2y13cHSz_1-HKCBMzYSTfDHsTQBG6U5STHvwM,11
-vllm_judge-0.1.3.dist-info/RECORD,,

{vllm_judge-0.1.3.dist-info → vllm_judge-0.1.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{vllm_judge-0.1.3.dist-info → vllm_judge-0.1.5.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{vllm_judge-0.1.3.dist-info → vllm_judge-0.1.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

vllm-judge 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

vllm-judge 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl