vllm-judge 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vllm_judge/models.py CHANGED
@@ -99,7 +99,8 @@ class Metric:
99
99
  system_prompt: Optional[str] = None,
100
100
  template_vars: Optional[Dict[str, Any]] = None,
101
101
  required_vars: Optional[List[str]] = None,
102
- template_engine: Union[str, TemplateEngine] = TemplateEngine.FORMAT
102
+ template_engine: Union[str, TemplateEngine] = TemplateEngine.FORMAT,
103
+ additional_instructions: Optional[str] = None
103
104
  ):
104
105
  """
105
106
  Initialize a reusable metric.
@@ -125,7 +126,7 @@ class Metric:
125
126
  self.template_vars = template_vars or {}
126
127
  self.required_vars = required_vars or []
127
128
  self.template_engine = TemplateEngine(template_engine)
128
-
129
+ self.additional_instructions = additional_instructions
129
130
  # Auto-detect required variables if not specified
130
131
  if not self.required_vars and self.template_engine == TemplateEngine.FORMAT:
131
132
  self._auto_detect_required_vars()
vllm_judge/prompts.py CHANGED
@@ -6,8 +6,9 @@ class PromptBuilder:
6
6
 
7
7
  @staticmethod
8
8
  def build_messages(
9
- response: Union[str, Dict[str, str]],
9
+ content: Union[str, Dict[str, str]],
10
10
  criteria: str,
11
+ input: Optional[str] = None,
11
12
  rubric: Union[str, Dict[Union[int, float], str]] = None,
12
13
  scale: Optional[Tuple[int, int]] = None,
13
14
  examples: List[Dict[str, Any]] = None,
@@ -19,8 +20,9 @@ class PromptBuilder:
19
20
  Build chat messages for evaluation.
20
21
 
21
22
  Args:
22
- response: Single response or dict with 'a' and 'b' for comparison
23
+ content: Single response or dict with 'a' and 'b' for comparison
23
24
  criteria: What to evaluate for
25
+ input: Optional input/question/prompt that the response addresses
24
26
  rubric: Evaluation guide
25
27
  scale: Numeric scale (min, max)
26
28
  examples: Few-shot examples
@@ -32,7 +34,7 @@ class PromptBuilder:
32
34
  List of chat messages
33
35
  """
34
36
  # Detect evaluation type
35
- is_comparison = isinstance(response, dict) and "a" in response and "b" in response
37
+ is_comparison = isinstance(content, dict) and "a" in content and "b" in content
36
38
 
37
39
  # System message
38
40
  if not system_prompt:
@@ -46,7 +48,7 @@ class PromptBuilder:
46
48
  # Output format instructions
47
49
  system_prompt+="\nYou must respond in JSON format:\n"
48
50
  system_prompt+="""{
49
- "decision": <your judgment - string|number|boolean>,
51
+ "decision": <your judgment - string|boolean>,
50
52
  "reasoning": "<concise explanation of your judgment>",
51
53
  "score": <numeric score if requested, otherwise null>
52
54
  }"""
@@ -54,7 +56,8 @@ class PromptBuilder:
54
56
 
55
57
  # Build user message
56
58
  user_content = PromptBuilder._build_user_prompt(
57
- response=response,
59
+ content=content,
60
+ input=input,
58
61
  criteria=criteria,
59
62
  rubric=rubric,
60
63
  scale=scale,
@@ -71,34 +74,49 @@ class PromptBuilder:
71
74
 
72
75
  @staticmethod
73
76
  def _build_user_prompt(
74
- response: Union[str, Dict[str, str]],
77
+ content: Union[str, Dict[str, str]],
75
78
  criteria: str,
76
79
  rubric: Union[str, Dict[Union[int, float], str]],
77
80
  scale: Optional[Tuple[int, int]],
78
81
  examples: List[Dict[str, Any]],
79
82
  is_comparison: bool,
80
83
  context: Optional[str] = None,
84
+ input: Optional[str] = None,
81
85
  **kwargs
82
86
  ) -> str:
83
87
  """Build the user message content."""
84
88
  parts = []
89
+
90
+ # Add input section if provided
91
+ if input:
92
+ parts.append("Given the following input/question:")
93
+ parts.append(f'"{input}"')
94
+ parts.append("")
85
95
 
86
96
  # Task description
87
97
  if is_comparison:
88
- parts.append(f"Compare these two responses based on: {criteria}")
98
+ if input:
99
+ parts.append(f"Compare how well these two responses address the input for: {criteria}")
100
+ else:
101
+ parts.append(f"Compare these two responses based on: {criteria}")
89
102
  if context:
90
103
  parts.append(f"\nContext: {context}")
91
- parts.append(f"\nResponse A:\n{response['a']}")
92
- parts.append(f"\nResponse B:\n{response['b']}")
104
+ parts.append(f"\nResponse A:\n{content['a']}")
105
+ parts.append(f"\nResponse B:\n{content['b']}")
93
106
  else:
94
- parts.append(f"Evaluate the following response based on: {criteria}")
107
+ if input:
108
+ parts.append(f"Evaluate how well this content addresses the input for: {criteria}")
109
+ else:
110
+ parts.append(f"Evaluate the following content based on: {criteria}")
95
111
  if context:
96
112
  parts.append(f"\nContext: {context}")
97
- parts.append(f"\nResponse to evaluate:\n{response}")
113
+ parts.append(f"\nContent to evaluate:\n{content}")
98
114
 
115
+ parts.append(f"\nYou must return a decision label/class (your judgement) for the `decision` field and a concise explanation for the `reasoning` field.")
116
+
99
117
  # Add scale and rubric
100
118
  if scale:
101
- parts.append(f"\nProvide a score from {scale[0]} to {scale[1]}")
119
+ parts.append(f"\nIn addition to these, provide a score from {scale[0]} to {scale[1]}")
102
120
 
103
121
  if isinstance(rubric, dict):
104
122
  parts.append("\nScoring guide:")
@@ -118,8 +136,10 @@ class PromptBuilder:
118
136
  parts.append(f"\nExample {i}:")
119
137
 
120
138
  # Handle different example formats
121
- if "response" in ex:
122
- parts.append(f"Response: {ex['response']}")
139
+ if "input" in ex:
140
+ parts.append(f"Input: {ex['input']}")
141
+ if "content" in ex:
142
+ parts.append(f"Content: {ex['content']}")
123
143
  elif "text" in ex:
124
144
  parts.append(f"Text: {ex['text']}")
125
145
 
@@ -138,7 +158,7 @@ class PromptBuilder:
138
158
  # Output format instructions
139
159
  parts.append("\nYou must respond in JSON format:")
140
160
  parts.append("""{
141
- "decision": <your judgment - string|number|boolean>,
161
+ "decision": <your judgment - string|boolean>,
142
162
  "reasoning": "<concise explanation of your judgment>",
143
163
  "score": <numeric score if requested, otherwise null>
144
164
  }""")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vllm_judge
3
- Version: 0.1.3
3
+ Version: 0.1.5
4
4
  Summary: LLM-as-a-Judge evaluations for vLLM hosted models
5
5
  Author: TrustyAI team
6
6
  Author-email: Sai Chandra Pandraju <saichandrapandraju@gmail.com>
@@ -0,0 +1,20 @@
1
+ vllm_judge/__init__.py,sha256=6OKo_RbNOov83pZIPfg12ITxiE6UZh2_UOTjQsgWbFY,2792
2
+ vllm_judge/batch.py,sha256=3zkatZxQESCjYz99qfLhxl2Dq2tHAfhtdTiXxjVqUxE,4836
3
+ vllm_judge/cli.py,sha256=tnMqJ2RvCFaXUY4ok4IO-d9IRNJhEck60AJNzdCaqhg,13679
4
+ vllm_judge/client.py,sha256=QPz64q9-7XEOOJiKQU7FBkGFWocJ-WGUmpETKSLQYDI,8386
5
+ vllm_judge/exceptions.py,sha256=X9YxnukDuI3RwJPkabj3pl6v0JIbflvhUaWrdAW4RTM,1066
6
+ vllm_judge/judge.py,sha256=SDT_cGDZzHu8NOjG6eqHQsYqIuXR12j7ocpyrVDhHrQ,16939
7
+ vllm_judge/metrics.py,sha256=WwtR6Bb4cc0gDplhZnysNzD1EfOMCEzFc8-3hJMqnJs,48709
8
+ vllm_judge/models.py,sha256=o4OdRtRdsz9n5RhHrz-uA9ylG0cGQg99NJYay0RaeDE,7998
9
+ vllm_judge/prompts.py,sha256=KC8AfiIgKKxQuhT1bnnyYXrSBbcU2-RnkSLqDJfrt8o,7251
10
+ vllm_judge/templating.py,sha256=LjVFXFcwHl8xnBLLVr_IIqtN-EbLp0HZ5ndNbBpcJTQ,6998
11
+ vllm_judge/utils.py,sha256=lhByBIMS_1EwvxEe31jFgVcTwcFwm5mWoJDXG4TnbvQ,509
12
+ vllm_judge/api/__init__.py,sha256=aPQ1o7_ZzbJJpm2UyX3H35snbOGbgQJoglJjzdnc1LU,762
13
+ vllm_judge/api/client.py,sha256=l46IpQHJxmbDfXpyCOXfir70c_3hPaIr6OEiOzOMk5Q,12449
14
+ vllm_judge/api/models.py,sha256=GXj3slwytJWg5M4f5MPZ8Ft_hrkEEAZh0qgpYDy-Qe4,5102
15
+ vllm_judge/api/server.py,sha256=1UQMV6MRdlqHS6NYdrQI41bi_wNb0QC8RZD4jCEeTkU,17888
16
+ vllm_judge-0.1.5.dist-info/METADATA,sha256=5UXUqyckWp9fGLQXcBxkI6ejmFfWpCjjpyIeMx96zTI,4251
17
+ vllm_judge-0.1.5.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
18
+ vllm_judge-0.1.5.dist-info/entry_points.txt,sha256=F3plmbMXOQ0pBIh0clqWPVIJWl20_1LZ7QHxC2XF5Lg,51
19
+ vllm_judge-0.1.5.dist-info/top_level.txt,sha256=bqtMvn2y13cHSz_1-HKCBMzYSTfDHsTQBG6U5STHvwM,11
20
+ vllm_judge-0.1.5.dist-info/RECORD,,
@@ -1,20 +0,0 @@
1
- vllm_judge/__init__.py,sha256=TBS7fQ4n7QEVwNtr4ErJu-T3m4c-8BwW4zDltt8S6Ko,2469
2
- vllm_judge/batch.py,sha256=68jKgRTMzZXw4bxAiGp73NZzHOd1tKK763nBNjrr6gg,4842
3
- vllm_judge/cli.py,sha256=mdoxNA5gQ1m3XBnNJYCE8uoi0RxrS9d3YIlrtdxRcME,10683
4
- vllm_judge/client.py,sha256=QPz64q9-7XEOOJiKQU7FBkGFWocJ-WGUmpETKSLQYDI,8386
5
- vllm_judge/exceptions.py,sha256=X9YxnukDuI3RwJPkabj3pl6v0JIbflvhUaWrdAW4RTM,1066
6
- vllm_judge/judge.py,sha256=FKMpl6ubugHqKlR-W1-arr4J2rkwnC76QM5oAFv_HyM,15220
7
- vllm_judge/metrics.py,sha256=lQOBaHqlX79L8yP9_YYd-dTaqvfOPo0nDMY0dtsnKvI,15960
8
- vllm_judge/models.py,sha256=aEXZmP2sM-9aetstzHE3ngZwvCcvnrqzcj-8oV0NCJA,7889
9
- vllm_judge/prompts.py,sha256=jAsBdshCCdgGF3UUAM0Wbb6MN1AB2jgHh1NmtXLbyrc,6345
10
- vllm_judge/templating.py,sha256=LjVFXFcwHl8xnBLLVr_IIqtN-EbLp0HZ5ndNbBpcJTQ,6998
11
- vllm_judge/utils.py,sha256=lhByBIMS_1EwvxEe31jFgVcTwcFwm5mWoJDXG4TnbvQ,509
12
- vllm_judge/api/__init__.py,sha256=aPQ1o7_ZzbJJpm2UyX3H35snbOGbgQJoglJjzdnc1LU,762
13
- vllm_judge/api/client.py,sha256=XRiveUw1edcknxO3zLFkYX_YbOObipx7dMFeSUjMSwk,11300
14
- vllm_judge/api/models.py,sha256=tPEePecZbKb9ZbjwusdJwhLiBK9Rd5xqiOqjklDKJ9s,4781
15
- vllm_judge/api/server.py,sha256=mbQ45YC0RYGONdy1oIcRIxUvByLtKXXrrMTpE9l2y1w,17818
16
- vllm_judge-0.1.3.dist-info/METADATA,sha256=L_Kf2ic1W5wn1D1Y4amZaxO6E2i6bEKjZ4JFVvh3-YA,4251
17
- vllm_judge-0.1.3.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
18
- vllm_judge-0.1.3.dist-info/entry_points.txt,sha256=F3plmbMXOQ0pBIh0clqWPVIJWl20_1LZ7QHxC2XF5Lg,51
19
- vllm_judge-0.1.3.dist-info/top_level.txt,sha256=bqtMvn2y13cHSz_1-HKCBMzYSTfDHsTQBG6U5STHvwM,11
20
- vllm_judge-0.1.3.dist-info/RECORD,,