vllm-judge 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vllm_judge/__init__.py +16 -2
- vllm_judge/api/client.py +46 -9
- vllm_judge/api/models.py +9 -2
- vllm_judge/api/server.py +4 -2
- vllm_judge/batch.py +4 -4
- vllm_judge/cli.py +82 -6
- vllm_judge/judge.py +59 -12
- vllm_judge/metrics.py +744 -262
- vllm_judge/models.py +3 -2
- vllm_judge/prompts.py +35 -15
- {vllm_judge-0.1.3.dist-info → vllm_judge-0.1.5.dist-info}/METADATA +1 -1
- vllm_judge-0.1.5.dist-info/RECORD +20 -0
- vllm_judge-0.1.3.dist-info/RECORD +0 -20
- {vllm_judge-0.1.3.dist-info → vllm_judge-0.1.5.dist-info}/WHEEL +0 -0
- {vllm_judge-0.1.3.dist-info → vllm_judge-0.1.5.dist-info}/entry_points.txt +0 -0
- {vllm_judge-0.1.3.dist-info → vllm_judge-0.1.5.dist-info}/top_level.txt +0 -0
vllm_judge/models.py
CHANGED
@@ -99,7 +99,8 @@ class Metric:
|
|
99
99
|
system_prompt: Optional[str] = None,
|
100
100
|
template_vars: Optional[Dict[str, Any]] = None,
|
101
101
|
required_vars: Optional[List[str]] = None,
|
102
|
-
template_engine: Union[str, TemplateEngine] = TemplateEngine.FORMAT
|
102
|
+
template_engine: Union[str, TemplateEngine] = TemplateEngine.FORMAT,
|
103
|
+
additional_instructions: Optional[str] = None
|
103
104
|
):
|
104
105
|
"""
|
105
106
|
Initialize a reusable metric.
|
@@ -125,7 +126,7 @@ class Metric:
|
|
125
126
|
self.template_vars = template_vars or {}
|
126
127
|
self.required_vars = required_vars or []
|
127
128
|
self.template_engine = TemplateEngine(template_engine)
|
128
|
-
|
129
|
+
self.additional_instructions = additional_instructions
|
129
130
|
# Auto-detect required variables if not specified
|
130
131
|
if not self.required_vars and self.template_engine == TemplateEngine.FORMAT:
|
131
132
|
self._auto_detect_required_vars()
|
vllm_judge/prompts.py
CHANGED
@@ -6,8 +6,9 @@ class PromptBuilder:
|
|
6
6
|
|
7
7
|
@staticmethod
|
8
8
|
def build_messages(
|
9
|
-
|
9
|
+
content: Union[str, Dict[str, str]],
|
10
10
|
criteria: str,
|
11
|
+
input: Optional[str] = None,
|
11
12
|
rubric: Union[str, Dict[Union[int, float], str]] = None,
|
12
13
|
scale: Optional[Tuple[int, int]] = None,
|
13
14
|
examples: List[Dict[str, Any]] = None,
|
@@ -19,8 +20,9 @@ class PromptBuilder:
|
|
19
20
|
Build chat messages for evaluation.
|
20
21
|
|
21
22
|
Args:
|
22
|
-
|
23
|
+
content: Single response or dict with 'a' and 'b' for comparison
|
23
24
|
criteria: What to evaluate for
|
25
|
+
input: Optional input/question/prompt that the response addresses
|
24
26
|
rubric: Evaluation guide
|
25
27
|
scale: Numeric scale (min, max)
|
26
28
|
examples: Few-shot examples
|
@@ -32,7 +34,7 @@ class PromptBuilder:
|
|
32
34
|
List of chat messages
|
33
35
|
"""
|
34
36
|
# Detect evaluation type
|
35
|
-
is_comparison = isinstance(
|
37
|
+
is_comparison = isinstance(content, dict) and "a" in content and "b" in content
|
36
38
|
|
37
39
|
# System message
|
38
40
|
if not system_prompt:
|
@@ -46,7 +48,7 @@ class PromptBuilder:
|
|
46
48
|
# Output format instructions
|
47
49
|
system_prompt+="\nYou must respond in JSON format:\n"
|
48
50
|
system_prompt+="""{
|
49
|
-
"decision": <your judgment - string|
|
51
|
+
"decision": <your judgment - string|boolean>,
|
50
52
|
"reasoning": "<concise explanation of your judgment>",
|
51
53
|
"score": <numeric score if requested, otherwise null>
|
52
54
|
}"""
|
@@ -54,7 +56,8 @@ class PromptBuilder:
|
|
54
56
|
|
55
57
|
# Build user message
|
56
58
|
user_content = PromptBuilder._build_user_prompt(
|
57
|
-
|
59
|
+
content=content,
|
60
|
+
input=input,
|
58
61
|
criteria=criteria,
|
59
62
|
rubric=rubric,
|
60
63
|
scale=scale,
|
@@ -71,34 +74,49 @@ class PromptBuilder:
|
|
71
74
|
|
72
75
|
@staticmethod
|
73
76
|
def _build_user_prompt(
|
74
|
-
|
77
|
+
content: Union[str, Dict[str, str]],
|
75
78
|
criteria: str,
|
76
79
|
rubric: Union[str, Dict[Union[int, float], str]],
|
77
80
|
scale: Optional[Tuple[int, int]],
|
78
81
|
examples: List[Dict[str, Any]],
|
79
82
|
is_comparison: bool,
|
80
83
|
context: Optional[str] = None,
|
84
|
+
input: Optional[str] = None,
|
81
85
|
**kwargs
|
82
86
|
) -> str:
|
83
87
|
"""Build the user message content."""
|
84
88
|
parts = []
|
89
|
+
|
90
|
+
# Add input section if provided
|
91
|
+
if input:
|
92
|
+
parts.append("Given the following input/question:")
|
93
|
+
parts.append(f'"{input}"')
|
94
|
+
parts.append("")
|
85
95
|
|
86
96
|
# Task description
|
87
97
|
if is_comparison:
|
88
|
-
|
98
|
+
if input:
|
99
|
+
parts.append(f"Compare how well these two responses address the input for: {criteria}")
|
100
|
+
else:
|
101
|
+
parts.append(f"Compare these two responses based on: {criteria}")
|
89
102
|
if context:
|
90
103
|
parts.append(f"\nContext: {context}")
|
91
|
-
parts.append(f"\nResponse A:\n{
|
92
|
-
parts.append(f"\nResponse B:\n{
|
104
|
+
parts.append(f"\nResponse A:\n{content['a']}")
|
105
|
+
parts.append(f"\nResponse B:\n{content['b']}")
|
93
106
|
else:
|
94
|
-
|
107
|
+
if input:
|
108
|
+
parts.append(f"Evaluate how well this content addresses the input for: {criteria}")
|
109
|
+
else:
|
110
|
+
parts.append(f"Evaluate the following content based on: {criteria}")
|
95
111
|
if context:
|
96
112
|
parts.append(f"\nContext: {context}")
|
97
|
-
parts.append(f"\
|
113
|
+
parts.append(f"\nContent to evaluate:\n{content}")
|
98
114
|
|
115
|
+
parts.append(f"\nYou must return a decision label/class (your judgement) for the `decision` field and a concise explanation for the `reasoning` field.")
|
116
|
+
|
99
117
|
# Add scale and rubric
|
100
118
|
if scale:
|
101
|
-
parts.append(f"\
|
119
|
+
parts.append(f"\nIn addition to these, provide a score from {scale[0]} to {scale[1]}")
|
102
120
|
|
103
121
|
if isinstance(rubric, dict):
|
104
122
|
parts.append("\nScoring guide:")
|
@@ -118,8 +136,10 @@ class PromptBuilder:
|
|
118
136
|
parts.append(f"\nExample {i}:")
|
119
137
|
|
120
138
|
# Handle different example formats
|
121
|
-
if "
|
122
|
-
parts.append(f"
|
139
|
+
if "input" in ex:
|
140
|
+
parts.append(f"Input: {ex['input']}")
|
141
|
+
if "content" in ex:
|
142
|
+
parts.append(f"Content: {ex['content']}")
|
123
143
|
elif "text" in ex:
|
124
144
|
parts.append(f"Text: {ex['text']}")
|
125
145
|
|
@@ -138,7 +158,7 @@ class PromptBuilder:
|
|
138
158
|
# Output format instructions
|
139
159
|
parts.append("\nYou must respond in JSON format:")
|
140
160
|
parts.append("""{
|
141
|
-
"decision": <your judgment - string|
|
161
|
+
"decision": <your judgment - string|boolean>,
|
142
162
|
"reasoning": "<concise explanation of your judgment>",
|
143
163
|
"score": <numeric score if requested, otherwise null>
|
144
164
|
}""")
|
@@ -0,0 +1,20 @@
|
|
1
|
+
vllm_judge/__init__.py,sha256=6OKo_RbNOov83pZIPfg12ITxiE6UZh2_UOTjQsgWbFY,2792
|
2
|
+
vllm_judge/batch.py,sha256=3zkatZxQESCjYz99qfLhxl2Dq2tHAfhtdTiXxjVqUxE,4836
|
3
|
+
vllm_judge/cli.py,sha256=tnMqJ2RvCFaXUY4ok4IO-d9IRNJhEck60AJNzdCaqhg,13679
|
4
|
+
vllm_judge/client.py,sha256=QPz64q9-7XEOOJiKQU7FBkGFWocJ-WGUmpETKSLQYDI,8386
|
5
|
+
vllm_judge/exceptions.py,sha256=X9YxnukDuI3RwJPkabj3pl6v0JIbflvhUaWrdAW4RTM,1066
|
6
|
+
vllm_judge/judge.py,sha256=SDT_cGDZzHu8NOjG6eqHQsYqIuXR12j7ocpyrVDhHrQ,16939
|
7
|
+
vllm_judge/metrics.py,sha256=WwtR6Bb4cc0gDplhZnysNzD1EfOMCEzFc8-3hJMqnJs,48709
|
8
|
+
vllm_judge/models.py,sha256=o4OdRtRdsz9n5RhHrz-uA9ylG0cGQg99NJYay0RaeDE,7998
|
9
|
+
vllm_judge/prompts.py,sha256=KC8AfiIgKKxQuhT1bnnyYXrSBbcU2-RnkSLqDJfrt8o,7251
|
10
|
+
vllm_judge/templating.py,sha256=LjVFXFcwHl8xnBLLVr_IIqtN-EbLp0HZ5ndNbBpcJTQ,6998
|
11
|
+
vllm_judge/utils.py,sha256=lhByBIMS_1EwvxEe31jFgVcTwcFwm5mWoJDXG4TnbvQ,509
|
12
|
+
vllm_judge/api/__init__.py,sha256=aPQ1o7_ZzbJJpm2UyX3H35snbOGbgQJoglJjzdnc1LU,762
|
13
|
+
vllm_judge/api/client.py,sha256=l46IpQHJxmbDfXpyCOXfir70c_3hPaIr6OEiOzOMk5Q,12449
|
14
|
+
vllm_judge/api/models.py,sha256=GXj3slwytJWg5M4f5MPZ8Ft_hrkEEAZh0qgpYDy-Qe4,5102
|
15
|
+
vllm_judge/api/server.py,sha256=1UQMV6MRdlqHS6NYdrQI41bi_wNb0QC8RZD4jCEeTkU,17888
|
16
|
+
vllm_judge-0.1.5.dist-info/METADATA,sha256=5UXUqyckWp9fGLQXcBxkI6ejmFfWpCjjpyIeMx96zTI,4251
|
17
|
+
vllm_judge-0.1.5.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
|
18
|
+
vllm_judge-0.1.5.dist-info/entry_points.txt,sha256=F3plmbMXOQ0pBIh0clqWPVIJWl20_1LZ7QHxC2XF5Lg,51
|
19
|
+
vllm_judge-0.1.5.dist-info/top_level.txt,sha256=bqtMvn2y13cHSz_1-HKCBMzYSTfDHsTQBG6U5STHvwM,11
|
20
|
+
vllm_judge-0.1.5.dist-info/RECORD,,
|
@@ -1,20 +0,0 @@
|
|
1
|
-
vllm_judge/__init__.py,sha256=TBS7fQ4n7QEVwNtr4ErJu-T3m4c-8BwW4zDltt8S6Ko,2469
|
2
|
-
vllm_judge/batch.py,sha256=68jKgRTMzZXw4bxAiGp73NZzHOd1tKK763nBNjrr6gg,4842
|
3
|
-
vllm_judge/cli.py,sha256=mdoxNA5gQ1m3XBnNJYCE8uoi0RxrS9d3YIlrtdxRcME,10683
|
4
|
-
vllm_judge/client.py,sha256=QPz64q9-7XEOOJiKQU7FBkGFWocJ-WGUmpETKSLQYDI,8386
|
5
|
-
vllm_judge/exceptions.py,sha256=X9YxnukDuI3RwJPkabj3pl6v0JIbflvhUaWrdAW4RTM,1066
|
6
|
-
vllm_judge/judge.py,sha256=FKMpl6ubugHqKlR-W1-arr4J2rkwnC76QM5oAFv_HyM,15220
|
7
|
-
vllm_judge/metrics.py,sha256=lQOBaHqlX79L8yP9_YYd-dTaqvfOPo0nDMY0dtsnKvI,15960
|
8
|
-
vllm_judge/models.py,sha256=aEXZmP2sM-9aetstzHE3ngZwvCcvnrqzcj-8oV0NCJA,7889
|
9
|
-
vllm_judge/prompts.py,sha256=jAsBdshCCdgGF3UUAM0Wbb6MN1AB2jgHh1NmtXLbyrc,6345
|
10
|
-
vllm_judge/templating.py,sha256=LjVFXFcwHl8xnBLLVr_IIqtN-EbLp0HZ5ndNbBpcJTQ,6998
|
11
|
-
vllm_judge/utils.py,sha256=lhByBIMS_1EwvxEe31jFgVcTwcFwm5mWoJDXG4TnbvQ,509
|
12
|
-
vllm_judge/api/__init__.py,sha256=aPQ1o7_ZzbJJpm2UyX3H35snbOGbgQJoglJjzdnc1LU,762
|
13
|
-
vllm_judge/api/client.py,sha256=XRiveUw1edcknxO3zLFkYX_YbOObipx7dMFeSUjMSwk,11300
|
14
|
-
vllm_judge/api/models.py,sha256=tPEePecZbKb9ZbjwusdJwhLiBK9Rd5xqiOqjklDKJ9s,4781
|
15
|
-
vllm_judge/api/server.py,sha256=mbQ45YC0RYGONdy1oIcRIxUvByLtKXXrrMTpE9l2y1w,17818
|
16
|
-
vllm_judge-0.1.3.dist-info/METADATA,sha256=L_Kf2ic1W5wn1D1Y4amZaxO6E2i6bEKjZ4JFVvh3-YA,4251
|
17
|
-
vllm_judge-0.1.3.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
|
18
|
-
vllm_judge-0.1.3.dist-info/entry_points.txt,sha256=F3plmbMXOQ0pBIh0clqWPVIJWl20_1LZ7QHxC2XF5Lg,51
|
19
|
-
vllm_judge-0.1.3.dist-info/top_level.txt,sha256=bqtMvn2y13cHSz_1-HKCBMzYSTfDHsTQBG6U5STHvwM,11
|
20
|
-
vllm_judge-0.1.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|