vllm-judge 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vllm_judge/__init__.py +17 -3
- vllm_judge/api/__init__.py +0 -3
- vllm_judge/api/client.py +0 -3
- vllm_judge/api/server.py +1 -5
- vllm_judge/batch.py +2 -1
- vllm_judge/builtin_metrics.py +907 -0
- vllm_judge/cli.py +1 -5
- vllm_judge/client.py +1 -6
- vllm_judge/judge.py +2 -2
- vllm_judge/models.py +3 -3
- vllm_judge/{prompts.py → prompt_builder.py} +60 -38
- {vllm_judge-0.1.4.dist-info → vllm_judge-0.1.6.dist-info}/METADATA +1 -1
- vllm_judge-0.1.6.dist-info/RECORD +20 -0
- vllm_judge/metrics.py +0 -582
- vllm_judge-0.1.4.dist-info/RECORD +0 -20
- /vllm_judge/{utils.py → parsers.py} +0 -0
- {vllm_judge-0.1.4.dist-info → vllm_judge-0.1.6.dist-info}/WHEEL +0 -0
- {vllm_judge-0.1.4.dist-info → vllm_judge-0.1.6.dist-info}/entry_points.txt +0 -0
- {vllm_judge-0.1.4.dist-info → vllm_judge-0.1.6.dist-info}/top_level.txt +0 -0
vllm_judge/cli.py
CHANGED
@@ -1,6 +1,3 @@
|
|
1
|
-
"""
|
2
|
-
Command-line interface for vLLM Judge.
|
3
|
-
"""
|
4
1
|
import asyncio
|
5
2
|
import json
|
6
3
|
import sys
|
@@ -8,10 +5,9 @@ from typing import Optional
|
|
8
5
|
import click
|
9
6
|
|
10
7
|
from vllm_judge import Judge
|
11
|
-
from vllm_judge.models import JudgeConfig
|
12
8
|
from vllm_judge.api.server import start_server as start_api_server
|
13
9
|
from vllm_judge.api.client import JudgeClient
|
14
|
-
from vllm_judge.
|
10
|
+
from vllm_judge.builtin_metrics import BUILTIN_METRICS
|
15
11
|
|
16
12
|
|
17
13
|
@click.group()
|
vllm_judge/client.py
CHANGED
@@ -126,12 +126,8 @@ class VLLMClient:
|
|
126
126
|
"messages": messages,
|
127
127
|
"temperature": self.config.temperature,
|
128
128
|
"max_tokens": self.config.max_tokens,
|
129
|
-
# "top_p": self.config.top_p,
|
130
129
|
}
|
131
|
-
|
132
|
-
# # Request JSON response format if supported
|
133
|
-
# if self.config.temperature < 0.2: # Only for low temperature
|
134
|
-
# request_data["response_format"] = {"type": "json_object"}
|
130
|
+
|
135
131
|
|
136
132
|
try:
|
137
133
|
response = await self._request_with_retry(
|
@@ -172,7 +168,6 @@ class VLLMClient:
|
|
172
168
|
"prompt": prompt,
|
173
169
|
"temperature": self.config.temperature,
|
174
170
|
"max_tokens": self.config.max_tokens,
|
175
|
-
# "top_p": self.config.top_p,
|
176
171
|
}
|
177
172
|
|
178
173
|
try:
|
vllm_judge/judge.py
CHANGED
@@ -4,9 +4,9 @@ from typing import Union, Dict, List, Optional, Tuple, Any, Callable
|
|
4
4
|
|
5
5
|
from vllm_judge.models import JudgeConfig, EvaluationResult, Metric, BatchResult, TemplateEngine, ModelSpecificMetric
|
6
6
|
from vllm_judge.client import VLLMClient
|
7
|
-
from vllm_judge.
|
7
|
+
from vllm_judge.prompt_builder import PromptBuilder
|
8
8
|
from vllm_judge.batch import BatchProcessor
|
9
|
-
from vllm_judge.
|
9
|
+
from vllm_judge.builtin_metrics import BUILTIN_METRICS
|
10
10
|
from vllm_judge.templating import TemplateProcessor
|
11
11
|
from vllm_judge.exceptions import (
|
12
12
|
ParseError,
|
vllm_judge/models.py
CHANGED
@@ -59,7 +59,6 @@ class JudgeConfig(BaseModel):
|
|
59
59
|
# Model parameters
|
60
60
|
temperature: float = Field(0.0, description="Sampling temperature")
|
61
61
|
max_tokens: int = Field(256, description="Maximum tokens in response")
|
62
|
-
# top_p: float = Field(0.95, description="Top-p sampling")
|
63
62
|
|
64
63
|
# Batch settings
|
65
64
|
max_concurrent: int = Field(50, description="Maximum concurrent requests")
|
@@ -99,7 +98,8 @@ class Metric:
|
|
99
98
|
system_prompt: Optional[str] = None,
|
100
99
|
template_vars: Optional[Dict[str, Any]] = None,
|
101
100
|
required_vars: Optional[List[str]] = None,
|
102
|
-
template_engine: Union[str, TemplateEngine] = TemplateEngine.FORMAT
|
101
|
+
template_engine: Union[str, TemplateEngine] = TemplateEngine.FORMAT,
|
102
|
+
additional_instructions: Optional[str] = None
|
103
103
|
):
|
104
104
|
"""
|
105
105
|
Initialize a reusable metric.
|
@@ -125,7 +125,7 @@ class Metric:
|
|
125
125
|
self.template_vars = template_vars or {}
|
126
126
|
self.required_vars = required_vars or []
|
127
127
|
self.template_engine = TemplateEngine(template_engine)
|
128
|
-
|
128
|
+
self.additional_instructions = additional_instructions
|
129
129
|
# Auto-detect required variables if not specified
|
130
130
|
if not self.required_vars and self.template_engine == TemplateEngine.FORMAT:
|
131
131
|
self._auto_detect_required_vars()
|
@@ -1,5 +1,5 @@
|
|
1
1
|
from typing import List, Dict, Union, Optional, Tuple, Any
|
2
|
-
|
2
|
+
import json
|
3
3
|
|
4
4
|
class PromptBuilder:
|
5
5
|
"""Builds prompts for evaluation requests."""
|
@@ -35,24 +35,36 @@ class PromptBuilder:
|
|
35
35
|
"""
|
36
36
|
# Detect evaluation type
|
37
37
|
is_comparison = isinstance(content, dict) and "a" in content and "b" in content
|
38
|
+
|
39
|
+
output_format = """
|
40
|
+
# Output Format:
|
41
|
+
|
42
|
+
The JSON object MUST have exactly these three fields:
|
43
|
+
|
44
|
+
1. decision: (String | Boolean) This decision label should clearly state your main finding. This could be a string representing a specific class (eg., PASS, FAIL, CORRECT, INCORRECT, etc.) or a boolean value (true or false). If user provided a rubric, you should use the rubric to determine the decision label.
|
45
|
+
2. score: (Number | null) A numerical score for the evaluation. If scoring is requested, provide the score as a number. If scoring is NOT requested or is not applicable for the specific task, you MUST use the value null for this field.
|
46
|
+
3. reasoning: (String) A concise explanation justifying your decision and score (if a score was provided). This reasoning must directly and logically support your evaluation and refer to the specific evaluation criteria.
|
47
|
+
|
48
|
+
The JSON object MUST be well-formed and adhere strictly to the following structure:
|
49
|
+
|
50
|
+
{
|
51
|
+
"decision": <your judgment - string|boolean>,
|
52
|
+
"reasoning": <concise explanation of your judgment - string>,
|
53
|
+
"score": <numeric score if requested, otherwise null - number|null>
|
54
|
+
}
|
55
|
+
"""
|
38
56
|
|
39
57
|
# System message
|
40
58
|
if not system_prompt:
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
system_prompt+=
|
50
|
-
system_prompt+="""{
|
51
|
-
"decision": <your judgment - string|number|boolean>,
|
52
|
-
"reasoning": "<concise explanation of your judgment>",
|
53
|
-
"score": <numeric score if requested, otherwise null>
|
54
|
-
}"""
|
55
|
-
system_prompt+="\nDo not include any text in your response except for the JSON object."
|
59
|
+
system_prompt = """You are an impartial judge and expert evaluator. Your task is to evaluate the provided content based on the specific evaluation criteria and rubric.
|
60
|
+
# Key Instructions:
|
61
|
+
1. Your evaluation must be objective, consistent, and based solely on the specified criteria. Do not let your own opinions or biases interfere.
|
62
|
+
2. Focus exclusively on quality assessment.
|
63
|
+
3. Do not be influenced by the length of the responses unless response length is explicitly relevant to the specified evaluation criteria (e.g., a task assessing conciseness or verbosity).
|
64
|
+
4. Your entire response MUST be a single, valid JSON object and nothing else. Do not include any text or conversational filler before or after this JSON object.
|
65
|
+
|
66
|
+
"""
|
67
|
+
system_prompt += output_format
|
56
68
|
|
57
69
|
# Build user message
|
58
70
|
user_content = PromptBuilder._build_user_prompt(
|
@@ -93,28 +105,30 @@ class PromptBuilder:
|
|
93
105
|
parts.append(f'"{input}"')
|
94
106
|
parts.append("")
|
95
107
|
|
108
|
+
parts.append("## Content to evaluate:")
|
109
|
+
if is_comparison:
|
110
|
+
parts.append(f"**Response A:**\n{content['a']}")
|
111
|
+
parts.append(f"**Response B:**\n{content['b']}")
|
112
|
+
else:
|
113
|
+
parts.append(content)
|
114
|
+
|
115
|
+
parts.append("## Evaluation Criteria:")
|
116
|
+
|
96
117
|
# Task description
|
97
118
|
if is_comparison:
|
98
|
-
|
99
|
-
parts.append(f"Compare how well these two responses address the input for: {criteria}")
|
100
|
-
else:
|
101
|
-
parts.append(f"Compare these two responses based on: {criteria}")
|
119
|
+
parts.append(f"Compare the two responses based on: {criteria}")
|
102
120
|
if context:
|
103
121
|
parts.append(f"\nContext: {context}")
|
104
|
-
parts.append(f"\nResponse A:\n{content['a']}")
|
105
|
-
parts.append(f"\nResponse B:\n{content['b']}")
|
106
122
|
else:
|
107
|
-
|
108
|
-
parts.append(f"Evaluate how well this response addresses the input for: {criteria}")
|
109
|
-
else:
|
110
|
-
parts.append(f"Evaluate the following response based on: {criteria}")
|
123
|
+
parts.append(f"Evaluate the content based on: {criteria}")
|
111
124
|
if context:
|
112
125
|
parts.append(f"\nContext: {context}")
|
113
|
-
parts.append(f"\nResponse to evaluate:\n{content}")
|
114
126
|
|
127
|
+
parts.append(f"\nYou must return a decision label/class (your main judgement) for the `decision` field and a concise explanation for the `reasoning` field in the JSON object.")
|
128
|
+
|
115
129
|
# Add scale and rubric
|
116
130
|
if scale:
|
117
|
-
parts.append(f"
|
131
|
+
parts.append(f"In addition to these, provide a score from {scale[0]} to {scale[1]}")
|
118
132
|
|
119
133
|
if isinstance(rubric, dict):
|
120
134
|
parts.append("\nScoring guide:")
|
@@ -125,38 +139,46 @@ class PromptBuilder:
|
|
125
139
|
elif rubric:
|
126
140
|
parts.append(f"\nEvaluation guide: {rubric}")
|
127
141
|
elif rubric:
|
142
|
+
parts.append("\nIn addition to these, provide a score if required by the following evaluation guide.")
|
128
143
|
parts.append(f"\nEvaluation guide: {rubric}")
|
129
144
|
|
130
145
|
# Add examples if provided
|
131
146
|
if examples:
|
132
147
|
parts.append("\nExample evaluations:")
|
133
|
-
for i, ex in enumerate(examples
|
134
|
-
parts.append(f"
|
135
|
-
|
148
|
+
for i, ex in enumerate(examples):
|
149
|
+
parts.append(f"Example {i+1}:")
|
150
|
+
parts.append("Request:")
|
136
151
|
# Handle different example formats
|
137
152
|
if "input" in ex:
|
138
153
|
parts.append(f"Input: {ex['input']}")
|
139
154
|
if "content" in ex:
|
140
|
-
parts.append(f"
|
155
|
+
parts.append(f"Content: {ex['content']}")
|
141
156
|
elif "text" in ex:
|
142
157
|
parts.append(f"Text: {ex['text']}")
|
143
158
|
|
144
|
-
|
145
|
-
|
159
|
+
parts.append("Response:")
|
160
|
+
|
161
|
+
response = {}
|
162
|
+
if "decision" not in ex or ex["decision"] is None or ex["decision"] == "":
|
163
|
+
raise ValueError("Example must include a decision field")
|
164
|
+
|
165
|
+
response["decision"] = ex["decision"]
|
146
166
|
if "score" in ex:
|
147
|
-
|
167
|
+
response["score"] = ex["score"]
|
148
168
|
|
149
169
|
if "reasoning" in ex:
|
150
|
-
|
170
|
+
response["reasoning"] = ex["reasoning"]
|
171
|
+
|
172
|
+
parts.append(json.dumps(response))
|
151
173
|
|
152
174
|
# Add any additional instructions
|
153
175
|
if kwargs.get("additional_instructions"):
|
154
|
-
parts.append(f"
|
176
|
+
parts.append(f"Additional instructions: {kwargs['additional_instructions']}")
|
155
177
|
|
156
178
|
# Output format instructions
|
157
179
|
parts.append("\nYou must respond in JSON format:")
|
158
180
|
parts.append("""{
|
159
|
-
"decision": <your judgment - string|
|
181
|
+
"decision": <your judgment - string|boolean>,
|
160
182
|
"reasoning": "<concise explanation of your judgment>",
|
161
183
|
"score": <numeric score if requested, otherwise null>
|
162
184
|
}""")
|
@@ -0,0 +1,20 @@
|
|
1
|
+
vllm_judge/__init__.py,sha256=bl6j1TXcPFsNcOKpFGX6FrkS0dikhApOKjhsBOaXm80,2800
|
2
|
+
vllm_judge/batch.py,sha256=GJe6d2nsUWCxcSG-j5xnyovfKAM-YklWS0PNAwTMO9s,4886
|
3
|
+
vllm_judge/builtin_metrics.py,sha256=XAhn5a-kJgip4NYkaTmkwiIWXjYGRcHoztAmqjmDO9A,48711
|
4
|
+
vllm_judge/cli.py,sha256=3075NrduxYz_iITQ0ZnqdjK0jJ9vGpzC6B_23lAN3wc,13598
|
5
|
+
vllm_judge/client.py,sha256=x3LBRUjnOmX0iEWdRqz-ALzb03qezZ92aMpdMFzHRcs,8096
|
6
|
+
vllm_judge/exceptions.py,sha256=X9YxnukDuI3RwJPkabj3pl6v0JIbflvhUaWrdAW4RTM,1066
|
7
|
+
vllm_judge/judge.py,sha256=X3oLXfWjmIOay5oDWBQNoEnxyDlF0sPf69HBjieW1Ug,16954
|
8
|
+
vllm_judge/models.py,sha256=wN2JGddWAxT4EXhmfl3IjBYOpDG_9lGP125UWP4IKTw,7935
|
9
|
+
vllm_judge/parsers.py,sha256=lhByBIMS_1EwvxEe31jFgVcTwcFwm5mWoJDXG4TnbvQ,509
|
10
|
+
vllm_judge/prompt_builder.py,sha256=miQU_mKDKkTuRfVEiQT2LfN4QEvnphLu39s2YldOvCA,8754
|
11
|
+
vllm_judge/templating.py,sha256=LjVFXFcwHl8xnBLLVr_IIqtN-EbLp0HZ5ndNbBpcJTQ,6998
|
12
|
+
vllm_judge/api/__init__.py,sha256=yUqAIcqpBDUKoq8ZLcKPQaX45oesy3Nmb2yEwy-dHyU,727
|
13
|
+
vllm_judge/api/client.py,sha256=RgbhzRLlOR8gia8_-Kbe2_wQC4tjNPzqObPz2GPP5ec,12409
|
14
|
+
vllm_judge/api/models.py,sha256=GXj3slwytJWg5M4f5MPZ8Ft_hrkEEAZh0qgpYDy-Qe4,5102
|
15
|
+
vllm_judge/api/server.py,sha256=B97RVVeVHLxf69_bSZBvP69DbTQhoFW2tZOBBS0ahrQ,17838
|
16
|
+
vllm_judge-0.1.6.dist-info/METADATA,sha256=MRGak20XswQG2-Qq_iFCIUNqZcWfMOZsSA8GRWMj6ak,4251
|
17
|
+
vllm_judge-0.1.6.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
|
18
|
+
vllm_judge-0.1.6.dist-info/entry_points.txt,sha256=F3plmbMXOQ0pBIh0clqWPVIJWl20_1LZ7QHxC2XF5Lg,51
|
19
|
+
vllm_judge-0.1.6.dist-info/top_level.txt,sha256=bqtMvn2y13cHSz_1-HKCBMzYSTfDHsTQBG6U5STHvwM,11
|
20
|
+
vllm_judge-0.1.6.dist-info/RECORD,,
|