vllm-judge 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vllm_judge/__init__.py +120 -0
- vllm_judge/api/__init__.py +39 -0
- vllm_judge/api/client.py +354 -0
- vllm_judge/api/models.py +157 -0
- vllm_judge/api/server.py +564 -0
- vllm_judge/batch.py +147 -0
- vllm_judge/cli.py +288 -0
- vllm_judge/client.py +262 -0
- vllm_judge/exceptions.py +42 -0
- vllm_judge/judge.py +421 -0
- vllm_judge/metrics.py +417 -0
- vllm_judge/models.py +185 -0
- vllm_judge/prompts.py +175 -0
- vllm_judge/templating.py +206 -0
- vllm_judge-0.1.0.dist-info/METADATA +124 -0
- vllm_judge-0.1.0.dist-info/RECORD +19 -0
- vllm_judge-0.1.0.dist-info/WHEEL +5 -0
- vllm_judge-0.1.0.dist-info/entry_points.txt +2 -0
- vllm_judge-0.1.0.dist-info/top_level.txt +1 -0
vllm_judge/judge.py
ADDED
@@ -0,0 +1,421 @@
|
|
1
|
+
import json
|
2
|
+
import re
|
3
|
+
from typing import Union, Dict, List, Optional, Tuple, Any, Callable
|
4
|
+
|
5
|
+
from vllm_judge.models import JudgeConfig, EvaluationResult, Metric, BatchResult, TemplateEngine
|
6
|
+
from vllm_judge.client import VLLMClient
|
7
|
+
from vllm_judge.prompts import PromptBuilder
|
8
|
+
from vllm_judge.batch import BatchProcessor
|
9
|
+
from vllm_judge.metrics import BUILTIN_METRICS
|
10
|
+
from vllm_judge.templating import TemplateProcessor
|
11
|
+
from vllm_judge.exceptions import (
|
12
|
+
ParseError,
|
13
|
+
InvalidInputError,
|
14
|
+
MetricNotFoundError,
|
15
|
+
VLLMJudgeError
|
16
|
+
)
|
17
|
+
|
18
|
+
|
19
|
+
class Judge:
|
20
|
+
"""Main class for LLM-as-a-Judge evaluations."""
|
21
|
+
|
22
|
+
def __init__(self, config: JudgeConfig):
|
23
|
+
"""
|
24
|
+
Initialize Judge with configuration.
|
25
|
+
|
26
|
+
Args:
|
27
|
+
config: Judge configuration
|
28
|
+
"""
|
29
|
+
self.config = config
|
30
|
+
self.client = VLLMClient(config)
|
31
|
+
self.metrics: Dict[str, Metric] = {}
|
32
|
+
|
33
|
+
@classmethod
|
34
|
+
def from_url(cls, base_url: str, model: Optional[str] = None, **kwargs) -> 'Judge':
|
35
|
+
"""
|
36
|
+
Create Judge from URL.
|
37
|
+
|
38
|
+
Args:
|
39
|
+
base_url: vLLM server URL
|
40
|
+
model: Model name (optional, can be auto-detected)
|
41
|
+
**kwargs: Additional configuration
|
42
|
+
|
43
|
+
Returns:
|
44
|
+
Judge instance
|
45
|
+
"""
|
46
|
+
config = JudgeConfig.from_url(base_url, model=model, **kwargs)
|
47
|
+
return cls(config)
|
48
|
+
|
49
|
+
async def __aenter__(self):
|
50
|
+
"""Async context manager entry."""
|
51
|
+
return self
|
52
|
+
|
53
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
54
|
+
"""Async context manager exit."""
|
55
|
+
await self.close()
|
56
|
+
|
57
|
+
async def close(self):
|
58
|
+
"""Close client connections."""
|
59
|
+
await self.client.close()
|
60
|
+
|
61
|
+
async def evaluate(
|
62
|
+
self,
|
63
|
+
response: Union[str, Dict[str, str]],
|
64
|
+
criteria: str = None,
|
65
|
+
rubric: Union[str, Dict[Union[int, float], str]] = None,
|
66
|
+
scale: Optional[Tuple[int, int]] = None,
|
67
|
+
examples: List[Dict[str, Any]] = None,
|
68
|
+
metric: Union[Metric, str] = None,
|
69
|
+
system_prompt: str = None,
|
70
|
+
context: str = None,
|
71
|
+
template_vars: Dict[str, Any] = None,
|
72
|
+
template_engine: Union[str, TemplateEngine] = TemplateEngine.FORMAT,
|
73
|
+
**kwargs
|
74
|
+
) -> EvaluationResult:
|
75
|
+
"""
|
76
|
+
Universal evaluation method that adapts to use case.
|
77
|
+
|
78
|
+
Args:
|
79
|
+
response: String for single evaluation, dict {"a": ..., "b": ...} for comparison
|
80
|
+
criteria: What to evaluate for (can contain template variables)
|
81
|
+
rubric: Instructions for evaluation, can be string or dict containing mapping of score to description (can contain template variables)
|
82
|
+
scale: Optional numeric scale (min, max)
|
83
|
+
examples: Optional few-shot examples
|
84
|
+
metric: Pre-defined Metric object or registered metric name
|
85
|
+
system_prompt: Optional custom system message (can contain template variables)
|
86
|
+
context: Optional context for the evaluation
|
87
|
+
template_vars: Variables to substitute in templates
|
88
|
+
template_engine: Template engine to use ('format' or 'jinja2'), default is 'format'
|
89
|
+
**kwargs: Additional parameters
|
90
|
+
|
91
|
+
Returns:
|
92
|
+
EvaluationResult with decision, reasoning, and optional score
|
93
|
+
|
94
|
+
Raises:
|
95
|
+
InvalidInputError: If inputs are invalid or template vars missing
|
96
|
+
MetricNotFoundError: If metric name not found
|
97
|
+
ParseError: If unable to parse model response
|
98
|
+
"""
|
99
|
+
# Handle metric parameter
|
100
|
+
metric_template_vars = {}
|
101
|
+
|
102
|
+
if metric:
|
103
|
+
if isinstance(metric, str):
|
104
|
+
metric = self.get_metric(metric)
|
105
|
+
# Use metric defaults but allow overrides
|
106
|
+
criteria = criteria or metric.criteria
|
107
|
+
rubric = rubric or metric.rubric
|
108
|
+
scale = scale or metric.scale
|
109
|
+
examples = examples or metric.examples
|
110
|
+
system_prompt = system_prompt or metric.system_prompt
|
111
|
+
metric_template_vars = metric.template_vars
|
112
|
+
if metric.template_engine:
|
113
|
+
template_engine = metric.template_engine
|
114
|
+
|
115
|
+
# Validate inputs
|
116
|
+
if not criteria:
|
117
|
+
raise InvalidInputError("Either 'criteria' or 'metric' must be provided")
|
118
|
+
|
119
|
+
# Determine template engine
|
120
|
+
engine = TemplateEngine(template_engine)
|
121
|
+
|
122
|
+
# Merge template variables (metric defaults + user provided)
|
123
|
+
all_template_vars = {**metric_template_vars, **(template_vars or {})}
|
124
|
+
|
125
|
+
# Process templates
|
126
|
+
criteria = TemplateProcessor.apply_template(
|
127
|
+
criteria, all_template_vars, engine, strict=True
|
128
|
+
)
|
129
|
+
rubric = TemplateProcessor.apply_template(
|
130
|
+
rubric, all_template_vars, engine, strict=True
|
131
|
+
)
|
132
|
+
system_prompt = TemplateProcessor.apply_template(
|
133
|
+
system_prompt, all_template_vars, engine, strict=True
|
134
|
+
)
|
135
|
+
context = TemplateProcessor.apply_template(
|
136
|
+
context, all_template_vars, engine, strict=True
|
137
|
+
)
|
138
|
+
|
139
|
+
# Build messages
|
140
|
+
messages = PromptBuilder.build_messages(
|
141
|
+
response=response,
|
142
|
+
criteria=criteria,
|
143
|
+
rubric=rubric,
|
144
|
+
scale=scale,
|
145
|
+
examples=examples,
|
146
|
+
system_prompt=system_prompt,
|
147
|
+
context=context,
|
148
|
+
**kwargs
|
149
|
+
)
|
150
|
+
|
151
|
+
# Get LLM response
|
152
|
+
try:
|
153
|
+
if self.config.use_chat_api:
|
154
|
+
llm_response = await self.client.chat_completion(messages)
|
155
|
+
else:
|
156
|
+
prompt = PromptBuilder.format_messages_as_text(messages)
|
157
|
+
llm_response = await self.client.completion(prompt)
|
158
|
+
except Exception as e:
|
159
|
+
raise VLLMJudgeError(f"Failed to get model response: {e}")
|
160
|
+
|
161
|
+
# Parse response
|
162
|
+
result = self._parse_response(llm_response)
|
163
|
+
|
164
|
+
# Add template info to metadata if used
|
165
|
+
if all_template_vars:
|
166
|
+
result.metadata["template_vars"] = all_template_vars
|
167
|
+
result.metadata["template_engine"] = engine.value
|
168
|
+
|
169
|
+
return result
|
170
|
+
|
171
|
+
def _parse_response(self, response: str) -> EvaluationResult:
|
172
|
+
"""
|
173
|
+
Parse LLM response into EvaluationResult.
|
174
|
+
|
175
|
+
Args:
|
176
|
+
response: Raw LLM response
|
177
|
+
|
178
|
+
Returns:
|
179
|
+
Parsed EvaluationResult
|
180
|
+
|
181
|
+
Raises:
|
182
|
+
ParseError: If unable to parse response
|
183
|
+
"""
|
184
|
+
# Try to parse as JSON
|
185
|
+
try:
|
186
|
+
# First attempt: direct JSON parsing
|
187
|
+
data = json.loads(response.strip())
|
188
|
+
except json.JSONDecodeError:
|
189
|
+
# Second attempt: extract JSON from markdown code blocks
|
190
|
+
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response, re.DOTALL)
|
191
|
+
if json_match:
|
192
|
+
try:
|
193
|
+
data = json.loads(json_match.group(1))
|
194
|
+
except json.JSONDecodeError:
|
195
|
+
pass
|
196
|
+
else:
|
197
|
+
# Third attempt: find JSON-like structure
|
198
|
+
json_match = re.search(r'({[^{}]*"decision"[^{}]*})', response, re.DOTALL)
|
199
|
+
if json_match:
|
200
|
+
try:
|
201
|
+
data = json.loads(json_match.group(1))
|
202
|
+
except json.JSONDecodeError:
|
203
|
+
raise ParseError(
|
204
|
+
"Failed to parse JSON from response",
|
205
|
+
raw_response=response
|
206
|
+
)
|
207
|
+
else:
|
208
|
+
raise ParseError(
|
209
|
+
"No JSON structure found in response",
|
210
|
+
raw_response=response
|
211
|
+
)
|
212
|
+
|
213
|
+
# Validate required fields
|
214
|
+
if "decision" not in data:
|
215
|
+
raise ParseError(
|
216
|
+
"Response missing required 'decision' field",
|
217
|
+
raw_response=response
|
218
|
+
)
|
219
|
+
|
220
|
+
if "reasoning" not in data:
|
221
|
+
# Try to extract reasoning from other fields
|
222
|
+
data["reasoning"] = data.get("reason", data.get("explanation", "No reasoning provided"))
|
223
|
+
|
224
|
+
# Create result
|
225
|
+
return EvaluationResult(
|
226
|
+
decision=data["decision"],
|
227
|
+
reasoning=data["reasoning"],
|
228
|
+
score=data.get("score"),
|
229
|
+
metadata={
|
230
|
+
"model": self.config.model,
|
231
|
+
"raw_response": response,
|
232
|
+
**data.get("metadata", {})
|
233
|
+
}
|
234
|
+
)
|
235
|
+
|
236
|
+
# Convenience methods
|
237
|
+
async def score(
|
238
|
+
self,
|
239
|
+
criteria: str,
|
240
|
+
response: str,
|
241
|
+
scale: Tuple[int, int] = (1, 10),
|
242
|
+
**kwargs
|
243
|
+
) -> EvaluationResult:
|
244
|
+
"""
|
245
|
+
Quick scoring evaluation.
|
246
|
+
|
247
|
+
Args:
|
248
|
+
criteria: What to evaluate
|
249
|
+
response: Response to evaluate
|
250
|
+
scale: Numeric scale (default 1-10)
|
251
|
+
**kwargs: Additional parameters
|
252
|
+
|
253
|
+
Returns:
|
254
|
+
EvaluationResult with numeric score
|
255
|
+
"""
|
256
|
+
return await self.evaluate(
|
257
|
+
response=response,
|
258
|
+
criteria=criteria,
|
259
|
+
scale=scale,
|
260
|
+
**kwargs
|
261
|
+
)
|
262
|
+
|
263
|
+
async def compare(
|
264
|
+
self,
|
265
|
+
response_a: str,
|
266
|
+
response_b: str,
|
267
|
+
criteria: str,
|
268
|
+
**kwargs
|
269
|
+
) -> EvaluationResult:
|
270
|
+
"""
|
271
|
+
Quick comparison evaluation.
|
272
|
+
|
273
|
+
Args:
|
274
|
+
response_a: First response
|
275
|
+
response_b: Second response
|
276
|
+
criteria: What to compare on
|
277
|
+
**kwargs: Additional parameters
|
278
|
+
|
279
|
+
Returns:
|
280
|
+
EvaluationResult with decision of 'response_a' or 'response_b'
|
281
|
+
"""
|
282
|
+
return await self.evaluate(
|
283
|
+
response={"a": response_a, "b": response_b},
|
284
|
+
criteria=criteria,
|
285
|
+
**kwargs
|
286
|
+
)
|
287
|
+
|
288
|
+
async def classify(
|
289
|
+
self,
|
290
|
+
response: str,
|
291
|
+
categories: List[str],
|
292
|
+
criteria: str = None,
|
293
|
+
**kwargs
|
294
|
+
) -> EvaluationResult:
|
295
|
+
"""
|
296
|
+
Quick classification evaluation.
|
297
|
+
|
298
|
+
Args:
|
299
|
+
response: Response to classify
|
300
|
+
categories: List of categories
|
301
|
+
criteria: Classification criteria
|
302
|
+
**kwargs: Additional parameters
|
303
|
+
|
304
|
+
Returns:
|
305
|
+
EvaluationResult with category decision
|
306
|
+
"""
|
307
|
+
if not criteria:
|
308
|
+
criteria = "appropriate category"
|
309
|
+
|
310
|
+
rubric = f"Classify into one of these categories: {', '.join(categories)}"
|
311
|
+
|
312
|
+
return await self.evaluate(
|
313
|
+
response=response,
|
314
|
+
criteria=criteria,
|
315
|
+
rubric=rubric,
|
316
|
+
**kwargs
|
317
|
+
)
|
318
|
+
|
319
|
+
# Metric management
|
320
|
+
def register_metric(self, metric: Metric):
|
321
|
+
"""
|
322
|
+
Register a metric for reuse.
|
323
|
+
|
324
|
+
Args:
|
325
|
+
metric: Metric to register
|
326
|
+
"""
|
327
|
+
self.metrics[metric.name] = metric
|
328
|
+
|
329
|
+
def get_metric(self, name: str) -> Metric:
|
330
|
+
"""
|
331
|
+
Get registered metric by name.
|
332
|
+
|
333
|
+
Args:
|
334
|
+
name: Metric name
|
335
|
+
|
336
|
+
Returns:
|
337
|
+
Metric instance
|
338
|
+
|
339
|
+
Raises:
|
340
|
+
MetricNotFoundError: If metric not found
|
341
|
+
"""
|
342
|
+
# Check user-registered metrics first
|
343
|
+
if name in self.metrics:
|
344
|
+
return self.metrics[name]
|
345
|
+
|
346
|
+
# Check built-in metrics
|
347
|
+
if name in BUILTIN_METRICS:
|
348
|
+
return BUILTIN_METRICS[name]
|
349
|
+
|
350
|
+
# List available metrics in error
|
351
|
+
available = list(self.metrics.keys()) + list(BUILTIN_METRICS.keys())
|
352
|
+
raise MetricNotFoundError(
|
353
|
+
f"Metric '{name}' not found. Available metrics: {', '.join(available)}"
|
354
|
+
)
|
355
|
+
|
356
|
+
def list_metrics(self) -> List[str]:
|
357
|
+
"""List all available metric names."""
|
358
|
+
return list(self.metrics.keys()) + list(BUILTIN_METRICS.keys())
|
359
|
+
|
360
|
+
# Batch processing
|
361
|
+
async def batch_evaluate(
|
362
|
+
self,
|
363
|
+
data: List[Dict[str, Any]],
|
364
|
+
max_concurrent: int = None,
|
365
|
+
progress_callback: Callable[[int, int], None] = None,
|
366
|
+
**default_kwargs
|
367
|
+
) -> BatchResult:
|
368
|
+
"""
|
369
|
+
Batch evaluation with high concurrency.
|
370
|
+
|
371
|
+
Args:
|
372
|
+
data: List of evaluation inputs (each must have 'response' key)
|
373
|
+
max_concurrent: Maximum concurrent requests
|
374
|
+
progress_callback: Optional callback for progress updates
|
375
|
+
**default_kwargs: Default parameters for all evaluations
|
376
|
+
|
377
|
+
Returns:
|
378
|
+
BatchResult with all results
|
379
|
+
|
380
|
+
Example:
|
381
|
+
results = await judge.batch_evaluate([
|
382
|
+
{"response": "Text 1", "criteria": "clarity"},
|
383
|
+
{"response": {"a": "A", "b": "B"}, "criteria": "quality"},
|
384
|
+
{"response": "Text 3", "metric": "safety"}
|
385
|
+
])
|
386
|
+
"""
|
387
|
+
processor = BatchProcessor(self, max_concurrent or self.config.max_concurrent)
|
388
|
+
return await processor.process(data, progress_callback, **default_kwargs)
|
389
|
+
|
390
|
+
async def batch_score(
|
391
|
+
self,
|
392
|
+
responses: List[str],
|
393
|
+
criteria: str,
|
394
|
+
scale: Tuple[int, int] = (1, 10),
|
395
|
+
**kwargs
|
396
|
+
) -> List[EvaluationResult]:
|
397
|
+
"""
|
398
|
+
Convenience method for batch scoring.
|
399
|
+
|
400
|
+
Args:
|
401
|
+
responses: List of responses to score
|
402
|
+
criteria: Scoring criteria
|
403
|
+
scale: Numeric scale
|
404
|
+
**kwargs: Additional parameters
|
405
|
+
|
406
|
+
Returns:
|
407
|
+
List of EvaluationResults
|
408
|
+
"""
|
409
|
+
data = [
|
410
|
+
{"response": resp, "criteria": criteria, "scale": scale, **kwargs}
|
411
|
+
for resp in responses
|
412
|
+
]
|
413
|
+
batch_result = await self.batch_evaluate(data)
|
414
|
+
|
415
|
+
# Extract results, raising first error if any
|
416
|
+
results = []
|
417
|
+
for r in batch_result.results:
|
418
|
+
if isinstance(r, Exception):
|
419
|
+
raise r
|
420
|
+
results.append(r)
|
421
|
+
return results
|