vllm-judge 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vllm_judge/__init__.py CHANGED
@@ -5,7 +5,7 @@ A lightweight library for evaluating text responses using self-hosted language m
5
5
  via vLLM's OpenAI-compatible API.
6
6
  """
7
7
 
8
- __version__ = "0.1.3"
8
+ __version__ = "0.1.5"
9
9
 
10
10
  from vllm_judge.judge import Judge
11
11
  from vllm_judge.models import (
@@ -24,10 +24,11 @@ from vllm_judge.metrics import (
24
24
  CLARITY,
25
25
  CONCISENESS,
26
26
  RELEVANCE,
27
-
27
+ COHERENCE,
28
28
  # Safety metrics
29
29
  SAFETY,
30
30
  TOXICITY,
31
+ BIAS_DETECTION,
31
32
  LLAMA_GUARD_3_SAFETY,
32
33
 
33
34
  # Code metrics
@@ -61,6 +62,12 @@ from vllm_judge.metrics import (
61
62
  PRODUCT_REVIEW_TEMPLATE,
62
63
  MEDICAL_INFO_TEMPLATE,
63
64
  API_DOCS_TEMPLATE,
65
+ RAG_EVALUATION_TEMPLATE,
66
+ AGENT_PERFORMANCE_TEMPLATE,
67
+
68
+ # NLP metrics
69
+ TRANSLATION_QUALITY,
70
+ SUMMARIZATION_QUALITY,
64
71
 
65
72
  )
66
73
  from vllm_judge.exceptions import (
@@ -91,8 +98,10 @@ __all__ = [
91
98
  "CLARITY",
92
99
  "CONCISENESS",
93
100
  "RELEVANCE",
101
+ "COHERENCE",
94
102
  "SAFETY",
95
103
  "TOXICITY",
104
+ "BIAS_DETECTION",
96
105
  "LLAMA_GUARD_3_SAFETY",
97
106
  "CODE_QUALITY",
98
107
  "CODE_SECURITY",
@@ -112,6 +121,11 @@ __all__ = [
112
121
  "PRODUCT_REVIEW_TEMPLATE",
113
122
  "MEDICAL_INFO_TEMPLATE",
114
123
  "API_DOCS_TEMPLATE",
124
+ "RAG_EVALUATION_TEMPLATE",
125
+ "AGENT_PERFORMANCE_TEMPLATE",
126
+ "TRANSLATION_QUALITY",
127
+ "SUMMARIZATION_QUALITY",
128
+
115
129
  # Exceptions
116
130
  "VLLMJudgeError",
117
131
  "ConfigurationError",
vllm_judge/api/client.py CHANGED
@@ -66,6 +66,7 @@ class JudgeClient:
66
66
  async def evaluate(
67
67
  self,
68
68
  content: Union[str, Dict[str, str]],
69
+ input: Optional[str] = None,
69
70
  criteria: str = None,
70
71
  rubric: Union[str, Dict[Union[int, float], str]] = None,
71
72
  scale: Optional[Tuple[int, int]] = None,
@@ -87,7 +88,8 @@ class JudgeClient:
87
88
  EvaluationResult
88
89
  """
89
90
  request = EvaluateRequest(
90
- response=content,
91
+ content=content,
92
+ input=input,
91
93
  criteria=criteria,
92
94
  rubric=rubric,
93
95
  scale=list(scale) if scale else None,
@@ -277,37 +279,69 @@ class JudgeClient:
277
279
  async def score(
278
280
  self,
279
281
  criteria: str,
280
- response: str,
282
+ content: str,
283
+ input: Optional[str] = None,
281
284
  scale: Tuple[int, int] = (1, 10),
282
285
  **kwargs
283
286
  ) -> EvaluationResult:
284
287
  """Quick scoring evaluation."""
285
288
  return await self.evaluate(
286
- response=response,
289
+ content=content,
290
+ input=input,
291
+ criteria=criteria,
292
+ scale=scale,
293
+ **kwargs
294
+ )
295
+ async def qa_evaluate(
296
+ self,
297
+ question: str,
298
+ answer: str,
299
+ criteria: str = "accuracy and completeness",
300
+ scale: Tuple[int, int] = (1, 10),
301
+ **kwargs
302
+ ) -> EvaluationResult:
303
+ """
304
+ Convenience method for QA evaluation via API.
305
+
306
+ Args:
307
+ question: The question being answered
308
+ answer: The answer to evaluate
309
+ criteria: Evaluation criteria (default: "accuracy and completeness")
310
+ scale: Numeric scale (default 1-10)
311
+ **kwargs: Additional parameters
312
+
313
+ Returns:
314
+ EvaluationResult with QA assessment
315
+ """
316
+ return await self.evaluate(
317
+ content=answer,
318
+ input=question,
287
319
  criteria=criteria,
288
320
  scale=scale,
289
321
  **kwargs
290
322
  )
291
-
292
323
  async def compare(
293
324
  self,
294
325
  response_a: str,
295
326
  response_b: str,
296
327
  criteria: str,
328
+ input: Optional[str] = None,
297
329
  **kwargs
298
330
  ) -> EvaluationResult:
299
331
  """Quick comparison evaluation."""
300
332
  return await self.evaluate(
301
- response={"a": response_a, "b": response_b},
333
+ content={"a": response_a, "b": response_b},
334
+ input=input,
302
335
  criteria=criteria,
303
336
  **kwargs
304
337
  )
305
338
 
306
339
  async def classify(
307
340
  self,
308
- response: str,
341
+ content: str,
309
342
  categories: List[str],
310
343
  criteria: str = None,
344
+ input: Optional[str] = None,
311
345
  **kwargs
312
346
  ) -> EvaluationResult:
313
347
  """Quick classification evaluation."""
@@ -317,7 +351,8 @@ class JudgeClient:
317
351
  rubric = f"Classify into one of these categories: {', '.join(categories)}"
318
352
 
319
353
  return await self.evaluate(
320
- response=response,
354
+ content=content,
355
+ input=input,
321
356
  criteria=criteria,
322
357
  rubric=rubric,
323
358
  **kwargs
@@ -325,7 +360,8 @@ class JudgeClient:
325
360
 
326
361
  async def evaluate_streaming(
327
362
  self,
328
- response: Union[str, Dict[str, str]],
363
+ content: Union[str, Dict[str, str]],
364
+ input: Optional[str] = None,
329
365
  **kwargs
330
366
  ) -> AsyncIterator[str]:
331
367
  """
@@ -339,7 +375,8 @@ class JudgeClient:
339
375
  async with websockets.connect(ws_url) as websocket:
340
376
  # Send request
341
377
  request_data = {
342
- "response": response,
378
+ "content": content,
379
+ "input": input,
343
380
  **kwargs
344
381
  }
345
382
  await websocket.send(json.dumps(request_data))
vllm_judge/api/models.py CHANGED
@@ -5,8 +5,15 @@ from datetime import datetime
5
5
 
6
6
  class EvaluateRequest(BaseModel):
7
7
  """Request model for single evaluation."""
8
- response: Union[str, Dict[str, str]] = Field(
9
- ..., description="Text to evaluate or dict with 'a' and 'b' for comparison"
8
+ content: Union[str, Dict[str, str]] = Field(
9
+ ...,
10
+ description="Content to evaluate (string or dict with 'a'/'b' for comparison)",
11
+ examples=["This is a response", {"a": "Response A", "b": "Response B"}]
12
+ )
13
+ input: Optional[str] = Field(
14
+ None,
15
+ description="Optional input/question/prompt that the content responds to",
16
+ examples=["What is the capital of France?", "Write a function to sort a list"]
10
17
  )
11
18
  criteria: Optional[str] = Field(
12
19
  None, description="What to evaluate for"
vllm_judge/api/server.py CHANGED
@@ -109,7 +109,8 @@ async def evaluate(request: EvaluateRequest):
109
109
 
110
110
  # Perform evaluation with template support
111
111
  result = await judge.evaluate(
112
- response=request.response,
112
+ content=request.content,
113
+ input=request.input,
113
114
  criteria=request.criteria,
114
115
  rubric=request.rubric,
115
116
  scale=scale,
@@ -422,7 +423,8 @@ async def websocket_evaluate(websocket: WebSocket):
422
423
  scale = tuple(request.scale) if request.scale else None
423
424
 
424
425
  result = await judge.evaluate(
425
- response=request.response,
426
+ content=request.content,
427
+ input=request.input,
426
428
  criteria=request.criteria,
427
429
  rubric=request.rubric,
428
430
  scale=scale,
vllm_judge/batch.py CHANGED
@@ -83,12 +83,12 @@ class BatchProcessor:
83
83
  async with self.semaphore:
84
84
  try:
85
85
  # Extract response from kwargs
86
- response = eval_kwargs.pop('response', None)
87
- if not response:
88
- raise ValueError(f"Item {index} missing 'response' field")
86
+ content = eval_kwargs.pop('content', None)
87
+ if not content:
88
+ raise ValueError(f"Item {index} missing 'content' field")
89
89
 
90
90
  # Perform evaluation
91
- result = await self.judge.evaluate(response=response, **eval_kwargs)
91
+ result = await self.judge.evaluate(content=content, **eval_kwargs)
92
92
 
93
93
  # Update progress
94
94
  async with self.progress_lock:
vllm_judge/cli.py CHANGED
@@ -50,7 +50,8 @@ def serve(base_url: str, model: str, host: str, port: int, reload: bool, max_con
50
50
  @click.option('--api-url', help='Judge API URL (if using remote server)')
51
51
  @click.option('--base-url', help='vLLM server URL (if using local)')
52
52
  @click.option('--model', help='Model name (if using local)')
53
- @click.option('--response', required=True, help='Text to evaluate')
53
+ @click.option('--content', required=True, help='Text to evaluate')
54
+ @click.option('--input', help='Input/question/prompt that the content responds to')
54
55
  @click.option('--criteria', help='Evaluation criteria')
55
56
  @click.option('--metric', help='Pre-defined metric name')
56
57
  @click.option('--scale', nargs=2, type=int, help='Numeric scale (min max)')
@@ -61,7 +62,8 @@ def evaluate(
61
62
  api_url: Optional[str],
62
63
  base_url: Optional[str],
63
64
  model: Optional[str],
64
- response: str,
65
+ content: str,
66
+ input: Optional[str],
65
67
  criteria: Optional[str],
66
68
  metric: Optional[str],
67
69
  scale: Optional[tuple],
@@ -75,7 +77,8 @@ def evaluate(
75
77
  # Use API client
76
78
  async with JudgeClient(api_url) as client:
77
79
  result = await client.evaluate(
78
- content=response,
80
+ content=content,
81
+ input=input,
79
82
  criteria=criteria,
80
83
  metric=metric,
81
84
  scale=scale,
@@ -91,7 +94,8 @@ def evaluate(
91
94
  judge = Judge.from_url(base_url, model=model)
92
95
  async with judge:
93
96
  result = await judge.evaluate(
94
- content=response,
97
+ content=content,
98
+ input=input,
95
99
  criteria=criteria,
96
100
  metric=metric,
97
101
  scale=scale,
@@ -110,6 +114,60 @@ def evaluate(
110
114
 
111
115
  asyncio.run(run_evaluation())
112
116
 
117
+ @cli.command()
118
+ @click.option('--api-url', help='Judge API URL (if using remote server)')
119
+ @click.option('--base-url', help='vLLM server URL (if using local)')
120
+ @click.option('--model', help='Model name (if using local)')
121
+ @click.option('--question', required=True, help='Question to evaluate answer for')
122
+ @click.option('--answer', required=True, help='Answer to evaluate')
123
+ @click.option('--criteria', default='accuracy and completeness', help='Evaluation criteria')
124
+ @click.option('--scale', nargs=2, type=int, default=[1, 10], help='Numeric scale (min max)')
125
+ @click.option('--output', type=click.Choice(['json', 'text']), default='text', help='Output format')
126
+ def qa_evaluate(
127
+ api_url: Optional[str],
128
+ base_url: Optional[str],
129
+ model: Optional[str],
130
+ question: str,
131
+ answer: str,
132
+ criteria: str,
133
+ scale: tuple,
134
+ output: str
135
+ ):
136
+ """Evaluate a QA pair (question and answer)."""
137
+ async def run_qa_evaluation():
138
+ if api_url:
139
+ async with JudgeClient(api_url) as client:
140
+ result = await client.qa_evaluate(
141
+ question=question,
142
+ answer=answer,
143
+ criteria=criteria,
144
+ scale=scale
145
+ )
146
+ else:
147
+ if not base_url:
148
+ click.echo("Error: Either --api-url or --base-url is required", err=True)
149
+ sys.exit(1)
150
+
151
+ judge = Judge.from_url(base_url, model=model)
152
+ async with judge:
153
+ result = await judge.qa_evaluate(
154
+ question=question,
155
+ answer=answer,
156
+ criteria=criteria,
157
+ scale=scale
158
+ )
159
+
160
+ if output == 'json':
161
+ click.echo(json.dumps(result.model_dump(), indent=2))
162
+ else:
163
+ click.echo(f"Question: {question}")
164
+ click.echo(f"Answer: {answer}")
165
+ click.echo(f"Decision: {result.decision}")
166
+ if result.score is not None:
167
+ click.echo(f"Score: {result.score}")
168
+ click.echo(f"Reasoning: {result.reasoning}")
169
+
170
+ asyncio.run(run_qa_evaluation())
113
171
 
114
172
  @cli.command()
115
173
  @click.option('--api-url', help='Judge API URL (if using remote server)')
@@ -118,6 +176,7 @@ def evaluate(
118
176
  @click.option('--response-a', required=True, help='First response')
119
177
  @click.option('--response-b', required=True, help='Second response')
120
178
  @click.option('--criteria', required=True, help='Comparison criteria')
179
+ @click.option('--input', help='Input/question that both responses address')
121
180
  @click.option('--output', type=click.Choice(['json', 'text']), default='text', help='Output format')
122
181
  def compare(
123
182
  api_url: Optional[str],
@@ -126,6 +185,7 @@ def compare(
126
185
  response_a: str,
127
186
  response_b: str,
128
187
  criteria: str,
188
+ input: Optional[str],
129
189
  output: str
130
190
  ):
131
191
  """Compare two responses."""
@@ -135,7 +195,8 @@ def compare(
135
195
  result = await client.compare(
136
196
  response_a=response_a,
137
197
  response_b=response_b,
138
- criteria=criteria
198
+ criteria=criteria,
199
+ input=input
139
200
  )
140
201
  else:
141
202
  if not base_url:
@@ -147,12 +208,17 @@ def compare(
147
208
  result = await judge.compare(
148
209
  response_a=response_a,
149
210
  response_b=response_b,
150
- criteria=criteria
211
+ criteria=criteria,
212
+ input=input
151
213
  )
152
214
 
153
215
  if output == 'json':
154
216
  click.echo(json.dumps(result.model_dump(), indent=2))
155
217
  else:
218
+ if input:
219
+ click.echo(f"Input: {input}")
220
+ click.echo(f"Response A: {response_a}")
221
+ click.echo(f"Response B: {response_b}")
156
222
  click.echo(f"Winner: {result.decision}")
157
223
  click.echo(f"Reasoning: {result.reasoning}")
158
224
 
@@ -281,6 +347,16 @@ def batch(api_url: str, file, use_async: bool, max_concurrent: Optional[int], ou
281
347
 
282
348
  def main():
283
349
  """Main entry point."""
350
+ cli.help = """vLLM Judge - LLM-as-a-Judge evaluation tool.
351
+
352
+ Features:
353
+ - Single response evaluation with optional input context
354
+ - QA (Question-Answer) evaluation
355
+ - Response comparison with optional input context
356
+ - Batch evaluation from JSON files
357
+ - API server mode
358
+ - Built-in and custom metrics with template support
359
+ """
284
360
  cli()
285
361
 
286
362
 
vllm_judge/judge.py CHANGED
@@ -64,6 +64,7 @@ class Judge:
64
64
  async def evaluate(
65
65
  self,
66
66
  content: Union[str, Dict[str, str]],
67
+ input: Optional[str] = None,
67
68
  criteria: str = None,
68
69
  rubric: Union[str, Dict[Union[int, float], str]] = None,
69
70
  scale: Optional[Tuple[int, int]] = None,
@@ -80,6 +81,7 @@ class Judge:
80
81
 
81
82
  Args:
82
83
  content: String for single evaluation, dict {"a": ..., "b": ...} for comparison
84
+ input: Optional input/question/prompt that the content is responding to
83
85
  criteria: What to evaluate for (can contain template variables)
84
86
  rubric: Instructions for evaluation, can be string or dict containing mapping of score to description (can contain template variables)
85
87
  scale: Optional numeric scale (min, max)
@@ -140,6 +142,9 @@ class Judge:
140
142
 
141
143
  # Merge template variables (metric defaults + user provided)
142
144
  all_template_vars = {**metric_template_vars, **(template_vars or {})}
145
+ # Add input to template variables if provided
146
+ if input:
147
+ all_template_vars["input"] = input
143
148
 
144
149
  # Process templates
145
150
  criteria = TemplateProcessor.apply_template(
@@ -154,10 +159,14 @@ class Judge:
154
159
  context = TemplateProcessor.apply_template(
155
160
  context, all_template_vars, engine, strict=True
156
161
  )
162
+ input = TemplateProcessor.apply_template(
163
+ input, all_template_vars, engine, strict=True
164
+ )
157
165
 
158
166
  # Build messages
159
167
  messages = PromptBuilder.build_messages(
160
- response=content,
168
+ content=content,
169
+ input=input,
161
170
  criteria=criteria,
162
171
  rubric=rubric,
163
172
  scale=scale,
@@ -264,7 +273,8 @@ class Judge:
264
273
  async def score(
265
274
  self,
266
275
  criteria: str,
267
- response: str,
276
+ content: str,
277
+ input: Optional[str] = None,
268
278
  scale: Tuple[int, int] = (1, 10),
269
279
  **kwargs
270
280
  ) -> EvaluationResult:
@@ -273,7 +283,8 @@ class Judge:
273
283
 
274
284
  Args:
275
285
  criteria: What to evaluate
276
- response: Response to evaluate
286
+ content: Response to evaluate
287
+ input: Optional input/question/prompt that the response addresses
277
288
  scale: Numeric scale (default 1-10)
278
289
  **kwargs: Additional parameters
279
290
 
@@ -281,7 +292,36 @@ class Judge:
281
292
  EvaluationResult with numeric score
282
293
  """
283
294
  return await self.evaluate(
284
- response=response,
295
+ content=content,
296
+ input=input,
297
+ criteria=criteria,
298
+ scale=scale,
299
+ **kwargs
300
+ )
301
+ async def qa_evaluate(
302
+ self,
303
+ question: str,
304
+ answer: str,
305
+ criteria: str = "accuracy and completeness",
306
+ scale: Tuple[int, int] = (1, 10),
307
+ **kwargs
308
+ ) -> EvaluationResult:
309
+ """
310
+ Convenience method for QA evaluation.
311
+
312
+ Args:
313
+ question: The question being answered
314
+ answer: The answer to evaluate
315
+ criteria: Evaluation criteria (default: "accuracy and completeness")
316
+ scale: Numeric scale (default 1-10)
317
+ **kwargs: Additional parameters
318
+
319
+ Returns:
320
+ EvaluationResult with QA assessment
321
+ """
322
+ return await self.evaluate(
323
+ content=answer,
324
+ input=question,
285
325
  criteria=criteria,
286
326
  scale=scale,
287
327
  **kwargs
@@ -292,6 +332,7 @@ class Judge:
292
332
  response_a: str,
293
333
  response_b: str,
294
334
  criteria: str,
335
+ input: Optional[str] = None,
295
336
  **kwargs
296
337
  ) -> EvaluationResult:
297
338
  """
@@ -301,31 +342,35 @@ class Judge:
301
342
  response_a: First response
302
343
  response_b: Second response
303
344
  criteria: What to compare on
345
+ input: Optional input/question that both responses address
304
346
  **kwargs: Additional parameters
305
347
 
306
348
  Returns:
307
349
  EvaluationResult with decision of 'response_a' or 'response_b'
308
350
  """
309
351
  return await self.evaluate(
310
- response={"a": response_a, "b": response_b},
352
+ content={"a": response_a, "b": response_b},
353
+ input=input,
311
354
  criteria=criteria,
312
355
  **kwargs
313
356
  )
314
357
 
315
358
  async def classify(
316
359
  self,
317
- response: str,
360
+ content: str,
318
361
  categories: List[str],
319
362
  criteria: str = None,
363
+ input: Optional[str] = None,
320
364
  **kwargs
321
365
  ) -> EvaluationResult:
322
366
  """
323
367
  Quick classification evaluation.
324
368
 
325
369
  Args:
326
- response: Response to classify
370
+ content: Content to classify
327
371
  categories: List of categories
328
372
  criteria: Classification criteria
373
+ input: Optional input/question that the response addresses
329
374
  **kwargs: Additional parameters
330
375
 
331
376
  Returns:
@@ -337,7 +382,8 @@ class Judge:
337
382
  rubric = f"Classify into one of these categories: {', '.join(categories)}"
338
383
 
339
384
  return await self.evaluate(
340
- response=response,
385
+ content=content,
386
+ input=input,
341
387
  criteria=criteria,
342
388
  rubric=rubric,
343
389
  **kwargs
@@ -396,7 +442,7 @@ class Judge:
396
442
  Batch evaluation with high concurrency.
397
443
 
398
444
  Args:
399
- data: List of evaluation inputs (each must have 'response' key)
445
+ data: List of evaluation inputs (each must have 'content' key)
400
446
  max_concurrent: Maximum concurrent requests
401
447
  progress_callback: Optional callback for progress updates
402
448
  **default_kwargs: Default parameters for all evaluations
@@ -406,9 +452,10 @@ class Judge:
406
452
 
407
453
  Example:
408
454
  results = await judge.batch_evaluate([
409
- {"response": "Text 1", "criteria": "clarity"},
410
- {"response": {"a": "A", "b": "B"}, "criteria": "quality"},
411
- {"response": "Text 3", "metric": "safety"}
455
+ {"content": "Text 1", "criteria": "clarity"},
456
+ {"content": "Paris", "input": "What is the capital of France?", "criteria": "accuracy"},
457
+ {"content": {"a": "A", "b": "B"}, "criteria": "quality"},
458
+ {"content": "Text 3", "metric": "safety"}
412
459
  ])
413
460
  """
414
461
  processor = BatchProcessor(self, max_concurrent or self.config.max_concurrent)