vllm-judge 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vllm_judge/__init__.py +16 -2
- vllm_judge/api/client.py +46 -9
- vllm_judge/api/models.py +9 -2
- vllm_judge/api/server.py +4 -2
- vllm_judge/batch.py +4 -4
- vllm_judge/cli.py +82 -6
- vllm_judge/judge.py +59 -12
- vllm_judge/metrics.py +744 -262
- vllm_judge/models.py +3 -2
- vllm_judge/prompts.py +35 -15
- {vllm_judge-0.1.3.dist-info → vllm_judge-0.1.5.dist-info}/METADATA +1 -1
- vllm_judge-0.1.5.dist-info/RECORD +20 -0
- vllm_judge-0.1.3.dist-info/RECORD +0 -20
- {vllm_judge-0.1.3.dist-info → vllm_judge-0.1.5.dist-info}/WHEEL +0 -0
- {vllm_judge-0.1.3.dist-info → vllm_judge-0.1.5.dist-info}/entry_points.txt +0 -0
- {vllm_judge-0.1.3.dist-info → vllm_judge-0.1.5.dist-info}/top_level.txt +0 -0
vllm_judge/__init__.py
CHANGED
@@ -5,7 +5,7 @@ A lightweight library for evaluating text responses using self-hosted language m
|
|
5
5
|
via vLLM's OpenAI-compatible API.
|
6
6
|
"""
|
7
7
|
|
8
|
-
__version__ = "0.1.
|
8
|
+
__version__ = "0.1.5"
|
9
9
|
|
10
10
|
from vllm_judge.judge import Judge
|
11
11
|
from vllm_judge.models import (
|
@@ -24,10 +24,11 @@ from vllm_judge.metrics import (
|
|
24
24
|
CLARITY,
|
25
25
|
CONCISENESS,
|
26
26
|
RELEVANCE,
|
27
|
-
|
27
|
+
COHERENCE,
|
28
28
|
# Safety metrics
|
29
29
|
SAFETY,
|
30
30
|
TOXICITY,
|
31
|
+
BIAS_DETECTION,
|
31
32
|
LLAMA_GUARD_3_SAFETY,
|
32
33
|
|
33
34
|
# Code metrics
|
@@ -61,6 +62,12 @@ from vllm_judge.metrics import (
|
|
61
62
|
PRODUCT_REVIEW_TEMPLATE,
|
62
63
|
MEDICAL_INFO_TEMPLATE,
|
63
64
|
API_DOCS_TEMPLATE,
|
65
|
+
RAG_EVALUATION_TEMPLATE,
|
66
|
+
AGENT_PERFORMANCE_TEMPLATE,
|
67
|
+
|
68
|
+
# NLP metrics
|
69
|
+
TRANSLATION_QUALITY,
|
70
|
+
SUMMARIZATION_QUALITY,
|
64
71
|
|
65
72
|
)
|
66
73
|
from vllm_judge.exceptions import (
|
@@ -91,8 +98,10 @@ __all__ = [
|
|
91
98
|
"CLARITY",
|
92
99
|
"CONCISENESS",
|
93
100
|
"RELEVANCE",
|
101
|
+
"COHERENCE",
|
94
102
|
"SAFETY",
|
95
103
|
"TOXICITY",
|
104
|
+
"BIAS_DETECTION",
|
96
105
|
"LLAMA_GUARD_3_SAFETY",
|
97
106
|
"CODE_QUALITY",
|
98
107
|
"CODE_SECURITY",
|
@@ -112,6 +121,11 @@ __all__ = [
|
|
112
121
|
"PRODUCT_REVIEW_TEMPLATE",
|
113
122
|
"MEDICAL_INFO_TEMPLATE",
|
114
123
|
"API_DOCS_TEMPLATE",
|
124
|
+
"RAG_EVALUATION_TEMPLATE",
|
125
|
+
"AGENT_PERFORMANCE_TEMPLATE",
|
126
|
+
"TRANSLATION_QUALITY",
|
127
|
+
"SUMMARIZATION_QUALITY",
|
128
|
+
|
115
129
|
# Exceptions
|
116
130
|
"VLLMJudgeError",
|
117
131
|
"ConfigurationError",
|
vllm_judge/api/client.py
CHANGED
@@ -66,6 +66,7 @@ class JudgeClient:
|
|
66
66
|
async def evaluate(
|
67
67
|
self,
|
68
68
|
content: Union[str, Dict[str, str]],
|
69
|
+
input: Optional[str] = None,
|
69
70
|
criteria: str = None,
|
70
71
|
rubric: Union[str, Dict[Union[int, float], str]] = None,
|
71
72
|
scale: Optional[Tuple[int, int]] = None,
|
@@ -87,7 +88,8 @@ class JudgeClient:
|
|
87
88
|
EvaluationResult
|
88
89
|
"""
|
89
90
|
request = EvaluateRequest(
|
90
|
-
|
91
|
+
content=content,
|
92
|
+
input=input,
|
91
93
|
criteria=criteria,
|
92
94
|
rubric=rubric,
|
93
95
|
scale=list(scale) if scale else None,
|
@@ -277,37 +279,69 @@ class JudgeClient:
|
|
277
279
|
async def score(
|
278
280
|
self,
|
279
281
|
criteria: str,
|
280
|
-
|
282
|
+
content: str,
|
283
|
+
input: Optional[str] = None,
|
281
284
|
scale: Tuple[int, int] = (1, 10),
|
282
285
|
**kwargs
|
283
286
|
) -> EvaluationResult:
|
284
287
|
"""Quick scoring evaluation."""
|
285
288
|
return await self.evaluate(
|
286
|
-
|
289
|
+
content=content,
|
290
|
+
input=input,
|
291
|
+
criteria=criteria,
|
292
|
+
scale=scale,
|
293
|
+
**kwargs
|
294
|
+
)
|
295
|
+
async def qa_evaluate(
|
296
|
+
self,
|
297
|
+
question: str,
|
298
|
+
answer: str,
|
299
|
+
criteria: str = "accuracy and completeness",
|
300
|
+
scale: Tuple[int, int] = (1, 10),
|
301
|
+
**kwargs
|
302
|
+
) -> EvaluationResult:
|
303
|
+
"""
|
304
|
+
Convenience method for QA evaluation via API.
|
305
|
+
|
306
|
+
Args:
|
307
|
+
question: The question being answered
|
308
|
+
answer: The answer to evaluate
|
309
|
+
criteria: Evaluation criteria (default: "accuracy and completeness")
|
310
|
+
scale: Numeric scale (default 1-10)
|
311
|
+
**kwargs: Additional parameters
|
312
|
+
|
313
|
+
Returns:
|
314
|
+
EvaluationResult with QA assessment
|
315
|
+
"""
|
316
|
+
return await self.evaluate(
|
317
|
+
content=answer,
|
318
|
+
input=question,
|
287
319
|
criteria=criteria,
|
288
320
|
scale=scale,
|
289
321
|
**kwargs
|
290
322
|
)
|
291
|
-
|
292
323
|
async def compare(
|
293
324
|
self,
|
294
325
|
response_a: str,
|
295
326
|
response_b: str,
|
296
327
|
criteria: str,
|
328
|
+
input: Optional[str] = None,
|
297
329
|
**kwargs
|
298
330
|
) -> EvaluationResult:
|
299
331
|
"""Quick comparison evaluation."""
|
300
332
|
return await self.evaluate(
|
301
|
-
|
333
|
+
content={"a": response_a, "b": response_b},
|
334
|
+
input=input,
|
302
335
|
criteria=criteria,
|
303
336
|
**kwargs
|
304
337
|
)
|
305
338
|
|
306
339
|
async def classify(
|
307
340
|
self,
|
308
|
-
|
341
|
+
content: str,
|
309
342
|
categories: List[str],
|
310
343
|
criteria: str = None,
|
344
|
+
input: Optional[str] = None,
|
311
345
|
**kwargs
|
312
346
|
) -> EvaluationResult:
|
313
347
|
"""Quick classification evaluation."""
|
@@ -317,7 +351,8 @@ class JudgeClient:
|
|
317
351
|
rubric = f"Classify into one of these categories: {', '.join(categories)}"
|
318
352
|
|
319
353
|
return await self.evaluate(
|
320
|
-
|
354
|
+
content=content,
|
355
|
+
input=input,
|
321
356
|
criteria=criteria,
|
322
357
|
rubric=rubric,
|
323
358
|
**kwargs
|
@@ -325,7 +360,8 @@ class JudgeClient:
|
|
325
360
|
|
326
361
|
async def evaluate_streaming(
|
327
362
|
self,
|
328
|
-
|
363
|
+
content: Union[str, Dict[str, str]],
|
364
|
+
input: Optional[str] = None,
|
329
365
|
**kwargs
|
330
366
|
) -> AsyncIterator[str]:
|
331
367
|
"""
|
@@ -339,7 +375,8 @@ class JudgeClient:
|
|
339
375
|
async with websockets.connect(ws_url) as websocket:
|
340
376
|
# Send request
|
341
377
|
request_data = {
|
342
|
-
"
|
378
|
+
"content": content,
|
379
|
+
"input": input,
|
343
380
|
**kwargs
|
344
381
|
}
|
345
382
|
await websocket.send(json.dumps(request_data))
|
vllm_judge/api/models.py
CHANGED
@@ -5,8 +5,15 @@ from datetime import datetime
|
|
5
5
|
|
6
6
|
class EvaluateRequest(BaseModel):
|
7
7
|
"""Request model for single evaluation."""
|
8
|
-
|
9
|
-
...,
|
8
|
+
content: Union[str, Dict[str, str]] = Field(
|
9
|
+
...,
|
10
|
+
description="Content to evaluate (string or dict with 'a'/'b' for comparison)",
|
11
|
+
examples=["This is a response", {"a": "Response A", "b": "Response B"}]
|
12
|
+
)
|
13
|
+
input: Optional[str] = Field(
|
14
|
+
None,
|
15
|
+
description="Optional input/question/prompt that the content responds to",
|
16
|
+
examples=["What is the capital of France?", "Write a function to sort a list"]
|
10
17
|
)
|
11
18
|
criteria: Optional[str] = Field(
|
12
19
|
None, description="What to evaluate for"
|
vllm_judge/api/server.py
CHANGED
@@ -109,7 +109,8 @@ async def evaluate(request: EvaluateRequest):
|
|
109
109
|
|
110
110
|
# Perform evaluation with template support
|
111
111
|
result = await judge.evaluate(
|
112
|
-
|
112
|
+
content=request.content,
|
113
|
+
input=request.input,
|
113
114
|
criteria=request.criteria,
|
114
115
|
rubric=request.rubric,
|
115
116
|
scale=scale,
|
@@ -422,7 +423,8 @@ async def websocket_evaluate(websocket: WebSocket):
|
|
422
423
|
scale = tuple(request.scale) if request.scale else None
|
423
424
|
|
424
425
|
result = await judge.evaluate(
|
425
|
-
|
426
|
+
content=request.content,
|
427
|
+
input=request.input,
|
426
428
|
criteria=request.criteria,
|
427
429
|
rubric=request.rubric,
|
428
430
|
scale=scale,
|
vllm_judge/batch.py
CHANGED
@@ -83,12 +83,12 @@ class BatchProcessor:
|
|
83
83
|
async with self.semaphore:
|
84
84
|
try:
|
85
85
|
# Extract response from kwargs
|
86
|
-
|
87
|
-
if not
|
88
|
-
raise ValueError(f"Item {index} missing '
|
86
|
+
content = eval_kwargs.pop('content', None)
|
87
|
+
if not content:
|
88
|
+
raise ValueError(f"Item {index} missing 'content' field")
|
89
89
|
|
90
90
|
# Perform evaluation
|
91
|
-
result = await self.judge.evaluate(
|
91
|
+
result = await self.judge.evaluate(content=content, **eval_kwargs)
|
92
92
|
|
93
93
|
# Update progress
|
94
94
|
async with self.progress_lock:
|
vllm_judge/cli.py
CHANGED
@@ -50,7 +50,8 @@ def serve(base_url: str, model: str, host: str, port: int, reload: bool, max_con
|
|
50
50
|
@click.option('--api-url', help='Judge API URL (if using remote server)')
|
51
51
|
@click.option('--base-url', help='vLLM server URL (if using local)')
|
52
52
|
@click.option('--model', help='Model name (if using local)')
|
53
|
-
@click.option('--
|
53
|
+
@click.option('--content', required=True, help='Text to evaluate')
|
54
|
+
@click.option('--input', help='Input/question/prompt that the content responds to')
|
54
55
|
@click.option('--criteria', help='Evaluation criteria')
|
55
56
|
@click.option('--metric', help='Pre-defined metric name')
|
56
57
|
@click.option('--scale', nargs=2, type=int, help='Numeric scale (min max)')
|
@@ -61,7 +62,8 @@ def evaluate(
|
|
61
62
|
api_url: Optional[str],
|
62
63
|
base_url: Optional[str],
|
63
64
|
model: Optional[str],
|
64
|
-
|
65
|
+
content: str,
|
66
|
+
input: Optional[str],
|
65
67
|
criteria: Optional[str],
|
66
68
|
metric: Optional[str],
|
67
69
|
scale: Optional[tuple],
|
@@ -75,7 +77,8 @@ def evaluate(
|
|
75
77
|
# Use API client
|
76
78
|
async with JudgeClient(api_url) as client:
|
77
79
|
result = await client.evaluate(
|
78
|
-
content=
|
80
|
+
content=content,
|
81
|
+
input=input,
|
79
82
|
criteria=criteria,
|
80
83
|
metric=metric,
|
81
84
|
scale=scale,
|
@@ -91,7 +94,8 @@ def evaluate(
|
|
91
94
|
judge = Judge.from_url(base_url, model=model)
|
92
95
|
async with judge:
|
93
96
|
result = await judge.evaluate(
|
94
|
-
content=
|
97
|
+
content=content,
|
98
|
+
input=input,
|
95
99
|
criteria=criteria,
|
96
100
|
metric=metric,
|
97
101
|
scale=scale,
|
@@ -110,6 +114,60 @@ def evaluate(
|
|
110
114
|
|
111
115
|
asyncio.run(run_evaluation())
|
112
116
|
|
117
|
+
@cli.command()
|
118
|
+
@click.option('--api-url', help='Judge API URL (if using remote server)')
|
119
|
+
@click.option('--base-url', help='vLLM server URL (if using local)')
|
120
|
+
@click.option('--model', help='Model name (if using local)')
|
121
|
+
@click.option('--question', required=True, help='Question to evaluate answer for')
|
122
|
+
@click.option('--answer', required=True, help='Answer to evaluate')
|
123
|
+
@click.option('--criteria', default='accuracy and completeness', help='Evaluation criteria')
|
124
|
+
@click.option('--scale', nargs=2, type=int, default=[1, 10], help='Numeric scale (min max)')
|
125
|
+
@click.option('--output', type=click.Choice(['json', 'text']), default='text', help='Output format')
|
126
|
+
def qa_evaluate(
|
127
|
+
api_url: Optional[str],
|
128
|
+
base_url: Optional[str],
|
129
|
+
model: Optional[str],
|
130
|
+
question: str,
|
131
|
+
answer: str,
|
132
|
+
criteria: str,
|
133
|
+
scale: tuple,
|
134
|
+
output: str
|
135
|
+
):
|
136
|
+
"""Evaluate a QA pair (question and answer)."""
|
137
|
+
async def run_qa_evaluation():
|
138
|
+
if api_url:
|
139
|
+
async with JudgeClient(api_url) as client:
|
140
|
+
result = await client.qa_evaluate(
|
141
|
+
question=question,
|
142
|
+
answer=answer,
|
143
|
+
criteria=criteria,
|
144
|
+
scale=scale
|
145
|
+
)
|
146
|
+
else:
|
147
|
+
if not base_url:
|
148
|
+
click.echo("Error: Either --api-url or --base-url is required", err=True)
|
149
|
+
sys.exit(1)
|
150
|
+
|
151
|
+
judge = Judge.from_url(base_url, model=model)
|
152
|
+
async with judge:
|
153
|
+
result = await judge.qa_evaluate(
|
154
|
+
question=question,
|
155
|
+
answer=answer,
|
156
|
+
criteria=criteria,
|
157
|
+
scale=scale
|
158
|
+
)
|
159
|
+
|
160
|
+
if output == 'json':
|
161
|
+
click.echo(json.dumps(result.model_dump(), indent=2))
|
162
|
+
else:
|
163
|
+
click.echo(f"Question: {question}")
|
164
|
+
click.echo(f"Answer: {answer}")
|
165
|
+
click.echo(f"Decision: {result.decision}")
|
166
|
+
if result.score is not None:
|
167
|
+
click.echo(f"Score: {result.score}")
|
168
|
+
click.echo(f"Reasoning: {result.reasoning}")
|
169
|
+
|
170
|
+
asyncio.run(run_qa_evaluation())
|
113
171
|
|
114
172
|
@cli.command()
|
115
173
|
@click.option('--api-url', help='Judge API URL (if using remote server)')
|
@@ -118,6 +176,7 @@ def evaluate(
|
|
118
176
|
@click.option('--response-a', required=True, help='First response')
|
119
177
|
@click.option('--response-b', required=True, help='Second response')
|
120
178
|
@click.option('--criteria', required=True, help='Comparison criteria')
|
179
|
+
@click.option('--input', help='Input/question that both responses address')
|
121
180
|
@click.option('--output', type=click.Choice(['json', 'text']), default='text', help='Output format')
|
122
181
|
def compare(
|
123
182
|
api_url: Optional[str],
|
@@ -126,6 +185,7 @@ def compare(
|
|
126
185
|
response_a: str,
|
127
186
|
response_b: str,
|
128
187
|
criteria: str,
|
188
|
+
input: Optional[str],
|
129
189
|
output: str
|
130
190
|
):
|
131
191
|
"""Compare two responses."""
|
@@ -135,7 +195,8 @@ def compare(
|
|
135
195
|
result = await client.compare(
|
136
196
|
response_a=response_a,
|
137
197
|
response_b=response_b,
|
138
|
-
criteria=criteria
|
198
|
+
criteria=criteria,
|
199
|
+
input=input
|
139
200
|
)
|
140
201
|
else:
|
141
202
|
if not base_url:
|
@@ -147,12 +208,17 @@ def compare(
|
|
147
208
|
result = await judge.compare(
|
148
209
|
response_a=response_a,
|
149
210
|
response_b=response_b,
|
150
|
-
criteria=criteria
|
211
|
+
criteria=criteria,
|
212
|
+
input=input
|
151
213
|
)
|
152
214
|
|
153
215
|
if output == 'json':
|
154
216
|
click.echo(json.dumps(result.model_dump(), indent=2))
|
155
217
|
else:
|
218
|
+
if input:
|
219
|
+
click.echo(f"Input: {input}")
|
220
|
+
click.echo(f"Response A: {response_a}")
|
221
|
+
click.echo(f"Response B: {response_b}")
|
156
222
|
click.echo(f"Winner: {result.decision}")
|
157
223
|
click.echo(f"Reasoning: {result.reasoning}")
|
158
224
|
|
@@ -281,6 +347,16 @@ def batch(api_url: str, file, use_async: bool, max_concurrent: Optional[int], ou
|
|
281
347
|
|
282
348
|
def main():
|
283
349
|
"""Main entry point."""
|
350
|
+
cli.help = """vLLM Judge - LLM-as-a-Judge evaluation tool.
|
351
|
+
|
352
|
+
Features:
|
353
|
+
- Single response evaluation with optional input context
|
354
|
+
- QA (Question-Answer) evaluation
|
355
|
+
- Response comparison with optional input context
|
356
|
+
- Batch evaluation from JSON files
|
357
|
+
- API server mode
|
358
|
+
- Built-in and custom metrics with template support
|
359
|
+
"""
|
284
360
|
cli()
|
285
361
|
|
286
362
|
|
vllm_judge/judge.py
CHANGED
@@ -64,6 +64,7 @@ class Judge:
|
|
64
64
|
async def evaluate(
|
65
65
|
self,
|
66
66
|
content: Union[str, Dict[str, str]],
|
67
|
+
input: Optional[str] = None,
|
67
68
|
criteria: str = None,
|
68
69
|
rubric: Union[str, Dict[Union[int, float], str]] = None,
|
69
70
|
scale: Optional[Tuple[int, int]] = None,
|
@@ -80,6 +81,7 @@ class Judge:
|
|
80
81
|
|
81
82
|
Args:
|
82
83
|
content: String for single evaluation, dict {"a": ..., "b": ...} for comparison
|
84
|
+
input: Optional input/question/prompt that the content is responding to
|
83
85
|
criteria: What to evaluate for (can contain template variables)
|
84
86
|
rubric: Instructions for evaluation, can be string or dict containing mapping of score to description (can contain template variables)
|
85
87
|
scale: Optional numeric scale (min, max)
|
@@ -140,6 +142,9 @@ class Judge:
|
|
140
142
|
|
141
143
|
# Merge template variables (metric defaults + user provided)
|
142
144
|
all_template_vars = {**metric_template_vars, **(template_vars or {})}
|
145
|
+
# Add input to template variables if provided
|
146
|
+
if input:
|
147
|
+
all_template_vars["input"] = input
|
143
148
|
|
144
149
|
# Process templates
|
145
150
|
criteria = TemplateProcessor.apply_template(
|
@@ -154,10 +159,14 @@ class Judge:
|
|
154
159
|
context = TemplateProcessor.apply_template(
|
155
160
|
context, all_template_vars, engine, strict=True
|
156
161
|
)
|
162
|
+
input = TemplateProcessor.apply_template(
|
163
|
+
input, all_template_vars, engine, strict=True
|
164
|
+
)
|
157
165
|
|
158
166
|
# Build messages
|
159
167
|
messages = PromptBuilder.build_messages(
|
160
|
-
|
168
|
+
content=content,
|
169
|
+
input=input,
|
161
170
|
criteria=criteria,
|
162
171
|
rubric=rubric,
|
163
172
|
scale=scale,
|
@@ -264,7 +273,8 @@ class Judge:
|
|
264
273
|
async def score(
|
265
274
|
self,
|
266
275
|
criteria: str,
|
267
|
-
|
276
|
+
content: str,
|
277
|
+
input: Optional[str] = None,
|
268
278
|
scale: Tuple[int, int] = (1, 10),
|
269
279
|
**kwargs
|
270
280
|
) -> EvaluationResult:
|
@@ -273,7 +283,8 @@ class Judge:
|
|
273
283
|
|
274
284
|
Args:
|
275
285
|
criteria: What to evaluate
|
276
|
-
|
286
|
+
content: Response to evaluate
|
287
|
+
input: Optional input/question/prompt that the response addresses
|
277
288
|
scale: Numeric scale (default 1-10)
|
278
289
|
**kwargs: Additional parameters
|
279
290
|
|
@@ -281,7 +292,36 @@ class Judge:
|
|
281
292
|
EvaluationResult with numeric score
|
282
293
|
"""
|
283
294
|
return await self.evaluate(
|
284
|
-
|
295
|
+
content=content,
|
296
|
+
input=input,
|
297
|
+
criteria=criteria,
|
298
|
+
scale=scale,
|
299
|
+
**kwargs
|
300
|
+
)
|
301
|
+
async def qa_evaluate(
|
302
|
+
self,
|
303
|
+
question: str,
|
304
|
+
answer: str,
|
305
|
+
criteria: str = "accuracy and completeness",
|
306
|
+
scale: Tuple[int, int] = (1, 10),
|
307
|
+
**kwargs
|
308
|
+
) -> EvaluationResult:
|
309
|
+
"""
|
310
|
+
Convenience method for QA evaluation.
|
311
|
+
|
312
|
+
Args:
|
313
|
+
question: The question being answered
|
314
|
+
answer: The answer to evaluate
|
315
|
+
criteria: Evaluation criteria (default: "accuracy and completeness")
|
316
|
+
scale: Numeric scale (default 1-10)
|
317
|
+
**kwargs: Additional parameters
|
318
|
+
|
319
|
+
Returns:
|
320
|
+
EvaluationResult with QA assessment
|
321
|
+
"""
|
322
|
+
return await self.evaluate(
|
323
|
+
content=answer,
|
324
|
+
input=question,
|
285
325
|
criteria=criteria,
|
286
326
|
scale=scale,
|
287
327
|
**kwargs
|
@@ -292,6 +332,7 @@ class Judge:
|
|
292
332
|
response_a: str,
|
293
333
|
response_b: str,
|
294
334
|
criteria: str,
|
335
|
+
input: Optional[str] = None,
|
295
336
|
**kwargs
|
296
337
|
) -> EvaluationResult:
|
297
338
|
"""
|
@@ -301,31 +342,35 @@ class Judge:
|
|
301
342
|
response_a: First response
|
302
343
|
response_b: Second response
|
303
344
|
criteria: What to compare on
|
345
|
+
input: Optional input/question that both responses address
|
304
346
|
**kwargs: Additional parameters
|
305
347
|
|
306
348
|
Returns:
|
307
349
|
EvaluationResult with decision of 'response_a' or 'response_b'
|
308
350
|
"""
|
309
351
|
return await self.evaluate(
|
310
|
-
|
352
|
+
content={"a": response_a, "b": response_b},
|
353
|
+
input=input,
|
311
354
|
criteria=criteria,
|
312
355
|
**kwargs
|
313
356
|
)
|
314
357
|
|
315
358
|
async def classify(
|
316
359
|
self,
|
317
|
-
|
360
|
+
content: str,
|
318
361
|
categories: List[str],
|
319
362
|
criteria: str = None,
|
363
|
+
input: Optional[str] = None,
|
320
364
|
**kwargs
|
321
365
|
) -> EvaluationResult:
|
322
366
|
"""
|
323
367
|
Quick classification evaluation.
|
324
368
|
|
325
369
|
Args:
|
326
|
-
|
370
|
+
content: Content to classify
|
327
371
|
categories: List of categories
|
328
372
|
criteria: Classification criteria
|
373
|
+
input: Optional input/question that the response addresses
|
329
374
|
**kwargs: Additional parameters
|
330
375
|
|
331
376
|
Returns:
|
@@ -337,7 +382,8 @@ class Judge:
|
|
337
382
|
rubric = f"Classify into one of these categories: {', '.join(categories)}"
|
338
383
|
|
339
384
|
return await self.evaluate(
|
340
|
-
|
385
|
+
content=content,
|
386
|
+
input=input,
|
341
387
|
criteria=criteria,
|
342
388
|
rubric=rubric,
|
343
389
|
**kwargs
|
@@ -396,7 +442,7 @@ class Judge:
|
|
396
442
|
Batch evaluation with high concurrency.
|
397
443
|
|
398
444
|
Args:
|
399
|
-
data: List of evaluation inputs (each must have '
|
445
|
+
data: List of evaluation inputs (each must have 'content' key)
|
400
446
|
max_concurrent: Maximum concurrent requests
|
401
447
|
progress_callback: Optional callback for progress updates
|
402
448
|
**default_kwargs: Default parameters for all evaluations
|
@@ -406,9 +452,10 @@ class Judge:
|
|
406
452
|
|
407
453
|
Example:
|
408
454
|
results = await judge.batch_evaluate([
|
409
|
-
{"
|
410
|
-
{"
|
411
|
-
{"
|
455
|
+
{"content": "Text 1", "criteria": "clarity"},
|
456
|
+
{"content": "Paris", "input": "What is the capital of France?", "criteria": "accuracy"},
|
457
|
+
{"content": {"a": "A", "b": "B"}, "criteria": "quality"},
|
458
|
+
{"content": "Text 3", "metric": "safety"}
|
412
459
|
])
|
413
460
|
"""
|
414
461
|
processor = BatchProcessor(self, max_concurrent or self.config.max_concurrent)
|