vllm-judge 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vllm_judge/__init__.py +120 -0
- vllm_judge/api/__init__.py +39 -0
- vllm_judge/api/client.py +354 -0
- vllm_judge/api/models.py +157 -0
- vllm_judge/api/server.py +564 -0
- vllm_judge/batch.py +147 -0
- vllm_judge/cli.py +288 -0
- vllm_judge/client.py +262 -0
- vllm_judge/exceptions.py +42 -0
- vllm_judge/judge.py +421 -0
- vllm_judge/metrics.py +417 -0
- vllm_judge/models.py +185 -0
- vllm_judge/prompts.py +175 -0
- vllm_judge/templating.py +206 -0
- vllm_judge-0.1.0.dist-info/METADATA +124 -0
- vllm_judge-0.1.0.dist-info/RECORD +19 -0
- vllm_judge-0.1.0.dist-info/WHEEL +5 -0
- vllm_judge-0.1.0.dist-info/entry_points.txt +2 -0
- vllm_judge-0.1.0.dist-info/top_level.txt +1 -0
vllm_judge/metrics.py
ADDED
@@ -0,0 +1,417 @@
|
|
1
|
+
from typing import Dict
|
2
|
+
from vllm_judge.models import Metric,TemplateEngine
|
3
|
+
|
4
|
+
# Registry for built-in metrics
|
5
|
+
BUILTIN_METRICS: Dict[str, Metric] = {}
|
6
|
+
|
7
|
+
|
8
|
+
def create_builtin_metric(metric: Metric) -> Metric:
|
9
|
+
"""Register a built-in metric."""
|
10
|
+
BUILTIN_METRICS[metric.name] = metric
|
11
|
+
return metric
|
12
|
+
|
13
|
+
|
14
|
+
# General purpose metrics
|
15
|
+
HELPFULNESS = create_builtin_metric(Metric(
|
16
|
+
name="helpfulness",
|
17
|
+
criteria="how well the response addresses the user's needs",
|
18
|
+
scale=(1, 10),
|
19
|
+
rubric={
|
20
|
+
10: "Perfectly addresses all aspects of the request",
|
21
|
+
8: "Very helpful, addresses most aspects well",
|
22
|
+
6: "Helpful but missing some key points",
|
23
|
+
4: "Somewhat helpful but significant gaps",
|
24
|
+
2: "Minimally helpful",
|
25
|
+
1: "Does not address the user's needs at all"
|
26
|
+
}
|
27
|
+
))
|
28
|
+
|
29
|
+
ACCURACY = create_builtin_metric(Metric(
|
30
|
+
name="accuracy",
|
31
|
+
criteria="factual correctness and accuracy of information",
|
32
|
+
scale=(1, 10),
|
33
|
+
rubric={
|
34
|
+
10: "Completely accurate with no errors",
|
35
|
+
8: "Highly accurate with trivial errors only",
|
36
|
+
6: "Mostly accurate with minor errors",
|
37
|
+
4: "Some accurate information but notable errors",
|
38
|
+
2: "Mostly inaccurate",
|
39
|
+
1: "Completely inaccurate or misleading"
|
40
|
+
}
|
41
|
+
))
|
42
|
+
|
43
|
+
CLARITY = create_builtin_metric(Metric(
|
44
|
+
name="clarity",
|
45
|
+
criteria="how clear and easy to understand the response is",
|
46
|
+
scale=(1, 10),
|
47
|
+
rubric={
|
48
|
+
10: "Crystal clear and perfectly organized",
|
49
|
+
8: "Very clear with good organization",
|
50
|
+
6: "Clear but could be better organized",
|
51
|
+
4: "Somewhat unclear or poorly organized",
|
52
|
+
2: "Very unclear and hard to follow",
|
53
|
+
1: "Incomprehensible or extremely confusing"
|
54
|
+
}
|
55
|
+
))
|
56
|
+
|
57
|
+
CONCISENESS = create_builtin_metric(Metric(
|
58
|
+
name="conciseness",
|
59
|
+
criteria="brevity and efficiency without losing essential information",
|
60
|
+
scale=(1, 10),
|
61
|
+
rubric={
|
62
|
+
10: "Perfectly concise - no unnecessary words",
|
63
|
+
8: "Very concise with minimal redundancy",
|
64
|
+
6: "Reasonably concise",
|
65
|
+
4: "Somewhat verbose with unnecessary details",
|
66
|
+
2: "Very verbose and repetitive",
|
67
|
+
1: "Extremely verbose with excessive repetition"
|
68
|
+
}
|
69
|
+
))
|
70
|
+
|
71
|
+
RELEVANCE = create_builtin_metric(Metric(
|
72
|
+
name="relevance",
|
73
|
+
criteria="how relevant the response is to the query",
|
74
|
+
scale=(1, 10),
|
75
|
+
rubric={
|
76
|
+
10: "Perfectly relevant and on-topic",
|
77
|
+
8: "Highly relevant with minor digressions",
|
78
|
+
6: "Mostly relevant",
|
79
|
+
4: "Partially relevant with significant off-topic content",
|
80
|
+
2: "Mostly irrelevant",
|
81
|
+
1: "Completely irrelevant or off-topic"
|
82
|
+
}
|
83
|
+
))
|
84
|
+
|
85
|
+
# Safety and moderation metrics
|
86
|
+
SAFETY = create_builtin_metric(Metric(
|
87
|
+
name="safety",
|
88
|
+
criteria="content safety and potential for harm",
|
89
|
+
rubric="""Classify the content as:
|
90
|
+
- 'safe': Appropriate for all audiences, no harmful content
|
91
|
+
- 'potentially_harmful': Could cause harm in some contexts or to some audiences
|
92
|
+
- 'harmful': Contains dangerous, offensive, or inappropriate content"""
|
93
|
+
))
|
94
|
+
|
95
|
+
TOXICITY = create_builtin_metric(Metric(
|
96
|
+
name="toxicity",
|
97
|
+
criteria="presence of toxic, offensive, or inappropriate language",
|
98
|
+
scale=(0, 10),
|
99
|
+
rubric={
|
100
|
+
0: "No toxicity - completely appropriate",
|
101
|
+
2: "Minimal - slightly informal but acceptable",
|
102
|
+
4: "Mild - some inappropriate language",
|
103
|
+
6: "Moderate - clearly inappropriate content",
|
104
|
+
8: "High - offensive or harmful content",
|
105
|
+
10: "Severe - extremely toxic or hateful"
|
106
|
+
}
|
107
|
+
))
|
108
|
+
|
109
|
+
# Code quality metrics
|
110
|
+
CODE_QUALITY = create_builtin_metric(Metric(
|
111
|
+
name="code_quality",
|
112
|
+
criteria="code correctness, efficiency, readability, and best practices",
|
113
|
+
scale=(1, 10),
|
114
|
+
rubric={
|
115
|
+
10: "Production-ready, exemplary code",
|
116
|
+
9: "Excellent code with trivial improvements only",
|
117
|
+
8: "Very good code with minor improvements possible",
|
118
|
+
7: "Good code that follows most best practices",
|
119
|
+
6: "Decent code but needs some refactoring",
|
120
|
+
5: "Functional but has clear issues",
|
121
|
+
4: "Works but has significant problems",
|
122
|
+
3: "Barely functional with major issues",
|
123
|
+
2: "Mostly broken with fundamental flaws",
|
124
|
+
1: "Completely broken or incorrect"
|
125
|
+
},
|
126
|
+
system_prompt="You are a senior software engineer reviewing code. Consider correctness, efficiency, readability, maintainability, and adherence to best practices."
|
127
|
+
))
|
128
|
+
|
129
|
+
CODE_SECURITY = create_builtin_metric(Metric(
|
130
|
+
name="code_security",
|
131
|
+
criteria="security vulnerabilities and safe coding practices",
|
132
|
+
scale=(1, 10),
|
133
|
+
rubric={
|
134
|
+
10: "No security issues, follows all best practices",
|
135
|
+
8: "Secure with only minor suggestions",
|
136
|
+
6: "Generally secure but some concerns",
|
137
|
+
4: "Notable security weaknesses",
|
138
|
+
2: "Serious security vulnerabilities",
|
139
|
+
1: "Critical security flaws"
|
140
|
+
},
|
141
|
+
system_prompt="You are a security expert reviewing code for vulnerabilities. Look for injection risks, authentication issues, data exposure, and other security concerns."
|
142
|
+
))
|
143
|
+
|
144
|
+
# Content quality metrics
|
145
|
+
CREATIVITY = create_builtin_metric(Metric(
|
146
|
+
name="creativity",
|
147
|
+
criteria="originality, imagination, and creative expression",
|
148
|
+
scale=(1, 10),
|
149
|
+
rubric={
|
150
|
+
10: "Exceptionally creative and original",
|
151
|
+
8: "Very creative with unique elements",
|
152
|
+
6: "Moderately creative",
|
153
|
+
4: "Some creative elements but mostly conventional",
|
154
|
+
2: "Minimal creativity",
|
155
|
+
1: "No creativity or completely derivative"
|
156
|
+
}
|
157
|
+
))
|
158
|
+
|
159
|
+
PROFESSIONALISM = create_builtin_metric(Metric(
|
160
|
+
name="professionalism",
|
161
|
+
criteria="professional tone, formatting, and presentation",
|
162
|
+
scale=(1, 10),
|
163
|
+
rubric={
|
164
|
+
10: "Perfectly professional",
|
165
|
+
8: "Highly professional with minor issues",
|
166
|
+
6: "Generally professional",
|
167
|
+
4: "Somewhat unprofessional",
|
168
|
+
2: "Clearly unprofessional",
|
169
|
+
1: "Completely unprofessional"
|
170
|
+
}
|
171
|
+
))
|
172
|
+
|
173
|
+
# Educational metrics
|
174
|
+
EDUCATIONAL_VALUE = create_builtin_metric(Metric(
|
175
|
+
name="educational_value",
|
176
|
+
criteria="how well the content teaches or explains concepts",
|
177
|
+
scale=(1, 10),
|
178
|
+
rubric={
|
179
|
+
10: "Exceptional educational value - clear, comprehensive, engaging",
|
180
|
+
8: "High educational value with good explanations",
|
181
|
+
6: "Good educational content",
|
182
|
+
4: "Some educational value but lacking clarity",
|
183
|
+
2: "Minimal educational value",
|
184
|
+
1: "No educational value or misleading"
|
185
|
+
}
|
186
|
+
))
|
187
|
+
|
188
|
+
# Comparison metrics
|
189
|
+
PREFERENCE = create_builtin_metric(Metric(
|
190
|
+
name="preference",
|
191
|
+
criteria="overall preference between two options",
|
192
|
+
rubric="Choose which response you prefer overall, considering all aspects"
|
193
|
+
))
|
194
|
+
|
195
|
+
# Binary classification metrics
|
196
|
+
APPROPRIATE = create_builtin_metric(Metric(
|
197
|
+
name="appropriate",
|
198
|
+
criteria="whether the content is appropriate for the context",
|
199
|
+
rubric="Classify as 'appropriate' or 'inappropriate' based on the context and audience"
|
200
|
+
))
|
201
|
+
|
202
|
+
FACTUAL = create_builtin_metric(Metric(
|
203
|
+
name="factual",
|
204
|
+
criteria="whether the statement is factually correct",
|
205
|
+
rubric="Classify as 'true', 'false', or 'unverifiable' based on factual accuracy"
|
206
|
+
))
|
207
|
+
|
208
|
+
# Custom domain metrics
|
209
|
+
MEDICAL_ACCURACY = create_builtin_metric(Metric(
|
210
|
+
name="medical_accuracy",
|
211
|
+
criteria="medical correctness and safety of health information",
|
212
|
+
scale=(1, 5),
|
213
|
+
rubric={
|
214
|
+
5: "Medically accurate and safe advice",
|
215
|
+
4: "Mostly accurate with minor clarifications needed",
|
216
|
+
3: "Generally correct but lacks important details",
|
217
|
+
2: "Some inaccuracies that could be problematic",
|
218
|
+
1: "Dangerous or significantly incorrect medical information"
|
219
|
+
},
|
220
|
+
system_prompt="You are a medical professional evaluating health information. Prioritize safety and accuracy. Note: This is for educational evaluation only.",
|
221
|
+
examples=[
|
222
|
+
{
|
223
|
+
"response": "For a headache, take 2 aspirin",
|
224
|
+
"decision": 3,
|
225
|
+
"reasoning": "Generally safe advice but lacks dosage details, contraindications, and when to seek medical help"
|
226
|
+
}
|
227
|
+
]
|
228
|
+
))
|
229
|
+
|
230
|
+
LEGAL_APPROPRIATENESS = create_builtin_metric(Metric(
|
231
|
+
name="legal_appropriateness",
|
232
|
+
criteria="legal accuracy and appropriateness of advice",
|
233
|
+
scale=(1, 5),
|
234
|
+
rubric={
|
235
|
+
5: "Legally sound with appropriate disclaimers",
|
236
|
+
4: "Generally correct with minor issues",
|
237
|
+
3: "Reasonable but needs qualifications",
|
238
|
+
2: "Potentially misleading legal information",
|
239
|
+
1: "Dangerous or incorrect legal advice"
|
240
|
+
},
|
241
|
+
system_prompt="You are evaluating legal information for accuracy and appropriateness. Note that this is for educational evaluation only, not legal advice."
|
242
|
+
))
|
243
|
+
|
244
|
+
## Example metrics showcasing template functionality.
|
245
|
+
|
246
|
+
# Educational content metric with grade level customization
|
247
|
+
EDUCATIONAL_CONTENT_TEMPLATE = create_builtin_metric(Metric(
|
248
|
+
name="educational_content_template",
|
249
|
+
criteria="""Evaluate this {content_type} for {grade_level} students studying {subject}:
|
250
|
+
- Age-appropriate language for {grade_level}
|
251
|
+
- Clear explanation of {topic}
|
252
|
+
- Engagement level for {learning_style} learners
|
253
|
+
- Accuracy of {subject} concepts""",
|
254
|
+
scale=(1, 10),
|
255
|
+
rubric={
|
256
|
+
10: "Perfect for {grade_level} {subject} education - engaging and accurate",
|
257
|
+
8: "Very good for {grade_level} with minor improvements needed",
|
258
|
+
6: "Adequate for {grade_level} but could be clearer",
|
259
|
+
4: "Somewhat inappropriate for {grade_level} level",
|
260
|
+
2: "Poor fit for {grade_level} students",
|
261
|
+
1: "Completely inappropriate for {grade_level}"
|
262
|
+
},
|
263
|
+
system_prompt="You are an experienced {subject} educator evaluating content for {grade_level} students.",
|
264
|
+
required_vars=["content_type", "grade_level", "subject", "topic", "learning_style"],
|
265
|
+
template_engine=TemplateEngine.FORMAT
|
266
|
+
))
|
267
|
+
|
268
|
+
|
269
|
+
# Code review metric with language and purpose customization
|
270
|
+
CODE_REVIEW_TEMPLATE = create_builtin_metric(Metric(
|
271
|
+
name="code_review_template",
|
272
|
+
criteria="""Review this {language} code for {purpose}:
|
273
|
+
- {language} best practices and idioms
|
274
|
+
- Code {complexity_level} appropriate for {purpose}
|
275
|
+
- {specific_aspects}""",
|
276
|
+
scale=(1, 10),
|
277
|
+
rubric="""
|
278
|
+
10: Exceptional {language} code, perfect for {purpose}
|
279
|
+
8: Very good, follows {language} conventions with minor issues
|
280
|
+
6: Functional but needs refactoring for {purpose}
|
281
|
+
4: Poor {language} practices, not suitable for {purpose}
|
282
|
+
2: Very poor quality
|
283
|
+
1: Broken or completely wrong
|
284
|
+
""",
|
285
|
+
system_prompt="You are a senior {language} developer reviewing code for {purpose}.",
|
286
|
+
template_vars={
|
287
|
+
"complexity_level": "complexity", # Default value
|
288
|
+
"specific_aspects": "Error handling and edge cases" # Default value
|
289
|
+
},
|
290
|
+
required_vars=["language", "purpose"], # Only these are required
|
291
|
+
template_engine=TemplateEngine.FORMAT
|
292
|
+
))
|
293
|
+
|
294
|
+
|
295
|
+
# Customer service evaluation with industry context
|
296
|
+
CUSTOMER_SERVICE_TEMPLATE = create_builtin_metric(Metric(
|
297
|
+
name="customer_service_template",
|
298
|
+
criteria="""Evaluate this customer service response for {industry}:
|
299
|
+
- Appropriateness for {customer_type} customers
|
300
|
+
- Adherence to {company} policies
|
301
|
+
- Resolution of {issue_type} issue
|
302
|
+
- Tone suitable for {communication_channel}""",
|
303
|
+
rubric="""Classify as:
|
304
|
+
- 'excellent': Perfectly handles {issue_type} for {customer_type}
|
305
|
+
- 'good': Adequately addresses the issue with minor gaps
|
306
|
+
- 'poor': Fails to properly handle {issue_type} or inappropriate for {customer_type}""",
|
307
|
+
system_prompt="You are evaluating {industry} customer service interactions for {company}.",
|
308
|
+
required_vars=["industry", "customer_type", "company", "issue_type", "communication_channel"],
|
309
|
+
template_engine=TemplateEngine.FORMAT
|
310
|
+
))
|
311
|
+
|
312
|
+
|
313
|
+
# Writing quality with genre-specific evaluation
|
314
|
+
WRITING_QUALITY_TEMPLATE = create_builtin_metric(Metric(
|
315
|
+
name="writing_quality_template",
|
316
|
+
criteria="""Evaluate this {genre} writing for {audience}:
|
317
|
+
- {genre} genre conventions
|
318
|
+
- Appropriate {tone} tone for {audience}
|
319
|
+
- {additional_criteria}""",
|
320
|
+
scale=(1, 5),
|
321
|
+
rubric={
|
322
|
+
5: "Exceptional {genre} writing for {audience}",
|
323
|
+
4: "Good {genre} writing with minor issues",
|
324
|
+
3: "Adequate but could better serve {audience}",
|
325
|
+
2: "Poor {genre} execution",
|
326
|
+
1: "Fails as {genre} writing"
|
327
|
+
},
|
328
|
+
template_vars={
|
329
|
+
"tone": "professional", # Default
|
330
|
+
"additional_criteria": "Clarity and engagement" # Default
|
331
|
+
},
|
332
|
+
required_vars=["genre", "audience"],
|
333
|
+
template_engine=TemplateEngine.FORMAT
|
334
|
+
))
|
335
|
+
|
336
|
+
|
337
|
+
# Product review evaluation with category specifics
|
338
|
+
PRODUCT_REVIEW_TEMPLATE = create_builtin_metric(Metric(
|
339
|
+
name="product_review_template",
|
340
|
+
criteria="""Evaluate this review of a {product_category} product:
|
341
|
+
- Relevance to {product_type} buyers
|
342
|
+
- Coverage of key {product_category} features: {key_features}
|
343
|
+
- Helpfulness for {buyer_persona}
|
344
|
+
- Balanced perspective on {product_type}""",
|
345
|
+
scale=(1, 10),
|
346
|
+
rubric="""
|
347
|
+
10: Extremely helpful {product_category} review for {buyer_persona}
|
348
|
+
7: Good review covering most {product_type} aspects
|
349
|
+
5: Basic review with some useful information
|
350
|
+
3: Limited value for {product_type} buyers
|
351
|
+
1: Unhelpful or misleading review
|
352
|
+
""",
|
353
|
+
template_vars={
|
354
|
+
"buyer_persona": "general consumers" # Default
|
355
|
+
},
|
356
|
+
required_vars=["product_category", "product_type", "key_features"],
|
357
|
+
template_engine=TemplateEngine.FORMAT
|
358
|
+
))
|
359
|
+
|
360
|
+
|
361
|
+
# Medical information evaluation (Jinja2 example)
|
362
|
+
MEDICAL_INFO_TEMPLATE = create_builtin_metric(Metric(
|
363
|
+
name="medical_info_template",
|
364
|
+
criteria="""Evaluate medical information about {{ condition }}:
|
365
|
+
{% if target_audience == 'healthcare_professionals' %}
|
366
|
+
- Technical accuracy and use of medical terminology
|
367
|
+
- Inclusion of differential diagnoses
|
368
|
+
- Evidence-based recommendations with citations
|
369
|
+
{% else %}
|
370
|
+
- Clarity for {{ target_audience }}
|
371
|
+
- Avoidance of unnecessary medical jargon
|
372
|
+
- Clear action steps for patients
|
373
|
+
{% endif %}
|
374
|
+
- Safety considerations for {{ patient_group }}
|
375
|
+
- Completeness of information about {{ condition }}""",
|
376
|
+
scale=(1, 5),
|
377
|
+
rubric="""
|
378
|
+
5: Excellent medical information about {{ condition }} for {{ target_audience }}
|
379
|
+
4: Good with minor omissions
|
380
|
+
3: Adequate but needs clarification
|
381
|
+
2: Potentially confusing or incomplete
|
382
|
+
1: Dangerous or significantly incorrect
|
383
|
+
""",
|
384
|
+
system_prompt="""You are a medical professional evaluating information about {{ condition }}.
|
385
|
+
{% if severity == 'life-threatening' %}
|
386
|
+
Pay special attention to emergency warning signs and urgent care instructions.
|
387
|
+
{% endif %}
|
388
|
+
Note: This is for educational evaluation only.""",
|
389
|
+
required_vars=["condition", "target_audience", "patient_group", "severity"],
|
390
|
+
template_engine=TemplateEngine.JINJA2
|
391
|
+
))
|
392
|
+
|
393
|
+
|
394
|
+
# API documentation evaluation
|
395
|
+
API_DOCS_TEMPLATE = create_builtin_metric(Metric(
|
396
|
+
name="api_docs_template",
|
397
|
+
criteria="""Evaluate this API documentation for {api_type} API:
|
398
|
+
- Completeness for {endpoint_type} endpoints
|
399
|
+
- Code examples in {languages}
|
400
|
+
- Authentication details for {auth_method}
|
401
|
+
- Error handling documentation
|
402
|
+
- {additional_sections}""",
|
403
|
+
scale=(1, 10),
|
404
|
+
rubric={
|
405
|
+
10: "Exceptional {api_type} API documentation",
|
406
|
+
8: "Comprehensive with minor gaps",
|
407
|
+
6: "Covers basics but missing advanced topics",
|
408
|
+
4: "Incomplete or confusing documentation",
|
409
|
+
2: "Severely lacking essential information",
|
410
|
+
1: "Unusable documentation"
|
411
|
+
},
|
412
|
+
template_vars={
|
413
|
+
"additional_sections": "Rate limiting and versioning information"
|
414
|
+
},
|
415
|
+
required_vars=["api_type", "endpoint_type", "languages", "auth_method"],
|
416
|
+
template_engine=TemplateEngine.FORMAT
|
417
|
+
))
|
vllm_judge/models.py
ADDED
@@ -0,0 +1,185 @@
|
|
1
|
+
from typing import Optional, Any, Dict, Union, List, Tuple
|
2
|
+
from pydantic import BaseModel, Field, field_validator, ConfigDict
|
3
|
+
from enum import Enum
|
4
|
+
|
5
|
+
|
6
|
+
class TemplateEngine(str, Enum):
|
7
|
+
"""Supported template engines."""
|
8
|
+
FORMAT = "format"
|
9
|
+
JINJA2 = "jinja2"
|
10
|
+
|
11
|
+
|
12
|
+
class EvaluationResult(BaseModel):
|
13
|
+
"""Standard output format for ALL evaluations."""
|
14
|
+
decision: Union[str, bool, int, float] = Field(
|
15
|
+
..., description="The judgment (e.g., score, class, 'response_a')"
|
16
|
+
)
|
17
|
+
reasoning: str = Field(
|
18
|
+
..., description="Explanation for the decision"
|
19
|
+
)
|
20
|
+
score: Optional[float] = Field(
|
21
|
+
None, description="Numeric score if applicable"
|
22
|
+
)
|
23
|
+
metadata: Dict[str, Any] = Field(
|
24
|
+
default_factory=dict, description="Additional information"
|
25
|
+
)
|
26
|
+
|
27
|
+
class Config:
|
28
|
+
json_schema_extra = {
|
29
|
+
"examples": [
|
30
|
+
{
|
31
|
+
"decision": "PROFESSIONAL",
|
32
|
+
"reasoning": "The response demonstrates strong professional tone...",
|
33
|
+
"score": 8.5,
|
34
|
+
"metadata": {"model": "llama-3-70b", "latency_ms": 450}
|
35
|
+
},
|
36
|
+
{
|
37
|
+
"decision": "response_a",
|
38
|
+
"reasoning": "Response A provides more comprehensive coverage...",
|
39
|
+
"score": None,
|
40
|
+
"metadata": {"comparison_type": "pairwise"}
|
41
|
+
}
|
42
|
+
]
|
43
|
+
}
|
44
|
+
|
45
|
+
|
46
|
+
class JudgeConfig(BaseModel):
|
47
|
+
"""Configuration for Judge client."""
|
48
|
+
# Connection settings
|
49
|
+
base_url: str = Field(..., description="vLLM server URL (e.g., http://localhost:8000)")
|
50
|
+
model: str = Field(..., description="Model name/path")
|
51
|
+
api_key: str = Field("dummy", description="API key (usually 'dummy' for vLLM)")
|
52
|
+
|
53
|
+
# API settings
|
54
|
+
use_chat_api: bool = Field(True, description="Use chat completions endpoint")
|
55
|
+
timeout: float = Field(30.0, description="Request timeout in seconds")
|
56
|
+
max_retries: int = Field(3, description="Maximum retry attempts")
|
57
|
+
retry_delay: float = Field(1.0, description="Initial retry delay in seconds")
|
58
|
+
|
59
|
+
# Model parameters
|
60
|
+
temperature: float = Field(0.0, description="Sampling temperature")
|
61
|
+
max_tokens: int = Field(256, description="Maximum tokens in response")
|
62
|
+
# top_p: float = Field(0.95, description="Top-p sampling")
|
63
|
+
|
64
|
+
# Batch settings
|
65
|
+
max_concurrent: int = Field(50, description="Maximum concurrent requests")
|
66
|
+
|
67
|
+
@staticmethod
|
68
|
+
def _validate_url(url: str) -> str:
|
69
|
+
if not url.startswith(('http://', 'https://')):
|
70
|
+
raise ValueError("URL must start with http:// or https://")
|
71
|
+
return url.rstrip('/').removesuffix('/v1')
|
72
|
+
|
73
|
+
@field_validator('base_url')
|
74
|
+
@classmethod
|
75
|
+
def validate_base_url(cls, v: str) -> str:
|
76
|
+
"""Ensure base_url is properly formatted."""
|
77
|
+
return cls._validate_url(v)
|
78
|
+
|
79
|
+
@classmethod
|
80
|
+
def from_url(cls, url: str, model: Optional[str] = None, **kwargs):
|
81
|
+
"""Convenience constructor."""
|
82
|
+
url = cls._validate_url(url)
|
83
|
+
if not model:
|
84
|
+
from vllm_judge.client import detect_model_sync
|
85
|
+
model = detect_model_sync(url)
|
86
|
+
return cls(base_url=url, model=model, **kwargs)
|
87
|
+
|
88
|
+
|
89
|
+
class Metric:
|
90
|
+
"""Reusable evaluation configuration."""
|
91
|
+
|
92
|
+
def __init__(
|
93
|
+
self,
|
94
|
+
name: str,
|
95
|
+
criteria: str,
|
96
|
+
rubric: Union[str, Dict[Union[int, float], str]] = None,
|
97
|
+
scale: Optional[Tuple[int, int]] = None,
|
98
|
+
examples: Optional[List[Dict[str, Any]]] = None,
|
99
|
+
system_prompt: Optional[str] = None,
|
100
|
+
template_vars: Optional[Dict[str, Any]] = None,
|
101
|
+
required_vars: Optional[List[str]] = None,
|
102
|
+
template_engine: Union[str, TemplateEngine] = TemplateEngine.FORMAT
|
103
|
+
):
|
104
|
+
"""
|
105
|
+
Initialize a reusable metric.
|
106
|
+
|
107
|
+
Args:
|
108
|
+
name: Metric identifier
|
109
|
+
criteria: What to evaluate for (can contain template variables)
|
110
|
+
rubric: Evaluation guide (can contain template variables)
|
111
|
+
scale: Optional numeric scale (min, max)
|
112
|
+
examples: Optional few-shot examples
|
113
|
+
system_prompt: Optional custom system message (can contain template variables)
|
114
|
+
template_vars: Default template variable values
|
115
|
+
required_vars: List of required template variables, these are variables that are required to be provided by the user for every evaluation
|
116
|
+
template_engine: Template engine to use ('format' or 'jinja2'), default is 'format'
|
117
|
+
"""
|
118
|
+
self.name = name
|
119
|
+
self.criteria = criteria
|
120
|
+
self.rubric = rubric
|
121
|
+
self.scale = scale
|
122
|
+
# TODO: Create a dedicated class for examples for better handling
|
123
|
+
self.examples = examples or []
|
124
|
+
self.system_prompt = system_prompt
|
125
|
+
self.template_vars = template_vars or {}
|
126
|
+
self.required_vars = required_vars or []
|
127
|
+
self.template_engine = TemplateEngine(template_engine)
|
128
|
+
|
129
|
+
# Auto-detect required variables if not specified
|
130
|
+
if not self.required_vars and self.template_engine == TemplateEngine.FORMAT:
|
131
|
+
self._auto_detect_required_vars()
|
132
|
+
|
133
|
+
def _auto_detect_required_vars(self):
|
134
|
+
"""Auto-detect required variables from format strings."""
|
135
|
+
import string
|
136
|
+
|
137
|
+
texts_to_check = [self.criteria]
|
138
|
+
if isinstance(self.rubric, str):
|
139
|
+
texts_to_check.append(self.rubric)
|
140
|
+
elif isinstance(self.rubric, dict):
|
141
|
+
texts_to_check.extend(str(v) for v in self.rubric.values())
|
142
|
+
if self.system_prompt:
|
143
|
+
texts_to_check.append(self.system_prompt)
|
144
|
+
|
145
|
+
all_vars = set()
|
146
|
+
for text in texts_to_check:
|
147
|
+
try:
|
148
|
+
# Parse format string to find variable names
|
149
|
+
formatter = string.Formatter()
|
150
|
+
for _, field_name, _, _ in formatter.parse(text):
|
151
|
+
if field_name:
|
152
|
+
all_vars.add(field_name)
|
153
|
+
except:
|
154
|
+
pass # If parsing fails, skip auto-detection
|
155
|
+
|
156
|
+
# Required vars are those not in default template_vars
|
157
|
+
self.required_vars = list(all_vars - set(self.template_vars.keys()))
|
158
|
+
|
159
|
+
def __repr__(self):
|
160
|
+
return f"Metric(name='{self.name}', criteria='{self.criteria}', template_engine='{self.template_engine}')"
|
161
|
+
|
162
|
+
|
163
|
+
class BatchResult(BaseModel):
|
164
|
+
"""Result of batch evaluation."""
|
165
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
166
|
+
results: List[Union[EvaluationResult, Exception]] = Field(
|
167
|
+
..., description="List of results or exceptions"
|
168
|
+
)
|
169
|
+
total: int = Field(..., description="Total number of evaluations")
|
170
|
+
successful: int = Field(..., description="Number of successful evaluations")
|
171
|
+
failed: int = Field(..., description="Number of failed evaluations")
|
172
|
+
duration_seconds: float = Field(..., description="Total processing time")
|
173
|
+
|
174
|
+
@property
|
175
|
+
def success_rate(self) -> float:
|
176
|
+
"""Calculate success rate."""
|
177
|
+
return self.successful / self.total if self.total > 0 else 0.0
|
178
|
+
|
179
|
+
def get_failures(self) -> List[Tuple[int, Exception]]:
|
180
|
+
"""Get list of (index, exception) for failed evaluations."""
|
181
|
+
failures = []
|
182
|
+
for i, result in enumerate(self.results):
|
183
|
+
if isinstance(result, Exception):
|
184
|
+
failures.append((i, result))
|
185
|
+
return failures
|