vllm-judge 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vllm_judge/__init__.py ADDED
@@ -0,0 +1,120 @@
1
+ """
2
+ vLLM Judge - LLM-as-a-Judge evaluations for vLLM hosted models.
3
+
4
+ A lightweight library for evaluating text responses using self-hosted language models
5
+ via vLLM's OpenAI-compatible API.
6
+ """
7
+
8
+ __version__ = "0.1.0"
9
+
10
+ from vllm_judge.judge import Judge
11
+ from vllm_judge.models import (
12
+ JudgeConfig,
13
+ EvaluationResult,
14
+ Metric,
15
+ BatchResult,
16
+ TemplateEngine
17
+ )
18
+ from vllm_judge.templating import TemplateProcessor
19
+ from vllm_judge.metrics import (
20
+ # General metrics
21
+ HELPFULNESS,
22
+ ACCURACY,
23
+ CLARITY,
24
+ CONCISENESS,
25
+ RELEVANCE,
26
+
27
+ # Safety metrics
28
+ SAFETY,
29
+ TOXICITY,
30
+
31
+ # Code metrics
32
+ CODE_QUALITY,
33
+ CODE_SECURITY,
34
+
35
+ # Content metrics
36
+ CREATIVITY,
37
+ PROFESSIONALISM,
38
+ EDUCATIONAL_VALUE,
39
+
40
+ # Comparison metrics
41
+ PREFERENCE,
42
+
43
+ # Binary metrics
44
+ APPROPRIATE,
45
+ FACTUAL,
46
+
47
+ # Domain metrics
48
+ MEDICAL_ACCURACY,
49
+ LEGAL_APPROPRIATENESS,
50
+
51
+ # Utility
52
+ BUILTIN_METRICS,
53
+
54
+ # Template metrics
55
+ EDUCATIONAL_CONTENT_TEMPLATE,
56
+ CODE_REVIEW_TEMPLATE,
57
+ CUSTOMER_SERVICE_TEMPLATE,
58
+ WRITING_QUALITY_TEMPLATE,
59
+ PRODUCT_REVIEW_TEMPLATE,
60
+ MEDICAL_INFO_TEMPLATE,
61
+ API_DOCS_TEMPLATE,
62
+
63
+ )
64
+ from vllm_judge.exceptions import (
65
+ VLLMJudgeError,
66
+ ConfigurationError,
67
+ ConnectionError,
68
+ TimeoutError,
69
+ ParseError,
70
+ MetricNotFoundError,
71
+ InvalidInputError,
72
+ RetryExhaustedError
73
+ )
74
+
75
+ __all__ = [
76
+ # Main classes
77
+ "Judge",
78
+ "JudgeConfig",
79
+ "EvaluationResult",
80
+ "Metric",
81
+ "BatchResult",
82
+ "TemplateEngine",
83
+ "TemplateProcessor",
84
+
85
+ # Metrics
86
+ "HELPFULNESS",
87
+ "ACCURACY",
88
+ "CLARITY",
89
+ "CONCISENESS",
90
+ "RELEVANCE",
91
+ "SAFETY",
92
+ "TOXICITY",
93
+ "CODE_QUALITY",
94
+ "CODE_SECURITY",
95
+ "CREATIVITY",
96
+ "PROFESSIONALISM",
97
+ "EDUCATIONAL_VALUE",
98
+ "PREFERENCE",
99
+ "APPROPRIATE",
100
+ "FACTUAL",
101
+ "MEDICAL_ACCURACY",
102
+ "LEGAL_APPROPRIATENESS",
103
+ "BUILTIN_METRICS",
104
+ "EDUCATIONAL_CONTENT_TEMPLATE",
105
+ "CODE_REVIEW_TEMPLATE",
106
+ "CUSTOMER_SERVICE_TEMPLATE",
107
+ "WRITING_QUALITY_TEMPLATE",
108
+ "PRODUCT_REVIEW_TEMPLATE",
109
+ "MEDICAL_INFO_TEMPLATE",
110
+ "API_DOCS_TEMPLATE",
111
+ # Exceptions
112
+ "VLLMJudgeError",
113
+ "ConfigurationError",
114
+ "ConnectionError",
115
+ "TimeoutError",
116
+ "ParseError",
117
+ "MetricNotFoundError",
118
+ "InvalidInputError",
119
+ "RetryExhaustedError"
120
+ ]
@@ -0,0 +1,39 @@
1
+ """
2
+ API module for vLLM Judge.
3
+ """
4
+ from vllm_judge.api.server import app, create_app, start_server
5
+ from vllm_judge.api.client import JudgeClient
6
+ from vllm_judge.api.models import (
7
+ EvaluateRequest,
8
+ BatchEvaluateRequest,
9
+ AsyncBatchRequest,
10
+ EvaluationResponse,
11
+ BatchResponse,
12
+ AsyncBatchResponse,
13
+ JobStatusResponse,
14
+ MetricInfo,
15
+ HealthResponse,
16
+ ErrorResponse
17
+ )
18
+
19
+ __all__ = [
20
+ # Server
21
+ "app",
22
+ "create_app",
23
+ "start_server",
24
+
25
+ # Client
26
+ "JudgeClient",
27
+
28
+ # Models
29
+ "EvaluateRequest",
30
+ "BatchEvaluateRequest",
31
+ "AsyncBatchRequest",
32
+ "EvaluationResponse",
33
+ "BatchResponse",
34
+ "AsyncBatchResponse",
35
+ "JobStatusResponse",
36
+ "MetricInfo",
37
+ "HealthResponse",
38
+ "ErrorResponse"
39
+ ]
@@ -0,0 +1,354 @@
1
+ """
2
+ HTTP client for vLLM Judge API.
3
+ """
4
+ import asyncio
5
+ from typing import Union, Dict, List, Optional, Tuple, Any, AsyncIterator
6
+ import httpx
7
+ import websockets
8
+ import json
9
+
10
+ from vllm_judge.models import EvaluationResult, BatchResult
11
+ from vllm_judge.exceptions import VLLMJudgeError, ConnectionError
12
+ from vllm_judge.api.models import (
13
+ EvaluateRequest,
14
+ BatchEvaluateRequest,
15
+ AsyncBatchRequest,
16
+ MetricInfo
17
+ )
18
+
19
+
20
+ class JudgeClient:
21
+ """HTTP client for vLLM Judge API."""
22
+
23
+ def __init__(
24
+ self,
25
+ api_url: str,
26
+ timeout: float = 30.0,
27
+ max_retries: int = 3
28
+ ):
29
+ """
30
+ Initialize Judge API client.
31
+
32
+ Args:
33
+ api_url: Base URL of Judge API server
34
+ timeout: Request timeout in seconds
35
+ max_retries: Maximum retry attempts
36
+ """
37
+ self.api_url = api_url.rstrip('/')
38
+ self.timeout = timeout
39
+ self.max_retries = max_retries
40
+ self.session = httpx.AsyncClient(
41
+ base_url=self.api_url,
42
+ timeout=httpx.Timeout(timeout)
43
+ )
44
+
45
+ async def __aenter__(self):
46
+ """Async context manager entry."""
47
+ return self
48
+
49
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
50
+ """Async context manager exit."""
51
+ await self.close()
52
+
53
+ async def close(self):
54
+ """Close HTTP session."""
55
+ await self.session.aclose()
56
+
57
+ async def health_check(self) -> Dict[str, Any]:
58
+ """Check API health status."""
59
+ try:
60
+ response = await self.session.get("/health")
61
+ response.raise_for_status()
62
+ return response.json()
63
+ except httpx.HTTPError as e:
64
+ raise ConnectionError(f"Health check failed: {e}")
65
+
66
+ async def evaluate(
67
+ self,
68
+ response: Union[str, Dict[str, str]],
69
+ criteria: str = None,
70
+ rubric: Union[str, Dict[Union[int, float], str]] = None,
71
+ scale: Optional[Tuple[int, int]] = None,
72
+ metric: str = None,
73
+ context: str = None,
74
+ system_prompt: str = None,
75
+ examples: List[Dict[str, Any]] = None,
76
+ template_vars: Dict[str, Any] = None,
77
+ template_engine: str = "format",
78
+ **kwargs
79
+ ) -> EvaluationResult:
80
+ """
81
+ Perform single evaluation via API.
82
+
83
+ Args:
84
+ Same as Judge.evaluate() including template support
85
+
86
+ Returns:
87
+ EvaluationResult
88
+ """
89
+ request = EvaluateRequest(
90
+ response=response,
91
+ criteria=criteria,
92
+ rubric=rubric,
93
+ scale=list(scale) if scale else None,
94
+ metric=metric,
95
+ context=context,
96
+ system_prompt=system_prompt,
97
+ examples=examples,
98
+ template_vars=template_vars,
99
+ template_engine=template_engine
100
+ )
101
+
102
+ try:
103
+ api_response = await self.session.post(
104
+ "/evaluate",
105
+ json=request.model_dump()
106
+ )
107
+ api_response.raise_for_status()
108
+ data = api_response.json()
109
+
110
+ return EvaluationResult(
111
+ decision=data["decision"],
112
+ reasoning=data["reasoning"],
113
+ score=data.get("score"),
114
+ metadata=data.get("metadata", {})
115
+ )
116
+
117
+ except httpx.HTTPStatusError as e:
118
+ error_detail = e.response.json().get("detail", str(e))
119
+ raise VLLMJudgeError(f"Evaluation failed: {error_detail}")
120
+ except httpx.HTTPError as e:
121
+ raise ConnectionError(f"API request failed: {e}")
122
+
123
+ async def batch_evaluate(
124
+ self,
125
+ data: List[Dict[str, Any]],
126
+ max_concurrent: int = None,
127
+ default_criteria: str = None,
128
+ default_metric: str = None,
129
+ **kwargs
130
+ ) -> BatchResult:
131
+ """
132
+ Perform synchronous batch evaluation.
133
+
134
+ Args:
135
+ data: List of evaluation inputs
136
+ max_concurrent: Maximum concurrent requests
137
+ default_criteria: Default criteria for all evaluations
138
+ default_metric: Default metric for all evaluations
139
+
140
+ Returns:
141
+ BatchResult
142
+ """
143
+ request = BatchEvaluateRequest(
144
+ data=data,
145
+ max_concurrent=max_concurrent,
146
+ default_criteria=default_criteria,
147
+ default_metric=default_metric
148
+ )
149
+
150
+ try:
151
+ response = await self.session.post(
152
+ "/batch",
153
+ json=request.model_dump(),
154
+ timeout=None # No timeout for batch operations
155
+ )
156
+ response.raise_for_status()
157
+ data = response.json()
158
+
159
+ # Convert results
160
+ results = []
161
+ for r in data["results"]:
162
+ if "error" in r:
163
+ results.append(VLLMJudgeError(r["error"]))
164
+ else:
165
+ results.append(EvaluationResult(
166
+ decision=r["decision"],
167
+ reasoning=r["reasoning"],
168
+ score=r.get("score"),
169
+ metadata=r.get("metadata", {})
170
+ ))
171
+
172
+ return BatchResult(
173
+ results=results,
174
+ total=data["total"],
175
+ successful=data["successful"],
176
+ failed=data["failed"],
177
+ duration_seconds=data["duration_seconds"]
178
+ )
179
+
180
+ except httpx.HTTPStatusError as e:
181
+ error_detail = e.response.json().get("detail", str(e))
182
+ raise VLLMJudgeError(f"Batch evaluation failed: {error_detail}")
183
+ except httpx.HTTPError as e:
184
+ raise ConnectionError(f"API request failed: {e}")
185
+
186
+ async def async_batch_evaluate(
187
+ self,
188
+ data: List[Dict[str, Any]],
189
+ callback_url: str = None,
190
+ max_concurrent: int = None,
191
+ poll_interval: float = 1.0
192
+ ) -> BatchResult:
193
+ """
194
+ Start async batch evaluation and wait for completion.
195
+
196
+ Args:
197
+ data: List of evaluation inputs
198
+ callback_url: Optional callback URL
199
+ max_concurrent: Maximum concurrent requests
200
+ poll_interval: Seconds between status checks
201
+
202
+ Returns:
203
+ BatchResult when complete
204
+ """
205
+ # Start async job
206
+ request = AsyncBatchRequest(
207
+ data=data,
208
+ callback_url=callback_url,
209
+ max_concurrent=max_concurrent
210
+ )
211
+
212
+ response = await self.session.post(
213
+ "/batch/async",
214
+ json=request.model_dump()
215
+ )
216
+ response.raise_for_status()
217
+ job_data = response.json()
218
+ job_id = job_data["job_id"]
219
+
220
+ # Poll for completion
221
+ while True:
222
+ status = await self.get_job_status(job_id)
223
+
224
+ if status["status"] == "completed":
225
+ return await self.get_job_result(job_id)
226
+ elif status["status"] == "failed":
227
+ raise VLLMJudgeError(f"Job failed: {status.get('error', 'Unknown error')}")
228
+
229
+ await asyncio.sleep(poll_interval)
230
+
231
+ async def get_job_status(self, job_id: str) -> Dict[str, Any]:
232
+ """Get status of async job."""
233
+ response = await self.session.get(f"/jobs/{job_id}")
234
+ response.raise_for_status()
235
+ return response.json()
236
+
237
+ async def get_job_result(self, job_id: str) -> BatchResult:
238
+ """Get result of completed async job."""
239
+ response = await self.session.get(f"/jobs/{job_id}/result")
240
+ response.raise_for_status()
241
+ data = response.json()
242
+
243
+ # Convert to BatchResult
244
+ results = []
245
+ for r in data["results"]:
246
+ if "error" in r:
247
+ results.append(VLLMJudgeError(r["error"]))
248
+ else:
249
+ results.append(EvaluationResult(
250
+ decision=r["decision"],
251
+ reasoning=r["reasoning"],
252
+ score=r.get("score"),
253
+ metadata=r.get("metadata", {})
254
+ ))
255
+
256
+ return BatchResult(
257
+ results=results,
258
+ total=data["total"],
259
+ successful=data["successful"],
260
+ failed=data["failed"],
261
+ duration_seconds=data["duration_seconds"]
262
+ )
263
+
264
+ async def list_metrics(self) -> List[MetricInfo]:
265
+ """List all available metrics."""
266
+ response = await self.session.get("/metrics")
267
+ response.raise_for_status()
268
+ return [MetricInfo(**m) for m in response.json()]
269
+
270
+ async def get_metric(self, metric_name: str) -> Dict[str, Any]:
271
+ """Get details of a specific metric."""
272
+ response = await self.session.get(f"/metrics/{metric_name}")
273
+ response.raise_for_status()
274
+ return response.json()
275
+
276
+ # Convenience methods matching Judge interface
277
+ async def score(
278
+ self,
279
+ criteria: str,
280
+ response: str,
281
+ scale: Tuple[int, int] = (1, 10),
282
+ **kwargs
283
+ ) -> EvaluationResult:
284
+ """Quick scoring evaluation."""
285
+ return await self.evaluate(
286
+ response=response,
287
+ criteria=criteria,
288
+ scale=scale,
289
+ **kwargs
290
+ )
291
+
292
+ async def compare(
293
+ self,
294
+ response_a: str,
295
+ response_b: str,
296
+ criteria: str,
297
+ **kwargs
298
+ ) -> EvaluationResult:
299
+ """Quick comparison evaluation."""
300
+ return await self.evaluate(
301
+ response={"a": response_a, "b": response_b},
302
+ criteria=criteria,
303
+ **kwargs
304
+ )
305
+
306
+ async def classify(
307
+ self,
308
+ response: str,
309
+ categories: List[str],
310
+ criteria: str = None,
311
+ **kwargs
312
+ ) -> EvaluationResult:
313
+ """Quick classification evaluation."""
314
+ if not criteria:
315
+ criteria = "appropriate category"
316
+
317
+ rubric = f"Classify into one of these categories: {', '.join(categories)}"
318
+
319
+ return await self.evaluate(
320
+ response=response,
321
+ criteria=criteria,
322
+ rubric=rubric,
323
+ **kwargs
324
+ )
325
+
326
+ async def evaluate_streaming(
327
+ self,
328
+ response: Union[str, Dict[str, str]],
329
+ **kwargs
330
+ ) -> AsyncIterator[str]:
331
+ """
332
+ WebSocket-based streaming evaluation.
333
+
334
+ Yields partial results as they arrive.
335
+ """
336
+ ws_url = self.api_url.replace("http://", "ws://").replace("https://", "wss://")
337
+ ws_url = f"{ws_url}/ws/evaluate"
338
+
339
+ async with websockets.connect(ws_url) as websocket:
340
+ # Send request
341
+ request_data = {
342
+ "response": response,
343
+ **kwargs
344
+ }
345
+ await websocket.send(json.dumps(request_data))
346
+
347
+ # Receive result
348
+ result_data = await websocket.recv()
349
+ result = json.loads(result_data)
350
+
351
+ if result["status"] == "success":
352
+ yield json.dumps(result["result"])
353
+ else:
354
+ raise VLLMJudgeError(f"Streaming evaluation failed: {result.get('error')}")
@@ -0,0 +1,157 @@
1
+ from typing import Union, Dict, List, Optional, Any, Tuple
2
+ from pydantic import BaseModel, Field
3
+ from datetime import datetime
4
+
5
+
6
+ class EvaluateRequest(BaseModel):
7
+ """Request model for single evaluation."""
8
+ response: Union[str, Dict[str, str]] = Field(
9
+ ..., description="Text to evaluate or dict with 'a' and 'b' for comparison"
10
+ )
11
+ criteria: Optional[str] = Field(
12
+ None, description="What to evaluate for"
13
+ )
14
+ rubric: Optional[Union[str, Dict[Union[int, float], str]]] = Field(
15
+ None, description="Evaluation guide"
16
+ )
17
+ scale: Optional[List[int]] = Field(
18
+ None, description="Numeric scale as [min, max]"
19
+ )
20
+ metric: Optional[str] = Field(
21
+ None, description="Pre-defined metric name"
22
+ )
23
+ context: Optional[str] = Field(
24
+ None, description="Additional context"
25
+ )
26
+ system_prompt: Optional[str] = Field(
27
+ None, description="Custom system prompt"
28
+ )
29
+ examples: Optional[List[Dict[str, Any]]] = Field(
30
+ None, description="Few-shot examples"
31
+ )
32
+ template_vars: Optional[Dict[str, Any]] = Field(
33
+ None, description="Template variables to substitute"
34
+ )
35
+ template_engine: Optional[str] = Field(
36
+ None, description="Template engine to use ('format' or 'jinja2'), default is 'format'"
37
+ )
38
+
39
+ class Config:
40
+ json_schema_extra = {
41
+ "example": {
42
+ "response": "Python is a high-level programming language...",
43
+ "criteria": "technical accuracy for {audience}",
44
+ "template_vars": {"audience": "beginners"},
45
+ "scale": [1, 10]
46
+ }
47
+ }
48
+
49
+
50
+ class BatchEvaluateRequest(BaseModel):
51
+ """Request model for batch evaluation."""
52
+ data: List[Dict[str, Any]] = Field(
53
+ ..., description="List of evaluation inputs"
54
+ )
55
+ max_concurrent: Optional[int] = Field(
56
+ None, description="Maximum concurrent requests"
57
+ )
58
+ default_criteria: Optional[str] = Field(
59
+ None, description="Default criteria for all evaluations"
60
+ )
61
+ default_metric: Optional[str] = Field(
62
+ None, description="Default metric for all evaluations"
63
+ )
64
+
65
+
66
+ class AsyncBatchRequest(BaseModel):
67
+ """Request model for async batch evaluation."""
68
+ data: List[Dict[str, Any]] = Field(
69
+ ..., description="List of evaluation inputs"
70
+ )
71
+ callback_url: Optional[str] = Field(
72
+ None, description="URL to POST results when complete"
73
+ )
74
+ max_concurrent: Optional[int] = Field(
75
+ None, description="Maximum concurrent requests"
76
+ )
77
+
78
+
79
+ class EvaluationResponse(BaseModel):
80
+ """Response model for evaluation results."""
81
+ decision: Union[str, bool, int, float]
82
+ reasoning: str
83
+ score: Optional[float]
84
+ metadata: Dict[str, Any] = {}
85
+
86
+ # API-specific fields
87
+ evaluation_id: Optional[str] = None
88
+ timestamp: Optional[datetime] = None
89
+ duration_ms: Optional[int] = None
90
+
91
+
92
+ class BatchResponse(BaseModel):
93
+ """Response model for batch results."""
94
+ total: int
95
+ successful: int
96
+ failed: int
97
+ success_rate: float
98
+ duration_seconds: float
99
+ results: List[Union[EvaluationResponse, Dict[str, str]]]
100
+
101
+
102
+ class AsyncBatchResponse(BaseModel):
103
+ """Response model for async batch initiation."""
104
+ job_id: str
105
+ status: str = "pending"
106
+ total_items: int
107
+ created_at: datetime
108
+ estimated_duration_seconds: Optional[float] = None
109
+
110
+
111
+ class JobStatusResponse(BaseModel):
112
+ """Response model for job status."""
113
+ job_id: str
114
+ status: str # "pending", "running", "completed", "failed"
115
+ progress: Dict[str, int] # {"completed": 50, "total": 100}
116
+ created_at: datetime
117
+ started_at: Optional[datetime] = None
118
+ completed_at: Optional[datetime] = None
119
+ result_url: Optional[str] = None
120
+ error: Optional[str] = None
121
+
122
+
123
+ class MetricInfo(BaseModel):
124
+ """Information about a metric."""
125
+ name: str
126
+ criteria: str
127
+ has_scale: bool
128
+ scale: Optional[Tuple[int, int]] = None
129
+ has_rubric: bool
130
+ rubric_type: Optional[str] = None # "string" or "dict"
131
+ has_examples: bool
132
+ example_count: int = 0
133
+ has_system_prompt: bool
134
+ has_template_vars: bool = False
135
+ template_vars: Optional[Dict[str, Any]] = None
136
+ required_vars: Optional[List[str]] = None
137
+ template_engine: Optional[str] = None
138
+
139
+
140
+ class HealthResponse(BaseModel):
141
+ """Health check response."""
142
+ status: str = "healthy"
143
+ version: str
144
+ model: str
145
+ base_url: str
146
+ uptime_seconds: float
147
+ total_evaluations: int
148
+ active_connections: int
149
+ metrics_available: int
150
+
151
+
152
+ class ErrorResponse(BaseModel):
153
+ """Error response model."""
154
+ error: str
155
+ detail: Optional[str] = None
156
+ code: Optional[str] = None
157
+ timestamp: datetime = Field(default_factory=datetime.utcnow)