vllm-judge 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vllm_judge/judge.py ADDED
@@ -0,0 +1,421 @@
1
+ import json
2
+ import re
3
+ from typing import Union, Dict, List, Optional, Tuple, Any, Callable
4
+
5
+ from vllm_judge.models import JudgeConfig, EvaluationResult, Metric, BatchResult, TemplateEngine
6
+ from vllm_judge.client import VLLMClient
7
+ from vllm_judge.prompts import PromptBuilder
8
+ from vllm_judge.batch import BatchProcessor
9
+ from vllm_judge.metrics import BUILTIN_METRICS
10
+ from vllm_judge.templating import TemplateProcessor
11
+ from vllm_judge.exceptions import (
12
+ ParseError,
13
+ InvalidInputError,
14
+ MetricNotFoundError,
15
+ VLLMJudgeError
16
+ )
17
+
18
+
19
+ class Judge:
20
+ """Main class for LLM-as-a-Judge evaluations."""
21
+
22
+ def __init__(self, config: JudgeConfig):
23
+ """
24
+ Initialize Judge with configuration.
25
+
26
+ Args:
27
+ config: Judge configuration
28
+ """
29
+ self.config = config
30
+ self.client = VLLMClient(config)
31
+ self.metrics: Dict[str, Metric] = {}
32
+
33
+ @classmethod
34
+ def from_url(cls, base_url: str, model: Optional[str] = None, **kwargs) -> 'Judge':
35
+ """
36
+ Create Judge from URL.
37
+
38
+ Args:
39
+ base_url: vLLM server URL
40
+ model: Model name (optional, can be auto-detected)
41
+ **kwargs: Additional configuration
42
+
43
+ Returns:
44
+ Judge instance
45
+ """
46
+ config = JudgeConfig.from_url(base_url, model=model, **kwargs)
47
+ return cls(config)
48
+
49
+ async def __aenter__(self):
50
+ """Async context manager entry."""
51
+ return self
52
+
53
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
54
+ """Async context manager exit."""
55
+ await self.close()
56
+
57
+ async def close(self):
58
+ """Close client connections."""
59
+ await self.client.close()
60
+
61
+ async def evaluate(
62
+ self,
63
+ response: Union[str, Dict[str, str]],
64
+ criteria: str = None,
65
+ rubric: Union[str, Dict[Union[int, float], str]] = None,
66
+ scale: Optional[Tuple[int, int]] = None,
67
+ examples: List[Dict[str, Any]] = None,
68
+ metric: Union[Metric, str] = None,
69
+ system_prompt: str = None,
70
+ context: str = None,
71
+ template_vars: Dict[str, Any] = None,
72
+ template_engine: Union[str, TemplateEngine] = TemplateEngine.FORMAT,
73
+ **kwargs
74
+ ) -> EvaluationResult:
75
+ """
76
+ Universal evaluation method that adapts to use case.
77
+
78
+ Args:
79
+ response: String for single evaluation, dict {"a": ..., "b": ...} for comparison
80
+ criteria: What to evaluate for (can contain template variables)
81
+ rubric: Instructions for evaluation, can be string or dict containing mapping of score to description (can contain template variables)
82
+ scale: Optional numeric scale (min, max)
83
+ examples: Optional few-shot examples
84
+ metric: Pre-defined Metric object or registered metric name
85
+ system_prompt: Optional custom system message (can contain template variables)
86
+ context: Optional context for the evaluation
87
+ template_vars: Variables to substitute in templates
88
+ template_engine: Template engine to use ('format' or 'jinja2'), default is 'format'
89
+ **kwargs: Additional parameters
90
+
91
+ Returns:
92
+ EvaluationResult with decision, reasoning, and optional score
93
+
94
+ Raises:
95
+ InvalidInputError: If inputs are invalid or template vars missing
96
+ MetricNotFoundError: If metric name not found
97
+ ParseError: If unable to parse model response
98
+ """
99
+ # Handle metric parameter
100
+ metric_template_vars = {}
101
+
102
+ if metric:
103
+ if isinstance(metric, str):
104
+ metric = self.get_metric(metric)
105
+ # Use metric defaults but allow overrides
106
+ criteria = criteria or metric.criteria
107
+ rubric = rubric or metric.rubric
108
+ scale = scale or metric.scale
109
+ examples = examples or metric.examples
110
+ system_prompt = system_prompt or metric.system_prompt
111
+ metric_template_vars = metric.template_vars
112
+ if metric.template_engine:
113
+ template_engine = metric.template_engine
114
+
115
+ # Validate inputs
116
+ if not criteria:
117
+ raise InvalidInputError("Either 'criteria' or 'metric' must be provided")
118
+
119
+ # Determine template engine
120
+ engine = TemplateEngine(template_engine)
121
+
122
+ # Merge template variables (metric defaults + user provided)
123
+ all_template_vars = {**metric_template_vars, **(template_vars or {})}
124
+
125
+ # Process templates
126
+ criteria = TemplateProcessor.apply_template(
127
+ criteria, all_template_vars, engine, strict=True
128
+ )
129
+ rubric = TemplateProcessor.apply_template(
130
+ rubric, all_template_vars, engine, strict=True
131
+ )
132
+ system_prompt = TemplateProcessor.apply_template(
133
+ system_prompt, all_template_vars, engine, strict=True
134
+ )
135
+ context = TemplateProcessor.apply_template(
136
+ context, all_template_vars, engine, strict=True
137
+ )
138
+
139
+ # Build messages
140
+ messages = PromptBuilder.build_messages(
141
+ response=response,
142
+ criteria=criteria,
143
+ rubric=rubric,
144
+ scale=scale,
145
+ examples=examples,
146
+ system_prompt=system_prompt,
147
+ context=context,
148
+ **kwargs
149
+ )
150
+
151
+ # Get LLM response
152
+ try:
153
+ if self.config.use_chat_api:
154
+ llm_response = await self.client.chat_completion(messages)
155
+ else:
156
+ prompt = PromptBuilder.format_messages_as_text(messages)
157
+ llm_response = await self.client.completion(prompt)
158
+ except Exception as e:
159
+ raise VLLMJudgeError(f"Failed to get model response: {e}")
160
+
161
+ # Parse response
162
+ result = self._parse_response(llm_response)
163
+
164
+ # Add template info to metadata if used
165
+ if all_template_vars:
166
+ result.metadata["template_vars"] = all_template_vars
167
+ result.metadata["template_engine"] = engine.value
168
+
169
+ return result
170
+
171
+ def _parse_response(self, response: str) -> EvaluationResult:
172
+ """
173
+ Parse LLM response into EvaluationResult.
174
+
175
+ Args:
176
+ response: Raw LLM response
177
+
178
+ Returns:
179
+ Parsed EvaluationResult
180
+
181
+ Raises:
182
+ ParseError: If unable to parse response
183
+ """
184
+ # Try to parse as JSON
185
+ try:
186
+ # First attempt: direct JSON parsing
187
+ data = json.loads(response.strip())
188
+ except json.JSONDecodeError:
189
+ # Second attempt: extract JSON from markdown code blocks
190
+ json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', response, re.DOTALL)
191
+ if json_match:
192
+ try:
193
+ data = json.loads(json_match.group(1))
194
+ except json.JSONDecodeError:
195
+ pass
196
+ else:
197
+ # Third attempt: find JSON-like structure
198
+ json_match = re.search(r'({[^{}]*"decision"[^{}]*})', response, re.DOTALL)
199
+ if json_match:
200
+ try:
201
+ data = json.loads(json_match.group(1))
202
+ except json.JSONDecodeError:
203
+ raise ParseError(
204
+ "Failed to parse JSON from response",
205
+ raw_response=response
206
+ )
207
+ else:
208
+ raise ParseError(
209
+ "No JSON structure found in response",
210
+ raw_response=response
211
+ )
212
+
213
+ # Validate required fields
214
+ if "decision" not in data:
215
+ raise ParseError(
216
+ "Response missing required 'decision' field",
217
+ raw_response=response
218
+ )
219
+
220
+ if "reasoning" not in data:
221
+ # Try to extract reasoning from other fields
222
+ data["reasoning"] = data.get("reason", data.get("explanation", "No reasoning provided"))
223
+
224
+ # Create result
225
+ return EvaluationResult(
226
+ decision=data["decision"],
227
+ reasoning=data["reasoning"],
228
+ score=data.get("score"),
229
+ metadata={
230
+ "model": self.config.model,
231
+ "raw_response": response,
232
+ **data.get("metadata", {})
233
+ }
234
+ )
235
+
236
+ # Convenience methods
237
+ async def score(
238
+ self,
239
+ criteria: str,
240
+ response: str,
241
+ scale: Tuple[int, int] = (1, 10),
242
+ **kwargs
243
+ ) -> EvaluationResult:
244
+ """
245
+ Quick scoring evaluation.
246
+
247
+ Args:
248
+ criteria: What to evaluate
249
+ response: Response to evaluate
250
+ scale: Numeric scale (default 1-10)
251
+ **kwargs: Additional parameters
252
+
253
+ Returns:
254
+ EvaluationResult with numeric score
255
+ """
256
+ return await self.evaluate(
257
+ response=response,
258
+ criteria=criteria,
259
+ scale=scale,
260
+ **kwargs
261
+ )
262
+
263
+ async def compare(
264
+ self,
265
+ response_a: str,
266
+ response_b: str,
267
+ criteria: str,
268
+ **kwargs
269
+ ) -> EvaluationResult:
270
+ """
271
+ Quick comparison evaluation.
272
+
273
+ Args:
274
+ response_a: First response
275
+ response_b: Second response
276
+ criteria: What to compare on
277
+ **kwargs: Additional parameters
278
+
279
+ Returns:
280
+ EvaluationResult with decision of 'response_a' or 'response_b'
281
+ """
282
+ return await self.evaluate(
283
+ response={"a": response_a, "b": response_b},
284
+ criteria=criteria,
285
+ **kwargs
286
+ )
287
+
288
+ async def classify(
289
+ self,
290
+ response: str,
291
+ categories: List[str],
292
+ criteria: str = None,
293
+ **kwargs
294
+ ) -> EvaluationResult:
295
+ """
296
+ Quick classification evaluation.
297
+
298
+ Args:
299
+ response: Response to classify
300
+ categories: List of categories
301
+ criteria: Classification criteria
302
+ **kwargs: Additional parameters
303
+
304
+ Returns:
305
+ EvaluationResult with category decision
306
+ """
307
+ if not criteria:
308
+ criteria = "appropriate category"
309
+
310
+ rubric = f"Classify into one of these categories: {', '.join(categories)}"
311
+
312
+ return await self.evaluate(
313
+ response=response,
314
+ criteria=criteria,
315
+ rubric=rubric,
316
+ **kwargs
317
+ )
318
+
319
+ # Metric management
320
+ def register_metric(self, metric: Metric):
321
+ """
322
+ Register a metric for reuse.
323
+
324
+ Args:
325
+ metric: Metric to register
326
+ """
327
+ self.metrics[metric.name] = metric
328
+
329
+ def get_metric(self, name: str) -> Metric:
330
+ """
331
+ Get registered metric by name.
332
+
333
+ Args:
334
+ name: Metric name
335
+
336
+ Returns:
337
+ Metric instance
338
+
339
+ Raises:
340
+ MetricNotFoundError: If metric not found
341
+ """
342
+ # Check user-registered metrics first
343
+ if name in self.metrics:
344
+ return self.metrics[name]
345
+
346
+ # Check built-in metrics
347
+ if name in BUILTIN_METRICS:
348
+ return BUILTIN_METRICS[name]
349
+
350
+ # List available metrics in error
351
+ available = list(self.metrics.keys()) + list(BUILTIN_METRICS.keys())
352
+ raise MetricNotFoundError(
353
+ f"Metric '{name}' not found. Available metrics: {', '.join(available)}"
354
+ )
355
+
356
+ def list_metrics(self) -> List[str]:
357
+ """List all available metric names."""
358
+ return list(self.metrics.keys()) + list(BUILTIN_METRICS.keys())
359
+
360
+ # Batch processing
361
+ async def batch_evaluate(
362
+ self,
363
+ data: List[Dict[str, Any]],
364
+ max_concurrent: int = None,
365
+ progress_callback: Callable[[int, int], None] = None,
366
+ **default_kwargs
367
+ ) -> BatchResult:
368
+ """
369
+ Batch evaluation with high concurrency.
370
+
371
+ Args:
372
+ data: List of evaluation inputs (each must have 'response' key)
373
+ max_concurrent: Maximum concurrent requests
374
+ progress_callback: Optional callback for progress updates
375
+ **default_kwargs: Default parameters for all evaluations
376
+
377
+ Returns:
378
+ BatchResult with all results
379
+
380
+ Example:
381
+ results = await judge.batch_evaluate([
382
+ {"response": "Text 1", "criteria": "clarity"},
383
+ {"response": {"a": "A", "b": "B"}, "criteria": "quality"},
384
+ {"response": "Text 3", "metric": "safety"}
385
+ ])
386
+ """
387
+ processor = BatchProcessor(self, max_concurrent or self.config.max_concurrent)
388
+ return await processor.process(data, progress_callback, **default_kwargs)
389
+
390
+ async def batch_score(
391
+ self,
392
+ responses: List[str],
393
+ criteria: str,
394
+ scale: Tuple[int, int] = (1, 10),
395
+ **kwargs
396
+ ) -> List[EvaluationResult]:
397
+ """
398
+ Convenience method for batch scoring.
399
+
400
+ Args:
401
+ responses: List of responses to score
402
+ criteria: Scoring criteria
403
+ scale: Numeric scale
404
+ **kwargs: Additional parameters
405
+
406
+ Returns:
407
+ List of EvaluationResults
408
+ """
409
+ data = [
410
+ {"response": resp, "criteria": criteria, "scale": scale, **kwargs}
411
+ for resp in responses
412
+ ]
413
+ batch_result = await self.batch_evaluate(data)
414
+
415
+ # Extract results, raising first error if any
416
+ results = []
417
+ for r in batch_result.results:
418
+ if isinstance(r, Exception):
419
+ raise r
420
+ results.append(r)
421
+ return results