vllm-judge 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vllm_judge/__init__.py +120 -0
- vllm_judge/api/__init__.py +39 -0
- vllm_judge/api/client.py +354 -0
- vllm_judge/api/models.py +157 -0
- vllm_judge/api/server.py +564 -0
- vllm_judge/batch.py +147 -0
- vllm_judge/cli.py +288 -0
- vllm_judge/client.py +262 -0
- vllm_judge/exceptions.py +42 -0
- vllm_judge/judge.py +421 -0
- vllm_judge/metrics.py +417 -0
- vllm_judge/models.py +185 -0
- vllm_judge/prompts.py +175 -0
- vllm_judge/templating.py +206 -0
- vllm_judge-0.1.0.dist-info/METADATA +124 -0
- vllm_judge-0.1.0.dist-info/RECORD +19 -0
- vllm_judge-0.1.0.dist-info/WHEEL +5 -0
- vllm_judge-0.1.0.dist-info/entry_points.txt +2 -0
- vllm_judge-0.1.0.dist-info/top_level.txt +1 -0
vllm_judge/__init__.py
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
"""
|
2
|
+
vLLM Judge - LLM-as-a-Judge evaluations for vLLM hosted models.
|
3
|
+
|
4
|
+
A lightweight library for evaluating text responses using self-hosted language models
|
5
|
+
via vLLM's OpenAI-compatible API.
|
6
|
+
"""
|
7
|
+
|
8
|
+
__version__ = "0.1.0"
|
9
|
+
|
10
|
+
from vllm_judge.judge import Judge
|
11
|
+
from vllm_judge.models import (
|
12
|
+
JudgeConfig,
|
13
|
+
EvaluationResult,
|
14
|
+
Metric,
|
15
|
+
BatchResult,
|
16
|
+
TemplateEngine
|
17
|
+
)
|
18
|
+
from vllm_judge.templating import TemplateProcessor
|
19
|
+
from vllm_judge.metrics import (
|
20
|
+
# General metrics
|
21
|
+
HELPFULNESS,
|
22
|
+
ACCURACY,
|
23
|
+
CLARITY,
|
24
|
+
CONCISENESS,
|
25
|
+
RELEVANCE,
|
26
|
+
|
27
|
+
# Safety metrics
|
28
|
+
SAFETY,
|
29
|
+
TOXICITY,
|
30
|
+
|
31
|
+
# Code metrics
|
32
|
+
CODE_QUALITY,
|
33
|
+
CODE_SECURITY,
|
34
|
+
|
35
|
+
# Content metrics
|
36
|
+
CREATIVITY,
|
37
|
+
PROFESSIONALISM,
|
38
|
+
EDUCATIONAL_VALUE,
|
39
|
+
|
40
|
+
# Comparison metrics
|
41
|
+
PREFERENCE,
|
42
|
+
|
43
|
+
# Binary metrics
|
44
|
+
APPROPRIATE,
|
45
|
+
FACTUAL,
|
46
|
+
|
47
|
+
# Domain metrics
|
48
|
+
MEDICAL_ACCURACY,
|
49
|
+
LEGAL_APPROPRIATENESS,
|
50
|
+
|
51
|
+
# Utility
|
52
|
+
BUILTIN_METRICS,
|
53
|
+
|
54
|
+
# Template metrics
|
55
|
+
EDUCATIONAL_CONTENT_TEMPLATE,
|
56
|
+
CODE_REVIEW_TEMPLATE,
|
57
|
+
CUSTOMER_SERVICE_TEMPLATE,
|
58
|
+
WRITING_QUALITY_TEMPLATE,
|
59
|
+
PRODUCT_REVIEW_TEMPLATE,
|
60
|
+
MEDICAL_INFO_TEMPLATE,
|
61
|
+
API_DOCS_TEMPLATE,
|
62
|
+
|
63
|
+
)
|
64
|
+
from vllm_judge.exceptions import (
|
65
|
+
VLLMJudgeError,
|
66
|
+
ConfigurationError,
|
67
|
+
ConnectionError,
|
68
|
+
TimeoutError,
|
69
|
+
ParseError,
|
70
|
+
MetricNotFoundError,
|
71
|
+
InvalidInputError,
|
72
|
+
RetryExhaustedError
|
73
|
+
)
|
74
|
+
|
75
|
+
__all__ = [
|
76
|
+
# Main classes
|
77
|
+
"Judge",
|
78
|
+
"JudgeConfig",
|
79
|
+
"EvaluationResult",
|
80
|
+
"Metric",
|
81
|
+
"BatchResult",
|
82
|
+
"TemplateEngine",
|
83
|
+
"TemplateProcessor",
|
84
|
+
|
85
|
+
# Metrics
|
86
|
+
"HELPFULNESS",
|
87
|
+
"ACCURACY",
|
88
|
+
"CLARITY",
|
89
|
+
"CONCISENESS",
|
90
|
+
"RELEVANCE",
|
91
|
+
"SAFETY",
|
92
|
+
"TOXICITY",
|
93
|
+
"CODE_QUALITY",
|
94
|
+
"CODE_SECURITY",
|
95
|
+
"CREATIVITY",
|
96
|
+
"PROFESSIONALISM",
|
97
|
+
"EDUCATIONAL_VALUE",
|
98
|
+
"PREFERENCE",
|
99
|
+
"APPROPRIATE",
|
100
|
+
"FACTUAL",
|
101
|
+
"MEDICAL_ACCURACY",
|
102
|
+
"LEGAL_APPROPRIATENESS",
|
103
|
+
"BUILTIN_METRICS",
|
104
|
+
"EDUCATIONAL_CONTENT_TEMPLATE",
|
105
|
+
"CODE_REVIEW_TEMPLATE",
|
106
|
+
"CUSTOMER_SERVICE_TEMPLATE",
|
107
|
+
"WRITING_QUALITY_TEMPLATE",
|
108
|
+
"PRODUCT_REVIEW_TEMPLATE",
|
109
|
+
"MEDICAL_INFO_TEMPLATE",
|
110
|
+
"API_DOCS_TEMPLATE",
|
111
|
+
# Exceptions
|
112
|
+
"VLLMJudgeError",
|
113
|
+
"ConfigurationError",
|
114
|
+
"ConnectionError",
|
115
|
+
"TimeoutError",
|
116
|
+
"ParseError",
|
117
|
+
"MetricNotFoundError",
|
118
|
+
"InvalidInputError",
|
119
|
+
"RetryExhaustedError"
|
120
|
+
]
|
@@ -0,0 +1,39 @@
|
|
1
|
+
"""
|
2
|
+
API module for vLLM Judge.
|
3
|
+
"""
|
4
|
+
from vllm_judge.api.server import app, create_app, start_server
|
5
|
+
from vllm_judge.api.client import JudgeClient
|
6
|
+
from vllm_judge.api.models import (
|
7
|
+
EvaluateRequest,
|
8
|
+
BatchEvaluateRequest,
|
9
|
+
AsyncBatchRequest,
|
10
|
+
EvaluationResponse,
|
11
|
+
BatchResponse,
|
12
|
+
AsyncBatchResponse,
|
13
|
+
JobStatusResponse,
|
14
|
+
MetricInfo,
|
15
|
+
HealthResponse,
|
16
|
+
ErrorResponse
|
17
|
+
)
|
18
|
+
|
19
|
+
__all__ = [
|
20
|
+
# Server
|
21
|
+
"app",
|
22
|
+
"create_app",
|
23
|
+
"start_server",
|
24
|
+
|
25
|
+
# Client
|
26
|
+
"JudgeClient",
|
27
|
+
|
28
|
+
# Models
|
29
|
+
"EvaluateRequest",
|
30
|
+
"BatchEvaluateRequest",
|
31
|
+
"AsyncBatchRequest",
|
32
|
+
"EvaluationResponse",
|
33
|
+
"BatchResponse",
|
34
|
+
"AsyncBatchResponse",
|
35
|
+
"JobStatusResponse",
|
36
|
+
"MetricInfo",
|
37
|
+
"HealthResponse",
|
38
|
+
"ErrorResponse"
|
39
|
+
]
|
vllm_judge/api/client.py
ADDED
@@ -0,0 +1,354 @@
|
|
1
|
+
"""
|
2
|
+
HTTP client for vLLM Judge API.
|
3
|
+
"""
|
4
|
+
import asyncio
|
5
|
+
from typing import Union, Dict, List, Optional, Tuple, Any, AsyncIterator
|
6
|
+
import httpx
|
7
|
+
import websockets
|
8
|
+
import json
|
9
|
+
|
10
|
+
from vllm_judge.models import EvaluationResult, BatchResult
|
11
|
+
from vllm_judge.exceptions import VLLMJudgeError, ConnectionError
|
12
|
+
from vllm_judge.api.models import (
|
13
|
+
EvaluateRequest,
|
14
|
+
BatchEvaluateRequest,
|
15
|
+
AsyncBatchRequest,
|
16
|
+
MetricInfo
|
17
|
+
)
|
18
|
+
|
19
|
+
|
20
|
+
class JudgeClient:
|
21
|
+
"""HTTP client for vLLM Judge API."""
|
22
|
+
|
23
|
+
def __init__(
|
24
|
+
self,
|
25
|
+
api_url: str,
|
26
|
+
timeout: float = 30.0,
|
27
|
+
max_retries: int = 3
|
28
|
+
):
|
29
|
+
"""
|
30
|
+
Initialize Judge API client.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
api_url: Base URL of Judge API server
|
34
|
+
timeout: Request timeout in seconds
|
35
|
+
max_retries: Maximum retry attempts
|
36
|
+
"""
|
37
|
+
self.api_url = api_url.rstrip('/')
|
38
|
+
self.timeout = timeout
|
39
|
+
self.max_retries = max_retries
|
40
|
+
self.session = httpx.AsyncClient(
|
41
|
+
base_url=self.api_url,
|
42
|
+
timeout=httpx.Timeout(timeout)
|
43
|
+
)
|
44
|
+
|
45
|
+
async def __aenter__(self):
|
46
|
+
"""Async context manager entry."""
|
47
|
+
return self
|
48
|
+
|
49
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
50
|
+
"""Async context manager exit."""
|
51
|
+
await self.close()
|
52
|
+
|
53
|
+
async def close(self):
|
54
|
+
"""Close HTTP session."""
|
55
|
+
await self.session.aclose()
|
56
|
+
|
57
|
+
async def health_check(self) -> Dict[str, Any]:
|
58
|
+
"""Check API health status."""
|
59
|
+
try:
|
60
|
+
response = await self.session.get("/health")
|
61
|
+
response.raise_for_status()
|
62
|
+
return response.json()
|
63
|
+
except httpx.HTTPError as e:
|
64
|
+
raise ConnectionError(f"Health check failed: {e}")
|
65
|
+
|
66
|
+
async def evaluate(
|
67
|
+
self,
|
68
|
+
response: Union[str, Dict[str, str]],
|
69
|
+
criteria: str = None,
|
70
|
+
rubric: Union[str, Dict[Union[int, float], str]] = None,
|
71
|
+
scale: Optional[Tuple[int, int]] = None,
|
72
|
+
metric: str = None,
|
73
|
+
context: str = None,
|
74
|
+
system_prompt: str = None,
|
75
|
+
examples: List[Dict[str, Any]] = None,
|
76
|
+
template_vars: Dict[str, Any] = None,
|
77
|
+
template_engine: str = "format",
|
78
|
+
**kwargs
|
79
|
+
) -> EvaluationResult:
|
80
|
+
"""
|
81
|
+
Perform single evaluation via API.
|
82
|
+
|
83
|
+
Args:
|
84
|
+
Same as Judge.evaluate() including template support
|
85
|
+
|
86
|
+
Returns:
|
87
|
+
EvaluationResult
|
88
|
+
"""
|
89
|
+
request = EvaluateRequest(
|
90
|
+
response=response,
|
91
|
+
criteria=criteria,
|
92
|
+
rubric=rubric,
|
93
|
+
scale=list(scale) if scale else None,
|
94
|
+
metric=metric,
|
95
|
+
context=context,
|
96
|
+
system_prompt=system_prompt,
|
97
|
+
examples=examples,
|
98
|
+
template_vars=template_vars,
|
99
|
+
template_engine=template_engine
|
100
|
+
)
|
101
|
+
|
102
|
+
try:
|
103
|
+
api_response = await self.session.post(
|
104
|
+
"/evaluate",
|
105
|
+
json=request.model_dump()
|
106
|
+
)
|
107
|
+
api_response.raise_for_status()
|
108
|
+
data = api_response.json()
|
109
|
+
|
110
|
+
return EvaluationResult(
|
111
|
+
decision=data["decision"],
|
112
|
+
reasoning=data["reasoning"],
|
113
|
+
score=data.get("score"),
|
114
|
+
metadata=data.get("metadata", {})
|
115
|
+
)
|
116
|
+
|
117
|
+
except httpx.HTTPStatusError as e:
|
118
|
+
error_detail = e.response.json().get("detail", str(e))
|
119
|
+
raise VLLMJudgeError(f"Evaluation failed: {error_detail}")
|
120
|
+
except httpx.HTTPError as e:
|
121
|
+
raise ConnectionError(f"API request failed: {e}")
|
122
|
+
|
123
|
+
async def batch_evaluate(
|
124
|
+
self,
|
125
|
+
data: List[Dict[str, Any]],
|
126
|
+
max_concurrent: int = None,
|
127
|
+
default_criteria: str = None,
|
128
|
+
default_metric: str = None,
|
129
|
+
**kwargs
|
130
|
+
) -> BatchResult:
|
131
|
+
"""
|
132
|
+
Perform synchronous batch evaluation.
|
133
|
+
|
134
|
+
Args:
|
135
|
+
data: List of evaluation inputs
|
136
|
+
max_concurrent: Maximum concurrent requests
|
137
|
+
default_criteria: Default criteria for all evaluations
|
138
|
+
default_metric: Default metric for all evaluations
|
139
|
+
|
140
|
+
Returns:
|
141
|
+
BatchResult
|
142
|
+
"""
|
143
|
+
request = BatchEvaluateRequest(
|
144
|
+
data=data,
|
145
|
+
max_concurrent=max_concurrent,
|
146
|
+
default_criteria=default_criteria,
|
147
|
+
default_metric=default_metric
|
148
|
+
)
|
149
|
+
|
150
|
+
try:
|
151
|
+
response = await self.session.post(
|
152
|
+
"/batch",
|
153
|
+
json=request.model_dump(),
|
154
|
+
timeout=None # No timeout for batch operations
|
155
|
+
)
|
156
|
+
response.raise_for_status()
|
157
|
+
data = response.json()
|
158
|
+
|
159
|
+
# Convert results
|
160
|
+
results = []
|
161
|
+
for r in data["results"]:
|
162
|
+
if "error" in r:
|
163
|
+
results.append(VLLMJudgeError(r["error"]))
|
164
|
+
else:
|
165
|
+
results.append(EvaluationResult(
|
166
|
+
decision=r["decision"],
|
167
|
+
reasoning=r["reasoning"],
|
168
|
+
score=r.get("score"),
|
169
|
+
metadata=r.get("metadata", {})
|
170
|
+
))
|
171
|
+
|
172
|
+
return BatchResult(
|
173
|
+
results=results,
|
174
|
+
total=data["total"],
|
175
|
+
successful=data["successful"],
|
176
|
+
failed=data["failed"],
|
177
|
+
duration_seconds=data["duration_seconds"]
|
178
|
+
)
|
179
|
+
|
180
|
+
except httpx.HTTPStatusError as e:
|
181
|
+
error_detail = e.response.json().get("detail", str(e))
|
182
|
+
raise VLLMJudgeError(f"Batch evaluation failed: {error_detail}")
|
183
|
+
except httpx.HTTPError as e:
|
184
|
+
raise ConnectionError(f"API request failed: {e}")
|
185
|
+
|
186
|
+
async def async_batch_evaluate(
|
187
|
+
self,
|
188
|
+
data: List[Dict[str, Any]],
|
189
|
+
callback_url: str = None,
|
190
|
+
max_concurrent: int = None,
|
191
|
+
poll_interval: float = 1.0
|
192
|
+
) -> BatchResult:
|
193
|
+
"""
|
194
|
+
Start async batch evaluation and wait for completion.
|
195
|
+
|
196
|
+
Args:
|
197
|
+
data: List of evaluation inputs
|
198
|
+
callback_url: Optional callback URL
|
199
|
+
max_concurrent: Maximum concurrent requests
|
200
|
+
poll_interval: Seconds between status checks
|
201
|
+
|
202
|
+
Returns:
|
203
|
+
BatchResult when complete
|
204
|
+
"""
|
205
|
+
# Start async job
|
206
|
+
request = AsyncBatchRequest(
|
207
|
+
data=data,
|
208
|
+
callback_url=callback_url,
|
209
|
+
max_concurrent=max_concurrent
|
210
|
+
)
|
211
|
+
|
212
|
+
response = await self.session.post(
|
213
|
+
"/batch/async",
|
214
|
+
json=request.model_dump()
|
215
|
+
)
|
216
|
+
response.raise_for_status()
|
217
|
+
job_data = response.json()
|
218
|
+
job_id = job_data["job_id"]
|
219
|
+
|
220
|
+
# Poll for completion
|
221
|
+
while True:
|
222
|
+
status = await self.get_job_status(job_id)
|
223
|
+
|
224
|
+
if status["status"] == "completed":
|
225
|
+
return await self.get_job_result(job_id)
|
226
|
+
elif status["status"] == "failed":
|
227
|
+
raise VLLMJudgeError(f"Job failed: {status.get('error', 'Unknown error')}")
|
228
|
+
|
229
|
+
await asyncio.sleep(poll_interval)
|
230
|
+
|
231
|
+
async def get_job_status(self, job_id: str) -> Dict[str, Any]:
|
232
|
+
"""Get status of async job."""
|
233
|
+
response = await self.session.get(f"/jobs/{job_id}")
|
234
|
+
response.raise_for_status()
|
235
|
+
return response.json()
|
236
|
+
|
237
|
+
async def get_job_result(self, job_id: str) -> BatchResult:
|
238
|
+
"""Get result of completed async job."""
|
239
|
+
response = await self.session.get(f"/jobs/{job_id}/result")
|
240
|
+
response.raise_for_status()
|
241
|
+
data = response.json()
|
242
|
+
|
243
|
+
# Convert to BatchResult
|
244
|
+
results = []
|
245
|
+
for r in data["results"]:
|
246
|
+
if "error" in r:
|
247
|
+
results.append(VLLMJudgeError(r["error"]))
|
248
|
+
else:
|
249
|
+
results.append(EvaluationResult(
|
250
|
+
decision=r["decision"],
|
251
|
+
reasoning=r["reasoning"],
|
252
|
+
score=r.get("score"),
|
253
|
+
metadata=r.get("metadata", {})
|
254
|
+
))
|
255
|
+
|
256
|
+
return BatchResult(
|
257
|
+
results=results,
|
258
|
+
total=data["total"],
|
259
|
+
successful=data["successful"],
|
260
|
+
failed=data["failed"],
|
261
|
+
duration_seconds=data["duration_seconds"]
|
262
|
+
)
|
263
|
+
|
264
|
+
async def list_metrics(self) -> List[MetricInfo]:
|
265
|
+
"""List all available metrics."""
|
266
|
+
response = await self.session.get("/metrics")
|
267
|
+
response.raise_for_status()
|
268
|
+
return [MetricInfo(**m) for m in response.json()]
|
269
|
+
|
270
|
+
async def get_metric(self, metric_name: str) -> Dict[str, Any]:
|
271
|
+
"""Get details of a specific metric."""
|
272
|
+
response = await self.session.get(f"/metrics/{metric_name}")
|
273
|
+
response.raise_for_status()
|
274
|
+
return response.json()
|
275
|
+
|
276
|
+
# Convenience methods matching Judge interface
|
277
|
+
async def score(
|
278
|
+
self,
|
279
|
+
criteria: str,
|
280
|
+
response: str,
|
281
|
+
scale: Tuple[int, int] = (1, 10),
|
282
|
+
**kwargs
|
283
|
+
) -> EvaluationResult:
|
284
|
+
"""Quick scoring evaluation."""
|
285
|
+
return await self.evaluate(
|
286
|
+
response=response,
|
287
|
+
criteria=criteria,
|
288
|
+
scale=scale,
|
289
|
+
**kwargs
|
290
|
+
)
|
291
|
+
|
292
|
+
async def compare(
|
293
|
+
self,
|
294
|
+
response_a: str,
|
295
|
+
response_b: str,
|
296
|
+
criteria: str,
|
297
|
+
**kwargs
|
298
|
+
) -> EvaluationResult:
|
299
|
+
"""Quick comparison evaluation."""
|
300
|
+
return await self.evaluate(
|
301
|
+
response={"a": response_a, "b": response_b},
|
302
|
+
criteria=criteria,
|
303
|
+
**kwargs
|
304
|
+
)
|
305
|
+
|
306
|
+
async def classify(
|
307
|
+
self,
|
308
|
+
response: str,
|
309
|
+
categories: List[str],
|
310
|
+
criteria: str = None,
|
311
|
+
**kwargs
|
312
|
+
) -> EvaluationResult:
|
313
|
+
"""Quick classification evaluation."""
|
314
|
+
if not criteria:
|
315
|
+
criteria = "appropriate category"
|
316
|
+
|
317
|
+
rubric = f"Classify into one of these categories: {', '.join(categories)}"
|
318
|
+
|
319
|
+
return await self.evaluate(
|
320
|
+
response=response,
|
321
|
+
criteria=criteria,
|
322
|
+
rubric=rubric,
|
323
|
+
**kwargs
|
324
|
+
)
|
325
|
+
|
326
|
+
async def evaluate_streaming(
|
327
|
+
self,
|
328
|
+
response: Union[str, Dict[str, str]],
|
329
|
+
**kwargs
|
330
|
+
) -> AsyncIterator[str]:
|
331
|
+
"""
|
332
|
+
WebSocket-based streaming evaluation.
|
333
|
+
|
334
|
+
Yields partial results as they arrive.
|
335
|
+
"""
|
336
|
+
ws_url = self.api_url.replace("http://", "ws://").replace("https://", "wss://")
|
337
|
+
ws_url = f"{ws_url}/ws/evaluate"
|
338
|
+
|
339
|
+
async with websockets.connect(ws_url) as websocket:
|
340
|
+
# Send request
|
341
|
+
request_data = {
|
342
|
+
"response": response,
|
343
|
+
**kwargs
|
344
|
+
}
|
345
|
+
await websocket.send(json.dumps(request_data))
|
346
|
+
|
347
|
+
# Receive result
|
348
|
+
result_data = await websocket.recv()
|
349
|
+
result = json.loads(result_data)
|
350
|
+
|
351
|
+
if result["status"] == "success":
|
352
|
+
yield json.dumps(result["result"])
|
353
|
+
else:
|
354
|
+
raise VLLMJudgeError(f"Streaming evaluation failed: {result.get('error')}")
|
vllm_judge/api/models.py
ADDED
@@ -0,0 +1,157 @@
|
|
1
|
+
from typing import Union, Dict, List, Optional, Any, Tuple
|
2
|
+
from pydantic import BaseModel, Field
|
3
|
+
from datetime import datetime
|
4
|
+
|
5
|
+
|
6
|
+
class EvaluateRequest(BaseModel):
|
7
|
+
"""Request model for single evaluation."""
|
8
|
+
response: Union[str, Dict[str, str]] = Field(
|
9
|
+
..., description="Text to evaluate or dict with 'a' and 'b' for comparison"
|
10
|
+
)
|
11
|
+
criteria: Optional[str] = Field(
|
12
|
+
None, description="What to evaluate for"
|
13
|
+
)
|
14
|
+
rubric: Optional[Union[str, Dict[Union[int, float], str]]] = Field(
|
15
|
+
None, description="Evaluation guide"
|
16
|
+
)
|
17
|
+
scale: Optional[List[int]] = Field(
|
18
|
+
None, description="Numeric scale as [min, max]"
|
19
|
+
)
|
20
|
+
metric: Optional[str] = Field(
|
21
|
+
None, description="Pre-defined metric name"
|
22
|
+
)
|
23
|
+
context: Optional[str] = Field(
|
24
|
+
None, description="Additional context"
|
25
|
+
)
|
26
|
+
system_prompt: Optional[str] = Field(
|
27
|
+
None, description="Custom system prompt"
|
28
|
+
)
|
29
|
+
examples: Optional[List[Dict[str, Any]]] = Field(
|
30
|
+
None, description="Few-shot examples"
|
31
|
+
)
|
32
|
+
template_vars: Optional[Dict[str, Any]] = Field(
|
33
|
+
None, description="Template variables to substitute"
|
34
|
+
)
|
35
|
+
template_engine: Optional[str] = Field(
|
36
|
+
None, description="Template engine to use ('format' or 'jinja2'), default is 'format'"
|
37
|
+
)
|
38
|
+
|
39
|
+
class Config:
|
40
|
+
json_schema_extra = {
|
41
|
+
"example": {
|
42
|
+
"response": "Python is a high-level programming language...",
|
43
|
+
"criteria": "technical accuracy for {audience}",
|
44
|
+
"template_vars": {"audience": "beginners"},
|
45
|
+
"scale": [1, 10]
|
46
|
+
}
|
47
|
+
}
|
48
|
+
|
49
|
+
|
50
|
+
class BatchEvaluateRequest(BaseModel):
|
51
|
+
"""Request model for batch evaluation."""
|
52
|
+
data: List[Dict[str, Any]] = Field(
|
53
|
+
..., description="List of evaluation inputs"
|
54
|
+
)
|
55
|
+
max_concurrent: Optional[int] = Field(
|
56
|
+
None, description="Maximum concurrent requests"
|
57
|
+
)
|
58
|
+
default_criteria: Optional[str] = Field(
|
59
|
+
None, description="Default criteria for all evaluations"
|
60
|
+
)
|
61
|
+
default_metric: Optional[str] = Field(
|
62
|
+
None, description="Default metric for all evaluations"
|
63
|
+
)
|
64
|
+
|
65
|
+
|
66
|
+
class AsyncBatchRequest(BaseModel):
|
67
|
+
"""Request model for async batch evaluation."""
|
68
|
+
data: List[Dict[str, Any]] = Field(
|
69
|
+
..., description="List of evaluation inputs"
|
70
|
+
)
|
71
|
+
callback_url: Optional[str] = Field(
|
72
|
+
None, description="URL to POST results when complete"
|
73
|
+
)
|
74
|
+
max_concurrent: Optional[int] = Field(
|
75
|
+
None, description="Maximum concurrent requests"
|
76
|
+
)
|
77
|
+
|
78
|
+
|
79
|
+
class EvaluationResponse(BaseModel):
|
80
|
+
"""Response model for evaluation results."""
|
81
|
+
decision: Union[str, bool, int, float]
|
82
|
+
reasoning: str
|
83
|
+
score: Optional[float]
|
84
|
+
metadata: Dict[str, Any] = {}
|
85
|
+
|
86
|
+
# API-specific fields
|
87
|
+
evaluation_id: Optional[str] = None
|
88
|
+
timestamp: Optional[datetime] = None
|
89
|
+
duration_ms: Optional[int] = None
|
90
|
+
|
91
|
+
|
92
|
+
class BatchResponse(BaseModel):
|
93
|
+
"""Response model for batch results."""
|
94
|
+
total: int
|
95
|
+
successful: int
|
96
|
+
failed: int
|
97
|
+
success_rate: float
|
98
|
+
duration_seconds: float
|
99
|
+
results: List[Union[EvaluationResponse, Dict[str, str]]]
|
100
|
+
|
101
|
+
|
102
|
+
class AsyncBatchResponse(BaseModel):
|
103
|
+
"""Response model for async batch initiation."""
|
104
|
+
job_id: str
|
105
|
+
status: str = "pending"
|
106
|
+
total_items: int
|
107
|
+
created_at: datetime
|
108
|
+
estimated_duration_seconds: Optional[float] = None
|
109
|
+
|
110
|
+
|
111
|
+
class JobStatusResponse(BaseModel):
|
112
|
+
"""Response model for job status."""
|
113
|
+
job_id: str
|
114
|
+
status: str # "pending", "running", "completed", "failed"
|
115
|
+
progress: Dict[str, int] # {"completed": 50, "total": 100}
|
116
|
+
created_at: datetime
|
117
|
+
started_at: Optional[datetime] = None
|
118
|
+
completed_at: Optional[datetime] = None
|
119
|
+
result_url: Optional[str] = None
|
120
|
+
error: Optional[str] = None
|
121
|
+
|
122
|
+
|
123
|
+
class MetricInfo(BaseModel):
|
124
|
+
"""Information about a metric."""
|
125
|
+
name: str
|
126
|
+
criteria: str
|
127
|
+
has_scale: bool
|
128
|
+
scale: Optional[Tuple[int, int]] = None
|
129
|
+
has_rubric: bool
|
130
|
+
rubric_type: Optional[str] = None # "string" or "dict"
|
131
|
+
has_examples: bool
|
132
|
+
example_count: int = 0
|
133
|
+
has_system_prompt: bool
|
134
|
+
has_template_vars: bool = False
|
135
|
+
template_vars: Optional[Dict[str, Any]] = None
|
136
|
+
required_vars: Optional[List[str]] = None
|
137
|
+
template_engine: Optional[str] = None
|
138
|
+
|
139
|
+
|
140
|
+
class HealthResponse(BaseModel):
|
141
|
+
"""Health check response."""
|
142
|
+
status: str = "healthy"
|
143
|
+
version: str
|
144
|
+
model: str
|
145
|
+
base_url: str
|
146
|
+
uptime_seconds: float
|
147
|
+
total_evaluations: int
|
148
|
+
active_connections: int
|
149
|
+
metrics_available: int
|
150
|
+
|
151
|
+
|
152
|
+
class ErrorResponse(BaseModel):
|
153
|
+
"""Error response model."""
|
154
|
+
error: str
|
155
|
+
detail: Optional[str] = None
|
156
|
+
code: Optional[str] = None
|
157
|
+
timestamp: datetime = Field(default_factory=datetime.utcnow)
|