vllm-judge 0.1.5__tar.gz → 0.1.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vllm_judge-0.1.5 → vllm_judge-0.1.6}/PKG-INFO +1 -1
- {vllm_judge-0.1.5 → vllm_judge-0.1.6}/pyproject.toml +1 -1
- {vllm_judge-0.1.5 → vllm_judge-0.1.6}/src/vllm_judge/__init__.py +2 -2
- {vllm_judge-0.1.5 → vllm_judge-0.1.6}/src/vllm_judge/api/__init__.py +0 -3
- {vllm_judge-0.1.5 → vllm_judge-0.1.6}/src/vllm_judge/api/client.py +0 -3
- {vllm_judge-0.1.5 → vllm_judge-0.1.6}/src/vllm_judge/api/server.py +1 -5
- {vllm_judge-0.1.5 → vllm_judge-0.1.6}/src/vllm_judge/batch.py +2 -1
- vllm_judge-0.1.5/src/vllm_judge/metrics.py → vllm_judge-0.1.6/src/vllm_judge/builtin_metrics.py +1 -1
- {vllm_judge-0.1.5 → vllm_judge-0.1.6}/src/vllm_judge/cli.py +1 -5
- {vllm_judge-0.1.5 → vllm_judge-0.1.6}/src/vllm_judge/client.py +1 -6
- {vllm_judge-0.1.5 → vllm_judge-0.1.6}/src/vllm_judge/judge.py +2 -2
- {vllm_judge-0.1.5 → vllm_judge-0.1.6}/src/vllm_judge/models.py +0 -1
- vllm_judge-0.1.5/src/vllm_judge/prompts.py → vllm_judge-0.1.6/src/vllm_judge/prompt_builder.py +57 -37
- {vllm_judge-0.1.5 → vllm_judge-0.1.6}/src/vllm_judge.egg-info/PKG-INFO +1 -1
- {vllm_judge-0.1.5 → vllm_judge-0.1.6}/src/vllm_judge.egg-info/SOURCES.txt +14 -4
- vllm_judge-0.1.6/tests/test_batch.py +142 -0
- vllm_judge-0.1.6/tests/test_builtin_metrics.py +71 -0
- vllm_judge-0.1.6/tests/test_exceptions.py +80 -0
- vllm_judge-0.1.6/tests/test_integration.py +270 -0
- vllm_judge-0.1.6/tests/test_judge.py +281 -0
- vllm_judge-0.1.6/tests/test_judge_client.py +140 -0
- vllm_judge-0.1.6/tests/test_models.py +222 -0
- vllm_judge-0.1.6/tests/test_prompt_builder.py +127 -0
- vllm_judge-0.1.6/tests/test_templating.py +127 -0
- vllm_judge-0.1.6/tests/test_vllm_client.py +166 -0
- {vllm_judge-0.1.5 → vllm_judge-0.1.6}/README.md +0 -0
- {vllm_judge-0.1.5 → vllm_judge-0.1.6}/setup.cfg +0 -0
- {vllm_judge-0.1.5 → vllm_judge-0.1.6}/src/vllm_judge/api/models.py +0 -0
- {vllm_judge-0.1.5 → vllm_judge-0.1.6}/src/vllm_judge/exceptions.py +0 -0
- /vllm_judge-0.1.5/src/vllm_judge/utils.py → /vllm_judge-0.1.6/src/vllm_judge/parsers.py +0 -0
- {vllm_judge-0.1.5 → vllm_judge-0.1.6}/src/vllm_judge/templating.py +0 -0
- {vllm_judge-0.1.5 → vllm_judge-0.1.6}/src/vllm_judge.egg-info/dependency_links.txt +0 -0
- {vllm_judge-0.1.5 → vllm_judge-0.1.6}/src/vllm_judge.egg-info/entry_points.txt +0 -0
- {vllm_judge-0.1.5 → vllm_judge-0.1.6}/src/vllm_judge.egg-info/requires.txt +0 -0
- {vllm_judge-0.1.5 → vllm_judge-0.1.6}/src/vllm_judge.egg-info/top_level.txt +0 -0
@@ -5,7 +5,7 @@ A lightweight library for evaluating text responses using self-hosted language m
|
|
5
5
|
via vLLM's OpenAI-compatible API.
|
6
6
|
"""
|
7
7
|
|
8
|
-
__version__ = "0.1.
|
8
|
+
__version__ = "0.1.6"
|
9
9
|
|
10
10
|
from vllm_judge.judge import Judge
|
11
11
|
from vllm_judge.models import (
|
@@ -17,7 +17,7 @@ from vllm_judge.models import (
|
|
17
17
|
ModelSpecificMetric
|
18
18
|
)
|
19
19
|
from vllm_judge.templating import TemplateProcessor
|
20
|
-
from vllm_judge.
|
20
|
+
from vllm_judge.builtin_metrics import (
|
21
21
|
# General metrics
|
22
22
|
HELPFULNESS,
|
23
23
|
ACCURACY,
|
@@ -1,7 +1,3 @@
|
|
1
|
-
"""
|
2
|
-
FastAPI server for vLLM Judge API.
|
3
|
-
"""
|
4
|
-
import asyncio
|
5
1
|
import time
|
6
2
|
import uuid
|
7
3
|
from datetime import datetime
|
@@ -14,7 +10,7 @@ import uvicorn
|
|
14
10
|
|
15
11
|
from vllm_judge.judge import Judge
|
16
12
|
from vllm_judge.models import EvaluationResult, JudgeConfig
|
17
|
-
from vllm_judge.
|
13
|
+
from vllm_judge.builtin_metrics import BUILTIN_METRICS
|
18
14
|
from vllm_judge.exceptions import VLLMJudgeError
|
19
15
|
from vllm_judge.api.models import (
|
20
16
|
EvaluateRequest,
|
@@ -17,7 +17,8 @@ class BatchProcessor:
|
|
17
17
|
max_concurrent: Maximum concurrent requests
|
18
18
|
"""
|
19
19
|
self.judge = judge
|
20
|
-
self.
|
20
|
+
self.max_concurrent = max_concurrent
|
21
|
+
self.semaphore = asyncio.Semaphore(self.max_concurrent)
|
21
22
|
self.progress_lock = asyncio.Lock()
|
22
23
|
self.completed = 0
|
23
24
|
|
vllm_judge-0.1.5/src/vllm_judge/metrics.py → vllm_judge-0.1.6/src/vllm_judge/builtin_metrics.py
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
from typing import Dict
|
2
2
|
from vllm_judge.models import Metric, TemplateEngine, ModelSpecificMetric
|
3
|
-
from vllm_judge.
|
3
|
+
from vllm_judge.parsers import parse_llama_guard_3
|
4
4
|
|
5
5
|
# Registry for built-in metrics
|
6
6
|
BUILTIN_METRICS: Dict[str, Metric] = {}
|
@@ -1,6 +1,3 @@
|
|
1
|
-
"""
|
2
|
-
Command-line interface for vLLM Judge.
|
3
|
-
"""
|
4
1
|
import asyncio
|
5
2
|
import json
|
6
3
|
import sys
|
@@ -8,10 +5,9 @@ from typing import Optional
|
|
8
5
|
import click
|
9
6
|
|
10
7
|
from vllm_judge import Judge
|
11
|
-
from vllm_judge.models import JudgeConfig
|
12
8
|
from vllm_judge.api.server import start_server as start_api_server
|
13
9
|
from vllm_judge.api.client import JudgeClient
|
14
|
-
from vllm_judge.
|
10
|
+
from vllm_judge.builtin_metrics import BUILTIN_METRICS
|
15
11
|
|
16
12
|
|
17
13
|
@click.group()
|
@@ -126,12 +126,8 @@ class VLLMClient:
|
|
126
126
|
"messages": messages,
|
127
127
|
"temperature": self.config.temperature,
|
128
128
|
"max_tokens": self.config.max_tokens,
|
129
|
-
# "top_p": self.config.top_p,
|
130
129
|
}
|
131
|
-
|
132
|
-
# # Request JSON response format if supported
|
133
|
-
# if self.config.temperature < 0.2: # Only for low temperature
|
134
|
-
# request_data["response_format"] = {"type": "json_object"}
|
130
|
+
|
135
131
|
|
136
132
|
try:
|
137
133
|
response = await self._request_with_retry(
|
@@ -172,7 +168,6 @@ class VLLMClient:
|
|
172
168
|
"prompt": prompt,
|
173
169
|
"temperature": self.config.temperature,
|
174
170
|
"max_tokens": self.config.max_tokens,
|
175
|
-
# "top_p": self.config.top_p,
|
176
171
|
}
|
177
172
|
|
178
173
|
try:
|
@@ -4,9 +4,9 @@ from typing import Union, Dict, List, Optional, Tuple, Any, Callable
|
|
4
4
|
|
5
5
|
from vllm_judge.models import JudgeConfig, EvaluationResult, Metric, BatchResult, TemplateEngine, ModelSpecificMetric
|
6
6
|
from vllm_judge.client import VLLMClient
|
7
|
-
from vllm_judge.
|
7
|
+
from vllm_judge.prompt_builder import PromptBuilder
|
8
8
|
from vllm_judge.batch import BatchProcessor
|
9
|
-
from vllm_judge.
|
9
|
+
from vllm_judge.builtin_metrics import BUILTIN_METRICS
|
10
10
|
from vllm_judge.templating import TemplateProcessor
|
11
11
|
from vllm_judge.exceptions import (
|
12
12
|
ParseError,
|
@@ -59,7 +59,6 @@ class JudgeConfig(BaseModel):
|
|
59
59
|
# Model parameters
|
60
60
|
temperature: float = Field(0.0, description="Sampling temperature")
|
61
61
|
max_tokens: int = Field(256, description="Maximum tokens in response")
|
62
|
-
# top_p: float = Field(0.95, description="Top-p sampling")
|
63
62
|
|
64
63
|
# Batch settings
|
65
64
|
max_concurrent: int = Field(50, description="Maximum concurrent requests")
|
vllm_judge-0.1.5/src/vllm_judge/prompts.py → vllm_judge-0.1.6/src/vllm_judge/prompt_builder.py
RENAMED
@@ -1,5 +1,5 @@
|
|
1
1
|
from typing import List, Dict, Union, Optional, Tuple, Any
|
2
|
-
|
2
|
+
import json
|
3
3
|
|
4
4
|
class PromptBuilder:
|
5
5
|
"""Builds prompts for evaluation requests."""
|
@@ -35,24 +35,36 @@ class PromptBuilder:
|
|
35
35
|
"""
|
36
36
|
# Detect evaluation type
|
37
37
|
is_comparison = isinstance(content, dict) and "a" in content and "b" in content
|
38
|
+
|
39
|
+
output_format = """
|
40
|
+
# Output Format:
|
41
|
+
|
42
|
+
The JSON object MUST have exactly these three fields:
|
43
|
+
|
44
|
+
1. decision: (String | Boolean) This decision label should clearly state your main finding. This could be a string representing a specific class (eg., PASS, FAIL, CORRECT, INCORRECT, etc.) or a boolean value (true or false). If user provided a rubric, you should use the rubric to determine the decision label.
|
45
|
+
2. score: (Number | null) A numerical score for the evaluation. If scoring is requested, provide the score as a number. If scoring is NOT requested or is not applicable for the specific task, you MUST use the value null for this field.
|
46
|
+
3. reasoning: (String) A concise explanation justifying your decision and score (if a score was provided). This reasoning must directly and logically support your evaluation and refer to the specific evaluation criteria.
|
47
|
+
|
48
|
+
The JSON object MUST be well-formed and adhere strictly to the following structure:
|
49
|
+
|
50
|
+
{
|
51
|
+
"decision": <your judgment - string|boolean>,
|
52
|
+
"reasoning": <concise explanation of your judgment - string>,
|
53
|
+
"score": <numeric score if requested, otherwise null - number|null>
|
54
|
+
}
|
55
|
+
"""
|
38
56
|
|
39
57
|
# System message
|
40
58
|
if not system_prompt:
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
system_prompt+=
|
50
|
-
system_prompt+="""{
|
51
|
-
"decision": <your judgment - string|boolean>,
|
52
|
-
"reasoning": "<concise explanation of your judgment>",
|
53
|
-
"score": <numeric score if requested, otherwise null>
|
54
|
-
}"""
|
55
|
-
system_prompt+="\nDo not include any text in your response except for the JSON object."
|
59
|
+
system_prompt = """You are an impartial judge and expert evaluator. Your task is to evaluate the provided content based on the specific evaluation criteria and rubric.
|
60
|
+
# Key Instructions:
|
61
|
+
1. Your evaluation must be objective, consistent, and based solely on the specified criteria. Do not let your own opinions or biases interfere.
|
62
|
+
2. Focus exclusively on quality assessment.
|
63
|
+
3. Do not be influenced by the length of the responses unless response length is explicitly relevant to the specified evaluation criteria (e.g., a task assessing conciseness or verbosity).
|
64
|
+
4. Your entire response MUST be a single, valid JSON object and nothing else. Do not include any text or conversational filler before or after this JSON object.
|
65
|
+
|
66
|
+
"""
|
67
|
+
system_prompt += output_format
|
56
68
|
|
57
69
|
# Build user message
|
58
70
|
user_content = PromptBuilder._build_user_prompt(
|
@@ -93,30 +105,30 @@ class PromptBuilder:
|
|
93
105
|
parts.append(f'"{input}"')
|
94
106
|
parts.append("")
|
95
107
|
|
108
|
+
parts.append("## Content to evaluate:")
|
109
|
+
if is_comparison:
|
110
|
+
parts.append(f"**Response A:**\n{content['a']}")
|
111
|
+
parts.append(f"**Response B:**\n{content['b']}")
|
112
|
+
else:
|
113
|
+
parts.append(content)
|
114
|
+
|
115
|
+
parts.append("## Evaluation Criteria:")
|
116
|
+
|
96
117
|
# Task description
|
97
118
|
if is_comparison:
|
98
|
-
|
99
|
-
parts.append(f"Compare how well these two responses address the input for: {criteria}")
|
100
|
-
else:
|
101
|
-
parts.append(f"Compare these two responses based on: {criteria}")
|
119
|
+
parts.append(f"Compare the two responses based on: {criteria}")
|
102
120
|
if context:
|
103
121
|
parts.append(f"\nContext: {context}")
|
104
|
-
parts.append(f"\nResponse A:\n{content['a']}")
|
105
|
-
parts.append(f"\nResponse B:\n{content['b']}")
|
106
122
|
else:
|
107
|
-
|
108
|
-
parts.append(f"Evaluate how well this content addresses the input for: {criteria}")
|
109
|
-
else:
|
110
|
-
parts.append(f"Evaluate the following content based on: {criteria}")
|
123
|
+
parts.append(f"Evaluate the content based on: {criteria}")
|
111
124
|
if context:
|
112
125
|
parts.append(f"\nContext: {context}")
|
113
|
-
parts.append(f"\nContent to evaluate:\n{content}")
|
114
126
|
|
115
|
-
parts.append(f"\nYou must return a decision label/class (your judgement) for the `decision` field and a concise explanation for the `reasoning` field.")
|
127
|
+
parts.append(f"\nYou must return a decision label/class (your main judgement) for the `decision` field and a concise explanation for the `reasoning` field in the JSON object.")
|
116
128
|
|
117
129
|
# Add scale and rubric
|
118
130
|
if scale:
|
119
|
-
parts.append(f"
|
131
|
+
parts.append(f"In addition to these, provide a score from {scale[0]} to {scale[1]}")
|
120
132
|
|
121
133
|
if isinstance(rubric, dict):
|
122
134
|
parts.append("\nScoring guide:")
|
@@ -127,14 +139,15 @@ class PromptBuilder:
|
|
127
139
|
elif rubric:
|
128
140
|
parts.append(f"\nEvaluation guide: {rubric}")
|
129
141
|
elif rubric:
|
142
|
+
parts.append("\nIn addition to these, provide a score if required by the following evaluation guide.")
|
130
143
|
parts.append(f"\nEvaluation guide: {rubric}")
|
131
144
|
|
132
145
|
# Add examples if provided
|
133
146
|
if examples:
|
134
147
|
parts.append("\nExample evaluations:")
|
135
|
-
for i, ex in enumerate(examples
|
136
|
-
parts.append(f"
|
137
|
-
|
148
|
+
for i, ex in enumerate(examples):
|
149
|
+
parts.append(f"Example {i+1}:")
|
150
|
+
parts.append("Request:")
|
138
151
|
# Handle different example formats
|
139
152
|
if "input" in ex:
|
140
153
|
parts.append(f"Input: {ex['input']}")
|
@@ -143,17 +156,24 @@ class PromptBuilder:
|
|
143
156
|
elif "text" in ex:
|
144
157
|
parts.append(f"Text: {ex['text']}")
|
145
158
|
|
146
|
-
|
147
|
-
|
159
|
+
parts.append("Response:")
|
160
|
+
|
161
|
+
response = {}
|
162
|
+
if "decision" not in ex or ex["decision"] is None or ex["decision"] == "":
|
163
|
+
raise ValueError("Example must include a decision field")
|
164
|
+
|
165
|
+
response["decision"] = ex["decision"]
|
148
166
|
if "score" in ex:
|
149
|
-
|
167
|
+
response["score"] = ex["score"]
|
150
168
|
|
151
169
|
if "reasoning" in ex:
|
152
|
-
|
170
|
+
response["reasoning"] = ex["reasoning"]
|
171
|
+
|
172
|
+
parts.append(json.dumps(response))
|
153
173
|
|
154
174
|
# Add any additional instructions
|
155
175
|
if kwargs.get("additional_instructions"):
|
156
|
-
parts.append(f"
|
176
|
+
parts.append(f"Additional instructions: {kwargs['additional_instructions']}")
|
157
177
|
|
158
178
|
# Output format instructions
|
159
179
|
parts.append("\nYou must respond in JSON format:")
|
@@ -2,15 +2,15 @@ README.md
|
|
2
2
|
pyproject.toml
|
3
3
|
src/vllm_judge/__init__.py
|
4
4
|
src/vllm_judge/batch.py
|
5
|
+
src/vllm_judge/builtin_metrics.py
|
5
6
|
src/vllm_judge/cli.py
|
6
7
|
src/vllm_judge/client.py
|
7
8
|
src/vllm_judge/exceptions.py
|
8
9
|
src/vllm_judge/judge.py
|
9
|
-
src/vllm_judge/metrics.py
|
10
10
|
src/vllm_judge/models.py
|
11
|
-
src/vllm_judge/
|
11
|
+
src/vllm_judge/parsers.py
|
12
|
+
src/vllm_judge/prompt_builder.py
|
12
13
|
src/vllm_judge/templating.py
|
13
|
-
src/vllm_judge/utils.py
|
14
14
|
src/vllm_judge.egg-info/PKG-INFO
|
15
15
|
src/vllm_judge.egg-info/SOURCES.txt
|
16
16
|
src/vllm_judge.egg-info/dependency_links.txt
|
@@ -20,4 +20,14 @@ src/vllm_judge.egg-info/top_level.txt
|
|
20
20
|
src/vllm_judge/api/__init__.py
|
21
21
|
src/vllm_judge/api/client.py
|
22
22
|
src/vllm_judge/api/models.py
|
23
|
-
src/vllm_judge/api/server.py
|
23
|
+
src/vllm_judge/api/server.py
|
24
|
+
tests/test_batch.py
|
25
|
+
tests/test_builtin_metrics.py
|
26
|
+
tests/test_exceptions.py
|
27
|
+
tests/test_integration.py
|
28
|
+
tests/test_judge.py
|
29
|
+
tests/test_judge_client.py
|
30
|
+
tests/test_models.py
|
31
|
+
tests/test_prompt_builder.py
|
32
|
+
tests/test_templating.py
|
33
|
+
tests/test_vllm_client.py
|
@@ -0,0 +1,142 @@
|
|
1
|
+
import pytest
|
2
|
+
import asyncio
|
3
|
+
from unittest.mock import AsyncMock, Mock
|
4
|
+
from vllm_judge.batch import BatchProcessor
|
5
|
+
from vllm_judge.models import EvaluationResult, BatchResult
|
6
|
+
|
7
|
+
|
8
|
+
class TestBatchProcessor:
|
9
|
+
"""Test BatchProcessor functionality."""
|
10
|
+
|
11
|
+
@pytest.fixture
|
12
|
+
def mock_judge(self):
|
13
|
+
"""Mock Judge for batch processing tests."""
|
14
|
+
judge = Mock()
|
15
|
+
judge.evaluate = AsyncMock()
|
16
|
+
judge.evaluate.return_value = EvaluationResult(
|
17
|
+
decision="GOOD", reasoning="Test reasoning"
|
18
|
+
)
|
19
|
+
return judge
|
20
|
+
|
21
|
+
async def test_batch_processor_init(self, mock_judge):
|
22
|
+
"""Test BatchProcessor initialization."""
|
23
|
+
processor = BatchProcessor(mock_judge, max_concurrent=10)
|
24
|
+
assert processor.judge == mock_judge
|
25
|
+
assert processor.max_concurrent == 10
|
26
|
+
|
27
|
+
async def test_batch_process_success(self, mock_judge):
|
28
|
+
"""Test successful batch processing."""
|
29
|
+
processor = BatchProcessor(mock_judge, max_concurrent=2)
|
30
|
+
|
31
|
+
data = [
|
32
|
+
{"content": "Text 1", "criteria": "quality"},
|
33
|
+
{"content": "Text 2", "criteria": "accuracy"},
|
34
|
+
{"content": "Text 3", "criteria": "clarity"}
|
35
|
+
]
|
36
|
+
|
37
|
+
result = await processor.process(data)
|
38
|
+
|
39
|
+
assert isinstance(result, BatchResult)
|
40
|
+
assert result.total == 3
|
41
|
+
assert result.successful == 3
|
42
|
+
assert result.failed == 0
|
43
|
+
assert len(result.results) == 3
|
44
|
+
|
45
|
+
# Check that all results are EvaluationResult instances
|
46
|
+
for res in result.results:
|
47
|
+
assert isinstance(res, EvaluationResult)
|
48
|
+
|
49
|
+
async def test_batch_process_with_failures(self, mock_judge):
|
50
|
+
"""Test batch processing with some failures."""
|
51
|
+
processor = BatchProcessor(mock_judge, max_concurrent=2)
|
52
|
+
|
53
|
+
# Make the second call fail
|
54
|
+
mock_judge.evaluate.side_effect = [
|
55
|
+
EvaluationResult(decision="GOOD", reasoning="Success"),
|
56
|
+
Exception("Evaluation failed"),
|
57
|
+
EvaluationResult(decision="OK", reasoning="Success")
|
58
|
+
]
|
59
|
+
|
60
|
+
data = [
|
61
|
+
{"content": "Text 1", "criteria": "quality"},
|
62
|
+
{"content": "Text 2", "criteria": "accuracy"}, # This will fail
|
63
|
+
{"content": "Text 3", "criteria": "clarity"}
|
64
|
+
]
|
65
|
+
|
66
|
+
result = await processor.process(data)
|
67
|
+
|
68
|
+
assert result.total == 3
|
69
|
+
assert result.successful == 2
|
70
|
+
assert result.failed == 1
|
71
|
+
assert result.success_rate == 2/3
|
72
|
+
|
73
|
+
# Check failures
|
74
|
+
failures = result.get_failures()
|
75
|
+
assert len(failures) == 1
|
76
|
+
assert failures[0][0] == 1 # Second item (index 1) failed
|
77
|
+
assert isinstance(failures[0][1], Exception)
|
78
|
+
|
79
|
+
async def test_batch_process_with_progress_callback(self, mock_judge):
|
80
|
+
"""Test batch processing with progress callback."""
|
81
|
+
processor = BatchProcessor(mock_judge, max_concurrent=1)
|
82
|
+
|
83
|
+
progress_calls = []
|
84
|
+
def progress_callback(completed, total):
|
85
|
+
progress_calls.append((completed, total))
|
86
|
+
|
87
|
+
data = [
|
88
|
+
{"content": "Text 1", "criteria": "quality"},
|
89
|
+
{"content": "Text 2", "criteria": "accuracy"}
|
90
|
+
]
|
91
|
+
|
92
|
+
await processor.process(data, progress_callback=progress_callback)
|
93
|
+
|
94
|
+
# Should have recorded progress
|
95
|
+
assert len(progress_calls) >= 2
|
96
|
+
assert progress_calls[-1] == (2, 2) # Final call should be (completed, total)
|
97
|
+
|
98
|
+
async def test_batch_process_default_kwargs(self, mock_judge):
|
99
|
+
"""Test batch processing with default kwargs."""
|
100
|
+
processor = BatchProcessor(mock_judge, max_concurrent=1)
|
101
|
+
|
102
|
+
data = [
|
103
|
+
{"content": "Text 1"}, # No criteria specified
|
104
|
+
{"content": "Text 2", "criteria": "specific_criteria"} # Override
|
105
|
+
]
|
106
|
+
|
107
|
+
default_kwargs = {"criteria": "default_criteria"}
|
108
|
+
|
109
|
+
await processor.process(data, **default_kwargs)
|
110
|
+
|
111
|
+
# Check that evaluate was called with merged kwargs
|
112
|
+
calls = mock_judge.evaluate.call_args_list
|
113
|
+
assert len(calls) == 2
|
114
|
+
|
115
|
+
# First call should use default criteria
|
116
|
+
assert calls[0][1]["criteria"] == "default_criteria"
|
117
|
+
# Second call should use specific criteria (override)
|
118
|
+
assert calls[1][1]["criteria"] == "specific_criteria"
|
119
|
+
|
120
|
+
async def test_batch_process_concurrency_limit(self, mock_judge):
|
121
|
+
"""Test that concurrency limit is respected."""
|
122
|
+
max_concurrent = 2
|
123
|
+
processor = BatchProcessor(mock_judge, max_concurrent=max_concurrent)
|
124
|
+
|
125
|
+
call_times = []
|
126
|
+
|
127
|
+
async def mock_evaluate(**kwargs):
|
128
|
+
call_times.append(asyncio.get_event_loop().time())
|
129
|
+
await asyncio.sleep(0.1) # Simulate some processing time
|
130
|
+
return EvaluationResult(decision="GOOD", reasoning="Test")
|
131
|
+
|
132
|
+
mock_judge.evaluate.side_effect = mock_evaluate
|
133
|
+
|
134
|
+
data = [{"content": f"Text {i}", "criteria": "test"} for i in range(5)]
|
135
|
+
|
136
|
+
start_time = asyncio.get_event_loop().time()
|
137
|
+
await processor.process(data)
|
138
|
+
end_time = asyncio.get_event_loop().time()
|
139
|
+
|
140
|
+
# With max_concurrent=2 and 0.1s per call, 5 calls should take at least 0.3s
|
141
|
+
# (first 2 in parallel, then next 2 in parallel, then last 1)
|
142
|
+
assert end_time - start_time >= 0.25 # Allow some margin for timing
|
@@ -0,0 +1,71 @@
|
|
1
|
+
from vllm_judge.builtin_metrics import (
|
2
|
+
HELPFULNESS, ACCURACY, SAFETY, CODE_QUALITY,
|
3
|
+
BUILTIN_METRICS, LLAMA_GUARD_3_SAFETY
|
4
|
+
)
|
5
|
+
from vllm_judge.models import Metric, ModelSpecificMetric
|
6
|
+
|
7
|
+
|
8
|
+
class TestBuiltinMetrics:
|
9
|
+
"""Test built-in metrics."""
|
10
|
+
|
11
|
+
def test_builtin_metrics_are_metrics(self):
|
12
|
+
"""Test that built-in metrics are Metric instances."""
|
13
|
+
assert isinstance(HELPFULNESS, Metric)
|
14
|
+
assert isinstance(ACCURACY, Metric)
|
15
|
+
assert isinstance(SAFETY, Metric)
|
16
|
+
assert isinstance(CODE_QUALITY, Metric)
|
17
|
+
|
18
|
+
def test_builtin_metrics_have_names(self):
|
19
|
+
"""Test that built-in metrics have proper names."""
|
20
|
+
assert HELPFULNESS.name.upper() == "HELPFULNESS"
|
21
|
+
assert ACCURACY.name.upper() == "ACCURACY"
|
22
|
+
assert SAFETY.name.upper() == "SAFETY"
|
23
|
+
assert CODE_QUALITY.name.upper() == "CODE_QUALITY"
|
24
|
+
|
25
|
+
def test_builtin_metrics_have_criteria(self):
|
26
|
+
"""Test that built-in metrics have criteria defined."""
|
27
|
+
assert HELPFULNESS.criteria is not None
|
28
|
+
assert len(HELPFULNESS.criteria) > 0
|
29
|
+
|
30
|
+
assert ACCURACY.criteria is not None
|
31
|
+
assert len(ACCURACY.criteria) > 0
|
32
|
+
|
33
|
+
def test_builtin_metrics_dict(self):
|
34
|
+
"""Test BUILTIN_METRICS dictionary."""
|
35
|
+
assert isinstance(BUILTIN_METRICS, dict)
|
36
|
+
assert "HELPFULNESS".lower() in BUILTIN_METRICS
|
37
|
+
assert "ACCURACY".lower() in BUILTIN_METRICS
|
38
|
+
assert BUILTIN_METRICS["HELPFULNESS".lower()] == HELPFULNESS
|
39
|
+
|
40
|
+
def test_model_specific_metrics(self):
|
41
|
+
"""Test model-specific metrics like Llama Guard."""
|
42
|
+
assert isinstance(LLAMA_GUARD_3_SAFETY, ModelSpecificMetric)
|
43
|
+
assert LLAMA_GUARD_3_SAFETY.name.upper() == "LLAMA_GUARD_3_SAFETY"
|
44
|
+
assert LLAMA_GUARD_3_SAFETY.model_pattern is not None
|
45
|
+
assert LLAMA_GUARD_3_SAFETY.parser_func is not None
|
46
|
+
|
47
|
+
def test_metrics_with_scales(self):
|
48
|
+
"""Test metrics that have defined scales."""
|
49
|
+
# Some metrics should have scales defined
|
50
|
+
scale_metrics = [m for m in BUILTIN_METRICS.values() if m.scale is not None]
|
51
|
+
assert len(scale_metrics) > 0
|
52
|
+
|
53
|
+
# Check scale format
|
54
|
+
for metric in scale_metrics:
|
55
|
+
assert isinstance(metric.scale, tuple)
|
56
|
+
assert len(metric.scale) == 2
|
57
|
+
assert metric.scale[0] < metric.scale[1]
|
58
|
+
|
59
|
+
def test_template_metrics(self):
|
60
|
+
"""Test metrics that use templates."""
|
61
|
+
from vllm_judge.builtin_metrics import (
|
62
|
+
EDUCATIONAL_CONTENT_TEMPLATE,
|
63
|
+
CODE_REVIEW_TEMPLATE
|
64
|
+
)
|
65
|
+
|
66
|
+
assert isinstance(EDUCATIONAL_CONTENT_TEMPLATE, Metric)
|
67
|
+
assert isinstance(CODE_REVIEW_TEMPLATE, Metric)
|
68
|
+
|
69
|
+
# These should have template variables
|
70
|
+
assert len(EDUCATIONAL_CONTENT_TEMPLATE.required_vars) > 0
|
71
|
+
assert len(CODE_REVIEW_TEMPLATE.required_vars) > 0
|
@@ -0,0 +1,80 @@
|
|
1
|
+
from vllm_judge.exceptions import (
|
2
|
+
VLLMJudgeError,
|
3
|
+
ConfigurationError,
|
4
|
+
ConnectionError,
|
5
|
+
TimeoutError,
|
6
|
+
ParseError,
|
7
|
+
MetricNotFoundError,
|
8
|
+
InvalidInputError,
|
9
|
+
RetryExhaustedError
|
10
|
+
)
|
11
|
+
|
12
|
+
|
13
|
+
class TestExceptions:
|
14
|
+
"""Test custom exceptions."""
|
15
|
+
|
16
|
+
def test_base_exception(self):
|
17
|
+
"""Test base VLLMJudgeError."""
|
18
|
+
error = VLLMJudgeError("Base error")
|
19
|
+
assert str(error) == "Base error"
|
20
|
+
assert isinstance(error, Exception)
|
21
|
+
|
22
|
+
def test_configuration_error(self):
|
23
|
+
"""Test ConfigurationError."""
|
24
|
+
error = ConfigurationError("Invalid config")
|
25
|
+
assert str(error) == "Invalid config"
|
26
|
+
assert isinstance(error, VLLMJudgeError)
|
27
|
+
|
28
|
+
def test_connection_error(self):
|
29
|
+
"""Test ConnectionError."""
|
30
|
+
error = ConnectionError("Cannot connect to server")
|
31
|
+
assert str(error) == "Cannot connect to server"
|
32
|
+
assert isinstance(error, VLLMJudgeError)
|
33
|
+
|
34
|
+
def test_timeout_error(self):
|
35
|
+
"""Test TimeoutError."""
|
36
|
+
error = TimeoutError("Request timed out")
|
37
|
+
assert str(error) == "Request timed out"
|
38
|
+
assert isinstance(error, VLLMJudgeError)
|
39
|
+
|
40
|
+
def test_parse_error(self):
|
41
|
+
"""Test ParseError with raw response."""
|
42
|
+
raw_response = "Invalid JSON response"
|
43
|
+
error = ParseError("Cannot parse response", raw_response=raw_response)
|
44
|
+
|
45
|
+
assert str(error) == "Cannot parse response"
|
46
|
+
assert error.raw_response == raw_response
|
47
|
+
assert isinstance(error, VLLMJudgeError)
|
48
|
+
|
49
|
+
def test_parse_error_without_raw_response(self):
|
50
|
+
"""Test ParseError without raw response."""
|
51
|
+
error = ParseError("Cannot parse response")
|
52
|
+
assert str(error) == "Cannot parse response"
|
53
|
+
assert error.raw_response is None
|
54
|
+
|
55
|
+
def test_metric_not_found_error(self):
|
56
|
+
"""Test MetricNotFoundError."""
|
57
|
+
error = MetricNotFoundError("Metric 'unknown' not found")
|
58
|
+
assert str(error) == "Metric 'unknown' not found"
|
59
|
+
assert isinstance(error, VLLMJudgeError)
|
60
|
+
|
61
|
+
def test_invalid_input_error(self):
|
62
|
+
"""Test InvalidInputError."""
|
63
|
+
error = InvalidInputError("Invalid input parameters")
|
64
|
+
assert str(error) == "Invalid input parameters"
|
65
|
+
assert isinstance(error, VLLMJudgeError)
|
66
|
+
|
67
|
+
def test_retry_exhausted_error(self):
|
68
|
+
"""Test RetryExhaustedError."""
|
69
|
+
last_error = Exception("Last attempt failed")
|
70
|
+
error = RetryExhaustedError("All retries failed", last_error=last_error)
|
71
|
+
|
72
|
+
assert str(error) == "All retries failed"
|
73
|
+
assert error.last_error == last_error
|
74
|
+
assert isinstance(error, VLLMJudgeError)
|
75
|
+
|
76
|
+
def test_retry_exhausted_error_without_last_error(self):
|
77
|
+
"""Test RetryExhaustedError without last error."""
|
78
|
+
error = RetryExhaustedError("All retries failed")
|
79
|
+
assert str(error) == "All retries failed"
|
80
|
+
assert error.last_error is None
|