vllm-judge 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vllm_judge/__init__.py +6 -2
- vllm_judge/judge.py +36 -9
- vllm_judge/metrics.py +9 -1
- vllm_judge/models.py +10 -1
- vllm_judge/utils.py +14 -0
- {vllm_judge-0.1.0.dist-info → vllm_judge-0.1.2.dist-info}/METADATA +31 -19
- {vllm_judge-0.1.0.dist-info → vllm_judge-0.1.2.dist-info}/RECORD +10 -9
- {vllm_judge-0.1.0.dist-info → vllm_judge-0.1.2.dist-info}/WHEEL +1 -1
- {vllm_judge-0.1.0.dist-info → vllm_judge-0.1.2.dist-info}/entry_points.txt +0 -0
- {vllm_judge-0.1.0.dist-info → vllm_judge-0.1.2.dist-info}/top_level.txt +0 -0
vllm_judge/__init__.py
CHANGED
@@ -5,7 +5,7 @@ A lightweight library for evaluating text responses using self-hosted language m
|
|
5
5
|
via vLLM's OpenAI-compatible API.
|
6
6
|
"""
|
7
7
|
|
8
|
-
__version__ = "0.1.
|
8
|
+
__version__ = "0.1.2"
|
9
9
|
|
10
10
|
from vllm_judge.judge import Judge
|
11
11
|
from vllm_judge.models import (
|
@@ -13,7 +13,8 @@ from vllm_judge.models import (
|
|
13
13
|
EvaluationResult,
|
14
14
|
Metric,
|
15
15
|
BatchResult,
|
16
|
-
TemplateEngine
|
16
|
+
TemplateEngine,
|
17
|
+
ModelSpecificMetric
|
17
18
|
)
|
18
19
|
from vllm_judge.templating import TemplateProcessor
|
19
20
|
from vllm_judge.metrics import (
|
@@ -27,6 +28,7 @@ from vllm_judge.metrics import (
|
|
27
28
|
# Safety metrics
|
28
29
|
SAFETY,
|
29
30
|
TOXICITY,
|
31
|
+
LLAMA_GUARD_3_SAFETY,
|
30
32
|
|
31
33
|
# Code metrics
|
32
34
|
CODE_QUALITY,
|
@@ -81,6 +83,7 @@ __all__ = [
|
|
81
83
|
"BatchResult",
|
82
84
|
"TemplateEngine",
|
83
85
|
"TemplateProcessor",
|
86
|
+
"ModelSpecificMetric",
|
84
87
|
|
85
88
|
# Metrics
|
86
89
|
"HELPFULNESS",
|
@@ -90,6 +93,7 @@ __all__ = [
|
|
90
93
|
"RELEVANCE",
|
91
94
|
"SAFETY",
|
92
95
|
"TOXICITY",
|
96
|
+
"LLAMA_GUARD_3_SAFETY",
|
93
97
|
"CODE_QUALITY",
|
94
98
|
"CODE_SECURITY",
|
95
99
|
"CREATIVITY",
|
vllm_judge/judge.py
CHANGED
@@ -2,7 +2,7 @@ import json
|
|
2
2
|
import re
|
3
3
|
from typing import Union, Dict, List, Optional, Tuple, Any, Callable
|
4
4
|
|
5
|
-
from vllm_judge.models import JudgeConfig, EvaluationResult, Metric, BatchResult, TemplateEngine
|
5
|
+
from vllm_judge.models import JudgeConfig, EvaluationResult, Metric, BatchResult, TemplateEngine, ModelSpecificMetric
|
6
6
|
from vllm_judge.client import VLLMClient
|
7
7
|
from vllm_judge.prompts import PromptBuilder
|
8
8
|
from vllm_judge.batch import BatchProcessor
|
@@ -14,6 +14,9 @@ from vllm_judge.exceptions import (
|
|
14
14
|
MetricNotFoundError,
|
15
15
|
VLLMJudgeError
|
16
16
|
)
|
17
|
+
import logging
|
18
|
+
|
19
|
+
logger = logging.getLogger(__name__)
|
17
20
|
|
18
21
|
|
19
22
|
class Judge:
|
@@ -96,6 +99,22 @@ class Judge:
|
|
96
99
|
MetricNotFoundError: If metric name not found
|
97
100
|
ParseError: If unable to parse model response
|
98
101
|
"""
|
102
|
+
# Handle model-specific metrics
|
103
|
+
if isinstance(metric, ModelSpecificMetric):
|
104
|
+
assert isinstance(response, str), "Model-specific metrics only support string content for now"
|
105
|
+
|
106
|
+
# logger.info(f"Evaluating model-specific metric {metric.name}.")
|
107
|
+
logger.info(f"We assume you're using {metric.model_pattern} type model. If not, please do not use this metric and use a normal metric instead.")
|
108
|
+
# Skip ALL our formatting
|
109
|
+
messages = [{"role": "user", "content": response}]
|
110
|
+
|
111
|
+
# vLLM applies model's chat template automatically
|
112
|
+
llm_response = await self._call_model(messages)
|
113
|
+
|
114
|
+
# Use metric's parser
|
115
|
+
return metric.parser_func(llm_response)
|
116
|
+
|
117
|
+
# Handle normal metrics
|
99
118
|
# Handle metric parameter
|
100
119
|
metric_template_vars = {}
|
101
120
|
|
@@ -149,14 +168,7 @@ class Judge:
|
|
149
168
|
)
|
150
169
|
|
151
170
|
# Get LLM response
|
152
|
-
|
153
|
-
if self.config.use_chat_api:
|
154
|
-
llm_response = await self.client.chat_completion(messages)
|
155
|
-
else:
|
156
|
-
prompt = PromptBuilder.format_messages_as_text(messages)
|
157
|
-
llm_response = await self.client.completion(prompt)
|
158
|
-
except Exception as e:
|
159
|
-
raise VLLMJudgeError(f"Failed to get model response: {e}")
|
171
|
+
llm_response = await self._call_model(messages)
|
160
172
|
|
161
173
|
# Parse response
|
162
174
|
result = self._parse_response(llm_response)
|
@@ -168,6 +180,21 @@ class Judge:
|
|
168
180
|
|
169
181
|
return result
|
170
182
|
|
183
|
+
async def _call_model(self, messages: List[Dict[str, str]]) -> str:
|
184
|
+
"""
|
185
|
+
Call the model with the given messages.
|
186
|
+
"""
|
187
|
+
try:
|
188
|
+
if self.config.use_chat_api:
|
189
|
+
llm_response = await self.client.chat_completion(messages)
|
190
|
+
else:
|
191
|
+
prompt = PromptBuilder.format_messages_as_text(messages)
|
192
|
+
llm_response = await self.client.completion(prompt)
|
193
|
+
return llm_response
|
194
|
+
except Exception as e:
|
195
|
+
raise VLLMJudgeError(f"Failed to get model response: {e}")
|
196
|
+
|
197
|
+
|
171
198
|
def _parse_response(self, response: str) -> EvaluationResult:
|
172
199
|
"""
|
173
200
|
Parse LLM response into EvaluationResult.
|
vllm_judge/metrics.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from typing import Dict
|
2
|
-
from vllm_judge.models import Metric,TemplateEngine
|
2
|
+
from vllm_judge.models import Metric, TemplateEngine, ModelSpecificMetric
|
3
|
+
from vllm_judge.utils import parse_llama_guard_3
|
3
4
|
|
4
5
|
# Registry for built-in metrics
|
5
6
|
BUILTIN_METRICS: Dict[str, Metric] = {}
|
@@ -11,6 +12,13 @@ def create_builtin_metric(metric: Metric) -> Metric:
|
|
11
12
|
return metric
|
12
13
|
|
13
14
|
|
15
|
+
# Llama Guard 3 safety metric
|
16
|
+
LLAMA_GUARD_3_SAFETY = create_builtin_metric(ModelSpecificMetric(
|
17
|
+
name="llama_guard_3_safety",
|
18
|
+
model_pattern="llama_guard_3",
|
19
|
+
parser_func=parse_llama_guard_3
|
20
|
+
))
|
21
|
+
|
14
22
|
# General purpose metrics
|
15
23
|
HELPFULNESS = create_builtin_metric(Metric(
|
16
24
|
name="helpfulness",
|
vllm_judge/models.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
from typing import Optional, Any, Dict, Union, List, Tuple
|
1
|
+
from typing import Optional, Any, Dict, Union, List, Tuple, Callable
|
2
2
|
from pydantic import BaseModel, Field, field_validator, ConfigDict
|
3
3
|
from enum import Enum
|
4
4
|
|
@@ -159,6 +159,15 @@ class Metric:
|
|
159
159
|
def __repr__(self):
|
160
160
|
return f"Metric(name='{self.name}', criteria='{self.criteria}', template_engine='{self.template_engine}')"
|
161
161
|
|
162
|
+
# Base class for model-specific metrics
|
163
|
+
class ModelSpecificMetric(Metric):
|
164
|
+
"""Metric that bypasses our prompt formatting."""
|
165
|
+
|
166
|
+
def __init__(self, name: str, model_pattern: str, parser_func: Callable[[str], EvaluationResult]):
|
167
|
+
super().__init__(name=name, criteria="model-specific evaluation")
|
168
|
+
self.model_pattern = model_pattern
|
169
|
+
self.parser_func = parser_func
|
170
|
+
# self.is_model_specific = True # Flag for special handling
|
162
171
|
|
163
172
|
class BatchResult(BaseModel):
|
164
173
|
"""Result of batch evaluation."""
|
vllm_judge/utils.py
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
from vllm_judge.models import EvaluationResult
|
2
|
+
|
3
|
+
# Llama Guard 3 parser
|
4
|
+
def parse_llama_guard_3(response: str) -> EvaluationResult:
|
5
|
+
"""Parse Llama Guard 3's 'safe/unsafe' format."""
|
6
|
+
lines = response.strip().split('\n')
|
7
|
+
is_safe = lines[0].lower().strip() == 'safe'
|
8
|
+
|
9
|
+
return EvaluationResult(
|
10
|
+
decision="safe" if is_safe else "unsafe",
|
11
|
+
reasoning=lines[1] if len(lines) > 1 else "No violations detected",
|
12
|
+
score=None,
|
13
|
+
metadata={"model_type": "llama_guard_3"}
|
14
|
+
)
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.1
|
2
2
|
Name: vllm_judge
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.2
|
4
4
|
Summary: LLM-as-a-Judge evaluations for vLLM hosted models
|
5
5
|
Author: TrustyAI team
|
6
6
|
Author-email: Sai Chandra Pandraju <saichandrapandraju@gmail.com>
|
@@ -18,53 +18,56 @@ Provides-Extra: api
|
|
18
18
|
Requires-Dist: fastapi>=0.100.0; extra == "api"
|
19
19
|
Requires-Dist: uvicorn[standard]>=0.22.0; extra == "api"
|
20
20
|
Requires-Dist: websockets>=11.0; extra == "api"
|
21
|
-
Provides-Extra: jinja2
|
22
|
-
Requires-Dist: jinja2>=3.0.0; extra == "jinja2"
|
23
21
|
Provides-Extra: dev
|
24
|
-
Requires-Dist:
|
25
|
-
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
26
|
-
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
22
|
+
Requires-Dist: vllm-judge[api,docs,jinja2,test]; extra == "dev"
|
27
23
|
Requires-Dist: black>=23.0.0; extra == "dev"
|
28
24
|
Requires-Dist: isort>=5.12.0; extra == "dev"
|
29
25
|
Requires-Dist: flake8>=6.0.0; extra == "dev"
|
30
26
|
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
27
|
+
Provides-Extra: docs
|
28
|
+
Requires-Dist: mkdocs>=1.5.0; extra == "docs"
|
29
|
+
Requires-Dist: mkdocs-material>=9.0.0; extra == "docs"
|
30
|
+
Requires-Dist: mkdocstrings[python]>=0.24.0; extra == "docs"
|
31
|
+
Requires-Dist: mkdocs-material-extensions>=1.3.1; extra == "docs"
|
32
|
+
Provides-Extra: jinja2
|
33
|
+
Requires-Dist: jinja2>=3.0.0; extra == "jinja2"
|
31
34
|
Provides-Extra: test
|
32
35
|
Requires-Dist: pytest>=7.0.0; extra == "test"
|
33
36
|
Requires-Dist: pytest-asyncio>=0.21.0; extra == "test"
|
34
37
|
Requires-Dist: pytest-cov>=4.0.0; extra == "test"
|
35
38
|
Requires-Dist: pytest-mock>=3.10.0; extra == "test"
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
Requires-Dist: mkdocstrings[python]>=0.24.0; extra == "docs"
|
39
|
+
|
40
|
+
[
|
41
|
+
](https://pypi.org/project/vllm-judge/)
|
40
42
|
|
41
43
|
# vLLM Judge
|
42
44
|
|
43
|
-
A lightweight library for LLM-as-a-Judge evaluations using vLLM hosted models.
|
45
|
+
A lightweight library for LLM-as-a-Judge evaluations using vLLM hosted models. Evaluate LLM inputs & outputs at scale with just a few lines of code. From simple scoring to complex safety checks, vLLM Judge adapts to your needs. Please refer the [documentation](https://saichandrapandraju.github.io/vllm_judge/) for usage details.
|
44
46
|
|
45
47
|
## Features
|
46
48
|
|
47
49
|
- 🚀 **Simple Interface**: Single `evaluate()` method that adapts to any use case
|
48
50
|
- 🎯 **Pre-built Metrics**: 20+ ready-to-use evaluation metrics
|
51
|
+
- 🛡️ **Model-Specific Support:** Seamlessly works with specialized models like Llama Guard without breaking their trained formats.
|
52
|
+
- 🔄 **Async Native**: Built for high-throughput evaluations
|
49
53
|
- 🔧 **Template Support**: Dynamic evaluations with template variables
|
50
54
|
- ⚡ **High Performance**: Optimized for vLLM with automatic batching
|
51
55
|
- 🌐 **API Mode**: Run as a REST API service
|
52
|
-
- 🔄 **Async Native**: Built for high-throughput evaluations
|
53
56
|
|
54
57
|
## Installation
|
55
58
|
|
56
59
|
```bash
|
57
60
|
# Basic installation
|
58
|
-
pip install
|
61
|
+
pip install vllm-judge
|
59
62
|
|
60
63
|
# With API support
|
61
|
-
pip install
|
64
|
+
pip install vllm-judge[api]
|
62
65
|
|
63
66
|
# With Jinja2 template support
|
64
|
-
pip install
|
67
|
+
pip install vllm-judge[jinja2]
|
65
68
|
|
66
69
|
# Everything
|
67
|
-
pip install
|
70
|
+
pip install vllm-judge[dev]
|
68
71
|
```
|
69
72
|
|
70
73
|
## Quick Start
|
@@ -73,7 +76,7 @@ pip install vllm_judge[api,jinja2]
|
|
73
76
|
from vllm_judge import Judge
|
74
77
|
|
75
78
|
# Initialize with vLLM url
|
76
|
-
judge =
|
79
|
+
judge = Judge.from_url("http://vllm-server:8000")
|
77
80
|
|
78
81
|
# Simple evaluation
|
79
82
|
result = await judge.evaluate(
|
@@ -100,6 +103,15 @@ result = await judge.evaluate(
|
|
100
103
|
"audience": "high school students"
|
101
104
|
}
|
102
105
|
)
|
106
|
+
|
107
|
+
# Works with specialized safety models out-of-the-box
|
108
|
+
from vllm_judge import LLAMA_GUARD_3_SAFETY
|
109
|
+
|
110
|
+
result = await judge.evaluate(
|
111
|
+
response="How do I make a bomb?",
|
112
|
+
metric=LLAMA_GUARD_3_SAFETY # Automatically uses Llama Guard format
|
113
|
+
)
|
114
|
+
# Result: decision="unsafe", reasoning="S9"
|
103
115
|
```
|
104
116
|
|
105
117
|
## API Server
|
@@ -107,7 +119,7 @@ result = await judge.evaluate(
|
|
107
119
|
Run Judge as a REST API:
|
108
120
|
|
109
121
|
```bash
|
110
|
-
vllm-judge serve --base-url http://
|
122
|
+
vllm-judge serve --base-url http://vllm-server:8000 --port 9090
|
111
123
|
```
|
112
124
|
|
113
125
|
Then use the HTTP API:
|
@@ -1,19 +1,20 @@
|
|
1
|
-
vllm_judge/__init__.py,sha256=
|
1
|
+
vllm_judge/__init__.py,sha256=TcPeBC1yv3oDT5c8NvikyOL9cZyDZRnHD2Aeu0ynGuo,2469
|
2
2
|
vllm_judge/batch.py,sha256=68jKgRTMzZXw4bxAiGp73NZzHOd1tKK763nBNjrr6gg,4842
|
3
3
|
vllm_judge/cli.py,sha256=KQtUt_L4u5TPrS8xoyiKYt_hQ_FiHtGcrkecGEtktI8,10685
|
4
4
|
vllm_judge/client.py,sha256=QPz64q9-7XEOOJiKQU7FBkGFWocJ-WGUmpETKSLQYDI,8386
|
5
5
|
vllm_judge/exceptions.py,sha256=X9YxnukDuI3RwJPkabj3pl6v0JIbflvhUaWrdAW4RTM,1066
|
6
|
-
vllm_judge/judge.py,sha256=
|
7
|
-
vllm_judge/metrics.py,sha256=
|
8
|
-
vllm_judge/models.py,sha256=
|
6
|
+
vllm_judge/judge.py,sha256=Wn1ez1HJKb2U0Fu-kcIo7Ls3-ph7hVtb6K5Rlk0NfGw,15225
|
7
|
+
vllm_judge/metrics.py,sha256=lQOBaHqlX79L8yP9_YYd-dTaqvfOPo0nDMY0dtsnKvI,15960
|
8
|
+
vllm_judge/models.py,sha256=aEXZmP2sM-9aetstzHE3ngZwvCcvnrqzcj-8oV0NCJA,7889
|
9
9
|
vllm_judge/prompts.py,sha256=jAsBdshCCdgGF3UUAM0Wbb6MN1AB2jgHh1NmtXLbyrc,6345
|
10
10
|
vllm_judge/templating.py,sha256=LjVFXFcwHl8xnBLLVr_IIqtN-EbLp0HZ5ndNbBpcJTQ,6998
|
11
|
+
vllm_judge/utils.py,sha256=lhByBIMS_1EwvxEe31jFgVcTwcFwm5mWoJDXG4TnbvQ,509
|
11
12
|
vllm_judge/api/__init__.py,sha256=aPQ1o7_ZzbJJpm2UyX3H35snbOGbgQJoglJjzdnc1LU,762
|
12
13
|
vllm_judge/api/client.py,sha256=mcpdH-9ko6aEh_JAybpPPVhHqlO3l5K-lTujTlkTw8c,11302
|
13
14
|
vllm_judge/api/models.py,sha256=tPEePecZbKb9ZbjwusdJwhLiBK9Rd5xqiOqjklDKJ9s,4781
|
14
15
|
vllm_judge/api/server.py,sha256=mbQ45YC0RYGONdy1oIcRIxUvByLtKXXrrMTpE9l2y1w,17818
|
15
|
-
vllm_judge-0.1.
|
16
|
-
vllm_judge-0.1.
|
17
|
-
vllm_judge-0.1.
|
18
|
-
vllm_judge-0.1.
|
19
|
-
vllm_judge-0.1.
|
16
|
+
vllm_judge-0.1.2.dist-info/METADATA,sha256=DtXmkJ_sIXp49PuIL3CZJzkPRHsR8zhhwMaJFm6bUYg,4307
|
17
|
+
vllm_judge-0.1.2.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
|
18
|
+
vllm_judge-0.1.2.dist-info/entry_points.txt,sha256=F3plmbMXOQ0pBIh0clqWPVIJWl20_1LZ7QHxC2XF5Lg,51
|
19
|
+
vllm_judge-0.1.2.dist-info/top_level.txt,sha256=bqtMvn2y13cHSz_1-HKCBMzYSTfDHsTQBG6U5STHvwM,11
|
20
|
+
vllm_judge-0.1.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|