vllm-judge 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vllm_judge/__init__.py CHANGED
@@ -5,7 +5,7 @@ A lightweight library for evaluating text responses using self-hosted language m
5
5
  via vLLM's OpenAI-compatible API.
6
6
  """
7
7
 
8
- __version__ = "0.1.1"
8
+ __version__ = "0.1.3"
9
9
 
10
10
  from vllm_judge.judge import Judge
11
11
  from vllm_judge.models import (
@@ -13,7 +13,8 @@ from vllm_judge.models import (
13
13
  EvaluationResult,
14
14
  Metric,
15
15
  BatchResult,
16
- TemplateEngine
16
+ TemplateEngine,
17
+ ModelSpecificMetric
17
18
  )
18
19
  from vllm_judge.templating import TemplateProcessor
19
20
  from vllm_judge.metrics import (
@@ -27,6 +28,7 @@ from vllm_judge.metrics import (
27
28
  # Safety metrics
28
29
  SAFETY,
29
30
  TOXICITY,
31
+ LLAMA_GUARD_3_SAFETY,
30
32
 
31
33
  # Code metrics
32
34
  CODE_QUALITY,
@@ -81,6 +83,7 @@ __all__ = [
81
83
  "BatchResult",
82
84
  "TemplateEngine",
83
85
  "TemplateProcessor",
86
+ "ModelSpecificMetric",
84
87
 
85
88
  # Metrics
86
89
  "HELPFULNESS",
@@ -90,6 +93,7 @@ __all__ = [
90
93
  "RELEVANCE",
91
94
  "SAFETY",
92
95
  "TOXICITY",
96
+ "LLAMA_GUARD_3_SAFETY",
93
97
  "CODE_QUALITY",
94
98
  "CODE_SECURITY",
95
99
  "CREATIVITY",
vllm_judge/api/client.py CHANGED
@@ -65,7 +65,7 @@ class JudgeClient:
65
65
 
66
66
  async def evaluate(
67
67
  self,
68
- response: Union[str, Dict[str, str]],
68
+ content: Union[str, Dict[str, str]],
69
69
  criteria: str = None,
70
70
  rubric: Union[str, Dict[Union[int, float], str]] = None,
71
71
  scale: Optional[Tuple[int, int]] = None,
@@ -87,7 +87,7 @@ class JudgeClient:
87
87
  EvaluationResult
88
88
  """
89
89
  request = EvaluateRequest(
90
- response=response,
90
+ response=content,
91
91
  criteria=criteria,
92
92
  rubric=rubric,
93
93
  scale=list(scale) if scale else None,
vllm_judge/cli.py CHANGED
@@ -75,7 +75,7 @@ def evaluate(
75
75
  # Use API client
76
76
  async with JudgeClient(api_url) as client:
77
77
  result = await client.evaluate(
78
- response=response,
78
+ content=response,
79
79
  criteria=criteria,
80
80
  metric=metric,
81
81
  scale=scale,
@@ -91,7 +91,7 @@ def evaluate(
91
91
  judge = Judge.from_url(base_url, model=model)
92
92
  async with judge:
93
93
  result = await judge.evaluate(
94
- response=response,
94
+ content=response,
95
95
  criteria=criteria,
96
96
  metric=metric,
97
97
  scale=scale,
vllm_judge/judge.py CHANGED
@@ -2,7 +2,7 @@ import json
2
2
  import re
3
3
  from typing import Union, Dict, List, Optional, Tuple, Any, Callable
4
4
 
5
- from vllm_judge.models import JudgeConfig, EvaluationResult, Metric, BatchResult, TemplateEngine
5
+ from vllm_judge.models import JudgeConfig, EvaluationResult, Metric, BatchResult, TemplateEngine, ModelSpecificMetric
6
6
  from vllm_judge.client import VLLMClient
7
7
  from vllm_judge.prompts import PromptBuilder
8
8
  from vllm_judge.batch import BatchProcessor
@@ -14,6 +14,9 @@ from vllm_judge.exceptions import (
14
14
  MetricNotFoundError,
15
15
  VLLMJudgeError
16
16
  )
17
+ import logging
18
+
19
+ logger = logging.getLogger(__name__)
17
20
 
18
21
 
19
22
  class Judge:
@@ -60,7 +63,7 @@ class Judge:
60
63
 
61
64
  async def evaluate(
62
65
  self,
63
- response: Union[str, Dict[str, str]],
66
+ content: Union[str, Dict[str, str]],
64
67
  criteria: str = None,
65
68
  rubric: Union[str, Dict[Union[int, float], str]] = None,
66
69
  scale: Optional[Tuple[int, int]] = None,
@@ -76,7 +79,7 @@ class Judge:
76
79
  Universal evaluation method that adapts to use case.
77
80
 
78
81
  Args:
79
- response: String for single evaluation, dict {"a": ..., "b": ...} for comparison
82
+ content: String for single evaluation, dict {"a": ..., "b": ...} for comparison
80
83
  criteria: What to evaluate for (can contain template variables)
81
84
  rubric: Instructions for evaluation, can be string or dict containing mapping of score to description (can contain template variables)
82
85
  scale: Optional numeric scale (min, max)
@@ -96,6 +99,22 @@ class Judge:
96
99
  MetricNotFoundError: If metric name not found
97
100
  ParseError: If unable to parse model response
98
101
  """
102
+ # Handle model-specific metrics
103
+ if isinstance(metric, ModelSpecificMetric):
104
+ assert isinstance(content, str), "Model-specific metrics only support string content for now"
105
+
106
+ # logger.info(f"Evaluating model-specific metric {metric.name}.")
107
+ logger.info(f"We assume you're using {metric.model_pattern} type model. If not, please do not use this metric and use a normal metric instead.")
108
+ # Skip ALL our formatting
109
+ messages = [{"role": "user", "content": content}]
110
+
111
+ # vLLM applies model's chat template automatically
112
+ llm_response = await self._call_model(messages)
113
+
114
+ # Use metric's parser
115
+ return metric.parser_func(llm_response)
116
+
117
+ # Handle normal metrics
99
118
  # Handle metric parameter
100
119
  metric_template_vars = {}
101
120
 
@@ -138,7 +157,7 @@ class Judge:
138
157
 
139
158
  # Build messages
140
159
  messages = PromptBuilder.build_messages(
141
- response=response,
160
+ response=content,
142
161
  criteria=criteria,
143
162
  rubric=rubric,
144
163
  scale=scale,
@@ -149,14 +168,7 @@ class Judge:
149
168
  )
150
169
 
151
170
  # Get LLM response
152
- try:
153
- if self.config.use_chat_api:
154
- llm_response = await self.client.chat_completion(messages)
155
- else:
156
- prompt = PromptBuilder.format_messages_as_text(messages)
157
- llm_response = await self.client.completion(prompt)
158
- except Exception as e:
159
- raise VLLMJudgeError(f"Failed to get model response: {e}")
171
+ llm_response = await self._call_model(messages)
160
172
 
161
173
  # Parse response
162
174
  result = self._parse_response(llm_response)
@@ -168,6 +180,21 @@ class Judge:
168
180
 
169
181
  return result
170
182
 
183
+ async def _call_model(self, messages: List[Dict[str, str]]) -> str:
184
+ """
185
+ Call the model with the given messages.
186
+ """
187
+ try:
188
+ if self.config.use_chat_api:
189
+ llm_response = await self.client.chat_completion(messages)
190
+ else:
191
+ prompt = PromptBuilder.format_messages_as_text(messages)
192
+ llm_response = await self.client.completion(prompt)
193
+ return llm_response
194
+ except Exception as e:
195
+ raise VLLMJudgeError(f"Failed to get model response: {e}")
196
+
197
+
171
198
  def _parse_response(self, response: str) -> EvaluationResult:
172
199
  """
173
200
  Parse LLM response into EvaluationResult.
vllm_judge/metrics.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from typing import Dict
2
- from vllm_judge.models import Metric,TemplateEngine
2
+ from vllm_judge.models import Metric, TemplateEngine, ModelSpecificMetric
3
+ from vllm_judge.utils import parse_llama_guard_3
3
4
 
4
5
  # Registry for built-in metrics
5
6
  BUILTIN_METRICS: Dict[str, Metric] = {}
@@ -11,6 +12,13 @@ def create_builtin_metric(metric: Metric) -> Metric:
11
12
  return metric
12
13
 
13
14
 
15
+ # Llama Guard 3 safety metric
16
+ LLAMA_GUARD_3_SAFETY = create_builtin_metric(ModelSpecificMetric(
17
+ name="llama_guard_3_safety",
18
+ model_pattern="llama_guard_3",
19
+ parser_func=parse_llama_guard_3
20
+ ))
21
+
14
22
  # General purpose metrics
15
23
  HELPFULNESS = create_builtin_metric(Metric(
16
24
  name="helpfulness",
vllm_judge/models.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import Optional, Any, Dict, Union, List, Tuple
1
+ from typing import Optional, Any, Dict, Union, List, Tuple, Callable
2
2
  from pydantic import BaseModel, Field, field_validator, ConfigDict
3
3
  from enum import Enum
4
4
 
@@ -159,6 +159,15 @@ class Metric:
159
159
  def __repr__(self):
160
160
  return f"Metric(name='{self.name}', criteria='{self.criteria}', template_engine='{self.template_engine}')"
161
161
 
162
+ # Base class for model-specific metrics
163
+ class ModelSpecificMetric(Metric):
164
+ """Metric that bypasses our prompt formatting."""
165
+
166
+ def __init__(self, name: str, model_pattern: str, parser_func: Callable[[str], EvaluationResult]):
167
+ super().__init__(name=name, criteria="model-specific evaluation")
168
+ self.model_pattern = model_pattern
169
+ self.parser_func = parser_func
170
+ # self.is_model_specific = True # Flag for special handling
162
171
 
163
172
  class BatchResult(BaseModel):
164
173
  """Result of batch evaluation."""
vllm_judge/utils.py ADDED
@@ -0,0 +1,14 @@
1
+ from vllm_judge.models import EvaluationResult
2
+
3
+ # Llama Guard 3 parser
4
+ def parse_llama_guard_3(response: str) -> EvaluationResult:
5
+ """Parse Llama Guard 3's 'safe/unsafe' format."""
6
+ lines = response.strip().split('\n')
7
+ is_safe = lines[0].lower().strip() == 'safe'
8
+
9
+ return EvaluationResult(
10
+ decision="safe" if is_safe else "unsafe",
11
+ reasoning=lines[1] if len(lines) > 1 else "No violations detected",
12
+ score=None,
13
+ metadata={"model_type": "llama_guard_3"}
14
+ )
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.4
1
+ Metadata-Version: 2.1
2
2
  Name: vllm_judge
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: LLM-as-a-Judge evaluations for vLLM hosted models
5
5
  Author: TrustyAI team
6
6
  Author-email: Sai Chandra Pandraju <saichandrapandraju@gmail.com>
@@ -18,6 +18,17 @@ Provides-Extra: api
18
18
  Requires-Dist: fastapi>=0.100.0; extra == "api"
19
19
  Requires-Dist: uvicorn[standard]>=0.22.0; extra == "api"
20
20
  Requires-Dist: websockets>=11.0; extra == "api"
21
+ Provides-Extra: dev
22
+ Requires-Dist: vllm-judge[api,docs,jinja2,test]; extra == "dev"
23
+ Requires-Dist: black>=23.0.0; extra == "dev"
24
+ Requires-Dist: isort>=5.12.0; extra == "dev"
25
+ Requires-Dist: flake8>=6.0.0; extra == "dev"
26
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
27
+ Provides-Extra: docs
28
+ Requires-Dist: mkdocs>=1.5.0; extra == "docs"
29
+ Requires-Dist: mkdocs-material>=9.0.0; extra == "docs"
30
+ Requires-Dist: mkdocstrings[python]>=0.24.0; extra == "docs"
31
+ Requires-Dist: mkdocs-material-extensions>=1.3.1; extra == "docs"
21
32
  Provides-Extra: jinja2
22
33
  Requires-Dist: jinja2>=3.0.0; extra == "jinja2"
23
34
  Provides-Extra: test
@@ -25,30 +36,22 @@ Requires-Dist: pytest>=7.0.0; extra == "test"
25
36
  Requires-Dist: pytest-asyncio>=0.21.0; extra == "test"
26
37
  Requires-Dist: pytest-cov>=4.0.0; extra == "test"
27
38
  Requires-Dist: pytest-mock>=3.10.0; extra == "test"
28
- Provides-Extra: docs
29
- Requires-Dist: mkdocs>=1.5.0; extra == "docs"
30
- Requires-Dist: mkdocs-material>=9.0.0; extra == "docs"
31
- Requires-Dist: mkdocstrings[python]>=0.24.0; extra == "docs"
32
- Requires-Dist: mkdocs-material-extensions>=1.3.1; extra == "docs"
33
- Provides-Extra: dev
34
- Requires-Dist: vllm_judge[api,docs,jinja2,test]; extra == "dev"
35
- Requires-Dist: black>=23.0.0; extra == "dev"
36
- Requires-Dist: isort>=5.12.0; extra == "dev"
37
- Requires-Dist: flake8>=6.0.0; extra == "dev"
38
- Requires-Dist: mypy>=1.0.0; extra == "dev"
39
+
40
+ [![PyPI version](https://img.shields.io/pypi/v/vllm-judge.svg)
41
+ ](https://pypi.org/project/vllm-judge/)
39
42
 
40
43
  # vLLM Judge
41
44
 
42
- A lightweight library for LLM-as-a-Judge evaluations using vLLM hosted models. Please refer the [documentation](https://saichandrapandraju.github.io/vllm_judge/) for usage details.
45
+ A lightweight library for LLM-as-a-Judge evaluations using vLLM hosted models. Evaluate LLM inputs & outputs at scale with just a few lines of code. From simple scoring to complex safety checks, vLLM Judge adapts to your needs. Please refer the [documentation](https://saichandrapandraju.github.io/vllm_judge/) for usage details.
43
46
 
44
47
  ## Features
45
48
 
46
49
  - 🚀 **Simple Interface**: Single `evaluate()` method that adapts to any use case
47
50
  - 🎯 **Pre-built Metrics**: 20+ ready-to-use evaluation metrics
51
+ - 🛡️ **Model-Specific Support:** Seamlessly works with specialized models like Llama Guard without breaking their trained formats.
52
+ - ⚡ **High Performance**: Async-first design enables high-throughput evaluations
48
53
  - 🔧 **Template Support**: Dynamic evaluations with template variables
49
- - ⚡ **High Performance**: Optimized for vLLM with automatic batching
50
54
  - 🌐 **API Mode**: Run as a REST API service
51
- - 🔄 **Async Native**: Built for high-throughput evaluations
52
55
 
53
56
  ## Installation
54
57
 
@@ -72,11 +75,11 @@ pip install vllm-judge[dev]
72
75
  from vllm_judge import Judge
73
76
 
74
77
  # Initialize with vLLM url
75
- judge = Judge.from_url("http://localhost:8000")
78
+ judge = Judge.from_url("http://vllm-server:8000")
76
79
 
77
80
  # Simple evaluation
78
81
  result = await judge.evaluate(
79
- response="The Earth orbits around the Sun.",
82
+ content="The Earth orbits around the Sun.",
80
83
  criteria="scientific accuracy"
81
84
  )
82
85
  print(f"Decision: {result.decision}")
@@ -86,19 +89,28 @@ print(f"Reasoning: {result.reasoning}")
86
89
  from vllm_judge import CODE_QUALITY
87
90
 
88
91
  result = await judge.evaluate(
89
- response="def add(a, b): return a + b",
92
+ content="def add(a, b): return a + b",
90
93
  metric=CODE_QUALITY
91
94
  )
92
95
 
93
96
  # With template variables
94
97
  result = await judge.evaluate(
95
- response="Essay content here...",
98
+ content="Essay content here...",
96
99
  criteria="Evaluate this {doc_type} for {audience}",
97
100
  template_vars={
98
101
  "doc_type": "essay",
99
102
  "audience": "high school students"
100
103
  }
101
104
  )
105
+
106
+ # Works with specialized safety models out-of-the-box
107
+ from vllm_judge import LLAMA_GUARD_3_SAFETY
108
+
109
+ result = await judge.evaluate(
110
+ content="How do I make a bomb?",
111
+ metric=LLAMA_GUARD_3_SAFETY # Automatically uses Llama Guard format
112
+ )
113
+ # Result: decision="unsafe", reasoning="S9"
102
114
  ```
103
115
 
104
116
  ## API Server
@@ -106,7 +118,7 @@ result = await judge.evaluate(
106
118
  Run Judge as a REST API:
107
119
 
108
120
  ```bash
109
- vllm-judge serve --base-url http://localhost:8000 --port 9090 --host localhost
121
+ vllm-judge serve --base-url http://vllm-server:8000 --port 9090
110
122
  ```
111
123
 
112
124
  Then use the HTTP API:
@@ -116,7 +128,7 @@ from vllm_judge.api import JudgeClient
116
128
 
117
129
  client = JudgeClient("http://localhost:9090")
118
130
  result = await client.evaluate(
119
- response="Python is great!",
131
+ content="Python is great!",
120
132
  criteria="technical accuracy"
121
133
  )
122
134
  ```
@@ -0,0 +1,20 @@
1
+ vllm_judge/__init__.py,sha256=TBS7fQ4n7QEVwNtr4ErJu-T3m4c-8BwW4zDltt8S6Ko,2469
2
+ vllm_judge/batch.py,sha256=68jKgRTMzZXw4bxAiGp73NZzHOd1tKK763nBNjrr6gg,4842
3
+ vllm_judge/cli.py,sha256=mdoxNA5gQ1m3XBnNJYCE8uoi0RxrS9d3YIlrtdxRcME,10683
4
+ vllm_judge/client.py,sha256=QPz64q9-7XEOOJiKQU7FBkGFWocJ-WGUmpETKSLQYDI,8386
5
+ vllm_judge/exceptions.py,sha256=X9YxnukDuI3RwJPkabj3pl6v0JIbflvhUaWrdAW4RTM,1066
6
+ vllm_judge/judge.py,sha256=FKMpl6ubugHqKlR-W1-arr4J2rkwnC76QM5oAFv_HyM,15220
7
+ vllm_judge/metrics.py,sha256=lQOBaHqlX79L8yP9_YYd-dTaqvfOPo0nDMY0dtsnKvI,15960
8
+ vllm_judge/models.py,sha256=aEXZmP2sM-9aetstzHE3ngZwvCcvnrqzcj-8oV0NCJA,7889
9
+ vllm_judge/prompts.py,sha256=jAsBdshCCdgGF3UUAM0Wbb6MN1AB2jgHh1NmtXLbyrc,6345
10
+ vllm_judge/templating.py,sha256=LjVFXFcwHl8xnBLLVr_IIqtN-EbLp0HZ5ndNbBpcJTQ,6998
11
+ vllm_judge/utils.py,sha256=lhByBIMS_1EwvxEe31jFgVcTwcFwm5mWoJDXG4TnbvQ,509
12
+ vllm_judge/api/__init__.py,sha256=aPQ1o7_ZzbJJpm2UyX3H35snbOGbgQJoglJjzdnc1LU,762
13
+ vllm_judge/api/client.py,sha256=XRiveUw1edcknxO3zLFkYX_YbOObipx7dMFeSUjMSwk,11300
14
+ vllm_judge/api/models.py,sha256=tPEePecZbKb9ZbjwusdJwhLiBK9Rd5xqiOqjklDKJ9s,4781
15
+ vllm_judge/api/server.py,sha256=mbQ45YC0RYGONdy1oIcRIxUvByLtKXXrrMTpE9l2y1w,17818
16
+ vllm_judge-0.1.3.dist-info/METADATA,sha256=L_Kf2ic1W5wn1D1Y4amZaxO6E2i6bEKjZ4JFVvh3-YA,4251
17
+ vllm_judge-0.1.3.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
18
+ vllm_judge-0.1.3.dist-info/entry_points.txt,sha256=F3plmbMXOQ0pBIh0clqWPVIJWl20_1LZ7QHxC2XF5Lg,51
19
+ vllm_judge-0.1.3.dist-info/top_level.txt,sha256=bqtMvn2y13cHSz_1-HKCBMzYSTfDHsTQBG6U5STHvwM,11
20
+ vllm_judge-0.1.3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (75.3.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,19 +0,0 @@
1
- vllm_judge/__init__.py,sha256=iI-gdqNrjLwn7jzU7yjCZHCHKwbqrjbKp6OgAfl8Tu8,2363
2
- vllm_judge/batch.py,sha256=68jKgRTMzZXw4bxAiGp73NZzHOd1tKK763nBNjrr6gg,4842
3
- vllm_judge/cli.py,sha256=KQtUt_L4u5TPrS8xoyiKYt_hQ_FiHtGcrkecGEtktI8,10685
4
- vllm_judge/client.py,sha256=QPz64q9-7XEOOJiKQU7FBkGFWocJ-WGUmpETKSLQYDI,8386
5
- vllm_judge/exceptions.py,sha256=X9YxnukDuI3RwJPkabj3pl6v0JIbflvhUaWrdAW4RTM,1066
6
- vllm_judge/judge.py,sha256=y2qp18PVtobAyxqI246tEsju82W-OuGG4zXfajTEW-E,14101
7
- vllm_judge/metrics.py,sha256=QeGzaERvfRKQTt4JfquL1rW72GSkWdJ2_Nw_Hf0zqjY,15685
8
- vllm_judge/models.py,sha256=fbEUFPsY3xhv54WueWqEKvAgIcWTm-JO42N2-6k5LeM,7417
9
- vllm_judge/prompts.py,sha256=jAsBdshCCdgGF3UUAM0Wbb6MN1AB2jgHh1NmtXLbyrc,6345
10
- vllm_judge/templating.py,sha256=LjVFXFcwHl8xnBLLVr_IIqtN-EbLp0HZ5ndNbBpcJTQ,6998
11
- vllm_judge/api/__init__.py,sha256=aPQ1o7_ZzbJJpm2UyX3H35snbOGbgQJoglJjzdnc1LU,762
12
- vllm_judge/api/client.py,sha256=mcpdH-9ko6aEh_JAybpPPVhHqlO3l5K-lTujTlkTw8c,11302
13
- vllm_judge/api/models.py,sha256=tPEePecZbKb9ZbjwusdJwhLiBK9Rd5xqiOqjklDKJ9s,4781
14
- vllm_judge/api/server.py,sha256=mbQ45YC0RYGONdy1oIcRIxUvByLtKXXrrMTpE9l2y1w,17818
15
- vllm_judge-0.1.1.dist-info/METADATA,sha256=8tAJdnNjmSFrORci6TgJ2TTgZ8zmZCicBSgShbu31gY,3643
16
- vllm_judge-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
17
- vllm_judge-0.1.1.dist-info/entry_points.txt,sha256=F3plmbMXOQ0pBIh0clqWPVIJWl20_1LZ7QHxC2XF5Lg,51
18
- vllm_judge-0.1.1.dist-info/top_level.txt,sha256=bqtMvn2y13cHSz_1-HKCBMzYSTfDHsTQBG6U5STHvwM,11
19
- vllm_judge-0.1.1.dist-info/RECORD,,