vllm-judge 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vllm_judge/__init__.py CHANGED
@@ -5,7 +5,7 @@ A lightweight library for evaluating text responses using self-hosted language m
5
5
  via vLLM's OpenAI-compatible API.
6
6
  """
7
7
 
8
- __version__ = "0.1.1"
8
+ __version__ = "0.1.2"
9
9
 
10
10
  from vllm_judge.judge import Judge
11
11
  from vllm_judge.models import (
@@ -13,7 +13,8 @@ from vllm_judge.models import (
13
13
  EvaluationResult,
14
14
  Metric,
15
15
  BatchResult,
16
- TemplateEngine
16
+ TemplateEngine,
17
+ ModelSpecificMetric
17
18
  )
18
19
  from vllm_judge.templating import TemplateProcessor
19
20
  from vllm_judge.metrics import (
@@ -27,6 +28,7 @@ from vllm_judge.metrics import (
27
28
  # Safety metrics
28
29
  SAFETY,
29
30
  TOXICITY,
31
+ LLAMA_GUARD_3_SAFETY,
30
32
 
31
33
  # Code metrics
32
34
  CODE_QUALITY,
@@ -81,6 +83,7 @@ __all__ = [
81
83
  "BatchResult",
82
84
  "TemplateEngine",
83
85
  "TemplateProcessor",
86
+ "ModelSpecificMetric",
84
87
 
85
88
  # Metrics
86
89
  "HELPFULNESS",
@@ -90,6 +93,7 @@ __all__ = [
90
93
  "RELEVANCE",
91
94
  "SAFETY",
92
95
  "TOXICITY",
96
+ "LLAMA_GUARD_3_SAFETY",
93
97
  "CODE_QUALITY",
94
98
  "CODE_SECURITY",
95
99
  "CREATIVITY",
vllm_judge/judge.py CHANGED
@@ -2,7 +2,7 @@ import json
2
2
  import re
3
3
  from typing import Union, Dict, List, Optional, Tuple, Any, Callable
4
4
 
5
- from vllm_judge.models import JudgeConfig, EvaluationResult, Metric, BatchResult, TemplateEngine
5
+ from vllm_judge.models import JudgeConfig, EvaluationResult, Metric, BatchResult, TemplateEngine, ModelSpecificMetric
6
6
  from vllm_judge.client import VLLMClient
7
7
  from vllm_judge.prompts import PromptBuilder
8
8
  from vllm_judge.batch import BatchProcessor
@@ -14,6 +14,9 @@ from vllm_judge.exceptions import (
14
14
  MetricNotFoundError,
15
15
  VLLMJudgeError
16
16
  )
17
+ import logging
18
+
19
+ logger = logging.getLogger(__name__)
17
20
 
18
21
 
19
22
  class Judge:
@@ -96,6 +99,22 @@ class Judge:
96
99
  MetricNotFoundError: If metric name not found
97
100
  ParseError: If unable to parse model response
98
101
  """
102
+ # Handle model-specific metrics
103
+ if isinstance(metric, ModelSpecificMetric):
104
+ assert isinstance(response, str), "Model-specific metrics only support string content for now"
105
+
106
+ # logger.info(f"Evaluating model-specific metric {metric.name}.")
107
+ logger.info(f"We assume you're using {metric.model_pattern} type model. If not, please do not use this metric and use a normal metric instead.")
108
+ # Skip ALL our formatting
109
+ messages = [{"role": "user", "content": response}]
110
+
111
+ # vLLM applies model's chat template automatically
112
+ llm_response = await self._call_model(messages)
113
+
114
+ # Use metric's parser
115
+ return metric.parser_func(llm_response)
116
+
117
+ # Handle normal metrics
99
118
  # Handle metric parameter
100
119
  metric_template_vars = {}
101
120
 
@@ -149,14 +168,7 @@ class Judge:
149
168
  )
150
169
 
151
170
  # Get LLM response
152
- try:
153
- if self.config.use_chat_api:
154
- llm_response = await self.client.chat_completion(messages)
155
- else:
156
- prompt = PromptBuilder.format_messages_as_text(messages)
157
- llm_response = await self.client.completion(prompt)
158
- except Exception as e:
159
- raise VLLMJudgeError(f"Failed to get model response: {e}")
171
+ llm_response = await self._call_model(messages)
160
172
 
161
173
  # Parse response
162
174
  result = self._parse_response(llm_response)
@@ -168,6 +180,21 @@ class Judge:
168
180
 
169
181
  return result
170
182
 
183
+ async def _call_model(self, messages: List[Dict[str, str]]) -> str:
184
+ """
185
+ Call the model with the given messages.
186
+ """
187
+ try:
188
+ if self.config.use_chat_api:
189
+ llm_response = await self.client.chat_completion(messages)
190
+ else:
191
+ prompt = PromptBuilder.format_messages_as_text(messages)
192
+ llm_response = await self.client.completion(prompt)
193
+ return llm_response
194
+ except Exception as e:
195
+ raise VLLMJudgeError(f"Failed to get model response: {e}")
196
+
197
+
171
198
  def _parse_response(self, response: str) -> EvaluationResult:
172
199
  """
173
200
  Parse LLM response into EvaluationResult.
vllm_judge/metrics.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from typing import Dict
2
- from vllm_judge.models import Metric,TemplateEngine
2
+ from vllm_judge.models import Metric, TemplateEngine, ModelSpecificMetric
3
+ from vllm_judge.utils import parse_llama_guard_3
3
4
 
4
5
  # Registry for built-in metrics
5
6
  BUILTIN_METRICS: Dict[str, Metric] = {}
@@ -11,6 +12,13 @@ def create_builtin_metric(metric: Metric) -> Metric:
11
12
  return metric
12
13
 
13
14
 
15
+ # Llama Guard 3 safety metric
16
+ LLAMA_GUARD_3_SAFETY = create_builtin_metric(ModelSpecificMetric(
17
+ name="llama_guard_3_safety",
18
+ model_pattern="llama_guard_3",
19
+ parser_func=parse_llama_guard_3
20
+ ))
21
+
14
22
  # General purpose metrics
15
23
  HELPFULNESS = create_builtin_metric(Metric(
16
24
  name="helpfulness",
vllm_judge/models.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import Optional, Any, Dict, Union, List, Tuple
1
+ from typing import Optional, Any, Dict, Union, List, Tuple, Callable
2
2
  from pydantic import BaseModel, Field, field_validator, ConfigDict
3
3
  from enum import Enum
4
4
 
@@ -159,6 +159,15 @@ class Metric:
159
159
  def __repr__(self):
160
160
  return f"Metric(name='{self.name}', criteria='{self.criteria}', template_engine='{self.template_engine}')"
161
161
 
162
+ # Base class for model-specific metrics
163
+ class ModelSpecificMetric(Metric):
164
+ """Metric that bypasses our prompt formatting."""
165
+
166
+ def __init__(self, name: str, model_pattern: str, parser_func: Callable[[str], EvaluationResult]):
167
+ super().__init__(name=name, criteria="model-specific evaluation")
168
+ self.model_pattern = model_pattern
169
+ self.parser_func = parser_func
170
+ # self.is_model_specific = True # Flag for special handling
162
171
 
163
172
  class BatchResult(BaseModel):
164
173
  """Result of batch evaluation."""
vllm_judge/utils.py ADDED
@@ -0,0 +1,14 @@
1
+ from vllm_judge.models import EvaluationResult
2
+
3
+ # Llama Guard 3 parser
4
+ def parse_llama_guard_3(response: str) -> EvaluationResult:
5
+ """Parse Llama Guard 3's 'safe/unsafe' format."""
6
+ lines = response.strip().split('\n')
7
+ is_safe = lines[0].lower().strip() == 'safe'
8
+
9
+ return EvaluationResult(
10
+ decision="safe" if is_safe else "unsafe",
11
+ reasoning=lines[1] if len(lines) > 1 else "No violations detected",
12
+ score=None,
13
+ metadata={"model_type": "llama_guard_3"}
14
+ )
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.4
1
+ Metadata-Version: 2.1
2
2
  Name: vllm_judge
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Summary: LLM-as-a-Judge evaluations for vLLM hosted models
5
5
  Author: TrustyAI team
6
6
  Author-email: Sai Chandra Pandraju <saichandrapandraju@gmail.com>
@@ -18,6 +18,17 @@ Provides-Extra: api
18
18
  Requires-Dist: fastapi>=0.100.0; extra == "api"
19
19
  Requires-Dist: uvicorn[standard]>=0.22.0; extra == "api"
20
20
  Requires-Dist: websockets>=11.0; extra == "api"
21
+ Provides-Extra: dev
22
+ Requires-Dist: vllm-judge[api,docs,jinja2,test]; extra == "dev"
23
+ Requires-Dist: black>=23.0.0; extra == "dev"
24
+ Requires-Dist: isort>=5.12.0; extra == "dev"
25
+ Requires-Dist: flake8>=6.0.0; extra == "dev"
26
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
27
+ Provides-Extra: docs
28
+ Requires-Dist: mkdocs>=1.5.0; extra == "docs"
29
+ Requires-Dist: mkdocs-material>=9.0.0; extra == "docs"
30
+ Requires-Dist: mkdocstrings[python]>=0.24.0; extra == "docs"
31
+ Requires-Dist: mkdocs-material-extensions>=1.3.1; extra == "docs"
21
32
  Provides-Extra: jinja2
22
33
  Requires-Dist: jinja2>=3.0.0; extra == "jinja2"
23
34
  Provides-Extra: test
@@ -25,30 +36,23 @@ Requires-Dist: pytest>=7.0.0; extra == "test"
25
36
  Requires-Dist: pytest-asyncio>=0.21.0; extra == "test"
26
37
  Requires-Dist: pytest-cov>=4.0.0; extra == "test"
27
38
  Requires-Dist: pytest-mock>=3.10.0; extra == "test"
28
- Provides-Extra: docs
29
- Requires-Dist: mkdocs>=1.5.0; extra == "docs"
30
- Requires-Dist: mkdocs-material>=9.0.0; extra == "docs"
31
- Requires-Dist: mkdocstrings[python]>=0.24.0; extra == "docs"
32
- Requires-Dist: mkdocs-material-extensions>=1.3.1; extra == "docs"
33
- Provides-Extra: dev
34
- Requires-Dist: vllm_judge[api,docs,jinja2,test]; extra == "dev"
35
- Requires-Dist: black>=23.0.0; extra == "dev"
36
- Requires-Dist: isort>=5.12.0; extra == "dev"
37
- Requires-Dist: flake8>=6.0.0; extra == "dev"
38
- Requires-Dist: mypy>=1.0.0; extra == "dev"
39
+
40
+ [![PyPI version](https://img.shields.io/pypi/v/vllm-judge.svg)
41
+ ](https://pypi.org/project/vllm-judge/)
39
42
 
40
43
  # vLLM Judge
41
44
 
42
- A lightweight library for LLM-as-a-Judge evaluations using vLLM hosted models. Please refer the [documentation](https://saichandrapandraju.github.io/vllm_judge/) for usage details.
45
+ A lightweight library for LLM-as-a-Judge evaluations using vLLM hosted models. Evaluate LLM inputs & outputs at scale with just a few lines of code. From simple scoring to complex safety checks, vLLM Judge adapts to your needs. Please refer the [documentation](https://saichandrapandraju.github.io/vllm_judge/) for usage details.
43
46
 
44
47
  ## Features
45
48
 
46
49
  - 🚀 **Simple Interface**: Single `evaluate()` method that adapts to any use case
47
50
  - 🎯 **Pre-built Metrics**: 20+ ready-to-use evaluation metrics
51
+ - 🛡️ **Model-Specific Support:** Seamlessly works with specialized models like Llama Guard without breaking their trained formats.
52
+ - 🔄 **Async Native**: Built for high-throughput evaluations
48
53
  - 🔧 **Template Support**: Dynamic evaluations with template variables
49
54
  - ⚡ **High Performance**: Optimized for vLLM with automatic batching
50
55
  - 🌐 **API Mode**: Run as a REST API service
51
- - 🔄 **Async Native**: Built for high-throughput evaluations
52
56
 
53
57
  ## Installation
54
58
 
@@ -72,7 +76,7 @@ pip install vllm-judge[dev]
72
76
  from vllm_judge import Judge
73
77
 
74
78
  # Initialize with vLLM url
75
- judge = Judge.from_url("http://localhost:8000")
79
+ judge = Judge.from_url("http://vllm-server:8000")
76
80
 
77
81
  # Simple evaluation
78
82
  result = await judge.evaluate(
@@ -99,6 +103,15 @@ result = await judge.evaluate(
99
103
  "audience": "high school students"
100
104
  }
101
105
  )
106
+
107
+ # Works with specialized safety models out-of-the-box
108
+ from vllm_judge import LLAMA_GUARD_3_SAFETY
109
+
110
+ result = await judge.evaluate(
111
+ response="How do I make a bomb?",
112
+ metric=LLAMA_GUARD_3_SAFETY # Automatically uses Llama Guard format
113
+ )
114
+ # Result: decision="unsafe", reasoning="S9"
102
115
  ```
103
116
 
104
117
  ## API Server
@@ -106,7 +119,7 @@ result = await judge.evaluate(
106
119
  Run Judge as a REST API:
107
120
 
108
121
  ```bash
109
- vllm-judge serve --base-url http://localhost:8000 --port 9090 --host localhost
122
+ vllm-judge serve --base-url http://vllm-server:8000 --port 9090
110
123
  ```
111
124
 
112
125
  Then use the HTTP API:
@@ -1,19 +1,20 @@
1
- vllm_judge/__init__.py,sha256=iI-gdqNrjLwn7jzU7yjCZHCHKwbqrjbKp6OgAfl8Tu8,2363
1
+ vllm_judge/__init__.py,sha256=TcPeBC1yv3oDT5c8NvikyOL9cZyDZRnHD2Aeu0ynGuo,2469
2
2
  vllm_judge/batch.py,sha256=68jKgRTMzZXw4bxAiGp73NZzHOd1tKK763nBNjrr6gg,4842
3
3
  vllm_judge/cli.py,sha256=KQtUt_L4u5TPrS8xoyiKYt_hQ_FiHtGcrkecGEtktI8,10685
4
4
  vllm_judge/client.py,sha256=QPz64q9-7XEOOJiKQU7FBkGFWocJ-WGUmpETKSLQYDI,8386
5
5
  vllm_judge/exceptions.py,sha256=X9YxnukDuI3RwJPkabj3pl6v0JIbflvhUaWrdAW4RTM,1066
6
- vllm_judge/judge.py,sha256=y2qp18PVtobAyxqI246tEsju82W-OuGG4zXfajTEW-E,14101
7
- vllm_judge/metrics.py,sha256=QeGzaERvfRKQTt4JfquL1rW72GSkWdJ2_Nw_Hf0zqjY,15685
8
- vllm_judge/models.py,sha256=fbEUFPsY3xhv54WueWqEKvAgIcWTm-JO42N2-6k5LeM,7417
6
+ vllm_judge/judge.py,sha256=Wn1ez1HJKb2U0Fu-kcIo7Ls3-ph7hVtb6K5Rlk0NfGw,15225
7
+ vllm_judge/metrics.py,sha256=lQOBaHqlX79L8yP9_YYd-dTaqvfOPo0nDMY0dtsnKvI,15960
8
+ vllm_judge/models.py,sha256=aEXZmP2sM-9aetstzHE3ngZwvCcvnrqzcj-8oV0NCJA,7889
9
9
  vllm_judge/prompts.py,sha256=jAsBdshCCdgGF3UUAM0Wbb6MN1AB2jgHh1NmtXLbyrc,6345
10
10
  vllm_judge/templating.py,sha256=LjVFXFcwHl8xnBLLVr_IIqtN-EbLp0HZ5ndNbBpcJTQ,6998
11
+ vllm_judge/utils.py,sha256=lhByBIMS_1EwvxEe31jFgVcTwcFwm5mWoJDXG4TnbvQ,509
11
12
  vllm_judge/api/__init__.py,sha256=aPQ1o7_ZzbJJpm2UyX3H35snbOGbgQJoglJjzdnc1LU,762
12
13
  vllm_judge/api/client.py,sha256=mcpdH-9ko6aEh_JAybpPPVhHqlO3l5K-lTujTlkTw8c,11302
13
14
  vllm_judge/api/models.py,sha256=tPEePecZbKb9ZbjwusdJwhLiBK9Rd5xqiOqjklDKJ9s,4781
14
15
  vllm_judge/api/server.py,sha256=mbQ45YC0RYGONdy1oIcRIxUvByLtKXXrrMTpE9l2y1w,17818
15
- vllm_judge-0.1.1.dist-info/METADATA,sha256=8tAJdnNjmSFrORci6TgJ2TTgZ8zmZCicBSgShbu31gY,3643
16
- vllm_judge-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
17
- vllm_judge-0.1.1.dist-info/entry_points.txt,sha256=F3plmbMXOQ0pBIh0clqWPVIJWl20_1LZ7QHxC2XF5Lg,51
18
- vllm_judge-0.1.1.dist-info/top_level.txt,sha256=bqtMvn2y13cHSz_1-HKCBMzYSTfDHsTQBG6U5STHvwM,11
19
- vllm_judge-0.1.1.dist-info/RECORD,,
16
+ vllm_judge-0.1.2.dist-info/METADATA,sha256=DtXmkJ_sIXp49PuIL3CZJzkPRHsR8zhhwMaJFm6bUYg,4307
17
+ vllm_judge-0.1.2.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
18
+ vllm_judge-0.1.2.dist-info/entry_points.txt,sha256=F3plmbMXOQ0pBIh0clqWPVIJWl20_1LZ7QHxC2XF5Lg,51
19
+ vllm_judge-0.1.2.dist-info/top_level.txt,sha256=bqtMvn2y13cHSz_1-HKCBMzYSTfDHsTQBG6U5STHvwM,11
20
+ vllm_judge-0.1.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (75.3.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5