vllm-judge 0.1.1__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {vllm_judge-0.1.1 → vllm_judge-0.1.3}/PKG-INFO +23 -11
  2. {vllm_judge-0.1.1 → vllm_judge-0.1.3}/README.md +21 -9
  3. {vllm_judge-0.1.1 → vllm_judge-0.1.3}/pyproject.toml +1 -1
  4. {vllm_judge-0.1.1 → vllm_judge-0.1.3}/src/vllm_judge/__init__.py +6 -2
  5. {vllm_judge-0.1.1 → vllm_judge-0.1.3}/src/vllm_judge/api/client.py +2 -2
  6. {vllm_judge-0.1.1 → vllm_judge-0.1.3}/src/vllm_judge/cli.py +2 -2
  7. {vllm_judge-0.1.1 → vllm_judge-0.1.3}/src/vllm_judge/judge.py +39 -12
  8. {vllm_judge-0.1.1 → vllm_judge-0.1.3}/src/vllm_judge/metrics.py +9 -1
  9. {vllm_judge-0.1.1 → vllm_judge-0.1.3}/src/vllm_judge/models.py +10 -1
  10. vllm_judge-0.1.3/src/vllm_judge/utils.py +14 -0
  11. {vllm_judge-0.1.1 → vllm_judge-0.1.3}/src/vllm_judge.egg-info/PKG-INFO +23 -11
  12. {vllm_judge-0.1.1 → vllm_judge-0.1.3}/src/vllm_judge.egg-info/SOURCES.txt +1 -0
  13. {vllm_judge-0.1.1 → vllm_judge-0.1.3}/setup.cfg +0 -0
  14. {vllm_judge-0.1.1 → vllm_judge-0.1.3}/src/vllm_judge/api/__init__.py +0 -0
  15. {vllm_judge-0.1.1 → vllm_judge-0.1.3}/src/vllm_judge/api/models.py +0 -0
  16. {vllm_judge-0.1.1 → vllm_judge-0.1.3}/src/vllm_judge/api/server.py +0 -0
  17. {vllm_judge-0.1.1 → vllm_judge-0.1.3}/src/vllm_judge/batch.py +0 -0
  18. {vllm_judge-0.1.1 → vllm_judge-0.1.3}/src/vllm_judge/client.py +0 -0
  19. {vllm_judge-0.1.1 → vllm_judge-0.1.3}/src/vllm_judge/exceptions.py +0 -0
  20. {vllm_judge-0.1.1 → vllm_judge-0.1.3}/src/vllm_judge/prompts.py +0 -0
  21. {vllm_judge-0.1.1 → vllm_judge-0.1.3}/src/vllm_judge/templating.py +0 -0
  22. {vllm_judge-0.1.1 → vllm_judge-0.1.3}/src/vllm_judge.egg-info/dependency_links.txt +0 -0
  23. {vllm_judge-0.1.1 → vllm_judge-0.1.3}/src/vllm_judge.egg-info/entry_points.txt +0 -0
  24. {vllm_judge-0.1.1 → vllm_judge-0.1.3}/src/vllm_judge.egg-info/requires.txt +0 -0
  25. {vllm_judge-0.1.1 → vllm_judge-0.1.3}/src/vllm_judge.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.4
1
+ Metadata-Version: 2.1
2
2
  Name: vllm_judge
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: LLM-as-a-Judge evaluations for vLLM hosted models
5
5
  Author: TrustyAI team
6
6
  Author-email: Sai Chandra Pandraju <saichandrapandraju@gmail.com>
@@ -37,18 +37,21 @@ Requires-Dist: isort>=5.12.0; extra == "dev"
37
37
  Requires-Dist: flake8>=6.0.0; extra == "dev"
38
38
  Requires-Dist: mypy>=1.0.0; extra == "dev"
39
39
 
40
+ [![PyPI version](https://img.shields.io/pypi/v/vllm-judge.svg)
41
+ ](https://pypi.org/project/vllm-judge/)
42
+
40
43
  # vLLM Judge
41
44
 
42
- A lightweight library for LLM-as-a-Judge evaluations using vLLM hosted models. Please refer the [documentation](https://saichandrapandraju.github.io/vllm_judge/) for usage details.
45
+ A lightweight library for LLM-as-a-Judge evaluations using vLLM hosted models. Evaluate LLM inputs & outputs at scale with just a few lines of code. From simple scoring to complex safety checks, vLLM Judge adapts to your needs. Please refer the [documentation](https://saichandrapandraju.github.io/vllm_judge/) for usage details.
43
46
 
44
47
  ## Features
45
48
 
46
49
  - 🚀 **Simple Interface**: Single `evaluate()` method that adapts to any use case
47
50
  - 🎯 **Pre-built Metrics**: 20+ ready-to-use evaluation metrics
51
+ - 🛡️ **Model-Specific Support:** Seamlessly works with specialized models like Llama Guard without breaking their trained formats.
52
+ - ⚡ **High Performance**: Async-first design enables high-throughput evaluations
48
53
  - 🔧 **Template Support**: Dynamic evaluations with template variables
49
- - ⚡ **High Performance**: Optimized for vLLM with automatic batching
50
54
  - 🌐 **API Mode**: Run as a REST API service
51
- - 🔄 **Async Native**: Built for high-throughput evaluations
52
55
 
53
56
  ## Installation
54
57
 
@@ -72,11 +75,11 @@ pip install vllm-judge[dev]
72
75
  from vllm_judge import Judge
73
76
 
74
77
  # Initialize with vLLM url
75
- judge = Judge.from_url("http://localhost:8000")
78
+ judge = Judge.from_url("http://vllm-server:8000")
76
79
 
77
80
  # Simple evaluation
78
81
  result = await judge.evaluate(
79
- response="The Earth orbits around the Sun.",
82
+ content="The Earth orbits around the Sun.",
80
83
  criteria="scientific accuracy"
81
84
  )
82
85
  print(f"Decision: {result.decision}")
@@ -86,19 +89,28 @@ print(f"Reasoning: {result.reasoning}")
86
89
  from vllm_judge import CODE_QUALITY
87
90
 
88
91
  result = await judge.evaluate(
89
- response="def add(a, b): return a + b",
92
+ content="def add(a, b): return a + b",
90
93
  metric=CODE_QUALITY
91
94
  )
92
95
 
93
96
  # With template variables
94
97
  result = await judge.evaluate(
95
- response="Essay content here...",
98
+ content="Essay content here...",
96
99
  criteria="Evaluate this {doc_type} for {audience}",
97
100
  template_vars={
98
101
  "doc_type": "essay",
99
102
  "audience": "high school students"
100
103
  }
101
104
  )
105
+
106
+ # Works with specialized safety models out-of-the-box
107
+ from vllm_judge import LLAMA_GUARD_3_SAFETY
108
+
109
+ result = await judge.evaluate(
110
+ content="How do I make a bomb?",
111
+ metric=LLAMA_GUARD_3_SAFETY # Automatically uses Llama Guard format
112
+ )
113
+ # Result: decision="unsafe", reasoning="S9"
102
114
  ```
103
115
 
104
116
  ## API Server
@@ -106,7 +118,7 @@ result = await judge.evaluate(
106
118
  Run Judge as a REST API:
107
119
 
108
120
  ```bash
109
- vllm-judge serve --base-url http://localhost:8000 --port 9090 --host localhost
121
+ vllm-judge serve --base-url http://vllm-server:8000 --port 9090
110
122
  ```
111
123
 
112
124
  Then use the HTTP API:
@@ -116,7 +128,7 @@ from vllm_judge.api import JudgeClient
116
128
 
117
129
  client = JudgeClient("http://localhost:9090")
118
130
  result = await client.evaluate(
119
- response="Python is great!",
131
+ content="Python is great!",
120
132
  criteria="technical accuracy"
121
133
  )
122
134
  ```
@@ -1,15 +1,18 @@
1
+ [![PyPI version](https://img.shields.io/pypi/v/vllm-judge.svg)
2
+ ](https://pypi.org/project/vllm-judge/)
3
+
1
4
  # vLLM Judge
2
5
 
3
- A lightweight library for LLM-as-a-Judge evaluations using vLLM hosted models. Please refer the [documentation](https://saichandrapandraju.github.io/vllm_judge/) for usage details.
6
+ A lightweight library for LLM-as-a-Judge evaluations using vLLM hosted models. Evaluate LLM inputs & outputs at scale with just a few lines of code. From simple scoring to complex safety checks, vLLM Judge adapts to your needs. Please refer the [documentation](https://saichandrapandraju.github.io/vllm_judge/) for usage details.
4
7
 
5
8
  ## Features
6
9
 
7
10
  - 🚀 **Simple Interface**: Single `evaluate()` method that adapts to any use case
8
11
  - 🎯 **Pre-built Metrics**: 20+ ready-to-use evaluation metrics
12
+ - 🛡️ **Model-Specific Support:** Seamlessly works with specialized models like Llama Guard without breaking their trained formats.
13
+ - ⚡ **High Performance**: Async-first design enables high-throughput evaluations
9
14
  - 🔧 **Template Support**: Dynamic evaluations with template variables
10
- - ⚡ **High Performance**: Optimized for vLLM with automatic batching
11
15
  - 🌐 **API Mode**: Run as a REST API service
12
- - 🔄 **Async Native**: Built for high-throughput evaluations
13
16
 
14
17
  ## Installation
15
18
 
@@ -33,11 +36,11 @@ pip install vllm-judge[dev]
33
36
  from vllm_judge import Judge
34
37
 
35
38
  # Initialize with vLLM url
36
- judge = Judge.from_url("http://localhost:8000")
39
+ judge = Judge.from_url("http://vllm-server:8000")
37
40
 
38
41
  # Simple evaluation
39
42
  result = await judge.evaluate(
40
- response="The Earth orbits around the Sun.",
43
+ content="The Earth orbits around the Sun.",
41
44
  criteria="scientific accuracy"
42
45
  )
43
46
  print(f"Decision: {result.decision}")
@@ -47,19 +50,28 @@ print(f"Reasoning: {result.reasoning}")
47
50
  from vllm_judge import CODE_QUALITY
48
51
 
49
52
  result = await judge.evaluate(
50
- response="def add(a, b): return a + b",
53
+ content="def add(a, b): return a + b",
51
54
  metric=CODE_QUALITY
52
55
  )
53
56
 
54
57
  # With template variables
55
58
  result = await judge.evaluate(
56
- response="Essay content here...",
59
+ content="Essay content here...",
57
60
  criteria="Evaluate this {doc_type} for {audience}",
58
61
  template_vars={
59
62
  "doc_type": "essay",
60
63
  "audience": "high school students"
61
64
  }
62
65
  )
66
+
67
+ # Works with specialized safety models out-of-the-box
68
+ from vllm_judge import LLAMA_GUARD_3_SAFETY
69
+
70
+ result = await judge.evaluate(
71
+ content="How do I make a bomb?",
72
+ metric=LLAMA_GUARD_3_SAFETY # Automatically uses Llama Guard format
73
+ )
74
+ # Result: decision="unsafe", reasoning="S9"
63
75
  ```
64
76
 
65
77
  ## API Server
@@ -67,7 +79,7 @@ result = await judge.evaluate(
67
79
  Run Judge as a REST API:
68
80
 
69
81
  ```bash
70
- vllm-judge serve --base-url http://localhost:8000 --port 9090 --host localhost
82
+ vllm-judge serve --base-url http://vllm-server:8000 --port 9090
71
83
  ```
72
84
 
73
85
  Then use the HTTP API:
@@ -77,7 +89,7 @@ from vllm_judge.api import JudgeClient
77
89
 
78
90
  client = JudgeClient("http://localhost:9090")
79
91
  result = await client.evaluate(
80
- response="Python is great!",
92
+ content="Python is great!",
81
93
  criteria="technical accuracy"
82
94
  )
83
95
  ```
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "vllm_judge"
7
- version = "0.1.1"
7
+ version = "0.1.3"
8
8
  description = "LLM-as-a-Judge evaluations for vLLM hosted models"
9
9
  readme = "README.md"
10
10
  authors = [
@@ -5,7 +5,7 @@ A lightweight library for evaluating text responses using self-hosted language m
5
5
  via vLLM's OpenAI-compatible API.
6
6
  """
7
7
 
8
- __version__ = "0.1.1"
8
+ __version__ = "0.1.3"
9
9
 
10
10
  from vllm_judge.judge import Judge
11
11
  from vllm_judge.models import (
@@ -13,7 +13,8 @@ from vllm_judge.models import (
13
13
  EvaluationResult,
14
14
  Metric,
15
15
  BatchResult,
16
- TemplateEngine
16
+ TemplateEngine,
17
+ ModelSpecificMetric
17
18
  )
18
19
  from vllm_judge.templating import TemplateProcessor
19
20
  from vllm_judge.metrics import (
@@ -27,6 +28,7 @@ from vllm_judge.metrics import (
27
28
  # Safety metrics
28
29
  SAFETY,
29
30
  TOXICITY,
31
+ LLAMA_GUARD_3_SAFETY,
30
32
 
31
33
  # Code metrics
32
34
  CODE_QUALITY,
@@ -81,6 +83,7 @@ __all__ = [
81
83
  "BatchResult",
82
84
  "TemplateEngine",
83
85
  "TemplateProcessor",
86
+ "ModelSpecificMetric",
84
87
 
85
88
  # Metrics
86
89
  "HELPFULNESS",
@@ -90,6 +93,7 @@ __all__ = [
90
93
  "RELEVANCE",
91
94
  "SAFETY",
92
95
  "TOXICITY",
96
+ "LLAMA_GUARD_3_SAFETY",
93
97
  "CODE_QUALITY",
94
98
  "CODE_SECURITY",
95
99
  "CREATIVITY",
@@ -65,7 +65,7 @@ class JudgeClient:
65
65
 
66
66
  async def evaluate(
67
67
  self,
68
- response: Union[str, Dict[str, str]],
68
+ content: Union[str, Dict[str, str]],
69
69
  criteria: str = None,
70
70
  rubric: Union[str, Dict[Union[int, float], str]] = None,
71
71
  scale: Optional[Tuple[int, int]] = None,
@@ -87,7 +87,7 @@ class JudgeClient:
87
87
  EvaluationResult
88
88
  """
89
89
  request = EvaluateRequest(
90
- response=response,
90
+ response=content,
91
91
  criteria=criteria,
92
92
  rubric=rubric,
93
93
  scale=list(scale) if scale else None,
@@ -75,7 +75,7 @@ def evaluate(
75
75
  # Use API client
76
76
  async with JudgeClient(api_url) as client:
77
77
  result = await client.evaluate(
78
- response=response,
78
+ content=response,
79
79
  criteria=criteria,
80
80
  metric=metric,
81
81
  scale=scale,
@@ -91,7 +91,7 @@ def evaluate(
91
91
  judge = Judge.from_url(base_url, model=model)
92
92
  async with judge:
93
93
  result = await judge.evaluate(
94
- response=response,
94
+ content=response,
95
95
  criteria=criteria,
96
96
  metric=metric,
97
97
  scale=scale,
@@ -2,7 +2,7 @@ import json
2
2
  import re
3
3
  from typing import Union, Dict, List, Optional, Tuple, Any, Callable
4
4
 
5
- from vllm_judge.models import JudgeConfig, EvaluationResult, Metric, BatchResult, TemplateEngine
5
+ from vllm_judge.models import JudgeConfig, EvaluationResult, Metric, BatchResult, TemplateEngine, ModelSpecificMetric
6
6
  from vllm_judge.client import VLLMClient
7
7
  from vllm_judge.prompts import PromptBuilder
8
8
  from vllm_judge.batch import BatchProcessor
@@ -14,6 +14,9 @@ from vllm_judge.exceptions import (
14
14
  MetricNotFoundError,
15
15
  VLLMJudgeError
16
16
  )
17
+ import logging
18
+
19
+ logger = logging.getLogger(__name__)
17
20
 
18
21
 
19
22
  class Judge:
@@ -60,7 +63,7 @@ class Judge:
60
63
 
61
64
  async def evaluate(
62
65
  self,
63
- response: Union[str, Dict[str, str]],
66
+ content: Union[str, Dict[str, str]],
64
67
  criteria: str = None,
65
68
  rubric: Union[str, Dict[Union[int, float], str]] = None,
66
69
  scale: Optional[Tuple[int, int]] = None,
@@ -76,7 +79,7 @@ class Judge:
76
79
  Universal evaluation method that adapts to use case.
77
80
 
78
81
  Args:
79
- response: String for single evaluation, dict {"a": ..., "b": ...} for comparison
82
+ content: String for single evaluation, dict {"a": ..., "b": ...} for comparison
80
83
  criteria: What to evaluate for (can contain template variables)
81
84
  rubric: Instructions for evaluation, can be string or dict containing mapping of score to description (can contain template variables)
82
85
  scale: Optional numeric scale (min, max)
@@ -96,6 +99,22 @@ class Judge:
96
99
  MetricNotFoundError: If metric name not found
97
100
  ParseError: If unable to parse model response
98
101
  """
102
+ # Handle model-specific metrics
103
+ if isinstance(metric, ModelSpecificMetric):
104
+ assert isinstance(content, str), "Model-specific metrics only support string content for now"
105
+
106
+ # logger.info(f"Evaluating model-specific metric {metric.name}.")
107
+ logger.info(f"We assume you're using {metric.model_pattern} type model. If not, please do not use this metric and use a normal metric instead.")
108
+ # Skip ALL our formatting
109
+ messages = [{"role": "user", "content": content}]
110
+
111
+ # vLLM applies model's chat template automatically
112
+ llm_response = await self._call_model(messages)
113
+
114
+ # Use metric's parser
115
+ return metric.parser_func(llm_response)
116
+
117
+ # Handle normal metrics
99
118
  # Handle metric parameter
100
119
  metric_template_vars = {}
101
120
 
@@ -138,7 +157,7 @@ class Judge:
138
157
 
139
158
  # Build messages
140
159
  messages = PromptBuilder.build_messages(
141
- response=response,
160
+ response=content,
142
161
  criteria=criteria,
143
162
  rubric=rubric,
144
163
  scale=scale,
@@ -149,14 +168,7 @@ class Judge:
149
168
  )
150
169
 
151
170
  # Get LLM response
152
- try:
153
- if self.config.use_chat_api:
154
- llm_response = await self.client.chat_completion(messages)
155
- else:
156
- prompt = PromptBuilder.format_messages_as_text(messages)
157
- llm_response = await self.client.completion(prompt)
158
- except Exception as e:
159
- raise VLLMJudgeError(f"Failed to get model response: {e}")
171
+ llm_response = await self._call_model(messages)
160
172
 
161
173
  # Parse response
162
174
  result = self._parse_response(llm_response)
@@ -168,6 +180,21 @@ class Judge:
168
180
 
169
181
  return result
170
182
 
183
+ async def _call_model(self, messages: List[Dict[str, str]]) -> str:
184
+ """
185
+ Call the model with the given messages.
186
+ """
187
+ try:
188
+ if self.config.use_chat_api:
189
+ llm_response = await self.client.chat_completion(messages)
190
+ else:
191
+ prompt = PromptBuilder.format_messages_as_text(messages)
192
+ llm_response = await self.client.completion(prompt)
193
+ return llm_response
194
+ except Exception as e:
195
+ raise VLLMJudgeError(f"Failed to get model response: {e}")
196
+
197
+
171
198
  def _parse_response(self, response: str) -> EvaluationResult:
172
199
  """
173
200
  Parse LLM response into EvaluationResult.
@@ -1,5 +1,6 @@
1
1
  from typing import Dict
2
- from vllm_judge.models import Metric,TemplateEngine
2
+ from vllm_judge.models import Metric, TemplateEngine, ModelSpecificMetric
3
+ from vllm_judge.utils import parse_llama_guard_3
3
4
 
4
5
  # Registry for built-in metrics
5
6
  BUILTIN_METRICS: Dict[str, Metric] = {}
@@ -11,6 +12,13 @@ def create_builtin_metric(metric: Metric) -> Metric:
11
12
  return metric
12
13
 
13
14
 
15
+ # Llama Guard 3 safety metric
16
+ LLAMA_GUARD_3_SAFETY = create_builtin_metric(ModelSpecificMetric(
17
+ name="llama_guard_3_safety",
18
+ model_pattern="llama_guard_3",
19
+ parser_func=parse_llama_guard_3
20
+ ))
21
+
14
22
  # General purpose metrics
15
23
  HELPFULNESS = create_builtin_metric(Metric(
16
24
  name="helpfulness",
@@ -1,4 +1,4 @@
1
- from typing import Optional, Any, Dict, Union, List, Tuple
1
+ from typing import Optional, Any, Dict, Union, List, Tuple, Callable
2
2
  from pydantic import BaseModel, Field, field_validator, ConfigDict
3
3
  from enum import Enum
4
4
 
@@ -159,6 +159,15 @@ class Metric:
159
159
  def __repr__(self):
160
160
  return f"Metric(name='{self.name}', criteria='{self.criteria}', template_engine='{self.template_engine}')"
161
161
 
162
+ # Base class for model-specific metrics
163
+ class ModelSpecificMetric(Metric):
164
+ """Metric that bypasses our prompt formatting."""
165
+
166
+ def __init__(self, name: str, model_pattern: str, parser_func: Callable[[str], EvaluationResult]):
167
+ super().__init__(name=name, criteria="model-specific evaluation")
168
+ self.model_pattern = model_pattern
169
+ self.parser_func = parser_func
170
+ # self.is_model_specific = True # Flag for special handling
162
171
 
163
172
  class BatchResult(BaseModel):
164
173
  """Result of batch evaluation."""
@@ -0,0 +1,14 @@
1
+ from vllm_judge.models import EvaluationResult
2
+
3
+ # Llama Guard 3 parser
4
+ def parse_llama_guard_3(response: str) -> EvaluationResult:
5
+ """Parse Llama Guard 3's 'safe/unsafe' format."""
6
+ lines = response.strip().split('\n')
7
+ is_safe = lines[0].lower().strip() == 'safe'
8
+
9
+ return EvaluationResult(
10
+ decision="safe" if is_safe else "unsafe",
11
+ reasoning=lines[1] if len(lines) > 1 else "No violations detected",
12
+ score=None,
13
+ metadata={"model_type": "llama_guard_3"}
14
+ )
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.4
1
+ Metadata-Version: 2.1
2
2
  Name: vllm_judge
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: LLM-as-a-Judge evaluations for vLLM hosted models
5
5
  Author: TrustyAI team
6
6
  Author-email: Sai Chandra Pandraju <saichandrapandraju@gmail.com>
@@ -37,18 +37,21 @@ Requires-Dist: isort>=5.12.0; extra == "dev"
37
37
  Requires-Dist: flake8>=6.0.0; extra == "dev"
38
38
  Requires-Dist: mypy>=1.0.0; extra == "dev"
39
39
 
40
+ [![PyPI version](https://img.shields.io/pypi/v/vllm-judge.svg)
41
+ ](https://pypi.org/project/vllm-judge/)
42
+
40
43
  # vLLM Judge
41
44
 
42
- A lightweight library for LLM-as-a-Judge evaluations using vLLM hosted models. Please refer the [documentation](https://saichandrapandraju.github.io/vllm_judge/) for usage details.
45
+ A lightweight library for LLM-as-a-Judge evaluations using vLLM hosted models. Evaluate LLM inputs & outputs at scale with just a few lines of code. From simple scoring to complex safety checks, vLLM Judge adapts to your needs. Please refer the [documentation](https://saichandrapandraju.github.io/vllm_judge/) for usage details.
43
46
 
44
47
  ## Features
45
48
 
46
49
  - 🚀 **Simple Interface**: Single `evaluate()` method that adapts to any use case
47
50
  - 🎯 **Pre-built Metrics**: 20+ ready-to-use evaluation metrics
51
+ - 🛡️ **Model-Specific Support:** Seamlessly works with specialized models like Llama Guard without breaking their trained formats.
52
+ - ⚡ **High Performance**: Async-first design enables high-throughput evaluations
48
53
  - 🔧 **Template Support**: Dynamic evaluations with template variables
49
- - ⚡ **High Performance**: Optimized for vLLM with automatic batching
50
54
  - 🌐 **API Mode**: Run as a REST API service
51
- - 🔄 **Async Native**: Built for high-throughput evaluations
52
55
 
53
56
  ## Installation
54
57
 
@@ -72,11 +75,11 @@ pip install vllm-judge[dev]
72
75
  from vllm_judge import Judge
73
76
 
74
77
  # Initialize with vLLM url
75
- judge = Judge.from_url("http://localhost:8000")
78
+ judge = Judge.from_url("http://vllm-server:8000")
76
79
 
77
80
  # Simple evaluation
78
81
  result = await judge.evaluate(
79
- response="The Earth orbits around the Sun.",
82
+ content="The Earth orbits around the Sun.",
80
83
  criteria="scientific accuracy"
81
84
  )
82
85
  print(f"Decision: {result.decision}")
@@ -86,19 +89,28 @@ print(f"Reasoning: {result.reasoning}")
86
89
  from vllm_judge import CODE_QUALITY
87
90
 
88
91
  result = await judge.evaluate(
89
- response="def add(a, b): return a + b",
92
+ content="def add(a, b): return a + b",
90
93
  metric=CODE_QUALITY
91
94
  )
92
95
 
93
96
  # With template variables
94
97
  result = await judge.evaluate(
95
- response="Essay content here...",
98
+ content="Essay content here...",
96
99
  criteria="Evaluate this {doc_type} for {audience}",
97
100
  template_vars={
98
101
  "doc_type": "essay",
99
102
  "audience": "high school students"
100
103
  }
101
104
  )
105
+
106
+ # Works with specialized safety models out-of-the-box
107
+ from vllm_judge import LLAMA_GUARD_3_SAFETY
108
+
109
+ result = await judge.evaluate(
110
+ content="How do I make a bomb?",
111
+ metric=LLAMA_GUARD_3_SAFETY # Automatically uses Llama Guard format
112
+ )
113
+ # Result: decision="unsafe", reasoning="S9"
102
114
  ```
103
115
 
104
116
  ## API Server
@@ -106,7 +118,7 @@ result = await judge.evaluate(
106
118
  Run Judge as a REST API:
107
119
 
108
120
  ```bash
109
- vllm-judge serve --base-url http://localhost:8000 --port 9090 --host localhost
121
+ vllm-judge serve --base-url http://vllm-server:8000 --port 9090
110
122
  ```
111
123
 
112
124
  Then use the HTTP API:
@@ -116,7 +128,7 @@ from vllm_judge.api import JudgeClient
116
128
 
117
129
  client = JudgeClient("http://localhost:9090")
118
130
  result = await client.evaluate(
119
- response="Python is great!",
131
+ content="Python is great!",
120
132
  criteria="technical accuracy"
121
133
  )
122
134
  ```
@@ -10,6 +10,7 @@ src/vllm_judge/metrics.py
10
10
  src/vllm_judge/models.py
11
11
  src/vllm_judge/prompts.py
12
12
  src/vllm_judge/templating.py
13
+ src/vllm_judge/utils.py
13
14
  src/vllm_judge.egg-info/PKG-INFO
14
15
  src/vllm_judge.egg-info/SOURCES.txt
15
16
  src/vllm_judge.egg-info/dependency_links.txt
File without changes