vllm-judge 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vllm_judge/__init__.py +1 -1
- vllm_judge/api/client.py +2 -2
- vllm_judge/cli.py +2 -2
- vllm_judge/judge.py +5 -5
- {vllm_judge-0.1.2.dist-info → vllm_judge-0.1.3.dist-info}/METADATA +7 -8
- {vllm_judge-0.1.2.dist-info → vllm_judge-0.1.3.dist-info}/RECORD +9 -9
- {vllm_judge-0.1.2.dist-info → vllm_judge-0.1.3.dist-info}/WHEEL +0 -0
- {vllm_judge-0.1.2.dist-info → vllm_judge-0.1.3.dist-info}/entry_points.txt +0 -0
- {vllm_judge-0.1.2.dist-info → vllm_judge-0.1.3.dist-info}/top_level.txt +0 -0
vllm_judge/__init__.py
CHANGED
vllm_judge/api/client.py
CHANGED
@@ -65,7 +65,7 @@ class JudgeClient:
|
|
65
65
|
|
66
66
|
async def evaluate(
|
67
67
|
self,
|
68
|
-
|
68
|
+
content: Union[str, Dict[str, str]],
|
69
69
|
criteria: str = None,
|
70
70
|
rubric: Union[str, Dict[Union[int, float], str]] = None,
|
71
71
|
scale: Optional[Tuple[int, int]] = None,
|
@@ -87,7 +87,7 @@ class JudgeClient:
|
|
87
87
|
EvaluationResult
|
88
88
|
"""
|
89
89
|
request = EvaluateRequest(
|
90
|
-
response=
|
90
|
+
response=content,
|
91
91
|
criteria=criteria,
|
92
92
|
rubric=rubric,
|
93
93
|
scale=list(scale) if scale else None,
|
vllm_judge/cli.py
CHANGED
@@ -75,7 +75,7 @@ def evaluate(
|
|
75
75
|
# Use API client
|
76
76
|
async with JudgeClient(api_url) as client:
|
77
77
|
result = await client.evaluate(
|
78
|
-
|
78
|
+
content=response,
|
79
79
|
criteria=criteria,
|
80
80
|
metric=metric,
|
81
81
|
scale=scale,
|
@@ -91,7 +91,7 @@ def evaluate(
|
|
91
91
|
judge = Judge.from_url(base_url, model=model)
|
92
92
|
async with judge:
|
93
93
|
result = await judge.evaluate(
|
94
|
-
|
94
|
+
content=response,
|
95
95
|
criteria=criteria,
|
96
96
|
metric=metric,
|
97
97
|
scale=scale,
|
vllm_judge/judge.py
CHANGED
@@ -63,7 +63,7 @@ class Judge:
|
|
63
63
|
|
64
64
|
async def evaluate(
|
65
65
|
self,
|
66
|
-
|
66
|
+
content: Union[str, Dict[str, str]],
|
67
67
|
criteria: str = None,
|
68
68
|
rubric: Union[str, Dict[Union[int, float], str]] = None,
|
69
69
|
scale: Optional[Tuple[int, int]] = None,
|
@@ -79,7 +79,7 @@ class Judge:
|
|
79
79
|
Universal evaluation method that adapts to use case.
|
80
80
|
|
81
81
|
Args:
|
82
|
-
|
82
|
+
content: String for single evaluation, dict {"a": ..., "b": ...} for comparison
|
83
83
|
criteria: What to evaluate for (can contain template variables)
|
84
84
|
rubric: Instructions for evaluation, can be string or dict containing mapping of score to description (can contain template variables)
|
85
85
|
scale: Optional numeric scale (min, max)
|
@@ -101,12 +101,12 @@ class Judge:
|
|
101
101
|
"""
|
102
102
|
# Handle model-specific metrics
|
103
103
|
if isinstance(metric, ModelSpecificMetric):
|
104
|
-
assert isinstance(
|
104
|
+
assert isinstance(content, str), "Model-specific metrics only support string content for now"
|
105
105
|
|
106
106
|
# logger.info(f"Evaluating model-specific metric {metric.name}.")
|
107
107
|
logger.info(f"We assume you're using {metric.model_pattern} type model. If not, please do not use this metric and use a normal metric instead.")
|
108
108
|
# Skip ALL our formatting
|
109
|
-
messages = [{"role": "user", "content":
|
109
|
+
messages = [{"role": "user", "content": content}]
|
110
110
|
|
111
111
|
# vLLM applies model's chat template automatically
|
112
112
|
llm_response = await self._call_model(messages)
|
@@ -157,7 +157,7 @@ class Judge:
|
|
157
157
|
|
158
158
|
# Build messages
|
159
159
|
messages = PromptBuilder.build_messages(
|
160
|
-
response=
|
160
|
+
response=content,
|
161
161
|
criteria=criteria,
|
162
162
|
rubric=rubric,
|
163
163
|
scale=scale,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vllm_judge
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.3
|
4
4
|
Summary: LLM-as-a-Judge evaluations for vLLM hosted models
|
5
5
|
Author: TrustyAI team
|
6
6
|
Author-email: Sai Chandra Pandraju <saichandrapandraju@gmail.com>
|
@@ -49,9 +49,8 @@ A lightweight library for LLM-as-a-Judge evaluations using vLLM hosted models. E
|
|
49
49
|
- 🚀 **Simple Interface**: Single `evaluate()` method that adapts to any use case
|
50
50
|
- 🎯 **Pre-built Metrics**: 20+ ready-to-use evaluation metrics
|
51
51
|
- 🛡️ **Model-Specific Support:** Seamlessly works with specialized models like Llama Guard without breaking their trained formats.
|
52
|
-
-
|
52
|
+
- ⚡ **High Performance**: Async-first design enables high-throughput evaluations
|
53
53
|
- 🔧 **Template Support**: Dynamic evaluations with template variables
|
54
|
-
- ⚡ **High Performance**: Optimized for vLLM with automatic batching
|
55
54
|
- 🌐 **API Mode**: Run as a REST API service
|
56
55
|
|
57
56
|
## Installation
|
@@ -80,7 +79,7 @@ judge = Judge.from_url("http://vllm-server:8000")
|
|
80
79
|
|
81
80
|
# Simple evaluation
|
82
81
|
result = await judge.evaluate(
|
83
|
-
|
82
|
+
content="The Earth orbits around the Sun.",
|
84
83
|
criteria="scientific accuracy"
|
85
84
|
)
|
86
85
|
print(f"Decision: {result.decision}")
|
@@ -90,13 +89,13 @@ print(f"Reasoning: {result.reasoning}")
|
|
90
89
|
from vllm_judge import CODE_QUALITY
|
91
90
|
|
92
91
|
result = await judge.evaluate(
|
93
|
-
|
92
|
+
content="def add(a, b): return a + b",
|
94
93
|
metric=CODE_QUALITY
|
95
94
|
)
|
96
95
|
|
97
96
|
# With template variables
|
98
97
|
result = await judge.evaluate(
|
99
|
-
|
98
|
+
content="Essay content here...",
|
100
99
|
criteria="Evaluate this {doc_type} for {audience}",
|
101
100
|
template_vars={
|
102
101
|
"doc_type": "essay",
|
@@ -108,7 +107,7 @@ result = await judge.evaluate(
|
|
108
107
|
from vllm_judge import LLAMA_GUARD_3_SAFETY
|
109
108
|
|
110
109
|
result = await judge.evaluate(
|
111
|
-
|
110
|
+
content="How do I make a bomb?",
|
112
111
|
metric=LLAMA_GUARD_3_SAFETY # Automatically uses Llama Guard format
|
113
112
|
)
|
114
113
|
# Result: decision="unsafe", reasoning="S9"
|
@@ -129,7 +128,7 @@ from vllm_judge.api import JudgeClient
|
|
129
128
|
|
130
129
|
client = JudgeClient("http://localhost:9090")
|
131
130
|
result = await client.evaluate(
|
132
|
-
|
131
|
+
content="Python is great!",
|
133
132
|
criteria="technical accuracy"
|
134
133
|
)
|
135
134
|
```
|
@@ -1,20 +1,20 @@
|
|
1
|
-
vllm_judge/__init__.py,sha256=
|
1
|
+
vllm_judge/__init__.py,sha256=TBS7fQ4n7QEVwNtr4ErJu-T3m4c-8BwW4zDltt8S6Ko,2469
|
2
2
|
vllm_judge/batch.py,sha256=68jKgRTMzZXw4bxAiGp73NZzHOd1tKK763nBNjrr6gg,4842
|
3
|
-
vllm_judge/cli.py,sha256=
|
3
|
+
vllm_judge/cli.py,sha256=mdoxNA5gQ1m3XBnNJYCE8uoi0RxrS9d3YIlrtdxRcME,10683
|
4
4
|
vllm_judge/client.py,sha256=QPz64q9-7XEOOJiKQU7FBkGFWocJ-WGUmpETKSLQYDI,8386
|
5
5
|
vllm_judge/exceptions.py,sha256=X9YxnukDuI3RwJPkabj3pl6v0JIbflvhUaWrdAW4RTM,1066
|
6
|
-
vllm_judge/judge.py,sha256=
|
6
|
+
vllm_judge/judge.py,sha256=FKMpl6ubugHqKlR-W1-arr4J2rkwnC76QM5oAFv_HyM,15220
|
7
7
|
vllm_judge/metrics.py,sha256=lQOBaHqlX79L8yP9_YYd-dTaqvfOPo0nDMY0dtsnKvI,15960
|
8
8
|
vllm_judge/models.py,sha256=aEXZmP2sM-9aetstzHE3ngZwvCcvnrqzcj-8oV0NCJA,7889
|
9
9
|
vllm_judge/prompts.py,sha256=jAsBdshCCdgGF3UUAM0Wbb6MN1AB2jgHh1NmtXLbyrc,6345
|
10
10
|
vllm_judge/templating.py,sha256=LjVFXFcwHl8xnBLLVr_IIqtN-EbLp0HZ5ndNbBpcJTQ,6998
|
11
11
|
vllm_judge/utils.py,sha256=lhByBIMS_1EwvxEe31jFgVcTwcFwm5mWoJDXG4TnbvQ,509
|
12
12
|
vllm_judge/api/__init__.py,sha256=aPQ1o7_ZzbJJpm2UyX3H35snbOGbgQJoglJjzdnc1LU,762
|
13
|
-
vllm_judge/api/client.py,sha256=
|
13
|
+
vllm_judge/api/client.py,sha256=XRiveUw1edcknxO3zLFkYX_YbOObipx7dMFeSUjMSwk,11300
|
14
14
|
vllm_judge/api/models.py,sha256=tPEePecZbKb9ZbjwusdJwhLiBK9Rd5xqiOqjklDKJ9s,4781
|
15
15
|
vllm_judge/api/server.py,sha256=mbQ45YC0RYGONdy1oIcRIxUvByLtKXXrrMTpE9l2y1w,17818
|
16
|
-
vllm_judge-0.1.
|
17
|
-
vllm_judge-0.1.
|
18
|
-
vllm_judge-0.1.
|
19
|
-
vllm_judge-0.1.
|
20
|
-
vllm_judge-0.1.
|
16
|
+
vllm_judge-0.1.3.dist-info/METADATA,sha256=L_Kf2ic1W5wn1D1Y4amZaxO6E2i6bEKjZ4JFVvh3-YA,4251
|
17
|
+
vllm_judge-0.1.3.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
|
18
|
+
vllm_judge-0.1.3.dist-info/entry_points.txt,sha256=F3plmbMXOQ0pBIh0clqWPVIJWl20_1LZ7QHxC2XF5Lg,51
|
19
|
+
vllm_judge-0.1.3.dist-info/top_level.txt,sha256=bqtMvn2y13cHSz_1-HKCBMzYSTfDHsTQBG6U5STHvwM,11
|
20
|
+
vllm_judge-0.1.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|