vllm-judge 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vllm_judge/__init__.py CHANGED
@@ -5,7 +5,7 @@ A lightweight library for evaluating text responses using self-hosted language m
5
5
  via vLLM's OpenAI-compatible API.
6
6
  """
7
7
 
8
- __version__ = "0.1.2"
8
+ __version__ = "0.1.3"
9
9
 
10
10
  from vllm_judge.judge import Judge
11
11
  from vllm_judge.models import (
vllm_judge/api/client.py CHANGED
@@ -65,7 +65,7 @@ class JudgeClient:
65
65
 
66
66
  async def evaluate(
67
67
  self,
68
- response: Union[str, Dict[str, str]],
68
+ content: Union[str, Dict[str, str]],
69
69
  criteria: str = None,
70
70
  rubric: Union[str, Dict[Union[int, float], str]] = None,
71
71
  scale: Optional[Tuple[int, int]] = None,
@@ -87,7 +87,7 @@ class JudgeClient:
87
87
  EvaluationResult
88
88
  """
89
89
  request = EvaluateRequest(
90
- response=response,
90
+ response=content,
91
91
  criteria=criteria,
92
92
  rubric=rubric,
93
93
  scale=list(scale) if scale else None,
vllm_judge/cli.py CHANGED
@@ -75,7 +75,7 @@ def evaluate(
75
75
  # Use API client
76
76
  async with JudgeClient(api_url) as client:
77
77
  result = await client.evaluate(
78
- response=response,
78
+ content=response,
79
79
  criteria=criteria,
80
80
  metric=metric,
81
81
  scale=scale,
@@ -91,7 +91,7 @@ def evaluate(
91
91
  judge = Judge.from_url(base_url, model=model)
92
92
  async with judge:
93
93
  result = await judge.evaluate(
94
- response=response,
94
+ content=response,
95
95
  criteria=criteria,
96
96
  metric=metric,
97
97
  scale=scale,
vllm_judge/judge.py CHANGED
@@ -63,7 +63,7 @@ class Judge:
63
63
 
64
64
  async def evaluate(
65
65
  self,
66
- response: Union[str, Dict[str, str]],
66
+ content: Union[str, Dict[str, str]],
67
67
  criteria: str = None,
68
68
  rubric: Union[str, Dict[Union[int, float], str]] = None,
69
69
  scale: Optional[Tuple[int, int]] = None,
@@ -79,7 +79,7 @@ class Judge:
79
79
  Universal evaluation method that adapts to use case.
80
80
 
81
81
  Args:
82
- response: String for single evaluation, dict {"a": ..., "b": ...} for comparison
82
+ content: String for single evaluation, dict {"a": ..., "b": ...} for comparison
83
83
  criteria: What to evaluate for (can contain template variables)
84
84
  rubric: Instructions for evaluation, can be string or dict containing mapping of score to description (can contain template variables)
85
85
  scale: Optional numeric scale (min, max)
@@ -101,12 +101,12 @@ class Judge:
101
101
  """
102
102
  # Handle model-specific metrics
103
103
  if isinstance(metric, ModelSpecificMetric):
104
- assert isinstance(response, str), "Model-specific metrics only support string content for now"
104
+ assert isinstance(content, str), "Model-specific metrics only support string content for now"
105
105
 
106
106
  # logger.info(f"Evaluating model-specific metric {metric.name}.")
107
107
  logger.info(f"We assume you're using {metric.model_pattern} type model. If not, please do not use this metric and use a normal metric instead.")
108
108
  # Skip ALL our formatting
109
- messages = [{"role": "user", "content": response}]
109
+ messages = [{"role": "user", "content": content}]
110
110
 
111
111
  # vLLM applies model's chat template automatically
112
112
  llm_response = await self._call_model(messages)
@@ -157,7 +157,7 @@ class Judge:
157
157
 
158
158
  # Build messages
159
159
  messages = PromptBuilder.build_messages(
160
- response=response,
160
+ response=content,
161
161
  criteria=criteria,
162
162
  rubric=rubric,
163
163
  scale=scale,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vllm_judge
3
- Version: 0.1.2
3
+ Version: 0.1.3
4
4
  Summary: LLM-as-a-Judge evaluations for vLLM hosted models
5
5
  Author: TrustyAI team
6
6
  Author-email: Sai Chandra Pandraju <saichandrapandraju@gmail.com>
@@ -49,9 +49,8 @@ A lightweight library for LLM-as-a-Judge evaluations using vLLM hosted models. E
49
49
  - 🚀 **Simple Interface**: Single `evaluate()` method that adapts to any use case
50
50
  - 🎯 **Pre-built Metrics**: 20+ ready-to-use evaluation metrics
51
51
  - 🛡️ **Model-Specific Support:** Seamlessly works with specialized models like Llama Guard without breaking their trained formats.
52
- - 🔄 **Async Native**: Built for high-throughput evaluations
52
+ - **High Performance**: Async-first design enables high-throughput evaluations
53
53
  - 🔧 **Template Support**: Dynamic evaluations with template variables
54
- - ⚡ **High Performance**: Optimized for vLLM with automatic batching
55
54
  - 🌐 **API Mode**: Run as a REST API service
56
55
 
57
56
  ## Installation
@@ -80,7 +79,7 @@ judge = Judge.from_url("http://vllm-server:8000")
80
79
 
81
80
  # Simple evaluation
82
81
  result = await judge.evaluate(
83
- response="The Earth orbits around the Sun.",
82
+ content="The Earth orbits around the Sun.",
84
83
  criteria="scientific accuracy"
85
84
  )
86
85
  print(f"Decision: {result.decision}")
@@ -90,13 +89,13 @@ print(f"Reasoning: {result.reasoning}")
90
89
  from vllm_judge import CODE_QUALITY
91
90
 
92
91
  result = await judge.evaluate(
93
- response="def add(a, b): return a + b",
92
+ content="def add(a, b): return a + b",
94
93
  metric=CODE_QUALITY
95
94
  )
96
95
 
97
96
  # With template variables
98
97
  result = await judge.evaluate(
99
- response="Essay content here...",
98
+ content="Essay content here...",
100
99
  criteria="Evaluate this {doc_type} for {audience}",
101
100
  template_vars={
102
101
  "doc_type": "essay",
@@ -108,7 +107,7 @@ result = await judge.evaluate(
108
107
  from vllm_judge import LLAMA_GUARD_3_SAFETY
109
108
 
110
109
  result = await judge.evaluate(
111
- response="How do I make a bomb?",
110
+ content="How do I make a bomb?",
112
111
  metric=LLAMA_GUARD_3_SAFETY # Automatically uses Llama Guard format
113
112
  )
114
113
  # Result: decision="unsafe", reasoning="S9"
@@ -129,7 +128,7 @@ from vllm_judge.api import JudgeClient
129
128
 
130
129
  client = JudgeClient("http://localhost:9090")
131
130
  result = await client.evaluate(
132
- response="Python is great!",
131
+ content="Python is great!",
133
132
  criteria="technical accuracy"
134
133
  )
135
134
  ```
@@ -1,20 +1,20 @@
1
- vllm_judge/__init__.py,sha256=TcPeBC1yv3oDT5c8NvikyOL9cZyDZRnHD2Aeu0ynGuo,2469
1
+ vllm_judge/__init__.py,sha256=TBS7fQ4n7QEVwNtr4ErJu-T3m4c-8BwW4zDltt8S6Ko,2469
2
2
  vllm_judge/batch.py,sha256=68jKgRTMzZXw4bxAiGp73NZzHOd1tKK763nBNjrr6gg,4842
3
- vllm_judge/cli.py,sha256=KQtUt_L4u5TPrS8xoyiKYt_hQ_FiHtGcrkecGEtktI8,10685
3
+ vllm_judge/cli.py,sha256=mdoxNA5gQ1m3XBnNJYCE8uoi0RxrS9d3YIlrtdxRcME,10683
4
4
  vllm_judge/client.py,sha256=QPz64q9-7XEOOJiKQU7FBkGFWocJ-WGUmpETKSLQYDI,8386
5
5
  vllm_judge/exceptions.py,sha256=X9YxnukDuI3RwJPkabj3pl6v0JIbflvhUaWrdAW4RTM,1066
6
- vllm_judge/judge.py,sha256=Wn1ez1HJKb2U0Fu-kcIo7Ls3-ph7hVtb6K5Rlk0NfGw,15225
6
+ vllm_judge/judge.py,sha256=FKMpl6ubugHqKlR-W1-arr4J2rkwnC76QM5oAFv_HyM,15220
7
7
  vllm_judge/metrics.py,sha256=lQOBaHqlX79L8yP9_YYd-dTaqvfOPo0nDMY0dtsnKvI,15960
8
8
  vllm_judge/models.py,sha256=aEXZmP2sM-9aetstzHE3ngZwvCcvnrqzcj-8oV0NCJA,7889
9
9
  vllm_judge/prompts.py,sha256=jAsBdshCCdgGF3UUAM0Wbb6MN1AB2jgHh1NmtXLbyrc,6345
10
10
  vllm_judge/templating.py,sha256=LjVFXFcwHl8xnBLLVr_IIqtN-EbLp0HZ5ndNbBpcJTQ,6998
11
11
  vllm_judge/utils.py,sha256=lhByBIMS_1EwvxEe31jFgVcTwcFwm5mWoJDXG4TnbvQ,509
12
12
  vllm_judge/api/__init__.py,sha256=aPQ1o7_ZzbJJpm2UyX3H35snbOGbgQJoglJjzdnc1LU,762
13
- vllm_judge/api/client.py,sha256=mcpdH-9ko6aEh_JAybpPPVhHqlO3l5K-lTujTlkTw8c,11302
13
+ vllm_judge/api/client.py,sha256=XRiveUw1edcknxO3zLFkYX_YbOObipx7dMFeSUjMSwk,11300
14
14
  vllm_judge/api/models.py,sha256=tPEePecZbKb9ZbjwusdJwhLiBK9Rd5xqiOqjklDKJ9s,4781
15
15
  vllm_judge/api/server.py,sha256=mbQ45YC0RYGONdy1oIcRIxUvByLtKXXrrMTpE9l2y1w,17818
16
- vllm_judge-0.1.2.dist-info/METADATA,sha256=DtXmkJ_sIXp49PuIL3CZJzkPRHsR8zhhwMaJFm6bUYg,4307
17
- vllm_judge-0.1.2.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
18
- vllm_judge-0.1.2.dist-info/entry_points.txt,sha256=F3plmbMXOQ0pBIh0clqWPVIJWl20_1LZ7QHxC2XF5Lg,51
19
- vllm_judge-0.1.2.dist-info/top_level.txt,sha256=bqtMvn2y13cHSz_1-HKCBMzYSTfDHsTQBG6U5STHvwM,11
20
- vllm_judge-0.1.2.dist-info/RECORD,,
16
+ vllm_judge-0.1.3.dist-info/METADATA,sha256=L_Kf2ic1W5wn1D1Y4amZaxO6E2i6bEKjZ4JFVvh3-YA,4251
17
+ vllm_judge-0.1.3.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
18
+ vllm_judge-0.1.3.dist-info/entry_points.txt,sha256=F3plmbMXOQ0pBIh0clqWPVIJWl20_1LZ7QHxC2XF5Lg,51
19
+ vllm_judge-0.1.3.dist-info/top_level.txt,sha256=bqtMvn2y13cHSz_1-HKCBMzYSTfDHsTQBG6U5STHvwM,11
20
+ vllm_judge-0.1.3.dist-info/RECORD,,