vllm-judge 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vllm_judge/prompts.py ADDED
@@ -0,0 +1,175 @@
1
+ from typing import List, Dict, Union, Optional, Tuple, Any
2
+
3
+
4
+ class PromptBuilder:
5
+ """Builds prompts for evaluation requests."""
6
+
7
+ @staticmethod
8
+ def build_messages(
9
+ response: Union[str, Dict[str, str]],
10
+ criteria: str,
11
+ rubric: Union[str, Dict[Union[int, float], str]] = None,
12
+ scale: Optional[Tuple[int, int]] = None,
13
+ examples: List[Dict[str, Any]] = None,
14
+ system_prompt: Optional[str] = None,
15
+ context: Optional[str] = None,
16
+ **kwargs
17
+ ) -> List[Dict[str, str]]:
18
+ """
19
+ Build chat messages for evaluation.
20
+
21
+ Args:
22
+ response: Single response or dict with 'a' and 'b' for comparison
23
+ criteria: What to evaluate for
24
+ rubric: Evaluation guide
25
+ scale: Numeric scale (min, max)
26
+ examples: Few-shot examples
27
+ system_prompt: Custom system message
28
+ context: Additional context
29
+ **kwargs: Additional parameters
30
+
31
+ Returns:
32
+ List of chat messages
33
+ """
34
+ # Detect evaluation type
35
+ is_comparison = isinstance(response, dict) and "a" in response and "b" in response
36
+
37
+ # System message
38
+ if not system_prompt:
39
+ # TODO: Add more detailed system prompts
40
+ system_prompt = "You are an impartial judge and expert evaluator "
41
+ if is_comparison:
42
+ system_prompt+="comparing responses objectively."
43
+ else:
44
+ system_prompt+="providing objective assessments."
45
+
46
+ # Output format instructions
47
+ system_prompt+="\nYou must respond in JSON format:\n"
48
+ system_prompt+="""{
49
+ "decision": <your judgment - string|number|boolean>,
50
+ "reasoning": "<concise explanation of your judgment>",
51
+ "score": <numeric score if requested, otherwise null>
52
+ }"""
53
+ system_prompt+="\nDo not include any text in your response except for the JSON object."
54
+
55
+ # Build user message
56
+ user_content = PromptBuilder._build_user_prompt(
57
+ response=response,
58
+ criteria=criteria,
59
+ rubric=rubric,
60
+ scale=scale,
61
+ examples=examples,
62
+ is_comparison=is_comparison,
63
+ context=context,
64
+ **kwargs
65
+ )
66
+
67
+ return [
68
+ {"role": "system", "content": system_prompt},
69
+ {"role": "user", "content": user_content}
70
+ ]
71
+
72
+ @staticmethod
73
+ def _build_user_prompt(
74
+ response: Union[str, Dict[str, str]],
75
+ criteria: str,
76
+ rubric: Union[str, Dict[Union[int, float], str]],
77
+ scale: Optional[Tuple[int, int]],
78
+ examples: List[Dict[str, Any]],
79
+ is_comparison: bool,
80
+ context: Optional[str] = None,
81
+ **kwargs
82
+ ) -> str:
83
+ """Build the user message content."""
84
+ parts = []
85
+
86
+ # Task description
87
+ if is_comparison:
88
+ parts.append(f"Compare these two responses based on: {criteria}")
89
+ if context:
90
+ parts.append(f"\nContext: {context}")
91
+ parts.append(f"\nResponse A:\n{response['a']}")
92
+ parts.append(f"\nResponse B:\n{response['b']}")
93
+ else:
94
+ parts.append(f"Evaluate the following response based on: {criteria}")
95
+ if context:
96
+ parts.append(f"\nContext: {context}")
97
+ parts.append(f"\nResponse to evaluate:\n{response}")
98
+
99
+ # Add scale and rubric
100
+ if scale:
101
+ parts.append(f"\nProvide a score from {scale[0]} to {scale[1]}")
102
+
103
+ if isinstance(rubric, dict):
104
+ parts.append("\nScoring guide:")
105
+ # Sort by score in descending order
106
+ sorted_items = sorted(rubric.items(), key=lambda x: float(x[0]), reverse=True)
107
+ for score, description in sorted_items:
108
+ parts.append(f"- {score}: {description}")
109
+ elif rubric:
110
+ parts.append(f"\nEvaluation guide: {rubric}")
111
+ elif rubric:
112
+ parts.append(f"\nEvaluation guide: {rubric}")
113
+
114
+ # Add examples if provided
115
+ if examples:
116
+ parts.append("\nExample evaluations:")
117
+ for i, ex in enumerate(examples, 1):
118
+ parts.append(f"\nExample {i}:")
119
+
120
+ # Handle different example formats
121
+ if "response" in ex:
122
+ parts.append(f"Response: {ex['response']}")
123
+ elif "text" in ex:
124
+ parts.append(f"Text: {ex['text']}")
125
+
126
+ if "decision" in ex:
127
+ parts.append(f"Decision: {ex['decision']}")
128
+ if "score" in ex:
129
+ parts.append(f"Score: {ex['score']}")
130
+
131
+ if "reasoning" in ex:
132
+ parts.append(f"Reasoning: {ex['reasoning']}")
133
+
134
+ # Add any additional instructions
135
+ if kwargs.get("additional_instructions"):
136
+ parts.append(f"\nAdditional instructions: {kwargs['additional_instructions']}")
137
+
138
+ # Output format instructions
139
+ parts.append("\nYou must respond in JSON format:")
140
+ parts.append("""{
141
+ "decision": <your judgment - string|number|boolean>,
142
+ "reasoning": "<concise explanation of your judgment>",
143
+ "score": <numeric score if requested, otherwise null>
144
+ }""")
145
+
146
+ return "\n".join(parts)
147
+
148
+ @staticmethod
149
+ def format_messages_as_text(messages: List[Dict[str, str]]) -> str:
150
+ """
151
+ Format chat messages as plain text for completion API.
152
+
153
+ Args:
154
+ messages: List of chat messages
155
+
156
+ Returns:
157
+ Formatted text prompt
158
+ """
159
+ parts = []
160
+
161
+ for message in messages:
162
+ role = message["role"]
163
+ content = message["content"]
164
+
165
+ if role == "system":
166
+ parts.append(f"System: {content}")
167
+ elif role == "user":
168
+ parts.append(f"\nUser: {content}")
169
+ elif role == "assistant":
170
+ parts.append(f"\nAssistant: {content}")
171
+
172
+ # Add a prompt for the assistant to respond
173
+ parts.append("\nAssistant:")
174
+
175
+ return "\n".join(parts)
@@ -0,0 +1,206 @@
1
+ import string
2
+ from typing import Dict, Any, List, Union, Set, Optional
3
+ from vllm_judge.models import TemplateEngine
4
+ from vllm_judge.exceptions import InvalidInputError
5
+
6
+
7
+ class TemplateProcessor:
8
+ """Template processing for dynamic prompts.
9
+ Handles template variable substitution."""
10
+
11
+ @staticmethod
12
+ def apply_template(
13
+ template: Optional[Union[str, Dict]],
14
+ template_vars: Dict[str, Any],
15
+ engine: TemplateEngine = TemplateEngine.FORMAT,
16
+ strict: bool = True
17
+ ) -> Optional[Union[str, Dict]]:
18
+ """
19
+ Apply template variables to a template string or dict.
20
+
21
+ Args:
22
+ template: Template string, dict, or None
23
+ template_vars: Variables to substitute
24
+ engine: Template engine to use
25
+ strict: If True, raise error for missing variables
26
+
27
+ Returns:
28
+ Processed template
29
+
30
+ Raises:
31
+ InvalidInputError: If required variables are missing
32
+ """
33
+ if isinstance(template, dict):
34
+ # Process dict values recursively
35
+ return {
36
+ k: TemplateProcessor.apply_template(v, template_vars, engine, strict)
37
+ for k, v in template.items()
38
+ }
39
+
40
+ if not isinstance(template, str):
41
+ return template
42
+
43
+ if engine == TemplateEngine.FORMAT:
44
+ return TemplateProcessor._apply_format_template(
45
+ template, template_vars, strict
46
+ )
47
+ elif engine == TemplateEngine.JINJA2:
48
+ return TemplateProcessor._apply_jinja2_template(
49
+ template, template_vars, strict
50
+ )
51
+
52
+ @staticmethod
53
+ def _apply_format_template(
54
+ template: str,
55
+ template_vars: Dict[str, Any],
56
+ strict: bool
57
+ ) -> str:
58
+ """Apply str.format() style template."""
59
+ try:
60
+ # First check for missing variables if strict
61
+ if strict:
62
+ missing = TemplateProcessor.get_required_vars_format(template) - set(template_vars.keys())
63
+ if missing:
64
+ raise InvalidInputError(
65
+ f"Missing required template variables: {', '.join(sorted(missing))}"
66
+ )
67
+
68
+ return template.format(**template_vars)
69
+ except KeyError as e:
70
+ if strict:
71
+ raise InvalidInputError(f"Missing template variable: {e}")
72
+ else:
73
+ # Partial formatting - leave missing variables as-is
74
+ return template.format_map(SafeDict(template_vars))
75
+
76
+ @staticmethod
77
+ def _apply_jinja2_template(
78
+ template: str,
79
+ template_vars: Dict[str, Any],
80
+ strict: bool
81
+ ) -> str:
82
+ """Apply Jinja2 template."""
83
+ try:
84
+ from jinja2 import Template, Environment, StrictUndefined, UndefinedError
85
+ except ImportError:
86
+ raise ImportError(
87
+ "Jinja2 is required for jinja2 template engine. "
88
+ "Install with: pip install vllm-judge[jinja2]"
89
+ )
90
+
91
+ try:
92
+ if strict:
93
+ # Use StrictUndefined to catch missing variables
94
+ env = Environment(undefined=StrictUndefined)
95
+ jinja_template = env.from_string(template)
96
+ else:
97
+ # Default behavior - missing variables render as empty
98
+ jinja_template = Template(template)
99
+
100
+ return jinja_template.render(**template_vars)
101
+ except UndefinedError as e:
102
+ raise InvalidInputError(f"Missing template variable in Jinja2 template: {e}")
103
+
104
+ @staticmethod
105
+ def get_required_vars(
106
+ template: Union[str, Dict, None],
107
+ engine: TemplateEngine = TemplateEngine.FORMAT
108
+ ) -> Set[str]:
109
+ """
110
+ Extract required variables from a template.
111
+
112
+ Args:
113
+ template: Template to analyze
114
+ engine: Template engine being used
115
+
116
+ Returns:
117
+ Set of required variable names
118
+ """
119
+ if isinstance(template, dict):
120
+ # Collect from all dict values
121
+ all_vars = set()
122
+ for v in template.values():
123
+ all_vars.update(TemplateProcessor.get_required_vars(v, engine))
124
+ return all_vars
125
+
126
+ if not isinstance(template, str):
127
+ return set()
128
+
129
+ if engine == TemplateEngine.FORMAT:
130
+ return TemplateProcessor.get_required_vars_format(template)
131
+ elif engine == TemplateEngine.JINJA2:
132
+ return TemplateProcessor.get_required_vars_jinja2(template)
133
+
134
+ @staticmethod
135
+ def get_required_vars_format(template: str) -> Set[str]:
136
+ """Extract variables from format string."""
137
+ formatter = string.Formatter()
138
+ variables = set()
139
+
140
+ try:
141
+ for _, field_name, _, _ in formatter.parse(template):
142
+ if field_name:
143
+ # Handle nested fields like {user.name}
144
+ base_var = field_name.split('.')[0].split('[')[0]
145
+ variables.add(base_var)
146
+ except:
147
+ pass # If parsing fails, return empty set
148
+
149
+ return variables
150
+
151
+ @staticmethod
152
+ def get_required_vars_jinja2(template: str) -> Set[str]:
153
+ """Extract variables from Jinja2 template."""
154
+ try:
155
+ from jinja2 import Environment, meta
156
+ except ImportError:
157
+ return set() # Can't analyze without Jinja2
158
+
159
+ try:
160
+ env = Environment()
161
+ ast = env.parse(template)
162
+ return meta.find_undeclared_variables(ast)
163
+ except:
164
+ return set()
165
+
166
+ @staticmethod
167
+ def validate_template_vars(
168
+ provided_vars: Dict[str, Any],
169
+ required_vars: List[str],
170
+ template_defaults: Dict[str, Any] = None
171
+ ) -> Dict[str, Any]:
172
+ """
173
+ Validate and merge template variables.
174
+
175
+ Args:
176
+ provided_vars: User-provided variables
177
+ required_vars: Required variable names
178
+ template_defaults: Default values
179
+
180
+ Returns:
181
+ Merged template variables
182
+
183
+ Raises:
184
+ InvalidInputError: If required variables are missing
185
+ """
186
+ # Start with defaults
187
+ final_vars = dict(template_defaults or {})
188
+
189
+ # Override with provided vars
190
+ final_vars.update(provided_vars)
191
+
192
+ # Check required vars
193
+ missing = set(required_vars) - set(final_vars.keys())
194
+ if missing:
195
+ raise InvalidInputError(
196
+ f"Missing required template variables: {', '.join(sorted(missing))}"
197
+ )
198
+
199
+ return final_vars
200
+
201
+
202
+ class SafeDict(dict):
203
+ """Dictionary that returns {key} for missing keys in format strings."""
204
+
205
+ def __missing__(self, key):
206
+ return f"{{{key}}}"
@@ -0,0 +1,124 @@
1
+ Metadata-Version: 2.4
2
+ Name: vllm_judge
3
+ Version: 0.1.0
4
+ Summary: LLM-as-a-Judge evaluations for vLLM hosted models
5
+ Author: TrustyAI team
6
+ Author-email: Sai Chandra Pandraju <saichandrapandraju@gmail.com>
7
+ Project-URL: Homepage, https://github.com/saichandrapandraju/vllm_judge
8
+ Project-URL: Repository, https://github.com/saichandrapandraju/vllm_judge
9
+ Project-URL: Issues, https://github.com/saichandrapandraju/vllm_judge/issues
10
+ Keywords: llm,evaluation,vllm,judge,ai,machine-learning,nlp,llm-evaluation,llm-as-judge
11
+ Requires-Python: >=3.8
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: httpx>=0.24.0
14
+ Requires-Dist: pydantic>=2.0.0
15
+ Requires-Dist: tenacity>=8.0.0
16
+ Requires-Dist: click>=8.0.0
17
+ Provides-Extra: api
18
+ Requires-Dist: fastapi>=0.100.0; extra == "api"
19
+ Requires-Dist: uvicorn[standard]>=0.22.0; extra == "api"
20
+ Requires-Dist: websockets>=11.0; extra == "api"
21
+ Provides-Extra: jinja2
22
+ Requires-Dist: jinja2>=3.0.0; extra == "jinja2"
23
+ Provides-Extra: dev
24
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
25
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
26
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
27
+ Requires-Dist: black>=23.0.0; extra == "dev"
28
+ Requires-Dist: isort>=5.12.0; extra == "dev"
29
+ Requires-Dist: flake8>=6.0.0; extra == "dev"
30
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
31
+ Provides-Extra: test
32
+ Requires-Dist: pytest>=7.0.0; extra == "test"
33
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "test"
34
+ Requires-Dist: pytest-cov>=4.0.0; extra == "test"
35
+ Requires-Dist: pytest-mock>=3.10.0; extra == "test"
36
+ Provides-Extra: docs
37
+ Requires-Dist: mkdocs>=1.5.0; extra == "docs"
38
+ Requires-Dist: mkdocs-material>=9.0.0; extra == "docs"
39
+ Requires-Dist: mkdocstrings[python]>=0.24.0; extra == "docs"
40
+
41
+ # vLLM Judge
42
+
43
+ A lightweight library for LLM-as-a-Judge evaluations using vLLM hosted models.
44
+
45
+ ## Features
46
+
47
+ - 🚀 **Simple Interface**: Single `evaluate()` method that adapts to any use case
48
+ - 🎯 **Pre-built Metrics**: 20+ ready-to-use evaluation metrics
49
+ - 🔧 **Template Support**: Dynamic evaluations with template variables
50
+ - ⚡ **High Performance**: Optimized for vLLM with automatic batching
51
+ - 🌐 **API Mode**: Run as a REST API service
52
+ - 🔄 **Async Native**: Built for high-throughput evaluations
53
+
54
+ ## Installation
55
+
56
+ ```bash
57
+ # Basic installation
58
+ pip install vllm_judge
59
+
60
+ # With API support
61
+ pip install vllm_judge[api]
62
+
63
+ # With Jinja2 template support
64
+ pip install vllm_judge[jinja2]
65
+
66
+ # Everything
67
+ pip install vllm_judge[api,jinja2]
68
+ ```
69
+
70
+ ## Quick Start
71
+
72
+ ```python
73
+ from vllm_judge import Judge
74
+
75
+ # Initialize with vLLM url
76
+ judge = await Judge.from_url("http://localhost:8000")
77
+
78
+ # Simple evaluation
79
+ result = await judge.evaluate(
80
+ response="The Earth orbits around the Sun.",
81
+ criteria="scientific accuracy"
82
+ )
83
+ print(f"Decision: {result.decision}")
84
+ print(f"Reasoning: {result.reasoning}")
85
+
86
+ # Using pre-built metrics
87
+ from vllm_judge import CODE_QUALITY
88
+
89
+ result = await judge.evaluate(
90
+ response="def add(a, b): return a + b",
91
+ metric=CODE_QUALITY
92
+ )
93
+
94
+ # With template variables
95
+ result = await judge.evaluate(
96
+ response="Essay content here...",
97
+ criteria="Evaluate this {doc_type} for {audience}",
98
+ template_vars={
99
+ "doc_type": "essay",
100
+ "audience": "high school students"
101
+ }
102
+ )
103
+ ```
104
+
105
+ ## API Server
106
+
107
+ Run Judge as a REST API:
108
+
109
+ ```bash
110
+ vllm-judge serve --base-url http://localhost:8000 --port 9090 --host localhost
111
+ ```
112
+
113
+ Then use the HTTP API:
114
+
115
+ ```python
116
+ from vllm_judge.api import JudgeClient
117
+
118
+ client = JudgeClient("http://localhost:9090")
119
+ result = await client.evaluate(
120
+ response="Python is great!",
121
+ criteria="technical accuracy"
122
+ )
123
+ ```
124
+
@@ -0,0 +1,19 @@
1
+ vllm_judge/__init__.py,sha256=Sx6sERXfksr1eubHxXj_uTiVrXmHXINoY9-nP20EiSg,2363
2
+ vllm_judge/batch.py,sha256=68jKgRTMzZXw4bxAiGp73NZzHOd1tKK763nBNjrr6gg,4842
3
+ vllm_judge/cli.py,sha256=KQtUt_L4u5TPrS8xoyiKYt_hQ_FiHtGcrkecGEtktI8,10685
4
+ vllm_judge/client.py,sha256=QPz64q9-7XEOOJiKQU7FBkGFWocJ-WGUmpETKSLQYDI,8386
5
+ vllm_judge/exceptions.py,sha256=X9YxnukDuI3RwJPkabj3pl6v0JIbflvhUaWrdAW4RTM,1066
6
+ vllm_judge/judge.py,sha256=y2qp18PVtobAyxqI246tEsju82W-OuGG4zXfajTEW-E,14101
7
+ vllm_judge/metrics.py,sha256=QeGzaERvfRKQTt4JfquL1rW72GSkWdJ2_Nw_Hf0zqjY,15685
8
+ vllm_judge/models.py,sha256=fbEUFPsY3xhv54WueWqEKvAgIcWTm-JO42N2-6k5LeM,7417
9
+ vllm_judge/prompts.py,sha256=jAsBdshCCdgGF3UUAM0Wbb6MN1AB2jgHh1NmtXLbyrc,6345
10
+ vllm_judge/templating.py,sha256=LjVFXFcwHl8xnBLLVr_IIqtN-EbLp0HZ5ndNbBpcJTQ,6998
11
+ vllm_judge/api/__init__.py,sha256=aPQ1o7_ZzbJJpm2UyX3H35snbOGbgQJoglJjzdnc1LU,762
12
+ vllm_judge/api/client.py,sha256=mcpdH-9ko6aEh_JAybpPPVhHqlO3l5K-lTujTlkTw8c,11302
13
+ vllm_judge/api/models.py,sha256=tPEePecZbKb9ZbjwusdJwhLiBK9Rd5xqiOqjklDKJ9s,4781
14
+ vllm_judge/api/server.py,sha256=mbQ45YC0RYGONdy1oIcRIxUvByLtKXXrrMTpE9l2y1w,17818
15
+ vllm_judge-0.1.0.dist-info/METADATA,sha256=W0_-H1J-KEDOzAV8ZNgM6z8gkKxodsebmH3lBVR2jU4,3572
16
+ vllm_judge-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
17
+ vllm_judge-0.1.0.dist-info/entry_points.txt,sha256=F3plmbMXOQ0pBIh0clqWPVIJWl20_1LZ7QHxC2XF5Lg,51
18
+ vllm_judge-0.1.0.dist-info/top_level.txt,sha256=bqtMvn2y13cHSz_1-HKCBMzYSTfDHsTQBG6U5STHvwM,11
19
+ vllm_judge-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ vllm-judge = vllm_judge.cli:main
@@ -0,0 +1 @@
1
+ vllm_judge