vllm-judge 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vllm_judge/__init__.py CHANGED
@@ -5,7 +5,7 @@ A lightweight library for evaluating text responses using self-hosted language m
5
5
  via vLLM's OpenAI-compatible API.
6
6
  """
7
7
 
8
- __version__ = "0.1.5"
8
+ __version__ = "0.1.6"
9
9
 
10
10
  from vllm_judge.judge import Judge
11
11
  from vllm_judge.models import (
@@ -17,7 +17,7 @@ from vllm_judge.models import (
17
17
  ModelSpecificMetric
18
18
  )
19
19
  from vllm_judge.templating import TemplateProcessor
20
- from vllm_judge.metrics import (
20
+ from vllm_judge.builtin_metrics import (
21
21
  # General metrics
22
22
  HELPFULNESS,
23
23
  ACCURACY,
@@ -1,6 +1,3 @@
1
- """
2
- API module for vLLM Judge.
3
- """
4
1
  from vllm_judge.api.server import app, create_app, start_server
5
2
  from vllm_judge.api.client import JudgeClient
6
3
  from vllm_judge.api.models import (
vllm_judge/api/client.py CHANGED
@@ -1,6 +1,3 @@
1
- """
2
- HTTP client for vLLM Judge API.
3
- """
4
1
  import asyncio
5
2
  from typing import Union, Dict, List, Optional, Tuple, Any, AsyncIterator
6
3
  import httpx
vllm_judge/api/server.py CHANGED
@@ -1,7 +1,3 @@
1
- """
2
- FastAPI server for vLLM Judge API.
3
- """
4
- import asyncio
5
1
  import time
6
2
  import uuid
7
3
  from datetime import datetime
@@ -14,7 +10,7 @@ import uvicorn
14
10
 
15
11
  from vllm_judge.judge import Judge
16
12
  from vllm_judge.models import EvaluationResult, JudgeConfig
17
- from vllm_judge.metrics import BUILTIN_METRICS
13
+ from vllm_judge.builtin_metrics import BUILTIN_METRICS
18
14
  from vllm_judge.exceptions import VLLMJudgeError
19
15
  from vllm_judge.api.models import (
20
16
  EvaluateRequest,
vllm_judge/batch.py CHANGED
@@ -17,7 +17,8 @@ class BatchProcessor:
17
17
  max_concurrent: Maximum concurrent requests
18
18
  """
19
19
  self.judge = judge
20
- self.semaphore = asyncio.Semaphore(max_concurrent)
20
+ self.max_concurrent = max_concurrent
21
+ self.semaphore = asyncio.Semaphore(self.max_concurrent)
21
22
  self.progress_lock = asyncio.Lock()
22
23
  self.completed = 0
23
24
 
@@ -1,6 +1,6 @@
1
1
  from typing import Dict
2
2
  from vllm_judge.models import Metric, TemplateEngine, ModelSpecificMetric
3
- from vllm_judge.utils import parse_llama_guard_3
3
+ from vllm_judge.parsers import parse_llama_guard_3
4
4
 
5
5
  # Registry for built-in metrics
6
6
  BUILTIN_METRICS: Dict[str, Metric] = {}
vllm_judge/cli.py CHANGED
@@ -1,6 +1,3 @@
1
- """
2
- Command-line interface for vLLM Judge.
3
- """
4
1
  import asyncio
5
2
  import json
6
3
  import sys
@@ -8,10 +5,9 @@ from typing import Optional
8
5
  import click
9
6
 
10
7
  from vllm_judge import Judge
11
- from vllm_judge.models import JudgeConfig
12
8
  from vllm_judge.api.server import start_server as start_api_server
13
9
  from vllm_judge.api.client import JudgeClient
14
- from vllm_judge.metrics import BUILTIN_METRICS
10
+ from vllm_judge.builtin_metrics import BUILTIN_METRICS
15
11
 
16
12
 
17
13
  @click.group()
vllm_judge/client.py CHANGED
@@ -126,12 +126,8 @@ class VLLMClient:
126
126
  "messages": messages,
127
127
  "temperature": self.config.temperature,
128
128
  "max_tokens": self.config.max_tokens,
129
- # "top_p": self.config.top_p,
130
129
  }
131
-
132
- # # Request JSON response format if supported
133
- # if self.config.temperature < 0.2: # Only for low temperature
134
- # request_data["response_format"] = {"type": "json_object"}
130
+
135
131
 
136
132
  try:
137
133
  response = await self._request_with_retry(
@@ -172,7 +168,6 @@ class VLLMClient:
172
168
  "prompt": prompt,
173
169
  "temperature": self.config.temperature,
174
170
  "max_tokens": self.config.max_tokens,
175
- # "top_p": self.config.top_p,
176
171
  }
177
172
 
178
173
  try:
vllm_judge/judge.py CHANGED
@@ -4,9 +4,9 @@ from typing import Union, Dict, List, Optional, Tuple, Any, Callable
4
4
 
5
5
  from vllm_judge.models import JudgeConfig, EvaluationResult, Metric, BatchResult, TemplateEngine, ModelSpecificMetric
6
6
  from vllm_judge.client import VLLMClient
7
- from vllm_judge.prompts import PromptBuilder
7
+ from vllm_judge.prompt_builder import PromptBuilder
8
8
  from vllm_judge.batch import BatchProcessor
9
- from vllm_judge.metrics import BUILTIN_METRICS
9
+ from vllm_judge.builtin_metrics import BUILTIN_METRICS
10
10
  from vllm_judge.templating import TemplateProcessor
11
11
  from vllm_judge.exceptions import (
12
12
  ParseError,
vllm_judge/models.py CHANGED
@@ -59,7 +59,6 @@ class JudgeConfig(BaseModel):
59
59
  # Model parameters
60
60
  temperature: float = Field(0.0, description="Sampling temperature")
61
61
  max_tokens: int = Field(256, description="Maximum tokens in response")
62
- # top_p: float = Field(0.95, description="Top-p sampling")
63
62
 
64
63
  # Batch settings
65
64
  max_concurrent: int = Field(50, description="Maximum concurrent requests")
@@ -1,5 +1,5 @@
1
1
  from typing import List, Dict, Union, Optional, Tuple, Any
2
-
2
+ import json
3
3
 
4
4
  class PromptBuilder:
5
5
  """Builds prompts for evaluation requests."""
@@ -35,24 +35,36 @@ class PromptBuilder:
35
35
  """
36
36
  # Detect evaluation type
37
37
  is_comparison = isinstance(content, dict) and "a" in content and "b" in content
38
+
39
+ output_format = """
40
+ # Output Format:
41
+
42
+ The JSON object MUST have exactly these three fields:
43
+
44
+ 1. decision: (String | Boolean) This decision label should clearly state your main finding. This could be a string representing a specific class (eg., PASS, FAIL, CORRECT, INCORRECT, etc.) or a boolean value (true or false). If user provided a rubric, you should use the rubric to determine the decision label.
45
+ 2. score: (Number | null) A numerical score for the evaluation. If scoring is requested, provide the score as a number. If scoring is NOT requested or is not applicable for the specific task, you MUST use the value null for this field.
46
+ 3. reasoning: (String) A concise explanation justifying your decision and score (if a score was provided). This reasoning must directly and logically support your evaluation and refer to the specific evaluation criteria.
47
+
48
+ The JSON object MUST be well-formed and adhere strictly to the following structure:
49
+
50
+ {
51
+ "decision": <your judgment - string|boolean>,
52
+ "reasoning": <concise explanation of your judgment - string>,
53
+ "score": <numeric score if requested, otherwise null - number|null>
54
+ }
55
+ """
38
56
 
39
57
  # System message
40
58
  if not system_prompt:
41
- # TODO: Add more detailed system prompts
42
- system_prompt = "You are an impartial judge and expert evaluator "
43
- if is_comparison:
44
- system_prompt+="comparing responses objectively."
45
- else:
46
- system_prompt+="providing objective assessments."
47
-
48
- # Output format instructions
49
- system_prompt+="\nYou must respond in JSON format:\n"
50
- system_prompt+="""{
51
- "decision": <your judgment - string|boolean>,
52
- "reasoning": "<concise explanation of your judgment>",
53
- "score": <numeric score if requested, otherwise null>
54
- }"""
55
- system_prompt+="\nDo not include any text in your response except for the JSON object."
59
+ system_prompt = """You are an impartial judge and expert evaluator. Your task is to evaluate the provided content based on the specific evaluation criteria and rubric.
60
+ # Key Instructions:
61
+ 1. Your evaluation must be objective, consistent, and based solely on the specified criteria. Do not let your own opinions or biases interfere.
62
+ 2. Focus exclusively on quality assessment.
63
+ 3. Do not be influenced by the length of the responses unless response length is explicitly relevant to the specified evaluation criteria (e.g., a task assessing conciseness or verbosity).
64
+ 4. Your entire response MUST be a single, valid JSON object and nothing else. Do not include any text or conversational filler before or after this JSON object.
65
+
66
+ """
67
+ system_prompt += output_format
56
68
 
57
69
  # Build user message
58
70
  user_content = PromptBuilder._build_user_prompt(
@@ -93,30 +105,30 @@ class PromptBuilder:
93
105
  parts.append(f'"{input}"')
94
106
  parts.append("")
95
107
 
108
+ parts.append("## Content to evaluate:")
109
+ if is_comparison:
110
+ parts.append(f"**Response A:**\n{content['a']}")
111
+ parts.append(f"**Response B:**\n{content['b']}")
112
+ else:
113
+ parts.append(content)
114
+
115
+ parts.append("## Evaluation Criteria:")
116
+
96
117
  # Task description
97
118
  if is_comparison:
98
- if input:
99
- parts.append(f"Compare how well these two responses address the input for: {criteria}")
100
- else:
101
- parts.append(f"Compare these two responses based on: {criteria}")
119
+ parts.append(f"Compare the two responses based on: {criteria}")
102
120
  if context:
103
121
  parts.append(f"\nContext: {context}")
104
- parts.append(f"\nResponse A:\n{content['a']}")
105
- parts.append(f"\nResponse B:\n{content['b']}")
106
122
  else:
107
- if input:
108
- parts.append(f"Evaluate how well this content addresses the input for: {criteria}")
109
- else:
110
- parts.append(f"Evaluate the following content based on: {criteria}")
123
+ parts.append(f"Evaluate the content based on: {criteria}")
111
124
  if context:
112
125
  parts.append(f"\nContext: {context}")
113
- parts.append(f"\nContent to evaluate:\n{content}")
114
126
 
115
- parts.append(f"\nYou must return a decision label/class (your judgement) for the `decision` field and a concise explanation for the `reasoning` field.")
127
+ parts.append(f"\nYou must return a decision label/class (your main judgement) for the `decision` field and a concise explanation for the `reasoning` field in the JSON object.")
116
128
 
117
129
  # Add scale and rubric
118
130
  if scale:
119
- parts.append(f"\nIn addition to these, provide a score from {scale[0]} to {scale[1]}")
131
+ parts.append(f"In addition to these, provide a score from {scale[0]} to {scale[1]}")
120
132
 
121
133
  if isinstance(rubric, dict):
122
134
  parts.append("\nScoring guide:")
@@ -127,14 +139,15 @@ class PromptBuilder:
127
139
  elif rubric:
128
140
  parts.append(f"\nEvaluation guide: {rubric}")
129
141
  elif rubric:
142
+ parts.append("\nIn addition to these, provide a score if required by the following evaluation guide.")
130
143
  parts.append(f"\nEvaluation guide: {rubric}")
131
144
 
132
145
  # Add examples if provided
133
146
  if examples:
134
147
  parts.append("\nExample evaluations:")
135
- for i, ex in enumerate(examples, 1):
136
- parts.append(f"\nExample {i}:")
137
-
148
+ for i, ex in enumerate(examples):
149
+ parts.append(f"Example {i+1}:")
150
+ parts.append("Request:")
138
151
  # Handle different example formats
139
152
  if "input" in ex:
140
153
  parts.append(f"Input: {ex['input']}")
@@ -143,17 +156,24 @@ class PromptBuilder:
143
156
  elif "text" in ex:
144
157
  parts.append(f"Text: {ex['text']}")
145
158
 
146
- if "decision" in ex:
147
- parts.append(f"Decision: {ex['decision']}")
159
+ parts.append("Response:")
160
+
161
+ response = {}
162
+ if "decision" not in ex or ex["decision"] is None or ex["decision"] == "":
163
+ raise ValueError("Example must include a decision field")
164
+
165
+ response["decision"] = ex["decision"]
148
166
  if "score" in ex:
149
- parts.append(f"Score: {ex['score']}")
167
+ response["score"] = ex["score"]
150
168
 
151
169
  if "reasoning" in ex:
152
- parts.append(f"Reasoning: {ex['reasoning']}")
170
+ response["reasoning"] = ex["reasoning"]
171
+
172
+ parts.append(json.dumps(response))
153
173
 
154
174
  # Add any additional instructions
155
175
  if kwargs.get("additional_instructions"):
156
- parts.append(f"\nAdditional instructions: {kwargs['additional_instructions']}")
176
+ parts.append(f"Additional instructions: {kwargs['additional_instructions']}")
157
177
 
158
178
  # Output format instructions
159
179
  parts.append("\nYou must respond in JSON format:")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vllm_judge
3
- Version: 0.1.5
3
+ Version: 0.1.6
4
4
  Summary: LLM-as-a-Judge evaluations for vLLM hosted models
5
5
  Author: TrustyAI team
6
6
  Author-email: Sai Chandra Pandraju <saichandrapandraju@gmail.com>
@@ -0,0 +1,20 @@
1
+ vllm_judge/__init__.py,sha256=bl6j1TXcPFsNcOKpFGX6FrkS0dikhApOKjhsBOaXm80,2800
2
+ vllm_judge/batch.py,sha256=GJe6d2nsUWCxcSG-j5xnyovfKAM-YklWS0PNAwTMO9s,4886
3
+ vllm_judge/builtin_metrics.py,sha256=XAhn5a-kJgip4NYkaTmkwiIWXjYGRcHoztAmqjmDO9A,48711
4
+ vllm_judge/cli.py,sha256=3075NrduxYz_iITQ0ZnqdjK0jJ9vGpzC6B_23lAN3wc,13598
5
+ vllm_judge/client.py,sha256=x3LBRUjnOmX0iEWdRqz-ALzb03qezZ92aMpdMFzHRcs,8096
6
+ vllm_judge/exceptions.py,sha256=X9YxnukDuI3RwJPkabj3pl6v0JIbflvhUaWrdAW4RTM,1066
7
+ vllm_judge/judge.py,sha256=X3oLXfWjmIOay5oDWBQNoEnxyDlF0sPf69HBjieW1Ug,16954
8
+ vllm_judge/models.py,sha256=wN2JGddWAxT4EXhmfl3IjBYOpDG_9lGP125UWP4IKTw,7935
9
+ vllm_judge/parsers.py,sha256=lhByBIMS_1EwvxEe31jFgVcTwcFwm5mWoJDXG4TnbvQ,509
10
+ vllm_judge/prompt_builder.py,sha256=miQU_mKDKkTuRfVEiQT2LfN4QEvnphLu39s2YldOvCA,8754
11
+ vllm_judge/templating.py,sha256=LjVFXFcwHl8xnBLLVr_IIqtN-EbLp0HZ5ndNbBpcJTQ,6998
12
+ vllm_judge/api/__init__.py,sha256=yUqAIcqpBDUKoq8ZLcKPQaX45oesy3Nmb2yEwy-dHyU,727
13
+ vllm_judge/api/client.py,sha256=RgbhzRLlOR8gia8_-Kbe2_wQC4tjNPzqObPz2GPP5ec,12409
14
+ vllm_judge/api/models.py,sha256=GXj3slwytJWg5M4f5MPZ8Ft_hrkEEAZh0qgpYDy-Qe4,5102
15
+ vllm_judge/api/server.py,sha256=B97RVVeVHLxf69_bSZBvP69DbTQhoFW2tZOBBS0ahrQ,17838
16
+ vllm_judge-0.1.6.dist-info/METADATA,sha256=MRGak20XswQG2-Qq_iFCIUNqZcWfMOZsSA8GRWMj6ak,4251
17
+ vllm_judge-0.1.6.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
18
+ vllm_judge-0.1.6.dist-info/entry_points.txt,sha256=F3plmbMXOQ0pBIh0clqWPVIJWl20_1LZ7QHxC2XF5Lg,51
19
+ vllm_judge-0.1.6.dist-info/top_level.txt,sha256=bqtMvn2y13cHSz_1-HKCBMzYSTfDHsTQBG6U5STHvwM,11
20
+ vllm_judge-0.1.6.dist-info/RECORD,,
@@ -1,20 +0,0 @@
1
- vllm_judge/__init__.py,sha256=6OKo_RbNOov83pZIPfg12ITxiE6UZh2_UOTjQsgWbFY,2792
2
- vllm_judge/batch.py,sha256=3zkatZxQESCjYz99qfLhxl2Dq2tHAfhtdTiXxjVqUxE,4836
3
- vllm_judge/cli.py,sha256=tnMqJ2RvCFaXUY4ok4IO-d9IRNJhEck60AJNzdCaqhg,13679
4
- vllm_judge/client.py,sha256=QPz64q9-7XEOOJiKQU7FBkGFWocJ-WGUmpETKSLQYDI,8386
5
- vllm_judge/exceptions.py,sha256=X9YxnukDuI3RwJPkabj3pl6v0JIbflvhUaWrdAW4RTM,1066
6
- vllm_judge/judge.py,sha256=SDT_cGDZzHu8NOjG6eqHQsYqIuXR12j7ocpyrVDhHrQ,16939
7
- vllm_judge/metrics.py,sha256=WwtR6Bb4cc0gDplhZnysNzD1EfOMCEzFc8-3hJMqnJs,48709
8
- vllm_judge/models.py,sha256=o4OdRtRdsz9n5RhHrz-uA9ylG0cGQg99NJYay0RaeDE,7998
9
- vllm_judge/prompts.py,sha256=KC8AfiIgKKxQuhT1bnnyYXrSBbcU2-RnkSLqDJfrt8o,7251
10
- vllm_judge/templating.py,sha256=LjVFXFcwHl8xnBLLVr_IIqtN-EbLp0HZ5ndNbBpcJTQ,6998
11
- vllm_judge/utils.py,sha256=lhByBIMS_1EwvxEe31jFgVcTwcFwm5mWoJDXG4TnbvQ,509
12
- vllm_judge/api/__init__.py,sha256=aPQ1o7_ZzbJJpm2UyX3H35snbOGbgQJoglJjzdnc1LU,762
13
- vllm_judge/api/client.py,sha256=l46IpQHJxmbDfXpyCOXfir70c_3hPaIr6OEiOzOMk5Q,12449
14
- vllm_judge/api/models.py,sha256=GXj3slwytJWg5M4f5MPZ8Ft_hrkEEAZh0qgpYDy-Qe4,5102
15
- vllm_judge/api/server.py,sha256=1UQMV6MRdlqHS6NYdrQI41bi_wNb0QC8RZD4jCEeTkU,17888
16
- vllm_judge-0.1.5.dist-info/METADATA,sha256=5UXUqyckWp9fGLQXcBxkI6ejmFfWpCjjpyIeMx96zTI,4251
17
- vllm_judge-0.1.5.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
18
- vllm_judge-0.1.5.dist-info/entry_points.txt,sha256=F3plmbMXOQ0pBIh0clqWPVIJWl20_1LZ7QHxC2XF5Lg,51
19
- vllm_judge-0.1.5.dist-info/top_level.txt,sha256=bqtMvn2y13cHSz_1-HKCBMzYSTfDHsTQBG6U5STHvwM,11
20
- vllm_judge-0.1.5.dist-info/RECORD,,
File without changes