vllm-judge 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,124 @@
1
+ Metadata-Version: 2.4
2
+ Name: vllm_judge
3
+ Version: 0.1.0
4
+ Summary: LLM-as-a-Judge evaluations for vLLM hosted models
5
+ Author: TrustyAI team
6
+ Author-email: Sai Chandra Pandraju <saichandrapandraju@gmail.com>
7
+ Project-URL: Homepage, https://github.com/saichandrapandraju/vllm_judge
8
+ Project-URL: Repository, https://github.com/saichandrapandraju/vllm_judge
9
+ Project-URL: Issues, https://github.com/saichandrapandraju/vllm_judge/issues
10
+ Keywords: llm,evaluation,vllm,judge,ai,machine-learning,nlp,llm-evaluation,llm-as-judge
11
+ Requires-Python: >=3.8
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: httpx>=0.24.0
14
+ Requires-Dist: pydantic>=2.0.0
15
+ Requires-Dist: tenacity>=8.0.0
16
+ Requires-Dist: click>=8.0.0
17
+ Provides-Extra: api
18
+ Requires-Dist: fastapi>=0.100.0; extra == "api"
19
+ Requires-Dist: uvicorn[standard]>=0.22.0; extra == "api"
20
+ Requires-Dist: websockets>=11.0; extra == "api"
21
+ Provides-Extra: jinja2
22
+ Requires-Dist: jinja2>=3.0.0; extra == "jinja2"
23
+ Provides-Extra: dev
24
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
25
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
26
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
27
+ Requires-Dist: black>=23.0.0; extra == "dev"
28
+ Requires-Dist: isort>=5.12.0; extra == "dev"
29
+ Requires-Dist: flake8>=6.0.0; extra == "dev"
30
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
31
+ Provides-Extra: test
32
+ Requires-Dist: pytest>=7.0.0; extra == "test"
33
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "test"
34
+ Requires-Dist: pytest-cov>=4.0.0; extra == "test"
35
+ Requires-Dist: pytest-mock>=3.10.0; extra == "test"
36
+ Provides-Extra: docs
37
+ Requires-Dist: mkdocs>=1.5.0; extra == "docs"
38
+ Requires-Dist: mkdocs-material>=9.0.0; extra == "docs"
39
+ Requires-Dist: mkdocstrings[python]>=0.24.0; extra == "docs"
40
+
41
+ # vLLM Judge
42
+
43
+ A lightweight library for LLM-as-a-Judge evaluations using vLLM hosted models.
44
+
45
+ ## Features
46
+
47
+ - 🚀 **Simple Interface**: Single `evaluate()` method that adapts to any use case
48
+ - 🎯 **Pre-built Metrics**: 20+ ready-to-use evaluation metrics
49
+ - 🔧 **Template Support**: Dynamic evaluations with template variables
50
+ - ⚡ **High Performance**: Optimized for vLLM with automatic batching
51
+ - 🌐 **API Mode**: Run as a REST API service
52
+ - 🔄 **Async Native**: Built for high-throughput evaluations
53
+
54
+ ## Installation
55
+
56
+ ```bash
57
+ # Basic installation
58
+ pip install vllm_judge
59
+
60
+ # With API support
61
+ pip install vllm_judge[api]
62
+
63
+ # With Jinja2 template support
64
+ pip install vllm_judge[jinja2]
65
+
66
+ # Everything
67
+ pip install vllm_judge[api,jinja2]
68
+ ```
69
+
70
+ ## Quick Start
71
+
72
+ ```python
73
+ from vllm_judge import Judge
74
+
75
+ # Initialize with vLLM url
76
+ judge = await Judge.from_url("http://localhost:8000")
77
+
78
+ # Simple evaluation
79
+ result = await judge.evaluate(
80
+ response="The Earth orbits around the Sun.",
81
+ criteria="scientific accuracy"
82
+ )
83
+ print(f"Decision: {result.decision}")
84
+ print(f"Reasoning: {result.reasoning}")
85
+
86
+ # Using pre-built metrics
87
+ from vllm_judge import CODE_QUALITY
88
+
89
+ result = await judge.evaluate(
90
+ response="def add(a, b): return a + b",
91
+ metric=CODE_QUALITY
92
+ )
93
+
94
+ # With template variables
95
+ result = await judge.evaluate(
96
+ response="Essay content here...",
97
+ criteria="Evaluate this {doc_type} for {audience}",
98
+ template_vars={
99
+ "doc_type": "essay",
100
+ "audience": "high school students"
101
+ }
102
+ )
103
+ ```
104
+
105
+ ## API Server
106
+
107
+ Run Judge as a REST API:
108
+
109
+ ```bash
110
+ vllm-judge serve --base-url http://localhost:8000 --port 9090 --host localhost
111
+ ```
112
+
113
+ Then use the HTTP API:
114
+
115
+ ```python
116
+ from vllm_judge.api import JudgeClient
117
+
118
+ client = JudgeClient("http://localhost:9090")
119
+ result = await client.evaluate(
120
+ response="Python is great!",
121
+ criteria="technical accuracy"
122
+ )
123
+ ```
124
+
@@ -0,0 +1,84 @@
1
+ # vLLM Judge
2
+
3
+ A lightweight library for LLM-as-a-Judge evaluations using vLLM hosted models.
4
+
5
+ ## Features
6
+
7
+ - 🚀 **Simple Interface**: Single `evaluate()` method that adapts to any use case
8
+ - 🎯 **Pre-built Metrics**: 20+ ready-to-use evaluation metrics
9
+ - 🔧 **Template Support**: Dynamic evaluations with template variables
10
+ - ⚡ **High Performance**: Optimized for vLLM with automatic batching
11
+ - 🌐 **API Mode**: Run as a REST API service
12
+ - 🔄 **Async Native**: Built for high-throughput evaluations
13
+
14
+ ## Installation
15
+
16
+ ```bash
17
+ # Basic installation
18
+ pip install vllm_judge
19
+
20
+ # With API support
21
+ pip install vllm_judge[api]
22
+
23
+ # With Jinja2 template support
24
+ pip install vllm_judge[jinja2]
25
+
26
+ # Everything
27
+ pip install vllm_judge[api,jinja2]
28
+ ```
29
+
30
+ ## Quick Start
31
+
32
+ ```python
33
+ from vllm_judge import Judge
34
+
35
+ # Initialize with vLLM url
36
+ judge = await Judge.from_url("http://localhost:8000")
37
+
38
+ # Simple evaluation
39
+ result = await judge.evaluate(
40
+ response="The Earth orbits around the Sun.",
41
+ criteria="scientific accuracy"
42
+ )
43
+ print(f"Decision: {result.decision}")
44
+ print(f"Reasoning: {result.reasoning}")
45
+
46
+ # Using pre-built metrics
47
+ from vllm_judge import CODE_QUALITY
48
+
49
+ result = await judge.evaluate(
50
+ response="def add(a, b): return a + b",
51
+ metric=CODE_QUALITY
52
+ )
53
+
54
+ # With template variables
55
+ result = await judge.evaluate(
56
+ response="Essay content here...",
57
+ criteria="Evaluate this {doc_type} for {audience}",
58
+ template_vars={
59
+ "doc_type": "essay",
60
+ "audience": "high school students"
61
+ }
62
+ )
63
+ ```
64
+
65
+ ## API Server
66
+
67
+ Run Judge as a REST API:
68
+
69
+ ```bash
70
+ vllm-judge serve --base-url http://localhost:8000 --port 9090 --host localhost
71
+ ```
72
+
73
+ Then use the HTTP API:
74
+
75
+ ```python
76
+ from vllm_judge.api import JudgeClient
77
+
78
+ client = JudgeClient("http://localhost:9090")
79
+ result = await client.evaluate(
80
+ response="Python is great!",
81
+ criteria="technical accuracy"
82
+ )
83
+ ```
84
+
@@ -0,0 +1,161 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "vllm_judge"
7
+ version = "0.1.0"
8
+ description = "LLM-as-a-Judge evaluations for vLLM hosted models"
9
+ readme = "README.md"
10
+ authors = [
11
+ {name = "Sai Chandra Pandraju", email = "saichandrapandraju@gmail.com"},
12
+ {name = "TrustyAI team"}
13
+ ]
14
+
15
+ keywords = [
16
+ "llm",
17
+ "evaluation",
18
+ "vllm",
19
+ "judge",
20
+ "ai",
21
+ "machine-learning",
22
+ "nlp",
23
+ "llm-evaluation",
24
+ "llm-as-judge"
25
+ ]
26
+ requires-python = ">=3.8"
27
+ dependencies = [
28
+ "httpx>=0.24.0",
29
+ "pydantic>=2.0.0",
30
+ "tenacity>=8.0.0",
31
+ "click>=8.0.0",
32
+ ]
33
+
34
+ [project.optional-dependencies]
35
+ api = [
36
+ "fastapi>=0.100.0",
37
+ "uvicorn[standard]>=0.22.0",
38
+ "websockets>=11.0",
39
+ ]
40
+ jinja2 = [
41
+ "jinja2>=3.0.0",
42
+ ]
43
+ dev = [
44
+ "pytest>=7.0.0",
45
+ "pytest-asyncio>=0.21.0",
46
+ "pytest-cov>=4.0.0",
47
+ "black>=23.0.0",
48
+ "isort>=5.12.0",
49
+ "flake8>=6.0.0",
50
+ "mypy>=1.0.0",
51
+ ]
52
+ test = [
53
+ "pytest>=7.0.0",
54
+ "pytest-asyncio>=0.21.0",
55
+ "pytest-cov>=4.0.0",
56
+ "pytest-mock>=3.10.0",
57
+ ]
58
+ docs = [
59
+ "mkdocs>=1.5.0",
60
+ "mkdocs-material>=9.0.0",
61
+ "mkdocstrings[python]>=0.24.0",
62
+ ]
63
+
64
+ [project.scripts]
65
+ vllm-judge = "vllm_judge.cli:main"
66
+
67
+ [project.urls]
68
+ Homepage = "https://github.com/saichandrapandraju/vllm_judge"
69
+ Repository = "https://github.com/saichandrapandraju/vllm_judge"
70
+ Issues = "https://github.com/saichandrapandraju/vllm_judge/issues"
71
+
72
+ [tool.setuptools.packages.find]
73
+ where = ["src"]
74
+
75
+ [tool.setuptools.package-data]
76
+ vllm_judge = ["py.typed"]
77
+
78
+ [tool.black]
79
+ line-length = 88
80
+ target-version = ['py38', 'py39', 'py310', 'py311', 'py312']
81
+ include = '\.pyi?$'
82
+ exclude = '''
83
+ /(
84
+ \.eggs
85
+ | \.git
86
+ | \.hg
87
+ | \.mypy_cache
88
+ | \.tox
89
+ | \.venv
90
+ | _build
91
+ | buck-out
92
+ | build
93
+ | dist
94
+ )/
95
+ '''
96
+
97
+ [tool.isort]
98
+ profile = "black"
99
+ line_length = 88
100
+
101
+ [tool.pytest.ini_options]
102
+ testpaths = ["tests"]
103
+ python_files = ["test_*.py", "*_test.py"]
104
+ python_classes = ["Test*"]
105
+ python_functions = ["test_*"]
106
+ asyncio_mode = "auto"
107
+ addopts = [
108
+ "--verbose",
109
+ "--cov=vllm_judge",
110
+ "--cov-report=term-missing",
111
+ "--cov-report=html",
112
+ "--cov-report=xml",
113
+ ]
114
+
115
+ [tool.mypy]
116
+ python_version = "3.8"
117
+ warn_return_any = true
118
+ warn_unused_configs = true
119
+ disallow_untyped_defs = true
120
+ disallow_incomplete_defs = true
121
+ check_untyped_defs = true
122
+ disallow_untyped_decorators = true
123
+ no_implicit_optional = true
124
+ warn_redundant_casts = true
125
+ warn_unused_ignores = true
126
+ warn_no_return = true
127
+ warn_unreachable = true
128
+ strict_equality = true
129
+
130
+ [tool.coverage.run]
131
+ source = ["src/vllm_judge"]
132
+ omit = ["*/tests/*", "*/test_*"]
133
+
134
+ [tool.coverage.report]
135
+ precision = 2
136
+ show_missing = true
137
+ skip_covered = false
138
+
139
+ [tool.ruff]
140
+ target-version = "py38"
141
+ line-length = 88
142
+ select = [
143
+ "E", # pycodestyle errors
144
+ "W", # pycodestyle warnings
145
+ "F", # pyflakes
146
+ "I", # isort
147
+ "B", # flake8-bugbear
148
+ "C4", # flake8-comprehensions
149
+ "UP", # pyupgrade
150
+ ]
151
+ ignore = [
152
+ "E501", # line too long, handled by black
153
+ "B008", # do not perform function calls in argument defaults
154
+ "C901", # too complex
155
+ ]
156
+
157
+ [tool.ruff.per-file-ignores]
158
+ "__init__.py" = ["F401"]
159
+
160
+ [tool.ruff.isort]
161
+ known-third-party = ["httpx", "pydantic", "tenacity"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,120 @@
1
+ """
2
+ vLLM Judge - LLM-as-a-Judge evaluations for vLLM hosted models.
3
+
4
+ A lightweight library for evaluating text responses using self-hosted language models
5
+ via vLLM's OpenAI-compatible API.
6
+ """
7
+
8
+ __version__ = "0.1.0"
9
+
10
+ from vllm_judge.judge import Judge
11
+ from vllm_judge.models import (
12
+ JudgeConfig,
13
+ EvaluationResult,
14
+ Metric,
15
+ BatchResult,
16
+ TemplateEngine
17
+ )
18
+ from vllm_judge.templating import TemplateProcessor
19
+ from vllm_judge.metrics import (
20
+ # General metrics
21
+ HELPFULNESS,
22
+ ACCURACY,
23
+ CLARITY,
24
+ CONCISENESS,
25
+ RELEVANCE,
26
+
27
+ # Safety metrics
28
+ SAFETY,
29
+ TOXICITY,
30
+
31
+ # Code metrics
32
+ CODE_QUALITY,
33
+ CODE_SECURITY,
34
+
35
+ # Content metrics
36
+ CREATIVITY,
37
+ PROFESSIONALISM,
38
+ EDUCATIONAL_VALUE,
39
+
40
+ # Comparison metrics
41
+ PREFERENCE,
42
+
43
+ # Binary metrics
44
+ APPROPRIATE,
45
+ FACTUAL,
46
+
47
+ # Domain metrics
48
+ MEDICAL_ACCURACY,
49
+ LEGAL_APPROPRIATENESS,
50
+
51
+ # Utility
52
+ BUILTIN_METRICS,
53
+
54
+ # Template metrics
55
+ EDUCATIONAL_CONTENT_TEMPLATE,
56
+ CODE_REVIEW_TEMPLATE,
57
+ CUSTOMER_SERVICE_TEMPLATE,
58
+ WRITING_QUALITY_TEMPLATE,
59
+ PRODUCT_REVIEW_TEMPLATE,
60
+ MEDICAL_INFO_TEMPLATE,
61
+ API_DOCS_TEMPLATE,
62
+
63
+ )
64
+ from vllm_judge.exceptions import (
65
+ VLLMJudgeError,
66
+ ConfigurationError,
67
+ ConnectionError,
68
+ TimeoutError,
69
+ ParseError,
70
+ MetricNotFoundError,
71
+ InvalidInputError,
72
+ RetryExhaustedError
73
+ )
74
+
75
+ __all__ = [
76
+ # Main classes
77
+ "Judge",
78
+ "JudgeConfig",
79
+ "EvaluationResult",
80
+ "Metric",
81
+ "BatchResult",
82
+ "TemplateEngine",
83
+ "TemplateProcessor",
84
+
85
+ # Metrics
86
+ "HELPFULNESS",
87
+ "ACCURACY",
88
+ "CLARITY",
89
+ "CONCISENESS",
90
+ "RELEVANCE",
91
+ "SAFETY",
92
+ "TOXICITY",
93
+ "CODE_QUALITY",
94
+ "CODE_SECURITY",
95
+ "CREATIVITY",
96
+ "PROFESSIONALISM",
97
+ "EDUCATIONAL_VALUE",
98
+ "PREFERENCE",
99
+ "APPROPRIATE",
100
+ "FACTUAL",
101
+ "MEDICAL_ACCURACY",
102
+ "LEGAL_APPROPRIATENESS",
103
+ "BUILTIN_METRICS",
104
+ "EDUCATIONAL_CONTENT_TEMPLATE",
105
+ "CODE_REVIEW_TEMPLATE",
106
+ "CUSTOMER_SERVICE_TEMPLATE",
107
+ "WRITING_QUALITY_TEMPLATE",
108
+ "PRODUCT_REVIEW_TEMPLATE",
109
+ "MEDICAL_INFO_TEMPLATE",
110
+ "API_DOCS_TEMPLATE",
111
+ # Exceptions
112
+ "VLLMJudgeError",
113
+ "ConfigurationError",
114
+ "ConnectionError",
115
+ "TimeoutError",
116
+ "ParseError",
117
+ "MetricNotFoundError",
118
+ "InvalidInputError",
119
+ "RetryExhaustedError"
120
+ ]
@@ -0,0 +1,39 @@
1
+ """
2
+ API module for vLLM Judge.
3
+ """
4
+ from vllm_judge.api.server import app, create_app, start_server
5
+ from vllm_judge.api.client import JudgeClient
6
+ from vllm_judge.api.models import (
7
+ EvaluateRequest,
8
+ BatchEvaluateRequest,
9
+ AsyncBatchRequest,
10
+ EvaluationResponse,
11
+ BatchResponse,
12
+ AsyncBatchResponse,
13
+ JobStatusResponse,
14
+ MetricInfo,
15
+ HealthResponse,
16
+ ErrorResponse
17
+ )
18
+
19
+ __all__ = [
20
+ # Server
21
+ "app",
22
+ "create_app",
23
+ "start_server",
24
+
25
+ # Client
26
+ "JudgeClient",
27
+
28
+ # Models
29
+ "EvaluateRequest",
30
+ "BatchEvaluateRequest",
31
+ "AsyncBatchRequest",
32
+ "EvaluationResponse",
33
+ "BatchResponse",
34
+ "AsyncBatchResponse",
35
+ "JobStatusResponse",
36
+ "MetricInfo",
37
+ "HealthResponse",
38
+ "ErrorResponse"
39
+ ]