evaluator-service 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evaluator_service-0.1.0/.env.example +10 -0
- evaluator_service-0.1.0/LICENSE +21 -0
- evaluator_service-0.1.0/MANIFEST.in +5 -0
- evaluator_service-0.1.0/PKG-INFO +237 -0
- evaluator_service-0.1.0/README.md +197 -0
- evaluator_service-0.1.0/evaluator_service/__init__.py +49 -0
- evaluator_service-0.1.0/evaluator_service/api/__init__.py +4 -0
- evaluator_service-0.1.0/evaluator_service/api/routes.py +98 -0
- evaluator_service-0.1.0/evaluator_service/clients/__init__.py +5 -0
- evaluator_service-0.1.0/evaluator_service/clients/mongo_observability_client.py +87 -0
- evaluator_service-0.1.0/evaluator_service/clients/pepgnix_client.py +64 -0
- evaluator_service-0.1.0/evaluator_service/main.py +11 -0
- evaluator_service-0.1.0/evaluator_service/models/__init__.py +24 -0
- evaluator_service-0.1.0/evaluator_service/models/models.py +98 -0
- evaluator_service-0.1.0/evaluator_service/services/__init__.py +13 -0
- evaluator_service-0.1.0/evaluator_service/services/evaluator_service.py +136 -0
- evaluator_service-0.1.0/evaluator_service/services/llm_judge.py +60 -0
- evaluator_service-0.1.0/evaluator_service/services/orchestrator.py +86 -0
- evaluator_service-0.1.0/evaluator_service/services/winner_selector.py +58 -0
- evaluator_service-0.1.0/evaluator_service/utils/__init__.py +5 -0
- evaluator_service-0.1.0/evaluator_service/utils/response_formatter.py +45 -0
- evaluator_service-0.1.0/evaluator_service/utils/score_aggregator.py +31 -0
- evaluator_service-0.1.0/evaluator_service.egg-info/PKG-INFO +237 -0
- evaluator_service-0.1.0/evaluator_service.egg-info/SOURCES.txt +28 -0
- evaluator_service-0.1.0/evaluator_service.egg-info/dependency_links.txt +1 -0
- evaluator_service-0.1.0/evaluator_service.egg-info/entry_points.txt +2 -0
- evaluator_service-0.1.0/evaluator_service.egg-info/requires.txt +13 -0
- evaluator_service-0.1.0/evaluator_service.egg-info/top_level.txt +1 -0
- evaluator_service-0.1.0/pyproject.toml +86 -0
- evaluator_service-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# Pepgnix LLM Service Configuration
|
|
2
|
+
PEPGNIX_SERVICE_URL=https://pepgnix-service.example.com/api/v1/llm
|
|
3
|
+
PEPGNIX_TEAM_ID=your-team-id
|
|
4
|
+
PEPGNIX_PROJECT_ID=your-project-id
|
|
5
|
+
PEPGNIX_API_KEY=your-pepgnix-api-key
|
|
6
|
+
|
|
7
|
+
# MongoDB Configuration (for observability)
|
|
8
|
+
MONGODB_CONNECTION_STRING=mongodb://localhost:27017
|
|
9
|
+
MONGODB_DATABASE_NAME=evaluator_service
|
|
10
|
+
MONGODB_COLLECTION_NAME=evaluation_traces
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 PepsiCo
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: evaluator-service
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Multi-LLM Response Validation & Selection Framework with RAG metrics evaluation
|
|
5
|
+
Author-email: PepsiCo <tech@pepsico.com>
|
|
6
|
+
Maintainer-email: PepsiCo <tech@pepsico.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/pepsico/evaluator-service
|
|
9
|
+
Project-URL: Documentation, https://github.com/pepsico/evaluator-service#readme
|
|
10
|
+
Project-URL: Repository, https://github.com/pepsico/evaluator-service.git
|
|
11
|
+
Project-URL: Issues, https://github.com/pepsico/evaluator-service/issues
|
|
12
|
+
Keywords: llm,rag,evaluation,multi-llm,response-selection,observability
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
22
|
+
Classifier: Framework :: FastAPI
|
|
23
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Requires-Dist: fastapi>=0.115.0
|
|
28
|
+
Requires-Dist: uvicorn[standard]>=0.32.0
|
|
29
|
+
Requires-Dist: pydantic>=2.9.2
|
|
30
|
+
Requires-Dist: httpx>=0.27.2
|
|
31
|
+
Requires-Dist: python-dotenv>=1.0.1
|
|
32
|
+
Requires-Dist: pymongo>=4.10.0
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
35
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
36
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
37
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
38
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
39
|
+
Dynamic: license-file
|
|
40
|
+
|
|
41
|
+
# Evaluator Service
|
|
42
|
+
|
|
43
|
+
Multi-LLM Response Validation & Selection Framework with RAG metrics evaluation.
|
|
44
|
+
|
|
45
|
+
## Features
|
|
46
|
+
|
|
47
|
+
- **Single-call Custom Evaluator**: Evaluates RAG metrics (faithfulness, context precision, context recall, relevance, hallucination risk) in a single LLM call
|
|
48
|
+
- **Score Aggregation**: Weighted scoring formula to combine multiple metrics into a final score
|
|
49
|
+
- **LLM-as-a-Judge**: Tie-breaking mechanism using LLM comparison when scores are close
|
|
50
|
+
- **Parallel Processing**: Evaluates multiple candidate responses concurrently
|
|
51
|
+
- **Observability**: MongoDB integration for storing evaluation traces
|
|
52
|
+
- **FastAPI**: RESTful API for easy integration
|
|
53
|
+
- **Extensible**: Pluggable architecture for different storage backends (MongoDB, Azure Blob, etc.)
|
|
54
|
+
|
|
55
|
+
## Installation
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
pip install evaluator-service
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Configuration
|
|
62
|
+
|
|
63
|
+
Set the following environment variables:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
# Pepgnix LLM Service Configuration
|
|
67
|
+
PEPGNIX_SERVICE_URL=https://pepgnix-service.example.com/api/v1/llm
|
|
68
|
+
PEPGNIX_TEAM_ID=your-team-id
|
|
69
|
+
PEPGNIX_PROJECT_ID=your-project-id
|
|
70
|
+
PEPGNIX_API_KEY=your-pepgnix-api-key
|
|
71
|
+
|
|
72
|
+
# MongoDB Configuration (for observability)
|
|
73
|
+
MONGODB_CONNECTION_STRING=mongodb://localhost:27017
|
|
74
|
+
MONGODB_DATABASE_NAME=evaluator_service
|
|
75
|
+
MONGODB_COLLECTION_NAME=evaluation_traces
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Usage
|
|
79
|
+
|
|
80
|
+
### As a Library
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from evaluator_service import EvaluationOrchestrator, EvaluatorService, WinnerSelector
|
|
84
|
+
from evaluator_service.clients import PepgnixClient, MongoObservabilityClient
|
|
85
|
+
from evaluator_service.models import EvalRequest, Candidate, ContextChunk
|
|
86
|
+
|
|
87
|
+
# Initialize clients
|
|
88
|
+
llm_client = PepgnixClient()
|
|
89
|
+
observability_client = MongoObservabilityClient()
|
|
90
|
+
|
|
91
|
+
# Initialize services
|
|
92
|
+
evaluator_service = EvaluatorService(llm_client)
|
|
93
|
+
llm_judge = LLMJudge(llm_client)
|
|
94
|
+
winner_selector = WinnerSelector(llm_judge)
|
|
95
|
+
orchestrator = EvaluationOrchestrator(evaluator_service, winner_selector, observability_client)
|
|
96
|
+
|
|
97
|
+
# Create evaluation request
|
|
98
|
+
request = EvalRequest(
|
|
99
|
+
request_id="req-123",
|
|
100
|
+
user_query="What was PepsiCo revenue in 2024?",
|
|
101
|
+
context_chunks=[
|
|
102
|
+
ContextChunk(
|
|
103
|
+
chunk_id="doc-001-chunk-04",
|
|
104
|
+
text="PepsiCo reported revenue of 91.8 billion USD in FY2024.",
|
|
105
|
+
retrieval_score=0.94
|
|
106
|
+
)
|
|
107
|
+
],
|
|
108
|
+
candidates=[
|
|
109
|
+
Candidate(model="gpt", response="PepsiCo reported revenue of 91.8 billion USD in FY2024."),
|
|
110
|
+
Candidate(model="claude", response="According to the annual report, PepsiCo reported total revenue of 91.8B for FY2024.")
|
|
111
|
+
]
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Run evaluation
|
|
115
|
+
result = orchestrator.evaluate(request)
|
|
116
|
+
print(f"Winner: {result.winner.model}, Score: {result.score}")
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### As a REST API
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
# Start the server
|
|
123
|
+
evaluator-service
|
|
124
|
+
|
|
125
|
+
# Or using python
|
|
126
|
+
python -m evaluator_service.main
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
The API will be available at `http://localhost:8080`
|
|
130
|
+
|
|
131
|
+
#### API Endpoint
|
|
132
|
+
|
|
133
|
+
**POST /api/v1/evaluate**
|
|
134
|
+
|
|
135
|
+
Request body:
|
|
136
|
+
```json
|
|
137
|
+
{
|
|
138
|
+
"request_id": "req-123",
|
|
139
|
+
"user_query": "What was PepsiCo revenue in 2024?",
|
|
140
|
+
"context_chunks": [
|
|
141
|
+
{
|
|
142
|
+
"chunk_id": "doc-001-chunk-04",
|
|
143
|
+
"text": "PepsiCo reported revenue of 91.8 billion USD in FY2024.",
|
|
144
|
+
"retrieval_score": 0.94
|
|
145
|
+
}
|
|
146
|
+
],
|
|
147
|
+
"candidates": [
|
|
148
|
+
{
|
|
149
|
+
"model": "gpt",
|
|
150
|
+
"response": "PepsiCo reported revenue of 91.8 billion USD in FY2024."
|
|
151
|
+
},
|
|
152
|
+
{
|
|
153
|
+
"model": "claude",
|
|
154
|
+
"response": "According to the annual report, PepsiCo reported total revenue of 91.8B for FY2024."
|
|
155
|
+
}
|
|
156
|
+
]
|
|
157
|
+
}
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
Response:
|
|
161
|
+
```json
|
|
162
|
+
{
|
|
163
|
+
"request_id": "req-123",
|
|
164
|
+
"winner": {
|
|
165
|
+
"model": "claude",
|
|
166
|
+
"response": "According to the annual report, PepsiCo reported total revenue of 91.8B for FY2024.",
|
|
167
|
+
"score": 0.85,
|
|
168
|
+
"selection_method": "score_winner"
|
|
169
|
+
},
|
|
170
|
+
"all_scores": {
|
|
171
|
+
"gpt": {
|
|
172
|
+
"final": 0.82,
|
|
173
|
+
"faithfulness": 0.9,
|
|
174
|
+
"context_precision": 0.85,
|
|
175
|
+
"context_recall": 0.8,
|
|
176
|
+
"relevance": 0.95,
|
|
177
|
+
"hallucination_risk": 0.1
|
|
178
|
+
},
|
|
179
|
+
"claude": {
|
|
180
|
+
"final": 0.85,
|
|
181
|
+
"faithfulness": 0.95,
|
|
182
|
+
"context_precision": 0.9,
|
|
183
|
+
"context_recall": 0.85,
|
|
184
|
+
"relevance": 0.9,
|
|
185
|
+
"hallucination_risk": 0.05
|
|
186
|
+
}
|
|
187
|
+
},
|
|
188
|
+
"trace_id": "abc-123-def-456",
|
|
189
|
+
"evaluated_at": "2024-01-15T10:30:00Z",
|
|
190
|
+
"latency_ms": 2340
|
|
191
|
+
}
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## Scoring Formula
|
|
195
|
+
|
|
196
|
+
The final score is calculated using the following weighted formula:
|
|
197
|
+
|
|
198
|
+
```
|
|
199
|
+
Final Score =
|
|
200
|
+
(0.35 × faithfulness)
|
|
201
|
+
+ (0.25 × context_recall)
|
|
202
|
+
+ (0.20 × relevance)
|
|
203
|
+
+ (0.20 × context_precision)
|
|
204
|
+
- (0.30 × hallucination_risk)
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
## Tie-Breaking
|
|
208
|
+
|
|
209
|
+
When the difference between the top two scores is less than 0.05, the LLM Judge is invoked to compare the two answers based on:
|
|
210
|
+
- Accuracy
|
|
211
|
+
- Completeness
|
|
212
|
+
- Grounding
|
|
213
|
+
- Clarity
|
|
214
|
+
|
|
215
|
+
## Development
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
# Install development dependencies
|
|
219
|
+
pip install -e ".[dev]"
|
|
220
|
+
|
|
221
|
+
# Run tests
|
|
222
|
+
pytest
|
|
223
|
+
|
|
224
|
+
# Format code
|
|
225
|
+
black .
|
|
226
|
+
|
|
227
|
+
# Lint
|
|
228
|
+
ruff check .
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
## License
|
|
232
|
+
|
|
233
|
+
MIT License - see LICENSE file for details.
|
|
234
|
+
|
|
235
|
+
## Contributing
|
|
236
|
+
|
|
237
|
+
Contributions are welcome! Please open an issue or submit a pull request.
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
# Evaluator Service
|
|
2
|
+
|
|
3
|
+
Multi-LLM Response Validation & Selection Framework with RAG metrics evaluation.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Single-call Custom Evaluator**: Evaluates RAG metrics (faithfulness, context precision, context recall, relevance, hallucination risk) in a single LLM call
|
|
8
|
+
- **Score Aggregation**: Weighted scoring formula to combine multiple metrics into a final score
|
|
9
|
+
- **LLM-as-a-Judge**: Tie-breaking mechanism using LLM comparison when scores are close
|
|
10
|
+
- **Parallel Processing**: Evaluates multiple candidate responses concurrently
|
|
11
|
+
- **Observability**: MongoDB integration for storing evaluation traces
|
|
12
|
+
- **FastAPI**: RESTful API for easy integration
|
|
13
|
+
- **Extensible**: Pluggable architecture for different storage backends (MongoDB, Azure Blob, etc.)
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install evaluator-service
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Configuration
|
|
22
|
+
|
|
23
|
+
Set the following environment variables:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
# Pepgnix LLM Service Configuration
|
|
27
|
+
PEPGNIX_SERVICE_URL=https://pepgnix-service.example.com/api/v1/llm
|
|
28
|
+
PEPGNIX_TEAM_ID=your-team-id
|
|
29
|
+
PEPGNIX_PROJECT_ID=your-project-id
|
|
30
|
+
PEPGNIX_API_KEY=your-pepgnix-api-key
|
|
31
|
+
|
|
32
|
+
# MongoDB Configuration (for observability)
|
|
33
|
+
MONGODB_CONNECTION_STRING=mongodb://localhost:27017
|
|
34
|
+
MONGODB_DATABASE_NAME=evaluator_service
|
|
35
|
+
MONGODB_COLLECTION_NAME=evaluation_traces
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Usage
|
|
39
|
+
|
|
40
|
+
### As a Library
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from evaluator_service import EvaluationOrchestrator, EvaluatorService, WinnerSelector
|
|
44
|
+
from evaluator_service.clients import PepgnixClient, MongoObservabilityClient
|
|
45
|
+
from evaluator_service.models import EvalRequest, Candidate, ContextChunk
|
|
46
|
+
|
|
47
|
+
# Initialize clients
|
|
48
|
+
llm_client = PepgnixClient()
|
|
49
|
+
observability_client = MongoObservabilityClient()
|
|
50
|
+
|
|
51
|
+
# Initialize services
|
|
52
|
+
evaluator_service = EvaluatorService(llm_client)
|
|
53
|
+
llm_judge = LLMJudge(llm_client)
|
|
54
|
+
winner_selector = WinnerSelector(llm_judge)
|
|
55
|
+
orchestrator = EvaluationOrchestrator(evaluator_service, winner_selector, observability_client)
|
|
56
|
+
|
|
57
|
+
# Create evaluation request
|
|
58
|
+
request = EvalRequest(
|
|
59
|
+
request_id="req-123",
|
|
60
|
+
user_query="What was PepsiCo revenue in 2024?",
|
|
61
|
+
context_chunks=[
|
|
62
|
+
ContextChunk(
|
|
63
|
+
chunk_id="doc-001-chunk-04",
|
|
64
|
+
text="PepsiCo reported revenue of 91.8 billion USD in FY2024.",
|
|
65
|
+
retrieval_score=0.94
|
|
66
|
+
)
|
|
67
|
+
],
|
|
68
|
+
candidates=[
|
|
69
|
+
Candidate(model="gpt", response="PepsiCo reported revenue of 91.8 billion USD in FY2024."),
|
|
70
|
+
Candidate(model="claude", response="According to the annual report, PepsiCo reported total revenue of 91.8B for FY2024.")
|
|
71
|
+
]
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# Run evaluation
|
|
75
|
+
result = orchestrator.evaluate(request)
|
|
76
|
+
print(f"Winner: {result.winner.model}, Score: {result.score}")
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### As a REST API
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
# Start the server
|
|
83
|
+
evaluator-service
|
|
84
|
+
|
|
85
|
+
# Or using python
|
|
86
|
+
python -m evaluator_service.main
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
The API will be available at `http://localhost:8080`
|
|
90
|
+
|
|
91
|
+
#### API Endpoint
|
|
92
|
+
|
|
93
|
+
**POST /api/v1/evaluate**
|
|
94
|
+
|
|
95
|
+
Request body:
|
|
96
|
+
```json
|
|
97
|
+
{
|
|
98
|
+
"request_id": "req-123",
|
|
99
|
+
"user_query": "What was PepsiCo revenue in 2024?",
|
|
100
|
+
"context_chunks": [
|
|
101
|
+
{
|
|
102
|
+
"chunk_id": "doc-001-chunk-04",
|
|
103
|
+
"text": "PepsiCo reported revenue of 91.8 billion USD in FY2024.",
|
|
104
|
+
"retrieval_score": 0.94
|
|
105
|
+
}
|
|
106
|
+
],
|
|
107
|
+
"candidates": [
|
|
108
|
+
{
|
|
109
|
+
"model": "gpt",
|
|
110
|
+
"response": "PepsiCo reported revenue of 91.8 billion USD in FY2024."
|
|
111
|
+
},
|
|
112
|
+
{
|
|
113
|
+
"model": "claude",
|
|
114
|
+
"response": "According to the annual report, PepsiCo reported total revenue of 91.8B for FY2024."
|
|
115
|
+
}
|
|
116
|
+
]
|
|
117
|
+
}
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Response:
|
|
121
|
+
```json
|
|
122
|
+
{
|
|
123
|
+
"request_id": "req-123",
|
|
124
|
+
"winner": {
|
|
125
|
+
"model": "claude",
|
|
126
|
+
"response": "According to the annual report, PepsiCo reported total revenue of 91.8B for FY2024.",
|
|
127
|
+
"score": 0.85,
|
|
128
|
+
"selection_method": "score_winner"
|
|
129
|
+
},
|
|
130
|
+
"all_scores": {
|
|
131
|
+
"gpt": {
|
|
132
|
+
"final": 0.82,
|
|
133
|
+
"faithfulness": 0.9,
|
|
134
|
+
"context_precision": 0.85,
|
|
135
|
+
"context_recall": 0.8,
|
|
136
|
+
"relevance": 0.95,
|
|
137
|
+
"hallucination_risk": 0.1
|
|
138
|
+
},
|
|
139
|
+
"claude": {
|
|
140
|
+
"final": 0.85,
|
|
141
|
+
"faithfulness": 0.95,
|
|
142
|
+
"context_precision": 0.9,
|
|
143
|
+
"context_recall": 0.85,
|
|
144
|
+
"relevance": 0.9,
|
|
145
|
+
"hallucination_risk": 0.05
|
|
146
|
+
}
|
|
147
|
+
},
|
|
148
|
+
"trace_id": "abc-123-def-456",
|
|
149
|
+
"evaluated_at": "2024-01-15T10:30:00Z",
|
|
150
|
+
"latency_ms": 2340
|
|
151
|
+
}
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## Scoring Formula
|
|
155
|
+
|
|
156
|
+
The final score is calculated using the following weighted formula:
|
|
157
|
+
|
|
158
|
+
```
|
|
159
|
+
Final Score =
|
|
160
|
+
(0.35 × faithfulness)
|
|
161
|
+
+ (0.25 × context_recall)
|
|
162
|
+
+ (0.20 × relevance)
|
|
163
|
+
+ (0.20 × context_precision)
|
|
164
|
+
- (0.30 × hallucination_risk)
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
## Tie-Breaking
|
|
168
|
+
|
|
169
|
+
When the difference between the top two scores is less than 0.05, the LLM Judge is invoked to compare the two answers based on:
|
|
170
|
+
- Accuracy
|
|
171
|
+
- Completeness
|
|
172
|
+
- Grounding
|
|
173
|
+
- Clarity
|
|
174
|
+
|
|
175
|
+
## Development
|
|
176
|
+
|
|
177
|
+
```bash
|
|
178
|
+
# Install development dependencies
|
|
179
|
+
pip install -e ".[dev]"
|
|
180
|
+
|
|
181
|
+
# Run tests
|
|
182
|
+
pytest
|
|
183
|
+
|
|
184
|
+
# Format code
|
|
185
|
+
black .
|
|
186
|
+
|
|
187
|
+
# Lint
|
|
188
|
+
ruff check .
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
## License
|
|
192
|
+
|
|
193
|
+
MIT License - see LICENSE file for details.
|
|
194
|
+
|
|
195
|
+
## Contributing
|
|
196
|
+
|
|
197
|
+
Contributions are welcome! Please open an issue or submit a pull request.
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# [AI-Generated] model: SWE-1.6 | tid: ff216a85
|
|
2
|
+
"""Evaluator Service - Multi-LLM Response Validation & Selection Framework"""
|
|
3
|
+
|
|
4
|
+
__version__ = "0.1.0"
|
|
5
|
+
|
|
6
|
+
from evaluator_service.api import app
|
|
7
|
+
from evaluator_service.models import (
|
|
8
|
+
ContextChunk,
|
|
9
|
+
Candidate,
|
|
10
|
+
EvalRequest,
|
|
11
|
+
EvalResult,
|
|
12
|
+
SelectionResult,
|
|
13
|
+
ScoreBreakdown,
|
|
14
|
+
WinnerInfo,
|
|
15
|
+
EvalResponse,
|
|
16
|
+
ErrorResponse,
|
|
17
|
+
)
|
|
18
|
+
from evaluator_service.clients import PepgnixClient, MongoObservabilityClient
|
|
19
|
+
from evaluator_service.services import (
|
|
20
|
+
EvaluatorService,
|
|
21
|
+
LLMJudge,
|
|
22
|
+
JudgeResult,
|
|
23
|
+
WinnerSelector,
|
|
24
|
+
EvaluationOrchestrator,
|
|
25
|
+
)
|
|
26
|
+
from evaluator_service.utils import ScoreAggregator, ResponseFormatter
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"__version__",
|
|
30
|
+
"app",
|
|
31
|
+
"ContextChunk",
|
|
32
|
+
"Candidate",
|
|
33
|
+
"EvalRequest",
|
|
34
|
+
"EvalResult",
|
|
35
|
+
"SelectionResult",
|
|
36
|
+
"ScoreBreakdown",
|
|
37
|
+
"WinnerInfo",
|
|
38
|
+
"EvalResponse",
|
|
39
|
+
"ErrorResponse",
|
|
40
|
+
"PepgnixClient",
|
|
41
|
+
"MongoObservabilityClient",
|
|
42
|
+
"EvaluatorService",
|
|
43
|
+
"LLMJudge",
|
|
44
|
+
"JudgeResult",
|
|
45
|
+
"WinnerSelector",
|
|
46
|
+
"EvaluationOrchestrator",
|
|
47
|
+
"ScoreAggregator",
|
|
48
|
+
"ResponseFormatter",
|
|
49
|
+
]
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# [AI-Generated] model: SWE-1.6 | tid: ff216a85
|
|
2
|
+
import logging
|
|
3
|
+
import time
|
|
4
|
+
import uuid
|
|
5
|
+
from fastapi import FastAPI, HTTPException, status
|
|
6
|
+
from fastapi.responses import JSONResponse
|
|
7
|
+
|
|
8
|
+
from evaluator_service.models import EvalRequest, ErrorResponse
|
|
9
|
+
from evaluator_service.clients import PepgnixClient, MongoObservabilityClient
|
|
10
|
+
from evaluator_service.services import EvaluatorService, LLMJudge, WinnerSelector, EvaluationOrchestrator
|
|
11
|
+
from evaluator_service.utils import ResponseFormatter, ScoreAggregator
|
|
12
|
+
|
|
13
|
+
logging.basicConfig(level=logging.INFO)
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
app = FastAPI(title="Evulate Service", version="2.1")
|
|
17
|
+
|
|
18
|
+
# [AI-Generated] model: SWE-1.6 | tid: ff216a85
|
|
19
|
+
# Initialize dependencies
|
|
20
|
+
try:
|
|
21
|
+
llm_client = PepgnixClient()
|
|
22
|
+
evaluator_service = EvaluatorService(llm_client)
|
|
23
|
+
llm_judge = LLMJudge(llm_client)
|
|
24
|
+
winner_selector = WinnerSelector(llm_judge)
|
|
25
|
+
observability_client = MongoObservabilityClient()
|
|
26
|
+
orchestrator = EvaluationOrchestrator(evaluator_service, winner_selector, observability_client)
|
|
27
|
+
response_formatter = ResponseFormatter()
|
|
28
|
+
except Exception as e:
|
|
29
|
+
logger.error(f"Failed to initialize service: {e}")
|
|
30
|
+
raise
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@app.post("/api/v1/evaluate", status_code=status.HTTP_200_OK)
|
|
34
|
+
async def evaluate(request: EvalRequest):
|
|
35
|
+
start_time = time.time()
|
|
36
|
+
trace_id = str(uuid.uuid4())
|
|
37
|
+
|
|
38
|
+
# Validation
|
|
39
|
+
if not request.candidates:
|
|
40
|
+
raise HTTPException(
|
|
41
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
42
|
+
detail=ErrorResponse(
|
|
43
|
+
error="INVALID_REQUEST",
|
|
44
|
+
message="candidates[] must contain at least 1 item"
|
|
45
|
+
).model_dump()
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
if not request.context_chunks:
|
|
49
|
+
raise HTTPException(
|
|
50
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
51
|
+
detail=ErrorResponse(
|
|
52
|
+
error="INVALID_REQUEST",
|
|
53
|
+
message="context_chunks[] must contain at least 1 item"
|
|
54
|
+
).model_dump()
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
# Run evaluation
|
|
59
|
+
selection_result = orchestrator.evaluate(request)
|
|
60
|
+
|
|
61
|
+
# Get eval results and scores for response formatting
|
|
62
|
+
context = "\n\n".join([chunk.text for chunk in request.context_chunks])
|
|
63
|
+
eval_results = {}
|
|
64
|
+
for candidate in request.candidates:
|
|
65
|
+
eval_results[candidate.model] = evaluator_service.evaluate(
|
|
66
|
+
request.user_query,
|
|
67
|
+
context,
|
|
68
|
+
candidate.response
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
scores = ScoreAggregator.aggregate(eval_results)
|
|
72
|
+
|
|
73
|
+
# Format response
|
|
74
|
+
response = response_formatter.format(
|
|
75
|
+
request_id=request.request_id,
|
|
76
|
+
selection_result=selection_result,
|
|
77
|
+
eval_results=eval_results,
|
|
78
|
+
scores=scores,
|
|
79
|
+
trace_id=trace_id,
|
|
80
|
+
start_time=start_time
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
return response
|
|
84
|
+
|
|
85
|
+
except Exception as e:
|
|
86
|
+
logger.error(f"Evaluation failed: {e}")
|
|
87
|
+
raise HTTPException(
|
|
88
|
+
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
|
89
|
+
detail=ErrorResponse(
|
|
90
|
+
error="EVALUATION_ERROR",
|
|
91
|
+
message=str(e)
|
|
92
|
+
).model_dump()
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@app.get("/health")
|
|
97
|
+
async def health_check():
|
|
98
|
+
return {"status": "healthy"}
|