evaluator-service 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. evaluator_service-0.1.0/.env.example +10 -0
  2. evaluator_service-0.1.0/LICENSE +21 -0
  3. evaluator_service-0.1.0/MANIFEST.in +5 -0
  4. evaluator_service-0.1.0/PKG-INFO +237 -0
  5. evaluator_service-0.1.0/README.md +197 -0
  6. evaluator_service-0.1.0/evaluator_service/__init__.py +49 -0
  7. evaluator_service-0.1.0/evaluator_service/api/__init__.py +4 -0
  8. evaluator_service-0.1.0/evaluator_service/api/routes.py +98 -0
  9. evaluator_service-0.1.0/evaluator_service/clients/__init__.py +5 -0
  10. evaluator_service-0.1.0/evaluator_service/clients/mongo_observability_client.py +87 -0
  11. evaluator_service-0.1.0/evaluator_service/clients/pepgnix_client.py +64 -0
  12. evaluator_service-0.1.0/evaluator_service/main.py +11 -0
  13. evaluator_service-0.1.0/evaluator_service/models/__init__.py +24 -0
  14. evaluator_service-0.1.0/evaluator_service/models/models.py +98 -0
  15. evaluator_service-0.1.0/evaluator_service/services/__init__.py +13 -0
  16. evaluator_service-0.1.0/evaluator_service/services/evaluator_service.py +136 -0
  17. evaluator_service-0.1.0/evaluator_service/services/llm_judge.py +60 -0
  18. evaluator_service-0.1.0/evaluator_service/services/orchestrator.py +86 -0
  19. evaluator_service-0.1.0/evaluator_service/services/winner_selector.py +58 -0
  20. evaluator_service-0.1.0/evaluator_service/utils/__init__.py +5 -0
  21. evaluator_service-0.1.0/evaluator_service/utils/response_formatter.py +45 -0
  22. evaluator_service-0.1.0/evaluator_service/utils/score_aggregator.py +31 -0
  23. evaluator_service-0.1.0/evaluator_service.egg-info/PKG-INFO +237 -0
  24. evaluator_service-0.1.0/evaluator_service.egg-info/SOURCES.txt +28 -0
  25. evaluator_service-0.1.0/evaluator_service.egg-info/dependency_links.txt +1 -0
  26. evaluator_service-0.1.0/evaluator_service.egg-info/entry_points.txt +2 -0
  27. evaluator_service-0.1.0/evaluator_service.egg-info/requires.txt +13 -0
  28. evaluator_service-0.1.0/evaluator_service.egg-info/top_level.txt +1 -0
  29. evaluator_service-0.1.0/pyproject.toml +86 -0
  30. evaluator_service-0.1.0/setup.cfg +4 -0
@@ -0,0 +1,10 @@
1
+ # Pepgnix LLM Service Configuration
2
+ PEPGNIX_SERVICE_URL=https://pepgnix-service.example.com/api/v1/llm
3
+ PEPGNIX_TEAM_ID=your-team-id
4
+ PEPGNIX_PROJECT_ID=your-project-id
5
+ PEPGNIX_API_KEY=your-pepgnix-api-key
6
+
7
+ # MongoDB Configuration (for observability)
8
+ MONGODB_CONNECTION_STRING=mongodb://localhost:27017
9
+ MONGODB_DATABASE_NAME=evaluator_service
10
+ MONGODB_COLLECTION_NAME=evaluation_traces
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 PepsiCo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,5 @@
1
+ include README.md
2
+ include LICENSE
3
+ include pyproject.toml
4
+ include .env.example
5
+ recursive-include evaluator_service *.py
@@ -0,0 +1,237 @@
1
+ Metadata-Version: 2.4
2
+ Name: evaluator-service
3
+ Version: 0.1.0
4
+ Summary: Multi-LLM Response Validation & Selection Framework with RAG metrics evaluation
5
+ Author-email: PepsiCo <tech@pepsico.com>
6
+ Maintainer-email: PepsiCo <tech@pepsico.com>
7
+ License: MIT
8
+ Project-URL: Homepage, https://github.com/pepsico/evaluator-service
9
+ Project-URL: Documentation, https://github.com/pepsico/evaluator-service#readme
10
+ Project-URL: Repository, https://github.com/pepsico/evaluator-service.git
11
+ Project-URL: Issues, https://github.com/pepsico/evaluator-service/issues
12
+ Keywords: llm,rag,evaluation,multi-llm,response-selection,observability
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.14
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.10
22
+ Classifier: Framework :: FastAPI
23
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
+ Requires-Python: >=3.10
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: fastapi>=0.115.0
28
+ Requires-Dist: uvicorn[standard]>=0.32.0
29
+ Requires-Dist: pydantic>=2.9.2
30
+ Requires-Dist: httpx>=0.27.2
31
+ Requires-Dist: python-dotenv>=1.0.1
32
+ Requires-Dist: pymongo>=4.10.0
33
+ Provides-Extra: dev
34
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
35
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
36
+ Requires-Dist: black>=23.0.0; extra == "dev"
37
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
38
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
39
+ Dynamic: license-file
40
+
41
+ # Evaluator Service
42
+
43
+ Multi-LLM Response Validation & Selection Framework with RAG metrics evaluation.
44
+
45
+ ## Features
46
+
47
+ - **Single-call Custom Evaluator**: Evaluates RAG metrics (faithfulness, context precision, context recall, relevance, hallucination risk) in a single LLM call
48
+ - **Score Aggregation**: Weighted scoring formula to combine multiple metrics into a final score
49
+ - **LLM-as-a-Judge**: Tie-breaking mechanism using LLM comparison when scores are close
50
+ - **Parallel Processing**: Evaluates multiple candidate responses concurrently
51
+ - **Observability**: MongoDB integration for storing evaluation traces
52
+ - **FastAPI**: RESTful API for easy integration
53
+ - **Extensible**: Pluggable architecture for different storage backends (MongoDB, Azure Blob, etc.)
54
+
55
+ ## Installation
56
+
57
+ ```bash
58
+ pip install evaluator-service
59
+ ```
60
+
61
+ ## Configuration
62
+
63
+ Set the following environment variables:
64
+
65
+ ```bash
66
+ # Pepgnix LLM Service Configuration
67
+ PEPGNIX_SERVICE_URL=https://pepgnix-service.example.com/api/v1/llm
68
+ PEPGNIX_TEAM_ID=your-team-id
69
+ PEPGNIX_PROJECT_ID=your-project-id
70
+ PEPGNIX_API_KEY=your-pepgnix-api-key
71
+
72
+ # MongoDB Configuration (for observability)
73
+ MONGODB_CONNECTION_STRING=mongodb://localhost:27017
74
+ MONGODB_DATABASE_NAME=evaluator_service
75
+ MONGODB_COLLECTION_NAME=evaluation_traces
76
+ ```
77
+
78
+ ## Usage
79
+
80
+ ### As a Library
81
+
82
+ ```python
83
+ from evaluator_service import EvaluationOrchestrator, EvaluatorService, WinnerSelector
84
+ from evaluator_service.clients import PepgnixClient, MongoObservabilityClient
85
+ from evaluator_service.models import EvalRequest, Candidate, ContextChunk
86
+
87
+ # Initialize clients
88
+ llm_client = PepgnixClient()
89
+ observability_client = MongoObservabilityClient()
90
+
91
+ # Initialize services
92
+ evaluator_service = EvaluatorService(llm_client)
93
+ llm_judge = LLMJudge(llm_client)
94
+ winner_selector = WinnerSelector(llm_judge)
95
+ orchestrator = EvaluationOrchestrator(evaluator_service, winner_selector, observability_client)
96
+
97
+ # Create evaluation request
98
+ request = EvalRequest(
99
+ request_id="req-123",
100
+ user_query="What was PepsiCo revenue in 2024?",
101
+ context_chunks=[
102
+ ContextChunk(
103
+ chunk_id="doc-001-chunk-04",
104
+ text="PepsiCo reported revenue of 91.8 billion USD in FY2024.",
105
+ retrieval_score=0.94
106
+ )
107
+ ],
108
+ candidates=[
109
+ Candidate(model="gpt", response="PepsiCo reported revenue of 91.8 billion USD in FY2024."),
110
+ Candidate(model="claude", response="According to the annual report, PepsiCo reported total revenue of 91.8B for FY2024.")
111
+ ]
112
+ )
113
+
114
+ # Run evaluation
115
+ result = orchestrator.evaluate(request)
116
+ print(f"Winner: {result.winner.model}, Score: {result.score}")
117
+ ```
118
+
119
+ ### As a REST API
120
+
121
+ ```bash
122
+ # Start the server
123
+ evaluator-service
124
+
125
+ # Or using python
126
+ python -m evaluator_service.main
127
+ ```
128
+
129
+ The API will be available at `http://localhost:8080`
130
+
131
+ #### API Endpoint
132
+
133
+ **POST /api/v1/evaluate**
134
+
135
+ Request body:
136
+ ```json
137
+ {
138
+ "request_id": "req-123",
139
+ "user_query": "What was PepsiCo revenue in 2024?",
140
+ "context_chunks": [
141
+ {
142
+ "chunk_id": "doc-001-chunk-04",
143
+ "text": "PepsiCo reported revenue of 91.8 billion USD in FY2024.",
144
+ "retrieval_score": 0.94
145
+ }
146
+ ],
147
+ "candidates": [
148
+ {
149
+ "model": "gpt",
150
+ "response": "PepsiCo reported revenue of 91.8 billion USD in FY2024."
151
+ },
152
+ {
153
+ "model": "claude",
154
+ "response": "According to the annual report, PepsiCo reported total revenue of 91.8B for FY2024."
155
+ }
156
+ ]
157
+ }
158
+ ```
159
+
160
+ Response:
161
+ ```json
162
+ {
163
+ "request_id": "req-123",
164
+ "winner": {
165
+ "model": "claude",
166
+ "response": "According to the annual report, PepsiCo reported total revenue of 91.8B for FY2024.",
167
+ "score": 0.85,
168
+ "selection_method": "score_winner"
169
+ },
170
+ "all_scores": {
171
+ "gpt": {
172
+ "final": 0.82,
173
+ "faithfulness": 0.9,
174
+ "context_precision": 0.85,
175
+ "context_recall": 0.8,
176
+ "relevance": 0.95,
177
+ "hallucination_risk": 0.1
178
+ },
179
+ "claude": {
180
+ "final": 0.85,
181
+ "faithfulness": 0.95,
182
+ "context_precision": 0.9,
183
+ "context_recall": 0.85,
184
+ "relevance": 0.9,
185
+ "hallucination_risk": 0.05
186
+ }
187
+ },
188
+ "trace_id": "abc-123-def-456",
189
+ "evaluated_at": "2024-01-15T10:30:00Z",
190
+ "latency_ms": 2340
191
+ }
192
+ ```
193
+
194
+ ## Scoring Formula
195
+
196
+ The final score is calculated using the following weighted formula:
197
+
198
+ ```
199
+ Final Score =
200
+ (0.35 × faithfulness)
201
+ + (0.25 × context_recall)
202
+ + (0.20 × relevance)
203
+ + (0.20 × context_precision)
204
+ - (0.30 × hallucination_risk)
205
+ ```
206
+
207
+ ## Tie-Breaking
208
+
209
+ When the difference between the top two scores is less than 0.05, the LLM Judge is invoked to compare the two answers based on:
210
+ - Accuracy
211
+ - Completeness
212
+ - Grounding
213
+ - Clarity
214
+
215
+ ## Development
216
+
217
+ ```bash
218
+ # Install development dependencies
219
+ pip install -e ".[dev]"
220
+
221
+ # Run tests
222
+ pytest
223
+
224
+ # Format code
225
+ black .
226
+
227
+ # Lint
228
+ ruff check .
229
+ ```
230
+
231
+ ## License
232
+
233
+ MIT License - see LICENSE file for details.
234
+
235
+ ## Contributing
236
+
237
+ Contributions are welcome! Please open an issue or submit a pull request.
@@ -0,0 +1,197 @@
1
+ # Evaluator Service
2
+
3
+ Multi-LLM Response Validation & Selection Framework with RAG metrics evaluation.
4
+
5
+ ## Features
6
+
7
+ - **Single-call Custom Evaluator**: Evaluates RAG metrics (faithfulness, context precision, context recall, relevance, hallucination risk) in a single LLM call
8
+ - **Score Aggregation**: Weighted scoring formula to combine multiple metrics into a final score
9
+ - **LLM-as-a-Judge**: Tie-breaking mechanism using LLM comparison when scores are close
10
+ - **Parallel Processing**: Evaluates multiple candidate responses concurrently
11
+ - **Observability**: MongoDB integration for storing evaluation traces
12
+ - **FastAPI**: RESTful API for easy integration
13
+ - **Extensible**: Pluggable architecture for different storage backends (MongoDB, Azure Blob, etc.)
14
+
15
+ ## Installation
16
+
17
+ ```bash
18
+ pip install evaluator-service
19
+ ```
20
+
21
+ ## Configuration
22
+
23
+ Set the following environment variables:
24
+
25
+ ```bash
26
+ # Pepgnix LLM Service Configuration
27
+ PEPGNIX_SERVICE_URL=https://pepgnix-service.example.com/api/v1/llm
28
+ PEPGNIX_TEAM_ID=your-team-id
29
+ PEPGNIX_PROJECT_ID=your-project-id
30
+ PEPGNIX_API_KEY=your-pepgnix-api-key
31
+
32
+ # MongoDB Configuration (for observability)
33
+ MONGODB_CONNECTION_STRING=mongodb://localhost:27017
34
+ MONGODB_DATABASE_NAME=evaluator_service
35
+ MONGODB_COLLECTION_NAME=evaluation_traces
36
+ ```
37
+
38
+ ## Usage
39
+
40
+ ### As a Library
41
+
42
+ ```python
43
+ from evaluator_service import EvaluationOrchestrator, EvaluatorService, WinnerSelector
44
+ from evaluator_service.clients import PepgnixClient, MongoObservabilityClient
45
+ from evaluator_service.models import EvalRequest, Candidate, ContextChunk
46
+
47
+ # Initialize clients
48
+ llm_client = PepgnixClient()
49
+ observability_client = MongoObservabilityClient()
50
+
51
+ # Initialize services
52
+ evaluator_service = EvaluatorService(llm_client)
53
+ llm_judge = LLMJudge(llm_client)
54
+ winner_selector = WinnerSelector(llm_judge)
55
+ orchestrator = EvaluationOrchestrator(evaluator_service, winner_selector, observability_client)
56
+
57
+ # Create evaluation request
58
+ request = EvalRequest(
59
+ request_id="req-123",
60
+ user_query="What was PepsiCo revenue in 2024?",
61
+ context_chunks=[
62
+ ContextChunk(
63
+ chunk_id="doc-001-chunk-04",
64
+ text="PepsiCo reported revenue of 91.8 billion USD in FY2024.",
65
+ retrieval_score=0.94
66
+ )
67
+ ],
68
+ candidates=[
69
+ Candidate(model="gpt", response="PepsiCo reported revenue of 91.8 billion USD in FY2024."),
70
+ Candidate(model="claude", response="According to the annual report, PepsiCo reported total revenue of 91.8B for FY2024.")
71
+ ]
72
+ )
73
+
74
+ # Run evaluation
75
+ result = orchestrator.evaluate(request)
76
+ print(f"Winner: {result.winner.model}, Score: {result.score}")
77
+ ```
78
+
79
+ ### As a REST API
80
+
81
+ ```bash
82
+ # Start the server
83
+ evaluator-service
84
+
85
+ # Or using python
86
+ python -m evaluator_service.main
87
+ ```
88
+
89
+ The API will be available at `http://localhost:8080`
90
+
91
+ #### API Endpoint
92
+
93
+ **POST /api/v1/evaluate**
94
+
95
+ Request body:
96
+ ```json
97
+ {
98
+ "request_id": "req-123",
99
+ "user_query": "What was PepsiCo revenue in 2024?",
100
+ "context_chunks": [
101
+ {
102
+ "chunk_id": "doc-001-chunk-04",
103
+ "text": "PepsiCo reported revenue of 91.8 billion USD in FY2024.",
104
+ "retrieval_score": 0.94
105
+ }
106
+ ],
107
+ "candidates": [
108
+ {
109
+ "model": "gpt",
110
+ "response": "PepsiCo reported revenue of 91.8 billion USD in FY2024."
111
+ },
112
+ {
113
+ "model": "claude",
114
+ "response": "According to the annual report, PepsiCo reported total revenue of 91.8B for FY2024."
115
+ }
116
+ ]
117
+ }
118
+ ```
119
+
120
+ Response:
121
+ ```json
122
+ {
123
+ "request_id": "req-123",
124
+ "winner": {
125
+ "model": "claude",
126
+ "response": "According to the annual report, PepsiCo reported total revenue of 91.8B for FY2024.",
127
+ "score": 0.85,
128
+ "selection_method": "score_winner"
129
+ },
130
+ "all_scores": {
131
+ "gpt": {
132
+ "final": 0.82,
133
+ "faithfulness": 0.9,
134
+ "context_precision": 0.85,
135
+ "context_recall": 0.8,
136
+ "relevance": 0.95,
137
+ "hallucination_risk": 0.1
138
+ },
139
+ "claude": {
140
+ "final": 0.85,
141
+ "faithfulness": 0.95,
142
+ "context_precision": 0.9,
143
+ "context_recall": 0.85,
144
+ "relevance": 0.9,
145
+ "hallucination_risk": 0.05
146
+ }
147
+ },
148
+ "trace_id": "abc-123-def-456",
149
+ "evaluated_at": "2024-01-15T10:30:00Z",
150
+ "latency_ms": 2340
151
+ }
152
+ ```
153
+
154
+ ## Scoring Formula
155
+
156
+ The final score is calculated using the following weighted formula:
157
+
158
+ ```
159
+ Final Score =
160
+ (0.35 × faithfulness)
161
+ + (0.25 × context_recall)
162
+ + (0.20 × relevance)
163
+ + (0.20 × context_precision)
164
+ - (0.30 × hallucination_risk)
165
+ ```
166
+
167
+ ## Tie-Breaking
168
+
169
+ When the difference between the top two scores is less than 0.05, the LLM Judge is invoked to compare the two answers based on:
170
+ - Accuracy
171
+ - Completeness
172
+ - Grounding
173
+ - Clarity
174
+
175
+ ## Development
176
+
177
+ ```bash
178
+ # Install development dependencies
179
+ pip install -e ".[dev]"
180
+
181
+ # Run tests
182
+ pytest
183
+
184
+ # Format code
185
+ black .
186
+
187
+ # Lint
188
+ ruff check .
189
+ ```
190
+
191
+ ## License
192
+
193
+ MIT License - see LICENSE file for details.
194
+
195
+ ## Contributing
196
+
197
+ Contributions are welcome! Please open an issue or submit a pull request.
@@ -0,0 +1,49 @@
1
+ # [AI-Generated] model: SWE-1.6 | tid: ff216a85
2
+ """Evaluator Service - Multi-LLM Response Validation & Selection Framework"""
3
+
4
+ __version__ = "0.1.0"
5
+
6
+ from evaluator_service.api import app
7
+ from evaluator_service.models import (
8
+ ContextChunk,
9
+ Candidate,
10
+ EvalRequest,
11
+ EvalResult,
12
+ SelectionResult,
13
+ ScoreBreakdown,
14
+ WinnerInfo,
15
+ EvalResponse,
16
+ ErrorResponse,
17
+ )
18
+ from evaluator_service.clients import PepgnixClient, MongoObservabilityClient
19
+ from evaluator_service.services import (
20
+ EvaluatorService,
21
+ LLMJudge,
22
+ JudgeResult,
23
+ WinnerSelector,
24
+ EvaluationOrchestrator,
25
+ )
26
+ from evaluator_service.utils import ScoreAggregator, ResponseFormatter
27
+
28
+ __all__ = [
29
+ "__version__",
30
+ "app",
31
+ "ContextChunk",
32
+ "Candidate",
33
+ "EvalRequest",
34
+ "EvalResult",
35
+ "SelectionResult",
36
+ "ScoreBreakdown",
37
+ "WinnerInfo",
38
+ "EvalResponse",
39
+ "ErrorResponse",
40
+ "PepgnixClient",
41
+ "MongoObservabilityClient",
42
+ "EvaluatorService",
43
+ "LLMJudge",
44
+ "JudgeResult",
45
+ "WinnerSelector",
46
+ "EvaluationOrchestrator",
47
+ "ScoreAggregator",
48
+ "ResponseFormatter",
49
+ ]
@@ -0,0 +1,4 @@
1
+ # [AI-Generated] model: SWE-1.6 | tid: ff216a85
2
+ from evaluator_service.api.routes import app
3
+
4
+ __all__ = ["app"]
@@ -0,0 +1,98 @@
1
+ # [AI-Generated] model: SWE-1.6 | tid: ff216a85
2
+ import logging
3
+ import time
4
+ import uuid
5
+ from fastapi import FastAPI, HTTPException, status
6
+ from fastapi.responses import JSONResponse
7
+
8
+ from evaluator_service.models import EvalRequest, ErrorResponse
9
+ from evaluator_service.clients import PepgnixClient, MongoObservabilityClient
10
+ from evaluator_service.services import EvaluatorService, LLMJudge, WinnerSelector, EvaluationOrchestrator
11
+ from evaluator_service.utils import ResponseFormatter, ScoreAggregator
12
+
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ app = FastAPI(title="Evulate Service", version="2.1")
17
+
18
+ # [AI-Generated] model: SWE-1.6 | tid: ff216a85
19
+ # Initialize dependencies
20
+ try:
21
+ llm_client = PepgnixClient()
22
+ evaluator_service = EvaluatorService(llm_client)
23
+ llm_judge = LLMJudge(llm_client)
24
+ winner_selector = WinnerSelector(llm_judge)
25
+ observability_client = MongoObservabilityClient()
26
+ orchestrator = EvaluationOrchestrator(evaluator_service, winner_selector, observability_client)
27
+ response_formatter = ResponseFormatter()
28
+ except Exception as e:
29
+ logger.error(f"Failed to initialize service: {e}")
30
+ raise
31
+
32
+
33
+ @app.post("/api/v1/evaluate", status_code=status.HTTP_200_OK)
34
+ async def evaluate(request: EvalRequest):
35
+ start_time = time.time()
36
+ trace_id = str(uuid.uuid4())
37
+
38
+ # Validation
39
+ if not request.candidates:
40
+ raise HTTPException(
41
+ status_code=status.HTTP_400_BAD_REQUEST,
42
+ detail=ErrorResponse(
43
+ error="INVALID_REQUEST",
44
+ message="candidates[] must contain at least 1 item"
45
+ ).model_dump()
46
+ )
47
+
48
+ if not request.context_chunks:
49
+ raise HTTPException(
50
+ status_code=status.HTTP_400_BAD_REQUEST,
51
+ detail=ErrorResponse(
52
+ error="INVALID_REQUEST",
53
+ message="context_chunks[] must contain at least 1 item"
54
+ ).model_dump()
55
+ )
56
+
57
+ try:
58
+ # Run evaluation
59
+ selection_result = orchestrator.evaluate(request)
60
+
61
+ # Get eval results and scores for response formatting
62
+ context = "\n\n".join([chunk.text for chunk in request.context_chunks])
63
+ eval_results = {}
64
+ for candidate in request.candidates:
65
+ eval_results[candidate.model] = evaluator_service.evaluate(
66
+ request.user_query,
67
+ context,
68
+ candidate.response
69
+ )
70
+
71
+ scores = ScoreAggregator.aggregate(eval_results)
72
+
73
+ # Format response
74
+ response = response_formatter.format(
75
+ request_id=request.request_id,
76
+ selection_result=selection_result,
77
+ eval_results=eval_results,
78
+ scores=scores,
79
+ trace_id=trace_id,
80
+ start_time=start_time
81
+ )
82
+
83
+ return response
84
+
85
+ except Exception as e:
86
+ logger.error(f"Evaluation failed: {e}")
87
+ raise HTTPException(
88
+ status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
89
+ detail=ErrorResponse(
90
+ error="EVALUATION_ERROR",
91
+ message=str(e)
92
+ ).model_dump()
93
+ )
94
+
95
+
96
+ @app.get("/health")
97
+ async def health_check():
98
+ return {"status": "healthy"}
@@ -0,0 +1,5 @@
1
+ # [AI-Generated] model: SWE-1.6 | tid: ff216a85
2
+ from .pepgnix_client import PepgnixClient
3
+ from .mongo_observability_client import MongoObservabilityClient
4
+
5
+ __all__ = ["PepgnixClient", "MongoObservabilityClient"]