eval-hub-sdk 0.1.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eval_hub_sdk-0.1.0a0.dist-info/METADATA +711 -0
- eval_hub_sdk-0.1.0a0.dist-info/RECORD +27 -0
- eval_hub_sdk-0.1.0a0.dist-info/WHEEL +5 -0
- eval_hub_sdk-0.1.0a0.dist-info/entry_points.txt +2 -0
- eval_hub_sdk-0.1.0a0.dist-info/licenses/LICENSE +201 -0
- eval_hub_sdk-0.1.0a0.dist-info/top_level.txt +1 -0
- evalhub/__init__.py +84 -0
- evalhub/adapter/__init__.py +28 -0
- evalhub/adapter/api/__init__.py +6 -0
- evalhub/adapter/api/endpoints.py +342 -0
- evalhub/adapter/api/router.py +135 -0
- evalhub/adapter/cli.py +331 -0
- evalhub/adapter/client/__init__.py +6 -0
- evalhub/adapter/client/adapter_client.py +418 -0
- evalhub/adapter/client/discovery.py +275 -0
- evalhub/adapter/models/__init__.py +9 -0
- evalhub/adapter/models/framework.py +404 -0
- evalhub/adapter/oci/__init__.py +5 -0
- evalhub/adapter/oci/persister.py +76 -0
- evalhub/adapter/server/__init__.py +5 -0
- evalhub/adapter/server/app.py +157 -0
- evalhub/cli.py +331 -0
- evalhub/models/__init__.py +32 -0
- evalhub/models/api.py +388 -0
- evalhub/py.typed +0 -0
- evalhub/utils/__init__.py +5 -0
- evalhub/utils/logging.py +41 -0
|
@@ -0,0 +1,711 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: eval-hub-sdk
|
|
3
|
+
Version: 0.1.0a0
|
|
4
|
+
Summary: SDK for building framework adapters that integrate with TrustyAI EvalHub
|
|
5
|
+
Author-email: Rui Vieira <rui@redhat.com>
|
|
6
|
+
Project-URL: Homepage, https://github.com/eval-hub
|
|
7
|
+
Project-URL: Repository, https://github.com/eval-hub/eval-hub-sdk
|
|
8
|
+
Project-URL: Documentation, https://github.com/eval-hub/eval-hub-sdk
|
|
9
|
+
Project-URL: Bug Tracker, https://github.com/eval-hub/eval-hub-sdk/issues
|
|
10
|
+
Keywords: ai,evaluation,framework,sdk,trustyai
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: pydantic>=2.0.0
|
|
25
|
+
Requires-Dist: importlib-metadata>=6.0.0
|
|
26
|
+
Provides-Extra: core
|
|
27
|
+
Requires-Dist: httpx>=0.25.0; extra == "core"
|
|
28
|
+
Provides-Extra: adapter
|
|
29
|
+
Requires-Dist: eval-hub-sdk[core]; extra == "adapter"
|
|
30
|
+
Requires-Dist: fastapi>=0.104.0; extra == "adapter"
|
|
31
|
+
Requires-Dist: uvicorn[standard]>=0.24.0; extra == "adapter"
|
|
32
|
+
Provides-Extra: client
|
|
33
|
+
Requires-Dist: eval-hub-sdk[core]; extra == "client"
|
|
34
|
+
Provides-Extra: cli
|
|
35
|
+
Requires-Dist: eval-hub-sdk[core]; extra == "cli"
|
|
36
|
+
Requires-Dist: typer>=0.9.0; extra == "cli"
|
|
37
|
+
Requires-Dist: rich>=13.0.0; extra == "cli"
|
|
38
|
+
Provides-Extra: dev
|
|
39
|
+
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
|
40
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
41
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
42
|
+
Requires-Dist: ruff==0.1.6; extra == "dev"
|
|
43
|
+
Requires-Dist: mypy==1.7.1; extra == "dev"
|
|
44
|
+
Requires-Dist: pre-commit>=3.4.0; extra == "dev"
|
|
45
|
+
Provides-Extra: examples
|
|
46
|
+
Requires-Dist: lm-eval>=0.4.0; extra == "examples"
|
|
47
|
+
Requires-Dist: ragas>=0.1.0; extra == "examples"
|
|
48
|
+
Provides-Extra: all
|
|
49
|
+
Requires-Dist: eval-hub-sdk[adapter,cli,client,core,dev]; extra == "all"
|
|
50
|
+
Dynamic: license-file
|
|
51
|
+
|
|
52
|
+
# EvalHub SDK
|
|
53
|
+
|
|
54
|
+
**Framework Adapter SDK for TrustyAI EvalHub Integration**
|
|
55
|
+
|
|
56
|
+
The EvalHub SDK provides a standardized way to create framework adapters that can be consumed by EvalHub, enabling a "Bring Your Own Framework" (BYOF) approach for evaluation frameworks.
|
|
57
|
+
|
|
58
|
+
## Overview
|
|
59
|
+
|
|
60
|
+
The SDK creates a common API layer that allows EvalHub to communicate with ANY evaluation framework. Users only need to write minimal "glue" code to connect their framework to the standardized interface.
|
|
61
|
+
|
|
62
|
+
```
|
|
63
|
+
EvalHub → (Standard API) → Your Framework Adapter → Your Evaluation Framework
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Architecture
|
|
67
|
+
|
|
68
|
+
```mermaid
|
|
69
|
+
graph LR
|
|
70
|
+
EH[EvalHub]
|
|
71
|
+
FA[Framework Adapter<br/>SDK + Glue Code]
|
|
72
|
+
YF[Your Framework<br/>LMEval, Custom,<br/>RAGAS, etc.]
|
|
73
|
+
API[Standard API<br/>─────────────<br/>/health<br/>/info<br/>/benchmarks<br/>/evaluations]
|
|
74
|
+
|
|
75
|
+
EH <--> FA
|
|
76
|
+
FA <--> YF
|
|
77
|
+
EH --> API
|
|
78
|
+
FA --> API
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Package Organization
|
|
82
|
+
|
|
83
|
+
The SDK is organized into distinct, focused packages:
|
|
84
|
+
|
|
85
|
+
**🏗️ Core (`evalhub.models`)** - Shared data models and utilities
|
|
86
|
+
- Request/response models for API communication
|
|
87
|
+
- Common data structures used by both clients and adapters
|
|
88
|
+
|
|
89
|
+
**🔧 Adapter SDK (`evalhub.adapter`)** - Components for building framework adapters
|
|
90
|
+
- Framework adapter base class and configuration
|
|
91
|
+
- Server components for hosting your adapter
|
|
92
|
+
- API routing and endpoint implementations
|
|
93
|
+
- CLI tools for running and managing adapters
|
|
94
|
+
|
|
95
|
+
**📡 Client SDK (`evalhub.adapter.client`)** - Components for communicating with adapters
|
|
96
|
+
- HTTP client for connecting to framework adapters
|
|
97
|
+
- Discovery service for finding and managing multiple adapters
|
|
98
|
+
- Async communication patterns
|
|
99
|
+
|
|
100
|
+
### Key Components
|
|
101
|
+
|
|
102
|
+
1. **Standard API**: Common REST endpoints that all adapters must implement
|
|
103
|
+
2. **Framework Adapter Base Class**: Abstract base class with the adapter contract (`evalhub.adapter.models`)
|
|
104
|
+
3. **Server Components**: FastAPI-based server for exposing the standard API (`evalhub.adapter.server`)
|
|
105
|
+
4. **Client Components**: HTTP client for EvalHub to communicate with adapters (`evalhub.adapter.client`)
|
|
106
|
+
5. **Data Models**: Pydantic models for requests, responses, and metadata (`evalhub.models`)
|
|
107
|
+
|
|
108
|
+
## Quick Start
|
|
109
|
+
|
|
110
|
+
### 1. Installation
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
# Install from PyPI (when available)
|
|
114
|
+
pip install evalhub-sdk
|
|
115
|
+
|
|
116
|
+
# Install from source
|
|
117
|
+
git clone https://github.com/trustyai-explainability/evalhub-sdk.git
|
|
118
|
+
cd evalhub-sdk
|
|
119
|
+
pip install -e .[dev]
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### 2. Create Your Adapter
|
|
123
|
+
|
|
124
|
+
Create a new Python file for your adapter:
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
# my_framework_adapter.py
|
|
128
|
+
from evalhub.adapter import FrameworkAdapter, AdapterConfig
|
|
129
|
+
from evalhub.models import *
|
|
130
|
+
|
|
131
|
+
class MyFrameworkAdapter(FrameworkAdapter):
|
|
132
|
+
async def initialize(self):
|
|
133
|
+
"""Initialize your framework here"""
|
|
134
|
+
# Load your evaluation framework
|
|
135
|
+
pass
|
|
136
|
+
|
|
137
|
+
async def list_benchmarks(self) -> List[BenchmarkInfo]:
|
|
138
|
+
"""Return available benchmarks from your framework"""
|
|
139
|
+
return [
|
|
140
|
+
BenchmarkInfo(
|
|
141
|
+
benchmark_id="my_benchmark",
|
|
142
|
+
name="My Custom Benchmark",
|
|
143
|
+
description="A custom benchmark",
|
|
144
|
+
category="reasoning",
|
|
145
|
+
metrics=["accuracy", "f1_score"]
|
|
146
|
+
)
|
|
147
|
+
]
|
|
148
|
+
|
|
149
|
+
async def submit_evaluation(self, request: EvaluationRequest) -> EvaluationJob:
|
|
150
|
+
"""Submit evaluation to your framework"""
|
|
151
|
+
# Translate request to your framework's format
|
|
152
|
+
# Run evaluation
|
|
153
|
+
# Return job information
|
|
154
|
+
pass
|
|
155
|
+
|
|
156
|
+
# Implement other required methods...
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### 3. Run Your Adapter
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
# run_adapter.py
|
|
163
|
+
from evalhub.adapter import AdapterServer, AdapterConfig
|
|
164
|
+
from my_framework_adapter import MyFrameworkAdapter
|
|
165
|
+
|
|
166
|
+
config = AdapterConfig(
|
|
167
|
+
framework_id="my_framework",
|
|
168
|
+
adapter_name="My Framework Adapter",
|
|
169
|
+
port=8080
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
adapter = MyFrameworkAdapter(config)
|
|
173
|
+
server = AdapterServer(adapter)
|
|
174
|
+
server.run()
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### 4. Test Your Adapter
|
|
178
|
+
|
|
179
|
+
```bash
|
|
180
|
+
# Run your adapter
|
|
181
|
+
python run_adapter.py
|
|
182
|
+
|
|
183
|
+
# Test health check
|
|
184
|
+
curl http://localhost:8080/api/v1/health
|
|
185
|
+
|
|
186
|
+
# Get framework info
|
|
187
|
+
curl http://localhost:8080/api/v1/info
|
|
188
|
+
|
|
189
|
+
# List benchmarks
|
|
190
|
+
curl http://localhost:8080/api/v1/benchmarks
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
## Package Organization Guide
|
|
194
|
+
|
|
195
|
+
The EvalHub SDK is organized into distinct packages based on your use case:
|
|
196
|
+
|
|
197
|
+
### 📦 **Which Package Should I Use?**
|
|
198
|
+
|
|
199
|
+
| Use Case | Primary Package | Description |
|
|
200
|
+
|----------|----------------|-------------|
|
|
201
|
+
| **Building an Adapter** | `evalhub.adapter` | You're creating a new framework adapter |
|
|
202
|
+
| **Connecting to Adapters** | `evalhub.adapter.client` | You're building a client to communicate with adapters |
|
|
203
|
+
| **Data Models** | `evalhub.models` | You need request/response models for API communication |
|
|
204
|
+
| **CLI Tools** | `evalhub.adapter.cli` | You want to run/manage adapters from command line |
|
|
205
|
+
|
|
206
|
+
### 🎯 **Import Patterns by Role**
|
|
207
|
+
|
|
208
|
+
**Framework Adapter Developer:**
|
|
209
|
+
```python
|
|
210
|
+
# Building your adapter
|
|
211
|
+
from evalhub.adapter.models import FrameworkAdapter, AdapterConfig
|
|
212
|
+
from evalhub.adapter.server import AdapterServer
|
|
213
|
+
from evalhub.models.api import EvaluationRequest, EvaluationJob
|
|
214
|
+
|
|
215
|
+
# Running your adapter
|
|
216
|
+
from evalhub.adapter import * # Everything you need
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
**Client Developer (EvalHub team):**
|
|
220
|
+
```python
|
|
221
|
+
# Communicating with adapters
|
|
222
|
+
from evalhub.adapter.client import AdapterClient, AdapterDiscovery
|
|
223
|
+
from evalhub.models.api import EvaluationRequest, ModelConfig
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
**Integration Developer:**
|
|
227
|
+
```python
|
|
228
|
+
# Using both sides of the API
|
|
229
|
+
from evalhub.adapter.client import AdapterClient # Client side
|
|
230
|
+
from evalhub.adapter.models import FrameworkAdapter # Adapter side
|
|
231
|
+
from evalhub.models.api import * # Shared models
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
## Complete Examples
|
|
235
|
+
|
|
236
|
+
### LightEval Framework Example
|
|
237
|
+
See [examples/lighteval_adapter/](examples/lighteval_adapter/) for a production-ready example with:
|
|
238
|
+
|
|
239
|
+
Try the demo (notebook runs **outside** the container):
|
|
240
|
+
```bash
|
|
241
|
+
# Container: LightEval + adapter
|
|
242
|
+
# Notebook: External HTTP client
|
|
243
|
+
cd examples/
|
|
244
|
+
jupyter notebook lighteval_demo_external.ipynb
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
## Standard API Endpoints
|
|
248
|
+
|
|
249
|
+
All framework adapters expose the same REST API:
|
|
250
|
+
|
|
251
|
+
| Endpoint | Method | Description |
|
|
252
|
+
|----------|--------|-------------|
|
|
253
|
+
| `/health` | GET | Health check |
|
|
254
|
+
| `/info` | GET | Framework information |
|
|
255
|
+
| `/benchmarks` | GET | List available benchmarks |
|
|
256
|
+
| `/benchmarks/{id}` | GET | Get benchmark details |
|
|
257
|
+
| `/evaluations` | POST | Submit evaluation job |
|
|
258
|
+
| `/evaluations/{job_id}` | GET | Get job status |
|
|
259
|
+
| `/evaluations/{job_id}/results` | GET | Get evaluation results |
|
|
260
|
+
| `/evaluations/{job_id}` | DELETE | Cancel job |
|
|
261
|
+
| `/evaluations/{job_id}/stream` | GET | Stream job updates |
|
|
262
|
+
|
|
263
|
+
### Example API Usage
|
|
264
|
+
|
|
265
|
+
```bash
|
|
266
|
+
# Submit evaluation
|
|
267
|
+
curl -X POST http://localhost:8080/api/v1/evaluations \
|
|
268
|
+
-H "Content-Type: application/json" \
|
|
269
|
+
-d '{
|
|
270
|
+
"benchmark_id": "my_benchmark",
|
|
271
|
+
"model": {
|
|
272
|
+
"name": "gpt-4",
|
|
273
|
+
"provider": "openai",
|
|
274
|
+
"parameters": {
|
|
275
|
+
"temperature": 0.1,
|
|
276
|
+
"max_tokens": 100
|
|
277
|
+
}
|
|
278
|
+
},
|
|
279
|
+
"num_examples": 100,
|
|
280
|
+
"experiment_name": "test_evaluation"
|
|
281
|
+
}'
|
|
282
|
+
|
|
283
|
+
# Check job status
|
|
284
|
+
curl http://localhost:8080/api/v1/evaluations/{job_id}
|
|
285
|
+
|
|
286
|
+
# Get results
|
|
287
|
+
curl http://localhost:8080/api/v1/evaluations/{job_id}/results
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
## Framework Adapter Interface
|
|
291
|
+
|
|
292
|
+
### Required Methods
|
|
293
|
+
|
|
294
|
+
Your adapter must implement these abstract methods:
|
|
295
|
+
|
|
296
|
+
```python
|
|
297
|
+
class FrameworkAdapter(ABC):
|
|
298
|
+
@abstractmethod
|
|
299
|
+
async def initialize(self) -> None:
|
|
300
|
+
"""Initialize the framework"""
|
|
301
|
+
|
|
302
|
+
@abstractmethod
|
|
303
|
+
async def get_framework_info(self) -> FrameworkInfo:
|
|
304
|
+
"""Get framework information"""
|
|
305
|
+
|
|
306
|
+
@abstractmethod
|
|
307
|
+
async def list_benchmarks(self) -> List[BenchmarkInfo]:
|
|
308
|
+
"""List available benchmarks"""
|
|
309
|
+
|
|
310
|
+
@abstractmethod
|
|
311
|
+
async def get_benchmark_info(self, benchmark_id: str) -> Optional[BenchmarkInfo]:
|
|
312
|
+
"""Get benchmark details"""
|
|
313
|
+
|
|
314
|
+
@abstractmethod
|
|
315
|
+
async def submit_evaluation(self, request: EvaluationRequest) -> EvaluationJob:
|
|
316
|
+
"""Submit evaluation job"""
|
|
317
|
+
|
|
318
|
+
@abstractmethod
|
|
319
|
+
async def get_job_status(self, job_id: str) -> Optional[EvaluationJob]:
|
|
320
|
+
"""Get job status"""
|
|
321
|
+
|
|
322
|
+
@abstractmethod
|
|
323
|
+
async def get_evaluation_results(self, job_id: str) -> Optional[EvaluationResponse]:
|
|
324
|
+
"""Get evaluation results"""
|
|
325
|
+
|
|
326
|
+
@abstractmethod
|
|
327
|
+
async def cancel_job(self, job_id: str) -> bool:
|
|
328
|
+
"""Cancel job"""
|
|
329
|
+
|
|
330
|
+
@abstractmethod
|
|
331
|
+
async def health_check(self) -> HealthResponse:
|
|
332
|
+
"""Perform health check"""
|
|
333
|
+
|
|
334
|
+
@abstractmethod
|
|
335
|
+
async def shutdown(self) -> None:
|
|
336
|
+
"""Graceful shutdown"""
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
### Data Models
|
|
340
|
+
|
|
341
|
+
Key data models for requests and responses:
|
|
342
|
+
|
|
343
|
+
```python
|
|
344
|
+
# Evaluation request from EvalHub
|
|
345
|
+
class EvaluationRequest(BaseModel):
|
|
346
|
+
benchmark_id: str
|
|
347
|
+
model: ModelConfig
|
|
348
|
+
num_examples: Optional[int] = None
|
|
349
|
+
num_few_shot: Optional[int] = None
|
|
350
|
+
benchmark_config: Dict[str, Any] = {}
|
|
351
|
+
experiment_name: Optional[str] = None
|
|
352
|
+
|
|
353
|
+
# Model configuration
|
|
354
|
+
class ModelConfig(BaseModel):
|
|
355
|
+
name: str
|
|
356
|
+
provider: Optional[str] = None
|
|
357
|
+
parameters: Dict[str, Any] = {}
|
|
358
|
+
device: Optional[str] = None
|
|
359
|
+
batch_size: Optional[int] = None
|
|
360
|
+
|
|
361
|
+
# Evaluation job tracking
|
|
362
|
+
class EvaluationJob(BaseModel):
|
|
363
|
+
job_id: str
|
|
364
|
+
status: JobStatus # PENDING, RUNNING, COMPLETED, FAILED, CANCELLED
|
|
365
|
+
request: EvaluationRequest
|
|
366
|
+
submitted_at: datetime
|
|
367
|
+
started_at: Optional[datetime] = None
|
|
368
|
+
completed_at: Optional[datetime] = None
|
|
369
|
+
progress: Optional[float] = None # 0.0 to 1.0
|
|
370
|
+
error_message: Optional[str] = None
|
|
371
|
+
|
|
372
|
+
# Evaluation results
|
|
373
|
+
class EvaluationResponse(BaseModel):
|
|
374
|
+
job_id: str
|
|
375
|
+
benchmark_id: str
|
|
376
|
+
model_name: str
|
|
377
|
+
results: List[EvaluationResult]
|
|
378
|
+
overall_score: Optional[float] = None
|
|
379
|
+
num_examples_evaluated: int
|
|
380
|
+
completed_at: datetime
|
|
381
|
+
duration_seconds: float
|
|
382
|
+
|
|
383
|
+
# Individual metric result
|
|
384
|
+
class EvaluationResult(BaseModel):
|
|
385
|
+
metric_name: str
|
|
386
|
+
metric_value: Union[float, int, str, bool]
|
|
387
|
+
metric_type: str = "float"
|
|
388
|
+
num_samples: Optional[int] = None
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
## CLI Usage
|
|
392
|
+
|
|
393
|
+
The SDK includes a CLI tool for running and testing adapters:
|
|
394
|
+
|
|
395
|
+
```bash
|
|
396
|
+
# Run an adapter
|
|
397
|
+
evalhub-adapter run my_adapter:MyAdapter --port 8080
|
|
398
|
+
|
|
399
|
+
# Get adapter info
|
|
400
|
+
evalhub-adapter info http://localhost:8080
|
|
401
|
+
|
|
402
|
+
# Check adapter health
|
|
403
|
+
evalhub-adapter health http://localhost:8080
|
|
404
|
+
|
|
405
|
+
# Discover multiple adapters
|
|
406
|
+
evalhub-adapter discover http://adapter1:8080 http://adapter2:8081
|
|
407
|
+
```
|
|
408
|
+
|
|
409
|
+
## EvalHub Integration
|
|
410
|
+
|
|
411
|
+
### Client Usage
|
|
412
|
+
|
|
413
|
+
EvalHub uses the provided client to communicate with adapters:
|
|
414
|
+
|
|
415
|
+
```python
|
|
416
|
+
from evalhub.adapter.client import AdapterClient
|
|
417
|
+
from evalhub.models import EvaluationRequest, ModelConfig
|
|
418
|
+
|
|
419
|
+
async with AdapterClient("http://adapter:8080") as client:
|
|
420
|
+
# Get framework info
|
|
421
|
+
info = await client.get_framework_info()
|
|
422
|
+
print(f"Framework: {info.name}")
|
|
423
|
+
|
|
424
|
+
# List benchmarks
|
|
425
|
+
benchmarks = await client.list_benchmarks()
|
|
426
|
+
print(f"Available benchmarks: {len(benchmarks)}")
|
|
427
|
+
|
|
428
|
+
# Submit evaluation
|
|
429
|
+
request = EvaluationRequest(
|
|
430
|
+
benchmark_id="custom_benchmark",
|
|
431
|
+
model=ModelConfig(
|
|
432
|
+
name="llama-7b",
|
|
433
|
+
provider="vllm",
|
|
434
|
+
parameters={"temperature": 0.1}
|
|
435
|
+
),
|
|
436
|
+
num_examples=100
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
job = await client.submit_evaluation(request)
|
|
440
|
+
print(f"Job submitted: {job.job_id}")
|
|
441
|
+
|
|
442
|
+
# Wait for completion
|
|
443
|
+
final_job = await client.wait_for_completion(job.job_id)
|
|
444
|
+
|
|
445
|
+
# Get results
|
|
446
|
+
if final_job.status == JobStatus.COMPLETED:
|
|
447
|
+
results = await client.get_evaluation_results(job.job_id)
|
|
448
|
+
print(f"Results: {len(results.results)} metrics")
|
|
449
|
+
```
|
|
450
|
+
|
|
451
|
+
### Discovery Service
|
|
452
|
+
|
|
453
|
+
EvalHub can automatically discover and manage multiple adapters:
|
|
454
|
+
|
|
455
|
+
```python
|
|
456
|
+
from evalhub.adapter.client import AdapterDiscovery
|
|
457
|
+
|
|
458
|
+
discovery = AdapterDiscovery()
|
|
459
|
+
|
|
460
|
+
# Register adapters
|
|
461
|
+
discovery.register_adapter("http://lmeval-adapter:8080")
|
|
462
|
+
discovery.register_adapter("http://ragas-adapter:8081")
|
|
463
|
+
|
|
464
|
+
# Start health monitoring
|
|
465
|
+
await discovery.start_health_monitoring()
|
|
466
|
+
|
|
467
|
+
# Get healthy adapters
|
|
468
|
+
healthy_adapters = discovery.get_healthy_adapters()
|
|
469
|
+
|
|
470
|
+
# Find adapter for specific framework
|
|
471
|
+
lmeval_adapter = discovery.get_adapter_for_framework("lm_evaluation_harness")
|
|
472
|
+
```
|
|
473
|
+
|
|
474
|
+
## Configuration
|
|
475
|
+
|
|
476
|
+
### Adapter Configuration
|
|
477
|
+
|
|
478
|
+
```python
|
|
479
|
+
config = AdapterConfig(
|
|
480
|
+
framework_id="my_framework",
|
|
481
|
+
adapter_name="My Framework Adapter",
|
|
482
|
+
version="1.0.0",
|
|
483
|
+
host="0.0.0.0",
|
|
484
|
+
port=8080,
|
|
485
|
+
max_concurrent_jobs=5,
|
|
486
|
+
job_timeout_seconds=3600,
|
|
487
|
+
log_level="INFO",
|
|
488
|
+
framework_config={
|
|
489
|
+
# Framework-specific settings
|
|
490
|
+
"model_cache_dir": "/models",
|
|
491
|
+
"device": "cuda",
|
|
492
|
+
"batch_size": 8
|
|
493
|
+
}
|
|
494
|
+
)
|
|
495
|
+
```
|
|
496
|
+
|
|
497
|
+
### Configuration File
|
|
498
|
+
|
|
499
|
+
```yaml
|
|
500
|
+
# adapter_config.yaml
|
|
501
|
+
framework_id: "my_framework"
|
|
502
|
+
adapter_name: "My Framework Adapter"
|
|
503
|
+
version: "1.0.0"
|
|
504
|
+
host: "0.0.0.0"
|
|
505
|
+
port: 8080
|
|
506
|
+
max_concurrent_jobs: 10
|
|
507
|
+
job_timeout_seconds: 7200
|
|
508
|
+
log_level: "DEBUG"
|
|
509
|
+
|
|
510
|
+
framework_config:
|
|
511
|
+
model_cache_dir: "/data/models"
|
|
512
|
+
device: "cuda:0"
|
|
513
|
+
batch_size: 16
|
|
514
|
+
enable_caching: true
|
|
515
|
+
```
|
|
516
|
+
|
|
517
|
+
## Deployment
|
|
518
|
+
|
|
519
|
+
### Podman with Red Hat UBI
|
|
520
|
+
|
|
521
|
+
```dockerfile
|
|
522
|
+
# Framework Adapter Container
|
|
523
|
+
FROM registry.access.redhat.com/ubi9/python-311:latest
|
|
524
|
+
|
|
525
|
+
# Set environment variables for Python optimization
|
|
526
|
+
ENV PYTHONUNBUFFERED=1
|
|
527
|
+
ENV PYTHONDONTWRITEBYTECODE=1
|
|
528
|
+
|
|
529
|
+
WORKDIR /app
|
|
530
|
+
|
|
531
|
+
# Copy source code
|
|
532
|
+
COPY . ./
|
|
533
|
+
|
|
534
|
+
# Install dependencies
|
|
535
|
+
RUN pip install -e .
|
|
536
|
+
|
|
537
|
+
EXPOSE 8080
|
|
538
|
+
|
|
539
|
+
# Health check
|
|
540
|
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
|
|
541
|
+
CMD curl -f http://localhost:8080/api/v1/health || exit 1
|
|
542
|
+
|
|
543
|
+
CMD ["evalhub-adapter", "run", "my_adapter:MyAdapter", "--port", "8080"]
|
|
544
|
+
```
|
|
545
|
+
|
|
546
|
+
### Building and Running with Podman
|
|
547
|
+
|
|
548
|
+
```bash
|
|
549
|
+
# Build the image
|
|
550
|
+
podman build -t your-adapter:latest .
|
|
551
|
+
|
|
552
|
+
# Run the container
|
|
553
|
+
podman run -d \
|
|
554
|
+
--name your-adapter \
|
|
555
|
+
-p 8080:8080 \
|
|
556
|
+
--health-cmd='curl -f http://localhost:8080/api/v1/health || exit 1' \
|
|
557
|
+
--health-interval=30s \
|
|
558
|
+
--health-timeout=10s \
|
|
559
|
+
--health-start-period=30s \
|
|
560
|
+
--health-retries=3 \
|
|
561
|
+
your-adapter:latest
|
|
562
|
+
|
|
563
|
+
# Check container health
|
|
564
|
+
podman ps
|
|
565
|
+
|
|
566
|
+
# View logs
|
|
567
|
+
podman logs your-adapter
|
|
568
|
+
|
|
569
|
+
# Stop and clean up
|
|
570
|
+
podman stop your-adapter
|
|
571
|
+
podman rm your-adapter
|
|
572
|
+
```
|
|
573
|
+
|
|
574
|
+
**Note**: For frameworks requiring additional build tools, see [examples/lighteval_adapter/](examples/lighteval_adapter/) for a production deployment example with UBI minimal and custom dependencies.
|
|
575
|
+
|
|
576
|
+
## Development
|
|
577
|
+
|
|
578
|
+
### Project Structure
|
|
579
|
+
|
|
580
|
+
The SDK uses a modern Python project structure with clear separation of concerns:
|
|
581
|
+
|
|
582
|
+
```
|
|
583
|
+
evalhub-sdk/
|
|
584
|
+
├── src/evalhub/ # Source code (src layout)
|
|
585
|
+
│ ├── models/ # 🏗️ Core: Shared data models
|
|
586
|
+
│ │ ├── api.py # Request/response models
|
|
587
|
+
│ │ └── __init__.py
|
|
588
|
+
│ ├── adapter/ # 🔧 Adapter SDK: Framework adapter components
|
|
589
|
+
│ │ ├── models/ # Adapter-specific models (FrameworkAdapter, AdapterConfig)
|
|
590
|
+
│ │ ├── server/ # FastAPI server for hosting adapters
|
|
591
|
+
│ │ ├── api/ # API endpoints and routing
|
|
592
|
+
│ │ ├── client/ # 📡 Client SDK: Communication with adapters
|
|
593
|
+
│ │ ├── cli.py # Command-line interface for adapters
|
|
594
|
+
│ │ └── __init__.py
|
|
595
|
+
│ ├── utils/ # 🛠️ Utilities and helpers
|
|
596
|
+
│ ├── cli.py # Main CLI interface
|
|
597
|
+
│ └── __init__.py # Public API exports
|
|
598
|
+
├── tests/ # Test suite
|
|
599
|
+
│ ├── unit/ # Unit tests
|
|
600
|
+
│ └── integration/ # Integration tests
|
|
601
|
+
├── examples/ # Example adapters
|
|
602
|
+
│ ├── custom_framework_adapter.py
|
|
603
|
+
│ └── lighteval_adapter/
|
|
604
|
+
└── pyproject.toml # Project configuration
|
|
605
|
+
```
|
|
606
|
+
|
|
607
|
+
### Package Usage Patterns
|
|
608
|
+
|
|
609
|
+
**🏗️ Building an Adapter:**
|
|
610
|
+
```python
|
|
611
|
+
from evalhub.adapter import FrameworkAdapter, AdapterConfig, AdapterServer
|
|
612
|
+
from evalhub.models import EvaluationRequest, EvaluationJob
|
|
613
|
+
```
|
|
614
|
+
|
|
615
|
+
**📡 Connecting to Adapters:**
|
|
616
|
+
```python
|
|
617
|
+
from evalhub.adapter.client import AdapterClient, AdapterDiscovery
|
|
618
|
+
from evalhub.models import EvaluationRequest, ModelConfig
|
|
619
|
+
```
|
|
620
|
+
|
|
621
|
+
**🛠️ Framework Development:**
|
|
622
|
+
```python
|
|
623
|
+
# Access everything through the main package
|
|
624
|
+
from evalhub.adapter import * # All adapter components
|
|
625
|
+
from evalhub.models import * # All data models
|
|
626
|
+
```
|
|
627
|
+
|
|
628
|
+
### Development Setup
|
|
629
|
+
|
|
630
|
+
```bash
|
|
631
|
+
# Clone the repository
|
|
632
|
+
git clone https://github.com/trustyai-explainability/evalhub-sdk.git
|
|
633
|
+
cd evalhub-sdk
|
|
634
|
+
|
|
635
|
+
# Install in development mode with all dependencies
|
|
636
|
+
pip install -e .[dev]
|
|
637
|
+
|
|
638
|
+
# Install pre-commit hooks
|
|
639
|
+
pre-commit install
|
|
640
|
+
|
|
641
|
+
# Run tests
|
|
642
|
+
pytest
|
|
643
|
+
|
|
644
|
+
# Run tests with coverage
|
|
645
|
+
pytest --cov=src/evalhub --cov-report=html
|
|
646
|
+
|
|
647
|
+
# Run type checking
|
|
648
|
+
mypy src/evalhub
|
|
649
|
+
|
|
650
|
+
# Run linting
|
|
651
|
+
ruff check src/ tests/
|
|
652
|
+
ruff format src/ tests/
|
|
653
|
+
```
|
|
654
|
+
|
|
655
|
+
### Testing Your Adapter
|
|
656
|
+
|
|
657
|
+
```python
|
|
658
|
+
import pytest
|
|
659
|
+
from evalhub.adapter.client import AdapterClient
|
|
660
|
+
|
|
661
|
+
@pytest.mark.asyncio
|
|
662
|
+
async def test_adapter_health():
|
|
663
|
+
async with AdapterClient("http://localhost:8080") as client:
|
|
664
|
+
health = await client.health_check()
|
|
665
|
+
assert health.status == "healthy"
|
|
666
|
+
|
|
667
|
+
@pytest.mark.asyncio
|
|
668
|
+
async def test_list_benchmarks():
|
|
669
|
+
async with AdapterClient("http://localhost:8080") as client:
|
|
670
|
+
benchmarks = await client.list_benchmarks()
|
|
671
|
+
assert len(benchmarks) > 0
|
|
672
|
+
assert all(b.benchmark_id for b in benchmarks)
|
|
673
|
+
```
|
|
674
|
+
|
|
675
|
+
### Development Server
|
|
676
|
+
|
|
677
|
+
```bash
|
|
678
|
+
# Run with auto-reload for development
|
|
679
|
+
evalhub-adapter run my_adapter:MyAdapter --reload --log-level DEBUG
|
|
680
|
+
```
|
|
681
|
+
|
|
682
|
+
### Quality Assurance
|
|
683
|
+
|
|
684
|
+
|
|
685
|
+
Run all quality checks:
|
|
686
|
+
```bash
|
|
687
|
+
# Format code
|
|
688
|
+
ruff format .
|
|
689
|
+
|
|
690
|
+
# Lint and fix issues
|
|
691
|
+
ruff check --fix .
|
|
692
|
+
|
|
693
|
+
# Type check
|
|
694
|
+
mypy src/evalhub
|
|
695
|
+
|
|
696
|
+
# Run full test suite
|
|
697
|
+
pytest -v --cov=src/evalhub
|
|
698
|
+
```
|
|
699
|
+
|
|
700
|
+
## Contributing
|
|
701
|
+
|
|
702
|
+
1. Fork the repository
|
|
703
|
+
2. Create a feature branch
|
|
704
|
+
3. Make your changes
|
|
705
|
+
4. Add tests for your changes
|
|
706
|
+
5. Run the test suite
|
|
707
|
+
6. Submit a pull request
|
|
708
|
+
|
|
709
|
+
## License
|
|
710
|
+
|
|
711
|
+
This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details.
|