eval-hub-sdk 0.1.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eval_hub_sdk-0.1.0a0.dist-info/METADATA +711 -0
- eval_hub_sdk-0.1.0a0.dist-info/RECORD +27 -0
- eval_hub_sdk-0.1.0a0.dist-info/WHEEL +5 -0
- eval_hub_sdk-0.1.0a0.dist-info/entry_points.txt +2 -0
- eval_hub_sdk-0.1.0a0.dist-info/licenses/LICENSE +201 -0
- eval_hub_sdk-0.1.0a0.dist-info/top_level.txt +1 -0
- evalhub/__init__.py +84 -0
- evalhub/adapter/__init__.py +28 -0
- evalhub/adapter/api/__init__.py +6 -0
- evalhub/adapter/api/endpoints.py +342 -0
- evalhub/adapter/api/router.py +135 -0
- evalhub/adapter/cli.py +331 -0
- evalhub/adapter/client/__init__.py +6 -0
- evalhub/adapter/client/adapter_client.py +418 -0
- evalhub/adapter/client/discovery.py +275 -0
- evalhub/adapter/models/__init__.py +9 -0
- evalhub/adapter/models/framework.py +404 -0
- evalhub/adapter/oci/__init__.py +5 -0
- evalhub/adapter/oci/persister.py +76 -0
- evalhub/adapter/server/__init__.py +5 -0
- evalhub/adapter/server/app.py +157 -0
- evalhub/cli.py +331 -0
- evalhub/models/__init__.py +32 -0
- evalhub/models/api.py +388 -0
- evalhub/py.typed +0 -0
- evalhub/utils/__init__.py +5 -0
- evalhub/utils/logging.py +41 -0
evalhub/models/api.py
ADDED
|
@@ -0,0 +1,388 @@
|
|
|
1
|
+
"""Core API models for the EvalHub SDK common interface."""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class JobStatus(str, Enum):
|
|
11
|
+
"""Standard job status values."""
|
|
12
|
+
|
|
13
|
+
PENDING = "pending"
|
|
14
|
+
RUNNING = "running"
|
|
15
|
+
COMPLETED = "completed"
|
|
16
|
+
FAILED = "failed"
|
|
17
|
+
CANCELLED = "cancelled"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class EvaluationStatus(str, Enum):
|
|
21
|
+
"""Evaluation-specific status values."""
|
|
22
|
+
|
|
23
|
+
QUEUED = "queued"
|
|
24
|
+
INITIALIZING = "initializing"
|
|
25
|
+
RUNNING = "running"
|
|
26
|
+
POST_PROCESSING = "post_processing"
|
|
27
|
+
COMPLETED = "completed"
|
|
28
|
+
FAILED = "failed"
|
|
29
|
+
CANCELLED = "cancelled"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ModelConfig(BaseModel):
|
|
33
|
+
"""Configuration for the model being evaluated."""
|
|
34
|
+
|
|
35
|
+
model_config = ConfigDict(extra="allow")
|
|
36
|
+
|
|
37
|
+
name: str = Field(..., description="Model name or identifier")
|
|
38
|
+
provider: str | None = Field(
|
|
39
|
+
default=None, description="Model provider (e.g., 'vllm', 'transformers')"
|
|
40
|
+
)
|
|
41
|
+
parameters: dict[str, Any] = Field(
|
|
42
|
+
default_factory=dict,
|
|
43
|
+
description="Model-specific parameters (temperature, max_tokens, etc.)",
|
|
44
|
+
)
|
|
45
|
+
device: str | None = Field(default=None, description="Device specification")
|
|
46
|
+
batch_size: int | None = Field(
|
|
47
|
+
default=None, description="Batch size for evaluation"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
@field_validator("name")
|
|
51
|
+
@classmethod
|
|
52
|
+
def validate_name(cls, v: str) -> str:
|
|
53
|
+
if not v.strip():
|
|
54
|
+
raise ValueError("Model name cannot be empty")
|
|
55
|
+
return v
|
|
56
|
+
|
|
57
|
+
def merge_with_defaults(self, defaults: dict[str, Any]) -> "ModelConfig":
|
|
58
|
+
"""Merge configuration with default values."""
|
|
59
|
+
merged_params = {**defaults, **self.parameters}
|
|
60
|
+
return self.model_copy(update={"parameters": merged_params})
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class BenchmarkInfo(BaseModel):
|
|
64
|
+
"""Information about an available benchmark."""
|
|
65
|
+
|
|
66
|
+
benchmark_id: str = Field(..., description="Unique benchmark identifier")
|
|
67
|
+
name: str = Field(..., description="Human-readable benchmark name")
|
|
68
|
+
description: str | None = Field(default=None, description="Benchmark description")
|
|
69
|
+
category: str | None = Field(default=None, description="Benchmark category")
|
|
70
|
+
tags: list[str] = Field(default_factory=list, description="Benchmark tags")
|
|
71
|
+
metrics: list[str] = Field(default_factory=list, description="Available metrics")
|
|
72
|
+
dataset_size: int | None = Field(
|
|
73
|
+
default=None, description="Number of examples in dataset"
|
|
74
|
+
)
|
|
75
|
+
supports_few_shot: bool = Field(
|
|
76
|
+
default=True, description="Whether benchmark supports few-shot evaluation"
|
|
77
|
+
)
|
|
78
|
+
default_few_shot: int | None = Field(
|
|
79
|
+
default=None, description="Default number of few-shot examples"
|
|
80
|
+
)
|
|
81
|
+
custom_config_schema: dict[str, Any] | None = Field(
|
|
82
|
+
default=None, description="JSON schema for custom benchmark configuration"
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
@field_validator("benchmark_id", "name")
|
|
86
|
+
@classmethod
|
|
87
|
+
def validate_non_empty_strings(cls, v: str) -> str:
|
|
88
|
+
if not v.strip():
|
|
89
|
+
raise ValueError("String fields cannot be empty")
|
|
90
|
+
return v
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class EvaluationRequest(BaseModel):
|
|
94
|
+
"""Request to run an evaluation."""
|
|
95
|
+
|
|
96
|
+
benchmark_id: str = Field(..., description="Benchmark to evaluate on")
|
|
97
|
+
model: ModelConfig = Field(..., description="Model configuration")
|
|
98
|
+
|
|
99
|
+
# Evaluation parameters
|
|
100
|
+
num_examples: int | None = Field(
|
|
101
|
+
default=None, description="Number of examples to evaluate (None = all)"
|
|
102
|
+
)
|
|
103
|
+
num_few_shot: int | None = Field(
|
|
104
|
+
default=None, description="Number of few-shot examples"
|
|
105
|
+
)
|
|
106
|
+
random_seed: int | None = Field(
|
|
107
|
+
default=42, description="Random seed for reproducibility"
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# Custom benchmark configuration
|
|
111
|
+
benchmark_config: dict[str, Any] = Field(
|
|
112
|
+
default_factory=dict, description="Benchmark-specific configuration"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# Job metadata
|
|
116
|
+
experiment_name: str | None = Field(
|
|
117
|
+
default=None, description="Name for this evaluation experiment"
|
|
118
|
+
)
|
|
119
|
+
tags: dict[str, str] = Field(
|
|
120
|
+
default_factory=dict, description="Custom tags for the job"
|
|
121
|
+
)
|
|
122
|
+
priority: int = Field(
|
|
123
|
+
default=0, description="Job priority (higher = more priority)"
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class EvaluationResult(BaseModel):
|
|
128
|
+
"""Individual evaluation result."""
|
|
129
|
+
|
|
130
|
+
metric_name: str = Field(..., description="Name of the metric")
|
|
131
|
+
metric_value: float | int | str | bool = Field(..., description="Metric value")
|
|
132
|
+
metric_type: str = Field(
|
|
133
|
+
default="float", description="Type of metric (float, int, accuracy, etc.)"
|
|
134
|
+
)
|
|
135
|
+
confidence_interval: tuple[float, float] | None = Field(
|
|
136
|
+
default=None, description="95% confidence interval if available"
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
# Additional metadata
|
|
140
|
+
num_samples: int | None = Field(
|
|
141
|
+
default=None, description="Number of samples used for this metric"
|
|
142
|
+
)
|
|
143
|
+
metadata: dict[str, Any] = Field(
|
|
144
|
+
default_factory=dict, description="Additional metric-specific metadata"
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class EvaluationJob(BaseModel):
|
|
149
|
+
"""Evaluation job information."""
|
|
150
|
+
|
|
151
|
+
job_id: str = Field(..., description="Unique job identifier")
|
|
152
|
+
status: JobStatus = Field(..., description="Current job status")
|
|
153
|
+
evaluation_status: EvaluationStatus | None = Field(
|
|
154
|
+
default=None, description="Detailed evaluation status"
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# Request information
|
|
158
|
+
request: EvaluationRequest = Field(..., description="Original evaluation request")
|
|
159
|
+
|
|
160
|
+
# Timing information
|
|
161
|
+
submitted_at: datetime = Field(..., description="When the job was submitted")
|
|
162
|
+
started_at: datetime | None = Field(
|
|
163
|
+
default=None, description="When evaluation started"
|
|
164
|
+
)
|
|
165
|
+
completed_at: datetime | None = Field(
|
|
166
|
+
default=None, description="When evaluation completed"
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# Progress information
|
|
170
|
+
progress: float | None = Field(
|
|
171
|
+
default=None, description="Progress percentage (0.0 to 1.0)"
|
|
172
|
+
)
|
|
173
|
+
current_step: str | None = Field(
|
|
174
|
+
default=None, description="Current step description"
|
|
175
|
+
)
|
|
176
|
+
total_steps: int | None = Field(default=None, description="Total number of steps")
|
|
177
|
+
completed_steps: int | None = Field(
|
|
178
|
+
default=None, description="Number of completed steps"
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
# Error information
|
|
182
|
+
error_message: str | None = Field(
|
|
183
|
+
default=None, description="Error message if failed"
|
|
184
|
+
)
|
|
185
|
+
error_details: dict[str, Any] | None = Field(
|
|
186
|
+
default=None, description="Detailed error information"
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# Resource usage
|
|
190
|
+
estimated_duration: int | None = Field(
|
|
191
|
+
default=None, description="Estimated duration in seconds"
|
|
192
|
+
)
|
|
193
|
+
actual_duration: int | None = Field(
|
|
194
|
+
default=None, description="Actual duration in seconds"
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
class EvaluationResponse(BaseModel):
|
|
199
|
+
"""Response containing evaluation results."""
|
|
200
|
+
|
|
201
|
+
job_id: str = Field(..., description="Job identifier")
|
|
202
|
+
benchmark_id: str = Field(..., description="Benchmark that was evaluated")
|
|
203
|
+
model_name: str = Field(..., description="Model that was evaluated")
|
|
204
|
+
|
|
205
|
+
# Results
|
|
206
|
+
results: list[EvaluationResult] = Field(..., description="Evaluation results")
|
|
207
|
+
|
|
208
|
+
# Summary statistics
|
|
209
|
+
overall_score: float | None = Field(
|
|
210
|
+
default=None, description="Overall score if applicable"
|
|
211
|
+
)
|
|
212
|
+
num_examples_evaluated: int = Field(
|
|
213
|
+
..., description="Number of examples actually evaluated"
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# Metadata
|
|
217
|
+
evaluation_metadata: dict[str, Any] = Field(
|
|
218
|
+
default_factory=dict, description="Framework-specific evaluation metadata"
|
|
219
|
+
)
|
|
220
|
+
completed_at: datetime = Field(..., description="When evaluation was completed")
|
|
221
|
+
duration_seconds: float = Field(..., description="Total evaluation time")
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
class OCICoordinate(BaseModel):
|
|
225
|
+
"""OCI artifact coordinates for persistence."""
|
|
226
|
+
|
|
227
|
+
oci_ref: str = Field(
|
|
228
|
+
..., description="OCI reference (e.g., 'ghcr.io/org/repo:tag')"
|
|
229
|
+
)
|
|
230
|
+
oci_subject: str | None = Field(
|
|
231
|
+
default=None, description="Optional OCI subject identifier"
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
model_config = ConfigDict(
|
|
235
|
+
json_schema_extra={
|
|
236
|
+
"example": {
|
|
237
|
+
"oci_ref": "ghcr.io/my-org/eval-results:latest",
|
|
238
|
+
"oci_subject": "not used atm",
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
class EvaluationJobFilesLocation(BaseModel):
|
|
245
|
+
"""Files location for persisting as OCI artifacts for an evaluation job."""
|
|
246
|
+
|
|
247
|
+
job_id: str = Field(..., description="Job identifier")
|
|
248
|
+
path: str | None = Field(
|
|
249
|
+
default=None,
|
|
250
|
+
description="Directory path containing files to persist. None if no files to persist.",
|
|
251
|
+
)
|
|
252
|
+
metadata: dict[str, str] = Field(
|
|
253
|
+
default_factory=dict,
|
|
254
|
+
description="Framework-specific metadata (e.g., OCI annotations)",
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
model_config = ConfigDict(
|
|
258
|
+
json_schema_extra={
|
|
259
|
+
"example": {
|
|
260
|
+
"job_id": "job_123",
|
|
261
|
+
"path": "/tmp/lighteval_output/job_123",
|
|
262
|
+
"metadata": {
|
|
263
|
+
"framework": "lighteval",
|
|
264
|
+
"benchmark_id": "benchmark_id_value",
|
|
265
|
+
},
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
class PersistResponse(BaseModel):
|
|
272
|
+
"""Response from OCI artifact persistence operation."""
|
|
273
|
+
|
|
274
|
+
job_id: str = Field(..., description="Job identifier")
|
|
275
|
+
oci_ref: str = Field(..., description="Full OCI reference including digest")
|
|
276
|
+
digest: str = Field(..., description="SHA256 digest of artifact")
|
|
277
|
+
files_count: int = Field(..., description="Number of files persisted")
|
|
278
|
+
metadata: dict[str, Any] = Field(
|
|
279
|
+
default_factory=dict, description="Additional persistence metadata"
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
model_config = ConfigDict(
|
|
283
|
+
json_schema_extra={
|
|
284
|
+
"example": {
|
|
285
|
+
"job_id": "job_123",
|
|
286
|
+
"oci_ref": "ghcr.io/org/repo:latest@sha256:abc123...",
|
|
287
|
+
"digest": "sha256:abc123...",
|
|
288
|
+
"files_count": 42,
|
|
289
|
+
"metadata": {"placeholder": True},
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
class FrameworkInfo(BaseModel):
|
|
296
|
+
"""Information about a framework adapter."""
|
|
297
|
+
|
|
298
|
+
framework_id: str = Field(..., description="Unique framework identifier")
|
|
299
|
+
name: str = Field(..., description="Framework display name")
|
|
300
|
+
version: str = Field(..., description="Framework version")
|
|
301
|
+
description: str | None = Field(default=None, description="Framework description")
|
|
302
|
+
|
|
303
|
+
# Capabilities
|
|
304
|
+
supported_benchmarks: list[BenchmarkInfo] = Field(
|
|
305
|
+
default_factory=list, description="Benchmarks supported by this framework"
|
|
306
|
+
)
|
|
307
|
+
supported_model_types: list[str] = Field(
|
|
308
|
+
default_factory=list,
|
|
309
|
+
description="Model types supported (e.g., 'transformers', 'vllm')",
|
|
310
|
+
)
|
|
311
|
+
capabilities: list[str] = Field(
|
|
312
|
+
default_factory=list,
|
|
313
|
+
description="Framework capabilities (e.g., 'text-generation', 'classification')",
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
# Configuration schema
|
|
317
|
+
default_model_config: dict[str, Any] = Field(
|
|
318
|
+
default_factory=dict, description="Default model configuration"
|
|
319
|
+
)
|
|
320
|
+
config_schema: dict[str, Any] | None = Field(
|
|
321
|
+
default=None, description="JSON schema for framework configuration"
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
# Metadata
|
|
325
|
+
author: str | None = Field(default=None, description="Framework adapter author")
|
|
326
|
+
contact: str | None = Field(default=None, description="Contact information")
|
|
327
|
+
documentation_url: str | None = Field(default=None, description="Documentation URL")
|
|
328
|
+
repository_url: str | None = Field(
|
|
329
|
+
default=None, description="Source repository URL"
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
class ErrorResponse(BaseModel):
|
|
334
|
+
"""Standard error response."""
|
|
335
|
+
|
|
336
|
+
error: str = Field(..., description="Type of error")
|
|
337
|
+
message: str = Field(..., description="Human-readable error message")
|
|
338
|
+
error_code: str | None = Field(
|
|
339
|
+
default=None, description="Framework-specific error code"
|
|
340
|
+
)
|
|
341
|
+
details: dict[str, Any] | None = Field(
|
|
342
|
+
default=None, description="Additional error details"
|
|
343
|
+
)
|
|
344
|
+
timestamp: datetime = Field(
|
|
345
|
+
default_factory=datetime.now, description="When error occurred"
|
|
346
|
+
)
|
|
347
|
+
request_id: str | None = Field(default=None, description="Request ID for debugging")
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
class HealthResponse(BaseModel):
|
|
351
|
+
"""Health check response."""
|
|
352
|
+
|
|
353
|
+
status: str = Field(
|
|
354
|
+
..., description="Health status ('healthy', 'unhealthy', 'degraded')"
|
|
355
|
+
)
|
|
356
|
+
framework_id: str = Field(..., description="Framework identifier")
|
|
357
|
+
version: str = Field(..., description="Framework adapter version")
|
|
358
|
+
|
|
359
|
+
# Dependency status
|
|
360
|
+
dependencies: dict[str, dict[str, Any]] | None = Field(
|
|
361
|
+
default=None, description="Status of framework dependencies"
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
# Resource information
|
|
365
|
+
memory_usage: dict[str, Any] | None = Field(
|
|
366
|
+
default=None, description="Memory usage information"
|
|
367
|
+
)
|
|
368
|
+
gpu_usage: dict[str, Any] | None = Field(
|
|
369
|
+
default=None, description="GPU usage information"
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
# Timing
|
|
373
|
+
uptime_seconds: float | None = Field(
|
|
374
|
+
default=None, description="Adapter uptime in seconds"
|
|
375
|
+
)
|
|
376
|
+
last_evaluation_time: datetime | None = Field(
|
|
377
|
+
default=None, description="Time of last evaluation"
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
# Error information for unhealthy status
|
|
381
|
+
error_message: str | None = Field(
|
|
382
|
+
default=None, description="Error message when status is unhealthy"
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
# Additional info
|
|
386
|
+
metadata: dict[str, Any] = Field(
|
|
387
|
+
default_factory=dict, description="Additional health metadata"
|
|
388
|
+
)
|
evalhub/py.typed
ADDED
|
File without changes
|
evalhub/utils/logging.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Logging utilities for the EvalHub SDK."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import sys
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def setup_logging(
|
|
9
|
+
level: str = "INFO",
|
|
10
|
+
format_string: str | None = None,
|
|
11
|
+
stream: Any = None,
|
|
12
|
+
) -> logging.Logger:
|
|
13
|
+
"""Set up logging configuration for the EvalHub SDK.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
|
17
|
+
format_string: Custom format string for log messages
|
|
18
|
+
stream: Output stream for logging (defaults to stdout)
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
Configured logger instance
|
|
22
|
+
"""
|
|
23
|
+
if format_string is None:
|
|
24
|
+
format_string = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
25
|
+
|
|
26
|
+
if stream is None:
|
|
27
|
+
stream = sys.stdout
|
|
28
|
+
|
|
29
|
+
# Configure root logger
|
|
30
|
+
logging.basicConfig(
|
|
31
|
+
level=getattr(logging, level.upper()),
|
|
32
|
+
format=format_string,
|
|
33
|
+
stream=stream,
|
|
34
|
+
force=True,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# Return logger for evalhub package
|
|
38
|
+
logger = logging.getLogger("evalhub")
|
|
39
|
+
logger.setLevel(getattr(logging, level.upper()))
|
|
40
|
+
|
|
41
|
+
return logger
|