eval-hub-sdk 0.1.0a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
evalhub/models/api.py ADDED
@@ -0,0 +1,388 @@
1
+ """Core API models for the EvalHub SDK common interface."""
2
+
3
+ from datetime import datetime
4
+ from enum import Enum
5
+ from typing import Any
6
+
7
+ from pydantic import BaseModel, ConfigDict, Field, field_validator
8
+
9
+
10
+ class JobStatus(str, Enum):
11
+ """Standard job status values."""
12
+
13
+ PENDING = "pending"
14
+ RUNNING = "running"
15
+ COMPLETED = "completed"
16
+ FAILED = "failed"
17
+ CANCELLED = "cancelled"
18
+
19
+
20
+ class EvaluationStatus(str, Enum):
21
+ """Evaluation-specific status values."""
22
+
23
+ QUEUED = "queued"
24
+ INITIALIZING = "initializing"
25
+ RUNNING = "running"
26
+ POST_PROCESSING = "post_processing"
27
+ COMPLETED = "completed"
28
+ FAILED = "failed"
29
+ CANCELLED = "cancelled"
30
+
31
+
32
+ class ModelConfig(BaseModel):
33
+ """Configuration for the model being evaluated."""
34
+
35
+ model_config = ConfigDict(extra="allow")
36
+
37
+ name: str = Field(..., description="Model name or identifier")
38
+ provider: str | None = Field(
39
+ default=None, description="Model provider (e.g., 'vllm', 'transformers')"
40
+ )
41
+ parameters: dict[str, Any] = Field(
42
+ default_factory=dict,
43
+ description="Model-specific parameters (temperature, max_tokens, etc.)",
44
+ )
45
+ device: str | None = Field(default=None, description="Device specification")
46
+ batch_size: int | None = Field(
47
+ default=None, description="Batch size for evaluation"
48
+ )
49
+
50
+ @field_validator("name")
51
+ @classmethod
52
+ def validate_name(cls, v: str) -> str:
53
+ if not v.strip():
54
+ raise ValueError("Model name cannot be empty")
55
+ return v
56
+
57
+ def merge_with_defaults(self, defaults: dict[str, Any]) -> "ModelConfig":
58
+ """Merge configuration with default values."""
59
+ merged_params = {**defaults, **self.parameters}
60
+ return self.model_copy(update={"parameters": merged_params})
61
+
62
+
63
+ class BenchmarkInfo(BaseModel):
64
+ """Information about an available benchmark."""
65
+
66
+ benchmark_id: str = Field(..., description="Unique benchmark identifier")
67
+ name: str = Field(..., description="Human-readable benchmark name")
68
+ description: str | None = Field(default=None, description="Benchmark description")
69
+ category: str | None = Field(default=None, description="Benchmark category")
70
+ tags: list[str] = Field(default_factory=list, description="Benchmark tags")
71
+ metrics: list[str] = Field(default_factory=list, description="Available metrics")
72
+ dataset_size: int | None = Field(
73
+ default=None, description="Number of examples in dataset"
74
+ )
75
+ supports_few_shot: bool = Field(
76
+ default=True, description="Whether benchmark supports few-shot evaluation"
77
+ )
78
+ default_few_shot: int | None = Field(
79
+ default=None, description="Default number of few-shot examples"
80
+ )
81
+ custom_config_schema: dict[str, Any] | None = Field(
82
+ default=None, description="JSON schema for custom benchmark configuration"
83
+ )
84
+
85
+ @field_validator("benchmark_id", "name")
86
+ @classmethod
87
+ def validate_non_empty_strings(cls, v: str) -> str:
88
+ if not v.strip():
89
+ raise ValueError("String fields cannot be empty")
90
+ return v
91
+
92
+
93
+ class EvaluationRequest(BaseModel):
94
+ """Request to run an evaluation."""
95
+
96
+ benchmark_id: str = Field(..., description="Benchmark to evaluate on")
97
+ model: ModelConfig = Field(..., description="Model configuration")
98
+
99
+ # Evaluation parameters
100
+ num_examples: int | None = Field(
101
+ default=None, description="Number of examples to evaluate (None = all)"
102
+ )
103
+ num_few_shot: int | None = Field(
104
+ default=None, description="Number of few-shot examples"
105
+ )
106
+ random_seed: int | None = Field(
107
+ default=42, description="Random seed for reproducibility"
108
+ )
109
+
110
+ # Custom benchmark configuration
111
+ benchmark_config: dict[str, Any] = Field(
112
+ default_factory=dict, description="Benchmark-specific configuration"
113
+ )
114
+
115
+ # Job metadata
116
+ experiment_name: str | None = Field(
117
+ default=None, description="Name for this evaluation experiment"
118
+ )
119
+ tags: dict[str, str] = Field(
120
+ default_factory=dict, description="Custom tags for the job"
121
+ )
122
+ priority: int = Field(
123
+ default=0, description="Job priority (higher = more priority)"
124
+ )
125
+
126
+
127
+ class EvaluationResult(BaseModel):
128
+ """Individual evaluation result."""
129
+
130
+ metric_name: str = Field(..., description="Name of the metric")
131
+ metric_value: float | int | str | bool = Field(..., description="Metric value")
132
+ metric_type: str = Field(
133
+ default="float", description="Type of metric (float, int, accuracy, etc.)"
134
+ )
135
+ confidence_interval: tuple[float, float] | None = Field(
136
+ default=None, description="95% confidence interval if available"
137
+ )
138
+
139
+ # Additional metadata
140
+ num_samples: int | None = Field(
141
+ default=None, description="Number of samples used for this metric"
142
+ )
143
+ metadata: dict[str, Any] = Field(
144
+ default_factory=dict, description="Additional metric-specific metadata"
145
+ )
146
+
147
+
148
+ class EvaluationJob(BaseModel):
149
+ """Evaluation job information."""
150
+
151
+ job_id: str = Field(..., description="Unique job identifier")
152
+ status: JobStatus = Field(..., description="Current job status")
153
+ evaluation_status: EvaluationStatus | None = Field(
154
+ default=None, description="Detailed evaluation status"
155
+ )
156
+
157
+ # Request information
158
+ request: EvaluationRequest = Field(..., description="Original evaluation request")
159
+
160
+ # Timing information
161
+ submitted_at: datetime = Field(..., description="When the job was submitted")
162
+ started_at: datetime | None = Field(
163
+ default=None, description="When evaluation started"
164
+ )
165
+ completed_at: datetime | None = Field(
166
+ default=None, description="When evaluation completed"
167
+ )
168
+
169
+ # Progress information
170
+ progress: float | None = Field(
171
+ default=None, description="Progress percentage (0.0 to 1.0)"
172
+ )
173
+ current_step: str | None = Field(
174
+ default=None, description="Current step description"
175
+ )
176
+ total_steps: int | None = Field(default=None, description="Total number of steps")
177
+ completed_steps: int | None = Field(
178
+ default=None, description="Number of completed steps"
179
+ )
180
+
181
+ # Error information
182
+ error_message: str | None = Field(
183
+ default=None, description="Error message if failed"
184
+ )
185
+ error_details: dict[str, Any] | None = Field(
186
+ default=None, description="Detailed error information"
187
+ )
188
+
189
+ # Resource usage
190
+ estimated_duration: int | None = Field(
191
+ default=None, description="Estimated duration in seconds"
192
+ )
193
+ actual_duration: int | None = Field(
194
+ default=None, description="Actual duration in seconds"
195
+ )
196
+
197
+
198
+ class EvaluationResponse(BaseModel):
199
+ """Response containing evaluation results."""
200
+
201
+ job_id: str = Field(..., description="Job identifier")
202
+ benchmark_id: str = Field(..., description="Benchmark that was evaluated")
203
+ model_name: str = Field(..., description="Model that was evaluated")
204
+
205
+ # Results
206
+ results: list[EvaluationResult] = Field(..., description="Evaluation results")
207
+
208
+ # Summary statistics
209
+ overall_score: float | None = Field(
210
+ default=None, description="Overall score if applicable"
211
+ )
212
+ num_examples_evaluated: int = Field(
213
+ ..., description="Number of examples actually evaluated"
214
+ )
215
+
216
+ # Metadata
217
+ evaluation_metadata: dict[str, Any] = Field(
218
+ default_factory=dict, description="Framework-specific evaluation metadata"
219
+ )
220
+ completed_at: datetime = Field(..., description="When evaluation was completed")
221
+ duration_seconds: float = Field(..., description="Total evaluation time")
222
+
223
+
224
+ class OCICoordinate(BaseModel):
225
+ """OCI artifact coordinates for persistence."""
226
+
227
+ oci_ref: str = Field(
228
+ ..., description="OCI reference (e.g., 'ghcr.io/org/repo:tag')"
229
+ )
230
+ oci_subject: str | None = Field(
231
+ default=None, description="Optional OCI subject identifier"
232
+ )
233
+
234
+ model_config = ConfigDict(
235
+ json_schema_extra={
236
+ "example": {
237
+ "oci_ref": "ghcr.io/my-org/eval-results:latest",
238
+ "oci_subject": "not used atm",
239
+ }
240
+ }
241
+ )
242
+
243
+
244
+ class EvaluationJobFilesLocation(BaseModel):
245
+ """Files location for persisting as OCI artifacts for an evaluation job."""
246
+
247
+ job_id: str = Field(..., description="Job identifier")
248
+ path: str | None = Field(
249
+ default=None,
250
+ description="Directory path containing files to persist. None if no files to persist.",
251
+ )
252
+ metadata: dict[str, str] = Field(
253
+ default_factory=dict,
254
+ description="Framework-specific metadata (e.g., OCI annotations)",
255
+ )
256
+
257
+ model_config = ConfigDict(
258
+ json_schema_extra={
259
+ "example": {
260
+ "job_id": "job_123",
261
+ "path": "/tmp/lighteval_output/job_123",
262
+ "metadata": {
263
+ "framework": "lighteval",
264
+ "benchmark_id": "benchmark_id_value",
265
+ },
266
+ }
267
+ }
268
+ )
269
+
270
+
271
+ class PersistResponse(BaseModel):
272
+ """Response from OCI artifact persistence operation."""
273
+
274
+ job_id: str = Field(..., description="Job identifier")
275
+ oci_ref: str = Field(..., description="Full OCI reference including digest")
276
+ digest: str = Field(..., description="SHA256 digest of artifact")
277
+ files_count: int = Field(..., description="Number of files persisted")
278
+ metadata: dict[str, Any] = Field(
279
+ default_factory=dict, description="Additional persistence metadata"
280
+ )
281
+
282
+ model_config = ConfigDict(
283
+ json_schema_extra={
284
+ "example": {
285
+ "job_id": "job_123",
286
+ "oci_ref": "ghcr.io/org/repo:latest@sha256:abc123...",
287
+ "digest": "sha256:abc123...",
288
+ "files_count": 42,
289
+ "metadata": {"placeholder": True},
290
+ }
291
+ }
292
+ )
293
+
294
+
295
+ class FrameworkInfo(BaseModel):
296
+ """Information about a framework adapter."""
297
+
298
+ framework_id: str = Field(..., description="Unique framework identifier")
299
+ name: str = Field(..., description="Framework display name")
300
+ version: str = Field(..., description="Framework version")
301
+ description: str | None = Field(default=None, description="Framework description")
302
+
303
+ # Capabilities
304
+ supported_benchmarks: list[BenchmarkInfo] = Field(
305
+ default_factory=list, description="Benchmarks supported by this framework"
306
+ )
307
+ supported_model_types: list[str] = Field(
308
+ default_factory=list,
309
+ description="Model types supported (e.g., 'transformers', 'vllm')",
310
+ )
311
+ capabilities: list[str] = Field(
312
+ default_factory=list,
313
+ description="Framework capabilities (e.g., 'text-generation', 'classification')",
314
+ )
315
+
316
+ # Configuration schema
317
+ default_model_config: dict[str, Any] = Field(
318
+ default_factory=dict, description="Default model configuration"
319
+ )
320
+ config_schema: dict[str, Any] | None = Field(
321
+ default=None, description="JSON schema for framework configuration"
322
+ )
323
+
324
+ # Metadata
325
+ author: str | None = Field(default=None, description="Framework adapter author")
326
+ contact: str | None = Field(default=None, description="Contact information")
327
+ documentation_url: str | None = Field(default=None, description="Documentation URL")
328
+ repository_url: str | None = Field(
329
+ default=None, description="Source repository URL"
330
+ )
331
+
332
+
333
+ class ErrorResponse(BaseModel):
334
+ """Standard error response."""
335
+
336
+ error: str = Field(..., description="Type of error")
337
+ message: str = Field(..., description="Human-readable error message")
338
+ error_code: str | None = Field(
339
+ default=None, description="Framework-specific error code"
340
+ )
341
+ details: dict[str, Any] | None = Field(
342
+ default=None, description="Additional error details"
343
+ )
344
+ timestamp: datetime = Field(
345
+ default_factory=datetime.now, description="When error occurred"
346
+ )
347
+ request_id: str | None = Field(default=None, description="Request ID for debugging")
348
+
349
+
350
+ class HealthResponse(BaseModel):
351
+ """Health check response."""
352
+
353
+ status: str = Field(
354
+ ..., description="Health status ('healthy', 'unhealthy', 'degraded')"
355
+ )
356
+ framework_id: str = Field(..., description="Framework identifier")
357
+ version: str = Field(..., description="Framework adapter version")
358
+
359
+ # Dependency status
360
+ dependencies: dict[str, dict[str, Any]] | None = Field(
361
+ default=None, description="Status of framework dependencies"
362
+ )
363
+
364
+ # Resource information
365
+ memory_usage: dict[str, Any] | None = Field(
366
+ default=None, description="Memory usage information"
367
+ )
368
+ gpu_usage: dict[str, Any] | None = Field(
369
+ default=None, description="GPU usage information"
370
+ )
371
+
372
+ # Timing
373
+ uptime_seconds: float | None = Field(
374
+ default=None, description="Adapter uptime in seconds"
375
+ )
376
+ last_evaluation_time: datetime | None = Field(
377
+ default=None, description="Time of last evaluation"
378
+ )
379
+
380
+ # Error information for unhealthy status
381
+ error_message: str | None = Field(
382
+ default=None, description="Error message when status is unhealthy"
383
+ )
384
+
385
+ # Additional info
386
+ metadata: dict[str, Any] = Field(
387
+ default_factory=dict, description="Additional health metadata"
388
+ )
evalhub/py.typed ADDED
File without changes
@@ -0,0 +1,5 @@
1
+ """Utility functions and helpers for the EvalHub SDK."""
2
+
3
+ from .logging import setup_logging
4
+
5
+ __all__ = ["setup_logging"]
@@ -0,0 +1,41 @@
1
+ """Logging utilities for the EvalHub SDK."""
2
+
3
+ import logging
4
+ import sys
5
+ from typing import Any
6
+
7
+
8
+ def setup_logging(
9
+ level: str = "INFO",
10
+ format_string: str | None = None,
11
+ stream: Any = None,
12
+ ) -> logging.Logger:
13
+ """Set up logging configuration for the EvalHub SDK.
14
+
15
+ Args:
16
+ level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
17
+ format_string: Custom format string for log messages
18
+ stream: Output stream for logging (defaults to stdout)
19
+
20
+ Returns:
21
+ Configured logger instance
22
+ """
23
+ if format_string is None:
24
+ format_string = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
25
+
26
+ if stream is None:
27
+ stream = sys.stdout
28
+
29
+ # Configure root logger
30
+ logging.basicConfig(
31
+ level=getattr(logging, level.upper()),
32
+ format=format_string,
33
+ stream=stream,
34
+ force=True,
35
+ )
36
+
37
+ # Return logger for evalhub package
38
+ logger = logging.getLogger("evalhub")
39
+ logger.setLevel(getattr(logging, level.upper()))
40
+
41
+ return logger