prime-evals 0.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- prime_evals-0.1.5/.gitignore +26 -0
- prime_evals-0.1.5/PKG-INFO +307 -0
- prime_evals-0.1.5/README.md +279 -0
- prime_evals-0.1.5/examples/basic_usage.py +193 -0
- prime_evals-0.1.5/pyproject.toml +56 -0
- prime_evals-0.1.5/src/prime_evals/__init__.py +68 -0
- prime_evals-0.1.5/src/prime_evals/core/__init__.py +21 -0
- prime_evals-0.1.5/src/prime_evals/core/client.py +233 -0
- prime_evals-0.1.5/src/prime_evals/core/config.py +56 -0
- prime_evals-0.1.5/src/prime_evals/evals.py +377 -0
- prime_evals-0.1.5/src/prime_evals/exceptions.py +31 -0
- prime_evals-0.1.5/src/prime_evals/models.py +135 -0
- prime_evals-0.1.5/src/prime_evals/py.typed +0 -0
- prime_evals-0.1.5/tests/test_evals.py +163 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.py[cod]
|
|
3
|
+
*$py.class
|
|
4
|
+
*.so
|
|
5
|
+
.Python
|
|
6
|
+
build/
|
|
7
|
+
develop-eggs/
|
|
8
|
+
dist/
|
|
9
|
+
downloads/
|
|
10
|
+
eggs/
|
|
11
|
+
.eggs/
|
|
12
|
+
lib/
|
|
13
|
+
lib64/
|
|
14
|
+
parts/
|
|
15
|
+
sdist/
|
|
16
|
+
var/
|
|
17
|
+
wheels/
|
|
18
|
+
*.egg-info/
|
|
19
|
+
.installed.cfg
|
|
20
|
+
*.egg
|
|
21
|
+
.env
|
|
22
|
+
venv/
|
|
23
|
+
.venv/
|
|
24
|
+
ENV/
|
|
25
|
+
test_env/
|
|
26
|
+
.DS_Store
|
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: prime-evals
|
|
3
|
+
Version: 0.1.5
|
|
4
|
+
Summary: Prime Intellect Evals SDK - Push and manage evaluations
|
|
5
|
+
Project-URL: Homepage, https://github.com/PrimeIntellect-ai/prime-cli
|
|
6
|
+
Project-URL: Documentation, https://github.com/PrimeIntellect-ai/prime-cli/tree/main/packages/prime-evals
|
|
7
|
+
Project-URL: Repository, https://github.com/PrimeIntellect-ai/prime-cli.git
|
|
8
|
+
Author-email: Prime Intellect <contact@primeintellect.ai>
|
|
9
|
+
License: MIT
|
|
10
|
+
Keywords: evals,evaluations
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Requires-Dist: httpx>=0.25.0
|
|
23
|
+
Requires-Dist: pydantic>=2.0.0
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
26
|
+
Requires-Dist: ruff>=0.13.1; extra == 'dev'
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
# Prime Evals SDK
|
|
30
|
+
|
|
31
|
+
Lightweight Python SDK for managing Prime Intellect evaluations - push, track, and analyze your model evaluation results.
|
|
32
|
+
|
|
33
|
+
## Features
|
|
34
|
+
|
|
35
|
+
- **Simple evaluation management** - Create, push samples, and finalize evaluations
|
|
36
|
+
- **Type-safe** - Full type hints and Pydantic models
|
|
37
|
+
- **Authentication caching** - Automatic token management
|
|
38
|
+
- **Environment checking** - Validate environments before pushing
|
|
39
|
+
- **No CLI dependencies** - Pure SDK, lightweight installation
|
|
40
|
+
- **Context manager support** - Automatic resource cleanup
|
|
41
|
+
|
|
42
|
+
## Installation
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
uv pip install prime-evals
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Or with pip:
|
|
49
|
+
```bash
|
|
50
|
+
pip install prime-evals
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Quick Start
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from prime_evals import APIClient, EvalsClient
|
|
57
|
+
|
|
58
|
+
# Initialize client
|
|
59
|
+
api_client = APIClient(api_key="your-api-key")
|
|
60
|
+
client = EvalsClient(api_client)
|
|
61
|
+
|
|
62
|
+
# Create an evaluation
|
|
63
|
+
eval_response = client.create_evaluation(
|
|
64
|
+
name="gsm8k-gpt4o-baseline",
|
|
65
|
+
model_name="gpt-4o-mini",
|
|
66
|
+
dataset="gsm8k",
|
|
67
|
+
framework="verifiers",
|
|
68
|
+
metadata={
|
|
69
|
+
"version": "1.0",
|
|
70
|
+
"num_examples": 10,
|
|
71
|
+
"temperature": 0.7,
|
|
72
|
+
}
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
eval_id = eval_response["evaluation_id"]
|
|
76
|
+
print(f"Created evaluation: {eval_id}")
|
|
77
|
+
|
|
78
|
+
# Push samples
|
|
79
|
+
samples = [
|
|
80
|
+
{
|
|
81
|
+
"example_id": 0,
|
|
82
|
+
"reward": 1.0,
|
|
83
|
+
"correct": True,
|
|
84
|
+
"answer": "18",
|
|
85
|
+
"prompt": [{"role": "user", "content": "What is 9+9?"}],
|
|
86
|
+
"completion": [{"role": "assistant", "content": "The answer is 18."}],
|
|
87
|
+
}
|
|
88
|
+
]
|
|
89
|
+
|
|
90
|
+
client.push_samples(eval_id, samples)
|
|
91
|
+
|
|
92
|
+
# Finalize with metrics
|
|
93
|
+
metrics = {
|
|
94
|
+
"avg_reward": 0.87,
|
|
95
|
+
"avg_correctness": 0.82,
|
|
96
|
+
"success_rate": 0.87,
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
client.finalize_evaluation(eval_id, metrics=metrics)
|
|
100
|
+
print("Evaluation finalized!")
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Async Usage
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
import asyncio
|
|
107
|
+
from prime_evals import AsyncEvalsClient
|
|
108
|
+
|
|
109
|
+
async def main():
|
|
110
|
+
async with AsyncEvalsClient(api_key="your-api-key") as client:
|
|
111
|
+
# Create evaluation
|
|
112
|
+
eval_response = client.create_evaluation(
|
|
113
|
+
name="my-evaluation",
|
|
114
|
+
model_name="gpt-4o-mini",
|
|
115
|
+
dataset="gsm8k",
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
eval_id = eval_response["evaluation_id"]
|
|
119
|
+
|
|
120
|
+
# Push samples
|
|
121
|
+
await client.push_samples(eval_id, samples)
|
|
122
|
+
|
|
123
|
+
# Finalize
|
|
124
|
+
await client.finalize_evaluation(eval_id)
|
|
125
|
+
|
|
126
|
+
# Client automatically closed
|
|
127
|
+
|
|
128
|
+
asyncio.run(main())
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## Authentication
|
|
132
|
+
|
|
133
|
+
The SDK looks for credentials in this order:
|
|
134
|
+
|
|
135
|
+
1. **Direct parameter**: `APIClient(api_key="sk-...")`
|
|
136
|
+
2. **Environment variable**: `export PRIME_API_KEY="sk-..."`
|
|
137
|
+
3. **Config file**: `~/.prime/config.json` (created by `prime login` CLI command)
|
|
138
|
+
|
|
139
|
+
## Complete Example
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
from prime_evals import APIClient, EvalsClient
|
|
143
|
+
|
|
144
|
+
# Initialize
|
|
145
|
+
api_client = APIClient(api_key="your-api-key")
|
|
146
|
+
client = EvalsClient(api_client)
|
|
147
|
+
|
|
148
|
+
# Create evaluation with full metadata
|
|
149
|
+
eval_response = client.create_evaluation(
|
|
150
|
+
name="gsm8k-experiment-1",
|
|
151
|
+
model_name="gpt-4o-mini",
|
|
152
|
+
dataset="gsm8k",
|
|
153
|
+
framework="verifiers",
|
|
154
|
+
task_type="math",
|
|
155
|
+
description="Baseline evaluation on GSM8K dataset",
|
|
156
|
+
tags=["baseline", "math", "gsm8k"],
|
|
157
|
+
metadata={
|
|
158
|
+
"version": "1.0",
|
|
159
|
+
"timestamp": "2025-10-09T12:00:00Z",
|
|
160
|
+
"num_examples": 100,
|
|
161
|
+
"temperature": 0.7,
|
|
162
|
+
"max_tokens": 2048,
|
|
163
|
+
}
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
eval_id = eval_response["evaluation_id"]
|
|
167
|
+
|
|
168
|
+
# Push samples in batches
|
|
169
|
+
samples_batch = [
|
|
170
|
+
{
|
|
171
|
+
"example_id": i,
|
|
172
|
+
"task": "gsm8k",
|
|
173
|
+
"reward": 1.0 if i % 2 == 0 else 0.5,
|
|
174
|
+
"correct": i % 2 == 0,
|
|
175
|
+
"format_reward": 1.0,
|
|
176
|
+
"correctness": 1.0 if i % 2 == 0 else 0.0,
|
|
177
|
+
"answer": str(i * 2),
|
|
178
|
+
"prompt": [
|
|
179
|
+
{"role": "system", "content": "Solve the math problem."},
|
|
180
|
+
{"role": "user", "content": f"What is {i} + {i}?"}
|
|
181
|
+
],
|
|
182
|
+
"completion": [
|
|
183
|
+
{"role": "assistant", "content": f"The answer is {i * 2}."}
|
|
184
|
+
],
|
|
185
|
+
"metadata": {"batch": 1}
|
|
186
|
+
}
|
|
187
|
+
for i in range(10)
|
|
188
|
+
]
|
|
189
|
+
|
|
190
|
+
client.push_samples(eval_id, samples_batch)
|
|
191
|
+
|
|
192
|
+
# Finalize with computed metrics
|
|
193
|
+
final_metrics = {
|
|
194
|
+
"avg_reward": 0.75,
|
|
195
|
+
"avg_format_reward": 1.0,
|
|
196
|
+
"avg_correctness": 0.50,
|
|
197
|
+
"success_rate": 0.75,
|
|
198
|
+
"total_samples": len(samples_batch),
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
client.finalize_evaluation(eval_id, metrics=final_metrics)
|
|
202
|
+
|
|
203
|
+
# Retrieve evaluation details
|
|
204
|
+
eval_details = client.get_evaluation(eval_id)
|
|
205
|
+
print(f"Evaluation Status: {eval_details.get('status')}")
|
|
206
|
+
|
|
207
|
+
# List all evaluations
|
|
208
|
+
evaluations = client.list_evaluations(limit=10)
|
|
209
|
+
for eval in evaluations.get("evaluations", []):
|
|
210
|
+
print(f"{eval['name']}: {eval.get('total_samples', 0)} samples")
|
|
211
|
+
|
|
212
|
+
# Get samples
|
|
213
|
+
samples_response = client.get_samples(eval_id, page=1, limit=100)
|
|
214
|
+
print(f"Retrieved {len(samples_response.get('samples', []))} samples")
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
## Push from JSON File
|
|
218
|
+
|
|
219
|
+
You can also push evaluations from a JSON file:
|
|
220
|
+
|
|
221
|
+
```python
|
|
222
|
+
import json
|
|
223
|
+
from prime_evals import APIClient, EvalsClient
|
|
224
|
+
|
|
225
|
+
with open("eval_results.json") as f:
|
|
226
|
+
eval_data = json.load(f)
|
|
227
|
+
|
|
228
|
+
api_client = APIClient()
|
|
229
|
+
client = EvalsClient(api_client)
|
|
230
|
+
# Create
|
|
231
|
+
eval_response = client.create_evaluation(
|
|
232
|
+
name=eval_data["eval_name"],
|
|
233
|
+
model_name=eval_data["model_name"],
|
|
234
|
+
dataset=eval_data["dataset"],
|
|
235
|
+
metadata=eval_data.get("metadata"),
|
|
236
|
+
metrics=eval_data.get("metrics"),
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
eval_id = eval_response["evaluation_id"]
|
|
240
|
+
|
|
241
|
+
# Push samples
|
|
242
|
+
if "results" in eval_data:
|
|
243
|
+
client.push_samples(eval_id, eval_data["results"])
|
|
244
|
+
|
|
245
|
+
# Finalize
|
|
246
|
+
client.finalize_evaluation(eval_id, metrics=eval_data.get("metrics"))
|
|
247
|
+
|
|
248
|
+
print(f"Successfully pushed evaluation: {eval_id}")
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
## API Reference
|
|
252
|
+
|
|
253
|
+
### EvalsClient
|
|
254
|
+
|
|
255
|
+
Main client for interacting with the Prime Evals API.
|
|
256
|
+
|
|
257
|
+
**Methods:**
|
|
258
|
+
|
|
259
|
+
- `create_evaluation()` - Create a new evaluation
|
|
260
|
+
- `push_samples()` - Push evaluation samples
|
|
261
|
+
- `finalize_evaluation()` - Finalize an evaluation with final metrics
|
|
262
|
+
- `get_evaluation()` - Get evaluation details by ID
|
|
263
|
+
- `list_evaluations()` - List evaluations with optional filters
|
|
264
|
+
- `get_samples()` - Get samples for an evaluation
|
|
265
|
+
|
|
266
|
+
### AsyncEvalsClient
|
|
267
|
+
|
|
268
|
+
Async version of EvalsClient with the same methods (all async).
|
|
269
|
+
|
|
270
|
+
### Models
|
|
271
|
+
|
|
272
|
+
**Evaluation**
|
|
273
|
+
- Full evaluation object with metadata
|
|
274
|
+
|
|
275
|
+
**Sample**
|
|
276
|
+
- Individual evaluation sample with prompt/completion/scores
|
|
277
|
+
|
|
278
|
+
**CreateEvaluationRequest**
|
|
279
|
+
- Request model for creating evaluations
|
|
280
|
+
|
|
281
|
+
**EvaluationStatus**
|
|
282
|
+
- Enum: PENDING, RUNNING, COMPLETED, FAILED, CANCELLED
|
|
283
|
+
|
|
284
|
+
## Error Handling
|
|
285
|
+
|
|
286
|
+
```python
|
|
287
|
+
from prime_evals import APIClient, EvalsClient, EvalsAPIError, EvaluationNotFoundError
|
|
288
|
+
|
|
289
|
+
try:
|
|
290
|
+
api_client = APIClient()
|
|
291
|
+
client = EvalsClient(api_client)
|
|
292
|
+
client.get_evaluation("non-existent-id")
|
|
293
|
+
except EvaluationNotFoundError:
|
|
294
|
+
print("Evaluation not found")
|
|
295
|
+
except EvalsAPIError as e:
|
|
296
|
+
print(f"API error: {e}")
|
|
297
|
+
```
|
|
298
|
+
|
|
299
|
+
## Related Packages
|
|
300
|
+
|
|
301
|
+
- **`prime`** - Full CLI + SDK with pods, sandboxes, inference, and more (includes this package)
|
|
302
|
+
- **`prime-sandboxes`** - SDK for managing remote code execution environments
|
|
303
|
+
|
|
304
|
+
## License
|
|
305
|
+
|
|
306
|
+
MIT License - see LICENSE file for details
|
|
307
|
+
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
# Prime Evals SDK
|
|
2
|
+
|
|
3
|
+
Lightweight Python SDK for managing Prime Intellect evaluations - push, track, and analyze your model evaluation results.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Simple evaluation management** - Create, push samples, and finalize evaluations
|
|
8
|
+
- **Type-safe** - Full type hints and Pydantic models
|
|
9
|
+
- **Authentication caching** - Automatic token management
|
|
10
|
+
- **Environment checking** - Validate environments before pushing
|
|
11
|
+
- **No CLI dependencies** - Pure SDK, lightweight installation
|
|
12
|
+
- **Context manager support** - Automatic resource cleanup
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
uv pip install prime-evals
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
Or with pip:
|
|
21
|
+
```bash
|
|
22
|
+
pip install prime-evals
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
from prime_evals import APIClient, EvalsClient
|
|
29
|
+
|
|
30
|
+
# Initialize client
|
|
31
|
+
api_client = APIClient(api_key="your-api-key")
|
|
32
|
+
client = EvalsClient(api_client)
|
|
33
|
+
|
|
34
|
+
# Create an evaluation
|
|
35
|
+
eval_response = client.create_evaluation(
|
|
36
|
+
name="gsm8k-gpt4o-baseline",
|
|
37
|
+
model_name="gpt-4o-mini",
|
|
38
|
+
dataset="gsm8k",
|
|
39
|
+
framework="verifiers",
|
|
40
|
+
metadata={
|
|
41
|
+
"version": "1.0",
|
|
42
|
+
"num_examples": 10,
|
|
43
|
+
"temperature": 0.7,
|
|
44
|
+
}
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
eval_id = eval_response["evaluation_id"]
|
|
48
|
+
print(f"Created evaluation: {eval_id}")
|
|
49
|
+
|
|
50
|
+
# Push samples
|
|
51
|
+
samples = [
|
|
52
|
+
{
|
|
53
|
+
"example_id": 0,
|
|
54
|
+
"reward": 1.0,
|
|
55
|
+
"correct": True,
|
|
56
|
+
"answer": "18",
|
|
57
|
+
"prompt": [{"role": "user", "content": "What is 9+9?"}],
|
|
58
|
+
"completion": [{"role": "assistant", "content": "The answer is 18."}],
|
|
59
|
+
}
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
client.push_samples(eval_id, samples)
|
|
63
|
+
|
|
64
|
+
# Finalize with metrics
|
|
65
|
+
metrics = {
|
|
66
|
+
"avg_reward": 0.87,
|
|
67
|
+
"avg_correctness": 0.82,
|
|
68
|
+
"success_rate": 0.87,
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
client.finalize_evaluation(eval_id, metrics=metrics)
|
|
72
|
+
print("Evaluation finalized!")
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Async Usage
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
import asyncio
|
|
79
|
+
from prime_evals import AsyncEvalsClient
|
|
80
|
+
|
|
81
|
+
async def main():
|
|
82
|
+
async with AsyncEvalsClient(api_key="your-api-key") as client:
|
|
83
|
+
# Create evaluation
|
|
84
|
+
eval_response = client.create_evaluation(
|
|
85
|
+
name="my-evaluation",
|
|
86
|
+
model_name="gpt-4o-mini",
|
|
87
|
+
dataset="gsm8k",
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
eval_id = eval_response["evaluation_id"]
|
|
91
|
+
|
|
92
|
+
# Push samples
|
|
93
|
+
await client.push_samples(eval_id, samples)
|
|
94
|
+
|
|
95
|
+
# Finalize
|
|
96
|
+
await client.finalize_evaluation(eval_id)
|
|
97
|
+
|
|
98
|
+
# Client automatically closed
|
|
99
|
+
|
|
100
|
+
asyncio.run(main())
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Authentication
|
|
104
|
+
|
|
105
|
+
The SDK looks for credentials in this order:
|
|
106
|
+
|
|
107
|
+
1. **Direct parameter**: `APIClient(api_key="sk-...")`
|
|
108
|
+
2. **Environment variable**: `export PRIME_API_KEY="sk-..."`
|
|
109
|
+
3. **Config file**: `~/.prime/config.json` (created by `prime login` CLI command)
|
|
110
|
+
|
|
111
|
+
## Complete Example
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
from prime_evals import APIClient, EvalsClient
|
|
115
|
+
|
|
116
|
+
# Initialize
|
|
117
|
+
api_client = APIClient(api_key="your-api-key")
|
|
118
|
+
client = EvalsClient(api_client)
|
|
119
|
+
|
|
120
|
+
# Create evaluation with full metadata
|
|
121
|
+
eval_response = client.create_evaluation(
|
|
122
|
+
name="gsm8k-experiment-1",
|
|
123
|
+
model_name="gpt-4o-mini",
|
|
124
|
+
dataset="gsm8k",
|
|
125
|
+
framework="verifiers",
|
|
126
|
+
task_type="math",
|
|
127
|
+
description="Baseline evaluation on GSM8K dataset",
|
|
128
|
+
tags=["baseline", "math", "gsm8k"],
|
|
129
|
+
metadata={
|
|
130
|
+
"version": "1.0",
|
|
131
|
+
"timestamp": "2025-10-09T12:00:00Z",
|
|
132
|
+
"num_examples": 100,
|
|
133
|
+
"temperature": 0.7,
|
|
134
|
+
"max_tokens": 2048,
|
|
135
|
+
}
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
eval_id = eval_response["evaluation_id"]
|
|
139
|
+
|
|
140
|
+
# Push samples in batches
|
|
141
|
+
samples_batch = [
|
|
142
|
+
{
|
|
143
|
+
"example_id": i,
|
|
144
|
+
"task": "gsm8k",
|
|
145
|
+
"reward": 1.0 if i % 2 == 0 else 0.5,
|
|
146
|
+
"correct": i % 2 == 0,
|
|
147
|
+
"format_reward": 1.0,
|
|
148
|
+
"correctness": 1.0 if i % 2 == 0 else 0.0,
|
|
149
|
+
"answer": str(i * 2),
|
|
150
|
+
"prompt": [
|
|
151
|
+
{"role": "system", "content": "Solve the math problem."},
|
|
152
|
+
{"role": "user", "content": f"What is {i} + {i}?"}
|
|
153
|
+
],
|
|
154
|
+
"completion": [
|
|
155
|
+
{"role": "assistant", "content": f"The answer is {i * 2}."}
|
|
156
|
+
],
|
|
157
|
+
"metadata": {"batch": 1}
|
|
158
|
+
}
|
|
159
|
+
for i in range(10)
|
|
160
|
+
]
|
|
161
|
+
|
|
162
|
+
client.push_samples(eval_id, samples_batch)
|
|
163
|
+
|
|
164
|
+
# Finalize with computed metrics
|
|
165
|
+
final_metrics = {
|
|
166
|
+
"avg_reward": 0.75,
|
|
167
|
+
"avg_format_reward": 1.0,
|
|
168
|
+
"avg_correctness": 0.50,
|
|
169
|
+
"success_rate": 0.75,
|
|
170
|
+
"total_samples": len(samples_batch),
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
client.finalize_evaluation(eval_id, metrics=final_metrics)
|
|
174
|
+
|
|
175
|
+
# Retrieve evaluation details
|
|
176
|
+
eval_details = client.get_evaluation(eval_id)
|
|
177
|
+
print(f"Evaluation Status: {eval_details.get('status')}")
|
|
178
|
+
|
|
179
|
+
# List all evaluations
|
|
180
|
+
evaluations = client.list_evaluations(limit=10)
|
|
181
|
+
for eval in evaluations.get("evaluations", []):
|
|
182
|
+
print(f"{eval['name']}: {eval.get('total_samples', 0)} samples")
|
|
183
|
+
|
|
184
|
+
# Get samples
|
|
185
|
+
samples_response = client.get_samples(eval_id, page=1, limit=100)
|
|
186
|
+
print(f"Retrieved {len(samples_response.get('samples', []))} samples")
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
## Push from JSON File
|
|
190
|
+
|
|
191
|
+
You can also push evaluations from a JSON file:
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
import json
|
|
195
|
+
from prime_evals import APIClient, EvalsClient
|
|
196
|
+
|
|
197
|
+
with open("eval_results.json") as f:
|
|
198
|
+
eval_data = json.load(f)
|
|
199
|
+
|
|
200
|
+
api_client = APIClient()
|
|
201
|
+
client = EvalsClient(api_client)
|
|
202
|
+
# Create
|
|
203
|
+
eval_response = client.create_evaluation(
|
|
204
|
+
name=eval_data["eval_name"],
|
|
205
|
+
model_name=eval_data["model_name"],
|
|
206
|
+
dataset=eval_data["dataset"],
|
|
207
|
+
metadata=eval_data.get("metadata"),
|
|
208
|
+
metrics=eval_data.get("metrics"),
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
eval_id = eval_response["evaluation_id"]
|
|
212
|
+
|
|
213
|
+
# Push samples
|
|
214
|
+
if "results" in eval_data:
|
|
215
|
+
client.push_samples(eval_id, eval_data["results"])
|
|
216
|
+
|
|
217
|
+
# Finalize
|
|
218
|
+
client.finalize_evaluation(eval_id, metrics=eval_data.get("metrics"))
|
|
219
|
+
|
|
220
|
+
print(f"Successfully pushed evaluation: {eval_id}")
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
## API Reference
|
|
224
|
+
|
|
225
|
+
### EvalsClient
|
|
226
|
+
|
|
227
|
+
Main client for interacting with the Prime Evals API.
|
|
228
|
+
|
|
229
|
+
**Methods:**
|
|
230
|
+
|
|
231
|
+
- `create_evaluation()` - Create a new evaluation
|
|
232
|
+
- `push_samples()` - Push evaluation samples
|
|
233
|
+
- `finalize_evaluation()` - Finalize an evaluation with final metrics
|
|
234
|
+
- `get_evaluation()` - Get evaluation details by ID
|
|
235
|
+
- `list_evaluations()` - List evaluations with optional filters
|
|
236
|
+
- `get_samples()` - Get samples for an evaluation
|
|
237
|
+
|
|
238
|
+
### AsyncEvalsClient
|
|
239
|
+
|
|
240
|
+
Async version of EvalsClient with the same methods (all async).
|
|
241
|
+
|
|
242
|
+
### Models
|
|
243
|
+
|
|
244
|
+
**Evaluation**
|
|
245
|
+
- Full evaluation object with metadata
|
|
246
|
+
|
|
247
|
+
**Sample**
|
|
248
|
+
- Individual evaluation sample with prompt/completion/scores
|
|
249
|
+
|
|
250
|
+
**CreateEvaluationRequest**
|
|
251
|
+
- Request model for creating evaluations
|
|
252
|
+
|
|
253
|
+
**EvaluationStatus**
|
|
254
|
+
- Enum: PENDING, RUNNING, COMPLETED, FAILED, CANCELLED
|
|
255
|
+
|
|
256
|
+
## Error Handling
|
|
257
|
+
|
|
258
|
+
```python
|
|
259
|
+
from prime_evals import APIClient, EvalsClient, EvalsAPIError, EvaluationNotFoundError
|
|
260
|
+
|
|
261
|
+
try:
|
|
262
|
+
api_client = APIClient()
|
|
263
|
+
client = EvalsClient(api_client)
|
|
264
|
+
client.get_evaluation("non-existent-id")
|
|
265
|
+
except EvaluationNotFoundError:
|
|
266
|
+
print("Evaluation not found")
|
|
267
|
+
except EvalsAPIError as e:
|
|
268
|
+
print(f"API error: {e}")
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
## Related Packages
|
|
272
|
+
|
|
273
|
+
- **`prime`** - Full CLI + SDK with pods, sandboxes, inference, and more (includes this package)
|
|
274
|
+
- **`prime-sandboxes`** - SDK for managing remote code execution environments
|
|
275
|
+
|
|
276
|
+
## License
|
|
277
|
+
|
|
278
|
+
MIT License - see LICENSE file for details
|
|
279
|
+
|