prime-evals 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,26 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *$py.class
4
+ *.so
5
+ .Python
6
+ build/
7
+ develop-eggs/
8
+ dist/
9
+ downloads/
10
+ eggs/
11
+ .eggs/
12
+ lib/
13
+ lib64/
14
+ parts/
15
+ sdist/
16
+ var/
17
+ wheels/
18
+ *.egg-info/
19
+ .installed.cfg
20
+ *.egg
21
+ .env
22
+ venv/
23
+ .venv/
24
+ ENV/
25
+ test_env/
26
+ .DS_Store
@@ -0,0 +1,307 @@
1
+ Metadata-Version: 2.4
2
+ Name: prime-evals
3
+ Version: 0.1.5
4
+ Summary: Prime Intellect Evals SDK - Push and manage evaluations
5
+ Project-URL: Homepage, https://github.com/PrimeIntellect-ai/prime-cli
6
+ Project-URL: Documentation, https://github.com/PrimeIntellect-ai/prime-cli/tree/main/packages/prime-evals
7
+ Project-URL: Repository, https://github.com/PrimeIntellect-ai/prime-cli.git
8
+ Author-email: Prime Intellect <contact@primeintellect.ai>
9
+ License: MIT
10
+ Keywords: evals,evaluations
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Requires-Python: >=3.10
22
+ Requires-Dist: httpx>=0.25.0
23
+ Requires-Dist: pydantic>=2.0.0
24
+ Provides-Extra: dev
25
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
26
+ Requires-Dist: ruff>=0.13.1; extra == 'dev'
27
+ Description-Content-Type: text/markdown
28
+
29
+ # Prime Evals SDK
30
+
31
+ Lightweight Python SDK for managing Prime Intellect evaluations - push, track, and analyze your model evaluation results.
32
+
33
+ ## Features
34
+
35
+ - **Simple evaluation management** - Create, push samples, and finalize evaluations
36
+ - **Type-safe** - Full type hints and Pydantic models
37
+ - **Authentication caching** - Automatic token management
38
+ - **Environment checking** - Validate environments before pushing
39
+ - **No CLI dependencies** - Pure SDK, lightweight installation
40
+ - **Context manager support** - Automatic resource cleanup
41
+
42
+ ## Installation
43
+
44
+ ```bash
45
+ uv pip install prime-evals
46
+ ```
47
+
48
+ Or with pip:
49
+ ```bash
50
+ pip install prime-evals
51
+ ```
52
+
53
+ ## Quick Start
54
+
55
+ ```python
56
+ from prime_evals import APIClient, EvalsClient
57
+
58
+ # Initialize client
59
+ api_client = APIClient(api_key="your-api-key")
60
+ client = EvalsClient(api_client)
61
+
62
+ # Create an evaluation
63
+ eval_response = client.create_evaluation(
64
+ name="gsm8k-gpt4o-baseline",
65
+ model_name="gpt-4o-mini",
66
+ dataset="gsm8k",
67
+ framework="verifiers",
68
+ metadata={
69
+ "version": "1.0",
70
+ "num_examples": 10,
71
+ "temperature": 0.7,
72
+ }
73
+ )
74
+
75
+ eval_id = eval_response["evaluation_id"]
76
+ print(f"Created evaluation: {eval_id}")
77
+
78
+ # Push samples
79
+ samples = [
80
+ {
81
+ "example_id": 0,
82
+ "reward": 1.0,
83
+ "correct": True,
84
+ "answer": "18",
85
+ "prompt": [{"role": "user", "content": "What is 9+9?"}],
86
+ "completion": [{"role": "assistant", "content": "The answer is 18."}],
87
+ }
88
+ ]
89
+
90
+ client.push_samples(eval_id, samples)
91
+
92
+ # Finalize with metrics
93
+ metrics = {
94
+ "avg_reward": 0.87,
95
+ "avg_correctness": 0.82,
96
+ "success_rate": 0.87,
97
+ }
98
+
99
+ client.finalize_evaluation(eval_id, metrics=metrics)
100
+ print("Evaluation finalized!")
101
+ ```
102
+
103
+ ## Async Usage
104
+
105
+ ```python
106
+ import asyncio
107
+ from prime_evals import AsyncEvalsClient
108
+
109
+ async def main():
110
+ async with AsyncEvalsClient(api_key="your-api-key") as client:
111
+ # Create evaluation
112
+ eval_response = client.create_evaluation(
113
+ name="my-evaluation",
114
+ model_name="gpt-4o-mini",
115
+ dataset="gsm8k",
116
+ )
117
+
118
+ eval_id = eval_response["evaluation_id"]
119
+
120
+ # Push samples
121
+ await client.push_samples(eval_id, samples)
122
+
123
+ # Finalize
124
+ await client.finalize_evaluation(eval_id)
125
+
126
+ # Client automatically closed
127
+
128
+ asyncio.run(main())
129
+ ```
130
+
131
+ ## Authentication
132
+
133
+ The SDK looks for credentials in this order:
134
+
135
+ 1. **Direct parameter**: `APIClient(api_key="sk-...")`
136
+ 2. **Environment variable**: `export PRIME_API_KEY="sk-..."`
137
+ 3. **Config file**: `~/.prime/config.json` (created by `prime login` CLI command)
138
+
139
+ ## Complete Example
140
+
141
+ ```python
142
+ from prime_evals import APIClient, EvalsClient
143
+
144
+ # Initialize
145
+ api_client = APIClient(api_key="your-api-key")
146
+ client = EvalsClient(api_client)
147
+
148
+ # Create evaluation with full metadata
149
+ eval_response = client.create_evaluation(
150
+ name="gsm8k-experiment-1",
151
+ model_name="gpt-4o-mini",
152
+ dataset="gsm8k",
153
+ framework="verifiers",
154
+ task_type="math",
155
+ description="Baseline evaluation on GSM8K dataset",
156
+ tags=["baseline", "math", "gsm8k"],
157
+ metadata={
158
+ "version": "1.0",
159
+ "timestamp": "2025-10-09T12:00:00Z",
160
+ "num_examples": 100,
161
+ "temperature": 0.7,
162
+ "max_tokens": 2048,
163
+ }
164
+ )
165
+
166
+ eval_id = eval_response["evaluation_id"]
167
+
168
+ # Push samples in batches
169
+ samples_batch = [
170
+ {
171
+ "example_id": i,
172
+ "task": "gsm8k",
173
+ "reward": 1.0 if i % 2 == 0 else 0.5,
174
+ "correct": i % 2 == 0,
175
+ "format_reward": 1.0,
176
+ "correctness": 1.0 if i % 2 == 0 else 0.0,
177
+ "answer": str(i * 2),
178
+ "prompt": [
179
+ {"role": "system", "content": "Solve the math problem."},
180
+ {"role": "user", "content": f"What is {i} + {i}?"}
181
+ ],
182
+ "completion": [
183
+ {"role": "assistant", "content": f"The answer is {i * 2}."}
184
+ ],
185
+ "metadata": {"batch": 1}
186
+ }
187
+ for i in range(10)
188
+ ]
189
+
190
+ client.push_samples(eval_id, samples_batch)
191
+
192
+ # Finalize with computed metrics
193
+ final_metrics = {
194
+ "avg_reward": 0.75,
195
+ "avg_format_reward": 1.0,
196
+ "avg_correctness": 0.50,
197
+ "success_rate": 0.75,
198
+ "total_samples": len(samples_batch),
199
+ }
200
+
201
+ client.finalize_evaluation(eval_id, metrics=final_metrics)
202
+
203
+ # Retrieve evaluation details
204
+ eval_details = client.get_evaluation(eval_id)
205
+ print(f"Evaluation Status: {eval_details.get('status')}")
206
+
207
+ # List all evaluations
208
+ evaluations = client.list_evaluations(limit=10)
209
+ for eval in evaluations.get("evaluations", []):
210
+ print(f"{eval['name']}: {eval.get('total_samples', 0)} samples")
211
+
212
+ # Get samples
213
+ samples_response = client.get_samples(eval_id, page=1, limit=100)
214
+ print(f"Retrieved {len(samples_response.get('samples', []))} samples")
215
+ ```
216
+
217
+ ## Push from JSON File
218
+
219
+ You can also push evaluations from a JSON file:
220
+
221
+ ```python
222
+ import json
223
+ from prime_evals import APIClient, EvalsClient
224
+
225
+ with open("eval_results.json") as f:
226
+ eval_data = json.load(f)
227
+
228
+ api_client = APIClient()
229
+ client = EvalsClient(api_client)
230
+ # Create
231
+ eval_response = client.create_evaluation(
232
+ name=eval_data["eval_name"],
233
+ model_name=eval_data["model_name"],
234
+ dataset=eval_data["dataset"],
235
+ metadata=eval_data.get("metadata"),
236
+ metrics=eval_data.get("metrics"),
237
+ )
238
+
239
+ eval_id = eval_response["evaluation_id"]
240
+
241
+ # Push samples
242
+ if "results" in eval_data:
243
+ client.push_samples(eval_id, eval_data["results"])
244
+
245
+ # Finalize
246
+ client.finalize_evaluation(eval_id, metrics=eval_data.get("metrics"))
247
+
248
+ print(f"Successfully pushed evaluation: {eval_id}")
249
+ ```
250
+
251
+ ## API Reference
252
+
253
+ ### EvalsClient
254
+
255
+ Main client for interacting with the Prime Evals API.
256
+
257
+ **Methods:**
258
+
259
+ - `create_evaluation()` - Create a new evaluation
260
+ - `push_samples()` - Push evaluation samples
261
+ - `finalize_evaluation()` - Finalize an evaluation with final metrics
262
+ - `get_evaluation()` - Get evaluation details by ID
263
+ - `list_evaluations()` - List evaluations with optional filters
264
+ - `get_samples()` - Get samples for an evaluation
265
+
266
+ ### AsyncEvalsClient
267
+
268
+ Async version of EvalsClient with the same methods (all async).
269
+
270
+ ### Models
271
+
272
+ **Evaluation**
273
+ - Full evaluation object with metadata
274
+
275
+ **Sample**
276
+ - Individual evaluation sample with prompt/completion/scores
277
+
278
+ **CreateEvaluationRequest**
279
+ - Request model for creating evaluations
280
+
281
+ **EvaluationStatus**
282
+ - Enum: PENDING, RUNNING, COMPLETED, FAILED, CANCELLED
283
+
284
+ ## Error Handling
285
+
286
+ ```python
287
+ from prime_evals import APIClient, EvalsClient, EvalsAPIError, EvaluationNotFoundError
288
+
289
+ try:
290
+ api_client = APIClient()
291
+ client = EvalsClient(api_client)
292
+ client.get_evaluation("non-existent-id")
293
+ except EvaluationNotFoundError:
294
+ print("Evaluation not found")
295
+ except EvalsAPIError as e:
296
+ print(f"API error: {e}")
297
+ ```
298
+
299
+ ## Related Packages
300
+
301
+ - **`prime`** - Full CLI + SDK with pods, sandboxes, inference, and more (includes this package)
302
+ - **`prime-sandboxes`** - SDK for managing remote code execution environments
303
+
304
+ ## License
305
+
306
+ MIT License - see LICENSE file for details
307
+
@@ -0,0 +1,279 @@
1
+ # Prime Evals SDK
2
+
3
+ Lightweight Python SDK for managing Prime Intellect evaluations - push, track, and analyze your model evaluation results.
4
+
5
+ ## Features
6
+
7
+ - **Simple evaluation management** - Create, push samples, and finalize evaluations
8
+ - **Type-safe** - Full type hints and Pydantic models
9
+ - **Authentication caching** - Automatic token management
10
+ - **Environment checking** - Validate environments before pushing
11
+ - **No CLI dependencies** - Pure SDK, lightweight installation
12
+ - **Context manager support** - Automatic resource cleanup
13
+
14
+ ## Installation
15
+
16
+ ```bash
17
+ uv pip install prime-evals
18
+ ```
19
+
20
+ Or with pip:
21
+ ```bash
22
+ pip install prime-evals
23
+ ```
24
+
25
+ ## Quick Start
26
+
27
+ ```python
28
+ from prime_evals import APIClient, EvalsClient
29
+
30
+ # Initialize client
31
+ api_client = APIClient(api_key="your-api-key")
32
+ client = EvalsClient(api_client)
33
+
34
+ # Create an evaluation
35
+ eval_response = client.create_evaluation(
36
+ name="gsm8k-gpt4o-baseline",
37
+ model_name="gpt-4o-mini",
38
+ dataset="gsm8k",
39
+ framework="verifiers",
40
+ metadata={
41
+ "version": "1.0",
42
+ "num_examples": 10,
43
+ "temperature": 0.7,
44
+ }
45
+ )
46
+
47
+ eval_id = eval_response["evaluation_id"]
48
+ print(f"Created evaluation: {eval_id}")
49
+
50
+ # Push samples
51
+ samples = [
52
+ {
53
+ "example_id": 0,
54
+ "reward": 1.0,
55
+ "correct": True,
56
+ "answer": "18",
57
+ "prompt": [{"role": "user", "content": "What is 9+9?"}],
58
+ "completion": [{"role": "assistant", "content": "The answer is 18."}],
59
+ }
60
+ ]
61
+
62
+ client.push_samples(eval_id, samples)
63
+
64
+ # Finalize with metrics
65
+ metrics = {
66
+ "avg_reward": 0.87,
67
+ "avg_correctness": 0.82,
68
+ "success_rate": 0.87,
69
+ }
70
+
71
+ client.finalize_evaluation(eval_id, metrics=metrics)
72
+ print("Evaluation finalized!")
73
+ ```
74
+
75
+ ## Async Usage
76
+
77
+ ```python
78
+ import asyncio
79
+ from prime_evals import AsyncEvalsClient
80
+
81
+ async def main():
82
+ async with AsyncEvalsClient(api_key="your-api-key") as client:
83
+ # Create evaluation
84
+ eval_response = client.create_evaluation(
85
+ name="my-evaluation",
86
+ model_name="gpt-4o-mini",
87
+ dataset="gsm8k",
88
+ )
89
+
90
+ eval_id = eval_response["evaluation_id"]
91
+
92
+ # Push samples
93
+ await client.push_samples(eval_id, samples)
94
+
95
+ # Finalize
96
+ await client.finalize_evaluation(eval_id)
97
+
98
+ # Client automatically closed
99
+
100
+ asyncio.run(main())
101
+ ```
102
+
103
+ ## Authentication
104
+
105
+ The SDK looks for credentials in this order:
106
+
107
+ 1. **Direct parameter**: `APIClient(api_key="sk-...")`
108
+ 2. **Environment variable**: `export PRIME_API_KEY="sk-..."`
109
+ 3. **Config file**: `~/.prime/config.json` (created by `prime login` CLI command)
110
+
111
+ ## Complete Example
112
+
113
+ ```python
114
+ from prime_evals import APIClient, EvalsClient
115
+
116
+ # Initialize
117
+ api_client = APIClient(api_key="your-api-key")
118
+ client = EvalsClient(api_client)
119
+
120
+ # Create evaluation with full metadata
121
+ eval_response = client.create_evaluation(
122
+ name="gsm8k-experiment-1",
123
+ model_name="gpt-4o-mini",
124
+ dataset="gsm8k",
125
+ framework="verifiers",
126
+ task_type="math",
127
+ description="Baseline evaluation on GSM8K dataset",
128
+ tags=["baseline", "math", "gsm8k"],
129
+ metadata={
130
+ "version": "1.0",
131
+ "timestamp": "2025-10-09T12:00:00Z",
132
+ "num_examples": 100,
133
+ "temperature": 0.7,
134
+ "max_tokens": 2048,
135
+ }
136
+ )
137
+
138
+ eval_id = eval_response["evaluation_id"]
139
+
140
+ # Push samples in batches
141
+ samples_batch = [
142
+ {
143
+ "example_id": i,
144
+ "task": "gsm8k",
145
+ "reward": 1.0 if i % 2 == 0 else 0.5,
146
+ "correct": i % 2 == 0,
147
+ "format_reward": 1.0,
148
+ "correctness": 1.0 if i % 2 == 0 else 0.0,
149
+ "answer": str(i * 2),
150
+ "prompt": [
151
+ {"role": "system", "content": "Solve the math problem."},
152
+ {"role": "user", "content": f"What is {i} + {i}?"}
153
+ ],
154
+ "completion": [
155
+ {"role": "assistant", "content": f"The answer is {i * 2}."}
156
+ ],
157
+ "metadata": {"batch": 1}
158
+ }
159
+ for i in range(10)
160
+ ]
161
+
162
+ client.push_samples(eval_id, samples_batch)
163
+
164
+ # Finalize with computed metrics
165
+ final_metrics = {
166
+ "avg_reward": 0.75,
167
+ "avg_format_reward": 1.0,
168
+ "avg_correctness": 0.50,
169
+ "success_rate": 0.75,
170
+ "total_samples": len(samples_batch),
171
+ }
172
+
173
+ client.finalize_evaluation(eval_id, metrics=final_metrics)
174
+
175
+ # Retrieve evaluation details
176
+ eval_details = client.get_evaluation(eval_id)
177
+ print(f"Evaluation Status: {eval_details.get('status')}")
178
+
179
+ # List all evaluations
180
+ evaluations = client.list_evaluations(limit=10)
181
+ for eval in evaluations.get("evaluations", []):
182
+ print(f"{eval['name']}: {eval.get('total_samples', 0)} samples")
183
+
184
+ # Get samples
185
+ samples_response = client.get_samples(eval_id, page=1, limit=100)
186
+ print(f"Retrieved {len(samples_response.get('samples', []))} samples")
187
+ ```
188
+
189
+ ## Push from JSON File
190
+
191
+ You can also push evaluations from a JSON file:
192
+
193
+ ```python
194
+ import json
195
+ from prime_evals import APIClient, EvalsClient
196
+
197
+ with open("eval_results.json") as f:
198
+ eval_data = json.load(f)
199
+
200
+ api_client = APIClient()
201
+ client = EvalsClient(api_client)
202
+ # Create
203
+ eval_response = client.create_evaluation(
204
+ name=eval_data["eval_name"],
205
+ model_name=eval_data["model_name"],
206
+ dataset=eval_data["dataset"],
207
+ metadata=eval_data.get("metadata"),
208
+ metrics=eval_data.get("metrics"),
209
+ )
210
+
211
+ eval_id = eval_response["evaluation_id"]
212
+
213
+ # Push samples
214
+ if "results" in eval_data:
215
+ client.push_samples(eval_id, eval_data["results"])
216
+
217
+ # Finalize
218
+ client.finalize_evaluation(eval_id, metrics=eval_data.get("metrics"))
219
+
220
+ print(f"Successfully pushed evaluation: {eval_id}")
221
+ ```
222
+
223
+ ## API Reference
224
+
225
+ ### EvalsClient
226
+
227
+ Main client for interacting with the Prime Evals API.
228
+
229
+ **Methods:**
230
+
231
+ - `create_evaluation()` - Create a new evaluation
232
+ - `push_samples()` - Push evaluation samples
233
+ - `finalize_evaluation()` - Finalize an evaluation with final metrics
234
+ - `get_evaluation()` - Get evaluation details by ID
235
+ - `list_evaluations()` - List evaluations with optional filters
236
+ - `get_samples()` - Get samples for an evaluation
237
+
238
+ ### AsyncEvalsClient
239
+
240
+ Async version of EvalsClient with the same methods (all async).
241
+
242
+ ### Models
243
+
244
+ **Evaluation**
245
+ - Full evaluation object with metadata
246
+
247
+ **Sample**
248
+ - Individual evaluation sample with prompt/completion/scores
249
+
250
+ **CreateEvaluationRequest**
251
+ - Request model for creating evaluations
252
+
253
+ **EvaluationStatus**
254
+ - Enum: PENDING, RUNNING, COMPLETED, FAILED, CANCELLED
255
+
256
+ ## Error Handling
257
+
258
+ ```python
259
+ from prime_evals import APIClient, EvalsClient, EvalsAPIError, EvaluationNotFoundError
260
+
261
+ try:
262
+ api_client = APIClient()
263
+ client = EvalsClient(api_client)
264
+ client.get_evaluation("non-existent-id")
265
+ except EvaluationNotFoundError:
266
+ print("Evaluation not found")
267
+ except EvalsAPIError as e:
268
+ print(f"API error: {e}")
269
+ ```
270
+
271
+ ## Related Packages
272
+
273
+ - **`prime`** - Full CLI + SDK with pods, sandboxes, inference, and more (includes this package)
274
+ - **`prime-sandboxes`** - SDK for managing remote code execution environments
275
+
276
+ ## License
277
+
278
+ MIT License - see LICENSE file for details
279
+