mlops-python-sdk 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlops_python_sdk-0.0.1/PKG-INFO +416 -0
- mlops_python_sdk-0.0.1/README.md +390 -0
- mlops_python_sdk-0.0.1/mlops/__init__.py +46 -0
- mlops_python_sdk-0.0.1/mlops/api/client/__init__.py +8 -0
- mlops_python_sdk-0.0.1/mlops/api/client/api/__init__.py +1 -0
- mlops_python_sdk-0.0.1/mlops/api/client/api/tasks/__init__.py +1 -0
- mlops_python_sdk-0.0.1/mlops/api/client/api/tasks/cancel_task.py +196 -0
- mlops_python_sdk-0.0.1/mlops/api/client/api/tasks/delete_task.py +204 -0
- mlops_python_sdk-0.0.1/mlops/api/client/api/tasks/get_task.py +196 -0
- mlops_python_sdk-0.0.1/mlops/api/client/api/tasks/list_tasks.py +255 -0
- mlops_python_sdk-0.0.1/mlops/api/client/api/tasks/submit_task.py +188 -0
- mlops_python_sdk-0.0.1/mlops/api/client/client.py +268 -0
- mlops_python_sdk-0.0.1/mlops/api/client/errors.py +16 -0
- mlops_python_sdk-0.0.1/mlops/api/client/models/__init__.py +33 -0
- mlops_python_sdk-0.0.1/mlops/api/client/models/error_response.py +68 -0
- mlops_python_sdk-0.0.1/mlops/api/client/models/message_response.py +59 -0
- mlops_python_sdk-0.0.1/mlops/api/client/models/task.py +1629 -0
- mlops_python_sdk-0.0.1/mlops/api/client/models/task_alloc_tres_type_0.py +49 -0
- mlops_python_sdk-0.0.1/mlops/api/client/models/task_gres_detail_type_0_item.py +44 -0
- mlops_python_sdk-0.0.1/mlops/api/client/models/task_job_resources_type_0.py +49 -0
- mlops_python_sdk-0.0.1/mlops/api/client/models/task_list_response.py +102 -0
- mlops_python_sdk-0.0.1/mlops/api/client/models/task_resources_type_0.py +49 -0
- mlops_python_sdk-0.0.1/mlops/api/client/models/task_status.py +15 -0
- mlops_python_sdk-0.0.1/mlops/api/client/models/task_submit_request.py +640 -0
- mlops_python_sdk-0.0.1/mlops/api/client/models/task_submit_request_environment_type_0.py +49 -0
- mlops_python_sdk-0.0.1/mlops/api/client/models/task_submit_response.py +78 -0
- mlops_python_sdk-0.0.1/mlops/api/client/models/task_tres_type_0.py +49 -0
- mlops_python_sdk-0.0.1/mlops/api/client/models/task_tres_used_type_0.py +49 -0
- mlops_python_sdk-0.0.1/mlops/api/client/py.typed +1 -0
- mlops_python_sdk-0.0.1/mlops/api/client/types.py +54 -0
- mlops_python_sdk-0.0.1/mlops/connection_config.py +106 -0
- mlops_python_sdk-0.0.1/mlops/exceptions.py +82 -0
- mlops_python_sdk-0.0.1/mlops/task/__init__.py +10 -0
- mlops_python_sdk-0.0.1/mlops/task/client.py +146 -0
- mlops_python_sdk-0.0.1/mlops/task/task.py +464 -0
- mlops_python_sdk-0.0.1/pyproject.toml +45 -0
|
@@ -0,0 +1,416 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: mlops-python-sdk
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: MLOps Python SDK for XCloud Service API
|
|
5
|
+
License: MIT
|
|
6
|
+
Author: mlops
|
|
7
|
+
Author-email: mlops@example.com
|
|
8
|
+
Requires-Python: >=3.9,<4.0
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Requires-Dist: attrs (>=23.2.0)
|
|
17
|
+
Requires-Dist: httpx (>=0.27.0,<1.0.0)
|
|
18
|
+
Requires-Dist: packaging (>=24.1)
|
|
19
|
+
Requires-Dist: python-dateutil (>=2.8.2)
|
|
20
|
+
Requires-Dist: typing-extensions (>=4.1.0)
|
|
21
|
+
Project-URL: Bug Tracker, https://github.com/xcloud-service/xservice/issues
|
|
22
|
+
Project-URL: Homepage, https://mlops.cloud/
|
|
23
|
+
Project-URL: Repository, https://github.com/xcloud-service/xservice
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# MLOps Python SDK
|
|
27
|
+
|
|
28
|
+
[MLOps](https://mlops.cloud) Python SDK for XCloud Service API. Manage and execute tasks with confidence.
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
Install the SDK from PyPI:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install mlops-python-sdk
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Quick Start
|
|
39
|
+
|
|
40
|
+
### 1. Setup Authentication
|
|
41
|
+
|
|
42
|
+
You can authenticate using either an API Key or an Access Token.
|
|
43
|
+
|
|
44
|
+
#### Option 1: API Key (Recommended for programmatic access)
|
|
45
|
+
|
|
46
|
+
1. Sign up at [MLOps](https://xcloud-service.com)
|
|
47
|
+
2. Create an API key from [API Keys](https://xcloud-service.com/home/api-keys)
|
|
48
|
+
3. Set environment variables:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
export MLOPS_API_KEY=xck_******
|
|
52
|
+
export MLOPS_DOMAIN=localhost:8090 # optional, default is localhost:8090
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
#### Option 2: Access Token (For user authentication)
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
export MLOPS_ACCESS_TOKEN=your_access_token
|
|
59
|
+
export MLOPS_DOMAIN=localhost:8090 # optional
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### 2. Basic Usage
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from client import Task, ConnectionConfig
|
|
66
|
+
from client.api.client.models.task_status import TaskStatus
|
|
67
|
+
|
|
68
|
+
# Initialize Task client (uses environment variables by default)
|
|
69
|
+
task = Task()
|
|
70
|
+
|
|
71
|
+
# Or initialize with explicit configuration
|
|
72
|
+
config = ConnectionConfig(
|
|
73
|
+
api_key="xck_******",
|
|
74
|
+
domain="localhost:8090",
|
|
75
|
+
debug=False
|
|
76
|
+
)
|
|
77
|
+
task = Task(config=config)
|
|
78
|
+
|
|
79
|
+
# Submit a task with script
|
|
80
|
+
result = task.submit(
|
|
81
|
+
name="my-training-task",
|
|
82
|
+
cluster_id=1,
|
|
83
|
+
script="#!/bin/bash\necho 'Hello World'",
|
|
84
|
+
resources={"cpu": 4, "memory": "8GB", "gpu": 1}
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# Or submit with command
|
|
88
|
+
result = task.submit(
|
|
89
|
+
name="my-task",
|
|
90
|
+
cluster_id=1,
|
|
91
|
+
command="python train.py",
|
|
92
|
+
resources={"cpu": 4, "memory": "8GB"}
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Get task details
|
|
96
|
+
task_info = task.get(task_id=result.job_id, cluster_id=1)
|
|
97
|
+
|
|
98
|
+
# List tasks with filters
|
|
99
|
+
running_tasks = task.list(
|
|
100
|
+
status=TaskStatus.RUNNING,
|
|
101
|
+
cluster_id=1,
|
|
102
|
+
page=1,
|
|
103
|
+
page_size=20
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# Cancel a task
|
|
107
|
+
task.cancel(task_id=result.job_id, cluster_id=1)
|
|
108
|
+
|
|
109
|
+
# Delete a task
|
|
110
|
+
task.delete(task_id=task_id, cluster_id=1)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## API Reference
|
|
114
|
+
|
|
115
|
+
### Task Class
|
|
116
|
+
|
|
117
|
+
The `Task` class provides a high-level interface for managing tasks.
|
|
118
|
+
|
|
119
|
+
#### Initialization
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
from client import Task, ConnectionConfig
|
|
123
|
+
|
|
124
|
+
# Using environment variables
|
|
125
|
+
task = Task()
|
|
126
|
+
|
|
127
|
+
# With explicit configuration
|
|
128
|
+
config = ConnectionConfig(
|
|
129
|
+
api_key="xck_******", # API key for authentication
|
|
130
|
+
access_token="token_******", # Access token (alternative to API key)
|
|
131
|
+
domain="localhost:8090", # API domain
|
|
132
|
+
debug=False, # Enable debug mode
|
|
133
|
+
request_timeout=30.0 # Request timeout in seconds
|
|
134
|
+
)
|
|
135
|
+
task = Task(config=config)
|
|
136
|
+
|
|
137
|
+
# Or pass parameters directly
|
|
138
|
+
task = Task(
|
|
139
|
+
api_key="xck_******",
|
|
140
|
+
domain="localhost:8090"
|
|
141
|
+
)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
#### Methods
|
|
145
|
+
|
|
146
|
+
##### `submit()`
|
|
147
|
+
|
|
148
|
+
Submit a new task to the cluster.
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
result = task.submit(
|
|
152
|
+
name: str, # Task name (required)
|
|
153
|
+
cluster_id: int, # Cluster ID (required)
|
|
154
|
+
script: Optional[str] = None, # Script content (script or command required)
|
|
155
|
+
command: Optional[str] = None,# Command to execute (script or command required)
|
|
156
|
+
resources: Optional[dict] = None, # Resource requirements
|
|
157
|
+
team_id: Optional[int] = None # Team ID (optional)
|
|
158
|
+
) -> TaskSubmitResponse
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
**Resources dictionary** can contain:
|
|
162
|
+
- `cpu` or `cpus_per_task`: Number of CPUs
|
|
163
|
+
- `memory`: Memory requirement (e.g., "8GB", "4096M")
|
|
164
|
+
- `nodes`: Number of nodes
|
|
165
|
+
- `gres`: GPU resources (e.g., "gpu:1")
|
|
166
|
+
- `time`: Time limit (e.g., "1-00:00:00" for 1 day)
|
|
167
|
+
- `partition`: Partition name
|
|
168
|
+
- `tres`: TRES specification
|
|
169
|
+
|
|
170
|
+
**Example:**
|
|
171
|
+
|
|
172
|
+
```python
|
|
173
|
+
result = task.submit(
|
|
174
|
+
name="ml-training",
|
|
175
|
+
cluster_id=1,
|
|
176
|
+
script="#!/bin/bash\npython train.py --epochs 100",
|
|
177
|
+
resources={
|
|
178
|
+
"cpu": 8,
|
|
179
|
+
"memory": "16GB",
|
|
180
|
+
"gpu": 1,
|
|
181
|
+
"time": "2-00:00:00", # 2 days
|
|
182
|
+
"partition": "gpu"
|
|
183
|
+
}
|
|
184
|
+
)
|
|
185
|
+
print(f"Task submitted: Job ID = {result.job_id}")
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
##### `get()`
|
|
189
|
+
|
|
190
|
+
Get task details by task ID.
|
|
191
|
+
|
|
192
|
+
```python
|
|
193
|
+
task_info = task.get(
|
|
194
|
+
task_id: int, # Task ID (Slurm job ID)
|
|
195
|
+
cluster_id: int # Cluster ID (required)
|
|
196
|
+
) -> Task
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
**Example:**
|
|
200
|
+
|
|
201
|
+
```python
|
|
202
|
+
task_info = task.get(task_id=12345, cluster_id=1)
|
|
203
|
+
print(f"Task status: {task_info.status}")
|
|
204
|
+
print(f"Task name: {task_info.name}")
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
##### `list()`
|
|
208
|
+
|
|
209
|
+
List tasks with optional filters and pagination.
|
|
210
|
+
|
|
211
|
+
```python
|
|
212
|
+
tasks = task.list(
|
|
213
|
+
page: int = 1, # Page number
|
|
214
|
+
page_size: int = 20, # Items per page
|
|
215
|
+
status: Optional[TaskStatus] = None, # Filter by status
|
|
216
|
+
cluster_id: Optional[int] = None, # Filter by cluster ID
|
|
217
|
+
team_id: Optional[int] = None, # Filter by team ID
|
|
218
|
+
user_id: Optional[int] = None # Filter by user ID
|
|
219
|
+
) -> TaskListResponse
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
**Example:**
|
|
223
|
+
|
|
224
|
+
```python
|
|
225
|
+
from client.api.client.models.task_status import TaskStatus
|
|
226
|
+
|
|
227
|
+
# List all running tasks
|
|
228
|
+
running_tasks = task.list(status=TaskStatus.RUNNING)
|
|
229
|
+
|
|
230
|
+
# List tasks in a specific cluster
|
|
231
|
+
cluster_tasks = task.list(cluster_id=1, page=1, page_size=10)
|
|
232
|
+
|
|
233
|
+
# List completed tasks with pagination
|
|
234
|
+
completed = task.list(
|
|
235
|
+
status=TaskStatus.COMPLETED,
|
|
236
|
+
cluster_id=1,
|
|
237
|
+
page=1,
|
|
238
|
+
page_size=50
|
|
239
|
+
)
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
##### `cancel()`
|
|
243
|
+
|
|
244
|
+
Cancel a running task.
|
|
245
|
+
|
|
246
|
+
```python
|
|
247
|
+
task.cancel(
|
|
248
|
+
task_id: int, # Task ID (Slurm job ID)
|
|
249
|
+
cluster_id: int # Cluster ID (required)
|
|
250
|
+
)
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
**Example:**
|
|
254
|
+
|
|
255
|
+
```python
|
|
256
|
+
task.cancel(task_id=12345, cluster_id=1)
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
### TaskStatus Enum
|
|
260
|
+
|
|
261
|
+
Task status values for filtering:
|
|
262
|
+
|
|
263
|
+
```python
|
|
264
|
+
from client.api.client.models.task_status import TaskStatus
|
|
265
|
+
|
|
266
|
+
TaskStatus.PENDING # Task is pending
|
|
267
|
+
TaskStatus.QUEUED # Task is queued
|
|
268
|
+
TaskStatus.RUNNING # Task is running
|
|
269
|
+
TaskStatus.COMPLETED # Task completed successfully
|
|
270
|
+
TaskStatus.SUCCEEDED # Task succeeded
|
|
271
|
+
TaskStatus.FAILED # Task failed
|
|
272
|
+
TaskStatus.CANCELLED # Task was cancelled
|
|
273
|
+
TaskStatus.CREATED # Task was created
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
## Configuration
|
|
277
|
+
|
|
278
|
+
### Environment Variables
|
|
279
|
+
|
|
280
|
+
The SDK reads configuration from environment variables:
|
|
281
|
+
|
|
282
|
+
- `MLOPS_API_KEY`: API key for authentication
|
|
283
|
+
- `MLOPS_ACCESS_TOKEN`: Access token for authentication (alternative to API key)
|
|
284
|
+
- `MLOPS_DOMAIN`: API domain (default: `localhost:8090`)
|
|
285
|
+
- `MLOPS_DEBUG`: Enable debug mode (`true`/`false`, default: `false`)
|
|
286
|
+
- `MLOPS_API_PATH`: API path prefix (default: `/api/v1`)
|
|
287
|
+
|
|
288
|
+
### ConnectionConfig
|
|
289
|
+
|
|
290
|
+
You can also configure the connection programmatically:
|
|
291
|
+
|
|
292
|
+
```python
|
|
293
|
+
from client import ConnectionConfig
|
|
294
|
+
|
|
295
|
+
config = ConnectionConfig(
|
|
296
|
+
domain="api.example.com",
|
|
297
|
+
api_key="xck_******",
|
|
298
|
+
debug=True,
|
|
299
|
+
request_timeout=60.0,
|
|
300
|
+
api_path="/api/v1"
|
|
301
|
+
)
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
## Error Handling
|
|
305
|
+
|
|
306
|
+
The SDK provides specific exception types:
|
|
307
|
+
|
|
308
|
+
```python
|
|
309
|
+
from client.exceptions import (
|
|
310
|
+
APIException, # General API errors
|
|
311
|
+
AuthenticationException, # Authentication failures
|
|
312
|
+
NotFoundException, # Resource not found
|
|
313
|
+
RateLimitException, # Rate limit exceeded
|
|
314
|
+
TimeoutException, # Request timeout
|
|
315
|
+
InvalidArgumentException # Invalid arguments
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
try:
|
|
319
|
+
result = task.submit(name="test", cluster_id=1, command="echo hello")
|
|
320
|
+
except AuthenticationException as e:
|
|
321
|
+
print(f"Authentication failed: {e}")
|
|
322
|
+
except NotFoundException as e:
|
|
323
|
+
print(f"Resource not found: {e}")
|
|
324
|
+
except APIException as e:
|
|
325
|
+
print(f"API error: {e}")
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
## Examples
|
|
329
|
+
|
|
330
|
+
### Submit a Machine Learning Training Job
|
|
331
|
+
|
|
332
|
+
```python
|
|
333
|
+
from client import Task
|
|
334
|
+
|
|
335
|
+
task = Task()
|
|
336
|
+
|
|
337
|
+
result = task.submit(
|
|
338
|
+
name="pytorch-training",
|
|
339
|
+
cluster_id=1,
|
|
340
|
+
script="""#!/bin/bash
|
|
341
|
+
#SBATCH --gres=gpu:1
|
|
342
|
+
#SBATCH --cpus-per-task=8
|
|
343
|
+
#SBATCH --mem=32GB
|
|
344
|
+
|
|
345
|
+
python train.py --config config.yaml
|
|
346
|
+
""",
|
|
347
|
+
resources={
|
|
348
|
+
"cpus_per_task": 8,
|
|
349
|
+
"memory": "32GB",
|
|
350
|
+
"gres": "gpu:1",
|
|
351
|
+
"time": "4-00:00:00", # 4 days
|
|
352
|
+
"partition": "gpu"
|
|
353
|
+
}
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
print(f"Training job submitted: {result.job_id}")
|
|
357
|
+
```
|
|
358
|
+
|
|
359
|
+
### Monitor Task Status
|
|
360
|
+
|
|
361
|
+
```python
|
|
362
|
+
from client import Task
|
|
363
|
+
from client.api.client.models.task_status import TaskStatus
|
|
364
|
+
import time
|
|
365
|
+
|
|
366
|
+
task = Task()
|
|
367
|
+
job_id = 12345
|
|
368
|
+
cluster_id = 1
|
|
369
|
+
|
|
370
|
+
while True:
|
|
371
|
+
task_info = task.get(task_id=job_id, cluster_id=cluster_id)
|
|
372
|
+
print(f"Status: {task_info.status}")
|
|
373
|
+
|
|
374
|
+
if task_info.status in [TaskStatus.COMPLETED, TaskStatus.FAILED, TaskStatus.CANCELLED]:
|
|
375
|
+
break
|
|
376
|
+
|
|
377
|
+
time.sleep(10) # Check every 10 seconds
|
|
378
|
+
```
|
|
379
|
+
|
|
380
|
+
### List and Filter Tasks
|
|
381
|
+
|
|
382
|
+
```python
|
|
383
|
+
from client import Task
|
|
384
|
+
from client.api.client.models.task_status import TaskStatus
|
|
385
|
+
|
|
386
|
+
task = Task()
|
|
387
|
+
|
|
388
|
+
# Get all running tasks in cluster 1
|
|
389
|
+
running = task.list(
|
|
390
|
+
status=TaskStatus.RUNNING,
|
|
391
|
+
cluster_id=1
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
for t in running.tasks:
|
|
395
|
+
print(f"{t.name}: {t.status} (Job ID: {t.job_id})")
|
|
396
|
+
|
|
397
|
+
# Get failed tasks
|
|
398
|
+
failed = task.list(status=TaskStatus.FAILED)
|
|
399
|
+
|
|
400
|
+
print(f"Total failed tasks: {failed.total}")
|
|
401
|
+
```
|
|
402
|
+
|
|
403
|
+
## Documentation
|
|
404
|
+
|
|
405
|
+
- [MLOPS Documentation](https://xcloud-service.com/docs)
|
|
406
|
+
- [API Reference](https://xcloud-service.com/docs/api)
|
|
407
|
+
|
|
408
|
+
## License
|
|
409
|
+
|
|
410
|
+
MIT
|
|
411
|
+
|
|
412
|
+
## Support
|
|
413
|
+
|
|
414
|
+
- [GitHub Issues](https://github.com/xcloud-service/xservice/issues)
|
|
415
|
+
- [Documentation](https://xcloud-service.com/docs)
|
|
416
|
+
|