mlops-python-sdk 1.0.2__tar.gz → 1.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlops_python_sdk-1.0.3/PKG-INFO +235 -0
- mlops_python_sdk-1.0.3/README.md +209 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/task/task.py +100 -1
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/pyproject.toml +1 -1
- mlops_python_sdk-1.0.2/PKG-INFO +0 -254
- mlops_python_sdk-1.0.2/README.md +0 -228
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/__init__.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/__init__.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/api/__init__.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/api/storage/__init__.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/api/storage/get_storage_presign_download.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/api/storage/get_storage_presign_upload.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/api/tasks/__init__.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/api/tasks/cancel_task.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/api/tasks/delete_task.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/api/tasks/get_task.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/api/tasks/get_task_by_task_id.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/api/tasks/get_task_logs.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/api/tasks/list_tasks.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/api/tasks/submit_task.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/client.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/errors.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/__init__.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/error_response.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/get_storage_presign_download_response_200.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/get_storage_presign_upload_response_200.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/get_task_logs_direction.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/get_task_logs_log_type.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/job_spec.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/job_spec_env.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/job_spec_master_strategy.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/log_pagination.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/message_response.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/task.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/task_alloc_tres_type_0.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/task_gres_detail_type_0_item.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/task_job_resources_type_0.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/task_list_response.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/task_log_entry.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/task_log_entry_log_type.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/task_logs_response.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/task_resources_type_0.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/task_status.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/task_submit_request.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/task_submit_request_environment_type_0.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/task_submit_response.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/task_tres_type_0.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/task_tres_used_type_0.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/py.typed +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/types.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/connection_config.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/exceptions.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/task/__init__.py +0 -0
- {mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/task/client.py +0 -0
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: mlops-python-sdk
|
|
3
|
+
Version: 1.0.3
|
|
4
|
+
Summary: MLOps Python SDK for XCloud Service API
|
|
5
|
+
License: MIT
|
|
6
|
+
Author: mlops
|
|
7
|
+
Author-email: mlops@example.com
|
|
8
|
+
Requires-Python: >=3.9,<4.0
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Requires-Dist: attrs (>=23.2.0)
|
|
17
|
+
Requires-Dist: httpx (>=0.27.0,<1.0.0)
|
|
18
|
+
Requires-Dist: packaging (>=24.1)
|
|
19
|
+
Requires-Dist: python-dateutil (>=2.8.2)
|
|
20
|
+
Requires-Dist: typing-extensions (>=4.1.0)
|
|
21
|
+
Project-URL: Bug Tracker, https://github.com/xcloud-service/xservice/issues
|
|
22
|
+
Project-URL: Homepage, https://mlops.cloud/
|
|
23
|
+
Project-URL: Repository, https://github.com/xcloud-service/xservice
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# SDK
|
|
27
|
+
|
|
28
|
+
Software Development Kits for integrating with the XCloud Service API.
|
|
29
|
+
|
|
30
|
+
> [!NOTE] SDK Support
|
|
31
|
+
> SDKs provide type-safe, high-level interfaces for interacting with the platform API. They handle authentication, error handling, and request retries automatically.
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
## Installation
|
|
35
|
+
|
|
36
|
+
The Python SDK installation.
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install mlops-python-sdk
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### Configuration
|
|
43
|
+
|
|
44
|
+
The SDK reads configuration from environment variables by default:
|
|
45
|
+
|
|
46
|
+
- `MLOPS_API_KEY`: API key (required)
|
|
47
|
+
- `MLOPS_DOMAIN`: API domain, e.g. `localhost:8090` or `https://example.com`
|
|
48
|
+
- `MLOPS_API_PATH`: API path prefix (default: `/api/v1`)
|
|
49
|
+
- `MLOPS_DEBUG`: `true|false` (default: `false`)
|
|
50
|
+
|
|
51
|
+
Or configure in code:
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
from mlops import ConnectionConfig, Task
|
|
55
|
+
|
|
56
|
+
config = ConnectionConfig(
|
|
57
|
+
api_key="xck_...",
|
|
58
|
+
domain="https://example.com",
|
|
59
|
+
api_path="/api/v1",
|
|
60
|
+
debug=False,
|
|
61
|
+
)
|
|
62
|
+
task = Task(config=config)
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## SDK Usage
|
|
66
|
+
|
|
67
|
+
### Initialize client
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
from mlops import Task
|
|
71
|
+
|
|
72
|
+
task = Task() # uses environment variables by default
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### Submit a GPU task
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
from mlops import Task
|
|
79
|
+
|
|
80
|
+
task = Task()
|
|
81
|
+
resp = task.submit(
|
|
82
|
+
name="gpu-task-from-sdk",
|
|
83
|
+
cluster_name="slurm-cn",
|
|
84
|
+
team_id=1,
|
|
85
|
+
image="/mnt/minio/images/01ai-registry.cn-shanghai.cr.aliyuncs.com+public+llamafactory+0.9.3.sqsh",
|
|
86
|
+
entry_command="llamafactory-cli train /workspace/config/test_lora.yaml",
|
|
87
|
+
resources={
|
|
88
|
+
"partition": "gpu",
|
|
89
|
+
"nodes": 2,
|
|
90
|
+
"ntasks": 2,
|
|
91
|
+
"cpus_per_task": 2,
|
|
92
|
+
"memory": "4G",
|
|
93
|
+
"time": "01:00:00",
|
|
94
|
+
"gres": "gpu:nvidia_a10:1",
|
|
95
|
+
"qos": "qos_xcloud",
|
|
96
|
+
},
|
|
97
|
+
file_path="/path/to/xservice.zip", # optional: .zip/.tar.gz/.tgz
|
|
98
|
+
)
|
|
99
|
+
print(resp.job_id)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Submit a CPU task
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
from mlops import Task
|
|
106
|
+
|
|
107
|
+
task = Task()
|
|
108
|
+
resp = task.submit(
|
|
109
|
+
name="cpu-task-from-sdk",
|
|
110
|
+
cluster_name="slurm-cn",
|
|
111
|
+
team_id=1,
|
|
112
|
+
image="docker://01ai-registry.cn-shanghai.cr.aliyuncs.com/01-ai/xcs/v2/alpine:3.23.0",
|
|
113
|
+
entry_command="echo hello",
|
|
114
|
+
resources={
|
|
115
|
+
"partition": "cpu",
|
|
116
|
+
"nodes": 1,
|
|
117
|
+
"ntasks": 1,
|
|
118
|
+
"cpus_per_task": 1,
|
|
119
|
+
"memory": "1G",
|
|
120
|
+
"time": "01:00:00",
|
|
121
|
+
"qos": "qos_xcloud",
|
|
122
|
+
},
|
|
123
|
+
)
|
|
124
|
+
print(resp.job_id)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### List tasks
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
from mlops import Task
|
|
131
|
+
from mlops.api.client.models.task_status import TaskStatus
|
|
132
|
+
|
|
133
|
+
task = Task()
|
|
134
|
+
resp = task.list(status=TaskStatus.COMPLETED, cluster_name="slurm-cn", page=1, page_size=20)
|
|
135
|
+
print(len(resp.tasks or []))
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### Get task details
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
from mlops import Task
|
|
142
|
+
|
|
143
|
+
task = Task()
|
|
144
|
+
task_info = task.get(task_id=12345, cluster_name="slurm-cn")
|
|
145
|
+
print(task_info)
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### Cancel a task
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
from mlops import Task
|
|
152
|
+
|
|
153
|
+
task = Task()
|
|
154
|
+
task.cancel(task_id=12345, cluster_name="slurm-cn")
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### Delete a task
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
from mlops import Task
|
|
161
|
+
|
|
162
|
+
task = Task()
|
|
163
|
+
task.delete(task_id=12345, cluster_name="slurm-cn")
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
**Task Management Methods:**
|
|
167
|
+
|
|
168
|
+
- `submit()` - Submit a new task with container image and entry command
|
|
169
|
+
- `get()` - Get task details by task ID
|
|
170
|
+
- `list()` - List tasks with optional filters (status, cluster_name, team_id, user_id)
|
|
171
|
+
- `cancel()` - Cancel a running task
|
|
172
|
+
- `delete()` - Delete a task record
|
|
173
|
+
|
|
174
|
+
**Task Status Values:**
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
from mlops.api.client.models.task_status import TaskStatus
|
|
178
|
+
|
|
179
|
+
TaskStatus.PENDING # Task is pending
|
|
180
|
+
TaskStatus.QUEUED # Task is queued
|
|
181
|
+
TaskStatus.RUNNING # Task is running
|
|
182
|
+
TaskStatus.COMPLETED # Task completed successfully
|
|
183
|
+
TaskStatus.SUCCEEDED # Task succeeded
|
|
184
|
+
TaskStatus.FAILED # Task failed
|
|
185
|
+
TaskStatus.CANCELLED # Task was cancelled
|
|
186
|
+
TaskStatus.CREATED # Task was created
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
**Error Handling:**
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
from mlops.exceptions import (
|
|
193
|
+
APIException,
|
|
194
|
+
AuthenticationException,
|
|
195
|
+
NotFoundException,
|
|
196
|
+
RateLimitException,
|
|
197
|
+
TimeoutException,
|
|
198
|
+
InvalidArgumentException,
|
|
199
|
+
NotEnoughSpaceException
|
|
200
|
+
)
|
|
201
|
+
from mlops import Task
|
|
202
|
+
|
|
203
|
+
task = Task()
|
|
204
|
+
|
|
205
|
+
try:
|
|
206
|
+
result = task.submit(
|
|
207
|
+
name="test",
|
|
208
|
+
cluster_name="slurm-cn",
|
|
209
|
+
image="docker://alpine:3.23.0",
|
|
210
|
+
entry_command="echo hello",
|
|
211
|
+
)
|
|
212
|
+
except AuthenticationException as e:
|
|
213
|
+
print(f"Authentication failed: {e}")
|
|
214
|
+
except NotFoundException as e:
|
|
215
|
+
print(f"Resource not found: {e}")
|
|
216
|
+
except APIException as e:
|
|
217
|
+
print(f"API error: {e}")
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
> [!TIP] Error Handling
|
|
221
|
+
> SDKs automatically parse typed responses and raise structured exceptions.
|
|
222
|
+
|
|
223
|
+
## Features
|
|
224
|
+
|
|
225
|
+
- Type-safe API clients
|
|
226
|
+
- Automatic authentication
|
|
227
|
+
- Error handling
|
|
228
|
+
- Typed response parsing (generated models)
|
|
229
|
+
- Unexpected-status guard (optional)
|
|
230
|
+
|
|
231
|
+
## Resources
|
|
232
|
+
|
|
233
|
+
- [Python SDK Documentation](https://github.com/xcloud-service/xservice/tree/main/client/python-sdk)
|
|
234
|
+
- [API Reference](https://xcloud-service.com/docs/api)
|
|
235
|
+
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
# SDK
|
|
2
|
+
|
|
3
|
+
Software Development Kits for integrating with the XCloud Service API.
|
|
4
|
+
|
|
5
|
+
> [!NOTE] SDK Support
|
|
6
|
+
> SDKs provide type-safe, high-level interfaces for interacting with the platform API. They handle authentication, error handling, and request retries automatically.
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
The Python SDK installation.
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
pip install mlops-python-sdk
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
### Configuration
|
|
18
|
+
|
|
19
|
+
The SDK reads configuration from environment variables by default:
|
|
20
|
+
|
|
21
|
+
- `MLOPS_API_KEY`: API key (required)
|
|
22
|
+
- `MLOPS_DOMAIN`: API domain, e.g. `localhost:8090` or `https://example.com`
|
|
23
|
+
- `MLOPS_API_PATH`: API path prefix (default: `/api/v1`)
|
|
24
|
+
- `MLOPS_DEBUG`: `true|false` (default: `false`)
|
|
25
|
+
|
|
26
|
+
Or configure in code:
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
from mlops import ConnectionConfig, Task
|
|
30
|
+
|
|
31
|
+
config = ConnectionConfig(
|
|
32
|
+
api_key="xck_...",
|
|
33
|
+
domain="https://example.com",
|
|
34
|
+
api_path="/api/v1",
|
|
35
|
+
debug=False,
|
|
36
|
+
)
|
|
37
|
+
task = Task(config=config)
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## SDK Usage
|
|
41
|
+
|
|
42
|
+
### Initialize client
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from mlops import Task
|
|
46
|
+
|
|
47
|
+
task = Task() # uses environment variables by default
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### Submit a GPU task
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
from mlops import Task
|
|
54
|
+
|
|
55
|
+
task = Task()
|
|
56
|
+
resp = task.submit(
|
|
57
|
+
name="gpu-task-from-sdk",
|
|
58
|
+
cluster_name="slurm-cn",
|
|
59
|
+
team_id=1,
|
|
60
|
+
image="/mnt/minio/images/01ai-registry.cn-shanghai.cr.aliyuncs.com+public+llamafactory+0.9.3.sqsh",
|
|
61
|
+
entry_command="llamafactory-cli train /workspace/config/test_lora.yaml",
|
|
62
|
+
resources={
|
|
63
|
+
"partition": "gpu",
|
|
64
|
+
"nodes": 2,
|
|
65
|
+
"ntasks": 2,
|
|
66
|
+
"cpus_per_task": 2,
|
|
67
|
+
"memory": "4G",
|
|
68
|
+
"time": "01:00:00",
|
|
69
|
+
"gres": "gpu:nvidia_a10:1",
|
|
70
|
+
"qos": "qos_xcloud",
|
|
71
|
+
},
|
|
72
|
+
file_path="/path/to/xservice.zip", # optional: .zip/.tar.gz/.tgz
|
|
73
|
+
)
|
|
74
|
+
print(resp.job_id)
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Submit a CPU task
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from mlops import Task
|
|
81
|
+
|
|
82
|
+
task = Task()
|
|
83
|
+
resp = task.submit(
|
|
84
|
+
name="cpu-task-from-sdk",
|
|
85
|
+
cluster_name="slurm-cn",
|
|
86
|
+
team_id=1,
|
|
87
|
+
image="docker://01ai-registry.cn-shanghai.cr.aliyuncs.com/01-ai/xcs/v2/alpine:3.23.0",
|
|
88
|
+
entry_command="echo hello",
|
|
89
|
+
resources={
|
|
90
|
+
"partition": "cpu",
|
|
91
|
+
"nodes": 1,
|
|
92
|
+
"ntasks": 1,
|
|
93
|
+
"cpus_per_task": 1,
|
|
94
|
+
"memory": "1G",
|
|
95
|
+
"time": "01:00:00",
|
|
96
|
+
"qos": "qos_xcloud",
|
|
97
|
+
},
|
|
98
|
+
)
|
|
99
|
+
print(resp.job_id)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### List tasks
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
from mlops import Task
|
|
106
|
+
from mlops.api.client.models.task_status import TaskStatus
|
|
107
|
+
|
|
108
|
+
task = Task()
|
|
109
|
+
resp = task.list(status=TaskStatus.COMPLETED, cluster_name="slurm-cn", page=1, page_size=20)
|
|
110
|
+
print(len(resp.tasks or []))
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### Get task details
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from mlops import Task
|
|
117
|
+
|
|
118
|
+
task = Task()
|
|
119
|
+
task_info = task.get(task_id=12345, cluster_name="slurm-cn")
|
|
120
|
+
print(task_info)
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### Cancel a task
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
from mlops import Task
|
|
127
|
+
|
|
128
|
+
task = Task()
|
|
129
|
+
task.cancel(task_id=12345, cluster_name="slurm-cn")
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### Delete a task
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
from mlops import Task
|
|
136
|
+
|
|
137
|
+
task = Task()
|
|
138
|
+
task.delete(task_id=12345, cluster_name="slurm-cn")
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
**Task Management Methods:**
|
|
142
|
+
|
|
143
|
+
- `submit()` - Submit a new task with container image and entry command
|
|
144
|
+
- `get()` - Get task details by task ID
|
|
145
|
+
- `list()` - List tasks with optional filters (status, cluster_name, team_id, user_id)
|
|
146
|
+
- `cancel()` - Cancel a running task
|
|
147
|
+
- `delete()` - Delete a task record
|
|
148
|
+
|
|
149
|
+
**Task Status Values:**
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
from mlops.api.client.models.task_status import TaskStatus
|
|
153
|
+
|
|
154
|
+
TaskStatus.PENDING # Task is pending
|
|
155
|
+
TaskStatus.QUEUED # Task is queued
|
|
156
|
+
TaskStatus.RUNNING # Task is running
|
|
157
|
+
TaskStatus.COMPLETED # Task completed successfully
|
|
158
|
+
TaskStatus.SUCCEEDED # Task succeeded
|
|
159
|
+
TaskStatus.FAILED # Task failed
|
|
160
|
+
TaskStatus.CANCELLED # Task was cancelled
|
|
161
|
+
TaskStatus.CREATED # Task was created
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
**Error Handling:**
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
from mlops.exceptions import (
|
|
168
|
+
APIException,
|
|
169
|
+
AuthenticationException,
|
|
170
|
+
NotFoundException,
|
|
171
|
+
RateLimitException,
|
|
172
|
+
TimeoutException,
|
|
173
|
+
InvalidArgumentException,
|
|
174
|
+
NotEnoughSpaceException
|
|
175
|
+
)
|
|
176
|
+
from mlops import Task
|
|
177
|
+
|
|
178
|
+
task = Task()
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
result = task.submit(
|
|
182
|
+
name="test",
|
|
183
|
+
cluster_name="slurm-cn",
|
|
184
|
+
image="docker://alpine:3.23.0",
|
|
185
|
+
entry_command="echo hello",
|
|
186
|
+
)
|
|
187
|
+
except AuthenticationException as e:
|
|
188
|
+
print(f"Authentication failed: {e}")
|
|
189
|
+
except NotFoundException as e:
|
|
190
|
+
print(f"Resource not found: {e}")
|
|
191
|
+
except APIException as e:
|
|
192
|
+
print(f"API error: {e}")
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
> [!TIP] Error Handling
|
|
196
|
+
> SDKs automatically parse typed responses and raise structured exceptions.
|
|
197
|
+
|
|
198
|
+
## Features
|
|
199
|
+
|
|
200
|
+
- Type-safe API clients
|
|
201
|
+
- Automatic authentication
|
|
202
|
+
- Error handling
|
|
203
|
+
- Typed response parsing (generated models)
|
|
204
|
+
- Unexpected-status guard (optional)
|
|
205
|
+
|
|
206
|
+
## Resources
|
|
207
|
+
|
|
208
|
+
- [Python SDK Documentation](https://github.com/xcloud-service/xservice/tree/main/client/python-sdk)
|
|
209
|
+
- [API Reference](https://xcloud-service.com/docs/api)
|
|
@@ -6,6 +6,9 @@ This module provides a convenient interface for managing tasks through the MLOps
|
|
|
6
6
|
|
|
7
7
|
import json
|
|
8
8
|
import os
|
|
9
|
+
import sys
|
|
10
|
+
import threading
|
|
11
|
+
import time
|
|
9
12
|
from http import HTTPStatus
|
|
10
13
|
from pathlib import Path
|
|
11
14
|
from typing import Optional
|
|
@@ -55,13 +58,109 @@ def _validate_archive_file_path(file_path: str) -> Path:
|
|
|
55
58
|
|
|
56
59
|
|
|
57
60
|
def _upload_file_to_presigned_url(url: str, file_path: Path, timeout: Optional[float]) -> None:
|
|
61
|
+
def _format_bytes_iec(n: int) -> str:
|
|
62
|
+
if n < 1024:
|
|
63
|
+
return f"{n}B"
|
|
64
|
+
unit = 1024.0
|
|
65
|
+
suffixes = ["KiB", "MiB", "GiB", "TiB", "PiB"]
|
|
66
|
+
v = float(n)
|
|
67
|
+
i = -1
|
|
68
|
+
while v >= unit and i < len(suffixes) - 1:
|
|
69
|
+
v /= unit
|
|
70
|
+
i += 1
|
|
71
|
+
return f"{v:.1f}{suffixes[i]}"
|
|
72
|
+
|
|
73
|
+
def _render_bar(done: int, total: int, width: int = 28) -> str:
|
|
74
|
+
if total <= 0 or width <= 1:
|
|
75
|
+
return ">"
|
|
76
|
+
done = max(0, min(done, total))
|
|
77
|
+
filled = int(width * (done / total))
|
|
78
|
+
if filled >= width:
|
|
79
|
+
return "=" * width
|
|
80
|
+
if filled <= 0:
|
|
81
|
+
return ">" + (" " * (width - 1))
|
|
82
|
+
return ("=" * filled) + ">" + (" " * (width - filled - 1))
|
|
83
|
+
|
|
84
|
+
def _format_elapsed_seconds(start: float) -> str:
|
|
85
|
+
sec = int(max(0.0, time.monotonic() - start))
|
|
86
|
+
return f"{sec}s"
|
|
87
|
+
|
|
88
|
+
class _ProgressIterable:
|
|
89
|
+
def __init__(self, f, total: int, name: str, chunk_size: int = 64 * 1024):
|
|
90
|
+
self._f = f # file-like object
|
|
91
|
+
self._total = max(0, int(total))
|
|
92
|
+
self._name = name
|
|
93
|
+
self._chunk_size = max(1, int(chunk_size))
|
|
94
|
+
self._read = 0
|
|
95
|
+
self._start = time.monotonic()
|
|
96
|
+
self._completed = False
|
|
97
|
+
self._out = sys.stdout
|
|
98
|
+
try:
|
|
99
|
+
self._is_tty = bool(self._out.isatty())
|
|
100
|
+
except Exception:
|
|
101
|
+
self._is_tty = False
|
|
102
|
+
|
|
103
|
+
def _render_line(self, display_read: int) -> str:
|
|
104
|
+
display_read = max(0, min(int(display_read), self._total))
|
|
105
|
+
pct = (display_read / self._total) * 100.0 if self._total > 0 else 0.0
|
|
106
|
+
bar = _render_bar(display_read, self._total, width=28)
|
|
107
|
+
elapsed = _format_elapsed_seconds(self._start)
|
|
108
|
+
return (
|
|
109
|
+
f"uploading {self._name} [{bar}] {pct:6.2f}% "
|
|
110
|
+
f"({_format_bytes_iec(display_read)}/{_format_bytes_iec(self._total)}) "
|
|
111
|
+
f"elapsed {elapsed}"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
def _print_line(self, line: str, final: bool = False) -> None:
|
|
115
|
+
if self._is_tty:
|
|
116
|
+
# Refresh same line in terminal.
|
|
117
|
+
print("\r" + line, end="" if not final else "\n", file=self._out, flush=True)
|
|
118
|
+
else:
|
|
119
|
+
# Always visible in non-TTY environments.
|
|
120
|
+
print(line, file=self._out, flush=True)
|
|
121
|
+
|
|
122
|
+
def __iter__(self):
|
|
123
|
+
stop_event = threading.Event()
|
|
124
|
+
|
|
125
|
+
def ticker() -> None:
|
|
126
|
+
last_sec = -1
|
|
127
|
+
# Print immediately so users see something right away.
|
|
128
|
+
self._print_line(self._render_line(self._read))
|
|
129
|
+
while not stop_event.is_set():
|
|
130
|
+
sec = int(max(0.0, time.monotonic() - self._start))
|
|
131
|
+
if sec != last_sec:
|
|
132
|
+
last_sec = sec
|
|
133
|
+
self._print_line(self._render_line(self._read))
|
|
134
|
+
# check frequently to avoid skipping seconds
|
|
135
|
+
stop_event.wait(0.05)
|
|
136
|
+
|
|
137
|
+
t = threading.Thread(target=ticker, name="mlops-upload-progress", daemon=True)
|
|
138
|
+
t.start()
|
|
139
|
+
try:
|
|
140
|
+
while True:
|
|
141
|
+
chunk = self._f.read(self._chunk_size)
|
|
142
|
+
if not chunk:
|
|
143
|
+
break
|
|
144
|
+
self._read += len(chunk)
|
|
145
|
+
yield chunk
|
|
146
|
+
finally:
|
|
147
|
+
# Ensure a final 100% line and stop ticker.
|
|
148
|
+
self._read = self._total
|
|
149
|
+
self._completed = True
|
|
150
|
+
stop_event.set()
|
|
151
|
+
t.join(timeout=0.2)
|
|
152
|
+
self._print_line(self._render_line(self._read), final=True)
|
|
153
|
+
|
|
58
154
|
size = file_path.stat().st_size
|
|
59
155
|
# Use a dedicated client for S3 presigned upload (avoid leaking API auth headers).
|
|
60
156
|
with httpx.Client(timeout=timeout) as client:
|
|
61
157
|
with file_path.open("rb") as f:
|
|
158
|
+
content = f
|
|
159
|
+
if size > 0:
|
|
160
|
+
content = _ProgressIterable(f, total=size, name=file_path.name)
|
|
62
161
|
resp = client.put(
|
|
63
162
|
url,
|
|
64
|
-
content=
|
|
163
|
+
content=content,
|
|
65
164
|
headers={
|
|
66
165
|
"Content-Length": str(size),
|
|
67
166
|
"Content-Type": "application/octet-stream",
|
mlops_python_sdk-1.0.2/PKG-INFO
DELETED
|
@@ -1,254 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.3
|
|
2
|
-
Name: mlops-python-sdk
|
|
3
|
-
Version: 1.0.2
|
|
4
|
-
Summary: MLOps Python SDK for XCloud Service API
|
|
5
|
-
License: MIT
|
|
6
|
-
Author: mlops
|
|
7
|
-
Author-email: mlops@example.com
|
|
8
|
-
Requires-Python: >=3.9,<4.0
|
|
9
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
-
Classifier: Programming Language :: Python :: 3
|
|
11
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
-
Requires-Dist: attrs (>=23.2.0)
|
|
17
|
-
Requires-Dist: httpx (>=0.27.0,<1.0.0)
|
|
18
|
-
Requires-Dist: packaging (>=24.1)
|
|
19
|
-
Requires-Dist: python-dateutil (>=2.8.2)
|
|
20
|
-
Requires-Dist: typing-extensions (>=4.1.0)
|
|
21
|
-
Project-URL: Bug Tracker, https://github.com/xcloud-service/xservice/issues
|
|
22
|
-
Project-URL: Homepage, https://mlops.cloud/
|
|
23
|
-
Project-URL: Repository, https://github.com/xcloud-service/xservice
|
|
24
|
-
Description-Content-Type: text/markdown
|
|
25
|
-
|
|
26
|
-
# SDK
|
|
27
|
-
|
|
28
|
-
Software Development Kits for integrating with the XCloud Service API.
|
|
29
|
-
|
|
30
|
-
> [!NOTE] SDK Support
|
|
31
|
-
> SDKs provide type-safe, high-level interfaces for interacting with the platform API. They handle authentication, error handling, and request retries automatically.
|
|
32
|
-
|
|
33
|
-
## Available SDKs
|
|
34
|
-
|
|
35
|
-
### Python SDK
|
|
36
|
-
|
|
37
|
-
### Installation
|
|
38
|
-
|
|
39
|
-
The Python SDK installation.
|
|
40
|
-
|
|
41
|
-
```bash
|
|
42
|
-
pip install mlops-python-sdk
|
|
43
|
-
```
|
|
44
|
-
|
|
45
|
-
### Configuration
|
|
46
|
-
|
|
47
|
-
The SDK reads configuration from environment variables by default:
|
|
48
|
-
|
|
49
|
-
- `MLOPS_API_KEY`: API key (required)
|
|
50
|
-
- `MLOPS_DOMAIN`: API domain, e.g. `localhost:8090` or `https://example.com`
|
|
51
|
-
- `MLOPS_API_PATH`: API path prefix (default: `/api/v1`)
|
|
52
|
-
- `MLOPS_DEBUG`: `true|false` (default: `false`)
|
|
53
|
-
|
|
54
|
-
Or configure in code:
|
|
55
|
-
|
|
56
|
-
```python
|
|
57
|
-
from mlops import ConnectionConfig, Task
|
|
58
|
-
|
|
59
|
-
config = ConnectionConfig(
|
|
60
|
-
api_key="xck_...",
|
|
61
|
-
domain="https://example.com",
|
|
62
|
-
api_path="/api/v1",
|
|
63
|
-
debug=False,
|
|
64
|
-
)
|
|
65
|
-
task = Task(config=config)
|
|
66
|
-
```
|
|
67
|
-
|
|
68
|
-
### Usage
|
|
69
|
-
|
|
70
|
-
```python
|
|
71
|
-
from mlops import Task
|
|
72
|
-
from mlops.api.client.models.task_status import TaskStatus
|
|
73
|
-
from pathlib import Path
|
|
74
|
-
|
|
75
|
-
# Initialize Task client (uses environment variables by default)
|
|
76
|
-
task = Task()
|
|
77
|
-
|
|
78
|
-
# Submit a task with gpu type
|
|
79
|
-
try:
|
|
80
|
-
result = task.submit(
|
|
81
|
-
name="gpu-task-from-sdk",
|
|
82
|
-
image="/mnt/minio/images/01ai-registry.cn-shanghai.cr.aliyuncs.com+public+llamafactory+0.9.3.sqsh",
|
|
83
|
-
entry_command="llamafactory-cli train /workspace/config/test_lora.yaml",
|
|
84
|
-
resources={
|
|
85
|
-
"partition": "gpu",
|
|
86
|
-
"nodes": 2,
|
|
87
|
-
"ntasks": 2,
|
|
88
|
-
"cpus_per_task": 2,
|
|
89
|
-
"memory": "4G",
|
|
90
|
-
"time": "01:00:00",
|
|
91
|
-
"gres": "gpu:nvidia_a10:1",
|
|
92
|
-
"qos": "qos_xcloud",
|
|
93
|
-
},
|
|
94
|
-
cluster_name="slurm-cn",
|
|
95
|
-
team_id=1,
|
|
96
|
-
file_path="your file path", # optional, support for .zip, .tar.gz, .tgz
|
|
97
|
-
)
|
|
98
|
-
|
|
99
|
-
if result is not None:
|
|
100
|
-
print("==== gpu task submitted successfully ====")
|
|
101
|
-
job_id = result.job_id
|
|
102
|
-
else:
|
|
103
|
-
print("==== gpu task submitted failed ====")
|
|
104
|
-
except Exception as e:
|
|
105
|
-
print("==== gpu task submitted failed error ====", e)
|
|
106
|
-
|
|
107
|
-
# Submit a task with cpu type
|
|
108
|
-
try:
|
|
109
|
-
entry_content = Path("entry.sh").read_text(encoding="utf-8")
|
|
110
|
-
result = task.submit(
|
|
111
|
-
name="cpu-task-from-sdk",
|
|
112
|
-
image="docker://01ai-registry.cn-shanghai.cr.aliyuncs.com/01-ai/xcs/v2/alpine:3.23.0",
|
|
113
|
-
entry_command=entry_content,
|
|
114
|
-
resources={
|
|
115
|
-
"partition": "cpu",
|
|
116
|
-
"nodes": 1,
|
|
117
|
-
"ntasks": 1,
|
|
118
|
-
"cpus_per_task": 1,
|
|
119
|
-
"memory": "1G",
|
|
120
|
-
"time": "01:00:00",
|
|
121
|
-
"qos": "qos_xcloud",
|
|
122
|
-
},
|
|
123
|
-
cluster_name="slurm-cn",
|
|
124
|
-
team_id=1,
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
if result is not None:
|
|
128
|
-
print("==== cpu task submitted successfully ====")
|
|
129
|
-
job_id = result.job_id
|
|
130
|
-
else:
|
|
131
|
-
print("==== cpu task submitted failed ====")
|
|
132
|
-
except Exception as e:
|
|
133
|
-
print("==== cpu task submitted failed error ====", e)
|
|
134
|
-
|
|
135
|
-
# List tasks with filters
|
|
136
|
-
try:
|
|
137
|
-
completed_tasks = task.list(
|
|
138
|
-
status=TaskStatus.COMPLETED,
|
|
139
|
-
cluster_name="slurm-cn",
|
|
140
|
-
page=1,
|
|
141
|
-
page_size=20
|
|
142
|
-
)
|
|
143
|
-
|
|
144
|
-
# Get task details
|
|
145
|
-
if completed_tasks is not None and len(completed_tasks.tasks) > 0:
|
|
146
|
-
print("==== completed_tasks number ====", len(completed_tasks.tasks))
|
|
147
|
-
task_info = task.get(task_id=completed_tasks.tasks[0].job_id, cluster_name="slurm-cn")
|
|
148
|
-
print("==== task_info ====", task_info)
|
|
149
|
-
else:
|
|
150
|
-
print("==== no completed tasks to get details ====")
|
|
151
|
-
except Exception as e:
|
|
152
|
-
print("==== get task details failed error ====", e)
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
# Cancel a running task
|
|
156
|
-
try:
|
|
157
|
-
running_tasks = task.list(
|
|
158
|
-
status=TaskStatus.RUNNING,
|
|
159
|
-
cluster_name="slurm-cn",
|
|
160
|
-
page=1,
|
|
161
|
-
page_size=20
|
|
162
|
-
)
|
|
163
|
-
if running_tasks is not None and len(running_tasks.tasks) > 0:
|
|
164
|
-
print("==== running_tasks number ====", len(running_tasks.tasks))
|
|
165
|
-
# Cancel a task
|
|
166
|
-
result = task.cancel(task_id=running_tasks.tasks[0].job_id, cluster_name="slurm-cn")
|
|
167
|
-
print("==== task cancelled ====", running_tasks.tasks[0].job_id, result)
|
|
168
|
-
else:
|
|
169
|
-
print("==== no running tasks to cancel ====")
|
|
170
|
-
except Exception as e:
|
|
171
|
-
print("==== cancel running task failed error ====", e)
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
# Delete a task
|
|
175
|
-
try:
|
|
176
|
-
completed_tasks = task.list(
|
|
177
|
-
status=TaskStatus.COMPLETED,
|
|
178
|
-
cluster_name="slurm-cn",
|
|
179
|
-
page=1,
|
|
180
|
-
page_size=20
|
|
181
|
-
)
|
|
182
|
-
if completed_tasks is not None and len(completed_tasks.tasks) > 0:
|
|
183
|
-
print("==== completed_tasks number ====", len(completed_tasks.tasks))
|
|
184
|
-
# Delete a task
|
|
185
|
-
result = task.delete(task_id=completed_tasks.tasks[0].job_id, cluster_name="slurm-cn")
|
|
186
|
-
print("==== task deleted ====", completed_tasks.tasks[0].job_id, result)
|
|
187
|
-
else:
|
|
188
|
-
print("==== no completed tasks to delete ====")
|
|
189
|
-
except Exception as e:
|
|
190
|
-
print("==== delete completed task failed error ====", e)
|
|
191
|
-
```
|
|
192
|
-
|
|
193
|
-
**Task Management Methods:**
|
|
194
|
-
|
|
195
|
-
- `submit()` - Submit a new task with container image and entry command
|
|
196
|
-
- `get()` - Get task details by task ID
|
|
197
|
-
- `list()` - List tasks with optional filters (status, cluster_name, team_id, user_id)
|
|
198
|
-
- `cancel()` - Cancel a running task
|
|
199
|
-
- `delete()` - Delete a task record
|
|
200
|
-
|
|
201
|
-
**Task Status Values:**
|
|
202
|
-
|
|
203
|
-
```python
|
|
204
|
-
from mlops.api.client.models.task_status import TaskStatus
|
|
205
|
-
|
|
206
|
-
TaskStatus.PENDING # Task is pending
|
|
207
|
-
TaskStatus.QUEUED # Task is queued
|
|
208
|
-
TaskStatus.RUNNING # Task is running
|
|
209
|
-
TaskStatus.COMPLETED # Task completed successfully
|
|
210
|
-
TaskStatus.SUCCEEDED # Task succeeded
|
|
211
|
-
TaskStatus.FAILED # Task failed
|
|
212
|
-
TaskStatus.CANCELLED # Task was cancelled
|
|
213
|
-
TaskStatus.CREATED # Task was created
|
|
214
|
-
```
|
|
215
|
-
|
|
216
|
-
**Error Handling:**
|
|
217
|
-
|
|
218
|
-
```python
|
|
219
|
-
from mlops.exceptions import (
|
|
220
|
-
APIException,
|
|
221
|
-
AuthenticationException,
|
|
222
|
-
NotFoundException,
|
|
223
|
-
RateLimitException,
|
|
224
|
-
TimeoutException,
|
|
225
|
-
InvalidArgumentException,
|
|
226
|
-
NotEnoughSpaceException
|
|
227
|
-
)
|
|
228
|
-
|
|
229
|
-
try:
|
|
230
|
-
result = task.submit(name="test", cluster_name="slurm-cn", command="echo hello")
|
|
231
|
-
except AuthenticationException as e:
|
|
232
|
-
print(f"Authentication failed: {e}")
|
|
233
|
-
except NotFoundException as e:
|
|
234
|
-
print(f"Resource not found: {e}")
|
|
235
|
-
except APIException as e:
|
|
236
|
-
print(f"API error: {e}")
|
|
237
|
-
```
|
|
238
|
-
|
|
239
|
-
> [!TIP] Error Handling
|
|
240
|
-
> SDKs automatically handle common errors and retry failed requests. Check SDK documentation for error handling best practices.
|
|
241
|
-
|
|
242
|
-
## Features
|
|
243
|
-
|
|
244
|
-
- Type-safe API clients
|
|
245
|
-
- Automatic authentication
|
|
246
|
-
- Error handling
|
|
247
|
-
- Request retry logic
|
|
248
|
-
- Response validation
|
|
249
|
-
|
|
250
|
-
## Resources
|
|
251
|
-
|
|
252
|
-
- [Python SDK Documentation](https://github.com/xcloud-service/xservice/tree/main/client/python-sdk)
|
|
253
|
-
- [API Reference](https://xcloud-service.com/docs/api)
|
|
254
|
-
|
mlops_python_sdk-1.0.2/README.md
DELETED
|
@@ -1,228 +0,0 @@
|
|
|
1
|
-
# SDK
|
|
2
|
-
|
|
3
|
-
Software Development Kits for integrating with the XCloud Service API.
|
|
4
|
-
|
|
5
|
-
> [!NOTE] SDK Support
|
|
6
|
-
> SDKs provide type-safe, high-level interfaces for interacting with the platform API. They handle authentication, error handling, and request retries automatically.
|
|
7
|
-
|
|
8
|
-
## Available SDKs
|
|
9
|
-
|
|
10
|
-
### Python SDK
|
|
11
|
-
|
|
12
|
-
### Installation
|
|
13
|
-
|
|
14
|
-
The Python SDK installation.
|
|
15
|
-
|
|
16
|
-
```bash
|
|
17
|
-
pip install mlops-python-sdk
|
|
18
|
-
```
|
|
19
|
-
|
|
20
|
-
### Configuration
|
|
21
|
-
|
|
22
|
-
The SDK reads configuration from environment variables by default:
|
|
23
|
-
|
|
24
|
-
- `MLOPS_API_KEY`: API key (required)
|
|
25
|
-
- `MLOPS_DOMAIN`: API domain, e.g. `localhost:8090` or `https://example.com`
|
|
26
|
-
- `MLOPS_API_PATH`: API path prefix (default: `/api/v1`)
|
|
27
|
-
- `MLOPS_DEBUG`: `true|false` (default: `false`)
|
|
28
|
-
|
|
29
|
-
Or configure in code:
|
|
30
|
-
|
|
31
|
-
```python
|
|
32
|
-
from mlops import ConnectionConfig, Task
|
|
33
|
-
|
|
34
|
-
config = ConnectionConfig(
|
|
35
|
-
api_key="xck_...",
|
|
36
|
-
domain="https://example.com",
|
|
37
|
-
api_path="/api/v1",
|
|
38
|
-
debug=False,
|
|
39
|
-
)
|
|
40
|
-
task = Task(config=config)
|
|
41
|
-
```
|
|
42
|
-
|
|
43
|
-
### Usage
|
|
44
|
-
|
|
45
|
-
```python
|
|
46
|
-
from mlops import Task
|
|
47
|
-
from mlops.api.client.models.task_status import TaskStatus
|
|
48
|
-
from pathlib import Path
|
|
49
|
-
|
|
50
|
-
# Initialize Task client (uses environment variables by default)
|
|
51
|
-
task = Task()
|
|
52
|
-
|
|
53
|
-
# Submit a task with gpu type
|
|
54
|
-
try:
|
|
55
|
-
result = task.submit(
|
|
56
|
-
name="gpu-task-from-sdk",
|
|
57
|
-
image="/mnt/minio/images/01ai-registry.cn-shanghai.cr.aliyuncs.com+public+llamafactory+0.9.3.sqsh",
|
|
58
|
-
entry_command="llamafactory-cli train /workspace/config/test_lora.yaml",
|
|
59
|
-
resources={
|
|
60
|
-
"partition": "gpu",
|
|
61
|
-
"nodes": 2,
|
|
62
|
-
"ntasks": 2,
|
|
63
|
-
"cpus_per_task": 2,
|
|
64
|
-
"memory": "4G",
|
|
65
|
-
"time": "01:00:00",
|
|
66
|
-
"gres": "gpu:nvidia_a10:1",
|
|
67
|
-
"qos": "qos_xcloud",
|
|
68
|
-
},
|
|
69
|
-
cluster_name="slurm-cn",
|
|
70
|
-
team_id=1,
|
|
71
|
-
file_path="your file path", # optional, support for .zip, .tar.gz, .tgz
|
|
72
|
-
)
|
|
73
|
-
|
|
74
|
-
if result is not None:
|
|
75
|
-
print("==== gpu task submitted successfully ====")
|
|
76
|
-
job_id = result.job_id
|
|
77
|
-
else:
|
|
78
|
-
print("==== gpu task submitted failed ====")
|
|
79
|
-
except Exception as e:
|
|
80
|
-
print("==== gpu task submitted failed error ====", e)
|
|
81
|
-
|
|
82
|
-
# Submit a task with cpu type
|
|
83
|
-
try:
|
|
84
|
-
entry_content = Path("entry.sh").read_text(encoding="utf-8")
|
|
85
|
-
result = task.submit(
|
|
86
|
-
name="cpu-task-from-sdk",
|
|
87
|
-
image="docker://01ai-registry.cn-shanghai.cr.aliyuncs.com/01-ai/xcs/v2/alpine:3.23.0",
|
|
88
|
-
entry_command=entry_content,
|
|
89
|
-
resources={
|
|
90
|
-
"partition": "cpu",
|
|
91
|
-
"nodes": 1,
|
|
92
|
-
"ntasks": 1,
|
|
93
|
-
"cpus_per_task": 1,
|
|
94
|
-
"memory": "1G",
|
|
95
|
-
"time": "01:00:00",
|
|
96
|
-
"qos": "qos_xcloud",
|
|
97
|
-
},
|
|
98
|
-
cluster_name="slurm-cn",
|
|
99
|
-
team_id=1,
|
|
100
|
-
)
|
|
101
|
-
|
|
102
|
-
if result is not None:
|
|
103
|
-
print("==== cpu task submitted successfully ====")
|
|
104
|
-
job_id = result.job_id
|
|
105
|
-
else:
|
|
106
|
-
print("==== cpu task submitted failed ====")
|
|
107
|
-
except Exception as e:
|
|
108
|
-
print("==== cpu task submitted failed error ====", e)
|
|
109
|
-
|
|
110
|
-
# List tasks with filters
|
|
111
|
-
try:
|
|
112
|
-
completed_tasks = task.list(
|
|
113
|
-
status=TaskStatus.COMPLETED,
|
|
114
|
-
cluster_name="slurm-cn",
|
|
115
|
-
page=1,
|
|
116
|
-
page_size=20
|
|
117
|
-
)
|
|
118
|
-
|
|
119
|
-
# Get task details
|
|
120
|
-
if completed_tasks is not None and len(completed_tasks.tasks) > 0:
|
|
121
|
-
print("==== completed_tasks number ====", len(completed_tasks.tasks))
|
|
122
|
-
task_info = task.get(task_id=completed_tasks.tasks[0].job_id, cluster_name="slurm-cn")
|
|
123
|
-
print("==== task_info ====", task_info)
|
|
124
|
-
else:
|
|
125
|
-
print("==== no completed tasks to get details ====")
|
|
126
|
-
except Exception as e:
|
|
127
|
-
print("==== get task details failed error ====", e)
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
# Cancel a running task
|
|
131
|
-
try:
|
|
132
|
-
running_tasks = task.list(
|
|
133
|
-
status=TaskStatus.RUNNING,
|
|
134
|
-
cluster_name="slurm-cn",
|
|
135
|
-
page=1,
|
|
136
|
-
page_size=20
|
|
137
|
-
)
|
|
138
|
-
if running_tasks is not None and len(running_tasks.tasks) > 0:
|
|
139
|
-
print("==== running_tasks number ====", len(running_tasks.tasks))
|
|
140
|
-
# Cancel a task
|
|
141
|
-
result = task.cancel(task_id=running_tasks.tasks[0].job_id, cluster_name="slurm-cn")
|
|
142
|
-
print("==== task cancelled ====", running_tasks.tasks[0].job_id, result)
|
|
143
|
-
else:
|
|
144
|
-
print("==== no running tasks to cancel ====")
|
|
145
|
-
except Exception as e:
|
|
146
|
-
print("==== cancel running task failed error ====", e)
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
# Delete a task
|
|
150
|
-
try:
|
|
151
|
-
completed_tasks = task.list(
|
|
152
|
-
status=TaskStatus.COMPLETED,
|
|
153
|
-
cluster_name="slurm-cn",
|
|
154
|
-
page=1,
|
|
155
|
-
page_size=20
|
|
156
|
-
)
|
|
157
|
-
if completed_tasks is not None and len(completed_tasks.tasks) > 0:
|
|
158
|
-
print("==== completed_tasks number ====", len(completed_tasks.tasks))
|
|
159
|
-
# Delete a task
|
|
160
|
-
result = task.delete(task_id=completed_tasks.tasks[0].job_id, cluster_name="slurm-cn")
|
|
161
|
-
print("==== task deleted ====", completed_tasks.tasks[0].job_id, result)
|
|
162
|
-
else:
|
|
163
|
-
print("==== no completed tasks to delete ====")
|
|
164
|
-
except Exception as e:
|
|
165
|
-
print("==== delete completed task failed error ====", e)
|
|
166
|
-
```
|
|
167
|
-
|
|
168
|
-
**Task Management Methods:**
|
|
169
|
-
|
|
170
|
-
- `submit()` - Submit a new task with container image and entry command
|
|
171
|
-
- `get()` - Get task details by task ID
|
|
172
|
-
- `list()` - List tasks with optional filters (status, cluster_name, team_id, user_id)
|
|
173
|
-
- `cancel()` - Cancel a running task
|
|
174
|
-
- `delete()` - Delete a task record
|
|
175
|
-
|
|
176
|
-
**Task Status Values:**
|
|
177
|
-
|
|
178
|
-
```python
|
|
179
|
-
from mlops.api.client.models.task_status import TaskStatus
|
|
180
|
-
|
|
181
|
-
TaskStatus.PENDING # Task is pending
|
|
182
|
-
TaskStatus.QUEUED # Task is queued
|
|
183
|
-
TaskStatus.RUNNING # Task is running
|
|
184
|
-
TaskStatus.COMPLETED # Task completed successfully
|
|
185
|
-
TaskStatus.SUCCEEDED # Task succeeded
|
|
186
|
-
TaskStatus.FAILED # Task failed
|
|
187
|
-
TaskStatus.CANCELLED # Task was cancelled
|
|
188
|
-
TaskStatus.CREATED # Task was created
|
|
189
|
-
```
|
|
190
|
-
|
|
191
|
-
**Error Handling:**
|
|
192
|
-
|
|
193
|
-
```python
|
|
194
|
-
from mlops.exceptions import (
|
|
195
|
-
APIException,
|
|
196
|
-
AuthenticationException,
|
|
197
|
-
NotFoundException,
|
|
198
|
-
RateLimitException,
|
|
199
|
-
TimeoutException,
|
|
200
|
-
InvalidArgumentException,
|
|
201
|
-
NotEnoughSpaceException
|
|
202
|
-
)
|
|
203
|
-
|
|
204
|
-
try:
|
|
205
|
-
result = task.submit(name="test", cluster_name="slurm-cn", command="echo hello")
|
|
206
|
-
except AuthenticationException as e:
|
|
207
|
-
print(f"Authentication failed: {e}")
|
|
208
|
-
except NotFoundException as e:
|
|
209
|
-
print(f"Resource not found: {e}")
|
|
210
|
-
except APIException as e:
|
|
211
|
-
print(f"API error: {e}")
|
|
212
|
-
```
|
|
213
|
-
|
|
214
|
-
> [!TIP] Error Handling
|
|
215
|
-
> SDKs automatically handle common errors and retry failed requests. Check SDK documentation for error handling best practices.
|
|
216
|
-
|
|
217
|
-
## Features
|
|
218
|
-
|
|
219
|
-
- Type-safe API clients
|
|
220
|
-
- Automatic authentication
|
|
221
|
-
- Error handling
|
|
222
|
-
- Request retry logic
|
|
223
|
-
- Response validation
|
|
224
|
-
|
|
225
|
-
## Resources
|
|
226
|
-
|
|
227
|
-
- [Python SDK Documentation](https://github.com/xcloud-service/xservice/tree/main/client/python-sdk)
|
|
228
|
-
- [API Reference](https://xcloud-service.com/docs/api)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/api/tasks/get_task_by_task_id.py
RENAMED
|
File without changes
|
{mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/api/tasks/get_task_logs.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/get_task_logs_direction.py
RENAMED
|
File without changes
|
{mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/get_task_logs_log_type.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/message_response.py
RENAMED
|
File without changes
|
|
File without changes
|
{mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/task_alloc_tres_type_0.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/task_list_response.py
RENAMED
|
File without changes
|
|
File without changes
|
{mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/task_log_entry_log_type.py
RENAMED
|
File without changes
|
{mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/task_logs_response.py
RENAMED
|
File without changes
|
{mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/task_resources_type_0.py
RENAMED
|
File without changes
|
|
File without changes
|
{mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/task_submit_request.py
RENAMED
|
File without changes
|
|
File without changes
|
{mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/task_submit_response.py
RENAMED
|
File without changes
|
{mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/task_tres_type_0.py
RENAMED
|
File without changes
|
{mlops_python_sdk-1.0.2 → mlops_python_sdk-1.0.3}/mlops/api/client/models/task_tres_used_type_0.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|