mlops-python-sdk 1.0.1__tar.gz → 1.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlops_python_sdk-1.0.2/PKG-INFO +254 -0
- mlops_python_sdk-1.0.2/README.md +228 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task_submit_request.py +44 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/connection_config.py +2 -2
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/task/task.py +44 -32
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/pyproject.toml +1 -1
- mlops_python_sdk-1.0.1/PKG-INFO +0 -407
- mlops_python_sdk-1.0.1/README.md +0 -381
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/__init__.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/__init__.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/api/__init__.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/api/storage/__init__.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/api/storage/get_storage_presign_download.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/api/storage/get_storage_presign_upload.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/api/tasks/__init__.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/api/tasks/cancel_task.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/api/tasks/delete_task.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/api/tasks/get_task.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/api/tasks/get_task_by_task_id.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/api/tasks/get_task_logs.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/api/tasks/list_tasks.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/api/tasks/submit_task.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/client.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/errors.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/__init__.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/error_response.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/get_storage_presign_download_response_200.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/get_storage_presign_upload_response_200.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/get_task_logs_direction.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/get_task_logs_log_type.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/job_spec.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/job_spec_env.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/job_spec_master_strategy.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/log_pagination.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/message_response.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task_alloc_tres_type_0.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task_gres_detail_type_0_item.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task_job_resources_type_0.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task_list_response.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task_log_entry.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task_log_entry_log_type.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task_logs_response.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task_resources_type_0.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task_status.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task_submit_request_environment_type_0.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task_submit_response.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task_tres_type_0.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task_tres_used_type_0.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/py.typed +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/types.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/exceptions.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/task/__init__.py +0 -0
- {mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/task/client.py +0 -0
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: mlops-python-sdk
|
|
3
|
+
Version: 1.0.2
|
|
4
|
+
Summary: MLOps Python SDK for XCloud Service API
|
|
5
|
+
License: MIT
|
|
6
|
+
Author: mlops
|
|
7
|
+
Author-email: mlops@example.com
|
|
8
|
+
Requires-Python: >=3.9,<4.0
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Requires-Dist: attrs (>=23.2.0)
|
|
17
|
+
Requires-Dist: httpx (>=0.27.0,<1.0.0)
|
|
18
|
+
Requires-Dist: packaging (>=24.1)
|
|
19
|
+
Requires-Dist: python-dateutil (>=2.8.2)
|
|
20
|
+
Requires-Dist: typing-extensions (>=4.1.0)
|
|
21
|
+
Project-URL: Bug Tracker, https://github.com/xcloud-service/xservice/issues
|
|
22
|
+
Project-URL: Homepage, https://mlops.cloud/
|
|
23
|
+
Project-URL: Repository, https://github.com/xcloud-service/xservice
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# SDK
|
|
27
|
+
|
|
28
|
+
Software Development Kits for integrating with the XCloud Service API.
|
|
29
|
+
|
|
30
|
+
> [!NOTE] SDK Support
|
|
31
|
+
> SDKs provide type-safe, high-level interfaces for interacting with the platform API. They handle authentication, error handling, and request retries automatically.
|
|
32
|
+
|
|
33
|
+
## Available SDKs
|
|
34
|
+
|
|
35
|
+
### Python SDK
|
|
36
|
+
|
|
37
|
+
### Installation
|
|
38
|
+
|
|
39
|
+
The Python SDK installation.
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install mlops-python-sdk
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Configuration
|
|
46
|
+
|
|
47
|
+
The SDK reads configuration from environment variables by default:
|
|
48
|
+
|
|
49
|
+
- `MLOPS_API_KEY`: API key (required)
|
|
50
|
+
- `MLOPS_DOMAIN`: API domain, e.g. `localhost:8090` or `https://example.com`
|
|
51
|
+
- `MLOPS_API_PATH`: API path prefix (default: `/api/v1`)
|
|
52
|
+
- `MLOPS_DEBUG`: `true|false` (default: `false`)
|
|
53
|
+
|
|
54
|
+
Or configure in code:
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from mlops import ConnectionConfig, Task
|
|
58
|
+
|
|
59
|
+
config = ConnectionConfig(
|
|
60
|
+
api_key="xck_...",
|
|
61
|
+
domain="https://example.com",
|
|
62
|
+
api_path="/api/v1",
|
|
63
|
+
debug=False,
|
|
64
|
+
)
|
|
65
|
+
task = Task(config=config)
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Usage
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
from mlops import Task
|
|
72
|
+
from mlops.api.client.models.task_status import TaskStatus
|
|
73
|
+
from pathlib import Path
|
|
74
|
+
|
|
75
|
+
# Initialize Task client (uses environment variables by default)
|
|
76
|
+
task = Task()
|
|
77
|
+
|
|
78
|
+
# Submit a task with gpu type
|
|
79
|
+
try:
|
|
80
|
+
result = task.submit(
|
|
81
|
+
name="gpu-task-from-sdk",
|
|
82
|
+
image="/mnt/minio/images/01ai-registry.cn-shanghai.cr.aliyuncs.com+public+llamafactory+0.9.3.sqsh",
|
|
83
|
+
entry_command="llamafactory-cli train /workspace/config/test_lora.yaml",
|
|
84
|
+
resources={
|
|
85
|
+
"partition": "gpu",
|
|
86
|
+
"nodes": 2,
|
|
87
|
+
"ntasks": 2,
|
|
88
|
+
"cpus_per_task": 2,
|
|
89
|
+
"memory": "4G",
|
|
90
|
+
"time": "01:00:00",
|
|
91
|
+
"gres": "gpu:nvidia_a10:1",
|
|
92
|
+
"qos": "qos_xcloud",
|
|
93
|
+
},
|
|
94
|
+
cluster_name="slurm-cn",
|
|
95
|
+
team_id=1,
|
|
96
|
+
file_path="your file path", # optional, support for .zip, .tar.gz, .tgz
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
if result is not None:
|
|
100
|
+
print("==== gpu task submitted successfully ====")
|
|
101
|
+
job_id = result.job_id
|
|
102
|
+
else:
|
|
103
|
+
print("==== gpu task submitted failed ====")
|
|
104
|
+
except Exception as e:
|
|
105
|
+
print("==== gpu task submitted failed error ====", e)
|
|
106
|
+
|
|
107
|
+
# Submit a task with cpu type
|
|
108
|
+
try:
|
|
109
|
+
entry_content = Path("entry.sh").read_text(encoding="utf-8")
|
|
110
|
+
result = task.submit(
|
|
111
|
+
name="cpu-task-from-sdk",
|
|
112
|
+
image="docker://01ai-registry.cn-shanghai.cr.aliyuncs.com/01-ai/xcs/v2/alpine:3.23.0",
|
|
113
|
+
entry_command=entry_content,
|
|
114
|
+
resources={
|
|
115
|
+
"partition": "cpu",
|
|
116
|
+
"nodes": 1,
|
|
117
|
+
"ntasks": 1,
|
|
118
|
+
"cpus_per_task": 1,
|
|
119
|
+
"memory": "1G",
|
|
120
|
+
"time": "01:00:00",
|
|
121
|
+
"qos": "qos_xcloud",
|
|
122
|
+
},
|
|
123
|
+
cluster_name="slurm-cn",
|
|
124
|
+
team_id=1,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
if result is not None:
|
|
128
|
+
print("==== cpu task submitted successfully ====")
|
|
129
|
+
job_id = result.job_id
|
|
130
|
+
else:
|
|
131
|
+
print("==== cpu task submitted failed ====")
|
|
132
|
+
except Exception as e:
|
|
133
|
+
print("==== cpu task submitted failed error ====", e)
|
|
134
|
+
|
|
135
|
+
# List tasks with filters
|
|
136
|
+
try:
|
|
137
|
+
completed_tasks = task.list(
|
|
138
|
+
status=TaskStatus.COMPLETED,
|
|
139
|
+
cluster_name="slurm-cn",
|
|
140
|
+
page=1,
|
|
141
|
+
page_size=20
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# Get task details
|
|
145
|
+
if completed_tasks is not None and len(completed_tasks.tasks) > 0:
|
|
146
|
+
print("==== completed_tasks number ====", len(completed_tasks.tasks))
|
|
147
|
+
task_info = task.get(task_id=completed_tasks.tasks[0].job_id, cluster_name="slurm-cn")
|
|
148
|
+
print("==== task_info ====", task_info)
|
|
149
|
+
else:
|
|
150
|
+
print("==== no completed tasks to get details ====")
|
|
151
|
+
except Exception as e:
|
|
152
|
+
print("==== get task details failed error ====", e)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# Cancel a running task
|
|
156
|
+
try:
|
|
157
|
+
running_tasks = task.list(
|
|
158
|
+
status=TaskStatus.RUNNING,
|
|
159
|
+
cluster_name="slurm-cn",
|
|
160
|
+
page=1,
|
|
161
|
+
page_size=20
|
|
162
|
+
)
|
|
163
|
+
if running_tasks is not None and len(running_tasks.tasks) > 0:
|
|
164
|
+
print("==== running_tasks number ====", len(running_tasks.tasks))
|
|
165
|
+
# Cancel a task
|
|
166
|
+
result = task.cancel(task_id=running_tasks.tasks[0].job_id, cluster_name="slurm-cn")
|
|
167
|
+
print("==== task cancelled ====", running_tasks.tasks[0].job_id, result)
|
|
168
|
+
else:
|
|
169
|
+
print("==== no running tasks to cancel ====")
|
|
170
|
+
except Exception as e:
|
|
171
|
+
print("==== cancel running task failed error ====", e)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
# Delete a task
|
|
175
|
+
try:
|
|
176
|
+
completed_tasks = task.list(
|
|
177
|
+
status=TaskStatus.COMPLETED,
|
|
178
|
+
cluster_name="slurm-cn",
|
|
179
|
+
page=1,
|
|
180
|
+
page_size=20
|
|
181
|
+
)
|
|
182
|
+
if completed_tasks is not None and len(completed_tasks.tasks) > 0:
|
|
183
|
+
print("==== completed_tasks number ====", len(completed_tasks.tasks))
|
|
184
|
+
# Delete a task
|
|
185
|
+
result = task.delete(task_id=completed_tasks.tasks[0].job_id, cluster_name="slurm-cn")
|
|
186
|
+
print("==== task deleted ====", completed_tasks.tasks[0].job_id, result)
|
|
187
|
+
else:
|
|
188
|
+
print("==== no completed tasks to delete ====")
|
|
189
|
+
except Exception as e:
|
|
190
|
+
print("==== delete completed task failed error ====", e)
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
**Task Management Methods:**
|
|
194
|
+
|
|
195
|
+
- `submit()` - Submit a new task with container image and entry command
|
|
196
|
+
- `get()` - Get task details by task ID
|
|
197
|
+
- `list()` - List tasks with optional filters (status, cluster_name, team_id, user_id)
|
|
198
|
+
- `cancel()` - Cancel a running task
|
|
199
|
+
- `delete()` - Delete a task record
|
|
200
|
+
|
|
201
|
+
**Task Status Values:**
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
from mlops.api.client.models.task_status import TaskStatus
|
|
205
|
+
|
|
206
|
+
TaskStatus.PENDING # Task is pending
|
|
207
|
+
TaskStatus.QUEUED # Task is queued
|
|
208
|
+
TaskStatus.RUNNING # Task is running
|
|
209
|
+
TaskStatus.COMPLETED # Task completed successfully
|
|
210
|
+
TaskStatus.SUCCEEDED # Task succeeded
|
|
211
|
+
TaskStatus.FAILED # Task failed
|
|
212
|
+
TaskStatus.CANCELLED # Task was cancelled
|
|
213
|
+
TaskStatus.CREATED # Task was created
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
**Error Handling:**
|
|
217
|
+
|
|
218
|
+
```python
|
|
219
|
+
from mlops.exceptions import (
|
|
220
|
+
APIException,
|
|
221
|
+
AuthenticationException,
|
|
222
|
+
NotFoundException,
|
|
223
|
+
RateLimitException,
|
|
224
|
+
TimeoutException,
|
|
225
|
+
InvalidArgumentException,
|
|
226
|
+
NotEnoughSpaceException
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
try:
|
|
230
|
+
result = task.submit(name="test", cluster_name="slurm-cn", command="echo hello")
|
|
231
|
+
except AuthenticationException as e:
|
|
232
|
+
print(f"Authentication failed: {e}")
|
|
233
|
+
except NotFoundException as e:
|
|
234
|
+
print(f"Resource not found: {e}")
|
|
235
|
+
except APIException as e:
|
|
236
|
+
print(f"API error: {e}")
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
> [!TIP] Error Handling
|
|
240
|
+
> SDKs automatically handle common errors and retry failed requests. Check SDK documentation for error handling best practices.
|
|
241
|
+
|
|
242
|
+
## Features
|
|
243
|
+
|
|
244
|
+
- Type-safe API clients
|
|
245
|
+
- Automatic authentication
|
|
246
|
+
- Error handling
|
|
247
|
+
- Request retry logic
|
|
248
|
+
- Response validation
|
|
249
|
+
|
|
250
|
+
## Resources
|
|
251
|
+
|
|
252
|
+
- [Python SDK Documentation](https://github.com/xcloud-service/xservice/tree/main/client/python-sdk)
|
|
253
|
+
- [API Reference](https://xcloud-service.com/docs/api)
|
|
254
|
+
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
# SDK
|
|
2
|
+
|
|
3
|
+
Software Development Kits for integrating with the XCloud Service API.
|
|
4
|
+
|
|
5
|
+
> [!NOTE] SDK Support
|
|
6
|
+
> SDKs provide type-safe, high-level interfaces for interacting with the platform API. They handle authentication, error handling, and request retries automatically.
|
|
7
|
+
|
|
8
|
+
## Available SDKs
|
|
9
|
+
|
|
10
|
+
### Python SDK
|
|
11
|
+
|
|
12
|
+
### Installation
|
|
13
|
+
|
|
14
|
+
The Python SDK installation.
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pip install mlops-python-sdk
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
### Configuration
|
|
21
|
+
|
|
22
|
+
The SDK reads configuration from environment variables by default:
|
|
23
|
+
|
|
24
|
+
- `MLOPS_API_KEY`: API key (required)
|
|
25
|
+
- `MLOPS_DOMAIN`: API domain, e.g. `localhost:8090` or `https://example.com`
|
|
26
|
+
- `MLOPS_API_PATH`: API path prefix (default: `/api/v1`)
|
|
27
|
+
- `MLOPS_DEBUG`: `true|false` (default: `false`)
|
|
28
|
+
|
|
29
|
+
Or configure in code:
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
from mlops import ConnectionConfig, Task
|
|
33
|
+
|
|
34
|
+
config = ConnectionConfig(
|
|
35
|
+
api_key="xck_...",
|
|
36
|
+
domain="https://example.com",
|
|
37
|
+
api_path="/api/v1",
|
|
38
|
+
debug=False,
|
|
39
|
+
)
|
|
40
|
+
task = Task(config=config)
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### Usage
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from mlops import Task
|
|
47
|
+
from mlops.api.client.models.task_status import TaskStatus
|
|
48
|
+
from pathlib import Path
|
|
49
|
+
|
|
50
|
+
# Initialize Task client (uses environment variables by default)
|
|
51
|
+
task = Task()
|
|
52
|
+
|
|
53
|
+
# Submit a task with gpu type
|
|
54
|
+
try:
|
|
55
|
+
result = task.submit(
|
|
56
|
+
name="gpu-task-from-sdk",
|
|
57
|
+
image="/mnt/minio/images/01ai-registry.cn-shanghai.cr.aliyuncs.com+public+llamafactory+0.9.3.sqsh",
|
|
58
|
+
entry_command="llamafactory-cli train /workspace/config/test_lora.yaml",
|
|
59
|
+
resources={
|
|
60
|
+
"partition": "gpu",
|
|
61
|
+
"nodes": 2,
|
|
62
|
+
"ntasks": 2,
|
|
63
|
+
"cpus_per_task": 2,
|
|
64
|
+
"memory": "4G",
|
|
65
|
+
"time": "01:00:00",
|
|
66
|
+
"gres": "gpu:nvidia_a10:1",
|
|
67
|
+
"qos": "qos_xcloud",
|
|
68
|
+
},
|
|
69
|
+
cluster_name="slurm-cn",
|
|
70
|
+
team_id=1,
|
|
71
|
+
file_path="your file path", # optional, support for .zip, .tar.gz, .tgz
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
if result is not None:
|
|
75
|
+
print("==== gpu task submitted successfully ====")
|
|
76
|
+
job_id = result.job_id
|
|
77
|
+
else:
|
|
78
|
+
print("==== gpu task submitted failed ====")
|
|
79
|
+
except Exception as e:
|
|
80
|
+
print("==== gpu task submitted failed error ====", e)
|
|
81
|
+
|
|
82
|
+
# Submit a task with cpu type
|
|
83
|
+
try:
|
|
84
|
+
entry_content = Path("entry.sh").read_text(encoding="utf-8")
|
|
85
|
+
result = task.submit(
|
|
86
|
+
name="cpu-task-from-sdk",
|
|
87
|
+
image="docker://01ai-registry.cn-shanghai.cr.aliyuncs.com/01-ai/xcs/v2/alpine:3.23.0",
|
|
88
|
+
entry_command=entry_content,
|
|
89
|
+
resources={
|
|
90
|
+
"partition": "cpu",
|
|
91
|
+
"nodes": 1,
|
|
92
|
+
"ntasks": 1,
|
|
93
|
+
"cpus_per_task": 1,
|
|
94
|
+
"memory": "1G",
|
|
95
|
+
"time": "01:00:00",
|
|
96
|
+
"qos": "qos_xcloud",
|
|
97
|
+
},
|
|
98
|
+
cluster_name="slurm-cn",
|
|
99
|
+
team_id=1,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
if result is not None:
|
|
103
|
+
print("==== cpu task submitted successfully ====")
|
|
104
|
+
job_id = result.job_id
|
|
105
|
+
else:
|
|
106
|
+
print("==== cpu task submitted failed ====")
|
|
107
|
+
except Exception as e:
|
|
108
|
+
print("==== cpu task submitted failed error ====", e)
|
|
109
|
+
|
|
110
|
+
# List tasks with filters
|
|
111
|
+
try:
|
|
112
|
+
completed_tasks = task.list(
|
|
113
|
+
status=TaskStatus.COMPLETED,
|
|
114
|
+
cluster_name="slurm-cn",
|
|
115
|
+
page=1,
|
|
116
|
+
page_size=20
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# Get task details
|
|
120
|
+
if completed_tasks is not None and len(completed_tasks.tasks) > 0:
|
|
121
|
+
print("==== completed_tasks number ====", len(completed_tasks.tasks))
|
|
122
|
+
task_info = task.get(task_id=completed_tasks.tasks[0].job_id, cluster_name="slurm-cn")
|
|
123
|
+
print("==== task_info ====", task_info)
|
|
124
|
+
else:
|
|
125
|
+
print("==== no completed tasks to get details ====")
|
|
126
|
+
except Exception as e:
|
|
127
|
+
print("==== get task details failed error ====", e)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
# Cancel a running task
|
|
131
|
+
try:
|
|
132
|
+
running_tasks = task.list(
|
|
133
|
+
status=TaskStatus.RUNNING,
|
|
134
|
+
cluster_name="slurm-cn",
|
|
135
|
+
page=1,
|
|
136
|
+
page_size=20
|
|
137
|
+
)
|
|
138
|
+
if running_tasks is not None and len(running_tasks.tasks) > 0:
|
|
139
|
+
print("==== running_tasks number ====", len(running_tasks.tasks))
|
|
140
|
+
# Cancel a task
|
|
141
|
+
result = task.cancel(task_id=running_tasks.tasks[0].job_id, cluster_name="slurm-cn")
|
|
142
|
+
print("==== task cancelled ====", running_tasks.tasks[0].job_id, result)
|
|
143
|
+
else:
|
|
144
|
+
print("==== no running tasks to cancel ====")
|
|
145
|
+
except Exception as e:
|
|
146
|
+
print("==== cancel running task failed error ====", e)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
# Delete a task
|
|
150
|
+
try:
|
|
151
|
+
completed_tasks = task.list(
|
|
152
|
+
status=TaskStatus.COMPLETED,
|
|
153
|
+
cluster_name="slurm-cn",
|
|
154
|
+
page=1,
|
|
155
|
+
page_size=20
|
|
156
|
+
)
|
|
157
|
+
if completed_tasks is not None and len(completed_tasks.tasks) > 0:
|
|
158
|
+
print("==== completed_tasks number ====", len(completed_tasks.tasks))
|
|
159
|
+
# Delete a task
|
|
160
|
+
result = task.delete(task_id=completed_tasks.tasks[0].job_id, cluster_name="slurm-cn")
|
|
161
|
+
print("==== task deleted ====", completed_tasks.tasks[0].job_id, result)
|
|
162
|
+
else:
|
|
163
|
+
print("==== no completed tasks to delete ====")
|
|
164
|
+
except Exception as e:
|
|
165
|
+
print("==== delete completed task failed error ====", e)
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
**Task Management Methods:**
|
|
169
|
+
|
|
170
|
+
- `submit()` - Submit a new task with container image and entry command
|
|
171
|
+
- `get()` - Get task details by task ID
|
|
172
|
+
- `list()` - List tasks with optional filters (status, cluster_name, team_id, user_id)
|
|
173
|
+
- `cancel()` - Cancel a running task
|
|
174
|
+
- `delete()` - Delete a task record
|
|
175
|
+
|
|
176
|
+
**Task Status Values:**
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
from mlops.api.client.models.task_status import TaskStatus
|
|
180
|
+
|
|
181
|
+
TaskStatus.PENDING # Task is pending
|
|
182
|
+
TaskStatus.QUEUED # Task is queued
|
|
183
|
+
TaskStatus.RUNNING # Task is running
|
|
184
|
+
TaskStatus.COMPLETED # Task completed successfully
|
|
185
|
+
TaskStatus.SUCCEEDED # Task succeeded
|
|
186
|
+
TaskStatus.FAILED # Task failed
|
|
187
|
+
TaskStatus.CANCELLED # Task was cancelled
|
|
188
|
+
TaskStatus.CREATED # Task was created
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
**Error Handling:**
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
from mlops.exceptions import (
|
|
195
|
+
APIException,
|
|
196
|
+
AuthenticationException,
|
|
197
|
+
NotFoundException,
|
|
198
|
+
RateLimitException,
|
|
199
|
+
TimeoutException,
|
|
200
|
+
InvalidArgumentException,
|
|
201
|
+
NotEnoughSpaceException
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
try:
|
|
205
|
+
result = task.submit(name="test", cluster_name="slurm-cn", command="echo hello")
|
|
206
|
+
except AuthenticationException as e:
|
|
207
|
+
print(f"Authentication failed: {e}")
|
|
208
|
+
except NotFoundException as e:
|
|
209
|
+
print(f"Resource not found: {e}")
|
|
210
|
+
except APIException as e:
|
|
211
|
+
print(f"API error: {e}")
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
> [!TIP] Error Handling
|
|
215
|
+
> SDKs automatically handle common errors and retry failed requests. Check SDK documentation for error handling best practices.
|
|
216
|
+
|
|
217
|
+
## Features
|
|
218
|
+
|
|
219
|
+
- Type-safe API clients
|
|
220
|
+
- Automatic authentication
|
|
221
|
+
- Error handling
|
|
222
|
+
- Request retry logic
|
|
223
|
+
- Response validation
|
|
224
|
+
|
|
225
|
+
## Resources
|
|
226
|
+
|
|
227
|
+
- [Python SDK Documentation](https://github.com/xcloud-service/xservice/tree/main/client/python-sdk)
|
|
228
|
+
- [API Reference](https://xcloud-service.com/docs/api)
|
{mlops_python_sdk-1.0.1 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task_submit_request.py
RENAMED
|
@@ -29,12 +29,18 @@ class TaskSubmitRequest:
|
|
|
29
29
|
cpus_per_task (Union[None, Unset, int]): CPUs per task Example: 1.
|
|
30
30
|
dependency (Union[None, Unset, str]): Job dependencies Example: afterok:12345.
|
|
31
31
|
distribution (Union[None, Unset, str]): Task distribution Example: block.
|
|
32
|
+
entry_command (Union[None, Unset, str]): Container entry command/script (bash snippet) executed inside the
|
|
33
|
+
container. The platform runs it under /workspace.
|
|
34
|
+
Example: python -V && ls -la.
|
|
32
35
|
environment (Union['TaskSubmitRequestEnvironmentType0', None, Unset]): Environment variables as key-value pairs
|
|
33
36
|
Example: {'CUDA_VISIBLE_DEVICES': '0,1', 'PYTHONPATH': '/opt/python/lib'}.
|
|
34
37
|
error (Union[None, Unset, str]): Standard error file pattern Example: error_%j.log.
|
|
35
38
|
exclude (Union[None, Unset, str]): Nodes to exclude
|
|
36
39
|
export (Union[None, Unset, str]): Environment export Example: ALL.
|
|
37
40
|
gres (Union[None, Unset, str]): Generic resources (e.g., "gpu:1", "gpu:tesla:2") Example: gpu:1.
|
|
41
|
+
image (Union[None, Unset, str]): Container image reference. Can be a Slurm container plugin supported reference
|
|
42
|
+
(e.g. "docker://..."), or a registry reference which will be mapped to a local .sqsh image path by the platform.
|
|
43
|
+
Example: 01ai-registry.cn-shanghai.cr.aliyuncs.com/public/llamafactory:0.9.3.
|
|
38
44
|
input_ (Union[None, Unset, str]): Standard input file
|
|
39
45
|
job_spec (Union[Unset, JobSpec]): Domain-specific job specification (rendered into slurm script)
|
|
40
46
|
mem_bind (Union[None, Unset, str]): Memory binding
|
|
@@ -65,11 +71,13 @@ class TaskSubmitRequest:
|
|
|
65
71
|
cpus_per_task: Union[None, Unset, int] = UNSET
|
|
66
72
|
dependency: Union[None, Unset, str] = UNSET
|
|
67
73
|
distribution: Union[None, Unset, str] = UNSET
|
|
74
|
+
entry_command: Union[None, Unset, str] = UNSET
|
|
68
75
|
environment: Union["TaskSubmitRequestEnvironmentType0", None, Unset] = UNSET
|
|
69
76
|
error: Union[None, Unset, str] = UNSET
|
|
70
77
|
exclude: Union[None, Unset, str] = UNSET
|
|
71
78
|
export: Union[None, Unset, str] = UNSET
|
|
72
79
|
gres: Union[None, Unset, str] = UNSET
|
|
80
|
+
image: Union[None, Unset, str] = UNSET
|
|
73
81
|
input_: Union[None, Unset, str] = UNSET
|
|
74
82
|
job_spec: Union[Unset, "JobSpec"] = UNSET
|
|
75
83
|
mem_bind: Union[None, Unset, str] = UNSET
|
|
@@ -143,6 +151,12 @@ class TaskSubmitRequest:
|
|
|
143
151
|
else:
|
|
144
152
|
distribution = self.distribution
|
|
145
153
|
|
|
154
|
+
entry_command: Union[None, Unset, str]
|
|
155
|
+
if isinstance(self.entry_command, Unset):
|
|
156
|
+
entry_command = UNSET
|
|
157
|
+
else:
|
|
158
|
+
entry_command = self.entry_command
|
|
159
|
+
|
|
146
160
|
environment: Union[None, Unset, dict[str, Any]]
|
|
147
161
|
if isinstance(self.environment, Unset):
|
|
148
162
|
environment = UNSET
|
|
@@ -175,6 +189,12 @@ class TaskSubmitRequest:
|
|
|
175
189
|
else:
|
|
176
190
|
gres = self.gres
|
|
177
191
|
|
|
192
|
+
image: Union[None, Unset, str]
|
|
193
|
+
if isinstance(self.image, Unset):
|
|
194
|
+
image = UNSET
|
|
195
|
+
else:
|
|
196
|
+
image = self.image
|
|
197
|
+
|
|
178
198
|
input_: Union[None, Unset, str]
|
|
179
199
|
if isinstance(self.input_, Unset):
|
|
180
200
|
input_ = UNSET
|
|
@@ -289,6 +309,8 @@ class TaskSubmitRequest:
|
|
|
289
309
|
field_dict["dependency"] = dependency
|
|
290
310
|
if distribution is not UNSET:
|
|
291
311
|
field_dict["distribution"] = distribution
|
|
312
|
+
if entry_command is not UNSET:
|
|
313
|
+
field_dict["entry_command"] = entry_command
|
|
292
314
|
if environment is not UNSET:
|
|
293
315
|
field_dict["environment"] = environment
|
|
294
316
|
if error is not UNSET:
|
|
@@ -299,6 +321,8 @@ class TaskSubmitRequest:
|
|
|
299
321
|
field_dict["export"] = export
|
|
300
322
|
if gres is not UNSET:
|
|
301
323
|
field_dict["gres"] = gres
|
|
324
|
+
if image is not UNSET:
|
|
325
|
+
field_dict["image"] = image
|
|
302
326
|
if input_ is not UNSET:
|
|
303
327
|
field_dict["input"] = input_
|
|
304
328
|
if job_spec is not UNSET:
|
|
@@ -416,6 +440,15 @@ class TaskSubmitRequest:
|
|
|
416
440
|
|
|
417
441
|
distribution = _parse_distribution(d.pop("distribution", UNSET))
|
|
418
442
|
|
|
443
|
+
def _parse_entry_command(data: object) -> Union[None, Unset, str]:
|
|
444
|
+
if data is None:
|
|
445
|
+
return data
|
|
446
|
+
if isinstance(data, Unset):
|
|
447
|
+
return data
|
|
448
|
+
return cast(Union[None, Unset, str], data)
|
|
449
|
+
|
|
450
|
+
entry_command = _parse_entry_command(d.pop("entry_command", UNSET))
|
|
451
|
+
|
|
419
452
|
def _parse_environment(data: object) -> Union["TaskSubmitRequestEnvironmentType0", None, Unset]:
|
|
420
453
|
if data is None:
|
|
421
454
|
return data
|
|
@@ -469,6 +502,15 @@ class TaskSubmitRequest:
|
|
|
469
502
|
|
|
470
503
|
gres = _parse_gres(d.pop("gres", UNSET))
|
|
471
504
|
|
|
505
|
+
def _parse_image(data: object) -> Union[None, Unset, str]:
|
|
506
|
+
if data is None:
|
|
507
|
+
return data
|
|
508
|
+
if isinstance(data, Unset):
|
|
509
|
+
return data
|
|
510
|
+
return cast(Union[None, Unset, str], data)
|
|
511
|
+
|
|
512
|
+
image = _parse_image(d.pop("image", UNSET))
|
|
513
|
+
|
|
472
514
|
def _parse_input_(data: object) -> Union[None, Unset, str]:
|
|
473
515
|
if data is None:
|
|
474
516
|
return data
|
|
@@ -615,11 +657,13 @@ class TaskSubmitRequest:
|
|
|
615
657
|
cpus_per_task=cpus_per_task,
|
|
616
658
|
dependency=dependency,
|
|
617
659
|
distribution=distribution,
|
|
660
|
+
entry_command=entry_command,
|
|
618
661
|
environment=environment,
|
|
619
662
|
error=error,
|
|
620
663
|
exclude=exclude,
|
|
621
664
|
export=export,
|
|
622
665
|
gres=gres,
|
|
666
|
+
image=image,
|
|
623
667
|
input_=input_,
|
|
624
668
|
job_spec=job_spec,
|
|
625
669
|
mem_bind=mem_bind,
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import os
|
|
2
2
|
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Optional, Dict
|
|
4
4
|
from httpx._types import ProxyTypes
|
|
5
5
|
|
|
6
|
-
REQUEST_TIMEOUT: float =
|
|
6
|
+
REQUEST_TIMEOUT: float = 120.0 # 120 seconds
|
|
7
7
|
|
|
8
8
|
KEEPALIVE_PING_INTERVAL_SEC = 50 # 50 seconds
|
|
9
9
|
KEEPALIVE_PING_HEADER = "Keepalive-Ping-Interval"
|