mlops-python-sdk 1.0.0__tar.gz → 1.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlops_python_sdk-1.0.2/PKG-INFO +254 -0
- mlops_python_sdk-1.0.2/README.md +228 -0
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/__init__.py +3 -3
- mlops_python_sdk-1.0.2/mlops/api/client/api/storage/get_storage_presign_download.py +175 -0
- mlops_python_sdk-1.0.2/mlops/api/client/api/storage/get_storage_presign_upload.py +175 -0
- mlops_python_sdk-1.0.2/mlops/api/client/api/tasks/__init__.py +1 -0
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/api/tasks/cancel_task.py +14 -14
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/api/tasks/delete_task.py +14 -14
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/api/tasks/get_task.py +15 -15
- mlops_python_sdk-1.0.2/mlops/api/client/api/tasks/get_task_by_task_id.py +204 -0
- mlops_python_sdk-1.0.2/mlops/api/client/api/tasks/get_task_logs.py +300 -0
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/api/tasks/list_tasks.py +14 -14
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/models/__init__.py +16 -0
- mlops_python_sdk-1.0.2/mlops/api/client/models/get_storage_presign_download_response_200.py +60 -0
- mlops_python_sdk-1.0.2/mlops/api/client/models/get_storage_presign_upload_response_200.py +79 -0
- mlops_python_sdk-1.0.2/mlops/api/client/models/get_task_logs_direction.py +9 -0
- mlops_python_sdk-1.0.2/mlops/api/client/models/get_task_logs_log_type.py +10 -0
- mlops_python_sdk-1.0.2/mlops/api/client/models/log_pagination.py +90 -0
- mlops_python_sdk-1.0.2/mlops/api/client/models/task_log_entry.py +105 -0
- mlops_python_sdk-1.0.2/mlops/api/client/models/task_log_entry_log_type.py +9 -0
- mlops_python_sdk-1.0.2/mlops/api/client/models/task_logs_response.py +112 -0
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task_submit_request.py +50 -6
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/connection_config.py +2 -9
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/exceptions.py +10 -10
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/task/__init__.py +1 -1
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/task/client.py +11 -35
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/task/task.py +194 -64
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/pyproject.toml +1 -1
- mlops_python_sdk-1.0.0/PKG-INFO +0 -416
- mlops_python_sdk-1.0.0/README.md +0 -390
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/__init__.py +0 -0
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/api/__init__.py +0 -0
- {mlops_python_sdk-1.0.0/mlops/api/client/api/tasks → mlops_python_sdk-1.0.2/mlops/api/client/api/storage}/__init__.py +0 -0
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/api/tasks/submit_task.py +0 -0
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/client.py +0 -0
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/errors.py +0 -0
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/models/error_response.py +0 -0
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/models/job_spec.py +0 -0
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/models/job_spec_env.py +0 -0
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/models/job_spec_master_strategy.py +0 -0
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/models/message_response.py +0 -0
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task.py +0 -0
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task_alloc_tres_type_0.py +0 -0
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task_gres_detail_type_0_item.py +0 -0
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task_job_resources_type_0.py +0 -0
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task_list_response.py +0 -0
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task_resources_type_0.py +0 -0
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task_status.py +0 -0
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task_submit_request_environment_type_0.py +0 -0
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task_submit_response.py +0 -0
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task_tres_type_0.py +0 -0
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/models/task_tres_used_type_0.py +0 -0
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/py.typed +0 -0
- {mlops_python_sdk-1.0.0 → mlops_python_sdk-1.0.2}/mlops/api/client/types.py +0 -0
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: mlops-python-sdk
|
|
3
|
+
Version: 1.0.2
|
|
4
|
+
Summary: MLOps Python SDK for XCloud Service API
|
|
5
|
+
License: MIT
|
|
6
|
+
Author: mlops
|
|
7
|
+
Author-email: mlops@example.com
|
|
8
|
+
Requires-Python: >=3.9,<4.0
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Requires-Dist: attrs (>=23.2.0)
|
|
17
|
+
Requires-Dist: httpx (>=0.27.0,<1.0.0)
|
|
18
|
+
Requires-Dist: packaging (>=24.1)
|
|
19
|
+
Requires-Dist: python-dateutil (>=2.8.2)
|
|
20
|
+
Requires-Dist: typing-extensions (>=4.1.0)
|
|
21
|
+
Project-URL: Bug Tracker, https://github.com/xcloud-service/xservice/issues
|
|
22
|
+
Project-URL: Homepage, https://mlops.cloud/
|
|
23
|
+
Project-URL: Repository, https://github.com/xcloud-service/xservice
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# SDK
|
|
27
|
+
|
|
28
|
+
Software Development Kits for integrating with the XCloud Service API.
|
|
29
|
+
|
|
30
|
+
> [!NOTE] SDK Support
|
|
31
|
+
> SDKs provide type-safe, high-level interfaces for interacting with the platform API. They handle authentication, error handling, and request retries automatically.
|
|
32
|
+
|
|
33
|
+
## Available SDKs
|
|
34
|
+
|
|
35
|
+
### Python SDK
|
|
36
|
+
|
|
37
|
+
### Installation
|
|
38
|
+
|
|
39
|
+
The Python SDK installation.
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install mlops-python-sdk
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Configuration
|
|
46
|
+
|
|
47
|
+
The SDK reads configuration from environment variables by default:
|
|
48
|
+
|
|
49
|
+
- `MLOPS_API_KEY`: API key (required)
|
|
50
|
+
- `MLOPS_DOMAIN`: API domain, e.g. `localhost:8090` or `https://example.com`
|
|
51
|
+
- `MLOPS_API_PATH`: API path prefix (default: `/api/v1`)
|
|
52
|
+
- `MLOPS_DEBUG`: `true|false` (default: `false`)
|
|
53
|
+
|
|
54
|
+
Or configure in code:
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from mlops import ConnectionConfig, Task
|
|
58
|
+
|
|
59
|
+
config = ConnectionConfig(
|
|
60
|
+
api_key="xck_...",
|
|
61
|
+
domain="https://example.com",
|
|
62
|
+
api_path="/api/v1",
|
|
63
|
+
debug=False,
|
|
64
|
+
)
|
|
65
|
+
task = Task(config=config)
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Usage
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
from mlops import Task
|
|
72
|
+
from mlops.api.client.models.task_status import TaskStatus
|
|
73
|
+
from pathlib import Path
|
|
74
|
+
|
|
75
|
+
# Initialize Task client (uses environment variables by default)
|
|
76
|
+
task = Task()
|
|
77
|
+
|
|
78
|
+
# Submit a task with gpu type
|
|
79
|
+
try:
|
|
80
|
+
result = task.submit(
|
|
81
|
+
name="gpu-task-from-sdk",
|
|
82
|
+
image="/mnt/minio/images/01ai-registry.cn-shanghai.cr.aliyuncs.com+public+llamafactory+0.9.3.sqsh",
|
|
83
|
+
entry_command="llamafactory-cli train /workspace/config/test_lora.yaml",
|
|
84
|
+
resources={
|
|
85
|
+
"partition": "gpu",
|
|
86
|
+
"nodes": 2,
|
|
87
|
+
"ntasks": 2,
|
|
88
|
+
"cpus_per_task": 2,
|
|
89
|
+
"memory": "4G",
|
|
90
|
+
"time": "01:00:00",
|
|
91
|
+
"gres": "gpu:nvidia_a10:1",
|
|
92
|
+
"qos": "qos_xcloud",
|
|
93
|
+
},
|
|
94
|
+
cluster_name="slurm-cn",
|
|
95
|
+
team_id=1,
|
|
96
|
+
file_path="your file path", # optional, support for .zip, .tar.gz, .tgz
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
if result is not None:
|
|
100
|
+
print("==== gpu task submitted successfully ====")
|
|
101
|
+
job_id = result.job_id
|
|
102
|
+
else:
|
|
103
|
+
print("==== gpu task submitted failed ====")
|
|
104
|
+
except Exception as e:
|
|
105
|
+
print("==== gpu task submitted failed error ====", e)
|
|
106
|
+
|
|
107
|
+
# Submit a task with cpu type
|
|
108
|
+
try:
|
|
109
|
+
entry_content = Path("entry.sh").read_text(encoding="utf-8")
|
|
110
|
+
result = task.submit(
|
|
111
|
+
name="cpu-task-from-sdk",
|
|
112
|
+
image="docker://01ai-registry.cn-shanghai.cr.aliyuncs.com/01-ai/xcs/v2/alpine:3.23.0",
|
|
113
|
+
entry_command=entry_content,
|
|
114
|
+
resources={
|
|
115
|
+
"partition": "cpu",
|
|
116
|
+
"nodes": 1,
|
|
117
|
+
"ntasks": 1,
|
|
118
|
+
"cpus_per_task": 1,
|
|
119
|
+
"memory": "1G",
|
|
120
|
+
"time": "01:00:00",
|
|
121
|
+
"qos": "qos_xcloud",
|
|
122
|
+
},
|
|
123
|
+
cluster_name="slurm-cn",
|
|
124
|
+
team_id=1,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
if result is not None:
|
|
128
|
+
print("==== cpu task submitted successfully ====")
|
|
129
|
+
job_id = result.job_id
|
|
130
|
+
else:
|
|
131
|
+
print("==== cpu task submitted failed ====")
|
|
132
|
+
except Exception as e:
|
|
133
|
+
print("==== cpu task submitted failed error ====", e)
|
|
134
|
+
|
|
135
|
+
# List tasks with filters
|
|
136
|
+
try:
|
|
137
|
+
completed_tasks = task.list(
|
|
138
|
+
status=TaskStatus.COMPLETED,
|
|
139
|
+
cluster_name="slurm-cn",
|
|
140
|
+
page=1,
|
|
141
|
+
page_size=20
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# Get task details
|
|
145
|
+
if completed_tasks is not None and len(completed_tasks.tasks) > 0:
|
|
146
|
+
print("==== completed_tasks number ====", len(completed_tasks.tasks))
|
|
147
|
+
task_info = task.get(task_id=completed_tasks.tasks[0].job_id, cluster_name="slurm-cn")
|
|
148
|
+
print("==== task_info ====", task_info)
|
|
149
|
+
else:
|
|
150
|
+
print("==== no completed tasks to get details ====")
|
|
151
|
+
except Exception as e:
|
|
152
|
+
print("==== get task details failed error ====", e)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# Cancel a running task
|
|
156
|
+
try:
|
|
157
|
+
running_tasks = task.list(
|
|
158
|
+
status=TaskStatus.RUNNING,
|
|
159
|
+
cluster_name="slurm-cn",
|
|
160
|
+
page=1,
|
|
161
|
+
page_size=20
|
|
162
|
+
)
|
|
163
|
+
if running_tasks is not None and len(running_tasks.tasks) > 0:
|
|
164
|
+
print("==== running_tasks number ====", len(running_tasks.tasks))
|
|
165
|
+
# Cancel a task
|
|
166
|
+
result = task.cancel(task_id=running_tasks.tasks[0].job_id, cluster_name="slurm-cn")
|
|
167
|
+
print("==== task cancelled ====", running_tasks.tasks[0].job_id, result)
|
|
168
|
+
else:
|
|
169
|
+
print("==== no running tasks to cancel ====")
|
|
170
|
+
except Exception as e:
|
|
171
|
+
print("==== cancel running task failed error ====", e)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
# Delete a task
|
|
175
|
+
try:
|
|
176
|
+
completed_tasks = task.list(
|
|
177
|
+
status=TaskStatus.COMPLETED,
|
|
178
|
+
cluster_name="slurm-cn",
|
|
179
|
+
page=1,
|
|
180
|
+
page_size=20
|
|
181
|
+
)
|
|
182
|
+
if completed_tasks is not None and len(completed_tasks.tasks) > 0:
|
|
183
|
+
print("==== completed_tasks number ====", len(completed_tasks.tasks))
|
|
184
|
+
# Delete a task
|
|
185
|
+
result = task.delete(task_id=completed_tasks.tasks[0].job_id, cluster_name="slurm-cn")
|
|
186
|
+
print("==== task deleted ====", completed_tasks.tasks[0].job_id, result)
|
|
187
|
+
else:
|
|
188
|
+
print("==== no completed tasks to delete ====")
|
|
189
|
+
except Exception as e:
|
|
190
|
+
print("==== delete completed task failed error ====", e)
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
**Task Management Methods:**
|
|
194
|
+
|
|
195
|
+
- `submit()` - Submit a new task with container image and entry command
|
|
196
|
+
- `get()` - Get task details by task ID
|
|
197
|
+
- `list()` - List tasks with optional filters (status, cluster_name, team_id, user_id)
|
|
198
|
+
- `cancel()` - Cancel a running task
|
|
199
|
+
- `delete()` - Delete a task record
|
|
200
|
+
|
|
201
|
+
**Task Status Values:**
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
from mlops.api.client.models.task_status import TaskStatus
|
|
205
|
+
|
|
206
|
+
TaskStatus.PENDING # Task is pending
|
|
207
|
+
TaskStatus.QUEUED # Task is queued
|
|
208
|
+
TaskStatus.RUNNING # Task is running
|
|
209
|
+
TaskStatus.COMPLETED # Task completed successfully
|
|
210
|
+
TaskStatus.SUCCEEDED # Task succeeded
|
|
211
|
+
TaskStatus.FAILED # Task failed
|
|
212
|
+
TaskStatus.CANCELLED # Task was cancelled
|
|
213
|
+
TaskStatus.CREATED # Task was created
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
**Error Handling:**
|
|
217
|
+
|
|
218
|
+
```python
|
|
219
|
+
from mlops.exceptions import (
|
|
220
|
+
APIException,
|
|
221
|
+
AuthenticationException,
|
|
222
|
+
NotFoundException,
|
|
223
|
+
RateLimitException,
|
|
224
|
+
TimeoutException,
|
|
225
|
+
InvalidArgumentException,
|
|
226
|
+
NotEnoughSpaceException
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
try:
|
|
230
|
+
result = task.submit(name="test", cluster_name="slurm-cn", command="echo hello")
|
|
231
|
+
except AuthenticationException as e:
|
|
232
|
+
print(f"Authentication failed: {e}")
|
|
233
|
+
except NotFoundException as e:
|
|
234
|
+
print(f"Resource not found: {e}")
|
|
235
|
+
except APIException as e:
|
|
236
|
+
print(f"API error: {e}")
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
> [!TIP] Error Handling
|
|
240
|
+
> SDKs automatically handle common errors and retry failed requests. Check SDK documentation for error handling best practices.
|
|
241
|
+
|
|
242
|
+
## Features
|
|
243
|
+
|
|
244
|
+
- Type-safe API clients
|
|
245
|
+
- Automatic authentication
|
|
246
|
+
- Error handling
|
|
247
|
+
- Request retry logic
|
|
248
|
+
- Response validation
|
|
249
|
+
|
|
250
|
+
## Resources
|
|
251
|
+
|
|
252
|
+
- [Python SDK Documentation](https://github.com/xcloud-service/xservice/tree/main/client/python-sdk)
|
|
253
|
+
- [API Reference](https://xcloud-service.com/docs/api)
|
|
254
|
+
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
# SDK
|
|
2
|
+
|
|
3
|
+
Software Development Kits for integrating with the XCloud Service API.
|
|
4
|
+
|
|
5
|
+
> [!NOTE] SDK Support
|
|
6
|
+
> SDKs provide type-safe, high-level interfaces for interacting with the platform API. They handle authentication, error handling, and request retries automatically.
|
|
7
|
+
|
|
8
|
+
## Available SDKs
|
|
9
|
+
|
|
10
|
+
### Python SDK
|
|
11
|
+
|
|
12
|
+
### Installation
|
|
13
|
+
|
|
14
|
+
The Python SDK installation.
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pip install mlops-python-sdk
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
### Configuration
|
|
21
|
+
|
|
22
|
+
The SDK reads configuration from environment variables by default:
|
|
23
|
+
|
|
24
|
+
- `MLOPS_API_KEY`: API key (required)
|
|
25
|
+
- `MLOPS_DOMAIN`: API domain, e.g. `localhost:8090` or `https://example.com`
|
|
26
|
+
- `MLOPS_API_PATH`: API path prefix (default: `/api/v1`)
|
|
27
|
+
- `MLOPS_DEBUG`: `true|false` (default: `false`)
|
|
28
|
+
|
|
29
|
+
Or configure in code:
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
from mlops import ConnectionConfig, Task
|
|
33
|
+
|
|
34
|
+
config = ConnectionConfig(
|
|
35
|
+
api_key="xck_...",
|
|
36
|
+
domain="https://example.com",
|
|
37
|
+
api_path="/api/v1",
|
|
38
|
+
debug=False,
|
|
39
|
+
)
|
|
40
|
+
task = Task(config=config)
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### Usage
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from mlops import Task
|
|
47
|
+
from mlops.api.client.models.task_status import TaskStatus
|
|
48
|
+
from pathlib import Path
|
|
49
|
+
|
|
50
|
+
# Initialize Task client (uses environment variables by default)
|
|
51
|
+
task = Task()
|
|
52
|
+
|
|
53
|
+
# Submit a task with gpu type
|
|
54
|
+
try:
|
|
55
|
+
result = task.submit(
|
|
56
|
+
name="gpu-task-from-sdk",
|
|
57
|
+
image="/mnt/minio/images/01ai-registry.cn-shanghai.cr.aliyuncs.com+public+llamafactory+0.9.3.sqsh",
|
|
58
|
+
entry_command="llamafactory-cli train /workspace/config/test_lora.yaml",
|
|
59
|
+
resources={
|
|
60
|
+
"partition": "gpu",
|
|
61
|
+
"nodes": 2,
|
|
62
|
+
"ntasks": 2,
|
|
63
|
+
"cpus_per_task": 2,
|
|
64
|
+
"memory": "4G",
|
|
65
|
+
"time": "01:00:00",
|
|
66
|
+
"gres": "gpu:nvidia_a10:1",
|
|
67
|
+
"qos": "qos_xcloud",
|
|
68
|
+
},
|
|
69
|
+
cluster_name="slurm-cn",
|
|
70
|
+
team_id=1,
|
|
71
|
+
file_path="your file path", # optional, support for .zip, .tar.gz, .tgz
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
if result is not None:
|
|
75
|
+
print("==== gpu task submitted successfully ====")
|
|
76
|
+
job_id = result.job_id
|
|
77
|
+
else:
|
|
78
|
+
print("==== gpu task submitted failed ====")
|
|
79
|
+
except Exception as e:
|
|
80
|
+
print("==== gpu task submitted failed error ====", e)
|
|
81
|
+
|
|
82
|
+
# Submit a task with cpu type
|
|
83
|
+
try:
|
|
84
|
+
entry_content = Path("entry.sh").read_text(encoding="utf-8")
|
|
85
|
+
result = task.submit(
|
|
86
|
+
name="cpu-task-from-sdk",
|
|
87
|
+
image="docker://01ai-registry.cn-shanghai.cr.aliyuncs.com/01-ai/xcs/v2/alpine:3.23.0",
|
|
88
|
+
entry_command=entry_content,
|
|
89
|
+
resources={
|
|
90
|
+
"partition": "cpu",
|
|
91
|
+
"nodes": 1,
|
|
92
|
+
"ntasks": 1,
|
|
93
|
+
"cpus_per_task": 1,
|
|
94
|
+
"memory": "1G",
|
|
95
|
+
"time": "01:00:00",
|
|
96
|
+
"qos": "qos_xcloud",
|
|
97
|
+
},
|
|
98
|
+
cluster_name="slurm-cn",
|
|
99
|
+
team_id=1,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
if result is not None:
|
|
103
|
+
print("==== cpu task submitted successfully ====")
|
|
104
|
+
job_id = result.job_id
|
|
105
|
+
else:
|
|
106
|
+
print("==== cpu task submitted failed ====")
|
|
107
|
+
except Exception as e:
|
|
108
|
+
print("==== cpu task submitted failed error ====", e)
|
|
109
|
+
|
|
110
|
+
# List tasks with filters
|
|
111
|
+
try:
|
|
112
|
+
completed_tasks = task.list(
|
|
113
|
+
status=TaskStatus.COMPLETED,
|
|
114
|
+
cluster_name="slurm-cn",
|
|
115
|
+
page=1,
|
|
116
|
+
page_size=20
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# Get task details
|
|
120
|
+
if completed_tasks is not None and len(completed_tasks.tasks) > 0:
|
|
121
|
+
print("==== completed_tasks number ====", len(completed_tasks.tasks))
|
|
122
|
+
task_info = task.get(task_id=completed_tasks.tasks[0].job_id, cluster_name="slurm-cn")
|
|
123
|
+
print("==== task_info ====", task_info)
|
|
124
|
+
else:
|
|
125
|
+
print("==== no completed tasks to get details ====")
|
|
126
|
+
except Exception as e:
|
|
127
|
+
print("==== get task details failed error ====", e)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
# Cancel a running task
|
|
131
|
+
try:
|
|
132
|
+
running_tasks = task.list(
|
|
133
|
+
status=TaskStatus.RUNNING,
|
|
134
|
+
cluster_name="slurm-cn",
|
|
135
|
+
page=1,
|
|
136
|
+
page_size=20
|
|
137
|
+
)
|
|
138
|
+
if running_tasks is not None and len(running_tasks.tasks) > 0:
|
|
139
|
+
print("==== running_tasks number ====", len(running_tasks.tasks))
|
|
140
|
+
# Cancel a task
|
|
141
|
+
result = task.cancel(task_id=running_tasks.tasks[0].job_id, cluster_name="slurm-cn")
|
|
142
|
+
print("==== task cancelled ====", running_tasks.tasks[0].job_id, result)
|
|
143
|
+
else:
|
|
144
|
+
print("==== no running tasks to cancel ====")
|
|
145
|
+
except Exception as e:
|
|
146
|
+
print("==== cancel running task failed error ====", e)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
# Delete a task
|
|
150
|
+
try:
|
|
151
|
+
completed_tasks = task.list(
|
|
152
|
+
status=TaskStatus.COMPLETED,
|
|
153
|
+
cluster_name="slurm-cn",
|
|
154
|
+
page=1,
|
|
155
|
+
page_size=20
|
|
156
|
+
)
|
|
157
|
+
if completed_tasks is not None and len(completed_tasks.tasks) > 0:
|
|
158
|
+
print("==== completed_tasks number ====", len(completed_tasks.tasks))
|
|
159
|
+
# Delete a task
|
|
160
|
+
result = task.delete(task_id=completed_tasks.tasks[0].job_id, cluster_name="slurm-cn")
|
|
161
|
+
print("==== task deleted ====", completed_tasks.tasks[0].job_id, result)
|
|
162
|
+
else:
|
|
163
|
+
print("==== no completed tasks to delete ====")
|
|
164
|
+
except Exception as e:
|
|
165
|
+
print("==== delete completed task failed error ====", e)
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
**Task Management Methods:**
|
|
169
|
+
|
|
170
|
+
- `submit()` - Submit a new task with container image and entry command
|
|
171
|
+
- `get()` - Get task details by task ID
|
|
172
|
+
- `list()` - List tasks with optional filters (status, cluster_name, team_id, user_id)
|
|
173
|
+
- `cancel()` - Cancel a running task
|
|
174
|
+
- `delete()` - Delete a task record
|
|
175
|
+
|
|
176
|
+
**Task Status Values:**
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
from mlops.api.client.models.task_status import TaskStatus
|
|
180
|
+
|
|
181
|
+
TaskStatus.PENDING # Task is pending
|
|
182
|
+
TaskStatus.QUEUED # Task is queued
|
|
183
|
+
TaskStatus.RUNNING # Task is running
|
|
184
|
+
TaskStatus.COMPLETED # Task completed successfully
|
|
185
|
+
TaskStatus.SUCCEEDED # Task succeeded
|
|
186
|
+
TaskStatus.FAILED # Task failed
|
|
187
|
+
TaskStatus.CANCELLED # Task was cancelled
|
|
188
|
+
TaskStatus.CREATED # Task was created
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
**Error Handling:**
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
from mlops.exceptions import (
|
|
195
|
+
APIException,
|
|
196
|
+
AuthenticationException,
|
|
197
|
+
NotFoundException,
|
|
198
|
+
RateLimitException,
|
|
199
|
+
TimeoutException,
|
|
200
|
+
InvalidArgumentException,
|
|
201
|
+
NotEnoughSpaceException
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
try:
|
|
205
|
+
result = task.submit(name="test", cluster_name="slurm-cn", command="echo hello")
|
|
206
|
+
except AuthenticationException as e:
|
|
207
|
+
print(f"Authentication failed: {e}")
|
|
208
|
+
except NotFoundException as e:
|
|
209
|
+
print(f"Resource not found: {e}")
|
|
210
|
+
except APIException as e:
|
|
211
|
+
print(f"API error: {e}")
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
> [!TIP] Error Handling
|
|
215
|
+
> SDKs automatically handle common errors and retry failed requests. Check SDK documentation for error handling best practices.
|
|
216
|
+
|
|
217
|
+
## Features
|
|
218
|
+
|
|
219
|
+
- Type-safe API clients
|
|
220
|
+
- Automatic authentication
|
|
221
|
+
- Error handling
|
|
222
|
+
- Request retry logic
|
|
223
|
+
- Response validation
|
|
224
|
+
|
|
225
|
+
## Resources
|
|
226
|
+
|
|
227
|
+
- [Python SDK Documentation](https://github.com/xcloud-service/xservice/tree/main/client/python-sdk)
|
|
228
|
+
- [API Reference](https://xcloud-service.com/docs/api)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
MLOps Python SDK for accessing MLOps API.
|
|
3
3
|
|
|
4
4
|
This package provides a client library for interacting with the XCloud Service API.
|
|
5
5
|
"""
|
|
@@ -10,7 +10,7 @@ from .connection_config import (
|
|
|
10
10
|
ProxyTypes,
|
|
11
11
|
)
|
|
12
12
|
from .exceptions import (
|
|
13
|
-
|
|
13
|
+
MLOpsException,
|
|
14
14
|
TimeoutException,
|
|
15
15
|
NotFoundException,
|
|
16
16
|
AuthenticationException,
|
|
@@ -29,7 +29,7 @@ __all__ = [
|
|
|
29
29
|
"ConnectionConfig",
|
|
30
30
|
"ProxyTypes",
|
|
31
31
|
# Exceptions
|
|
32
|
-
"
|
|
32
|
+
"MLOpsException",
|
|
33
33
|
"TimeoutException",
|
|
34
34
|
"NotFoundException",
|
|
35
35
|
"AuthenticationException",
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
from http import HTTPStatus
|
|
2
|
+
from typing import Any, Optional, Union
|
|
3
|
+
|
|
4
|
+
import httpx
|
|
5
|
+
|
|
6
|
+
from ... import errors
|
|
7
|
+
from ...client import AuthenticatedClient, Client
|
|
8
|
+
from ...models.error_response import ErrorResponse
|
|
9
|
+
from ...models.get_storage_presign_download_response_200 import GetStoragePresignDownloadResponse200
|
|
10
|
+
from ...types import UNSET, Response
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _get_kwargs(
|
|
14
|
+
*,
|
|
15
|
+
key: str,
|
|
16
|
+
) -> dict[str, Any]:
|
|
17
|
+
params: dict[str, Any] = {}
|
|
18
|
+
|
|
19
|
+
params["key"] = key
|
|
20
|
+
|
|
21
|
+
params = {k: v for k, v in params.items() if v is not UNSET and v is not None}
|
|
22
|
+
|
|
23
|
+
_kwargs: dict[str, Any] = {
|
|
24
|
+
"method": "get",
|
|
25
|
+
"url": "/storage/presign_download",
|
|
26
|
+
"params": params,
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
return _kwargs
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _parse_response(
|
|
33
|
+
*, client: Union[AuthenticatedClient, Client], response: httpx.Response
|
|
34
|
+
) -> Optional[Union[ErrorResponse, GetStoragePresignDownloadResponse200]]:
|
|
35
|
+
if response.status_code == 200:
|
|
36
|
+
response_200 = GetStoragePresignDownloadResponse200.from_dict(response.json())
|
|
37
|
+
|
|
38
|
+
return response_200
|
|
39
|
+
if response.status_code == 400:
|
|
40
|
+
response_400 = ErrorResponse.from_dict(response.json())
|
|
41
|
+
|
|
42
|
+
return response_400
|
|
43
|
+
if response.status_code == 500:
|
|
44
|
+
response_500 = ErrorResponse.from_dict(response.json())
|
|
45
|
+
|
|
46
|
+
return response_500
|
|
47
|
+
if client.raise_on_unexpected_status:
|
|
48
|
+
raise errors.UnexpectedStatus(response.status_code, response.content)
|
|
49
|
+
else:
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _build_response(
|
|
54
|
+
*, client: Union[AuthenticatedClient, Client], response: httpx.Response
|
|
55
|
+
) -> Response[Union[ErrorResponse, GetStoragePresignDownloadResponse200]]:
|
|
56
|
+
return Response(
|
|
57
|
+
status_code=HTTPStatus(response.status_code),
|
|
58
|
+
content=response.content,
|
|
59
|
+
headers=response.headers,
|
|
60
|
+
parsed=_parse_response(client=client, response=response),
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def sync_detailed(
|
|
65
|
+
*,
|
|
66
|
+
client: AuthenticatedClient,
|
|
67
|
+
key: str,
|
|
68
|
+
) -> Response[Union[ErrorResponse, GetStoragePresignDownloadResponse200]]:
|
|
69
|
+
"""Get presigned URL for file download
|
|
70
|
+
|
|
71
|
+
Generates a presigned URL for downloading a file from S3.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
key (str):
|
|
75
|
+
|
|
76
|
+
Raises:
|
|
77
|
+
errors.UnexpectedStatus: If the server returns an undocumented status code and Client.raise_on_unexpected_status is True.
|
|
78
|
+
httpx.TimeoutException: If the request takes longer than Client.timeout.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
Response[Union[ErrorResponse, GetStoragePresignDownloadResponse200]]
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
kwargs = _get_kwargs(
|
|
85
|
+
key=key,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
response = client.get_httpx_client().request(
|
|
89
|
+
**kwargs,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
return _build_response(client=client, response=response)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def sync(
|
|
96
|
+
*,
|
|
97
|
+
client: AuthenticatedClient,
|
|
98
|
+
key: str,
|
|
99
|
+
) -> Optional[Union[ErrorResponse, GetStoragePresignDownloadResponse200]]:
|
|
100
|
+
"""Get presigned URL for file download
|
|
101
|
+
|
|
102
|
+
Generates a presigned URL for downloading a file from S3.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
key (str):
|
|
106
|
+
|
|
107
|
+
Raises:
|
|
108
|
+
errors.UnexpectedStatus: If the server returns an undocumented status code and Client.raise_on_unexpected_status is True.
|
|
109
|
+
httpx.TimeoutException: If the request takes longer than Client.timeout.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Union[ErrorResponse, GetStoragePresignDownloadResponse200]
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
return sync_detailed(
|
|
116
|
+
client=client,
|
|
117
|
+
key=key,
|
|
118
|
+
).parsed
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
async def asyncio_detailed(
|
|
122
|
+
*,
|
|
123
|
+
client: AuthenticatedClient,
|
|
124
|
+
key: str,
|
|
125
|
+
) -> Response[Union[ErrorResponse, GetStoragePresignDownloadResponse200]]:
|
|
126
|
+
"""Get presigned URL for file download
|
|
127
|
+
|
|
128
|
+
Generates a presigned URL for downloading a file from S3.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
key (str):
|
|
132
|
+
|
|
133
|
+
Raises:
|
|
134
|
+
errors.UnexpectedStatus: If the server returns an undocumented status code and Client.raise_on_unexpected_status is True.
|
|
135
|
+
httpx.TimeoutException: If the request takes longer than Client.timeout.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
Response[Union[ErrorResponse, GetStoragePresignDownloadResponse200]]
|
|
139
|
+
"""
|
|
140
|
+
|
|
141
|
+
kwargs = _get_kwargs(
|
|
142
|
+
key=key,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
response = await client.get_async_httpx_client().request(**kwargs)
|
|
146
|
+
|
|
147
|
+
return _build_response(client=client, response=response)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
async def asyncio(
|
|
151
|
+
*,
|
|
152
|
+
client: AuthenticatedClient,
|
|
153
|
+
key: str,
|
|
154
|
+
) -> Optional[Union[ErrorResponse, GetStoragePresignDownloadResponse200]]:
|
|
155
|
+
"""Get presigned URL for file download
|
|
156
|
+
|
|
157
|
+
Generates a presigned URL for downloading a file from S3.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
key (str):
|
|
161
|
+
|
|
162
|
+
Raises:
|
|
163
|
+
errors.UnexpectedStatus: If the server returns an undocumented status code and Client.raise_on_unexpected_status is True.
|
|
164
|
+
httpx.TimeoutException: If the request takes longer than Client.timeout.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
Union[ErrorResponse, GetStoragePresignDownloadResponse200]
|
|
168
|
+
"""
|
|
169
|
+
|
|
170
|
+
return (
|
|
171
|
+
await asyncio_detailed(
|
|
172
|
+
client=client,
|
|
173
|
+
key=key,
|
|
174
|
+
)
|
|
175
|
+
).parsed
|