mlops-python-sdk 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlops/api/client/models/task_submit_request.py +44 -0
- mlops/connection_config.py +2 -2
- mlops/task/task.py +144 -33
- mlops_python_sdk-1.0.3.dist-info/METADATA +235 -0
- {mlops_python_sdk-1.0.1.dist-info → mlops_python_sdk-1.0.3.dist-info}/RECORD +6 -6
- mlops_python_sdk-1.0.1.dist-info/METADATA +0 -407
- {mlops_python_sdk-1.0.1.dist-info → mlops_python_sdk-1.0.3.dist-info}/WHEEL +0 -0
|
@@ -29,12 +29,18 @@ class TaskSubmitRequest:
|
|
|
29
29
|
cpus_per_task (Union[None, Unset, int]): CPUs per task Example: 1.
|
|
30
30
|
dependency (Union[None, Unset, str]): Job dependencies Example: afterok:12345.
|
|
31
31
|
distribution (Union[None, Unset, str]): Task distribution Example: block.
|
|
32
|
+
entry_command (Union[None, Unset, str]): Container entry command/script (bash snippet) executed inside the
|
|
33
|
+
container. The platform runs it under /workspace.
|
|
34
|
+
Example: python -V && ls -la.
|
|
32
35
|
environment (Union['TaskSubmitRequestEnvironmentType0', None, Unset]): Environment variables as key-value pairs
|
|
33
36
|
Example: {'CUDA_VISIBLE_DEVICES': '0,1', 'PYTHONPATH': '/opt/python/lib'}.
|
|
34
37
|
error (Union[None, Unset, str]): Standard error file pattern Example: error_%j.log.
|
|
35
38
|
exclude (Union[None, Unset, str]): Nodes to exclude
|
|
36
39
|
export (Union[None, Unset, str]): Environment export Example: ALL.
|
|
37
40
|
gres (Union[None, Unset, str]): Generic resources (e.g., "gpu:1", "gpu:tesla:2") Example: gpu:1.
|
|
41
|
+
image (Union[None, Unset, str]): Container image reference. Can be a Slurm container plugin supported reference
|
|
42
|
+
(e.g. "docker://..."), or a registry reference which will be mapped to a local .sqsh image path by the platform.
|
|
43
|
+
Example: 01ai-registry.cn-shanghai.cr.aliyuncs.com/public/llamafactory:0.9.3.
|
|
38
44
|
input_ (Union[None, Unset, str]): Standard input file
|
|
39
45
|
job_spec (Union[Unset, JobSpec]): Domain-specific job specification (rendered into slurm script)
|
|
40
46
|
mem_bind (Union[None, Unset, str]): Memory binding
|
|
@@ -65,11 +71,13 @@ class TaskSubmitRequest:
|
|
|
65
71
|
cpus_per_task: Union[None, Unset, int] = UNSET
|
|
66
72
|
dependency: Union[None, Unset, str] = UNSET
|
|
67
73
|
distribution: Union[None, Unset, str] = UNSET
|
|
74
|
+
entry_command: Union[None, Unset, str] = UNSET
|
|
68
75
|
environment: Union["TaskSubmitRequestEnvironmentType0", None, Unset] = UNSET
|
|
69
76
|
error: Union[None, Unset, str] = UNSET
|
|
70
77
|
exclude: Union[None, Unset, str] = UNSET
|
|
71
78
|
export: Union[None, Unset, str] = UNSET
|
|
72
79
|
gres: Union[None, Unset, str] = UNSET
|
|
80
|
+
image: Union[None, Unset, str] = UNSET
|
|
73
81
|
input_: Union[None, Unset, str] = UNSET
|
|
74
82
|
job_spec: Union[Unset, "JobSpec"] = UNSET
|
|
75
83
|
mem_bind: Union[None, Unset, str] = UNSET
|
|
@@ -143,6 +151,12 @@ class TaskSubmitRequest:
|
|
|
143
151
|
else:
|
|
144
152
|
distribution = self.distribution
|
|
145
153
|
|
|
154
|
+
entry_command: Union[None, Unset, str]
|
|
155
|
+
if isinstance(self.entry_command, Unset):
|
|
156
|
+
entry_command = UNSET
|
|
157
|
+
else:
|
|
158
|
+
entry_command = self.entry_command
|
|
159
|
+
|
|
146
160
|
environment: Union[None, Unset, dict[str, Any]]
|
|
147
161
|
if isinstance(self.environment, Unset):
|
|
148
162
|
environment = UNSET
|
|
@@ -175,6 +189,12 @@ class TaskSubmitRequest:
|
|
|
175
189
|
else:
|
|
176
190
|
gres = self.gres
|
|
177
191
|
|
|
192
|
+
image: Union[None, Unset, str]
|
|
193
|
+
if isinstance(self.image, Unset):
|
|
194
|
+
image = UNSET
|
|
195
|
+
else:
|
|
196
|
+
image = self.image
|
|
197
|
+
|
|
178
198
|
input_: Union[None, Unset, str]
|
|
179
199
|
if isinstance(self.input_, Unset):
|
|
180
200
|
input_ = UNSET
|
|
@@ -289,6 +309,8 @@ class TaskSubmitRequest:
|
|
|
289
309
|
field_dict["dependency"] = dependency
|
|
290
310
|
if distribution is not UNSET:
|
|
291
311
|
field_dict["distribution"] = distribution
|
|
312
|
+
if entry_command is not UNSET:
|
|
313
|
+
field_dict["entry_command"] = entry_command
|
|
292
314
|
if environment is not UNSET:
|
|
293
315
|
field_dict["environment"] = environment
|
|
294
316
|
if error is not UNSET:
|
|
@@ -299,6 +321,8 @@ class TaskSubmitRequest:
|
|
|
299
321
|
field_dict["export"] = export
|
|
300
322
|
if gres is not UNSET:
|
|
301
323
|
field_dict["gres"] = gres
|
|
324
|
+
if image is not UNSET:
|
|
325
|
+
field_dict["image"] = image
|
|
302
326
|
if input_ is not UNSET:
|
|
303
327
|
field_dict["input"] = input_
|
|
304
328
|
if job_spec is not UNSET:
|
|
@@ -416,6 +440,15 @@ class TaskSubmitRequest:
|
|
|
416
440
|
|
|
417
441
|
distribution = _parse_distribution(d.pop("distribution", UNSET))
|
|
418
442
|
|
|
443
|
+
def _parse_entry_command(data: object) -> Union[None, Unset, str]:
|
|
444
|
+
if data is None:
|
|
445
|
+
return data
|
|
446
|
+
if isinstance(data, Unset):
|
|
447
|
+
return data
|
|
448
|
+
return cast(Union[None, Unset, str], data)
|
|
449
|
+
|
|
450
|
+
entry_command = _parse_entry_command(d.pop("entry_command", UNSET))
|
|
451
|
+
|
|
419
452
|
def _parse_environment(data: object) -> Union["TaskSubmitRequestEnvironmentType0", None, Unset]:
|
|
420
453
|
if data is None:
|
|
421
454
|
return data
|
|
@@ -469,6 +502,15 @@ class TaskSubmitRequest:
|
|
|
469
502
|
|
|
470
503
|
gres = _parse_gres(d.pop("gres", UNSET))
|
|
471
504
|
|
|
505
|
+
def _parse_image(data: object) -> Union[None, Unset, str]:
|
|
506
|
+
if data is None:
|
|
507
|
+
return data
|
|
508
|
+
if isinstance(data, Unset):
|
|
509
|
+
return data
|
|
510
|
+
return cast(Union[None, Unset, str], data)
|
|
511
|
+
|
|
512
|
+
image = _parse_image(d.pop("image", UNSET))
|
|
513
|
+
|
|
472
514
|
def _parse_input_(data: object) -> Union[None, Unset, str]:
|
|
473
515
|
if data is None:
|
|
474
516
|
return data
|
|
@@ -615,11 +657,13 @@ class TaskSubmitRequest:
|
|
|
615
657
|
cpus_per_task=cpus_per_task,
|
|
616
658
|
dependency=dependency,
|
|
617
659
|
distribution=distribution,
|
|
660
|
+
entry_command=entry_command,
|
|
618
661
|
environment=environment,
|
|
619
662
|
error=error,
|
|
620
663
|
exclude=exclude,
|
|
621
664
|
export=export,
|
|
622
665
|
gres=gres,
|
|
666
|
+
image=image,
|
|
623
667
|
input_=input_,
|
|
624
668
|
job_spec=job_spec,
|
|
625
669
|
mem_bind=mem_bind,
|
mlops/connection_config.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import os
|
|
2
2
|
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Optional, Dict
|
|
4
4
|
from httpx._types import ProxyTypes
|
|
5
5
|
|
|
6
|
-
REQUEST_TIMEOUT: float =
|
|
6
|
+
REQUEST_TIMEOUT: float = 120.0 # 120 seconds
|
|
7
7
|
|
|
8
8
|
KEEPALIVE_PING_INTERVAL_SEC = 50 # 50 seconds
|
|
9
9
|
KEEPALIVE_PING_HEADER = "Keepalive-Ping-Interval"
|
mlops/task/task.py
CHANGED
|
@@ -6,6 +6,9 @@ This module provides a convenient interface for managing tasks through the MLOps
|
|
|
6
6
|
|
|
7
7
|
import json
|
|
8
8
|
import os
|
|
9
|
+
import sys
|
|
10
|
+
import threading
|
|
11
|
+
import time
|
|
9
12
|
from http import HTTPStatus
|
|
10
13
|
from pathlib import Path
|
|
11
14
|
from typing import Optional
|
|
@@ -55,13 +58,109 @@ def _validate_archive_file_path(file_path: str) -> Path:
|
|
|
55
58
|
|
|
56
59
|
|
|
57
60
|
def _upload_file_to_presigned_url(url: str, file_path: Path, timeout: Optional[float]) -> None:
|
|
61
|
+
def _format_bytes_iec(n: int) -> str:
|
|
62
|
+
if n < 1024:
|
|
63
|
+
return f"{n}B"
|
|
64
|
+
unit = 1024.0
|
|
65
|
+
suffixes = ["KiB", "MiB", "GiB", "TiB", "PiB"]
|
|
66
|
+
v = float(n)
|
|
67
|
+
i = -1
|
|
68
|
+
while v >= unit and i < len(suffixes) - 1:
|
|
69
|
+
v /= unit
|
|
70
|
+
i += 1
|
|
71
|
+
return f"{v:.1f}{suffixes[i]}"
|
|
72
|
+
|
|
73
|
+
def _render_bar(done: int, total: int, width: int = 28) -> str:
|
|
74
|
+
if total <= 0 or width <= 1:
|
|
75
|
+
return ">"
|
|
76
|
+
done = max(0, min(done, total))
|
|
77
|
+
filled = int(width * (done / total))
|
|
78
|
+
if filled >= width:
|
|
79
|
+
return "=" * width
|
|
80
|
+
if filled <= 0:
|
|
81
|
+
return ">" + (" " * (width - 1))
|
|
82
|
+
return ("=" * filled) + ">" + (" " * (width - filled - 1))
|
|
83
|
+
|
|
84
|
+
def _format_elapsed_seconds(start: float) -> str:
|
|
85
|
+
sec = int(max(0.0, time.monotonic() - start))
|
|
86
|
+
return f"{sec}s"
|
|
87
|
+
|
|
88
|
+
class _ProgressIterable:
|
|
89
|
+
def __init__(self, f, total: int, name: str, chunk_size: int = 64 * 1024):
|
|
90
|
+
self._f = f # file-like object
|
|
91
|
+
self._total = max(0, int(total))
|
|
92
|
+
self._name = name
|
|
93
|
+
self._chunk_size = max(1, int(chunk_size))
|
|
94
|
+
self._read = 0
|
|
95
|
+
self._start = time.monotonic()
|
|
96
|
+
self._completed = False
|
|
97
|
+
self._out = sys.stdout
|
|
98
|
+
try:
|
|
99
|
+
self._is_tty = bool(self._out.isatty())
|
|
100
|
+
except Exception:
|
|
101
|
+
self._is_tty = False
|
|
102
|
+
|
|
103
|
+
def _render_line(self, display_read: int) -> str:
|
|
104
|
+
display_read = max(0, min(int(display_read), self._total))
|
|
105
|
+
pct = (display_read / self._total) * 100.0 if self._total > 0 else 0.0
|
|
106
|
+
bar = _render_bar(display_read, self._total, width=28)
|
|
107
|
+
elapsed = _format_elapsed_seconds(self._start)
|
|
108
|
+
return (
|
|
109
|
+
f"uploading {self._name} [{bar}] {pct:6.2f}% "
|
|
110
|
+
f"({_format_bytes_iec(display_read)}/{_format_bytes_iec(self._total)}) "
|
|
111
|
+
f"elapsed {elapsed}"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
def _print_line(self, line: str, final: bool = False) -> None:
|
|
115
|
+
if self._is_tty:
|
|
116
|
+
# Refresh same line in terminal.
|
|
117
|
+
print("\r" + line, end="" if not final else "\n", file=self._out, flush=True)
|
|
118
|
+
else:
|
|
119
|
+
# Always visible in non-TTY environments.
|
|
120
|
+
print(line, file=self._out, flush=True)
|
|
121
|
+
|
|
122
|
+
def __iter__(self):
|
|
123
|
+
stop_event = threading.Event()
|
|
124
|
+
|
|
125
|
+
def ticker() -> None:
|
|
126
|
+
last_sec = -1
|
|
127
|
+
# Print immediately so users see something right away.
|
|
128
|
+
self._print_line(self._render_line(self._read))
|
|
129
|
+
while not stop_event.is_set():
|
|
130
|
+
sec = int(max(0.0, time.monotonic() - self._start))
|
|
131
|
+
if sec != last_sec:
|
|
132
|
+
last_sec = sec
|
|
133
|
+
self._print_line(self._render_line(self._read))
|
|
134
|
+
# check frequently to avoid skipping seconds
|
|
135
|
+
stop_event.wait(0.05)
|
|
136
|
+
|
|
137
|
+
t = threading.Thread(target=ticker, name="mlops-upload-progress", daemon=True)
|
|
138
|
+
t.start()
|
|
139
|
+
try:
|
|
140
|
+
while True:
|
|
141
|
+
chunk = self._f.read(self._chunk_size)
|
|
142
|
+
if not chunk:
|
|
143
|
+
break
|
|
144
|
+
self._read += len(chunk)
|
|
145
|
+
yield chunk
|
|
146
|
+
finally:
|
|
147
|
+
# Ensure a final 100% line and stop ticker.
|
|
148
|
+
self._read = self._total
|
|
149
|
+
self._completed = True
|
|
150
|
+
stop_event.set()
|
|
151
|
+
t.join(timeout=0.2)
|
|
152
|
+
self._print_line(self._render_line(self._read), final=True)
|
|
153
|
+
|
|
58
154
|
size = file_path.stat().st_size
|
|
59
155
|
# Use a dedicated client for S3 presigned upload (avoid leaking API auth headers).
|
|
60
156
|
with httpx.Client(timeout=timeout) as client:
|
|
61
157
|
with file_path.open("rb") as f:
|
|
158
|
+
content = f
|
|
159
|
+
if size > 0:
|
|
160
|
+
content = _ProgressIterable(f, total=size, name=file_path.name)
|
|
62
161
|
resp = client.put(
|
|
63
162
|
url,
|
|
64
|
-
content=
|
|
163
|
+
content=content,
|
|
65
164
|
headers={
|
|
66
165
|
"Content-Length": str(size),
|
|
67
166
|
"Content-Type": "application/octet-stream",
|
|
@@ -85,18 +184,25 @@ class Task:
|
|
|
85
184
|
config = ConnectionConfig(api_key="your_api_key")
|
|
86
185
|
task = Task(config=config)
|
|
87
186
|
|
|
88
|
-
# Submit a task with
|
|
89
|
-
result = task.submit(
|
|
90
|
-
name="my-task",
|
|
91
|
-
cluster_name="slurm-cn",
|
|
92
|
-
script="#!/bin/bash\\necho 'Hello World'"
|
|
93
|
-
)
|
|
94
|
-
|
|
95
|
-
# Or submit with command
|
|
187
|
+
# Submit a task with gpu type
|
|
96
188
|
result = task.submit(
|
|
97
|
-
name="
|
|
189
|
+
name="gpu-task-from-sdk",
|
|
98
190
|
cluster_name="slurm-cn",
|
|
99
|
-
|
|
191
|
+
image="/mnt/minio/images/01ai-registry.cn-shanghai.cr.aliyuncs.com+public+llamafactory+0.9.3.sqsh",
|
|
192
|
+
entry_command="llamafactory-cli train /workspace/config/test_lora.yaml",
|
|
193
|
+
resources={
|
|
194
|
+
"partition": "gpu",
|
|
195
|
+
"nodes": 2,
|
|
196
|
+
"ntasks": 2,
|
|
197
|
+
"cpus_per_task": 2,
|
|
198
|
+
"memory": "4G",
|
|
199
|
+
"time": "01:00:00",
|
|
200
|
+
"gres": "gpu:nvidia_a10:1",
|
|
201
|
+
"qos": "qos_xcloud",
|
|
202
|
+
"job_type": "batch",
|
|
203
|
+
},
|
|
204
|
+
team_id=1,
|
|
205
|
+
file_path="your file path",
|
|
100
206
|
)
|
|
101
207
|
|
|
102
208
|
# Get task details
|
|
@@ -151,8 +257,8 @@ class Task:
|
|
|
151
257
|
self,
|
|
152
258
|
name: str,
|
|
153
259
|
cluster_name: str,
|
|
154
|
-
|
|
155
|
-
|
|
260
|
+
image: str,
|
|
261
|
+
entry_command: str,
|
|
156
262
|
resources: Optional[dict] = None,
|
|
157
263
|
team_id: Optional[int] = None,
|
|
158
264
|
file_path: Optional[str] = None,
|
|
@@ -163,11 +269,11 @@ class Task:
|
|
|
163
269
|
Args:
|
|
164
270
|
name: Task name
|
|
165
271
|
cluster_name: Cluster name to submit the task to
|
|
166
|
-
|
|
167
|
-
|
|
272
|
+
image: Container image reference
|
|
273
|
+
entry_command: Container entry command/script
|
|
168
274
|
resources: Resource requirements dict (optional)
|
|
169
275
|
team_id: Team ID (optional)
|
|
170
|
-
|
|
276
|
+
file_path: Local file path to upload (optional, support for .zip, .tar.gz, .tgz)
|
|
171
277
|
Returns:
|
|
172
278
|
TaskSubmitResponse containing the submitted task information
|
|
173
279
|
|
|
@@ -175,25 +281,14 @@ class Task:
|
|
|
175
281
|
APIException: If the API returns an error
|
|
176
282
|
AuthenticationException: If authentication fails
|
|
177
283
|
"""
|
|
178
|
-
# At least one of script or command must be provided
|
|
179
|
-
if not script and not command:
|
|
180
|
-
raise APIException("At least one of 'script' or 'command' must be provided")
|
|
181
|
-
|
|
182
|
-
# Map resources dict to individual fields
|
|
183
|
-
# resources dict can contain: cpu, cpus_per_task, memory, nodes, gres, time, partition, etc.
|
|
184
|
-
|
|
185
284
|
request_kwargs = {
|
|
186
285
|
"name": name,
|
|
187
286
|
"cluster_name": cluster_name,
|
|
287
|
+
"image": image,
|
|
288
|
+
"entry_command": entry_command,
|
|
188
289
|
}
|
|
189
|
-
|
|
190
|
-
#
|
|
191
|
-
# script is Union[Unset, str], so we need to set it or leave as UNSET
|
|
192
|
-
if script:
|
|
193
|
-
request_kwargs["script"] = script
|
|
194
|
-
# command is Union[None, Unset, str], so we can set it or leave as UNSET
|
|
195
|
-
if command:
|
|
196
|
-
request_kwargs["command"] = command
|
|
290
|
+
# Map resources dict to individual fields
|
|
291
|
+
# resources dict can contain: cpu, cpus_per_task, memory, nodes, gres, time, partition, etc.
|
|
197
292
|
|
|
198
293
|
# team_id is Union[None, Unset, int]
|
|
199
294
|
if team_id is not None:
|
|
@@ -203,18 +298,34 @@ class Task:
|
|
|
203
298
|
if resources:
|
|
204
299
|
if "cpu" in resources or "cpus_per_task" in resources:
|
|
205
300
|
request_kwargs["cpus_per_task"] = resources.get("cpus_per_task") or resources.get("cpu")
|
|
301
|
+
else:
|
|
302
|
+
request_kwargs["cpus_per_task"] = 1
|
|
206
303
|
if "memory" in resources:
|
|
207
304
|
request_kwargs["memory"] = resources.get("memory")
|
|
305
|
+
else:
|
|
306
|
+
request_kwargs["memory"] = "1G"
|
|
208
307
|
if "nodes" in resources:
|
|
209
308
|
request_kwargs["nodes"] = resources.get("nodes")
|
|
309
|
+
else:
|
|
310
|
+
request_kwargs["nodes"] = 1
|
|
210
311
|
if "gres" in resources:
|
|
211
312
|
request_kwargs["gres"] = resources.get("gres")
|
|
212
313
|
if "time" in resources:
|
|
213
314
|
request_kwargs["time"] = resources.get("time")
|
|
315
|
+
else:
|
|
316
|
+
request_kwargs["time"] = "01:00:00"
|
|
214
317
|
if "partition" in resources:
|
|
215
318
|
request_kwargs["partition"] = resources.get("partition")
|
|
216
|
-
|
|
217
|
-
request_kwargs["
|
|
319
|
+
else:
|
|
320
|
+
request_kwargs["partition"] = "debug"
|
|
321
|
+
if "qos" in resources:
|
|
322
|
+
request_kwargs["qos"] = resources.get("qos")
|
|
323
|
+
else:
|
|
324
|
+
request_kwargs["qos"] = "qos_xcloud"
|
|
325
|
+
if "ntasks" in resources:
|
|
326
|
+
request_kwargs["ntasks"] = resources.get("ntasks")
|
|
327
|
+
else:
|
|
328
|
+
request_kwargs["ntasks"] = 1
|
|
218
329
|
|
|
219
330
|
if file_path:
|
|
220
331
|
local_path = _validate_archive_file_path(file_path)
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: mlops-python-sdk
|
|
3
|
+
Version: 1.0.3
|
|
4
|
+
Summary: MLOps Python SDK for XCloud Service API
|
|
5
|
+
License: MIT
|
|
6
|
+
Author: mlops
|
|
7
|
+
Author-email: mlops@example.com
|
|
8
|
+
Requires-Python: >=3.9,<4.0
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Requires-Dist: attrs (>=23.2.0)
|
|
17
|
+
Requires-Dist: httpx (>=0.27.0,<1.0.0)
|
|
18
|
+
Requires-Dist: packaging (>=24.1)
|
|
19
|
+
Requires-Dist: python-dateutil (>=2.8.2)
|
|
20
|
+
Requires-Dist: typing-extensions (>=4.1.0)
|
|
21
|
+
Project-URL: Bug Tracker, https://github.com/xcloud-service/xservice/issues
|
|
22
|
+
Project-URL: Homepage, https://mlops.cloud/
|
|
23
|
+
Project-URL: Repository, https://github.com/xcloud-service/xservice
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# SDK
|
|
27
|
+
|
|
28
|
+
Software Development Kits for integrating with the XCloud Service API.
|
|
29
|
+
|
|
30
|
+
> [!NOTE] SDK Support
|
|
31
|
+
> SDKs provide type-safe, high-level interfaces for interacting with the platform API. They handle authentication, error handling, and request retries automatically.
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
## Installation
|
|
35
|
+
|
|
36
|
+
The Python SDK installation.
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install mlops-python-sdk
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### Configuration
|
|
43
|
+
|
|
44
|
+
The SDK reads configuration from environment variables by default:
|
|
45
|
+
|
|
46
|
+
- `MLOPS_API_KEY`: API key (required)
|
|
47
|
+
- `MLOPS_DOMAIN`: API domain, e.g. `localhost:8090` or `https://example.com`
|
|
48
|
+
- `MLOPS_API_PATH`: API path prefix (default: `/api/v1`)
|
|
49
|
+
- `MLOPS_DEBUG`: `true|false` (default: `false`)
|
|
50
|
+
|
|
51
|
+
Or configure in code:
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
from mlops import ConnectionConfig, Task
|
|
55
|
+
|
|
56
|
+
config = ConnectionConfig(
|
|
57
|
+
api_key="xck_...",
|
|
58
|
+
domain="https://example.com",
|
|
59
|
+
api_path="/api/v1",
|
|
60
|
+
debug=False,
|
|
61
|
+
)
|
|
62
|
+
task = Task(config=config)
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## SDK Usage
|
|
66
|
+
|
|
67
|
+
### Initialize client
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
from mlops import Task
|
|
71
|
+
|
|
72
|
+
task = Task() # uses environment variables by default
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### Submit a GPU task
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
from mlops import Task
|
|
79
|
+
|
|
80
|
+
task = Task()
|
|
81
|
+
resp = task.submit(
|
|
82
|
+
name="gpu-task-from-sdk",
|
|
83
|
+
cluster_name="slurm-cn",
|
|
84
|
+
team_id=1,
|
|
85
|
+
image="/mnt/minio/images/01ai-registry.cn-shanghai.cr.aliyuncs.com+public+llamafactory+0.9.3.sqsh",
|
|
86
|
+
entry_command="llamafactory-cli train /workspace/config/test_lora.yaml",
|
|
87
|
+
resources={
|
|
88
|
+
"partition": "gpu",
|
|
89
|
+
"nodes": 2,
|
|
90
|
+
"ntasks": 2,
|
|
91
|
+
"cpus_per_task": 2,
|
|
92
|
+
"memory": "4G",
|
|
93
|
+
"time": "01:00:00",
|
|
94
|
+
"gres": "gpu:nvidia_a10:1",
|
|
95
|
+
"qos": "qos_xcloud",
|
|
96
|
+
},
|
|
97
|
+
file_path="/path/to/xservice.zip", # optional: .zip/.tar.gz/.tgz
|
|
98
|
+
)
|
|
99
|
+
print(resp.job_id)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Submit a CPU task
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
from mlops import Task
|
|
106
|
+
|
|
107
|
+
task = Task()
|
|
108
|
+
resp = task.submit(
|
|
109
|
+
name="cpu-task-from-sdk",
|
|
110
|
+
cluster_name="slurm-cn",
|
|
111
|
+
team_id=1,
|
|
112
|
+
image="docker://01ai-registry.cn-shanghai.cr.aliyuncs.com/01-ai/xcs/v2/alpine:3.23.0",
|
|
113
|
+
entry_command="echo hello",
|
|
114
|
+
resources={
|
|
115
|
+
"partition": "cpu",
|
|
116
|
+
"nodes": 1,
|
|
117
|
+
"ntasks": 1,
|
|
118
|
+
"cpus_per_task": 1,
|
|
119
|
+
"memory": "1G",
|
|
120
|
+
"time": "01:00:00",
|
|
121
|
+
"qos": "qos_xcloud",
|
|
122
|
+
},
|
|
123
|
+
)
|
|
124
|
+
print(resp.job_id)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### List tasks
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
from mlops import Task
|
|
131
|
+
from mlops.api.client.models.task_status import TaskStatus
|
|
132
|
+
|
|
133
|
+
task = Task()
|
|
134
|
+
resp = task.list(status=TaskStatus.COMPLETED, cluster_name="slurm-cn", page=1, page_size=20)
|
|
135
|
+
print(len(resp.tasks or []))
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### Get task details
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
from mlops import Task
|
|
142
|
+
|
|
143
|
+
task = Task()
|
|
144
|
+
task_info = task.get(task_id=12345, cluster_name="slurm-cn")
|
|
145
|
+
print(task_info)
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### Cancel a task
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
from mlops import Task
|
|
152
|
+
|
|
153
|
+
task = Task()
|
|
154
|
+
task.cancel(task_id=12345, cluster_name="slurm-cn")
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### Delete a task
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
from mlops import Task
|
|
161
|
+
|
|
162
|
+
task = Task()
|
|
163
|
+
task.delete(task_id=12345, cluster_name="slurm-cn")
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
**Task Management Methods:**
|
|
167
|
+
|
|
168
|
+
- `submit()` - Submit a new task with container image and entry command
|
|
169
|
+
- `get()` - Get task details by task ID
|
|
170
|
+
- `list()` - List tasks with optional filters (status, cluster_name, team_id, user_id)
|
|
171
|
+
- `cancel()` - Cancel a running task
|
|
172
|
+
- `delete()` - Delete a task record
|
|
173
|
+
|
|
174
|
+
**Task Status Values:**
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
from mlops.api.client.models.task_status import TaskStatus
|
|
178
|
+
|
|
179
|
+
TaskStatus.PENDING # Task is pending
|
|
180
|
+
TaskStatus.QUEUED # Task is queued
|
|
181
|
+
TaskStatus.RUNNING # Task is running
|
|
182
|
+
TaskStatus.COMPLETED # Task completed successfully
|
|
183
|
+
TaskStatus.SUCCEEDED # Task succeeded
|
|
184
|
+
TaskStatus.FAILED # Task failed
|
|
185
|
+
TaskStatus.CANCELLED # Task was cancelled
|
|
186
|
+
TaskStatus.CREATED # Task was created
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
**Error Handling:**
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
from mlops.exceptions import (
|
|
193
|
+
APIException,
|
|
194
|
+
AuthenticationException,
|
|
195
|
+
NotFoundException,
|
|
196
|
+
RateLimitException,
|
|
197
|
+
TimeoutException,
|
|
198
|
+
InvalidArgumentException,
|
|
199
|
+
NotEnoughSpaceException
|
|
200
|
+
)
|
|
201
|
+
from mlops import Task
|
|
202
|
+
|
|
203
|
+
task = Task()
|
|
204
|
+
|
|
205
|
+
try:
|
|
206
|
+
result = task.submit(
|
|
207
|
+
name="test",
|
|
208
|
+
cluster_name="slurm-cn",
|
|
209
|
+
image="docker://alpine:3.23.0",
|
|
210
|
+
entry_command="echo hello",
|
|
211
|
+
)
|
|
212
|
+
except AuthenticationException as e:
|
|
213
|
+
print(f"Authentication failed: {e}")
|
|
214
|
+
except NotFoundException as e:
|
|
215
|
+
print(f"Resource not found: {e}")
|
|
216
|
+
except APIException as e:
|
|
217
|
+
print(f"API error: {e}")
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
> [!TIP] Error Handling
|
|
221
|
+
> SDKs automatically parse typed responses and raise structured exceptions.
|
|
222
|
+
|
|
223
|
+
## Features
|
|
224
|
+
|
|
225
|
+
- Type-safe API clients
|
|
226
|
+
- Automatic authentication
|
|
227
|
+
- Error handling
|
|
228
|
+
- Typed response parsing (generated models)
|
|
229
|
+
- Unexpected-status guard (optional)
|
|
230
|
+
|
|
231
|
+
## Resources
|
|
232
|
+
|
|
233
|
+
- [Python SDK Documentation](https://github.com/xcloud-service/xservice/tree/main/client/python-sdk)
|
|
234
|
+
- [API Reference](https://xcloud-service.com/docs/api)
|
|
235
|
+
|
|
@@ -35,18 +35,18 @@ mlops/api/client/models/task_log_entry_log_type.py,sha256=uVqbF8RewyFkezY6sy28He
|
|
|
35
35
|
mlops/api/client/models/task_logs_response.py,sha256=QEGRy51qB7t0K-EGusxzDmkDlAjdKkwHF92em3dLb1c,3557
|
|
36
36
|
mlops/api/client/models/task_resources_type_0.py,sha256=36nxeOqAJS4ksfQtzoXigWVMhEV1Tnq5Z_64sHa3gGQ,1341
|
|
37
37
|
mlops/api/client/models/task_status.py,sha256=Tht4F2UeBp-QBLhh-z0fEw45r5cBCfkFUro-la42BPY,315
|
|
38
|
-
mlops/api/client/models/task_submit_request.py,sha256=
|
|
38
|
+
mlops/api/client/models/task_submit_request.py,sha256=g8THqxUjn0VD4fw8eo6I6qe9Eym6q9vmSSTFrhcUlbc,24803
|
|
39
39
|
mlops/api/client/models/task_submit_request_environment_type_0.py,sha256=Wx6ye6vVHytSex186AeUm27-XMWMmZe6lbL2Ons2mkw,1454
|
|
40
40
|
mlops/api/client/models/task_submit_response.py,sha256=EK3ZXxo_XO5Yn2zdOrR-VMPKg9om49qQ1ywS2Smgink,2200
|
|
41
41
|
mlops/api/client/models/task_tres_type_0.py,sha256=rEaiQG7A19mlTIHDppzxuWa4oPfh9qsKjPhhVOlBf4g,1292
|
|
42
42
|
mlops/api/client/models/task_tres_used_type_0.py,sha256=4w6An7-ZCqa8cc3SPi7mcwGK-ekT6AYq_dEdf8KzoYA,1320
|
|
43
43
|
mlops/api/client/py.typed,sha256=8ZJUsxZiuOy1oJeVhsTWQhTG_6pTVHVXk5hJL79ebTk,25
|
|
44
44
|
mlops/api/client/types.py,sha256=AX4orxQZQJat3vZrgjJ-TYb2sNBL8kNo9yqYDT-n8y8,1391
|
|
45
|
-
mlops/connection_config.py,sha256=
|
|
45
|
+
mlops/connection_config.py,sha256=_b9sVFGJtf1GynmIB4NtKCzg7kkgE-wSrsG3LwzlOqk,2946
|
|
46
46
|
mlops/exceptions.py,sha256=3kfda-Rz0km9kV-gvnPCw7ueemWkXIGGdT0NXx6z9Xk,1680
|
|
47
47
|
mlops/task/__init__.py,sha256=M983vMPLj3tZQNFXQyTP5I2RsRorFElezLeppr3WLsw,133
|
|
48
48
|
mlops/task/client.py,sha256=V131WLVJl1raGAVixUhJCX8s1neN15mxAjQwO01qlIg,3552
|
|
49
|
-
mlops/task/task.py,sha256=
|
|
50
|
-
mlops_python_sdk-1.0.
|
|
51
|
-
mlops_python_sdk-1.0.
|
|
52
|
-
mlops_python_sdk-1.0.
|
|
49
|
+
mlops/task/task.py,sha256=Eqb4XGMlFLjelg3js9Twoulf0Nlyn0pz5isuGl916vs,27756
|
|
50
|
+
mlops_python_sdk-1.0.3.dist-info/METADATA,sha256=KwMwLVAYfXBjKXXiU_p5TibVXGbli5gaxCCa0Wap9h4,5679
|
|
51
|
+
mlops_python_sdk-1.0.3.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
52
|
+
mlops_python_sdk-1.0.3.dist-info/RECORD,,
|
|
@@ -1,407 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.3
|
|
2
|
-
Name: mlops-python-sdk
|
|
3
|
-
Version: 1.0.1
|
|
4
|
-
Summary: MLOps Python SDK for XCloud Service API
|
|
5
|
-
License: MIT
|
|
6
|
-
Author: mlops
|
|
7
|
-
Author-email: mlops@example.com
|
|
8
|
-
Requires-Python: >=3.9,<4.0
|
|
9
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
-
Classifier: Programming Language :: Python :: 3
|
|
11
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
-
Requires-Dist: attrs (>=23.2.0)
|
|
17
|
-
Requires-Dist: httpx (>=0.27.0,<1.0.0)
|
|
18
|
-
Requires-Dist: packaging (>=24.1)
|
|
19
|
-
Requires-Dist: python-dateutil (>=2.8.2)
|
|
20
|
-
Requires-Dist: typing-extensions (>=4.1.0)
|
|
21
|
-
Project-URL: Bug Tracker, https://github.com/xcloud-service/xservice/issues
|
|
22
|
-
Project-URL: Homepage, https://mlops.cloud/
|
|
23
|
-
Project-URL: Repository, https://github.com/xcloud-service/xservice
|
|
24
|
-
Description-Content-Type: text/markdown
|
|
25
|
-
|
|
26
|
-
# MLOps Python SDK
|
|
27
|
-
|
|
28
|
-
[MLOps](https://xcloud-service.com) Python SDK for XCloud Service API. Manage and execute tasks with confidence.
|
|
29
|
-
|
|
30
|
-
## Installation
|
|
31
|
-
|
|
32
|
-
Install the SDK from PyPI:
|
|
33
|
-
|
|
34
|
-
```bash
|
|
35
|
-
pip install mlops-python-sdk
|
|
36
|
-
```
|
|
37
|
-
|
|
38
|
-
## Quick Start
|
|
39
|
-
|
|
40
|
-
### 1. Setup Authentication
|
|
41
|
-
|
|
42
|
-
You can authenticate using either an API Key.
|
|
43
|
-
|
|
44
|
-
#### API Key (Recommended for programmatic access)
|
|
45
|
-
|
|
46
|
-
1. Sign up at [MLOps](https://xcloud-service.com)
|
|
47
|
-
2. Create an API key from [API Keys](https://xcloud-service.com/home/api-keys)
|
|
48
|
-
3. Set environment variables:
|
|
49
|
-
|
|
50
|
-
```bash
|
|
51
|
-
export MLOPS_API_KEY=xck_******
|
|
52
|
-
export MLOPS_DOMAIN=localhost:8090 # optional, default is localhost:8090
|
|
53
|
-
```
|
|
54
|
-
|
|
55
|
-
### 2. Basic Usage
|
|
56
|
-
|
|
57
|
-
```python
|
|
58
|
-
from mlops import Task, ConnectionConfig
|
|
59
|
-
from mlops.api.client.models.task_status import TaskStatus
|
|
60
|
-
|
|
61
|
-
# Initialize Task client (uses environment variables by default)
|
|
62
|
-
task = Task()
|
|
63
|
-
|
|
64
|
-
# Or initialize with explicit configuration
|
|
65
|
-
config = ConnectionConfig(
|
|
66
|
-
api_key="xck_******",
|
|
67
|
-
domain="localhost:8090",
|
|
68
|
-
debug=False
|
|
69
|
-
)
|
|
70
|
-
task = Task(config=config)
|
|
71
|
-
|
|
72
|
-
# Submit a task with script
|
|
73
|
-
result = task.submit(
|
|
74
|
-
name="my-training-task",
|
|
75
|
-
cluster_id=1,
|
|
76
|
-
script="#!/bin/bash\necho 'Hello World'",
|
|
77
|
-
resources={"cpu": 4, "memory": "8GB", "gpu": 1}
|
|
78
|
-
)
|
|
79
|
-
|
|
80
|
-
# Or submit with command
|
|
81
|
-
result = task.submit(
|
|
82
|
-
name="my-task",
|
|
83
|
-
cluster_id=1,
|
|
84
|
-
command="python train.py",
|
|
85
|
-
resources={"cpu": 4, "memory": "8GB"}
|
|
86
|
-
)
|
|
87
|
-
|
|
88
|
-
# Get task details
|
|
89
|
-
task_info = task.get(task_id=result.job_id, cluster_id=1)
|
|
90
|
-
|
|
91
|
-
# List tasks with filters
|
|
92
|
-
running_tasks = task.list(
|
|
93
|
-
status=TaskStatus.RUNNING,
|
|
94
|
-
cluster_id=1,
|
|
95
|
-
page=1,
|
|
96
|
-
page_size=20
|
|
97
|
-
)
|
|
98
|
-
|
|
99
|
-
# Cancel a task
|
|
100
|
-
task.cancel(task_id=result.job_id, cluster_id=1)
|
|
101
|
-
|
|
102
|
-
# Delete a task
|
|
103
|
-
task.delete(task_id=task_id, cluster_id=1)
|
|
104
|
-
```
|
|
105
|
-
|
|
106
|
-
## API Reference
|
|
107
|
-
|
|
108
|
-
### Task Class
|
|
109
|
-
|
|
110
|
-
The `Task` class provides a high-level interface for managing tasks.
|
|
111
|
-
|
|
112
|
-
#### Initialization
|
|
113
|
-
|
|
114
|
-
```python
|
|
115
|
-
from mlops import Task, ConnectionConfig
|
|
116
|
-
|
|
117
|
-
# Using environment variables
|
|
118
|
-
task = Task()
|
|
119
|
-
|
|
120
|
-
# With explicit configuration
|
|
121
|
-
config = ConnectionConfig(
|
|
122
|
-
api_key="xck_******", # API key for authentication
|
|
123
|
-
domain="localhost:8090", # API domain
|
|
124
|
-
debug=False, # Enable debug mode
|
|
125
|
-
request_timeout=30.0 # Request timeout in seconds
|
|
126
|
-
)
|
|
127
|
-
task = Task(config=config)
|
|
128
|
-
|
|
129
|
-
# Or pass parameters directly
|
|
130
|
-
task = Task(
|
|
131
|
-
api_key="xck_******",
|
|
132
|
-
domain="localhost:8090"
|
|
133
|
-
)
|
|
134
|
-
```
|
|
135
|
-
|
|
136
|
-
#### Methods
|
|
137
|
-
|
|
138
|
-
##### `submit()`
|
|
139
|
-
|
|
140
|
-
Submit a new task to the cluster.
|
|
141
|
-
|
|
142
|
-
```python
|
|
143
|
-
result = task.submit(
|
|
144
|
-
name: str, # Task name (required)
|
|
145
|
-
cluster_id: int, # Cluster ID (required)
|
|
146
|
-
script: Optional[str] = None, # Script content (script or command required)
|
|
147
|
-
command: Optional[str] = None,# Command to execute (script or command required)
|
|
148
|
-
resources: Optional[dict] = None, # Resource requirements
|
|
149
|
-
team_id: Optional[int] = None # Team ID (optional)
|
|
150
|
-
) -> TaskSubmitResponse
|
|
151
|
-
```
|
|
152
|
-
|
|
153
|
-
**Resources dictionary** can contain:
|
|
154
|
-
- `cpu` or `cpus_per_task`: Number of CPUs
|
|
155
|
-
- `memory`: Memory requirement (e.g., "8GB", "4096M")
|
|
156
|
-
- `nodes`: Number of nodes
|
|
157
|
-
- `gres`: GPU resources (e.g., "gpu:1")
|
|
158
|
-
- `time`: Time limit (e.g., "1-00:00:00" for 1 day)
|
|
159
|
-
- `partition`: Partition name
|
|
160
|
-
- `tres`: TRES specification
|
|
161
|
-
|
|
162
|
-
**Example:**
|
|
163
|
-
|
|
164
|
-
```python
|
|
165
|
-
result = task.submit(
|
|
166
|
-
name="ml-training",
|
|
167
|
-
cluster_id=1,
|
|
168
|
-
script="#!/bin/bash\npython train.py --epochs 100",
|
|
169
|
-
resources={
|
|
170
|
-
"cpu": 8,
|
|
171
|
-
"memory": "16GB",
|
|
172
|
-
"gpu": 1,
|
|
173
|
-
"time": "2-00:00:00", # 2 days
|
|
174
|
-
"partition": "gpu"
|
|
175
|
-
}
|
|
176
|
-
)
|
|
177
|
-
print(f"Task submitted: Job ID = {result.job_id}")
|
|
178
|
-
```
|
|
179
|
-
|
|
180
|
-
##### `get()`
|
|
181
|
-
|
|
182
|
-
Get task details by task ID.
|
|
183
|
-
|
|
184
|
-
```python
|
|
185
|
-
task_info = task.get(
|
|
186
|
-
task_id: int, # Task ID (Slurm job ID)
|
|
187
|
-
cluster_id: int # Cluster ID (required)
|
|
188
|
-
) -> Task
|
|
189
|
-
```
|
|
190
|
-
|
|
191
|
-
**Example:**
|
|
192
|
-
|
|
193
|
-
```python
|
|
194
|
-
task_info = task.get(task_id=12345, cluster_id=1)
|
|
195
|
-
print(f"Task status: {task_info.status}")
|
|
196
|
-
print(f"Task name: {task_info.name}")
|
|
197
|
-
```
|
|
198
|
-
|
|
199
|
-
##### `list()`
|
|
200
|
-
|
|
201
|
-
List tasks with optional filters and pagination.
|
|
202
|
-
|
|
203
|
-
```python
|
|
204
|
-
tasks = task.list(
|
|
205
|
-
page: int = 1, # Page number
|
|
206
|
-
page_size: int = 20, # Items per page
|
|
207
|
-
status: Optional[TaskStatus] = None, # Filter by status
|
|
208
|
-
cluster_id: Optional[int] = None, # Filter by cluster ID
|
|
209
|
-
team_id: Optional[int] = None, # Filter by team ID
|
|
210
|
-
user_id: Optional[int] = None # Filter by user ID
|
|
211
|
-
) -> TaskListResponse
|
|
212
|
-
```
|
|
213
|
-
|
|
214
|
-
**Example:**
|
|
215
|
-
|
|
216
|
-
```python
|
|
217
|
-
from mlops.api.client.models.task_status import TaskStatus
|
|
218
|
-
|
|
219
|
-
# List all running tasks
|
|
220
|
-
running_tasks = task.list(status=TaskStatus.RUNNING)
|
|
221
|
-
|
|
222
|
-
# List tasks in a specific cluster
|
|
223
|
-
cluster_tasks = task.list(cluster_id=1, page=1, page_size=10)
|
|
224
|
-
|
|
225
|
-
# List completed tasks with pagination
|
|
226
|
-
completed = task.list(
|
|
227
|
-
status=TaskStatus.COMPLETED,
|
|
228
|
-
cluster_id=1,
|
|
229
|
-
page=1,
|
|
230
|
-
page_size=50
|
|
231
|
-
)
|
|
232
|
-
```
|
|
233
|
-
|
|
234
|
-
##### `cancel()`
|
|
235
|
-
|
|
236
|
-
Cancel a running task.
|
|
237
|
-
|
|
238
|
-
```python
|
|
239
|
-
task.cancel(
|
|
240
|
-
task_id: int, # Task ID (Slurm job ID)
|
|
241
|
-
cluster_id: int # Cluster ID (required)
|
|
242
|
-
)
|
|
243
|
-
```
|
|
244
|
-
|
|
245
|
-
**Example:**
|
|
246
|
-
|
|
247
|
-
```python
|
|
248
|
-
task.cancel(task_id=12345, cluster_id=1)
|
|
249
|
-
```
|
|
250
|
-
|
|
251
|
-
### TaskStatus Enum
|
|
252
|
-
|
|
253
|
-
Task status values for filtering:
|
|
254
|
-
|
|
255
|
-
```python
|
|
256
|
-
from mlops.api.client.models.task_status import TaskStatus
|
|
257
|
-
|
|
258
|
-
TaskStatus.PENDING # Task is pending
|
|
259
|
-
TaskStatus.QUEUED # Task is queued
|
|
260
|
-
TaskStatus.RUNNING # Task is running
|
|
261
|
-
TaskStatus.COMPLETED # Task completed successfully
|
|
262
|
-
TaskStatus.SUCCEEDED # Task succeeded
|
|
263
|
-
TaskStatus.FAILED # Task failed
|
|
264
|
-
TaskStatus.CANCELLED # Task was cancelled
|
|
265
|
-
TaskStatus.CREATED # Task was created
|
|
266
|
-
```
|
|
267
|
-
|
|
268
|
-
## Configuration
|
|
269
|
-
|
|
270
|
-
### Environment Variables
|
|
271
|
-
|
|
272
|
-
The SDK reads configuration from environment variables:
|
|
273
|
-
|
|
274
|
-
- `MLOPS_API_KEY`: API key for authentication
|
|
275
|
-
- `MLOPS_DOMAIN`: API domain (default: `localhost:8090`)
|
|
276
|
-
- `MLOPS_DEBUG`: Enable debug mode (`true`/`false`, default: `false`)
|
|
277
|
-
- `MLOPS_API_PATH`: API path prefix (default: `/api/v1`)
|
|
278
|
-
|
|
279
|
-
### ConnectionConfig
|
|
280
|
-
|
|
281
|
-
You can also configure the connection programmatically:
|
|
282
|
-
|
|
283
|
-
```python
|
|
284
|
-
from mlops import ConnectionConfig
|
|
285
|
-
|
|
286
|
-
config = ConnectionConfig(
|
|
287
|
-
domain="api.example.com",
|
|
288
|
-
api_key="xck_******",
|
|
289
|
-
debug=True,
|
|
290
|
-
request_timeout=60.0,
|
|
291
|
-
api_path="/api/v1"
|
|
292
|
-
)
|
|
293
|
-
```
|
|
294
|
-
|
|
295
|
-
## Error Handling
|
|
296
|
-
|
|
297
|
-
The SDK provides specific exception types:
|
|
298
|
-
|
|
299
|
-
```python
|
|
300
|
-
from mlops.exceptions import (
|
|
301
|
-
APIException, # General API errors
|
|
302
|
-
AuthenticationException, # Authentication failures
|
|
303
|
-
NotFoundException, # Resource not found
|
|
304
|
-
RateLimitException, # Rate limit exceeded
|
|
305
|
-
TimeoutException, # Request timeout
|
|
306
|
-
InvalidArgumentException # Invalid arguments
|
|
307
|
-
)
|
|
308
|
-
|
|
309
|
-
try:
|
|
310
|
-
result = task.submit(name="test", cluster_id=1, command="echo hello")
|
|
311
|
-
except AuthenticationException as e:
|
|
312
|
-
print(f"Authentication failed: {e}")
|
|
313
|
-
except NotFoundException as e:
|
|
314
|
-
print(f"Resource not found: {e}")
|
|
315
|
-
except APIException as e:
|
|
316
|
-
print(f"API error: {e}")
|
|
317
|
-
```
|
|
318
|
-
|
|
319
|
-
## Examples
|
|
320
|
-
|
|
321
|
-
### Submit a Machine Learning Training Job
|
|
322
|
-
|
|
323
|
-
```python
|
|
324
|
-
from mlops import Task
|
|
325
|
-
|
|
326
|
-
task = Task()
|
|
327
|
-
|
|
328
|
-
result = task.submit(
|
|
329
|
-
name="pytorch-training",
|
|
330
|
-
cluster_id=1,
|
|
331
|
-
script="""#!/bin/bash
|
|
332
|
-
#SBATCH --gres=gpu:1
|
|
333
|
-
#SBATCH --cpus-per-task=2
|
|
334
|
-
#SBATCH --mem=4GB
|
|
335
|
-
|
|
336
|
-
python train.py --config config.yaml
|
|
337
|
-
""",
|
|
338
|
-
resources={
|
|
339
|
-
"cpus_per_task": 2,
|
|
340
|
-
"memory": "4GB",
|
|
341
|
-
"gres": "gpu:1",
|
|
342
|
-
"time": "1-00:00:00", # 1 days
|
|
343
|
-
"partition": "gpu"
|
|
344
|
-
}
|
|
345
|
-
)
|
|
346
|
-
|
|
347
|
-
print(f"Training job submitted: {result.job_id}")
|
|
348
|
-
```
|
|
349
|
-
|
|
350
|
-
### Monitor Task Status
|
|
351
|
-
|
|
352
|
-
```python
|
|
353
|
-
from mlops import Task
|
|
354
|
-
from mlops.api.client.models.task_status import TaskStatus
|
|
355
|
-
import time
|
|
356
|
-
|
|
357
|
-
task = Task()
|
|
358
|
-
job_id = 12345
|
|
359
|
-
cluster_id = 1
|
|
360
|
-
|
|
361
|
-
while True:
|
|
362
|
-
task_info = task.get(task_id=job_id, cluster_id=cluster_id)
|
|
363
|
-
print(f"Status: {task_info.status}")
|
|
364
|
-
|
|
365
|
-
if task_info.status in [TaskStatus.COMPLETED, TaskStatus.FAILED, TaskStatus.CANCELLED]:
|
|
366
|
-
break
|
|
367
|
-
|
|
368
|
-
time.sleep(10) # Check every 10 seconds
|
|
369
|
-
```
|
|
370
|
-
|
|
371
|
-
### List and Filter Tasks
|
|
372
|
-
|
|
373
|
-
```python
|
|
374
|
-
from mlops import Task
|
|
375
|
-
from mlops.api.client.models.task_status import TaskStatus
|
|
376
|
-
|
|
377
|
-
task = Task()
|
|
378
|
-
|
|
379
|
-
# Get all running tasks in cluster 1
|
|
380
|
-
running = task.list(
|
|
381
|
-
status=TaskStatus.RUNNING,
|
|
382
|
-
cluster_id=1
|
|
383
|
-
)
|
|
384
|
-
|
|
385
|
-
for t in running.tasks:
|
|
386
|
-
print(f"{t.name}: {t.status} (Job ID: {t.job_id})")
|
|
387
|
-
|
|
388
|
-
# Get failed tasks
|
|
389
|
-
failed = task.list(status=TaskStatus.FAILED)
|
|
390
|
-
|
|
391
|
-
print(f"Total failed tasks: {failed.total}")
|
|
392
|
-
```
|
|
393
|
-
|
|
394
|
-
## Documentation
|
|
395
|
-
|
|
396
|
-
- [MLOPS Documentation](https://xcloud-service.com/docs)
|
|
397
|
-
- [API Reference](https://xcloud-service.com/docs/api)
|
|
398
|
-
|
|
399
|
-
## License
|
|
400
|
-
|
|
401
|
-
MIT
|
|
402
|
-
|
|
403
|
-
## Support
|
|
404
|
-
|
|
405
|
-
- [GitHub Issues](https://github.com/xcloud-service/xservice/issues)
|
|
406
|
-
- [Documentation](https://xcloud-service.com/docs)
|
|
407
|
-
|
|
File without changes
|