mlops-python-sdk 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,12 +29,18 @@ class TaskSubmitRequest:
29
29
  cpus_per_task (Union[None, Unset, int]): CPUs per task Example: 1.
30
30
  dependency (Union[None, Unset, str]): Job dependencies Example: afterok:12345.
31
31
  distribution (Union[None, Unset, str]): Task distribution Example: block.
32
+ entry_command (Union[None, Unset, str]): Container entry command/script (bash snippet) executed inside the
33
+ container. The platform runs it under /workspace.
34
+ Example: python -V && ls -la.
32
35
  environment (Union['TaskSubmitRequestEnvironmentType0', None, Unset]): Environment variables as key-value pairs
33
36
  Example: {'CUDA_VISIBLE_DEVICES': '0,1', 'PYTHONPATH': '/opt/python/lib'}.
34
37
  error (Union[None, Unset, str]): Standard error file pattern Example: error_%j.log.
35
38
  exclude (Union[None, Unset, str]): Nodes to exclude
36
39
  export (Union[None, Unset, str]): Environment export Example: ALL.
37
40
  gres (Union[None, Unset, str]): Generic resources (e.g., "gpu:1", "gpu:tesla:2") Example: gpu:1.
41
+ image (Union[None, Unset, str]): Container image reference. Can be a Slurm container plugin supported reference
42
+ (e.g. "docker://..."), or a registry reference which will be mapped to a local .sqsh image path by the platform.
43
+ Example: 01ai-registry.cn-shanghai.cr.aliyuncs.com/public/llamafactory:0.9.3.
38
44
  input_ (Union[None, Unset, str]): Standard input file
39
45
  job_spec (Union[Unset, JobSpec]): Domain-specific job specification (rendered into slurm script)
40
46
  mem_bind (Union[None, Unset, str]): Memory binding
@@ -65,11 +71,13 @@ class TaskSubmitRequest:
65
71
  cpus_per_task: Union[None, Unset, int] = UNSET
66
72
  dependency: Union[None, Unset, str] = UNSET
67
73
  distribution: Union[None, Unset, str] = UNSET
74
+ entry_command: Union[None, Unset, str] = UNSET
68
75
  environment: Union["TaskSubmitRequestEnvironmentType0", None, Unset] = UNSET
69
76
  error: Union[None, Unset, str] = UNSET
70
77
  exclude: Union[None, Unset, str] = UNSET
71
78
  export: Union[None, Unset, str] = UNSET
72
79
  gres: Union[None, Unset, str] = UNSET
80
+ image: Union[None, Unset, str] = UNSET
73
81
  input_: Union[None, Unset, str] = UNSET
74
82
  job_spec: Union[Unset, "JobSpec"] = UNSET
75
83
  mem_bind: Union[None, Unset, str] = UNSET
@@ -143,6 +151,12 @@ class TaskSubmitRequest:
143
151
  else:
144
152
  distribution = self.distribution
145
153
 
154
+ entry_command: Union[None, Unset, str]
155
+ if isinstance(self.entry_command, Unset):
156
+ entry_command = UNSET
157
+ else:
158
+ entry_command = self.entry_command
159
+
146
160
  environment: Union[None, Unset, dict[str, Any]]
147
161
  if isinstance(self.environment, Unset):
148
162
  environment = UNSET
@@ -175,6 +189,12 @@ class TaskSubmitRequest:
175
189
  else:
176
190
  gres = self.gres
177
191
 
192
+ image: Union[None, Unset, str]
193
+ if isinstance(self.image, Unset):
194
+ image = UNSET
195
+ else:
196
+ image = self.image
197
+
178
198
  input_: Union[None, Unset, str]
179
199
  if isinstance(self.input_, Unset):
180
200
  input_ = UNSET
@@ -289,6 +309,8 @@ class TaskSubmitRequest:
289
309
  field_dict["dependency"] = dependency
290
310
  if distribution is not UNSET:
291
311
  field_dict["distribution"] = distribution
312
+ if entry_command is not UNSET:
313
+ field_dict["entry_command"] = entry_command
292
314
  if environment is not UNSET:
293
315
  field_dict["environment"] = environment
294
316
  if error is not UNSET:
@@ -299,6 +321,8 @@ class TaskSubmitRequest:
299
321
  field_dict["export"] = export
300
322
  if gres is not UNSET:
301
323
  field_dict["gres"] = gres
324
+ if image is not UNSET:
325
+ field_dict["image"] = image
302
326
  if input_ is not UNSET:
303
327
  field_dict["input"] = input_
304
328
  if job_spec is not UNSET:
@@ -416,6 +440,15 @@ class TaskSubmitRequest:
416
440
 
417
441
  distribution = _parse_distribution(d.pop("distribution", UNSET))
418
442
 
443
+ def _parse_entry_command(data: object) -> Union[None, Unset, str]:
444
+ if data is None:
445
+ return data
446
+ if isinstance(data, Unset):
447
+ return data
448
+ return cast(Union[None, Unset, str], data)
449
+
450
+ entry_command = _parse_entry_command(d.pop("entry_command", UNSET))
451
+
419
452
  def _parse_environment(data: object) -> Union["TaskSubmitRequestEnvironmentType0", None, Unset]:
420
453
  if data is None:
421
454
  return data
@@ -469,6 +502,15 @@ class TaskSubmitRequest:
469
502
 
470
503
  gres = _parse_gres(d.pop("gres", UNSET))
471
504
 
505
+ def _parse_image(data: object) -> Union[None, Unset, str]:
506
+ if data is None:
507
+ return data
508
+ if isinstance(data, Unset):
509
+ return data
510
+ return cast(Union[None, Unset, str], data)
511
+
512
+ image = _parse_image(d.pop("image", UNSET))
513
+
472
514
  def _parse_input_(data: object) -> Union[None, Unset, str]:
473
515
  if data is None:
474
516
  return data
@@ -615,11 +657,13 @@ class TaskSubmitRequest:
615
657
  cpus_per_task=cpus_per_task,
616
658
  dependency=dependency,
617
659
  distribution=distribution,
660
+ entry_command=entry_command,
618
661
  environment=environment,
619
662
  error=error,
620
663
  exclude=exclude,
621
664
  export=export,
622
665
  gres=gres,
666
+ image=image,
623
667
  input_=input_,
624
668
  job_spec=job_spec,
625
669
  mem_bind=mem_bind,
@@ -1,9 +1,9 @@
1
1
  import os
2
2
 
3
- from typing import Literal, Optional, Dict
3
+ from typing import Optional, Dict
4
4
  from httpx._types import ProxyTypes
5
5
 
6
- REQUEST_TIMEOUT: float = 30.0 # 30 seconds
6
+ REQUEST_TIMEOUT: float = 120.0 # 120 seconds
7
7
 
8
8
  KEEPALIVE_PING_INTERVAL_SEC = 50 # 50 seconds
9
9
  KEEPALIVE_PING_HEADER = "Keepalive-Ping-Interval"
mlops/task/task.py CHANGED
@@ -6,6 +6,9 @@ This module provides a convenient interface for managing tasks through the MLOps
6
6
 
7
7
  import json
8
8
  import os
9
+ import sys
10
+ import threading
11
+ import time
9
12
  from http import HTTPStatus
10
13
  from pathlib import Path
11
14
  from typing import Optional
@@ -55,13 +58,109 @@ def _validate_archive_file_path(file_path: str) -> Path:
55
58
 
56
59
 
57
60
  def _upload_file_to_presigned_url(url: str, file_path: Path, timeout: Optional[float]) -> None:
61
+ def _format_bytes_iec(n: int) -> str:
62
+ if n < 1024:
63
+ return f"{n}B"
64
+ unit = 1024.0
65
+ suffixes = ["KiB", "MiB", "GiB", "TiB", "PiB"]
66
+ v = float(n)
67
+ i = -1
68
+ while v >= unit and i < len(suffixes) - 1:
69
+ v /= unit
70
+ i += 1
71
+ return f"{v:.1f}{suffixes[i]}"
72
+
73
+ def _render_bar(done: int, total: int, width: int = 28) -> str:
74
+ if total <= 0 or width <= 1:
75
+ return ">"
76
+ done = max(0, min(done, total))
77
+ filled = int(width * (done / total))
78
+ if filled >= width:
79
+ return "=" * width
80
+ if filled <= 0:
81
+ return ">" + (" " * (width - 1))
82
+ return ("=" * filled) + ">" + (" " * (width - filled - 1))
83
+
84
+ def _format_elapsed_seconds(start: float) -> str:
85
+ sec = int(max(0.0, time.monotonic() - start))
86
+ return f"{sec}s"
87
+
88
+ class _ProgressIterable:
89
+ def __init__(self, f, total: int, name: str, chunk_size: int = 64 * 1024):
90
+ self._f = f # file-like object
91
+ self._total = max(0, int(total))
92
+ self._name = name
93
+ self._chunk_size = max(1, int(chunk_size))
94
+ self._read = 0
95
+ self._start = time.monotonic()
96
+ self._completed = False
97
+ self._out = sys.stdout
98
+ try:
99
+ self._is_tty = bool(self._out.isatty())
100
+ except Exception:
101
+ self._is_tty = False
102
+
103
+ def _render_line(self, display_read: int) -> str:
104
+ display_read = max(0, min(int(display_read), self._total))
105
+ pct = (display_read / self._total) * 100.0 if self._total > 0 else 0.0
106
+ bar = _render_bar(display_read, self._total, width=28)
107
+ elapsed = _format_elapsed_seconds(self._start)
108
+ return (
109
+ f"uploading {self._name} [{bar}] {pct:6.2f}% "
110
+ f"({_format_bytes_iec(display_read)}/{_format_bytes_iec(self._total)}) "
111
+ f"elapsed {elapsed}"
112
+ )
113
+
114
+ def _print_line(self, line: str, final: bool = False) -> None:
115
+ if self._is_tty:
116
+ # Refresh same line in terminal.
117
+ print("\r" + line, end="" if not final else "\n", file=self._out, flush=True)
118
+ else:
119
+ # Always visible in non-TTY environments.
120
+ print(line, file=self._out, flush=True)
121
+
122
+ def __iter__(self):
123
+ stop_event = threading.Event()
124
+
125
+ def ticker() -> None:
126
+ last_sec = -1
127
+ # Print immediately so users see something right away.
128
+ self._print_line(self._render_line(self._read))
129
+ while not stop_event.is_set():
130
+ sec = int(max(0.0, time.monotonic() - self._start))
131
+ if sec != last_sec:
132
+ last_sec = sec
133
+ self._print_line(self._render_line(self._read))
134
+ # check frequently to avoid skipping seconds
135
+ stop_event.wait(0.05)
136
+
137
+ t = threading.Thread(target=ticker, name="mlops-upload-progress", daemon=True)
138
+ t.start()
139
+ try:
140
+ while True:
141
+ chunk = self._f.read(self._chunk_size)
142
+ if not chunk:
143
+ break
144
+ self._read += len(chunk)
145
+ yield chunk
146
+ finally:
147
+ # Ensure a final 100% line and stop ticker.
148
+ self._read = self._total
149
+ self._completed = True
150
+ stop_event.set()
151
+ t.join(timeout=0.2)
152
+ self._print_line(self._render_line(self._read), final=True)
153
+
58
154
  size = file_path.stat().st_size
59
155
  # Use a dedicated client for S3 presigned upload (avoid leaking API auth headers).
60
156
  with httpx.Client(timeout=timeout) as client:
61
157
  with file_path.open("rb") as f:
158
+ content = f
159
+ if size > 0:
160
+ content = _ProgressIterable(f, total=size, name=file_path.name)
62
161
  resp = client.put(
63
162
  url,
64
- content=f,
163
+ content=content,
65
164
  headers={
66
165
  "Content-Length": str(size),
67
166
  "Content-Type": "application/octet-stream",
@@ -85,18 +184,25 @@ class Task:
85
184
  config = ConnectionConfig(api_key="your_api_key")
86
185
  task = Task(config=config)
87
186
 
88
- # Submit a task with script
89
- result = task.submit(
90
- name="my-task",
91
- cluster_name="slurm-cn",
92
- script="#!/bin/bash\\necho 'Hello World'"
93
- )
94
-
95
- # Or submit with command
187
+ # Submit a task with gpu type
96
188
  result = task.submit(
97
- name="my-task",
189
+ name="gpu-task-from-sdk",
98
190
  cluster_name="slurm-cn",
99
- command="echo 'Hello World'"
191
+ image="/mnt/minio/images/01ai-registry.cn-shanghai.cr.aliyuncs.com+public+llamafactory+0.9.3.sqsh",
192
+ entry_command="llamafactory-cli train /workspace/config/test_lora.yaml",
193
+ resources={
194
+ "partition": "gpu",
195
+ "nodes": 2,
196
+ "ntasks": 2,
197
+ "cpus_per_task": 2,
198
+ "memory": "4G",
199
+ "time": "01:00:00",
200
+ "gres": "gpu:nvidia_a10:1",
201
+ "qos": "qos_xcloud",
202
+ "job_type": "batch",
203
+ },
204
+ team_id=1,
205
+ file_path="your file path",
100
206
  )
101
207
 
102
208
  # Get task details
@@ -151,8 +257,8 @@ class Task:
151
257
  self,
152
258
  name: str,
153
259
  cluster_name: str,
154
- script: Optional[str] = None,
155
- command: Optional[str] = None,
260
+ image: str,
261
+ entry_command: str,
156
262
  resources: Optional[dict] = None,
157
263
  team_id: Optional[int] = None,
158
264
  file_path: Optional[str] = None,
@@ -163,11 +269,11 @@ class Task:
163
269
  Args:
164
270
  name: Task name
165
271
  cluster_name: Cluster name to submit the task to
166
- script: Task script content (optional, but at least one of script or command is required)
167
- command: Command to execute (optional, but at least one of script or command is required)
272
+ image: Container image reference
273
+ entry_command: Container entry command/script
168
274
  resources: Resource requirements dict (optional)
169
275
  team_id: Team ID (optional)
170
-
276
+ file_path: Local file path to upload (optional, support for .zip, .tar.gz, .tgz)
171
277
  Returns:
172
278
  TaskSubmitResponse containing the submitted task information
173
279
 
@@ -175,25 +281,14 @@ class Task:
175
281
  APIException: If the API returns an error
176
282
  AuthenticationException: If authentication fails
177
283
  """
178
- # At least one of script or command must be provided
179
- if not script and not command:
180
- raise APIException("At least one of 'script' or 'command' must be provided")
181
-
182
- # Map resources dict to individual fields
183
- # resources dict can contain: cpu, cpus_per_task, memory, nodes, gres, time, partition, etc.
184
-
185
284
  request_kwargs = {
186
285
  "name": name,
187
286
  "cluster_name": cluster_name,
287
+ "image": image,
288
+ "entry_command": entry_command,
188
289
  }
189
-
190
- # Handle script and command (at least one is required)
191
- # script is Union[Unset, str], so we need to set it or leave as UNSET
192
- if script:
193
- request_kwargs["script"] = script
194
- # command is Union[None, Unset, str], so we can set it or leave as UNSET
195
- if command:
196
- request_kwargs["command"] = command
290
+ # Map resources dict to individual fields
291
+ # resources dict can contain: cpu, cpus_per_task, memory, nodes, gres, time, partition, etc.
197
292
 
198
293
  # team_id is Union[None, Unset, int]
199
294
  if team_id is not None:
@@ -203,18 +298,34 @@ class Task:
203
298
  if resources:
204
299
  if "cpu" in resources or "cpus_per_task" in resources:
205
300
  request_kwargs["cpus_per_task"] = resources.get("cpus_per_task") or resources.get("cpu")
301
+ else:
302
+ request_kwargs["cpus_per_task"] = 1
206
303
  if "memory" in resources:
207
304
  request_kwargs["memory"] = resources.get("memory")
305
+ else:
306
+ request_kwargs["memory"] = "1G"
208
307
  if "nodes" in resources:
209
308
  request_kwargs["nodes"] = resources.get("nodes")
309
+ else:
310
+ request_kwargs["nodes"] = 1
210
311
  if "gres" in resources:
211
312
  request_kwargs["gres"] = resources.get("gres")
212
313
  if "time" in resources:
213
314
  request_kwargs["time"] = resources.get("time")
315
+ else:
316
+ request_kwargs["time"] = "01:00:00"
214
317
  if "partition" in resources:
215
318
  request_kwargs["partition"] = resources.get("partition")
216
- if "tres" in resources:
217
- request_kwargs["tres"] = resources.get("tres")
319
+ else:
320
+ request_kwargs["partition"] = "debug"
321
+ if "qos" in resources:
322
+ request_kwargs["qos"] = resources.get("qos")
323
+ else:
324
+ request_kwargs["qos"] = "qos_xcloud"
325
+ if "ntasks" in resources:
326
+ request_kwargs["ntasks"] = resources.get("ntasks")
327
+ else:
328
+ request_kwargs["ntasks"] = 1
218
329
 
219
330
  if file_path:
220
331
  local_path = _validate_archive_file_path(file_path)
@@ -0,0 +1,235 @@
1
+ Metadata-Version: 2.3
2
+ Name: mlops-python-sdk
3
+ Version: 1.0.3
4
+ Summary: MLOps Python SDK for XCloud Service API
5
+ License: MIT
6
+ Author: mlops
7
+ Author-email: mlops@example.com
8
+ Requires-Python: >=3.9,<4.0
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Requires-Dist: attrs (>=23.2.0)
17
+ Requires-Dist: httpx (>=0.27.0,<1.0.0)
18
+ Requires-Dist: packaging (>=24.1)
19
+ Requires-Dist: python-dateutil (>=2.8.2)
20
+ Requires-Dist: typing-extensions (>=4.1.0)
21
+ Project-URL: Bug Tracker, https://github.com/xcloud-service/xservice/issues
22
+ Project-URL: Homepage, https://mlops.cloud/
23
+ Project-URL: Repository, https://github.com/xcloud-service/xservice
24
+ Description-Content-Type: text/markdown
25
+
26
+ # SDK
27
+
28
+ Software Development Kits for integrating with the XCloud Service API.
29
+
30
+ > [!NOTE] SDK Support
31
+ > SDKs provide type-safe, high-level interfaces for interacting with the platform API. They handle authentication, error handling, and request retries automatically.
32
+
33
+
34
+ ## Installation
35
+
36
+ The Python SDK installation.
37
+
38
+ ```bash
39
+ pip install mlops-python-sdk
40
+ ```
41
+
42
+ ### Configuration
43
+
44
+ The SDK reads configuration from environment variables by default:
45
+
46
+ - `MLOPS_API_KEY`: API key (required)
47
+ - `MLOPS_DOMAIN`: API domain, e.g. `localhost:8090` or `https://example.com`
48
+ - `MLOPS_API_PATH`: API path prefix (default: `/api/v1`)
49
+ - `MLOPS_DEBUG`: `true|false` (default: `false`)
50
+
51
+ Or configure in code:
52
+
53
+ ```python
54
+ from mlops import ConnectionConfig, Task
55
+
56
+ config = ConnectionConfig(
57
+ api_key="xck_...",
58
+ domain="https://example.com",
59
+ api_path="/api/v1",
60
+ debug=False,
61
+ )
62
+ task = Task(config=config)
63
+ ```
64
+
65
+ ## SDK Usage
66
+
67
+ ### Initialize client
68
+
69
+ ```python
70
+ from mlops import Task
71
+
72
+ task = Task() # uses environment variables by default
73
+ ```
74
+
75
+ ### Submit a GPU task
76
+
77
+ ```python
78
+ from mlops import Task
79
+
80
+ task = Task()
81
+ resp = task.submit(
82
+ name="gpu-task-from-sdk",
83
+ cluster_name="slurm-cn",
84
+ team_id=1,
85
+ image="/mnt/minio/images/01ai-registry.cn-shanghai.cr.aliyuncs.com+public+llamafactory+0.9.3.sqsh",
86
+ entry_command="llamafactory-cli train /workspace/config/test_lora.yaml",
87
+ resources={
88
+ "partition": "gpu",
89
+ "nodes": 2,
90
+ "ntasks": 2,
91
+ "cpus_per_task": 2,
92
+ "memory": "4G",
93
+ "time": "01:00:00",
94
+ "gres": "gpu:nvidia_a10:1",
95
+ "qos": "qos_xcloud",
96
+ },
97
+ file_path="/path/to/xservice.zip", # optional: .zip/.tar.gz/.tgz
98
+ )
99
+ print(resp.job_id)
100
+ ```
101
+
102
+ ### Submit a CPU task
103
+
104
+ ```python
105
+ from mlops import Task
106
+
107
+ task = Task()
108
+ resp = task.submit(
109
+ name="cpu-task-from-sdk",
110
+ cluster_name="slurm-cn",
111
+ team_id=1,
112
+ image="docker://01ai-registry.cn-shanghai.cr.aliyuncs.com/01-ai/xcs/v2/alpine:3.23.0",
113
+ entry_command="echo hello",
114
+ resources={
115
+ "partition": "cpu",
116
+ "nodes": 1,
117
+ "ntasks": 1,
118
+ "cpus_per_task": 1,
119
+ "memory": "1G",
120
+ "time": "01:00:00",
121
+ "qos": "qos_xcloud",
122
+ },
123
+ )
124
+ print(resp.job_id)
125
+ ```
126
+
127
+ ### List tasks
128
+
129
+ ```python
130
+ from mlops import Task
131
+ from mlops.api.client.models.task_status import TaskStatus
132
+
133
+ task = Task()
134
+ resp = task.list(status=TaskStatus.COMPLETED, cluster_name="slurm-cn", page=1, page_size=20)
135
+ print(len(resp.tasks or []))
136
+ ```
137
+
138
+ ### Get task details
139
+
140
+ ```python
141
+ from mlops import Task
142
+
143
+ task = Task()
144
+ task_info = task.get(task_id=12345, cluster_name="slurm-cn")
145
+ print(task_info)
146
+ ```
147
+
148
+ ### Cancel a task
149
+
150
+ ```python
151
+ from mlops import Task
152
+
153
+ task = Task()
154
+ task.cancel(task_id=12345, cluster_name="slurm-cn")
155
+ ```
156
+
157
+ ### Delete a task
158
+
159
+ ```python
160
+ from mlops import Task
161
+
162
+ task = Task()
163
+ task.delete(task_id=12345, cluster_name="slurm-cn")
164
+ ```
165
+
166
+ **Task Management Methods:**
167
+
168
+ - `submit()` - Submit a new task with container image and entry command
169
+ - `get()` - Get task details by task ID
170
+ - `list()` - List tasks with optional filters (status, cluster_name, team_id, user_id)
171
+ - `cancel()` - Cancel a running task
172
+ - `delete()` - Delete a task record
173
+
174
+ **Task Status Values:**
175
+
176
+ ```python
177
+ from mlops.api.client.models.task_status import TaskStatus
178
+
179
+ TaskStatus.PENDING # Task is pending
180
+ TaskStatus.QUEUED # Task is queued
181
+ TaskStatus.RUNNING # Task is running
182
+ TaskStatus.COMPLETED # Task completed successfully
183
+ TaskStatus.SUCCEEDED # Task succeeded
184
+ TaskStatus.FAILED # Task failed
185
+ TaskStatus.CANCELLED # Task was cancelled
186
+ TaskStatus.CREATED # Task was created
187
+ ```
188
+
189
+ **Error Handling:**
190
+
191
+ ```python
192
+ from mlops.exceptions import (
193
+ APIException,
194
+ AuthenticationException,
195
+ NotFoundException,
196
+ RateLimitException,
197
+ TimeoutException,
198
+ InvalidArgumentException,
199
+ NotEnoughSpaceException
200
+ )
201
+ from mlops import Task
202
+
203
+ task = Task()
204
+
205
+ try:
206
+ result = task.submit(
207
+ name="test",
208
+ cluster_name="slurm-cn",
209
+ image="docker://alpine:3.23.0",
210
+ entry_command="echo hello",
211
+ )
212
+ except AuthenticationException as e:
213
+ print(f"Authentication failed: {e}")
214
+ except NotFoundException as e:
215
+ print(f"Resource not found: {e}")
216
+ except APIException as e:
217
+ print(f"API error: {e}")
218
+ ```
219
+
220
+ > [!TIP] Error Handling
221
+ > SDKs automatically parse typed responses and raise structured exceptions.
222
+
223
+ ## Features
224
+
225
+ - Type-safe API clients
226
+ - Automatic authentication
227
+ - Error handling
228
+ - Typed response parsing (generated models)
229
+ - Unexpected-status guard (optional)
230
+
231
+ ## Resources
232
+
233
+ - [Python SDK Documentation](https://github.com/xcloud-service/xservice/tree/main/client/python-sdk)
234
+ - [API Reference](https://xcloud-service.com/docs/api)
235
+
@@ -35,18 +35,18 @@ mlops/api/client/models/task_log_entry_log_type.py,sha256=uVqbF8RewyFkezY6sy28He
35
35
  mlops/api/client/models/task_logs_response.py,sha256=QEGRy51qB7t0K-EGusxzDmkDlAjdKkwHF92em3dLb1c,3557
36
36
  mlops/api/client/models/task_resources_type_0.py,sha256=36nxeOqAJS4ksfQtzoXigWVMhEV1Tnq5Z_64sHa3gGQ,1341
37
37
  mlops/api/client/models/task_status.py,sha256=Tht4F2UeBp-QBLhh-z0fEw45r5cBCfkFUro-la42BPY,315
38
- mlops/api/client/models/task_submit_request.py,sha256=8zbEK2Y_dT4S6Wflm6WTNn9f_f-SLQ7Sl92bS2H_T0c,22941
38
+ mlops/api/client/models/task_submit_request.py,sha256=g8THqxUjn0VD4fw8eo6I6qe9Eym6q9vmSSTFrhcUlbc,24803
39
39
  mlops/api/client/models/task_submit_request_environment_type_0.py,sha256=Wx6ye6vVHytSex186AeUm27-XMWMmZe6lbL2Ons2mkw,1454
40
40
  mlops/api/client/models/task_submit_response.py,sha256=EK3ZXxo_XO5Yn2zdOrR-VMPKg9om49qQ1ywS2Smgink,2200
41
41
  mlops/api/client/models/task_tres_type_0.py,sha256=rEaiQG7A19mlTIHDppzxuWa4oPfh9qsKjPhhVOlBf4g,1292
42
42
  mlops/api/client/models/task_tres_used_type_0.py,sha256=4w6An7-ZCqa8cc3SPi7mcwGK-ekT6AYq_dEdf8KzoYA,1320
43
43
  mlops/api/client/py.typed,sha256=8ZJUsxZiuOy1oJeVhsTWQhTG_6pTVHVXk5hJL79ebTk,25
44
44
  mlops/api/client/types.py,sha256=AX4orxQZQJat3vZrgjJ-TYb2sNBL8kNo9yqYDT-n8y8,1391
45
- mlops/connection_config.py,sha256=aU_8WwkMcomjt4dDyRk1Oyr92ywwuIhFLmv0oQ29KkM,2953
45
+ mlops/connection_config.py,sha256=_b9sVFGJtf1GynmIB4NtKCzg7kkgE-wSrsG3LwzlOqk,2946
46
46
  mlops/exceptions.py,sha256=3kfda-Rz0km9kV-gvnPCw7ueemWkXIGGdT0NXx6z9Xk,1680
47
47
  mlops/task/__init__.py,sha256=M983vMPLj3tZQNFXQyTP5I2RsRorFElezLeppr3WLsw,133
48
48
  mlops/task/client.py,sha256=V131WLVJl1raGAVixUhJCX8s1neN15mxAjQwO01qlIg,3552
49
- mlops/task/task.py,sha256=7QBSNpmI4jacWep2FaSZyA86wgsx-BiNAlBrKi-Razg,23450
50
- mlops_python_sdk-1.0.1.dist-info/METADATA,sha256=afpVJjsJ-TN-lmCym3ScsMJhrOdFoId06MBfLEfFbY4,9525
51
- mlops_python_sdk-1.0.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
52
- mlops_python_sdk-1.0.1.dist-info/RECORD,,
49
+ mlops/task/task.py,sha256=Eqb4XGMlFLjelg3js9Twoulf0Nlyn0pz5isuGl916vs,27756
50
+ mlops_python_sdk-1.0.3.dist-info/METADATA,sha256=KwMwLVAYfXBjKXXiU_p5TibVXGbli5gaxCCa0Wap9h4,5679
51
+ mlops_python_sdk-1.0.3.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
52
+ mlops_python_sdk-1.0.3.dist-info/RECORD,,
@@ -1,407 +0,0 @@
1
- Metadata-Version: 2.3
2
- Name: mlops-python-sdk
3
- Version: 1.0.1
4
- Summary: MLOps Python SDK for XCloud Service API
5
- License: MIT
6
- Author: mlops
7
- Author-email: mlops@example.com
8
- Requires-Python: >=3.9,<4.0
9
- Classifier: License :: OSI Approved :: MIT License
10
- Classifier: Programming Language :: Python :: 3
11
- Classifier: Programming Language :: Python :: 3.9
12
- Classifier: Programming Language :: Python :: 3.10
13
- Classifier: Programming Language :: Python :: 3.11
14
- Classifier: Programming Language :: Python :: 3.12
15
- Classifier: Programming Language :: Python :: 3.13
16
- Requires-Dist: attrs (>=23.2.0)
17
- Requires-Dist: httpx (>=0.27.0,<1.0.0)
18
- Requires-Dist: packaging (>=24.1)
19
- Requires-Dist: python-dateutil (>=2.8.2)
20
- Requires-Dist: typing-extensions (>=4.1.0)
21
- Project-URL: Bug Tracker, https://github.com/xcloud-service/xservice/issues
22
- Project-URL: Homepage, https://mlops.cloud/
23
- Project-URL: Repository, https://github.com/xcloud-service/xservice
24
- Description-Content-Type: text/markdown
25
-
26
- # MLOps Python SDK
27
-
28
- [MLOps](https://xcloud-service.com) Python SDK for XCloud Service API. Manage and execute tasks with confidence.
29
-
30
- ## Installation
31
-
32
- Install the SDK from PyPI:
33
-
34
- ```bash
35
- pip install mlops-python-sdk
36
- ```
37
-
38
- ## Quick Start
39
-
40
- ### 1. Setup Authentication
41
-
42
- You can authenticate using either an API Key.
43
-
44
- #### API Key (Recommended for programmatic access)
45
-
46
- 1. Sign up at [MLOps](https://xcloud-service.com)
47
- 2. Create an API key from [API Keys](https://xcloud-service.com/home/api-keys)
48
- 3. Set environment variables:
49
-
50
- ```bash
51
- export MLOPS_API_KEY=xck_******
52
- export MLOPS_DOMAIN=localhost:8090 # optional, default is localhost:8090
53
- ```
54
-
55
- ### 2. Basic Usage
56
-
57
- ```python
58
- from mlops import Task, ConnectionConfig
59
- from mlops.api.client.models.task_status import TaskStatus
60
-
61
- # Initialize Task client (uses environment variables by default)
62
- task = Task()
63
-
64
- # Or initialize with explicit configuration
65
- config = ConnectionConfig(
66
- api_key="xck_******",
67
- domain="localhost:8090",
68
- debug=False
69
- )
70
- task = Task(config=config)
71
-
72
- # Submit a task with script
73
- result = task.submit(
74
- name="my-training-task",
75
- cluster_id=1,
76
- script="#!/bin/bash\necho 'Hello World'",
77
- resources={"cpu": 4, "memory": "8GB", "gpu": 1}
78
- )
79
-
80
- # Or submit with command
81
- result = task.submit(
82
- name="my-task",
83
- cluster_id=1,
84
- command="python train.py",
85
- resources={"cpu": 4, "memory": "8GB"}
86
- )
87
-
88
- # Get task details
89
- task_info = task.get(task_id=result.job_id, cluster_id=1)
90
-
91
- # List tasks with filters
92
- running_tasks = task.list(
93
- status=TaskStatus.RUNNING,
94
- cluster_id=1,
95
- page=1,
96
- page_size=20
97
- )
98
-
99
- # Cancel a task
100
- task.cancel(task_id=result.job_id, cluster_id=1)
101
-
102
- # Delete a task
103
- task.delete(task_id=task_id, cluster_id=1)
104
- ```
105
-
106
- ## API Reference
107
-
108
- ### Task Class
109
-
110
- The `Task` class provides a high-level interface for managing tasks.
111
-
112
- #### Initialization
113
-
114
- ```python
115
- from mlops import Task, ConnectionConfig
116
-
117
- # Using environment variables
118
- task = Task()
119
-
120
- # With explicit configuration
121
- config = ConnectionConfig(
122
- api_key="xck_******", # API key for authentication
123
- domain="localhost:8090", # API domain
124
- debug=False, # Enable debug mode
125
- request_timeout=30.0 # Request timeout in seconds
126
- )
127
- task = Task(config=config)
128
-
129
- # Or pass parameters directly
130
- task = Task(
131
- api_key="xck_******",
132
- domain="localhost:8090"
133
- )
134
- ```
135
-
136
- #### Methods
137
-
138
- ##### `submit()`
139
-
140
- Submit a new task to the cluster.
141
-
142
- ```python
143
- result = task.submit(
144
- name: str, # Task name (required)
145
- cluster_id: int, # Cluster ID (required)
146
- script: Optional[str] = None, # Script content (script or command required)
147
- command: Optional[str] = None,# Command to execute (script or command required)
148
- resources: Optional[dict] = None, # Resource requirements
149
- team_id: Optional[int] = None # Team ID (optional)
150
- ) -> TaskSubmitResponse
151
- ```
152
-
153
- **Resources dictionary** can contain:
154
- - `cpu` or `cpus_per_task`: Number of CPUs
155
- - `memory`: Memory requirement (e.g., "8GB", "4096M")
156
- - `nodes`: Number of nodes
157
- - `gres`: GPU resources (e.g., "gpu:1")
158
- - `time`: Time limit (e.g., "1-00:00:00" for 1 day)
159
- - `partition`: Partition name
160
- - `tres`: TRES specification
161
-
162
- **Example:**
163
-
164
- ```python
165
- result = task.submit(
166
- name="ml-training",
167
- cluster_id=1,
168
- script="#!/bin/bash\npython train.py --epochs 100",
169
- resources={
170
- "cpu": 8,
171
- "memory": "16GB",
172
- "gpu": 1,
173
- "time": "2-00:00:00", # 2 days
174
- "partition": "gpu"
175
- }
176
- )
177
- print(f"Task submitted: Job ID = {result.job_id}")
178
- ```
179
-
180
- ##### `get()`
181
-
182
- Get task details by task ID.
183
-
184
- ```python
185
- task_info = task.get(
186
- task_id: int, # Task ID (Slurm job ID)
187
- cluster_id: int # Cluster ID (required)
188
- ) -> Task
189
- ```
190
-
191
- **Example:**
192
-
193
- ```python
194
- task_info = task.get(task_id=12345, cluster_id=1)
195
- print(f"Task status: {task_info.status}")
196
- print(f"Task name: {task_info.name}")
197
- ```
198
-
199
- ##### `list()`
200
-
201
- List tasks with optional filters and pagination.
202
-
203
- ```python
204
- tasks = task.list(
205
- page: int = 1, # Page number
206
- page_size: int = 20, # Items per page
207
- status: Optional[TaskStatus] = None, # Filter by status
208
- cluster_id: Optional[int] = None, # Filter by cluster ID
209
- team_id: Optional[int] = None, # Filter by team ID
210
- user_id: Optional[int] = None # Filter by user ID
211
- ) -> TaskListResponse
212
- ```
213
-
214
- **Example:**
215
-
216
- ```python
217
- from mlops.api.client.models.task_status import TaskStatus
218
-
219
- # List all running tasks
220
- running_tasks = task.list(status=TaskStatus.RUNNING)
221
-
222
- # List tasks in a specific cluster
223
- cluster_tasks = task.list(cluster_id=1, page=1, page_size=10)
224
-
225
- # List completed tasks with pagination
226
- completed = task.list(
227
- status=TaskStatus.COMPLETED,
228
- cluster_id=1,
229
- page=1,
230
- page_size=50
231
- )
232
- ```
233
-
234
- ##### `cancel()`
235
-
236
- Cancel a running task.
237
-
238
- ```python
239
- task.cancel(
240
- task_id: int, # Task ID (Slurm job ID)
241
- cluster_id: int # Cluster ID (required)
242
- )
243
- ```
244
-
245
- **Example:**
246
-
247
- ```python
248
- task.cancel(task_id=12345, cluster_id=1)
249
- ```
250
-
251
- ### TaskStatus Enum
252
-
253
- Task status values for filtering:
254
-
255
- ```python
256
- from mlops.api.client.models.task_status import TaskStatus
257
-
258
- TaskStatus.PENDING # Task is pending
259
- TaskStatus.QUEUED # Task is queued
260
- TaskStatus.RUNNING # Task is running
261
- TaskStatus.COMPLETED # Task completed successfully
262
- TaskStatus.SUCCEEDED # Task succeeded
263
- TaskStatus.FAILED # Task failed
264
- TaskStatus.CANCELLED # Task was cancelled
265
- TaskStatus.CREATED # Task was created
266
- ```
267
-
268
- ## Configuration
269
-
270
- ### Environment Variables
271
-
272
- The SDK reads configuration from environment variables:
273
-
274
- - `MLOPS_API_KEY`: API key for authentication
275
- - `MLOPS_DOMAIN`: API domain (default: `localhost:8090`)
276
- - `MLOPS_DEBUG`: Enable debug mode (`true`/`false`, default: `false`)
277
- - `MLOPS_API_PATH`: API path prefix (default: `/api/v1`)
278
-
279
- ### ConnectionConfig
280
-
281
- You can also configure the connection programmatically:
282
-
283
- ```python
284
- from mlops import ConnectionConfig
285
-
286
- config = ConnectionConfig(
287
- domain="api.example.com",
288
- api_key="xck_******",
289
- debug=True,
290
- request_timeout=60.0,
291
- api_path="/api/v1"
292
- )
293
- ```
294
-
295
- ## Error Handling
296
-
297
- The SDK provides specific exception types:
298
-
299
- ```python
300
- from mlops.exceptions import (
301
- APIException, # General API errors
302
- AuthenticationException, # Authentication failures
303
- NotFoundException, # Resource not found
304
- RateLimitException, # Rate limit exceeded
305
- TimeoutException, # Request timeout
306
- InvalidArgumentException # Invalid arguments
307
- )
308
-
309
- try:
310
- result = task.submit(name="test", cluster_id=1, command="echo hello")
311
- except AuthenticationException as e:
312
- print(f"Authentication failed: {e}")
313
- except NotFoundException as e:
314
- print(f"Resource not found: {e}")
315
- except APIException as e:
316
- print(f"API error: {e}")
317
- ```
318
-
319
- ## Examples
320
-
321
- ### Submit a Machine Learning Training Job
322
-
323
- ```python
324
- from mlops import Task
325
-
326
- task = Task()
327
-
328
- result = task.submit(
329
- name="pytorch-training",
330
- cluster_id=1,
331
- script="""#!/bin/bash
332
- #SBATCH --gres=gpu:1
333
- #SBATCH --cpus-per-task=2
334
- #SBATCH --mem=4GB
335
-
336
- python train.py --config config.yaml
337
- """,
338
- resources={
339
- "cpus_per_task": 2,
340
- "memory": "4GB",
341
- "gres": "gpu:1",
342
- "time": "1-00:00:00", # 1 days
343
- "partition": "gpu"
344
- }
345
- )
346
-
347
- print(f"Training job submitted: {result.job_id}")
348
- ```
349
-
350
- ### Monitor Task Status
351
-
352
- ```python
353
- from mlops import Task
354
- from mlops.api.client.models.task_status import TaskStatus
355
- import time
356
-
357
- task = Task()
358
- job_id = 12345
359
- cluster_id = 1
360
-
361
- while True:
362
- task_info = task.get(task_id=job_id, cluster_id=cluster_id)
363
- print(f"Status: {task_info.status}")
364
-
365
- if task_info.status in [TaskStatus.COMPLETED, TaskStatus.FAILED, TaskStatus.CANCELLED]:
366
- break
367
-
368
- time.sleep(10) # Check every 10 seconds
369
- ```
370
-
371
- ### List and Filter Tasks
372
-
373
- ```python
374
- from mlops import Task
375
- from mlops.api.client.models.task_status import TaskStatus
376
-
377
- task = Task()
378
-
379
- # Get all running tasks in cluster 1
380
- running = task.list(
381
- status=TaskStatus.RUNNING,
382
- cluster_id=1
383
- )
384
-
385
- for t in running.tasks:
386
- print(f"{t.name}: {t.status} (Job ID: {t.job_id})")
387
-
388
- # Get failed tasks
389
- failed = task.list(status=TaskStatus.FAILED)
390
-
391
- print(f"Total failed tasks: {failed.total}")
392
- ```
393
-
394
- ## Documentation
395
-
396
- - [MLOPS Documentation](https://xcloud-service.com/docs)
397
- - [API Reference](https://xcloud-service.com/docs/api)
398
-
399
- ## License
400
-
401
- MIT
402
-
403
- ## Support
404
-
405
- - [GitHub Issues](https://github.com/xcloud-service/xservice/issues)
406
- - [Documentation](https://xcloud-service.com/docs)
407
-