mlops-python-sdk 1.0.1__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,12 +29,18 @@ class TaskSubmitRequest:
29
29
  cpus_per_task (Union[None, Unset, int]): CPUs per task Example: 1.
30
30
  dependency (Union[None, Unset, str]): Job dependencies Example: afterok:12345.
31
31
  distribution (Union[None, Unset, str]): Task distribution Example: block.
32
+ entry_command (Union[None, Unset, str]): Container entry command/script (bash snippet) executed inside the
33
+ container. The platform runs it under /workspace.
34
+ Example: python -V && ls -la.
32
35
  environment (Union['TaskSubmitRequestEnvironmentType0', None, Unset]): Environment variables as key-value pairs
33
36
  Example: {'CUDA_VISIBLE_DEVICES': '0,1', 'PYTHONPATH': '/opt/python/lib'}.
34
37
  error (Union[None, Unset, str]): Standard error file pattern Example: error_%j.log.
35
38
  exclude (Union[None, Unset, str]): Nodes to exclude
36
39
  export (Union[None, Unset, str]): Environment export Example: ALL.
37
40
  gres (Union[None, Unset, str]): Generic resources (e.g., "gpu:1", "gpu:tesla:2") Example: gpu:1.
41
+ image (Union[None, Unset, str]): Container image reference. Can be a Slurm container plugin supported reference
42
+ (e.g. "docker://..."), or a registry reference which will be mapped to a local .sqsh image path by the platform.
43
+ Example: 01ai-registry.cn-shanghai.cr.aliyuncs.com/public/llamafactory:0.9.3.
38
44
  input_ (Union[None, Unset, str]): Standard input file
39
45
  job_spec (Union[Unset, JobSpec]): Domain-specific job specification (rendered into slurm script)
40
46
  mem_bind (Union[None, Unset, str]): Memory binding
@@ -65,11 +71,13 @@ class TaskSubmitRequest:
65
71
  cpus_per_task: Union[None, Unset, int] = UNSET
66
72
  dependency: Union[None, Unset, str] = UNSET
67
73
  distribution: Union[None, Unset, str] = UNSET
74
+ entry_command: Union[None, Unset, str] = UNSET
68
75
  environment: Union["TaskSubmitRequestEnvironmentType0", None, Unset] = UNSET
69
76
  error: Union[None, Unset, str] = UNSET
70
77
  exclude: Union[None, Unset, str] = UNSET
71
78
  export: Union[None, Unset, str] = UNSET
72
79
  gres: Union[None, Unset, str] = UNSET
80
+ image: Union[None, Unset, str] = UNSET
73
81
  input_: Union[None, Unset, str] = UNSET
74
82
  job_spec: Union[Unset, "JobSpec"] = UNSET
75
83
  mem_bind: Union[None, Unset, str] = UNSET
@@ -143,6 +151,12 @@ class TaskSubmitRequest:
143
151
  else:
144
152
  distribution = self.distribution
145
153
 
154
+ entry_command: Union[None, Unset, str]
155
+ if isinstance(self.entry_command, Unset):
156
+ entry_command = UNSET
157
+ else:
158
+ entry_command = self.entry_command
159
+
146
160
  environment: Union[None, Unset, dict[str, Any]]
147
161
  if isinstance(self.environment, Unset):
148
162
  environment = UNSET
@@ -175,6 +189,12 @@ class TaskSubmitRequest:
175
189
  else:
176
190
  gres = self.gres
177
191
 
192
+ image: Union[None, Unset, str]
193
+ if isinstance(self.image, Unset):
194
+ image = UNSET
195
+ else:
196
+ image = self.image
197
+
178
198
  input_: Union[None, Unset, str]
179
199
  if isinstance(self.input_, Unset):
180
200
  input_ = UNSET
@@ -289,6 +309,8 @@ class TaskSubmitRequest:
289
309
  field_dict["dependency"] = dependency
290
310
  if distribution is not UNSET:
291
311
  field_dict["distribution"] = distribution
312
+ if entry_command is not UNSET:
313
+ field_dict["entry_command"] = entry_command
292
314
  if environment is not UNSET:
293
315
  field_dict["environment"] = environment
294
316
  if error is not UNSET:
@@ -299,6 +321,8 @@ class TaskSubmitRequest:
299
321
  field_dict["export"] = export
300
322
  if gres is not UNSET:
301
323
  field_dict["gres"] = gres
324
+ if image is not UNSET:
325
+ field_dict["image"] = image
302
326
  if input_ is not UNSET:
303
327
  field_dict["input"] = input_
304
328
  if job_spec is not UNSET:
@@ -416,6 +440,15 @@ class TaskSubmitRequest:
416
440
 
417
441
  distribution = _parse_distribution(d.pop("distribution", UNSET))
418
442
 
443
+ def _parse_entry_command(data: object) -> Union[None, Unset, str]:
444
+ if data is None:
445
+ return data
446
+ if isinstance(data, Unset):
447
+ return data
448
+ return cast(Union[None, Unset, str], data)
449
+
450
+ entry_command = _parse_entry_command(d.pop("entry_command", UNSET))
451
+
419
452
  def _parse_environment(data: object) -> Union["TaskSubmitRequestEnvironmentType0", None, Unset]:
420
453
  if data is None:
421
454
  return data
@@ -469,6 +502,15 @@ class TaskSubmitRequest:
469
502
 
470
503
  gres = _parse_gres(d.pop("gres", UNSET))
471
504
 
505
+ def _parse_image(data: object) -> Union[None, Unset, str]:
506
+ if data is None:
507
+ return data
508
+ if isinstance(data, Unset):
509
+ return data
510
+ return cast(Union[None, Unset, str], data)
511
+
512
+ image = _parse_image(d.pop("image", UNSET))
513
+
472
514
  def _parse_input_(data: object) -> Union[None, Unset, str]:
473
515
  if data is None:
474
516
  return data
@@ -615,11 +657,13 @@ class TaskSubmitRequest:
615
657
  cpus_per_task=cpus_per_task,
616
658
  dependency=dependency,
617
659
  distribution=distribution,
660
+ entry_command=entry_command,
618
661
  environment=environment,
619
662
  error=error,
620
663
  exclude=exclude,
621
664
  export=export,
622
665
  gres=gres,
666
+ image=image,
623
667
  input_=input_,
624
668
  job_spec=job_spec,
625
669
  mem_bind=mem_bind,
@@ -1,9 +1,9 @@
1
1
  import os
2
2
 
3
- from typing import Literal, Optional, Dict
3
+ from typing import Optional, Dict
4
4
  from httpx._types import ProxyTypes
5
5
 
6
- REQUEST_TIMEOUT: float = 30.0 # 30 seconds
6
+ REQUEST_TIMEOUT: float = 120.0 # 120 seconds
7
7
 
8
8
  KEEPALIVE_PING_INTERVAL_SEC = 50 # 50 seconds
9
9
  KEEPALIVE_PING_HEADER = "Keepalive-Ping-Interval"
mlops/task/task.py CHANGED
@@ -85,18 +85,25 @@ class Task:
85
85
  config = ConnectionConfig(api_key="your_api_key")
86
86
  task = Task(config=config)
87
87
 
88
- # Submit a task with script
88
+ # Submit a task with gpu type
89
89
  result = task.submit(
90
- name="my-task",
90
+ name="gpu-task-from-sdk",
91
91
  cluster_name="slurm-cn",
92
- script="#!/bin/bash\\necho 'Hello World'"
93
- )
94
-
95
- # Or submit with command
96
- result = task.submit(
97
- name="my-task",
98
- cluster_name="slurm-cn",
99
- command="echo 'Hello World'"
92
+ image="/mnt/minio/images/01ai-registry.cn-shanghai.cr.aliyuncs.com+public+llamafactory+0.9.3.sqsh",
93
+ entry_command="llamafactory-cli train /workspace/config/test_lora.yaml",
94
+ resources={
95
+ "partition": "gpu",
96
+ "nodes": 2,
97
+ "ntasks": 2,
98
+ "cpus_per_task": 2,
99
+ "memory": "4G",
100
+ "time": "01:00:00",
101
+ "gres": "gpu:nvidia_a10:1",
102
+ "qos": "qos_xcloud",
103
+ "job_type": "batch",
104
+ },
105
+ team_id=1,
106
+ file_path="your file path",
100
107
  )
101
108
 
102
109
  # Get task details
@@ -151,8 +158,8 @@ class Task:
151
158
  self,
152
159
  name: str,
153
160
  cluster_name: str,
154
- script: Optional[str] = None,
155
- command: Optional[str] = None,
161
+ image: str,
162
+ entry_command: str,
156
163
  resources: Optional[dict] = None,
157
164
  team_id: Optional[int] = None,
158
165
  file_path: Optional[str] = None,
@@ -163,11 +170,11 @@ class Task:
163
170
  Args:
164
171
  name: Task name
165
172
  cluster_name: Cluster name to submit the task to
166
- script: Task script content (optional, but at least one of script or command is required)
167
- command: Command to execute (optional, but at least one of script or command is required)
173
+ image: Container image reference
174
+ entry_command: Container entry command/script
168
175
  resources: Resource requirements dict (optional)
169
176
  team_id: Team ID (optional)
170
-
177
+ file_path: Local file path to upload (optional, support for .zip, .tar.gz, .tgz)
171
178
  Returns:
172
179
  TaskSubmitResponse containing the submitted task information
173
180
 
@@ -175,25 +182,14 @@ class Task:
175
182
  APIException: If the API returns an error
176
183
  AuthenticationException: If authentication fails
177
184
  """
178
- # At least one of script or command must be provided
179
- if not script and not command:
180
- raise APIException("At least one of 'script' or 'command' must be provided")
181
-
182
- # Map resources dict to individual fields
183
- # resources dict can contain: cpu, cpus_per_task, memory, nodes, gres, time, partition, etc.
184
-
185
185
  request_kwargs = {
186
186
  "name": name,
187
187
  "cluster_name": cluster_name,
188
+ "image": image,
189
+ "entry_command": entry_command,
188
190
  }
189
-
190
- # Handle script and command (at least one is required)
191
- # script is Union[Unset, str], so we need to set it or leave as UNSET
192
- if script:
193
- request_kwargs["script"] = script
194
- # command is Union[None, Unset, str], so we can set it or leave as UNSET
195
- if command:
196
- request_kwargs["command"] = command
191
+ # Map resources dict to individual fields
192
+ # resources dict can contain: cpu, cpus_per_task, memory, nodes, gres, time, partition, etc.
197
193
 
198
194
  # team_id is Union[None, Unset, int]
199
195
  if team_id is not None:
@@ -203,18 +199,34 @@ class Task:
203
199
  if resources:
204
200
  if "cpu" in resources or "cpus_per_task" in resources:
205
201
  request_kwargs["cpus_per_task"] = resources.get("cpus_per_task") or resources.get("cpu")
202
+ else:
203
+ request_kwargs["cpus_per_task"] = 1
206
204
  if "memory" in resources:
207
205
  request_kwargs["memory"] = resources.get("memory")
206
+ else:
207
+ request_kwargs["memory"] = "1G"
208
208
  if "nodes" in resources:
209
209
  request_kwargs["nodes"] = resources.get("nodes")
210
+ else:
211
+ request_kwargs["nodes"] = 1
210
212
  if "gres" in resources:
211
213
  request_kwargs["gres"] = resources.get("gres")
212
214
  if "time" in resources:
213
215
  request_kwargs["time"] = resources.get("time")
216
+ else:
217
+ request_kwargs["time"] = "01:00:00"
214
218
  if "partition" in resources:
215
219
  request_kwargs["partition"] = resources.get("partition")
216
- if "tres" in resources:
217
- request_kwargs["tres"] = resources.get("tres")
220
+ else:
221
+ request_kwargs["partition"] = "debug"
222
+ if "qos" in resources:
223
+ request_kwargs["qos"] = resources.get("qos")
224
+ else:
225
+ request_kwargs["qos"] = "qos_xcloud"
226
+ if "ntasks" in resources:
227
+ request_kwargs["ntasks"] = resources.get("ntasks")
228
+ else:
229
+ request_kwargs["ntasks"] = 1
218
230
 
219
231
  if file_path:
220
232
  local_path = _validate_archive_file_path(file_path)
@@ -0,0 +1,254 @@
1
+ Metadata-Version: 2.3
2
+ Name: mlops-python-sdk
3
+ Version: 1.0.2
4
+ Summary: MLOps Python SDK for XCloud Service API
5
+ License: MIT
6
+ Author: mlops
7
+ Author-email: mlops@example.com
8
+ Requires-Python: >=3.9,<4.0
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Requires-Dist: attrs (>=23.2.0)
17
+ Requires-Dist: httpx (>=0.27.0,<1.0.0)
18
+ Requires-Dist: packaging (>=24.1)
19
+ Requires-Dist: python-dateutil (>=2.8.2)
20
+ Requires-Dist: typing-extensions (>=4.1.0)
21
+ Project-URL: Bug Tracker, https://github.com/xcloud-service/xservice/issues
22
+ Project-URL: Homepage, https://mlops.cloud/
23
+ Project-URL: Repository, https://github.com/xcloud-service/xservice
24
+ Description-Content-Type: text/markdown
25
+
26
+ # SDK
27
+
28
+ Software Development Kits for integrating with the XCloud Service API.
29
+
30
+ > [!NOTE] SDK Support
31
+ > SDKs provide type-safe, high-level interfaces for interacting with the platform API. They handle authentication, error handling, and request retries automatically.
32
+
33
+ ## Available SDKs
34
+
35
+ ### Python SDK
36
+
37
+ ### Installation
38
+
39
+ The Python SDK installation.
40
+
41
+ ```bash
42
+ pip install mlops-python-sdk
43
+ ```
44
+
45
+ ### Configuration
46
+
47
+ The SDK reads configuration from environment variables by default:
48
+
49
+ - `MLOPS_API_KEY`: API key (required)
50
+ - `MLOPS_DOMAIN`: API domain, e.g. `localhost:8090` or `https://example.com`
51
+ - `MLOPS_API_PATH`: API path prefix (default: `/api/v1`)
52
+ - `MLOPS_DEBUG`: `true|false` (default: `false`)
53
+
54
+ Or configure in code:
55
+
56
+ ```python
57
+ from mlops import ConnectionConfig, Task
58
+
59
+ config = ConnectionConfig(
60
+ api_key="xck_...",
61
+ domain="https://example.com",
62
+ api_path="/api/v1",
63
+ debug=False,
64
+ )
65
+ task = Task(config=config)
66
+ ```
67
+
68
+ ### Usage
69
+
70
+ ```python
71
+ from mlops import Task
72
+ from mlops.api.client.models.task_status import TaskStatus
73
+ from pathlib import Path
74
+
75
+ # Initialize Task client (uses environment variables by default)
76
+ task = Task()
77
+
78
+ # Submit a task with gpu type
79
+ try:
80
+ result = task.submit(
81
+ name="gpu-task-from-sdk",
82
+ image="/mnt/minio/images/01ai-registry.cn-shanghai.cr.aliyuncs.com+public+llamafactory+0.9.3.sqsh",
83
+ entry_command="llamafactory-cli train /workspace/config/test_lora.yaml",
84
+ resources={
85
+ "partition": "gpu",
86
+ "nodes": 2,
87
+ "ntasks": 2,
88
+ "cpus_per_task": 2,
89
+ "memory": "4G",
90
+ "time": "01:00:00",
91
+ "gres": "gpu:nvidia_a10:1",
92
+ "qos": "qos_xcloud",
93
+ },
94
+ cluster_name="slurm-cn",
95
+ team_id=1,
96
+ file_path="your file path", # optional, support for .zip, .tar.gz, .tgz
97
+ )
98
+
99
+ if result is not None:
100
+ print("==== gpu task submitted successfully ====")
101
+ job_id = result.job_id
102
+ else:
103
+ print("==== gpu task submitted failed ====")
104
+ except Exception as e:
105
+ print("==== gpu task submitted failed error ====", e)
106
+
107
+ # Submit a task with cpu type
108
+ try:
109
+ entry_content = Path("entry.sh").read_text(encoding="utf-8")
110
+ result = task.submit(
111
+ name="cpu-task-from-sdk",
112
+ image="docker://01ai-registry.cn-shanghai.cr.aliyuncs.com/01-ai/xcs/v2/alpine:3.23.0",
113
+ entry_command=entry_content,
114
+ resources={
115
+ "partition": "cpu",
116
+ "nodes": 1,
117
+ "ntasks": 1,
118
+ "cpus_per_task": 1,
119
+ "memory": "1G",
120
+ "time": "01:00:00",
121
+ "qos": "qos_xcloud",
122
+ },
123
+ cluster_name="slurm-cn",
124
+ team_id=1,
125
+ )
126
+
127
+ if result is not None:
128
+ print("==== cpu task submitted successfully ====")
129
+ job_id = result.job_id
130
+ else:
131
+ print("==== cpu task submitted failed ====")
132
+ except Exception as e:
133
+ print("==== cpu task submitted failed error ====", e)
134
+
135
+ # List tasks with filters
136
+ try:
137
+ completed_tasks = task.list(
138
+ status=TaskStatus.COMPLETED,
139
+ cluster_name="slurm-cn",
140
+ page=1,
141
+ page_size=20
142
+ )
143
+
144
+ # Get task details
145
+ if completed_tasks is not None and len(completed_tasks.tasks) > 0:
146
+ print("==== completed_tasks number ====", len(completed_tasks.tasks))
147
+ task_info = task.get(task_id=completed_tasks.tasks[0].job_id, cluster_name="slurm-cn")
148
+ print("==== task_info ====", task_info)
149
+ else:
150
+ print("==== no completed tasks to get details ====")
151
+ except Exception as e:
152
+ print("==== get task details failed error ====", e)
153
+
154
+
155
+ # Cancel a running task
156
+ try:
157
+ running_tasks = task.list(
158
+ status=TaskStatus.RUNNING,
159
+ cluster_name="slurm-cn",
160
+ page=1,
161
+ page_size=20
162
+ )
163
+ if running_tasks is not None and len(running_tasks.tasks) > 0:
164
+ print("==== running_tasks number ====", len(running_tasks.tasks))
165
+ # Cancel a task
166
+ result = task.cancel(task_id=running_tasks.tasks[0].job_id, cluster_name="slurm-cn")
167
+ print("==== task cancelled ====", running_tasks.tasks[0].job_id, result)
168
+ else:
169
+ print("==== no running tasks to cancel ====")
170
+ except Exception as e:
171
+ print("==== cancel running task failed error ====", e)
172
+
173
+
174
+ # Delete a task
175
+ try:
176
+ completed_tasks = task.list(
177
+ status=TaskStatus.COMPLETED,
178
+ cluster_name="slurm-cn",
179
+ page=1,
180
+ page_size=20
181
+ )
182
+ if completed_tasks is not None and len(completed_tasks.tasks) > 0:
183
+ print("==== completed_tasks number ====", len(completed_tasks.tasks))
184
+ # Delete a task
185
+ result = task.delete(task_id=completed_tasks.tasks[0].job_id, cluster_name="slurm-cn")
186
+ print("==== task deleted ====", completed_tasks.tasks[0].job_id, result)
187
+ else:
188
+ print("==== no completed tasks to delete ====")
189
+ except Exception as e:
190
+ print("==== delete completed task failed error ====", e)
191
+ ```
192
+
193
+ **Task Management Methods:**
194
+
195
+ - `submit()` - Submit a new task with container image and entry command
196
+ - `get()` - Get task details by task ID
197
+ - `list()` - List tasks with optional filters (status, cluster_name, team_id, user_id)
198
+ - `cancel()` - Cancel a running task
199
+ - `delete()` - Delete a task record
200
+
201
+ **Task Status Values:**
202
+
203
+ ```python
204
+ from mlops.api.client.models.task_status import TaskStatus
205
+
206
+ TaskStatus.PENDING # Task is pending
207
+ TaskStatus.QUEUED # Task is queued
208
+ TaskStatus.RUNNING # Task is running
209
+ TaskStatus.COMPLETED # Task completed successfully
210
+ TaskStatus.SUCCEEDED # Task succeeded
211
+ TaskStatus.FAILED # Task failed
212
+ TaskStatus.CANCELLED # Task was cancelled
213
+ TaskStatus.CREATED # Task was created
214
+ ```
215
+
216
+ **Error Handling:**
217
+
218
+ ```python
219
+ from mlops.exceptions import (
220
+ APIException,
221
+ AuthenticationException,
222
+ NotFoundException,
223
+ RateLimitException,
224
+ TimeoutException,
225
+ InvalidArgumentException,
226
+ NotEnoughSpaceException
227
+ )
228
+
229
+ try:
230
+ result = task.submit(name="test", cluster_name="slurm-cn", command="echo hello")
231
+ except AuthenticationException as e:
232
+ print(f"Authentication failed: {e}")
233
+ except NotFoundException as e:
234
+ print(f"Resource not found: {e}")
235
+ except APIException as e:
236
+ print(f"API error: {e}")
237
+ ```
238
+
239
+ > [!TIP] Error Handling
240
+ > SDKs automatically handle common errors and retry failed requests. Check SDK documentation for error handling best practices.
241
+
242
+ ## Features
243
+
244
+ - Type-safe API clients
245
+ - Automatic authentication
246
+ - Error handling
247
+ - Request retry logic
248
+ - Response validation
249
+
250
+ ## Resources
251
+
252
+ - [Python SDK Documentation](https://github.com/xcloud-service/xservice/tree/main/client/python-sdk)
253
+ - [API Reference](https://xcloud-service.com/docs/api)
254
+
@@ -35,18 +35,18 @@ mlops/api/client/models/task_log_entry_log_type.py,sha256=uVqbF8RewyFkezY6sy28He
35
35
  mlops/api/client/models/task_logs_response.py,sha256=QEGRy51qB7t0K-EGusxzDmkDlAjdKkwHF92em3dLb1c,3557
36
36
  mlops/api/client/models/task_resources_type_0.py,sha256=36nxeOqAJS4ksfQtzoXigWVMhEV1Tnq5Z_64sHa3gGQ,1341
37
37
  mlops/api/client/models/task_status.py,sha256=Tht4F2UeBp-QBLhh-z0fEw45r5cBCfkFUro-la42BPY,315
38
- mlops/api/client/models/task_submit_request.py,sha256=8zbEK2Y_dT4S6Wflm6WTNn9f_f-SLQ7Sl92bS2H_T0c,22941
38
+ mlops/api/client/models/task_submit_request.py,sha256=g8THqxUjn0VD4fw8eo6I6qe9Eym6q9vmSSTFrhcUlbc,24803
39
39
  mlops/api/client/models/task_submit_request_environment_type_0.py,sha256=Wx6ye6vVHytSex186AeUm27-XMWMmZe6lbL2Ons2mkw,1454
40
40
  mlops/api/client/models/task_submit_response.py,sha256=EK3ZXxo_XO5Yn2zdOrR-VMPKg9om49qQ1ywS2Smgink,2200
41
41
  mlops/api/client/models/task_tres_type_0.py,sha256=rEaiQG7A19mlTIHDppzxuWa4oPfh9qsKjPhhVOlBf4g,1292
42
42
  mlops/api/client/models/task_tres_used_type_0.py,sha256=4w6An7-ZCqa8cc3SPi7mcwGK-ekT6AYq_dEdf8KzoYA,1320
43
43
  mlops/api/client/py.typed,sha256=8ZJUsxZiuOy1oJeVhsTWQhTG_6pTVHVXk5hJL79ebTk,25
44
44
  mlops/api/client/types.py,sha256=AX4orxQZQJat3vZrgjJ-TYb2sNBL8kNo9yqYDT-n8y8,1391
45
- mlops/connection_config.py,sha256=aU_8WwkMcomjt4dDyRk1Oyr92ywwuIhFLmv0oQ29KkM,2953
45
+ mlops/connection_config.py,sha256=_b9sVFGJtf1GynmIB4NtKCzg7kkgE-wSrsG3LwzlOqk,2946
46
46
  mlops/exceptions.py,sha256=3kfda-Rz0km9kV-gvnPCw7ueemWkXIGGdT0NXx6z9Xk,1680
47
47
  mlops/task/__init__.py,sha256=M983vMPLj3tZQNFXQyTP5I2RsRorFElezLeppr3WLsw,133
48
48
  mlops/task/client.py,sha256=V131WLVJl1raGAVixUhJCX8s1neN15mxAjQwO01qlIg,3552
49
- mlops/task/task.py,sha256=7QBSNpmI4jacWep2FaSZyA86wgsx-BiNAlBrKi-Razg,23450
50
- mlops_python_sdk-1.0.1.dist-info/METADATA,sha256=afpVJjsJ-TN-lmCym3ScsMJhrOdFoId06MBfLEfFbY4,9525
51
- mlops_python_sdk-1.0.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
52
- mlops_python_sdk-1.0.1.dist-info/RECORD,,
49
+ mlops/task/task.py,sha256=Y_lWpIVY9Wq-2iuaoZYuskcWHasUzLSpXi9fkwn7S3s,23882
50
+ mlops_python_sdk-1.0.2.dist-info/METADATA,sha256=lBkRytOiRISGMHHzk93fijbmF9EC9iKSpHm-6I9QNsM,7637
51
+ mlops_python_sdk-1.0.2.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
52
+ mlops_python_sdk-1.0.2.dist-info/RECORD,,
@@ -1,407 +0,0 @@
1
- Metadata-Version: 2.3
2
- Name: mlops-python-sdk
3
- Version: 1.0.1
4
- Summary: MLOps Python SDK for XCloud Service API
5
- License: MIT
6
- Author: mlops
7
- Author-email: mlops@example.com
8
- Requires-Python: >=3.9,<4.0
9
- Classifier: License :: OSI Approved :: MIT License
10
- Classifier: Programming Language :: Python :: 3
11
- Classifier: Programming Language :: Python :: 3.9
12
- Classifier: Programming Language :: Python :: 3.10
13
- Classifier: Programming Language :: Python :: 3.11
14
- Classifier: Programming Language :: Python :: 3.12
15
- Classifier: Programming Language :: Python :: 3.13
16
- Requires-Dist: attrs (>=23.2.0)
17
- Requires-Dist: httpx (>=0.27.0,<1.0.0)
18
- Requires-Dist: packaging (>=24.1)
19
- Requires-Dist: python-dateutil (>=2.8.2)
20
- Requires-Dist: typing-extensions (>=4.1.0)
21
- Project-URL: Bug Tracker, https://github.com/xcloud-service/xservice/issues
22
- Project-URL: Homepage, https://mlops.cloud/
23
- Project-URL: Repository, https://github.com/xcloud-service/xservice
24
- Description-Content-Type: text/markdown
25
-
26
- # MLOps Python SDK
27
-
28
- [MLOps](https://xcloud-service.com) Python SDK for XCloud Service API. Manage and execute tasks with confidence.
29
-
30
- ## Installation
31
-
32
- Install the SDK from PyPI:
33
-
34
- ```bash
35
- pip install mlops-python-sdk
36
- ```
37
-
38
- ## Quick Start
39
-
40
- ### 1. Setup Authentication
41
-
42
- You can authenticate using either an API Key.
43
-
44
- #### API Key (Recommended for programmatic access)
45
-
46
- 1. Sign up at [MLOps](https://xcloud-service.com)
47
- 2. Create an API key from [API Keys](https://xcloud-service.com/home/api-keys)
48
- 3. Set environment variables:
49
-
50
- ```bash
51
- export MLOPS_API_KEY=xck_******
52
- export MLOPS_DOMAIN=localhost:8090 # optional, default is localhost:8090
53
- ```
54
-
55
- ### 2. Basic Usage
56
-
57
- ```python
58
- from mlops import Task, ConnectionConfig
59
- from mlops.api.client.models.task_status import TaskStatus
60
-
61
- # Initialize Task client (uses environment variables by default)
62
- task = Task()
63
-
64
- # Or initialize with explicit configuration
65
- config = ConnectionConfig(
66
- api_key="xck_******",
67
- domain="localhost:8090",
68
- debug=False
69
- )
70
- task = Task(config=config)
71
-
72
- # Submit a task with script
73
- result = task.submit(
74
- name="my-training-task",
75
- cluster_id=1,
76
- script="#!/bin/bash\necho 'Hello World'",
77
- resources={"cpu": 4, "memory": "8GB", "gpu": 1}
78
- )
79
-
80
- # Or submit with command
81
- result = task.submit(
82
- name="my-task",
83
- cluster_id=1,
84
- command="python train.py",
85
- resources={"cpu": 4, "memory": "8GB"}
86
- )
87
-
88
- # Get task details
89
- task_info = task.get(task_id=result.job_id, cluster_id=1)
90
-
91
- # List tasks with filters
92
- running_tasks = task.list(
93
- status=TaskStatus.RUNNING,
94
- cluster_id=1,
95
- page=1,
96
- page_size=20
97
- )
98
-
99
- # Cancel a task
100
- task.cancel(task_id=result.job_id, cluster_id=1)
101
-
102
- # Delete a task
103
- task.delete(task_id=task_id, cluster_id=1)
104
- ```
105
-
106
- ## API Reference
107
-
108
- ### Task Class
109
-
110
- The `Task` class provides a high-level interface for managing tasks.
111
-
112
- #### Initialization
113
-
114
- ```python
115
- from mlops import Task, ConnectionConfig
116
-
117
- # Using environment variables
118
- task = Task()
119
-
120
- # With explicit configuration
121
- config = ConnectionConfig(
122
- api_key="xck_******", # API key for authentication
123
- domain="localhost:8090", # API domain
124
- debug=False, # Enable debug mode
125
- request_timeout=30.0 # Request timeout in seconds
126
- )
127
- task = Task(config=config)
128
-
129
- # Or pass parameters directly
130
- task = Task(
131
- api_key="xck_******",
132
- domain="localhost:8090"
133
- )
134
- ```
135
-
136
- #### Methods
137
-
138
- ##### `submit()`
139
-
140
- Submit a new task to the cluster.
141
-
142
- ```python
143
- result = task.submit(
144
- name: str, # Task name (required)
145
- cluster_id: int, # Cluster ID (required)
146
- script: Optional[str] = None, # Script content (script or command required)
147
- command: Optional[str] = None,# Command to execute (script or command required)
148
- resources: Optional[dict] = None, # Resource requirements
149
- team_id: Optional[int] = None # Team ID (optional)
150
- ) -> TaskSubmitResponse
151
- ```
152
-
153
- **Resources dictionary** can contain:
154
- - `cpu` or `cpus_per_task`: Number of CPUs
155
- - `memory`: Memory requirement (e.g., "8GB", "4096M")
156
- - `nodes`: Number of nodes
157
- - `gres`: GPU resources (e.g., "gpu:1")
158
- - `time`: Time limit (e.g., "1-00:00:00" for 1 day)
159
- - `partition`: Partition name
160
- - `tres`: TRES specification
161
-
162
- **Example:**
163
-
164
- ```python
165
- result = task.submit(
166
- name="ml-training",
167
- cluster_id=1,
168
- script="#!/bin/bash\npython train.py --epochs 100",
169
- resources={
170
- "cpu": 8,
171
- "memory": "16GB",
172
- "gpu": 1,
173
- "time": "2-00:00:00", # 2 days
174
- "partition": "gpu"
175
- }
176
- )
177
- print(f"Task submitted: Job ID = {result.job_id}")
178
- ```
179
-
180
- ##### `get()`
181
-
182
- Get task details by task ID.
183
-
184
- ```python
185
- task_info = task.get(
186
- task_id: int, # Task ID (Slurm job ID)
187
- cluster_id: int # Cluster ID (required)
188
- ) -> Task
189
- ```
190
-
191
- **Example:**
192
-
193
- ```python
194
- task_info = task.get(task_id=12345, cluster_id=1)
195
- print(f"Task status: {task_info.status}")
196
- print(f"Task name: {task_info.name}")
197
- ```
198
-
199
- ##### `list()`
200
-
201
- List tasks with optional filters and pagination.
202
-
203
- ```python
204
- tasks = task.list(
205
- page: int = 1, # Page number
206
- page_size: int = 20, # Items per page
207
- status: Optional[TaskStatus] = None, # Filter by status
208
- cluster_id: Optional[int] = None, # Filter by cluster ID
209
- team_id: Optional[int] = None, # Filter by team ID
210
- user_id: Optional[int] = None # Filter by user ID
211
- ) -> TaskListResponse
212
- ```
213
-
214
- **Example:**
215
-
216
- ```python
217
- from mlops.api.client.models.task_status import TaskStatus
218
-
219
- # List all running tasks
220
- running_tasks = task.list(status=TaskStatus.RUNNING)
221
-
222
- # List tasks in a specific cluster
223
- cluster_tasks = task.list(cluster_id=1, page=1, page_size=10)
224
-
225
- # List completed tasks with pagination
226
- completed = task.list(
227
- status=TaskStatus.COMPLETED,
228
- cluster_id=1,
229
- page=1,
230
- page_size=50
231
- )
232
- ```
233
-
234
- ##### `cancel()`
235
-
236
- Cancel a running task.
237
-
238
- ```python
239
- task.cancel(
240
- task_id: int, # Task ID (Slurm job ID)
241
- cluster_id: int # Cluster ID (required)
242
- )
243
- ```
244
-
245
- **Example:**
246
-
247
- ```python
248
- task.cancel(task_id=12345, cluster_id=1)
249
- ```
250
-
251
- ### TaskStatus Enum
252
-
253
- Task status values for filtering:
254
-
255
- ```python
256
- from mlops.api.client.models.task_status import TaskStatus
257
-
258
- TaskStatus.PENDING # Task is pending
259
- TaskStatus.QUEUED # Task is queued
260
- TaskStatus.RUNNING # Task is running
261
- TaskStatus.COMPLETED # Task completed successfully
262
- TaskStatus.SUCCEEDED # Task succeeded
263
- TaskStatus.FAILED # Task failed
264
- TaskStatus.CANCELLED # Task was cancelled
265
- TaskStatus.CREATED # Task was created
266
- ```
267
-
268
- ## Configuration
269
-
270
- ### Environment Variables
271
-
272
- The SDK reads configuration from environment variables:
273
-
274
- - `MLOPS_API_KEY`: API key for authentication
275
- - `MLOPS_DOMAIN`: API domain (default: `localhost:8090`)
276
- - `MLOPS_DEBUG`: Enable debug mode (`true`/`false`, default: `false`)
277
- - `MLOPS_API_PATH`: API path prefix (default: `/api/v1`)
278
-
279
- ### ConnectionConfig
280
-
281
- You can also configure the connection programmatically:
282
-
283
- ```python
284
- from mlops import ConnectionConfig
285
-
286
- config = ConnectionConfig(
287
- domain="api.example.com",
288
- api_key="xck_******",
289
- debug=True,
290
- request_timeout=60.0,
291
- api_path="/api/v1"
292
- )
293
- ```
294
-
295
- ## Error Handling
296
-
297
- The SDK provides specific exception types:
298
-
299
- ```python
300
- from mlops.exceptions import (
301
- APIException, # General API errors
302
- AuthenticationException, # Authentication failures
303
- NotFoundException, # Resource not found
304
- RateLimitException, # Rate limit exceeded
305
- TimeoutException, # Request timeout
306
- InvalidArgumentException # Invalid arguments
307
- )
308
-
309
- try:
310
- result = task.submit(name="test", cluster_id=1, command="echo hello")
311
- except AuthenticationException as e:
312
- print(f"Authentication failed: {e}")
313
- except NotFoundException as e:
314
- print(f"Resource not found: {e}")
315
- except APIException as e:
316
- print(f"API error: {e}")
317
- ```
318
-
319
- ## Examples
320
-
321
- ### Submit a Machine Learning Training Job
322
-
323
- ```python
324
- from mlops import Task
325
-
326
- task = Task()
327
-
328
- result = task.submit(
329
- name="pytorch-training",
330
- cluster_id=1,
331
- script="""#!/bin/bash
332
- #SBATCH --gres=gpu:1
333
- #SBATCH --cpus-per-task=2
334
- #SBATCH --mem=4GB
335
-
336
- python train.py --config config.yaml
337
- """,
338
- resources={
339
- "cpus_per_task": 2,
340
- "memory": "4GB",
341
- "gres": "gpu:1",
342
- "time": "1-00:00:00", # 1 days
343
- "partition": "gpu"
344
- }
345
- )
346
-
347
- print(f"Training job submitted: {result.job_id}")
348
- ```
349
-
350
- ### Monitor Task Status
351
-
352
- ```python
353
- from mlops import Task
354
- from mlops.api.client.models.task_status import TaskStatus
355
- import time
356
-
357
- task = Task()
358
- job_id = 12345
359
- cluster_id = 1
360
-
361
- while True:
362
- task_info = task.get(task_id=job_id, cluster_id=cluster_id)
363
- print(f"Status: {task_info.status}")
364
-
365
- if task_info.status in [TaskStatus.COMPLETED, TaskStatus.FAILED, TaskStatus.CANCELLED]:
366
- break
367
-
368
- time.sleep(10) # Check every 10 seconds
369
- ```
370
-
371
- ### List and Filter Tasks
372
-
373
- ```python
374
- from mlops import Task
375
- from mlops.api.client.models.task_status import TaskStatus
376
-
377
- task = Task()
378
-
379
- # Get all running tasks in cluster 1
380
- running = task.list(
381
- status=TaskStatus.RUNNING,
382
- cluster_id=1
383
- )
384
-
385
- for t in running.tasks:
386
- print(f"{t.name}: {t.status} (Job ID: {t.job_id})")
387
-
388
- # Get failed tasks
389
- failed = task.list(status=TaskStatus.FAILED)
390
-
391
- print(f"Total failed tasks: {failed.total}")
392
- ```
393
-
394
- ## Documentation
395
-
396
- - [MLOPS Documentation](https://xcloud-service.com/docs)
397
- - [API Reference](https://xcloud-service.com/docs/api)
398
-
399
- ## License
400
-
401
- MIT
402
-
403
- ## Support
404
-
405
- - [GitHub Issues](https://github.com/xcloud-service/xservice/issues)
406
- - [Documentation](https://xcloud-service.com/docs)
407
-