mlops-python-sdk 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. mlops/__init__.py +3 -3
  2. mlops/api/client/api/storage/__init__.py +1 -0
  3. mlops/api/client/api/storage/get_storage_presign_download.py +175 -0
  4. mlops/api/client/api/storage/get_storage_presign_upload.py +175 -0
  5. mlops/api/client/api/tasks/cancel_task.py +14 -14
  6. mlops/api/client/api/tasks/delete_task.py +14 -14
  7. mlops/api/client/api/tasks/get_task.py +15 -15
  8. mlops/api/client/api/tasks/get_task_by_task_id.py +204 -0
  9. mlops/api/client/api/tasks/get_task_logs.py +300 -0
  10. mlops/api/client/api/tasks/list_tasks.py +14 -14
  11. mlops/api/client/models/__init__.py +16 -0
  12. mlops/api/client/models/get_storage_presign_download_response_200.py +60 -0
  13. mlops/api/client/models/get_storage_presign_upload_response_200.py +79 -0
  14. mlops/api/client/models/get_task_logs_direction.py +9 -0
  15. mlops/api/client/models/get_task_logs_log_type.py +10 -0
  16. mlops/api/client/models/log_pagination.py +90 -0
  17. mlops/api/client/models/task_log_entry.py +105 -0
  18. mlops/api/client/models/task_log_entry_log_type.py +9 -0
  19. mlops/api/client/models/task_logs_response.py +112 -0
  20. mlops/api/client/models/task_submit_request.py +50 -6
  21. mlops/connection_config.py +2 -9
  22. mlops/exceptions.py +10 -10
  23. mlops/task/__init__.py +1 -1
  24. mlops/task/client.py +11 -35
  25. mlops/task/task.py +194 -64
  26. mlops_python_sdk-1.0.2.dist-info/METADATA +254 -0
  27. mlops_python_sdk-1.0.2.dist-info/RECORD +52 -0
  28. mlops_python_sdk-1.0.0.dist-info/METADATA +0 -416
  29. mlops_python_sdk-1.0.0.dist-info/RECORD +0 -39
  30. {mlops_python_sdk-1.0.0.dist-info → mlops_python_sdk-1.0.2.dist-info}/WHEEL +0 -0
mlops/task/task.py CHANGED
@@ -1,12 +1,17 @@
1
1
  """
2
- High-level Task SDK interface for XClient.
2
+ High-level Task SDK interface for MLOps.
3
3
 
4
- This module provides a convenient interface for managing tasks through the XClient API.
5
- """
4
+ This module provides a convenient interface for managing tasks through the MLOps API.
5
+ """
6
6
 
7
7
  import json
8
+ import os
8
9
  from http import HTTPStatus
10
+ from pathlib import Path
9
11
  from typing import Optional
12
+
13
+ import httpx
14
+
10
15
  from ..api.client.api.tasks import (
11
16
  submit_task,
12
17
  get_task,
@@ -14,8 +19,15 @@ from ..api.client.api.tasks import (
14
19
  cancel_task,
15
20
  delete_task,
16
21
  )
22
+ from ..api.client.api.storage import (
23
+ get_storage_presign_upload,
24
+ get_storage_presign_download,
25
+ )
17
26
  from ..api.client.models.task import Task as TaskModel
18
27
  from ..api.client.models.task_submit_request import TaskSubmitRequest
28
+ from ..api.client.models.task_submit_request_environment_type_0 import (
29
+ TaskSubmitRequestEnvironmentType0,
30
+ )
19
31
  from ..api.client.models.task_submit_response import TaskSubmitResponse
20
32
  from ..api.client.models.task_list_response import TaskListResponse
21
33
  from ..api.client.models.task_status import TaskStatus
@@ -29,42 +41,82 @@ from ..exceptions import (
29
41
  from .client import TaskClient, handle_api_exception
30
42
 
31
43
 
44
+ def _validate_archive_file_path(file_path: str) -> Path:
45
+ p = Path(os.path.expanduser(file_path)).resolve()
46
+ if not p.exists():
47
+ raise APIException(f"File not found: {p}")
48
+ if not p.is_file():
49
+ raise APIException(f"file_path must be a file: {p}")
50
+
51
+ lower = p.name.lower()
52
+ if not (lower.endswith(".zip") or lower.endswith(".tar.gz") or lower.endswith(".tgz")):
53
+ raise APIException(f"file_path must be one of .zip, .tar.gz, .tgz: {p}")
54
+ return p
55
+
56
+
57
+ def _upload_file_to_presigned_url(url: str, file_path: Path, timeout: Optional[float]) -> None:
58
+ size = file_path.stat().st_size
59
+ # Use a dedicated client for S3 presigned upload (avoid leaking API auth headers).
60
+ with httpx.Client(timeout=timeout) as client:
61
+ with file_path.open("rb") as f:
62
+ resp = client.put(
63
+ url,
64
+ content=f,
65
+ headers={
66
+ "Content-Length": str(size),
67
+ "Content-Type": "application/octet-stream",
68
+ },
69
+ )
70
+ if resp.status_code < 200 or resp.status_code >= 300:
71
+ body = (resp.text or "")[:2048]
72
+ raise APIException(
73
+ f"Failed to upload file to presigned url: HTTP {resp.status_code}: {body}"
74
+ )
75
+
76
+
32
77
  class Task:
33
78
  """
34
79
  High-level interface for managing tasks.
35
80
 
36
81
  Example:
37
82
  ```python
38
- from xclient import Task, ConnectionConfig
83
+ from mlops import Task, ConnectionConfig
39
84
 
40
85
  config = ConnectionConfig(api_key="your_api_key")
41
86
  task = Task(config=config)
42
87
 
43
- # Submit a task with script
88
+ # Submit a task with gpu type
44
89
  result = task.submit(
45
- name="my-task",
46
- cluster_id=1,
47
- script="#!/bin/bash\\necho 'Hello World'"
48
- )
49
-
50
- # Or submit with command
51
- result = task.submit(
52
- name="my-task",
53
- cluster_id=1,
54
- command="echo 'Hello World'"
90
+ name="gpu-task-from-sdk",
91
+ cluster_name="slurm-cn",
92
+ image="/mnt/minio/images/01ai-registry.cn-shanghai.cr.aliyuncs.com+public+llamafactory+0.9.3.sqsh",
93
+ entry_command="llamafactory-cli train /workspace/config/test_lora.yaml",
94
+ resources={
95
+ "partition": "gpu",
96
+ "nodes": 2,
97
+ "ntasks": 2,
98
+ "cpus_per_task": 2,
99
+ "memory": "4G",
100
+ "time": "01:00:00",
101
+ "gres": "gpu:nvidia_a10:1",
102
+ "qos": "qos_xcloud",
103
+ "job_type": "batch",
104
+ },
105
+ team_id=1,
106
+ file_path="your file path",
55
107
  )
56
108
 
57
109
  # Get task details
58
- task_info = task.get(task_id=result.job_id, cluster_id=1)
110
+ task_info = task.get(task_id=result.job_id, cluster_name="slurm-cn")
59
111
 
60
112
  # List tasks
61
113
  tasks = task.list(status=TaskStatus.RUNNING)
62
114
 
63
115
  # Cancel a task
64
- task.cancel(task_id=result.job_id, cluster_id=1)
116
+ task.cancel(task_id=result.job_id, cluster_name="slurm-cn")
65
117
 
66
118
  # Delete a task
67
- task.delete(task_id=result.job_id, cluster_id=1)
119
+ task.delete(task_id=result.job_id, cluster_name="slurm-cn")
68
120
  ```
69
121
  """
70
122
 
@@ -72,7 +124,6 @@ class Task:
72
124
  self,
73
125
  config: Optional["ConnectionConfig"] = None,
74
126
  api_key: Optional[str] = None,
75
- access_token: Optional[str] = None,
76
127
  domain: Optional[str] = None,
77
128
  debug: Optional[bool] = None,
78
129
  request_timeout: Optional[float] = None,
@@ -83,7 +134,6 @@ class Task:
83
134
  Args:
84
135
  config: ConnectionConfig instance. If not provided, a new one will be created.
85
136
  api_key: API key for authentication. Overrides config.api_key.
86
- access_token: Access token for authentication. Overrides config.access_token.
87
137
  domain: API domain. Overrides config.domain.
88
138
  debug: Enable debug mode. Overrides config.debug.
89
139
  request_timeout: Request timeout in seconds. Overrides config.request_timeout.
@@ -95,8 +145,6 @@ class Task:
95
145
  # Override config values if provided
96
146
  if api_key is not None:
97
147
  config.api_key = api_key
98
- if access_token is not None:
99
- config.access_token = access_token
100
148
  if domain is not None:
101
149
  config.domain = domain
102
150
  if debug is not None:
@@ -106,27 +154,27 @@ class Task:
106
154
 
107
155
  self._config = config
108
156
  self._client = TaskClient(config=config)
109
-
110
157
  def submit(
111
158
  self,
112
159
  name: str,
113
- cluster_id: Optional[int] = None,
114
- script: Optional[str] = None,
115
- command: Optional[str] = None,
160
+ cluster_name: str,
161
+ image: str,
162
+ entry_command: str,
116
163
  resources: Optional[dict] = None,
117
164
  team_id: Optional[int] = None,
165
+ file_path: Optional[str] = None,
118
166
  ) -> TaskSubmitResponse:
119
167
  """
120
168
  Submit a new task.
121
169
 
122
170
  Args:
123
171
  name: Task name
124
- cluster_id: Cluster ID to submit the task to
125
- script: Task script content (optional, but at least one of script or command is required)
126
- command: Command to execute (optional, but at least one of script or command is required)
172
+ cluster_name: Cluster name to submit the task to
173
+ image: Container image reference
174
+ entry_command: Container entry command/script
127
175
  resources: Resource requirements dict (optional)
128
176
  team_id: Team ID (optional)
129
-
177
+ file_path: Local file path to upload (optional, support for .zip, .tar.gz, .tgz)
130
178
  Returns:
131
179
  TaskSubmitResponse containing the submitted task information
132
180
 
@@ -134,28 +182,14 @@ class Task:
134
182
  APIException: If the API returns an error
135
183
  AuthenticationException: If authentication fails
136
184
  """
137
- # Validate required fields
138
- if cluster_id is None:
139
- raise APIException("cluster_id is required")
140
-
141
- # At least one of script or command must be provided
142
- if not script and not command:
143
- raise APIException("At least one of 'script' or 'command' must be provided")
144
-
145
- # Map resources dict to individual fields
146
- # resources dict can contain: cpu, cpus_per_task, memory, nodes, gres, time, partition, etc.
147
185
  request_kwargs = {
148
186
  "name": name,
149
- "cluster_id": cluster_id,
187
+ "cluster_name": cluster_name,
188
+ "image": image,
189
+ "entry_command": entry_command,
150
190
  }
151
-
152
- # Handle script and command (at least one is required)
153
- # script is Union[Unset, str], so we need to set it or leave as UNSET
154
- if script:
155
- request_kwargs["script"] = script
156
- # command is Union[None, Unset, str], so we can set it or leave as UNSET
157
- if command:
158
- request_kwargs["command"] = command
191
+ # Map resources dict to individual fields
192
+ # resources dict can contain: cpu, cpus_per_task, memory, nodes, gres, time, partition, etc.
159
193
 
160
194
  # team_id is Union[None, Unset, int]
161
195
  if team_id is not None:
@@ -165,19 +199,115 @@ class Task:
165
199
  if resources:
166
200
  if "cpu" in resources or "cpus_per_task" in resources:
167
201
  request_kwargs["cpus_per_task"] = resources.get("cpus_per_task") or resources.get("cpu")
202
+ else:
203
+ request_kwargs["cpus_per_task"] = 1
168
204
  if "memory" in resources:
169
205
  request_kwargs["memory"] = resources.get("memory")
206
+ else:
207
+ request_kwargs["memory"] = "1G"
170
208
  if "nodes" in resources:
171
209
  request_kwargs["nodes"] = resources.get("nodes")
210
+ else:
211
+ request_kwargs["nodes"] = 1
172
212
  if "gres" in resources:
173
213
  request_kwargs["gres"] = resources.get("gres")
174
214
  if "time" in resources:
175
215
  request_kwargs["time"] = resources.get("time")
216
+ else:
217
+ request_kwargs["time"] = "01:00:00"
176
218
  if "partition" in resources:
177
219
  request_kwargs["partition"] = resources.get("partition")
178
- if "tres" in resources:
179
- request_kwargs["tres"] = resources.get("tres")
180
-
220
+ else:
221
+ request_kwargs["partition"] = "debug"
222
+ if "qos" in resources:
223
+ request_kwargs["qos"] = resources.get("qos")
224
+ else:
225
+ request_kwargs["qos"] = "qos_xcloud"
226
+ if "ntasks" in resources:
227
+ request_kwargs["ntasks"] = resources.get("ntasks")
228
+ else:
229
+ request_kwargs["ntasks"] = 1
230
+
231
+ if file_path:
232
+ local_path = _validate_archive_file_path(file_path)
233
+ timeout = self._config.get_request_timeout()
234
+
235
+ # 1) Get presigned upload URL
236
+ presign_upload_obj = get_storage_presign_upload.sync_detailed(
237
+ client=self._client,
238
+ filename=local_path.name,
239
+ )
240
+ presign_upload = presign_upload_obj.parsed
241
+ if isinstance(presign_upload, ErrorResponse):
242
+ status_code = (
243
+ presign_upload.code
244
+ if presign_upload.code != UNSET and presign_upload.code != 0
245
+ else presign_upload_obj.status_code.value
246
+ )
247
+ exception = handle_api_exception(
248
+ Response(
249
+ status_code=HTTPStatus(status_code),
250
+ content=presign_upload_obj.content,
251
+ headers=presign_upload_obj.headers,
252
+ parsed=None,
253
+ )
254
+ )
255
+ raise exception
256
+
257
+ if (
258
+ presign_upload is None
259
+ or presign_upload.url in (UNSET, None)
260
+ or presign_upload.key in (UNSET, None)
261
+ ):
262
+ raise APIException("Failed to get presigned upload url: empty response")
263
+
264
+ # 2) Upload file to S3 (presigned URL)
265
+ _upload_file_to_presigned_url(
266
+ url=str(presign_upload.url),
267
+ file_path=local_path,
268
+ timeout=timeout,
269
+ )
270
+
271
+ # 3) Get presigned download URL
272
+ presign_download_obj = get_storage_presign_download.sync_detailed(
273
+ client=self._client,
274
+ key=str(presign_upload.key),
275
+ )
276
+ presign_download = presign_download_obj.parsed
277
+ if isinstance(presign_download, ErrorResponse):
278
+ status_code = (
279
+ presign_download.code
280
+ if presign_download.code != UNSET and presign_download.code != 0
281
+ else presign_download_obj.status_code.value
282
+ )
283
+ exception = handle_api_exception(
284
+ Response(
285
+ status_code=HTTPStatus(status_code),
286
+ content=presign_download_obj.content,
287
+ headers=presign_download_obj.headers,
288
+ parsed=None,
289
+ )
290
+ )
291
+ raise exception
292
+
293
+ if presign_download is None or presign_download.url in (UNSET, None):
294
+ raise APIException(
295
+ "Failed to get presigned download url: empty response"
296
+ )
297
+
298
+ # 4) Set env var (merge if user already provided environment)
299
+ env: dict[str, str] = {}
300
+ existing_env = request_kwargs.get("environment")
301
+ if isinstance(existing_env, TaskSubmitRequestEnvironmentType0):
302
+ env.update(existing_env.additional_properties)
303
+ elif isinstance(existing_env, dict):
304
+ env.update(existing_env)
305
+
306
+ env["SYSTEM_DOWNLOAD_ARCHIVE_URL"] = str(presign_download.url)
307
+ request_kwargs["environment"] = TaskSubmitRequestEnvironmentType0.from_dict(
308
+ env
309
+ )
310
+
181
311
  request = TaskSubmitRequest(**request_kwargs)
182
312
 
183
313
  # Use sync_detailed to get full response information
@@ -230,14 +360,14 @@ class Task:
230
360
  def get(
231
361
  self,
232
362
  task_id: int,
233
- cluster_id: int,
363
+ cluster_name: str,
234
364
  ) -> TaskModel:
235
365
  """
236
366
  Get task details by task ID.
237
367
 
238
368
  Args:
239
369
  task_id: Task ID
240
- cluster_id: Cluster ID
370
+ cluster_name: Cluster name
241
371
 
242
372
  Returns:
243
373
  Task model with task details
@@ -250,7 +380,7 @@ class Task:
250
380
  response_obj = get_task.sync_detailed(
251
381
  id=task_id,
252
382
  client=self._client,
253
- cluster_id=cluster_id,
383
+ cluster_name=cluster_name,
254
384
  )
255
385
  response = response_obj.parsed
256
386
 
@@ -302,7 +432,7 @@ class Task:
302
432
  status: Optional[TaskStatus] = None,
303
433
  user_id: Optional[int] = None,
304
434
  team_id: Optional[int] = None,
305
- cluster_id: Optional[int] = None,
435
+ cluster_name: Optional[str] = None,
306
436
  ) -> TaskListResponse:
307
437
  """
308
438
  List tasks with optional filtering.
@@ -313,7 +443,7 @@ class Task:
313
443
  status: Filter by task status (optional)
314
444
  user_id: Filter by user ID (optional)
315
445
  team_id: Filter by team ID (optional)
316
- cluster_id: Filter by cluster ID (optional)
446
+ cluster_name: Filter by cluster name (optional)
317
447
 
318
448
  Returns:
319
449
  TaskListResponse containing the list of tasks
@@ -329,7 +459,7 @@ class Task:
329
459
  status=status if status is not None else UNSET,
330
460
  user_id=user_id if user_id is not None else UNSET,
331
461
  team_id=team_id if team_id is not None else UNSET,
332
- cluster_id=cluster_id if cluster_id is not None else UNSET,
462
+ cluster_name=cluster_name if cluster_name is not None else UNSET,
333
463
  )
334
464
  response = response_obj.parsed
335
465
 
@@ -377,14 +507,14 @@ class Task:
377
507
  def cancel(
378
508
  self,
379
509
  task_id: int,
380
- cluster_id: int,
510
+ cluster_name: str,
381
511
  ) -> bool:
382
512
  """
383
513
  Cancel a task.
384
514
 
385
515
  Args:
386
516
  task_id: Task ID to cancel
387
- cluster_id: Cluster ID where the task is running
517
+ cluster_name: Cluster name where the task is running
388
518
 
389
519
  Returns:
390
520
  True if the task was cancelled successfully
@@ -397,7 +527,7 @@ class Task:
397
527
  response_obj = cancel_task.sync_detailed(
398
528
  id=task_id,
399
529
  client=self._client,
400
- cluster_id=cluster_id,
530
+ cluster_name=cluster_name,
401
531
  )
402
532
  response = response_obj.parsed
403
533
 
@@ -434,14 +564,14 @@ class Task:
434
564
  def delete(
435
565
  self,
436
566
  task_id: int,
437
- cluster_id: int,
567
+ cluster_name: str,
438
568
  ) -> bool:
439
569
  """
440
570
  Delete a task.
441
571
 
442
572
  Args:
443
573
  task_id: Task ID to delete
444
- cluster_id: Cluster ID where the task is running
574
+ cluster_name: Cluster name where the task is running
445
575
 
446
576
  Returns:
447
577
  True if the task was deleted successfully
@@ -454,7 +584,7 @@ class Task:
454
584
  response_obj = delete_task.sync_detailed(
455
585
  id=task_id,
456
586
  client=self._client,
457
- cluster_id=cluster_id,
587
+ cluster_name=cluster_name,
458
588
  )
459
589
  response = response_obj.parsed
460
590