gmicloud 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gmicloud/__init__.py +2 -2
- gmicloud/_internal/_client/_artifact_client.py +40 -7
- gmicloud/_internal/_client/_file_upload_client.py +10 -7
- gmicloud/_internal/_config.py +9 -3
- gmicloud/_internal/_enums.py +5 -0
- gmicloud/_internal/_manager/_artifact_manager.py +198 -17
- gmicloud/_internal/_manager/_task_manager.py +76 -2
- gmicloud/_internal/_manager/serve_command_utils.py +121 -0
- gmicloud/_internal/_models.py +154 -34
- gmicloud/client.py +179 -75
- gmicloud/tests/test_artifacts.py +6 -22
- gmicloud-0.1.7.dist-info/METADATA +237 -0
- gmicloud-0.1.7.dist-info/RECORD +28 -0
- {gmicloud-0.1.5.dist-info → gmicloud-0.1.7.dist-info}/WHEEL +1 -1
- gmicloud-0.1.5.dist-info/METADATA +0 -246
- gmicloud-0.1.5.dist-info/RECORD +0 -27
- {gmicloud-0.1.5.dist-info → gmicloud-0.1.7.dist-info}/top_level.txt +0 -0
gmicloud/__init__.py
CHANGED
@@ -15,7 +15,7 @@ from ._internal._models import (
|
|
15
15
|
OneOffScheduling,
|
16
16
|
DailyScheduling,
|
17
17
|
DailyTrigger,
|
18
|
-
|
18
|
+
Template,
|
19
19
|
)
|
20
20
|
from ._internal._enums import (
|
21
21
|
BuildStatus,
|
@@ -39,7 +39,7 @@ __all__ = [
|
|
39
39
|
"OneOffScheduling",
|
40
40
|
"DailyScheduling",
|
41
41
|
"DailyTrigger",
|
42
|
-
"
|
42
|
+
"Template",
|
43
43
|
"BuildStatus",
|
44
44
|
"TaskEndpointStatus",
|
45
45
|
]
|
@@ -1,7 +1,7 @@
|
|
1
1
|
from typing import List
|
2
2
|
import logging
|
3
3
|
from requests.exceptions import RequestException
|
4
|
-
|
4
|
+
import json
|
5
5
|
from ._http_client import HTTPClient
|
6
6
|
from ._iam_client import IAMClient
|
7
7
|
from ._decorator import handle_refresh_token
|
@@ -120,6 +120,39 @@ class ArtifactClient:
|
|
120
120
|
logger.error(f"Failed to rebuild artifact {artifact_id}: {e}")
|
121
121
|
return None
|
122
122
|
|
123
|
+
@handle_refresh_token
|
124
|
+
def add_env_parameters_to_artifact(self, artifact_id: str, env_parameters: dict[str, str]) -> None:
|
125
|
+
"""
|
126
|
+
Updates an artifact by its ID.
|
127
|
+
|
128
|
+
:param artifact_id: The ID of the artifact to update.
|
129
|
+
:param request: The request object containing the updated artifact details.
|
130
|
+
"""
|
131
|
+
try:
|
132
|
+
old_artifact = self.get_artifact(artifact_id)
|
133
|
+
if not old_artifact:
|
134
|
+
logger.error(f"Artifact {artifact_id} not found")
|
135
|
+
return
|
136
|
+
request = UpdateArtifactRequestBody(
|
137
|
+
artifact_description=old_artifact.artifact_metadata.artifact_description,
|
138
|
+
artifact_name=old_artifact.artifact_metadata.artifact_name,
|
139
|
+
artifact_tags=old_artifact.artifact_metadata.artifact_tags,
|
140
|
+
env_parameters=old_artifact.artifact_parameters.env_parameters,
|
141
|
+
model_parameters=old_artifact.artifact_parameters.model_parameters
|
142
|
+
)
|
143
|
+
new_env_parameters = [EnvParameter(key=k, value=v) for k, v in env_parameters.items()]
|
144
|
+
if not request.env_parameters:
|
145
|
+
request.env_parameters = []
|
146
|
+
request.env_parameters.extend(new_env_parameters)
|
147
|
+
response = self.client.put(
|
148
|
+
f"/update_artifact?artifact_id={artifact_id}",
|
149
|
+
self.iam_client.get_custom_headers(),
|
150
|
+
request.model_dump()
|
151
|
+
)
|
152
|
+
except (RequestException, ValueError) as e:
|
153
|
+
logger.error(f"Failed to add env parameters to artifact {artifact_id}: {e}")
|
154
|
+
return
|
155
|
+
|
123
156
|
@handle_refresh_token
|
124
157
|
def delete_artifact(self, artifact_id: str) -> Optional[DeleteArtifactResponse]:
|
125
158
|
"""
|
@@ -140,7 +173,7 @@ class ArtifactClient:
|
|
140
173
|
return None
|
141
174
|
|
142
175
|
@handle_refresh_token
|
143
|
-
def get_bigfile_upload_url(self, request:
|
176
|
+
def get_bigfile_upload_url(self, request: ResumableUploadLinkRequest) -> Optional[ResumableUploadLinkResponse]:
|
144
177
|
"""
|
145
178
|
Generates a pre-signed URL for uploading a large file.
|
146
179
|
|
@@ -156,7 +189,7 @@ class ArtifactClient:
|
|
156
189
|
logger.error("Empty response from /get_bigfile_upload_url")
|
157
190
|
return None
|
158
191
|
|
159
|
-
return
|
192
|
+
return ResumableUploadLinkResponse.model_validate(response)
|
160
193
|
|
161
194
|
except (RequestException, ValueError) as e:
|
162
195
|
logger.error(f"Failed to generate upload URL: {e}")
|
@@ -186,12 +219,12 @@ class ArtifactClient:
|
|
186
219
|
return None
|
187
220
|
|
188
221
|
@handle_refresh_token
|
189
|
-
def get_public_templates(self) -> List[
|
222
|
+
def get_public_templates(self) -> List[Template]:
|
190
223
|
"""
|
191
224
|
Fetches all artifact templates.
|
192
225
|
|
193
|
-
:return: A list of
|
194
|
-
:rtype: List[
|
226
|
+
:return: A list of Template objects.
|
227
|
+
:rtype: List[Template]
|
195
228
|
"""
|
196
229
|
try:
|
197
230
|
response = self.client.get("/get_public_templates", self.iam_client.get_custom_headers())
|
@@ -201,7 +234,7 @@ class ArtifactClient:
|
|
201
234
|
return []
|
202
235
|
|
203
236
|
try:
|
204
|
-
result =
|
237
|
+
result = GetTemplatesResponse.model_validate(response)
|
205
238
|
return result.artifact_templates
|
206
239
|
except ValueError as ve:
|
207
240
|
logger.error(f"Failed to validate response data: {ve}")
|
@@ -1,8 +1,10 @@
|
|
1
1
|
import os
|
2
2
|
import requests
|
3
|
+
import logging
|
3
4
|
|
4
5
|
from .._exceptions import UploadFileError
|
5
6
|
|
7
|
+
logger = logging.getLogger()
|
6
8
|
|
7
9
|
class FileUploadClient:
|
8
10
|
CHUNK_SIZE = 10 * 1024 * 1024 # 10MB Default Chunk Size
|
@@ -45,13 +47,13 @@ class FileUploadClient:
|
|
45
47
|
"""
|
46
48
|
try:
|
47
49
|
file_size = os.path.getsize(file_path)
|
48
|
-
|
50
|
+
logger.info(f"File {file_path} size: {file_size} bytes")
|
49
51
|
|
50
52
|
start_byte = 0
|
51
53
|
uploaded_range = FileUploadClient._check_file_status(upload_url, file_size)
|
52
54
|
if uploaded_range:
|
53
55
|
start_byte = int(uploaded_range.split("-")[1]) + 1
|
54
|
-
|
56
|
+
logger.info(f"Resuming uploading {file_path} from {start_byte} bytes")
|
55
57
|
|
56
58
|
with open(file_path, "rb") as file:
|
57
59
|
while start_byte < file_size:
|
@@ -74,14 +76,15 @@ class FileUploadClient:
|
|
74
76
|
# Ensure upload is successful for this chunk
|
75
77
|
if resp.status_code not in (200, 201, 308):
|
76
78
|
raise UploadFileError(
|
77
|
-
f"Failed to upload file, code:{resp.status_code} ,message: {resp.text}")
|
79
|
+
f"Failed to upload file {file_path}, code:{resp.status_code} ,message: {resp.text}")
|
78
80
|
|
79
81
|
start_byte = end_byte + 1
|
80
|
-
|
82
|
+
percentage = (start_byte / file_size) * 100
|
83
|
+
logger.info(f"File {file_path} uploaded {end_byte + 1:,}/{file_size:,} bytes ({percentage:.2f}%)")
|
81
84
|
|
82
|
-
|
85
|
+
logger.info(f"File {file_path} uploaded successfully.")
|
83
86
|
except Exception as e:
|
84
|
-
raise UploadFileError(f"Failed to upload file: {str(e)}")
|
87
|
+
raise UploadFileError(f"Failed to upload file {file_path}, got error: {str(e)}")
|
85
88
|
|
86
89
|
@staticmethod
|
87
90
|
def _check_file_status(upload_url: str, file_size: int) -> str:
|
@@ -104,7 +107,7 @@ class FileUploadClient:
|
|
104
107
|
if resp.status_code == 308:
|
105
108
|
range_header = resp.headers.get("Range")
|
106
109
|
if range_header:
|
107
|
-
|
110
|
+
logger.info(f"Server reports partial upload range: {range_header}")
|
108
111
|
return range_header
|
109
112
|
|
110
113
|
if resp.status_code in (200, 201):
|
gmicloud/_internal/_config.py
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
1
|
+
# Dev environment
|
2
|
+
# ARTIFACT_SERVICE_BASE_URL = "https://ce-tot.gmicloud-dev.com/api/v1/ie/artifact"
|
3
|
+
# TASK_SERVICE_BASE_URL = "https://ce-tot.gmicloud-dev.com/api/v1/ie/task"
|
4
|
+
# IAM_SERVICE_BASE_URL = "https://ce-tot.gmicloud-dev.com/api/v1"
|
5
|
+
|
6
|
+
# Prod environment
|
7
|
+
ARTIFACT_SERVICE_BASE_URL = "https://inference-engine.gmicloud.ai/api/v1/ie/artifact"
|
8
|
+
TASK_SERVICE_BASE_URL = "https://inference-engine.gmicloud.ai/api/v1/ie/task"
|
9
|
+
IAM_SERVICE_BASE_URL = "https://inference-engine.gmicloud.ai/api/v1"
|
gmicloud/_internal/_enums.py
CHANGED
@@ -1,12 +1,21 @@
|
|
1
1
|
import os
|
2
|
+
import time
|
2
3
|
from typing import List
|
3
4
|
import mimetypes
|
5
|
+
import concurrent.futures
|
6
|
+
import re
|
7
|
+
from tqdm import tqdm
|
8
|
+
from tqdm.contrib.logging import logging_redirect_tqdm
|
4
9
|
|
5
10
|
from .._client._iam_client import IAMClient
|
6
11
|
from .._client._artifact_client import ArtifactClient
|
7
12
|
from .._client._file_upload_client import FileUploadClient
|
8
13
|
from .._models import *
|
14
|
+
from .._manager.serve_command_utils import parse_server_command, extract_gpu_num_from_serve_command
|
9
15
|
|
16
|
+
import logging
|
17
|
+
|
18
|
+
logger = logging.getLogger(__name__)
|
10
19
|
|
11
20
|
class ArtifactManager:
|
12
21
|
"""
|
@@ -49,7 +58,12 @@ class ArtifactManager:
|
|
49
58
|
self,
|
50
59
|
artifact_name: str,
|
51
60
|
description: Optional[str] = "",
|
52
|
-
tags: Optional[List[str]] = None
|
61
|
+
tags: Optional[List[str]] = None,
|
62
|
+
deployment_type: Optional[str] = "",
|
63
|
+
template_id: Optional[str] = "",
|
64
|
+
env_parameters: Optional[List["EnvParameter"]] = None,
|
65
|
+
model_description: Optional[str] = "",
|
66
|
+
model_parameters: Optional[List["ModelParameter"]] = None,
|
53
67
|
) -> CreateArtifactResponse:
|
54
68
|
"""
|
55
69
|
Create a new artifact for a user.
|
@@ -65,11 +79,16 @@ class ArtifactManager:
|
|
65
79
|
|
66
80
|
req = CreateArtifactRequest(artifact_name=artifact_name,
|
67
81
|
artifact_description=description,
|
68
|
-
artifact_tags=tags,
|
82
|
+
artifact_tags=tags,
|
83
|
+
deployment_type=deployment_type,
|
84
|
+
template_id=template_id,
|
85
|
+
env_parameters=env_parameters,
|
86
|
+
model_description=model_description,
|
87
|
+
model_parameters=model_parameters)
|
69
88
|
|
70
89
|
return self.artifact_client.create_artifact(req)
|
71
90
|
|
72
|
-
def create_artifact_from_template(self, artifact_template_id: str) -> str:
|
91
|
+
def create_artifact_from_template(self, artifact_template_id: str, env_parameters: Optional[dict[str, str]] = None) -> str:
|
73
92
|
"""
|
74
93
|
Create a new artifact for a user using a template.
|
75
94
|
|
@@ -81,12 +100,102 @@ class ArtifactManager:
|
|
81
100
|
if not artifact_template_id or not artifact_template_id.strip():
|
82
101
|
raise ValueError("Artifact template ID is required and cannot be empty.")
|
83
102
|
|
103
|
+
|
84
104
|
resp = self.artifact_client.create_artifact_from_template(artifact_template_id)
|
85
105
|
if not resp or not resp.artifact_id:
|
86
106
|
raise ValueError("Failed to create artifact from template.")
|
87
107
|
|
108
|
+
if env_parameters:
|
109
|
+
self.artifact_client.add_env_parameters_to_artifact(resp.artifact_id, env_parameters)
|
110
|
+
|
88
111
|
return resp.artifact_id
|
89
112
|
|
113
|
+
|
114
|
+
def create_artifact_from_template_name(self, artifact_template_name: str) -> tuple[str, ReplicaResource]:
|
115
|
+
"""
|
116
|
+
Create an artifact from a template.
|
117
|
+
:param artifact_template_name: The name of the template to use.
|
118
|
+
:return: A tuple containing the artifact ID and the recommended replica resources.
|
119
|
+
:rtype: tuple[str, ReplicaResource]
|
120
|
+
"""
|
121
|
+
|
122
|
+
recommended_replica_resources = None
|
123
|
+
template_id = None
|
124
|
+
try:
|
125
|
+
templates = self.get_public_templates()
|
126
|
+
except Exception as e:
|
127
|
+
logger.error(f"Failed to get artifact templates, Error: {e}")
|
128
|
+
for template in templates:
|
129
|
+
if template.template_data and template.template_data.name == artifact_template_name:
|
130
|
+
resources_template = template.template_data.resources
|
131
|
+
recommended_replica_resources = ReplicaResource(
|
132
|
+
cpu=resources_template.cpu,
|
133
|
+
ram_gb=resources_template.memory,
|
134
|
+
gpu=resources_template.gpu,
|
135
|
+
gpu_name=resources_template.gpu_name,
|
136
|
+
)
|
137
|
+
template_id = template.template_id
|
138
|
+
break
|
139
|
+
if not template_id:
|
140
|
+
raise ValueError(f"Template with name {artifact_template_name} not found.")
|
141
|
+
try:
|
142
|
+
artifact_id = self.create_artifact_from_template(template_id)
|
143
|
+
self.wait_for_artifact_ready(artifact_id)
|
144
|
+
return artifact_id, recommended_replica_resources
|
145
|
+
except Exception as e:
|
146
|
+
logger.error(f"Failed to create artifact from template, Error: {e}")
|
147
|
+
raise e
|
148
|
+
|
149
|
+
def create_artifact_for_serve_command_and_custom_model(self, template_name: str, artifact_name: str, serve_command: str, gpu_type: str, artifact_description: str = "") -> tuple[str, ReplicaResource]:
|
150
|
+
"""
|
151
|
+
Create an artifact from a template and support custom model.
|
152
|
+
:param artifact_template_name: The name of the template to use.
|
153
|
+
:return: A tuple containing the artifact ID and the recommended replica resources.
|
154
|
+
:rtype: tuple[str, ReplicaResource]
|
155
|
+
"""
|
156
|
+
|
157
|
+
recommended_replica_resources = None
|
158
|
+
picked_template = None
|
159
|
+
try:
|
160
|
+
templates = self.get_public_templates()
|
161
|
+
except Exception as e:
|
162
|
+
logger.error(f"Failed to get artifact templates, Error: {e}")
|
163
|
+
for template in templates:
|
164
|
+
if template.template_data and template.template_data.name == template_name:
|
165
|
+
picked_template = template
|
166
|
+
break
|
167
|
+
if not picked_template:
|
168
|
+
raise ValueError(f"Template with name {template_name} not found.")
|
169
|
+
|
170
|
+
try:
|
171
|
+
if gpu_type not in ["H100", "H200"]:
|
172
|
+
raise ValueError("Only support A100 and H100 for now")
|
173
|
+
|
174
|
+
type, env_vars, serve_args_dict = parse_server_command(serve_command)
|
175
|
+
if type.lower() not in template_name.lower():
|
176
|
+
raise ValueError(f"Template {template_name} does not support inference with {type}.")
|
177
|
+
num_gpus = extract_gpu_num_from_serve_command(serve_args_dict)
|
178
|
+
recommended_replica_resources = ReplicaResource(
|
179
|
+
cpu=num_gpus * 16,
|
180
|
+
ram_gb=num_gpus * 100,
|
181
|
+
gpu=num_gpus,
|
182
|
+
gpu_name=gpu_type,
|
183
|
+
)
|
184
|
+
except Exception as e:
|
185
|
+
raise ValueError(f"Failed to parse serve command, Error: {e}")
|
186
|
+
|
187
|
+
try:
|
188
|
+
env_vars = [
|
189
|
+
EnvParameter(key="SERVE_COMMAND", value=serve_command),
|
190
|
+
EnvParameter(key="GPU_TYPE", value=gpu_type),
|
191
|
+
]
|
192
|
+
resp = self.create_artifact(artifact_name, artifact_description, deployment_type="template", template_id=picked_template.template_id, env_parameters=env_vars)
|
193
|
+
# Assume Artifact is already with BuildStatus.SUCCESS status
|
194
|
+
return resp.artifact_id, recommended_replica_resources
|
195
|
+
except Exception as e:
|
196
|
+
logger.error(f"Failed to create artifact from template, Error: {e}")
|
197
|
+
raise e
|
198
|
+
|
90
199
|
def rebuild_artifact(self, artifact_id: str) -> RebuildArtifactResponse:
|
91
200
|
"""
|
92
201
|
Rebuild an existing artifact.
|
@@ -172,7 +281,7 @@ class ArtifactManager:
|
|
172
281
|
model_file_name = os.path.basename(model_file_path)
|
173
282
|
model_file_type = mimetypes.guess_type(model_file_path)[0]
|
174
283
|
|
175
|
-
req =
|
284
|
+
req = ResumableUploadLinkRequest(artifact_id=artifact_id, file_name=model_file_name, file_type=model_file_type)
|
176
285
|
|
177
286
|
resp = self.artifact_client.get_bigfile_upload_url(req)
|
178
287
|
if not resp or not resp.upload_link:
|
@@ -211,44 +320,116 @@ class ArtifactManager:
|
|
211
320
|
|
212
321
|
FileUploadClient.upload_large_file(upload_link, file_path)
|
213
322
|
|
323
|
+
|
324
|
+
def upload_model_files_to_artifact(self, artifact_id: str, model_directory: str) -> None:
|
325
|
+
"""
|
326
|
+
Upload model files to an existing artifact.
|
327
|
+
|
328
|
+
:param artifact_id: The ID of the artifact to upload the model files to.
|
329
|
+
:param model_directory: The path to the model directory.
|
330
|
+
"""
|
331
|
+
|
332
|
+
# List all files in the model directory recursively
|
333
|
+
model_file_paths = []
|
334
|
+
for root, _, files in os.walk(model_directory):
|
335
|
+
for file in files:
|
336
|
+
model_file_paths.append(os.path.join(root, file))
|
337
|
+
|
338
|
+
def upload_file(model_file_path):
|
339
|
+
self._validate_file_path(model_file_path)
|
340
|
+
bigfile_upload_url_resp = self.artifact_client.get_bigfile_upload_url(
|
341
|
+
ResumableUploadLinkRequest(artifact_id=artifact_id, file_name=os.path.basename(model_file_path))
|
342
|
+
)
|
343
|
+
FileUploadClient.upload_large_file(bigfile_upload_url_resp.upload_link, model_file_path)
|
344
|
+
|
345
|
+
# Upload files in parallel with progress bar
|
346
|
+
with tqdm(total=len(model_file_paths), desc="Uploading model files") as progress_bar:
|
347
|
+
with logging_redirect_tqdm():
|
348
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
349
|
+
futures = {executor.submit(upload_file, path): path for path in model_file_paths}
|
350
|
+
for future in concurrent.futures.as_completed(futures):
|
351
|
+
try:
|
352
|
+
future.result()
|
353
|
+
except Exception as e:
|
354
|
+
logger.error(f"Failed to upload file {futures[future]}, Error: {e}")
|
355
|
+
progress_bar.update(1)
|
356
|
+
|
214
357
|
def create_artifact_with_model_files(
|
215
358
|
self,
|
216
359
|
artifact_name: str,
|
217
360
|
artifact_file_path: str,
|
218
|
-
|
361
|
+
model_directory: str,
|
219
362
|
description: Optional[str] = "",
|
220
363
|
tags: Optional[str] = None
|
221
364
|
) -> str:
|
222
365
|
"""
|
223
366
|
Create a new artifact for a user and upload model files associated with the artifact.
|
224
|
-
|
225
367
|
:param artifact_name: The name of the artifact.
|
226
368
|
:param artifact_file_path: The path to the artifact file(Dockerfile+serve.py).
|
227
|
-
:param
|
369
|
+
:param model_directory: The path to the model directory.
|
228
370
|
:param description: An optional description for the artifact.
|
229
371
|
:param tags: Optional tags associated with the artifact, as a comma-separated string.
|
230
372
|
:return: The `artifact_id` of the created artifact.
|
231
|
-
:raises FileNotFoundError: If the provided `file_path` does not exist.
|
232
373
|
"""
|
233
374
|
artifact_id = self.create_artifact_with_file(artifact_name, artifact_file_path, description, tags)
|
375
|
+
logger.info(f"Artifact created: {artifact_id}")
|
234
376
|
|
235
|
-
|
236
|
-
self._validate_file_path(model_file_path)
|
237
|
-
bigfile_upload_url_resp = self.artifact_client.get_bigfile_upload_url(
|
238
|
-
GetBigFileUploadUrlRequest(artifact_id=artifact_id, model_file_path=model_file_path)
|
239
|
-
)
|
240
|
-
FileUploadClient.upload_large_file(bigfile_upload_url_resp.upload_link, model_file_path)
|
377
|
+
self.upload_model_files_to_artifact(artifact_id, model_directory)
|
241
378
|
|
242
379
|
return artifact_id
|
243
380
|
|
244
|
-
|
381
|
+
|
382
|
+
def wait_for_artifact_ready(self, artifact_id: str, timeout_s: int = 900) -> None:
|
383
|
+
"""
|
384
|
+
Wait for an artifact to be ready.
|
385
|
+
|
386
|
+
:param artifact_id: The ID of the artifact to wait for.
|
387
|
+
:param timeout_s: The timeout in seconds.
|
388
|
+
:return: None
|
389
|
+
"""
|
390
|
+
start_time = time.time()
|
391
|
+
while True:
|
392
|
+
try:
|
393
|
+
artifact = self.get_artifact(artifact_id)
|
394
|
+
if artifact.build_status == BuildStatus.SUCCESS:
|
395
|
+
return
|
396
|
+
elif artifact.build_status in [BuildStatus.FAILED, BuildStatus.TIMEOUT, BuildStatus.CANCELLED]:
|
397
|
+
raise Exception(f"Artifact build failed, status: {artifact.build_status}")
|
398
|
+
except Exception as e:
|
399
|
+
logger.error(f"Failed to get artifact, Error: {e}")
|
400
|
+
if time.time() - start_time > timeout_s:
|
401
|
+
raise Exception(f"Artifact build takes more than {timeout_s // 60} minutes. Testing aborted.")
|
402
|
+
time.sleep(10)
|
403
|
+
|
404
|
+
|
405
|
+
def get_public_templates(self) -> List[Template]:
|
245
406
|
"""
|
246
407
|
Fetch all artifact templates.
|
247
408
|
|
248
|
-
:return: A list of
|
249
|
-
:rtype: List[
|
409
|
+
:return: A list of Template objects.
|
410
|
+
:rtype: List[Template]
|
250
411
|
"""
|
251
412
|
return self.artifact_client.get_public_templates()
|
413
|
+
|
414
|
+
|
415
|
+
def list_public_template_names(self) -> list[str]:
|
416
|
+
"""
|
417
|
+
List all public templates.
|
418
|
+
|
419
|
+
:return: A list of template names.
|
420
|
+
:rtype: list[str]
|
421
|
+
"""
|
422
|
+
template_names = []
|
423
|
+
try:
|
424
|
+
templates = self.get_public_templates()
|
425
|
+
for template in templates:
|
426
|
+
if template.template_data and template.template_data.name:
|
427
|
+
template_names.append(template.template_data.name)
|
428
|
+
return template_names
|
429
|
+
except Exception as e:
|
430
|
+
logger.error(f"Failed to get artifact templates, Error: {e}")
|
431
|
+
return []
|
432
|
+
|
252
433
|
|
253
434
|
@staticmethod
|
254
435
|
def _validate_file_name(file_name: str) -> None:
|
@@ -4,6 +4,10 @@ from .._client._iam_client import IAMClient
|
|
4
4
|
from .._client._task_client import TaskClient
|
5
5
|
from .._models import *
|
6
6
|
|
7
|
+
import time
|
8
|
+
import logging
|
9
|
+
|
10
|
+
logger = logging.getLogger(__name__)
|
7
11
|
|
8
12
|
class TaskManager:
|
9
13
|
"""
|
@@ -37,7 +41,7 @@ class TaskManager:
|
|
37
41
|
|
38
42
|
:return: A list of `Task` objects.
|
39
43
|
"""
|
40
|
-
resp = self.task_client.get_all_tasks(
|
44
|
+
resp = self.task_client.get_all_tasks()
|
41
45
|
if not resp or not resp.tasks:
|
42
46
|
return []
|
43
47
|
|
@@ -59,6 +63,7 @@ class TaskManager:
|
|
59
63
|
if not resp or not resp.task:
|
60
64
|
raise ValueError("Failed to create task.")
|
61
65
|
|
66
|
+
logger.info(f"Task created: {resp.task.task_id}")
|
62
67
|
return resp.task
|
63
68
|
|
64
69
|
def create_task_from_file(self, artifact_id: str, config_file_path: str, trigger_timestamp: int = None) -> Task:
|
@@ -132,6 +137,56 @@ class TaskManager:
|
|
132
137
|
self._validate_not_empty(task_id, "Task ID")
|
133
138
|
|
134
139
|
return self.task_client.start_task(task_id)
|
140
|
+
|
141
|
+
|
142
|
+
def wait_for_task(self, task_id: str, timeout_s: int = 900) -> Task:
|
143
|
+
"""
|
144
|
+
Wait for a task to reach the RUNNING state or raise an exception if it fails.
|
145
|
+
|
146
|
+
:param task_id: The ID of the task to wait for.
|
147
|
+
:param timeout_s: The timeout in seconds.
|
148
|
+
:return: The task object.
|
149
|
+
:rtype: Task
|
150
|
+
"""
|
151
|
+
start_time = time.time()
|
152
|
+
while True:
|
153
|
+
try:
|
154
|
+
task = self.get_task(task_id)
|
155
|
+
if task.task_status == TaskStatus.RUNNING:
|
156
|
+
if task.endpoint_info is not None and task.endpoint_info.endpoint_status == TaskEndpointStatus.RUNNING:
|
157
|
+
return task
|
158
|
+
else:
|
159
|
+
if task.cluster_endpoints:
|
160
|
+
for ce in task.cluster_endpoints:
|
161
|
+
if ce.endpoint_status == TaskEndpointStatus.RUNNING:
|
162
|
+
return task
|
163
|
+
if task.task_status in [TaskStatus.NEEDSTOP, TaskStatus.ARCHIVED]:
|
164
|
+
raise Exception(f"Unexpected task status after starting: {task.task_status}")
|
165
|
+
else:
|
166
|
+
logger.info(f"Pending task starting. Task status: {task.task_status}")
|
167
|
+
except Exception as e:
|
168
|
+
logger.error(f"Failed to get task, Error: {e}")
|
169
|
+
if time.time() - start_time > timeout_s:
|
170
|
+
raise Exception(f"Task creation takes more than {timeout_s // 60} minutes. Testing aborted.")
|
171
|
+
time.sleep(10)
|
172
|
+
|
173
|
+
def start_task_and_wait(self, task_id: str, timeout_s: int = 3600) -> Task:
|
174
|
+
"""
|
175
|
+
Start a task and wait for it to be ready.
|
176
|
+
|
177
|
+
:param task_id: The ID of the task to start.
|
178
|
+
:param timeout_s: The timeout in seconds.
|
179
|
+
:return: The task object.
|
180
|
+
:rtype: Task
|
181
|
+
"""
|
182
|
+
try:
|
183
|
+
self.start_task(task_id)
|
184
|
+
logger.info(f"Started task ID: {task_id}")
|
185
|
+
except Exception as e:
|
186
|
+
logger.error(f"Failed to start task, Error: {e}")
|
187
|
+
raise e
|
188
|
+
|
189
|
+
return self.wait_for_task(task_id, timeout_s)
|
135
190
|
|
136
191
|
def stop_task(self, task_id: str) -> bool:
|
137
192
|
"""
|
@@ -142,9 +197,28 @@ class TaskManager:
|
|
142
197
|
:raises ValueError: If `task_id` is invalid (None or empty string).
|
143
198
|
"""
|
144
199
|
self._validate_not_empty(task_id, "Task ID")
|
145
|
-
|
146
200
|
return self.task_client.stop_task(task_id)
|
147
201
|
|
202
|
+
|
203
|
+
def stop_task_and_wait(self, task_id: str, timeout_s: int = 3600):
|
204
|
+
try:
|
205
|
+
self.stop_task(task_id)
|
206
|
+
logger.info(f"Stopping task ID: {task_id}")
|
207
|
+
except Exception as e:
|
208
|
+
logger.error(f"Failed to stop task, Error: {e}")
|
209
|
+
start_time = time.time()
|
210
|
+
while True:
|
211
|
+
try:
|
212
|
+
task = self.get_task(task_id)
|
213
|
+
if task.task_status == TaskStatus.IDLE:
|
214
|
+
break
|
215
|
+
except Exception as e:
|
216
|
+
logger.error(f"Failed to get task, Error: {e}")
|
217
|
+
if time.time() - start_time > timeout_s:
|
218
|
+
raise Exception(f"Task stopping takes more than {timeout_s // 60} minutes. Testing aborted.")
|
219
|
+
time.sleep(10)
|
220
|
+
|
221
|
+
|
148
222
|
def get_usage_data(self, start_timestamp: str, end_timestamp: str) -> GetUsageDataResponse:
|
149
223
|
"""
|
150
224
|
Retrieve the usage data of a task within a given time range.
|