viettelcloud-aiplatform 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- viettelcloud/__init__.py +1 -0
- viettelcloud/aiplatform/__init__.py +15 -0
- viettelcloud/aiplatform/common/__init__.py +0 -0
- viettelcloud/aiplatform/common/constants.py +22 -0
- viettelcloud/aiplatform/common/types.py +28 -0
- viettelcloud/aiplatform/common/utils.py +40 -0
- viettelcloud/aiplatform/hub/OWNERS +14 -0
- viettelcloud/aiplatform/hub/__init__.py +25 -0
- viettelcloud/aiplatform/hub/api/__init__.py +13 -0
- viettelcloud/aiplatform/hub/api/_proxy_client.py +355 -0
- viettelcloud/aiplatform/hub/api/model_registry_client.py +561 -0
- viettelcloud/aiplatform/hub/api/model_registry_client_test.py +462 -0
- viettelcloud/aiplatform/optimizer/__init__.py +45 -0
- viettelcloud/aiplatform/optimizer/api/__init__.py +0 -0
- viettelcloud/aiplatform/optimizer/api/optimizer_client.py +248 -0
- viettelcloud/aiplatform/optimizer/backends/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/backends/base.py +77 -0
- viettelcloud/aiplatform/optimizer/backends/kubernetes/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/backends/kubernetes/backend.py +563 -0
- viettelcloud/aiplatform/optimizer/backends/kubernetes/utils.py +112 -0
- viettelcloud/aiplatform/optimizer/constants/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/constants/constants.py +59 -0
- viettelcloud/aiplatform/optimizer/types/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/types/algorithm_types.py +87 -0
- viettelcloud/aiplatform/optimizer/types/optimization_types.py +135 -0
- viettelcloud/aiplatform/optimizer/types/search_types.py +95 -0
- viettelcloud/aiplatform/py.typed +0 -0
- viettelcloud/aiplatform/trainer/__init__.py +82 -0
- viettelcloud/aiplatform/trainer/api/__init__.py +3 -0
- viettelcloud/aiplatform/trainer/api/trainer_client.py +277 -0
- viettelcloud/aiplatform/trainer/api/trainer_client_test.py +72 -0
- viettelcloud/aiplatform/trainer/backends/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/backends/base.py +94 -0
- viettelcloud/aiplatform/trainer/backends/container/adapters/base.py +195 -0
- viettelcloud/aiplatform/trainer/backends/container/adapters/docker.py +231 -0
- viettelcloud/aiplatform/trainer/backends/container/adapters/podman.py +258 -0
- viettelcloud/aiplatform/trainer/backends/container/backend.py +668 -0
- viettelcloud/aiplatform/trainer/backends/container/backend_test.py +867 -0
- viettelcloud/aiplatform/trainer/backends/container/runtime_loader.py +631 -0
- viettelcloud/aiplatform/trainer/backends/container/runtime_loader_test.py +637 -0
- viettelcloud/aiplatform/trainer/backends/container/types.py +67 -0
- viettelcloud/aiplatform/trainer/backends/container/utils.py +213 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/backend.py +710 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/backend_test.py +1344 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/constants.py +15 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/utils.py +636 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/utils_test.py +582 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/backend.py +306 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/backend_test.py +501 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/constants.py +90 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/job.py +184 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/types.py +52 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/utils.py +302 -0
- viettelcloud/aiplatform/trainer/constants/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/constants/constants.py +179 -0
- viettelcloud/aiplatform/trainer/options/__init__.py +52 -0
- viettelcloud/aiplatform/trainer/options/common.py +55 -0
- viettelcloud/aiplatform/trainer/options/kubernetes.py +502 -0
- viettelcloud/aiplatform/trainer/options/kubernetes_test.py +259 -0
- viettelcloud/aiplatform/trainer/options/localprocess.py +20 -0
- viettelcloud/aiplatform/trainer/test/common.py +22 -0
- viettelcloud/aiplatform/trainer/types/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/types/types.py +517 -0
- viettelcloud/aiplatform/trainer/types/types_test.py +115 -0
- viettelcloud_aiplatform-0.3.0.dist-info/METADATA +226 -0
- viettelcloud_aiplatform-0.3.0.dist-info/RECORD +71 -0
- viettelcloud_aiplatform-0.3.0.dist-info/WHEEL +4 -0
- viettelcloud_aiplatform-0.3.0.dist-info/licenses/LICENSE +201 -0
- viettelcloud_aiplatform-0.3.0.dist-info/licenses/NOTICE +36 -0
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
# Copyright 2025 The Kubeflow Authors.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from collections.abc import Callable, Iterator
|
|
15
|
+
from datetime import datetime
|
|
16
|
+
import logging
|
|
17
|
+
import random
|
|
18
|
+
import string
|
|
19
|
+
import tempfile
|
|
20
|
+
import time
|
|
21
|
+
from typing import Optional, Union
|
|
22
|
+
import uuid
|
|
23
|
+
|
|
24
|
+
from viettelcloud.aiplatform.trainer.backends.base import RuntimeBackend
|
|
25
|
+
from viettelcloud.aiplatform.trainer.backends.localprocess import utils as local_utils
|
|
26
|
+
from viettelcloud.aiplatform.trainer.backends.localprocess.constants import local_runtimes
|
|
27
|
+
from viettelcloud.aiplatform.trainer.backends.localprocess.job import LocalJob
|
|
28
|
+
from viettelcloud.aiplatform.trainer.backends.localprocess.types import (
|
|
29
|
+
LocalBackendJobs,
|
|
30
|
+
LocalBackendStep,
|
|
31
|
+
LocalProcessBackendConfig,
|
|
32
|
+
)
|
|
33
|
+
from viettelcloud.aiplatform.trainer.constants import constants
|
|
34
|
+
from viettelcloud.aiplatform.trainer.types import types
|
|
35
|
+
|
|
36
|
+
logger = logging.getLogger(__name__)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class LocalProcessBackend(RuntimeBackend):
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
cfg: LocalProcessBackendConfig,
|
|
43
|
+
):
|
|
44
|
+
# list of running subprocesses
|
|
45
|
+
self.__local_jobs: list[LocalBackendJobs] = []
|
|
46
|
+
self.cfg = cfg
|
|
47
|
+
|
|
48
|
+
def list_runtimes(self) -> list[types.Runtime]:
|
|
49
|
+
return [self.__convert_local_runtime_to_runtime(local_runtime=rt) for rt in local_runtimes]
|
|
50
|
+
|
|
51
|
+
def get_runtime(self, name: str) -> types.Runtime:
|
|
52
|
+
runtime = next(
|
|
53
|
+
(
|
|
54
|
+
self.__convert_local_runtime_to_runtime(rt)
|
|
55
|
+
for rt in local_runtimes
|
|
56
|
+
if rt.name == name
|
|
57
|
+
),
|
|
58
|
+
None,
|
|
59
|
+
)
|
|
60
|
+
if not runtime:
|
|
61
|
+
raise ValueError(f"Runtime '{name}' not found.")
|
|
62
|
+
|
|
63
|
+
return runtime
|
|
64
|
+
|
|
65
|
+
def get_runtime_packages(self, runtime: types.Runtime):
|
|
66
|
+
local_runtime = next((rt for rt in local_runtimes if rt.name == runtime.name), None)
|
|
67
|
+
if not local_runtime:
|
|
68
|
+
raise ValueError(f"Runtime '{runtime.name}' not found.")
|
|
69
|
+
|
|
70
|
+
return local_runtime.trainer.packages
|
|
71
|
+
|
|
72
|
+
def train(
|
|
73
|
+
self,
|
|
74
|
+
runtime: Optional[Union[str, types.Runtime]] = None,
|
|
75
|
+
initializer: Optional[types.Initializer] = None,
|
|
76
|
+
trainer: Optional[
|
|
77
|
+
Union[types.CustomTrainer, types.CustomTrainerContainer, types.BuiltinTrainer]
|
|
78
|
+
] = None,
|
|
79
|
+
options: Optional[list] = None,
|
|
80
|
+
) -> str:
|
|
81
|
+
if runtime is None:
|
|
82
|
+
raise ValueError("Runtime must be provided for LocalProcessBackend")
|
|
83
|
+
if isinstance(runtime, str):
|
|
84
|
+
runtime = self.get_runtime(runtime)
|
|
85
|
+
|
|
86
|
+
# Process options to extract configuration
|
|
87
|
+
name = None
|
|
88
|
+
if options:
|
|
89
|
+
job_spec = {}
|
|
90
|
+
for option in options:
|
|
91
|
+
option(job_spec, trainer, self)
|
|
92
|
+
|
|
93
|
+
metadata_section = job_spec.get("metadata", {})
|
|
94
|
+
name = metadata_section.get("name")
|
|
95
|
+
|
|
96
|
+
# Generate train job name if not provided via options
|
|
97
|
+
trainjob_name = name or (
|
|
98
|
+
random.choice(string.ascii_lowercase)
|
|
99
|
+
+ uuid.uuid4().hex[: constants.JOB_NAME_UUID_LENGTH]
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# localprocess backend only supports CustomTrainer
|
|
103
|
+
if not isinstance(trainer, types.CustomTrainer):
|
|
104
|
+
raise ValueError("CustomTrainer must be set with LocalProcessBackend")
|
|
105
|
+
|
|
106
|
+
# create temp dir
|
|
107
|
+
venv_dir = tempfile.mkdtemp(prefix=trainjob_name)
|
|
108
|
+
logger.debug(f"operating in {venv_dir}")
|
|
109
|
+
|
|
110
|
+
# get local runtime trainer
|
|
111
|
+
runtime.trainer = local_utils.get_local_runtime_trainer(
|
|
112
|
+
runtime_name=runtime.name,
|
|
113
|
+
venv_dir=venv_dir,
|
|
114
|
+
framework=runtime.trainer.framework,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# build training job command
|
|
118
|
+
training_command = local_utils.get_local_train_job_script(
|
|
119
|
+
trainer=trainer,
|
|
120
|
+
runtime=runtime,
|
|
121
|
+
train_job_name=trainjob_name,
|
|
122
|
+
venv_dir=venv_dir,
|
|
123
|
+
cleanup_venv=self.cfg.cleanup_venv,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
# set the command in the runtime trainer
|
|
127
|
+
runtime.trainer.set_command(training_command)
|
|
128
|
+
|
|
129
|
+
# create subprocess object
|
|
130
|
+
train_job = LocalJob(
|
|
131
|
+
name=f"{trainjob_name}-train",
|
|
132
|
+
command=training_command,
|
|
133
|
+
execution_dir=venv_dir,
|
|
134
|
+
env=trainer.env,
|
|
135
|
+
dependencies=[],
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
self.__register_job(
|
|
139
|
+
train_job_name=trainjob_name,
|
|
140
|
+
step_name="train",
|
|
141
|
+
job=train_job,
|
|
142
|
+
runtime=runtime,
|
|
143
|
+
)
|
|
144
|
+
# start the job.
|
|
145
|
+
train_job.start()
|
|
146
|
+
|
|
147
|
+
return trainjob_name
|
|
148
|
+
|
|
149
|
+
def list_jobs(self, runtime: Optional[types.Runtime] = None) -> list[types.TrainJob]:
|
|
150
|
+
result = []
|
|
151
|
+
|
|
152
|
+
for _job in self.__local_jobs:
|
|
153
|
+
if runtime and _job.runtime.name != runtime.name:
|
|
154
|
+
continue
|
|
155
|
+
result.append(
|
|
156
|
+
types.TrainJob(
|
|
157
|
+
name=_job.name,
|
|
158
|
+
creation_timestamp=_job.created,
|
|
159
|
+
runtime=runtime,
|
|
160
|
+
num_nodes=1,
|
|
161
|
+
steps=[
|
|
162
|
+
types.Step(name=s.step_name, pod_name=s.step_name, status=s.job.status)
|
|
163
|
+
for s in _job.steps
|
|
164
|
+
],
|
|
165
|
+
)
|
|
166
|
+
)
|
|
167
|
+
return result
|
|
168
|
+
|
|
169
|
+
def get_job(self, name: str) -> types.TrainJob:
|
|
170
|
+
_job = next((j for j in self.__local_jobs if j.name == name), None)
|
|
171
|
+
if _job is None:
|
|
172
|
+
raise ValueError(f"No TrainJob with name {name}")
|
|
173
|
+
|
|
174
|
+
# check and set the correct job status to match `TrainerClient` supported statuses
|
|
175
|
+
status = self.__get_job_status(_job)
|
|
176
|
+
|
|
177
|
+
return types.TrainJob(
|
|
178
|
+
name=_job.name,
|
|
179
|
+
creation_timestamp=_job.created,
|
|
180
|
+
steps=[
|
|
181
|
+
types.Step(
|
|
182
|
+
name=_step.step_name,
|
|
183
|
+
pod_name=_step.step_name,
|
|
184
|
+
status=_step.job.status,
|
|
185
|
+
)
|
|
186
|
+
for _step in _job.steps
|
|
187
|
+
],
|
|
188
|
+
runtime=_job.runtime,
|
|
189
|
+
num_nodes=1,
|
|
190
|
+
status=status,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
def get_job_logs(
|
|
194
|
+
self,
|
|
195
|
+
name: str,
|
|
196
|
+
follow: bool = False,
|
|
197
|
+
step: str = constants.NODE + "-0",
|
|
198
|
+
) -> Iterator[str]:
|
|
199
|
+
_job = [j for j in self.__local_jobs if j.name == name]
|
|
200
|
+
if not _job:
|
|
201
|
+
raise ValueError(f"No TrainJob with name {name}")
|
|
202
|
+
|
|
203
|
+
want_all_steps = step == constants.NODE + "-0"
|
|
204
|
+
|
|
205
|
+
for _step in _job[0].steps:
|
|
206
|
+
if not want_all_steps and _step.step_name != step:
|
|
207
|
+
continue
|
|
208
|
+
# Flatten the generator and pass through flags so it behaves as expected
|
|
209
|
+
# (adjust args if stream_logs has different signature)
|
|
210
|
+
yield from _step.job.logs(follow=follow)
|
|
211
|
+
|
|
212
|
+
def wait_for_job_status(
|
|
213
|
+
self,
|
|
214
|
+
name: str,
|
|
215
|
+
status: set[str] = {constants.TRAINJOB_COMPLETE},
|
|
216
|
+
timeout: int = 600,
|
|
217
|
+
polling_interval: int = 2,
|
|
218
|
+
callbacks: Optional[list[Callable[[types.TrainJob], None]]] = None,
|
|
219
|
+
) -> types.TrainJob:
|
|
220
|
+
# find first match or fallback
|
|
221
|
+
_job = next((_job for _job in self.__local_jobs if _job.name == name), None)
|
|
222
|
+
|
|
223
|
+
if _job is None:
|
|
224
|
+
raise ValueError(f"No TrainJob with name {name}")
|
|
225
|
+
|
|
226
|
+
if polling_interval > timeout:
|
|
227
|
+
raise ValueError(
|
|
228
|
+
f"Polling interval {polling_interval} must be less than timeout: {timeout}"
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
for _ in range(round(timeout / polling_interval)):
|
|
232
|
+
# Get current job status
|
|
233
|
+
trainjob = self.get_job(name)
|
|
234
|
+
|
|
235
|
+
# Invoke callbacks if provided
|
|
236
|
+
if callbacks:
|
|
237
|
+
for callback in callbacks:
|
|
238
|
+
callback(trainjob)
|
|
239
|
+
|
|
240
|
+
# Return if job has reached desired status
|
|
241
|
+
if trainjob.status in status:
|
|
242
|
+
return trainjob
|
|
243
|
+
|
|
244
|
+
time.sleep(polling_interval)
|
|
245
|
+
|
|
246
|
+
# Timeout reached
|
|
247
|
+
raise TimeoutError(f"Timeout waiting for TrainJob {name} to reach status: {status}")
|
|
248
|
+
|
|
249
|
+
def delete_job(self, name: str):
|
|
250
|
+
# find job first.
|
|
251
|
+
_job = next((j for j in self.__local_jobs if j.name == name), None)
|
|
252
|
+
if _job is None:
|
|
253
|
+
raise ValueError(f"No TrainJob with name {name}")
|
|
254
|
+
|
|
255
|
+
# cancel all nested step jobs in target job
|
|
256
|
+
_ = [step.job.cancel() for step in _job.steps]
|
|
257
|
+
# remove the job from the list of jobs
|
|
258
|
+
self.__local_jobs.remove(_job)
|
|
259
|
+
|
|
260
|
+
def __get_job_status(self, job: LocalBackendJobs) -> str:
|
|
261
|
+
statuses = [_step.job.status for _step in job.steps]
|
|
262
|
+
# if status is running or failed will take precedence over completed
|
|
263
|
+
if constants.TRAINJOB_FAILED in statuses:
|
|
264
|
+
status = constants.TRAINJOB_FAILED
|
|
265
|
+
elif constants.TRAINJOB_RUNNING in statuses:
|
|
266
|
+
status = constants.TRAINJOB_RUNNING
|
|
267
|
+
elif constants.TRAINJOB_CREATED in statuses:
|
|
268
|
+
status = constants.TRAINJOB_CREATED
|
|
269
|
+
else:
|
|
270
|
+
status = constants.TRAINJOB_CREATED
|
|
271
|
+
|
|
272
|
+
return status
|
|
273
|
+
|
|
274
|
+
def __register_job(
|
|
275
|
+
self,
|
|
276
|
+
train_job_name: str,
|
|
277
|
+
step_name: str,
|
|
278
|
+
job: LocalJob,
|
|
279
|
+
runtime: types.Runtime,
|
|
280
|
+
):
|
|
281
|
+
existing_jobs = [j for j in self.__local_jobs if j.name == train_job_name]
|
|
282
|
+
if not existing_jobs:
|
|
283
|
+
_job = LocalBackendJobs(name=train_job_name, runtime=runtime, created=datetime.now())
|
|
284
|
+
self.__local_jobs.append(_job)
|
|
285
|
+
else:
|
|
286
|
+
_job = existing_jobs[0]
|
|
287
|
+
|
|
288
|
+
existing_steps = [s for s in _job.steps if s.step_name == step_name]
|
|
289
|
+
if not existing_steps:
|
|
290
|
+
_step = LocalBackendStep(step_name=step_name, job=job)
|
|
291
|
+
_job.steps.append(_step)
|
|
292
|
+
else:
|
|
293
|
+
logger.warning(f"Step '{step_name}' already registered.")
|
|
294
|
+
|
|
295
|
+
def __convert_local_runtime_to_runtime(self, local_runtime) -> types.Runtime:
|
|
296
|
+
return types.Runtime(
|
|
297
|
+
name=local_runtime.name,
|
|
298
|
+
trainer=types.RuntimeTrainer(
|
|
299
|
+
trainer_type=local_runtime.trainer.trainer_type,
|
|
300
|
+
framework=local_runtime.trainer.framework,
|
|
301
|
+
num_nodes=local_runtime.trainer.num_nodes,
|
|
302
|
+
device_count=local_runtime.trainer.device_count,
|
|
303
|
+
device=local_runtime.trainer.device,
|
|
304
|
+
image=local_runtime.trainer.image,
|
|
305
|
+
),
|
|
306
|
+
)
|