viettelcloud-aiplatform 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. viettelcloud/__init__.py +1 -0
  2. viettelcloud/aiplatform/__init__.py +15 -0
  3. viettelcloud/aiplatform/common/__init__.py +0 -0
  4. viettelcloud/aiplatform/common/constants.py +22 -0
  5. viettelcloud/aiplatform/common/types.py +28 -0
  6. viettelcloud/aiplatform/common/utils.py +40 -0
  7. viettelcloud/aiplatform/hub/OWNERS +14 -0
  8. viettelcloud/aiplatform/hub/__init__.py +25 -0
  9. viettelcloud/aiplatform/hub/api/__init__.py +13 -0
  10. viettelcloud/aiplatform/hub/api/_proxy_client.py +355 -0
  11. viettelcloud/aiplatform/hub/api/model_registry_client.py +561 -0
  12. viettelcloud/aiplatform/hub/api/model_registry_client_test.py +462 -0
  13. viettelcloud/aiplatform/optimizer/__init__.py +45 -0
  14. viettelcloud/aiplatform/optimizer/api/__init__.py +0 -0
  15. viettelcloud/aiplatform/optimizer/api/optimizer_client.py +248 -0
  16. viettelcloud/aiplatform/optimizer/backends/__init__.py +13 -0
  17. viettelcloud/aiplatform/optimizer/backends/base.py +77 -0
  18. viettelcloud/aiplatform/optimizer/backends/kubernetes/__init__.py +13 -0
  19. viettelcloud/aiplatform/optimizer/backends/kubernetes/backend.py +563 -0
  20. viettelcloud/aiplatform/optimizer/backends/kubernetes/utils.py +112 -0
  21. viettelcloud/aiplatform/optimizer/constants/__init__.py +13 -0
  22. viettelcloud/aiplatform/optimizer/constants/constants.py +59 -0
  23. viettelcloud/aiplatform/optimizer/types/__init__.py +13 -0
  24. viettelcloud/aiplatform/optimizer/types/algorithm_types.py +87 -0
  25. viettelcloud/aiplatform/optimizer/types/optimization_types.py +135 -0
  26. viettelcloud/aiplatform/optimizer/types/search_types.py +95 -0
  27. viettelcloud/aiplatform/py.typed +0 -0
  28. viettelcloud/aiplatform/trainer/__init__.py +82 -0
  29. viettelcloud/aiplatform/trainer/api/__init__.py +3 -0
  30. viettelcloud/aiplatform/trainer/api/trainer_client.py +277 -0
  31. viettelcloud/aiplatform/trainer/api/trainer_client_test.py +72 -0
  32. viettelcloud/aiplatform/trainer/backends/__init__.py +0 -0
  33. viettelcloud/aiplatform/trainer/backends/base.py +94 -0
  34. viettelcloud/aiplatform/trainer/backends/container/adapters/base.py +195 -0
  35. viettelcloud/aiplatform/trainer/backends/container/adapters/docker.py +231 -0
  36. viettelcloud/aiplatform/trainer/backends/container/adapters/podman.py +258 -0
  37. viettelcloud/aiplatform/trainer/backends/container/backend.py +668 -0
  38. viettelcloud/aiplatform/trainer/backends/container/backend_test.py +867 -0
  39. viettelcloud/aiplatform/trainer/backends/container/runtime_loader.py +631 -0
  40. viettelcloud/aiplatform/trainer/backends/container/runtime_loader_test.py +637 -0
  41. viettelcloud/aiplatform/trainer/backends/container/types.py +67 -0
  42. viettelcloud/aiplatform/trainer/backends/container/utils.py +213 -0
  43. viettelcloud/aiplatform/trainer/backends/kubernetes/__init__.py +0 -0
  44. viettelcloud/aiplatform/trainer/backends/kubernetes/backend.py +710 -0
  45. viettelcloud/aiplatform/trainer/backends/kubernetes/backend_test.py +1344 -0
  46. viettelcloud/aiplatform/trainer/backends/kubernetes/constants.py +15 -0
  47. viettelcloud/aiplatform/trainer/backends/kubernetes/utils.py +636 -0
  48. viettelcloud/aiplatform/trainer/backends/kubernetes/utils_test.py +582 -0
  49. viettelcloud/aiplatform/trainer/backends/localprocess/__init__.py +0 -0
  50. viettelcloud/aiplatform/trainer/backends/localprocess/backend.py +306 -0
  51. viettelcloud/aiplatform/trainer/backends/localprocess/backend_test.py +501 -0
  52. viettelcloud/aiplatform/trainer/backends/localprocess/constants.py +90 -0
  53. viettelcloud/aiplatform/trainer/backends/localprocess/job.py +184 -0
  54. viettelcloud/aiplatform/trainer/backends/localprocess/types.py +52 -0
  55. viettelcloud/aiplatform/trainer/backends/localprocess/utils.py +302 -0
  56. viettelcloud/aiplatform/trainer/constants/__init__.py +0 -0
  57. viettelcloud/aiplatform/trainer/constants/constants.py +179 -0
  58. viettelcloud/aiplatform/trainer/options/__init__.py +52 -0
  59. viettelcloud/aiplatform/trainer/options/common.py +55 -0
  60. viettelcloud/aiplatform/trainer/options/kubernetes.py +502 -0
  61. viettelcloud/aiplatform/trainer/options/kubernetes_test.py +259 -0
  62. viettelcloud/aiplatform/trainer/options/localprocess.py +20 -0
  63. viettelcloud/aiplatform/trainer/test/common.py +22 -0
  64. viettelcloud/aiplatform/trainer/types/__init__.py +0 -0
  65. viettelcloud/aiplatform/trainer/types/types.py +517 -0
  66. viettelcloud/aiplatform/trainer/types/types_test.py +115 -0
  67. viettelcloud_aiplatform-0.3.0.dist-info/METADATA +226 -0
  68. viettelcloud_aiplatform-0.3.0.dist-info/RECORD +71 -0
  69. viettelcloud_aiplatform-0.3.0.dist-info/WHEEL +4 -0
  70. viettelcloud_aiplatform-0.3.0.dist-info/licenses/LICENSE +201 -0
  71. viettelcloud_aiplatform-0.3.0.dist-info/licenses/NOTICE +36 -0
@@ -0,0 +1,306 @@
1
+ # Copyright 2025 The Kubeflow Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from collections.abc import Callable, Iterator
15
+ from datetime import datetime
16
+ import logging
17
+ import random
18
+ import string
19
+ import tempfile
20
+ import time
21
+ from typing import Optional, Union
22
+ import uuid
23
+
24
+ from viettelcloud.aiplatform.trainer.backends.base import RuntimeBackend
25
+ from viettelcloud.aiplatform.trainer.backends.localprocess import utils as local_utils
26
+ from viettelcloud.aiplatform.trainer.backends.localprocess.constants import local_runtimes
27
+ from viettelcloud.aiplatform.trainer.backends.localprocess.job import LocalJob
28
+ from viettelcloud.aiplatform.trainer.backends.localprocess.types import (
29
+ LocalBackendJobs,
30
+ LocalBackendStep,
31
+ LocalProcessBackendConfig,
32
+ )
33
+ from viettelcloud.aiplatform.trainer.constants import constants
34
+ from viettelcloud.aiplatform.trainer.types import types
35
+
36
+ logger = logging.getLogger(__name__)
37
+
38
+
39
+ class LocalProcessBackend(RuntimeBackend):
40
+ def __init__(
41
+ self,
42
+ cfg: LocalProcessBackendConfig,
43
+ ):
44
+ # list of running subprocesses
45
+ self.__local_jobs: list[LocalBackendJobs] = []
46
+ self.cfg = cfg
47
+
48
+ def list_runtimes(self) -> list[types.Runtime]:
49
+ return [self.__convert_local_runtime_to_runtime(local_runtime=rt) for rt in local_runtimes]
50
+
51
+ def get_runtime(self, name: str) -> types.Runtime:
52
+ runtime = next(
53
+ (
54
+ self.__convert_local_runtime_to_runtime(rt)
55
+ for rt in local_runtimes
56
+ if rt.name == name
57
+ ),
58
+ None,
59
+ )
60
+ if not runtime:
61
+ raise ValueError(f"Runtime '{name}' not found.")
62
+
63
+ return runtime
64
+
65
+ def get_runtime_packages(self, runtime: types.Runtime):
66
+ local_runtime = next((rt for rt in local_runtimes if rt.name == runtime.name), None)
67
+ if not local_runtime:
68
+ raise ValueError(f"Runtime '{runtime.name}' not found.")
69
+
70
+ return local_runtime.trainer.packages
71
+
72
+ def train(
73
+ self,
74
+ runtime: Optional[Union[str, types.Runtime]] = None,
75
+ initializer: Optional[types.Initializer] = None,
76
+ trainer: Optional[
77
+ Union[types.CustomTrainer, types.CustomTrainerContainer, types.BuiltinTrainer]
78
+ ] = None,
79
+ options: Optional[list] = None,
80
+ ) -> str:
81
+ if runtime is None:
82
+ raise ValueError("Runtime must be provided for LocalProcessBackend")
83
+ if isinstance(runtime, str):
84
+ runtime = self.get_runtime(runtime)
85
+
86
+ # Process options to extract configuration
87
+ name = None
88
+ if options:
89
+ job_spec = {}
90
+ for option in options:
91
+ option(job_spec, trainer, self)
92
+
93
+ metadata_section = job_spec.get("metadata", {})
94
+ name = metadata_section.get("name")
95
+
96
+ # Generate train job name if not provided via options
97
+ trainjob_name = name or (
98
+ random.choice(string.ascii_lowercase)
99
+ + uuid.uuid4().hex[: constants.JOB_NAME_UUID_LENGTH]
100
+ )
101
+
102
+ # localprocess backend only supports CustomTrainer
103
+ if not isinstance(trainer, types.CustomTrainer):
104
+ raise ValueError("CustomTrainer must be set with LocalProcessBackend")
105
+
106
+ # create temp dir
107
+ venv_dir = tempfile.mkdtemp(prefix=trainjob_name)
108
+ logger.debug(f"operating in {venv_dir}")
109
+
110
+ # get local runtime trainer
111
+ runtime.trainer = local_utils.get_local_runtime_trainer(
112
+ runtime_name=runtime.name,
113
+ venv_dir=venv_dir,
114
+ framework=runtime.trainer.framework,
115
+ )
116
+
117
+ # build training job command
118
+ training_command = local_utils.get_local_train_job_script(
119
+ trainer=trainer,
120
+ runtime=runtime,
121
+ train_job_name=trainjob_name,
122
+ venv_dir=venv_dir,
123
+ cleanup_venv=self.cfg.cleanup_venv,
124
+ )
125
+
126
+ # set the command in the runtime trainer
127
+ runtime.trainer.set_command(training_command)
128
+
129
+ # create subprocess object
130
+ train_job = LocalJob(
131
+ name=f"{trainjob_name}-train",
132
+ command=training_command,
133
+ execution_dir=venv_dir,
134
+ env=trainer.env,
135
+ dependencies=[],
136
+ )
137
+
138
+ self.__register_job(
139
+ train_job_name=trainjob_name,
140
+ step_name="train",
141
+ job=train_job,
142
+ runtime=runtime,
143
+ )
144
+ # start the job.
145
+ train_job.start()
146
+
147
+ return trainjob_name
148
+
149
+ def list_jobs(self, runtime: Optional[types.Runtime] = None) -> list[types.TrainJob]:
150
+ result = []
151
+
152
+ for _job in self.__local_jobs:
153
+ if runtime and _job.runtime.name != runtime.name:
154
+ continue
155
+ result.append(
156
+ types.TrainJob(
157
+ name=_job.name,
158
+ creation_timestamp=_job.created,
159
+ runtime=runtime,
160
+ num_nodes=1,
161
+ steps=[
162
+ types.Step(name=s.step_name, pod_name=s.step_name, status=s.job.status)
163
+ for s in _job.steps
164
+ ],
165
+ )
166
+ )
167
+ return result
168
+
169
+ def get_job(self, name: str) -> types.TrainJob:
170
+ _job = next((j for j in self.__local_jobs if j.name == name), None)
171
+ if _job is None:
172
+ raise ValueError(f"No TrainJob with name {name}")
173
+
174
+ # check and set the correct job status to match `TrainerClient` supported statuses
175
+ status = self.__get_job_status(_job)
176
+
177
+ return types.TrainJob(
178
+ name=_job.name,
179
+ creation_timestamp=_job.created,
180
+ steps=[
181
+ types.Step(
182
+ name=_step.step_name,
183
+ pod_name=_step.step_name,
184
+ status=_step.job.status,
185
+ )
186
+ for _step in _job.steps
187
+ ],
188
+ runtime=_job.runtime,
189
+ num_nodes=1,
190
+ status=status,
191
+ )
192
+
193
+ def get_job_logs(
194
+ self,
195
+ name: str,
196
+ follow: bool = False,
197
+ step: str = constants.NODE + "-0",
198
+ ) -> Iterator[str]:
199
+ _job = [j for j in self.__local_jobs if j.name == name]
200
+ if not _job:
201
+ raise ValueError(f"No TrainJob with name {name}")
202
+
203
+ want_all_steps = step == constants.NODE + "-0"
204
+
205
+ for _step in _job[0].steps:
206
+ if not want_all_steps and _step.step_name != step:
207
+ continue
208
+ # Flatten the generator and pass through flags so it behaves as expected
209
+ # (adjust args if stream_logs has different signature)
210
+ yield from _step.job.logs(follow=follow)
211
+
212
+ def wait_for_job_status(
213
+ self,
214
+ name: str,
215
+ status: set[str] = {constants.TRAINJOB_COMPLETE},
216
+ timeout: int = 600,
217
+ polling_interval: int = 2,
218
+ callbacks: Optional[list[Callable[[types.TrainJob], None]]] = None,
219
+ ) -> types.TrainJob:
220
+ # find first match or fallback
221
+ _job = next((_job for _job in self.__local_jobs if _job.name == name), None)
222
+
223
+ if _job is None:
224
+ raise ValueError(f"No TrainJob with name {name}")
225
+
226
+ if polling_interval > timeout:
227
+ raise ValueError(
228
+ f"Polling interval {polling_interval} must be less than timeout: {timeout}"
229
+ )
230
+
231
+ for _ in range(round(timeout / polling_interval)):
232
+ # Get current job status
233
+ trainjob = self.get_job(name)
234
+
235
+ # Invoke callbacks if provided
236
+ if callbacks:
237
+ for callback in callbacks:
238
+ callback(trainjob)
239
+
240
+ # Return if job has reached desired status
241
+ if trainjob.status in status:
242
+ return trainjob
243
+
244
+ time.sleep(polling_interval)
245
+
246
+ # Timeout reached
247
+ raise TimeoutError(f"Timeout waiting for TrainJob {name} to reach status: {status}")
248
+
249
+ def delete_job(self, name: str):
250
+ # find job first.
251
+ _job = next((j for j in self.__local_jobs if j.name == name), None)
252
+ if _job is None:
253
+ raise ValueError(f"No TrainJob with name {name}")
254
+
255
+ # cancel all nested step jobs in target job
256
+ _ = [step.job.cancel() for step in _job.steps]
257
+ # remove the job from the list of jobs
258
+ self.__local_jobs.remove(_job)
259
+
260
+ def __get_job_status(self, job: LocalBackendJobs) -> str:
261
+ statuses = [_step.job.status for _step in job.steps]
262
+ # if status is running or failed will take precedence over completed
263
+ if constants.TRAINJOB_FAILED in statuses:
264
+ status = constants.TRAINJOB_FAILED
265
+ elif constants.TRAINJOB_RUNNING in statuses:
266
+ status = constants.TRAINJOB_RUNNING
267
+ elif constants.TRAINJOB_CREATED in statuses:
268
+ status = constants.TRAINJOB_CREATED
269
+ else:
270
+ status = constants.TRAINJOB_CREATED
271
+
272
+ return status
273
+
274
+ def __register_job(
275
+ self,
276
+ train_job_name: str,
277
+ step_name: str,
278
+ job: LocalJob,
279
+ runtime: types.Runtime,
280
+ ):
281
+ existing_jobs = [j for j in self.__local_jobs if j.name == train_job_name]
282
+ if not existing_jobs:
283
+ _job = LocalBackendJobs(name=train_job_name, runtime=runtime, created=datetime.now())
284
+ self.__local_jobs.append(_job)
285
+ else:
286
+ _job = existing_jobs[0]
287
+
288
+ existing_steps = [s for s in _job.steps if s.step_name == step_name]
289
+ if not existing_steps:
290
+ _step = LocalBackendStep(step_name=step_name, job=job)
291
+ _job.steps.append(_step)
292
+ else:
293
+ logger.warning(f"Step '{step_name}' already registered.")
294
+
295
+ def __convert_local_runtime_to_runtime(self, local_runtime) -> types.Runtime:
296
+ return types.Runtime(
297
+ name=local_runtime.name,
298
+ trainer=types.RuntimeTrainer(
299
+ trainer_type=local_runtime.trainer.trainer_type,
300
+ framework=local_runtime.trainer.framework,
301
+ num_nodes=local_runtime.trainer.num_nodes,
302
+ device_count=local_runtime.trainer.device_count,
303
+ device=local_runtime.trainer.device,
304
+ image=local_runtime.trainer.image,
305
+ ),
306
+ )