viettelcloud-aiplatform 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- viettelcloud/__init__.py +1 -0
- viettelcloud/aiplatform/__init__.py +15 -0
- viettelcloud/aiplatform/common/__init__.py +0 -0
- viettelcloud/aiplatform/common/constants.py +22 -0
- viettelcloud/aiplatform/common/types.py +28 -0
- viettelcloud/aiplatform/common/utils.py +40 -0
- viettelcloud/aiplatform/hub/OWNERS +14 -0
- viettelcloud/aiplatform/hub/__init__.py +25 -0
- viettelcloud/aiplatform/hub/api/__init__.py +13 -0
- viettelcloud/aiplatform/hub/api/_proxy_client.py +355 -0
- viettelcloud/aiplatform/hub/api/model_registry_client.py +561 -0
- viettelcloud/aiplatform/hub/api/model_registry_client_test.py +462 -0
- viettelcloud/aiplatform/optimizer/__init__.py +45 -0
- viettelcloud/aiplatform/optimizer/api/__init__.py +0 -0
- viettelcloud/aiplatform/optimizer/api/optimizer_client.py +248 -0
- viettelcloud/aiplatform/optimizer/backends/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/backends/base.py +77 -0
- viettelcloud/aiplatform/optimizer/backends/kubernetes/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/backends/kubernetes/backend.py +563 -0
- viettelcloud/aiplatform/optimizer/backends/kubernetes/utils.py +112 -0
- viettelcloud/aiplatform/optimizer/constants/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/constants/constants.py +59 -0
- viettelcloud/aiplatform/optimizer/types/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/types/algorithm_types.py +87 -0
- viettelcloud/aiplatform/optimizer/types/optimization_types.py +135 -0
- viettelcloud/aiplatform/optimizer/types/search_types.py +95 -0
- viettelcloud/aiplatform/py.typed +0 -0
- viettelcloud/aiplatform/trainer/__init__.py +82 -0
- viettelcloud/aiplatform/trainer/api/__init__.py +3 -0
- viettelcloud/aiplatform/trainer/api/trainer_client.py +277 -0
- viettelcloud/aiplatform/trainer/api/trainer_client_test.py +72 -0
- viettelcloud/aiplatform/trainer/backends/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/backends/base.py +94 -0
- viettelcloud/aiplatform/trainer/backends/container/adapters/base.py +195 -0
- viettelcloud/aiplatform/trainer/backends/container/adapters/docker.py +231 -0
- viettelcloud/aiplatform/trainer/backends/container/adapters/podman.py +258 -0
- viettelcloud/aiplatform/trainer/backends/container/backend.py +668 -0
- viettelcloud/aiplatform/trainer/backends/container/backend_test.py +867 -0
- viettelcloud/aiplatform/trainer/backends/container/runtime_loader.py +631 -0
- viettelcloud/aiplatform/trainer/backends/container/runtime_loader_test.py +637 -0
- viettelcloud/aiplatform/trainer/backends/container/types.py +67 -0
- viettelcloud/aiplatform/trainer/backends/container/utils.py +213 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/backend.py +710 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/backend_test.py +1344 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/constants.py +15 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/utils.py +636 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/utils_test.py +582 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/backend.py +306 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/backend_test.py +501 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/constants.py +90 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/job.py +184 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/types.py +52 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/utils.py +302 -0
- viettelcloud/aiplatform/trainer/constants/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/constants/constants.py +179 -0
- viettelcloud/aiplatform/trainer/options/__init__.py +52 -0
- viettelcloud/aiplatform/trainer/options/common.py +55 -0
- viettelcloud/aiplatform/trainer/options/kubernetes.py +502 -0
- viettelcloud/aiplatform/trainer/options/kubernetes_test.py +259 -0
- viettelcloud/aiplatform/trainer/options/localprocess.py +20 -0
- viettelcloud/aiplatform/trainer/test/common.py +22 -0
- viettelcloud/aiplatform/trainer/types/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/types/types.py +517 -0
- viettelcloud/aiplatform/trainer/types/types_test.py +115 -0
- viettelcloud_aiplatform-0.3.0.dist-info/METADATA +226 -0
- viettelcloud_aiplatform-0.3.0.dist-info/RECORD +71 -0
- viettelcloud_aiplatform-0.3.0.dist-info/WHEEL +4 -0
- viettelcloud_aiplatform-0.3.0.dist-info/licenses/LICENSE +201 -0
- viettelcloud_aiplatform-0.3.0.dist-info/licenses/NOTICE +36 -0
|
@@ -0,0 +1,563 @@
|
|
|
1
|
+
# Copyright 2025 The Kubeflow Authors.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from collections.abc import Callable, Iterator
|
|
16
|
+
import logging
|
|
17
|
+
import multiprocessing
|
|
18
|
+
import random
|
|
19
|
+
import string
|
|
20
|
+
import time
|
|
21
|
+
from typing import Any, Optional
|
|
22
|
+
import uuid
|
|
23
|
+
|
|
24
|
+
from kubeflow_katib_api import models
|
|
25
|
+
from kubernetes import client, config
|
|
26
|
+
|
|
27
|
+
import viettelcloud.aiplatform.common.constants as common_constants
|
|
28
|
+
from viettelcloud.aiplatform.common.types import KubernetesBackendConfig
|
|
29
|
+
import viettelcloud.aiplatform.common.utils as common_utils
|
|
30
|
+
from viettelcloud.aiplatform.optimizer.backends.base import RuntimeBackend
|
|
31
|
+
from viettelcloud.aiplatform.optimizer.backends.kubernetes import utils
|
|
32
|
+
from viettelcloud.aiplatform.optimizer.constants import constants
|
|
33
|
+
from viettelcloud.aiplatform.optimizer.types.algorithm_types import BaseAlgorithm, RandomSearch
|
|
34
|
+
from viettelcloud.aiplatform.optimizer.types.optimization_types import (
|
|
35
|
+
Metric,
|
|
36
|
+
Objective,
|
|
37
|
+
OptimizationJob,
|
|
38
|
+
Result,
|
|
39
|
+
Trial,
|
|
40
|
+
TrialConfig,
|
|
41
|
+
)
|
|
42
|
+
from viettelcloud.aiplatform.trainer.backends.kubernetes.backend import (
|
|
43
|
+
KubernetesBackend as TrainerBackend,
|
|
44
|
+
)
|
|
45
|
+
import viettelcloud.aiplatform.trainer.constants.constants as trainer_constants
|
|
46
|
+
from viettelcloud.aiplatform.trainer.types.types import Event, TrainJobTemplate
|
|
47
|
+
|
|
48
|
+
logger = logging.getLogger(__name__)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class KubernetesBackend(RuntimeBackend):
|
|
52
|
+
def __init__(self, cfg: KubernetesBackendConfig):
|
|
53
|
+
if cfg.namespace is None:
|
|
54
|
+
cfg.namespace = common_utils.get_default_target_namespace(cfg.context)
|
|
55
|
+
|
|
56
|
+
# If client configuration is not set, use kube-config to access Kubernetes APIs.
|
|
57
|
+
if cfg.client_configuration is None:
|
|
58
|
+
# Load kube-config or in-cluster config.
|
|
59
|
+
if cfg.config_file or not common_utils.is_running_in_k8s():
|
|
60
|
+
config.load_kube_config(config_file=cfg.config_file, context=cfg.context)
|
|
61
|
+
else:
|
|
62
|
+
config.load_incluster_config()
|
|
63
|
+
|
|
64
|
+
k8s_client = client.ApiClient(cfg.client_configuration)
|
|
65
|
+
self.custom_api = client.CustomObjectsApi(k8s_client)
|
|
66
|
+
self.core_api = client.CoreV1Api(k8s_client)
|
|
67
|
+
|
|
68
|
+
self.namespace = cfg.namespace
|
|
69
|
+
self.trainer_backend = TrainerBackend(cfg)
|
|
70
|
+
|
|
71
|
+
def optimize(
|
|
72
|
+
self,
|
|
73
|
+
trial_template: TrainJobTemplate,
|
|
74
|
+
*,
|
|
75
|
+
search_space: dict[str, Any],
|
|
76
|
+
trial_config: Optional[TrialConfig] = None,
|
|
77
|
+
objectives: Optional[list[Objective]] = None,
|
|
78
|
+
algorithm: Optional[BaseAlgorithm] = None,
|
|
79
|
+
) -> str:
|
|
80
|
+
# Generate unique name for the OptimizationJob.
|
|
81
|
+
optimization_job_name = random.choice(string.ascii_lowercase) + uuid.uuid4().hex[:11]
|
|
82
|
+
|
|
83
|
+
# Validate search_space
|
|
84
|
+
if not search_space:
|
|
85
|
+
raise ValueError("Search space must be set.")
|
|
86
|
+
|
|
87
|
+
# Set defaults.
|
|
88
|
+
objectives = objectives or [Objective()]
|
|
89
|
+
algorithm = algorithm or RandomSearch()
|
|
90
|
+
trial_config = trial_config or TrialConfig()
|
|
91
|
+
|
|
92
|
+
# Iterate over search space to build the following values:
|
|
93
|
+
# experiment.spec.parameters to define distribution and feasible space.
|
|
94
|
+
# experiment.spec.trialTemplate.trialParameters to reference parameters in Trials.
|
|
95
|
+
# Trainer function arguments for the appropriate substitution.
|
|
96
|
+
parameters_spec = []
|
|
97
|
+
trial_parameters = []
|
|
98
|
+
if trial_template.trainer.func_args is None:
|
|
99
|
+
trial_template.trainer.func_args = {}
|
|
100
|
+
|
|
101
|
+
for param_name, param_spec in search_space.items():
|
|
102
|
+
param_spec.name = param_name
|
|
103
|
+
parameters_spec.append(param_spec)
|
|
104
|
+
|
|
105
|
+
trial_parameters.append(
|
|
106
|
+
models.V1beta1TrialParameterSpec(
|
|
107
|
+
name=param_name,
|
|
108
|
+
reference=param_name,
|
|
109
|
+
)
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
trial_template.trainer.func_args[param_name] = f"${{trialParameters.{param_name}}}"
|
|
113
|
+
|
|
114
|
+
# Build the Experiment.
|
|
115
|
+
experiment = models.V1beta1Experiment(
|
|
116
|
+
apiVersion=constants.API_VERSION,
|
|
117
|
+
kind=constants.EXPERIMENT_KIND,
|
|
118
|
+
metadata=models.IoK8sApimachineryPkgApisMetaV1ObjectMeta(name=optimization_job_name),
|
|
119
|
+
spec=models.V1beta1ExperimentSpec(
|
|
120
|
+
# Trial template and parameters.
|
|
121
|
+
trialTemplate=models.V1beta1TrialTemplate(
|
|
122
|
+
retain=True,
|
|
123
|
+
primaryContainerName=trainer_constants.NODE,
|
|
124
|
+
trialParameters=trial_parameters,
|
|
125
|
+
trialSpec={
|
|
126
|
+
"apiVersion": trainer_constants.API_VERSION,
|
|
127
|
+
"kind": trainer_constants.TRAINJOB_KIND,
|
|
128
|
+
"spec": self.trainer_backend._get_trainjob_spec(
|
|
129
|
+
runtime=trial_template.runtime,
|
|
130
|
+
trainer=trial_template.trainer,
|
|
131
|
+
initializer=trial_template.initializer,
|
|
132
|
+
).to_dict(),
|
|
133
|
+
},
|
|
134
|
+
),
|
|
135
|
+
parameters=parameters_spec,
|
|
136
|
+
# Trial Configs.
|
|
137
|
+
maxTrialCount=trial_config.num_trials,
|
|
138
|
+
parallelTrialCount=trial_config.parallel_trials,
|
|
139
|
+
maxFailedTrialCount=trial_config.max_failed_trials,
|
|
140
|
+
# Objective specification.
|
|
141
|
+
objective=models.V1beta1ObjectiveSpec(
|
|
142
|
+
objectiveMetricName=objectives[0].metric,
|
|
143
|
+
type=objectives[0].direction.value,
|
|
144
|
+
additionalMetricNames=[obj.metric for obj in objectives[1:]]
|
|
145
|
+
if len(objectives) > 1
|
|
146
|
+
else None,
|
|
147
|
+
),
|
|
148
|
+
# Algorithm specification.
|
|
149
|
+
algorithm=algorithm._to_katib_spec(),
|
|
150
|
+
),
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# Create the Experiment.
|
|
154
|
+
try:
|
|
155
|
+
self.custom_api.create_namespaced_custom_object(
|
|
156
|
+
constants.GROUP,
|
|
157
|
+
constants.VERSION,
|
|
158
|
+
self.namespace,
|
|
159
|
+
constants.EXPERIMENT_PLURAL,
|
|
160
|
+
experiment.to_dict(),
|
|
161
|
+
)
|
|
162
|
+
except multiprocessing.TimeoutError as e:
|
|
163
|
+
raise TimeoutError(
|
|
164
|
+
f"Timeout to create {constants.OPTIMIZATION_JOB_KIND}: "
|
|
165
|
+
f"{self.namespace}/{optimization_job_name}"
|
|
166
|
+
) from e
|
|
167
|
+
except Exception as e:
|
|
168
|
+
raise RuntimeError(
|
|
169
|
+
f"Failed to create {constants.OPTIMIZATION_JOB_KIND}: "
|
|
170
|
+
f"{self.namespace}/{optimization_job_name}"
|
|
171
|
+
) from e
|
|
172
|
+
|
|
173
|
+
logger.debug(
|
|
174
|
+
f"{constants.OPTIMIZATION_JOB_KIND} {self.namespace}/{optimization_job_name} "
|
|
175
|
+
"has been created"
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
return optimization_job_name
|
|
179
|
+
|
|
180
|
+
def list_jobs(self) -> list[OptimizationJob]:
|
|
181
|
+
"""List of the created OptimizationJobs"""
|
|
182
|
+
result = []
|
|
183
|
+
|
|
184
|
+
try:
|
|
185
|
+
thread = self.custom_api.list_namespaced_custom_object(
|
|
186
|
+
constants.GROUP,
|
|
187
|
+
constants.VERSION,
|
|
188
|
+
self.namespace,
|
|
189
|
+
constants.EXPERIMENT_PLURAL,
|
|
190
|
+
async_req=True,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
optimization_job_list = models.V1beta1ExperimentList.from_dict(
|
|
194
|
+
thread.get(common_constants.DEFAULT_TIMEOUT)
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
if not optimization_job_list:
|
|
198
|
+
return result
|
|
199
|
+
|
|
200
|
+
for optimization_job in optimization_job_list.items:
|
|
201
|
+
result.append(self.__get_optimization_job_from_cr(optimization_job))
|
|
202
|
+
|
|
203
|
+
except multiprocessing.TimeoutError as e:
|
|
204
|
+
raise TimeoutError(
|
|
205
|
+
f"Timeout to list {constants.OPTIMIZATION_JOB_KIND}s in namespace: {self.namespace}"
|
|
206
|
+
) from e
|
|
207
|
+
except Exception as e:
|
|
208
|
+
raise RuntimeError(
|
|
209
|
+
f"Failed to list {constants.OPTIMIZATION_JOB_KIND}s in namespace: {self.namespace}"
|
|
210
|
+
) from e
|
|
211
|
+
|
|
212
|
+
return result
|
|
213
|
+
|
|
214
|
+
def get_job(self, name: str) -> OptimizationJob:
|
|
215
|
+
"""Get the OptimizationJob object"""
|
|
216
|
+
optimization_job = self.__get_experiment_cr(name)
|
|
217
|
+
return self.__get_optimization_job_from_cr(optimization_job)
|
|
218
|
+
|
|
219
|
+
def get_job_logs(
|
|
220
|
+
self,
|
|
221
|
+
name: str,
|
|
222
|
+
trial_name: Optional[str] = None,
|
|
223
|
+
follow: bool = False,
|
|
224
|
+
) -> Iterator[str]:
|
|
225
|
+
"""Get the OptimizationJob logs from a Trial"""
|
|
226
|
+
# Determine what trial to get logs from.
|
|
227
|
+
if trial_name is None:
|
|
228
|
+
# Get logs from the best current trial.
|
|
229
|
+
best_trial = self._get_best_trial(name)
|
|
230
|
+
if best_trial is None:
|
|
231
|
+
# Get first trial if available.
|
|
232
|
+
optimization_job = self.get_job(name)
|
|
233
|
+
if not optimization_job.trials:
|
|
234
|
+
return
|
|
235
|
+
trial_name = optimization_job.trials[0].name
|
|
236
|
+
else:
|
|
237
|
+
trial_name = best_trial.name
|
|
238
|
+
logger.debug(f"Getting logs from trial: {trial_name}")
|
|
239
|
+
|
|
240
|
+
# Get the Trial's Pod name.
|
|
241
|
+
pod_name = None
|
|
242
|
+
step = trainer_constants.NODE + "-0"
|
|
243
|
+
for c in self.trainer_backend.get_job(trial_name).steps:
|
|
244
|
+
if c.status != trainer_constants.POD_PENDING and c.name == step:
|
|
245
|
+
pod_name = c.pod_name
|
|
246
|
+
break
|
|
247
|
+
if pod_name is None:
|
|
248
|
+
return
|
|
249
|
+
|
|
250
|
+
container_name = constants.METRICS_COLLECTOR_CONTAINER
|
|
251
|
+
yield from self.trainer_backend._read_pod_logs(
|
|
252
|
+
pod_name=pod_name, container_name=container_name, follow=follow
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
def get_best_results(self, name: str) -> Optional[Result]:
|
|
256
|
+
"""Get the best hyperparameters and metrics from an OptimizationJob"""
|
|
257
|
+
best_trial = self._get_best_trial(name)
|
|
258
|
+
|
|
259
|
+
if best_trial is None:
|
|
260
|
+
return None
|
|
261
|
+
|
|
262
|
+
return Result(
|
|
263
|
+
parameters=best_trial.parameters,
|
|
264
|
+
metrics=best_trial.metrics,
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
def wait_for_job_status(
|
|
268
|
+
self,
|
|
269
|
+
name: str,
|
|
270
|
+
status: set[str] = {constants.OPTIMIZATION_JOB_COMPLETE},
|
|
271
|
+
timeout: int = 3600,
|
|
272
|
+
polling_interval: int = 2,
|
|
273
|
+
callbacks: Optional[list[Callable[[OptimizationJob], None]]] = None,
|
|
274
|
+
) -> OptimizationJob:
|
|
275
|
+
job_statuses = {
|
|
276
|
+
constants.OPTIMIZATION_JOB_CREATED,
|
|
277
|
+
constants.OPTIMIZATION_JOB_RUNNING,
|
|
278
|
+
constants.OPTIMIZATION_JOB_COMPLETE,
|
|
279
|
+
constants.OPTIMIZATION_JOB_FAILED,
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
if not status.issubset(job_statuses):
|
|
283
|
+
raise ValueError(f"Expected status {status} must be a subset of {job_statuses}")
|
|
284
|
+
|
|
285
|
+
if polling_interval > timeout:
|
|
286
|
+
raise ValueError(
|
|
287
|
+
f"Polling interval {polling_interval} must be less than timeout: {timeout}"
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
for _ in range(round(timeout / polling_interval)):
|
|
291
|
+
optimization_job = self.get_job(name)
|
|
292
|
+
logger.debug(
|
|
293
|
+
f"{constants.OPTIMIZATION_JOB_KIND} {name}, status {optimization_job.status}"
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
# Invoke callbacks if provided
|
|
297
|
+
if callbacks:
|
|
298
|
+
for callback in callbacks:
|
|
299
|
+
callback(optimization_job)
|
|
300
|
+
|
|
301
|
+
if (
|
|
302
|
+
constants.OPTIMIZATION_JOB_FAILED not in status
|
|
303
|
+
and optimization_job.status == constants.OPTIMIZATION_JOB_FAILED
|
|
304
|
+
):
|
|
305
|
+
raise RuntimeError(f"{constants.OPTIMIZATION_JOB_KIND} {name} is Failed")
|
|
306
|
+
|
|
307
|
+
if optimization_job.status in status:
|
|
308
|
+
return optimization_job
|
|
309
|
+
|
|
310
|
+
time.sleep(polling_interval)
|
|
311
|
+
|
|
312
|
+
raise TimeoutError(
|
|
313
|
+
f"Timeout waiting for {constants.OPTIMIZATION_JOB_KIND} {name} to reach status: "
|
|
314
|
+
f"{status}"
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
def delete_job(self, name: str):
|
|
318
|
+
"""Delete the OptimizationJob"""
|
|
319
|
+
|
|
320
|
+
try:
|
|
321
|
+
self.custom_api.delete_namespaced_custom_object(
|
|
322
|
+
constants.GROUP,
|
|
323
|
+
constants.VERSION,
|
|
324
|
+
self.namespace,
|
|
325
|
+
constants.EXPERIMENT_PLURAL,
|
|
326
|
+
name=name,
|
|
327
|
+
)
|
|
328
|
+
except multiprocessing.TimeoutError as e:
|
|
329
|
+
raise TimeoutError(
|
|
330
|
+
f"Timeout to delete {constants.OPTIMIZATION_JOB_KIND}: {self.namespace}/{name}"
|
|
331
|
+
) from e
|
|
332
|
+
except Exception as e:
|
|
333
|
+
raise RuntimeError(
|
|
334
|
+
f"Failed to delete {constants.OPTIMIZATION_JOB_KIND}: {self.namespace}/{name}"
|
|
335
|
+
) from e
|
|
336
|
+
|
|
337
|
+
logger.debug(f"{constants.OPTIMIZATION_JOB_KIND} {self.namespace}/{name} has been deleted")
|
|
338
|
+
|
|
339
|
+
def get_job_events(self, name: str) -> list[Event]:
|
|
340
|
+
# Get the OptimizationJob to ensure it exists
|
|
341
|
+
job = self.get_job(name)
|
|
342
|
+
|
|
343
|
+
# Create set of all OptimizationJob-related resource names
|
|
344
|
+
optimization_job_resources = {name}
|
|
345
|
+
for trial in job.trials:
|
|
346
|
+
optimization_job_resources.add(trial.name)
|
|
347
|
+
|
|
348
|
+
events = []
|
|
349
|
+
try:
|
|
350
|
+
# Retrieve events from the namespace
|
|
351
|
+
event_response: models.IoK8sApiCoreV1EventList = self.core_api.list_namespaced_event(
|
|
352
|
+
namespace=self.namespace,
|
|
353
|
+
async_req=True,
|
|
354
|
+
).get(common_constants.DEFAULT_TIMEOUT)
|
|
355
|
+
|
|
356
|
+
# Filter events related to OptimizationJob resources
|
|
357
|
+
for event in event_response.items:
|
|
358
|
+
if not (event.metadata and event.involved_object and event.first_timestamp):
|
|
359
|
+
continue
|
|
360
|
+
|
|
361
|
+
involved_object = event.involved_object
|
|
362
|
+
|
|
363
|
+
# Check if event is related to OptimizationJob resources
|
|
364
|
+
if (
|
|
365
|
+
involved_object.kind in {constants.EXPERIMENT_KIND, constants.TRIAL_KIND}
|
|
366
|
+
and involved_object.name in optimization_job_resources
|
|
367
|
+
):
|
|
368
|
+
events.append(
|
|
369
|
+
Event(
|
|
370
|
+
involved_object_kind=involved_object.kind,
|
|
371
|
+
involved_object_name=involved_object.name,
|
|
372
|
+
message=event.message or "",
|
|
373
|
+
reason=event.reason or "",
|
|
374
|
+
event_time=event.first_timestamp,
|
|
375
|
+
)
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
# Sort events by first occurrence time
|
|
379
|
+
events.sort(key=lambda e: e.event_time)
|
|
380
|
+
return events
|
|
381
|
+
except multiprocessing.TimeoutError as e:
|
|
382
|
+
raise TimeoutError(
|
|
383
|
+
f"Timeout getting {constants.OPTIMIZATION_JOB_KIND} events: {self.namespace}/{name}"
|
|
384
|
+
) from e
|
|
385
|
+
|
|
386
|
+
def _get_best_trial(self, name: str) -> Optional[Trial]:
|
|
387
|
+
"""Get the best current Trial for the OptimizationJob"""
|
|
388
|
+
optimization_job = self.__get_experiment_cr(name)
|
|
389
|
+
|
|
390
|
+
# Get the best trial from currentOptimalTrial
|
|
391
|
+
if (
|
|
392
|
+
optimization_job.status
|
|
393
|
+
and optimization_job.status.current_optimal_trial
|
|
394
|
+
and optimization_job.status.current_optimal_trial.best_trial_name
|
|
395
|
+
):
|
|
396
|
+
best_trial_name = optimization_job.status.current_optimal_trial.best_trial_name
|
|
397
|
+
|
|
398
|
+
parameters = {}
|
|
399
|
+
if optimization_job.status.current_optimal_trial.parameter_assignments:
|
|
400
|
+
parameters = {
|
|
401
|
+
pa.name: pa.value
|
|
402
|
+
for pa in optimization_job.status.current_optimal_trial.parameter_assignments
|
|
403
|
+
if pa.name is not None and pa.value is not None
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
metrics = []
|
|
407
|
+
if (
|
|
408
|
+
optimization_job.status.current_optimal_trial.observation
|
|
409
|
+
and optimization_job.status.current_optimal_trial.observation.metrics
|
|
410
|
+
):
|
|
411
|
+
metrics = [
|
|
412
|
+
Metric(name=m.name, latest=m.latest, max=m.max, min=m.min)
|
|
413
|
+
for m in optimization_job.status.current_optimal_trial.observation.metrics
|
|
414
|
+
if m.name is not None
|
|
415
|
+
and m.latest is not None
|
|
416
|
+
and m.max is not None
|
|
417
|
+
and m.min is not None
|
|
418
|
+
]
|
|
419
|
+
|
|
420
|
+
trainjob = self.trainer_backend.get_job(name=best_trial_name)
|
|
421
|
+
|
|
422
|
+
return Trial(
|
|
423
|
+
name=best_trial_name,
|
|
424
|
+
parameters=parameters,
|
|
425
|
+
metrics=metrics,
|
|
426
|
+
trainjob=trainjob,
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
return None
|
|
430
|
+
|
|
431
|
+
def __get_experiment_cr(self, name: str) -> models.V1beta1Experiment:
|
|
432
|
+
"""Get the Experiment CR from Kubernetes API"""
|
|
433
|
+
try:
|
|
434
|
+
thread = self.custom_api.get_namespaced_custom_object(
|
|
435
|
+
constants.GROUP,
|
|
436
|
+
constants.VERSION,
|
|
437
|
+
self.namespace,
|
|
438
|
+
constants.EXPERIMENT_PLURAL,
|
|
439
|
+
name,
|
|
440
|
+
async_req=True,
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
optimization_job = models.V1beta1Experiment.from_dict(
|
|
444
|
+
thread.get(common_constants.DEFAULT_TIMEOUT) # type: ignore
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
except multiprocessing.TimeoutError as e:
|
|
448
|
+
raise TimeoutError(
|
|
449
|
+
f"Timeout to get {constants.OPTIMIZATION_JOB_KIND}: {self.namespace}/{name}"
|
|
450
|
+
) from e
|
|
451
|
+
except Exception as e:
|
|
452
|
+
raise RuntimeError(
|
|
453
|
+
f"Failed to get {constants.OPTIMIZATION_JOB_KIND}: {self.namespace}/{name}"
|
|
454
|
+
) from e
|
|
455
|
+
|
|
456
|
+
return optimization_job
|
|
457
|
+
|
|
458
|
+
def __get_optimization_job_from_cr(
|
|
459
|
+
self,
|
|
460
|
+
optimization_job_cr: models.V1beta1Experiment,
|
|
461
|
+
) -> OptimizationJob:
|
|
462
|
+
if not (
|
|
463
|
+
optimization_job_cr.metadata
|
|
464
|
+
and optimization_job_cr.metadata.name
|
|
465
|
+
and optimization_job_cr.metadata.namespace
|
|
466
|
+
and optimization_job_cr.spec
|
|
467
|
+
and optimization_job_cr.spec.parameters
|
|
468
|
+
and optimization_job_cr.spec.objective
|
|
469
|
+
and optimization_job_cr.spec.algorithm
|
|
470
|
+
and optimization_job_cr.spec.max_trial_count
|
|
471
|
+
and optimization_job_cr.spec.parallel_trial_count
|
|
472
|
+
and optimization_job_cr.metadata.creation_timestamp
|
|
473
|
+
):
|
|
474
|
+
raise Exception(
|
|
475
|
+
f"{constants.OPTIMIZATION_JOB_KIND} CR is invalid: {optimization_job_cr}"
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
optimization_job = OptimizationJob(
|
|
479
|
+
name=optimization_job_cr.metadata.name,
|
|
480
|
+
search_space=utils.get_search_space_from_katib_spec(
|
|
481
|
+
optimization_job_cr.spec.parameters
|
|
482
|
+
),
|
|
483
|
+
objectives=utils.get_objectives_from_katib_spec(optimization_job_cr.spec.objective),
|
|
484
|
+
algorithm=utils.get_algorithm_from_katib_spec(optimization_job_cr.spec.algorithm),
|
|
485
|
+
trial_config=TrialConfig(
|
|
486
|
+
num_trials=optimization_job_cr.spec.max_trial_count,
|
|
487
|
+
parallel_trials=optimization_job_cr.spec.parallel_trial_count,
|
|
488
|
+
max_failed_trials=optimization_job_cr.spec.max_failed_trial_count,
|
|
489
|
+
),
|
|
490
|
+
trials=self.__get_trials_from_job(optimization_job_cr.metadata.name),
|
|
491
|
+
creation_timestamp=optimization_job_cr.metadata.creation_timestamp,
|
|
492
|
+
status=constants.OPTIMIZATION_JOB_CREATED, # The default OptimizationJob status.
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
# Update the OptimizationJob status from Experiment conditions.
|
|
496
|
+
if optimization_job_cr.status and optimization_job_cr.status.conditions:
|
|
497
|
+
for c in optimization_job_cr.status.conditions:
|
|
498
|
+
if c.type == constants.EXPERIMENT_SUCCEEDED and c.status == "True":
|
|
499
|
+
optimization_job.status = constants.OPTIMIZATION_JOB_COMPLETE
|
|
500
|
+
elif c.type == constants.OPTIMIZATION_JOB_FAILED and c.status == "True":
|
|
501
|
+
optimization_job.status = constants.OPTIMIZATION_JOB_FAILED
|
|
502
|
+
else:
|
|
503
|
+
for trial in optimization_job.trials:
|
|
504
|
+
if trial.trainjob.status == trainer_constants.TRAINJOB_RUNNING:
|
|
505
|
+
optimization_job.status = constants.OPTIMIZATION_JOB_RUNNING
|
|
506
|
+
|
|
507
|
+
return optimization_job
|
|
508
|
+
|
|
509
|
+
def __get_trials_from_job(self, optimization_job_name: str) -> list[Trial]:
|
|
510
|
+
result = []
|
|
511
|
+
try:
|
|
512
|
+
thread = self.custom_api.list_namespaced_custom_object(
|
|
513
|
+
constants.GROUP,
|
|
514
|
+
constants.VERSION,
|
|
515
|
+
self.namespace,
|
|
516
|
+
constants.TRIAL_PLURAL,
|
|
517
|
+
label_selector=f"{constants.EXPERIMENT_LABEL}={optimization_job_name}",
|
|
518
|
+
async_req=True,
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
trial_list = models.V1beta1TrialList.from_dict(
|
|
522
|
+
thread.get(common_constants.DEFAULT_TIMEOUT)
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
if not trial_list:
|
|
526
|
+
return result
|
|
527
|
+
|
|
528
|
+
for t in trial_list.items:
|
|
529
|
+
if not (t.metadata and t.metadata.name and t.spec and t.spec.parameter_assignments):
|
|
530
|
+
raise ValueError(f"{constants.TRIAL_KIND} CR is invalid: {t}")
|
|
531
|
+
|
|
532
|
+
# Trial name is equal to the TrainJob name.
|
|
533
|
+
trial = Trial(
|
|
534
|
+
name=t.metadata.name,
|
|
535
|
+
parameters={
|
|
536
|
+
pa.name: pa.value
|
|
537
|
+
for pa in t.spec.parameter_assignments
|
|
538
|
+
if pa.name is not None and pa.value is not None
|
|
539
|
+
},
|
|
540
|
+
trainjob=self.trainer_backend.get_job(name=t.metadata.name),
|
|
541
|
+
)
|
|
542
|
+
if t.status and t.status.observation and t.status.observation.metrics:
|
|
543
|
+
trial.metrics = [
|
|
544
|
+
Metric(name=m.name, latest=m.latest, max=m.max, min=m.min)
|
|
545
|
+
for m in t.status.observation.metrics
|
|
546
|
+
if m.name is not None
|
|
547
|
+
and m.latest is not None
|
|
548
|
+
and m.max is not None
|
|
549
|
+
and m.min is not None
|
|
550
|
+
]
|
|
551
|
+
|
|
552
|
+
result.append(trial)
|
|
553
|
+
|
|
554
|
+
except multiprocessing.TimeoutError as e:
|
|
555
|
+
raise TimeoutError(
|
|
556
|
+
f"Timeout to list {constants.TRIAL_KIND}s in namespace: {self.namespace}"
|
|
557
|
+
) from e
|
|
558
|
+
except Exception as e:
|
|
559
|
+
raise RuntimeError(
|
|
560
|
+
f"Failed to list {constants.TRIAL_KIND}s in namespace: {self.namespace}"
|
|
561
|
+
) from e
|
|
562
|
+
|
|
563
|
+
return result
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
# Copyright 2025 The Kubeflow Authors.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from dataclasses import fields
|
|
16
|
+
from typing import Any, Optional, Union, get_args, get_origin
|
|
17
|
+
|
|
18
|
+
from kubeflow_katib_api import models
|
|
19
|
+
|
|
20
|
+
from viettelcloud.aiplatform.optimizer.constants import constants
|
|
21
|
+
from viettelcloud.aiplatform.optimizer.types.algorithm_types import (
|
|
22
|
+
ALGORITHM_REGISTRY,
|
|
23
|
+
GridSearch,
|
|
24
|
+
RandomSearch,
|
|
25
|
+
)
|
|
26
|
+
from viettelcloud.aiplatform.optimizer.types.optimization_types import Direction, Objective
|
|
27
|
+
from viettelcloud.aiplatform.optimizer.types.search_types import (
|
|
28
|
+
CategoricalSearchSpace,
|
|
29
|
+
ContinuousSearchSpace,
|
|
30
|
+
Distribution,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def convert_value(raw_value: str, target_type: Any):
|
|
35
|
+
origin = get_origin(target_type)
|
|
36
|
+
args = get_args(target_type)
|
|
37
|
+
|
|
38
|
+
if origin is Optional:
|
|
39
|
+
target_type = args[0]
|
|
40
|
+
|
|
41
|
+
if target_type is int:
|
|
42
|
+
return int(raw_value)
|
|
43
|
+
elif target_type is float:
|
|
44
|
+
return float(raw_value)
|
|
45
|
+
elif target_type is bool:
|
|
46
|
+
return raw_value.lower() in ("True", "1")
|
|
47
|
+
return raw_value
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def get_algorithm_from_katib_spec(
|
|
51
|
+
algorithm: models.V1beta1AlgorithmSpec,
|
|
52
|
+
) -> Union[GridSearch, RandomSearch]:
|
|
53
|
+
alg_cls = ALGORITHM_REGISTRY.get(algorithm.algorithm_name or "")
|
|
54
|
+
|
|
55
|
+
if alg_cls is None:
|
|
56
|
+
raise ValueError(f"Kubeflow SDK doesn't support {algorithm.algorithm_name} algorithm.")
|
|
57
|
+
|
|
58
|
+
kwargs = {}
|
|
59
|
+
settings = {s.name: s.value for s in algorithm.algorithm_settings or []}
|
|
60
|
+
|
|
61
|
+
for f in fields(alg_cls):
|
|
62
|
+
raw_value = settings.get(f.name)
|
|
63
|
+
if raw_value is None:
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
if f.name in settings:
|
|
67
|
+
kwargs[f.name] = convert_value(raw_value, f.type)
|
|
68
|
+
|
|
69
|
+
return alg_cls(**kwargs)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def get_objectives_from_katib_spec(objective: models.V1beta1ObjectiveSpec) -> list[Objective]:
|
|
73
|
+
if objective.objective_metric_name is None:
|
|
74
|
+
raise ValueError("Objective metric name cannot be empty")
|
|
75
|
+
|
|
76
|
+
# TODO (andreyvelich): Katib doesn't support multi-objective optimization.
|
|
77
|
+
# Currently, the first metric is objective, and the rest is additional metrics.
|
|
78
|
+
direction = Direction(objective.type)
|
|
79
|
+
metrics = [objective.objective_metric_name] + (objective.additional_metric_names or [])
|
|
80
|
+
|
|
81
|
+
return [Objective(metric=m, direction=direction) for m in metrics]
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def get_search_space_from_katib_spec(
|
|
85
|
+
parameters: list[models.V1beta1ParameterSpec],
|
|
86
|
+
) -> dict[str, Union[ContinuousSearchSpace, CategoricalSearchSpace]]:
|
|
87
|
+
search_space = {}
|
|
88
|
+
|
|
89
|
+
for p in parameters:
|
|
90
|
+
if p.parameter_type == constants.CATEGORICAL_PARAMETERS:
|
|
91
|
+
if not (p.feasible_space and p.feasible_space.list):
|
|
92
|
+
raise ValueError(f"Katib categorical parameters are invalid: {parameters}")
|
|
93
|
+
|
|
94
|
+
search_space[p.name] = CategoricalSearchSpace(
|
|
95
|
+
choices=[str(v) for v in p.feasible_space.list]
|
|
96
|
+
)
|
|
97
|
+
else:
|
|
98
|
+
if not (
|
|
99
|
+
p.feasible_space
|
|
100
|
+
and p.feasible_space.min
|
|
101
|
+
and p.feasible_space.max
|
|
102
|
+
and p.feasible_space.distribution
|
|
103
|
+
):
|
|
104
|
+
raise ValueError(f"Katib continuous parameters are invalid: {parameters}")
|
|
105
|
+
|
|
106
|
+
search_space[p.name] = ContinuousSearchSpace(
|
|
107
|
+
min=float(p.feasible_space.min),
|
|
108
|
+
max=float(p.feasible_space.max),
|
|
109
|
+
distribution=Distribution(p.feasible_space.distribution),
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
return search_space
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright 2025 The Kubeflow Authors.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|