viettelcloud-aiplatform 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. viettelcloud/__init__.py +1 -0
  2. viettelcloud/aiplatform/__init__.py +15 -0
  3. viettelcloud/aiplatform/common/__init__.py +0 -0
  4. viettelcloud/aiplatform/common/constants.py +22 -0
  5. viettelcloud/aiplatform/common/types.py +28 -0
  6. viettelcloud/aiplatform/common/utils.py +40 -0
  7. viettelcloud/aiplatform/hub/OWNERS +14 -0
  8. viettelcloud/aiplatform/hub/__init__.py +25 -0
  9. viettelcloud/aiplatform/hub/api/__init__.py +13 -0
  10. viettelcloud/aiplatform/hub/api/_proxy_client.py +355 -0
  11. viettelcloud/aiplatform/hub/api/model_registry_client.py +561 -0
  12. viettelcloud/aiplatform/hub/api/model_registry_client_test.py +462 -0
  13. viettelcloud/aiplatform/optimizer/__init__.py +45 -0
  14. viettelcloud/aiplatform/optimizer/api/__init__.py +0 -0
  15. viettelcloud/aiplatform/optimizer/api/optimizer_client.py +248 -0
  16. viettelcloud/aiplatform/optimizer/backends/__init__.py +13 -0
  17. viettelcloud/aiplatform/optimizer/backends/base.py +77 -0
  18. viettelcloud/aiplatform/optimizer/backends/kubernetes/__init__.py +13 -0
  19. viettelcloud/aiplatform/optimizer/backends/kubernetes/backend.py +563 -0
  20. viettelcloud/aiplatform/optimizer/backends/kubernetes/utils.py +112 -0
  21. viettelcloud/aiplatform/optimizer/constants/__init__.py +13 -0
  22. viettelcloud/aiplatform/optimizer/constants/constants.py +59 -0
  23. viettelcloud/aiplatform/optimizer/types/__init__.py +13 -0
  24. viettelcloud/aiplatform/optimizer/types/algorithm_types.py +87 -0
  25. viettelcloud/aiplatform/optimizer/types/optimization_types.py +135 -0
  26. viettelcloud/aiplatform/optimizer/types/search_types.py +95 -0
  27. viettelcloud/aiplatform/py.typed +0 -0
  28. viettelcloud/aiplatform/trainer/__init__.py +82 -0
  29. viettelcloud/aiplatform/trainer/api/__init__.py +3 -0
  30. viettelcloud/aiplatform/trainer/api/trainer_client.py +277 -0
  31. viettelcloud/aiplatform/trainer/api/trainer_client_test.py +72 -0
  32. viettelcloud/aiplatform/trainer/backends/__init__.py +0 -0
  33. viettelcloud/aiplatform/trainer/backends/base.py +94 -0
  34. viettelcloud/aiplatform/trainer/backends/container/adapters/base.py +195 -0
  35. viettelcloud/aiplatform/trainer/backends/container/adapters/docker.py +231 -0
  36. viettelcloud/aiplatform/trainer/backends/container/adapters/podman.py +258 -0
  37. viettelcloud/aiplatform/trainer/backends/container/backend.py +668 -0
  38. viettelcloud/aiplatform/trainer/backends/container/backend_test.py +867 -0
  39. viettelcloud/aiplatform/trainer/backends/container/runtime_loader.py +631 -0
  40. viettelcloud/aiplatform/trainer/backends/container/runtime_loader_test.py +637 -0
  41. viettelcloud/aiplatform/trainer/backends/container/types.py +67 -0
  42. viettelcloud/aiplatform/trainer/backends/container/utils.py +213 -0
  43. viettelcloud/aiplatform/trainer/backends/kubernetes/__init__.py +0 -0
  44. viettelcloud/aiplatform/trainer/backends/kubernetes/backend.py +710 -0
  45. viettelcloud/aiplatform/trainer/backends/kubernetes/backend_test.py +1344 -0
  46. viettelcloud/aiplatform/trainer/backends/kubernetes/constants.py +15 -0
  47. viettelcloud/aiplatform/trainer/backends/kubernetes/utils.py +636 -0
  48. viettelcloud/aiplatform/trainer/backends/kubernetes/utils_test.py +582 -0
  49. viettelcloud/aiplatform/trainer/backends/localprocess/__init__.py +0 -0
  50. viettelcloud/aiplatform/trainer/backends/localprocess/backend.py +306 -0
  51. viettelcloud/aiplatform/trainer/backends/localprocess/backend_test.py +501 -0
  52. viettelcloud/aiplatform/trainer/backends/localprocess/constants.py +90 -0
  53. viettelcloud/aiplatform/trainer/backends/localprocess/job.py +184 -0
  54. viettelcloud/aiplatform/trainer/backends/localprocess/types.py +52 -0
  55. viettelcloud/aiplatform/trainer/backends/localprocess/utils.py +302 -0
  56. viettelcloud/aiplatform/trainer/constants/__init__.py +0 -0
  57. viettelcloud/aiplatform/trainer/constants/constants.py +179 -0
  58. viettelcloud/aiplatform/trainer/options/__init__.py +52 -0
  59. viettelcloud/aiplatform/trainer/options/common.py +55 -0
  60. viettelcloud/aiplatform/trainer/options/kubernetes.py +502 -0
  61. viettelcloud/aiplatform/trainer/options/kubernetes_test.py +259 -0
  62. viettelcloud/aiplatform/trainer/options/localprocess.py +20 -0
  63. viettelcloud/aiplatform/trainer/test/common.py +22 -0
  64. viettelcloud/aiplatform/trainer/types/__init__.py +0 -0
  65. viettelcloud/aiplatform/trainer/types/types.py +517 -0
  66. viettelcloud/aiplatform/trainer/types/types_test.py +115 -0
  67. viettelcloud_aiplatform-0.3.0.dist-info/METADATA +226 -0
  68. viettelcloud_aiplatform-0.3.0.dist-info/RECORD +71 -0
  69. viettelcloud_aiplatform-0.3.0.dist-info/WHEEL +4 -0
  70. viettelcloud_aiplatform-0.3.0.dist-info/licenses/LICENSE +201 -0
  71. viettelcloud_aiplatform-0.3.0.dist-info/licenses/NOTICE +36 -0
@@ -0,0 +1,563 @@
1
+ # Copyright 2025 The Kubeflow Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from collections.abc import Callable, Iterator
16
+ import logging
17
+ import multiprocessing
18
+ import random
19
+ import string
20
+ import time
21
+ from typing import Any, Optional
22
+ import uuid
23
+
24
+ from kubeflow_katib_api import models
25
+ from kubernetes import client, config
26
+
27
+ import viettelcloud.aiplatform.common.constants as common_constants
28
+ from viettelcloud.aiplatform.common.types import KubernetesBackendConfig
29
+ import viettelcloud.aiplatform.common.utils as common_utils
30
+ from viettelcloud.aiplatform.optimizer.backends.base import RuntimeBackend
31
+ from viettelcloud.aiplatform.optimizer.backends.kubernetes import utils
32
+ from viettelcloud.aiplatform.optimizer.constants import constants
33
+ from viettelcloud.aiplatform.optimizer.types.algorithm_types import BaseAlgorithm, RandomSearch
34
+ from viettelcloud.aiplatform.optimizer.types.optimization_types import (
35
+ Metric,
36
+ Objective,
37
+ OptimizationJob,
38
+ Result,
39
+ Trial,
40
+ TrialConfig,
41
+ )
42
+ from viettelcloud.aiplatform.trainer.backends.kubernetes.backend import (
43
+ KubernetesBackend as TrainerBackend,
44
+ )
45
+ import viettelcloud.aiplatform.trainer.constants.constants as trainer_constants
46
+ from viettelcloud.aiplatform.trainer.types.types import Event, TrainJobTemplate
47
+
48
+ logger = logging.getLogger(__name__)
49
+
50
+
51
+ class KubernetesBackend(RuntimeBackend):
52
+ def __init__(self, cfg: KubernetesBackendConfig):
53
+ if cfg.namespace is None:
54
+ cfg.namespace = common_utils.get_default_target_namespace(cfg.context)
55
+
56
+ # If client configuration is not set, use kube-config to access Kubernetes APIs.
57
+ if cfg.client_configuration is None:
58
+ # Load kube-config or in-cluster config.
59
+ if cfg.config_file or not common_utils.is_running_in_k8s():
60
+ config.load_kube_config(config_file=cfg.config_file, context=cfg.context)
61
+ else:
62
+ config.load_incluster_config()
63
+
64
+ k8s_client = client.ApiClient(cfg.client_configuration)
65
+ self.custom_api = client.CustomObjectsApi(k8s_client)
66
+ self.core_api = client.CoreV1Api(k8s_client)
67
+
68
+ self.namespace = cfg.namespace
69
+ self.trainer_backend = TrainerBackend(cfg)
70
+
71
+ def optimize(
72
+ self,
73
+ trial_template: TrainJobTemplate,
74
+ *,
75
+ search_space: dict[str, Any],
76
+ trial_config: Optional[TrialConfig] = None,
77
+ objectives: Optional[list[Objective]] = None,
78
+ algorithm: Optional[BaseAlgorithm] = None,
79
+ ) -> str:
80
+ # Generate unique name for the OptimizationJob.
81
+ optimization_job_name = random.choice(string.ascii_lowercase) + uuid.uuid4().hex[:11]
82
+
83
+ # Validate search_space
84
+ if not search_space:
85
+ raise ValueError("Search space must be set.")
86
+
87
+ # Set defaults.
88
+ objectives = objectives or [Objective()]
89
+ algorithm = algorithm or RandomSearch()
90
+ trial_config = trial_config or TrialConfig()
91
+
92
+ # Iterate over search space to build the following values:
93
+ # experiment.spec.parameters to define distribution and feasible space.
94
+ # experiment.spec.trialTemplate.trialParameters to reference parameters in Trials.
95
+ # Trainer function arguments for the appropriate substitution.
96
+ parameters_spec = []
97
+ trial_parameters = []
98
+ if trial_template.trainer.func_args is None:
99
+ trial_template.trainer.func_args = {}
100
+
101
+ for param_name, param_spec in search_space.items():
102
+ param_spec.name = param_name
103
+ parameters_spec.append(param_spec)
104
+
105
+ trial_parameters.append(
106
+ models.V1beta1TrialParameterSpec(
107
+ name=param_name,
108
+ reference=param_name,
109
+ )
110
+ )
111
+
112
+ trial_template.trainer.func_args[param_name] = f"${{trialParameters.{param_name}}}"
113
+
114
+ # Build the Experiment.
115
+ experiment = models.V1beta1Experiment(
116
+ apiVersion=constants.API_VERSION,
117
+ kind=constants.EXPERIMENT_KIND,
118
+ metadata=models.IoK8sApimachineryPkgApisMetaV1ObjectMeta(name=optimization_job_name),
119
+ spec=models.V1beta1ExperimentSpec(
120
+ # Trial template and parameters.
121
+ trialTemplate=models.V1beta1TrialTemplate(
122
+ retain=True,
123
+ primaryContainerName=trainer_constants.NODE,
124
+ trialParameters=trial_parameters,
125
+ trialSpec={
126
+ "apiVersion": trainer_constants.API_VERSION,
127
+ "kind": trainer_constants.TRAINJOB_KIND,
128
+ "spec": self.trainer_backend._get_trainjob_spec(
129
+ runtime=trial_template.runtime,
130
+ trainer=trial_template.trainer,
131
+ initializer=trial_template.initializer,
132
+ ).to_dict(),
133
+ },
134
+ ),
135
+ parameters=parameters_spec,
136
+ # Trial Configs.
137
+ maxTrialCount=trial_config.num_trials,
138
+ parallelTrialCount=trial_config.parallel_trials,
139
+ maxFailedTrialCount=trial_config.max_failed_trials,
140
+ # Objective specification.
141
+ objective=models.V1beta1ObjectiveSpec(
142
+ objectiveMetricName=objectives[0].metric,
143
+ type=objectives[0].direction.value,
144
+ additionalMetricNames=[obj.metric for obj in objectives[1:]]
145
+ if len(objectives) > 1
146
+ else None,
147
+ ),
148
+ # Algorithm specification.
149
+ algorithm=algorithm._to_katib_spec(),
150
+ ),
151
+ )
152
+
153
+ # Create the Experiment.
154
+ try:
155
+ self.custom_api.create_namespaced_custom_object(
156
+ constants.GROUP,
157
+ constants.VERSION,
158
+ self.namespace,
159
+ constants.EXPERIMENT_PLURAL,
160
+ experiment.to_dict(),
161
+ )
162
+ except multiprocessing.TimeoutError as e:
163
+ raise TimeoutError(
164
+ f"Timeout to create {constants.OPTIMIZATION_JOB_KIND}: "
165
+ f"{self.namespace}/{optimization_job_name}"
166
+ ) from e
167
+ except Exception as e:
168
+ raise RuntimeError(
169
+ f"Failed to create {constants.OPTIMIZATION_JOB_KIND}: "
170
+ f"{self.namespace}/{optimization_job_name}"
171
+ ) from e
172
+
173
+ logger.debug(
174
+ f"{constants.OPTIMIZATION_JOB_KIND} {self.namespace}/{optimization_job_name} "
175
+ "has been created"
176
+ )
177
+
178
+ return optimization_job_name
179
+
180
+ def list_jobs(self) -> list[OptimizationJob]:
181
+ """List of the created OptimizationJobs"""
182
+ result = []
183
+
184
+ try:
185
+ thread = self.custom_api.list_namespaced_custom_object(
186
+ constants.GROUP,
187
+ constants.VERSION,
188
+ self.namespace,
189
+ constants.EXPERIMENT_PLURAL,
190
+ async_req=True,
191
+ )
192
+
193
+ optimization_job_list = models.V1beta1ExperimentList.from_dict(
194
+ thread.get(common_constants.DEFAULT_TIMEOUT)
195
+ )
196
+
197
+ if not optimization_job_list:
198
+ return result
199
+
200
+ for optimization_job in optimization_job_list.items:
201
+ result.append(self.__get_optimization_job_from_cr(optimization_job))
202
+
203
+ except multiprocessing.TimeoutError as e:
204
+ raise TimeoutError(
205
+ f"Timeout to list {constants.OPTIMIZATION_JOB_KIND}s in namespace: {self.namespace}"
206
+ ) from e
207
+ except Exception as e:
208
+ raise RuntimeError(
209
+ f"Failed to list {constants.OPTIMIZATION_JOB_KIND}s in namespace: {self.namespace}"
210
+ ) from e
211
+
212
+ return result
213
+
214
+ def get_job(self, name: str) -> OptimizationJob:
215
+ """Get the OptimizationJob object"""
216
+ optimization_job = self.__get_experiment_cr(name)
217
+ return self.__get_optimization_job_from_cr(optimization_job)
218
+
219
+ def get_job_logs(
220
+ self,
221
+ name: str,
222
+ trial_name: Optional[str] = None,
223
+ follow: bool = False,
224
+ ) -> Iterator[str]:
225
+ """Get the OptimizationJob logs from a Trial"""
226
+ # Determine what trial to get logs from.
227
+ if trial_name is None:
228
+ # Get logs from the best current trial.
229
+ best_trial = self._get_best_trial(name)
230
+ if best_trial is None:
231
+ # Get first trial if available.
232
+ optimization_job = self.get_job(name)
233
+ if not optimization_job.trials:
234
+ return
235
+ trial_name = optimization_job.trials[0].name
236
+ else:
237
+ trial_name = best_trial.name
238
+ logger.debug(f"Getting logs from trial: {trial_name}")
239
+
240
+ # Get the Trial's Pod name.
241
+ pod_name = None
242
+ step = trainer_constants.NODE + "-0"
243
+ for c in self.trainer_backend.get_job(trial_name).steps:
244
+ if c.status != trainer_constants.POD_PENDING and c.name == step:
245
+ pod_name = c.pod_name
246
+ break
247
+ if pod_name is None:
248
+ return
249
+
250
+ container_name = constants.METRICS_COLLECTOR_CONTAINER
251
+ yield from self.trainer_backend._read_pod_logs(
252
+ pod_name=pod_name, container_name=container_name, follow=follow
253
+ )
254
+
255
+ def get_best_results(self, name: str) -> Optional[Result]:
256
+ """Get the best hyperparameters and metrics from an OptimizationJob"""
257
+ best_trial = self._get_best_trial(name)
258
+
259
+ if best_trial is None:
260
+ return None
261
+
262
+ return Result(
263
+ parameters=best_trial.parameters,
264
+ metrics=best_trial.metrics,
265
+ )
266
+
267
+ def wait_for_job_status(
268
+ self,
269
+ name: str,
270
+ status: set[str] = {constants.OPTIMIZATION_JOB_COMPLETE},
271
+ timeout: int = 3600,
272
+ polling_interval: int = 2,
273
+ callbacks: Optional[list[Callable[[OptimizationJob], None]]] = None,
274
+ ) -> OptimizationJob:
275
+ job_statuses = {
276
+ constants.OPTIMIZATION_JOB_CREATED,
277
+ constants.OPTIMIZATION_JOB_RUNNING,
278
+ constants.OPTIMIZATION_JOB_COMPLETE,
279
+ constants.OPTIMIZATION_JOB_FAILED,
280
+ }
281
+
282
+ if not status.issubset(job_statuses):
283
+ raise ValueError(f"Expected status {status} must be a subset of {job_statuses}")
284
+
285
+ if polling_interval > timeout:
286
+ raise ValueError(
287
+ f"Polling interval {polling_interval} must be less than timeout: {timeout}"
288
+ )
289
+
290
+ for _ in range(round(timeout / polling_interval)):
291
+ optimization_job = self.get_job(name)
292
+ logger.debug(
293
+ f"{constants.OPTIMIZATION_JOB_KIND} {name}, status {optimization_job.status}"
294
+ )
295
+
296
+ # Invoke callbacks if provided
297
+ if callbacks:
298
+ for callback in callbacks:
299
+ callback(optimization_job)
300
+
301
+ if (
302
+ constants.OPTIMIZATION_JOB_FAILED not in status
303
+ and optimization_job.status == constants.OPTIMIZATION_JOB_FAILED
304
+ ):
305
+ raise RuntimeError(f"{constants.OPTIMIZATION_JOB_KIND} {name} is Failed")
306
+
307
+ if optimization_job.status in status:
308
+ return optimization_job
309
+
310
+ time.sleep(polling_interval)
311
+
312
+ raise TimeoutError(
313
+ f"Timeout waiting for {constants.OPTIMIZATION_JOB_KIND} {name} to reach status: "
314
+ f"{status}"
315
+ )
316
+
317
+ def delete_job(self, name: str):
318
+ """Delete the OptimizationJob"""
319
+
320
+ try:
321
+ self.custom_api.delete_namespaced_custom_object(
322
+ constants.GROUP,
323
+ constants.VERSION,
324
+ self.namespace,
325
+ constants.EXPERIMENT_PLURAL,
326
+ name=name,
327
+ )
328
+ except multiprocessing.TimeoutError as e:
329
+ raise TimeoutError(
330
+ f"Timeout to delete {constants.OPTIMIZATION_JOB_KIND}: {self.namespace}/{name}"
331
+ ) from e
332
+ except Exception as e:
333
+ raise RuntimeError(
334
+ f"Failed to delete {constants.OPTIMIZATION_JOB_KIND}: {self.namespace}/{name}"
335
+ ) from e
336
+
337
+ logger.debug(f"{constants.OPTIMIZATION_JOB_KIND} {self.namespace}/{name} has been deleted")
338
+
339
+ def get_job_events(self, name: str) -> list[Event]:
340
+ # Get the OptimizationJob to ensure it exists
341
+ job = self.get_job(name)
342
+
343
+ # Create set of all OptimizationJob-related resource names
344
+ optimization_job_resources = {name}
345
+ for trial in job.trials:
346
+ optimization_job_resources.add(trial.name)
347
+
348
+ events = []
349
+ try:
350
+ # Retrieve events from the namespace
351
+ event_response: models.IoK8sApiCoreV1EventList = self.core_api.list_namespaced_event(
352
+ namespace=self.namespace,
353
+ async_req=True,
354
+ ).get(common_constants.DEFAULT_TIMEOUT)
355
+
356
+ # Filter events related to OptimizationJob resources
357
+ for event in event_response.items:
358
+ if not (event.metadata and event.involved_object and event.first_timestamp):
359
+ continue
360
+
361
+ involved_object = event.involved_object
362
+
363
+ # Check if event is related to OptimizationJob resources
364
+ if (
365
+ involved_object.kind in {constants.EXPERIMENT_KIND, constants.TRIAL_KIND}
366
+ and involved_object.name in optimization_job_resources
367
+ ):
368
+ events.append(
369
+ Event(
370
+ involved_object_kind=involved_object.kind,
371
+ involved_object_name=involved_object.name,
372
+ message=event.message or "",
373
+ reason=event.reason or "",
374
+ event_time=event.first_timestamp,
375
+ )
376
+ )
377
+
378
+ # Sort events by first occurrence time
379
+ events.sort(key=lambda e: e.event_time)
380
+ return events
381
+ except multiprocessing.TimeoutError as e:
382
+ raise TimeoutError(
383
+ f"Timeout getting {constants.OPTIMIZATION_JOB_KIND} events: {self.namespace}/{name}"
384
+ ) from e
385
+
386
+ def _get_best_trial(self, name: str) -> Optional[Trial]:
387
+ """Get the best current Trial for the OptimizationJob"""
388
+ optimization_job = self.__get_experiment_cr(name)
389
+
390
+ # Get the best trial from currentOptimalTrial
391
+ if (
392
+ optimization_job.status
393
+ and optimization_job.status.current_optimal_trial
394
+ and optimization_job.status.current_optimal_trial.best_trial_name
395
+ ):
396
+ best_trial_name = optimization_job.status.current_optimal_trial.best_trial_name
397
+
398
+ parameters = {}
399
+ if optimization_job.status.current_optimal_trial.parameter_assignments:
400
+ parameters = {
401
+ pa.name: pa.value
402
+ for pa in optimization_job.status.current_optimal_trial.parameter_assignments
403
+ if pa.name is not None and pa.value is not None
404
+ }
405
+
406
+ metrics = []
407
+ if (
408
+ optimization_job.status.current_optimal_trial.observation
409
+ and optimization_job.status.current_optimal_trial.observation.metrics
410
+ ):
411
+ metrics = [
412
+ Metric(name=m.name, latest=m.latest, max=m.max, min=m.min)
413
+ for m in optimization_job.status.current_optimal_trial.observation.metrics
414
+ if m.name is not None
415
+ and m.latest is not None
416
+ and m.max is not None
417
+ and m.min is not None
418
+ ]
419
+
420
+ trainjob = self.trainer_backend.get_job(name=best_trial_name)
421
+
422
+ return Trial(
423
+ name=best_trial_name,
424
+ parameters=parameters,
425
+ metrics=metrics,
426
+ trainjob=trainjob,
427
+ )
428
+
429
+ return None
430
+
431
+ def __get_experiment_cr(self, name: str) -> models.V1beta1Experiment:
432
+ """Get the Experiment CR from Kubernetes API"""
433
+ try:
434
+ thread = self.custom_api.get_namespaced_custom_object(
435
+ constants.GROUP,
436
+ constants.VERSION,
437
+ self.namespace,
438
+ constants.EXPERIMENT_PLURAL,
439
+ name,
440
+ async_req=True,
441
+ )
442
+
443
+ optimization_job = models.V1beta1Experiment.from_dict(
444
+ thread.get(common_constants.DEFAULT_TIMEOUT) # type: ignore
445
+ )
446
+
447
+ except multiprocessing.TimeoutError as e:
448
+ raise TimeoutError(
449
+ f"Timeout to get {constants.OPTIMIZATION_JOB_KIND}: {self.namespace}/{name}"
450
+ ) from e
451
+ except Exception as e:
452
+ raise RuntimeError(
453
+ f"Failed to get {constants.OPTIMIZATION_JOB_KIND}: {self.namespace}/{name}"
454
+ ) from e
455
+
456
+ return optimization_job
457
+
458
+ def __get_optimization_job_from_cr(
459
+ self,
460
+ optimization_job_cr: models.V1beta1Experiment,
461
+ ) -> OptimizationJob:
462
+ if not (
463
+ optimization_job_cr.metadata
464
+ and optimization_job_cr.metadata.name
465
+ and optimization_job_cr.metadata.namespace
466
+ and optimization_job_cr.spec
467
+ and optimization_job_cr.spec.parameters
468
+ and optimization_job_cr.spec.objective
469
+ and optimization_job_cr.spec.algorithm
470
+ and optimization_job_cr.spec.max_trial_count
471
+ and optimization_job_cr.spec.parallel_trial_count
472
+ and optimization_job_cr.metadata.creation_timestamp
473
+ ):
474
+ raise Exception(
475
+ f"{constants.OPTIMIZATION_JOB_KIND} CR is invalid: {optimization_job_cr}"
476
+ )
477
+
478
+ optimization_job = OptimizationJob(
479
+ name=optimization_job_cr.metadata.name,
480
+ search_space=utils.get_search_space_from_katib_spec(
481
+ optimization_job_cr.spec.parameters
482
+ ),
483
+ objectives=utils.get_objectives_from_katib_spec(optimization_job_cr.spec.objective),
484
+ algorithm=utils.get_algorithm_from_katib_spec(optimization_job_cr.spec.algorithm),
485
+ trial_config=TrialConfig(
486
+ num_trials=optimization_job_cr.spec.max_trial_count,
487
+ parallel_trials=optimization_job_cr.spec.parallel_trial_count,
488
+ max_failed_trials=optimization_job_cr.spec.max_failed_trial_count,
489
+ ),
490
+ trials=self.__get_trials_from_job(optimization_job_cr.metadata.name),
491
+ creation_timestamp=optimization_job_cr.metadata.creation_timestamp,
492
+ status=constants.OPTIMIZATION_JOB_CREATED, # The default OptimizationJob status.
493
+ )
494
+
495
+ # Update the OptimizationJob status from Experiment conditions.
496
+ if optimization_job_cr.status and optimization_job_cr.status.conditions:
497
+ for c in optimization_job_cr.status.conditions:
498
+ if c.type == constants.EXPERIMENT_SUCCEEDED and c.status == "True":
499
+ optimization_job.status = constants.OPTIMIZATION_JOB_COMPLETE
500
+ elif c.type == constants.OPTIMIZATION_JOB_FAILED and c.status == "True":
501
+ optimization_job.status = constants.OPTIMIZATION_JOB_FAILED
502
+ else:
503
+ for trial in optimization_job.trials:
504
+ if trial.trainjob.status == trainer_constants.TRAINJOB_RUNNING:
505
+ optimization_job.status = constants.OPTIMIZATION_JOB_RUNNING
506
+
507
+ return optimization_job
508
+
509
+ def __get_trials_from_job(self, optimization_job_name: str) -> list[Trial]:
510
+ result = []
511
+ try:
512
+ thread = self.custom_api.list_namespaced_custom_object(
513
+ constants.GROUP,
514
+ constants.VERSION,
515
+ self.namespace,
516
+ constants.TRIAL_PLURAL,
517
+ label_selector=f"{constants.EXPERIMENT_LABEL}={optimization_job_name}",
518
+ async_req=True,
519
+ )
520
+
521
+ trial_list = models.V1beta1TrialList.from_dict(
522
+ thread.get(common_constants.DEFAULT_TIMEOUT)
523
+ )
524
+
525
+ if not trial_list:
526
+ return result
527
+
528
+ for t in trial_list.items:
529
+ if not (t.metadata and t.metadata.name and t.spec and t.spec.parameter_assignments):
530
+ raise ValueError(f"{constants.TRIAL_KIND} CR is invalid: {t}")
531
+
532
+ # Trial name is equal to the TrainJob name.
533
+ trial = Trial(
534
+ name=t.metadata.name,
535
+ parameters={
536
+ pa.name: pa.value
537
+ for pa in t.spec.parameter_assignments
538
+ if pa.name is not None and pa.value is not None
539
+ },
540
+ trainjob=self.trainer_backend.get_job(name=t.metadata.name),
541
+ )
542
+ if t.status and t.status.observation and t.status.observation.metrics:
543
+ trial.metrics = [
544
+ Metric(name=m.name, latest=m.latest, max=m.max, min=m.min)
545
+ for m in t.status.observation.metrics
546
+ if m.name is not None
547
+ and m.latest is not None
548
+ and m.max is not None
549
+ and m.min is not None
550
+ ]
551
+
552
+ result.append(trial)
553
+
554
+ except multiprocessing.TimeoutError as e:
555
+ raise TimeoutError(
556
+ f"Timeout to list {constants.TRIAL_KIND}s in namespace: {self.namespace}"
557
+ ) from e
558
+ except Exception as e:
559
+ raise RuntimeError(
560
+ f"Failed to list {constants.TRIAL_KIND}s in namespace: {self.namespace}"
561
+ ) from e
562
+
563
+ return result
@@ -0,0 +1,112 @@
1
+ # Copyright 2025 The Kubeflow Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from dataclasses import fields
16
+ from typing import Any, Optional, Union, get_args, get_origin
17
+
18
+ from kubeflow_katib_api import models
19
+
20
+ from viettelcloud.aiplatform.optimizer.constants import constants
21
+ from viettelcloud.aiplatform.optimizer.types.algorithm_types import (
22
+ ALGORITHM_REGISTRY,
23
+ GridSearch,
24
+ RandomSearch,
25
+ )
26
+ from viettelcloud.aiplatform.optimizer.types.optimization_types import Direction, Objective
27
+ from viettelcloud.aiplatform.optimizer.types.search_types import (
28
+ CategoricalSearchSpace,
29
+ ContinuousSearchSpace,
30
+ Distribution,
31
+ )
32
+
33
+
34
+ def convert_value(raw_value: str, target_type: Any):
35
+ origin = get_origin(target_type)
36
+ args = get_args(target_type)
37
+
38
+ if origin is Optional:
39
+ target_type = args[0]
40
+
41
+ if target_type is int:
42
+ return int(raw_value)
43
+ elif target_type is float:
44
+ return float(raw_value)
45
+ elif target_type is bool:
46
+ return raw_value.lower() in ("True", "1")
47
+ return raw_value
48
+
49
+
50
+ def get_algorithm_from_katib_spec(
51
+ algorithm: models.V1beta1AlgorithmSpec,
52
+ ) -> Union[GridSearch, RandomSearch]:
53
+ alg_cls = ALGORITHM_REGISTRY.get(algorithm.algorithm_name or "")
54
+
55
+ if alg_cls is None:
56
+ raise ValueError(f"Kubeflow SDK doesn't support {algorithm.algorithm_name} algorithm.")
57
+
58
+ kwargs = {}
59
+ settings = {s.name: s.value for s in algorithm.algorithm_settings or []}
60
+
61
+ for f in fields(alg_cls):
62
+ raw_value = settings.get(f.name)
63
+ if raw_value is None:
64
+ continue
65
+
66
+ if f.name in settings:
67
+ kwargs[f.name] = convert_value(raw_value, f.type)
68
+
69
+ return alg_cls(**kwargs)
70
+
71
+
72
+ def get_objectives_from_katib_spec(objective: models.V1beta1ObjectiveSpec) -> list[Objective]:
73
+ if objective.objective_metric_name is None:
74
+ raise ValueError("Objective metric name cannot be empty")
75
+
76
+ # TODO (andreyvelich): Katib doesn't support multi-objective optimization.
77
+ # Currently, the first metric is objective, and the rest is additional metrics.
78
+ direction = Direction(objective.type)
79
+ metrics = [objective.objective_metric_name] + (objective.additional_metric_names or [])
80
+
81
+ return [Objective(metric=m, direction=direction) for m in metrics]
82
+
83
+
84
+ def get_search_space_from_katib_spec(
85
+ parameters: list[models.V1beta1ParameterSpec],
86
+ ) -> dict[str, Union[ContinuousSearchSpace, CategoricalSearchSpace]]:
87
+ search_space = {}
88
+
89
+ for p in parameters:
90
+ if p.parameter_type == constants.CATEGORICAL_PARAMETERS:
91
+ if not (p.feasible_space and p.feasible_space.list):
92
+ raise ValueError(f"Katib categorical parameters are invalid: {parameters}")
93
+
94
+ search_space[p.name] = CategoricalSearchSpace(
95
+ choices=[str(v) for v in p.feasible_space.list]
96
+ )
97
+ else:
98
+ if not (
99
+ p.feasible_space
100
+ and p.feasible_space.min
101
+ and p.feasible_space.max
102
+ and p.feasible_space.distribution
103
+ ):
104
+ raise ValueError(f"Katib continuous parameters are invalid: {parameters}")
105
+
106
+ search_space[p.name] = ContinuousSearchSpace(
107
+ min=float(p.feasible_space.min),
108
+ max=float(p.feasible_space.max),
109
+ distribution=Distribution(p.feasible_space.distribution),
110
+ )
111
+
112
+ return search_space
@@ -0,0 +1,13 @@
1
+ # Copyright 2025 The Kubeflow Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.