viettelcloud-aiplatform 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- viettelcloud/__init__.py +1 -0
- viettelcloud/aiplatform/__init__.py +15 -0
- viettelcloud/aiplatform/common/__init__.py +0 -0
- viettelcloud/aiplatform/common/constants.py +22 -0
- viettelcloud/aiplatform/common/types.py +28 -0
- viettelcloud/aiplatform/common/utils.py +40 -0
- viettelcloud/aiplatform/hub/OWNERS +14 -0
- viettelcloud/aiplatform/hub/__init__.py +25 -0
- viettelcloud/aiplatform/hub/api/__init__.py +13 -0
- viettelcloud/aiplatform/hub/api/_proxy_client.py +355 -0
- viettelcloud/aiplatform/hub/api/model_registry_client.py +561 -0
- viettelcloud/aiplatform/hub/api/model_registry_client_test.py +462 -0
- viettelcloud/aiplatform/optimizer/__init__.py +45 -0
- viettelcloud/aiplatform/optimizer/api/__init__.py +0 -0
- viettelcloud/aiplatform/optimizer/api/optimizer_client.py +248 -0
- viettelcloud/aiplatform/optimizer/backends/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/backends/base.py +77 -0
- viettelcloud/aiplatform/optimizer/backends/kubernetes/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/backends/kubernetes/backend.py +563 -0
- viettelcloud/aiplatform/optimizer/backends/kubernetes/utils.py +112 -0
- viettelcloud/aiplatform/optimizer/constants/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/constants/constants.py +59 -0
- viettelcloud/aiplatform/optimizer/types/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/types/algorithm_types.py +87 -0
- viettelcloud/aiplatform/optimizer/types/optimization_types.py +135 -0
- viettelcloud/aiplatform/optimizer/types/search_types.py +95 -0
- viettelcloud/aiplatform/py.typed +0 -0
- viettelcloud/aiplatform/trainer/__init__.py +82 -0
- viettelcloud/aiplatform/trainer/api/__init__.py +3 -0
- viettelcloud/aiplatform/trainer/api/trainer_client.py +277 -0
- viettelcloud/aiplatform/trainer/api/trainer_client_test.py +72 -0
- viettelcloud/aiplatform/trainer/backends/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/backends/base.py +94 -0
- viettelcloud/aiplatform/trainer/backends/container/adapters/base.py +195 -0
- viettelcloud/aiplatform/trainer/backends/container/adapters/docker.py +231 -0
- viettelcloud/aiplatform/trainer/backends/container/adapters/podman.py +258 -0
- viettelcloud/aiplatform/trainer/backends/container/backend.py +668 -0
- viettelcloud/aiplatform/trainer/backends/container/backend_test.py +867 -0
- viettelcloud/aiplatform/trainer/backends/container/runtime_loader.py +631 -0
- viettelcloud/aiplatform/trainer/backends/container/runtime_loader_test.py +637 -0
- viettelcloud/aiplatform/trainer/backends/container/types.py +67 -0
- viettelcloud/aiplatform/trainer/backends/container/utils.py +213 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/backend.py +710 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/backend_test.py +1344 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/constants.py +15 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/utils.py +636 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/utils_test.py +582 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/backend.py +306 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/backend_test.py +501 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/constants.py +90 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/job.py +184 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/types.py +52 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/utils.py +302 -0
- viettelcloud/aiplatform/trainer/constants/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/constants/constants.py +179 -0
- viettelcloud/aiplatform/trainer/options/__init__.py +52 -0
- viettelcloud/aiplatform/trainer/options/common.py +55 -0
- viettelcloud/aiplatform/trainer/options/kubernetes.py +502 -0
- viettelcloud/aiplatform/trainer/options/kubernetes_test.py +259 -0
- viettelcloud/aiplatform/trainer/options/localprocess.py +20 -0
- viettelcloud/aiplatform/trainer/test/common.py +22 -0
- viettelcloud/aiplatform/trainer/types/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/types/types.py +517 -0
- viettelcloud/aiplatform/trainer/types/types_test.py +115 -0
- viettelcloud_aiplatform-0.3.0.dist-info/METADATA +226 -0
- viettelcloud_aiplatform-0.3.0.dist-info/RECORD +71 -0
- viettelcloud_aiplatform-0.3.0.dist-info/WHEEL +4 -0
- viettelcloud_aiplatform-0.3.0.dist-info/licenses/LICENSE +201 -0
- viettelcloud_aiplatform-0.3.0.dist-info/licenses/NOTICE +36 -0
|
@@ -0,0 +1,502 @@
|
|
|
1
|
+
# Copyright 2025 The Kubeflow Authors.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""Kubernetes-specific training options for the Kubeflow Trainer SDK."""
|
|
16
|
+
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from typing import Any, Optional, Union
|
|
19
|
+
|
|
20
|
+
from viettelcloud.aiplatform.trainer.backends.base import RuntimeBackend
|
|
21
|
+
from viettelcloud.aiplatform.trainer.types.types import (
|
|
22
|
+
BuiltinTrainer,
|
|
23
|
+
CustomTrainer,
|
|
24
|
+
CustomTrainerContainer,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class ContainerOverride:
|
|
30
|
+
"""Configuration for overriding a specific container in a pod.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
name: Name of the container to override (must exist in TrainingRuntime).
|
|
34
|
+
env: Environment variables to add/merge with the container.
|
|
35
|
+
Each dict should have 'name' and 'value' or 'valueFrom' keys.
|
|
36
|
+
volume_mounts: Volume mounts to add/merge with the container.
|
|
37
|
+
Each dict should have 'name' and 'mountPath' keys at minimum.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
name: str
|
|
41
|
+
env: Optional[list[dict]] = None
|
|
42
|
+
volume_mounts: Optional[list[dict]] = None
|
|
43
|
+
|
|
44
|
+
def __post_init__(self):
|
|
45
|
+
"""Validate the container override configuration."""
|
|
46
|
+
# Validate container name
|
|
47
|
+
if not self.name or not self.name.strip():
|
|
48
|
+
raise ValueError("Container name must be a non-empty string")
|
|
49
|
+
|
|
50
|
+
if self.env is not None:
|
|
51
|
+
if not isinstance(self.env, list):
|
|
52
|
+
raise ValueError("env must be a list of dictionaries")
|
|
53
|
+
for env_var in self.env:
|
|
54
|
+
if not isinstance(env_var, dict):
|
|
55
|
+
raise ValueError("Each env entry must be a dictionary")
|
|
56
|
+
if "name" not in env_var:
|
|
57
|
+
raise ValueError("Each env entry must have a 'name' key")
|
|
58
|
+
if not env_var.get("name"):
|
|
59
|
+
raise ValueError("env 'name' must be a non-empty string")
|
|
60
|
+
if "value" not in env_var and "valueFrom" not in env_var:
|
|
61
|
+
raise ValueError("Each env entry must have either 'value' or 'valueFrom' key")
|
|
62
|
+
# Validate valueFrom structure if present
|
|
63
|
+
if "valueFrom" in env_var:
|
|
64
|
+
value_from = env_var["valueFrom"]
|
|
65
|
+
if not isinstance(value_from, dict):
|
|
66
|
+
raise ValueError("env 'valueFrom' must be a dictionary")
|
|
67
|
+
# valueFrom must have one of these keys
|
|
68
|
+
valid_keys = {"configMapKeyRef", "secretKeyRef", "fieldRef", "resourceFieldRef"}
|
|
69
|
+
if not any(key in value_from for key in valid_keys):
|
|
70
|
+
raise ValueError(
|
|
71
|
+
f"env 'valueFrom' must contain one of: {', '.join(valid_keys)}"
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
if self.volume_mounts is not None:
|
|
75
|
+
if not isinstance(self.volume_mounts, list):
|
|
76
|
+
raise ValueError("volume_mounts must be a list of dictionaries")
|
|
77
|
+
for mount in self.volume_mounts:
|
|
78
|
+
if not isinstance(mount, dict):
|
|
79
|
+
raise ValueError("Each volume_mounts entry must be a dictionary")
|
|
80
|
+
if "name" not in mount:
|
|
81
|
+
raise ValueError("Each volume_mounts entry must have a 'name' key")
|
|
82
|
+
if not mount.get("name"):
|
|
83
|
+
raise ValueError("volume_mounts 'name' must be a non-empty string")
|
|
84
|
+
if "mountPath" not in mount:
|
|
85
|
+
raise ValueError("Each volume_mounts entry must have a 'mountPath' key")
|
|
86
|
+
mount_path = mount.get("mountPath")
|
|
87
|
+
if not mount_path or not isinstance(mount_path, str):
|
|
88
|
+
raise ValueError("volume_mounts 'mountPath' must be a non-empty string")
|
|
89
|
+
if not mount_path.startswith("/"):
|
|
90
|
+
raise ValueError(
|
|
91
|
+
f"volume_mounts 'mountPath' must be an absolute path "
|
|
92
|
+
f"(start with /): {mount_path}"
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@dataclass
|
|
97
|
+
class PodSpecOverride:
|
|
98
|
+
"""Configuration for overriding pod template specifications.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
service_account_name: Service account to use for the pods.
|
|
102
|
+
node_selector: Node selector to place pods on specific nodes.
|
|
103
|
+
affinity: Affinity rules for pod scheduling.
|
|
104
|
+
tolerations: Tolerations for pod scheduling.
|
|
105
|
+
volumes: Volumes to add/merge with the pod.
|
|
106
|
+
init_containers: Init containers to add/merge with the pod.
|
|
107
|
+
containers: Containers to add/merge with the pod.
|
|
108
|
+
scheduling_gates: Scheduling gates for the pods.
|
|
109
|
+
image_pull_secrets: Image pull secrets for the pods.
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
service_account_name: Optional[str] = None
|
|
113
|
+
node_selector: Optional[dict[str, str]] = None
|
|
114
|
+
affinity: Optional[dict] = None
|
|
115
|
+
tolerations: Optional[list[dict]] = None
|
|
116
|
+
volumes: Optional[list[dict]] = None
|
|
117
|
+
init_containers: Optional[list[ContainerOverride]] = None
|
|
118
|
+
containers: Optional[list[ContainerOverride]] = None
|
|
119
|
+
scheduling_gates: Optional[list[dict]] = None
|
|
120
|
+
image_pull_secrets: Optional[list[dict]] = None
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@dataclass
|
|
124
|
+
class PodTemplateOverride:
|
|
125
|
+
"""Configuration for overriding pod templates for specific job types.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
target_jobs: List of job names to apply the overrides to (e.g., ["node", "launcher"]).
|
|
129
|
+
metadata: Metadata overrides for the pod template (labels, annotations).
|
|
130
|
+
spec: Spec overrides for the pod template.
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
target_jobs: list[str]
|
|
134
|
+
metadata: Optional[dict] = None
|
|
135
|
+
spec: Optional[PodSpecOverride] = None
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
@dataclass
|
|
139
|
+
class Labels:
|
|
140
|
+
"""Add labels to the TrainJob resource metadata (.metadata.labels).
|
|
141
|
+
|
|
142
|
+
Supported backends:
|
|
143
|
+
- Kubernetes
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
labels: Dictionary of label key-value pairs to add to TrainJob metadata.
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
labels: dict[str, str]
|
|
150
|
+
|
|
151
|
+
def __call__(
|
|
152
|
+
self,
|
|
153
|
+
job_spec: dict[str, Any],
|
|
154
|
+
trainer: Optional[Union[CustomTrainer, BuiltinTrainer]],
|
|
155
|
+
backend: RuntimeBackend,
|
|
156
|
+
) -> None:
|
|
157
|
+
"""Apply labels to the job specification.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
job_spec: Job specification dictionary to modify.
|
|
161
|
+
trainer: Optional trainer instance for context.
|
|
162
|
+
backend: Backend instance for validation.
|
|
163
|
+
|
|
164
|
+
Raises:
|
|
165
|
+
ValueError: If backend does not support labels.
|
|
166
|
+
"""
|
|
167
|
+
from viettelcloud.aiplatform.trainer.backends.kubernetes.backend import KubernetesBackend
|
|
168
|
+
|
|
169
|
+
if not isinstance(backend, KubernetesBackend):
|
|
170
|
+
raise ValueError(
|
|
171
|
+
f"Labels option is not compatible with {type(backend).__name__}. "
|
|
172
|
+
f"Supported backends: KubernetesBackend"
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
metadata = job_spec.setdefault("metadata", {})
|
|
176
|
+
metadata["labels"] = self.labels
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
@dataclass
|
|
180
|
+
class Annotations:
|
|
181
|
+
"""Add annotations to the TrainJob resource metadata (.metadata.annotations).
|
|
182
|
+
|
|
183
|
+
Supported backends:
|
|
184
|
+
- Kubernetes
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
annotations: Dictionary of annotation key-value pairs to add to TrainJob metadata.
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
annotations: dict[str, str]
|
|
191
|
+
|
|
192
|
+
def __call__(
|
|
193
|
+
self,
|
|
194
|
+
job_spec: dict[str, Any],
|
|
195
|
+
trainer: Optional[Union[CustomTrainer, BuiltinTrainer]],
|
|
196
|
+
backend: RuntimeBackend,
|
|
197
|
+
) -> None:
|
|
198
|
+
"""Apply annotations to the job specification.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
job_spec: Job specification dictionary to modify.
|
|
202
|
+
trainer: Optional trainer instance for context.
|
|
203
|
+
backend: Backend instance for validation.
|
|
204
|
+
|
|
205
|
+
Raises:
|
|
206
|
+
ValueError: If backend does not support annotations.
|
|
207
|
+
"""
|
|
208
|
+
from viettelcloud.aiplatform.trainer.backends.kubernetes.backend import KubernetesBackend
|
|
209
|
+
|
|
210
|
+
if not isinstance(backend, KubernetesBackend):
|
|
211
|
+
raise ValueError(
|
|
212
|
+
f"Annotations option is not compatible with {type(backend).__name__}. "
|
|
213
|
+
f"Supported backends: KubernetesBackend"
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
metadata = job_spec.setdefault("metadata", {})
|
|
217
|
+
metadata["annotations"] = self.annotations
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
@dataclass
|
|
221
|
+
class SpecLabels:
|
|
222
|
+
"""Add labels to derivative JobSet and Jobs (.spec.labels).
|
|
223
|
+
|
|
224
|
+
These labels will be merged with the TrainingRuntime values and applied to
|
|
225
|
+
the JobSet and Jobs created by the TrainJob.
|
|
226
|
+
|
|
227
|
+
Supported backends:
|
|
228
|
+
- Kubernetes
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
labels: Dictionary of label key-value pairs to add to JobSet and Jobs.
|
|
232
|
+
"""
|
|
233
|
+
|
|
234
|
+
labels: dict[str, str]
|
|
235
|
+
|
|
236
|
+
def __call__(
|
|
237
|
+
self,
|
|
238
|
+
job_spec: dict[str, Any],
|
|
239
|
+
trainer: Optional[Union[CustomTrainer, BuiltinTrainer]],
|
|
240
|
+
backend: RuntimeBackend,
|
|
241
|
+
) -> None:
|
|
242
|
+
"""Apply spec-level labels to the job specification.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
job_spec: Job specification dictionary to modify.
|
|
246
|
+
trainer: Optional trainer instance for context.
|
|
247
|
+
backend: Backend instance for validation.
|
|
248
|
+
|
|
249
|
+
Raises:
|
|
250
|
+
ValueError: If backend does not support spec labels.
|
|
251
|
+
"""
|
|
252
|
+
from viettelcloud.aiplatform.trainer.backends.kubernetes.backend import KubernetesBackend
|
|
253
|
+
|
|
254
|
+
if not isinstance(backend, KubernetesBackend):
|
|
255
|
+
raise ValueError(
|
|
256
|
+
f"SpecLabels option is not compatible with {type(backend).__name__}. "
|
|
257
|
+
f"Supported backends: KubernetesBackend"
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
spec = job_spec.setdefault("spec", {})
|
|
261
|
+
spec["labels"] = self.labels
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
@dataclass
|
|
265
|
+
class SpecAnnotations:
|
|
266
|
+
"""Add annotations to derivative JobSet and Jobs (.spec.annotations).
|
|
267
|
+
|
|
268
|
+
These annotations will be merged with the TrainingRuntime values and applied to
|
|
269
|
+
the JobSet and Jobs created by the TrainJob.
|
|
270
|
+
|
|
271
|
+
Supported backends:
|
|
272
|
+
- Kubernetes
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
annotations: Dictionary of annotation key-value pairs to add to JobSet and Jobs.
|
|
276
|
+
"""
|
|
277
|
+
|
|
278
|
+
annotations: dict[str, str]
|
|
279
|
+
|
|
280
|
+
def __call__(
|
|
281
|
+
self,
|
|
282
|
+
job_spec: dict[str, Any],
|
|
283
|
+
trainer: Optional[Union[CustomTrainer, BuiltinTrainer]],
|
|
284
|
+
backend: RuntimeBackend,
|
|
285
|
+
) -> None:
|
|
286
|
+
"""Apply spec-level annotations to the job specification.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
job_spec: Job specification dictionary to modify.
|
|
290
|
+
trainer: Optional trainer instance for context.
|
|
291
|
+
backend: Backend instance for validation.
|
|
292
|
+
|
|
293
|
+
Raises:
|
|
294
|
+
ValueError: If backend does not support spec annotations.
|
|
295
|
+
"""
|
|
296
|
+
from viettelcloud.aiplatform.trainer.backends.kubernetes.backend import KubernetesBackend
|
|
297
|
+
|
|
298
|
+
if not isinstance(backend, KubernetesBackend):
|
|
299
|
+
raise ValueError(
|
|
300
|
+
f"SpecAnnotations option is not compatible with {type(backend).__name__}. "
|
|
301
|
+
f"Supported backends: KubernetesBackend"
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
spec = job_spec.setdefault("spec", {})
|
|
305
|
+
spec["annotations"] = self.annotations
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
class PodTemplateOverrides:
|
|
309
|
+
"""Add pod template overrides to the TrainJob (.spec.podTemplateOverrides).
|
|
310
|
+
|
|
311
|
+
Supported backends:
|
|
312
|
+
- Kubernetes
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
*overrides: One or more PodTemplateOverride objects.
|
|
316
|
+
"""
|
|
317
|
+
|
|
318
|
+
def __init__(self, *overrides: PodTemplateOverride):
|
|
319
|
+
"""Initialize with variable number of PodTemplateOverride objects."""
|
|
320
|
+
if not overrides:
|
|
321
|
+
raise ValueError("At least one PodTemplateOverride must be provided")
|
|
322
|
+
self.pod_overrides = list(overrides)
|
|
323
|
+
|
|
324
|
+
def __call__(
|
|
325
|
+
self,
|
|
326
|
+
job_spec: dict[str, Any],
|
|
327
|
+
trainer: Optional[Union[CustomTrainer, BuiltinTrainer]],
|
|
328
|
+
backend: RuntimeBackend,
|
|
329
|
+
) -> None:
|
|
330
|
+
"""Apply pod template overrides to the job specification.
|
|
331
|
+
|
|
332
|
+
Args:
|
|
333
|
+
job_spec: Job specification dictionary to modify.
|
|
334
|
+
trainer: Optional trainer instance for context.
|
|
335
|
+
backend: Backend instance for validation.
|
|
336
|
+
|
|
337
|
+
Raises:
|
|
338
|
+
ValueError: If backend does not support pod template overrides.
|
|
339
|
+
"""
|
|
340
|
+
from viettelcloud.aiplatform.trainer.backends.kubernetes.backend import KubernetesBackend
|
|
341
|
+
|
|
342
|
+
if not isinstance(backend, KubernetesBackend):
|
|
343
|
+
raise ValueError(
|
|
344
|
+
f"PodTemplateOverrides option is not compatible with {type(backend).__name__}. "
|
|
345
|
+
f"Supported backends: KubernetesBackend"
|
|
346
|
+
)
|
|
347
|
+
spec = job_spec.setdefault("spec", {})
|
|
348
|
+
pod_overrides = spec.setdefault("podTemplateOverrides", [])
|
|
349
|
+
|
|
350
|
+
for override in self.pod_overrides:
|
|
351
|
+
api_override = {"targetJobs": [{"name": job} for job in override.target_jobs]}
|
|
352
|
+
|
|
353
|
+
if override.metadata:
|
|
354
|
+
api_override["metadata"] = override.metadata
|
|
355
|
+
|
|
356
|
+
if override.spec:
|
|
357
|
+
spec_dict = {}
|
|
358
|
+
|
|
359
|
+
if override.spec.service_account_name:
|
|
360
|
+
spec_dict["serviceAccountName"] = override.spec.service_account_name
|
|
361
|
+
if override.spec.node_selector:
|
|
362
|
+
spec_dict["nodeSelector"] = override.spec.node_selector
|
|
363
|
+
if override.spec.affinity:
|
|
364
|
+
spec_dict["affinity"] = override.spec.affinity
|
|
365
|
+
if override.spec.tolerations:
|
|
366
|
+
spec_dict["tolerations"] = override.spec.tolerations
|
|
367
|
+
if override.spec.volumes:
|
|
368
|
+
spec_dict["volumes"] = override.spec.volumes
|
|
369
|
+
if override.spec.scheduling_gates:
|
|
370
|
+
spec_dict["schedulingGates"] = override.spec.scheduling_gates
|
|
371
|
+
if override.spec.image_pull_secrets:
|
|
372
|
+
spec_dict["imagePullSecrets"] = override.spec.image_pull_secrets
|
|
373
|
+
|
|
374
|
+
# Handle container overrides
|
|
375
|
+
if override.spec.init_containers:
|
|
376
|
+
spec_dict["initContainers"] = []
|
|
377
|
+
for container in override.spec.init_containers:
|
|
378
|
+
container_dict = {"name": container.name}
|
|
379
|
+
if container.env:
|
|
380
|
+
container_dict["env"] = container.env
|
|
381
|
+
if container.volume_mounts:
|
|
382
|
+
container_dict["volumeMounts"] = container.volume_mounts
|
|
383
|
+
spec_dict["initContainers"].append(container_dict)
|
|
384
|
+
|
|
385
|
+
if override.spec.containers:
|
|
386
|
+
spec_dict["containers"] = []
|
|
387
|
+
for container in override.spec.containers:
|
|
388
|
+
container_dict = {"name": container.name}
|
|
389
|
+
if container.env:
|
|
390
|
+
container_dict["env"] = container.env
|
|
391
|
+
if container.volume_mounts:
|
|
392
|
+
container_dict["volumeMounts"] = container.volume_mounts
|
|
393
|
+
spec_dict["containers"].append(container_dict)
|
|
394
|
+
|
|
395
|
+
if spec_dict:
|
|
396
|
+
api_override["spec"] = spec_dict
|
|
397
|
+
|
|
398
|
+
pod_overrides.append(api_override)
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
@dataclass
|
|
402
|
+
class TrainerCommand:
|
|
403
|
+
"""Override the trainer container command (.spec.trainer.command).
|
|
404
|
+
|
|
405
|
+
Can only be used with CustomTrainerContainer. CustomTrainer generates its own
|
|
406
|
+
command from the function, and BuiltinTrainer uses pre-configured commands.
|
|
407
|
+
|
|
408
|
+
Supported backends:
|
|
409
|
+
- Kubernetes
|
|
410
|
+
|
|
411
|
+
Args:
|
|
412
|
+
command: List of command strings to override the default trainer command.
|
|
413
|
+
"""
|
|
414
|
+
|
|
415
|
+
command: list[str]
|
|
416
|
+
|
|
417
|
+
def __call__(
|
|
418
|
+
self,
|
|
419
|
+
job_spec: dict[str, Any],
|
|
420
|
+
trainer: Optional[Union[CustomTrainer, BuiltinTrainer, CustomTrainerContainer]],
|
|
421
|
+
backend: RuntimeBackend,
|
|
422
|
+
) -> None:
|
|
423
|
+
"""Apply trainer command override to the job specification.
|
|
424
|
+
|
|
425
|
+
Args:
|
|
426
|
+
job_spec: The job specification to modify.
|
|
427
|
+
trainer: Optional trainer context for validation.
|
|
428
|
+
backend: Backend instance for validation.
|
|
429
|
+
|
|
430
|
+
Raises:
|
|
431
|
+
ValueError: If backend doesn't support or trainer type conflicts.
|
|
432
|
+
"""
|
|
433
|
+
from viettelcloud.aiplatform.trainer.backends.kubernetes.backend import KubernetesBackend
|
|
434
|
+
|
|
435
|
+
if not isinstance(backend, KubernetesBackend):
|
|
436
|
+
raise ValueError(
|
|
437
|
+
f"TrainerCommand option is not compatible with {type(backend).__name__}. "
|
|
438
|
+
f"Supported backends: KubernetesBackend"
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
if trainer is not None and not isinstance(trainer, CustomTrainerContainer):
|
|
442
|
+
raise ValueError(
|
|
443
|
+
"TrainerCommand can only be used with CustomTrainerContainer. "
|
|
444
|
+
"CustomTrainer generates its own command from the function, and "
|
|
445
|
+
"BuiltinTrainer uses pre-configured commands."
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
spec = job_spec.setdefault("spec", {})
|
|
449
|
+
trainer_spec = spec.setdefault("trainer", {})
|
|
450
|
+
trainer_spec["command"] = self.command
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
@dataclass
|
|
454
|
+
class TrainerArgs:
|
|
455
|
+
"""Override the trainer container arguments (.spec.trainer.args).
|
|
456
|
+
|
|
457
|
+
Can only be used with CustomTrainerContainer. CustomTrainer generates its own
|
|
458
|
+
arguments from the function, and BuiltinTrainer uses pre-configured arguments.
|
|
459
|
+
|
|
460
|
+
Supported backends:
|
|
461
|
+
- Kubernetes
|
|
462
|
+
|
|
463
|
+
Args:
|
|
464
|
+
args: List of argument strings to override the default trainer arguments.
|
|
465
|
+
"""
|
|
466
|
+
|
|
467
|
+
args: list[str]
|
|
468
|
+
|
|
469
|
+
def __call__(
|
|
470
|
+
self,
|
|
471
|
+
job_spec: dict[str, Any],
|
|
472
|
+
trainer: Optional[Union[CustomTrainer, BuiltinTrainer, CustomTrainerContainer]],
|
|
473
|
+
backend: RuntimeBackend,
|
|
474
|
+
) -> None:
|
|
475
|
+
"""Apply trainer args override to the job specification.
|
|
476
|
+
|
|
477
|
+
Args:
|
|
478
|
+
job_spec: The job specification to modify.
|
|
479
|
+
trainer: Optional trainer context for validation.
|
|
480
|
+
backend: Backend instance for validation.
|
|
481
|
+
|
|
482
|
+
Raises:
|
|
483
|
+
ValueError: If backend doesn't support or trainer type conflicts.
|
|
484
|
+
"""
|
|
485
|
+
from viettelcloud.aiplatform.trainer.backends.kubernetes.backend import KubernetesBackend
|
|
486
|
+
|
|
487
|
+
if not isinstance(backend, KubernetesBackend):
|
|
488
|
+
raise ValueError(
|
|
489
|
+
f"TrainerArgs option is not compatible with {type(backend).__name__}. "
|
|
490
|
+
f"Supported backends: KubernetesBackend"
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
if trainer is not None and not isinstance(trainer, CustomTrainerContainer):
|
|
494
|
+
raise ValueError(
|
|
495
|
+
"TrainerArgs can only be used with CustomTrainerContainer. "
|
|
496
|
+
"CustomTrainer generates its own arguments from the function, and "
|
|
497
|
+
"BuiltinTrainer uses pre-configured arguments."
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
spec = job_spec.setdefault("spec", {})
|
|
501
|
+
trainer_spec = spec.setdefault("trainer", {})
|
|
502
|
+
trainer_spec["args"] = self.args
|