viettelcloud-aiplatform 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- viettelcloud/__init__.py +1 -0
- viettelcloud/aiplatform/__init__.py +15 -0
- viettelcloud/aiplatform/common/__init__.py +0 -0
- viettelcloud/aiplatform/common/constants.py +22 -0
- viettelcloud/aiplatform/common/types.py +28 -0
- viettelcloud/aiplatform/common/utils.py +40 -0
- viettelcloud/aiplatform/hub/OWNERS +14 -0
- viettelcloud/aiplatform/hub/__init__.py +25 -0
- viettelcloud/aiplatform/hub/api/__init__.py +13 -0
- viettelcloud/aiplatform/hub/api/_proxy_client.py +355 -0
- viettelcloud/aiplatform/hub/api/model_registry_client.py +561 -0
- viettelcloud/aiplatform/hub/api/model_registry_client_test.py +462 -0
- viettelcloud/aiplatform/optimizer/__init__.py +45 -0
- viettelcloud/aiplatform/optimizer/api/__init__.py +0 -0
- viettelcloud/aiplatform/optimizer/api/optimizer_client.py +248 -0
- viettelcloud/aiplatform/optimizer/backends/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/backends/base.py +77 -0
- viettelcloud/aiplatform/optimizer/backends/kubernetes/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/backends/kubernetes/backend.py +563 -0
- viettelcloud/aiplatform/optimizer/backends/kubernetes/utils.py +112 -0
- viettelcloud/aiplatform/optimizer/constants/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/constants/constants.py +59 -0
- viettelcloud/aiplatform/optimizer/types/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/types/algorithm_types.py +87 -0
- viettelcloud/aiplatform/optimizer/types/optimization_types.py +135 -0
- viettelcloud/aiplatform/optimizer/types/search_types.py +95 -0
- viettelcloud/aiplatform/py.typed +0 -0
- viettelcloud/aiplatform/trainer/__init__.py +82 -0
- viettelcloud/aiplatform/trainer/api/__init__.py +3 -0
- viettelcloud/aiplatform/trainer/api/trainer_client.py +277 -0
- viettelcloud/aiplatform/trainer/api/trainer_client_test.py +72 -0
- viettelcloud/aiplatform/trainer/backends/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/backends/base.py +94 -0
- viettelcloud/aiplatform/trainer/backends/container/adapters/base.py +195 -0
- viettelcloud/aiplatform/trainer/backends/container/adapters/docker.py +231 -0
- viettelcloud/aiplatform/trainer/backends/container/adapters/podman.py +258 -0
- viettelcloud/aiplatform/trainer/backends/container/backend.py +668 -0
- viettelcloud/aiplatform/trainer/backends/container/backend_test.py +867 -0
- viettelcloud/aiplatform/trainer/backends/container/runtime_loader.py +631 -0
- viettelcloud/aiplatform/trainer/backends/container/runtime_loader_test.py +637 -0
- viettelcloud/aiplatform/trainer/backends/container/types.py +67 -0
- viettelcloud/aiplatform/trainer/backends/container/utils.py +213 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/backend.py +710 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/backend_test.py +1344 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/constants.py +15 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/utils.py +636 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/utils_test.py +582 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/backend.py +306 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/backend_test.py +501 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/constants.py +90 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/job.py +184 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/types.py +52 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/utils.py +302 -0
- viettelcloud/aiplatform/trainer/constants/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/constants/constants.py +179 -0
- viettelcloud/aiplatform/trainer/options/__init__.py +52 -0
- viettelcloud/aiplatform/trainer/options/common.py +55 -0
- viettelcloud/aiplatform/trainer/options/kubernetes.py +502 -0
- viettelcloud/aiplatform/trainer/options/kubernetes_test.py +259 -0
- viettelcloud/aiplatform/trainer/options/localprocess.py +20 -0
- viettelcloud/aiplatform/trainer/test/common.py +22 -0
- viettelcloud/aiplatform/trainer/types/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/types/types.py +517 -0
- viettelcloud/aiplatform/trainer/types/types_test.py +115 -0
- viettelcloud_aiplatform-0.3.0.dist-info/METADATA +226 -0
- viettelcloud_aiplatform-0.3.0.dist-info/RECORD +71 -0
- viettelcloud_aiplatform-0.3.0.dist-info/WHEEL +4 -0
- viettelcloud_aiplatform-0.3.0.dist-info/licenses/LICENSE +201 -0
- viettelcloud_aiplatform-0.3.0.dist-info/licenses/NOTICE +36 -0
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
# Copyright 2024 The Kubeflow Authors.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from collections.abc import Callable, Iterator
|
|
16
|
+
import logging
|
|
17
|
+
from typing import Optional, Union
|
|
18
|
+
|
|
19
|
+
from viettelcloud.aiplatform.common.types import KubernetesBackendConfig
|
|
20
|
+
from viettelcloud.aiplatform.trainer.backends.container.backend import ContainerBackend
|
|
21
|
+
from viettelcloud.aiplatform.trainer.backends.container.types import ContainerBackendConfig
|
|
22
|
+
from viettelcloud.aiplatform.trainer.backends.kubernetes.backend import KubernetesBackend
|
|
23
|
+
from viettelcloud.aiplatform.trainer.backends.localprocess.backend import (
|
|
24
|
+
LocalProcessBackend,
|
|
25
|
+
LocalProcessBackendConfig,
|
|
26
|
+
)
|
|
27
|
+
from viettelcloud.aiplatform.trainer.constants import constants
|
|
28
|
+
from viettelcloud.aiplatform.trainer.types import types
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class TrainerClient:
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
backend_config: Optional[
|
|
37
|
+
Union[
|
|
38
|
+
KubernetesBackendConfig,
|
|
39
|
+
LocalProcessBackendConfig,
|
|
40
|
+
ContainerBackendConfig,
|
|
41
|
+
]
|
|
42
|
+
] = None,
|
|
43
|
+
):
|
|
44
|
+
"""Initialize a Kubeflow Trainer client.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
backend_config: Backend configuration. Either KubernetesBackendConfig,
|
|
48
|
+
LocalProcessBackendConfig, ContainerBackendConfig,
|
|
49
|
+
or None to use the backend's default config class.
|
|
50
|
+
Defaults to KubernetesBackendConfig.
|
|
51
|
+
|
|
52
|
+
Raises:
|
|
53
|
+
ValueError: Invalid backend configuration.
|
|
54
|
+
|
|
55
|
+
"""
|
|
56
|
+
# Set the default backend config.
|
|
57
|
+
if not backend_config:
|
|
58
|
+
backend_config = KubernetesBackendConfig()
|
|
59
|
+
|
|
60
|
+
if isinstance(backend_config, KubernetesBackendConfig):
|
|
61
|
+
self.backend = KubernetesBackend(backend_config)
|
|
62
|
+
elif isinstance(backend_config, LocalProcessBackendConfig):
|
|
63
|
+
self.backend = LocalProcessBackend(backend_config)
|
|
64
|
+
elif isinstance(backend_config, ContainerBackendConfig):
|
|
65
|
+
self.backend = ContainerBackend(backend_config)
|
|
66
|
+
else:
|
|
67
|
+
raise ValueError(f"Invalid backend config '{backend_config}'")
|
|
68
|
+
|
|
69
|
+
def list_runtimes(self) -> list[types.Runtime]:
|
|
70
|
+
"""List of the available runtimes.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
A list of available training runtimes. If no runtimes exist, an empty list is returned.
|
|
74
|
+
|
|
75
|
+
Raises:
|
|
76
|
+
TimeoutError: Timeout to list runtimes.
|
|
77
|
+
RuntimeError: Failed to list runtimes.
|
|
78
|
+
"""
|
|
79
|
+
return self.backend.list_runtimes()
|
|
80
|
+
|
|
81
|
+
def get_runtime(self, name: str) -> types.Runtime:
|
|
82
|
+
"""Get the runtime object
|
|
83
|
+
Args:
|
|
84
|
+
name: Name of the runtime.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
A runtime object.
|
|
88
|
+
"""
|
|
89
|
+
return self.backend.get_runtime(name=name)
|
|
90
|
+
|
|
91
|
+
def get_runtime_packages(self, runtime: types.Runtime):
|
|
92
|
+
"""Print the installed Python packages for the given runtime. If a runtime has GPUs it also
|
|
93
|
+
prints available GPUs on the single training node.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
runtime: Reference to one of existing runtimes.
|
|
97
|
+
|
|
98
|
+
Raises:
|
|
99
|
+
ValueError: Input arguments are invalid.
|
|
100
|
+
RuntimeError: Failed to get Runtime.
|
|
101
|
+
|
|
102
|
+
"""
|
|
103
|
+
return self.backend.get_runtime_packages(runtime=runtime)
|
|
104
|
+
|
|
105
|
+
def train(
|
|
106
|
+
self,
|
|
107
|
+
runtime: Optional[Union[str, types.Runtime]] = None,
|
|
108
|
+
initializer: Optional[types.Initializer] = None,
|
|
109
|
+
trainer: Optional[
|
|
110
|
+
Union[types.CustomTrainer, types.CustomTrainerContainer, types.BuiltinTrainer]
|
|
111
|
+
] = None,
|
|
112
|
+
options: Optional[list] = None,
|
|
113
|
+
) -> str:
|
|
114
|
+
"""Create a TrainJob. You can configure the TrainJob using one of these trainers:
|
|
115
|
+
|
|
116
|
+
- CustomTrainer: Runs training with a user-defined function that fully encapsulates the
|
|
117
|
+
training process.
|
|
118
|
+
- CustomTrainerContainer: Runs training with a user-defined image that fully encapsulates
|
|
119
|
+
the training process.
|
|
120
|
+
- BuiltinTrainer: Uses a predefined trainer with built-in post-training logic, requiring
|
|
121
|
+
only parameter configuration.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
runtime: Optional reference to one of the existing runtimes. It can accept the runtime
|
|
125
|
+
name or Runtime object from the `get_runtime()` API.
|
|
126
|
+
Defaults to the torch-distributed runtime if not provided.
|
|
127
|
+
initializer: Optional configuration for the dataset and model initializers.
|
|
128
|
+
trainer: Optional configuration for a CustomTrainer, CustomTrainerContainer, or
|
|
129
|
+
BuiltinTrainer. If not specified, the TrainJob will use the
|
|
130
|
+
runtime's default values.
|
|
131
|
+
options: Optional list of configuration options to apply to the TrainJob.
|
|
132
|
+
Options can be imported from viettelcloud.aiplatform.trainer.options.
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
The unique name of the TrainJob that has been generated.
|
|
136
|
+
|
|
137
|
+
Raises:
|
|
138
|
+
ValueError: Input arguments are invalid.
|
|
139
|
+
TimeoutError: Timeout to create TrainJobs.
|
|
140
|
+
RuntimeError: Failed to create TrainJobs.
|
|
141
|
+
"""
|
|
142
|
+
return self.backend.train(
|
|
143
|
+
runtime=runtime,
|
|
144
|
+
initializer=initializer,
|
|
145
|
+
trainer=trainer,
|
|
146
|
+
options=options,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
def list_jobs(self, runtime: Optional[types.Runtime] = None) -> list[types.TrainJob]:
|
|
150
|
+
"""List of the created TrainJobs. If a runtime is specified, only TrainJobs associated with
|
|
151
|
+
that runtime are returned.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
runtime: Reference to one of the existing runtimes.
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
List of created TrainJobs. If no TrainJob exist, an empty list is returned.
|
|
158
|
+
|
|
159
|
+
Raises:
|
|
160
|
+
TimeoutError: Timeout to list TrainJobs.
|
|
161
|
+
RuntimeError: Failed to list TrainJobs.
|
|
162
|
+
"""
|
|
163
|
+
return self.backend.list_jobs(runtime=runtime)
|
|
164
|
+
|
|
165
|
+
def get_job(self, name: str) -> types.TrainJob:
|
|
166
|
+
"""Get the TrainJob object
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
name: Name of the TrainJob.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
A TrainJob object.
|
|
173
|
+
|
|
174
|
+
Raises:
|
|
175
|
+
TimeoutError: Timeout to get a TrainJob.
|
|
176
|
+
RuntimeError: Failed to get a TrainJob.
|
|
177
|
+
"""
|
|
178
|
+
|
|
179
|
+
return self.backend.get_job(name=name)
|
|
180
|
+
|
|
181
|
+
def get_job_logs(
|
|
182
|
+
self,
|
|
183
|
+
name: str,
|
|
184
|
+
step: str = constants.NODE + "-0",
|
|
185
|
+
follow: Optional[bool] = False,
|
|
186
|
+
) -> Iterator[str]:
|
|
187
|
+
"""Get logs from a specific step of a TrainJob.
|
|
188
|
+
|
|
189
|
+
You can watch for the logs in realtime as follows:
|
|
190
|
+
```python
|
|
191
|
+
from viettelcloud.aiplatform.trainer import TrainerClient
|
|
192
|
+
|
|
193
|
+
for logline in TrainerClient().get_job_logs(name="s8d44aa4fb6d", follow=True):
|
|
194
|
+
print(logline)
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
name: Name of the TrainJob.
|
|
199
|
+
step: Step of the TrainJob to collect logs from, like dataset-initializer or node-0.
|
|
200
|
+
follow: Whether to stream logs in realtime as they are produced.
|
|
201
|
+
|
|
202
|
+
Returns:
|
|
203
|
+
Iterator of log lines.
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
Raises:
|
|
207
|
+
TimeoutError: Timeout to get a TrainJob.
|
|
208
|
+
RuntimeError: Failed to get a TrainJob.
|
|
209
|
+
"""
|
|
210
|
+
return self.backend.get_job_logs(name=name, follow=follow, step=step)
|
|
211
|
+
|
|
212
|
+
def get_job_events(self, name: str) -> list[types.Event]:
|
|
213
|
+
"""Get events for a TrainJob.
|
|
214
|
+
|
|
215
|
+
This provides additional clarity about the state of the TrainJob
|
|
216
|
+
when logs alone are not sufficient. Events include information about
|
|
217
|
+
pod state changes, errors, and other significant occurrences.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
name: Name of the TrainJob.
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
A list of Event objects associated with the TrainJob.
|
|
224
|
+
|
|
225
|
+
Raises:
|
|
226
|
+
TimeoutError: Timeout to get a TrainJob events.
|
|
227
|
+
RuntimeError: Failed to get a TrainJob events.
|
|
228
|
+
"""
|
|
229
|
+
return self.backend.get_job_events(name=name)
|
|
230
|
+
|
|
231
|
+
def wait_for_job_status(
|
|
232
|
+
self,
|
|
233
|
+
name: str,
|
|
234
|
+
status: set[str] = {constants.TRAINJOB_COMPLETE},
|
|
235
|
+
timeout: int = 600,
|
|
236
|
+
polling_interval: int = 2,
|
|
237
|
+
callbacks: Optional[list[Callable[[types.TrainJob], None]]] = None,
|
|
238
|
+
) -> types.TrainJob:
|
|
239
|
+
"""Wait for a TrainJob to reach a desired status.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
name: Name of the TrainJob.
|
|
243
|
+
status: Expected statuses. Must be a subset of Created, Running, Complete, and
|
|
244
|
+
Failed statuses.
|
|
245
|
+
timeout: Maximum number of seconds to wait for the TrainJob to reach one of the
|
|
246
|
+
expected statuses.
|
|
247
|
+
polling_interval: The polling interval in seconds to check TrainJob status.
|
|
248
|
+
callbacks: Optional list of callback functions to be invoked after each polling
|
|
249
|
+
interval. Each callback should accept a single argument: the TrainJob object.
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
A TrainJob object that reaches the desired status.
|
|
253
|
+
|
|
254
|
+
Raises:
|
|
255
|
+
ValueError: The input values are incorrect.
|
|
256
|
+
RuntimeError: Failed to get TrainJob or TrainJob reaches unexpected Failed status.
|
|
257
|
+
TimeoutError: Timeout to wait for TrainJob status.
|
|
258
|
+
"""
|
|
259
|
+
return self.backend.wait_for_job_status(
|
|
260
|
+
name=name,
|
|
261
|
+
status=status,
|
|
262
|
+
timeout=timeout,
|
|
263
|
+
polling_interval=polling_interval,
|
|
264
|
+
callbacks=callbacks,
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
def delete_job(self, name: str):
|
|
268
|
+
"""Delete the TrainJob.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
name: Name of the TrainJob.
|
|
272
|
+
|
|
273
|
+
Raises:
|
|
274
|
+
TimeoutError: Timeout to delete TrainJob.
|
|
275
|
+
RuntimeError: Failed to delete TrainJob.
|
|
276
|
+
"""
|
|
277
|
+
return self.backend.delete_job(name=name)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# Copyright 2025 The Kubeflow Authors.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""
|
|
16
|
+
Unit tests for TrainerClient backend selection.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from unittest.mock import Mock, patch
|
|
20
|
+
|
|
21
|
+
import pytest
|
|
22
|
+
|
|
23
|
+
from viettelcloud.aiplatform.common.types import KubernetesBackendConfig
|
|
24
|
+
from viettelcloud.aiplatform.trainer.api.trainer_client import TrainerClient
|
|
25
|
+
from viettelcloud.aiplatform.trainer.backends.localprocess.types import LocalProcessBackendConfig
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@pytest.mark.parametrize(
|
|
29
|
+
"test_case",
|
|
30
|
+
[
|
|
31
|
+
{
|
|
32
|
+
"name": "default_backend_is_kubernetes",
|
|
33
|
+
"backend_config": None,
|
|
34
|
+
"expected_backend": "KubernetesBackend",
|
|
35
|
+
"use_k8s_mocks": True,
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
"name": "local_process_backend_selection",
|
|
39
|
+
"backend_config": LocalProcessBackendConfig(),
|
|
40
|
+
"expected_backend": "LocalProcessBackend",
|
|
41
|
+
"use_k8s_mocks": False,
|
|
42
|
+
},
|
|
43
|
+
{
|
|
44
|
+
"name": "kubernetes_backend_selection",
|
|
45
|
+
"backend_config": KubernetesBackendConfig(),
|
|
46
|
+
"expected_backend": "KubernetesBackend",
|
|
47
|
+
"use_k8s_mocks": True,
|
|
48
|
+
},
|
|
49
|
+
],
|
|
50
|
+
)
|
|
51
|
+
def test_backend_selection(test_case):
|
|
52
|
+
"""Test TrainerClient backend selection logic."""
|
|
53
|
+
if test_case["use_k8s_mocks"]:
|
|
54
|
+
with (
|
|
55
|
+
patch("kubernetes.config.load_kube_config"),
|
|
56
|
+
patch("kubernetes.client.CustomObjectsApi") as mock_custom_api,
|
|
57
|
+
patch("kubernetes.client.CoreV1Api") as mock_core_api,
|
|
58
|
+
):
|
|
59
|
+
mock_custom_api.return_value = Mock()
|
|
60
|
+
mock_core_api.return_value = Mock()
|
|
61
|
+
|
|
62
|
+
if test_case["backend_config"]:
|
|
63
|
+
client = TrainerClient(backend_config=test_case["backend_config"])
|
|
64
|
+
else:
|
|
65
|
+
client = TrainerClient()
|
|
66
|
+
|
|
67
|
+
backend_name = client.backend.__class__.__name__
|
|
68
|
+
assert backend_name == test_case["expected_backend"]
|
|
69
|
+
else:
|
|
70
|
+
client = TrainerClient(backend_config=test_case["backend_config"])
|
|
71
|
+
backend_name = client.backend.__class__.__name__
|
|
72
|
+
assert backend_name == test_case["expected_backend"]
|
|
File without changes
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# Copyright 2025 The Kubeflow Authors.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import abc
|
|
16
|
+
from collections.abc import Callable, Iterator
|
|
17
|
+
from typing import Optional, Union
|
|
18
|
+
|
|
19
|
+
from viettelcloud.aiplatform.trainer.constants import constants
|
|
20
|
+
from viettelcloud.aiplatform.trainer.types import types
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class RuntimeBackend(abc.ABC):
|
|
24
|
+
"""Base class for runtime backends.
|
|
25
|
+
|
|
26
|
+
Options self-validate by checking the backend instance type in their __call__ method.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
@abc.abstractmethod
|
|
30
|
+
def list_runtimes(self) -> list[types.Runtime]:
|
|
31
|
+
raise NotImplementedError()
|
|
32
|
+
|
|
33
|
+
@abc.abstractmethod
|
|
34
|
+
def get_runtime(self, name: str) -> types.Runtime:
|
|
35
|
+
raise NotImplementedError()
|
|
36
|
+
|
|
37
|
+
@abc.abstractmethod
|
|
38
|
+
def get_runtime_packages(self, runtime: types.Runtime):
|
|
39
|
+
raise NotImplementedError()
|
|
40
|
+
|
|
41
|
+
@abc.abstractmethod
|
|
42
|
+
def train(
|
|
43
|
+
self,
|
|
44
|
+
runtime: Optional[Union[str, types.Runtime]] = None,
|
|
45
|
+
initializer: Optional[types.Initializer] = None,
|
|
46
|
+
trainer: Optional[
|
|
47
|
+
Union[types.CustomTrainer, types.CustomTrainerContainer, types.BuiltinTrainer]
|
|
48
|
+
] = None,
|
|
49
|
+
options: Optional[list] = None,
|
|
50
|
+
) -> str:
|
|
51
|
+
raise NotImplementedError()
|
|
52
|
+
|
|
53
|
+
@abc.abstractmethod
|
|
54
|
+
def list_jobs(self, runtime: Optional[types.Runtime] = None) -> list[types.TrainJob]:
|
|
55
|
+
raise NotImplementedError()
|
|
56
|
+
|
|
57
|
+
@abc.abstractmethod
|
|
58
|
+
def get_job(self, name: str) -> types.TrainJob:
|
|
59
|
+
raise NotImplementedError()
|
|
60
|
+
|
|
61
|
+
@abc.abstractmethod
|
|
62
|
+
def get_job_logs(
|
|
63
|
+
self,
|
|
64
|
+
name: str,
|
|
65
|
+
follow: bool = False,
|
|
66
|
+
step: str = constants.NODE + "-0",
|
|
67
|
+
) -> Iterator[str]:
|
|
68
|
+
raise NotImplementedError()
|
|
69
|
+
|
|
70
|
+
def get_job_events(self, name: str) -> list[types.Event]:
|
|
71
|
+
"""Get events for a TrainJob.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
name: Name of the TrainJob.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
A list of Event objects associated with the TrainJob.
|
|
78
|
+
"""
|
|
79
|
+
return []
|
|
80
|
+
|
|
81
|
+
@abc.abstractmethod
|
|
82
|
+
def wait_for_job_status(
|
|
83
|
+
self,
|
|
84
|
+
name: str,
|
|
85
|
+
status: set[str] = {constants.TRAINJOB_COMPLETE},
|
|
86
|
+
timeout: int = 600,
|
|
87
|
+
polling_interval: int = 2,
|
|
88
|
+
callbacks: Optional[list[Callable[[types.TrainJob], None]]] = None,
|
|
89
|
+
) -> types.TrainJob:
|
|
90
|
+
raise NotImplementedError()
|
|
91
|
+
|
|
92
|
+
@abc.abstractmethod
|
|
93
|
+
def delete_job(self, name: str):
|
|
94
|
+
raise NotImplementedError()
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# Copyright 2025 The Kubeflow Authors.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""
|
|
16
|
+
Container client adapters for Docker and Podman.
|
|
17
|
+
|
|
18
|
+
This module implements the adapter pattern to abstract away differences between
|
|
19
|
+
Docker and Podman APIs, allowing the backend to work with either runtime through
|
|
20
|
+
a common interface.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import abc
|
|
26
|
+
from collections.abc import Iterator
|
|
27
|
+
from typing import Optional
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class BaseContainerClientAdapter(abc.ABC):
|
|
31
|
+
"""
|
|
32
|
+
Abstract adapter interface for container clients.
|
|
33
|
+
|
|
34
|
+
This adapter abstracts the container runtime API, allowing the backend
|
|
35
|
+
to work with Docker and Podman through a unified interface.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
@abc.abstractmethod
|
|
39
|
+
def ping(self):
|
|
40
|
+
"""Test the connection to the container runtime."""
|
|
41
|
+
raise NotImplementedError()
|
|
42
|
+
|
|
43
|
+
@abc.abstractmethod
|
|
44
|
+
def create_network(
|
|
45
|
+
self,
|
|
46
|
+
name: str,
|
|
47
|
+
labels: dict[str, str],
|
|
48
|
+
) -> str:
|
|
49
|
+
"""
|
|
50
|
+
Create a container network.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
name: Network name
|
|
54
|
+
labels: Labels to attach to the network
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Network ID or name
|
|
58
|
+
"""
|
|
59
|
+
raise NotImplementedError()
|
|
60
|
+
|
|
61
|
+
@abc.abstractmethod
|
|
62
|
+
def delete_network(self, network_id: str):
|
|
63
|
+
"""Delete a network."""
|
|
64
|
+
raise NotImplementedError()
|
|
65
|
+
|
|
66
|
+
@abc.abstractmethod
|
|
67
|
+
def create_and_start_container(
|
|
68
|
+
self,
|
|
69
|
+
image: str,
|
|
70
|
+
command: list[str],
|
|
71
|
+
name: str,
|
|
72
|
+
network_id: str,
|
|
73
|
+
environment: dict[str, str],
|
|
74
|
+
labels: dict[str, str],
|
|
75
|
+
volumes: dict[str, dict[str, str]],
|
|
76
|
+
working_dir: str,
|
|
77
|
+
) -> str:
|
|
78
|
+
"""
|
|
79
|
+
Create and start a container.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
image: Container image
|
|
83
|
+
command: Command to run
|
|
84
|
+
name: Container name
|
|
85
|
+
network_id: Network to attach to
|
|
86
|
+
environment: Environment variables
|
|
87
|
+
labels: Container labels
|
|
88
|
+
volumes: Volume mounts
|
|
89
|
+
working_dir: Working directory
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
Container ID
|
|
93
|
+
"""
|
|
94
|
+
raise NotImplementedError()
|
|
95
|
+
|
|
96
|
+
@abc.abstractmethod
|
|
97
|
+
def get_container(self, container_id: str):
|
|
98
|
+
"""Get container object by ID."""
|
|
99
|
+
raise NotImplementedError()
|
|
100
|
+
|
|
101
|
+
@abc.abstractmethod
|
|
102
|
+
def container_logs(self, container_id: str, follow: bool) -> Iterator[str]:
|
|
103
|
+
"""Stream logs from a container."""
|
|
104
|
+
raise NotImplementedError()
|
|
105
|
+
|
|
106
|
+
@abc.abstractmethod
|
|
107
|
+
def stop_container(self, container_id: str, timeout: int = 10):
|
|
108
|
+
"""Stop a container."""
|
|
109
|
+
raise NotImplementedError()
|
|
110
|
+
|
|
111
|
+
@abc.abstractmethod
|
|
112
|
+
def remove_container(self, container_id: str, force: bool = True):
|
|
113
|
+
"""Remove a container."""
|
|
114
|
+
raise NotImplementedError()
|
|
115
|
+
|
|
116
|
+
@abc.abstractmethod
|
|
117
|
+
def pull_image(self, image: str):
|
|
118
|
+
"""Pull an image."""
|
|
119
|
+
raise NotImplementedError()
|
|
120
|
+
|
|
121
|
+
@abc.abstractmethod
|
|
122
|
+
def image_exists(self, image: str) -> bool:
|
|
123
|
+
"""Check if an image exists locally."""
|
|
124
|
+
raise NotImplementedError()
|
|
125
|
+
|
|
126
|
+
@abc.abstractmethod
|
|
127
|
+
def run_oneoff_container(self, image: str, command: list[str]) -> str:
|
|
128
|
+
"""
|
|
129
|
+
Run a short-lived container and return its output.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
image: Container image
|
|
133
|
+
command: Command to run
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
Container output as string
|
|
137
|
+
"""
|
|
138
|
+
raise NotImplementedError()
|
|
139
|
+
|
|
140
|
+
@abc.abstractmethod
|
|
141
|
+
def container_status(self, container_id: str) -> tuple[str, Optional[int]]:
|
|
142
|
+
"""
|
|
143
|
+
Get container status.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
Tuple of (status_string, exit_code)
|
|
147
|
+
Status strings: "running", "created", "exited", etc.
|
|
148
|
+
Exit code is None if container hasn't exited
|
|
149
|
+
"""
|
|
150
|
+
raise NotImplementedError()
|
|
151
|
+
|
|
152
|
+
@abc.abstractmethod
|
|
153
|
+
def get_container_ip(self, container_id: str, network_id: str) -> Optional[str]:
|
|
154
|
+
"""
|
|
155
|
+
Get container's IP address on a specific network.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
container_id: Container ID
|
|
159
|
+
network_id: Network name or ID
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
IP address string or None if not found
|
|
163
|
+
"""
|
|
164
|
+
raise NotImplementedError()
|
|
165
|
+
|
|
166
|
+
@abc.abstractmethod
|
|
167
|
+
def list_containers(self, filters: Optional[dict[str, list[str]]] = None) -> list[dict]:
|
|
168
|
+
"""
|
|
169
|
+
List containers, optionally filtered by labels.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
filters: Dictionary of filters (e.g., {"label": ["key=value"]})
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
List of container info dictionaries with keys:
|
|
176
|
+
- id: Container ID
|
|
177
|
+
- name: Container name
|
|
178
|
+
- labels: Dictionary of labels
|
|
179
|
+
- status: Container status
|
|
180
|
+
- created: Creation timestamp
|
|
181
|
+
"""
|
|
182
|
+
raise NotImplementedError()
|
|
183
|
+
|
|
184
|
+
@abc.abstractmethod
|
|
185
|
+
def get_network(self, network_id: str) -> Optional[dict]:
|
|
186
|
+
"""
|
|
187
|
+
Get network information by ID or name.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
network_id: Network ID or name
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
Dictionary with network info including labels, or None if not found
|
|
194
|
+
"""
|
|
195
|
+
raise NotImplementedError()
|