viettelcloud-aiplatform 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- viettelcloud/__init__.py +1 -0
- viettelcloud/aiplatform/__init__.py +15 -0
- viettelcloud/aiplatform/common/__init__.py +0 -0
- viettelcloud/aiplatform/common/constants.py +22 -0
- viettelcloud/aiplatform/common/types.py +28 -0
- viettelcloud/aiplatform/common/utils.py +40 -0
- viettelcloud/aiplatform/hub/OWNERS +14 -0
- viettelcloud/aiplatform/hub/__init__.py +25 -0
- viettelcloud/aiplatform/hub/api/__init__.py +13 -0
- viettelcloud/aiplatform/hub/api/_proxy_client.py +355 -0
- viettelcloud/aiplatform/hub/api/model_registry_client.py +561 -0
- viettelcloud/aiplatform/hub/api/model_registry_client_test.py +462 -0
- viettelcloud/aiplatform/optimizer/__init__.py +45 -0
- viettelcloud/aiplatform/optimizer/api/__init__.py +0 -0
- viettelcloud/aiplatform/optimizer/api/optimizer_client.py +248 -0
- viettelcloud/aiplatform/optimizer/backends/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/backends/base.py +77 -0
- viettelcloud/aiplatform/optimizer/backends/kubernetes/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/backends/kubernetes/backend.py +563 -0
- viettelcloud/aiplatform/optimizer/backends/kubernetes/utils.py +112 -0
- viettelcloud/aiplatform/optimizer/constants/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/constants/constants.py +59 -0
- viettelcloud/aiplatform/optimizer/types/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/types/algorithm_types.py +87 -0
- viettelcloud/aiplatform/optimizer/types/optimization_types.py +135 -0
- viettelcloud/aiplatform/optimizer/types/search_types.py +95 -0
- viettelcloud/aiplatform/py.typed +0 -0
- viettelcloud/aiplatform/trainer/__init__.py +82 -0
- viettelcloud/aiplatform/trainer/api/__init__.py +3 -0
- viettelcloud/aiplatform/trainer/api/trainer_client.py +277 -0
- viettelcloud/aiplatform/trainer/api/trainer_client_test.py +72 -0
- viettelcloud/aiplatform/trainer/backends/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/backends/base.py +94 -0
- viettelcloud/aiplatform/trainer/backends/container/adapters/base.py +195 -0
- viettelcloud/aiplatform/trainer/backends/container/adapters/docker.py +231 -0
- viettelcloud/aiplatform/trainer/backends/container/adapters/podman.py +258 -0
- viettelcloud/aiplatform/trainer/backends/container/backend.py +668 -0
- viettelcloud/aiplatform/trainer/backends/container/backend_test.py +867 -0
- viettelcloud/aiplatform/trainer/backends/container/runtime_loader.py +631 -0
- viettelcloud/aiplatform/trainer/backends/container/runtime_loader_test.py +637 -0
- viettelcloud/aiplatform/trainer/backends/container/types.py +67 -0
- viettelcloud/aiplatform/trainer/backends/container/utils.py +213 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/backend.py +710 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/backend_test.py +1344 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/constants.py +15 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/utils.py +636 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/utils_test.py +582 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/backend.py +306 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/backend_test.py +501 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/constants.py +90 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/job.py +184 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/types.py +52 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/utils.py +302 -0
- viettelcloud/aiplatform/trainer/constants/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/constants/constants.py +179 -0
- viettelcloud/aiplatform/trainer/options/__init__.py +52 -0
- viettelcloud/aiplatform/trainer/options/common.py +55 -0
- viettelcloud/aiplatform/trainer/options/kubernetes.py +502 -0
- viettelcloud/aiplatform/trainer/options/kubernetes_test.py +259 -0
- viettelcloud/aiplatform/trainer/options/localprocess.py +20 -0
- viettelcloud/aiplatform/trainer/test/common.py +22 -0
- viettelcloud/aiplatform/trainer/types/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/types/types.py +517 -0
- viettelcloud/aiplatform/trainer/types/types_test.py +115 -0
- viettelcloud_aiplatform-0.3.0.dist-info/METADATA +226 -0
- viettelcloud_aiplatform-0.3.0.dist-info/RECORD +71 -0
- viettelcloud_aiplatform-0.3.0.dist-info/WHEEL +4 -0
- viettelcloud_aiplatform-0.3.0.dist-info/licenses/LICENSE +201 -0
- viettelcloud_aiplatform-0.3.0.dist-info/licenses/NOTICE +36 -0
|
@@ -0,0 +1,867 @@
|
|
|
1
|
+
# Copyright 2025 The Kubeflow Authors.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""
|
|
16
|
+
Unit tests for ContainerBackend.
|
|
17
|
+
|
|
18
|
+
Tests the ContainerBackend class with mocked container adapters.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from collections.abc import Iterator
|
|
22
|
+
from contextlib import nullcontext
|
|
23
|
+
import os
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
import shutil
|
|
26
|
+
import tempfile
|
|
27
|
+
from typing import Optional
|
|
28
|
+
from unittest.mock import Mock, patch
|
|
29
|
+
|
|
30
|
+
import pytest
|
|
31
|
+
|
|
32
|
+
from viettelcloud.aiplatform.trainer.backends.container.adapters.base import (
|
|
33
|
+
BaseContainerClientAdapter,
|
|
34
|
+
)
|
|
35
|
+
from viettelcloud.aiplatform.trainer.backends.container.backend import ContainerBackend
|
|
36
|
+
from viettelcloud.aiplatform.trainer.backends.container.types import ContainerBackendConfig
|
|
37
|
+
from viettelcloud.aiplatform.trainer.constants import constants
|
|
38
|
+
from viettelcloud.aiplatform.trainer.test.common import FAILED, SUCCESS, TestCase
|
|
39
|
+
from viettelcloud.aiplatform.trainer.types import types
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# Mock Container Adapter
|
|
43
|
+
class MockContainerAdapter(BaseContainerClientAdapter):
|
|
44
|
+
"""Mock adapter for testing ContainerBackend without Docker/Podman."""
|
|
45
|
+
|
|
46
|
+
def __init__(self):
|
|
47
|
+
self._runtime_type = "mock"
|
|
48
|
+
self.networks_created = []
|
|
49
|
+
self.containers_created = []
|
|
50
|
+
self.containers_stopped = []
|
|
51
|
+
self.containers_removed = []
|
|
52
|
+
self.networks_deleted = []
|
|
53
|
+
self.images_pulled = []
|
|
54
|
+
self.ping_called = False
|
|
55
|
+
|
|
56
|
+
def ping(self):
|
|
57
|
+
self.ping_called = True
|
|
58
|
+
|
|
59
|
+
def create_network(self, name: str, labels: dict[str, str]) -> str:
|
|
60
|
+
network_id = f"net-{name}"
|
|
61
|
+
self.networks_created.append({"id": network_id, "name": name, "labels": labels})
|
|
62
|
+
return network_id
|
|
63
|
+
|
|
64
|
+
def delete_network(self, network_id: str):
|
|
65
|
+
self.networks_deleted.append(network_id)
|
|
66
|
+
|
|
67
|
+
def create_and_start_container(
|
|
68
|
+
self,
|
|
69
|
+
image: str,
|
|
70
|
+
command: list[str],
|
|
71
|
+
name: str,
|
|
72
|
+
network_id: str,
|
|
73
|
+
environment: dict[str, str],
|
|
74
|
+
labels: dict[str, str],
|
|
75
|
+
volumes: dict[str, dict[str, str]],
|
|
76
|
+
working_dir: str,
|
|
77
|
+
) -> str:
|
|
78
|
+
container_id = f"container-{len(self.containers_created)}"
|
|
79
|
+
self.containers_created.append(
|
|
80
|
+
{
|
|
81
|
+
"id": container_id,
|
|
82
|
+
"name": name,
|
|
83
|
+
"image": image,
|
|
84
|
+
"command": command,
|
|
85
|
+
"network": network_id,
|
|
86
|
+
"environment": environment,
|
|
87
|
+
"labels": labels,
|
|
88
|
+
"volumes": volumes,
|
|
89
|
+
"working_dir": working_dir,
|
|
90
|
+
"status": "running",
|
|
91
|
+
"exit_code": None,
|
|
92
|
+
}
|
|
93
|
+
)
|
|
94
|
+
return container_id
|
|
95
|
+
|
|
96
|
+
def get_container(self, container_id: str):
|
|
97
|
+
for container in self.containers_created:
|
|
98
|
+
if container["id"] == container_id:
|
|
99
|
+
return Mock(id=container_id, status=container["status"])
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
def container_logs(self, container_id: str, follow: bool) -> Iterator[str]:
|
|
103
|
+
if follow:
|
|
104
|
+
yield f"Log line 1 from {container_id}\n"
|
|
105
|
+
yield f"Log line 2 from {container_id}\n"
|
|
106
|
+
else:
|
|
107
|
+
yield f"Complete log from {container_id}\n"
|
|
108
|
+
|
|
109
|
+
def stop_container(self, container_id: str, timeout: int = 10):
|
|
110
|
+
self.containers_stopped.append(container_id)
|
|
111
|
+
for container in self.containers_created:
|
|
112
|
+
if container["id"] == container_id:
|
|
113
|
+
container["status"] = "exited"
|
|
114
|
+
container["exit_code"] = 0
|
|
115
|
+
|
|
116
|
+
def remove_container(self, container_id: str, force: bool = True):
|
|
117
|
+
self.containers_removed.append(container_id)
|
|
118
|
+
|
|
119
|
+
def pull_image(self, image: str):
|
|
120
|
+
self.images_pulled.append(image)
|
|
121
|
+
|
|
122
|
+
def image_exists(self, image: str) -> bool:
|
|
123
|
+
return "local" in image or image in self.images_pulled
|
|
124
|
+
|
|
125
|
+
def run_oneoff_container(self, image: str, command: list[str]) -> str:
|
|
126
|
+
return "Python 3.9.0\npip 21.0.1\nnvidia-smi not found\n"
|
|
127
|
+
|
|
128
|
+
def container_status(self, container_id: str) -> tuple[str, Optional[int]]:
|
|
129
|
+
for container in self.containers_created:
|
|
130
|
+
if container["id"] == container_id:
|
|
131
|
+
return (container["status"], container.get("exit_code"))
|
|
132
|
+
return ("unknown", None)
|
|
133
|
+
|
|
134
|
+
def set_container_status(self, container_id: str, status: str, exit_code: Optional[int] = None):
|
|
135
|
+
"""Helper method to set container status for testing."""
|
|
136
|
+
for container in self.containers_created:
|
|
137
|
+
if container["id"] == container_id:
|
|
138
|
+
container["status"] = status
|
|
139
|
+
container["exit_code"] = exit_code
|
|
140
|
+
|
|
141
|
+
def get_container_ip(self, container_id: str, network_id: str) -> Optional[str]:
|
|
142
|
+
"""Get container IP address on a specific network."""
|
|
143
|
+
for container in self.containers_created:
|
|
144
|
+
if container["id"] == container_id:
|
|
145
|
+
return f"192.168.1.{len(self.containers_created)}"
|
|
146
|
+
return None
|
|
147
|
+
|
|
148
|
+
def list_containers(self, filters: Optional[dict[str, list[str]]] = None) -> list[dict]:
|
|
149
|
+
"""List containers with optional filters."""
|
|
150
|
+
if not filters:
|
|
151
|
+
return [
|
|
152
|
+
{
|
|
153
|
+
"id": c["id"],
|
|
154
|
+
"name": c["name"],
|
|
155
|
+
"labels": c["labels"],
|
|
156
|
+
"status": c["status"],
|
|
157
|
+
"created": "2025-01-01T00:00:00Z",
|
|
158
|
+
}
|
|
159
|
+
for c in self.containers_created
|
|
160
|
+
]
|
|
161
|
+
|
|
162
|
+
# Simple label filtering
|
|
163
|
+
result = []
|
|
164
|
+
for container in self.containers_created:
|
|
165
|
+
if "label" in filters:
|
|
166
|
+
match = True
|
|
167
|
+
for label_filter in filters["label"]:
|
|
168
|
+
if "=" in label_filter:
|
|
169
|
+
key, value = label_filter.split("=", 1)
|
|
170
|
+
if container["labels"].get(key) != value:
|
|
171
|
+
match = False
|
|
172
|
+
break
|
|
173
|
+
else:
|
|
174
|
+
if label_filter not in container["labels"]:
|
|
175
|
+
match = False
|
|
176
|
+
break
|
|
177
|
+
if match:
|
|
178
|
+
result.append(
|
|
179
|
+
{
|
|
180
|
+
"id": container["id"],
|
|
181
|
+
"name": container["name"],
|
|
182
|
+
"labels": container["labels"],
|
|
183
|
+
"status": container["status"],
|
|
184
|
+
"created": "2025-01-01T00:00:00Z",
|
|
185
|
+
}
|
|
186
|
+
)
|
|
187
|
+
return result
|
|
188
|
+
|
|
189
|
+
def get_network(self, network_id: str) -> Optional[dict]:
|
|
190
|
+
"""Get network information."""
|
|
191
|
+
for network in self.networks_created:
|
|
192
|
+
if network["id"] == network_id or network["name"] == network_id:
|
|
193
|
+
return {
|
|
194
|
+
"id": network["id"],
|
|
195
|
+
"name": network["name"],
|
|
196
|
+
"labels": network["labels"],
|
|
197
|
+
}
|
|
198
|
+
return None
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
# Fixtures
|
|
202
|
+
@pytest.fixture
|
|
203
|
+
def container_backend():
|
|
204
|
+
"""Provide ContainerBackend with mocked adapter."""
|
|
205
|
+
with patch(
|
|
206
|
+
"viettelcloud.aiplatform.trainer.backends.container.backend.DockerClientAdapter"
|
|
207
|
+
) as mock_docker:
|
|
208
|
+
mock_docker.return_value = MockContainerAdapter()
|
|
209
|
+
backend = ContainerBackend(ContainerBackendConfig())
|
|
210
|
+
return backend
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
@pytest.fixture
|
|
214
|
+
def temp_workdir():
|
|
215
|
+
"""Provide a temporary working directory."""
|
|
216
|
+
tmpdir = tempfile.mkdtemp()
|
|
217
|
+
yield tmpdir
|
|
218
|
+
if os.path.exists(tmpdir):
|
|
219
|
+
shutil.rmtree(tmpdir)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
# Helper Function
|
|
223
|
+
def simple_train_func():
|
|
224
|
+
"""Simple training function for tests."""
|
|
225
|
+
print("Training")
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
# Tests
|
|
229
|
+
@pytest.mark.parametrize(
|
|
230
|
+
"test_case",
|
|
231
|
+
[
|
|
232
|
+
TestCase(
|
|
233
|
+
name="auto-detect docker first",
|
|
234
|
+
expected_status=SUCCESS,
|
|
235
|
+
),
|
|
236
|
+
TestCase(
|
|
237
|
+
name="auto-detect falls back to podman",
|
|
238
|
+
expected_status=SUCCESS,
|
|
239
|
+
),
|
|
240
|
+
TestCase(
|
|
241
|
+
name="both unavailable raises error",
|
|
242
|
+
expected_status=FAILED,
|
|
243
|
+
expected_error=RuntimeError,
|
|
244
|
+
),
|
|
245
|
+
],
|
|
246
|
+
)
|
|
247
|
+
def test_backend_initialization(test_case):
|
|
248
|
+
"""Test ContainerBackend initialization and adapter creation."""
|
|
249
|
+
print("Executing test:", test_case.name)
|
|
250
|
+
try:
|
|
251
|
+
if test_case.name == "auto-detect docker first":
|
|
252
|
+
with (
|
|
253
|
+
patch(
|
|
254
|
+
"viettelcloud.aiplatform.trainer.backends.container.backend.DockerClientAdapter"
|
|
255
|
+
) as mock_docker,
|
|
256
|
+
patch(
|
|
257
|
+
"viettelcloud.aiplatform.trainer.backends.container.backend.PodmanClientAdapter"
|
|
258
|
+
) as mock_podman,
|
|
259
|
+
):
|
|
260
|
+
mock_docker_instance = Mock()
|
|
261
|
+
mock_docker.return_value = mock_docker_instance
|
|
262
|
+
|
|
263
|
+
_ = ContainerBackend(ContainerBackendConfig())
|
|
264
|
+
|
|
265
|
+
# Docker should be called (could be with Colima socket or None)
|
|
266
|
+
assert mock_docker.call_count == 1
|
|
267
|
+
mock_docker_instance.ping.assert_called_once()
|
|
268
|
+
mock_podman.assert_not_called()
|
|
269
|
+
assert test_case.expected_status == SUCCESS
|
|
270
|
+
|
|
271
|
+
elif test_case.name == "auto-detect falls back to podman":
|
|
272
|
+
with (
|
|
273
|
+
patch(
|
|
274
|
+
"viettelcloud.aiplatform.trainer.backends.container.backend.DockerClientAdapter"
|
|
275
|
+
) as mock_docker,
|
|
276
|
+
patch(
|
|
277
|
+
"viettelcloud.aiplatform.trainer.backends.container.backend.PodmanClientAdapter"
|
|
278
|
+
) as mock_podman,
|
|
279
|
+
):
|
|
280
|
+
mock_docker_instance = Mock()
|
|
281
|
+
mock_docker_instance.ping.side_effect = Exception("Docker not available")
|
|
282
|
+
mock_docker.return_value = mock_docker_instance
|
|
283
|
+
|
|
284
|
+
mock_podman_instance = Mock()
|
|
285
|
+
mock_podman.return_value = mock_podman_instance
|
|
286
|
+
|
|
287
|
+
_ = ContainerBackend(ContainerBackendConfig())
|
|
288
|
+
|
|
289
|
+
# Docker may be tried multiple times (different socket locations)
|
|
290
|
+
assert mock_docker.call_count >= 1
|
|
291
|
+
mock_podman.assert_called_once_with(None)
|
|
292
|
+
mock_podman_instance.ping.assert_called_once()
|
|
293
|
+
assert test_case.expected_status == SUCCESS
|
|
294
|
+
|
|
295
|
+
elif test_case.name == "both unavailable raises error":
|
|
296
|
+
with (
|
|
297
|
+
patch(
|
|
298
|
+
"viettelcloud.aiplatform.trainer.backends.container.backend.DockerClientAdapter"
|
|
299
|
+
) as mock_docker,
|
|
300
|
+
patch(
|
|
301
|
+
"viettelcloud.aiplatform.trainer.backends.container.backend.PodmanClientAdapter"
|
|
302
|
+
) as mock_podman,
|
|
303
|
+
):
|
|
304
|
+
mock_docker_instance = Mock()
|
|
305
|
+
mock_docker_instance.ping.side_effect = Exception("Docker not available")
|
|
306
|
+
mock_docker.return_value = mock_docker_instance
|
|
307
|
+
|
|
308
|
+
mock_podman_instance = Mock()
|
|
309
|
+
mock_podman_instance.ping.side_effect = Exception("Podman not available")
|
|
310
|
+
mock_podman.return_value = mock_podman_instance
|
|
311
|
+
|
|
312
|
+
ContainerBackend(ContainerBackendConfig())
|
|
313
|
+
|
|
314
|
+
except Exception as e:
|
|
315
|
+
assert type(e) is test_case.expected_error
|
|
316
|
+
print("test execution complete")
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def test_list_runtimes(container_backend):
|
|
320
|
+
"""Test listing available local runtimes."""
|
|
321
|
+
print("Executing test: list_runtimes")
|
|
322
|
+
runtimes = container_backend.list_runtimes()
|
|
323
|
+
|
|
324
|
+
assert isinstance(runtimes, list)
|
|
325
|
+
assert len(runtimes) > 0
|
|
326
|
+
runtime_names = [r.name for r in runtimes]
|
|
327
|
+
assert constants.DEFAULT_TRAINING_RUNTIME in runtime_names
|
|
328
|
+
print("test execution complete")
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
@pytest.mark.parametrize(
|
|
332
|
+
"test_case",
|
|
333
|
+
[
|
|
334
|
+
TestCase(
|
|
335
|
+
name="get valid runtime",
|
|
336
|
+
expected_status=SUCCESS,
|
|
337
|
+
config={"name": constants.DEFAULT_TRAINING_RUNTIME},
|
|
338
|
+
),
|
|
339
|
+
TestCase(
|
|
340
|
+
name="get invalid runtime",
|
|
341
|
+
expected_status=FAILED,
|
|
342
|
+
config={"name": "nonexistent-runtime"},
|
|
343
|
+
expected_error=ValueError,
|
|
344
|
+
),
|
|
345
|
+
],
|
|
346
|
+
)
|
|
347
|
+
def test_get_runtime(container_backend, test_case):
|
|
348
|
+
"""Test getting a specific runtime."""
|
|
349
|
+
print("Executing test:", test_case.name)
|
|
350
|
+
try:
|
|
351
|
+
runtime = container_backend.get_runtime(**test_case.config)
|
|
352
|
+
|
|
353
|
+
assert test_case.expected_status == SUCCESS
|
|
354
|
+
assert isinstance(runtime, types.Runtime)
|
|
355
|
+
assert runtime.name == test_case.config["name"]
|
|
356
|
+
|
|
357
|
+
except Exception as e:
|
|
358
|
+
assert type(e) is test_case.expected_error
|
|
359
|
+
print("test execution complete")
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def test_get_runtime_packages(container_backend):
|
|
363
|
+
"""Test getting runtime packages."""
|
|
364
|
+
print("Executing test: get_runtime_packages")
|
|
365
|
+
runtime = container_backend.get_runtime(constants.DEFAULT_TRAINING_RUNTIME)
|
|
366
|
+
container_backend.get_runtime_packages(runtime)
|
|
367
|
+
|
|
368
|
+
assert len(
|
|
369
|
+
container_backend._adapter.images_pulled
|
|
370
|
+
) > 0 or container_backend._adapter.image_exists(runtime.trainer.image)
|
|
371
|
+
print("test execution complete")
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
@pytest.mark.parametrize(
|
|
375
|
+
"test_case",
|
|
376
|
+
[
|
|
377
|
+
TestCase(
|
|
378
|
+
name="train single node",
|
|
379
|
+
expected_status=SUCCESS,
|
|
380
|
+
config={"num_nodes": 1, "expected_containers": 1},
|
|
381
|
+
),
|
|
382
|
+
TestCase(
|
|
383
|
+
name="train multi-node",
|
|
384
|
+
expected_status=SUCCESS,
|
|
385
|
+
config={"num_nodes": 3, "expected_containers": 3},
|
|
386
|
+
),
|
|
387
|
+
TestCase(
|
|
388
|
+
name="train with custom env",
|
|
389
|
+
expected_status=SUCCESS,
|
|
390
|
+
config={
|
|
391
|
+
"num_nodes": 1,
|
|
392
|
+
"env": {"MY_VAR": "my_value", "DEBUG": "true"},
|
|
393
|
+
"expected_containers": 1,
|
|
394
|
+
},
|
|
395
|
+
),
|
|
396
|
+
TestCase(
|
|
397
|
+
name="train with packages",
|
|
398
|
+
expected_status=SUCCESS,
|
|
399
|
+
config={
|
|
400
|
+
"num_nodes": 1,
|
|
401
|
+
"packages": ["numpy", "pandas"],
|
|
402
|
+
"expected_containers": 1,
|
|
403
|
+
},
|
|
404
|
+
),
|
|
405
|
+
TestCase(
|
|
406
|
+
name="train with single GPU",
|
|
407
|
+
expected_status=SUCCESS,
|
|
408
|
+
config={
|
|
409
|
+
"num_nodes": 1,
|
|
410
|
+
"resources_per_node": {"gpu": "1"},
|
|
411
|
+
"expected_containers": 1,
|
|
412
|
+
"expected_nproc_per_node": 1,
|
|
413
|
+
},
|
|
414
|
+
),
|
|
415
|
+
TestCase(
|
|
416
|
+
name="train with multiple GPUs",
|
|
417
|
+
expected_status=SUCCESS,
|
|
418
|
+
config={
|
|
419
|
+
"num_nodes": 1,
|
|
420
|
+
"resources_per_node": {"gpu": "4"},
|
|
421
|
+
"expected_containers": 1,
|
|
422
|
+
"expected_nproc_per_node": 4,
|
|
423
|
+
},
|
|
424
|
+
),
|
|
425
|
+
TestCase(
|
|
426
|
+
name="train multi-node with GPUs",
|
|
427
|
+
expected_status=SUCCESS,
|
|
428
|
+
config={
|
|
429
|
+
"num_nodes": 2,
|
|
430
|
+
"resources_per_node": {"gpu": "2"},
|
|
431
|
+
"expected_containers": 2,
|
|
432
|
+
"expected_nproc_per_node": 2,
|
|
433
|
+
},
|
|
434
|
+
),
|
|
435
|
+
TestCase(
|
|
436
|
+
name="train with CPU resources (nproc=1)",
|
|
437
|
+
expected_status=SUCCESS,
|
|
438
|
+
config={
|
|
439
|
+
"num_nodes": 1,
|
|
440
|
+
"resources_per_node": {"cpu": "16"},
|
|
441
|
+
"expected_containers": 1,
|
|
442
|
+
"expected_nproc_per_node": 1,
|
|
443
|
+
},
|
|
444
|
+
),
|
|
445
|
+
],
|
|
446
|
+
)
|
|
447
|
+
def test_train(container_backend, test_case):
|
|
448
|
+
"""Test training job creation."""
|
|
449
|
+
print("Executing test:", test_case.name)
|
|
450
|
+
try:
|
|
451
|
+
trainer = types.CustomTrainer(
|
|
452
|
+
func=simple_train_func,
|
|
453
|
+
num_nodes=test_case.config.get("num_nodes", 1),
|
|
454
|
+
env=test_case.config.get("env"),
|
|
455
|
+
packages_to_install=test_case.config.get("packages"),
|
|
456
|
+
resources_per_node=test_case.config.get("resources_per_node"),
|
|
457
|
+
)
|
|
458
|
+
runtime = container_backend.get_runtime(constants.DEFAULT_TRAINING_RUNTIME)
|
|
459
|
+
|
|
460
|
+
job_name = container_backend.train(runtime=runtime, trainer=trainer)
|
|
461
|
+
|
|
462
|
+
assert test_case.expected_status == SUCCESS
|
|
463
|
+
assert job_name is not None
|
|
464
|
+
assert len(job_name) == 12
|
|
465
|
+
assert (
|
|
466
|
+
len(container_backend._adapter.containers_created)
|
|
467
|
+
== test_case.config["expected_containers"]
|
|
468
|
+
)
|
|
469
|
+
assert len(container_backend._adapter.networks_created) == 1
|
|
470
|
+
|
|
471
|
+
# Check environment if specified
|
|
472
|
+
if "env" in test_case.config:
|
|
473
|
+
container = container_backend._adapter.containers_created[0]
|
|
474
|
+
for key, value in test_case.config["env"].items():
|
|
475
|
+
assert container["environment"][key] == value
|
|
476
|
+
|
|
477
|
+
# Check packages if specified
|
|
478
|
+
if "packages" in test_case.config:
|
|
479
|
+
container = container_backend._adapter.containers_created[0]
|
|
480
|
+
command_str = " ".join(container["command"])
|
|
481
|
+
assert "pip install" in command_str
|
|
482
|
+
for package in test_case.config["packages"]:
|
|
483
|
+
assert package in command_str
|
|
484
|
+
|
|
485
|
+
# Check nproc_per_node if specified
|
|
486
|
+
if "expected_nproc_per_node" in test_case.config:
|
|
487
|
+
container = container_backend._adapter.containers_created[0]
|
|
488
|
+
command_str = container["command"][2] # Get bash script content
|
|
489
|
+
expected_nproc = test_case.config["expected_nproc_per_node"]
|
|
490
|
+
assert f"--nproc_per_node={expected_nproc}" in command_str, (
|
|
491
|
+
f"Expected --nproc_per_node={expected_nproc} in command, but got: {command_str}"
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
except Exception as e:
|
|
495
|
+
assert type(e) is test_case.expected_error
|
|
496
|
+
print("test execution complete")
|
|
497
|
+
|
|
498
|
+
|
|
499
|
+
@pytest.mark.parametrize(
|
|
500
|
+
"test_case",
|
|
501
|
+
[
|
|
502
|
+
TestCase(
|
|
503
|
+
name="list all jobs",
|
|
504
|
+
expected_status=SUCCESS,
|
|
505
|
+
config={"num_jobs": 2},
|
|
506
|
+
),
|
|
507
|
+
TestCase(
|
|
508
|
+
name="list empty jobs",
|
|
509
|
+
expected_status=SUCCESS,
|
|
510
|
+
config={"num_jobs": 0},
|
|
511
|
+
),
|
|
512
|
+
],
|
|
513
|
+
)
|
|
514
|
+
def test_list_jobs(container_backend, test_case):
|
|
515
|
+
"""Test listing training jobs."""
|
|
516
|
+
print("Executing test:", test_case.name)
|
|
517
|
+
try:
|
|
518
|
+
runtime = container_backend.get_runtime(constants.DEFAULT_TRAINING_RUNTIME)
|
|
519
|
+
created_jobs = []
|
|
520
|
+
|
|
521
|
+
for _ in range(test_case.config["num_jobs"]):
|
|
522
|
+
trainer = types.CustomTrainer(func=simple_train_func, num_nodes=1)
|
|
523
|
+
job_name = container_backend.train(runtime=runtime, trainer=trainer)
|
|
524
|
+
created_jobs.append(job_name)
|
|
525
|
+
|
|
526
|
+
jobs = container_backend.list_jobs()
|
|
527
|
+
|
|
528
|
+
assert test_case.expected_status == SUCCESS
|
|
529
|
+
assert len(jobs) == test_case.config["num_jobs"]
|
|
530
|
+
if test_case.config["num_jobs"] > 0:
|
|
531
|
+
job_names = [job.name for job in jobs]
|
|
532
|
+
for created_job in created_jobs:
|
|
533
|
+
assert created_job in job_names
|
|
534
|
+
|
|
535
|
+
except Exception as e:
|
|
536
|
+
assert type(e) is test_case.expected_error
|
|
537
|
+
print("test execution complete")
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
@pytest.mark.parametrize(
|
|
541
|
+
"test_case",
|
|
542
|
+
[
|
|
543
|
+
TestCase(
|
|
544
|
+
name="get existing job",
|
|
545
|
+
expected_status=SUCCESS,
|
|
546
|
+
config={"num_nodes": 2},
|
|
547
|
+
),
|
|
548
|
+
TestCase(
|
|
549
|
+
name="get nonexistent job",
|
|
550
|
+
expected_status=FAILED,
|
|
551
|
+
config={"job_name": "nonexistent-job"},
|
|
552
|
+
expected_error=ValueError,
|
|
553
|
+
),
|
|
554
|
+
],
|
|
555
|
+
)
|
|
556
|
+
def test_get_job(container_backend, test_case):
|
|
557
|
+
"""Test getting a specific job."""
|
|
558
|
+
print("Executing test:", test_case.name)
|
|
559
|
+
try:
|
|
560
|
+
if test_case.name == "get existing job":
|
|
561
|
+
trainer = types.CustomTrainer(
|
|
562
|
+
func=simple_train_func, num_nodes=test_case.config["num_nodes"]
|
|
563
|
+
)
|
|
564
|
+
runtime = container_backend.get_runtime(constants.DEFAULT_TRAINING_RUNTIME)
|
|
565
|
+
job_name = container_backend.train(runtime=runtime, trainer=trainer)
|
|
566
|
+
|
|
567
|
+
job = container_backend.get_job(job_name)
|
|
568
|
+
|
|
569
|
+
assert test_case.expected_status == SUCCESS
|
|
570
|
+
assert job.name == job_name
|
|
571
|
+
assert job.num_nodes == test_case.config["num_nodes"]
|
|
572
|
+
assert len(job.steps) == test_case.config["num_nodes"]
|
|
573
|
+
|
|
574
|
+
elif test_case.name == "get nonexistent job":
|
|
575
|
+
container_backend.get_job(test_case.config["job_name"])
|
|
576
|
+
|
|
577
|
+
except Exception as e:
|
|
578
|
+
assert type(e) is test_case.expected_error
|
|
579
|
+
print("test execution complete")
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
@pytest.mark.parametrize(
|
|
583
|
+
"test_case",
|
|
584
|
+
[
|
|
585
|
+
TestCase(
|
|
586
|
+
name="get logs no follow",
|
|
587
|
+
expected_status=SUCCESS,
|
|
588
|
+
config={"follow": False},
|
|
589
|
+
),
|
|
590
|
+
TestCase(
|
|
591
|
+
name="get logs with follow",
|
|
592
|
+
expected_status=SUCCESS,
|
|
593
|
+
config={"follow": True},
|
|
594
|
+
),
|
|
595
|
+
],
|
|
596
|
+
)
|
|
597
|
+
def test_get_job_logs(container_backend, test_case):
|
|
598
|
+
"""Test getting job logs."""
|
|
599
|
+
print("Executing test:", test_case.name)
|
|
600
|
+
try:
|
|
601
|
+
trainer = types.CustomTrainer(func=simple_train_func, num_nodes=1)
|
|
602
|
+
runtime = container_backend.get_runtime(constants.DEFAULT_TRAINING_RUNTIME)
|
|
603
|
+
job_name = container_backend.train(runtime=runtime, trainer=trainer)
|
|
604
|
+
|
|
605
|
+
logs = list(container_backend.get_job_logs(job_name, follow=test_case.config["follow"]))
|
|
606
|
+
|
|
607
|
+
assert test_case.expected_status == SUCCESS
|
|
608
|
+
assert len(logs) > 0
|
|
609
|
+
if test_case.config["follow"]:
|
|
610
|
+
assert any("Log line" in log for log in logs)
|
|
611
|
+
else:
|
|
612
|
+
assert any("Complete log" in log for log in logs)
|
|
613
|
+
|
|
614
|
+
except Exception as e:
|
|
615
|
+
assert type(e) is test_case.expected_error
|
|
616
|
+
print("test execution complete")
|
|
617
|
+
|
|
618
|
+
|
|
619
|
+
@pytest.mark.parametrize(
|
|
620
|
+
"test_case",
|
|
621
|
+
[
|
|
622
|
+
TestCase(
|
|
623
|
+
name="wait for complete",
|
|
624
|
+
expected_status=SUCCESS,
|
|
625
|
+
config={"wait_status": constants.TRAINJOB_COMPLETE, "container_exit_code": 0},
|
|
626
|
+
),
|
|
627
|
+
TestCase(
|
|
628
|
+
name="wait timeout",
|
|
629
|
+
expected_status=FAILED,
|
|
630
|
+
config={"wait_status": constants.TRAINJOB_COMPLETE, "timeout": 2},
|
|
631
|
+
expected_error=TimeoutError,
|
|
632
|
+
),
|
|
633
|
+
TestCase(
|
|
634
|
+
name="job fails",
|
|
635
|
+
expected_status=FAILED,
|
|
636
|
+
config={"wait_status": constants.TRAINJOB_COMPLETE, "container_exit_code": 1},
|
|
637
|
+
expected_error=RuntimeError,
|
|
638
|
+
),
|
|
639
|
+
],
|
|
640
|
+
)
|
|
641
|
+
def test_wait_for_job_status(container_backend, test_case):
|
|
642
|
+
"""Test waiting for job status."""
|
|
643
|
+
print("Executing test:", test_case.name)
|
|
644
|
+
try:
|
|
645
|
+
trainer = types.CustomTrainer(func=simple_train_func, num_nodes=1)
|
|
646
|
+
runtime = container_backend.get_runtime(constants.DEFAULT_TRAINING_RUNTIME)
|
|
647
|
+
job_name = container_backend.train(runtime=runtime, trainer=trainer)
|
|
648
|
+
|
|
649
|
+
if test_case.name == "wait for complete":
|
|
650
|
+
container_id = container_backend._adapter.containers_created[0]["id"]
|
|
651
|
+
container_backend._adapter.set_container_status(
|
|
652
|
+
container_id, "exited", test_case.config["container_exit_code"]
|
|
653
|
+
)
|
|
654
|
+
|
|
655
|
+
completed_job = container_backend.wait_for_job_status(
|
|
656
|
+
job_name, status={test_case.config["wait_status"]}, timeout=5, polling_interval=1
|
|
657
|
+
)
|
|
658
|
+
|
|
659
|
+
assert test_case.expected_status == SUCCESS
|
|
660
|
+
assert completed_job.status == constants.TRAINJOB_COMPLETE
|
|
661
|
+
|
|
662
|
+
elif test_case.name == "wait timeout":
|
|
663
|
+
container_backend.wait_for_job_status(
|
|
664
|
+
job_name,
|
|
665
|
+
status={test_case.config["wait_status"]},
|
|
666
|
+
timeout=test_case.config["timeout"],
|
|
667
|
+
polling_interval=1,
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
elif test_case.name == "job fails":
|
|
671
|
+
container_id = container_backend._adapter.containers_created[0]["id"]
|
|
672
|
+
container_backend._adapter.set_container_status(
|
|
673
|
+
container_id, "exited", test_case.config["container_exit_code"]
|
|
674
|
+
)
|
|
675
|
+
|
|
676
|
+
container_backend.wait_for_job_status(
|
|
677
|
+
job_name, status={test_case.config["wait_status"]}, timeout=5, polling_interval=1
|
|
678
|
+
)
|
|
679
|
+
|
|
680
|
+
except Exception as e:
|
|
681
|
+
assert type(e) is test_case.expected_error
|
|
682
|
+
print("test execution complete")
|
|
683
|
+
|
|
684
|
+
|
|
685
|
+
@pytest.mark.parametrize(
|
|
686
|
+
"test_case",
|
|
687
|
+
[
|
|
688
|
+
TestCase(
|
|
689
|
+
name="delete with auto_remove true",
|
|
690
|
+
expected_status=SUCCESS,
|
|
691
|
+
config={"auto_remove": True, "num_nodes": 2},
|
|
692
|
+
),
|
|
693
|
+
TestCase(
|
|
694
|
+
name="delete with auto_remove false",
|
|
695
|
+
expected_status=SUCCESS,
|
|
696
|
+
config={"auto_remove": False, "num_nodes": 2},
|
|
697
|
+
),
|
|
698
|
+
],
|
|
699
|
+
)
|
|
700
|
+
def test_delete_job(container_backend, temp_workdir, test_case):
|
|
701
|
+
"""Test deleting a job."""
|
|
702
|
+
print("Executing test:", test_case.name)
|
|
703
|
+
try:
|
|
704
|
+
container_backend.cfg.auto_remove = test_case.config["auto_remove"]
|
|
705
|
+
|
|
706
|
+
trainer = types.CustomTrainer(
|
|
707
|
+
func=simple_train_func, num_nodes=test_case.config["num_nodes"]
|
|
708
|
+
)
|
|
709
|
+
runtime = container_backend.get_runtime(constants.DEFAULT_TRAINING_RUNTIME)
|
|
710
|
+
job_name = container_backend.train(runtime=runtime, trainer=trainer)
|
|
711
|
+
|
|
712
|
+
job_workdir = Path.home() / ".kubeflow" / "trainer" / "containers" / job_name
|
|
713
|
+
assert job_workdir.exists()
|
|
714
|
+
|
|
715
|
+
container_backend.delete_job(job_name)
|
|
716
|
+
|
|
717
|
+
assert test_case.expected_status == SUCCESS
|
|
718
|
+
assert len(container_backend._adapter.containers_stopped) == test_case.config["num_nodes"]
|
|
719
|
+
assert len(container_backend._adapter.containers_removed) == test_case.config["num_nodes"]
|
|
720
|
+
assert len(container_backend._adapter.networks_deleted) == 1
|
|
721
|
+
|
|
722
|
+
if test_case.config["auto_remove"]:
|
|
723
|
+
assert not job_workdir.exists()
|
|
724
|
+
else:
|
|
725
|
+
assert job_workdir.exists()
|
|
726
|
+
|
|
727
|
+
except Exception as e:
|
|
728
|
+
assert type(e) is test_case.expected_error
|
|
729
|
+
print("test execution complete")
|
|
730
|
+
|
|
731
|
+
|
|
732
|
+
@pytest.mark.parametrize(
|
|
733
|
+
"test_case",
|
|
734
|
+
[
|
|
735
|
+
TestCase(
|
|
736
|
+
name="running container",
|
|
737
|
+
expected_status=SUCCESS,
|
|
738
|
+
config={
|
|
739
|
+
"container_status": "running",
|
|
740
|
+
"exit_code": None,
|
|
741
|
+
"expected_job_status": constants.TRAINJOB_RUNNING,
|
|
742
|
+
},
|
|
743
|
+
),
|
|
744
|
+
TestCase(
|
|
745
|
+
name="exited success",
|
|
746
|
+
expected_status=SUCCESS,
|
|
747
|
+
config={
|
|
748
|
+
"container_status": "exited",
|
|
749
|
+
"exit_code": 0,
|
|
750
|
+
"expected_job_status": constants.TRAINJOB_COMPLETE,
|
|
751
|
+
},
|
|
752
|
+
),
|
|
753
|
+
TestCase(
|
|
754
|
+
name="exited failure",
|
|
755
|
+
expected_status=SUCCESS,
|
|
756
|
+
config={
|
|
757
|
+
"container_status": "exited",
|
|
758
|
+
"exit_code": 1,
|
|
759
|
+
"expected_job_status": constants.TRAINJOB_FAILED,
|
|
760
|
+
},
|
|
761
|
+
),
|
|
762
|
+
],
|
|
763
|
+
)
|
|
764
|
+
def test_container_status_mapping(container_backend, test_case):
|
|
765
|
+
"""Test container status mapping to TrainJob status."""
|
|
766
|
+
print("Executing test:", test_case.name)
|
|
767
|
+
try:
|
|
768
|
+
trainer = types.CustomTrainer(func=simple_train_func, num_nodes=1)
|
|
769
|
+
runtime = container_backend.get_runtime(constants.DEFAULT_TRAINING_RUNTIME)
|
|
770
|
+
job_name = container_backend.train(runtime=runtime, trainer=trainer)
|
|
771
|
+
|
|
772
|
+
container_id = container_backend._adapter.containers_created[0]["id"]
|
|
773
|
+
container_backend._adapter.set_container_status(
|
|
774
|
+
container_id, test_case.config["container_status"], test_case.config["exit_code"]
|
|
775
|
+
)
|
|
776
|
+
|
|
777
|
+
job = container_backend.get_job(job_name)
|
|
778
|
+
|
|
779
|
+
assert test_case.expected_status == SUCCESS
|
|
780
|
+
assert job.status == test_case.config["expected_job_status"]
|
|
781
|
+
|
|
782
|
+
except Exception as e:
|
|
783
|
+
assert type(e) is test_case.expected_error
|
|
784
|
+
print("test execution complete")
|
|
785
|
+
|
|
786
|
+
|
|
787
|
+
@pytest.mark.parametrize(
|
|
788
|
+
"test_case",
|
|
789
|
+
[
|
|
790
|
+
TestCase(
|
|
791
|
+
name="docker socket locations with colima",
|
|
792
|
+
expected_status=SUCCESS,
|
|
793
|
+
config={
|
|
794
|
+
"runtime_name": "docker",
|
|
795
|
+
"container_host": None,
|
|
796
|
+
"create_colima_socket": True,
|
|
797
|
+
"expected_contains_none": True,
|
|
798
|
+
"expected_has_colima": True,
|
|
799
|
+
},
|
|
800
|
+
),
|
|
801
|
+
TestCase(
|
|
802
|
+
name="custom host has priority",
|
|
803
|
+
expected_status=SUCCESS,
|
|
804
|
+
config={
|
|
805
|
+
"runtime_name": "docker",
|
|
806
|
+
"container_host": "unix:///custom/path/docker.sock",
|
|
807
|
+
"create_colima_socket": False,
|
|
808
|
+
"expected_first": "unix:///custom/path/docker.sock",
|
|
809
|
+
},
|
|
810
|
+
),
|
|
811
|
+
],
|
|
812
|
+
)
|
|
813
|
+
def test_get_common_socket_locations(test_case, tmp_path):
|
|
814
|
+
"""Test common socket location detection."""
|
|
815
|
+
print("Executing test:", test_case.name)
|
|
816
|
+
|
|
817
|
+
# Setup
|
|
818
|
+
if test_case.config.get("create_colima_socket"):
|
|
819
|
+
colima_dir = tmp_path / ".colima" / "default"
|
|
820
|
+
colima_dir.mkdir(parents=True)
|
|
821
|
+
colima_sock = colima_dir / "docker.sock"
|
|
822
|
+
colima_sock.touch()
|
|
823
|
+
|
|
824
|
+
cfg = ContainerBackendConfig(container_host=test_case.config["container_host"])
|
|
825
|
+
|
|
826
|
+
# Test the method directly without creating the backend
|
|
827
|
+
context_manager = (
|
|
828
|
+
patch("pathlib.Path.home", return_value=tmp_path)
|
|
829
|
+
if test_case.config.get("create_colima_socket")
|
|
830
|
+
else nullcontext()
|
|
831
|
+
)
|
|
832
|
+
|
|
833
|
+
with context_manager:
|
|
834
|
+
backend = ContainerBackend.__new__(ContainerBackend)
|
|
835
|
+
backend.cfg = cfg
|
|
836
|
+
locations = backend._get_common_socket_locations(test_case.config["runtime_name"])
|
|
837
|
+
|
|
838
|
+
# Assertions
|
|
839
|
+
if "expected_contains_none" in test_case.config:
|
|
840
|
+
assert None in locations
|
|
841
|
+
|
|
842
|
+
if "expected_has_colima" in test_case.config:
|
|
843
|
+
assert f"unix://{colima_sock}" in locations
|
|
844
|
+
|
|
845
|
+
if "expected_first" in test_case.config:
|
|
846
|
+
assert locations[0] == test_case.config["expected_first"]
|
|
847
|
+
|
|
848
|
+
print("test execution complete")
|
|
849
|
+
|
|
850
|
+
|
|
851
|
+
def test_create_adapter_error_message_format():
|
|
852
|
+
"""Test that error message includes attempted connections."""
|
|
853
|
+
cfg = ContainerBackendConfig(container_runtime="docker")
|
|
854
|
+
|
|
855
|
+
docker_adapter = (
|
|
856
|
+
"viettelcloud.aiplatform.trainer.backends.container.adapters.docker.DockerClientAdapter"
|
|
857
|
+
)
|
|
858
|
+
with patch(docker_adapter) as mock_docker:
|
|
859
|
+
mock_docker.side_effect = Exception("Connection failed")
|
|
860
|
+
|
|
861
|
+
with pytest.raises(RuntimeError) as exc_info:
|
|
862
|
+
ContainerBackend(cfg)
|
|
863
|
+
|
|
864
|
+
# Error message should be helpful
|
|
865
|
+
error_msg = str(exc_info.value)
|
|
866
|
+
assert "Could not connect" in error_msg
|
|
867
|
+
assert "tried:" in error_msg
|