viettelcloud-aiplatform 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. viettelcloud/__init__.py +1 -0
  2. viettelcloud/aiplatform/__init__.py +15 -0
  3. viettelcloud/aiplatform/common/__init__.py +0 -0
  4. viettelcloud/aiplatform/common/constants.py +22 -0
  5. viettelcloud/aiplatform/common/types.py +28 -0
  6. viettelcloud/aiplatform/common/utils.py +40 -0
  7. viettelcloud/aiplatform/hub/OWNERS +14 -0
  8. viettelcloud/aiplatform/hub/__init__.py +25 -0
  9. viettelcloud/aiplatform/hub/api/__init__.py +13 -0
  10. viettelcloud/aiplatform/hub/api/_proxy_client.py +355 -0
  11. viettelcloud/aiplatform/hub/api/model_registry_client.py +561 -0
  12. viettelcloud/aiplatform/hub/api/model_registry_client_test.py +462 -0
  13. viettelcloud/aiplatform/optimizer/__init__.py +45 -0
  14. viettelcloud/aiplatform/optimizer/api/__init__.py +0 -0
  15. viettelcloud/aiplatform/optimizer/api/optimizer_client.py +248 -0
  16. viettelcloud/aiplatform/optimizer/backends/__init__.py +13 -0
  17. viettelcloud/aiplatform/optimizer/backends/base.py +77 -0
  18. viettelcloud/aiplatform/optimizer/backends/kubernetes/__init__.py +13 -0
  19. viettelcloud/aiplatform/optimizer/backends/kubernetes/backend.py +563 -0
  20. viettelcloud/aiplatform/optimizer/backends/kubernetes/utils.py +112 -0
  21. viettelcloud/aiplatform/optimizer/constants/__init__.py +13 -0
  22. viettelcloud/aiplatform/optimizer/constants/constants.py +59 -0
  23. viettelcloud/aiplatform/optimizer/types/__init__.py +13 -0
  24. viettelcloud/aiplatform/optimizer/types/algorithm_types.py +87 -0
  25. viettelcloud/aiplatform/optimizer/types/optimization_types.py +135 -0
  26. viettelcloud/aiplatform/optimizer/types/search_types.py +95 -0
  27. viettelcloud/aiplatform/py.typed +0 -0
  28. viettelcloud/aiplatform/trainer/__init__.py +82 -0
  29. viettelcloud/aiplatform/trainer/api/__init__.py +3 -0
  30. viettelcloud/aiplatform/trainer/api/trainer_client.py +277 -0
  31. viettelcloud/aiplatform/trainer/api/trainer_client_test.py +72 -0
  32. viettelcloud/aiplatform/trainer/backends/__init__.py +0 -0
  33. viettelcloud/aiplatform/trainer/backends/base.py +94 -0
  34. viettelcloud/aiplatform/trainer/backends/container/adapters/base.py +195 -0
  35. viettelcloud/aiplatform/trainer/backends/container/adapters/docker.py +231 -0
  36. viettelcloud/aiplatform/trainer/backends/container/adapters/podman.py +258 -0
  37. viettelcloud/aiplatform/trainer/backends/container/backend.py +668 -0
  38. viettelcloud/aiplatform/trainer/backends/container/backend_test.py +867 -0
  39. viettelcloud/aiplatform/trainer/backends/container/runtime_loader.py +631 -0
  40. viettelcloud/aiplatform/trainer/backends/container/runtime_loader_test.py +637 -0
  41. viettelcloud/aiplatform/trainer/backends/container/types.py +67 -0
  42. viettelcloud/aiplatform/trainer/backends/container/utils.py +213 -0
  43. viettelcloud/aiplatform/trainer/backends/kubernetes/__init__.py +0 -0
  44. viettelcloud/aiplatform/trainer/backends/kubernetes/backend.py +710 -0
  45. viettelcloud/aiplatform/trainer/backends/kubernetes/backend_test.py +1344 -0
  46. viettelcloud/aiplatform/trainer/backends/kubernetes/constants.py +15 -0
  47. viettelcloud/aiplatform/trainer/backends/kubernetes/utils.py +636 -0
  48. viettelcloud/aiplatform/trainer/backends/kubernetes/utils_test.py +582 -0
  49. viettelcloud/aiplatform/trainer/backends/localprocess/__init__.py +0 -0
  50. viettelcloud/aiplatform/trainer/backends/localprocess/backend.py +306 -0
  51. viettelcloud/aiplatform/trainer/backends/localprocess/backend_test.py +501 -0
  52. viettelcloud/aiplatform/trainer/backends/localprocess/constants.py +90 -0
  53. viettelcloud/aiplatform/trainer/backends/localprocess/job.py +184 -0
  54. viettelcloud/aiplatform/trainer/backends/localprocess/types.py +52 -0
  55. viettelcloud/aiplatform/trainer/backends/localprocess/utils.py +302 -0
  56. viettelcloud/aiplatform/trainer/constants/__init__.py +0 -0
  57. viettelcloud/aiplatform/trainer/constants/constants.py +179 -0
  58. viettelcloud/aiplatform/trainer/options/__init__.py +52 -0
  59. viettelcloud/aiplatform/trainer/options/common.py +55 -0
  60. viettelcloud/aiplatform/trainer/options/kubernetes.py +502 -0
  61. viettelcloud/aiplatform/trainer/options/kubernetes_test.py +259 -0
  62. viettelcloud/aiplatform/trainer/options/localprocess.py +20 -0
  63. viettelcloud/aiplatform/trainer/test/common.py +22 -0
  64. viettelcloud/aiplatform/trainer/types/__init__.py +0 -0
  65. viettelcloud/aiplatform/trainer/types/types.py +517 -0
  66. viettelcloud/aiplatform/trainer/types/types_test.py +115 -0
  67. viettelcloud_aiplatform-0.3.0.dist-info/METADATA +226 -0
  68. viettelcloud_aiplatform-0.3.0.dist-info/RECORD +71 -0
  69. viettelcloud_aiplatform-0.3.0.dist-info/WHEEL +4 -0
  70. viettelcloud_aiplatform-0.3.0.dist-info/licenses/LICENSE +201 -0
  71. viettelcloud_aiplatform-0.3.0.dist-info/licenses/NOTICE +36 -0
@@ -0,0 +1,867 @@
1
+ # Copyright 2025 The Kubeflow Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ Unit tests for ContainerBackend.
17
+
18
+ Tests the ContainerBackend class with mocked container adapters.
19
+ """
20
+
21
+ from collections.abc import Iterator
22
+ from contextlib import nullcontext
23
+ import os
24
+ from pathlib import Path
25
+ import shutil
26
+ import tempfile
27
+ from typing import Optional
28
+ from unittest.mock import Mock, patch
29
+
30
+ import pytest
31
+
32
+ from viettelcloud.aiplatform.trainer.backends.container.adapters.base import (
33
+ BaseContainerClientAdapter,
34
+ )
35
+ from viettelcloud.aiplatform.trainer.backends.container.backend import ContainerBackend
36
+ from viettelcloud.aiplatform.trainer.backends.container.types import ContainerBackendConfig
37
+ from viettelcloud.aiplatform.trainer.constants import constants
38
+ from viettelcloud.aiplatform.trainer.test.common import FAILED, SUCCESS, TestCase
39
+ from viettelcloud.aiplatform.trainer.types import types
40
+
41
+
42
+ # Mock Container Adapter
43
+ class MockContainerAdapter(BaseContainerClientAdapter):
44
+ """Mock adapter for testing ContainerBackend without Docker/Podman."""
45
+
46
+ def __init__(self):
47
+ self._runtime_type = "mock"
48
+ self.networks_created = []
49
+ self.containers_created = []
50
+ self.containers_stopped = []
51
+ self.containers_removed = []
52
+ self.networks_deleted = []
53
+ self.images_pulled = []
54
+ self.ping_called = False
55
+
56
+ def ping(self):
57
+ self.ping_called = True
58
+
59
+ def create_network(self, name: str, labels: dict[str, str]) -> str:
60
+ network_id = f"net-{name}"
61
+ self.networks_created.append({"id": network_id, "name": name, "labels": labels})
62
+ return network_id
63
+
64
+ def delete_network(self, network_id: str):
65
+ self.networks_deleted.append(network_id)
66
+
67
+ def create_and_start_container(
68
+ self,
69
+ image: str,
70
+ command: list[str],
71
+ name: str,
72
+ network_id: str,
73
+ environment: dict[str, str],
74
+ labels: dict[str, str],
75
+ volumes: dict[str, dict[str, str]],
76
+ working_dir: str,
77
+ ) -> str:
78
+ container_id = f"container-{len(self.containers_created)}"
79
+ self.containers_created.append(
80
+ {
81
+ "id": container_id,
82
+ "name": name,
83
+ "image": image,
84
+ "command": command,
85
+ "network": network_id,
86
+ "environment": environment,
87
+ "labels": labels,
88
+ "volumes": volumes,
89
+ "working_dir": working_dir,
90
+ "status": "running",
91
+ "exit_code": None,
92
+ }
93
+ )
94
+ return container_id
95
+
96
+ def get_container(self, container_id: str):
97
+ for container in self.containers_created:
98
+ if container["id"] == container_id:
99
+ return Mock(id=container_id, status=container["status"])
100
+ return None
101
+
102
+ def container_logs(self, container_id: str, follow: bool) -> Iterator[str]:
103
+ if follow:
104
+ yield f"Log line 1 from {container_id}\n"
105
+ yield f"Log line 2 from {container_id}\n"
106
+ else:
107
+ yield f"Complete log from {container_id}\n"
108
+
109
+ def stop_container(self, container_id: str, timeout: int = 10):
110
+ self.containers_stopped.append(container_id)
111
+ for container in self.containers_created:
112
+ if container["id"] == container_id:
113
+ container["status"] = "exited"
114
+ container["exit_code"] = 0
115
+
116
+ def remove_container(self, container_id: str, force: bool = True):
117
+ self.containers_removed.append(container_id)
118
+
119
+ def pull_image(self, image: str):
120
+ self.images_pulled.append(image)
121
+
122
+ def image_exists(self, image: str) -> bool:
123
+ return "local" in image or image in self.images_pulled
124
+
125
+ def run_oneoff_container(self, image: str, command: list[str]) -> str:
126
+ return "Python 3.9.0\npip 21.0.1\nnvidia-smi not found\n"
127
+
128
+ def container_status(self, container_id: str) -> tuple[str, Optional[int]]:
129
+ for container in self.containers_created:
130
+ if container["id"] == container_id:
131
+ return (container["status"], container.get("exit_code"))
132
+ return ("unknown", None)
133
+
134
+ def set_container_status(self, container_id: str, status: str, exit_code: Optional[int] = None):
135
+ """Helper method to set container status for testing."""
136
+ for container in self.containers_created:
137
+ if container["id"] == container_id:
138
+ container["status"] = status
139
+ container["exit_code"] = exit_code
140
+
141
+ def get_container_ip(self, container_id: str, network_id: str) -> Optional[str]:
142
+ """Get container IP address on a specific network."""
143
+ for container in self.containers_created:
144
+ if container["id"] == container_id:
145
+ return f"192.168.1.{len(self.containers_created)}"
146
+ return None
147
+
148
+ def list_containers(self, filters: Optional[dict[str, list[str]]] = None) -> list[dict]:
149
+ """List containers with optional filters."""
150
+ if not filters:
151
+ return [
152
+ {
153
+ "id": c["id"],
154
+ "name": c["name"],
155
+ "labels": c["labels"],
156
+ "status": c["status"],
157
+ "created": "2025-01-01T00:00:00Z",
158
+ }
159
+ for c in self.containers_created
160
+ ]
161
+
162
+ # Simple label filtering
163
+ result = []
164
+ for container in self.containers_created:
165
+ if "label" in filters:
166
+ match = True
167
+ for label_filter in filters["label"]:
168
+ if "=" in label_filter:
169
+ key, value = label_filter.split("=", 1)
170
+ if container["labels"].get(key) != value:
171
+ match = False
172
+ break
173
+ else:
174
+ if label_filter not in container["labels"]:
175
+ match = False
176
+ break
177
+ if match:
178
+ result.append(
179
+ {
180
+ "id": container["id"],
181
+ "name": container["name"],
182
+ "labels": container["labels"],
183
+ "status": container["status"],
184
+ "created": "2025-01-01T00:00:00Z",
185
+ }
186
+ )
187
+ return result
188
+
189
+ def get_network(self, network_id: str) -> Optional[dict]:
190
+ """Get network information."""
191
+ for network in self.networks_created:
192
+ if network["id"] == network_id or network["name"] == network_id:
193
+ return {
194
+ "id": network["id"],
195
+ "name": network["name"],
196
+ "labels": network["labels"],
197
+ }
198
+ return None
199
+
200
+
201
+ # Fixtures
202
+ @pytest.fixture
203
+ def container_backend():
204
+ """Provide ContainerBackend with mocked adapter."""
205
+ with patch(
206
+ "viettelcloud.aiplatform.trainer.backends.container.backend.DockerClientAdapter"
207
+ ) as mock_docker:
208
+ mock_docker.return_value = MockContainerAdapter()
209
+ backend = ContainerBackend(ContainerBackendConfig())
210
+ return backend
211
+
212
+
213
+ @pytest.fixture
214
+ def temp_workdir():
215
+ """Provide a temporary working directory."""
216
+ tmpdir = tempfile.mkdtemp()
217
+ yield tmpdir
218
+ if os.path.exists(tmpdir):
219
+ shutil.rmtree(tmpdir)
220
+
221
+
222
+ # Helper Function
223
+ def simple_train_func():
224
+ """Simple training function for tests."""
225
+ print("Training")
226
+
227
+
228
+ # Tests
229
+ @pytest.mark.parametrize(
230
+ "test_case",
231
+ [
232
+ TestCase(
233
+ name="auto-detect docker first",
234
+ expected_status=SUCCESS,
235
+ ),
236
+ TestCase(
237
+ name="auto-detect falls back to podman",
238
+ expected_status=SUCCESS,
239
+ ),
240
+ TestCase(
241
+ name="both unavailable raises error",
242
+ expected_status=FAILED,
243
+ expected_error=RuntimeError,
244
+ ),
245
+ ],
246
+ )
247
+ def test_backend_initialization(test_case):
248
+ """Test ContainerBackend initialization and adapter creation."""
249
+ print("Executing test:", test_case.name)
250
+ try:
251
+ if test_case.name == "auto-detect docker first":
252
+ with (
253
+ patch(
254
+ "viettelcloud.aiplatform.trainer.backends.container.backend.DockerClientAdapter"
255
+ ) as mock_docker,
256
+ patch(
257
+ "viettelcloud.aiplatform.trainer.backends.container.backend.PodmanClientAdapter"
258
+ ) as mock_podman,
259
+ ):
260
+ mock_docker_instance = Mock()
261
+ mock_docker.return_value = mock_docker_instance
262
+
263
+ _ = ContainerBackend(ContainerBackendConfig())
264
+
265
+ # Docker should be called (could be with Colima socket or None)
266
+ assert mock_docker.call_count == 1
267
+ mock_docker_instance.ping.assert_called_once()
268
+ mock_podman.assert_not_called()
269
+ assert test_case.expected_status == SUCCESS
270
+
271
+ elif test_case.name == "auto-detect falls back to podman":
272
+ with (
273
+ patch(
274
+ "viettelcloud.aiplatform.trainer.backends.container.backend.DockerClientAdapter"
275
+ ) as mock_docker,
276
+ patch(
277
+ "viettelcloud.aiplatform.trainer.backends.container.backend.PodmanClientAdapter"
278
+ ) as mock_podman,
279
+ ):
280
+ mock_docker_instance = Mock()
281
+ mock_docker_instance.ping.side_effect = Exception("Docker not available")
282
+ mock_docker.return_value = mock_docker_instance
283
+
284
+ mock_podman_instance = Mock()
285
+ mock_podman.return_value = mock_podman_instance
286
+
287
+ _ = ContainerBackend(ContainerBackendConfig())
288
+
289
+ # Docker may be tried multiple times (different socket locations)
290
+ assert mock_docker.call_count >= 1
291
+ mock_podman.assert_called_once_with(None)
292
+ mock_podman_instance.ping.assert_called_once()
293
+ assert test_case.expected_status == SUCCESS
294
+
295
+ elif test_case.name == "both unavailable raises error":
296
+ with (
297
+ patch(
298
+ "viettelcloud.aiplatform.trainer.backends.container.backend.DockerClientAdapter"
299
+ ) as mock_docker,
300
+ patch(
301
+ "viettelcloud.aiplatform.trainer.backends.container.backend.PodmanClientAdapter"
302
+ ) as mock_podman,
303
+ ):
304
+ mock_docker_instance = Mock()
305
+ mock_docker_instance.ping.side_effect = Exception("Docker not available")
306
+ mock_docker.return_value = mock_docker_instance
307
+
308
+ mock_podman_instance = Mock()
309
+ mock_podman_instance.ping.side_effect = Exception("Podman not available")
310
+ mock_podman.return_value = mock_podman_instance
311
+
312
+ ContainerBackend(ContainerBackendConfig())
313
+
314
+ except Exception as e:
315
+ assert type(e) is test_case.expected_error
316
+ print("test execution complete")
317
+
318
+
319
+ def test_list_runtimes(container_backend):
320
+ """Test listing available local runtimes."""
321
+ print("Executing test: list_runtimes")
322
+ runtimes = container_backend.list_runtimes()
323
+
324
+ assert isinstance(runtimes, list)
325
+ assert len(runtimes) > 0
326
+ runtime_names = [r.name for r in runtimes]
327
+ assert constants.DEFAULT_TRAINING_RUNTIME in runtime_names
328
+ print("test execution complete")
329
+
330
+
331
+ @pytest.mark.parametrize(
332
+ "test_case",
333
+ [
334
+ TestCase(
335
+ name="get valid runtime",
336
+ expected_status=SUCCESS,
337
+ config={"name": constants.DEFAULT_TRAINING_RUNTIME},
338
+ ),
339
+ TestCase(
340
+ name="get invalid runtime",
341
+ expected_status=FAILED,
342
+ config={"name": "nonexistent-runtime"},
343
+ expected_error=ValueError,
344
+ ),
345
+ ],
346
+ )
347
+ def test_get_runtime(container_backend, test_case):
348
+ """Test getting a specific runtime."""
349
+ print("Executing test:", test_case.name)
350
+ try:
351
+ runtime = container_backend.get_runtime(**test_case.config)
352
+
353
+ assert test_case.expected_status == SUCCESS
354
+ assert isinstance(runtime, types.Runtime)
355
+ assert runtime.name == test_case.config["name"]
356
+
357
+ except Exception as e:
358
+ assert type(e) is test_case.expected_error
359
+ print("test execution complete")
360
+
361
+
362
+ def test_get_runtime_packages(container_backend):
363
+ """Test getting runtime packages."""
364
+ print("Executing test: get_runtime_packages")
365
+ runtime = container_backend.get_runtime(constants.DEFAULT_TRAINING_RUNTIME)
366
+ container_backend.get_runtime_packages(runtime)
367
+
368
+ assert len(
369
+ container_backend._adapter.images_pulled
370
+ ) > 0 or container_backend._adapter.image_exists(runtime.trainer.image)
371
+ print("test execution complete")
372
+
373
+
374
+ @pytest.mark.parametrize(
375
+ "test_case",
376
+ [
377
+ TestCase(
378
+ name="train single node",
379
+ expected_status=SUCCESS,
380
+ config={"num_nodes": 1, "expected_containers": 1},
381
+ ),
382
+ TestCase(
383
+ name="train multi-node",
384
+ expected_status=SUCCESS,
385
+ config={"num_nodes": 3, "expected_containers": 3},
386
+ ),
387
+ TestCase(
388
+ name="train with custom env",
389
+ expected_status=SUCCESS,
390
+ config={
391
+ "num_nodes": 1,
392
+ "env": {"MY_VAR": "my_value", "DEBUG": "true"},
393
+ "expected_containers": 1,
394
+ },
395
+ ),
396
+ TestCase(
397
+ name="train with packages",
398
+ expected_status=SUCCESS,
399
+ config={
400
+ "num_nodes": 1,
401
+ "packages": ["numpy", "pandas"],
402
+ "expected_containers": 1,
403
+ },
404
+ ),
405
+ TestCase(
406
+ name="train with single GPU",
407
+ expected_status=SUCCESS,
408
+ config={
409
+ "num_nodes": 1,
410
+ "resources_per_node": {"gpu": "1"},
411
+ "expected_containers": 1,
412
+ "expected_nproc_per_node": 1,
413
+ },
414
+ ),
415
+ TestCase(
416
+ name="train with multiple GPUs",
417
+ expected_status=SUCCESS,
418
+ config={
419
+ "num_nodes": 1,
420
+ "resources_per_node": {"gpu": "4"},
421
+ "expected_containers": 1,
422
+ "expected_nproc_per_node": 4,
423
+ },
424
+ ),
425
+ TestCase(
426
+ name="train multi-node with GPUs",
427
+ expected_status=SUCCESS,
428
+ config={
429
+ "num_nodes": 2,
430
+ "resources_per_node": {"gpu": "2"},
431
+ "expected_containers": 2,
432
+ "expected_nproc_per_node": 2,
433
+ },
434
+ ),
435
+ TestCase(
436
+ name="train with CPU resources (nproc=1)",
437
+ expected_status=SUCCESS,
438
+ config={
439
+ "num_nodes": 1,
440
+ "resources_per_node": {"cpu": "16"},
441
+ "expected_containers": 1,
442
+ "expected_nproc_per_node": 1,
443
+ },
444
+ ),
445
+ ],
446
+ )
447
+ def test_train(container_backend, test_case):
448
+ """Test training job creation."""
449
+ print("Executing test:", test_case.name)
450
+ try:
451
+ trainer = types.CustomTrainer(
452
+ func=simple_train_func,
453
+ num_nodes=test_case.config.get("num_nodes", 1),
454
+ env=test_case.config.get("env"),
455
+ packages_to_install=test_case.config.get("packages"),
456
+ resources_per_node=test_case.config.get("resources_per_node"),
457
+ )
458
+ runtime = container_backend.get_runtime(constants.DEFAULT_TRAINING_RUNTIME)
459
+
460
+ job_name = container_backend.train(runtime=runtime, trainer=trainer)
461
+
462
+ assert test_case.expected_status == SUCCESS
463
+ assert job_name is not None
464
+ assert len(job_name) == 12
465
+ assert (
466
+ len(container_backend._adapter.containers_created)
467
+ == test_case.config["expected_containers"]
468
+ )
469
+ assert len(container_backend._adapter.networks_created) == 1
470
+
471
+ # Check environment if specified
472
+ if "env" in test_case.config:
473
+ container = container_backend._adapter.containers_created[0]
474
+ for key, value in test_case.config["env"].items():
475
+ assert container["environment"][key] == value
476
+
477
+ # Check packages if specified
478
+ if "packages" in test_case.config:
479
+ container = container_backend._adapter.containers_created[0]
480
+ command_str = " ".join(container["command"])
481
+ assert "pip install" in command_str
482
+ for package in test_case.config["packages"]:
483
+ assert package in command_str
484
+
485
+ # Check nproc_per_node if specified
486
+ if "expected_nproc_per_node" in test_case.config:
487
+ container = container_backend._adapter.containers_created[0]
488
+ command_str = container["command"][2] # Get bash script content
489
+ expected_nproc = test_case.config["expected_nproc_per_node"]
490
+ assert f"--nproc_per_node={expected_nproc}" in command_str, (
491
+ f"Expected --nproc_per_node={expected_nproc} in command, but got: {command_str}"
492
+ )
493
+
494
+ except Exception as e:
495
+ assert type(e) is test_case.expected_error
496
+ print("test execution complete")
497
+
498
+
499
+ @pytest.mark.parametrize(
500
+ "test_case",
501
+ [
502
+ TestCase(
503
+ name="list all jobs",
504
+ expected_status=SUCCESS,
505
+ config={"num_jobs": 2},
506
+ ),
507
+ TestCase(
508
+ name="list empty jobs",
509
+ expected_status=SUCCESS,
510
+ config={"num_jobs": 0},
511
+ ),
512
+ ],
513
+ )
514
+ def test_list_jobs(container_backend, test_case):
515
+ """Test listing training jobs."""
516
+ print("Executing test:", test_case.name)
517
+ try:
518
+ runtime = container_backend.get_runtime(constants.DEFAULT_TRAINING_RUNTIME)
519
+ created_jobs = []
520
+
521
+ for _ in range(test_case.config["num_jobs"]):
522
+ trainer = types.CustomTrainer(func=simple_train_func, num_nodes=1)
523
+ job_name = container_backend.train(runtime=runtime, trainer=trainer)
524
+ created_jobs.append(job_name)
525
+
526
+ jobs = container_backend.list_jobs()
527
+
528
+ assert test_case.expected_status == SUCCESS
529
+ assert len(jobs) == test_case.config["num_jobs"]
530
+ if test_case.config["num_jobs"] > 0:
531
+ job_names = [job.name for job in jobs]
532
+ for created_job in created_jobs:
533
+ assert created_job in job_names
534
+
535
+ except Exception as e:
536
+ assert type(e) is test_case.expected_error
537
+ print("test execution complete")
538
+
539
+
540
+ @pytest.mark.parametrize(
541
+ "test_case",
542
+ [
543
+ TestCase(
544
+ name="get existing job",
545
+ expected_status=SUCCESS,
546
+ config={"num_nodes": 2},
547
+ ),
548
+ TestCase(
549
+ name="get nonexistent job",
550
+ expected_status=FAILED,
551
+ config={"job_name": "nonexistent-job"},
552
+ expected_error=ValueError,
553
+ ),
554
+ ],
555
+ )
556
+ def test_get_job(container_backend, test_case):
557
+ """Test getting a specific job."""
558
+ print("Executing test:", test_case.name)
559
+ try:
560
+ if test_case.name == "get existing job":
561
+ trainer = types.CustomTrainer(
562
+ func=simple_train_func, num_nodes=test_case.config["num_nodes"]
563
+ )
564
+ runtime = container_backend.get_runtime(constants.DEFAULT_TRAINING_RUNTIME)
565
+ job_name = container_backend.train(runtime=runtime, trainer=trainer)
566
+
567
+ job = container_backend.get_job(job_name)
568
+
569
+ assert test_case.expected_status == SUCCESS
570
+ assert job.name == job_name
571
+ assert job.num_nodes == test_case.config["num_nodes"]
572
+ assert len(job.steps) == test_case.config["num_nodes"]
573
+
574
+ elif test_case.name == "get nonexistent job":
575
+ container_backend.get_job(test_case.config["job_name"])
576
+
577
+ except Exception as e:
578
+ assert type(e) is test_case.expected_error
579
+ print("test execution complete")
580
+
581
+
582
+ @pytest.mark.parametrize(
583
+ "test_case",
584
+ [
585
+ TestCase(
586
+ name="get logs no follow",
587
+ expected_status=SUCCESS,
588
+ config={"follow": False},
589
+ ),
590
+ TestCase(
591
+ name="get logs with follow",
592
+ expected_status=SUCCESS,
593
+ config={"follow": True},
594
+ ),
595
+ ],
596
+ )
597
+ def test_get_job_logs(container_backend, test_case):
598
+ """Test getting job logs."""
599
+ print("Executing test:", test_case.name)
600
+ try:
601
+ trainer = types.CustomTrainer(func=simple_train_func, num_nodes=1)
602
+ runtime = container_backend.get_runtime(constants.DEFAULT_TRAINING_RUNTIME)
603
+ job_name = container_backend.train(runtime=runtime, trainer=trainer)
604
+
605
+ logs = list(container_backend.get_job_logs(job_name, follow=test_case.config["follow"]))
606
+
607
+ assert test_case.expected_status == SUCCESS
608
+ assert len(logs) > 0
609
+ if test_case.config["follow"]:
610
+ assert any("Log line" in log for log in logs)
611
+ else:
612
+ assert any("Complete log" in log for log in logs)
613
+
614
+ except Exception as e:
615
+ assert type(e) is test_case.expected_error
616
+ print("test execution complete")
617
+
618
+
619
+ @pytest.mark.parametrize(
620
+ "test_case",
621
+ [
622
+ TestCase(
623
+ name="wait for complete",
624
+ expected_status=SUCCESS,
625
+ config={"wait_status": constants.TRAINJOB_COMPLETE, "container_exit_code": 0},
626
+ ),
627
+ TestCase(
628
+ name="wait timeout",
629
+ expected_status=FAILED,
630
+ config={"wait_status": constants.TRAINJOB_COMPLETE, "timeout": 2},
631
+ expected_error=TimeoutError,
632
+ ),
633
+ TestCase(
634
+ name="job fails",
635
+ expected_status=FAILED,
636
+ config={"wait_status": constants.TRAINJOB_COMPLETE, "container_exit_code": 1},
637
+ expected_error=RuntimeError,
638
+ ),
639
+ ],
640
+ )
641
+ def test_wait_for_job_status(container_backend, test_case):
642
+ """Test waiting for job status."""
643
+ print("Executing test:", test_case.name)
644
+ try:
645
+ trainer = types.CustomTrainer(func=simple_train_func, num_nodes=1)
646
+ runtime = container_backend.get_runtime(constants.DEFAULT_TRAINING_RUNTIME)
647
+ job_name = container_backend.train(runtime=runtime, trainer=trainer)
648
+
649
+ if test_case.name == "wait for complete":
650
+ container_id = container_backend._adapter.containers_created[0]["id"]
651
+ container_backend._adapter.set_container_status(
652
+ container_id, "exited", test_case.config["container_exit_code"]
653
+ )
654
+
655
+ completed_job = container_backend.wait_for_job_status(
656
+ job_name, status={test_case.config["wait_status"]}, timeout=5, polling_interval=1
657
+ )
658
+
659
+ assert test_case.expected_status == SUCCESS
660
+ assert completed_job.status == constants.TRAINJOB_COMPLETE
661
+
662
+ elif test_case.name == "wait timeout":
663
+ container_backend.wait_for_job_status(
664
+ job_name,
665
+ status={test_case.config["wait_status"]},
666
+ timeout=test_case.config["timeout"],
667
+ polling_interval=1,
668
+ )
669
+
670
+ elif test_case.name == "job fails":
671
+ container_id = container_backend._adapter.containers_created[0]["id"]
672
+ container_backend._adapter.set_container_status(
673
+ container_id, "exited", test_case.config["container_exit_code"]
674
+ )
675
+
676
+ container_backend.wait_for_job_status(
677
+ job_name, status={test_case.config["wait_status"]}, timeout=5, polling_interval=1
678
+ )
679
+
680
+ except Exception as e:
681
+ assert type(e) is test_case.expected_error
682
+ print("test execution complete")
683
+
684
+
685
+ @pytest.mark.parametrize(
686
+ "test_case",
687
+ [
688
+ TestCase(
689
+ name="delete with auto_remove true",
690
+ expected_status=SUCCESS,
691
+ config={"auto_remove": True, "num_nodes": 2},
692
+ ),
693
+ TestCase(
694
+ name="delete with auto_remove false",
695
+ expected_status=SUCCESS,
696
+ config={"auto_remove": False, "num_nodes": 2},
697
+ ),
698
+ ],
699
+ )
700
+ def test_delete_job(container_backend, temp_workdir, test_case):
701
+ """Test deleting a job."""
702
+ print("Executing test:", test_case.name)
703
+ try:
704
+ container_backend.cfg.auto_remove = test_case.config["auto_remove"]
705
+
706
+ trainer = types.CustomTrainer(
707
+ func=simple_train_func, num_nodes=test_case.config["num_nodes"]
708
+ )
709
+ runtime = container_backend.get_runtime(constants.DEFAULT_TRAINING_RUNTIME)
710
+ job_name = container_backend.train(runtime=runtime, trainer=trainer)
711
+
712
+ job_workdir = Path.home() / ".kubeflow" / "trainer" / "containers" / job_name
713
+ assert job_workdir.exists()
714
+
715
+ container_backend.delete_job(job_name)
716
+
717
+ assert test_case.expected_status == SUCCESS
718
+ assert len(container_backend._adapter.containers_stopped) == test_case.config["num_nodes"]
719
+ assert len(container_backend._adapter.containers_removed) == test_case.config["num_nodes"]
720
+ assert len(container_backend._adapter.networks_deleted) == 1
721
+
722
+ if test_case.config["auto_remove"]:
723
+ assert not job_workdir.exists()
724
+ else:
725
+ assert job_workdir.exists()
726
+
727
+ except Exception as e:
728
+ assert type(e) is test_case.expected_error
729
+ print("test execution complete")
730
+
731
+
732
+ @pytest.mark.parametrize(
733
+ "test_case",
734
+ [
735
+ TestCase(
736
+ name="running container",
737
+ expected_status=SUCCESS,
738
+ config={
739
+ "container_status": "running",
740
+ "exit_code": None,
741
+ "expected_job_status": constants.TRAINJOB_RUNNING,
742
+ },
743
+ ),
744
+ TestCase(
745
+ name="exited success",
746
+ expected_status=SUCCESS,
747
+ config={
748
+ "container_status": "exited",
749
+ "exit_code": 0,
750
+ "expected_job_status": constants.TRAINJOB_COMPLETE,
751
+ },
752
+ ),
753
+ TestCase(
754
+ name="exited failure",
755
+ expected_status=SUCCESS,
756
+ config={
757
+ "container_status": "exited",
758
+ "exit_code": 1,
759
+ "expected_job_status": constants.TRAINJOB_FAILED,
760
+ },
761
+ ),
762
+ ],
763
+ )
764
+ def test_container_status_mapping(container_backend, test_case):
765
+ """Test container status mapping to TrainJob status."""
766
+ print("Executing test:", test_case.name)
767
+ try:
768
+ trainer = types.CustomTrainer(func=simple_train_func, num_nodes=1)
769
+ runtime = container_backend.get_runtime(constants.DEFAULT_TRAINING_RUNTIME)
770
+ job_name = container_backend.train(runtime=runtime, trainer=trainer)
771
+
772
+ container_id = container_backend._adapter.containers_created[0]["id"]
773
+ container_backend._adapter.set_container_status(
774
+ container_id, test_case.config["container_status"], test_case.config["exit_code"]
775
+ )
776
+
777
+ job = container_backend.get_job(job_name)
778
+
779
+ assert test_case.expected_status == SUCCESS
780
+ assert job.status == test_case.config["expected_job_status"]
781
+
782
+ except Exception as e:
783
+ assert type(e) is test_case.expected_error
784
+ print("test execution complete")
785
+
786
+
787
+ @pytest.mark.parametrize(
788
+ "test_case",
789
+ [
790
+ TestCase(
791
+ name="docker socket locations with colima",
792
+ expected_status=SUCCESS,
793
+ config={
794
+ "runtime_name": "docker",
795
+ "container_host": None,
796
+ "create_colima_socket": True,
797
+ "expected_contains_none": True,
798
+ "expected_has_colima": True,
799
+ },
800
+ ),
801
+ TestCase(
802
+ name="custom host has priority",
803
+ expected_status=SUCCESS,
804
+ config={
805
+ "runtime_name": "docker",
806
+ "container_host": "unix:///custom/path/docker.sock",
807
+ "create_colima_socket": False,
808
+ "expected_first": "unix:///custom/path/docker.sock",
809
+ },
810
+ ),
811
+ ],
812
+ )
813
+ def test_get_common_socket_locations(test_case, tmp_path):
814
+ """Test common socket location detection."""
815
+ print("Executing test:", test_case.name)
816
+
817
+ # Setup
818
+ if test_case.config.get("create_colima_socket"):
819
+ colima_dir = tmp_path / ".colima" / "default"
820
+ colima_dir.mkdir(parents=True)
821
+ colima_sock = colima_dir / "docker.sock"
822
+ colima_sock.touch()
823
+
824
+ cfg = ContainerBackendConfig(container_host=test_case.config["container_host"])
825
+
826
+ # Test the method directly without creating the backend
827
+ context_manager = (
828
+ patch("pathlib.Path.home", return_value=tmp_path)
829
+ if test_case.config.get("create_colima_socket")
830
+ else nullcontext()
831
+ )
832
+
833
+ with context_manager:
834
+ backend = ContainerBackend.__new__(ContainerBackend)
835
+ backend.cfg = cfg
836
+ locations = backend._get_common_socket_locations(test_case.config["runtime_name"])
837
+
838
+ # Assertions
839
+ if "expected_contains_none" in test_case.config:
840
+ assert None in locations
841
+
842
+ if "expected_has_colima" in test_case.config:
843
+ assert f"unix://{colima_sock}" in locations
844
+
845
+ if "expected_first" in test_case.config:
846
+ assert locations[0] == test_case.config["expected_first"]
847
+
848
+ print("test execution complete")
849
+
850
+
851
+ def test_create_adapter_error_message_format():
852
+ """Test that error message includes attempted connections."""
853
+ cfg = ContainerBackendConfig(container_runtime="docker")
854
+
855
+ docker_adapter = (
856
+ "viettelcloud.aiplatform.trainer.backends.container.adapters.docker.DockerClientAdapter"
857
+ )
858
+ with patch(docker_adapter) as mock_docker:
859
+ mock_docker.side_effect = Exception("Connection failed")
860
+
861
+ with pytest.raises(RuntimeError) as exc_info:
862
+ ContainerBackend(cfg)
863
+
864
+ # Error message should be helpful
865
+ error_msg = str(exc_info.value)
866
+ assert "Could not connect" in error_msg
867
+ assert "tried:" in error_msg