viettelcloud-aiplatform 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- viettelcloud/__init__.py +1 -0
- viettelcloud/aiplatform/__init__.py +15 -0
- viettelcloud/aiplatform/common/__init__.py +0 -0
- viettelcloud/aiplatform/common/constants.py +22 -0
- viettelcloud/aiplatform/common/types.py +28 -0
- viettelcloud/aiplatform/common/utils.py +40 -0
- viettelcloud/aiplatform/hub/OWNERS +14 -0
- viettelcloud/aiplatform/hub/__init__.py +25 -0
- viettelcloud/aiplatform/hub/api/__init__.py +13 -0
- viettelcloud/aiplatform/hub/api/_proxy_client.py +355 -0
- viettelcloud/aiplatform/hub/api/model_registry_client.py +561 -0
- viettelcloud/aiplatform/hub/api/model_registry_client_test.py +462 -0
- viettelcloud/aiplatform/optimizer/__init__.py +45 -0
- viettelcloud/aiplatform/optimizer/api/__init__.py +0 -0
- viettelcloud/aiplatform/optimizer/api/optimizer_client.py +248 -0
- viettelcloud/aiplatform/optimizer/backends/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/backends/base.py +77 -0
- viettelcloud/aiplatform/optimizer/backends/kubernetes/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/backends/kubernetes/backend.py +563 -0
- viettelcloud/aiplatform/optimizer/backends/kubernetes/utils.py +112 -0
- viettelcloud/aiplatform/optimizer/constants/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/constants/constants.py +59 -0
- viettelcloud/aiplatform/optimizer/types/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/types/algorithm_types.py +87 -0
- viettelcloud/aiplatform/optimizer/types/optimization_types.py +135 -0
- viettelcloud/aiplatform/optimizer/types/search_types.py +95 -0
- viettelcloud/aiplatform/py.typed +0 -0
- viettelcloud/aiplatform/trainer/__init__.py +82 -0
- viettelcloud/aiplatform/trainer/api/__init__.py +3 -0
- viettelcloud/aiplatform/trainer/api/trainer_client.py +277 -0
- viettelcloud/aiplatform/trainer/api/trainer_client_test.py +72 -0
- viettelcloud/aiplatform/trainer/backends/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/backends/base.py +94 -0
- viettelcloud/aiplatform/trainer/backends/container/adapters/base.py +195 -0
- viettelcloud/aiplatform/trainer/backends/container/adapters/docker.py +231 -0
- viettelcloud/aiplatform/trainer/backends/container/adapters/podman.py +258 -0
- viettelcloud/aiplatform/trainer/backends/container/backend.py +668 -0
- viettelcloud/aiplatform/trainer/backends/container/backend_test.py +867 -0
- viettelcloud/aiplatform/trainer/backends/container/runtime_loader.py +631 -0
- viettelcloud/aiplatform/trainer/backends/container/runtime_loader_test.py +637 -0
- viettelcloud/aiplatform/trainer/backends/container/types.py +67 -0
- viettelcloud/aiplatform/trainer/backends/container/utils.py +213 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/backend.py +710 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/backend_test.py +1344 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/constants.py +15 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/utils.py +636 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/utils_test.py +582 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/backend.py +306 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/backend_test.py +501 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/constants.py +90 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/job.py +184 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/types.py +52 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/utils.py +302 -0
- viettelcloud/aiplatform/trainer/constants/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/constants/constants.py +179 -0
- viettelcloud/aiplatform/trainer/options/__init__.py +52 -0
- viettelcloud/aiplatform/trainer/options/common.py +55 -0
- viettelcloud/aiplatform/trainer/options/kubernetes.py +502 -0
- viettelcloud/aiplatform/trainer/options/kubernetes_test.py +259 -0
- viettelcloud/aiplatform/trainer/options/localprocess.py +20 -0
- viettelcloud/aiplatform/trainer/test/common.py +22 -0
- viettelcloud/aiplatform/trainer/types/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/types/types.py +517 -0
- viettelcloud/aiplatform/trainer/types/types_test.py +115 -0
- viettelcloud_aiplatform-0.3.0.dist-info/METADATA +226 -0
- viettelcloud_aiplatform-0.3.0.dist-info/RECORD +71 -0
- viettelcloud_aiplatform-0.3.0.dist-info/WHEEL +4 -0
- viettelcloud_aiplatform-0.3.0.dist-info/licenses/LICENSE +201 -0
- viettelcloud_aiplatform-0.3.0.dist-info/licenses/NOTICE +36 -0
|
@@ -0,0 +1,668 @@
|
|
|
1
|
+
# Copyright 2025 The Kubeflow Authors.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""
|
|
16
|
+
ContainerBackend
|
|
17
|
+
----------------
|
|
18
|
+
|
|
19
|
+
Unified local execution backend for `CustomTrainer` jobs using containers.
|
|
20
|
+
|
|
21
|
+
This backend automatically detects and uses either Docker or Podman.
|
|
22
|
+
It provides a single interface regardless of the underlying container runtime.
|
|
23
|
+
|
|
24
|
+
Key behaviors:
|
|
25
|
+
- Auto-detection: Tries Docker first, then Podman. Can be overridden via config.
|
|
26
|
+
- Multi-node jobs: one container per node connected via a per-job network.
|
|
27
|
+
- Entry script generation: we serialize the user's training function and embed it
|
|
28
|
+
inline in the container command using a heredoc (no file I/O on the host). The
|
|
29
|
+
script is created inside the container at /tmp/train.py and invoked using
|
|
30
|
+
`torchrun` (preferred) or `python` as a fallback.
|
|
31
|
+
- Runtimes: we use `config/training_runtimes` to define runtime images and
|
|
32
|
+
characteristics (e.g., torch). Defaults to `torch-distributed` if no runtime
|
|
33
|
+
is provided.
|
|
34
|
+
- Image pulling: controlled via `pull_policy` and performed automatically if
|
|
35
|
+
needed.
|
|
36
|
+
- Logs and lifecycle: streaming logs and deletion semantics similar to the
|
|
37
|
+
Docker/Podman backends, but with automatic runtime detection.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
from collections.abc import Callable, Iterator
|
|
41
|
+
from datetime import datetime
|
|
42
|
+
import logging
|
|
43
|
+
import os
|
|
44
|
+
import random
|
|
45
|
+
import shutil
|
|
46
|
+
import string
|
|
47
|
+
from typing import Optional, Union
|
|
48
|
+
import uuid
|
|
49
|
+
|
|
50
|
+
from viettelcloud.aiplatform.trainer.backends.base import RuntimeBackend
|
|
51
|
+
from viettelcloud.aiplatform.trainer.backends.container import utils as container_utils
|
|
52
|
+
from viettelcloud.aiplatform.trainer.backends.container.adapters.base import (
|
|
53
|
+
BaseContainerClientAdapter,
|
|
54
|
+
)
|
|
55
|
+
from viettelcloud.aiplatform.trainer.backends.container.adapters.docker import DockerClientAdapter
|
|
56
|
+
from viettelcloud.aiplatform.trainer.backends.container.adapters.podman import PodmanClientAdapter
|
|
57
|
+
from viettelcloud.aiplatform.trainer.backends.container.runtime_loader import (
|
|
58
|
+
get_training_runtime_from_sources,
|
|
59
|
+
list_training_runtimes_from_sources,
|
|
60
|
+
)
|
|
61
|
+
from viettelcloud.aiplatform.trainer.backends.container.types import ContainerBackendConfig
|
|
62
|
+
from viettelcloud.aiplatform.trainer.constants import constants
|
|
63
|
+
from viettelcloud.aiplatform.trainer.types import types
|
|
64
|
+
|
|
65
|
+
logger = logging.getLogger(__name__)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class ContainerBackend(RuntimeBackend):
|
|
69
|
+
"""
|
|
70
|
+
Unified container backend that auto-detects Docker or Podman.
|
|
71
|
+
|
|
72
|
+
This backend uses the adapter pattern to abstract away differences between
|
|
73
|
+
Docker and Podman, providing a single consistent interface.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def __init__(self, cfg: ContainerBackendConfig):
|
|
77
|
+
self.cfg = cfg
|
|
78
|
+
self.label_prefix = "trainer.kubeflow.org"
|
|
79
|
+
|
|
80
|
+
# Initialize the container client adapter
|
|
81
|
+
self._adapter = self._create_adapter()
|
|
82
|
+
|
|
83
|
+
def _get_common_socket_locations(self, runtime_name: str) -> list[Optional[str]]:
|
|
84
|
+
"""
|
|
85
|
+
Get common socket locations to try for the given runtime.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
runtime_name: "docker" or "podman"
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
List of socket URLs to try, including None (for default)
|
|
92
|
+
"""
|
|
93
|
+
import os
|
|
94
|
+
from pathlib import Path
|
|
95
|
+
|
|
96
|
+
locations = [self.cfg.container_host] if self.cfg.container_host else []
|
|
97
|
+
|
|
98
|
+
if runtime_name == "docker":
|
|
99
|
+
# Common Docker socket locations
|
|
100
|
+
colima_sock = Path.home() / ".colima/default/docker.sock"
|
|
101
|
+
if colima_sock.exists():
|
|
102
|
+
locations.append(f"unix://{colima_sock}")
|
|
103
|
+
# Standard Docker socket
|
|
104
|
+
locations.append(None) # Use docker.from_env() default
|
|
105
|
+
|
|
106
|
+
elif runtime_name == "podman":
|
|
107
|
+
# Common Podman socket locations on macOS
|
|
108
|
+
uid = os.getuid() if hasattr(os, "getuid") else None
|
|
109
|
+
if uid:
|
|
110
|
+
user_sock = f"/run/user/{uid}/podman/podman.sock"
|
|
111
|
+
if Path(user_sock).exists():
|
|
112
|
+
locations.append(f"unix://{user_sock}")
|
|
113
|
+
# Standard Podman socket
|
|
114
|
+
locations.append(None) # Use PodmanClient() default
|
|
115
|
+
|
|
116
|
+
# Remove duplicates while preserving order
|
|
117
|
+
seen = set()
|
|
118
|
+
unique_locations = []
|
|
119
|
+
for loc in locations:
|
|
120
|
+
if loc not in seen:
|
|
121
|
+
unique_locations.append(loc)
|
|
122
|
+
seen.add(loc)
|
|
123
|
+
|
|
124
|
+
return unique_locations
|
|
125
|
+
|
|
126
|
+
def _create_adapter(self) -> BaseContainerClientAdapter:
|
|
127
|
+
"""
|
|
128
|
+
Create the appropriate container client adapter.
|
|
129
|
+
|
|
130
|
+
Tries Docker first, then Podman if Docker fails, unless a specific
|
|
131
|
+
runtime is requested in the config. Automatically tries common socket
|
|
132
|
+
locations (e.g., Colima for Docker on macOS, user socket for Podman).
|
|
133
|
+
|
|
134
|
+
Raises RuntimeError if neither Docker nor Podman are available.
|
|
135
|
+
"""
|
|
136
|
+
runtime_map = {
|
|
137
|
+
"docker": DockerClientAdapter,
|
|
138
|
+
"podman": PodmanClientAdapter,
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
# Determine which runtimes to try
|
|
142
|
+
runtimes_to_try = (
|
|
143
|
+
[self.cfg.container_runtime] if self.cfg.container_runtime else ["docker", "podman"]
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
attempted_connections = []
|
|
147
|
+
last_error = None
|
|
148
|
+
|
|
149
|
+
for runtime_name in runtimes_to_try:
|
|
150
|
+
if runtime_name not in runtime_map:
|
|
151
|
+
continue
|
|
152
|
+
|
|
153
|
+
# Try common socket locations for this runtime
|
|
154
|
+
socket_locations = self._get_common_socket_locations(runtime_name)
|
|
155
|
+
|
|
156
|
+
for host in socket_locations:
|
|
157
|
+
try:
|
|
158
|
+
adapter = runtime_map[runtime_name](host)
|
|
159
|
+
adapter.ping()
|
|
160
|
+
host_display = host or "default"
|
|
161
|
+
logger.debug(
|
|
162
|
+
f"Using {runtime_name} as container runtime (host: {host_display})"
|
|
163
|
+
)
|
|
164
|
+
return adapter
|
|
165
|
+
except Exception as e:
|
|
166
|
+
host_str = host or "default"
|
|
167
|
+
logger.debug(f"{runtime_name} initialization failed at {host_str}: {e}")
|
|
168
|
+
attempted_connections.append(f"{runtime_name} at {host_str}")
|
|
169
|
+
last_error = e
|
|
170
|
+
|
|
171
|
+
# Build helpful error message
|
|
172
|
+
import platform
|
|
173
|
+
|
|
174
|
+
system = platform.system()
|
|
175
|
+
|
|
176
|
+
attempted = ", ".join(attempted_connections)
|
|
177
|
+
error_msg = f"Could not connect to Docker or Podman (tried: {attempted}).\n"
|
|
178
|
+
|
|
179
|
+
if system == "Darwin": # macOS
|
|
180
|
+
error_msg += (
|
|
181
|
+
"Ensure Docker/Podman is running "
|
|
182
|
+
"(e.g., 'colima start' or 'podman machine start').\n"
|
|
183
|
+
)
|
|
184
|
+
else:
|
|
185
|
+
error_msg += "Ensure Docker/Podman is installed and running.\n"
|
|
186
|
+
|
|
187
|
+
error_msg += (
|
|
188
|
+
"To specify a custom socket: ContainerBackendConfig(container_host='unix:///path/to/socket')\n"
|
|
189
|
+
"Or use LocalProcessBackendConfig for non-containerized execution."
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
raise RuntimeError(error_msg) from last_error
|
|
193
|
+
|
|
194
|
+
@property
|
|
195
|
+
def _runtime_type(self) -> str:
|
|
196
|
+
"""Get the runtime type for debugging/logging."""
|
|
197
|
+
return self._adapter._runtime_type
|
|
198
|
+
|
|
199
|
+
# ---- Runtime APIs ----
|
|
200
|
+
def list_runtimes(self) -> list[types.Runtime]:
|
|
201
|
+
return list_training_runtimes_from_sources(self.cfg.runtime_source.sources)
|
|
202
|
+
|
|
203
|
+
def get_runtime(self, name: str) -> types.Runtime:
|
|
204
|
+
return get_training_runtime_from_sources(name, self.cfg.runtime_source.sources)
|
|
205
|
+
|
|
206
|
+
def get_runtime_packages(self, runtime: types.Runtime):
|
|
207
|
+
"""
|
|
208
|
+
Spawn a short-lived container to report Python version, pip list, and nvidia-smi.
|
|
209
|
+
"""
|
|
210
|
+
container_utils.maybe_pull_image(self._adapter, runtime.trainer.image, self.cfg.pull_policy)
|
|
211
|
+
|
|
212
|
+
command = [
|
|
213
|
+
"bash",
|
|
214
|
+
"-lc",
|
|
215
|
+
"python -c \"import sys; print(f'Python: {sys.version}')\" && "
|
|
216
|
+
"(pip list || echo 'pip not found') && "
|
|
217
|
+
"(nvidia-smi || echo 'nvidia-smi not found')",
|
|
218
|
+
]
|
|
219
|
+
|
|
220
|
+
logs = self._adapter.run_oneoff_container(image=runtime.trainer.image, command=command)
|
|
221
|
+
print(logs)
|
|
222
|
+
|
|
223
|
+
def train(
|
|
224
|
+
self,
|
|
225
|
+
runtime: Optional[Union[str, types.Runtime]] = None,
|
|
226
|
+
initializer: Optional[types.Initializer] = None,
|
|
227
|
+
trainer: Optional[
|
|
228
|
+
Union[types.CustomTrainer, types.CustomTrainerContainer, types.BuiltinTrainer]
|
|
229
|
+
] = None,
|
|
230
|
+
options: Optional[list] = None,
|
|
231
|
+
) -> str:
|
|
232
|
+
if runtime is None:
|
|
233
|
+
runtime = self.get_runtime(constants.DEFAULT_TRAINING_RUNTIME)
|
|
234
|
+
elif isinstance(runtime, str):
|
|
235
|
+
runtime = self.get_runtime(runtime)
|
|
236
|
+
|
|
237
|
+
# Process options to extract configuration
|
|
238
|
+
name = None
|
|
239
|
+
if options:
|
|
240
|
+
job_spec = {}
|
|
241
|
+
for option in options:
|
|
242
|
+
option(job_spec, trainer, self)
|
|
243
|
+
|
|
244
|
+
metadata_section = job_spec.get("metadata", {})
|
|
245
|
+
name = metadata_section.get("name")
|
|
246
|
+
|
|
247
|
+
if not isinstance(trainer, types.CustomTrainer):
|
|
248
|
+
raise ValueError(f"{self.__class__.__name__} supports only CustomTrainer in v1")
|
|
249
|
+
|
|
250
|
+
# Generate train job name if not provided via options
|
|
251
|
+
trainjob_name = name or (
|
|
252
|
+
random.choice(string.ascii_lowercase)
|
|
253
|
+
+ uuid.uuid4().hex[: constants.JOB_NAME_UUID_LENGTH]
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
logger.debug(f"Starting training job: {trainjob_name}")
|
|
257
|
+
try:
|
|
258
|
+
# Create per-job working directory on host (for outputs, checkpoints, etc.)
|
|
259
|
+
workdir = container_utils.create_workdir(trainjob_name)
|
|
260
|
+
logger.debug(f"Created working directory: {workdir}")
|
|
261
|
+
|
|
262
|
+
# Generate training script code (inline, not written to disk)
|
|
263
|
+
training_script_code = container_utils.get_training_script_code(trainer)
|
|
264
|
+
logger.debug("Generated training script code")
|
|
265
|
+
|
|
266
|
+
# Get the image from the trainer or runtime.
|
|
267
|
+
image = trainer.image if trainer.image else runtime.trainer.image
|
|
268
|
+
logger.debug(f"Using image: {image}")
|
|
269
|
+
|
|
270
|
+
container_utils.maybe_pull_image(self._adapter, image, self.cfg.pull_policy)
|
|
271
|
+
logger.debug(f"Image ready: {image}")
|
|
272
|
+
|
|
273
|
+
# Build base environment
|
|
274
|
+
env = container_utils.build_environment(trainer)
|
|
275
|
+
|
|
276
|
+
# Construct pre-run command to install packages
|
|
277
|
+
pre_install_cmd = container_utils.build_pip_install_cmd(trainer)
|
|
278
|
+
|
|
279
|
+
# Create network for multi-node communication
|
|
280
|
+
num_nodes = trainer.num_nodes or runtime.trainer.num_nodes or 1
|
|
281
|
+
logger.debug(f"Creating network for {num_nodes} nodes")
|
|
282
|
+
|
|
283
|
+
# Determine number of processes per node from GPU count
|
|
284
|
+
# For GPU training: spawn one process per GPU for optimal utilization
|
|
285
|
+
# For CPU training: use single process (PyTorch parallelizes internally via threads)
|
|
286
|
+
nproc_per_node = 1 # Default for CPU training
|
|
287
|
+
if trainer.resources_per_node and "gpu" in trainer.resources_per_node:
|
|
288
|
+
try:
|
|
289
|
+
nproc_per_node = int(trainer.resources_per_node["gpu"])
|
|
290
|
+
logger.debug(f"Using {nproc_per_node} processes per node (1 per GPU)")
|
|
291
|
+
except (ValueError, TypeError):
|
|
292
|
+
logger.warning(
|
|
293
|
+
f"Invalid GPU count in resources_per_node: "
|
|
294
|
+
f"{trainer.resources_per_node['gpu']}, defaulting to 1 process per node"
|
|
295
|
+
)
|
|
296
|
+
else:
|
|
297
|
+
logger.debug("No GPU specified, using 1 process per node")
|
|
298
|
+
|
|
299
|
+
network_id = self._adapter.create_network(
|
|
300
|
+
name=f"{trainjob_name}-net",
|
|
301
|
+
labels={
|
|
302
|
+
f"{self.label_prefix}/trainjob-name": trainjob_name,
|
|
303
|
+
f"{self.label_prefix}/runtime-name": runtime.name,
|
|
304
|
+
f"{self.label_prefix}/workdir": workdir,
|
|
305
|
+
},
|
|
306
|
+
)
|
|
307
|
+
logger.debug(f"Created network: {network_id}")
|
|
308
|
+
|
|
309
|
+
# Create N containers (one per node)
|
|
310
|
+
container_ids: list[str] = []
|
|
311
|
+
master_container_id = None
|
|
312
|
+
master_ip = None
|
|
313
|
+
|
|
314
|
+
for rank in range(num_nodes):
|
|
315
|
+
container_name = f"{trainjob_name}-node-{rank}"
|
|
316
|
+
|
|
317
|
+
# Get master address and port for torchrun
|
|
318
|
+
master_port = 29500
|
|
319
|
+
|
|
320
|
+
# For Podman: use IP address to avoid DNS timing issues
|
|
321
|
+
# For Docker: use hostname (DNS is reliable)
|
|
322
|
+
if rank == 0:
|
|
323
|
+
# Master node - will be created first
|
|
324
|
+
master_addr = f"{trainjob_name}-node-0"
|
|
325
|
+
else:
|
|
326
|
+
# Worker nodes - determine master address based on runtime
|
|
327
|
+
if self._runtime_type == "podman" and master_ip:
|
|
328
|
+
master_addr = master_ip
|
|
329
|
+
logger.debug(f"Using master IP address for Podman: {master_ip}")
|
|
330
|
+
else:
|
|
331
|
+
master_addr = f"{trainjob_name}-node-0"
|
|
332
|
+
logger.debug(f"Using master hostname: {master_addr}")
|
|
333
|
+
|
|
334
|
+
# Prefer torchrun; fall back to python if torchrun is unavailable
|
|
335
|
+
# For worker nodes, wait for master to be reachable before starting torchrun
|
|
336
|
+
wait_for_master = ""
|
|
337
|
+
if rank > 0:
|
|
338
|
+
wait_for_master = (
|
|
339
|
+
f"echo 'Waiting for master node {master_addr}:{master_port}...'; "
|
|
340
|
+
f"for i in {{1..60}}; do "
|
|
341
|
+
f" if timeout 1 bash -c 'cat < /dev/null > "
|
|
342
|
+
f"/dev/tcp/{master_addr}/{master_port}' 2>/dev/null; then "
|
|
343
|
+
f" echo 'Master node is reachable'; break; "
|
|
344
|
+
f" fi; "
|
|
345
|
+
f" if [ $i -eq 60 ]; then "
|
|
346
|
+
f"echo 'Timeout waiting for master node'; exit 1; fi; "
|
|
347
|
+
f" sleep 2; "
|
|
348
|
+
f"done; "
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
# Embed training script inline using heredoc (no file I/O on host)
|
|
352
|
+
entry_cmd = (
|
|
353
|
+
f"{pre_install_cmd}"
|
|
354
|
+
f"{wait_for_master}"
|
|
355
|
+
f"cat > /tmp/train.py << 'TRAINING_SCRIPT_EOF'\n"
|
|
356
|
+
f"{training_script_code}\n"
|
|
357
|
+
f"TRAINING_SCRIPT_EOF\n"
|
|
358
|
+
"if command -v torchrun >/dev/null 2>&1; then "
|
|
359
|
+
f" torchrun --nproc_per_node={nproc_per_node} --nnodes={num_nodes} "
|
|
360
|
+
f" --node-rank={rank} --rdzv-backend=static "
|
|
361
|
+
f" --rdzv-endpoint={master_addr}:{master_port} "
|
|
362
|
+
f" /tmp/train.py; "
|
|
363
|
+
"else "
|
|
364
|
+
f" python /tmp/train.py; "
|
|
365
|
+
"fi"
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
full_cmd = ["bash", "-lc", entry_cmd]
|
|
369
|
+
|
|
370
|
+
labels = {
|
|
371
|
+
f"{self.label_prefix}/trainjob-name": trainjob_name,
|
|
372
|
+
f"{self.label_prefix}/step": f"node-{rank}",
|
|
373
|
+
f"{self.label_prefix}/network-id": network_id,
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
volumes = {
|
|
377
|
+
workdir: {
|
|
378
|
+
"bind": constants.WORKSPACE_PATH,
|
|
379
|
+
"mode": "rw",
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
logger.debug(f"Creating container {rank}/{num_nodes}: {container_name}")
|
|
384
|
+
|
|
385
|
+
container_id = self._adapter.create_and_start_container(
|
|
386
|
+
image=image,
|
|
387
|
+
command=full_cmd,
|
|
388
|
+
name=container_name,
|
|
389
|
+
network_id=network_id,
|
|
390
|
+
environment=env,
|
|
391
|
+
labels=labels,
|
|
392
|
+
volumes=volumes,
|
|
393
|
+
working_dir=constants.WORKSPACE_PATH,
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
logger.debug(f"Started container {container_name} (ID: {container_id[:12]})")
|
|
397
|
+
container_ids.append(container_id)
|
|
398
|
+
|
|
399
|
+
# If this is the master node and we're using Podman, get its IP address
|
|
400
|
+
if rank == 0:
|
|
401
|
+
master_container_id = container_id
|
|
402
|
+
if self._runtime_type == "podman":
|
|
403
|
+
# Get master IP for worker nodes to use
|
|
404
|
+
master_ip = self._adapter.get_container_ip(master_container_id, network_id)
|
|
405
|
+
if master_ip:
|
|
406
|
+
logger.debug(f"Master node IP address: {master_ip}")
|
|
407
|
+
else:
|
|
408
|
+
logger.warning(
|
|
409
|
+
"Could not retrieve master IP address. "
|
|
410
|
+
"Worker nodes will fall back to DNS resolution."
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
logger.debug(
|
|
414
|
+
f"Training job {trainjob_name} created successfully with "
|
|
415
|
+
f"{len(container_ids)} container(s)"
|
|
416
|
+
)
|
|
417
|
+
return trainjob_name
|
|
418
|
+
|
|
419
|
+
except Exception as e:
|
|
420
|
+
# Clean up on failure
|
|
421
|
+
logger.error(f"Failed to create training job {trainjob_name}: {e}")
|
|
422
|
+
logger.exception("Full traceback:")
|
|
423
|
+
|
|
424
|
+
# Try to clean up any resources that were created
|
|
425
|
+
from contextlib import suppress
|
|
426
|
+
|
|
427
|
+
try:
|
|
428
|
+
# Stop and remove any containers that were created
|
|
429
|
+
if "container_ids" in locals():
|
|
430
|
+
for container_id in container_ids:
|
|
431
|
+
with suppress(Exception):
|
|
432
|
+
self._adapter.stop_container(container_id, timeout=5)
|
|
433
|
+
self._adapter.remove_container(container_id, force=True)
|
|
434
|
+
|
|
435
|
+
# Remove network if it was created
|
|
436
|
+
if "network_id" in locals():
|
|
437
|
+
with suppress(Exception):
|
|
438
|
+
self._adapter.delete_network(network_id)
|
|
439
|
+
|
|
440
|
+
# Remove working directory if it was created
|
|
441
|
+
if "workdir" in locals() and os.path.isdir(workdir):
|
|
442
|
+
shutil.rmtree(workdir, ignore_errors=True)
|
|
443
|
+
|
|
444
|
+
except Exception as cleanup_error:
|
|
445
|
+
logger.error(f"Error during cleanup: {cleanup_error}")
|
|
446
|
+
|
|
447
|
+
# Re-raise the original exception
|
|
448
|
+
raise
|
|
449
|
+
|
|
450
|
+
def _get_job_containers(self, name: str) -> list[dict]:
|
|
451
|
+
"""
|
|
452
|
+
Get containers for a specific training job.
|
|
453
|
+
|
|
454
|
+
Args:
|
|
455
|
+
name: Name of the training job
|
|
456
|
+
|
|
457
|
+
Returns:
|
|
458
|
+
List of container dictionaries for this job
|
|
459
|
+
|
|
460
|
+
Raises:
|
|
461
|
+
ValueError: If no containers found for the job
|
|
462
|
+
"""
|
|
463
|
+
filters = {"label": [f"{self.label_prefix}/trainjob-name={name}"]}
|
|
464
|
+
containers = self._adapter.list_containers(filters=filters)
|
|
465
|
+
|
|
466
|
+
if not containers:
|
|
467
|
+
raise ValueError(f"No TrainJob with name {name}")
|
|
468
|
+
|
|
469
|
+
return containers
|
|
470
|
+
|
|
471
|
+
def __get_trainjob_from_containers(
|
|
472
|
+
self, job_name: str, containers: list[dict]
|
|
473
|
+
) -> types.TrainJob:
|
|
474
|
+
"""
|
|
475
|
+
Build a TrainJob object from a list of containers.
|
|
476
|
+
|
|
477
|
+
Args:
|
|
478
|
+
job_name: Name of the training job
|
|
479
|
+
containers: List of container dictionaries for this job
|
|
480
|
+
|
|
481
|
+
Returns:
|
|
482
|
+
TrainJob object
|
|
483
|
+
|
|
484
|
+
Raises:
|
|
485
|
+
ValueError: If network metadata is missing or runtime not found
|
|
486
|
+
"""
|
|
487
|
+
if not containers:
|
|
488
|
+
raise ValueError(f"No containers found for TrainJob {job_name}")
|
|
489
|
+
|
|
490
|
+
# Get metadata from network
|
|
491
|
+
network_id = containers[0]["labels"].get(f"{self.label_prefix}/network-id")
|
|
492
|
+
if not network_id:
|
|
493
|
+
raise ValueError(f"TrainJob {job_name} is missing network metadata")
|
|
494
|
+
|
|
495
|
+
network_info = self._adapter.get_network(network_id)
|
|
496
|
+
if not network_info:
|
|
497
|
+
raise ValueError(f"TrainJob {job_name} network not found")
|
|
498
|
+
|
|
499
|
+
network_labels = network_info.get("labels", {})
|
|
500
|
+
runtime_name = network_labels.get(f"{self.label_prefix}/runtime-name")
|
|
501
|
+
|
|
502
|
+
# Get runtime object
|
|
503
|
+
try:
|
|
504
|
+
job_runtime = self.get_runtime(runtime_name) if runtime_name else None
|
|
505
|
+
except Exception as e:
|
|
506
|
+
raise ValueError(f"Runtime {runtime_name} not found for job {job_name}") from e
|
|
507
|
+
|
|
508
|
+
if not job_runtime:
|
|
509
|
+
raise ValueError(f"Runtime {runtime_name} not found for job {job_name}")
|
|
510
|
+
|
|
511
|
+
# Parse creation timestamp from first container
|
|
512
|
+
created_str = containers[0].get("created", "")
|
|
513
|
+
try:
|
|
514
|
+
from dateutil import parser
|
|
515
|
+
|
|
516
|
+
creation_timestamp = parser.isoparse(created_str)
|
|
517
|
+
except Exception:
|
|
518
|
+
creation_timestamp = datetime.now()
|
|
519
|
+
|
|
520
|
+
# Build steps from containers
|
|
521
|
+
steps = []
|
|
522
|
+
for container in sorted(containers, key=lambda c: c["name"]):
|
|
523
|
+
step_name = container["labels"].get(f"{self.label_prefix}/step", "")
|
|
524
|
+
steps.append(
|
|
525
|
+
types.Step(
|
|
526
|
+
name=step_name,
|
|
527
|
+
pod_name=container["name"],
|
|
528
|
+
status=container_utils.get_container_status(self._adapter, container["id"]),
|
|
529
|
+
)
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
# Get num_nodes from container count
|
|
533
|
+
num_nodes = len(containers)
|
|
534
|
+
|
|
535
|
+
return types.TrainJob(
|
|
536
|
+
name=job_name,
|
|
537
|
+
creation_timestamp=creation_timestamp,
|
|
538
|
+
runtime=job_runtime,
|
|
539
|
+
steps=steps,
|
|
540
|
+
num_nodes=num_nodes,
|
|
541
|
+
status=container_utils.aggregate_container_statuses(self._adapter, containers),
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
def list_jobs(self, runtime: Optional[types.Runtime] = None) -> list[types.TrainJob]:
|
|
545
|
+
"""List all training jobs by querying container runtime."""
|
|
546
|
+
# Get all containers with our label prefix
|
|
547
|
+
filters = {"label": [f"{self.label_prefix}/trainjob-name"]}
|
|
548
|
+
containers = self._adapter.list_containers(filters=filters)
|
|
549
|
+
|
|
550
|
+
# Group containers by job name
|
|
551
|
+
jobs_map: dict[str, list[dict]] = {}
|
|
552
|
+
for container in containers:
|
|
553
|
+
job_name = container["labels"].get(f"{self.label_prefix}/trainjob-name")
|
|
554
|
+
if job_name:
|
|
555
|
+
if job_name not in jobs_map:
|
|
556
|
+
jobs_map[job_name] = []
|
|
557
|
+
jobs_map[job_name].append(container)
|
|
558
|
+
|
|
559
|
+
result: list[types.TrainJob] = []
|
|
560
|
+
for job_name, job_containers in jobs_map.items():
|
|
561
|
+
# Skip jobs with no containers
|
|
562
|
+
if not job_containers:
|
|
563
|
+
continue
|
|
564
|
+
|
|
565
|
+
# Filter by runtime if specified
|
|
566
|
+
if runtime:
|
|
567
|
+
network_id = job_containers[0]["labels"].get(f"{self.label_prefix}/network-id")
|
|
568
|
+
if network_id:
|
|
569
|
+
network_info = self._adapter.get_network(network_id)
|
|
570
|
+
if network_info:
|
|
571
|
+
network_labels = network_info.get("labels", {})
|
|
572
|
+
runtime_name = network_labels.get(f"{self.label_prefix}/runtime-name")
|
|
573
|
+
if runtime_name != runtime.name:
|
|
574
|
+
continue
|
|
575
|
+
|
|
576
|
+
# Build TrainJob from containers
|
|
577
|
+
try:
|
|
578
|
+
result.append(self.__get_trainjob_from_containers(job_name, job_containers))
|
|
579
|
+
except Exception as e:
|
|
580
|
+
logger.warning(f"Failed to get TrainJob {job_name}: {e}")
|
|
581
|
+
continue
|
|
582
|
+
|
|
583
|
+
return result
|
|
584
|
+
|
|
585
|
+
def get_job(self, name: str) -> types.TrainJob:
|
|
586
|
+
"""Get a specific training job by querying container runtime."""
|
|
587
|
+
containers = self._get_job_containers(name)
|
|
588
|
+
return self.__get_trainjob_from_containers(name, containers)
|
|
589
|
+
|
|
590
|
+
def get_job_logs(
|
|
591
|
+
self,
|
|
592
|
+
name: str,
|
|
593
|
+
follow: bool = False,
|
|
594
|
+
step: str = constants.NODE + "-0",
|
|
595
|
+
) -> Iterator[str]:
|
|
596
|
+
"""Get logs for a training job by querying container runtime."""
|
|
597
|
+
containers = self._get_job_containers(name)
|
|
598
|
+
|
|
599
|
+
want_all = step == constants.NODE + "-0"
|
|
600
|
+
for container in sorted(containers, key=lambda c: c["name"]):
|
|
601
|
+
container_step = container["labels"].get(f"{self.label_prefix}/step", "")
|
|
602
|
+
if not want_all and container_step != step:
|
|
603
|
+
continue
|
|
604
|
+
try:
|
|
605
|
+
yield from self._adapter.container_logs(container["id"], follow)
|
|
606
|
+
except Exception as e:
|
|
607
|
+
logger.warning(f"Failed to get logs for {container['name']}: {e}")
|
|
608
|
+
yield f"Error getting logs: {e}\n"
|
|
609
|
+
|
|
610
|
+
def wait_for_job_status(
|
|
611
|
+
self,
|
|
612
|
+
name: str,
|
|
613
|
+
status: set[str] = {constants.TRAINJOB_COMPLETE},
|
|
614
|
+
timeout: int = 600,
|
|
615
|
+
polling_interval: int = 2,
|
|
616
|
+
callbacks: Optional[list[Callable[[types.TrainJob], None]]] = None,
|
|
617
|
+
) -> types.TrainJob:
|
|
618
|
+
import time
|
|
619
|
+
|
|
620
|
+
end = time.time() + timeout
|
|
621
|
+
while time.time() < end:
|
|
622
|
+
tj = self.get_job(name)
|
|
623
|
+
logger.debug(f"TrainJob {name}, status {tj.status}")
|
|
624
|
+
|
|
625
|
+
# Invoke callbacks if provided
|
|
626
|
+
if callbacks:
|
|
627
|
+
for callback in callbacks:
|
|
628
|
+
callback(tj)
|
|
629
|
+
|
|
630
|
+
if tj.status in status:
|
|
631
|
+
return tj
|
|
632
|
+
if constants.TRAINJOB_FAILED not in status and tj.status == constants.TRAINJOB_FAILED:
|
|
633
|
+
raise RuntimeError(f"TrainJob {name} is Failed")
|
|
634
|
+
time.sleep(polling_interval)
|
|
635
|
+
raise TimeoutError(f"Timeout waiting for TrainJob {name} to reach status: {status}")
|
|
636
|
+
|
|
637
|
+
def delete_job(self, name: str):
|
|
638
|
+
"""Delete a training job by querying container runtime."""
|
|
639
|
+
containers = self._get_job_containers(name)
|
|
640
|
+
|
|
641
|
+
# Get network_id and workdir from labels
|
|
642
|
+
network_id = containers[0]["labels"].get(f"{self.label_prefix}/network-id")
|
|
643
|
+
|
|
644
|
+
# Get workdir from network labels
|
|
645
|
+
workdir_host = None
|
|
646
|
+
if network_id:
|
|
647
|
+
network_info = self._adapter.get_network(network_id)
|
|
648
|
+
if network_info:
|
|
649
|
+
network_labels = network_info.get("labels", {})
|
|
650
|
+
workdir_host = network_labels.get(f"{self.label_prefix}/workdir")
|
|
651
|
+
|
|
652
|
+
# Stop containers and remove
|
|
653
|
+
from contextlib import suppress
|
|
654
|
+
|
|
655
|
+
for container in containers:
|
|
656
|
+
with suppress(Exception):
|
|
657
|
+
self._adapter.stop_container(container["id"], timeout=10)
|
|
658
|
+
with suppress(Exception):
|
|
659
|
+
self._adapter.remove_container(container["id"], force=True)
|
|
660
|
+
|
|
661
|
+
# Remove network (best-effort)
|
|
662
|
+
if network_id:
|
|
663
|
+
with suppress(Exception):
|
|
664
|
+
self._adapter.delete_network(network_id)
|
|
665
|
+
|
|
666
|
+
# Remove working directory if configured
|
|
667
|
+
if self.cfg.auto_remove and workdir_host and os.path.isdir(workdir_host):
|
|
668
|
+
shutil.rmtree(workdir_host, ignore_errors=True)
|