viettelcloud-aiplatform 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. viettelcloud/__init__.py +1 -0
  2. viettelcloud/aiplatform/__init__.py +15 -0
  3. viettelcloud/aiplatform/common/__init__.py +0 -0
  4. viettelcloud/aiplatform/common/constants.py +22 -0
  5. viettelcloud/aiplatform/common/types.py +28 -0
  6. viettelcloud/aiplatform/common/utils.py +40 -0
  7. viettelcloud/aiplatform/hub/OWNERS +14 -0
  8. viettelcloud/aiplatform/hub/__init__.py +25 -0
  9. viettelcloud/aiplatform/hub/api/__init__.py +13 -0
  10. viettelcloud/aiplatform/hub/api/_proxy_client.py +355 -0
  11. viettelcloud/aiplatform/hub/api/model_registry_client.py +561 -0
  12. viettelcloud/aiplatform/hub/api/model_registry_client_test.py +462 -0
  13. viettelcloud/aiplatform/optimizer/__init__.py +45 -0
  14. viettelcloud/aiplatform/optimizer/api/__init__.py +0 -0
  15. viettelcloud/aiplatform/optimizer/api/optimizer_client.py +248 -0
  16. viettelcloud/aiplatform/optimizer/backends/__init__.py +13 -0
  17. viettelcloud/aiplatform/optimizer/backends/base.py +77 -0
  18. viettelcloud/aiplatform/optimizer/backends/kubernetes/__init__.py +13 -0
  19. viettelcloud/aiplatform/optimizer/backends/kubernetes/backend.py +563 -0
  20. viettelcloud/aiplatform/optimizer/backends/kubernetes/utils.py +112 -0
  21. viettelcloud/aiplatform/optimizer/constants/__init__.py +13 -0
  22. viettelcloud/aiplatform/optimizer/constants/constants.py +59 -0
  23. viettelcloud/aiplatform/optimizer/types/__init__.py +13 -0
  24. viettelcloud/aiplatform/optimizer/types/algorithm_types.py +87 -0
  25. viettelcloud/aiplatform/optimizer/types/optimization_types.py +135 -0
  26. viettelcloud/aiplatform/optimizer/types/search_types.py +95 -0
  27. viettelcloud/aiplatform/py.typed +0 -0
  28. viettelcloud/aiplatform/trainer/__init__.py +82 -0
  29. viettelcloud/aiplatform/trainer/api/__init__.py +3 -0
  30. viettelcloud/aiplatform/trainer/api/trainer_client.py +277 -0
  31. viettelcloud/aiplatform/trainer/api/trainer_client_test.py +72 -0
  32. viettelcloud/aiplatform/trainer/backends/__init__.py +0 -0
  33. viettelcloud/aiplatform/trainer/backends/base.py +94 -0
  34. viettelcloud/aiplatform/trainer/backends/container/adapters/base.py +195 -0
  35. viettelcloud/aiplatform/trainer/backends/container/adapters/docker.py +231 -0
  36. viettelcloud/aiplatform/trainer/backends/container/adapters/podman.py +258 -0
  37. viettelcloud/aiplatform/trainer/backends/container/backend.py +668 -0
  38. viettelcloud/aiplatform/trainer/backends/container/backend_test.py +867 -0
  39. viettelcloud/aiplatform/trainer/backends/container/runtime_loader.py +631 -0
  40. viettelcloud/aiplatform/trainer/backends/container/runtime_loader_test.py +637 -0
  41. viettelcloud/aiplatform/trainer/backends/container/types.py +67 -0
  42. viettelcloud/aiplatform/trainer/backends/container/utils.py +213 -0
  43. viettelcloud/aiplatform/trainer/backends/kubernetes/__init__.py +0 -0
  44. viettelcloud/aiplatform/trainer/backends/kubernetes/backend.py +710 -0
  45. viettelcloud/aiplatform/trainer/backends/kubernetes/backend_test.py +1344 -0
  46. viettelcloud/aiplatform/trainer/backends/kubernetes/constants.py +15 -0
  47. viettelcloud/aiplatform/trainer/backends/kubernetes/utils.py +636 -0
  48. viettelcloud/aiplatform/trainer/backends/kubernetes/utils_test.py +582 -0
  49. viettelcloud/aiplatform/trainer/backends/localprocess/__init__.py +0 -0
  50. viettelcloud/aiplatform/trainer/backends/localprocess/backend.py +306 -0
  51. viettelcloud/aiplatform/trainer/backends/localprocess/backend_test.py +501 -0
  52. viettelcloud/aiplatform/trainer/backends/localprocess/constants.py +90 -0
  53. viettelcloud/aiplatform/trainer/backends/localprocess/job.py +184 -0
  54. viettelcloud/aiplatform/trainer/backends/localprocess/types.py +52 -0
  55. viettelcloud/aiplatform/trainer/backends/localprocess/utils.py +302 -0
  56. viettelcloud/aiplatform/trainer/constants/__init__.py +0 -0
  57. viettelcloud/aiplatform/trainer/constants/constants.py +179 -0
  58. viettelcloud/aiplatform/trainer/options/__init__.py +52 -0
  59. viettelcloud/aiplatform/trainer/options/common.py +55 -0
  60. viettelcloud/aiplatform/trainer/options/kubernetes.py +502 -0
  61. viettelcloud/aiplatform/trainer/options/kubernetes_test.py +259 -0
  62. viettelcloud/aiplatform/trainer/options/localprocess.py +20 -0
  63. viettelcloud/aiplatform/trainer/test/common.py +22 -0
  64. viettelcloud/aiplatform/trainer/types/__init__.py +0 -0
  65. viettelcloud/aiplatform/trainer/types/types.py +517 -0
  66. viettelcloud/aiplatform/trainer/types/types_test.py +115 -0
  67. viettelcloud_aiplatform-0.3.0.dist-info/METADATA +226 -0
  68. viettelcloud_aiplatform-0.3.0.dist-info/RECORD +71 -0
  69. viettelcloud_aiplatform-0.3.0.dist-info/WHEEL +4 -0
  70. viettelcloud_aiplatform-0.3.0.dist-info/licenses/LICENSE +201 -0
  71. viettelcloud_aiplatform-0.3.0.dist-info/licenses/NOTICE +36 -0
@@ -0,0 +1,668 @@
1
+ # Copyright 2025 The Kubeflow Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ ContainerBackend
17
+ ----------------
18
+
19
+ Unified local execution backend for `CustomTrainer` jobs using containers.
20
+
21
+ This backend automatically detects and uses either Docker or Podman.
22
+ It provides a single interface regardless of the underlying container runtime.
23
+
24
+ Key behaviors:
25
+ - Auto-detection: Tries Docker first, then Podman. Can be overridden via config.
26
+ - Multi-node jobs: one container per node connected via a per-job network.
27
+ - Entry script generation: we serialize the user's training function and embed it
28
+ inline in the container command using a heredoc (no file I/O on the host). The
29
+ script is created inside the container at /tmp/train.py and invoked using
30
+ `torchrun` (preferred) or `python` as a fallback.
31
+ - Runtimes: we use `config/training_runtimes` to define runtime images and
32
+ characteristics (e.g., torch). Defaults to `torch-distributed` if no runtime
33
+ is provided.
34
+ - Image pulling: controlled via `pull_policy` and performed automatically if
35
+ needed.
36
+ - Logs and lifecycle: streaming logs and deletion semantics similar to the
37
+ Docker/Podman backends, but with automatic runtime detection.
38
+ """
39
+
40
+ from collections.abc import Callable, Iterator
41
+ from datetime import datetime
42
+ import logging
43
+ import os
44
+ import random
45
+ import shutil
46
+ import string
47
+ from typing import Optional, Union
48
+ import uuid
49
+
50
+ from viettelcloud.aiplatform.trainer.backends.base import RuntimeBackend
51
+ from viettelcloud.aiplatform.trainer.backends.container import utils as container_utils
52
+ from viettelcloud.aiplatform.trainer.backends.container.adapters.base import (
53
+ BaseContainerClientAdapter,
54
+ )
55
+ from viettelcloud.aiplatform.trainer.backends.container.adapters.docker import DockerClientAdapter
56
+ from viettelcloud.aiplatform.trainer.backends.container.adapters.podman import PodmanClientAdapter
57
+ from viettelcloud.aiplatform.trainer.backends.container.runtime_loader import (
58
+ get_training_runtime_from_sources,
59
+ list_training_runtimes_from_sources,
60
+ )
61
+ from viettelcloud.aiplatform.trainer.backends.container.types import ContainerBackendConfig
62
+ from viettelcloud.aiplatform.trainer.constants import constants
63
+ from viettelcloud.aiplatform.trainer.types import types
64
+
65
+ logger = logging.getLogger(__name__)
66
+
67
+
68
+ class ContainerBackend(RuntimeBackend):
69
+ """
70
+ Unified container backend that auto-detects Docker or Podman.
71
+
72
+ This backend uses the adapter pattern to abstract away differences between
73
+ Docker and Podman, providing a single consistent interface.
74
+ """
75
+
76
+ def __init__(self, cfg: ContainerBackendConfig):
77
+ self.cfg = cfg
78
+ self.label_prefix = "trainer.kubeflow.org"
79
+
80
+ # Initialize the container client adapter
81
+ self._adapter = self._create_adapter()
82
+
83
+ def _get_common_socket_locations(self, runtime_name: str) -> list[Optional[str]]:
84
+ """
85
+ Get common socket locations to try for the given runtime.
86
+
87
+ Args:
88
+ runtime_name: "docker" or "podman"
89
+
90
+ Returns:
91
+ List of socket URLs to try, including None (for default)
92
+ """
93
+ import os
94
+ from pathlib import Path
95
+
96
+ locations = [self.cfg.container_host] if self.cfg.container_host else []
97
+
98
+ if runtime_name == "docker":
99
+ # Common Docker socket locations
100
+ colima_sock = Path.home() / ".colima/default/docker.sock"
101
+ if colima_sock.exists():
102
+ locations.append(f"unix://{colima_sock}")
103
+ # Standard Docker socket
104
+ locations.append(None) # Use docker.from_env() default
105
+
106
+ elif runtime_name == "podman":
107
+ # Common Podman socket locations on macOS
108
+ uid = os.getuid() if hasattr(os, "getuid") else None
109
+ if uid:
110
+ user_sock = f"/run/user/{uid}/podman/podman.sock"
111
+ if Path(user_sock).exists():
112
+ locations.append(f"unix://{user_sock}")
113
+ # Standard Podman socket
114
+ locations.append(None) # Use PodmanClient() default
115
+
116
+ # Remove duplicates while preserving order
117
+ seen = set()
118
+ unique_locations = []
119
+ for loc in locations:
120
+ if loc not in seen:
121
+ unique_locations.append(loc)
122
+ seen.add(loc)
123
+
124
+ return unique_locations
125
+
126
+ def _create_adapter(self) -> BaseContainerClientAdapter:
127
+ """
128
+ Create the appropriate container client adapter.
129
+
130
+ Tries Docker first, then Podman if Docker fails, unless a specific
131
+ runtime is requested in the config. Automatically tries common socket
132
+ locations (e.g., Colima for Docker on macOS, user socket for Podman).
133
+
134
+ Raises RuntimeError if neither Docker nor Podman are available.
135
+ """
136
+ runtime_map = {
137
+ "docker": DockerClientAdapter,
138
+ "podman": PodmanClientAdapter,
139
+ }
140
+
141
+ # Determine which runtimes to try
142
+ runtimes_to_try = (
143
+ [self.cfg.container_runtime] if self.cfg.container_runtime else ["docker", "podman"]
144
+ )
145
+
146
+ attempted_connections = []
147
+ last_error = None
148
+
149
+ for runtime_name in runtimes_to_try:
150
+ if runtime_name not in runtime_map:
151
+ continue
152
+
153
+ # Try common socket locations for this runtime
154
+ socket_locations = self._get_common_socket_locations(runtime_name)
155
+
156
+ for host in socket_locations:
157
+ try:
158
+ adapter = runtime_map[runtime_name](host)
159
+ adapter.ping()
160
+ host_display = host or "default"
161
+ logger.debug(
162
+ f"Using {runtime_name} as container runtime (host: {host_display})"
163
+ )
164
+ return adapter
165
+ except Exception as e:
166
+ host_str = host or "default"
167
+ logger.debug(f"{runtime_name} initialization failed at {host_str}: {e}")
168
+ attempted_connections.append(f"{runtime_name} at {host_str}")
169
+ last_error = e
170
+
171
+ # Build helpful error message
172
+ import platform
173
+
174
+ system = platform.system()
175
+
176
+ attempted = ", ".join(attempted_connections)
177
+ error_msg = f"Could not connect to Docker or Podman (tried: {attempted}).\n"
178
+
179
+ if system == "Darwin": # macOS
180
+ error_msg += (
181
+ "Ensure Docker/Podman is running "
182
+ "(e.g., 'colima start' or 'podman machine start').\n"
183
+ )
184
+ else:
185
+ error_msg += "Ensure Docker/Podman is installed and running.\n"
186
+
187
+ error_msg += (
188
+ "To specify a custom socket: ContainerBackendConfig(container_host='unix:///path/to/socket')\n"
189
+ "Or use LocalProcessBackendConfig for non-containerized execution."
190
+ )
191
+
192
+ raise RuntimeError(error_msg) from last_error
193
+
194
+ @property
195
+ def _runtime_type(self) -> str:
196
+ """Get the runtime type for debugging/logging."""
197
+ return self._adapter._runtime_type
198
+
199
+ # ---- Runtime APIs ----
200
+ def list_runtimes(self) -> list[types.Runtime]:
201
+ return list_training_runtimes_from_sources(self.cfg.runtime_source.sources)
202
+
203
+ def get_runtime(self, name: str) -> types.Runtime:
204
+ return get_training_runtime_from_sources(name, self.cfg.runtime_source.sources)
205
+
206
+ def get_runtime_packages(self, runtime: types.Runtime):
207
+ """
208
+ Spawn a short-lived container to report Python version, pip list, and nvidia-smi.
209
+ """
210
+ container_utils.maybe_pull_image(self._adapter, runtime.trainer.image, self.cfg.pull_policy)
211
+
212
+ command = [
213
+ "bash",
214
+ "-lc",
215
+ "python -c \"import sys; print(f'Python: {sys.version}')\" && "
216
+ "(pip list || echo 'pip not found') && "
217
+ "(nvidia-smi || echo 'nvidia-smi not found')",
218
+ ]
219
+
220
+ logs = self._adapter.run_oneoff_container(image=runtime.trainer.image, command=command)
221
+ print(logs)
222
+
223
+ def train(
224
+ self,
225
+ runtime: Optional[Union[str, types.Runtime]] = None,
226
+ initializer: Optional[types.Initializer] = None,
227
+ trainer: Optional[
228
+ Union[types.CustomTrainer, types.CustomTrainerContainer, types.BuiltinTrainer]
229
+ ] = None,
230
+ options: Optional[list] = None,
231
+ ) -> str:
232
+ if runtime is None:
233
+ runtime = self.get_runtime(constants.DEFAULT_TRAINING_RUNTIME)
234
+ elif isinstance(runtime, str):
235
+ runtime = self.get_runtime(runtime)
236
+
237
+ # Process options to extract configuration
238
+ name = None
239
+ if options:
240
+ job_spec = {}
241
+ for option in options:
242
+ option(job_spec, trainer, self)
243
+
244
+ metadata_section = job_spec.get("metadata", {})
245
+ name = metadata_section.get("name")
246
+
247
+ if not isinstance(trainer, types.CustomTrainer):
248
+ raise ValueError(f"{self.__class__.__name__} supports only CustomTrainer in v1")
249
+
250
+ # Generate train job name if not provided via options
251
+ trainjob_name = name or (
252
+ random.choice(string.ascii_lowercase)
253
+ + uuid.uuid4().hex[: constants.JOB_NAME_UUID_LENGTH]
254
+ )
255
+
256
+ logger.debug(f"Starting training job: {trainjob_name}")
257
+ try:
258
+ # Create per-job working directory on host (for outputs, checkpoints, etc.)
259
+ workdir = container_utils.create_workdir(trainjob_name)
260
+ logger.debug(f"Created working directory: {workdir}")
261
+
262
+ # Generate training script code (inline, not written to disk)
263
+ training_script_code = container_utils.get_training_script_code(trainer)
264
+ logger.debug("Generated training script code")
265
+
266
+ # Get the image from the trainer or runtime.
267
+ image = trainer.image if trainer.image else runtime.trainer.image
268
+ logger.debug(f"Using image: {image}")
269
+
270
+ container_utils.maybe_pull_image(self._adapter, image, self.cfg.pull_policy)
271
+ logger.debug(f"Image ready: {image}")
272
+
273
+ # Build base environment
274
+ env = container_utils.build_environment(trainer)
275
+
276
+ # Construct pre-run command to install packages
277
+ pre_install_cmd = container_utils.build_pip_install_cmd(trainer)
278
+
279
+ # Create network for multi-node communication
280
+ num_nodes = trainer.num_nodes or runtime.trainer.num_nodes or 1
281
+ logger.debug(f"Creating network for {num_nodes} nodes")
282
+
283
+ # Determine number of processes per node from GPU count
284
+ # For GPU training: spawn one process per GPU for optimal utilization
285
+ # For CPU training: use single process (PyTorch parallelizes internally via threads)
286
+ nproc_per_node = 1 # Default for CPU training
287
+ if trainer.resources_per_node and "gpu" in trainer.resources_per_node:
288
+ try:
289
+ nproc_per_node = int(trainer.resources_per_node["gpu"])
290
+ logger.debug(f"Using {nproc_per_node} processes per node (1 per GPU)")
291
+ except (ValueError, TypeError):
292
+ logger.warning(
293
+ f"Invalid GPU count in resources_per_node: "
294
+ f"{trainer.resources_per_node['gpu']}, defaulting to 1 process per node"
295
+ )
296
+ else:
297
+ logger.debug("No GPU specified, using 1 process per node")
298
+
299
+ network_id = self._adapter.create_network(
300
+ name=f"{trainjob_name}-net",
301
+ labels={
302
+ f"{self.label_prefix}/trainjob-name": trainjob_name,
303
+ f"{self.label_prefix}/runtime-name": runtime.name,
304
+ f"{self.label_prefix}/workdir": workdir,
305
+ },
306
+ )
307
+ logger.debug(f"Created network: {network_id}")
308
+
309
+ # Create N containers (one per node)
310
+ container_ids: list[str] = []
311
+ master_container_id = None
312
+ master_ip = None
313
+
314
+ for rank in range(num_nodes):
315
+ container_name = f"{trainjob_name}-node-{rank}"
316
+
317
+ # Get master address and port for torchrun
318
+ master_port = 29500
319
+
320
+ # For Podman: use IP address to avoid DNS timing issues
321
+ # For Docker: use hostname (DNS is reliable)
322
+ if rank == 0:
323
+ # Master node - will be created first
324
+ master_addr = f"{trainjob_name}-node-0"
325
+ else:
326
+ # Worker nodes - determine master address based on runtime
327
+ if self._runtime_type == "podman" and master_ip:
328
+ master_addr = master_ip
329
+ logger.debug(f"Using master IP address for Podman: {master_ip}")
330
+ else:
331
+ master_addr = f"{trainjob_name}-node-0"
332
+ logger.debug(f"Using master hostname: {master_addr}")
333
+
334
+ # Prefer torchrun; fall back to python if torchrun is unavailable
335
+ # For worker nodes, wait for master to be reachable before starting torchrun
336
+ wait_for_master = ""
337
+ if rank > 0:
338
+ wait_for_master = (
339
+ f"echo 'Waiting for master node {master_addr}:{master_port}...'; "
340
+ f"for i in {{1..60}}; do "
341
+ f" if timeout 1 bash -c 'cat < /dev/null > "
342
+ f"/dev/tcp/{master_addr}/{master_port}' 2>/dev/null; then "
343
+ f" echo 'Master node is reachable'; break; "
344
+ f" fi; "
345
+ f" if [ $i -eq 60 ]; then "
346
+ f"echo 'Timeout waiting for master node'; exit 1; fi; "
347
+ f" sleep 2; "
348
+ f"done; "
349
+ )
350
+
351
+ # Embed training script inline using heredoc (no file I/O on host)
352
+ entry_cmd = (
353
+ f"{pre_install_cmd}"
354
+ f"{wait_for_master}"
355
+ f"cat > /tmp/train.py << 'TRAINING_SCRIPT_EOF'\n"
356
+ f"{training_script_code}\n"
357
+ f"TRAINING_SCRIPT_EOF\n"
358
+ "if command -v torchrun >/dev/null 2>&1; then "
359
+ f" torchrun --nproc_per_node={nproc_per_node} --nnodes={num_nodes} "
360
+ f" --node-rank={rank} --rdzv-backend=static "
361
+ f" --rdzv-endpoint={master_addr}:{master_port} "
362
+ f" /tmp/train.py; "
363
+ "else "
364
+ f" python /tmp/train.py; "
365
+ "fi"
366
+ )
367
+
368
+ full_cmd = ["bash", "-lc", entry_cmd]
369
+
370
+ labels = {
371
+ f"{self.label_prefix}/trainjob-name": trainjob_name,
372
+ f"{self.label_prefix}/step": f"node-{rank}",
373
+ f"{self.label_prefix}/network-id": network_id,
374
+ }
375
+
376
+ volumes = {
377
+ workdir: {
378
+ "bind": constants.WORKSPACE_PATH,
379
+ "mode": "rw",
380
+ }
381
+ }
382
+
383
+ logger.debug(f"Creating container {rank}/{num_nodes}: {container_name}")
384
+
385
+ container_id = self._adapter.create_and_start_container(
386
+ image=image,
387
+ command=full_cmd,
388
+ name=container_name,
389
+ network_id=network_id,
390
+ environment=env,
391
+ labels=labels,
392
+ volumes=volumes,
393
+ working_dir=constants.WORKSPACE_PATH,
394
+ )
395
+
396
+ logger.debug(f"Started container {container_name} (ID: {container_id[:12]})")
397
+ container_ids.append(container_id)
398
+
399
+ # If this is the master node and we're using Podman, get its IP address
400
+ if rank == 0:
401
+ master_container_id = container_id
402
+ if self._runtime_type == "podman":
403
+ # Get master IP for worker nodes to use
404
+ master_ip = self._adapter.get_container_ip(master_container_id, network_id)
405
+ if master_ip:
406
+ logger.debug(f"Master node IP address: {master_ip}")
407
+ else:
408
+ logger.warning(
409
+ "Could not retrieve master IP address. "
410
+ "Worker nodes will fall back to DNS resolution."
411
+ )
412
+
413
+ logger.debug(
414
+ f"Training job {trainjob_name} created successfully with "
415
+ f"{len(container_ids)} container(s)"
416
+ )
417
+ return trainjob_name
418
+
419
+ except Exception as e:
420
+ # Clean up on failure
421
+ logger.error(f"Failed to create training job {trainjob_name}: {e}")
422
+ logger.exception("Full traceback:")
423
+
424
+ # Try to clean up any resources that were created
425
+ from contextlib import suppress
426
+
427
+ try:
428
+ # Stop and remove any containers that were created
429
+ if "container_ids" in locals():
430
+ for container_id in container_ids:
431
+ with suppress(Exception):
432
+ self._adapter.stop_container(container_id, timeout=5)
433
+ self._adapter.remove_container(container_id, force=True)
434
+
435
+ # Remove network if it was created
436
+ if "network_id" in locals():
437
+ with suppress(Exception):
438
+ self._adapter.delete_network(network_id)
439
+
440
+ # Remove working directory if it was created
441
+ if "workdir" in locals() and os.path.isdir(workdir):
442
+ shutil.rmtree(workdir, ignore_errors=True)
443
+
444
+ except Exception as cleanup_error:
445
+ logger.error(f"Error during cleanup: {cleanup_error}")
446
+
447
+ # Re-raise the original exception
448
+ raise
449
+
450
+ def _get_job_containers(self, name: str) -> list[dict]:
451
+ """
452
+ Get containers for a specific training job.
453
+
454
+ Args:
455
+ name: Name of the training job
456
+
457
+ Returns:
458
+ List of container dictionaries for this job
459
+
460
+ Raises:
461
+ ValueError: If no containers found for the job
462
+ """
463
+ filters = {"label": [f"{self.label_prefix}/trainjob-name={name}"]}
464
+ containers = self._adapter.list_containers(filters=filters)
465
+
466
+ if not containers:
467
+ raise ValueError(f"No TrainJob with name {name}")
468
+
469
+ return containers
470
+
471
+ def __get_trainjob_from_containers(
472
+ self, job_name: str, containers: list[dict]
473
+ ) -> types.TrainJob:
474
+ """
475
+ Build a TrainJob object from a list of containers.
476
+
477
+ Args:
478
+ job_name: Name of the training job
479
+ containers: List of container dictionaries for this job
480
+
481
+ Returns:
482
+ TrainJob object
483
+
484
+ Raises:
485
+ ValueError: If network metadata is missing or runtime not found
486
+ """
487
+ if not containers:
488
+ raise ValueError(f"No containers found for TrainJob {job_name}")
489
+
490
+ # Get metadata from network
491
+ network_id = containers[0]["labels"].get(f"{self.label_prefix}/network-id")
492
+ if not network_id:
493
+ raise ValueError(f"TrainJob {job_name} is missing network metadata")
494
+
495
+ network_info = self._adapter.get_network(network_id)
496
+ if not network_info:
497
+ raise ValueError(f"TrainJob {job_name} network not found")
498
+
499
+ network_labels = network_info.get("labels", {})
500
+ runtime_name = network_labels.get(f"{self.label_prefix}/runtime-name")
501
+
502
+ # Get runtime object
503
+ try:
504
+ job_runtime = self.get_runtime(runtime_name) if runtime_name else None
505
+ except Exception as e:
506
+ raise ValueError(f"Runtime {runtime_name} not found for job {job_name}") from e
507
+
508
+ if not job_runtime:
509
+ raise ValueError(f"Runtime {runtime_name} not found for job {job_name}")
510
+
511
+ # Parse creation timestamp from first container
512
+ created_str = containers[0].get("created", "")
513
+ try:
514
+ from dateutil import parser
515
+
516
+ creation_timestamp = parser.isoparse(created_str)
517
+ except Exception:
518
+ creation_timestamp = datetime.now()
519
+
520
+ # Build steps from containers
521
+ steps = []
522
+ for container in sorted(containers, key=lambda c: c["name"]):
523
+ step_name = container["labels"].get(f"{self.label_prefix}/step", "")
524
+ steps.append(
525
+ types.Step(
526
+ name=step_name,
527
+ pod_name=container["name"],
528
+ status=container_utils.get_container_status(self._adapter, container["id"]),
529
+ )
530
+ )
531
+
532
+ # Get num_nodes from container count
533
+ num_nodes = len(containers)
534
+
535
+ return types.TrainJob(
536
+ name=job_name,
537
+ creation_timestamp=creation_timestamp,
538
+ runtime=job_runtime,
539
+ steps=steps,
540
+ num_nodes=num_nodes,
541
+ status=container_utils.aggregate_container_statuses(self._adapter, containers),
542
+ )
543
+
544
+ def list_jobs(self, runtime: Optional[types.Runtime] = None) -> list[types.TrainJob]:
545
+ """List all training jobs by querying container runtime."""
546
+ # Get all containers with our label prefix
547
+ filters = {"label": [f"{self.label_prefix}/trainjob-name"]}
548
+ containers = self._adapter.list_containers(filters=filters)
549
+
550
+ # Group containers by job name
551
+ jobs_map: dict[str, list[dict]] = {}
552
+ for container in containers:
553
+ job_name = container["labels"].get(f"{self.label_prefix}/trainjob-name")
554
+ if job_name:
555
+ if job_name not in jobs_map:
556
+ jobs_map[job_name] = []
557
+ jobs_map[job_name].append(container)
558
+
559
+ result: list[types.TrainJob] = []
560
+ for job_name, job_containers in jobs_map.items():
561
+ # Skip jobs with no containers
562
+ if not job_containers:
563
+ continue
564
+
565
+ # Filter by runtime if specified
566
+ if runtime:
567
+ network_id = job_containers[0]["labels"].get(f"{self.label_prefix}/network-id")
568
+ if network_id:
569
+ network_info = self._adapter.get_network(network_id)
570
+ if network_info:
571
+ network_labels = network_info.get("labels", {})
572
+ runtime_name = network_labels.get(f"{self.label_prefix}/runtime-name")
573
+ if runtime_name != runtime.name:
574
+ continue
575
+
576
+ # Build TrainJob from containers
577
+ try:
578
+ result.append(self.__get_trainjob_from_containers(job_name, job_containers))
579
+ except Exception as e:
580
+ logger.warning(f"Failed to get TrainJob {job_name}: {e}")
581
+ continue
582
+
583
+ return result
584
+
585
+ def get_job(self, name: str) -> types.TrainJob:
586
+ """Get a specific training job by querying container runtime."""
587
+ containers = self._get_job_containers(name)
588
+ return self.__get_trainjob_from_containers(name, containers)
589
+
590
+ def get_job_logs(
591
+ self,
592
+ name: str,
593
+ follow: bool = False,
594
+ step: str = constants.NODE + "-0",
595
+ ) -> Iterator[str]:
596
+ """Get logs for a training job by querying container runtime."""
597
+ containers = self._get_job_containers(name)
598
+
599
+ want_all = step == constants.NODE + "-0"
600
+ for container in sorted(containers, key=lambda c: c["name"]):
601
+ container_step = container["labels"].get(f"{self.label_prefix}/step", "")
602
+ if not want_all and container_step != step:
603
+ continue
604
+ try:
605
+ yield from self._adapter.container_logs(container["id"], follow)
606
+ except Exception as e:
607
+ logger.warning(f"Failed to get logs for {container['name']}: {e}")
608
+ yield f"Error getting logs: {e}\n"
609
+
610
+ def wait_for_job_status(
611
+ self,
612
+ name: str,
613
+ status: set[str] = {constants.TRAINJOB_COMPLETE},
614
+ timeout: int = 600,
615
+ polling_interval: int = 2,
616
+ callbacks: Optional[list[Callable[[types.TrainJob], None]]] = None,
617
+ ) -> types.TrainJob:
618
+ import time
619
+
620
+ end = time.time() + timeout
621
+ while time.time() < end:
622
+ tj = self.get_job(name)
623
+ logger.debug(f"TrainJob {name}, status {tj.status}")
624
+
625
+ # Invoke callbacks if provided
626
+ if callbacks:
627
+ for callback in callbacks:
628
+ callback(tj)
629
+
630
+ if tj.status in status:
631
+ return tj
632
+ if constants.TRAINJOB_FAILED not in status and tj.status == constants.TRAINJOB_FAILED:
633
+ raise RuntimeError(f"TrainJob {name} is Failed")
634
+ time.sleep(polling_interval)
635
+ raise TimeoutError(f"Timeout waiting for TrainJob {name} to reach status: {status}")
636
+
637
+ def delete_job(self, name: str):
638
+ """Delete a training job by querying container runtime."""
639
+ containers = self._get_job_containers(name)
640
+
641
+ # Get network_id and workdir from labels
642
+ network_id = containers[0]["labels"].get(f"{self.label_prefix}/network-id")
643
+
644
+ # Get workdir from network labels
645
+ workdir_host = None
646
+ if network_id:
647
+ network_info = self._adapter.get_network(network_id)
648
+ if network_info:
649
+ network_labels = network_info.get("labels", {})
650
+ workdir_host = network_labels.get(f"{self.label_prefix}/workdir")
651
+
652
+ # Stop containers and remove
653
+ from contextlib import suppress
654
+
655
+ for container in containers:
656
+ with suppress(Exception):
657
+ self._adapter.stop_container(container["id"], timeout=10)
658
+ with suppress(Exception):
659
+ self._adapter.remove_container(container["id"], force=True)
660
+
661
+ # Remove network (best-effort)
662
+ if network_id:
663
+ with suppress(Exception):
664
+ self._adapter.delete_network(network_id)
665
+
666
+ # Remove working directory if configured
667
+ if self.cfg.auto_remove and workdir_host and os.path.isdir(workdir_host):
668
+ shutil.rmtree(workdir_host, ignore_errors=True)