viettelcloud-aiplatform 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. viettelcloud/__init__.py +1 -0
  2. viettelcloud/aiplatform/__init__.py +15 -0
  3. viettelcloud/aiplatform/common/__init__.py +0 -0
  4. viettelcloud/aiplatform/common/constants.py +22 -0
  5. viettelcloud/aiplatform/common/types.py +28 -0
  6. viettelcloud/aiplatform/common/utils.py +40 -0
  7. viettelcloud/aiplatform/hub/OWNERS +14 -0
  8. viettelcloud/aiplatform/hub/__init__.py +25 -0
  9. viettelcloud/aiplatform/hub/api/__init__.py +13 -0
  10. viettelcloud/aiplatform/hub/api/_proxy_client.py +355 -0
  11. viettelcloud/aiplatform/hub/api/model_registry_client.py +561 -0
  12. viettelcloud/aiplatform/hub/api/model_registry_client_test.py +462 -0
  13. viettelcloud/aiplatform/optimizer/__init__.py +45 -0
  14. viettelcloud/aiplatform/optimizer/api/__init__.py +0 -0
  15. viettelcloud/aiplatform/optimizer/api/optimizer_client.py +248 -0
  16. viettelcloud/aiplatform/optimizer/backends/__init__.py +13 -0
  17. viettelcloud/aiplatform/optimizer/backends/base.py +77 -0
  18. viettelcloud/aiplatform/optimizer/backends/kubernetes/__init__.py +13 -0
  19. viettelcloud/aiplatform/optimizer/backends/kubernetes/backend.py +563 -0
  20. viettelcloud/aiplatform/optimizer/backends/kubernetes/utils.py +112 -0
  21. viettelcloud/aiplatform/optimizer/constants/__init__.py +13 -0
  22. viettelcloud/aiplatform/optimizer/constants/constants.py +59 -0
  23. viettelcloud/aiplatform/optimizer/types/__init__.py +13 -0
  24. viettelcloud/aiplatform/optimizer/types/algorithm_types.py +87 -0
  25. viettelcloud/aiplatform/optimizer/types/optimization_types.py +135 -0
  26. viettelcloud/aiplatform/optimizer/types/search_types.py +95 -0
  27. viettelcloud/aiplatform/py.typed +0 -0
  28. viettelcloud/aiplatform/trainer/__init__.py +82 -0
  29. viettelcloud/aiplatform/trainer/api/__init__.py +3 -0
  30. viettelcloud/aiplatform/trainer/api/trainer_client.py +277 -0
  31. viettelcloud/aiplatform/trainer/api/trainer_client_test.py +72 -0
  32. viettelcloud/aiplatform/trainer/backends/__init__.py +0 -0
  33. viettelcloud/aiplatform/trainer/backends/base.py +94 -0
  34. viettelcloud/aiplatform/trainer/backends/container/adapters/base.py +195 -0
  35. viettelcloud/aiplatform/trainer/backends/container/adapters/docker.py +231 -0
  36. viettelcloud/aiplatform/trainer/backends/container/adapters/podman.py +258 -0
  37. viettelcloud/aiplatform/trainer/backends/container/backend.py +668 -0
  38. viettelcloud/aiplatform/trainer/backends/container/backend_test.py +867 -0
  39. viettelcloud/aiplatform/trainer/backends/container/runtime_loader.py +631 -0
  40. viettelcloud/aiplatform/trainer/backends/container/runtime_loader_test.py +637 -0
  41. viettelcloud/aiplatform/trainer/backends/container/types.py +67 -0
  42. viettelcloud/aiplatform/trainer/backends/container/utils.py +213 -0
  43. viettelcloud/aiplatform/trainer/backends/kubernetes/__init__.py +0 -0
  44. viettelcloud/aiplatform/trainer/backends/kubernetes/backend.py +710 -0
  45. viettelcloud/aiplatform/trainer/backends/kubernetes/backend_test.py +1344 -0
  46. viettelcloud/aiplatform/trainer/backends/kubernetes/constants.py +15 -0
  47. viettelcloud/aiplatform/trainer/backends/kubernetes/utils.py +636 -0
  48. viettelcloud/aiplatform/trainer/backends/kubernetes/utils_test.py +582 -0
  49. viettelcloud/aiplatform/trainer/backends/localprocess/__init__.py +0 -0
  50. viettelcloud/aiplatform/trainer/backends/localprocess/backend.py +306 -0
  51. viettelcloud/aiplatform/trainer/backends/localprocess/backend_test.py +501 -0
  52. viettelcloud/aiplatform/trainer/backends/localprocess/constants.py +90 -0
  53. viettelcloud/aiplatform/trainer/backends/localprocess/job.py +184 -0
  54. viettelcloud/aiplatform/trainer/backends/localprocess/types.py +52 -0
  55. viettelcloud/aiplatform/trainer/backends/localprocess/utils.py +302 -0
  56. viettelcloud/aiplatform/trainer/constants/__init__.py +0 -0
  57. viettelcloud/aiplatform/trainer/constants/constants.py +179 -0
  58. viettelcloud/aiplatform/trainer/options/__init__.py +52 -0
  59. viettelcloud/aiplatform/trainer/options/common.py +55 -0
  60. viettelcloud/aiplatform/trainer/options/kubernetes.py +502 -0
  61. viettelcloud/aiplatform/trainer/options/kubernetes_test.py +259 -0
  62. viettelcloud/aiplatform/trainer/options/localprocess.py +20 -0
  63. viettelcloud/aiplatform/trainer/test/common.py +22 -0
  64. viettelcloud/aiplatform/trainer/types/__init__.py +0 -0
  65. viettelcloud/aiplatform/trainer/types/types.py +517 -0
  66. viettelcloud/aiplatform/trainer/types/types_test.py +115 -0
  67. viettelcloud_aiplatform-0.3.0.dist-info/METADATA +226 -0
  68. viettelcloud_aiplatform-0.3.0.dist-info/RECORD +71 -0
  69. viettelcloud_aiplatform-0.3.0.dist-info/WHEEL +4 -0
  70. viettelcloud_aiplatform-0.3.0.dist-info/licenses/LICENSE +201 -0
  71. viettelcloud_aiplatform-0.3.0.dist-info/licenses/NOTICE +36 -0
@@ -0,0 +1,67 @@
1
+ # Copyright 2025 The Kubeflow Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ Types and configuration for the unified Container backend.
17
+
18
+ This backend automatically detects and uses either Docker or Podman.
19
+ It provides a single interface for container-based execution regardless
20
+ of the underlying runtime.
21
+
22
+ Configuration options:
23
+ - pull_policy: Controls image pulling. Supported values: "IfNotPresent",
24
+ "Always", "Never". The default is "IfNotPresent".
25
+ - auto_remove: Whether to remove containers and networks when jobs are deleted.
26
+ Defaults to True.
27
+ - container_host: Optional override for connecting to a remote/local container
28
+ daemon. By default, auto-detects from environment or uses system defaults.
29
+ For Docker: uses DOCKER_HOST or default socket.
30
+ For Podman: uses CONTAINER_HOST or default socket.
31
+ - container_runtime: Force use of a specific container runtime ("docker" or "podman").
32
+ If not set, auto-detects based on availability (tries Docker first, then Podman).
33
+ - runtime_source: Configuration for training runtime sources using URL schemes.
34
+ Supports github://, https://, http://, file://, and absolute paths.
35
+ Built-in runtimes packaged with kubeflow-trainer are used as default fallback.
36
+ """
37
+
38
+ from typing import Literal, Optional
39
+
40
+ from pydantic import BaseModel, Field
41
+
42
+
43
+ class TrainingRuntimeSource(BaseModel):
44
+ """Configuration for training runtime sources using URL schemes."""
45
+
46
+ sources: list[str] = Field(
47
+ default_factory=lambda: ["github://kubeflow/trainer"],
48
+ description=(
49
+ "Runtime sources with URL schemes (checked in priority order):\n"
50
+ " - github://owner/repo[/path] - GitHub repository\n"
51
+ " - https://url or http://url - HTTP(S) endpoint\n"
52
+ " - file:///path or /absolute/path - Local filesystem\n"
53
+ "If a runtime is not found in configured sources, built-in runtimes "
54
+ "packaged with kubeflow-trainer are used as default."
55
+ ),
56
+ )
57
+
58
+
59
+ class ContainerBackendConfig(BaseModel):
60
+ pull_policy: str = Field(default="IfNotPresent")
61
+ auto_remove: bool = Field(default=True)
62
+ container_host: Optional[str] = Field(default=None)
63
+ container_runtime: Optional[Literal["docker", "podman"]] = Field(default=None)
64
+ runtime_source: TrainingRuntimeSource = Field(
65
+ default_factory=TrainingRuntimeSource,
66
+ description="Configuration for training runtime sources",
67
+ )
@@ -0,0 +1,213 @@
1
+ # Copyright 2025 The Kubeflow Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ Utility functions for the Container backend.
17
+ """
18
+
19
+ import logging
20
+ import os
21
+ from pathlib import Path
22
+
23
+ from viettelcloud.aiplatform.common.constants import UNKNOWN
24
+ from viettelcloud.aiplatform.trainer.constants import constants
25
+ from viettelcloud.aiplatform.trainer.types import types
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ def create_workdir(job_name: str) -> str:
31
+ """
32
+ Create per-job working directory on host.
33
+
34
+ Working directories are created under ~/.kubeflow/trainer/containers/<job_name>
35
+
36
+ Args:
37
+ job_name: Name of the training job.
38
+
39
+ Returns:
40
+ Absolute path to the working directory.
41
+ """
42
+ home_base = Path.home() / ".kubeflow" / "trainer" / "containers"
43
+ home_base.mkdir(parents=True, exist_ok=True)
44
+ workdir = str((home_base / f"{job_name}").resolve())
45
+ os.makedirs(workdir, exist_ok=True)
46
+ return workdir
47
+
48
+
49
+ def get_training_script_code(trainer: types.CustomTrainer) -> str:
50
+ """
51
+ Generate the training script code from the trainer function.
52
+
53
+ This extracts the function source and appends a function call,
54
+ similar to how the Kubernetes backend handles training scripts.
55
+
56
+ Args:
57
+ trainer: CustomTrainer configuration.
58
+
59
+ Returns:
60
+ Complete Python code as a string to execute.
61
+ """
62
+ import inspect
63
+ import textwrap
64
+
65
+ code = inspect.getsource(trainer.func)
66
+ code = textwrap.dedent(code)
67
+ if trainer.func_args is None:
68
+ code += f"\n{trainer.func.__name__}()\n"
69
+ else:
70
+ code += f"\n{trainer.func.__name__}(**{trainer.func_args})\n"
71
+ return code
72
+
73
+
74
+ def build_environment(trainer: types.CustomTrainer) -> dict[str, str]:
75
+ """
76
+ Build environment variables for containers.
77
+
78
+ Args:
79
+ trainer: CustomTrainer configuration.
80
+
81
+ Returns:
82
+ Dictionary of environment variables.
83
+ """
84
+ return dict(trainer.env or {})
85
+
86
+
87
+ def build_pip_install_cmd(trainer: types.CustomTrainer) -> str:
88
+ """
89
+ Build pip install command for packages.
90
+
91
+ Args:
92
+ trainer: CustomTrainer configuration.
93
+
94
+ Returns:
95
+ Pip install command string (empty if no packages to install).
96
+ """
97
+ pkgs = trainer.packages_to_install or []
98
+ if not pkgs:
99
+ return ""
100
+
101
+ index_urls = trainer.pip_index_urls or list(constants.DEFAULT_PIP_INDEX_URLS)
102
+ main_idx = index_urls[0]
103
+ extras = " ".join(f"--extra-index-url {u}" for u in index_urls[1:])
104
+ quoted = " ".join(f'"{p}"' for p in pkgs)
105
+ return (
106
+ "PIP_DISABLE_PIP_VERSION_CHECK=1 pip install --no-warn-script-location "
107
+ f"--index-url {main_idx} {extras} {quoted} && "
108
+ )
109
+
110
+
111
+ def container_status_to_trainjob_status(status: str, exit_code: int) -> str:
112
+ """
113
+ Convert container status to TrainJob status.
114
+
115
+ Args:
116
+ status: Container status (e.g., "running", "exited", "created").
117
+ exit_code: Container exit code.
118
+
119
+ Returns:
120
+ TrainJob status constant.
121
+ """
122
+ if status == "running":
123
+ return constants.TRAINJOB_RUNNING
124
+ if status == "created":
125
+ return constants.TRAINJOB_CREATED
126
+ if status == "exited":
127
+ # Exit code 0 -> complete, else failed
128
+ return constants.TRAINJOB_COMPLETE if exit_code == 0 else constants.TRAINJOB_FAILED
129
+ return UNKNOWN
130
+
131
+
132
+ def aggregate_status_from_containers(container_statuses: list[str]) -> str:
133
+ """
134
+ Aggregate status from multiple container statuses.
135
+
136
+ Args:
137
+ container_statuses: List of container status strings.
138
+
139
+ Returns:
140
+ Aggregated TrainJob status.
141
+ """
142
+ if constants.TRAINJOB_FAILED in container_statuses:
143
+ return constants.TRAINJOB_FAILED
144
+ if constants.TRAINJOB_RUNNING in container_statuses:
145
+ return constants.TRAINJOB_RUNNING
146
+ if all(s == constants.TRAINJOB_COMPLETE for s in container_statuses if s != UNKNOWN):
147
+ return constants.TRAINJOB_COMPLETE
148
+ if any(s == constants.TRAINJOB_CREATED for s in container_statuses):
149
+ return constants.TRAINJOB_CREATED
150
+ return UNKNOWN
151
+
152
+
153
+ def maybe_pull_image(adapter, image: str, pull_policy: str):
154
+ """
155
+ Pull image based on pull policy.
156
+
157
+ Args:
158
+ adapter: Container client adapter (DockerClientAdapter or PodmanClientAdapter).
159
+ image: Container image name.
160
+ pull_policy: Pull policy ("IfNotPresent", "Always", or "Never").
161
+
162
+ Raises:
163
+ RuntimeError: If image is not found or pull fails.
164
+ """
165
+ policy = pull_policy.lower()
166
+ try:
167
+ if policy == "never":
168
+ if not adapter.image_exists(image):
169
+ raise RuntimeError(f"Image '{image}' not found locally and pull policy is Never")
170
+ return
171
+ if policy == "always":
172
+ logger.debug(f"Pulling image (Always): {image}")
173
+ adapter.pull_image(image)
174
+ return
175
+ # IfNotPresent
176
+ if not adapter.image_exists(image):
177
+ logger.debug(f"Pulling image (IfNotPresent): {image}")
178
+ adapter.pull_image(image)
179
+ except Exception as e:
180
+ raise RuntimeError(f"Failed to ensure image '{image}': {e}") from e
181
+
182
+
183
+ def get_container_status(adapter, container_id: str) -> str:
184
+ """
185
+ Get the TrainJob status of a container.
186
+
187
+ Args:
188
+ adapter: Container client adapter (DockerClientAdapter or PodmanClientAdapter).
189
+ container_id: Container ID.
190
+
191
+ Returns:
192
+ TrainJob status constant.
193
+ """
194
+ try:
195
+ status, exit_code = adapter.container_status(container_id)
196
+ return container_status_to_trainjob_status(status, exit_code)
197
+ except Exception:
198
+ return UNKNOWN
199
+
200
+
201
+ def aggregate_container_statuses(adapter, containers: list[dict]) -> str:
202
+ """
203
+ Aggregate TrainJob status from container info dicts.
204
+
205
+ Args:
206
+ adapter: Container client adapter (DockerClientAdapter or PodmanClientAdapter).
207
+ containers: List of container info dicts with 'id' key.
208
+
209
+ Returns:
210
+ Aggregated TrainJob status.
211
+ """
212
+ statuses = [get_container_status(adapter, c["id"]) for c in containers]
213
+ return aggregate_status_from_containers(statuses)