viettelcloud-aiplatform 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- viettelcloud/__init__.py +1 -0
- viettelcloud/aiplatform/__init__.py +15 -0
- viettelcloud/aiplatform/common/__init__.py +0 -0
- viettelcloud/aiplatform/common/constants.py +22 -0
- viettelcloud/aiplatform/common/types.py +28 -0
- viettelcloud/aiplatform/common/utils.py +40 -0
- viettelcloud/aiplatform/hub/OWNERS +14 -0
- viettelcloud/aiplatform/hub/__init__.py +25 -0
- viettelcloud/aiplatform/hub/api/__init__.py +13 -0
- viettelcloud/aiplatform/hub/api/_proxy_client.py +355 -0
- viettelcloud/aiplatform/hub/api/model_registry_client.py +561 -0
- viettelcloud/aiplatform/hub/api/model_registry_client_test.py +462 -0
- viettelcloud/aiplatform/optimizer/__init__.py +45 -0
- viettelcloud/aiplatform/optimizer/api/__init__.py +0 -0
- viettelcloud/aiplatform/optimizer/api/optimizer_client.py +248 -0
- viettelcloud/aiplatform/optimizer/backends/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/backends/base.py +77 -0
- viettelcloud/aiplatform/optimizer/backends/kubernetes/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/backends/kubernetes/backend.py +563 -0
- viettelcloud/aiplatform/optimizer/backends/kubernetes/utils.py +112 -0
- viettelcloud/aiplatform/optimizer/constants/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/constants/constants.py +59 -0
- viettelcloud/aiplatform/optimizer/types/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/types/algorithm_types.py +87 -0
- viettelcloud/aiplatform/optimizer/types/optimization_types.py +135 -0
- viettelcloud/aiplatform/optimizer/types/search_types.py +95 -0
- viettelcloud/aiplatform/py.typed +0 -0
- viettelcloud/aiplatform/trainer/__init__.py +82 -0
- viettelcloud/aiplatform/trainer/api/__init__.py +3 -0
- viettelcloud/aiplatform/trainer/api/trainer_client.py +277 -0
- viettelcloud/aiplatform/trainer/api/trainer_client_test.py +72 -0
- viettelcloud/aiplatform/trainer/backends/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/backends/base.py +94 -0
- viettelcloud/aiplatform/trainer/backends/container/adapters/base.py +195 -0
- viettelcloud/aiplatform/trainer/backends/container/adapters/docker.py +231 -0
- viettelcloud/aiplatform/trainer/backends/container/adapters/podman.py +258 -0
- viettelcloud/aiplatform/trainer/backends/container/backend.py +668 -0
- viettelcloud/aiplatform/trainer/backends/container/backend_test.py +867 -0
- viettelcloud/aiplatform/trainer/backends/container/runtime_loader.py +631 -0
- viettelcloud/aiplatform/trainer/backends/container/runtime_loader_test.py +637 -0
- viettelcloud/aiplatform/trainer/backends/container/types.py +67 -0
- viettelcloud/aiplatform/trainer/backends/container/utils.py +213 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/backend.py +710 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/backend_test.py +1344 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/constants.py +15 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/utils.py +636 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/utils_test.py +582 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/backend.py +306 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/backend_test.py +501 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/constants.py +90 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/job.py +184 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/types.py +52 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/utils.py +302 -0
- viettelcloud/aiplatform/trainer/constants/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/constants/constants.py +179 -0
- viettelcloud/aiplatform/trainer/options/__init__.py +52 -0
- viettelcloud/aiplatform/trainer/options/common.py +55 -0
- viettelcloud/aiplatform/trainer/options/kubernetes.py +502 -0
- viettelcloud/aiplatform/trainer/options/kubernetes_test.py +259 -0
- viettelcloud/aiplatform/trainer/options/localprocess.py +20 -0
- viettelcloud/aiplatform/trainer/test/common.py +22 -0
- viettelcloud/aiplatform/trainer/types/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/types/types.py +517 -0
- viettelcloud/aiplatform/trainer/types/types_test.py +115 -0
- viettelcloud_aiplatform-0.3.0.dist-info/METADATA +226 -0
- viettelcloud_aiplatform-0.3.0.dist-info/RECORD +71 -0
- viettelcloud_aiplatform-0.3.0.dist-info/WHEEL +4 -0
- viettelcloud_aiplatform-0.3.0.dist-info/licenses/LICENSE +201 -0
- viettelcloud_aiplatform-0.3.0.dist-info/licenses/NOTICE +36 -0
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# Copyright 2025 The Kubeflow Authors.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""
|
|
16
|
+
Types and configuration for the unified Container backend.
|
|
17
|
+
|
|
18
|
+
This backend automatically detects and uses either Docker or Podman.
|
|
19
|
+
It provides a single interface for container-based execution regardless
|
|
20
|
+
of the underlying runtime.
|
|
21
|
+
|
|
22
|
+
Configuration options:
|
|
23
|
+
- pull_policy: Controls image pulling. Supported values: "IfNotPresent",
|
|
24
|
+
"Always", "Never". The default is "IfNotPresent".
|
|
25
|
+
- auto_remove: Whether to remove containers and networks when jobs are deleted.
|
|
26
|
+
Defaults to True.
|
|
27
|
+
- container_host: Optional override for connecting to a remote/local container
|
|
28
|
+
daemon. By default, auto-detects from environment or uses system defaults.
|
|
29
|
+
For Docker: uses DOCKER_HOST or default socket.
|
|
30
|
+
For Podman: uses CONTAINER_HOST or default socket.
|
|
31
|
+
- container_runtime: Force use of a specific container runtime ("docker" or "podman").
|
|
32
|
+
If not set, auto-detects based on availability (tries Docker first, then Podman).
|
|
33
|
+
- runtime_source: Configuration for training runtime sources using URL schemes.
|
|
34
|
+
Supports github://, https://, http://, file://, and absolute paths.
|
|
35
|
+
Built-in runtimes packaged with kubeflow-trainer are used as default fallback.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
from typing import Literal, Optional
|
|
39
|
+
|
|
40
|
+
from pydantic import BaseModel, Field
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class TrainingRuntimeSource(BaseModel):
|
|
44
|
+
"""Configuration for training runtime sources using URL schemes."""
|
|
45
|
+
|
|
46
|
+
sources: list[str] = Field(
|
|
47
|
+
default_factory=lambda: ["github://kubeflow/trainer"],
|
|
48
|
+
description=(
|
|
49
|
+
"Runtime sources with URL schemes (checked in priority order):\n"
|
|
50
|
+
" - github://owner/repo[/path] - GitHub repository\n"
|
|
51
|
+
" - https://url or http://url - HTTP(S) endpoint\n"
|
|
52
|
+
" - file:///path or /absolute/path - Local filesystem\n"
|
|
53
|
+
"If a runtime is not found in configured sources, built-in runtimes "
|
|
54
|
+
"packaged with kubeflow-trainer are used as default."
|
|
55
|
+
),
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class ContainerBackendConfig(BaseModel):
|
|
60
|
+
pull_policy: str = Field(default="IfNotPresent")
|
|
61
|
+
auto_remove: bool = Field(default=True)
|
|
62
|
+
container_host: Optional[str] = Field(default=None)
|
|
63
|
+
container_runtime: Optional[Literal["docker", "podman"]] = Field(default=None)
|
|
64
|
+
runtime_source: TrainingRuntimeSource = Field(
|
|
65
|
+
default_factory=TrainingRuntimeSource,
|
|
66
|
+
description="Configuration for training runtime sources",
|
|
67
|
+
)
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
# Copyright 2025 The Kubeflow Authors.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""
|
|
16
|
+
Utility functions for the Container backend.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import logging
|
|
20
|
+
import os
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
from viettelcloud.aiplatform.common.constants import UNKNOWN
|
|
24
|
+
from viettelcloud.aiplatform.trainer.constants import constants
|
|
25
|
+
from viettelcloud.aiplatform.trainer.types import types
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def create_workdir(job_name: str) -> str:
|
|
31
|
+
"""
|
|
32
|
+
Create per-job working directory on host.
|
|
33
|
+
|
|
34
|
+
Working directories are created under ~/.kubeflow/trainer/containers/<job_name>
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
job_name: Name of the training job.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Absolute path to the working directory.
|
|
41
|
+
"""
|
|
42
|
+
home_base = Path.home() / ".kubeflow" / "trainer" / "containers"
|
|
43
|
+
home_base.mkdir(parents=True, exist_ok=True)
|
|
44
|
+
workdir = str((home_base / f"{job_name}").resolve())
|
|
45
|
+
os.makedirs(workdir, exist_ok=True)
|
|
46
|
+
return workdir
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_training_script_code(trainer: types.CustomTrainer) -> str:
|
|
50
|
+
"""
|
|
51
|
+
Generate the training script code from the trainer function.
|
|
52
|
+
|
|
53
|
+
This extracts the function source and appends a function call,
|
|
54
|
+
similar to how the Kubernetes backend handles training scripts.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
trainer: CustomTrainer configuration.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
Complete Python code as a string to execute.
|
|
61
|
+
"""
|
|
62
|
+
import inspect
|
|
63
|
+
import textwrap
|
|
64
|
+
|
|
65
|
+
code = inspect.getsource(trainer.func)
|
|
66
|
+
code = textwrap.dedent(code)
|
|
67
|
+
if trainer.func_args is None:
|
|
68
|
+
code += f"\n{trainer.func.__name__}()\n"
|
|
69
|
+
else:
|
|
70
|
+
code += f"\n{trainer.func.__name__}(**{trainer.func_args})\n"
|
|
71
|
+
return code
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def build_environment(trainer: types.CustomTrainer) -> dict[str, str]:
|
|
75
|
+
"""
|
|
76
|
+
Build environment variables for containers.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
trainer: CustomTrainer configuration.
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
Dictionary of environment variables.
|
|
83
|
+
"""
|
|
84
|
+
return dict(trainer.env or {})
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def build_pip_install_cmd(trainer: types.CustomTrainer) -> str:
|
|
88
|
+
"""
|
|
89
|
+
Build pip install command for packages.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
trainer: CustomTrainer configuration.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Pip install command string (empty if no packages to install).
|
|
96
|
+
"""
|
|
97
|
+
pkgs = trainer.packages_to_install or []
|
|
98
|
+
if not pkgs:
|
|
99
|
+
return ""
|
|
100
|
+
|
|
101
|
+
index_urls = trainer.pip_index_urls or list(constants.DEFAULT_PIP_INDEX_URLS)
|
|
102
|
+
main_idx = index_urls[0]
|
|
103
|
+
extras = " ".join(f"--extra-index-url {u}" for u in index_urls[1:])
|
|
104
|
+
quoted = " ".join(f'"{p}"' for p in pkgs)
|
|
105
|
+
return (
|
|
106
|
+
"PIP_DISABLE_PIP_VERSION_CHECK=1 pip install --no-warn-script-location "
|
|
107
|
+
f"--index-url {main_idx} {extras} {quoted} && "
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def container_status_to_trainjob_status(status: str, exit_code: int) -> str:
|
|
112
|
+
"""
|
|
113
|
+
Convert container status to TrainJob status.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
status: Container status (e.g., "running", "exited", "created").
|
|
117
|
+
exit_code: Container exit code.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
TrainJob status constant.
|
|
121
|
+
"""
|
|
122
|
+
if status == "running":
|
|
123
|
+
return constants.TRAINJOB_RUNNING
|
|
124
|
+
if status == "created":
|
|
125
|
+
return constants.TRAINJOB_CREATED
|
|
126
|
+
if status == "exited":
|
|
127
|
+
# Exit code 0 -> complete, else failed
|
|
128
|
+
return constants.TRAINJOB_COMPLETE if exit_code == 0 else constants.TRAINJOB_FAILED
|
|
129
|
+
return UNKNOWN
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def aggregate_status_from_containers(container_statuses: list[str]) -> str:
|
|
133
|
+
"""
|
|
134
|
+
Aggregate status from multiple container statuses.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
container_statuses: List of container status strings.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
Aggregated TrainJob status.
|
|
141
|
+
"""
|
|
142
|
+
if constants.TRAINJOB_FAILED in container_statuses:
|
|
143
|
+
return constants.TRAINJOB_FAILED
|
|
144
|
+
if constants.TRAINJOB_RUNNING in container_statuses:
|
|
145
|
+
return constants.TRAINJOB_RUNNING
|
|
146
|
+
if all(s == constants.TRAINJOB_COMPLETE for s in container_statuses if s != UNKNOWN):
|
|
147
|
+
return constants.TRAINJOB_COMPLETE
|
|
148
|
+
if any(s == constants.TRAINJOB_CREATED for s in container_statuses):
|
|
149
|
+
return constants.TRAINJOB_CREATED
|
|
150
|
+
return UNKNOWN
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def maybe_pull_image(adapter, image: str, pull_policy: str):
|
|
154
|
+
"""
|
|
155
|
+
Pull image based on pull policy.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
adapter: Container client adapter (DockerClientAdapter or PodmanClientAdapter).
|
|
159
|
+
image: Container image name.
|
|
160
|
+
pull_policy: Pull policy ("IfNotPresent", "Always", or "Never").
|
|
161
|
+
|
|
162
|
+
Raises:
|
|
163
|
+
RuntimeError: If image is not found or pull fails.
|
|
164
|
+
"""
|
|
165
|
+
policy = pull_policy.lower()
|
|
166
|
+
try:
|
|
167
|
+
if policy == "never":
|
|
168
|
+
if not adapter.image_exists(image):
|
|
169
|
+
raise RuntimeError(f"Image '{image}' not found locally and pull policy is Never")
|
|
170
|
+
return
|
|
171
|
+
if policy == "always":
|
|
172
|
+
logger.debug(f"Pulling image (Always): {image}")
|
|
173
|
+
adapter.pull_image(image)
|
|
174
|
+
return
|
|
175
|
+
# IfNotPresent
|
|
176
|
+
if not adapter.image_exists(image):
|
|
177
|
+
logger.debug(f"Pulling image (IfNotPresent): {image}")
|
|
178
|
+
adapter.pull_image(image)
|
|
179
|
+
except Exception as e:
|
|
180
|
+
raise RuntimeError(f"Failed to ensure image '{image}': {e}") from e
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def get_container_status(adapter, container_id: str) -> str:
|
|
184
|
+
"""
|
|
185
|
+
Get the TrainJob status of a container.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
adapter: Container client adapter (DockerClientAdapter or PodmanClientAdapter).
|
|
189
|
+
container_id: Container ID.
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
TrainJob status constant.
|
|
193
|
+
"""
|
|
194
|
+
try:
|
|
195
|
+
status, exit_code = adapter.container_status(container_id)
|
|
196
|
+
return container_status_to_trainjob_status(status, exit_code)
|
|
197
|
+
except Exception:
|
|
198
|
+
return UNKNOWN
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def aggregate_container_statuses(adapter, containers: list[dict]) -> str:
|
|
202
|
+
"""
|
|
203
|
+
Aggregate TrainJob status from container info dicts.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
adapter: Container client adapter (DockerClientAdapter or PodmanClientAdapter).
|
|
207
|
+
containers: List of container info dicts with 'id' key.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
Aggregated TrainJob status.
|
|
211
|
+
"""
|
|
212
|
+
statuses = [get_container_status(adapter, c["id"]) for c in containers]
|
|
213
|
+
return aggregate_status_from_containers(statuses)
|
|
File without changes
|