viettelcloud-aiplatform 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- viettelcloud/__init__.py +1 -0
- viettelcloud/aiplatform/__init__.py +15 -0
- viettelcloud/aiplatform/common/__init__.py +0 -0
- viettelcloud/aiplatform/common/constants.py +22 -0
- viettelcloud/aiplatform/common/types.py +28 -0
- viettelcloud/aiplatform/common/utils.py +40 -0
- viettelcloud/aiplatform/hub/OWNERS +14 -0
- viettelcloud/aiplatform/hub/__init__.py +25 -0
- viettelcloud/aiplatform/hub/api/__init__.py +13 -0
- viettelcloud/aiplatform/hub/api/_proxy_client.py +355 -0
- viettelcloud/aiplatform/hub/api/model_registry_client.py +561 -0
- viettelcloud/aiplatform/hub/api/model_registry_client_test.py +462 -0
- viettelcloud/aiplatform/optimizer/__init__.py +45 -0
- viettelcloud/aiplatform/optimizer/api/__init__.py +0 -0
- viettelcloud/aiplatform/optimizer/api/optimizer_client.py +248 -0
- viettelcloud/aiplatform/optimizer/backends/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/backends/base.py +77 -0
- viettelcloud/aiplatform/optimizer/backends/kubernetes/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/backends/kubernetes/backend.py +563 -0
- viettelcloud/aiplatform/optimizer/backends/kubernetes/utils.py +112 -0
- viettelcloud/aiplatform/optimizer/constants/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/constants/constants.py +59 -0
- viettelcloud/aiplatform/optimizer/types/__init__.py +13 -0
- viettelcloud/aiplatform/optimizer/types/algorithm_types.py +87 -0
- viettelcloud/aiplatform/optimizer/types/optimization_types.py +135 -0
- viettelcloud/aiplatform/optimizer/types/search_types.py +95 -0
- viettelcloud/aiplatform/py.typed +0 -0
- viettelcloud/aiplatform/trainer/__init__.py +82 -0
- viettelcloud/aiplatform/trainer/api/__init__.py +3 -0
- viettelcloud/aiplatform/trainer/api/trainer_client.py +277 -0
- viettelcloud/aiplatform/trainer/api/trainer_client_test.py +72 -0
- viettelcloud/aiplatform/trainer/backends/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/backends/base.py +94 -0
- viettelcloud/aiplatform/trainer/backends/container/adapters/base.py +195 -0
- viettelcloud/aiplatform/trainer/backends/container/adapters/docker.py +231 -0
- viettelcloud/aiplatform/trainer/backends/container/adapters/podman.py +258 -0
- viettelcloud/aiplatform/trainer/backends/container/backend.py +668 -0
- viettelcloud/aiplatform/trainer/backends/container/backend_test.py +867 -0
- viettelcloud/aiplatform/trainer/backends/container/runtime_loader.py +631 -0
- viettelcloud/aiplatform/trainer/backends/container/runtime_loader_test.py +637 -0
- viettelcloud/aiplatform/trainer/backends/container/types.py +67 -0
- viettelcloud/aiplatform/trainer/backends/container/utils.py +213 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/backend.py +710 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/backend_test.py +1344 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/constants.py +15 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/utils.py +636 -0
- viettelcloud/aiplatform/trainer/backends/kubernetes/utils_test.py +582 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/backend.py +306 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/backend_test.py +501 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/constants.py +90 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/job.py +184 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/types.py +52 -0
- viettelcloud/aiplatform/trainer/backends/localprocess/utils.py +302 -0
- viettelcloud/aiplatform/trainer/constants/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/constants/constants.py +179 -0
- viettelcloud/aiplatform/trainer/options/__init__.py +52 -0
- viettelcloud/aiplatform/trainer/options/common.py +55 -0
- viettelcloud/aiplatform/trainer/options/kubernetes.py +502 -0
- viettelcloud/aiplatform/trainer/options/kubernetes_test.py +259 -0
- viettelcloud/aiplatform/trainer/options/localprocess.py +20 -0
- viettelcloud/aiplatform/trainer/test/common.py +22 -0
- viettelcloud/aiplatform/trainer/types/__init__.py +0 -0
- viettelcloud/aiplatform/trainer/types/types.py +517 -0
- viettelcloud/aiplatform/trainer/types/types_test.py +115 -0
- viettelcloud_aiplatform-0.3.0.dist-info/METADATA +226 -0
- viettelcloud_aiplatform-0.3.0.dist-info/RECORD +71 -0
- viettelcloud_aiplatform-0.3.0.dist-info/WHEEL +4 -0
- viettelcloud_aiplatform-0.3.0.dist-info/licenses/LICENSE +201 -0
- viettelcloud_aiplatform-0.3.0.dist-info/licenses/NOTICE +36 -0
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# Copyright 2024 The Kubeflow Authors.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
import textwrap
|
|
17
|
+
|
|
18
|
+
# Common constants.
|
|
19
|
+
GROUP = "trainer.kubeflow.org"
|
|
20
|
+
VERSION = "v1alpha1"
|
|
21
|
+
API_VERSION = f"{GROUP}/{VERSION}"
|
|
22
|
+
|
|
23
|
+
# The Kind name for the ClusterTrainingRuntime.
|
|
24
|
+
CLUSTER_TRAINING_RUNTIME_KIND = "ClusterTrainingRuntime"
|
|
25
|
+
|
|
26
|
+
# The plural for the ClusterTrainingRuntime.
|
|
27
|
+
CLUSTER_TRAINING_RUNTIME_PLURAL = "clustertrainingruntimes"
|
|
28
|
+
|
|
29
|
+
# The Kind name for the TrainJob.
|
|
30
|
+
TRAINJOB_KIND = "TrainJob"
|
|
31
|
+
|
|
32
|
+
# The plural for the TrainJob.
|
|
33
|
+
TRAINJOB_PLURAL = "trainjobs"
|
|
34
|
+
|
|
35
|
+
# The default status for the TrainJob once users create it.
|
|
36
|
+
TRAINJOB_CREATED = "Created"
|
|
37
|
+
|
|
38
|
+
# The running status of the TrainJob, defined when all training node (e.g. Pods) are
|
|
39
|
+
# running or succeeded.
|
|
40
|
+
TRAINJOB_RUNNING = "Running"
|
|
41
|
+
|
|
42
|
+
# The complete status of the TrainJob, defined when TrainJob CR has complete condition.
|
|
43
|
+
TRAINJOB_COMPLETE = "Complete"
|
|
44
|
+
|
|
45
|
+
# The failed status of the TrainJob, defined when TrainJob CR has failed condition.
|
|
46
|
+
TRAINJOB_FAILED = "Failed"
|
|
47
|
+
|
|
48
|
+
# The succeeded phase of the Pods.
|
|
49
|
+
POD_SUCCEEDED = "Succeeded"
|
|
50
|
+
|
|
51
|
+
# The label key to identify the relationship between TrainJob and Pod template in the runtime.
|
|
52
|
+
# For example, what PodTemplate must be overridden by TrainJob's .spec.trainer APIs.
|
|
53
|
+
TRAINJOB_ANCESTOR_LABEL = "trainer.kubeflow.org/trainjob-ancestor-step"
|
|
54
|
+
|
|
55
|
+
# The label key to identify ML framework that runtime uses (e.g. torch, deepspeed, torchtune, etc.)
|
|
56
|
+
RUNTIME_FRAMEWORK_LABEL = "trainer.kubeflow.org/framework"
|
|
57
|
+
|
|
58
|
+
# The name of the ReplicatedJob and container of the dataset initializer.
|
|
59
|
+
# Also, it represents the `trainjob-ancestor-step` label value for the dataset initializer step.
|
|
60
|
+
DATASET_INITIALIZER = "dataset-initializer"
|
|
61
|
+
|
|
62
|
+
# The name of the ReplicatedJob and container of the model initializer.
|
|
63
|
+
# Also, it represents the `trainjob-ancestor-step` label value for the model initializer step.
|
|
64
|
+
MODEL_INITIALIZER = "model-initializer"
|
|
65
|
+
|
|
66
|
+
# The env name for the access token of dataset/model initializer.
|
|
67
|
+
INITIALIZER_ENV_ACCESS_TOKEN = "ACCESS_TOKEN"
|
|
68
|
+
|
|
69
|
+
# The default value for initializer to ignore files.
|
|
70
|
+
INITIALIZER_DEFAULT_IGNORE_PATTERNS = ["*.msgpack", "*.h5", "*.bin", ".pt", ".pth"]
|
|
71
|
+
|
|
72
|
+
# The default path to the users' workspace.
|
|
73
|
+
# TODO (andreyvelich): Discuss how to keep this path is sync with pkg.initializers.constants
|
|
74
|
+
WORKSPACE_PATH = "/workspace"
|
|
75
|
+
|
|
76
|
+
# The path where initializer downloads dataset.
|
|
77
|
+
DATASET_PATH = os.path.join(WORKSPACE_PATH, "dataset")
|
|
78
|
+
|
|
79
|
+
# The path where initializer downloads model.
|
|
80
|
+
MODEL_PATH = os.path.join(WORKSPACE_PATH, "model")
|
|
81
|
+
|
|
82
|
+
# The name of the ReplicatedJob to launch mpirun.
|
|
83
|
+
LAUNCHER = "launcher"
|
|
84
|
+
|
|
85
|
+
# The name of the ReplicatedJob and container of the node. The node usually represents
|
|
86
|
+
# single VM where distributed training code is executed.
|
|
87
|
+
NODE = "node"
|
|
88
|
+
|
|
89
|
+
# The label for cpu in the container resources.
|
|
90
|
+
CPU_LABEL = "cpu"
|
|
91
|
+
|
|
92
|
+
# The label for NVIDIA GPU in the container resources.
|
|
93
|
+
GPU_LABEL = "nvidia.com/gpu"
|
|
94
|
+
|
|
95
|
+
# The prefix for NVIDIA MIG device in the container resources.
|
|
96
|
+
GPU_MIG_PREFIX = "nvidia.com/mig-"
|
|
97
|
+
|
|
98
|
+
# The label for TPU in the container resources.
|
|
99
|
+
TPU_LABEL = "google.com/tpu"
|
|
100
|
+
|
|
101
|
+
# The label key to identify the JobSet name of the Pod.
|
|
102
|
+
JOBSET_NAME_LABEL = "jobset.sigs.k8s.io/jobset-name"
|
|
103
|
+
|
|
104
|
+
# The label key to identify the JobSet's ReplicatedJob of the Pod.
|
|
105
|
+
JOBSET_RJOB_NAME_LABEL = "jobset.sigs.k8s.io/replicatedjob-name"
|
|
106
|
+
|
|
107
|
+
# The label key to identify the Job completion index of the Pod.
|
|
108
|
+
JOB_INDEX_LABEL = "batch.kubernetes.io/job-completion-index"
|
|
109
|
+
|
|
110
|
+
# The Pod pending phase indicates that Pod has been accepted by the Kubernetes cluster,
|
|
111
|
+
# but one or more of the containers has not been made ready to run.
|
|
112
|
+
POD_PENDING = "Pending"
|
|
113
|
+
|
|
114
|
+
# The label selector for Pods created by the TrainJob.
|
|
115
|
+
# It checks the following rJob.name: dataset-initializer, model-initializer, launcher, node.
|
|
116
|
+
POD_LABEL_SELECTOR = (
|
|
117
|
+
f"{JOBSET_NAME_LABEL}={{trainjob_name}},{JOBSET_RJOB_NAME_LABEL} "
|
|
118
|
+
f"in ({DATASET_INITIALIZER}, {MODEL_INITIALIZER}, {LAUNCHER}, {NODE})"
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# Handle environment variable for multiple URLs (comma-separated).
|
|
122
|
+
# The first URL will be the index-url, and remaining ones are extra-index-urls.
|
|
123
|
+
DEFAULT_PIP_INDEX_URLS = os.getenv("DEFAULT_PIP_INDEX_URLS", "https://pypi.org/simple").split(",")
|
|
124
|
+
|
|
125
|
+
# The exec script to embed training function into container command.
|
|
126
|
+
# __ENTRYPOINT__ depends on the MLPolicy, func_code and func_file is substituted in the `train` API.
|
|
127
|
+
EXEC_FUNC_SCRIPT = textwrap.dedent(
|
|
128
|
+
"""
|
|
129
|
+
read -r -d '' SCRIPT << EOM\n
|
|
130
|
+
{func_code}
|
|
131
|
+
EOM
|
|
132
|
+
printf "%s" \"$SCRIPT\" > \"{func_file}\"
|
|
133
|
+
__ENTRYPOINT__ \"{func_file}\""""
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
# The default command for the PlainML CustomTrainer.
|
|
137
|
+
DEFAULT_COMMAND = (
|
|
138
|
+
"bash",
|
|
139
|
+
"-c",
|
|
140
|
+
EXEC_FUNC_SCRIPT.replace("__ENTRYPOINT__", "python"),
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# The default home directory for the MPI user.
|
|
144
|
+
DEFAULT_MPI_USER_HOME = os.getenv("DEFAULT_MPI_USER_HOME", "/home/mpiuser")
|
|
145
|
+
|
|
146
|
+
# The default command for the OpenMPI CustomTrainer.
|
|
147
|
+
MPI_COMMAND = (
|
|
148
|
+
"mpirun",
|
|
149
|
+
"--hostfile",
|
|
150
|
+
"/etc/mpi/hostfile",
|
|
151
|
+
*DEFAULT_COMMAND,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# The default name for the ClusterTrainingRuntime.
|
|
155
|
+
DEFAULT_TRAINING_RUNTIME = os.getenv("DEFAULT_TRAINING_RUNTIME", "torch-distributed")
|
|
156
|
+
|
|
157
|
+
# The default container command for the Torch CustomTrainer
|
|
158
|
+
TORCH_COMMAND = (
|
|
159
|
+
"bash",
|
|
160
|
+
"-c",
|
|
161
|
+
EXEC_FUNC_SCRIPT.replace("__ENTRYPOINT__", "torchrun"),
|
|
162
|
+
)
|
|
163
|
+
# The Torch env name for the number of procs per node (e.g. number of GPUs per Pod).
|
|
164
|
+
TORCH_ENV_NUM_PROC_PER_NODE = "PET_NPROC_PER_NODE"
|
|
165
|
+
|
|
166
|
+
# The default command for the TorchTune BuiltinTrainer.
|
|
167
|
+
TORCH_TUNE_COMMAND = ("tune", "run")
|
|
168
|
+
|
|
169
|
+
# The Instruct Datasets class in torchtune
|
|
170
|
+
TORCH_TUNE_INSTRUCT_DATASET = "torchtune.datasets.instruct_dataset"
|
|
171
|
+
|
|
172
|
+
# Default container images for each framework (used as fallback when runtime not provided)
|
|
173
|
+
DEFAULT_FRAMEWORK_IMAGES = {
|
|
174
|
+
"torch": "pytorch/pytorch:2.7.1-cuda12.8-cudnn9-runtime",
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
# The length of the UUID suffix for auto-generated job names.
|
|
178
|
+
# Total name length = 1 (random letter) + 11 (UUID hex) = 12 characters
|
|
179
|
+
JOB_NAME_UUID_LENGTH = 11
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# Copyright 2025 The Kubeflow Authors.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""Training options for the Viettel Cloud AI Platform SDK.
|
|
16
|
+
|
|
17
|
+
All options are available from this single import location:
|
|
18
|
+
from viettelcloud.aiplatform.trainer.options import Name, Labels, PodTemplateOverrides, ...
|
|
19
|
+
|
|
20
|
+
Options self-validate their backend compatibility at runtime.
|
|
21
|
+
Check each option's docstring for supported backends.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from viettelcloud.aiplatform.trainer.options.common import Name
|
|
25
|
+
from viettelcloud.aiplatform.trainer.options.kubernetes import (
|
|
26
|
+
Annotations,
|
|
27
|
+
ContainerOverride,
|
|
28
|
+
Labels,
|
|
29
|
+
PodSpecOverride,
|
|
30
|
+
PodTemplateOverride,
|
|
31
|
+
PodTemplateOverrides,
|
|
32
|
+
SpecAnnotations,
|
|
33
|
+
SpecLabels,
|
|
34
|
+
TrainerArgs,
|
|
35
|
+
TrainerCommand,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
__all__ = [
|
|
39
|
+
# Common options (all backends)
|
|
40
|
+
"Name",
|
|
41
|
+
# Kubernetes options
|
|
42
|
+
"Annotations",
|
|
43
|
+
"ContainerOverride",
|
|
44
|
+
"Labels",
|
|
45
|
+
"PodSpecOverride",
|
|
46
|
+
"PodTemplateOverride",
|
|
47
|
+
"PodTemplateOverrides",
|
|
48
|
+
"SpecAnnotations",
|
|
49
|
+
"SpecLabels",
|
|
50
|
+
"TrainerArgs",
|
|
51
|
+
"TrainerCommand",
|
|
52
|
+
]
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# Copyright 2025 The Kubeflow Authors.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""Common options and helper classes used across multiple backends."""
|
|
16
|
+
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from typing import Any, Optional, Union
|
|
19
|
+
|
|
20
|
+
from viettelcloud.aiplatform.trainer.backends.base import RuntimeBackend
|
|
21
|
+
from viettelcloud.aiplatform.trainer.types.types import (
|
|
22
|
+
BuiltinTrainer,
|
|
23
|
+
CustomTrainer,
|
|
24
|
+
CustomTrainerContainer,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class Name:
|
|
30
|
+
"""Set a custom name for the TrainJob resource.
|
|
31
|
+
|
|
32
|
+
This option works with all backends.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
name: Custom name for the job. Must be a valid identifier.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
name: str
|
|
39
|
+
|
|
40
|
+
def __call__(
|
|
41
|
+
self,
|
|
42
|
+
job_spec: dict[str, Any],
|
|
43
|
+
trainer: Optional[Union[BuiltinTrainer, CustomTrainer, CustomTrainerContainer]],
|
|
44
|
+
backend: RuntimeBackend,
|
|
45
|
+
) -> None:
|
|
46
|
+
"""Apply custom name to the job specification.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
job_spec: Job specification dictionary to modify.
|
|
50
|
+
trainer: Optional trainer instance for context.
|
|
51
|
+
backend: Backend instance for validation and context.
|
|
52
|
+
"""
|
|
53
|
+
# Name option is generic - works with all backends
|
|
54
|
+
metadata = job_spec.setdefault("metadata", {})
|
|
55
|
+
metadata["name"] = self.name
|