xpk 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +3 -3
- xpk/commands/cluster.py +22 -1
- xpk/commands/cluster_gcluster.py +27 -0
- xpk/commands/common.py +12 -5
- xpk/commands/kjob_common.py +4 -1
- xpk/commands/run.py +2 -2
- xpk/commands/shell.py +2 -2
- xpk/commands/storage.py +10 -3
- xpk/commands/workload.py +64 -27
- xpk/core/blueprint/blueprint_generator.py +108 -40
- xpk/core/capacity.py +66 -6
- xpk/core/cluster.py +165 -7
- xpk/core/config.py +1 -65
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +145 -72
- xpk/core/jobset.py +143 -0
- xpk/core/kjob.py +2 -6
- xpk/core/kueue.py +165 -5
- xpk/core/nodepool.py +17 -4
- xpk/core/pathways.py +1 -2
- xpk/core/storage.py +1 -95
- xpk/core/system_characteristics.py +1 -1
- xpk/core/workload.py +0 -44
- xpk/core/workload_decorators/rdma_decorator.py +2 -0
- xpk/core/workload_decorators/tcpx_decorator.py +10 -4
- xpk/core/workload_decorators/tcpxo_decorator.py +7 -0
- xpk/parser/cluster.py +23 -7
- xpk/parser/storage.py +2 -2
- xpk/parser/workload.py +21 -3
- {xpk-0.9.0.dist-info → xpk-0.10.0.dist-info}/METADATA +45 -6
- {xpk-0.9.0.dist-info → xpk-0.10.0.dist-info}/RECORD +35 -34
- {xpk-0.9.0.dist-info → xpk-0.10.0.dist-info}/WHEEL +0 -0
- {xpk-0.9.0.dist-info → xpk-0.10.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.9.0.dist-info → xpk-0.10.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.9.0.dist-info → xpk-0.10.0.dist-info}/top_level.txt +0 -0
xpk/commands/batch.py
CHANGED
|
@@ -18,7 +18,7 @@ import re
|
|
|
18
18
|
from argparse import Namespace
|
|
19
19
|
|
|
20
20
|
from ..core.cluster import (
|
|
21
|
-
|
|
21
|
+
setup_k8s_service_accounts,
|
|
22
22
|
get_cluster_credentials,
|
|
23
23
|
)
|
|
24
24
|
from ..core.commands import run_command_for_value
|
|
@@ -54,14 +54,14 @@ def batch(args: Namespace) -> None:
|
|
|
54
54
|
err_code = prepare_kjob(args)
|
|
55
55
|
if err_code > 0:
|
|
56
56
|
xpk_exit(err_code)
|
|
57
|
-
|
|
57
|
+
setup_k8s_service_accounts()
|
|
58
58
|
|
|
59
59
|
submit_job(args)
|
|
60
60
|
|
|
61
61
|
|
|
62
62
|
def submit_job(args: Namespace) -> None:
|
|
63
63
|
|
|
64
|
-
|
|
64
|
+
setup_k8s_service_accounts()
|
|
65
65
|
|
|
66
66
|
cmd = (
|
|
67
67
|
'kubectl kjob create slurm'
|
xpk/commands/cluster.py
CHANGED
|
@@ -31,6 +31,7 @@ from ..core.cluster import (
|
|
|
31
31
|
update_cluster_with_gcsfuse_driver_if_necessary,
|
|
32
32
|
update_cluster_with_parallelstore_driver_if_necessary,
|
|
33
33
|
update_cluster_with_pd_driver_if_necessary,
|
|
34
|
+
update_cluster_with_lustre_driver_if_necessary,
|
|
34
35
|
update_cluster_with_workload_identity_if_necessary,
|
|
35
36
|
)
|
|
36
37
|
from ..core.cluster_private import authorize_private_cluster_access_if_necessary
|
|
@@ -42,12 +43,14 @@ from ..core.gcloud_context import (
|
|
|
42
43
|
get_gke_server_config,
|
|
43
44
|
zone_to_region,
|
|
44
45
|
)
|
|
46
|
+
from ..core.jobset import update_jobset_resources_if_necessary
|
|
45
47
|
from ..core.kjob import apply_kjob_crds, prepare_kjob, verify_kjob_installed
|
|
46
48
|
from ..core.kueue import (
|
|
47
49
|
cluster_preheat_yml,
|
|
48
50
|
install_kueue_crs,
|
|
49
51
|
install_kueue_on_cluster,
|
|
50
52
|
wait_for_kueue_available,
|
|
53
|
+
update_kueue_resources_if_necessary,
|
|
51
54
|
)
|
|
52
55
|
from ..core.nap import enable_autoprovisioning_on_cluster
|
|
53
56
|
from ..core.network import (
|
|
@@ -170,7 +173,6 @@ def cluster_adapt(args) -> None:
|
|
|
170
173
|
install_kueue(args, system, autoprovisioning_config)
|
|
171
174
|
|
|
172
175
|
install_kjob(args)
|
|
173
|
-
|
|
174
176
|
if system.accelerator_type == AcceleratorType['GPU']:
|
|
175
177
|
prepare_gpus(args, system)
|
|
176
178
|
|
|
@@ -308,6 +310,9 @@ def cluster_create(args) -> None:
|
|
|
308
310
|
set_jobset_on_cluster_code = set_jobset_on_cluster(args)
|
|
309
311
|
if set_jobset_on_cluster_code != 0:
|
|
310
312
|
xpk_exit(set_jobset_on_cluster_code)
|
|
313
|
+
update_jobset_resources_code = update_jobset_resources_if_necessary(args)
|
|
314
|
+
if update_jobset_resources_code != 0:
|
|
315
|
+
xpk_exit(update_jobset_resources_code)
|
|
311
316
|
|
|
312
317
|
set_pathways_job_on_cluster_code = set_pathways_job_on_cluster(args)
|
|
313
318
|
if set_pathways_job_on_cluster_code != 0:
|
|
@@ -879,6 +884,10 @@ def run_gke_cluster_create_command(
|
|
|
879
884
|
if args.enable_pd_csi_driver:
|
|
880
885
|
addons.append('GcePersistentDiskCsiDriver')
|
|
881
886
|
|
|
887
|
+
if args.enable_lustre_csi_driver:
|
|
888
|
+
addons.append('LustreCsiDriver')
|
|
889
|
+
command += ' --enable-legacy-lustre-port'
|
|
890
|
+
|
|
882
891
|
if hasattr(args, 'enable_mtc') and args.enable_mtc:
|
|
883
892
|
addons.append('HighScaleCheckpointing')
|
|
884
893
|
|
|
@@ -922,6 +931,13 @@ def install_storage_csis(args):
|
|
|
922
931
|
if update_cluster_command_code != 0:
|
|
923
932
|
xpk_exit(update_cluster_command_code)
|
|
924
933
|
|
|
934
|
+
if args.enable_lustre_csi_driver:
|
|
935
|
+
update_cluster_command_code = (
|
|
936
|
+
update_cluster_with_lustre_driver_if_necessary(args)
|
|
937
|
+
)
|
|
938
|
+
if update_cluster_command_code != 0:
|
|
939
|
+
xpk_exit(update_cluster_command_code)
|
|
940
|
+
|
|
925
941
|
|
|
926
942
|
def install_kjob(args):
|
|
927
943
|
xpk_print('Verifying kjob installation')
|
|
@@ -957,6 +973,11 @@ def install_kueue(args, system: SystemCharacteristics, autoprovisioning_config):
|
|
|
957
973
|
if enable_kueue_credentials_code != 0:
|
|
958
974
|
xpk_exit(enable_kueue_credentials_code)
|
|
959
975
|
|
|
976
|
+
xpk_print('Update Kueue Controller Manager resources')
|
|
977
|
+
update_kueue_resources_code = update_kueue_resources_if_necessary(args)
|
|
978
|
+
if update_kueue_resources_code != 0:
|
|
979
|
+
xpk_exit(update_kueue_resources_code)
|
|
980
|
+
|
|
960
981
|
|
|
961
982
|
def prepare_gpus(args, system: SystemCharacteristics):
|
|
962
983
|
xpk_print('Installing NCCL Plugin for cluster')
|
xpk/commands/cluster_gcluster.py
CHANGED
|
@@ -37,6 +37,7 @@ from ..utils.console import xpk_exit, xpk_print
|
|
|
37
37
|
from ..utils.file import ensure_directory_exists
|
|
38
38
|
from ..utils.network import all_IPs_cidr
|
|
39
39
|
from ..utils.objects import hash_string
|
|
40
|
+
from ..core.capacity import get_reservation_maintenance_interval, get_reservation_placement_policy
|
|
40
41
|
|
|
41
42
|
blueprints_path = os.path.abspath('xpkclusters/blueprints')
|
|
42
43
|
gcluster_working_dir = os.path.abspath('xpkclusters/gcluster-out')
|
|
@@ -234,6 +235,30 @@ def generate_blueprint(
|
|
|
234
235
|
if args.device_type in supported_device_types:
|
|
235
236
|
if args.device_type == a3mega_device_type:
|
|
236
237
|
num_nodes = args.num_nodes if not args.num_nodes is None else 2
|
|
238
|
+
|
|
239
|
+
maintenance_interval = (
|
|
240
|
+
get_reservation_maintenance_interval(
|
|
241
|
+
args.reservation, args.zone, args.project
|
|
242
|
+
)
|
|
243
|
+
if args.reservation is not None
|
|
244
|
+
else 'PERIODIC'
|
|
245
|
+
)
|
|
246
|
+
placement_policy_name = (
|
|
247
|
+
get_reservation_placement_policy(
|
|
248
|
+
args.reservation, args.zone, args.project
|
|
249
|
+
)
|
|
250
|
+
if args.reservation is not None
|
|
251
|
+
else None
|
|
252
|
+
)
|
|
253
|
+
placement_policy = (
|
|
254
|
+
{
|
|
255
|
+
'type': 'COMPACT',
|
|
256
|
+
'name': placement_policy_name.split('/')[-1],
|
|
257
|
+
}
|
|
258
|
+
if placement_policy_name is not None
|
|
259
|
+
and len(placement_policy_name) > 0
|
|
260
|
+
else None
|
|
261
|
+
)
|
|
237
262
|
return bpg.generate_a3_mega_blueprint(
|
|
238
263
|
blueprint_name=blueprint_name,
|
|
239
264
|
prefix=prefix,
|
|
@@ -243,6 +268,8 @@ def generate_blueprint(
|
|
|
243
268
|
zone=args.zone,
|
|
244
269
|
auth_cidr=all_IPs_cidr,
|
|
245
270
|
num_nodes=num_nodes,
|
|
271
|
+
reservation_maintenance_interval=maintenance_interval,
|
|
272
|
+
reservation_placement_policy=placement_policy,
|
|
246
273
|
reservation=args.reservation if args.reservation else None,
|
|
247
274
|
capacity_type=capacity_type,
|
|
248
275
|
system_node_pool_machine_type=args.default_pool_cpu_machine_type,
|
xpk/commands/common.py
CHANGED
|
@@ -15,10 +15,12 @@ limitations under the License.
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
from ..core.commands import run_command_with_updates_retry
|
|
18
|
-
from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics
|
|
19
18
|
from ..core.capacity import H100_MEGA_DEVICE_TYPE, CapacityType
|
|
20
19
|
from ..core.gcloud_context import zone_to_region
|
|
21
20
|
from ..utils.console import xpk_print, xpk_exit
|
|
21
|
+
from ..core.system_characteristics import (
|
|
22
|
+
SystemCharacteristics,
|
|
23
|
+
)
|
|
22
24
|
|
|
23
25
|
|
|
24
26
|
def set_cluster_command(args) -> int:
|
|
@@ -47,7 +49,11 @@ def set_cluster_command(args) -> int:
|
|
|
47
49
|
return return_code
|
|
48
50
|
|
|
49
51
|
|
|
50
|
-
def is_TAS_possible(
|
|
52
|
+
def is_TAS_possible(
|
|
53
|
+
system_characteristics: SystemCharacteristics,
|
|
54
|
+
capacity_type: CapacityType,
|
|
55
|
+
flex: bool,
|
|
56
|
+
) -> bool:
|
|
51
57
|
"""Check cluster's machine_type and capacity type to determine if Kueue TAS is possible
|
|
52
58
|
|
|
53
59
|
Args:
|
|
@@ -56,8 +62,6 @@ def is_TAS_possible(args) -> bool:
|
|
|
56
62
|
Returns:
|
|
57
63
|
True if possible and False otherwise.
|
|
58
64
|
"""
|
|
59
|
-
system_characteristics = get_cluster_system_characteristics(args)
|
|
60
|
-
capacity_type = get_cluster_capacity_type(args)
|
|
61
65
|
|
|
62
66
|
if system_characteristics is None:
|
|
63
67
|
xpk_print('system_characteristics data was not found in configmaps.')
|
|
@@ -67,9 +71,12 @@ def is_TAS_possible(args) -> bool:
|
|
|
67
71
|
xpk_print('capacity_type data was not found in configmaps.')
|
|
68
72
|
xpk_exit(1)
|
|
69
73
|
|
|
74
|
+
if flex:
|
|
75
|
+
return False
|
|
76
|
+
|
|
70
77
|
if (
|
|
71
78
|
system_characteristics.device_type == H100_MEGA_DEVICE_TYPE
|
|
72
|
-
and capacity_type
|
|
79
|
+
and capacity_type != CapacityType.RESERVATION
|
|
73
80
|
):
|
|
74
81
|
return False
|
|
75
82
|
|
xpk/commands/kjob_common.py
CHANGED
|
@@ -27,6 +27,7 @@ from ..core.kjob import (
|
|
|
27
27
|
Kueue_TAS_annotation,
|
|
28
28
|
)
|
|
29
29
|
from .common import is_TAS_possible
|
|
30
|
+
from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics
|
|
30
31
|
|
|
31
32
|
|
|
32
33
|
def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
|
|
@@ -50,7 +51,9 @@ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
|
|
|
50
51
|
|
|
51
52
|
|
|
52
53
|
def add_TAS_annotations_to_command(args, cmd: str) -> str:
|
|
53
|
-
|
|
54
|
+
system_characteristics = get_cluster_system_characteristics(args)
|
|
55
|
+
capacity_type = get_cluster_capacity_type(args)
|
|
56
|
+
if is_TAS_possible(system_characteristics, capacity_type, flex=False):
|
|
54
57
|
cmd += f" --pod-template-annotation {Kueue_TAS_annotation}"
|
|
55
58
|
|
|
56
59
|
return cmd
|
xpk/commands/run.py
CHANGED
|
@@ -17,7 +17,7 @@ limitations under the License.
|
|
|
17
17
|
from argparse import Namespace
|
|
18
18
|
|
|
19
19
|
from ..core.cluster import (
|
|
20
|
-
|
|
20
|
+
setup_k8s_service_accounts,
|
|
21
21
|
get_cluster_credentials,
|
|
22
22
|
)
|
|
23
23
|
from ..core.commands import run_command_with_full_controls
|
|
@@ -53,7 +53,7 @@ def run(args: Namespace) -> None:
|
|
|
53
53
|
err_code = prepare_kjob(args)
|
|
54
54
|
if err_code > 0:
|
|
55
55
|
xpk_exit(err_code)
|
|
56
|
-
|
|
56
|
+
setup_k8s_service_accounts()
|
|
57
57
|
|
|
58
58
|
submit_job(args)
|
|
59
59
|
|
xpk/commands/shell.py
CHANGED
|
@@ -12,7 +12,7 @@ limitations under the License.
|
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
14
|
from ..core.commands import run_command_with_full_controls, run_command_for_value, run_command_with_updates
|
|
15
|
-
from ..core.cluster import get_cluster_credentials, add_zone_and_project,
|
|
15
|
+
from ..core.cluster import get_cluster_credentials, add_zone_and_project, setup_k8s_service_accounts
|
|
16
16
|
from ..utils.console import xpk_exit, xpk_print
|
|
17
17
|
from argparse import Namespace
|
|
18
18
|
|
|
@@ -82,7 +82,7 @@ def connect_to_new_interactive_shell(args: Namespace) -> int:
|
|
|
82
82
|
err_code = prepare_kjob(args)
|
|
83
83
|
if err_code > 0:
|
|
84
84
|
xpk_exit(err_code)
|
|
85
|
-
|
|
85
|
+
setup_k8s_service_accounts()
|
|
86
86
|
|
|
87
87
|
cmd = (
|
|
88
88
|
'kubectl-kjob create interactive --profile'
|
xpk/commands/storage.py
CHANGED
|
@@ -29,6 +29,7 @@ from ..core.cluster import (
|
|
|
29
29
|
setup_k8s_env,
|
|
30
30
|
update_cluster_with_parallelstore_driver_if_necessary,
|
|
31
31
|
update_cluster_with_pd_driver_if_necessary,
|
|
32
|
+
update_cluster_with_lustre_driver_if_necessary,
|
|
32
33
|
update_cluster_with_gcpfilestore_driver_if_necessary,
|
|
33
34
|
update_cluster_with_gcsfuse_driver_if_necessary,
|
|
34
35
|
update_cluster_with_workload_identity_if_necessary,
|
|
@@ -45,6 +46,7 @@ from ..core.storage import (
|
|
|
45
46
|
GCS_FUSE_TYPE,
|
|
46
47
|
GCE_PD_TYPE,
|
|
47
48
|
PARALLELSTORE_TYPE,
|
|
49
|
+
LUSTRE_TYPE,
|
|
48
50
|
STORAGE_CRD_PLURAL,
|
|
49
51
|
XPK_API_GROUP_NAME,
|
|
50
52
|
XPK_API_GROUP_VERSION,
|
|
@@ -183,11 +185,11 @@ def storage_attach(args: Namespace) -> None:
|
|
|
183
185
|
args.prefetch_metadata,
|
|
184
186
|
)
|
|
185
187
|
|
|
186
|
-
elif args.type in [PARALLELSTORE_TYPE, GCE_PD_TYPE]:
|
|
188
|
+
elif args.type in [PARALLELSTORE_TYPE, GCE_PD_TYPE, LUSTRE_TYPE]:
|
|
187
189
|
if args.manifest is None:
|
|
188
190
|
xpk_print(
|
|
189
|
-
"Parallelstore and
|
|
190
|
-
" --manifest"
|
|
191
|
+
"Parallelstore, PersistentDisk, and Lustre are currently supported"
|
|
192
|
+
" only with --manifest"
|
|
191
193
|
)
|
|
192
194
|
xpk_exit(1)
|
|
193
195
|
|
|
@@ -234,6 +236,11 @@ def enable_csi_drivers_if_necessary(args: Namespace) -> None:
|
|
|
234
236
|
if return_code > 0:
|
|
235
237
|
xpk_exit(return_code)
|
|
236
238
|
|
|
239
|
+
if args.type == LUSTRE_TYPE:
|
|
240
|
+
return_code = update_cluster_with_lustre_driver_if_necessary(args)
|
|
241
|
+
if return_code > 0:
|
|
242
|
+
xpk_exit(return_code)
|
|
243
|
+
|
|
237
244
|
|
|
238
245
|
def storage_list(args: Namespace) -> None:
|
|
239
246
|
k8s_api_client = setup_k8s_env(args)
|
xpk/commands/workload.py
CHANGED
|
@@ -14,23 +14,25 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
from ..core.blueprint.blueprint_generator import (
|
|
18
|
+
a3high_device_type,
|
|
19
|
+
a3mega_device_type,
|
|
20
|
+
a3ultra_device_type,
|
|
21
|
+
a4_device_type,
|
|
22
|
+
)
|
|
17
23
|
from ..core.cluster import (
|
|
18
24
|
XPK_SA,
|
|
19
|
-
|
|
25
|
+
setup_k8s_service_accounts,
|
|
20
26
|
get_cluster_credentials,
|
|
21
27
|
setup_k8s_env,
|
|
22
28
|
)
|
|
23
29
|
from ..core.commands import run_command_with_updates, run_commands
|
|
24
|
-
from ..core.config import (
|
|
25
|
-
VERTEX_TENSORBOARD_FEATURE_FLAG,
|
|
26
|
-
XPK_CURRENT_VERSION,
|
|
27
|
-
parse_env_config,
|
|
28
|
-
)
|
|
30
|
+
from ..core.config import (VERTEX_TENSORBOARD_FEATURE_FLAG, XPK_CURRENT_VERSION)
|
|
29
31
|
from ..core.docker_container import (
|
|
30
32
|
get_main_container_docker_image,
|
|
31
33
|
get_user_workload_container,
|
|
32
34
|
)
|
|
33
|
-
from ..core.docker_resources import get_volumes
|
|
35
|
+
from ..core.docker_resources import get_volumes, parse_env_config
|
|
34
36
|
from ..core.gcloud_context import add_zone_and_project
|
|
35
37
|
from ..core.kueue import LOCAL_QUEUE_NAME
|
|
36
38
|
from ..core.monitoring import get_gke_outlier_dashboard
|
|
@@ -50,6 +52,10 @@ from ..core.pathways import (
|
|
|
50
52
|
get_user_workload_for_pathways,
|
|
51
53
|
try_to_delete_pathwaysjob_first,
|
|
52
54
|
)
|
|
55
|
+
from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics
|
|
56
|
+
from ..core.capacity import (
|
|
57
|
+
CapacityType,
|
|
58
|
+
)
|
|
53
59
|
from ..core.resources import CLUSTER_METADATA_CONFIGMAP, get_cluster_configmap
|
|
54
60
|
from ..core.scheduling import (
|
|
55
61
|
check_if_workload_can_schedule,
|
|
@@ -65,6 +71,7 @@ from ..core.storage import (
|
|
|
65
71
|
GCP_FILESTORE_TYPE,
|
|
66
72
|
GCS_FUSE_TYPE,
|
|
67
73
|
PARALLELSTORE_TYPE,
|
|
74
|
+
LUSTRE_TYPE,
|
|
68
75
|
Storage,
|
|
69
76
|
add_bucket_iam_members,
|
|
70
77
|
get_storage_annotations,
|
|
@@ -76,7 +83,6 @@ from ..core.system_characteristics import (
|
|
|
76
83
|
)
|
|
77
84
|
from ..core.vertex import create_vertex_experiment
|
|
78
85
|
from ..core.workload import (
|
|
79
|
-
add_gpu_rxdm_container,
|
|
80
86
|
check_if_workload_exists,
|
|
81
87
|
get_workload_list,
|
|
82
88
|
wait_for_job_completion,
|
|
@@ -85,12 +91,13 @@ from ..core.workload import (
|
|
|
85
91
|
from ..core.workload_decorators import (
|
|
86
92
|
rdma_decorator,
|
|
87
93
|
storage_decorator,
|
|
94
|
+
tcpx_decorator,
|
|
88
95
|
tcpxo_decorator,
|
|
89
96
|
)
|
|
90
97
|
from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
91
98
|
from ..utils.file import write_tmp_file
|
|
92
|
-
from .common import is_TAS_possible
|
|
93
99
|
from . import cluster_gcluster
|
|
100
|
+
from .common import is_TAS_possible
|
|
94
101
|
|
|
95
102
|
WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
|
|
96
103
|
kind: JobSet
|
|
@@ -123,6 +130,8 @@ spec:
|
|
|
123
130
|
{storage_annotations}
|
|
124
131
|
spec:
|
|
125
132
|
schedulerName: {args.scheduler}
|
|
133
|
+
imagePullSecrets:
|
|
134
|
+
- name: {args.docker_image_pull_secret}
|
|
126
135
|
restartPolicy: Never
|
|
127
136
|
{affinity}
|
|
128
137
|
nodeSelector:
|
|
@@ -136,6 +145,8 @@ spec:
|
|
|
136
145
|
containers:
|
|
137
146
|
{container}
|
|
138
147
|
serviceAccountName: {service_account}
|
|
148
|
+
tolerations:
|
|
149
|
+
{tpu_toleration}
|
|
139
150
|
volumes:
|
|
140
151
|
{volumes}
|
|
141
152
|
"""
|
|
@@ -175,6 +186,8 @@ spec:
|
|
|
175
186
|
{gpu_scheduler}
|
|
176
187
|
priorityClassName: {args.priority}
|
|
177
188
|
restartPolicy: Never
|
|
189
|
+
imagePullSecrets:
|
|
190
|
+
- name: {args.docker_image_pull_secret}
|
|
178
191
|
hostNetwork: true
|
|
179
192
|
dnsPolicy: ClusterFirstWithHostNet
|
|
180
193
|
terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
|
|
@@ -213,11 +226,12 @@ spec:
|
|
|
213
226
|
metadata:
|
|
214
227
|
labels:
|
|
215
228
|
xpk.google.com/workload: {args.workload}
|
|
216
|
-
annotations:
|
|
217
|
-
{kueue_TAS_annotation}
|
|
229
|
+
annotations: {annotations}
|
|
218
230
|
spec:
|
|
219
231
|
priorityClassName: {args.priority}
|
|
220
232
|
restartPolicy: Never
|
|
233
|
+
imagePullSecrets:
|
|
234
|
+
- name: {args.docker_image_pull_secret}
|
|
221
235
|
dnsPolicy: ClusterFirstWithHostNet
|
|
222
236
|
terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
|
|
223
237
|
serviceAccountName: {service_account}
|
|
@@ -291,7 +305,7 @@ def workload_create(args) -> None:
|
|
|
291
305
|
0 if successful and 1 otherwise.
|
|
292
306
|
"""
|
|
293
307
|
k8s_api_client = setup_k8s_env(args)
|
|
294
|
-
|
|
308
|
+
setup_k8s_service_accounts()
|
|
295
309
|
|
|
296
310
|
workload_exists = check_if_workload_exists(args)
|
|
297
311
|
|
|
@@ -347,7 +361,7 @@ def workload_create(args) -> None:
|
|
|
347
361
|
if not tensorboard_config:
|
|
348
362
|
xpk_exit(1)
|
|
349
363
|
|
|
350
|
-
parse_env_config(args, tensorboard_config
|
|
364
|
+
parse_env_config(args, tensorboard_config)
|
|
351
365
|
|
|
352
366
|
autoprovisioning_args = ''
|
|
353
367
|
autoprovisioning_enabled, return_code = is_autoprovisioning_enabled(
|
|
@@ -382,6 +396,9 @@ def workload_create(args) -> None:
|
|
|
382
396
|
pd_storages: list[Storage] = list(
|
|
383
397
|
filter(lambda storage: storage.type == GCE_PD_TYPE, storages)
|
|
384
398
|
)
|
|
399
|
+
lustre_storages: list[Storage] = list(
|
|
400
|
+
filter(lambda storage: storage.type == LUSTRE_TYPE, storages)
|
|
401
|
+
)
|
|
385
402
|
if len(gcs_fuse_storages) > 0:
|
|
386
403
|
service_account = XPK_SA
|
|
387
404
|
xpk_print(f'Detected gcsfuse Storages to add: {gcs_fuse_storages}')
|
|
@@ -411,11 +428,18 @@ def workload_create(args) -> None:
|
|
|
411
428
|
else:
|
|
412
429
|
xpk_print('No gce persistent disk instances to add detected.')
|
|
413
430
|
|
|
431
|
+
if len(lustre_storages) > 0:
|
|
432
|
+
service_account = XPK_SA
|
|
433
|
+
xpk_print(f'Detected managed lustre instances to add: {lustre_storages}')
|
|
434
|
+
else:
|
|
435
|
+
xpk_print('No managed lustre instances to add detected.')
|
|
436
|
+
|
|
414
437
|
all_storages = (
|
|
415
438
|
gcs_fuse_storages
|
|
416
439
|
+ gcpfilestore_storages
|
|
417
440
|
+ parallelstore_storages
|
|
418
441
|
+ pd_storages
|
|
442
|
+
+ lustre_storages
|
|
419
443
|
)
|
|
420
444
|
|
|
421
445
|
# Currently failure policy rules are supported for Pathways workloads. b/408465881
|
|
@@ -447,31 +471,41 @@ def workload_create(args) -> None:
|
|
|
447
471
|
)
|
|
448
472
|
if return_code != 0:
|
|
449
473
|
xpk_exit(return_code)
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
474
|
+
system_characteristics = get_cluster_system_characteristics(args)
|
|
475
|
+
capacity_type = get_cluster_capacity_type(args)
|
|
476
|
+
|
|
477
|
+
annotations = (
|
|
478
|
+
''
|
|
479
|
+
if not is_TAS_possible(
|
|
480
|
+
system_characteristics,
|
|
481
|
+
capacity_type,
|
|
482
|
+
flex=True if capacity_type == CapacityType.FLEX_START else False,
|
|
483
|
+
)
|
|
484
|
+
else (
|
|
485
|
+
'kueue.x-k8s.io/podset-preferred-topology:'
|
|
486
|
+
' "cloud.google.com/gce-topology-host"'
|
|
487
|
+
)
|
|
454
488
|
)
|
|
455
|
-
if not is_TAS_possible(args):
|
|
456
|
-
kueue_TAS_annotation = ''
|
|
457
489
|
|
|
458
|
-
if
|
|
490
|
+
if (
|
|
491
|
+
system.device_type in cluster_gcluster.supported_device_types
|
|
492
|
+
or system.device_type == a3high_device_type
|
|
493
|
+
):
|
|
459
494
|
yml_string = A3_GPU_WORKLOAD_CREATE_YAML.format(
|
|
460
495
|
args=args,
|
|
461
496
|
container=container,
|
|
462
497
|
service_account=XPK_SA,
|
|
463
498
|
failure_policy_rules=failure_policy_rules,
|
|
464
499
|
pod_failure_policy=pod_failure_policy,
|
|
465
|
-
|
|
500
|
+
annotations=annotations,
|
|
466
501
|
)
|
|
467
502
|
|
|
468
503
|
sub_networks = get_cluster_subnetworks(args)
|
|
469
|
-
if args.device_type ==
|
|
504
|
+
if args.device_type == a3high_device_type:
|
|
505
|
+
yml_string = tcpx_decorator.decorate_jobset(yml_string)
|
|
506
|
+
elif args.device_type == a3mega_device_type:
|
|
470
507
|
yml_string = tcpxo_decorator.decorate_jobset(yml_string, sub_networks)
|
|
471
|
-
elif args.device_type in [
|
|
472
|
-
cluster_gcluster.a3ultra_device_type,
|
|
473
|
-
cluster_gcluster.a4_device_type,
|
|
474
|
-
]:
|
|
508
|
+
elif args.device_type in [a3ultra_device_type, a4_device_type]:
|
|
475
509
|
yml_string = rdma_decorator.decorate_jobset(yml_string, sub_networks)
|
|
476
510
|
|
|
477
511
|
if all_storages:
|
|
@@ -489,7 +523,6 @@ def workload_create(args) -> None:
|
|
|
489
523
|
failure_policy_rules=failure_policy_rules,
|
|
490
524
|
pod_failure_policy=pod_failure_policy,
|
|
491
525
|
)
|
|
492
|
-
yml_string = add_gpu_rxdm_container(yml_string, system, all_storages)
|
|
493
526
|
|
|
494
527
|
elif args.use_pathways and ensure_pathways_workload_prerequisites(
|
|
495
528
|
args, system
|
|
@@ -526,6 +559,10 @@ def workload_create(args) -> None:
|
|
|
526
559
|
get_storage_annotations(all_storages)
|
|
527
560
|
),
|
|
528
561
|
service_account=service_account,
|
|
562
|
+
tpu_toleration="""
|
|
563
|
+
- operator: "Exists"
|
|
564
|
+
key: google.com/tpu
|
|
565
|
+
""" if system.accelerator_type == AcceleratorType['TPU'] else '',
|
|
529
566
|
failure_policy_rules=failure_policy_rules,
|
|
530
567
|
pod_failure_policy=pod_failure_policy,
|
|
531
568
|
)
|