xpk 0.8.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. xpk/commands/batch.py +5 -6
  2. xpk/commands/cluster.py +246 -73
  3. xpk/commands/cluster_gcluster.py +27 -0
  4. xpk/commands/common.py +40 -1
  5. xpk/commands/kjob_common.py +13 -1
  6. xpk/commands/run.py +4 -5
  7. xpk/commands/shell.py +2 -2
  8. xpk/commands/storage.py +24 -6
  9. xpk/commands/workload.py +66 -27
  10. xpk/core/blueprint/blueprint_generator.py +115 -47
  11. xpk/core/capacity.py +66 -6
  12. xpk/core/cluster.py +282 -13
  13. xpk/core/config.py +1 -65
  14. xpk/core/docker_manager.py +1 -1
  15. xpk/core/docker_resources.py +145 -72
  16. xpk/core/filestore.py +2 -6
  17. xpk/core/gcsfuse.py +22 -4
  18. xpk/core/jobset.py +143 -0
  19. xpk/core/kjob.py +21 -18
  20. xpk/core/kueue.py +194 -4
  21. xpk/core/mtc.py +195 -0
  22. xpk/core/network.py +23 -1
  23. xpk/core/nodepool.py +17 -4
  24. xpk/core/pathways.py +2 -3
  25. xpk/core/resources.py +21 -0
  26. xpk/core/storage.py +1 -95
  27. xpk/core/system_characteristics.py +1 -1
  28. xpk/core/workload.py +1 -45
  29. xpk/core/workload_decorators/rdma_decorator.py +8 -10
  30. xpk/core/workload_decorators/tcpx_decorator.py +185 -0
  31. xpk/core/workload_decorators/tcpxo_decorator.py +22 -14
  32. xpk/parser/cluster.py +589 -389
  33. xpk/parser/storage.py +12 -3
  34. xpk/parser/workload.py +21 -3
  35. xpk/utils/kubectl.py +4 -1
  36. {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/METADATA +178 -96
  37. {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/RECORD +41 -38
  38. {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/WHEEL +1 -1
  39. {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/entry_points.txt +0 -0
  40. {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/licenses/LICENSE +0 -0
  41. {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/top_level.txt +0 -0
xpk/commands/batch.py CHANGED
@@ -18,7 +18,7 @@ import re
18
18
  from argparse import Namespace
19
19
 
20
20
  from ..core.cluster import (
21
- create_xpk_k8s_service_account,
21
+ setup_k8s_service_accounts,
22
22
  get_cluster_credentials,
23
23
  )
24
24
  from ..core.commands import run_command_for_value
@@ -26,14 +26,13 @@ from ..core.gcloud_context import add_zone_and_project
26
26
  from ..core.kjob import (
27
27
  AppProfileDefaults,
28
28
  JobTemplateDefaults,
29
- Kueue_TAS_annotation,
30
29
  get_storage_annotations,
31
30
  prepare_kjob,
32
31
  )
33
32
  from ..core.kueue import LOCAL_QUEUE_NAME
34
33
  from ..utils.console import xpk_exit, xpk_print
35
34
  from .kind import set_local_cluster_command
36
- from .kjob_common import add_gpu_networking_annotations_to_command
35
+ from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
37
36
 
38
37
 
39
38
  def batch(args: Namespace) -> None:
@@ -55,24 +54,24 @@ def batch(args: Namespace) -> None:
55
54
  err_code = prepare_kjob(args)
56
55
  if err_code > 0:
57
56
  xpk_exit(err_code)
58
- create_xpk_k8s_service_account()
57
+ setup_k8s_service_accounts()
59
58
 
60
59
  submit_job(args)
61
60
 
62
61
 
63
62
  def submit_job(args: Namespace) -> None:
64
63
 
65
- create_xpk_k8s_service_account()
64
+ setup_k8s_service_accounts()
66
65
 
67
66
  cmd = (
68
67
  'kubectl kjob create slurm'
69
68
  f' --profile {AppProfileDefaults.NAME.value}'
70
69
  f' --localqueue {LOCAL_QUEUE_NAME}'
71
- f' --pod-template-annotation {Kueue_TAS_annotation}'
72
70
  f' --worker-container {JobTemplateDefaults.CONTAINER_NAME.value}'
73
71
  ' --first-node-ip'
74
72
  )
75
73
  cmd = add_gpu_networking_annotations_to_command(args, cmd)
74
+ cmd = add_TAS_annotations_to_command(args, cmd)
76
75
 
77
76
  for annotation in get_storage_annotations(args):
78
77
  cmd += f' --pod-template-annotation {annotation}'
xpk/commands/cluster.py CHANGED
@@ -16,19 +16,23 @@ limitations under the License.
16
16
 
17
17
  from tabulate import tabulate
18
18
 
19
- from ..core.capacity import H100_DEVICE_TYPE
19
+ from ..core.capacity import H100_DEVICE_TYPE, H200_DEVICE_TYPE, B200_DEVICE_TYPE
20
20
  from ..core.cluster import (
21
21
  get_all_clusters_programmatic,
22
22
  get_cluster_credentials,
23
23
  install_nccl_on_cluster,
24
+ install_nri_on_cluster,
24
25
  set_jobset_on_cluster,
25
26
  set_pathways_job_on_cluster,
26
27
  setup_k8s_env,
27
- update_cluster_with_gcsfuse_driver_if_necessary,
28
- update_cluster_with_workload_identity_if_necessary,
28
+ disable_mglru_on_cluster,
29
+ count_nodes_on_cluster,
29
30
  update_cluster_with_gcpfilestore_driver_if_necessary,
31
+ update_cluster_with_gcsfuse_driver_if_necessary,
30
32
  update_cluster_with_parallelstore_driver_if_necessary,
31
33
  update_cluster_with_pd_driver_if_necessary,
34
+ update_cluster_with_lustre_driver_if_necessary,
35
+ update_cluster_with_workload_identity_if_necessary,
32
36
  )
33
37
  from ..core.cluster_private import authorize_private_cluster_access_if_necessary
34
38
  from ..core.commands import run_command_for_value, run_command_with_updates
@@ -39,12 +43,14 @@ from ..core.gcloud_context import (
39
43
  get_gke_server_config,
40
44
  zone_to_region,
41
45
  )
46
+ from ..core.jobset import update_jobset_resources_if_necessary
42
47
  from ..core.kjob import apply_kjob_crds, prepare_kjob, verify_kjob_installed
43
48
  from ..core.kueue import (
44
49
  cluster_preheat_yml,
45
50
  install_kueue_crs,
46
51
  install_kueue_on_cluster,
47
52
  wait_for_kueue_available,
53
+ update_kueue_resources_if_necessary,
48
54
  )
49
55
  from ..core.nap import enable_autoprovisioning_on_cluster
50
56
  from ..core.network import (
@@ -52,8 +58,12 @@ from ..core.network import (
52
58
  delete_cluster_subnets,
53
59
  set_up_cluster_network_for_a3,
54
60
  )
55
- from ..core.nodepool import get_gke_node_pool_version, run_gke_node_pool_create_command
61
+ from ..core.nodepool import (
62
+ get_gke_node_pool_version,
63
+ run_gke_node_pool_create_command,
64
+ )
56
65
  from ..core.ray import install_ray_cluster
66
+ from ..core.mtc import install_mtc_on_cluster
57
67
  from ..core.resources import create_cluster_configmaps
58
68
  from ..core.storage import install_storage_crd
59
69
  from ..core.system_characteristics import (
@@ -70,14 +80,122 @@ from . import cluster_gcluster
70
80
  from .common import set_cluster_command
71
81
 
72
82
 
83
+ def cluster_adapt(args) -> None:
84
+ """Function that performs cluster adaptation.
85
+
86
+ Args:
87
+ args: user provided arguments for running the command.
88
+ """
89
+ args.enable_pathways = False
90
+
91
+ system, return_code = get_system_characteristics(args)
92
+
93
+ if return_code > 0:
94
+ xpk_print('Fetching system characteristics failed!')
95
+ xpk_exit(return_code)
96
+
97
+ xpk_print(
98
+ f'Starting cluster adaptation for cluster {args.cluster}:', flush=True
99
+ )
100
+ add_zone_and_project(args)
101
+
102
+ if system.accelerator_type == AcceleratorType['GPU'] and not getattr(
103
+ args, 'num_nodes'
104
+ ):
105
+ xpk_print(
106
+ 'Argument --num-nodes was not provided, trying to determine number of'
107
+ ' nodes based on the available nodes in the cluster...'
108
+ )
109
+ args.num_nodes = count_nodes_on_cluster(args, system)
110
+ if args.num_nodes == 0:
111
+ xpk_print(
112
+ 'Found unexpected number of nodes. Is the --device-type correct?'
113
+ )
114
+ xpk_exit(1)
115
+ else:
116
+ xpk_print(f'Using {args.num_nodes} nodes.')
117
+
118
+ # ToDo(roshanin@) - Re-enable CloudDNS on Pathways clusters conditionally.
119
+ # Enable WorkloadIdentity if not enabled already.
120
+ if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
121
+ update_cluster_command_code = (
122
+ update_cluster_with_workload_identity_if_necessary(args)
123
+ )
124
+ if update_cluster_command_code != 0:
125
+ xpk_exit(update_cluster_command_code)
126
+
127
+ get_cluster_credentials(args)
128
+
129
+ k8s_client = setup_k8s_env(args)
130
+
131
+ install_storage_crd(k8s_client)
132
+ install_storage_csis(args)
133
+
134
+ # create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
135
+ tensorboard_config = {}
136
+ if VERTEX_TENSORBOARD_FEATURE_FLAG and args.create_vertex_tensorboard:
137
+ tensorboard_config = create_vertex_tensorboard(args)
138
+ # exit if failed to create Tensorboard in Vertex AI
139
+ if not tensorboard_config:
140
+ xpk_exit(1)
141
+
142
+ # Provision node pools dynamically based on incoming workloads:
143
+ # Currently autoprovisioning is not supported with Pathways.
144
+ autoprovisioning_config = None
145
+ if args.enable_autoprovisioning:
146
+ xpk_print('Enabling Autoprovisioning')
147
+ autoprovisioning_config, return_code = enable_autoprovisioning_on_cluster(
148
+ args, system
149
+ )
150
+ if return_code != 0:
151
+ xpk_exit(return_code)
152
+
153
+ xpk_print('Creating ConfigMap for cluster')
154
+ create_cluster_configmaps_code = create_cluster_configmaps(
155
+ args, system, tensorboard_config, autoprovisioning_config
156
+ )
157
+ if create_cluster_configmaps_code != 0:
158
+ xpk_exit(create_cluster_configmaps_code)
159
+
160
+ xpk_print(
161
+ 'Enabling the jobset API on our cluster, to be deprecated when Jobset is'
162
+ ' globally available'
163
+ )
164
+ set_jobset_on_cluster_code = set_jobset_on_cluster(args)
165
+ if set_jobset_on_cluster_code != 0:
166
+ xpk_exit(set_jobset_on_cluster_code)
167
+
168
+ # TODO: Uncomment when cluster_adapt will support TPU cluters
169
+ # set_pathways_job_on_cluster_code = set_pathways_job_on_cluster(args)
170
+ # if set_pathways_job_on_cluster_code != 0:
171
+ # xpk_exit(set_pathways_job_on_cluster_code)
172
+
173
+ install_kueue(args, system, autoprovisioning_config)
174
+
175
+ install_kjob(args)
176
+ if system.accelerator_type == AcceleratorType['GPU']:
177
+ prepare_gpus(args, system)
178
+
179
+ if args.enable_ray_cluster:
180
+ return_code = install_ray_cluster(args, system)
181
+ if return_code != 0:
182
+ xpk_print('Installation of RayCluster failed.')
183
+ xpk_exit(return_code)
184
+
185
+ xpk_print('GKE commands done! Resources are created.')
186
+ xpk_print(
187
+ 'See your GKE Cluster here:'
188
+ # pylint: disable=line-too-long
189
+ f' https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}'
190
+ )
191
+ xpk_exit(0)
192
+
193
+
73
194
  def cluster_create(args) -> None:
74
195
  """Function around cluster creation.
75
196
 
76
197
  Args:
77
198
  args: user provided arguments for running the command.
78
-
79
- Returns:
80
- 0 if successful and 1 otherwise.
81
199
  """
82
200
  system, return_code = get_system_characteristics(args)
83
201
 
@@ -127,38 +245,12 @@ def cluster_create(args) -> None:
127
245
  if update_cluster_command_code != 0:
128
246
  xpk_exit(update_cluster_command_code)
129
247
 
130
- # Enable GCSFuse CSI Driver if not enabled already.
131
- if args.enable_gcsfuse_csi_driver:
132
- update_cluster_command_code = (
133
- update_cluster_with_gcsfuse_driver_if_necessary(args)
134
- )
135
- if update_cluster_command_code != 0:
136
- xpk_exit(update_cluster_command_code)
137
-
138
- if args.enable_gcpfilestore_csi_driver:
139
- update_cluster_command_code = (
140
- update_cluster_with_gcpfilestore_driver_if_necessary(args)
141
- )
142
- if update_cluster_command_code != 0:
143
- xpk_exit(update_cluster_command_code)
144
-
145
- if args.enable_parallelstore_csi_driver:
146
- update_cluster_command_code = (
147
- update_cluster_with_parallelstore_driver_if_necessary(args)
148
- )
149
- if update_cluster_command_code != 0:
150
- xpk_exit(update_cluster_command_code)
151
-
152
- if args.enable_pd_csi_driver:
153
- update_cluster_command_code = update_cluster_with_pd_driver_if_necessary(
154
- args
155
- )
156
- if update_cluster_command_code != 0:
157
- xpk_exit(update_cluster_command_code)
248
+ get_cluster_credentials(args)
158
249
 
159
- # Update Pathways clusters with CloudDNS if not enabled already.
250
+ k8s_client = setup_k8s_env(args)
160
251
 
161
- get_cluster_credentials(args)
252
+ install_storage_crd(k8s_client)
253
+ install_storage_csis(args)
162
254
 
163
255
  # create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
164
256
  tensorboard_config = {}
@@ -218,50 +310,20 @@ def cluster_create(args) -> None:
218
310
  set_jobset_on_cluster_code = set_jobset_on_cluster(args)
219
311
  if set_jobset_on_cluster_code != 0:
220
312
  xpk_exit(set_jobset_on_cluster_code)
313
+ update_jobset_resources_code = update_jobset_resources_if_necessary(args)
314
+ if update_jobset_resources_code != 0:
315
+ xpk_exit(update_jobset_resources_code)
221
316
 
222
317
  set_pathways_job_on_cluster_code = set_pathways_job_on_cluster(args)
223
318
  if set_pathways_job_on_cluster_code != 0:
224
319
  xpk_exit(set_pathways_job_on_cluster_code)
225
320
 
226
- xpk_print('Enabling Kueue on the cluster')
227
- install_kueue_on_cluster_code = install_kueue_on_cluster(args)
228
- if install_kueue_on_cluster_code != 0:
229
- xpk_exit(install_kueue_on_cluster_code)
230
-
231
- xpk_print('Verifying kjob installation')
232
- err_code = verify_kjob_installed(args)
233
- if err_code > 0:
234
- xpk_exit(err_code)
235
-
236
- xpk_print('Applying kjob CDRs')
237
- err_code = apply_kjob_crds(args)
238
- if err_code > 0:
239
- xpk_exit(err_code)
321
+ install_kueue(args, system, autoprovisioning_config)
240
322
 
241
- err_code = prepare_kjob(args)
242
- if err_code > 0:
243
- xpk_exit(err_code)
244
-
245
- k8s_client = setup_k8s_env(args)
246
- install_storage_crd(k8s_client)
247
-
248
- xpk_print('Wait for Kueue to be fully available')
249
- wait_for_kueue_available_code = wait_for_kueue_available(args)
250
- if wait_for_kueue_available_code != 0:
251
- xpk_exit(wait_for_kueue_available_code)
252
-
253
- xpk_print('Install Kueue Custom Resources')
254
- enable_kueue_credentials_code = install_kueue_crs(
255
- args, system, autoprovisioning_config
256
- )
257
- if enable_kueue_credentials_code != 0:
258
- xpk_exit(enable_kueue_credentials_code)
323
+ install_kjob(args)
259
324
 
260
325
  if system.accelerator_type == AcceleratorType['GPU']:
261
- xpk_print('Installing NCCL Plugin for cluster')
262
- install_nccl_code = install_nccl_on_cluster(args, system)
263
- if install_nccl_code != 0:
264
- xpk_exit(install_nccl_code)
326
+ prepare_gpus(args, system)
265
327
 
266
328
  if args.enable_ray_cluster:
267
329
  return_code = install_ray_cluster(args, system)
@@ -269,6 +331,12 @@ def cluster_create(args) -> None:
269
331
  xpk_print('Installation of RayCluster failed.')
270
332
  xpk_exit(return_code)
271
333
 
334
+ if hasattr(args, 'enable_mtc') and args.enable_mtc:
335
+ return_code = install_mtc_on_cluster(args, system)
336
+ if return_code != 0:
337
+ xpk_print('Installation of MTC failed.')
338
+ xpk_exit(return_code)
339
+
272
340
  xpk_print('GKE commands done! Resources are created.')
273
341
  xpk_print(
274
342
  'See your GKE Cluster here:'
@@ -773,6 +841,7 @@ def run_gke_cluster_create_command(
773
841
  f' --num-nodes {args.default_pool_cpu_num_nodes}'
774
842
  f' {args.custom_cluster_arguments}'
775
843
  f' {rapid_release_cmd}'
844
+ ' --enable-dns-access'
776
845
  )
777
846
 
778
847
  enable_ip_alias = False
@@ -805,6 +874,7 @@ def run_gke_cluster_create_command(
805
874
  addons = []
806
875
  if args.enable_gcsfuse_csi_driver:
807
876
  addons.append('GcsFuseCsiDriver')
877
+
808
878
  if args.enable_gcpfilestore_csi_driver:
809
879
  addons.append('GcpFilestoreCsiDriver')
810
880
 
@@ -814,6 +884,13 @@ def run_gke_cluster_create_command(
814
884
  if args.enable_pd_csi_driver:
815
885
  addons.append('GcePersistentDiskCsiDriver')
816
886
 
887
+ if args.enable_lustre_csi_driver:
888
+ addons.append('LustreCsiDriver')
889
+ command += ' --enable-legacy-lustre-port'
890
+
891
+ if hasattr(args, 'enable_mtc') and args.enable_mtc:
892
+ addons.append('HighScaleCheckpointing')
893
+
817
894
  if len(addons) > 0:
818
895
  addons_str = ','.join(addons)
819
896
  command += f' --addons={addons_str}'
@@ -823,3 +900,99 @@ def run_gke_cluster_create_command(
823
900
  xpk_print(f'GKE Cluster Create request returned ERROR {return_code}')
824
901
  return 1
825
902
  return 0
903
+
904
+
905
+ def install_storage_csis(args):
906
+ if args.enable_gcsfuse_csi_driver:
907
+ update_cluster_command_code = (
908
+ update_cluster_with_gcsfuse_driver_if_necessary(args)
909
+ )
910
+ if update_cluster_command_code != 0:
911
+ xpk_exit(update_cluster_command_code)
912
+
913
+ if args.enable_gcpfilestore_csi_driver:
914
+ update_cluster_command_code = (
915
+ update_cluster_with_gcpfilestore_driver_if_necessary(args)
916
+ )
917
+ if update_cluster_command_code != 0:
918
+ xpk_exit(update_cluster_command_code)
919
+
920
+ if args.enable_parallelstore_csi_driver:
921
+ update_cluster_command_code = (
922
+ update_cluster_with_parallelstore_driver_if_necessary(args)
923
+ )
924
+ if update_cluster_command_code != 0:
925
+ xpk_exit(update_cluster_command_code)
926
+
927
+ if args.enable_pd_csi_driver:
928
+ update_cluster_command_code = update_cluster_with_pd_driver_if_necessary(
929
+ args
930
+ )
931
+ if update_cluster_command_code != 0:
932
+ xpk_exit(update_cluster_command_code)
933
+
934
+ if args.enable_lustre_csi_driver:
935
+ update_cluster_command_code = (
936
+ update_cluster_with_lustre_driver_if_necessary(args)
937
+ )
938
+ if update_cluster_command_code != 0:
939
+ xpk_exit(update_cluster_command_code)
940
+
941
+
942
+ def install_kjob(args):
943
+ xpk_print('Verifying kjob installation')
944
+ err_code = verify_kjob_installed(args)
945
+ if err_code > 0:
946
+ xpk_exit(err_code)
947
+
948
+ xpk_print('Applying kjob CDRs')
949
+ err_code = apply_kjob_crds(args)
950
+ if err_code > 0:
951
+ xpk_exit(err_code)
952
+
953
+ err_code = prepare_kjob(args)
954
+ if err_code > 0:
955
+ xpk_exit(err_code)
956
+
957
+
958
+ def install_kueue(args, system: SystemCharacteristics, autoprovisioning_config):
959
+ xpk_print('Enabling Kueue on the cluster')
960
+ install_kueue_on_cluster_code = install_kueue_on_cluster(args)
961
+ if install_kueue_on_cluster_code != 0:
962
+ xpk_exit(install_kueue_on_cluster_code)
963
+
964
+ xpk_print('Wait for Kueue to be fully available')
965
+ wait_for_kueue_available_code = wait_for_kueue_available(args)
966
+ if wait_for_kueue_available_code != 0:
967
+ xpk_exit(wait_for_kueue_available_code)
968
+
969
+ xpk_print('Install Kueue Custom Resources')
970
+ enable_kueue_credentials_code = install_kueue_crs(
971
+ args, system, autoprovisioning_config
972
+ )
973
+ if enable_kueue_credentials_code != 0:
974
+ xpk_exit(enable_kueue_credentials_code)
975
+
976
+ xpk_print('Update Kueue Controller Manager resources')
977
+ update_kueue_resources_code = update_kueue_resources_if_necessary(args)
978
+ if update_kueue_resources_code != 0:
979
+ xpk_exit(update_kueue_resources_code)
980
+
981
+
982
+ def prepare_gpus(args, system: SystemCharacteristics):
983
+ xpk_print('Installing NCCL Plugin for cluster')
984
+ install_nccl_code = install_nccl_on_cluster(args, system)
985
+ if install_nccl_code != 0:
986
+ xpk_exit(install_nccl_code)
987
+
988
+ if system.device_type == H100_DEVICE_TYPE:
989
+ xpk_print('Installing NRI device injector for cluster')
990
+ install_nri_code = install_nri_on_cluster(args)
991
+ if install_nri_code != 0:
992
+ xpk_exit(install_nri_code)
993
+
994
+ if system.device_type in [H200_DEVICE_TYPE, B200_DEVICE_TYPE]:
995
+ xpk_print('Disabling MGLRU')
996
+ err_code = disable_mglru_on_cluster(args)
997
+ if err_code > 0:
998
+ xpk_exit(err_code)
@@ -37,6 +37,7 @@ from ..utils.console import xpk_exit, xpk_print
37
37
  from ..utils.file import ensure_directory_exists
38
38
  from ..utils.network import all_IPs_cidr
39
39
  from ..utils.objects import hash_string
40
+ from ..core.capacity import get_reservation_maintenance_interval, get_reservation_placement_policy
40
41
 
41
42
  blueprints_path = os.path.abspath('xpkclusters/blueprints')
42
43
  gcluster_working_dir = os.path.abspath('xpkclusters/gcluster-out')
@@ -234,6 +235,30 @@ def generate_blueprint(
234
235
  if args.device_type in supported_device_types:
235
236
  if args.device_type == a3mega_device_type:
236
237
  num_nodes = args.num_nodes if not args.num_nodes is None else 2
238
+
239
+ maintenance_interval = (
240
+ get_reservation_maintenance_interval(
241
+ args.reservation, args.zone, args.project
242
+ )
243
+ if args.reservation is not None
244
+ else 'PERIODIC'
245
+ )
246
+ placement_policy_name = (
247
+ get_reservation_placement_policy(
248
+ args.reservation, args.zone, args.project
249
+ )
250
+ if args.reservation is not None
251
+ else None
252
+ )
253
+ placement_policy = (
254
+ {
255
+ 'type': 'COMPACT',
256
+ 'name': placement_policy_name.split('/')[-1],
257
+ }
258
+ if placement_policy_name is not None
259
+ and len(placement_policy_name) > 0
260
+ else None
261
+ )
237
262
  return bpg.generate_a3_mega_blueprint(
238
263
  blueprint_name=blueprint_name,
239
264
  prefix=prefix,
@@ -243,6 +268,8 @@ def generate_blueprint(
243
268
  zone=args.zone,
244
269
  auth_cidr=all_IPs_cidr,
245
270
  num_nodes=num_nodes,
271
+ reservation_maintenance_interval=maintenance_interval,
272
+ reservation_placement_policy=placement_policy,
246
273
  reservation=args.reservation if args.reservation else None,
247
274
  capacity_type=capacity_type,
248
275
  system_node_pool_machine_type=args.default_pool_cpu_machine_type,
xpk/commands/common.py CHANGED
@@ -15,8 +15,12 @@ limitations under the License.
15
15
  """
16
16
 
17
17
  from ..core.commands import run_command_with_updates_retry
18
+ from ..core.capacity import H100_MEGA_DEVICE_TYPE, CapacityType
18
19
  from ..core.gcloud_context import zone_to_region
19
- from ..utils.console import xpk_print
20
+ from ..utils.console import xpk_print, xpk_exit
21
+ from ..core.system_characteristics import (
22
+ SystemCharacteristics,
23
+ )
20
24
 
21
25
 
22
26
  def set_cluster_command(args) -> int:
@@ -31,6 +35,7 @@ def set_cluster_command(args) -> int:
31
35
  command = (
32
36
  'gcloud container clusters get-credentials'
33
37
  f' {args.cluster} --region={zone_to_region(args.zone)}'
38
+ ' --dns-endpoint'
34
39
  f' --project={args.project} &&'
35
40
  ' kubectl config view && kubectl config set-context --current'
36
41
  ' --namespace=default'
@@ -42,3 +47,37 @@ def set_cluster_command(args) -> int:
42
47
  if return_code != 0:
43
48
  xpk_print(f'{task} returned ERROR {return_code}')
44
49
  return return_code
50
+
51
+
52
+ def is_TAS_possible(
53
+ system_characteristics: SystemCharacteristics,
54
+ capacity_type: CapacityType,
55
+ flex: bool,
56
+ ) -> bool:
57
+ """Check cluster's machine_type and capacity type to determine if Kueue TAS is possible
58
+
59
+ Args:
60
+ args: user provided arguments for running the command.
61
+
62
+ Returns:
63
+ True if possible and False otherwise.
64
+ """
65
+
66
+ if system_characteristics is None:
67
+ xpk_print('system_characteristics data was not found in configmaps.')
68
+ xpk_exit(1)
69
+
70
+ if capacity_type is None:
71
+ xpk_print('capacity_type data was not found in configmaps.')
72
+ xpk_exit(1)
73
+
74
+ if flex:
75
+ return False
76
+
77
+ if (
78
+ system_characteristics.device_type == H100_MEGA_DEVICE_TYPE
79
+ and capacity_type != CapacityType.RESERVATION
80
+ ):
81
+ return False
82
+
83
+ return True
@@ -24,7 +24,10 @@ from ..core.kjob import (
24
24
  get_a3mega_pod_template_annotations,
25
25
  get_a3ultra_pod_template_annotations,
26
26
  get_a4_pod_template_annotations,
27
+ Kueue_TAS_annotation,
27
28
  )
29
+ from .common import is_TAS_possible
30
+ from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics
28
31
 
29
32
 
30
33
  def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
@@ -35,7 +38,7 @@ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
35
38
  elif gpu_type == H200_DEVICE_TYPE:
36
39
  annotations = get_a3ultra_pod_template_annotations(args)
37
40
  elif gpu_type == B200_DEVICE_TYPE:
38
- annotations = get_a4_pod_template_annotations()
41
+ annotations = get_a4_pod_template_annotations(args)
39
42
  else:
40
43
  annotations = []
41
44
 
@@ -45,3 +48,12 @@ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
45
48
  cmd += "\\\n".join(flags)
46
49
 
47
50
  return cmd
51
+
52
+
53
+ def add_TAS_annotations_to_command(args, cmd: str) -> str:
54
+ system_characteristics = get_cluster_system_characteristics(args)
55
+ capacity_type = get_cluster_capacity_type(args)
56
+ if is_TAS_possible(system_characteristics, capacity_type, flex=False):
57
+ cmd += f" --pod-template-annotation {Kueue_TAS_annotation}"
58
+
59
+ return cmd
xpk/commands/run.py CHANGED
@@ -17,7 +17,7 @@ limitations under the License.
17
17
  from argparse import Namespace
18
18
 
19
19
  from ..core.cluster import (
20
- create_xpk_k8s_service_account,
20
+ setup_k8s_service_accounts,
21
21
  get_cluster_credentials,
22
22
  )
23
23
  from ..core.commands import run_command_with_full_controls
@@ -25,14 +25,13 @@ from ..core.gcloud_context import add_zone_and_project
25
25
  from ..core.kjob import (
26
26
  AppProfileDefaults,
27
27
  JobTemplateDefaults,
28
- Kueue_TAS_annotation,
29
28
  get_storage_annotations,
30
29
  prepare_kjob,
31
30
  )
32
31
  from ..core.kueue import LOCAL_QUEUE_NAME
33
32
  from ..utils.console import xpk_exit, xpk_print
34
33
  from .kind import set_local_cluster_command
35
- from .kjob_common import add_gpu_networking_annotations_to_command
34
+ from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
36
35
 
37
36
 
38
37
  def run(args: Namespace) -> None:
@@ -54,7 +53,7 @@ def run(args: Namespace) -> None:
54
53
  err_code = prepare_kjob(args)
55
54
  if err_code > 0:
56
55
  xpk_exit(err_code)
57
- create_xpk_k8s_service_account()
56
+ setup_k8s_service_accounts()
58
57
 
59
58
  submit_job(args)
60
59
 
@@ -64,12 +63,12 @@ def submit_job(args: Namespace) -> None:
64
63
  'kubectl kjob create slurm --profile'
65
64
  f' {AppProfileDefaults.NAME.value} '
66
65
  f' --localqueue {LOCAL_QUEUE_NAME} '
67
- f" --pod-template-annotation '{Kueue_TAS_annotation}'"
68
66
  f' --stream-container {JobTemplateDefaults.CONTAINER_NAME.value}'
69
67
  f' --worker-container {JobTemplateDefaults.CONTAINER_NAME.value}'
70
68
  ' --wait --rm --first-node-ip'
71
69
  )
72
70
  cmd = add_gpu_networking_annotations_to_command(args, cmd)
71
+ cmd = add_TAS_annotations_to_command(args, cmd)
73
72
 
74
73
  for annotation in get_storage_annotations(args):
75
74
  cmd += f' --pod-template-annotation {annotation}'
xpk/commands/shell.py CHANGED
@@ -12,7 +12,7 @@ limitations under the License.
12
12
  """
13
13
 
14
14
  from ..core.commands import run_command_with_full_controls, run_command_for_value, run_command_with_updates
15
- from ..core.cluster import get_cluster_credentials, add_zone_and_project, create_xpk_k8s_service_account
15
+ from ..core.cluster import get_cluster_credentials, add_zone_and_project, setup_k8s_service_accounts
16
16
  from ..utils.console import xpk_exit, xpk_print
17
17
  from argparse import Namespace
18
18
 
@@ -82,7 +82,7 @@ def connect_to_new_interactive_shell(args: Namespace) -> int:
82
82
  err_code = prepare_kjob(args)
83
83
  if err_code > 0:
84
84
  xpk_exit(err_code)
85
- create_xpk_k8s_service_account()
85
+ setup_k8s_service_accounts()
86
86
 
87
87
  cmd = (
88
88
  'kubectl-kjob create interactive --profile'