xpk 0.7.2__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. xpk/commands/batch.py +19 -13
  2. xpk/commands/cluster.py +240 -71
  3. xpk/commands/cluster_gcluster.py +22 -5
  4. xpk/commands/common.py +33 -1
  5. xpk/commands/info.py +2 -4
  6. xpk/commands/job.py +7 -8
  7. xpk/commands/kjob_common.py +30 -18
  8. xpk/commands/run.py +17 -12
  9. xpk/commands/shell.py +3 -4
  10. xpk/commands/storage.py +75 -19
  11. xpk/commands/workload.py +161 -324
  12. xpk/core/blueprint/blueprint_definitions.py +2 -0
  13. xpk/core/blueprint/blueprint_generator.py +335 -45
  14. xpk/core/capacity.py +1 -0
  15. xpk/core/cluster.py +193 -12
  16. xpk/core/config.py +3 -1
  17. xpk/core/docker_manager.py +1 -1
  18. xpk/core/docker_resources.py +9 -21
  19. xpk/core/filestore.py +5 -1
  20. xpk/core/gcsfuse.py +27 -6
  21. xpk/core/kjob.py +66 -20
  22. xpk/core/kueue.py +30 -0
  23. xpk/core/mtc.py +195 -0
  24. xpk/core/nap.py +4 -0
  25. xpk/core/network.py +34 -22
  26. xpk/core/nodepool.py +28 -26
  27. xpk/core/pathways.py +165 -210
  28. xpk/core/resources.py +21 -0
  29. xpk/core/scheduling.py +36 -0
  30. xpk/core/storage.py +66 -12
  31. xpk/core/system_characteristics.py +9 -0
  32. xpk/core/workload.py +28 -83
  33. xpk/core/workload_decorators/rdma_decorator.py +11 -15
  34. xpk/core/workload_decorators/storage_decorator.py +8 -3
  35. xpk/core/workload_decorators/tcpx_decorator.py +179 -0
  36. xpk/core/workload_decorators/tcpxo_decorator.py +17 -16
  37. xpk/parser/cluster.py +574 -381
  38. xpk/parser/storage.py +25 -5
  39. xpk/parser/workload.py +59 -31
  40. xpk/utils/kubectl.py +4 -1
  41. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/METADATA +192 -93
  42. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/RECORD +46 -44
  43. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/WHEEL +1 -1
  44. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/entry_points.txt +0 -0
  45. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/licenses/LICENSE +0 -0
  46. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/top_level.txt +0 -0
xpk/commands/batch.py CHANGED
@@ -14,18 +14,25 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
+ import re
17
18
  from argparse import Namespace
18
19
 
19
- from ..core.cluster import create_xpk_k8s_service_account
20
+ from ..core.cluster import (
21
+ create_xpk_k8s_service_account,
22
+ get_cluster_credentials,
23
+ )
20
24
  from ..core.commands import run_command_for_value
21
25
  from ..core.gcloud_context import add_zone_and_project
26
+ from ..core.kjob import (
27
+ AppProfileDefaults,
28
+ JobTemplateDefaults,
29
+ get_storage_annotations,
30
+ prepare_kjob,
31
+ )
22
32
  from ..core.kueue import LOCAL_QUEUE_NAME
23
33
  from ..utils.console import xpk_exit, xpk_print
24
- from .common import set_cluster_command
25
- from ..core.kjob import AppProfileDefaults, JobTemplateDefaults, prepare_kjob, Kueue_TAS_annotation, get_gcsfuse_annotation
26
- from .kjob_common import add_gpu_networking_annotations_to_command
27
34
  from .kind import set_local_cluster_command
28
- import re
35
+ from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
29
36
 
30
37
 
31
38
  def batch(args: Namespace) -> None:
@@ -38,12 +45,11 @@ def batch(args: Namespace) -> None:
38
45
  """
39
46
  if not args.kind_cluster:
40
47
  add_zone_and_project(args)
41
- set_cluster_command_code = set_cluster_command(args)
48
+ get_cluster_credentials(args)
42
49
  else:
43
50
  set_cluster_command_code = set_local_cluster_command(args)
44
-
45
- if set_cluster_command_code != 0:
46
- xpk_exit(set_cluster_command_code)
51
+ if set_cluster_command_code != 0:
52
+ xpk_exit(set_cluster_command_code)
47
53
 
48
54
  err_code = prepare_kjob(args)
49
55
  if err_code > 0:
@@ -61,14 +67,14 @@ def submit_job(args: Namespace) -> None:
61
67
  'kubectl kjob create slurm'
62
68
  f' --profile {AppProfileDefaults.NAME.value}'
63
69
  f' --localqueue {LOCAL_QUEUE_NAME}'
64
- f' --pod-template-annotation {Kueue_TAS_annotation}'
65
70
  f' --worker-container {JobTemplateDefaults.CONTAINER_NAME.value}'
66
71
  ' --first-node-ip'
67
72
  )
68
73
  cmd = add_gpu_networking_annotations_to_command(args, cmd)
69
- gcsfuse_annotation = get_gcsfuse_annotation(args)
70
- if gcsfuse_annotation is not None:
71
- cmd += f' --pod-template-annotation {gcsfuse_annotation}'
74
+ cmd = add_TAS_annotations_to_command(args, cmd)
75
+
76
+ for annotation in get_storage_annotations(args):
77
+ cmd += f' --pod-template-annotation {annotation}'
72
78
 
73
79
  if args.ignore_unknown_flags:
74
80
  cmd += ' --ignore-unknown-flags'
xpk/commands/cluster.py CHANGED
@@ -16,14 +16,21 @@ limitations under the License.
16
16
 
17
17
  from tabulate import tabulate
18
18
 
19
- from ..core.capacity import H100_DEVICE_TYPE
19
+ from ..core.capacity import H100_DEVICE_TYPE, H200_DEVICE_TYPE, B200_DEVICE_TYPE
20
20
  from ..core.cluster import (
21
21
  get_all_clusters_programmatic,
22
22
  get_cluster_credentials,
23
23
  install_nccl_on_cluster,
24
+ install_nri_on_cluster,
24
25
  set_jobset_on_cluster,
26
+ set_pathways_job_on_cluster,
25
27
  setup_k8s_env,
28
+ disable_mglru_on_cluster,
29
+ count_nodes_on_cluster,
30
+ update_cluster_with_gcpfilestore_driver_if_necessary,
26
31
  update_cluster_with_gcsfuse_driver_if_necessary,
32
+ update_cluster_with_parallelstore_driver_if_necessary,
33
+ update_cluster_with_pd_driver_if_necessary,
27
34
  update_cluster_with_workload_identity_if_necessary,
28
35
  )
29
36
  from ..core.cluster_private import authorize_private_cluster_access_if_necessary
@@ -46,10 +53,14 @@ from ..core.nap import enable_autoprovisioning_on_cluster
46
53
  from ..core.network import (
47
54
  create_cluster_network_config,
48
55
  delete_cluster_subnets,
49
- set_up_cluster_network_for_gpu,
56
+ set_up_cluster_network_for_a3,
57
+ )
58
+ from ..core.nodepool import (
59
+ get_gke_node_pool_version,
60
+ run_gke_node_pool_create_command,
50
61
  )
51
- from ..core.nodepool import get_gke_node_pool_version, run_gke_node_pool_create_command
52
62
  from ..core.ray import install_ray_cluster
63
+ from ..core.mtc import install_mtc_on_cluster
53
64
  from ..core.resources import create_cluster_configmaps
54
65
  from ..core.storage import install_storage_crd
55
66
  from ..core.system_characteristics import (
@@ -64,7 +75,118 @@ from ..utils.console import get_user_input, xpk_exit, xpk_print
64
75
  from ..utils.file import write_tmp_file
65
76
  from . import cluster_gcluster
66
77
  from .common import set_cluster_command
67
- from ..core.cluster import update_cluster_with_gcpfilestore_driver_if_necessary
78
+
79
+
80
+ def cluster_adapt(args) -> None:
81
+ """Function that performs cluster adaptation.
82
+
83
+ Args:
84
+ args: user provided arguments for running the command.
85
+ """
86
+ args.enable_pathways = False
87
+
88
+ system, return_code = get_system_characteristics(args)
89
+
90
+ if return_code > 0:
91
+ xpk_print('Fetching system characteristics failed!')
92
+ xpk_exit(return_code)
93
+
94
+ xpk_print(
95
+ f'Starting cluster adaptation for cluster {args.cluster}:', flush=True
96
+ )
97
+ add_zone_and_project(args)
98
+
99
+ if system.accelerator_type == AcceleratorType['GPU'] and not getattr(
100
+ args, 'num_nodes'
101
+ ):
102
+ xpk_print(
103
+ 'Argument --num-nodes was not provided, trying to determine number of'
104
+ ' nodes based on the available nodes in the cluster...'
105
+ )
106
+ args.num_nodes = count_nodes_on_cluster(args, system)
107
+ if args.num_nodes == 0:
108
+ xpk_print(
109
+ 'Found unexpected number of nodes. Is the --device-type correct?'
110
+ )
111
+ xpk_exit(1)
112
+ else:
113
+ xpk_print(f'Using {args.num_nodes} nodes.')
114
+
115
+ # ToDo(roshanin@) - Re-enable CloudDNS on Pathways clusters conditionally.
116
+ # Enable WorkloadIdentity if not enabled already.
117
+ if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
118
+ update_cluster_command_code = (
119
+ update_cluster_with_workload_identity_if_necessary(args)
120
+ )
121
+ if update_cluster_command_code != 0:
122
+ xpk_exit(update_cluster_command_code)
123
+
124
+ get_cluster_credentials(args)
125
+
126
+ k8s_client = setup_k8s_env(args)
127
+
128
+ install_storage_crd(k8s_client)
129
+ install_storage_csis(args)
130
+
131
+ # create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
132
+ tensorboard_config = {}
133
+ if VERTEX_TENSORBOARD_FEATURE_FLAG and args.create_vertex_tensorboard:
134
+ tensorboard_config = create_vertex_tensorboard(args)
135
+ # exit if failed to create Tensorboard in Vertex AI
136
+ if not tensorboard_config:
137
+ xpk_exit(1)
138
+
139
+ # Provision node pools dynamically based on incoming workloads:
140
+ # Currently autoprovisioning is not supported with Pathways.
141
+ autoprovisioning_config = None
142
+ if args.enable_autoprovisioning:
143
+ xpk_print('Enabling Autoprovisioning')
144
+ autoprovisioning_config, return_code = enable_autoprovisioning_on_cluster(
145
+ args, system
146
+ )
147
+ if return_code != 0:
148
+ xpk_exit(return_code)
149
+
150
+ xpk_print('Creating ConfigMap for cluster')
151
+ create_cluster_configmaps_code = create_cluster_configmaps(
152
+ args, system, tensorboard_config, autoprovisioning_config
153
+ )
154
+ if create_cluster_configmaps_code != 0:
155
+ xpk_exit(create_cluster_configmaps_code)
156
+
157
+ xpk_print(
158
+ 'Enabling the jobset API on our cluster, to be deprecated when Jobset is'
159
+ ' globally available'
160
+ )
161
+ set_jobset_on_cluster_code = set_jobset_on_cluster(args)
162
+ if set_jobset_on_cluster_code != 0:
163
+ xpk_exit(set_jobset_on_cluster_code)
164
+
165
+ # TODO: Uncomment when cluster_adapt will support TPU cluters
166
+ # set_pathways_job_on_cluster_code = set_pathways_job_on_cluster(args)
167
+ # if set_pathways_job_on_cluster_code != 0:
168
+ # xpk_exit(set_pathways_job_on_cluster_code)
169
+
170
+ install_kueue(args, system, autoprovisioning_config)
171
+
172
+ install_kjob(args)
173
+
174
+ if system.accelerator_type == AcceleratorType['GPU']:
175
+ prepare_gpus(args, system)
176
+
177
+ if args.enable_ray_cluster:
178
+ return_code = install_ray_cluster(args, system)
179
+ if return_code != 0:
180
+ xpk_print('Installation of RayCluster failed.')
181
+ xpk_exit(return_code)
182
+
183
+ xpk_print('GKE commands done! Resources are created.')
184
+ xpk_print(
185
+ 'See your GKE Cluster here:'
186
+ # pylint: disable=line-too-long
187
+ f' https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}'
188
+ )
189
+ xpk_exit(0)
68
190
 
69
191
 
70
192
  def cluster_create(args) -> None:
@@ -72,9 +194,6 @@ def cluster_create(args) -> None:
72
194
 
73
195
  Args:
74
196
  args: user provided arguments for running the command.
75
-
76
- Returns:
77
- 0 if successful and 1 otherwise.
78
197
  """
79
198
  system, return_code = get_system_characteristics(args)
80
199
 
@@ -117,35 +236,19 @@ def cluster_create(args) -> None:
117
236
 
118
237
  # ToDo(roshanin@) - Re-enable CloudDNS on Pathways clusters conditionally.
119
238
  # Enable WorkloadIdentity if not enabled already.
120
- if (
121
- args.enable_workload_identity
122
- or args.enable_gcsfuse_csi_driver
123
- or args.enable_gcpfilestore_csi_driver
124
- ):
239
+ if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
125
240
  update_cluster_command_code = (
126
241
  update_cluster_with_workload_identity_if_necessary(args)
127
242
  )
128
243
  if update_cluster_command_code != 0:
129
244
  xpk_exit(update_cluster_command_code)
130
245
 
131
- # Enable GCSFuse CSI Driver if not enabled already.
132
- if args.enable_gcsfuse_csi_driver:
133
- update_cluster_command_code = (
134
- update_cluster_with_gcsfuse_driver_if_necessary(args)
135
- )
136
- if update_cluster_command_code != 0:
137
- xpk_exit(update_cluster_command_code)
138
-
139
- if args.enable_gcpfilestore_csi_driver:
140
- update_cluster_command_code = (
141
- update_cluster_with_gcpfilestore_driver_if_necessary(args)
142
- )
143
- if update_cluster_command_code != 0:
144
- xpk_exit(update_cluster_command_code)
246
+ get_cluster_credentials(args)
145
247
 
146
- # Update Pathways clusters with CloudDNS if not enabled already.
248
+ k8s_client = setup_k8s_env(args)
147
249
 
148
- get_cluster_credentials(args)
250
+ install_storage_crd(k8s_client)
251
+ install_storage_csis(args)
149
252
 
150
253
  # create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
151
254
  tensorboard_config = {}
@@ -155,13 +258,12 @@ def cluster_create(args) -> None:
155
258
  if not tensorboard_config:
156
259
  xpk_exit(1)
157
260
 
158
- if system.accelerator_type == AcceleratorType['GPU']:
261
+ if system.device_type == H100_DEVICE_TYPE:
159
262
  xpk_print('Setting up Network for cluster')
160
- set_up_cluster_network_code = set_up_cluster_network_for_gpu(args, system)
263
+ set_up_cluster_network_code = set_up_cluster_network_for_a3(args)
161
264
  if set_up_cluster_network_code != 0:
162
265
  xpk_exit(set_up_cluster_network_code)
163
266
 
164
- if system.device_type == H100_DEVICE_TYPE:
165
267
  xpk_print('Creating Network Config for cluster')
166
268
  create_cluster_network_config_code = create_cluster_network_config(args)
167
269
  if create_cluster_network_config_code != 0:
@@ -207,45 +309,16 @@ def cluster_create(args) -> None:
207
309
  if set_jobset_on_cluster_code != 0:
208
310
  xpk_exit(set_jobset_on_cluster_code)
209
311
 
210
- xpk_print('Enabling Kueue on the cluster')
211
- install_kueue_on_cluster_code = install_kueue_on_cluster(args)
212
- if install_kueue_on_cluster_code != 0:
213
- xpk_exit(install_kueue_on_cluster_code)
312
+ set_pathways_job_on_cluster_code = set_pathways_job_on_cluster(args)
313
+ if set_pathways_job_on_cluster_code != 0:
314
+ xpk_exit(set_pathways_job_on_cluster_code)
214
315
 
215
- xpk_print('Verifying kjob installation')
216
- err_code = verify_kjob_installed(args)
217
- if err_code > 0:
218
- xpk_exit(err_code)
316
+ install_kueue(args, system, autoprovisioning_config)
219
317
 
220
- xpk_print('Applying kjob CDRs')
221
- err_code = apply_kjob_crds(args)
222
- if err_code > 0:
223
- xpk_exit(err_code)
224
-
225
- err_code = prepare_kjob(args)
226
- if err_code > 0:
227
- xpk_exit(err_code)
228
-
229
- k8s_client = setup_k8s_env(args)
230
- install_storage_crd(k8s_client)
231
-
232
- xpk_print('Wait for Kueue to be fully available')
233
- wait_for_kueue_available_code = wait_for_kueue_available(args)
234
- if wait_for_kueue_available_code != 0:
235
- xpk_exit(wait_for_kueue_available_code)
236
-
237
- xpk_print('Install Kueue Custom Resources')
238
- enable_kueue_credentials_code = install_kueue_crs(
239
- args, system, autoprovisioning_config
240
- )
241
- if enable_kueue_credentials_code != 0:
242
- xpk_exit(enable_kueue_credentials_code)
318
+ install_kjob(args)
243
319
 
244
320
  if system.accelerator_type == AcceleratorType['GPU']:
245
- xpk_print('Installing NCCL Plugin for cluster')
246
- install_nccl_code = install_nccl_on_cluster(args, system)
247
- if install_nccl_code != 0:
248
- xpk_exit(install_nccl_code)
321
+ prepare_gpus(args, system)
249
322
 
250
323
  if args.enable_ray_cluster:
251
324
  return_code = install_ray_cluster(args, system)
@@ -253,6 +326,12 @@ def cluster_create(args) -> None:
253
326
  xpk_print('Installation of RayCluster failed.')
254
327
  xpk_exit(return_code)
255
328
 
329
+ if hasattr(args, 'enable_mtc') and args.enable_mtc:
330
+ return_code = install_mtc_on_cluster(args, system)
331
+ if return_code != 0:
332
+ xpk_print('Installation of MTC failed.')
333
+ xpk_exit(return_code)
334
+
256
335
  xpk_print('GKE commands done! Resources are created.')
257
336
  xpk_print(
258
337
  'See your GKE Cluster here:'
@@ -757,6 +836,7 @@ def run_gke_cluster_create_command(
757
836
  f' --num-nodes {args.default_pool_cpu_num_nodes}'
758
837
  f' {args.custom_cluster_arguments}'
759
838
  f' {rapid_release_cmd}'
839
+ ' --enable-dns-access'
760
840
  )
761
841
 
762
842
  enable_ip_alias = False
@@ -783,11 +863,7 @@ def run_gke_cluster_create_command(
783
863
  if args.enable_ray_cluster:
784
864
  command += ' --addons RayOperator'
785
865
 
786
- if (
787
- args.enable_workload_identity
788
- or args.enable_gcsfuse_csi_driver
789
- or args.enable_gcpfilestore_csi_driver
790
- ):
866
+ if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
791
867
  command += f' --workload-pool={args.project}.svc.id.goog'
792
868
 
793
869
  addons = []
@@ -797,6 +873,15 @@ def run_gke_cluster_create_command(
797
873
  if args.enable_gcpfilestore_csi_driver:
798
874
  addons.append('GcpFilestoreCsiDriver')
799
875
 
876
+ if args.enable_parallelstore_csi_driver:
877
+ addons.append('ParallelstoreCsiDriver')
878
+
879
+ if args.enable_pd_csi_driver:
880
+ addons.append('GcePersistentDiskCsiDriver')
881
+
882
+ if hasattr(args, 'enable_mtc') and args.enable_mtc:
883
+ addons.append('HighScaleCheckpointing')
884
+
800
885
  if len(addons) > 0:
801
886
  addons_str = ','.join(addons)
802
887
  command += f' --addons={addons_str}'
@@ -806,3 +891,87 @@ def run_gke_cluster_create_command(
806
891
  xpk_print(f'GKE Cluster Create request returned ERROR {return_code}')
807
892
  return 1
808
893
  return 0
894
+
895
+
896
+ def install_storage_csis(args):
897
+ if args.enable_gcsfuse_csi_driver:
898
+ update_cluster_command_code = (
899
+ update_cluster_with_gcsfuse_driver_if_necessary(args)
900
+ )
901
+ if update_cluster_command_code != 0:
902
+ xpk_exit(update_cluster_command_code)
903
+
904
+ if args.enable_gcpfilestore_csi_driver:
905
+ update_cluster_command_code = (
906
+ update_cluster_with_gcpfilestore_driver_if_necessary(args)
907
+ )
908
+ if update_cluster_command_code != 0:
909
+ xpk_exit(update_cluster_command_code)
910
+
911
+ if args.enable_parallelstore_csi_driver:
912
+ update_cluster_command_code = (
913
+ update_cluster_with_parallelstore_driver_if_necessary(args)
914
+ )
915
+ if update_cluster_command_code != 0:
916
+ xpk_exit(update_cluster_command_code)
917
+
918
+ if args.enable_pd_csi_driver:
919
+ update_cluster_command_code = update_cluster_with_pd_driver_if_necessary(
920
+ args
921
+ )
922
+ if update_cluster_command_code != 0:
923
+ xpk_exit(update_cluster_command_code)
924
+
925
+
926
+ def install_kjob(args):
927
+ xpk_print('Verifying kjob installation')
928
+ err_code = verify_kjob_installed(args)
929
+ if err_code > 0:
930
+ xpk_exit(err_code)
931
+
932
+ xpk_print('Applying kjob CDRs')
933
+ err_code = apply_kjob_crds(args)
934
+ if err_code > 0:
935
+ xpk_exit(err_code)
936
+
937
+ err_code = prepare_kjob(args)
938
+ if err_code > 0:
939
+ xpk_exit(err_code)
940
+
941
+
942
+ def install_kueue(args, system: SystemCharacteristics, autoprovisioning_config):
943
+ xpk_print('Enabling Kueue on the cluster')
944
+ install_kueue_on_cluster_code = install_kueue_on_cluster(args)
945
+ if install_kueue_on_cluster_code != 0:
946
+ xpk_exit(install_kueue_on_cluster_code)
947
+
948
+ xpk_print('Wait for Kueue to be fully available')
949
+ wait_for_kueue_available_code = wait_for_kueue_available(args)
950
+ if wait_for_kueue_available_code != 0:
951
+ xpk_exit(wait_for_kueue_available_code)
952
+
953
+ xpk_print('Install Kueue Custom Resources')
954
+ enable_kueue_credentials_code = install_kueue_crs(
955
+ args, system, autoprovisioning_config
956
+ )
957
+ if enable_kueue_credentials_code != 0:
958
+ xpk_exit(enable_kueue_credentials_code)
959
+
960
+
961
+ def prepare_gpus(args, system: SystemCharacteristics):
962
+ xpk_print('Installing NCCL Plugin for cluster')
963
+ install_nccl_code = install_nccl_on_cluster(args, system)
964
+ if install_nccl_code != 0:
965
+ xpk_exit(install_nccl_code)
966
+
967
+ if system.device_type == H100_DEVICE_TYPE:
968
+ xpk_print('Installing NRI device injector for cluster')
969
+ install_nri_code = install_nri_on_cluster(args)
970
+ if install_nri_code != 0:
971
+ xpk_exit(install_nri_code)
972
+
973
+ if system.device_type in [H200_DEVICE_TYPE, B200_DEVICE_TYPE]:
974
+ xpk_print('Disabling MGLRU')
975
+ err_code = disable_mglru_on_cluster(args)
976
+ if err_code > 0:
977
+ xpk_exit(err_code)
@@ -16,26 +16,27 @@ limitations under the License.
16
16
 
17
17
  import os
18
18
 
19
- from ..core.remote_state.remote_state_client import RemoteStateClient
20
- from ..core.remote_state.fuse_remote_state import FuseStateClient
21
19
  from ..core.blueprint.blueprint_generator import (
22
20
  BlueprintGenerator,
23
21
  BlueprintGeneratorOutput,
24
22
  a3mega_device_type,
25
23
  a3ultra_device_type,
24
+ a4_device_type,
26
25
  supported_device_types,
27
26
  )
28
- from ..core.commands import run_command_for_value
29
27
  from ..core.capacity import get_capacity_type
28
+ from ..core.cluster import get_cluster_credentials
29
+ from ..core.commands import run_command_for_value
30
30
  from ..core.docker_manager import DockerManager
31
31
  from ..core.gcloud_context import zone_to_region
32
32
  from ..core.gcluster_manager import GclusterManager
33
+ from ..core.kjob import apply_kjob_crds, prepare_kjob
34
+ from ..core.remote_state.fuse_remote_state import FuseStateClient
35
+ from ..core.remote_state.remote_state_client import RemoteStateClient
33
36
  from ..utils.console import xpk_exit, xpk_print
34
37
  from ..utils.file import ensure_directory_exists
35
38
  from ..utils.network import all_IPs_cidr
36
39
  from ..utils.objects import hash_string
37
- from ..core.cluster import get_cluster_credentials
38
- from ..core.kjob import apply_kjob_crds, prepare_kjob
39
40
 
40
41
  blueprints_path = os.path.abspath('xpkclusters/blueprints')
41
42
  gcluster_working_dir = os.path.abspath('xpkclusters/gcluster-out')
@@ -266,4 +267,20 @@ def generate_blueprint(
266
267
  system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
267
268
  gcs_bucket=args.cluster_state_gcs_bucket,
268
269
  )
270
+ if args.device_type == a4_device_type:
271
+ num_nodes = args.num_nodes if not args.num_nodes is None else 2
272
+ return bpg.generate_a4_blueprint(
273
+ blueprint_name=blueprint_name,
274
+ prefix=prefix,
275
+ cluster_name=args.cluster,
276
+ region=zone_to_region(args.zone),
277
+ project_id=args.project,
278
+ zone=args.zone,
279
+ auth_cidr=all_IPs_cidr,
280
+ num_nodes=num_nodes,
281
+ reservation=args.reservation if args.reservation else None,
282
+ capacity_type=capacity_type,
283
+ system_node_pool_machine_type=args.default_pool_cpu_machine_type,
284
+ system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
285
+ )
269
286
  return None
xpk/commands/common.py CHANGED
@@ -15,8 +15,10 @@ limitations under the License.
15
15
  """
16
16
 
17
17
  from ..core.commands import run_command_with_updates_retry
18
+ from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics
19
+ from ..core.capacity import H100_MEGA_DEVICE_TYPE, CapacityType
18
20
  from ..core.gcloud_context import zone_to_region
19
- from ..utils.console import xpk_print
21
+ from ..utils.console import xpk_print, xpk_exit
20
22
 
21
23
 
22
24
  def set_cluster_command(args) -> int:
@@ -31,6 +33,7 @@ def set_cluster_command(args) -> int:
31
33
  command = (
32
34
  'gcloud container clusters get-credentials'
33
35
  f' {args.cluster} --region={zone_to_region(args.zone)}'
36
+ ' --dns-endpoint'
34
37
  f' --project={args.project} &&'
35
38
  ' kubectl config view && kubectl config set-context --current'
36
39
  ' --namespace=default'
@@ -42,3 +45,32 @@ def set_cluster_command(args) -> int:
42
45
  if return_code != 0:
43
46
  xpk_print(f'{task} returned ERROR {return_code}')
44
47
  return return_code
48
+
49
+
50
+ def is_TAS_possible(args) -> bool:
51
+ """Check cluster's machine_type and capacity type to determine if Kueue TAS is possible
52
+
53
+ Args:
54
+ args: user provided arguments for running the command.
55
+
56
+ Returns:
57
+ True if possible and False otherwise.
58
+ """
59
+ system_characteristics = get_cluster_system_characteristics(args)
60
+ capacity_type = get_cluster_capacity_type(args)
61
+
62
+ if system_characteristics is None:
63
+ xpk_print('system_characteristics data was not found in configmaps.')
64
+ xpk_exit(1)
65
+
66
+ if capacity_type is None:
67
+ xpk_print('capacity_type data was not found in configmaps.')
68
+ xpk_exit(1)
69
+
70
+ if (
71
+ system_characteristics.device_type == H100_MEGA_DEVICE_TYPE
72
+ and capacity_type == CapacityType.SPOT
73
+ ):
74
+ return False
75
+
76
+ return True
xpk/commands/info.py CHANGED
@@ -20,10 +20,10 @@ from argparse import Namespace
20
20
  from tabulate import tabulate
21
21
 
22
22
  from ..core.commands import run_command_for_value
23
+ from ..core.cluster import get_cluster_credentials
23
24
  from ..core.gcloud_context import add_zone_and_project
24
25
  from ..core.kueue import verify_kueuectl
25
26
  from ..utils.console import xpk_exit, xpk_print
26
- from .common import set_cluster_command
27
27
 
28
28
  table_fmt = 'plain'
29
29
 
@@ -37,9 +37,7 @@ def info(args: Namespace) -> None:
37
37
  None
38
38
  """
39
39
  add_zone_and_project(args)
40
- set_cluster_command_code = set_cluster_command(args)
41
- if set_cluster_command_code != 0:
42
- xpk_exit(set_cluster_command_code)
40
+ get_cluster_credentials(args)
43
41
 
44
42
  verify_kueuectl(args)
45
43
  lq, cq = bool(args.localqueue), bool(args.clusterqueue)
xpk/commands/job.py CHANGED
@@ -20,10 +20,10 @@ import sys
20
20
  from ruamel.yaml import YAML
21
21
 
22
22
  from ..core.commands import run_command_for_value, run_command_with_updates
23
+ from ..core.cluster import get_cluster_credentials
23
24
  from ..core.gcloud_context import add_zone_and_project
24
25
  from ..core.kjob import AppProfileDefaults
25
26
  from ..utils.console import xpk_exit, xpk_print
26
- from .common import set_cluster_command
27
27
  from .kind import set_local_cluster_command
28
28
 
29
29
 
@@ -143,14 +143,14 @@ def job_list(args) -> None:
143
143
  """
144
144
  if not args.kind_cluster:
145
145
  add_zone_and_project(args)
146
- set_cluster_command_code = set_cluster_command(args)
146
+ get_cluster_credentials(args)
147
147
  msg = f'Listing jobs for project {args.project} and zone {args.zone}:'
148
148
  else:
149
149
  set_cluster_command_code = set_local_cluster_command(args)
150
150
  msg = 'Listing jobs:'
151
+ if set_cluster_command_code != 0:
152
+ xpk_exit(set_cluster_command_code)
151
153
 
152
- if set_cluster_command_code != 0:
153
- xpk_exit(set_cluster_command_code)
154
154
  xpk_print(msg, flush=True)
155
155
 
156
156
  return_code = run_slurm_job_list_command(args)
@@ -178,12 +178,11 @@ def job_cancel(args) -> None:
178
178
  xpk_print(f'Starting job cancel for job: {args.name}', flush=True)
179
179
  if not args.kind_cluster:
180
180
  add_zone_and_project(args)
181
- set_cluster_command_code = set_cluster_command(args)
181
+ get_cluster_credentials(args)
182
182
  else:
183
183
  set_cluster_command_code = set_local_cluster_command(args)
184
-
185
- if set_cluster_command_code != 0:
186
- xpk_exit(set_cluster_command_code)
184
+ if set_cluster_command_code != 0:
185
+ xpk_exit(set_cluster_command_code)
187
186
 
188
187
  return_code = run_slurm_job_delete_command(args)
189
188
  xpk_exit(return_code)