xpk 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xpk/commands/batch.py CHANGED
@@ -26,14 +26,13 @@ from ..core.gcloud_context import add_zone_and_project
26
26
  from ..core.kjob import (
27
27
  AppProfileDefaults,
28
28
  JobTemplateDefaults,
29
- Kueue_TAS_annotation,
30
29
  get_storage_annotations,
31
30
  prepare_kjob,
32
31
  )
33
32
  from ..core.kueue import LOCAL_QUEUE_NAME
34
33
  from ..utils.console import xpk_exit, xpk_print
35
34
  from .kind import set_local_cluster_command
36
- from .kjob_common import add_gpu_networking_annotations_to_command
35
+ from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
37
36
 
38
37
 
39
38
  def batch(args: Namespace) -> None:
@@ -68,11 +67,11 @@ def submit_job(args: Namespace) -> None:
68
67
  'kubectl kjob create slurm'
69
68
  f' --profile {AppProfileDefaults.NAME.value}'
70
69
  f' --localqueue {LOCAL_QUEUE_NAME}'
71
- f' --pod-template-annotation {Kueue_TAS_annotation}'
72
70
  f' --worker-container {JobTemplateDefaults.CONTAINER_NAME.value}'
73
71
  ' --first-node-ip'
74
72
  )
75
73
  cmd = add_gpu_networking_annotations_to_command(args, cmd)
74
+ cmd = add_TAS_annotations_to_command(args, cmd)
76
75
 
77
76
  for annotation in get_storage_annotations(args):
78
77
  cmd += f' --pod-template-annotation {annotation}'
xpk/commands/cluster.py CHANGED
@@ -16,19 +16,22 @@ limitations under the License.
16
16
 
17
17
  from tabulate import tabulate
18
18
 
19
- from ..core.capacity import H100_DEVICE_TYPE
19
+ from ..core.capacity import H100_DEVICE_TYPE, H200_DEVICE_TYPE, B200_DEVICE_TYPE
20
20
  from ..core.cluster import (
21
21
  get_all_clusters_programmatic,
22
22
  get_cluster_credentials,
23
23
  install_nccl_on_cluster,
24
+ install_nri_on_cluster,
24
25
  set_jobset_on_cluster,
25
26
  set_pathways_job_on_cluster,
26
27
  setup_k8s_env,
27
- update_cluster_with_gcsfuse_driver_if_necessary,
28
- update_cluster_with_workload_identity_if_necessary,
28
+ disable_mglru_on_cluster,
29
+ count_nodes_on_cluster,
29
30
  update_cluster_with_gcpfilestore_driver_if_necessary,
31
+ update_cluster_with_gcsfuse_driver_if_necessary,
30
32
  update_cluster_with_parallelstore_driver_if_necessary,
31
33
  update_cluster_with_pd_driver_if_necessary,
34
+ update_cluster_with_workload_identity_if_necessary,
32
35
  )
33
36
  from ..core.cluster_private import authorize_private_cluster_access_if_necessary
34
37
  from ..core.commands import run_command_for_value, run_command_with_updates
@@ -52,8 +55,12 @@ from ..core.network import (
52
55
  delete_cluster_subnets,
53
56
  set_up_cluster_network_for_a3,
54
57
  )
55
- from ..core.nodepool import get_gke_node_pool_version, run_gke_node_pool_create_command
58
+ from ..core.nodepool import (
59
+ get_gke_node_pool_version,
60
+ run_gke_node_pool_create_command,
61
+ )
56
62
  from ..core.ray import install_ray_cluster
63
+ from ..core.mtc import install_mtc_on_cluster
57
64
  from ..core.resources import create_cluster_configmaps
58
65
  from ..core.storage import install_storage_crd
59
66
  from ..core.system_characteristics import (
@@ -70,14 +77,123 @@ from . import cluster_gcluster
70
77
  from .common import set_cluster_command
71
78
 
72
79
 
80
+ def cluster_adapt(args) -> None:
81
+ """Function that performs cluster adaptation.
82
+
83
+ Args:
84
+ args: user provided arguments for running the command.
85
+ """
86
+ args.enable_pathways = False
87
+
88
+ system, return_code = get_system_characteristics(args)
89
+
90
+ if return_code > 0:
91
+ xpk_print('Fetching system characteristics failed!')
92
+ xpk_exit(return_code)
93
+
94
+ xpk_print(
95
+ f'Starting cluster adaptation for cluster {args.cluster}:', flush=True
96
+ )
97
+ add_zone_and_project(args)
98
+
99
+ if system.accelerator_type == AcceleratorType['GPU'] and not getattr(
100
+ args, 'num_nodes'
101
+ ):
102
+ xpk_print(
103
+ 'Argument --num-nodes was not provided, trying to determine number of'
104
+ ' nodes based on the available nodes in the cluster...'
105
+ )
106
+ args.num_nodes = count_nodes_on_cluster(args, system)
107
+ if args.num_nodes == 0:
108
+ xpk_print(
109
+ 'Found unexpected number of nodes. Is the --device-type correct?'
110
+ )
111
+ xpk_exit(1)
112
+ else:
113
+ xpk_print(f'Using {args.num_nodes} nodes.')
114
+
115
+ # ToDo(roshanin@) - Re-enable CloudDNS on Pathways clusters conditionally.
116
+ # Enable WorkloadIdentity if not enabled already.
117
+ if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
118
+ update_cluster_command_code = (
119
+ update_cluster_with_workload_identity_if_necessary(args)
120
+ )
121
+ if update_cluster_command_code != 0:
122
+ xpk_exit(update_cluster_command_code)
123
+
124
+ get_cluster_credentials(args)
125
+
126
+ k8s_client = setup_k8s_env(args)
127
+
128
+ install_storage_crd(k8s_client)
129
+ install_storage_csis(args)
130
+
131
+ # create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
132
+ tensorboard_config = {}
133
+ if VERTEX_TENSORBOARD_FEATURE_FLAG and args.create_vertex_tensorboard:
134
+ tensorboard_config = create_vertex_tensorboard(args)
135
+ # exit if failed to create Tensorboard in Vertex AI
136
+ if not tensorboard_config:
137
+ xpk_exit(1)
138
+
139
+ # Provision node pools dynamically based on incoming workloads:
140
+ # Currently autoprovisioning is not supported with Pathways.
141
+ autoprovisioning_config = None
142
+ if args.enable_autoprovisioning:
143
+ xpk_print('Enabling Autoprovisioning')
144
+ autoprovisioning_config, return_code = enable_autoprovisioning_on_cluster(
145
+ args, system
146
+ )
147
+ if return_code != 0:
148
+ xpk_exit(return_code)
149
+
150
+ xpk_print('Creating ConfigMap for cluster')
151
+ create_cluster_configmaps_code = create_cluster_configmaps(
152
+ args, system, tensorboard_config, autoprovisioning_config
153
+ )
154
+ if create_cluster_configmaps_code != 0:
155
+ xpk_exit(create_cluster_configmaps_code)
156
+
157
+ xpk_print(
158
+ 'Enabling the jobset API on our cluster, to be deprecated when Jobset is'
159
+ ' globally available'
160
+ )
161
+ set_jobset_on_cluster_code = set_jobset_on_cluster(args)
162
+ if set_jobset_on_cluster_code != 0:
163
+ xpk_exit(set_jobset_on_cluster_code)
164
+
165
+ # TODO: Uncomment when cluster_adapt will support TPU cluters
166
+ # set_pathways_job_on_cluster_code = set_pathways_job_on_cluster(args)
167
+ # if set_pathways_job_on_cluster_code != 0:
168
+ # xpk_exit(set_pathways_job_on_cluster_code)
169
+
170
+ install_kueue(args, system, autoprovisioning_config)
171
+
172
+ install_kjob(args)
173
+
174
+ if system.accelerator_type == AcceleratorType['GPU']:
175
+ prepare_gpus(args, system)
176
+
177
+ if args.enable_ray_cluster:
178
+ return_code = install_ray_cluster(args, system)
179
+ if return_code != 0:
180
+ xpk_print('Installation of RayCluster failed.')
181
+ xpk_exit(return_code)
182
+
183
+ xpk_print('GKE commands done! Resources are created.')
184
+ xpk_print(
185
+ 'See your GKE Cluster here:'
186
+ # pylint: disable=line-too-long
187
+ f' https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}'
188
+ )
189
+ xpk_exit(0)
190
+
191
+
73
192
  def cluster_create(args) -> None:
74
193
  """Function around cluster creation.
75
194
 
76
195
  Args:
77
196
  args: user provided arguments for running the command.
78
-
79
- Returns:
80
- 0 if successful and 1 otherwise.
81
197
  """
82
198
  system, return_code = get_system_characteristics(args)
83
199
 
@@ -127,38 +243,12 @@ def cluster_create(args) -> None:
127
243
  if update_cluster_command_code != 0:
128
244
  xpk_exit(update_cluster_command_code)
129
245
 
130
- # Enable GCSFuse CSI Driver if not enabled already.
131
- if args.enable_gcsfuse_csi_driver:
132
- update_cluster_command_code = (
133
- update_cluster_with_gcsfuse_driver_if_necessary(args)
134
- )
135
- if update_cluster_command_code != 0:
136
- xpk_exit(update_cluster_command_code)
137
-
138
- if args.enable_gcpfilestore_csi_driver:
139
- update_cluster_command_code = (
140
- update_cluster_with_gcpfilestore_driver_if_necessary(args)
141
- )
142
- if update_cluster_command_code != 0:
143
- xpk_exit(update_cluster_command_code)
144
-
145
- if args.enable_parallelstore_csi_driver:
146
- update_cluster_command_code = (
147
- update_cluster_with_parallelstore_driver_if_necessary(args)
148
- )
149
- if update_cluster_command_code != 0:
150
- xpk_exit(update_cluster_command_code)
151
-
152
- if args.enable_pd_csi_driver:
153
- update_cluster_command_code = update_cluster_with_pd_driver_if_necessary(
154
- args
155
- )
156
- if update_cluster_command_code != 0:
157
- xpk_exit(update_cluster_command_code)
246
+ get_cluster_credentials(args)
158
247
 
159
- # Update Pathways clusters with CloudDNS if not enabled already.
248
+ k8s_client = setup_k8s_env(args)
160
249
 
161
- get_cluster_credentials(args)
250
+ install_storage_crd(k8s_client)
251
+ install_storage_csis(args)
162
252
 
163
253
  # create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
164
254
  tensorboard_config = {}
@@ -223,45 +313,12 @@ def cluster_create(args) -> None:
223
313
  if set_pathways_job_on_cluster_code != 0:
224
314
  xpk_exit(set_pathways_job_on_cluster_code)
225
315
 
226
- xpk_print('Enabling Kueue on the cluster')
227
- install_kueue_on_cluster_code = install_kueue_on_cluster(args)
228
- if install_kueue_on_cluster_code != 0:
229
- xpk_exit(install_kueue_on_cluster_code)
316
+ install_kueue(args, system, autoprovisioning_config)
230
317
 
231
- xpk_print('Verifying kjob installation')
232
- err_code = verify_kjob_installed(args)
233
- if err_code > 0:
234
- xpk_exit(err_code)
235
-
236
- xpk_print('Applying kjob CDRs')
237
- err_code = apply_kjob_crds(args)
238
- if err_code > 0:
239
- xpk_exit(err_code)
240
-
241
- err_code = prepare_kjob(args)
242
- if err_code > 0:
243
- xpk_exit(err_code)
244
-
245
- k8s_client = setup_k8s_env(args)
246
- install_storage_crd(k8s_client)
247
-
248
- xpk_print('Wait for Kueue to be fully available')
249
- wait_for_kueue_available_code = wait_for_kueue_available(args)
250
- if wait_for_kueue_available_code != 0:
251
- xpk_exit(wait_for_kueue_available_code)
252
-
253
- xpk_print('Install Kueue Custom Resources')
254
- enable_kueue_credentials_code = install_kueue_crs(
255
- args, system, autoprovisioning_config
256
- )
257
- if enable_kueue_credentials_code != 0:
258
- xpk_exit(enable_kueue_credentials_code)
318
+ install_kjob(args)
259
319
 
260
320
  if system.accelerator_type == AcceleratorType['GPU']:
261
- xpk_print('Installing NCCL Plugin for cluster')
262
- install_nccl_code = install_nccl_on_cluster(args, system)
263
- if install_nccl_code != 0:
264
- xpk_exit(install_nccl_code)
321
+ prepare_gpus(args, system)
265
322
 
266
323
  if args.enable_ray_cluster:
267
324
  return_code = install_ray_cluster(args, system)
@@ -269,6 +326,12 @@ def cluster_create(args) -> None:
269
326
  xpk_print('Installation of RayCluster failed.')
270
327
  xpk_exit(return_code)
271
328
 
329
+ if hasattr(args, 'enable_mtc') and args.enable_mtc:
330
+ return_code = install_mtc_on_cluster(args, system)
331
+ if return_code != 0:
332
+ xpk_print('Installation of MTC failed.')
333
+ xpk_exit(return_code)
334
+
272
335
  xpk_print('GKE commands done! Resources are created.')
273
336
  xpk_print(
274
337
  'See your GKE Cluster here:'
@@ -773,6 +836,7 @@ def run_gke_cluster_create_command(
773
836
  f' --num-nodes {args.default_pool_cpu_num_nodes}'
774
837
  f' {args.custom_cluster_arguments}'
775
838
  f' {rapid_release_cmd}'
839
+ ' --enable-dns-access'
776
840
  )
777
841
 
778
842
  enable_ip_alias = False
@@ -805,6 +869,7 @@ def run_gke_cluster_create_command(
805
869
  addons = []
806
870
  if args.enable_gcsfuse_csi_driver:
807
871
  addons.append('GcsFuseCsiDriver')
872
+
808
873
  if args.enable_gcpfilestore_csi_driver:
809
874
  addons.append('GcpFilestoreCsiDriver')
810
875
 
@@ -814,6 +879,9 @@ def run_gke_cluster_create_command(
814
879
  if args.enable_pd_csi_driver:
815
880
  addons.append('GcePersistentDiskCsiDriver')
816
881
 
882
+ if hasattr(args, 'enable_mtc') and args.enable_mtc:
883
+ addons.append('HighScaleCheckpointing')
884
+
817
885
  if len(addons) > 0:
818
886
  addons_str = ','.join(addons)
819
887
  command += f' --addons={addons_str}'
@@ -823,3 +891,87 @@ def run_gke_cluster_create_command(
823
891
  xpk_print(f'GKE Cluster Create request returned ERROR {return_code}')
824
892
  return 1
825
893
  return 0
894
+
895
+
896
+ def install_storage_csis(args):
897
+ if args.enable_gcsfuse_csi_driver:
898
+ update_cluster_command_code = (
899
+ update_cluster_with_gcsfuse_driver_if_necessary(args)
900
+ )
901
+ if update_cluster_command_code != 0:
902
+ xpk_exit(update_cluster_command_code)
903
+
904
+ if args.enable_gcpfilestore_csi_driver:
905
+ update_cluster_command_code = (
906
+ update_cluster_with_gcpfilestore_driver_if_necessary(args)
907
+ )
908
+ if update_cluster_command_code != 0:
909
+ xpk_exit(update_cluster_command_code)
910
+
911
+ if args.enable_parallelstore_csi_driver:
912
+ update_cluster_command_code = (
913
+ update_cluster_with_parallelstore_driver_if_necessary(args)
914
+ )
915
+ if update_cluster_command_code != 0:
916
+ xpk_exit(update_cluster_command_code)
917
+
918
+ if args.enable_pd_csi_driver:
919
+ update_cluster_command_code = update_cluster_with_pd_driver_if_necessary(
920
+ args
921
+ )
922
+ if update_cluster_command_code != 0:
923
+ xpk_exit(update_cluster_command_code)
924
+
925
+
926
+ def install_kjob(args):
927
+ xpk_print('Verifying kjob installation')
928
+ err_code = verify_kjob_installed(args)
929
+ if err_code > 0:
930
+ xpk_exit(err_code)
931
+
932
+ xpk_print('Applying kjob CDRs')
933
+ err_code = apply_kjob_crds(args)
934
+ if err_code > 0:
935
+ xpk_exit(err_code)
936
+
937
+ err_code = prepare_kjob(args)
938
+ if err_code > 0:
939
+ xpk_exit(err_code)
940
+
941
+
942
+ def install_kueue(args, system: SystemCharacteristics, autoprovisioning_config):
943
+ xpk_print('Enabling Kueue on the cluster')
944
+ install_kueue_on_cluster_code = install_kueue_on_cluster(args)
945
+ if install_kueue_on_cluster_code != 0:
946
+ xpk_exit(install_kueue_on_cluster_code)
947
+
948
+ xpk_print('Wait for Kueue to be fully available')
949
+ wait_for_kueue_available_code = wait_for_kueue_available(args)
950
+ if wait_for_kueue_available_code != 0:
951
+ xpk_exit(wait_for_kueue_available_code)
952
+
953
+ xpk_print('Install Kueue Custom Resources')
954
+ enable_kueue_credentials_code = install_kueue_crs(
955
+ args, system, autoprovisioning_config
956
+ )
957
+ if enable_kueue_credentials_code != 0:
958
+ xpk_exit(enable_kueue_credentials_code)
959
+
960
+
961
+ def prepare_gpus(args, system: SystemCharacteristics):
962
+ xpk_print('Installing NCCL Plugin for cluster')
963
+ install_nccl_code = install_nccl_on_cluster(args, system)
964
+ if install_nccl_code != 0:
965
+ xpk_exit(install_nccl_code)
966
+
967
+ if system.device_type == H100_DEVICE_TYPE:
968
+ xpk_print('Installing NRI device injector for cluster')
969
+ install_nri_code = install_nri_on_cluster(args)
970
+ if install_nri_code != 0:
971
+ xpk_exit(install_nri_code)
972
+
973
+ if system.device_type in [H200_DEVICE_TYPE, B200_DEVICE_TYPE]:
974
+ xpk_print('Disabling MGLRU')
975
+ err_code = disable_mglru_on_cluster(args)
976
+ if err_code > 0:
977
+ xpk_exit(err_code)
xpk/commands/common.py CHANGED
@@ -15,8 +15,10 @@ limitations under the License.
15
15
  """
16
16
 
17
17
  from ..core.commands import run_command_with_updates_retry
18
+ from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics
19
+ from ..core.capacity import H100_MEGA_DEVICE_TYPE, CapacityType
18
20
  from ..core.gcloud_context import zone_to_region
19
- from ..utils.console import xpk_print
21
+ from ..utils.console import xpk_print, xpk_exit
20
22
 
21
23
 
22
24
  def set_cluster_command(args) -> int:
@@ -31,6 +33,7 @@ def set_cluster_command(args) -> int:
31
33
  command = (
32
34
  'gcloud container clusters get-credentials'
33
35
  f' {args.cluster} --region={zone_to_region(args.zone)}'
36
+ ' --dns-endpoint'
34
37
  f' --project={args.project} &&'
35
38
  ' kubectl config view && kubectl config set-context --current'
36
39
  ' --namespace=default'
@@ -42,3 +45,32 @@ def set_cluster_command(args) -> int:
42
45
  if return_code != 0:
43
46
  xpk_print(f'{task} returned ERROR {return_code}')
44
47
  return return_code
48
+
49
+
50
+ def is_TAS_possible(args) -> bool:
51
+ """Check cluster's machine_type and capacity type to determine if Kueue TAS is possible
52
+
53
+ Args:
54
+ args: user provided arguments for running the command.
55
+
56
+ Returns:
57
+ True if possible and False otherwise.
58
+ """
59
+ system_characteristics = get_cluster_system_characteristics(args)
60
+ capacity_type = get_cluster_capacity_type(args)
61
+
62
+ if system_characteristics is None:
63
+ xpk_print('system_characteristics data was not found in configmaps.')
64
+ xpk_exit(1)
65
+
66
+ if capacity_type is None:
67
+ xpk_print('capacity_type data was not found in configmaps.')
68
+ xpk_exit(1)
69
+
70
+ if (
71
+ system_characteristics.device_type == H100_MEGA_DEVICE_TYPE
72
+ and capacity_type == CapacityType.SPOT
73
+ ):
74
+ return False
75
+
76
+ return True
@@ -24,7 +24,9 @@ from ..core.kjob import (
24
24
  get_a3mega_pod_template_annotations,
25
25
  get_a3ultra_pod_template_annotations,
26
26
  get_a4_pod_template_annotations,
27
+ Kueue_TAS_annotation,
27
28
  )
29
+ from .common import is_TAS_possible
28
30
 
29
31
 
30
32
  def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
@@ -35,7 +37,7 @@ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
35
37
  elif gpu_type == H200_DEVICE_TYPE:
36
38
  annotations = get_a3ultra_pod_template_annotations(args)
37
39
  elif gpu_type == B200_DEVICE_TYPE:
38
- annotations = get_a4_pod_template_annotations()
40
+ annotations = get_a4_pod_template_annotations(args)
39
41
  else:
40
42
  annotations = []
41
43
 
@@ -45,3 +47,10 @@ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
45
47
  cmd += "\\\n".join(flags)
46
48
 
47
49
  return cmd
50
+
51
+
52
+ def add_TAS_annotations_to_command(args, cmd: str) -> str:
53
+ if is_TAS_possible(args):
54
+ cmd += f" --pod-template-annotation {Kueue_TAS_annotation}"
55
+
56
+ return cmd
xpk/commands/run.py CHANGED
@@ -25,14 +25,13 @@ from ..core.gcloud_context import add_zone_and_project
25
25
  from ..core.kjob import (
26
26
  AppProfileDefaults,
27
27
  JobTemplateDefaults,
28
- Kueue_TAS_annotation,
29
28
  get_storage_annotations,
30
29
  prepare_kjob,
31
30
  )
32
31
  from ..core.kueue import LOCAL_QUEUE_NAME
33
32
  from ..utils.console import xpk_exit, xpk_print
34
33
  from .kind import set_local_cluster_command
35
- from .kjob_common import add_gpu_networking_annotations_to_command
34
+ from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
36
35
 
37
36
 
38
37
  def run(args: Namespace) -> None:
@@ -64,12 +63,12 @@ def submit_job(args: Namespace) -> None:
64
63
  'kubectl kjob create slurm --profile'
65
64
  f' {AppProfileDefaults.NAME.value} '
66
65
  f' --localqueue {LOCAL_QUEUE_NAME} '
67
- f" --pod-template-annotation '{Kueue_TAS_annotation}'"
68
66
  f' --stream-container {JobTemplateDefaults.CONTAINER_NAME.value}'
69
67
  f' --worker-container {JobTemplateDefaults.CONTAINER_NAME.value}'
70
68
  ' --wait --rm --first-node-ip'
71
69
  )
72
70
  cmd = add_gpu_networking_annotations_to_command(args, cmd)
71
+ cmd = add_TAS_annotations_to_command(args, cmd)
73
72
 
74
73
  for annotation in get_storage_annotations(args):
75
74
  cmd += f' --pod-template-annotation {annotation}'
xpk/commands/storage.py CHANGED
@@ -86,7 +86,6 @@ def storage_create(args: Namespace) -> None:
86
86
  args.vol,
87
87
  args.access_mode,
88
88
  filestore_network,
89
- args.mount_options,
90
89
  )
91
90
 
92
91
  k8s_api_client = setup_k8s_env(args)
@@ -162,7 +161,6 @@ def storage_attach(args: Namespace) -> None:
162
161
  args.vol,
163
162
  args.access_mode,
164
163
  filestore_network,
165
- args.mount_options,
166
164
  )
167
165
 
168
166
  elif args.type == GCS_FUSE_TYPE:
@@ -178,7 +176,11 @@ def storage_attach(args: Namespace) -> None:
178
176
  manifest = list(yaml.safe_load_all(f))
179
177
  else:
180
178
  manifest = gcsfuse.manifest(
181
- args.name, args.bucket, args.size, args.mount_options
179
+ args.name,
180
+ args.bucket,
181
+ args.size,
182
+ args.mount_options,
183
+ args.prefetch_metadata,
182
184
  )
183
185
 
184
186
  elif args.type in [PARALLELSTORE_TYPE, GCE_PD_TYPE]:
@@ -323,3 +325,12 @@ def delete_storage_resources(k8s_api_client: ApiClient, storage: Storage):
323
325
  storage.name,
324
326
  "Storage",
325
327
  )
328
+
329
+ # remove kubernetes.io/pvc-protection
330
+ delete_resource(
331
+ lambda name: core_api.patch_namespaced_persistent_volume_claim(
332
+ name, "default", {"metadata": {"finalizers": None}}
333
+ ),
334
+ storage.pvc,
335
+ "Persistent Volume Claim finalizers",
336
+ )
xpk/commands/workload.py CHANGED
@@ -14,11 +14,6 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
- from ..core.blueprint.blueprint_generator import (
18
- get_subnetworks_for_a3mega,
19
- get_subnetworks_for_a3ultra,
20
- get_subnetworks_for_a4,
21
- )
22
17
  from ..core.cluster import (
23
18
  XPK_SA,
24
19
  create_xpk_k8s_service_account,
@@ -43,6 +38,7 @@ from ..core.nap import (
43
38
  get_autoprovisioning_node_selector_args,
44
39
  is_autoprovisioning_enabled,
45
40
  )
41
+ from ..core.network import get_cluster_subnetworks
46
42
  from ..core.pathways import (
47
43
  append_custom_colocated_python_sidecar,
48
44
  append_custom_pathways_proxy_server,
@@ -93,6 +89,7 @@ from ..core.workload_decorators import (
93
89
  )
94
90
  from ..utils.console import get_user_input, xpk_exit, xpk_print
95
91
  from ..utils.file import write_tmp_file
92
+ from .common import is_TAS_possible
96
93
  from . import cluster_gcluster
97
94
 
98
95
  WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
@@ -217,7 +214,7 @@ spec:
217
214
  labels:
218
215
  xpk.google.com/workload: {args.workload}
219
216
  annotations:
220
- kueue.x-k8s.io/podset-preferred-topology: "cloud.google.com/gce-topology-host"
217
+ {kueue_TAS_annotation}
221
218
  spec:
222
219
  priorityClassName: {args.priority}
223
220
  restartPolicy: Never
@@ -406,7 +403,7 @@ def workload_create(args) -> None:
406
403
  f' {parallelstore_storages}'
407
404
  )
408
405
  else:
409
- xpk_print('No gcp filestore instances to add detected.')
406
+ xpk_print('No gcp parallelstore instances to add detected.')
410
407
 
411
408
  if len(pd_storages) > 0:
412
409
  service_account = XPK_SA
@@ -451,6 +448,13 @@ def workload_create(args) -> None:
451
448
  if return_code != 0:
452
449
  xpk_exit(return_code)
453
450
 
451
+ kueue_TAS_annotation = (
452
+ 'kueue.x-k8s.io/podset-preferred-topology:'
453
+ ' "cloud.google.com/gce-topology-host"'
454
+ )
455
+ if not is_TAS_possible(args):
456
+ kueue_TAS_annotation = ''
457
+
454
458
  if system.device_type in cluster_gcluster.supported_device_types:
455
459
  yml_string = A3_GPU_WORKLOAD_CREATE_YAML.format(
456
460
  args=args,
@@ -458,18 +462,16 @@ def workload_create(args) -> None:
458
462
  service_account=XPK_SA,
459
463
  failure_policy_rules=failure_policy_rules,
460
464
  pod_failure_policy=pod_failure_policy,
465
+ kueue_TAS_annotation=kueue_TAS_annotation,
461
466
  )
462
467
 
468
+ sub_networks = get_cluster_subnetworks(args)
463
469
  if args.device_type == cluster_gcluster.a3mega_device_type:
464
- sub_networks = get_subnetworks_for_a3mega(args.cluster)
465
470
  yml_string = tcpxo_decorator.decorate_jobset(yml_string, sub_networks)
466
-
467
- if args.device_type == cluster_gcluster.a3ultra_device_type:
468
- sub_networks = get_subnetworks_for_a3ultra(args.cluster)
469
- yml_string = rdma_decorator.decorate_jobset(yml_string, sub_networks)
470
-
471
- if args.device_type == cluster_gcluster.a4_device_type:
472
- sub_networks = get_subnetworks_for_a4()
471
+ elif args.device_type in [
472
+ cluster_gcluster.a3ultra_device_type,
473
+ cluster_gcluster.a4_device_type,
474
+ ]:
473
475
  yml_string = rdma_decorator.decorate_jobset(yml_string, sub_networks)
474
476
 
475
477
  if all_storages: