xpk 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. xpk/api/__init__.py +15 -0
  2. xpk/api/storage_crd.yaml +52 -0
  3. xpk/commands/batch.py +27 -5
  4. xpk/commands/cluster.py +104 -80
  5. xpk/commands/cluster_gcluster.py +94 -10
  6. xpk/commands/common.py +44 -0
  7. xpk/commands/config.py +29 -0
  8. xpk/commands/info.py +8 -10
  9. xpk/commands/inspector.py +5 -11
  10. xpk/commands/job.py +9 -7
  11. xpk/commands/kind.py +34 -4
  12. xpk/commands/kjob_common.py +44 -0
  13. xpk/commands/run.py +128 -0
  14. xpk/commands/shell.py +27 -7
  15. xpk/commands/storage.py +280 -0
  16. xpk/commands/version.py +6 -18
  17. xpk/commands/workload.py +381 -184
  18. xpk/core/blueprint/blueprint_definitions.py +1 -0
  19. xpk/core/blueprint/blueprint_generator.py +132 -76
  20. xpk/core/capacity.py +185 -0
  21. xpk/core/cluster.py +564 -0
  22. xpk/core/cluster_private.py +6 -3
  23. xpk/core/commands.py +18 -14
  24. xpk/core/config.py +179 -0
  25. xpk/core/docker_container.py +225 -0
  26. xpk/core/docker_image.py +210 -0
  27. xpk/core/docker_resources.py +350 -0
  28. xpk/core/filestore.py +251 -0
  29. xpk/core/gcloud_context.py +196 -0
  30. xpk/core/gcluster_manager.py +20 -2
  31. xpk/core/gcsfuse.py +50 -0
  32. xpk/core/kjob.py +257 -18
  33. xpk/core/kueue.py +12 -6
  34. xpk/core/monitoring.py +134 -0
  35. xpk/core/nap.py +32 -20
  36. xpk/core/network.py +377 -0
  37. xpk/core/nodepool.py +581 -0
  38. xpk/core/pathways.py +124 -45
  39. xpk/core/remote_state/__init__.py +15 -0
  40. xpk/core/remote_state/fuse_remote_state.py +99 -0
  41. xpk/core/remote_state/remote_state_client.py +38 -0
  42. xpk/core/resources.py +238 -0
  43. xpk/core/scheduling.py +253 -0
  44. xpk/core/storage.py +581 -0
  45. xpk/core/system_characteristics.py +38 -1
  46. xpk/core/vertex.py +105 -0
  47. xpk/core/workload.py +209 -1
  48. xpk/core/workload_decorators/rdma_decorator.py +25 -5
  49. xpk/core/workload_decorators/storage_decorator.py +52 -0
  50. xpk/core/workload_decorators/tcpxo_decorator.py +70 -37
  51. xpk/main.py +3 -1
  52. xpk/parser/batch.py +10 -151
  53. xpk/parser/cluster.py +49 -8
  54. xpk/parser/common.py +189 -1
  55. xpk/parser/config.py +49 -0
  56. xpk/parser/core.py +27 -1
  57. xpk/parser/info.py +2 -1
  58. xpk/parser/inspector.py +3 -3
  59. xpk/parser/job.py +25 -4
  60. xpk/parser/kind.py +3 -2
  61. xpk/parser/run.py +47 -0
  62. xpk/parser/shell.py +10 -1
  63. xpk/parser/storage.py +326 -0
  64. xpk/parser/validators.py +3 -3
  65. xpk/parser/workload.py +118 -76
  66. xpk/templates/__init__.py +15 -0
  67. xpk/templates/storage.yaml +13 -0
  68. xpk/utils/gcs_utils.py +125 -0
  69. xpk/utils/kubectl.py +57 -0
  70. xpk/utils/objects.py +8 -5
  71. xpk/utils/templates.py +28 -0
  72. xpk/utils/validation.py +80 -0
  73. {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/METADATA +169 -15
  74. xpk-0.7.1.dist-info/RECORD +92 -0
  75. {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/WHEEL +1 -1
  76. xpk/core/core.py +0 -2824
  77. xpk-0.6.0.dist-info/RECORD +0 -57
  78. {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/entry_points.txt +0 -0
  79. {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info/licenses}/LICENSE +0 -0
  80. {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/top_level.txt +0 -0
xpk/core/kueue.py CHANGED
@@ -15,18 +15,24 @@ limitations under the License.
15
15
  """
16
16
 
17
17
  from argparse import Namespace
18
- from packaging.version import Version
18
+
19
19
  import packaging
20
+ from packaging.version import Version
21
+
22
+ from ..utils.console import xpk_exit, xpk_print
20
23
  from ..utils.file import write_tmp_file
21
- from ..utils.console import xpk_print, xpk_exit
22
- from .commands import run_command_with_updates, run_command_with_updates_retry, run_command_for_value
23
- from .core import (
24
- AutoprovisioningConfig,
24
+ from .commands import (
25
+ run_command_for_value,
26
+ run_command_with_updates,
27
+ run_command_with_updates_retry,
28
+ )
29
+ from .pathways import add_pw_resource_flavors, add_pw_resources_to_kueue
30
+ from .resources import AutoprovisioningConfig
31
+ from .scheduling import (
25
32
  create_accelerator_label,
26
33
  create_machine_label,
27
34
  get_total_chips_requested_from_args,
28
35
  )
29
- from .pathways import add_pw_resource_flavors, add_pw_resources_to_kueue
30
36
  from .system_characteristics import (
31
37
  AcceleratorTypeToAcceleratorCharacteristics,
32
38
  SystemCharacteristics,
xpk/core/monitoring.py ADDED
@@ -0,0 +1,134 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from ..utils.console import xpk_print
18
+ from .commands import run_command_for_value
19
+
20
+
21
+ def get_gke_dashboard(args, dashboard_filter) -> tuple[bool, str | None]:
22
+ """Get the identifier of GKE dashboard deployed in the project.
23
+
24
+ Args:
25
+ args: user provided arguments for running the command.
26
+
27
+ Returns:
28
+ bool:
29
+ True if 'gcloud monitoring dashboards list' returned an error or
30
+ multiple dashboards with same filter exist in the project,
31
+ False otherwise.
32
+ str:
33
+ identifier of dashboard if deployed in project,
34
+ None otherwise.
35
+ """
36
+ command = (
37
+ 'gcloud monitoring dashboards list'
38
+ f' --project={args.project} --filter="{dashboard_filter}"'
39
+ ' --format="value(name)" --verbosity=error'
40
+ )
41
+
42
+ return_code, return_value = run_command_for_value(
43
+ command, 'GKE Dashboard List', args
44
+ )
45
+
46
+ if return_code != 0:
47
+ xpk_print(
48
+ f'GKE Dashboard List request returned ERROR {return_code}. If there is'
49
+ ' a permissions error, please check'
50
+ ' https://github.com/google/xpk/blob/main/README.md#roles-needed-based-on-permission-errors'
51
+ ' for possible solutions.'
52
+ )
53
+ return True, None
54
+
55
+ if not return_value:
56
+ xpk_print(
57
+ f'No dashboard with {dashboard_filter} found in the'
58
+ f' project:{args.project}.'
59
+ )
60
+ return False, return_value
61
+
62
+ dashboards = return_value.strip().split('\n')
63
+ if len(dashboards) > 1:
64
+ xpk_print(
65
+ f'Multiple dashboards with same {dashboard_filter} exist in the'
66
+ f' project:{args.project}. Delete all but one dashboard deployed using'
67
+ ' https://github.com/google/cloud-tpu-monitoring-debugging.'
68
+ )
69
+ return True, None
70
+
71
+ if dashboards[0]:
72
+ return False, dashboards[0].strip().split('/')[-1]
73
+
74
+ return True, None
75
+
76
+
77
+ def get_gke_outlier_dashboard(args) -> str | None:
78
+ """Get the identifier of GKE outlier dashboard deployed in the project.
79
+
80
+ Args:
81
+ args: user provided arguments for running the command.
82
+
83
+ Returns:
84
+ str:
85
+ identifier of outlier dashboard if deployed in project,
86
+ None otherwise.
87
+ """
88
+ outlier_dashboard_filter = "displayName:'GKE - TPU Monitoring Dashboard'"
89
+ is_error, dashboard_id = get_gke_dashboard(args, outlier_dashboard_filter)
90
+
91
+ # 'gcloud monitoring dashboards list' returned an error or multiple dashboards with same filter exist in the project
92
+ if is_error:
93
+ return None
94
+
95
+ # 'gcloud monitoring dashboards list' succeeded but no dashboard for the filter exist in the project
96
+ if not is_error and not dashboard_id:
97
+ xpk_print(
98
+ 'Follow https://github.com/google/cloud-tpu-monitoring-debugging to'
99
+ ' deploy monitoring dashboard to view statistics and outlier mode of'
100
+ ' GKE metrics.'
101
+ )
102
+ return None
103
+
104
+ return str(dashboard_id)
105
+
106
+
107
+ def get_gke_debugging_dashboard(args) -> str | None:
108
+ """Get the identifier of GKE debugging dashboard deployed in the project.
109
+
110
+ Args:
111
+ args: user provided arguments for running the command.
112
+
113
+ Returns:
114
+ str:
115
+ identifier of debugging dashboard if deployed in project,
116
+ None otherwise.
117
+ """
118
+ debugging_dashboard_filter = "displayName:'GKE - TPU Logging Dashboard'"
119
+ is_error, dashboard_id = get_gke_dashboard(args, debugging_dashboard_filter)
120
+
121
+ # 'gcloud monitoring dashboards list' returned an error or multiple dashboards with same filter exist in the project
122
+ if is_error:
123
+ return None
124
+
125
+ # 'gcloud monitoring dashboards list' succeeded but no dashboard for the filter exist in the project
126
+ if not is_error and not dashboard_id:
127
+ xpk_print(
128
+ 'Follow https://github.com/google/cloud-tpu-monitoring-debugging to'
129
+ ' deploy debugging dashboard to view stack traces collected in Cloud'
130
+ ' Logging.'
131
+ )
132
+ return None
133
+
134
+ return str(dashboard_id)
xpk/core/nap.py CHANGED
@@ -14,29 +14,31 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
- from ..core.core import (
17
+ from ..utils.console import xpk_print
18
+ from ..utils.file import write_tmp_file
19
+ from ..utils.objects import get_value_from_map
20
+ from .capacity import (
18
21
  AUTOPROVISIONING_CONFIG_VALUE,
19
22
  CAPACITY_TYPE_CONFIG_KEY,
20
- CLUSTER_METADATA_CONFIGMAP,
21
- CLUSTER_RESOURCES_CONFIGMAP,
22
23
  RESERVATION_CONFIG_KEY,
23
- AutoprovisioningConfig,
24
24
  CapacityType,
25
- get_all_nodepools_programmatic,
26
25
  get_capacity_node_selectors_from_capacity_type,
27
26
  get_capacity_type,
28
- get_cluster_configmap,
29
- get_total_chips_requested_from_args,
30
27
  verify_reservation_exists,
31
- zone_to_region,
32
28
  )
33
- from ..utils.objects import get_value_from_map
34
- from ..utils.file import write_tmp_file
35
- from ..utils.console import xpk_print
36
29
  from .commands import run_command_with_updates, run_commands
30
+ from .gcloud_context import zone_to_region
31
+ from .nodepool import get_all_nodepools_programmatic
32
+ from .resources import (
33
+ CLUSTER_METADATA_CONFIGMAP,
34
+ CLUSTER_RESOURCES_CONFIGMAP,
35
+ AutoprovisioningConfig,
36
+ get_cluster_configmap,
37
+ )
38
+ from .scheduling import get_total_chips_requested_from_args
37
39
  from .system_characteristics import AcceleratorType, SystemCharacteristics
38
40
 
39
- autoprovisioning_config_file = """
41
+ AUTOPROVISIONING_CONFIG_FILE = """
40
42
  management:
41
43
  autoRepair: true
42
44
  autoUpgrade: true
@@ -44,8 +46,7 @@ autoprovisioningLocations:
44
46
  {zones}
45
47
  {resource_limits}
46
48
  """
47
-
48
- autoprovisioning_resource_limits = """
49
+ AUTOPROVISIONING_RESOURCE_LIMITS = """
49
50
  resourceLimits:
50
51
  - resourceType: 'cpu'
51
52
  {cpu_limits}
@@ -53,8 +54,7 @@ resourceLimits:
53
54
  {memory_limits}
54
55
  {custom_resource_type}
55
56
  """
56
-
57
- autoprovisioning_custom_resource_type = """
57
+ AUTOPROVISIONING_CUSTOM_RESOURCE_TYPE = """
58
58
  - resourceType: {resource_type}
59
59
  minimum: {minimum}
60
60
  maximum: {maximum}
@@ -218,19 +218,19 @@ def create_autoprovisioning_config(
218
218
  ' small, rescaling will not work well.'
219
219
  )
220
220
 
221
- custom_resource_string = autoprovisioning_custom_resource_type.format(
221
+ custom_resource_string = AUTOPROVISIONING_CUSTOM_RESOURCE_TYPE.format(
222
222
  resource_type=system.gke_accelerator,
223
223
  minimum=minimum,
224
224
  maximum=maximum,
225
225
  )
226
226
 
227
- resource_limits = autoprovisioning_resource_limits.format(
227
+ resource_limits = AUTOPROVISIONING_RESOURCE_LIMITS.format(
228
228
  cpu_limits=cpu_limits,
229
229
  memory_limits=memory_limits,
230
230
  custom_resource_type=custom_resource_string,
231
231
  )
232
232
 
233
- yml_string = autoprovisioning_config_file.format(
233
+ yml_string = AUTOPROVISIONING_CONFIG_FILE.format(
234
234
  resource_limits=resource_limits,
235
235
  zones=f'- {args.zone}',
236
236
  )
@@ -266,7 +266,7 @@ def is_autoprovisioning_enabled(
266
266
  return False, 0
267
267
 
268
268
  return_code, autoprovisioning_value = get_value_from_map(
269
- system.gke_accelerator, cluster_config_map
269
+ system.gke_accelerator, cluster_config_map, verbose=False
270
270
  )
271
271
  if return_code != 0:
272
272
  xpk_print(
@@ -347,3 +347,15 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
347
347
  return node_selector_args, return_code
348
348
 
349
349
  return node_selector_args, return_code
350
+
351
+
352
+ def get_cluster_provisioner(args) -> str:
353
+ metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
354
+ cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
355
+ cluster_provisioner = 'gcloud'
356
+ if not cluster_config_map is None:
357
+ provisioner = cluster_config_map.get('provisioner')
358
+ if not provisioner is None:
359
+ cluster_provisioner = provisioner
360
+ xpk_print(f'Cluster provisioner: {cluster_provisioner}')
361
+ return cluster_provisioner
xpk/core/network.py ADDED
@@ -0,0 +1,377 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from ..utils.console import xpk_print
18
+ from ..utils.file import write_tmp_file
19
+ from .capacity import H100_DEVICE_TYPE
20
+ from .commands import run_command_for_value, run_command_with_updates
21
+ from .gcloud_context import zone_to_region
22
+ from .system_characteristics import SystemCharacteristics
23
+
24
+ # cluster_network_yaml: the config when creating the network for a3 cluster
25
+ CLUSTER_NETWORK_YAML = """
26
+ apiVersion: networking.gke.io/v1
27
+ kind: Network
28
+ metadata:
29
+ name: vpc1
30
+ spec:
31
+ parametersRef:
32
+ group: networking.gke.io
33
+ kind: GKENetworkParamSet
34
+ name: vpc1
35
+ type: Device
36
+ ---
37
+ apiVersion: networking.gke.io/v1
38
+ kind: Network
39
+ metadata:
40
+ name: vpc2
41
+ spec:
42
+ parametersRef:
43
+ group: networking.gke.io
44
+ kind: GKENetworkParamSet
45
+ name: vpc2
46
+ type: Device
47
+ ---
48
+ apiVersion: networking.gke.io/v1
49
+ kind: Network
50
+ metadata:
51
+ name: vpc3
52
+ spec:
53
+ parametersRef:
54
+ group: networking.gke.io
55
+ kind: GKENetworkParamSet
56
+ name: vpc3
57
+ type: Device
58
+ ---
59
+ apiVersion: networking.gke.io/v1
60
+ kind: Network
61
+ metadata:
62
+ name: vpc4
63
+ spec:
64
+ parametersRef:
65
+ group: networking.gke.io
66
+ kind: GKENetworkParamSet
67
+ name: vpc4
68
+ type: Device
69
+ ---
70
+ apiVersion: networking.gke.io/v1
71
+ kind: GKENetworkParamSet
72
+ metadata:
73
+ name: vpc1
74
+ spec:
75
+ vpc: {cluster_name}-net-1
76
+ vpcSubnet: {cluster_name}-sub-1
77
+ deviceMode: NetDevice
78
+ ---
79
+ apiVersion: networking.gke.io/v1
80
+ kind: GKENetworkParamSet
81
+ metadata:
82
+ name: vpc2
83
+ spec:
84
+ vpc: {cluster_name}-net-2
85
+ vpcSubnet: {cluster_name}-sub-2
86
+ deviceMode: NetDevice
87
+ ---
88
+ apiVersion: networking.gke.io/v1
89
+ kind: GKENetworkParamSet
90
+ metadata:
91
+ name: vpc3
92
+ spec:
93
+ vpc: {cluster_name}-net-3
94
+ vpcSubnet: {cluster_name}-sub-3
95
+ deviceMode: NetDevice
96
+ ---
97
+ apiVersion: networking.gke.io/v1
98
+ kind: GKENetworkParamSet
99
+ metadata:
100
+ name: vpc4
101
+ spec:
102
+ vpc: {cluster_name}-net-4
103
+ vpcSubnet: {cluster_name}-sub-4
104
+ deviceMode: NetDevice
105
+ """
106
+
107
+
108
+ def create_cluster_network(args, index) -> int:
109
+ """Create one GKE Cluster network.
110
+
111
+ Args:
112
+ args: user provided arguments for running the command.
113
+ index: index number for the network to be created.
114
+
115
+ Returns:
116
+ 0 if successful and 1 otherwise.
117
+ """
118
+ existing_network_names, return_code = get_all_networks_programmatic(args)
119
+ if return_code > 0:
120
+ xpk_print('Listing all networks failed!')
121
+ return return_code
122
+
123
+ network_name = f'{args.cluster}-net-{index}'
124
+ if network_name not in existing_network_names:
125
+ command = (
126
+ f'gcloud compute --project={args.project}'
127
+ f' networks create {network_name}'
128
+ ' --subnet-mode=custom --mtu=8244'
129
+ )
130
+ return_code = run_command_with_updates(
131
+ command, 'Create Cluster Network', args, verbose=False
132
+ )
133
+
134
+ if return_code != 0:
135
+ xpk_print(f'Create Cluster Network request returned ERROR {return_code}')
136
+ return 1
137
+ else:
138
+ xpk_print(f'Reusing existing network {network_name}')
139
+
140
+ return 0
141
+
142
+
143
+ def create_cluster_subnet(args, index) -> int:
144
+ """Create one GKE Cluster subnet.
145
+
146
+ Args:
147
+ args: user provided arguments for running the command.
148
+ index: index number for the subnet to be created.
149
+
150
+ Returns:
151
+ 0 if successful and 1 otherwise.
152
+ """
153
+ existing_subnet_names, return_code = get_all_subnets_programmatic(args)
154
+ if return_code > 0:
155
+ xpk_print('Listing all subnets failed!')
156
+ return return_code
157
+ subnet_name = f'{args.cluster}-{zone_to_region(args.zone)}-sub-{index}'
158
+ if subnet_name not in existing_subnet_names:
159
+ command = (
160
+ f'gcloud compute --project={args.project}'
161
+ f' networks subnets create {subnet_name}'
162
+ f' --network={args.cluster}-net-{index}'
163
+ f' --region={zone_to_region(args.zone)} --range=192.168.{index}.0/24'
164
+ )
165
+ return_code = run_command_with_updates(
166
+ command, 'Create Cluster Subnet', args, verbose=False
167
+ )
168
+
169
+ if return_code != 0:
170
+ xpk_print(f'Create Cluster Subnet request returned ERROR {return_code}')
171
+ return 1
172
+ else:
173
+ xpk_print(f'Reusing existing subnet {subnet_name}')
174
+
175
+ return 0
176
+
177
+
178
+ def get_subnetworks_for_a3mega(cluster_name: str) -> list[str]:
179
+ return [f'{cluster_name}-gpunet-{i}-subnet' for i in range(8)]
180
+
181
+
182
+ def get_subnetworks_for_a3ultra(cluster_name: str) -> list[str]:
183
+ return [f'{cluster_name}-sub-1'] + [
184
+ f'{cluster_name}-rdma-sub-{i}' for i in range(8)
185
+ ]
186
+
187
+
188
+ def create_cluster_firewall_rule(args, index) -> int:
189
+ """Create one GKE Cluster firewall rule.
190
+
191
+ Args:
192
+ args: user provided arguments for running the command.
193
+ index: index number for the firewall rule to be created.
194
+
195
+ Returns:
196
+ 0 if successful and 1 otherwise.
197
+ """
198
+ existing_firewall_rules_names, return_code = (
199
+ get_all_firewall_rules_programmatic(args)
200
+ )
201
+ if return_code > 0:
202
+ xpk_print('Listing all firewall rules failed!')
203
+ return return_code
204
+ firewall_rule_name = f'{args.cluster}-internal-{index}'
205
+ if firewall_rule_name not in existing_firewall_rules_names:
206
+ command = (
207
+ f'gcloud compute --project={args.project} firewall-rules create'
208
+ f' {firewall_rule_name} --network={args.cluster}-net-{index} --action=ALLOW'
209
+ ' --rules=tcp:0-65535,udp:0-65535,icmp --source-ranges=192.168.0.0/16'
210
+ )
211
+ return_code = run_command_with_updates(
212
+ command, 'Create Cluster Firewall Rule', args, verbose=False
213
+ )
214
+
215
+ if return_code != 0:
216
+ xpk_print(
217
+ f'Create Cluster Firewall Rule request returned ERROR {return_code}'
218
+ )
219
+ return 1
220
+ else:
221
+ xpk_print(f'Reusing existing firewall rule {firewall_rule_name}')
222
+ return 0
223
+
224
+
225
+ def create_cluster_network_config(args) -> int:
226
+ """Run the Create GKE Cluster Network Config request.
227
+
228
+ Args:
229
+ args: user provided arguments for running the command.
230
+
231
+ Returns:
232
+ 0 if successful and 1 otherwise.
233
+ """
234
+ yml_string = CLUSTER_NETWORK_YAML.format(cluster_name=args.cluster)
235
+ tmp = write_tmp_file(yml_string)
236
+ command = f'kubectl apply -f {str(tmp.file.name)}'
237
+
238
+ return_code = run_command_with_updates(
239
+ command, 'GKE Cluster Create Network Config', args
240
+ )
241
+ if return_code != 0:
242
+ xpk_print(
243
+ f'GKE Cluster Create ConfigMap request returned ERROR {return_code}'
244
+ )
245
+ return 1
246
+
247
+ return 0
248
+
249
+
250
+ def set_up_cluster_network_for_gpu(args, system: SystemCharacteristics) -> int:
251
+ """Set up GKE Cluster networks, subnets and firewall rules for A3/A3+.
252
+ Note: there are 4 NICs for GPU-GPU bw and 1 NIC for host in an A3 node,
253
+ and there are 8 NICs for GPU-GPU bw and 1 NIC for host in an A3+ node.
254
+
255
+ Args:
256
+ args: user provided arguments for running the command.
257
+ system: system characteristics.
258
+
259
+ Returns:
260
+ 0 if successful and 1 otherwise.
261
+ """
262
+ num_networks = 5 if system.device_type == H100_DEVICE_TYPE else 9
263
+ for i in range(1, num_networks):
264
+ return_code = create_cluster_network(args, i)
265
+ if return_code != 0:
266
+ return 1
267
+ return_code = create_cluster_subnet(args, i)
268
+ if return_code != 0:
269
+ return 1
270
+ return_code = create_cluster_firewall_rule(args, i)
271
+ if return_code != 0:
272
+ return 1
273
+ return 0
274
+
275
+
276
+ def delete_cluster_subnets(args) -> int:
277
+ """Delete GKE Cluster subnets.
278
+
279
+ Args:
280
+ args: user provided arguments for running the command.
281
+
282
+ Returns:
283
+ 0 if successful and 1 otherwise.
284
+ """
285
+ existing_subnet_names, return_code = get_all_subnets_programmatic(args)
286
+ if return_code > 0:
287
+ xpk_print('Listing all subnets failed!')
288
+ return return_code
289
+
290
+ for subnet_name in existing_subnet_names:
291
+ command = (
292
+ f'gcloud compute networks subnets delete {subnet_name}'
293
+ f' --region={zone_to_region(args.zone)} --project={args.project} --quiet'
294
+ )
295
+
296
+ return_code = run_command_with_updates(
297
+ command, 'Delete Cluster Subnet', args, verbose=False
298
+ )
299
+
300
+ if return_code != 0:
301
+ xpk_print(f'Delete Cluster Subnet request returned ERROR {return_code}')
302
+ return 1
303
+ else:
304
+ xpk_print(f'Deleted existing subnet {subnet_name}')
305
+
306
+ return 0
307
+
308
+
309
+ def get_all_networks_programmatic(args) -> tuple[list[str], int]:
310
+ """Gets all the networks associated with project .
311
+
312
+ Args:
313
+ args: user provided arguments for running the command.
314
+
315
+ Returns:
316
+ List of networks and 0 if successful and 1 otherwise.
317
+ """
318
+ command = 'gcloud compute networks list --format="csv[no-heading](name)"'
319
+ return_code, raw_network_output = run_command_for_value(
320
+ command, 'Get All Networks', args
321
+ )
322
+ if return_code != 0:
323
+ xpk_print(f'Get All Networks returned ERROR {return_code}')
324
+ return [], 1
325
+
326
+ return raw_network_output.splitlines(), 0
327
+
328
+
329
+ def get_all_subnets_programmatic(args) -> tuple[list[str], int]:
330
+ """Gets all the subnets associated with the project.
331
+
332
+ Args:
333
+ args: user provided arguments for running the command.
334
+
335
+ Returns:
336
+ List of subnets and 0 if successful and 1 otherwise.
337
+ """
338
+ subnet_name_filter = f'{args.cluster}-{zone_to_region(args.zone)}-sub-*'
339
+
340
+ command = (
341
+ 'gcloud compute networks subnets list'
342
+ f' --filter=name~"{subnet_name_filter}" --project={args.project}'
343
+ )
344
+ return_code, raw_subnets_output = run_command_for_value(
345
+ command, 'Get All Subnets', args
346
+ )
347
+ if return_code != 0:
348
+ xpk_print(f'Get All Subnets returned ERROR {return_code}')
349
+ return [], 1
350
+
351
+ all_outputs = raw_subnets_output.splitlines()
352
+ all_networks = [
353
+ all_outputs[i].split(' ')[0] for i in range(1, len(all_outputs))
354
+ ]
355
+ return all_networks, 0
356
+
357
+
358
+ def get_all_firewall_rules_programmatic(args) -> tuple[list[str], int]:
359
+ """Gets all the firewall rules associated with the project.
360
+
361
+ Args:
362
+ args: user provided arguments for running the command.
363
+
364
+ Returns:
365
+ List of firewall rules and 0 if successful and 1 otherwise.
366
+ """
367
+ command = (
368
+ 'gcloud compute firewall-rules list --format="csv[no-heading](name)"'
369
+ )
370
+ return_code, raw_subnets_output = run_command_for_value(
371
+ command, 'Get All Firewall Rules', args
372
+ )
373
+ if return_code != 0:
374
+ xpk_print(f'Get All Firewall Rules returned ERROR {return_code}')
375
+ return [], 1
376
+
377
+ return raw_subnets_output.splitlines(), 0