xpk 0.7.2__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. xpk/commands/batch.py +19 -13
  2. xpk/commands/cluster.py +240 -71
  3. xpk/commands/cluster_gcluster.py +22 -5
  4. xpk/commands/common.py +33 -1
  5. xpk/commands/info.py +2 -4
  6. xpk/commands/job.py +7 -8
  7. xpk/commands/kjob_common.py +30 -18
  8. xpk/commands/run.py +17 -12
  9. xpk/commands/shell.py +3 -4
  10. xpk/commands/storage.py +75 -19
  11. xpk/commands/workload.py +161 -324
  12. xpk/core/blueprint/blueprint_definitions.py +2 -0
  13. xpk/core/blueprint/blueprint_generator.py +335 -45
  14. xpk/core/capacity.py +1 -0
  15. xpk/core/cluster.py +193 -12
  16. xpk/core/config.py +3 -1
  17. xpk/core/docker_manager.py +1 -1
  18. xpk/core/docker_resources.py +9 -21
  19. xpk/core/filestore.py +5 -1
  20. xpk/core/gcsfuse.py +27 -6
  21. xpk/core/kjob.py +66 -20
  22. xpk/core/kueue.py +30 -0
  23. xpk/core/mtc.py +195 -0
  24. xpk/core/nap.py +4 -0
  25. xpk/core/network.py +34 -22
  26. xpk/core/nodepool.py +28 -26
  27. xpk/core/pathways.py +165 -210
  28. xpk/core/resources.py +21 -0
  29. xpk/core/scheduling.py +36 -0
  30. xpk/core/storage.py +66 -12
  31. xpk/core/system_characteristics.py +9 -0
  32. xpk/core/workload.py +28 -83
  33. xpk/core/workload_decorators/rdma_decorator.py +11 -15
  34. xpk/core/workload_decorators/storage_decorator.py +8 -3
  35. xpk/core/workload_decorators/tcpx_decorator.py +179 -0
  36. xpk/core/workload_decorators/tcpxo_decorator.py +17 -16
  37. xpk/parser/cluster.py +574 -381
  38. xpk/parser/storage.py +25 -5
  39. xpk/parser/workload.py +59 -31
  40. xpk/utils/kubectl.py +4 -1
  41. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/METADATA +192 -93
  42. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/RECORD +46 -44
  43. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/WHEEL +1 -1
  44. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/entry_points.txt +0 -0
  45. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/licenses/LICENSE +0 -0
  46. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/top_level.txt +0 -0
xpk/core/mtc.py ADDED
@@ -0,0 +1,195 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import requests
18
+ import yaml
19
+
20
+ from ..core.cluster import JOBSET_VERSION
21
+ from ..core.cluster import setup_k8s_env
22
+ from ..utils import templates
23
+ from ..utils.console import xpk_exit
24
+ from ..utils.console import xpk_print
25
+ from ..utils.kubectl import apply_kubectl_manifest
26
+
27
+
28
+ MTC_CPC_PATH = "/../templates/mtc-cpc.yaml"
29
+
30
+
31
+ def create_mtc_cpc(
32
+ mtc_gcs_bucket: str,
33
+ mtc_machine_type: str,
34
+ mtc_toleration_key: str,
35
+ mtc_ramdisk_size: str,
36
+ ) -> dict:
37
+ """Create MTC Checkpoint Configuration.
38
+
39
+ Args:
40
+ mtc_gcs_bucket: GCS bucket for MTC
41
+ mtc_machine_type: Machine type for MTC
42
+ mtc_toleration_key: Toleration key for MTC
43
+ mtc_ramdisk_size: Ramdisk size for MTC
44
+
45
+ Returns:
46
+ MTC Checkpoint Configuration
47
+ """
48
+ data = templates.load(MTC_CPC_PATH)
49
+
50
+ data["spec"]["cloudStorageBucketName"] = mtc_gcs_bucket
51
+ data["spec"]["nodeSelector"][
52
+ "node.kubernetes.io/instance-type"
53
+ ] = mtc_machine_type
54
+ data["spec"]["tolerations"][0]["key"] = mtc_toleration_key
55
+ data["spec"]["inMemoryVolumeSize"] = mtc_ramdisk_size
56
+
57
+ return data
58
+
59
+
60
+ def install_mtc_on_cluster(args, system) -> int:
61
+ """Install MTC on the cluster.
62
+
63
+ Args:
64
+ args: user provided arguments for running the command.
65
+ system: system related information.
66
+
67
+ Returns:
68
+ return code of the command.
69
+ """
70
+ if args.mtc_gcs_bucket is None:
71
+ xpk_print("MTC GCS bucket is required.")
72
+ xpk_exit(1)
73
+ if args.mtc_gcs_bucket.startswith("gs://"):
74
+ args.mtc_gcs_bucket = args.mtc_gcs_bucket.replace("gs://", "")
75
+
76
+ if args.mtc_ramdisk_size is None:
77
+ xpk_print("MTC ramdisk size is required.")
78
+ xpk_exit(1)
79
+
80
+ if args.mtc_toleration_key is None:
81
+ args.mtc_toleration_key = "google.com/tpu"
82
+
83
+ k8s_api_client = setup_k8s_env(args)
84
+ jobset_manifest = update_jobset_manifest()
85
+ if jobset_manifest is None:
86
+ xpk_print(
87
+ "Updated jobset manifest is empty, not updating the jobset controller."
88
+ )
89
+
90
+ xpk_print("Applying Jobset with MTC Configuration")
91
+ return_code = apply_kubectl_manifest(k8s_api_client, [jobset_manifest])
92
+ if return_code != 0:
93
+ return return_code
94
+
95
+ mtc_checkpoint_configuration_crd_data = create_mtc_cpc(
96
+ args.mtc_gcs_bucket,
97
+ system.gce_machine_type,
98
+ args.mtc_toleration_key,
99
+ args.mtc_ramdisk_size,
100
+ )
101
+ xpk_print("Applying MTC Checkpoint Configuration")
102
+ return_code = apply_kubectl_manifest(
103
+ k8s_api_client, [mtc_checkpoint_configuration_crd_data]
104
+ )
105
+
106
+ return return_code
107
+
108
+
109
+ def update_jobset_manifest():
110
+ """Update the jobset manifest to increase the resources for the jobset controller manager.
111
+
112
+ Returns:
113
+ The updated jobset manifest.
114
+ """
115
+ manifest_url = f"https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml"
116
+ manifest_content = None
117
+ # Fetch the manifest content
118
+ try:
119
+ response = requests.get(manifest_url, timeout=10)
120
+ response.raise_for_status() # Raise an exception for HTTP errors
121
+ manifest_content = response.text
122
+ except requests.exceptions.Timeout as e:
123
+ xpk_print(f"Error: Request to {manifest_url} after 10 seconds: {e}")
124
+ xpk_exit(1)
125
+ except requests.exceptions.RequestException as e:
126
+ xpk_print(f"Error fetching manifest from {manifest_url}: {e}")
127
+ xpk_exit(1)
128
+
129
+ if manifest_content is None:
130
+ xpk_print("Manifest content not found.")
131
+ xpk_exit(1)
132
+
133
+ # Load all YAML documents from the manifest
134
+ yaml_data_list = list(yaml.safe_load_all(manifest_content))
135
+ # Iterate through the yaml_data to find the Deployment for
136
+ # jobset-controller-manager
137
+ update_manifest = False
138
+ for yaml_data in yaml_data_list:
139
+ if (
140
+ yaml_data
141
+ and yaml_data.get("apiVersion") == "apps/v1"
142
+ and yaml_data.get("kind") == "Deployment"
143
+ and yaml_data.get("metadata", {}).get("name")
144
+ == "jobset-controller-manager"
145
+ ):
146
+ # Found the Deployment, now modify the resources
147
+ containers = yaml_data["spec"]["template"]["spec"]["containers"]
148
+ for container in containers:
149
+ if container["name"] == "manager":
150
+ # Update resource limits and requests
151
+ current_cpu_request = (
152
+ container["resources"].get("requests", {}).get("cpu", "0m")
153
+ )
154
+ current_memory_request = (
155
+ container["resources"].get("requests", {}).get("memory", "0Mi")
156
+ )
157
+ current_memory_limit = (
158
+ container["resources"].get("limits", {}).get("memory", "0Mi")
159
+ )
160
+
161
+ # Define new values for comparison
162
+ new_cpu_request = "1000m"
163
+ new_memory_request = "1Gi"
164
+ new_memory_limit = "2Gi"
165
+
166
+ if parse_resource_value(current_cpu_request) < parse_resource_value(
167
+ new_cpu_request
168
+ ):
169
+ container["resources"]["requests"]["cpu"] = new_cpu_request
170
+ update_manifest = True
171
+ if parse_resource_value(
172
+ current_memory_request
173
+ ) < parse_resource_value(new_memory_request):
174
+ container["resources"]["requests"]["memory"] = new_memory_request
175
+ update_manifest = True
176
+ if parse_resource_value(current_memory_limit) < parse_resource_value(
177
+ new_memory_limit
178
+ ):
179
+ container["resources"]["limits"]["memory"] = new_memory_limit
180
+ update_manifest = True
181
+ break
182
+ if update_manifest:
183
+ xpk_print("Jobset controller updation required.")
184
+ return yaml_data
185
+ xpk_print("Jobset controller no updation required.")
186
+
187
+
188
+ def parse_resource_value(value) -> int:
189
+ if value.endswith("m"):
190
+ return int(value[:-1])
191
+ if value.endswith("Mi"):
192
+ return int(value[:-2])
193
+ if value.endswith("Gi"):
194
+ return int(value[:-2]) * 1024
195
+ return int(value)
xpk/core/nap.py CHANGED
@@ -255,6 +255,10 @@ def is_autoprovisioning_enabled(
255
255
  bool is true if autoprovisioning is enabled, false otherwise.
256
256
  int of 0 if successful and 1 otherwise.
257
257
  """
258
+ # Currently autoprovisioning is not enabled for Pathways workloads. b/360898087
259
+ if args.use_pathways:
260
+ return False, 0
261
+
258
262
  resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
259
263
  cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
260
264
 
xpk/core/network.py CHANGED
@@ -14,12 +14,10 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
- from ..utils.console import xpk_print
17
+ from ..utils.console import xpk_exit, xpk_print
18
18
  from ..utils.file import write_tmp_file
19
- from .capacity import H100_DEVICE_TYPE
20
19
  from .commands import run_command_for_value, run_command_with_updates
21
20
  from .gcloud_context import zone_to_region
22
- from .system_characteristics import SystemCharacteristics
23
21
 
24
22
  # cluster_network_yaml: the config when creating the network for a3 cluster
25
23
  CLUSTER_NETWORK_YAML = """
@@ -175,16 +173,6 @@ def create_cluster_subnet(args, index) -> int:
175
173
  return 0
176
174
 
177
175
 
178
- def get_subnetworks_for_a3mega(cluster_name: str) -> list[str]:
179
- return [f'{cluster_name}-gpunet-{i}-subnet' for i in range(8)]
180
-
181
-
182
- def get_subnetworks_for_a3ultra(cluster_name: str) -> list[str]:
183
- return [f'{cluster_name}-sub-1'] + [
184
- f'{cluster_name}-rdma-sub-{i}' for i in range(8)
185
- ]
186
-
187
-
188
176
  def create_cluster_firewall_rule(args, index) -> int:
189
177
  """Create one GKE Cluster firewall rule.
190
178
 
@@ -247,20 +235,40 @@ def create_cluster_network_config(args) -> int:
247
235
  return 0
248
236
 
249
237
 
250
- def set_up_cluster_network_for_gpu(args, system: SystemCharacteristics) -> int:
251
- """Set up GKE Cluster networks, subnets and firewall rules for A3/A3+.
252
- Note: there are 4 NICs for GPU-GPU bw and 1 NIC for host in an A3 node,
253
- and there are 8 NICs for GPU-GPU bw and 1 NIC for host in an A3+ node.
238
+ def get_cluster_subnetworks(args) -> list[str]:
239
+ """Gets the list of cluster networks.
240
+
241
+ Args:
242
+ args: user provided arguments for running the command.
243
+
244
+ Returns:
245
+ list[str]: list of cluster networks
246
+ """
247
+ command = 'kubectl get GKENetworkParamSet'
248
+ return_code, stdout = run_command_for_value(
249
+ command, 'Get Cluster Networks', args
250
+ )
251
+ if return_code != 0:
252
+ xpk_print('GKE Cluster Get NetworkParamSet failed')
253
+ xpk_exit(return_code)
254
+
255
+ networks = [line.split()[0] for line in stdout.splitlines()][1:]
256
+
257
+ return networks
258
+
259
+
260
+ def set_up_cluster_network_for_a3(args) -> int:
261
+ """Set up GKE Cluster networks, subnets and firewall rules for A3.
262
+ Note: there are 4 NICs for GPU-GPU bw and 1 NIC for host in an A3 node.
254
263
 
255
264
  Args:
256
265
  args: user provided arguments for running the command.
257
- system: system characteristics.
258
266
 
259
267
  Returns:
260
268
  0 if successful and 1 otherwise.
261
269
  """
262
- num_networks = 5 if system.device_type == H100_DEVICE_TYPE else 9
263
- for i in range(1, num_networks):
270
+ num_networks = 4
271
+ for i in range(1, num_networks + 1):
264
272
  return_code = create_cluster_network(args, i)
265
273
  if return_code != 0:
266
274
  return 1
@@ -315,7 +323,10 @@ def get_all_networks_programmatic(args) -> tuple[list[str], int]:
315
323
  Returns:
316
324
  List of networks and 0 if successful and 1 otherwise.
317
325
  """
318
- command = 'gcloud compute networks list --format="csv[no-heading](name)"'
326
+ command = (
327
+ 'gcloud compute networks list --format="csv[no-heading](name)" '
328
+ f' --project={args.project}'
329
+ )
319
330
  return_code, raw_network_output = run_command_for_value(
320
331
  command, 'Get All Networks', args
321
332
  )
@@ -365,7 +376,8 @@ def get_all_firewall_rules_programmatic(args) -> tuple[list[str], int]:
365
376
  List of firewall rules and 0 if successful and 1 otherwise.
366
377
  """
367
378
  command = (
368
- 'gcloud compute firewall-rules list --format="csv[no-heading](name)"'
379
+ 'gcloud compute firewall-rules list --format="csv[no-heading](name)" '
380
+ f' --project={args.project}'
369
381
  )
370
382
  return_code, raw_subnets_output = run_command_for_value(
371
383
  command, 'Get All Firewall Rules', args
xpk/core/nodepool.py CHANGED
@@ -37,6 +37,8 @@ CLOUD_PLATFORM_AUTH_SCOPE_URL = (
37
37
  '"https://www.googleapis.com/auth/cloud-platform"'
38
38
  )
39
39
 
40
+ OLDER_PATHWAYS_CPU_NP_TO_DELETE = ['cpu-rm-np', 'cpu-proxy-np', 'cpu-user-np']
41
+
40
42
 
41
43
  def run_gke_node_pool_create_command(
42
44
  args, system, gke_node_pool_version
@@ -122,7 +124,10 @@ def run_gke_node_pool_create_command(
122
124
  args, system, existing_node_pool_names, desired_node_pool_names
123
125
  )
124
126
  for node_pool_name in existing_node_pool_names:
125
- if node_pool_name.find(f'{args.cluster}-np-') != 0:
127
+ if (
128
+ node_pool_name.find(f'{args.cluster}-np-') != 0
129
+ and node_pool_name not in OLDER_PATHWAYS_CPU_NP_TO_DELETE
130
+ ):
126
131
  continue
127
132
 
128
133
  if node_pool_name in node_pools_to_delete:
@@ -283,28 +288,15 @@ def run_gke_node_pool_create_command(
283
288
  command += (
284
289
  ' --accelerator'
285
290
  f' type={system.gke_accelerator},count={str(system.chips_per_vm)},gpu-driver-version=latest'
286
- ' --no-enable-autoupgrade '
287
- f' --scopes={CLOUD_PLATFORM_AUTH_SCOPE_URL} --additional-node-network'
288
- f' network={args.cluster}-net-1,subnetwork={subnet_prefix}-sub-1'
289
- ' --additional-node-network'
290
- f' network={args.cluster}-net-2,subnetwork={subnet_prefix}-sub-2'
291
- ' --additional-node-network'
292
- f' network={args.cluster}-net-3,subnetwork={subnet_prefix}-sub-3'
293
- ' --additional-node-network'
294
- f' network={args.cluster}-net-4,subnetwork={subnet_prefix}-sub-4'
291
+ f' --no-enable-autoupgrade --scopes={CLOUD_PLATFORM_AUTH_SCOPE_URL}'
295
292
  )
296
293
  if device_type == H100_MEGA_DEVICE_TYPE:
297
- command += (
298
- ' --additional-node-network'
299
- f' network={args.cluster}-net-5,subnetwork={subnet_prefix}-sub-5'
300
- ' --additional-node-network'
301
- f' network={args.cluster}-net-6,subnetwork={subnet_prefix}-sub-6'
302
- ' --additional-node-network'
303
- f' network={args.cluster}-net-7,subnetwork={subnet_prefix}-sub-7'
304
- ' --additional-node-network'
305
- f' network={args.cluster}-net-8,subnetwork={subnet_prefix}-sub-8'
306
- ' --max-pods-per-node=32'
307
- )
294
+ for i in range(1, 9):
295
+ command += (
296
+ ' --additional-node-network'
297
+ f' network={args.cluster}-net-{i},subnetwork={subnet_prefix}-sub-{i}'
298
+ )
299
+ command += ' --max-pods-per-node=32'
308
300
  elif system.accelerator_type == AcceleratorType['CPU']:
309
301
  command += f' --num-nodes={system.vms_per_slice}'
310
302
  command += (
@@ -318,7 +310,7 @@ def run_gke_node_pool_create_command(
318
310
  create_commands.append(command)
319
311
  create_task_names.append(task)
320
312
 
321
- desired_pw_cpu_node_pools = ['cpu-user-np', 'cpu-rm-np', 'cpu-proxy-np']
313
+ desired_pw_cpu_node_pools = ['cpu-np']
322
314
  if args.enable_pathways:
323
315
  # Pathways needs CPU nodepools in addition to TPU nodepools
324
316
  for node_pool_name in desired_pw_cpu_node_pools:
@@ -368,11 +360,9 @@ def get_node_pools_to_delete(
368
360
  check_resource, is_requested_resource_in_cluster = check_cluster_resources(
369
361
  args, system
370
362
  )
371
- for existing_node_pool_name in existing_node_pool_names:
372
- # Deletion logic would leave behind any Pathways CPU nodepools.
373
- if existing_node_pool_name.find(f'{args.cluster}-np-') != 0:
374
- continue
363
+ xpk_print('Existing node pool names ', existing_node_pool_names)
375
364
 
365
+ for existing_node_pool_name in existing_node_pool_names:
376
366
  # Nodepools will be deleted in two scenarios:
377
367
  # Scenario 1: Cluster exists with 3 nodepools of 'x' device_type/gke_accelerator and now we are updating
378
368
  # the cluster to 2 nodepools of 'x' device_type/gke_accelerator. In this case, we will delete
@@ -380,6 +370,18 @@ def get_node_pools_to_delete(
380
370
  # Scenario 2: Cluster exists with 2 nodepools of 'x' device_type/gke_accelerator and now we are updating
381
371
  # the cluster to 2 nodepools of 'y' device_type/gke_accelerator. In this case, we will delete
382
372
  # '{args.cluster}-np-0' and '{args.cluster}-np-1' from the cluster.
373
+ # Scenario 3: Deletes older Pathways CPU nodepools named cpu-rm-np, cpu-proxy-np and cpu-user-np
374
+
375
+ if existing_node_pool_name in OLDER_PATHWAYS_CPU_NP_TO_DELETE:
376
+ node_pools_to_delete.append(existing_node_pool_name)
377
+ xpk_print(
378
+ 'Upgrading Pathways version on the cluster. Deleting older pathways'
379
+ ' nodepool ',
380
+ existing_node_pool_name,
381
+ )
382
+
383
+ if existing_node_pool_name.find(f'{args.cluster}-np-') != 0:
384
+ continue
383
385
  if existing_node_pool_name not in desired_node_pool_names or (
384
386
  check_resource and not is_requested_resource_in_cluster
385
387
  ):