xpk 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xpk/core/network.py CHANGED
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
- from ..utils.console import xpk_print
17
+ from ..utils.console import xpk_exit, xpk_print
18
18
  from ..utils.file import write_tmp_file
19
19
  from .commands import run_command_for_value, run_command_with_updates
20
20
  from .gcloud_context import zone_to_region
@@ -235,6 +235,28 @@ def create_cluster_network_config(args) -> int:
235
235
  return 0
236
236
 
237
237
 
238
+ def get_cluster_subnetworks(args) -> list[str]:
239
+ """Gets the list of cluster networks.
240
+
241
+ Args:
242
+ args: user provided arguments for running the command.
243
+
244
+ Returns:
245
+ list[str]: list of cluster networks
246
+ """
247
+ command = 'kubectl get GKENetworkParamSet'
248
+ return_code, stdout = run_command_for_value(
249
+ command, 'Get Cluster Networks', args
250
+ )
251
+ if return_code != 0:
252
+ xpk_print('GKE Cluster Get NetworkParamSet failed')
253
+ xpk_exit(return_code)
254
+
255
+ networks = [line.split()[0] for line in stdout.splitlines()][1:]
256
+
257
+ return networks
258
+
259
+
238
260
  def set_up_cluster_network_for_a3(args) -> int:
239
261
  """Set up GKE Cluster networks, subnets and firewall rules for A3.
240
262
  Note: there are 4 NICs for GPU-GPU bw and 1 NIC for host in an A3 node.
xpk/core/pathways.py CHANGED
@@ -211,7 +211,7 @@ def append_custom_pathways_worker(args) -> str:
211
211
  """
212
212
  yaml = """"""
213
213
  if args.server_image or args.custom_pathways_worker_args:
214
- yaml = """- componentType: pathways_worker"""
214
+ yaml = """- componentType: worker"""
215
215
  indentation = (
216
216
  ' ' * 8
217
217
  ) # Currently 8, based on the YAML, may need to update in the future.
xpk/core/resources.py CHANGED
@@ -236,3 +236,24 @@ def get_cluster_system_characteristics(args) -> SystemCharacteristics | None:
236
236
  return system
237
237
 
238
238
  return None
239
+
240
+
241
+ def get_cluster_capacity_type(args) -> CapacityType | None:
242
+ """Get systemCharcteristics based on the cluster resources configMap
243
+ Args:
244
+ args: user provided arguments for running the command.
245
+
246
+ Returns:
247
+ returns system characteristics
248
+ """
249
+ metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
250
+ cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
251
+
252
+ if cluster_config_map is None:
253
+ return None
254
+
255
+ capacityValue = cluster_config_map.get('capacity_type')
256
+ if capacityValue is not None:
257
+ return CapacityType[capacityValue.upper()]
258
+
259
+ return None
xpk/core/workload.py CHANGED
@@ -131,7 +131,7 @@ def get_workload_list(args) -> tuple[int, str]:
131
131
  )
132
132
  workload_list_filter_job_cmd = determine_workload_list_filter_by_job(args)
133
133
  command = (
134
- f'kubectl get workloads -o=custom-columns="{s}" '
134
+ f'kubectl get workloads --ignore-not-found -o=custom-columns="{s}" '
135
135
  f'{workload_list_filter_status_cmd} {workload_list_filter_job_cmd}'
136
136
  )
137
137
 
@@ -68,16 +68,12 @@ def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
68
68
 
69
69
 
70
70
  def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
71
- interfaces = [
72
- '[',
73
- ' {"interfaceName":"eth0","network":"default"},',
74
- *[
75
- f' {{"interfaceName":"eth{i + 1}","network":"{sub_networks[i]}"}}{"," if i<8 else ""}'
76
- for i in range(9)
77
- ],
78
- ']',
79
- ]
80
- return 'networking.gke.io/interfaces', literal_string('\n'.join(interfaces))
71
+ entries = ',\n'.join([
72
+ f' {{"interfaceName":"eth{i}","network":"{network}"}}'
73
+ for i, network in enumerate(sub_networks)
74
+ ])
75
+ interfaces = f'[\n{entries}\n]'
76
+ return 'networking.gke.io/interfaces', literal_string(interfaces)
81
77
 
82
78
 
83
79
  def add_annotations(job_manifest: dict, sub_networks: list[str]):
@@ -0,0 +1,179 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import yaml
18
+
19
+ from ...utils.yaml import literal_string
20
+
21
+ # Component version
22
+ tcpx = 'v2.0.11'
23
+
24
+
25
+ def decorate_kjob_template(job_manifest: dict) -> dict:
26
+ add_volumes(job_manifest)
27
+ add_tolerations(job_manifest)
28
+ add_tcpxo_daemon_container(job_manifest)
29
+ update_gpu_containers(job_manifest)
30
+ return job_manifest
31
+
32
+
33
+ def decorate_job(job_manifest: dict) -> dict:
34
+ add_annotations(job_manifest)
35
+ add_volumes(job_manifest)
36
+ add_tolerations(job_manifest)
37
+ add_tcpxo_daemon_container(job_manifest)
38
+ update_gpu_containers(job_manifest)
39
+ return job_manifest
40
+
41
+
42
+ def decorate_jobset(jobset_manifest_str: str) -> str:
43
+ """
44
+ Decorates a JobSet manifest with the necessary components for tcpxo-daemon.
45
+
46
+ Args:
47
+ jobset_manifest_str: The JobSet manifest as a YAML string.
48
+
49
+ Returns:
50
+ The modified JobSet manifest as a YAML string.
51
+ """
52
+
53
+ manifest = yaml.safe_load(jobset_manifest_str)
54
+
55
+ for job in manifest['spec']['replicatedJobs']:
56
+ job_manifest = job['template']
57
+ job_manifest = decorate_job(job_manifest)
58
+ return yaml.dump(manifest, sort_keys=False)
59
+
60
+
61
+ def get_interfaces_annotation() -> dict:
62
+ interfaces = [
63
+ '[',
64
+ ' {"interfaceName":"eth0","network":"default"},',
65
+ ' {"interfaceName":"eth1","network":"vpc1"},',
66
+ ' {"interfaceName":"eth2","network":"vpc2"},',
67
+ ' {"interfaceName":"eth3","network":"vpc3"},',
68
+ ' {"interfaceName":"eth4","network":"vpc4"}',
69
+ ']',
70
+ ]
71
+ return {'networking.gke.io/interfaces': literal_string('\n'.join(interfaces))}
72
+
73
+
74
+ def get_tcpx_deamon_annotation() -> dict:
75
+ return {
76
+ 'devices.gke.io/container.tcpx-daemon': literal_string(
77
+ '- path: /dev/nvidia0\n'
78
+ '- path: /dev/nvidia1\n'
79
+ '- path: /dev/nvidia2\n'
80
+ '- path: /dev/nvidia3\n'
81
+ '- path: /dev/nvidia4\n'
82
+ '- path: /dev/nvidia5\n'
83
+ '- path: /dev/nvidia6\n'
84
+ '- path: /dev/nvidia7\n'
85
+ '- path: /dev/nvidiactl\n'
86
+ '- path: /dev/nvidia-uvm\n'
87
+ )
88
+ }
89
+
90
+
91
+ def add_annotations(job_manifest: dict):
92
+ """Adds or updates annotations in the Pod template."""
93
+ annotations: dict = (
94
+ job_manifest.setdefault('spec', {})
95
+ .setdefault('template', {})
96
+ .setdefault('metadata', {})
97
+ .setdefault('annotations', {})
98
+ )
99
+ annotations.update(get_tcpx_deamon_annotation())
100
+ annotations.update({'networking.gke.io/default-interface': 'eth0'})
101
+ annotations.update(get_interfaces_annotation())
102
+
103
+
104
+ def add_tolerations(job_manifest: dict):
105
+ """Adds tolerations to the Pod spec."""
106
+ tolerations: list = (
107
+ job_manifest.setdefault('spec', {})
108
+ .setdefault('template', {})
109
+ .setdefault('spec', {})
110
+ .setdefault('tolerations', [])
111
+ )
112
+ tolerations.append({
113
+ 'key': 'user-workload',
114
+ 'operator': 'Equal',
115
+ 'value': 'true',
116
+ 'effect': 'NoSchedule',
117
+ })
118
+
119
+
120
+ def add_volumes(job_manifest: dict):
121
+ """Adds volumes to the Pod spec."""
122
+ volumes: list = (
123
+ job_manifest.setdefault('spec', {})
124
+ .setdefault('template', {})
125
+ .setdefault('spec', {})
126
+ .setdefault('volumes', [])
127
+ )
128
+ volumes.append({
129
+ 'name': 'libraries',
130
+ 'hostPath': {'path': '/home/kubernetes/bin/nvidia/lib64'},
131
+ })
132
+ volumes.append({'name': 'sys', 'hostPath': {'path': '/sys'}})
133
+ volumes.append({'name': 'proc-sys', 'hostPath': {'path': '/proc/sys'}})
134
+
135
+
136
+ def add_tcpxo_daemon_container(job_manifest):
137
+ """Adds the tcpxo-daemon container to the Pod spec."""
138
+ tcpxo_daemon_container = {
139
+ 'name': 'tcpx-daemon',
140
+ 'image': f'us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:{tcpx}',
141
+ 'imagePullPolicy': 'Always',
142
+ 'restartPolicy': 'Always',
143
+ 'command': [
144
+ '/tcpgpudmarxd/build/app/tcpgpudmarxd',
145
+ '--gpu_nic_preset',
146
+ 'a3vm',
147
+ '--gpu_shmem_type',
148
+ 'fd',
149
+ '--uds_path',
150
+ '/run/tcpx',
151
+ '--setup_param',
152
+ '"--verbose 128 2 0 "',
153
+ ],
154
+ 'securityContext': {'capabilities': {'add': ['NET_ADMIN']}},
155
+ 'volumeMounts': [
156
+ {'name': 'libraries', 'mountPath': '/usr/local/nvidia/lib64'},
157
+ {'name': 'tcpx-socket', 'mountPath': '/run/tcpx'},
158
+ {'name': 'sys', 'mountPath': '/hostsysfs'},
159
+ {'name': 'proc-sys', 'mountPath': '/hostprocsysfs'},
160
+ ],
161
+ 'env': [{'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}],
162
+ }
163
+ spec = job_manifest['spec']['template']['spec']
164
+ spec.setdefault('initContainers', [])
165
+ spec['initContainers'].append(tcpxo_daemon_container)
166
+
167
+
168
+ def update_gpu_containers(job_manifest):
169
+ for container in job_manifest['spec']['template']['spec']['containers']:
170
+ if 'nvidia.com/gpu' in container.get('resources', {}).get('limits', {}):
171
+ env: list = container.setdefault('env', [])
172
+ env.append(
173
+ {'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}
174
+ )
175
+ volumeMounts: list = container.setdefault('volumeMounts', [])
176
+ volumeMounts.append({'name': 'tcpx-socket', 'mountPath': '/tmp'})
177
+ volumeMounts.append(
178
+ {'name': 'libraries', 'mountPath': '/usr/local/nvidia/lib64'}
179
+ )
@@ -77,16 +77,12 @@ def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
77
77
 
78
78
 
79
79
  def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
80
- interfaces = [
81
- '[',
82
- ' {"interfaceName":"eth0","network":"default"},',
83
- *[
84
- f' {{"interfaceName":"eth{i + 1}","network":"{sub_networks[i]}"}}{"," if i<7 else ""}'
85
- for i in range(8)
86
- ],
87
- ']',
88
- ]
89
- return 'networking.gke.io/interfaces', literal_string('\n'.join(interfaces))
80
+ entries = ',\n'.join([
81
+ f' {{"interfaceName":"eth{i}","network":"{network}"}}'
82
+ for i, network in enumerate(sub_networks)
83
+ ])
84
+ interfaces = f'[\n{entries}\n]'
85
+ return 'networking.gke.io/interfaces', literal_string(interfaces)
90
86
 
91
87
 
92
88
  def get_tcpxo_deamon_entry() -> tuple[str, str]:
@@ -107,7 +103,11 @@ def get_tcpxo_deamon_entry() -> tuple[str, str]:
107
103
 
108
104
  def add_annotations(job_manifest: dict, sub_networks: list[str]):
109
105
  """Adds or updates annotations in the Pod template."""
110
- annotations = job_manifest['spec']['template']['metadata']['annotations']
106
+ metadata = job_manifest['spec']['template']['metadata']
107
+ annotations = metadata.get('annotations')
108
+ if annotations is None:
109
+ annotations = {}
110
+ metadata['annotations'] = annotations
111
111
  tcpxo_deamon_key, tcpxo_deamon_paths = get_tcpxo_deamon_entry()
112
112
  interfaces_key, interfaces_value = get_interfaces_entry(sub_networks)
113
113
  annotations.update({
@@ -149,6 +149,7 @@ def add_tcpxo_daemon_container(job_manifest):
149
149
  'name': 'tcpxo-daemon',
150
150
  'image': f'us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:{rxdm}',
151
151
  'imagePullPolicy': 'Always',
152
+ 'restartPolicy': 'Always',
152
153
  'command': ['/bin/sh', '-c'],
153
154
  'args': [
154
155
  'set -ex\nchmod 755'
@@ -165,9 +166,9 @@ def add_tcpxo_daemon_container(job_manifest):
165
166
  ],
166
167
  'env': [{'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}],
167
168
  }
168
- job_manifest['spec']['template']['spec']['containers'].append(
169
- tcpxo_daemon_container
170
- )
169
+ spec = job_manifest['spec']['template']['spec']
170
+ spec.setdefault('initContainers', [])
171
+ spec['initContainers'].append(tcpxo_daemon_container)
171
172
 
172
173
 
173
174
  def update_gpu_containers(job_manifest):