xpk 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. xpk/__init__.py +15 -0
  2. xpk/commands/__init__.py +15 -0
  3. xpk/commands/batch.py +109 -0
  4. xpk/commands/cluster.py +784 -0
  5. xpk/commands/cluster_gcluster.py +185 -0
  6. xpk/commands/info.py +245 -0
  7. xpk/commands/inspector.py +363 -0
  8. xpk/commands/job.py +197 -0
  9. xpk/commands/kind.py +253 -0
  10. xpk/commands/shell.py +120 -0
  11. xpk/commands/version.py +39 -0
  12. xpk/commands/workload.py +692 -0
  13. xpk/core/__init__.py +15 -0
  14. xpk/core/blueprint/__init__.py +15 -0
  15. xpk/core/blueprint/blueprint_definitions.py +61 -0
  16. xpk/core/blueprint/blueprint_generator.py +652 -0
  17. xpk/core/cluster_private.py +197 -0
  18. xpk/core/commands.py +352 -0
  19. xpk/core/core.py +2824 -0
  20. xpk/core/docker_manager.py +308 -0
  21. xpk/core/gcluster_manager.py +158 -0
  22. xpk/core/kjob.py +205 -0
  23. xpk/core/kueue.py +352 -0
  24. xpk/core/nap.py +349 -0
  25. xpk/core/pathways.py +298 -0
  26. xpk/core/ray.py +222 -0
  27. xpk/core/system_characteristics.py +1395 -0
  28. xpk/core/workload.py +133 -0
  29. xpk/core/workload_decorators/__init__.py +15 -0
  30. xpk/core/workload_decorators/rdma_decorator.py +109 -0
  31. xpk/core/workload_decorators/tcpxo_decorator.py +157 -0
  32. xpk/main.py +73 -0
  33. xpk/parser/__init__.py +15 -0
  34. xpk/parser/batch.py +184 -0
  35. xpk/parser/cluster.py +621 -0
  36. xpk/parser/common.py +71 -0
  37. xpk/parser/core.py +109 -0
  38. xpk/parser/info.py +63 -0
  39. xpk/parser/inspector.py +65 -0
  40. xpk/parser/job.py +126 -0
  41. xpk/parser/kind.py +94 -0
  42. xpk/parser/shell.py +50 -0
  43. xpk/parser/validators.py +39 -0
  44. xpk/parser/version.py +23 -0
  45. xpk/parser/workload.py +684 -0
  46. xpk/utils/__init__.py +15 -0
  47. xpk/utils/console.py +55 -0
  48. xpk/utils/file.py +82 -0
  49. xpk/utils/network.py +168 -0
  50. xpk/utils/objects.py +85 -0
  51. xpk/utils/yaml.py +30 -0
  52. {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/METADATA +301 -28
  53. xpk-0.6.0.dist-info/RECORD +57 -0
  54. {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/WHEEL +1 -1
  55. xpk-0.6.0.dist-info/entry_points.txt +2 -0
  56. xpk-0.5.0.dist-info/RECORD +0 -7
  57. xpk-0.5.0.dist-info/entry_points.txt +0 -2
  58. xpk.py +0 -7282
  59. {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/LICENSE +0 -0
  60. {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/top_level.txt +0 -0
xpk/core/pathways.py ADDED
@@ -0,0 +1,298 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from ..utils.console import xpk_exit, xpk_print
18
+ from .core import (
19
+ AcceleratorType,
20
+ get_all_nodepools_programmatic,
21
+ get_user_workload_container,
22
+ zone_to_region,
23
+ )
24
+ from .system_characteristics import SystemCharacteristics
25
+
26
+ PathwaysExpectedInstancesMap = {
27
+ 'v6e': 'tpuv6e',
28
+ 'v5p': 'tpuv5',
29
+ 'v5litepod': 'tpuv5e',
30
+ 'v4': 'tpuv4',
31
+ 'v3': 'tpuv3',
32
+ }
33
+
34
+
35
+ def get_pathways_worker_args(args) -> str:
36
+ """Arguments for the Pathways workers.
37
+ Args:
38
+ args: user provided arguments for running the command.
39
+
40
+ Returns:
41
+ str: yaml containing arguments for the Pathways workers.
42
+ """
43
+ yaml = """- --server_port=29001
44
+ - --resource_manager_address={rm_address}
45
+ - --gcs_scratch_location={args.pathways_gcs_location}"""
46
+ if args.use_pathways:
47
+ return yaml.format(args=args, rm_address=get_rm_address(args))
48
+ else:
49
+ return ''
50
+
51
+
52
+ def get_pathways_proxy_args(args) -> str:
53
+ """Arguments for the Pathways proxy.
54
+ Args:
55
+ args: user provided arguments for running the command.
56
+
57
+ Returns:
58
+ str: yaml containing arguments for the Pathways proxy.
59
+ """
60
+ yaml = """- --server_port=29000
61
+ - --resource_manager_address={rm_address}
62
+ - --gcs_scratch_location={args.pathways_gcs_location}"""
63
+
64
+ if args.use_pathways:
65
+ return yaml.format(args=args, rm_address=get_rm_address(args))
66
+ else:
67
+ return ''
68
+
69
+
70
+ def add_pw_resource_flavors(args):
71
+ """Add resource flavors required for Pathways enabled clusters."""
72
+ resource_flavor_yaml = """apiVersion: kueue.x-k8s.io/v1beta1
73
+ kind: ResourceFlavor
74
+ metadata:
75
+ name: cpu-rm
76
+ spec:
77
+ nodeLabels:
78
+ cloud.google.com/gke-nodepool: cpu-rm-np
79
+ ---
80
+ apiVersion: kueue.x-k8s.io/v1beta1
81
+ kind: ResourceFlavor
82
+ metadata:
83
+ name: cpu-proxy
84
+ spec:
85
+ nodeLabels:
86
+ cloud.google.com/gke-nodepool: cpu-proxy-np
87
+ ---
88
+ apiVersion: kueue.x-k8s.io/v1beta1
89
+ kind: ResourceFlavor
90
+ metadata:
91
+ name: cpu-user
92
+ spec:
93
+ nodeLabels:
94
+ cloud.google.com/gke-nodepool: cpu-user-np
95
+ ---"""
96
+ if args.enable_pathways:
97
+ return resource_flavor_yaml
98
+ return ''
99
+
100
+
101
+ def add_pw_resources_to_kueue(args):
102
+ """Add resource flavors required for Pathways, to the cluster queue."""
103
+ resources_yaml = """- coveredResources: ["cpu", "memory"]
104
+ flavors:
105
+ - name: cpu-rm
106
+ resources:
107
+ - name: "cpu"
108
+ nominalQuota: 80
109
+ - name: "memory"
110
+ nominalQuota: 160G
111
+ - name: cpu-proxy
112
+ resources:
113
+ - name: "cpu"
114
+ nominalQuota: 480
115
+ - name: "memory"
116
+ nominalQuota: 2000G
117
+ - name: cpu-user
118
+ resources:
119
+ - name: "cpu"
120
+ nominalQuota: 480
121
+ - name: "memory"
122
+ nominalQuota: 2000G"""
123
+ if args.enable_pathways:
124
+ return resources_yaml
125
+ return ''
126
+
127
+
128
+ def ensure_pathways_workload_prerequisites(args, system) -> bool:
129
+ """Check all Pathways workload prerequisites and set necessary args.
130
+
131
+ Args:
132
+ args: user provided arguments for running the command.
133
+ system: system characteristics.
134
+
135
+ Returns:
136
+ True once conditions satisfy and variables are set. Exits otherwise.
137
+ """
138
+ # Ensure command is provided if not using Pathways in headless mode
139
+ if args.command is None and not args.headless:
140
+ xpk_print(
141
+ 'Please provide a command using "--command" for the docker container to'
142
+ ' execute. Command is not required if you wish to run Pathways'
143
+ ' workloads in headless mode (`xpk workload create-pathways'
144
+ ' --headless`).'
145
+ )
146
+ xpk_exit(1)
147
+
148
+ # Ensure the cluster and CPU nodepools were created with create-pathways
149
+ all_node_pools = get_all_nodepools_programmatic(args)
150
+ desired_pw_cpu_node_pools = {'cpu-user-np', 'cpu-rm-np', 'cpu-proxy-np'}
151
+ if not desired_pw_cpu_node_pools.issubset(set(all_node_pools[0])):
152
+ xpk_print(
153
+ 'Cluster needs to be created with `xpk create-pathways` to run'
154
+ ' Pathways workloads.'
155
+ )
156
+ xpk_exit(1)
157
+
158
+ # Ensure device type is TPUs - currently Pathways supports TPUs only.
159
+ if system.accelerator_type != AcceleratorType['TPU']:
160
+ xpk_print('Currently, Pathways workloads can only be run on TPUs.')
161
+ xpk_exit(1)
162
+
163
+ # Set proxy address to be consumed in helper methods and displayed to user.
164
+ args.pathways_proxy_address = get_proxy_address(args)
165
+
166
+ # Set the job which determines the life of other Pathways jobs
167
+ args.targetReplicatedJob = 'proxy' if args.headless else 'main'
168
+
169
+ # Always report user code failures back to JobSet.
170
+ args.restart_on_user_code_failure = True
171
+
172
+ return True
173
+
174
+
175
+ def get_pathways_unified_query_link(args) -> str:
176
+ """Get the unified query link for the pathways workload."""
177
+ pw_suffixes = ['main', 'rm', 'proxy']
178
+ pw_pod_names = [f'"{args.workload}-{suffix}-0"' for suffix in pw_suffixes]
179
+ pw_pod_names_query = '%20OR%20'.join(pw_pod_names + ['worker-0-0'])
180
+ query_params = (
181
+ 'resource.type%3D"k8s_container"%0A'
182
+ f'resource.labels.project_id%3D"{args.project}"%0A'
183
+ f'resource.labels.location%3D"{zone_to_region(args.zone)}"%0A'
184
+ f'resource.labels.cluster_name%3D"{args.cluster}"%0A'
185
+ f'resource.labels.pod_name:{pw_pod_names_query}%0A'
186
+ 'severity>%3DDEFAULT'
187
+ )
188
+
189
+ return f'https://console.cloud.google.com/logs/query;query={query_params}'
190
+
191
+
192
+ def get_pathways_rm_args(args, system: SystemCharacteristics) -> str:
193
+ """Arguments for the Pathways resource manager.
194
+ Args:
195
+ args: user provided arguments for running the command.
196
+
197
+ Returns:
198
+ str: yaml containing arguments for the Pathways resource manager.
199
+ """
200
+ yaml = """- --server_port=29001
201
+ - --gcs_scratch_location={args.pathways_gcs_location}
202
+ - --node_type=resource_manager
203
+ - --instance_count={instance_count}
204
+ - --instance_type={instance_type}"""
205
+ if args.use_pathways:
206
+ return yaml.format(
207
+ args=args,
208
+ instance_count=args.num_slices,
209
+ instance_type=f'{get_pathways_expected_tpu_type(system.device_type)}:{system.topology}',
210
+ )
211
+ else:
212
+ return ''
213
+
214
+
215
+ def get_user_workload_for_pathways(args, system: SystemCharacteristics) -> str:
216
+ """
217
+ Create a user workload container for Pathways.
218
+ Don't create one for Pathways headless mode.
219
+
220
+ Args:
221
+ args: user provided args.
222
+ system: system characteristics.
223
+
224
+
225
+ Returns:
226
+ str:
227
+ Pathways server port as a YAML string
228
+ """
229
+ user_workload_yaml = """- name: main
230
+ replicas: 1
231
+ template:
232
+ metadata:
233
+ labels:
234
+ xpk.google.com/workload: {args.workload}
235
+ spec:
236
+ backoffLimit: 0
237
+ completions: 1
238
+ parallelism: 1
239
+ template:
240
+ spec:
241
+ containers:
242
+ {container}
243
+ nodeSelector:
244
+ cloud.google.com/gke-nodepool: cpu-user-np
245
+ restartPolicy: OnFailure
246
+ volumes:
247
+ - hostPath:
248
+ path: /tmp
249
+ type: DirectoryOrCreate
250
+ name: shared-tmp"""
251
+ if args.headless:
252
+ return ''
253
+ else:
254
+ container, _ = get_user_workload_container(args, system)
255
+ return user_workload_yaml.format(args=args, container=container)
256
+
257
+
258
+ def get_rm_address(args) -> str:
259
+ """Generates the Pathways resource manager address.
260
+ Args:
261
+ args: user provided arguments for running the command.
262
+
263
+ Returns:
264
+ str: Fully qualified RM address.
265
+ """
266
+ rm_address = f'{args.workload}-rm-0-0.{args.workload}:29001'
267
+ return rm_address
268
+
269
+
270
+ def get_proxy_address(args) -> str:
271
+ """Generates the Pathways proxy address.
272
+ Args:
273
+ args: user provided arguments for running the command.
274
+
275
+ Returns:
276
+ str: Fully qualified proxy address.
277
+ """
278
+ proxy_address = f'grpc://{args.workload}-proxy-0-0.{args.workload}:29000'
279
+ return proxy_address
280
+
281
+
282
+ def get_pathways_expected_tpu_type(device_type: str) -> str:
283
+ """Returns the device type expected by Pathways
284
+ Args:
285
+ device_type: the system characteristic device type
286
+
287
+ Returns:
288
+ str: the device type expected by pathways.
289
+ """
290
+ raw_type = device_type.split('-')[0].lower()
291
+ pathways_expected_instance = PathwaysExpectedInstancesMap[raw_type]
292
+ if not pathways_expected_instance:
293
+ xpk_print(
294
+ f'Passed in device_type {device_type} is incorrect. Please pass in a'
295
+ ' valid device type'
296
+ )
297
+ xpk_exit(1)
298
+ return pathways_expected_instance
xpk/core/ray.py ADDED
@@ -0,0 +1,222 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import re
18
+ from ..utils.console import xpk_exit, xpk_print
19
+ from ..utils.file import write_tmp_file
20
+ from .commands import run_command_for_value, run_command_with_updates_retry
21
+
22
+
23
+ HEAD_CPU = 0.5
24
+ WORKER_CPU = 0.9
25
+ GCS_SERVER = 6379
26
+ DASHBOARD = 8265
27
+ CLIENT = 10001
28
+ MULTISLICE = 8081
29
+
30
+ ray_cluster_crd_yaml = """apiVersion: v1
31
+ kind: Namespace
32
+ metadata:
33
+ name: ray
34
+ ---
35
+ apiVersion: ray.io/v1
36
+ kind: RayCluster
37
+ metadata:
38
+ name: raycluster
39
+ namespace: ray
40
+ spec:
41
+ rayVersion: '{version}'
42
+ headGroupSpec:
43
+ rayStartParams: {{}}
44
+ #pod template
45
+ template:
46
+ spec:
47
+ containers:
48
+ - name: ray-head
49
+ image: rayproject/ray:{version}
50
+ resources:
51
+ limits:
52
+ cpu: {head_cpu}
53
+ memory: {head_mem}
54
+ requests:
55
+ cpu: {head_cpu}
56
+ memory: {head_mem}
57
+ ports:
58
+ - containerPort: {gcs_server}
59
+ name: gcs-server
60
+ - containerPort: {dashboard} # Ray dashboard
61
+ name: dashboard
62
+ - containerPort: {client}
63
+ name: client
64
+ - containerPort: {multislice}
65
+ name: multislice
66
+ workerGroupSpecs:
67
+ - replicas: {replicas} # TODO: Set min and max replicas
68
+ numOfHosts: {num_hosts}
69
+ minReplicas: {replicas}
70
+ maxReplicas: {replicas}
71
+ groupName: workergroup0
72
+ rayStartParams:
73
+ block: 'true'
74
+ template:
75
+ spec:
76
+ containers:
77
+ - name: ray-worker
78
+ image: rayproject/ray:{version}
79
+ resources:
80
+ limits:
81
+ cpu: {worker_cpu}
82
+ google.com/tpu: {chips_per_vm}
83
+ memory: {worker_mem}
84
+ requests:
85
+ cpu: {worker_cpu}
86
+ google.com/tpu: {chips_per_vm}
87
+ memory: {worker_mem}
88
+ nodeSelector:
89
+ cloud.google.com/gke-tpu-accelerator: {accelerator}
90
+ cloud.google.com/gke-tpu-topology: {topology}
91
+ """
92
+
93
+
94
+ def install_ray_cluster(args, system) -> int:
95
+ """Install a RayCluster on the cluster
96
+
97
+ Args:
98
+ args: user provided arguments for running the command.
99
+ system: system characteristics.
100
+
101
+ Returns:
102
+ 0 if successful and 1 otherwise.
103
+ """
104
+
105
+ delete_ray_cluster(args)
106
+
107
+ label = 'cloud.google.com/gke-nodepool=default-pool'
108
+ available_head_cpu, available_head_mem = generate_available_resources(
109
+ label, args, HEAD_CPU
110
+ )
111
+
112
+ label = f'cloud.google.com/gke-tpu-accelerator={system.gke_accelerator}'
113
+ available_worker_cpu, available_worker_mem = generate_available_resources(
114
+ label, args, WORKER_CPU
115
+ )
116
+
117
+ yml_string = ray_cluster_crd_yaml.format(
118
+ accelerator=system.gke_accelerator,
119
+ topology=system.topology,
120
+ chips_per_vm=system.chips_per_vm,
121
+ num_hosts=system.vms_per_slice,
122
+ replicas=args.num_slices,
123
+ version=args.ray_version,
124
+ worker_cpu=available_worker_cpu,
125
+ worker_mem=available_worker_mem,
126
+ head_cpu=available_head_cpu,
127
+ head_mem=available_head_mem,
128
+ gcs_server=GCS_SERVER,
129
+ dashboard=DASHBOARD,
130
+ client=CLIENT,
131
+ multislice=MULTISLICE,
132
+ )
133
+
134
+ tmp = write_tmp_file(yml_string)
135
+ command = f'kubectl apply -f {str(tmp.file.name)}'
136
+ task = 'Applying RayCluster'
137
+ retry_attempts = 1
138
+ return_code = run_command_with_updates_retry(
139
+ command, task, args, num_retry_attempts=retry_attempts
140
+ )
141
+ if return_code != 0:
142
+ xpk_print(f'{task} not successful.')
143
+ xpk_exit(return_code)
144
+ return return_code
145
+
146
+
147
+ def delete_ray_cluster(args) -> None:
148
+ """Delete all RayClusters on the cluster
149
+
150
+ Args:
151
+ args: user provided arguments for running the command.
152
+
153
+ Returns:
154
+ None
155
+ """
156
+
157
+ command = 'kubectl delete rayclusters -n ray --all'
158
+ task = 'Deleting old RayCluster'
159
+ retry_attempts = 1
160
+ return_code = run_command_with_updates_retry(
161
+ command, task, args, num_retry_attempts=retry_attempts
162
+ )
163
+
164
+ if return_code != 0:
165
+ xpk_print(f'{task} not successful.')
166
+ xpk_exit(return_code)
167
+
168
+ return
169
+
170
+
171
+ def generate_available_resources(label, args, percent) -> tuple:
172
+ """Generate the available resources for the nodes that match the given label
173
+
174
+ Args:
175
+ label: the label used to match the appropriate nodes
176
+ args: user provided arguments for running the command
177
+ percent: the percent of the available resources to use
178
+
179
+ Returns:
180
+ A tuple with the available cpu and memory
181
+ """
182
+
183
+ command = (
184
+ f"kubectl get nodes -l {label} -o jsonpath='{{.items[0].metadata.name}}'"
185
+ )
186
+ task = f'Getting nodes with label {label}'
187
+ _, node_name = run_command_for_value(command, task, args)
188
+
189
+ command = (
190
+ f"kubectl get node {node_name} -o jsonpath='{{.status.allocatable.cpu}}'"
191
+ )
192
+ task = 'Fetching available CPU on node'
193
+ _, available_cpu = run_command_for_value(command, task, args)
194
+ match = re.match(r'(\d+)([a-zA-Z]+)', available_cpu)
195
+ if not match:
196
+ xpk_print(
197
+ 'Could not find a regex match for allocatable cpu on TPU node'
198
+ f' {node_name}'
199
+ )
200
+ xpk_exit(1)
201
+ value, units = match.group(1), match.group(2)
202
+ cpu_value = int(int(value) * percent)
203
+ adjusted_available_cpu = str(cpu_value) + units
204
+
205
+ command = (
206
+ f'kubectl get node {node_name} -o'
207
+ " jsonpath='{.status.allocatable.memory}'"
208
+ )
209
+ task = 'Fetching available memory on node'
210
+ _, available_memory = run_command_for_value(command, task, args)
211
+ match = re.match(r'(\d+)([a-zA-Z]+)', available_memory)
212
+ if not match:
213
+ xpk_print(
214
+ 'Could not find a regex match for allocatable memory on TPU node'
215
+ f' {node_name}'
216
+ )
217
+ xpk_exit(1)
218
+ value, units = match.group(1), match.group(2)
219
+ memory_value = int(int(value) * percent)
220
+ adjusted_available_memory = str(memory_value) + units
221
+
222
+ return adjusted_available_cpu, adjusted_available_memory