xpk 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. xpk/__init__.py +15 -0
  2. xpk/api/__init__.py +15 -0
  3. xpk/api/storage_crd.yaml +52 -0
  4. xpk/commands/__init__.py +15 -0
  5. xpk/commands/batch.py +131 -0
  6. xpk/commands/cluster.py +808 -0
  7. xpk/commands/cluster_gcluster.py +269 -0
  8. xpk/commands/common.py +44 -0
  9. xpk/commands/config.py +29 -0
  10. xpk/commands/info.py +243 -0
  11. xpk/commands/inspector.py +357 -0
  12. xpk/commands/job.py +199 -0
  13. xpk/commands/kind.py +283 -0
  14. xpk/commands/kjob_common.py +44 -0
  15. xpk/commands/run.py +128 -0
  16. xpk/commands/shell.py +140 -0
  17. xpk/commands/storage.py +267 -0
  18. xpk/commands/version.py +27 -0
  19. xpk/commands/workload.py +889 -0
  20. xpk/core/__init__.py +15 -0
  21. xpk/core/blueprint/__init__.py +15 -0
  22. xpk/core/blueprint/blueprint_definitions.py +62 -0
  23. xpk/core/blueprint/blueprint_generator.py +708 -0
  24. xpk/core/capacity.py +185 -0
  25. xpk/core/cluster.py +564 -0
  26. xpk/core/cluster_private.py +200 -0
  27. xpk/core/commands.py +356 -0
  28. xpk/core/config.py +179 -0
  29. xpk/core/docker_container.py +225 -0
  30. xpk/core/docker_image.py +210 -0
  31. xpk/core/docker_manager.py +308 -0
  32. xpk/core/docker_resources.py +350 -0
  33. xpk/core/filestore.py +251 -0
  34. xpk/core/gcloud_context.py +196 -0
  35. xpk/core/gcluster_manager.py +176 -0
  36. xpk/core/gcsfuse.py +50 -0
  37. xpk/core/kjob.py +444 -0
  38. xpk/core/kueue.py +358 -0
  39. xpk/core/monitoring.py +134 -0
  40. xpk/core/nap.py +361 -0
  41. xpk/core/network.py +377 -0
  42. xpk/core/nodepool.py +581 -0
  43. xpk/core/pathways.py +377 -0
  44. xpk/core/ray.py +222 -0
  45. xpk/core/remote_state/__init__.py +15 -0
  46. xpk/core/remote_state/fuse_remote_state.py +99 -0
  47. xpk/core/remote_state/remote_state_client.py +38 -0
  48. xpk/core/resources.py +238 -0
  49. xpk/core/scheduling.py +253 -0
  50. xpk/core/storage.py +581 -0
  51. xpk/core/system_characteristics.py +1432 -0
  52. xpk/core/vertex.py +105 -0
  53. xpk/core/workload.py +341 -0
  54. xpk/core/workload_decorators/__init__.py +15 -0
  55. xpk/core/workload_decorators/rdma_decorator.py +129 -0
  56. xpk/core/workload_decorators/storage_decorator.py +52 -0
  57. xpk/core/workload_decorators/tcpxo_decorator.py +190 -0
  58. xpk/main.py +75 -0
  59. xpk/parser/__init__.py +15 -0
  60. xpk/parser/batch.py +43 -0
  61. xpk/parser/cluster.py +662 -0
  62. xpk/parser/common.py +259 -0
  63. xpk/parser/config.py +49 -0
  64. xpk/parser/core.py +135 -0
  65. xpk/parser/info.py +64 -0
  66. xpk/parser/inspector.py +65 -0
  67. xpk/parser/job.py +147 -0
  68. xpk/parser/kind.py +95 -0
  69. xpk/parser/run.py +47 -0
  70. xpk/parser/shell.py +59 -0
  71. xpk/parser/storage.py +316 -0
  72. xpk/parser/validators.py +39 -0
  73. xpk/parser/version.py +23 -0
  74. xpk/parser/workload.py +726 -0
  75. xpk/templates/__init__.py +15 -0
  76. xpk/templates/storage.yaml +13 -0
  77. xpk/utils/__init__.py +15 -0
  78. xpk/utils/console.py +55 -0
  79. xpk/utils/file.py +82 -0
  80. xpk/utils/gcs_utils.py +125 -0
  81. xpk/utils/kubectl.py +57 -0
  82. xpk/utils/network.py +168 -0
  83. xpk/utils/objects.py +88 -0
  84. xpk/utils/templates.py +28 -0
  85. xpk/utils/validation.py +80 -0
  86. xpk/utils/yaml.py +30 -0
  87. xpk-0.0.1.dist-info/LICENSE +202 -0
  88. xpk-0.0.1.dist-info/METADATA +1498 -0
  89. xpk-0.0.1.dist-info/RECORD +92 -0
  90. xpk-0.0.1.dist-info/WHEEL +5 -0
  91. xpk-0.0.1.dist-info/entry_points.txt +2 -0
  92. xpk-0.0.1.dist-info/top_level.txt +1 -0
xpk/core/pathways.py ADDED
@@ -0,0 +1,377 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from .cluster import XPK_SA
18
+ from ..core.docker_container import get_user_workload_container
19
+ from ..core.gcloud_context import zone_to_region
20
+ from ..core.nodepool import get_all_nodepools_programmatic
21
+ from ..utils.console import xpk_exit, xpk_print
22
+ from .config import AcceleratorType
23
+ from .storage import Storage, get_storage_volumes_yaml, GCS_FUSE_ANNOTATION
24
+ from .system_characteristics import SystemCharacteristics
25
+
26
+ PathwaysExpectedInstancesMap = {
27
+ 'v6e': 'tpuv6e',
28
+ 'v5p': 'tpuv5',
29
+ 'v5litepod': 'tpuv5e',
30
+ 'v4': 'tpuv4',
31
+ 'v3': 'tpuv3',
32
+ }
33
+
34
+
35
+ def get_pathways_worker_args(args) -> str:
36
+ """Arguments for the Pathways workers.
37
+ Args:
38
+ args: user provided arguments for running the command.
39
+
40
+ Returns:
41
+ str: yaml containing arguments for the Pathways workers.
42
+ """
43
+ yaml = """- --server_port=29001
44
+ - --resource_manager_address={rm_address}
45
+ - --gcs_scratch_location={args.pathways_gcs_location}"""
46
+ if args.use_pathways:
47
+ if args.custom_pathways_worker_args:
48
+ yaml = append_custom_pathways_args(yaml, args.custom_pathways_worker_args)
49
+ return yaml.format(args=args, rm_address=get_rm_address(args))
50
+ else:
51
+ return ''
52
+
53
+
54
+ def get_pathways_proxy_args(args) -> str:
55
+ """Arguments for the Pathways proxy.
56
+ Args:
57
+ args: user provided arguments for running the command.
58
+
59
+ Returns:
60
+ str: yaml containing arguments for the Pathways proxy.
61
+ """
62
+ yaml = """- --server_port=29000
63
+ - --resource_manager_address={rm_address}
64
+ - --gcs_scratch_location={args.pathways_gcs_location}"""
65
+
66
+ if args.use_pathways:
67
+ if args.custom_pathways_proxy_server_args:
68
+ yaml = append_custom_pathways_args(
69
+ yaml, args.custom_pathways_proxy_server_args
70
+ )
71
+ return yaml.format(args=args, rm_address=get_rm_address(args))
72
+ else:
73
+ return ''
74
+
75
+
76
+ def get_pathways_sidecar_container(args) -> str:
77
+ """This is a sidecar container that runs the remote python server.
78
+
79
+ It is a special case of the initContainer (designated by restartPolicy:
80
+ Always)
81
+ See https://kubernetes.io/docs/concepts/workloads/pods/sidecar-containers/
82
+ for more details.
83
+ Args:
84
+ args: user provided arguments for running the command.
85
+
86
+ Returns:
87
+ str: yaml containing arguments for the Pathways sidecar container.
88
+ """
89
+ yaml = """initContainers:
90
+ - name: remote-python-sidecar
91
+ image: {args.remote_python_sidecar_image}
92
+ imagePullPolicy: Always
93
+ securityContext:
94
+ privileged: true
95
+ volumeMounts:
96
+ - mountPath: /tmp # Shared volume mount with the main container.
97
+ name: shared-tmp
98
+ restartPolicy: Always
99
+ ports:
100
+ - containerPort: 50051
101
+ env:
102
+ - name: GRPC_SERVER_ADDRESS
103
+ value: '0.0.0.0:50051'"""
104
+ if args.use_pathways and args.remote_python_sidecar_image is not None:
105
+ return yaml.format(args=args)
106
+ else:
107
+ return ''
108
+
109
+
110
+ def add_pw_resource_flavors(args):
111
+ """Add resource flavors required for Pathways enabled clusters."""
112
+ resource_flavor_yaml = """apiVersion: kueue.x-k8s.io/v1beta1
113
+ kind: ResourceFlavor
114
+ metadata:
115
+ name: cpu-rm
116
+ spec:
117
+ nodeLabels:
118
+ cloud.google.com/gke-nodepool: cpu-rm-np
119
+ ---
120
+ apiVersion: kueue.x-k8s.io/v1beta1
121
+ kind: ResourceFlavor
122
+ metadata:
123
+ name: cpu-proxy
124
+ spec:
125
+ nodeLabels:
126
+ cloud.google.com/gke-nodepool: cpu-proxy-np
127
+ ---
128
+ apiVersion: kueue.x-k8s.io/v1beta1
129
+ kind: ResourceFlavor
130
+ metadata:
131
+ name: cpu-user
132
+ spec:
133
+ nodeLabels:
134
+ cloud.google.com/gke-nodepool: cpu-user-np
135
+ ---"""
136
+ if args.enable_pathways:
137
+ return resource_flavor_yaml
138
+ return ''
139
+
140
+
141
+ def add_pw_resources_to_kueue(args):
142
+ """Add resource flavors required for Pathways, to the cluster queue."""
143
+ resources_yaml = """- coveredResources: ["cpu", "memory"]
144
+ flavors:
145
+ - name: cpu-rm
146
+ resources:
147
+ - name: "cpu"
148
+ nominalQuota: 480
149
+ - name: "memory"
150
+ nominalQuota: 2000G
151
+ - name: cpu-proxy
152
+ resources:
153
+ - name: "cpu"
154
+ nominalQuota: 480
155
+ - name: "memory"
156
+ nominalQuota: 2000G
157
+ - name: cpu-user
158
+ resources:
159
+ - name: "cpu"
160
+ nominalQuota: 480
161
+ - name: "memory"
162
+ nominalQuota: 2000G"""
163
+ if args.enable_pathways:
164
+ return resources_yaml
165
+ return ''
166
+
167
+
168
+ def ensure_pathways_workload_prerequisites(args, system) -> bool:
169
+ """Check all Pathways workload prerequisites and set necessary args.
170
+
171
+ Args:
172
+ args: user provided arguments for running the command.
173
+ system: system characteristics.
174
+
175
+ Returns:
176
+ True once conditions satisfy and variables are set. Exits otherwise.
177
+ """
178
+ # Ensure command is provided if not using Pathways in headless mode
179
+ if args.command is None and not args.headless:
180
+ xpk_print(
181
+ 'Please provide a command using "--command" for the docker container to'
182
+ ' execute. Command is not required if you wish to run Pathways'
183
+ ' workloads in headless mode (`xpk workload create-pathways'
184
+ ' --headless`).'
185
+ )
186
+ xpk_exit(1)
187
+
188
+ # Ensure the cluster and CPU nodepools were created with create-pathways
189
+ all_node_pools = get_all_nodepools_programmatic(args)
190
+ desired_pw_cpu_node_pools = {'cpu-user-np', 'cpu-rm-np', 'cpu-proxy-np'}
191
+ if not desired_pw_cpu_node_pools.issubset(set(all_node_pools[0])):
192
+ xpk_print(
193
+ 'Cluster needs to be created with `xpk create-pathways` to run'
194
+ ' Pathways workloads.'
195
+ )
196
+ xpk_exit(1)
197
+
198
+ # Ensure device type is TPUs - currently Pathways supports TPUs only.
199
+ if system.accelerator_type != AcceleratorType['TPU']:
200
+ xpk_print('Currently, Pathways workloads can only be run on TPUs.')
201
+ xpk_exit(1)
202
+
203
+ # Set proxy address to be consumed in helper methods and displayed to user.
204
+ args.pathways_proxy_address = get_proxy_address(args)
205
+
206
+ # Set the job which determines the life of other Pathways jobs
207
+ args.targetReplicatedJob = 'proxy' if args.headless else 'main'
208
+
209
+ return True
210
+
211
+
212
+ def get_pathways_unified_query_link(args) -> str:
213
+ """Get the unified query link for the pathways workload."""
214
+ query_params = (
215
+ 'resource.type%3D"k8s_container"%0A'
216
+ f'resource.labels.project_id%3D"{args.project}"%0A'
217
+ f'resource.labels.location%3D"{zone_to_region(args.zone)}"%0A'
218
+ f'resource.labels.cluster_name%3D"{args.cluster}"%0A'
219
+ f'resource.labels.pod_name:"{args.workload}-"%0A'
220
+ 'severity>%3DDEFAULT'
221
+ )
222
+
223
+ return f'https://console.cloud.google.com/logs/query;query={query_params}'
224
+
225
+
226
+ def get_pathways_rm_args(args, system: SystemCharacteristics) -> str:
227
+ """Arguments for the Pathways resource manager.
228
+ Args:
229
+ args: user provided arguments for running the command.
230
+
231
+ Returns:
232
+ str: yaml containing arguments for the Pathways resource manager.
233
+ """
234
+ yaml = """- --server_port=29001
235
+ - --gcs_scratch_location={args.pathways_gcs_location}
236
+ - --node_type=resource_manager
237
+ - --instance_count={instance_count}
238
+ - --instance_type={instance_type}"""
239
+ if args.use_pathways:
240
+ if args.custom_pathways_server_args:
241
+ yaml = append_custom_pathways_args(yaml, args.custom_pathways_server_args)
242
+ return yaml.format(
243
+ args=args,
244
+ instance_count=args.num_slices,
245
+ instance_type=f'{get_pathways_expected_tpu_type(system.device_type)}:{system.topology}',
246
+ )
247
+ else:
248
+ return ''
249
+
250
+
251
+ def append_custom_pathways_args(yaml, custom_args) -> str:
252
+ """Append custom Pathways args to the YAML with proper indentation.
253
+
254
+ Args:
255
+ yaml (string): existing yaml containing args
256
+
257
+ Returns:
258
+ yaml (string): yaml with additional args appended.
259
+ """
260
+ second_line = yaml.split('\n')[1]
261
+ if (
262
+ not second_line
263
+ ): # to cover edge case if only one arg remains, we would have to look at the entire YAML in this case.
264
+ return yaml
265
+ # Calculate the indentation based on the second line of existing YAML.
266
+ indentation = ' ' * (len(second_line) - len(second_line.lstrip()))
267
+ custom_args = custom_args.split(' ')
268
+ for arg in custom_args:
269
+ yaml += '\n' + indentation + '- ' + arg
270
+ return yaml
271
+
272
+
273
+ def get_user_workload_for_pathways(
274
+ args,
275
+ system: SystemCharacteristics,
276
+ pod_failure_policy,
277
+ storages: list[Storage],
278
+ ) -> str:
279
+ """
280
+ Create a user workload container for Pathways.
281
+ Don't create one for Pathways headless mode.
282
+
283
+ Args:
284
+ args: user provided args.
285
+ system: system characteristics.
286
+
287
+
288
+ Returns:
289
+ str:
290
+ Pathways server port as a YAML string
291
+ """
292
+ user_workload_yaml = """- name: main
293
+ replicas: 1
294
+ template:
295
+ metadata:
296
+ labels:
297
+ xpk.google.com/workload: {args.workload}
298
+ spec:
299
+ backoffLimit: 0
300
+ completions: 1
301
+ parallelism: 1
302
+ {pod_failure_policy}
303
+ template:
304
+ metadata:
305
+ annotations:
306
+ {gcs_fuse_annotation}
307
+ spec:
308
+ containers:
309
+ {container}
310
+ serviceAccountName: {service_account}
311
+ nodeSelector:
312
+ cloud.google.com/gke-nodepool: cpu-user-np
313
+ hostNetwork: true
314
+ dnsPolicy: ClusterFirstWithHostNet
315
+ restartPolicy: Never
316
+ volumes:
317
+ - hostPath:
318
+ path: /tmp
319
+ type: DirectoryOrCreate
320
+ name: shared-tmp
321
+ {storage_volumes}"""
322
+ if args.headless:
323
+ return ''
324
+ else:
325
+ container, _ = get_user_workload_container(args, system)
326
+ storage_volumes = get_storage_volumes_yaml(storages)
327
+ return user_workload_yaml.format(
328
+ args=args,
329
+ container=container,
330
+ storage_volumes=storage_volumes,
331
+ pod_failure_policy=pod_failure_policy,
332
+ service_account=XPK_SA,
333
+ gcs_fuse_annotation=GCS_FUSE_ANNOTATION,
334
+ )
335
+
336
+
337
+ def get_rm_address(args) -> str:
338
+ """Generates the Pathways resource manager address.
339
+ Args:
340
+ args: user provided arguments for running the command.
341
+
342
+ Returns:
343
+ str: Fully qualified RM address.
344
+ """
345
+ rm_address = f'{args.workload}-rm-0-0.{args.workload}:29001'
346
+ return rm_address
347
+
348
+
349
+ def get_proxy_address(args) -> str:
350
+ """Generates the Pathways proxy address.
351
+ Args:
352
+ args: user provided arguments for running the command.
353
+
354
+ Returns:
355
+ str: Fully qualified proxy address.
356
+ """
357
+ proxy_address = f'grpc://{args.workload}-proxy-0-0.{args.workload}:29000'
358
+ return proxy_address
359
+
360
+
361
+ def get_pathways_expected_tpu_type(device_type: str) -> str:
362
+ """Returns the device type expected by Pathways
363
+ Args:
364
+ device_type: the system characteristic device type
365
+
366
+ Returns:
367
+ str: the device type expected by pathways.
368
+ """
369
+ raw_type = device_type.split('-')[0].lower()
370
+ pathways_expected_instance = PathwaysExpectedInstancesMap[raw_type]
371
+ if not pathways_expected_instance:
372
+ xpk_print(
373
+ f'Passed in device_type {device_type} is incorrect. Please pass in a'
374
+ ' valid device type'
375
+ )
376
+ xpk_exit(1)
377
+ return pathways_expected_instance
xpk/core/ray.py ADDED
@@ -0,0 +1,222 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import re
18
+ from ..utils.console import xpk_exit, xpk_print
19
+ from ..utils.file import write_tmp_file
20
+ from .commands import run_command_for_value, run_command_with_updates_retry
21
+
22
+
23
+ HEAD_CPU = 0.5
24
+ WORKER_CPU = 0.9
25
+ GCS_SERVER = 6379
26
+ DASHBOARD = 8265
27
+ CLIENT = 10001
28
+ MULTISLICE = 8081
29
+
30
+ ray_cluster_crd_yaml = """apiVersion: v1
31
+ kind: Namespace
32
+ metadata:
33
+ name: ray
34
+ ---
35
+ apiVersion: ray.io/v1
36
+ kind: RayCluster
37
+ metadata:
38
+ name: raycluster
39
+ namespace: ray
40
+ spec:
41
+ rayVersion: '{version}'
42
+ headGroupSpec:
43
+ rayStartParams: {{}}
44
+ #pod template
45
+ template:
46
+ spec:
47
+ containers:
48
+ - name: ray-head
49
+ image: rayproject/ray:{version}
50
+ resources:
51
+ limits:
52
+ cpu: {head_cpu}
53
+ memory: {head_mem}
54
+ requests:
55
+ cpu: {head_cpu}
56
+ memory: {head_mem}
57
+ ports:
58
+ - containerPort: {gcs_server}
59
+ name: gcs-server
60
+ - containerPort: {dashboard} # Ray dashboard
61
+ name: dashboard
62
+ - containerPort: {client}
63
+ name: client
64
+ - containerPort: {multislice}
65
+ name: multislice
66
+ workerGroupSpecs:
67
+ - replicas: {replicas} # TODO: Set min and max replicas
68
+ numOfHosts: {num_hosts}
69
+ minReplicas: {replicas}
70
+ maxReplicas: {replicas}
71
+ groupName: workergroup0
72
+ rayStartParams:
73
+ block: 'true'
74
+ template:
75
+ spec:
76
+ containers:
77
+ - name: ray-worker
78
+ image: rayproject/ray:{version}
79
+ resources:
80
+ limits:
81
+ cpu: {worker_cpu}
82
+ google.com/tpu: {chips_per_vm}
83
+ memory: {worker_mem}
84
+ requests:
85
+ cpu: {worker_cpu}
86
+ google.com/tpu: {chips_per_vm}
87
+ memory: {worker_mem}
88
+ nodeSelector:
89
+ cloud.google.com/gke-tpu-accelerator: {accelerator}
90
+ cloud.google.com/gke-tpu-topology: {topology}
91
+ """
92
+
93
+
94
+ def install_ray_cluster(args, system) -> int:
95
+ """Install a RayCluster on the cluster
96
+
97
+ Args:
98
+ args: user provided arguments for running the command.
99
+ system: system characteristics.
100
+
101
+ Returns:
102
+ 0 if successful and 1 otherwise.
103
+ """
104
+
105
+ delete_ray_cluster(args)
106
+
107
+ label = 'cloud.google.com/gke-nodepool=default-pool'
108
+ available_head_cpu, available_head_mem = generate_available_resources(
109
+ label, args, HEAD_CPU
110
+ )
111
+
112
+ label = f'cloud.google.com/gke-tpu-accelerator={system.gke_accelerator}'
113
+ available_worker_cpu, available_worker_mem = generate_available_resources(
114
+ label, args, WORKER_CPU
115
+ )
116
+
117
+ yml_string = ray_cluster_crd_yaml.format(
118
+ accelerator=system.gke_accelerator,
119
+ topology=system.topology,
120
+ chips_per_vm=system.chips_per_vm,
121
+ num_hosts=system.vms_per_slice,
122
+ replicas=args.num_slices,
123
+ version=args.ray_version,
124
+ worker_cpu=available_worker_cpu,
125
+ worker_mem=available_worker_mem,
126
+ head_cpu=available_head_cpu,
127
+ head_mem=available_head_mem,
128
+ gcs_server=GCS_SERVER,
129
+ dashboard=DASHBOARD,
130
+ client=CLIENT,
131
+ multislice=MULTISLICE,
132
+ )
133
+
134
+ tmp = write_tmp_file(yml_string)
135
+ command = f'kubectl apply -f {str(tmp.file.name)}'
136
+ task = 'Applying RayCluster'
137
+ retry_attempts = 1
138
+ return_code = run_command_with_updates_retry(
139
+ command, task, args, num_retry_attempts=retry_attempts
140
+ )
141
+ if return_code != 0:
142
+ xpk_print(f'{task} not successful.')
143
+ xpk_exit(return_code)
144
+ return return_code
145
+
146
+
147
+ def delete_ray_cluster(args) -> None:
148
+ """Delete all RayClusters on the cluster
149
+
150
+ Args:
151
+ args: user provided arguments for running the command.
152
+
153
+ Returns:
154
+ None
155
+ """
156
+
157
+ command = 'kubectl delete rayclusters -n ray --all'
158
+ task = 'Deleting old RayCluster'
159
+ retry_attempts = 1
160
+ return_code = run_command_with_updates_retry(
161
+ command, task, args, num_retry_attempts=retry_attempts
162
+ )
163
+
164
+ if return_code != 0:
165
+ xpk_print(f'{task} not successful.')
166
+ xpk_exit(return_code)
167
+
168
+ return
169
+
170
+
171
+ def generate_available_resources(label, args, percent) -> tuple:
172
+ """Generate the available resources for the nodes that match the given label
173
+
174
+ Args:
175
+ label: the label used to match the appropriate nodes
176
+ args: user provided arguments for running the command
177
+ percent: the percent of the available resources to use
178
+
179
+ Returns:
180
+ A tuple with the available cpu and memory
181
+ """
182
+
183
+ command = (
184
+ f"kubectl get nodes -l {label} -o jsonpath='{{.items[0].metadata.name}}'"
185
+ )
186
+ task = f'Getting nodes with label {label}'
187
+ _, node_name = run_command_for_value(command, task, args)
188
+
189
+ command = (
190
+ f"kubectl get node {node_name} -o jsonpath='{{.status.allocatable.cpu}}'"
191
+ )
192
+ task = 'Fetching available CPU on node'
193
+ _, available_cpu = run_command_for_value(command, task, args)
194
+ match = re.match(r'(\d+)([a-zA-Z]+)', available_cpu)
195
+ if not match:
196
+ xpk_print(
197
+ 'Could not find a regex match for allocatable cpu on TPU node'
198
+ f' {node_name}'
199
+ )
200
+ xpk_exit(1)
201
+ value, units = match.group(1), match.group(2)
202
+ cpu_value = int(int(value) * percent)
203
+ adjusted_available_cpu = str(cpu_value) + units
204
+
205
+ command = (
206
+ f'kubectl get node {node_name} -o'
207
+ " jsonpath='{.status.allocatable.memory}'"
208
+ )
209
+ task = 'Fetching available memory on node'
210
+ _, available_memory = run_command_for_value(command, task, args)
211
+ match = re.match(r'(\d+)([a-zA-Z]+)', available_memory)
212
+ if not match:
213
+ xpk_print(
214
+ 'Could not find a regex match for allocatable memory on TPU node'
215
+ f' {node_name}'
216
+ )
217
+ xpk_exit(1)
218
+ value, units = match.group(1), match.group(2)
219
+ memory_value = int(int(value) * percent)
220
+ adjusted_available_memory = str(memory_value) + units
221
+
222
+ return adjusted_available_cpu, adjusted_available_memory
@@ -0,0 +1,15 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
@@ -0,0 +1,99 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from .remote_state_client import RemoteStateClient
18
+ from ...utils.gcs_utils import upload_directory_to_gcs, check_file_exists, download_bucket_to_dir, upload_file_to_gcs
19
+ from ...utils.console import xpk_print
20
+ from google.cloud.storage import Client
21
+ import os
22
+
23
+
24
+ class FuseStateClient(RemoteStateClient):
25
+ """FuseStateClient is a class for managing remote xpk state stored in GCS Fuse."""
26
+
27
+ def __init__(
28
+ self,
29
+ bucket: str,
30
+ state_directory: str,
31
+ cluster: str,
32
+ deployment_name: str,
33
+ prefix: str,
34
+ ) -> None:
35
+ self.bucket = bucket
36
+ self.state_dir = state_directory
37
+ self.storage_client = Client()
38
+ self.cluster = cluster
39
+ self.prefix = prefix
40
+ self.deployment_name = deployment_name
41
+
42
+ def _get_bucket_path(self) -> str:
43
+ return (
44
+ f'xpk_terraform_state/{self.prefix}/blueprints/{self.deployment_name}/'
45
+ )
46
+
47
+ def _get_bucket_path_blueprint(self) -> str:
48
+ return f'xpk_terraform_state/{self.prefix}/blueprints/'
49
+
50
+ def _get_deployment_filename(self) -> str:
51
+ return f'{self.deployment_name}.yaml'
52
+
53
+ def _get_blueprint_path(self) -> str:
54
+ blueprint_dir = '/'.join(self.state_dir.split('/')[:-1])
55
+ return os.path.join(blueprint_dir, self.deployment_name) + '.yaml'
56
+
57
+ def upload_state(self) -> None:
58
+ xpk_print(
59
+ f'Uploading dependecies from directory {self.state_dir} to bucket:'
60
+ f' {self.bucket}. Path within bucket is: {self._get_bucket_path()}'
61
+ )
62
+ upload_directory_to_gcs(
63
+ storage_client=self.storage_client,
64
+ bucket_name=self.bucket,
65
+ bucket_path=self._get_bucket_path(),
66
+ source_directory=self.state_dir,
67
+ )
68
+ blueprint_bucket_path = (
69
+ self._get_bucket_path_blueprint() + self._get_deployment_filename()
70
+ )
71
+ xpk_print(
72
+ f'Uploading blueprint file: {self._get_blueprint_path()} to bucket'
73
+ f' {self.bucket}. Path within bucket is: {blueprint_bucket_path}'
74
+ )
75
+ upload_file_to_gcs(
76
+ storage_client=self.storage_client,
77
+ bucket_name=self.bucket,
78
+ bucket_path=blueprint_bucket_path,
79
+ file=self._get_blueprint_path(),
80
+ )
81
+
82
+ def download_state(self) -> None:
83
+ xpk_print(
84
+ f'Downloading from bucket: {self.bucket}, from path:'
85
+ f' {self._get_bucket_path()} to directory: {self.state_dir}'
86
+ )
87
+ download_bucket_to_dir(
88
+ self.storage_client,
89
+ self.bucket,
90
+ self._get_bucket_path(),
91
+ destination_directory=self.state_dir,
92
+ )
93
+
94
+ def check_remote_state_exists(self) -> bool:
95
+ return check_file_exists(
96
+ self.storage_client,
97
+ self.bucket,
98
+ self._get_bucket_path_blueprint() + self._get_deployment_filename(),
99
+ )