xpk 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. xpk/api/__init__.py +15 -0
  2. xpk/api/storage_crd.yaml +52 -0
  3. xpk/commands/batch.py +27 -5
  4. xpk/commands/cluster.py +104 -80
  5. xpk/commands/cluster_gcluster.py +94 -10
  6. xpk/commands/common.py +44 -0
  7. xpk/commands/config.py +29 -0
  8. xpk/commands/info.py +8 -10
  9. xpk/commands/inspector.py +5 -11
  10. xpk/commands/job.py +9 -7
  11. xpk/commands/kind.py +34 -4
  12. xpk/commands/kjob_common.py +44 -0
  13. xpk/commands/run.py +128 -0
  14. xpk/commands/shell.py +27 -7
  15. xpk/commands/storage.py +267 -0
  16. xpk/commands/version.py +6 -18
  17. xpk/commands/workload.py +381 -184
  18. xpk/core/blueprint/blueprint_definitions.py +1 -0
  19. xpk/core/blueprint/blueprint_generator.py +132 -76
  20. xpk/core/capacity.py +185 -0
  21. xpk/core/cluster.py +564 -0
  22. xpk/core/cluster_private.py +6 -3
  23. xpk/core/commands.py +18 -14
  24. xpk/core/config.py +179 -0
  25. xpk/core/docker_container.py +225 -0
  26. xpk/core/docker_image.py +210 -0
  27. xpk/core/docker_resources.py +350 -0
  28. xpk/core/filestore.py +251 -0
  29. xpk/core/gcloud_context.py +196 -0
  30. xpk/core/gcluster_manager.py +20 -2
  31. xpk/core/gcsfuse.py +50 -0
  32. xpk/core/kjob.py +257 -18
  33. xpk/core/kueue.py +12 -6
  34. xpk/core/monitoring.py +134 -0
  35. xpk/core/nap.py +32 -20
  36. xpk/core/network.py +377 -0
  37. xpk/core/nodepool.py +581 -0
  38. xpk/core/pathways.py +124 -45
  39. xpk/core/remote_state/__init__.py +15 -0
  40. xpk/core/remote_state/fuse_remote_state.py +99 -0
  41. xpk/core/remote_state/remote_state_client.py +38 -0
  42. xpk/core/resources.py +238 -0
  43. xpk/core/scheduling.py +253 -0
  44. xpk/core/storage.py +581 -0
  45. xpk/core/system_characteristics.py +38 -1
  46. xpk/core/vertex.py +105 -0
  47. xpk/core/workload.py +209 -1
  48. xpk/core/workload_decorators/rdma_decorator.py +25 -5
  49. xpk/core/workload_decorators/storage_decorator.py +52 -0
  50. xpk/core/workload_decorators/tcpxo_decorator.py +70 -37
  51. xpk/main.py +3 -1
  52. xpk/parser/batch.py +10 -151
  53. xpk/parser/cluster.py +49 -8
  54. xpk/parser/common.py +189 -1
  55. xpk/parser/config.py +49 -0
  56. xpk/parser/core.py +27 -1
  57. xpk/parser/info.py +2 -1
  58. xpk/parser/inspector.py +3 -3
  59. xpk/parser/job.py +25 -4
  60. xpk/parser/kind.py +3 -2
  61. xpk/parser/run.py +47 -0
  62. xpk/parser/shell.py +10 -1
  63. xpk/parser/storage.py +316 -0
  64. xpk/parser/validators.py +3 -3
  65. xpk/parser/workload.py +118 -76
  66. xpk/templates/__init__.py +15 -0
  67. xpk/templates/storage.yaml +13 -0
  68. xpk/utils/gcs_utils.py +125 -0
  69. xpk/utils/kubectl.py +57 -0
  70. xpk/utils/objects.py +8 -5
  71. xpk/utils/templates.py +28 -0
  72. xpk/utils/validation.py +80 -0
  73. {xpk-0.6.0.dist-info → xpk-0.7.0.dist-info}/METADATA +165 -14
  74. xpk-0.7.0.dist-info/RECORD +92 -0
  75. {xpk-0.6.0.dist-info → xpk-0.7.0.dist-info}/WHEEL +1 -1
  76. xpk/core/core.py +0 -2824
  77. xpk-0.6.0.dist-info/RECORD +0 -57
  78. {xpk-0.6.0.dist-info → xpk-0.7.0.dist-info}/LICENSE +0 -0
  79. {xpk-0.6.0.dist-info → xpk-0.7.0.dist-info}/entry_points.txt +0 -0
  80. {xpk-0.6.0.dist-info → xpk-0.7.0.dist-info}/top_level.txt +0 -0
xpk/api/__init__.py ADDED
@@ -0,0 +1,15 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
@@ -0,0 +1,52 @@
1
+ apiVersion: apiextensions.k8s.io/v1
2
+ kind: CustomResourceDefinition
3
+ metadata:
4
+ name: storages.xpk.x-k8s.io
5
+ spec:
6
+ group: xpk.x-k8s.io
7
+ versions:
8
+ - name: v1
9
+ served: true
10
+ storage: true
11
+ schema:
12
+ openAPIV3Schema:
13
+ type: object
14
+ properties:
15
+ spec:
16
+ type: object
17
+ properties:
18
+ type:
19
+ type: string
20
+ cluster:
21
+ type: string
22
+ auto_mount:
23
+ type: boolean
24
+ mount_point:
25
+ type: string
26
+ readonly:
27
+ type: boolean
28
+ manifest:
29
+ type: string
30
+ pv:
31
+ type: string
32
+ pvc:
33
+ type: string
34
+ required:
35
+ - type
36
+ - cluster
37
+ - auto_mount
38
+ - mount_point
39
+ - readonly
40
+ - manifest
41
+ - pvc
42
+ - pv
43
+ x-kubernetes-validations:
44
+ - message: Value is immutable
45
+ rule: self == oldSelf
46
+ scope: Cluster
47
+ names:
48
+ plural: storages
49
+ singular: storage
50
+ kind: Storage
51
+ shortNames:
52
+ - stg
xpk/commands/batch.py CHANGED
@@ -16,13 +16,16 @@ limitations under the License.
16
16
 
17
17
  from argparse import Namespace
18
18
 
19
+ from ..core.cluster import create_xpk_k8s_service_account
20
+ from ..core.commands import run_command_for_value
21
+ from ..core.gcloud_context import add_zone_and_project
19
22
  from ..core.kueue import LOCAL_QUEUE_NAME
20
23
  from ..utils.console import xpk_exit, xpk_print
21
- from .cluster import set_cluster_command
22
- from ..core.core import add_zone_and_project
23
- from ..core.kjob import AppProfileDefaults
24
- from ..core.commands import run_command_for_value
24
+ from .common import set_cluster_command
25
+ from ..core.kjob import AppProfileDefaults, JobTemplateDefaults, prepare_kjob, Kueue_TAS_annotation, get_gcsfuse_annotation
26
+ from .kjob_common import add_gpu_networking_annotations_to_command
25
27
  from .kind import set_local_cluster_command
28
+ import re
26
29
 
27
30
 
28
31
  def batch(args: Namespace) -> None:
@@ -42,15 +45,30 @@ def batch(args: Namespace) -> None:
42
45
  if set_cluster_command_code != 0:
43
46
  xpk_exit(set_cluster_command_code)
44
47
 
48
+ err_code = prepare_kjob(args)
49
+ if err_code > 0:
50
+ xpk_exit(err_code)
51
+ create_xpk_k8s_service_account()
52
+
45
53
  submit_job(args)
46
54
 
47
55
 
48
56
  def submit_job(args: Namespace) -> None:
57
+
58
+ create_xpk_k8s_service_account()
59
+
49
60
  cmd = (
50
61
  'kubectl kjob create slurm'
51
62
  f' --profile {AppProfileDefaults.NAME.value}'
52
63
  f' --localqueue {LOCAL_QUEUE_NAME}'
64
+ f' --pod-template-annotation {Kueue_TAS_annotation}'
65
+ f' --worker-container {JobTemplateDefaults.CONTAINER_NAME.value}'
66
+ ' --first-node-ip'
53
67
  )
68
+ cmd = add_gpu_networking_annotations_to_command(args, cmd)
69
+ gcsfuse_annotation = get_gcsfuse_annotation(args)
70
+ if gcsfuse_annotation is not None:
71
+ cmd += f' --pod-template-annotation {gcsfuse_annotation}'
54
72
 
55
73
  if args.ignore_unknown_flags:
56
74
  cmd += ' --ignore-unknown-flags'
@@ -102,8 +120,12 @@ def submit_job(args: Namespace) -> None:
102
120
  if args.time is not None:
103
121
  cmd += f' --time {args.time}'
104
122
 
105
- return_code, _ = run_command_for_value(cmd, 'submit job', args)
123
+ return_code, return_value = run_command_for_value(cmd, 'submit job', args)
106
124
 
107
125
  if return_code != 0:
108
126
  xpk_print(f'Running batch job returned ERROR {return_code}')
109
127
  xpk_exit(return_code)
128
+
129
+ m = re.match(r'job\.batch/([-a-z0-9]+)', return_value)
130
+ if m:
131
+ xpk_print(f'Job name: {m.group(1)}')
xpk/commands/cluster.py CHANGED
@@ -14,36 +14,28 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
- from ..core.commands import (
18
- run_command_for_value,
19
- run_command_with_updates,
20
- run_command_with_updates_retry,
21
- )
22
- from ..core.core import (
23
- VERTEX_TENSORBOARD_FEATURE_FLAG,
24
- add_zone_and_project,
25
- create_cluster_configmaps,
26
- create_cluster_network_config,
27
- create_vertex_tensorboard,
28
- delete_cluster_subnets,
17
+ from tabulate import tabulate
18
+
19
+ from ..core.capacity import H100_DEVICE_TYPE
20
+ from ..core.cluster import (
29
21
  get_all_clusters_programmatic,
30
- get_gke_control_plane_version,
31
- get_gke_node_pool_version,
32
- get_gke_server_config,
33
- h100_device_type,
22
+ get_cluster_credentials,
34
23
  install_nccl_on_cluster,
35
- run_gke_node_pool_create_command,
36
24
  set_jobset_on_cluster,
37
- set_up_cluster_network_for_gpu,
38
- zone_to_region,
39
- get_user_input,
25
+ setup_k8s_env,
26
+ update_cluster_with_gcsfuse_driver_if_necessary,
27
+ update_cluster_with_workload_identity_if_necessary,
40
28
  )
41
29
  from ..core.cluster_private import authorize_private_cluster_access_if_necessary
42
- from ..core.kjob import (
43
- verify_kjob_installed,
44
- prepare_kjob,
45
- apply_kjob_crds,
30
+ from ..core.commands import run_command_for_value, run_command_with_updates
31
+ from ..core.config import VERTEX_TENSORBOARD_FEATURE_FLAG
32
+ from ..core.gcloud_context import (
33
+ add_zone_and_project,
34
+ get_gke_control_plane_version,
35
+ get_gke_server_config,
36
+ zone_to_region,
46
37
  )
38
+ from ..core.kjob import apply_kjob_crds, prepare_kjob, verify_kjob_installed
47
39
  from ..core.kueue import (
48
40
  cluster_preheat_yml,
49
41
  install_kueue_crs,
@@ -51,19 +43,28 @@ from ..core.kueue import (
51
43
  wait_for_kueue_available,
52
44
  )
53
45
  from ..core.nap import enable_autoprovisioning_on_cluster
46
+ from ..core.network import (
47
+ create_cluster_network_config,
48
+ delete_cluster_subnets,
49
+ set_up_cluster_network_for_gpu,
50
+ )
51
+ from ..core.nodepool import get_gke_node_pool_version, run_gke_node_pool_create_command
54
52
  from ..core.ray import install_ray_cluster
53
+ from ..core.resources import create_cluster_configmaps
54
+ from ..core.storage import install_storage_crd
55
55
  from ..core.system_characteristics import (
56
56
  AcceleratorType,
57
57
  AcceleratorTypeToAcceleratorCharacteristics,
58
58
  SystemCharacteristics,
59
59
  get_system_characteristics,
60
60
  )
61
+ from ..core.vertex import create_vertex_tensorboard
61
62
  from ..core.workload import get_workload_list
63
+ from ..utils.console import get_user_input, xpk_exit, xpk_print
62
64
  from ..utils.file import write_tmp_file
63
- from ..utils.console import xpk_exit, xpk_print
64
65
  from . import cluster_gcluster
65
-
66
- from tabulate import tabulate
66
+ from .common import set_cluster_command
67
+ from ..core.cluster import update_cluster_with_gcpfilestore_driver_if_necessary
67
68
 
68
69
 
69
70
  def cluster_create(args) -> None:
@@ -115,10 +116,36 @@ def cluster_create(args) -> None:
115
116
  xpk_exit(authorize_private_cluster_access_command_code)
116
117
 
117
118
  # ToDo(roshanin@) - Re-enable CloudDNS on Pathways clusters conditionally.
119
+ # Enable WorkloadIdentity if not enabled already.
120
+ if (
121
+ args.enable_workload_identity
122
+ or args.enable_gcsfuse_csi_driver
123
+ or args.enable_gcpfilestore_csi_driver
124
+ ):
125
+ update_cluster_command_code = (
126
+ update_cluster_with_workload_identity_if_necessary(args)
127
+ )
128
+ if update_cluster_command_code != 0:
129
+ xpk_exit(update_cluster_command_code)
118
130
 
119
- set_cluster_command_code = set_cluster_command(args)
120
- if set_cluster_command_code != 0:
121
- xpk_exit(set_cluster_command_code)
131
+ # Enable GCSFuse CSI Driver if not enabled already.
132
+ if args.enable_gcsfuse_csi_driver:
133
+ update_cluster_command_code = (
134
+ update_cluster_with_gcsfuse_driver_if_necessary(args)
135
+ )
136
+ if update_cluster_command_code != 0:
137
+ xpk_exit(update_cluster_command_code)
138
+
139
+ if args.enable_gcpfilestore_csi_driver:
140
+ update_cluster_command_code = (
141
+ update_cluster_with_gcpfilestore_driver_if_necessary(args)
142
+ )
143
+ if update_cluster_command_code != 0:
144
+ xpk_exit(update_cluster_command_code)
145
+
146
+ # Update Pathways clusters with CloudDNS if not enabled already.
147
+
148
+ get_cluster_credentials(args)
122
149
 
123
150
  # create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
124
151
  tensorboard_config = {}
@@ -134,7 +161,7 @@ def cluster_create(args) -> None:
134
161
  if set_up_cluster_network_code != 0:
135
162
  xpk_exit(set_up_cluster_network_code)
136
163
 
137
- if system.device_type == h100_device_type:
164
+ if system.device_type == H100_DEVICE_TYPE:
138
165
  xpk_print('Creating Network Config for cluster')
139
166
  create_cluster_network_config_code = create_cluster_network_config(args)
140
167
  if create_cluster_network_config_code != 0:
@@ -154,6 +181,24 @@ def cluster_create(args) -> None:
154
181
  if run_gke_node_pool_create_command_code != 0:
155
182
  xpk_exit(run_gke_node_pool_create_command_code)
156
183
 
184
+ # Provision node pools dynamically based on incoming workloads:
185
+ # Currently autoprovisioning is not supported with Pathways.
186
+ autoprovisioning_config = None
187
+ if not args.enable_pathways and args.enable_autoprovisioning:
188
+ xpk_print('Enabling Autoprovisioning')
189
+ autoprovisioning_config, return_code = enable_autoprovisioning_on_cluster(
190
+ args, system
191
+ )
192
+ if return_code != 0:
193
+ xpk_exit(return_code)
194
+
195
+ xpk_print('Creating ConfigMap for cluster')
196
+ create_cluster_configmaps_code = create_cluster_configmaps(
197
+ args, system, tensorboard_config, autoprovisioning_config
198
+ )
199
+ if create_cluster_configmaps_code != 0:
200
+ xpk_exit(create_cluster_configmaps_code)
201
+
157
202
  xpk_print(
158
203
  'Enabling the jobset API on our cluster, to be deprecated when Jobset is'
159
204
  ' globally available'
@@ -177,20 +222,12 @@ def cluster_create(args) -> None:
177
222
  if err_code > 0:
178
223
  xpk_exit(err_code)
179
224
 
180
- xpk_print('Preparing kjob')
181
225
  err_code = prepare_kjob(args)
182
226
  if err_code > 0:
183
227
  xpk_exit(err_code)
184
- # Provision node pools dynamically based on incoming workloads:
185
- # Currently autoprovisioning is not supported with Pathways.
186
- autoprovisioning_config = None
187
- if not args.enable_pathways and args.enable_autoprovisioning:
188
- xpk_print('Enabling Autoprovisioning')
189
- autoprovisioning_config, return_code = enable_autoprovisioning_on_cluster(
190
- args, system
191
- )
192
- if return_code != 0:
193
- xpk_exit(return_code)
228
+
229
+ k8s_client = setup_k8s_env(args)
230
+ install_storage_crd(k8s_client)
194
231
 
195
232
  xpk_print('Wait for Kueue to be fully available')
196
233
  wait_for_kueue_available_code = wait_for_kueue_available(args)
@@ -210,13 +247,6 @@ def cluster_create(args) -> None:
210
247
  if install_nccl_code != 0:
211
248
  xpk_exit(install_nccl_code)
212
249
 
213
- xpk_print('Creating ConfigMap for cluster')
214
- create_cluster_configmaps_code = create_cluster_configmaps(
215
- args, system, tensorboard_config, autoprovisioning_config
216
- )
217
- if create_cluster_configmaps_code != 0:
218
- xpk_exit(create_cluster_configmaps_code)
219
-
220
250
  if args.enable_ray_cluster:
221
251
  return_code = install_ray_cluster(args, system)
222
252
  if return_code != 0:
@@ -249,7 +279,12 @@ def cluster_delete(args) -> None:
249
279
  cluster_gcluster.cluster_delete(args)
250
280
  xpk_exit(0)
251
281
 
282
+ set_cluster_command_code = set_cluster_command(args)
283
+ if set_cluster_command_code != 0:
284
+ xpk_exit(set_cluster_command_code)
285
+
252
286
  run_gke_cluster_delete_command_code = run_gke_cluster_delete_command(args)
287
+
253
288
  if run_gke_cluster_delete_command_code != 0:
254
289
  xpk_exit(run_gke_cluster_delete_command_code)
255
290
  xpk_print(f'GKE commands done! Cluster {args.cluster} deleted.\n')
@@ -270,9 +305,7 @@ def cluster_cacheimage(args) -> None:
270
305
  )
271
306
  add_zone_and_project(args)
272
307
 
273
- set_cluster_command_code = set_cluster_command(args)
274
- if set_cluster_command_code != 0:
275
- xpk_exit(set_cluster_command_code)
308
+ get_cluster_credentials(args)
276
309
  system, return_code = get_system_characteristics(args)
277
310
 
278
311
  if return_code > 0:
@@ -321,9 +354,7 @@ def cluster_describe(args) -> None:
321
354
  xpk_print(f'Starting nodepool list for cluster: {args.cluster}', flush=True)
322
355
  add_zone_and_project(args)
323
356
 
324
- set_cluster_command_code = set_cluster_command(args)
325
- if set_cluster_command_code != 0:
326
- xpk_exit(set_cluster_command_code)
357
+ get_cluster_credentials(args)
327
358
 
328
359
  return_code, data_table = nodepools_build_table(args)
329
360
  if return_code != 0:
@@ -752,33 +783,26 @@ def run_gke_cluster_create_command(
752
783
  if args.enable_ray_cluster:
753
784
  command += ' --addons RayOperator'
754
785
 
755
- return_code = run_command_with_updates(command, 'GKE Cluster Create', args)
756
- if return_code != 0:
757
- xpk_print(f'GKE Cluster Create request returned ERROR {return_code}')
758
- return 1
759
- return 0
786
+ if (
787
+ args.enable_workload_identity
788
+ or args.enable_gcsfuse_csi_driver
789
+ or args.enable_gcpfilestore_csi_driver
790
+ ):
791
+ command += f' --workload-pool={args.project}.svc.id.goog'
760
792
 
793
+ addons = []
794
+ if args.enable_gcsfuse_csi_driver:
795
+ addons.append('GcsFuseCsiDriver')
761
796
 
762
- def set_cluster_command(args) -> int:
763
- """Run cluster configuration command to set the kubectl config.
797
+ if args.enable_gcpfilestore_csi_driver:
798
+ addons.append('GcpFilestoreCsiDriver')
764
799
 
765
- Args:
766
- args: user provided arguments for running the command.
800
+ if len(addons) > 0:
801
+ addons_str = ','.join(addons)
802
+ command += f' --addons={addons_str}'
767
803
 
768
- Returns:
769
- 0 if successful and 1 otherwise.
770
- """
771
- command = (
772
- 'gcloud container clusters get-credentials'
773
- f' {args.cluster} --region={zone_to_region(args.zone)}'
774
- f' --project={args.project} &&'
775
- ' kubectl config view && kubectl config set-context --current'
776
- ' --namespace=default'
777
- )
778
- task = f'get-credentials to cluster {args.cluster}'
779
- return_code = run_command_with_updates_retry(
780
- command, task, args, verbose=False
781
- )
804
+ return_code = run_command_with_updates(command, 'GKE Cluster Create', args)
782
805
  if return_code != 0:
783
- xpk_print(f'{task} returned ERROR {return_code}')
784
- return return_code
806
+ xpk_print(f'GKE Cluster Create request returned ERROR {return_code}')
807
+ return 1
808
+ return 0
@@ -14,15 +14,28 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
- from ..core.blueprint.blueprint_generator import BlueprintGenerator, BlueprintGeneratorOutput, supported_device_types, a3mega_device_type, a3ultra_device_type
17
+ import os
18
+
19
+ from ..core.remote_state.remote_state_client import RemoteStateClient
20
+ from ..core.remote_state.fuse_remote_state import FuseStateClient
21
+ from ..core.blueprint.blueprint_generator import (
22
+ BlueprintGenerator,
23
+ BlueprintGeneratorOutput,
24
+ a3mega_device_type,
25
+ a3ultra_device_type,
26
+ supported_device_types,
27
+ )
28
+ from ..core.commands import run_command_for_value
29
+ from ..core.capacity import get_capacity_type
18
30
  from ..core.docker_manager import DockerManager
31
+ from ..core.gcloud_context import zone_to_region
19
32
  from ..core.gcluster_manager import GclusterManager
20
- from ..core.core import zone_to_region, get_capacity_type
21
33
  from ..utils.console import xpk_exit, xpk_print
22
- from ..utils.network import all_IPs_cidr
23
34
  from ..utils.file import ensure_directory_exists
35
+ from ..utils.network import all_IPs_cidr
24
36
  from ..utils.objects import hash_string
25
- import os
37
+ from ..core.cluster import get_cluster_credentials
38
+ from ..core.kjob import apply_kjob_crds, prepare_kjob
26
39
 
27
40
  blueprints_path = os.path.abspath('xpkclusters/blueprints')
28
41
  gcluster_working_dir = os.path.abspath('xpkclusters/gcluster-out')
@@ -40,13 +53,22 @@ def cluster_create(args) -> None:
40
53
  """
41
54
  check_gcloud_authenticated()
42
55
  prepare_directories()
43
- gcm = prepare_gcluster_manager()
44
56
  region = zone_to_region(args.zone)
45
57
 
46
58
  # unique_name uses shortened hash string, so still name collision is possible
47
59
  unique_name = get_unique_name(args.project, region, args.cluster)
48
60
  # prefix is to prevent name collisions for blueprints and also deployments by storing them in prefix directory. Ex.: blueprints/{prefix}/cluster_name_hash
49
61
  prefix = get_prefix_path(args.project, region)
62
+ remote_state_client = None
63
+ if args.cluster_state_gcs_bucket is not None:
64
+ remote_state_client = FuseStateClient(
65
+ bucket=args.cluster_state_gcs_bucket,
66
+ state_directory=os.path.join(blueprints_path, prefix, unique_name),
67
+ prefix=prefix,
68
+ cluster=args.cluster,
69
+ deployment_name=unique_name,
70
+ )
71
+ gcm = prepare_gcluster_manager(remote_state_client)
50
72
 
51
73
  bp = generate_blueprint(blueprint_name=unique_name, args=args, prefix=prefix)
52
74
 
@@ -61,6 +83,18 @@ def cluster_create(args) -> None:
61
83
  deployment_name=unique_name,
62
84
  prefix=prefix,
63
85
  )
86
+ if args.cluster_state_gcs_bucket is not None:
87
+ gcm.upload_state()
88
+
89
+ get_cluster_credentials(args)
90
+
91
+ err_code = apply_kjob_crds(args)
92
+ if err_code > 0:
93
+ xpk_exit(err_code)
94
+
95
+ err_code = prepare_kjob(args)
96
+ if err_code > 0:
97
+ xpk_exit(err_code)
64
98
 
65
99
  xpk_exit(0)
66
100
 
@@ -76,15 +110,42 @@ def cluster_delete(args) -> None:
76
110
  """
77
111
  check_gcloud_authenticated()
78
112
  prepare_directories()
79
- gcm = prepare_gcluster_manager()
80
113
  region = zone_to_region(args.zone)
114
+ unique_name = get_unique_name(args.project, region, args.cluster)
115
+ # prefix is to prevent name collisions for blueprints and also deployments by storing them in prefix directory. Ex.: blueprints/{prefix}/cluster_name_hash
116
+ prefix = get_prefix_path(args.project, region)
117
+ remote_state_client = None
118
+ if args.cluster_state_gcs_bucket is not None:
119
+ remote_state_client = FuseStateClient(
120
+ bucket=args.cluster_state_gcs_bucket,
121
+ state_directory=os.path.join(blueprints_path, prefix, unique_name),
122
+ prefix=prefix,
123
+ cluster=args.cluster,
124
+ deployment_name=unique_name,
125
+ )
126
+ gcm = prepare_gcluster_manager(remote_state_client)
81
127
 
82
128
  # unique_name uses shortened hash string, so still name collision is possible
83
129
  unique_name = get_unique_name(args.project, region, args.cluster)
84
130
  # prefix is to prevent name collisions for blueprints and also deployments by storing them in prefix directory. Ex.: blueprints/{prefix}/cluster_name_hash
85
- prefix_path = get_prefix_path(args.project, region)
131
+ prefix = get_prefix_path(args.project, region)
132
+ if args.cluster_state_gcs_bucket is not None:
133
+ gcm.download_state()
134
+
135
+ bp = BlueprintGeneratorOutput(
136
+ blueprint_file=os.path.join(blueprints_path, prefix, unique_name)
137
+ + '.yaml',
138
+ blueprint_dependencies=os.path.join(
139
+ blueprints_path, prefix, unique_name
140
+ ),
141
+ )
86
142
 
87
- gcm.destroy_deployment(deployment_name=unique_name, prefix=prefix_path)
143
+ gcm.stage_files(
144
+ blueprint_file=bp.blueprint_file,
145
+ blueprint_dependencies=bp.blueprint_dependencies,
146
+ prefix=prefix,
147
+ )
148
+ gcm.destroy_deployment(deployment_name=unique_name, prefix=prefix)
88
149
 
89
150
  xpk_exit(0)
90
151
 
@@ -127,18 +188,35 @@ def check_gcloud_authenticated():
127
188
  xpk_exit(1)
128
189
 
129
190
 
130
- def prepare_gcluster_manager() -> GclusterManager:
191
+ def prepare_gcluster_manager(
192
+ remote_state_client: RemoteStateClient | None,
193
+ ) -> GclusterManager:
131
194
  dm = DockerManager(
132
195
  working_dir=gcluster_working_dir, gcloud_cfg_path=gcloud_cfg_path
133
196
  )
134
197
  dm.initialize()
135
- return GclusterManager(gcluster_command_runner=dm)
198
+ return GclusterManager(
199
+ gcluster_command_runner=dm, remote_state_client=remote_state_client
200
+ )
136
201
 
137
202
 
138
203
  def prepare_blueprint_generator() -> BlueprintGenerator:
139
204
  return BlueprintGenerator(storage_path=blueprints_path)
140
205
 
141
206
 
207
+ def validate_state_gcs_bucket(args):
208
+ bucket_validate_cmd = (
209
+ f'gcloud storage buckets describe gs://{args.cluster_state_gcs_bucket}'
210
+ )
211
+ err_code, _ = run_command_for_value(
212
+ bucket_validate_cmd,
213
+ 'Validate remote state bucket existence.',
214
+ global_args=args,
215
+ )
216
+ if err_code != 0:
217
+ xpk_exit(err_code)
218
+
219
+
142
220
  def generate_blueprint(
143
221
  blueprint_name, args, prefix=None
144
222
  ) -> BlueprintGeneratorOutput:
@@ -149,6 +227,9 @@ def generate_blueprint(
149
227
 
150
228
  bpg = prepare_blueprint_generator()
151
229
 
230
+ if args.cluster_state_gcs_bucket is not None:
231
+ validate_state_gcs_bucket(args)
232
+
152
233
  if args.device_type in supported_device_types:
153
234
  if args.device_type == a3mega_device_type:
154
235
  num_nodes = args.num_nodes if not args.num_nodes is None else 2
@@ -165,6 +246,7 @@ def generate_blueprint(
165
246
  capacity_type=capacity_type,
166
247
  system_node_pool_machine_type=args.default_pool_cpu_machine_type,
167
248
  system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
249
+ gcs_bucket=args.cluster_state_gcs_bucket,
168
250
  )
169
251
  if args.device_type == a3ultra_device_type:
170
252
  num_nodes = args.num_nodes if not args.num_nodes is None else 2
@@ -178,8 +260,10 @@ def generate_blueprint(
178
260
  auth_cidr=all_IPs_cidr,
179
261
  num_nodes=num_nodes,
180
262
  reservation=args.reservation if args.reservation else None,
263
+ enable_filestore_csi_driver=args.enable_gcpfilestore_csi_driver,
181
264
  capacity_type=capacity_type,
182
265
  system_node_pool_machine_type=args.default_pool_cpu_machine_type,
183
266
  system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
267
+ gcs_bucket=args.cluster_state_gcs_bucket,
184
268
  )
185
269
  return None
xpk/commands/common.py ADDED
@@ -0,0 +1,44 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from ..core.commands import run_command_with_updates_retry
18
+ from ..core.gcloud_context import zone_to_region
19
+ from ..utils.console import xpk_print
20
+
21
+
22
+ def set_cluster_command(args) -> int:
23
+ """Run cluster configuration command to set the kubectl config.
24
+
25
+ Args:
26
+ args: user provided arguments for running the command.
27
+
28
+ Returns:
29
+ 0 if successful and 1 otherwise.
30
+ """
31
+ command = (
32
+ 'gcloud container clusters get-credentials'
33
+ f' {args.cluster} --region={zone_to_region(args.zone)}'
34
+ f' --project={args.project} &&'
35
+ ' kubectl config view && kubectl config set-context --current'
36
+ ' --namespace=default'
37
+ )
38
+ task = f'get-credentials to cluster {args.cluster}'
39
+ return_code = run_command_with_updates_retry(
40
+ command, task, args, verbose=False
41
+ )
42
+ if return_code != 0:
43
+ xpk_print(f'{task} returned ERROR {return_code}')
44
+ return return_code
xpk/commands/config.py ADDED
@@ -0,0 +1,29 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from ..core.config import XpkConfig
18
+ from ..utils.console import xpk_print
19
+
20
+ xpk_cfg = XpkConfig()
21
+
22
+
23
+ def set_config(args):
24
+ xpk_cfg.set(args.set_config_args[0], args.set_config_args[1])
25
+
26
+
27
+ def get_config(args):
28
+ value = xpk_cfg.get(args.get_config_key[0])
29
+ xpk_print(value)