xpk 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. xpk/api/__init__.py +15 -0
  2. xpk/api/storage_crd.yaml +52 -0
  3. xpk/commands/batch.py +27 -5
  4. xpk/commands/cluster.py +104 -80
  5. xpk/commands/cluster_gcluster.py +94 -10
  6. xpk/commands/common.py +44 -0
  7. xpk/commands/config.py +29 -0
  8. xpk/commands/info.py +8 -10
  9. xpk/commands/inspector.py +5 -11
  10. xpk/commands/job.py +9 -7
  11. xpk/commands/kind.py +34 -4
  12. xpk/commands/kjob_common.py +44 -0
  13. xpk/commands/run.py +128 -0
  14. xpk/commands/shell.py +27 -7
  15. xpk/commands/storage.py +280 -0
  16. xpk/commands/version.py +6 -18
  17. xpk/commands/workload.py +381 -184
  18. xpk/core/blueprint/blueprint_definitions.py +1 -0
  19. xpk/core/blueprint/blueprint_generator.py +132 -76
  20. xpk/core/capacity.py +185 -0
  21. xpk/core/cluster.py +564 -0
  22. xpk/core/cluster_private.py +6 -3
  23. xpk/core/commands.py +18 -14
  24. xpk/core/config.py +179 -0
  25. xpk/core/docker_container.py +225 -0
  26. xpk/core/docker_image.py +210 -0
  27. xpk/core/docker_resources.py +350 -0
  28. xpk/core/filestore.py +251 -0
  29. xpk/core/gcloud_context.py +196 -0
  30. xpk/core/gcluster_manager.py +20 -2
  31. xpk/core/gcsfuse.py +50 -0
  32. xpk/core/kjob.py +257 -18
  33. xpk/core/kueue.py +12 -6
  34. xpk/core/monitoring.py +134 -0
  35. xpk/core/nap.py +32 -20
  36. xpk/core/network.py +377 -0
  37. xpk/core/nodepool.py +581 -0
  38. xpk/core/pathways.py +124 -45
  39. xpk/core/remote_state/__init__.py +15 -0
  40. xpk/core/remote_state/fuse_remote_state.py +99 -0
  41. xpk/core/remote_state/remote_state_client.py +38 -0
  42. xpk/core/resources.py +238 -0
  43. xpk/core/scheduling.py +253 -0
  44. xpk/core/storage.py +581 -0
  45. xpk/core/system_characteristics.py +38 -1
  46. xpk/core/vertex.py +105 -0
  47. xpk/core/workload.py +209 -1
  48. xpk/core/workload_decorators/rdma_decorator.py +25 -5
  49. xpk/core/workload_decorators/storage_decorator.py +52 -0
  50. xpk/core/workload_decorators/tcpxo_decorator.py +70 -37
  51. xpk/main.py +3 -1
  52. xpk/parser/batch.py +10 -151
  53. xpk/parser/cluster.py +49 -8
  54. xpk/parser/common.py +189 -1
  55. xpk/parser/config.py +49 -0
  56. xpk/parser/core.py +27 -1
  57. xpk/parser/info.py +2 -1
  58. xpk/parser/inspector.py +3 -3
  59. xpk/parser/job.py +25 -4
  60. xpk/parser/kind.py +3 -2
  61. xpk/parser/run.py +47 -0
  62. xpk/parser/shell.py +10 -1
  63. xpk/parser/storage.py +326 -0
  64. xpk/parser/validators.py +3 -3
  65. xpk/parser/workload.py +118 -76
  66. xpk/templates/__init__.py +15 -0
  67. xpk/templates/storage.yaml +13 -0
  68. xpk/utils/gcs_utils.py +125 -0
  69. xpk/utils/kubectl.py +57 -0
  70. xpk/utils/objects.py +8 -5
  71. xpk/utils/templates.py +28 -0
  72. xpk/utils/validation.py +80 -0
  73. {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/METADATA +169 -15
  74. xpk-0.7.1.dist-info/RECORD +92 -0
  75. {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/WHEEL +1 -1
  76. xpk/core/core.py +0 -2824
  77. xpk-0.6.0.dist-info/RECORD +0 -57
  78. {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/entry_points.txt +0 -0
  79. {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info/licenses}/LICENSE +0 -0
  80. {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/top_level.txt +0 -0
xpk/core/pathways.py CHANGED
@@ -14,13 +14,13 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
+ from .cluster import XPK_SA
18
+ from ..core.docker_container import get_user_workload_container
19
+ from ..core.gcloud_context import zone_to_region
20
+ from ..core.nodepool import get_all_nodepools_programmatic
17
21
  from ..utils.console import xpk_exit, xpk_print
18
- from .core import (
19
- AcceleratorType,
20
- get_all_nodepools_programmatic,
21
- get_user_workload_container,
22
- zone_to_region,
23
- )
22
+ from .config import AcceleratorType
23
+ from .storage import Storage, get_storage_volumes_yaml, GCS_FUSE_ANNOTATION
24
24
  from .system_characteristics import SystemCharacteristics
25
25
 
26
26
  PathwaysExpectedInstancesMap = {
@@ -41,9 +41,11 @@ def get_pathways_worker_args(args) -> str:
41
41
  str: yaml containing arguments for the Pathways workers.
42
42
  """
43
43
  yaml = """- --server_port=29001
44
- - --resource_manager_address={rm_address}
45
- - --gcs_scratch_location={args.pathways_gcs_location}"""
44
+ - --resource_manager_address={rm_address}
45
+ - --gcs_scratch_location={args.pathways_gcs_location}"""
46
46
  if args.use_pathways:
47
+ if args.custom_pathways_worker_args:
48
+ yaml = append_custom_pathways_args(yaml, args.custom_pathways_worker_args)
47
49
  return yaml.format(args=args, rm_address=get_rm_address(args))
48
50
  else:
49
51
  return ''
@@ -58,15 +60,53 @@ def get_pathways_proxy_args(args) -> str:
58
60
  str: yaml containing arguments for the Pathways proxy.
59
61
  """
60
62
  yaml = """- --server_port=29000
61
- - --resource_manager_address={rm_address}
62
- - --gcs_scratch_location={args.pathways_gcs_location}"""
63
+ - --resource_manager_address={rm_address}
64
+ - --gcs_scratch_location={args.pathways_gcs_location}"""
63
65
 
64
66
  if args.use_pathways:
67
+ if args.custom_pathways_proxy_server_args:
68
+ yaml = append_custom_pathways_args(
69
+ yaml, args.custom_pathways_proxy_server_args
70
+ )
65
71
  return yaml.format(args=args, rm_address=get_rm_address(args))
66
72
  else:
67
73
  return ''
68
74
 
69
75
 
76
+ def get_pathways_sidecar_container(args) -> str:
77
+ """This is a sidecar container that runs the remote python server.
78
+
79
+ It is a special case of the initContainer (designated by restartPolicy:
80
+ Always)
81
+ See https://kubernetes.io/docs/concepts/workloads/pods/sidecar-containers/
82
+ for more details.
83
+ Args:
84
+ args: user provided arguments for running the command.
85
+
86
+ Returns:
87
+ str: yaml containing arguments for the Pathways sidecar container.
88
+ """
89
+ yaml = """initContainers:
90
+ - name: remote-python-sidecar
91
+ image: {args.remote_python_sidecar_image}
92
+ imagePullPolicy: Always
93
+ securityContext:
94
+ privileged: true
95
+ volumeMounts:
96
+ - mountPath: /tmp # Shared volume mount with the main container.
97
+ name: shared-tmp
98
+ restartPolicy: Always
99
+ ports:
100
+ - containerPort: 50051
101
+ env:
102
+ - name: GRPC_SERVER_ADDRESS
103
+ value: '0.0.0.0:50051'"""
104
+ if args.use_pathways and args.remote_python_sidecar_image is not None:
105
+ return yaml.format(args=args)
106
+ else:
107
+ return ''
108
+
109
+
70
110
  def add_pw_resource_flavors(args):
71
111
  """Add resource flavors required for Pathways enabled clusters."""
72
112
  resource_flavor_yaml = """apiVersion: kueue.x-k8s.io/v1beta1
@@ -105,9 +145,9 @@ def add_pw_resources_to_kueue(args):
105
145
  - name: cpu-rm
106
146
  resources:
107
147
  - name: "cpu"
108
- nominalQuota: 80
148
+ nominalQuota: 480
109
149
  - name: "memory"
110
- nominalQuota: 160G
150
+ nominalQuota: 2000G
111
151
  - name: cpu-proxy
112
152
  resources:
113
153
  - name: "cpu"
@@ -166,23 +206,17 @@ def ensure_pathways_workload_prerequisites(args, system) -> bool:
166
206
  # Set the job which determines the life of other Pathways jobs
167
207
  args.targetReplicatedJob = 'proxy' if args.headless else 'main'
168
208
 
169
- # Always report user code failures back to JobSet.
170
- args.restart_on_user_code_failure = True
171
-
172
209
  return True
173
210
 
174
211
 
175
212
  def get_pathways_unified_query_link(args) -> str:
176
213
  """Get the unified query link for the pathways workload."""
177
- pw_suffixes = ['main', 'rm', 'proxy']
178
- pw_pod_names = [f'"{args.workload}-{suffix}-0"' for suffix in pw_suffixes]
179
- pw_pod_names_query = '%20OR%20'.join(pw_pod_names + ['worker-0-0'])
180
214
  query_params = (
181
215
  'resource.type%3D"k8s_container"%0A'
182
216
  f'resource.labels.project_id%3D"{args.project}"%0A'
183
217
  f'resource.labels.location%3D"{zone_to_region(args.zone)}"%0A'
184
218
  f'resource.labels.cluster_name%3D"{args.cluster}"%0A'
185
- f'resource.labels.pod_name:{pw_pod_names_query}%0A'
219
+ f'resource.labels.pod_name:"{args.workload}-"%0A'
186
220
  'severity>%3DDEFAULT'
187
221
  )
188
222
 
@@ -198,11 +232,13 @@ def get_pathways_rm_args(args, system: SystemCharacteristics) -> str:
198
232
  str: yaml containing arguments for the Pathways resource manager.
199
233
  """
200
234
  yaml = """- --server_port=29001
201
- - --gcs_scratch_location={args.pathways_gcs_location}
202
- - --node_type=resource_manager
203
- - --instance_count={instance_count}
204
- - --instance_type={instance_type}"""
235
+ - --gcs_scratch_location={args.pathways_gcs_location}
236
+ - --node_type=resource_manager
237
+ - --instance_count={instance_count}
238
+ - --instance_type={instance_type}"""
205
239
  if args.use_pathways:
240
+ if args.custom_pathways_server_args:
241
+ yaml = append_custom_pathways_args(yaml, args.custom_pathways_server_args)
206
242
  return yaml.format(
207
243
  args=args,
208
244
  instance_count=args.num_slices,
@@ -212,7 +248,34 @@ def get_pathways_rm_args(args, system: SystemCharacteristics) -> str:
212
248
  return ''
213
249
 
214
250
 
215
- def get_user_workload_for_pathways(args, system: SystemCharacteristics) -> str:
251
+ def append_custom_pathways_args(yaml, custom_args) -> str:
252
+ """Append custom Pathways args to the YAML with proper indentation.
253
+
254
+ Args:
255
+ yaml (string): existing yaml containing args
256
+
257
+ Returns:
258
+ yaml (string): yaml with additional args appended.
259
+ """
260
+ second_line = yaml.split('\n')[1]
261
+ if (
262
+ not second_line
263
+ ): # to cover edge case if only one arg remains, we would have to look at the entire YAML in this case.
264
+ return yaml
265
+ # Calculate the indentation based on the second line of existing YAML.
266
+ indentation = ' ' * (len(second_line) - len(second_line.lstrip()))
267
+ custom_args = custom_args.split(' ')
268
+ for arg in custom_args:
269
+ yaml += '\n' + indentation + '- ' + arg
270
+ return yaml
271
+
272
+
273
+ def get_user_workload_for_pathways(
274
+ args,
275
+ system: SystemCharacteristics,
276
+ pod_failure_policy,
277
+ storages: list[Storage],
278
+ ) -> str:
216
279
  """
217
280
  Create a user workload container for Pathways.
218
281
  Don't create one for Pathways headless mode.
@@ -227,32 +290,48 @@ def get_user_workload_for_pathways(args, system: SystemCharacteristics) -> str:
227
290
  Pathways server port as a YAML string
228
291
  """
229
292
  user_workload_yaml = """- name: main
230
- replicas: 1
231
- template:
232
- metadata:
233
- labels:
234
- xpk.google.com/workload: {args.workload}
235
- spec:
236
- backoffLimit: 0
237
- completions: 1
238
- parallelism: 1
239
- template:
240
- spec:
241
- containers:
293
+ replicas: 1
294
+ template:
295
+ metadata:
296
+ labels:
297
+ xpk.google.com/workload: {args.workload}
298
+ spec:
299
+ backoffLimit: 0
300
+ completions: 1
301
+ parallelism: 1
302
+ {pod_failure_policy}
303
+ template:
304
+ metadata:
305
+ annotations:
306
+ {gcs_fuse_annotation}
307
+ spec:
308
+ containers:
242
309
  {container}
243
- nodeSelector:
244
- cloud.google.com/gke-nodepool: cpu-user-np
245
- restartPolicy: OnFailure
246
- volumes:
247
- - hostPath:
248
- path: /tmp
249
- type: DirectoryOrCreate
250
- name: shared-tmp"""
310
+ serviceAccountName: {service_account}
311
+ nodeSelector:
312
+ cloud.google.com/gke-nodepool: cpu-user-np
313
+ hostNetwork: true
314
+ dnsPolicy: ClusterFirstWithHostNet
315
+ restartPolicy: Never
316
+ volumes:
317
+ - hostPath:
318
+ path: /tmp
319
+ type: DirectoryOrCreate
320
+ name: shared-tmp
321
+ {storage_volumes}"""
251
322
  if args.headless:
252
323
  return ''
253
324
  else:
254
325
  container, _ = get_user_workload_container(args, system)
255
- return user_workload_yaml.format(args=args, container=container)
326
+ storage_volumes = get_storage_volumes_yaml(storages)
327
+ return user_workload_yaml.format(
328
+ args=args,
329
+ container=container,
330
+ storage_volumes=storage_volumes,
331
+ pod_failure_policy=pod_failure_policy,
332
+ service_account=XPK_SA,
333
+ gcs_fuse_annotation=GCS_FUSE_ANNOTATION,
334
+ )
256
335
 
257
336
 
258
337
  def get_rm_address(args) -> str:
@@ -0,0 +1,15 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
@@ -0,0 +1,99 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from .remote_state_client import RemoteStateClient
18
+ from ...utils.gcs_utils import upload_directory_to_gcs, check_file_exists, download_bucket_to_dir, upload_file_to_gcs
19
+ from ...utils.console import xpk_print
20
+ from google.cloud.storage import Client
21
+ import os
22
+
23
+
24
+ class FuseStateClient(RemoteStateClient):
25
+ """FuseStateClient is a class for managing remote xpk state stored in GCS Fuse."""
26
+
27
+ def __init__(
28
+ self,
29
+ bucket: str,
30
+ state_directory: str,
31
+ cluster: str,
32
+ deployment_name: str,
33
+ prefix: str,
34
+ ) -> None:
35
+ self.bucket = bucket
36
+ self.state_dir = state_directory
37
+ self.storage_client = Client()
38
+ self.cluster = cluster
39
+ self.prefix = prefix
40
+ self.deployment_name = deployment_name
41
+
42
+ def _get_bucket_path(self) -> str:
43
+ return (
44
+ f'xpk_terraform_state/{self.prefix}/blueprints/{self.deployment_name}/'
45
+ )
46
+
47
+ def _get_bucket_path_blueprint(self) -> str:
48
+ return f'xpk_terraform_state/{self.prefix}/blueprints/'
49
+
50
+ def _get_deployment_filename(self) -> str:
51
+ return f'{self.deployment_name}.yaml'
52
+
53
+ def _get_blueprint_path(self) -> str:
54
+ blueprint_dir = '/'.join(self.state_dir.split('/')[:-1])
55
+ return os.path.join(blueprint_dir, self.deployment_name) + '.yaml'
56
+
57
+ def upload_state(self) -> None:
58
+ xpk_print(
59
+ f'Uploading dependecies from directory {self.state_dir} to bucket:'
60
+ f' {self.bucket}. Path within bucket is: {self._get_bucket_path()}'
61
+ )
62
+ upload_directory_to_gcs(
63
+ storage_client=self.storage_client,
64
+ bucket_name=self.bucket,
65
+ bucket_path=self._get_bucket_path(),
66
+ source_directory=self.state_dir,
67
+ )
68
+ blueprint_bucket_path = (
69
+ self._get_bucket_path_blueprint() + self._get_deployment_filename()
70
+ )
71
+ xpk_print(
72
+ f'Uploading blueprint file: {self._get_blueprint_path()} to bucket'
73
+ f' {self.bucket}. Path within bucket is: {blueprint_bucket_path}'
74
+ )
75
+ upload_file_to_gcs(
76
+ storage_client=self.storage_client,
77
+ bucket_name=self.bucket,
78
+ bucket_path=blueprint_bucket_path,
79
+ file=self._get_blueprint_path(),
80
+ )
81
+
82
+ def download_state(self) -> None:
83
+ xpk_print(
84
+ f'Downloading from bucket: {self.bucket}, from path:'
85
+ f' {self._get_bucket_path()} to directory: {self.state_dir}'
86
+ )
87
+ download_bucket_to_dir(
88
+ self.storage_client,
89
+ self.bucket,
90
+ self._get_bucket_path(),
91
+ destination_directory=self.state_dir,
92
+ )
93
+
94
+ def check_remote_state_exists(self) -> bool:
95
+ return check_file_exists(
96
+ self.storage_client,
97
+ self.bucket,
98
+ self._get_bucket_path_blueprint() + self._get_deployment_filename(),
99
+ )
@@ -0,0 +1,38 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from abc import ABC, abstractmethod
18
+
19
+
20
+ class RemoteStateClient(ABC):
21
+ """This is a base class that defines methods a class for managing remote cluster state.
22
+ Args:
23
+ ABC (_type_): _description_
24
+ """
25
+
26
+ @abstractmethod
27
+ def upload_state(self) -> None:
28
+ """Upload state to remote storage"""
29
+ return None
30
+
31
+ @abstractmethod
32
+ def download_state(self) -> None:
33
+ """Download state from remote storage"""
34
+ return None
35
+
36
+ @abstractmethod
37
+ def check_remote_state_exists(self) -> bool:
38
+ return False
xpk/core/resources.py ADDED
@@ -0,0 +1,238 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from dataclasses import dataclass
18
+
19
+ from ..utils.console import xpk_print
20
+ from ..utils.file import write_tmp_file
21
+ from .capacity import (
22
+ AUTOPROVISIONING_CONFIG_MAXIMUM_KEY,
23
+ AUTOPROVISIONING_CONFIG_MINIMUM_KEY,
24
+ AUTOPROVISIONING_CONFIG_VALUE,
25
+ CAPACITY_TYPE_CONFIG_KEY,
26
+ RESERVATION_CONFIG_KEY,
27
+ CapacityType,
28
+ get_capacity_type,
29
+ )
30
+ from .commands import run_command_for_value, run_commands
31
+ from .config import XPK_CURRENT_VERSION
32
+ from .system_characteristics import AcceleratorType, get_system_characteristics_by_device_type, SystemCharacteristics
33
+
34
+ CLUSTER_RESOURCES_CONFIGMAP = 'resources-configmap'
35
+ CLUSTER_METADATA_CONFIGMAP = 'metadata-configmap'
36
+
37
+ CLUSTER_CONFIGMAP_YAML = """kind: ConfigMap
38
+ apiVersion: v1
39
+ metadata:
40
+ name: {name}
41
+ data:
42
+ {data}
43
+ """
44
+
45
+
46
+ @dataclass
47
+ class AutoprovisioningConfig:
48
+ config_filename: str
49
+ minimum_chips: int
50
+ maximum_chips: int
51
+
52
+
53
+ def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None:
54
+ """Run the Get GKE Cluster ConfigMap request.
55
+
56
+ Args:
57
+ args: user provided arguments for running the command.
58
+ configmap_name: name of the configmap.
59
+
60
+ Returns:
61
+ key:value pairs stored in cluster ConfigMap.
62
+ """
63
+ command = (
64
+ 'kubectl get configmap'
65
+ f' {configmap_name} -o=custom-columns="ConfigData:data" --no-headers=true'
66
+ )
67
+
68
+ return_code, return_value = run_command_for_value(
69
+ command, 'GKE Cluster Get ConfigMap', args
70
+ )
71
+ if return_code != 0:
72
+ xpk_print(f'GKE Cluster Get ConfigMap request returned ERROR {return_code}')
73
+ return None
74
+
75
+ config_map = {}
76
+ return_value = return_value.strip()
77
+
78
+ if return_value:
79
+ # Format of ConfigMap: map[key1:value1 key2:value2]
80
+ return_value = return_value[return_value.index('map') :]
81
+ configs = return_value[4:-1].split(' ')
82
+
83
+ for config in configs:
84
+ key, value = config.strip().split(':')
85
+ config_map[key] = value
86
+ return config_map
87
+
88
+
89
+ def create_cluster_configmaps(
90
+ args,
91
+ system,
92
+ tensorboard_config: dict,
93
+ autoprovisioning_config: AutoprovisioningConfig | None,
94
+ ) -> int:
95
+ """Run the Create GKE Cluster ConfigMap request.
96
+
97
+ Args:
98
+ args: user provided arguments for running the command.
99
+ system: system characteristics.
100
+ tensorboard_config: map that contains Vertex Tensorboard name, id and location
101
+ autoprovisioning_config: Config used in autoprovisioning.
102
+ Returns:
103
+ 0 if successful and 1 otherwise.
104
+ """
105
+ configmap_yml = {}
106
+
107
+ # ConfigMap to store resources available in the cluster.
108
+ device_type = system.device_type
109
+ if system.accelerator_type == AcceleratorType['GPU']:
110
+ resources_data = f'{device_type}: "{int(args.num_nodes)}"'
111
+ elif (
112
+ not args.enable_pathways
113
+ and args.enable_autoprovisioning
114
+ and autoprovisioning_config
115
+ ):
116
+ # Currently autoprovisioning is not supported with Pathways.
117
+ # Auto provisioning will have variable topologies for a gke accelerator type.
118
+ resources_data = (
119
+ f'{system.gke_accelerator}: {AUTOPROVISIONING_CONFIG_VALUE}'
120
+ )
121
+ resources_data += (
122
+ f'\n {AUTOPROVISIONING_CONFIG_MINIMUM_KEY}:'
123
+ f' "{autoprovisioning_config.minimum_chips}"'
124
+ )
125
+ resources_data += (
126
+ f'\n {AUTOPROVISIONING_CONFIG_MAXIMUM_KEY}:'
127
+ f' "{autoprovisioning_config.maximum_chips}"'
128
+ )
129
+ else:
130
+ resources_data = (
131
+ f'{device_type}: "{int(args.num_slices) * system.vms_per_slice}"'
132
+ )
133
+ resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
134
+ resources_yml = CLUSTER_CONFIGMAP_YAML.format(
135
+ args=args, name=resources_configmap_name, data=resources_data
136
+ )
137
+ configmap_yml[resources_configmap_name] = resources_yml
138
+
139
+ # ConfigMap to store cluster metadata.
140
+ # XPK Version.
141
+ metadata = f'xpk_version: {XPK_CURRENT_VERSION}'
142
+ # Vertex Tensorboard information
143
+ for key, value in tensorboard_config.items():
144
+ metadata += f'\n {key}: "{value}"'
145
+ # Capacity Type.
146
+ capacity_type, return_code = get_capacity_type(args)
147
+ if return_code != 0:
148
+ xpk_print('Unable to determine capacity type.')
149
+ return return_code
150
+ metadata += f'\n {CAPACITY_TYPE_CONFIG_KEY}: {capacity_type.name}'
151
+ # Reservation ID if applicable.
152
+ if capacity_type == CapacityType.RESERVATION:
153
+ metadata += f'\n {RESERVATION_CONFIG_KEY}: {args.reservation}'
154
+ metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
155
+ metadata_yml = CLUSTER_CONFIGMAP_YAML.format(
156
+ args=args, name=metadata_configmap_name, data=metadata
157
+ )
158
+ configmap_yml[metadata_configmap_name] = metadata_yml
159
+ return create_or_update_cluster_configmap(configmap_yml)
160
+
161
+
162
+ def create_or_update_cluster_configmap(configmap_yml: dict) -> int:
163
+ """
164
+ Args:
165
+ configmap_yml: dict containing ConfigMap name and yml string.
166
+
167
+ Returns:
168
+ 0 if successful, 1 otherwise.
169
+ """
170
+ commands = []
171
+ task_names = []
172
+ for configmap_name, yml_string in configmap_yml.items():
173
+ tmp = write_tmp_file(yml_string)
174
+ command = f'kubectl apply -f {str(tmp.file.name)}'
175
+ commands.append(command)
176
+ task_name = f'ConfigMap CreateOrUpdate-{configmap_name}'
177
+ task_names.append(task_name)
178
+
179
+ return_code = run_commands(
180
+ commands, 'GKE Cluster CreateOrUpdate ConfigMap(s)', task_names
181
+ )
182
+ if return_code != 0:
183
+ xpk_print(
184
+ 'GKE Cluster Create/Update ConfigMap(s) request returned ERROR'
185
+ f' {return_code}'
186
+ )
187
+ return 1
188
+ return 0
189
+
190
+
191
+ def check_cluster_resources(args, system) -> tuple[bool, bool]:
192
+ """Check if cluster has resources of a specified device_type/gke_accelerator.
193
+ This check will be skipped if <args.cluster>-<_CLUSTER_RESOURCES_CONFIGMAP> ConfigMap doesn't exist for the cluster.
194
+
195
+ Args:
196
+ args: user provided arguments for running the command.
197
+ system: system characteristics.
198
+
199
+ Returns:
200
+ Tuple of bool, bool
201
+ True if resources in the cluster should be checked, False otherwise.
202
+ True if device_type/gke_accelerator exists in the cluster, False otherwise.
203
+ """
204
+ resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
205
+ resources_config_map = get_cluster_configmap(args, resources_configmap_name)
206
+ if resources_config_map is None:
207
+ xpk_print(
208
+ f'No ConfigMap exist for cluster with the name {resources_config_map}.'
209
+ ' Cluster resources check will be skipped.'
210
+ )
211
+ return False, False
212
+ if system.device_type in resources_config_map:
213
+ return True, True
214
+ elif system.gke_accelerator in resources_config_map:
215
+ return True, True
216
+ return True, False
217
+
218
+
219
+ def get_cluster_system_characteristics(args) -> SystemCharacteristics | None:
220
+ """Get systemCharcteristics based on the cluster resources configMap
221
+ Args:
222
+ args: user provided arguments for running the command.
223
+
224
+ Returns:
225
+ returns system characteristics
226
+ """
227
+ resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
228
+ cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
229
+
230
+ if cluster_config_map is None:
231
+ return None
232
+
233
+ for key in cluster_config_map:
234
+ system, result_code = get_system_characteristics_by_device_type(key)
235
+ if result_code == 0:
236
+ return system
237
+
238
+ return None