xpk 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. xpk/api/__init__.py +15 -0
  2. xpk/api/storage_crd.yaml +52 -0
  3. xpk/commands/batch.py +27 -5
  4. xpk/commands/cluster.py +104 -80
  5. xpk/commands/cluster_gcluster.py +94 -10
  6. xpk/commands/common.py +44 -0
  7. xpk/commands/config.py +29 -0
  8. xpk/commands/info.py +8 -10
  9. xpk/commands/inspector.py +5 -11
  10. xpk/commands/job.py +9 -7
  11. xpk/commands/kind.py +34 -4
  12. xpk/commands/kjob_common.py +44 -0
  13. xpk/commands/run.py +128 -0
  14. xpk/commands/shell.py +27 -7
  15. xpk/commands/storage.py +267 -0
  16. xpk/commands/version.py +6 -18
  17. xpk/commands/workload.py +381 -184
  18. xpk/core/blueprint/blueprint_definitions.py +1 -0
  19. xpk/core/blueprint/blueprint_generator.py +132 -76
  20. xpk/core/capacity.py +185 -0
  21. xpk/core/cluster.py +564 -0
  22. xpk/core/cluster_private.py +6 -3
  23. xpk/core/commands.py +18 -14
  24. xpk/core/config.py +179 -0
  25. xpk/core/docker_container.py +225 -0
  26. xpk/core/docker_image.py +210 -0
  27. xpk/core/docker_resources.py +350 -0
  28. xpk/core/filestore.py +251 -0
  29. xpk/core/gcloud_context.py +196 -0
  30. xpk/core/gcluster_manager.py +20 -2
  31. xpk/core/gcsfuse.py +50 -0
  32. xpk/core/kjob.py +257 -18
  33. xpk/core/kueue.py +12 -6
  34. xpk/core/monitoring.py +134 -0
  35. xpk/core/nap.py +32 -20
  36. xpk/core/network.py +377 -0
  37. xpk/core/nodepool.py +581 -0
  38. xpk/core/pathways.py +124 -45
  39. xpk/core/remote_state/__init__.py +15 -0
  40. xpk/core/remote_state/fuse_remote_state.py +99 -0
  41. xpk/core/remote_state/remote_state_client.py +38 -0
  42. xpk/core/resources.py +238 -0
  43. xpk/core/scheduling.py +253 -0
  44. xpk/core/storage.py +581 -0
  45. xpk/core/system_characteristics.py +38 -1
  46. xpk/core/vertex.py +105 -0
  47. xpk/core/workload.py +209 -1
  48. xpk/core/workload_decorators/rdma_decorator.py +25 -5
  49. xpk/core/workload_decorators/storage_decorator.py +52 -0
  50. xpk/core/workload_decorators/tcpxo_decorator.py +70 -37
  51. xpk/main.py +3 -1
  52. xpk/parser/batch.py +10 -151
  53. xpk/parser/cluster.py +49 -8
  54. xpk/parser/common.py +189 -1
  55. xpk/parser/config.py +49 -0
  56. xpk/parser/core.py +27 -1
  57. xpk/parser/info.py +2 -1
  58. xpk/parser/inspector.py +3 -3
  59. xpk/parser/job.py +25 -4
  60. xpk/parser/kind.py +3 -2
  61. xpk/parser/run.py +47 -0
  62. xpk/parser/shell.py +10 -1
  63. xpk/parser/storage.py +316 -0
  64. xpk/parser/validators.py +3 -3
  65. xpk/parser/workload.py +118 -76
  66. xpk/templates/__init__.py +15 -0
  67. xpk/templates/storage.yaml +13 -0
  68. xpk/utils/gcs_utils.py +125 -0
  69. xpk/utils/kubectl.py +57 -0
  70. xpk/utils/objects.py +8 -5
  71. xpk/utils/templates.py +28 -0
  72. xpk/utils/validation.py +80 -0
  73. {xpk-0.6.0.dist-info → xpk-0.7.0.dist-info}/METADATA +165 -14
  74. xpk-0.7.0.dist-info/RECORD +92 -0
  75. {xpk-0.6.0.dist-info → xpk-0.7.0.dist-info}/WHEEL +1 -1
  76. xpk/core/core.py +0 -2824
  77. xpk-0.6.0.dist-info/RECORD +0 -57
  78. {xpk-0.6.0.dist-info → xpk-0.7.0.dist-info}/LICENSE +0 -0
  79. {xpk-0.6.0.dist-info → xpk-0.7.0.dist-info}/entry_points.txt +0 -0
  80. {xpk-0.6.0.dist-info → xpk-0.7.0.dist-info}/top_level.txt +0 -0
@@ -55,6 +55,7 @@ class Blueprint:
55
55
  """A class to represent Cluster Toolkit blueprint"""
56
56
 
57
57
  deployment_groups: list[DeploymentGroup]
58
+ terraform_backend_defaults: Optional[dict]
58
59
  blueprint_name: Optional[str]
59
60
  toolkit_modules_url: str
60
61
  toolkit_modules_version: str
@@ -14,21 +14,22 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
+ import os
17
18
  import shutil
18
19
  from typing import Optional
20
+
19
21
  from ruamel import yaml
20
- import os
21
22
 
22
- from .blueprint_definitions import DeploymentGroup, DeploymentModule, Blueprint
23
- from ..system_characteristics import get_system_characteristics_by_device_type
24
- from ...utils.console import xpk_print, xpk_exit
23
+ from ...utils.console import xpk_exit, xpk_print
25
24
  from ...utils.file import ensure_directory_exists
26
- from ..core import CapacityType, h100_mega_device_type, h200_device_type
25
+ from ..capacity import H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE, CapacityType
26
+ from ..system_characteristics import get_system_characteristics_by_device_type
27
+ from .blueprint_definitions import Blueprint, DeploymentGroup, DeploymentModule
27
28
 
28
29
  yaml = yaml.YAML()
29
30
 
30
- a3mega_device_type = h100_mega_device_type
31
- a3ultra_device_type = h200_device_type
31
+ a3mega_device_type = H100_MEGA_DEVICE_TYPE
32
+ a3ultra_device_type = H200_DEVICE_TYPE
32
33
  supported_device_types = {a3mega_device_type, a3ultra_device_type}
33
34
  blueprint_dependencies_dir = {
34
35
  a3mega_device_type: "src/xpk/blueprints/a3mega",
@@ -39,6 +40,16 @@ cluster_toolkit_url = "github.com/GoogleCloudPlatform/cluster-toolkit"
39
40
  cluster_toolkit_version = "v1.45.1"
40
41
 
41
42
 
43
+ def get_subnetworks_for_a3mega(cluster_name: str) -> list[str]:
44
+ return [f"{cluster_name}-gpunet-{i}-subnet" for i in range(8)]
45
+
46
+
47
+ def get_subnetworks_for_a3ultra(cluster_name: str) -> list[str]:
48
+ return [f"{cluster_name}-sub-1"] + [
49
+ f"{cluster_name}-rdma-sub-{i}" for i in range(8)
50
+ ]
51
+
52
+
42
53
  class BlueprintGeneratorOutput:
43
54
  """BlueprintGeneratorOutput is a class containing fields with output blueprint file path and path to blueprint dependencies.
44
55
  Atributes:
@@ -79,6 +90,7 @@ class BlueprintGenerator:
79
90
  group_placement_max_distance: int = 2,
80
91
  subnetwork_cidr_suffix: int = 24,
81
92
  reservation: str | None = None,
93
+ gcs_bucket: Optional[str | None] = None,
82
94
  capacity_type: CapacityType = CapacityType.ON_DEMAND,
83
95
  system_node_pool_min_node_count: int = 2,
84
96
  ) -> BlueprintGeneratorOutput:
@@ -132,6 +144,8 @@ class BlueprintGenerator:
132
144
  "prefix_with_deployment_name": False,
133
145
  "name_suffix": cluster_name,
134
146
  "enable_private_endpoint": False,
147
+ "enable_gcsfuse_csi": True,
148
+ "enable_filestore_csi": True,
135
149
  "master_authorized_networks": [{
136
150
  "cidr_block": (
137
151
  f"{auth_cidr}"
@@ -156,18 +170,6 @@ class BlueprintGenerator:
156
170
  },
157
171
  )
158
172
 
159
- reservation_affinity = (
160
- {
161
- "consume_reservation_type": "NO_RESERVATION",
162
- "specific_reservations": [],
163
- }
164
- if reservation is None
165
- else {
166
- "consume_reservation_type": "SPECIFIC_RESERVATION",
167
- "specific_reservations": [{"name": reservation}],
168
- }
169
- )
170
-
171
173
  a3_megagpu_pool_0 = DeploymentModule(
172
174
  id="a3_megagpu_pool_0",
173
175
  source="modules/compute/gke-node-pool",
@@ -178,7 +180,9 @@ class BlueprintGenerator:
178
180
  "static_node_count": num_nodes,
179
181
  "zones": [zone],
180
182
  "host_maintenance_interval": "PERIODIC",
181
- "reservation_affinity": reservation_affinity,
183
+ "reservation_affinity": self._getblock_reservation_affinity(
184
+ reservation
185
+ ),
182
186
  "run_workload_script": False,
183
187
  "spot": capacity_type == CapacityType.SPOT,
184
188
  "max_pods_per_node": 32,
@@ -199,6 +203,9 @@ class BlueprintGenerator:
199
203
  "config_template_vars": {"num_chips": f"{num_chips}"},
200
204
  },
201
205
  "jobset": {"install": True, "version": "v0.7.2"},
206
+ "apply_manifests": [{
207
+ "source": f'$(ghpc_stage("{blueprint_name}"))/storage_crd.yaml'
208
+ }],
202
209
  },
203
210
  )
204
211
 
@@ -235,7 +242,10 @@ class BlueprintGenerator:
235
242
  workload_configmap,
236
243
  ],
237
244
  )
238
- xpk_blueprint = Blueprint(
245
+ a3_mega_blueprint = Blueprint(
246
+ terraform_backend_defaults=self._getblock_terraform_backend(
247
+ gcs_bucket, prefix
248
+ ),
239
249
  blueprint_name=blueprint_name,
240
250
  toolkit_modules_url=cluster_toolkit_url,
241
251
  toolkit_modules_version=cluster_toolkit_version,
@@ -247,8 +257,9 @@ class BlueprintGenerator:
247
257
  "zone": zone,
248
258
  },
249
259
  )
260
+
250
261
  blueprint_file_path = self._save_blueprint_to_file(
251
- blueprint_name, xpk_blueprint, prefix
262
+ blueprint_name, a3_mega_blueprint, prefix
252
263
  )
253
264
  blueprint_dependencies = self._get_a3_mega_blueprint_dependencies(
254
265
  blueprint_name, prefix
@@ -271,6 +282,7 @@ class BlueprintGenerator:
271
282
  region: str,
272
283
  auth_cidr: str,
273
284
  prefix: str = "",
285
+ gcs_bucket: Optional[str | None] = None,
274
286
  ) -> BlueprintGeneratorOutput:
275
287
  """Create a simple gke cluster
276
288
 
@@ -318,6 +330,9 @@ class BlueprintGenerator:
318
330
  modules=[network1, gke_cluster],
319
331
  )
320
332
  ml_gke = Blueprint(
333
+ terraform_backend_defaults=self._getblock_terraform_backend(
334
+ gcs_bucket, prefix
335
+ ),
321
336
  blueprint_name=blueprint_name,
322
337
  toolkit_modules_url=cluster_toolkit_url,
323
338
  toolkit_modules_version=cluster_toolkit_version,
@@ -328,6 +343,7 @@ class BlueprintGenerator:
328
343
  "region": region,
329
344
  },
330
345
  )
346
+
331
347
  blueprint_file_path = self._save_blueprint_to_file(
332
348
  blueprint_name, ml_gke, prefix
333
349
  )
@@ -337,55 +353,6 @@ class BlueprintGenerator:
337
353
  blueprint_dependencies=blueprint_dependencies,
338
354
  )
339
355
 
340
- def _save_blueprint_to_file(
341
- self, blueprint_name: str, xpk_blueprint: Blueprint, prefix: str = ""
342
- ) -> str:
343
- blueprint_path = self._get_blueprint_path(blueprint_name, prefix)
344
- with open(blueprint_path, "w+", encoding="utf-8") as blueprint_file:
345
- yaml.dump(xpk_blueprint, blueprint_file)
346
- return blueprint_path
347
-
348
- def _get_blueprint_path(self, blueprint_name, prefix: str = ""):
349
- blueprint_path = os.path.join(
350
- self._get_storage_path(prefix), f"{blueprint_name}.yaml"
351
- )
352
- return blueprint_path
353
-
354
- def _get_storage_path(self, prefix):
355
- storage_path_with_prefix = os.path.join(self.storage_path, prefix)
356
- ensure_directory_exists(storage_path_with_prefix)
357
- return storage_path_with_prefix
358
-
359
- def blueprint_exists(self, blueprint_name, prefix: str = ""):
360
- blueprint_path = self._get_blueprint_path(blueprint_name, prefix)
361
- return os.path.exists(blueprint_path)
362
-
363
- def _get_a3_mega_blueprint_dependencies(
364
- self, blueprint_name: str, prefix: str = ""
365
- ) -> str:
366
- deployment_files_path = os.path.join(
367
- self._get_storage_path(prefix), blueprint_name
368
- )
369
- shutil.copytree(
370
- blueprint_dependencies_dir[a3mega_device_type],
371
- deployment_files_path,
372
- dirs_exist_ok=True,
373
- )
374
- return deployment_files_path
375
-
376
- def _get_a3_ultra_blueprint_dependencies(
377
- self, blueprint_name: str, prefix: str = ""
378
- ) -> str:
379
- deployment_files_path = os.path.join(
380
- self._get_storage_path(prefix), blueprint_name
381
- )
382
- shutil.copytree(
383
- blueprint_dependencies_dir[a3ultra_device_type],
384
- deployment_files_path,
385
- dirs_exist_ok=True,
386
- )
387
- return deployment_files_path
388
-
389
356
  def generate_a3_ultra_blueprint(
390
357
  self,
391
358
  project_id: str,
@@ -396,7 +363,9 @@ class BlueprintGenerator:
396
363
  auth_cidr: str,
397
364
  system_node_pool_machine_type: str,
398
365
  reservation: Optional[str | None] = None,
366
+ gcs_bucket: Optional[str | None] = None,
399
367
  num_nodes: int = 2,
368
+ enable_filestore_csi_driver=True,
400
369
  prefix: str = "",
401
370
  mtu_size: int = 8896,
402
371
  system_node_pool_min_node_count: int = 2,
@@ -501,6 +470,7 @@ class BlueprintGenerator:
501
470
  "system_node_pool_machine_type": system_node_pool_machine_type,
502
471
  "enable_dcgm_monitoring": True,
503
472
  "enable_gcsfuse_csi": True,
473
+ "enable_filestore_csi": enable_filestore_csi_driver,
504
474
  "enable_private_endpoint": False,
505
475
  "master_authorized_networks": [{
506
476
  "cidr_block": auth_cidr,
@@ -540,6 +510,9 @@ class BlueprintGenerator:
540
510
  "zones": [zone],
541
511
  "static_node_count": num_nodes,
542
512
  "spot": capacity_type == CapacityType.SPOT,
513
+ "reservation_affinity": self._getblock_reservation_affinity(
514
+ reservation
515
+ ),
543
516
  "max_pods_per_node": 32,
544
517
  "guest_accelerator": [{
545
518
  "type": "nvidia-h200-141gb",
@@ -561,11 +534,6 @@ class BlueprintGenerator:
561
534
  },
562
535
  outputs=["instructions"],
563
536
  )
564
- if reservation is not None:
565
- gpu_pool.settings["reservation_affinity"] = {
566
- "consume_reservation_type": "SPECIFIC_RESERVATION",
567
- "specific_reservations": [{"name": reservation}],
568
- }
569
537
 
570
538
  num_chips = num_nodes * system.chips_per_vm
571
539
  workload_manager_install_id = "workload-manager-install"
@@ -584,6 +552,11 @@ class BlueprintGenerator:
584
552
  "apply_manifests": [
585
553
  {"source": nccl_installer_path},
586
554
  {"source": mlgru_disable_path},
555
+ {
556
+ "source": (
557
+ f'$(ghpc_stage("{blueprint_name}"))/storage_crd.yaml'
558
+ )
559
+ },
587
560
  ],
588
561
  },
589
562
  )
@@ -623,6 +596,9 @@ class BlueprintGenerator:
623
596
  ],
624
597
  )
625
598
  a3_ultra_blueprint = Blueprint(
599
+ terraform_backend_defaults=self._getblock_terraform_backend(
600
+ gcs_bucket, prefix
601
+ ),
626
602
  blueprint_name=blueprint_name,
627
603
  toolkit_modules_url=cluster_toolkit_url,
628
604
  toolkit_modules_version=cluster_toolkit_version,
@@ -646,6 +622,86 @@ class BlueprintGenerator:
646
622
  blueprint_dependencies=blueprint_dependencies,
647
623
  )
648
624
 
625
+ def _getblock_reservation_affinity(
626
+ self, reservation: str | None = None
627
+ ) -> dict:
628
+ return (
629
+ {
630
+ "consume_reservation_type": "NO_RESERVATION",
631
+ "specific_reservations": [],
632
+ }
633
+ if reservation is None
634
+ else {
635
+ "consume_reservation_type": "SPECIFIC_RESERVATION",
636
+ "specific_reservations": [{"name": reservation}],
637
+ }
638
+ )
639
+
640
+ def _getblock_terraform_backend(
641
+ self, gcs_bucket: str, prefix: str = ""
642
+ ) -> dict | None:
643
+ if gcs_bucket is None:
644
+ return None
645
+ return {
646
+ "type": "gcs",
647
+ "configuration": {
648
+ "bucket": gcs_bucket,
649
+ "prefix": self._get_terraforrm_backend_full_prefix(prefix),
650
+ },
651
+ }
652
+
653
+ def _get_terraforrm_backend_full_prefix(self, prefix: str = "") -> str:
654
+ return f"xpk_terraform_state/{prefix}/tfstate/"
655
+
656
+ def _save_blueprint_to_file(
657
+ self, blueprint_name: str, xpk_blueprint: Blueprint, prefix: str = ""
658
+ ) -> str:
659
+ blueprint_path = self._get_blueprint_path(blueprint_name, prefix)
660
+ with open(blueprint_path, "w+", encoding="utf-8") as blueprint_file:
661
+ yaml.dump(xpk_blueprint, blueprint_file)
662
+ return blueprint_path
663
+
664
+ def _get_blueprint_path(self, blueprint_name, prefix: str = ""):
665
+ blueprint_path = os.path.join(
666
+ self._get_storage_path(prefix), f"{blueprint_name}.yaml"
667
+ )
668
+ return blueprint_path
669
+
670
+ def _get_storage_path(self, prefix):
671
+ storage_path_with_prefix = os.path.join(self.storage_path, prefix)
672
+ ensure_directory_exists(storage_path_with_prefix)
673
+ return storage_path_with_prefix
674
+
675
+ def blueprint_exists(self, blueprint_name, prefix: str = ""):
676
+ blueprint_path = self._get_blueprint_path(blueprint_name, prefix)
677
+ return os.path.exists(blueprint_path)
678
+
679
+ def _get_a3_mega_blueprint_dependencies(
680
+ self, blueprint_name: str, prefix: str = ""
681
+ ) -> str:
682
+ deployment_files_path = os.path.join(
683
+ self._get_storage_path(prefix), blueprint_name
684
+ )
685
+ shutil.copytree(
686
+ blueprint_dependencies_dir[a3mega_device_type],
687
+ deployment_files_path,
688
+ dirs_exist_ok=True,
689
+ )
690
+ return deployment_files_path
691
+
692
+ def _get_a3_ultra_blueprint_dependencies(
693
+ self, blueprint_name: str, prefix: str = ""
694
+ ) -> str:
695
+ deployment_files_path = os.path.join(
696
+ self._get_storage_path(prefix), blueprint_name
697
+ )
698
+ shutil.copytree(
699
+ blueprint_dependencies_dir[a3ultra_device_type],
700
+ deployment_files_path,
701
+ dirs_exist_ok=True,
702
+ )
703
+ return deployment_files_path
704
+
649
705
 
650
706
  yaml.register_class(Blueprint)
651
707
  yaml.register_class(DeploymentGroup)
xpk/core/capacity.py ADDED
@@ -0,0 +1,185 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import enum
18
+
19
+ from ..utils.console import xpk_print
20
+ from .commands import run_command_with_updates
21
+
22
+ AUTOPROVISIONING_CONFIG_VALUE = 'AUTOPROVISION'
23
+ AUTOPROVISIONING_CONFIG_MINIMUM_KEY = 'minimum_chips'
24
+ AUTOPROVISIONING_CONFIG_MAXIMUM_KEY = 'maximum_chips'
25
+ CAPACITY_TYPE_CONFIG_KEY = 'capacity_type'
26
+
27
+ H100_DEVICE_TYPE = 'h100-80gb-8'
28
+ H100_MEGA_DEVICE_TYPE = 'h100-mega-80gb-8'
29
+ H200_DEVICE_TYPE = 'h200-141gb-8'
30
+ RESERVATION_CONFIG_KEY = 'reservation_id'
31
+
32
+
33
+ class CapacityType(enum.Enum):
34
+ ON_DEMAND = 'on_demand'
35
+ RESERVATION = 'reservation'
36
+ SPOT = 'spot'
37
+ UNKNOWN = 'unknown'
38
+
39
+
40
+ def print_reservations(args) -> int:
41
+ """Print the reservations in the project.
42
+
43
+ Args:
44
+ args: user provided arguments for running the command.
45
+
46
+ Returns:
47
+ 0 if successful and 1 otherwise.
48
+ """
49
+ command = f'gcloud beta compute reservations list --project={args.project}'
50
+ return_code = run_command_with_updates(
51
+ command, 'Get all reservations in the project', args
52
+ )
53
+ if return_code != 0:
54
+ xpk_print(f'Get all reservations returned ERROR {return_code}')
55
+ return 1
56
+ return 0
57
+
58
+
59
+ def get_capacity_type(args) -> tuple[CapacityType, int]:
60
+ """Determine the capacity type based on user arguments.
61
+
62
+ Args:
63
+ args: user provided arguments for running the command.
64
+
65
+ Returns:
66
+ Tuple with string with the system characteristics and
67
+ int of 0 if successful and 1 otherwise.
68
+ """
69
+ capacity_type = CapacityType.UNKNOWN
70
+ num_types = 0
71
+ return_code = 0
72
+
73
+ # Determine the capacity argument.
74
+ if args.on_demand:
75
+ capacity_type = CapacityType.ON_DEMAND
76
+ num_types += 1
77
+ if args.reservation:
78
+ return_code = verify_reservation_exists(args)
79
+ if return_code > 0:
80
+ return capacity_type, return_code
81
+ capacity_type = CapacityType.RESERVATION
82
+ num_types += 1
83
+ if args.spot:
84
+ capacity_type = CapacityType.SPOT
85
+ num_types += 1
86
+
87
+ # Check that the number of user arguments provided is valid.
88
+ if num_types == 0:
89
+ capacity_type = CapacityType.UNKNOWN
90
+ elif num_types != 1:
91
+ xpk_print(
92
+ 'ERROR: User specified more than one of the following arguments. Please'
93
+ ' specify only one of `--reservation=$RESERVATION_NAME`, `--on-demand`'
94
+ ' or `--spot`.'
95
+ )
96
+ return_code = 1
97
+
98
+ return capacity_type, return_code
99
+
100
+
101
+ def verify_reservation_exists(args) -> int:
102
+ """Verify the reservation exists.
103
+
104
+ Args:
105
+ args: user provided arguments for running the command.
106
+
107
+ Returns:
108
+ 0 if successful and 1 otherwise.
109
+ """
110
+ command = (
111
+ f'gcloud beta compute reservations describe {args.reservation}'
112
+ f' --project={args.project} --zone={args.zone}'
113
+ )
114
+ return_code = run_command_with_updates(command, 'Describe reservation', args)
115
+ if return_code != 0:
116
+ xpk_print(f'Describe reservation returned ERROR {return_code}')
117
+ xpk_print('Please confirm that your reservation name is correct.')
118
+ return 1
119
+ return 0
120
+
121
+
122
+ def get_capacity_arguments_from_capacity_type(
123
+ args, capacity_type: CapacityType
124
+ ) -> tuple[str, int]:
125
+ """Determine the TPU Nodepool creation capacity arguments needed.
126
+
127
+ Args:
128
+ args: user provided arguments for running the command.
129
+ capacity_type: The type of capacity the user configured.
130
+
131
+ Returns:
132
+ Tuple with string with the capacity argument to use and
133
+ int of 0 if successful and 1 otherwise.
134
+ """
135
+ capacity_args = ''
136
+ return_code = 0
137
+
138
+ match capacity_type:
139
+ case CapacityType.ON_DEMAND:
140
+ capacity_args = ''
141
+ case CapacityType.SPOT:
142
+ capacity_args = '--spot'
143
+ case CapacityType.RESERVATION:
144
+ capacity_args = (
145
+ f'--reservation-affinity=specific --reservation={args.reservation}'
146
+ )
147
+ case _:
148
+ xpk_print(
149
+ f'Unknown capacity type: {capacity_type}. Unable to determine'
150
+ ' capacity args.'
151
+ )
152
+ return_code = 1
153
+ return capacity_args, return_code
154
+
155
+
156
+ def get_capacity_node_selectors_from_capacity_type(
157
+ args, capacity_type: str
158
+ ) -> tuple[str, int]:
159
+ """Determine the node selectors for a workload to run on a specific capacity type.
160
+
161
+ Args:
162
+ args: user provided arguments for running the command.
163
+ capacity_type: The type of capacity the user configured.
164
+
165
+ Returns:
166
+ Tuple with string with the node selectors to use and
167
+ int of 0 if successful and 1 otherwise.
168
+ """
169
+ node_selector = ''
170
+ return_code = 0
171
+
172
+ match capacity_type:
173
+ case CapacityType.ON_DEMAND.name:
174
+ node_selector = ''
175
+ case CapacityType.SPOT.name:
176
+ node_selector = 'cloud.google.com/gke-spot="true"'
177
+ case CapacityType.RESERVATION.name:
178
+ node_selector = f'cloud.google.com/reservation-name: {args.reservation}'
179
+ case _:
180
+ xpk_print(
181
+ f'Unknown capacity type: {capacity_type}. Unable to determine the'
182
+ ' node selectors.'
183
+ )
184
+ return_code = 1
185
+ return node_selector, return_code