xpk 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/api/__init__.py +15 -0
- xpk/api/storage_crd.yaml +52 -0
- xpk/commands/batch.py +27 -5
- xpk/commands/cluster.py +104 -80
- xpk/commands/cluster_gcluster.py +94 -10
- xpk/commands/common.py +44 -0
- xpk/commands/config.py +29 -0
- xpk/commands/info.py +8 -10
- xpk/commands/inspector.py +5 -11
- xpk/commands/job.py +9 -7
- xpk/commands/kind.py +34 -4
- xpk/commands/kjob_common.py +44 -0
- xpk/commands/run.py +128 -0
- xpk/commands/shell.py +27 -7
- xpk/commands/storage.py +280 -0
- xpk/commands/version.py +6 -18
- xpk/commands/workload.py +381 -184
- xpk/core/blueprint/blueprint_definitions.py +1 -0
- xpk/core/blueprint/blueprint_generator.py +132 -76
- xpk/core/capacity.py +185 -0
- xpk/core/cluster.py +564 -0
- xpk/core/cluster_private.py +6 -3
- xpk/core/commands.py +18 -14
- xpk/core/config.py +179 -0
- xpk/core/docker_container.py +225 -0
- xpk/core/docker_image.py +210 -0
- xpk/core/docker_resources.py +350 -0
- xpk/core/filestore.py +251 -0
- xpk/core/gcloud_context.py +196 -0
- xpk/core/gcluster_manager.py +20 -2
- xpk/core/gcsfuse.py +50 -0
- xpk/core/kjob.py +257 -18
- xpk/core/kueue.py +12 -6
- xpk/core/monitoring.py +134 -0
- xpk/core/nap.py +32 -20
- xpk/core/network.py +377 -0
- xpk/core/nodepool.py +581 -0
- xpk/core/pathways.py +124 -45
- xpk/core/remote_state/__init__.py +15 -0
- xpk/core/remote_state/fuse_remote_state.py +99 -0
- xpk/core/remote_state/remote_state_client.py +38 -0
- xpk/core/resources.py +238 -0
- xpk/core/scheduling.py +253 -0
- xpk/core/storage.py +581 -0
- xpk/core/system_characteristics.py +38 -1
- xpk/core/vertex.py +105 -0
- xpk/core/workload.py +209 -1
- xpk/core/workload_decorators/rdma_decorator.py +25 -5
- xpk/core/workload_decorators/storage_decorator.py +52 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +70 -37
- xpk/main.py +3 -1
- xpk/parser/batch.py +10 -151
- xpk/parser/cluster.py +49 -8
- xpk/parser/common.py +189 -1
- xpk/parser/config.py +49 -0
- xpk/parser/core.py +27 -1
- xpk/parser/info.py +2 -1
- xpk/parser/inspector.py +3 -3
- xpk/parser/job.py +25 -4
- xpk/parser/kind.py +3 -2
- xpk/parser/run.py +47 -0
- xpk/parser/shell.py +10 -1
- xpk/parser/storage.py +326 -0
- xpk/parser/validators.py +3 -3
- xpk/parser/workload.py +118 -76
- xpk/templates/__init__.py +15 -0
- xpk/templates/storage.yaml +13 -0
- xpk/utils/gcs_utils.py +125 -0
- xpk/utils/kubectl.py +57 -0
- xpk/utils/objects.py +8 -5
- xpk/utils/templates.py +28 -0
- xpk/utils/validation.py +80 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/METADATA +169 -15
- xpk-0.7.1.dist-info/RECORD +92 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/WHEEL +1 -1
- xpk/core/core.py +0 -2824
- xpk-0.6.0.dist-info/RECORD +0 -57
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/entry_points.txt +0 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info/licenses}/LICENSE +0 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/top_level.txt +0 -0
|
@@ -14,21 +14,22 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
import os
|
|
17
18
|
import shutil
|
|
18
19
|
from typing import Optional
|
|
20
|
+
|
|
19
21
|
from ruamel import yaml
|
|
20
|
-
import os
|
|
21
22
|
|
|
22
|
-
from .
|
|
23
|
-
from ..system_characteristics import get_system_characteristics_by_device_type
|
|
24
|
-
from ...utils.console import xpk_print, xpk_exit
|
|
23
|
+
from ...utils.console import xpk_exit, xpk_print
|
|
25
24
|
from ...utils.file import ensure_directory_exists
|
|
26
|
-
from ..
|
|
25
|
+
from ..capacity import H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE, CapacityType
|
|
26
|
+
from ..system_characteristics import get_system_characteristics_by_device_type
|
|
27
|
+
from .blueprint_definitions import Blueprint, DeploymentGroup, DeploymentModule
|
|
27
28
|
|
|
28
29
|
yaml = yaml.YAML()
|
|
29
30
|
|
|
30
|
-
a3mega_device_type =
|
|
31
|
-
a3ultra_device_type =
|
|
31
|
+
a3mega_device_type = H100_MEGA_DEVICE_TYPE
|
|
32
|
+
a3ultra_device_type = H200_DEVICE_TYPE
|
|
32
33
|
supported_device_types = {a3mega_device_type, a3ultra_device_type}
|
|
33
34
|
blueprint_dependencies_dir = {
|
|
34
35
|
a3mega_device_type: "src/xpk/blueprints/a3mega",
|
|
@@ -39,6 +40,16 @@ cluster_toolkit_url = "github.com/GoogleCloudPlatform/cluster-toolkit"
|
|
|
39
40
|
cluster_toolkit_version = "v1.45.1"
|
|
40
41
|
|
|
41
42
|
|
|
43
|
+
def get_subnetworks_for_a3mega(cluster_name: str) -> list[str]:
|
|
44
|
+
return [f"{cluster_name}-gpunet-{i}-subnet" for i in range(8)]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_subnetworks_for_a3ultra(cluster_name: str) -> list[str]:
|
|
48
|
+
return [f"{cluster_name}-sub-1"] + [
|
|
49
|
+
f"{cluster_name}-rdma-sub-{i}" for i in range(8)
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
|
|
42
53
|
class BlueprintGeneratorOutput:
|
|
43
54
|
"""BlueprintGeneratorOutput is a class containing fields with output blueprint file path and path to blueprint dependencies.
|
|
44
55
|
Atributes:
|
|
@@ -79,6 +90,7 @@ class BlueprintGenerator:
|
|
|
79
90
|
group_placement_max_distance: int = 2,
|
|
80
91
|
subnetwork_cidr_suffix: int = 24,
|
|
81
92
|
reservation: str | None = None,
|
|
93
|
+
gcs_bucket: Optional[str | None] = None,
|
|
82
94
|
capacity_type: CapacityType = CapacityType.ON_DEMAND,
|
|
83
95
|
system_node_pool_min_node_count: int = 2,
|
|
84
96
|
) -> BlueprintGeneratorOutput:
|
|
@@ -132,6 +144,8 @@ class BlueprintGenerator:
|
|
|
132
144
|
"prefix_with_deployment_name": False,
|
|
133
145
|
"name_suffix": cluster_name,
|
|
134
146
|
"enable_private_endpoint": False,
|
|
147
|
+
"enable_gcsfuse_csi": True,
|
|
148
|
+
"enable_filestore_csi": True,
|
|
135
149
|
"master_authorized_networks": [{
|
|
136
150
|
"cidr_block": (
|
|
137
151
|
f"{auth_cidr}"
|
|
@@ -156,18 +170,6 @@ class BlueprintGenerator:
|
|
|
156
170
|
},
|
|
157
171
|
)
|
|
158
172
|
|
|
159
|
-
reservation_affinity = (
|
|
160
|
-
{
|
|
161
|
-
"consume_reservation_type": "NO_RESERVATION",
|
|
162
|
-
"specific_reservations": [],
|
|
163
|
-
}
|
|
164
|
-
if reservation is None
|
|
165
|
-
else {
|
|
166
|
-
"consume_reservation_type": "SPECIFIC_RESERVATION",
|
|
167
|
-
"specific_reservations": [{"name": reservation}],
|
|
168
|
-
}
|
|
169
|
-
)
|
|
170
|
-
|
|
171
173
|
a3_megagpu_pool_0 = DeploymentModule(
|
|
172
174
|
id="a3_megagpu_pool_0",
|
|
173
175
|
source="modules/compute/gke-node-pool",
|
|
@@ -178,7 +180,9 @@ class BlueprintGenerator:
|
|
|
178
180
|
"static_node_count": num_nodes,
|
|
179
181
|
"zones": [zone],
|
|
180
182
|
"host_maintenance_interval": "PERIODIC",
|
|
181
|
-
"reservation_affinity":
|
|
183
|
+
"reservation_affinity": self._getblock_reservation_affinity(
|
|
184
|
+
reservation
|
|
185
|
+
),
|
|
182
186
|
"run_workload_script": False,
|
|
183
187
|
"spot": capacity_type == CapacityType.SPOT,
|
|
184
188
|
"max_pods_per_node": 32,
|
|
@@ -199,6 +203,9 @@ class BlueprintGenerator:
|
|
|
199
203
|
"config_template_vars": {"num_chips": f"{num_chips}"},
|
|
200
204
|
},
|
|
201
205
|
"jobset": {"install": True, "version": "v0.7.2"},
|
|
206
|
+
"apply_manifests": [{
|
|
207
|
+
"source": f'$(ghpc_stage("{blueprint_name}"))/storage_crd.yaml'
|
|
208
|
+
}],
|
|
202
209
|
},
|
|
203
210
|
)
|
|
204
211
|
|
|
@@ -235,7 +242,10 @@ class BlueprintGenerator:
|
|
|
235
242
|
workload_configmap,
|
|
236
243
|
],
|
|
237
244
|
)
|
|
238
|
-
|
|
245
|
+
a3_mega_blueprint = Blueprint(
|
|
246
|
+
terraform_backend_defaults=self._getblock_terraform_backend(
|
|
247
|
+
gcs_bucket, prefix
|
|
248
|
+
),
|
|
239
249
|
blueprint_name=blueprint_name,
|
|
240
250
|
toolkit_modules_url=cluster_toolkit_url,
|
|
241
251
|
toolkit_modules_version=cluster_toolkit_version,
|
|
@@ -247,8 +257,9 @@ class BlueprintGenerator:
|
|
|
247
257
|
"zone": zone,
|
|
248
258
|
},
|
|
249
259
|
)
|
|
260
|
+
|
|
250
261
|
blueprint_file_path = self._save_blueprint_to_file(
|
|
251
|
-
blueprint_name,
|
|
262
|
+
blueprint_name, a3_mega_blueprint, prefix
|
|
252
263
|
)
|
|
253
264
|
blueprint_dependencies = self._get_a3_mega_blueprint_dependencies(
|
|
254
265
|
blueprint_name, prefix
|
|
@@ -271,6 +282,7 @@ class BlueprintGenerator:
|
|
|
271
282
|
region: str,
|
|
272
283
|
auth_cidr: str,
|
|
273
284
|
prefix: str = "",
|
|
285
|
+
gcs_bucket: Optional[str | None] = None,
|
|
274
286
|
) -> BlueprintGeneratorOutput:
|
|
275
287
|
"""Create a simple gke cluster
|
|
276
288
|
|
|
@@ -318,6 +330,9 @@ class BlueprintGenerator:
|
|
|
318
330
|
modules=[network1, gke_cluster],
|
|
319
331
|
)
|
|
320
332
|
ml_gke = Blueprint(
|
|
333
|
+
terraform_backend_defaults=self._getblock_terraform_backend(
|
|
334
|
+
gcs_bucket, prefix
|
|
335
|
+
),
|
|
321
336
|
blueprint_name=blueprint_name,
|
|
322
337
|
toolkit_modules_url=cluster_toolkit_url,
|
|
323
338
|
toolkit_modules_version=cluster_toolkit_version,
|
|
@@ -328,6 +343,7 @@ class BlueprintGenerator:
|
|
|
328
343
|
"region": region,
|
|
329
344
|
},
|
|
330
345
|
)
|
|
346
|
+
|
|
331
347
|
blueprint_file_path = self._save_blueprint_to_file(
|
|
332
348
|
blueprint_name, ml_gke, prefix
|
|
333
349
|
)
|
|
@@ -337,55 +353,6 @@ class BlueprintGenerator:
|
|
|
337
353
|
blueprint_dependencies=blueprint_dependencies,
|
|
338
354
|
)
|
|
339
355
|
|
|
340
|
-
def _save_blueprint_to_file(
|
|
341
|
-
self, blueprint_name: str, xpk_blueprint: Blueprint, prefix: str = ""
|
|
342
|
-
) -> str:
|
|
343
|
-
blueprint_path = self._get_blueprint_path(blueprint_name, prefix)
|
|
344
|
-
with open(blueprint_path, "w+", encoding="utf-8") as blueprint_file:
|
|
345
|
-
yaml.dump(xpk_blueprint, blueprint_file)
|
|
346
|
-
return blueprint_path
|
|
347
|
-
|
|
348
|
-
def _get_blueprint_path(self, blueprint_name, prefix: str = ""):
|
|
349
|
-
blueprint_path = os.path.join(
|
|
350
|
-
self._get_storage_path(prefix), f"{blueprint_name}.yaml"
|
|
351
|
-
)
|
|
352
|
-
return blueprint_path
|
|
353
|
-
|
|
354
|
-
def _get_storage_path(self, prefix):
|
|
355
|
-
storage_path_with_prefix = os.path.join(self.storage_path, prefix)
|
|
356
|
-
ensure_directory_exists(storage_path_with_prefix)
|
|
357
|
-
return storage_path_with_prefix
|
|
358
|
-
|
|
359
|
-
def blueprint_exists(self, blueprint_name, prefix: str = ""):
|
|
360
|
-
blueprint_path = self._get_blueprint_path(blueprint_name, prefix)
|
|
361
|
-
return os.path.exists(blueprint_path)
|
|
362
|
-
|
|
363
|
-
def _get_a3_mega_blueprint_dependencies(
|
|
364
|
-
self, blueprint_name: str, prefix: str = ""
|
|
365
|
-
) -> str:
|
|
366
|
-
deployment_files_path = os.path.join(
|
|
367
|
-
self._get_storage_path(prefix), blueprint_name
|
|
368
|
-
)
|
|
369
|
-
shutil.copytree(
|
|
370
|
-
blueprint_dependencies_dir[a3mega_device_type],
|
|
371
|
-
deployment_files_path,
|
|
372
|
-
dirs_exist_ok=True,
|
|
373
|
-
)
|
|
374
|
-
return deployment_files_path
|
|
375
|
-
|
|
376
|
-
def _get_a3_ultra_blueprint_dependencies(
|
|
377
|
-
self, blueprint_name: str, prefix: str = ""
|
|
378
|
-
) -> str:
|
|
379
|
-
deployment_files_path = os.path.join(
|
|
380
|
-
self._get_storage_path(prefix), blueprint_name
|
|
381
|
-
)
|
|
382
|
-
shutil.copytree(
|
|
383
|
-
blueprint_dependencies_dir[a3ultra_device_type],
|
|
384
|
-
deployment_files_path,
|
|
385
|
-
dirs_exist_ok=True,
|
|
386
|
-
)
|
|
387
|
-
return deployment_files_path
|
|
388
|
-
|
|
389
356
|
def generate_a3_ultra_blueprint(
|
|
390
357
|
self,
|
|
391
358
|
project_id: str,
|
|
@@ -396,7 +363,9 @@ class BlueprintGenerator:
|
|
|
396
363
|
auth_cidr: str,
|
|
397
364
|
system_node_pool_machine_type: str,
|
|
398
365
|
reservation: Optional[str | None] = None,
|
|
366
|
+
gcs_bucket: Optional[str | None] = None,
|
|
399
367
|
num_nodes: int = 2,
|
|
368
|
+
enable_filestore_csi_driver=True,
|
|
400
369
|
prefix: str = "",
|
|
401
370
|
mtu_size: int = 8896,
|
|
402
371
|
system_node_pool_min_node_count: int = 2,
|
|
@@ -501,6 +470,7 @@ class BlueprintGenerator:
|
|
|
501
470
|
"system_node_pool_machine_type": system_node_pool_machine_type,
|
|
502
471
|
"enable_dcgm_monitoring": True,
|
|
503
472
|
"enable_gcsfuse_csi": True,
|
|
473
|
+
"enable_filestore_csi": enable_filestore_csi_driver,
|
|
504
474
|
"enable_private_endpoint": False,
|
|
505
475
|
"master_authorized_networks": [{
|
|
506
476
|
"cidr_block": auth_cidr,
|
|
@@ -540,6 +510,9 @@ class BlueprintGenerator:
|
|
|
540
510
|
"zones": [zone],
|
|
541
511
|
"static_node_count": num_nodes,
|
|
542
512
|
"spot": capacity_type == CapacityType.SPOT,
|
|
513
|
+
"reservation_affinity": self._getblock_reservation_affinity(
|
|
514
|
+
reservation
|
|
515
|
+
),
|
|
543
516
|
"max_pods_per_node": 32,
|
|
544
517
|
"guest_accelerator": [{
|
|
545
518
|
"type": "nvidia-h200-141gb",
|
|
@@ -561,11 +534,6 @@ class BlueprintGenerator:
|
|
|
561
534
|
},
|
|
562
535
|
outputs=["instructions"],
|
|
563
536
|
)
|
|
564
|
-
if reservation is not None:
|
|
565
|
-
gpu_pool.settings["reservation_affinity"] = {
|
|
566
|
-
"consume_reservation_type": "SPECIFIC_RESERVATION",
|
|
567
|
-
"specific_reservations": [{"name": reservation}],
|
|
568
|
-
}
|
|
569
537
|
|
|
570
538
|
num_chips = num_nodes * system.chips_per_vm
|
|
571
539
|
workload_manager_install_id = "workload-manager-install"
|
|
@@ -584,6 +552,11 @@ class BlueprintGenerator:
|
|
|
584
552
|
"apply_manifests": [
|
|
585
553
|
{"source": nccl_installer_path},
|
|
586
554
|
{"source": mlgru_disable_path},
|
|
555
|
+
{
|
|
556
|
+
"source": (
|
|
557
|
+
f'$(ghpc_stage("{blueprint_name}"))/storage_crd.yaml'
|
|
558
|
+
)
|
|
559
|
+
},
|
|
587
560
|
],
|
|
588
561
|
},
|
|
589
562
|
)
|
|
@@ -623,6 +596,9 @@ class BlueprintGenerator:
|
|
|
623
596
|
],
|
|
624
597
|
)
|
|
625
598
|
a3_ultra_blueprint = Blueprint(
|
|
599
|
+
terraform_backend_defaults=self._getblock_terraform_backend(
|
|
600
|
+
gcs_bucket, prefix
|
|
601
|
+
),
|
|
626
602
|
blueprint_name=blueprint_name,
|
|
627
603
|
toolkit_modules_url=cluster_toolkit_url,
|
|
628
604
|
toolkit_modules_version=cluster_toolkit_version,
|
|
@@ -646,6 +622,86 @@ class BlueprintGenerator:
|
|
|
646
622
|
blueprint_dependencies=blueprint_dependencies,
|
|
647
623
|
)
|
|
648
624
|
|
|
625
|
+
def _getblock_reservation_affinity(
|
|
626
|
+
self, reservation: str | None = None
|
|
627
|
+
) -> dict:
|
|
628
|
+
return (
|
|
629
|
+
{
|
|
630
|
+
"consume_reservation_type": "NO_RESERVATION",
|
|
631
|
+
"specific_reservations": [],
|
|
632
|
+
}
|
|
633
|
+
if reservation is None
|
|
634
|
+
else {
|
|
635
|
+
"consume_reservation_type": "SPECIFIC_RESERVATION",
|
|
636
|
+
"specific_reservations": [{"name": reservation}],
|
|
637
|
+
}
|
|
638
|
+
)
|
|
639
|
+
|
|
640
|
+
def _getblock_terraform_backend(
|
|
641
|
+
self, gcs_bucket: str, prefix: str = ""
|
|
642
|
+
) -> dict | None:
|
|
643
|
+
if gcs_bucket is None:
|
|
644
|
+
return None
|
|
645
|
+
return {
|
|
646
|
+
"type": "gcs",
|
|
647
|
+
"configuration": {
|
|
648
|
+
"bucket": gcs_bucket,
|
|
649
|
+
"prefix": self._get_terraforrm_backend_full_prefix(prefix),
|
|
650
|
+
},
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
def _get_terraforrm_backend_full_prefix(self, prefix: str = "") -> str:
|
|
654
|
+
return f"xpk_terraform_state/{prefix}/tfstate/"
|
|
655
|
+
|
|
656
|
+
def _save_blueprint_to_file(
|
|
657
|
+
self, blueprint_name: str, xpk_blueprint: Blueprint, prefix: str = ""
|
|
658
|
+
) -> str:
|
|
659
|
+
blueprint_path = self._get_blueprint_path(blueprint_name, prefix)
|
|
660
|
+
with open(blueprint_path, "w+", encoding="utf-8") as blueprint_file:
|
|
661
|
+
yaml.dump(xpk_blueprint, blueprint_file)
|
|
662
|
+
return blueprint_path
|
|
663
|
+
|
|
664
|
+
def _get_blueprint_path(self, blueprint_name, prefix: str = ""):
|
|
665
|
+
blueprint_path = os.path.join(
|
|
666
|
+
self._get_storage_path(prefix), f"{blueprint_name}.yaml"
|
|
667
|
+
)
|
|
668
|
+
return blueprint_path
|
|
669
|
+
|
|
670
|
+
def _get_storage_path(self, prefix):
|
|
671
|
+
storage_path_with_prefix = os.path.join(self.storage_path, prefix)
|
|
672
|
+
ensure_directory_exists(storage_path_with_prefix)
|
|
673
|
+
return storage_path_with_prefix
|
|
674
|
+
|
|
675
|
+
def blueprint_exists(self, blueprint_name, prefix: str = ""):
|
|
676
|
+
blueprint_path = self._get_blueprint_path(blueprint_name, prefix)
|
|
677
|
+
return os.path.exists(blueprint_path)
|
|
678
|
+
|
|
679
|
+
def _get_a3_mega_blueprint_dependencies(
|
|
680
|
+
self, blueprint_name: str, prefix: str = ""
|
|
681
|
+
) -> str:
|
|
682
|
+
deployment_files_path = os.path.join(
|
|
683
|
+
self._get_storage_path(prefix), blueprint_name
|
|
684
|
+
)
|
|
685
|
+
shutil.copytree(
|
|
686
|
+
blueprint_dependencies_dir[a3mega_device_type],
|
|
687
|
+
deployment_files_path,
|
|
688
|
+
dirs_exist_ok=True,
|
|
689
|
+
)
|
|
690
|
+
return deployment_files_path
|
|
691
|
+
|
|
692
|
+
def _get_a3_ultra_blueprint_dependencies(
|
|
693
|
+
self, blueprint_name: str, prefix: str = ""
|
|
694
|
+
) -> str:
|
|
695
|
+
deployment_files_path = os.path.join(
|
|
696
|
+
self._get_storage_path(prefix), blueprint_name
|
|
697
|
+
)
|
|
698
|
+
shutil.copytree(
|
|
699
|
+
blueprint_dependencies_dir[a3ultra_device_type],
|
|
700
|
+
deployment_files_path,
|
|
701
|
+
dirs_exist_ok=True,
|
|
702
|
+
)
|
|
703
|
+
return deployment_files_path
|
|
704
|
+
|
|
649
705
|
|
|
650
706
|
yaml.register_class(Blueprint)
|
|
651
707
|
yaml.register_class(DeploymentGroup)
|
xpk/core/capacity.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import enum
|
|
18
|
+
|
|
19
|
+
from ..utils.console import xpk_print
|
|
20
|
+
from .commands import run_command_with_updates
|
|
21
|
+
|
|
22
|
+
AUTOPROVISIONING_CONFIG_VALUE = 'AUTOPROVISION'
|
|
23
|
+
AUTOPROVISIONING_CONFIG_MINIMUM_KEY = 'minimum_chips'
|
|
24
|
+
AUTOPROVISIONING_CONFIG_MAXIMUM_KEY = 'maximum_chips'
|
|
25
|
+
CAPACITY_TYPE_CONFIG_KEY = 'capacity_type'
|
|
26
|
+
|
|
27
|
+
H100_DEVICE_TYPE = 'h100-80gb-8'
|
|
28
|
+
H100_MEGA_DEVICE_TYPE = 'h100-mega-80gb-8'
|
|
29
|
+
H200_DEVICE_TYPE = 'h200-141gb-8'
|
|
30
|
+
RESERVATION_CONFIG_KEY = 'reservation_id'
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class CapacityType(enum.Enum):
|
|
34
|
+
ON_DEMAND = 'on_demand'
|
|
35
|
+
RESERVATION = 'reservation'
|
|
36
|
+
SPOT = 'spot'
|
|
37
|
+
UNKNOWN = 'unknown'
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def print_reservations(args) -> int:
|
|
41
|
+
"""Print the reservations in the project.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
args: user provided arguments for running the command.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
0 if successful and 1 otherwise.
|
|
48
|
+
"""
|
|
49
|
+
command = f'gcloud beta compute reservations list --project={args.project}'
|
|
50
|
+
return_code = run_command_with_updates(
|
|
51
|
+
command, 'Get all reservations in the project', args
|
|
52
|
+
)
|
|
53
|
+
if return_code != 0:
|
|
54
|
+
xpk_print(f'Get all reservations returned ERROR {return_code}')
|
|
55
|
+
return 1
|
|
56
|
+
return 0
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def get_capacity_type(args) -> tuple[CapacityType, int]:
|
|
60
|
+
"""Determine the capacity type based on user arguments.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
args: user provided arguments for running the command.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Tuple with string with the system characteristics and
|
|
67
|
+
int of 0 if successful and 1 otherwise.
|
|
68
|
+
"""
|
|
69
|
+
capacity_type = CapacityType.UNKNOWN
|
|
70
|
+
num_types = 0
|
|
71
|
+
return_code = 0
|
|
72
|
+
|
|
73
|
+
# Determine the capacity argument.
|
|
74
|
+
if args.on_demand:
|
|
75
|
+
capacity_type = CapacityType.ON_DEMAND
|
|
76
|
+
num_types += 1
|
|
77
|
+
if args.reservation:
|
|
78
|
+
return_code = verify_reservation_exists(args)
|
|
79
|
+
if return_code > 0:
|
|
80
|
+
return capacity_type, return_code
|
|
81
|
+
capacity_type = CapacityType.RESERVATION
|
|
82
|
+
num_types += 1
|
|
83
|
+
if args.spot:
|
|
84
|
+
capacity_type = CapacityType.SPOT
|
|
85
|
+
num_types += 1
|
|
86
|
+
|
|
87
|
+
# Check that the number of user arguments provided is valid.
|
|
88
|
+
if num_types == 0:
|
|
89
|
+
capacity_type = CapacityType.UNKNOWN
|
|
90
|
+
elif num_types != 1:
|
|
91
|
+
xpk_print(
|
|
92
|
+
'ERROR: User specified more than one of the following arguments. Please'
|
|
93
|
+
' specify only one of `--reservation=$RESERVATION_NAME`, `--on-demand`'
|
|
94
|
+
' or `--spot`.'
|
|
95
|
+
)
|
|
96
|
+
return_code = 1
|
|
97
|
+
|
|
98
|
+
return capacity_type, return_code
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def verify_reservation_exists(args) -> int:
|
|
102
|
+
"""Verify the reservation exists.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
args: user provided arguments for running the command.
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
0 if successful and 1 otherwise.
|
|
109
|
+
"""
|
|
110
|
+
command = (
|
|
111
|
+
f'gcloud beta compute reservations describe {args.reservation}'
|
|
112
|
+
f' --project={args.project} --zone={args.zone}'
|
|
113
|
+
)
|
|
114
|
+
return_code = run_command_with_updates(command, 'Describe reservation', args)
|
|
115
|
+
if return_code != 0:
|
|
116
|
+
xpk_print(f'Describe reservation returned ERROR {return_code}')
|
|
117
|
+
xpk_print('Please confirm that your reservation name is correct.')
|
|
118
|
+
return 1
|
|
119
|
+
return 0
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def get_capacity_arguments_from_capacity_type(
|
|
123
|
+
args, capacity_type: CapacityType
|
|
124
|
+
) -> tuple[str, int]:
|
|
125
|
+
"""Determine the TPU Nodepool creation capacity arguments needed.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
args: user provided arguments for running the command.
|
|
129
|
+
capacity_type: The type of capacity the user configured.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Tuple with string with the capacity argument to use and
|
|
133
|
+
int of 0 if successful and 1 otherwise.
|
|
134
|
+
"""
|
|
135
|
+
capacity_args = ''
|
|
136
|
+
return_code = 0
|
|
137
|
+
|
|
138
|
+
match capacity_type:
|
|
139
|
+
case CapacityType.ON_DEMAND:
|
|
140
|
+
capacity_args = ''
|
|
141
|
+
case CapacityType.SPOT:
|
|
142
|
+
capacity_args = '--spot'
|
|
143
|
+
case CapacityType.RESERVATION:
|
|
144
|
+
capacity_args = (
|
|
145
|
+
f'--reservation-affinity=specific --reservation={args.reservation}'
|
|
146
|
+
)
|
|
147
|
+
case _:
|
|
148
|
+
xpk_print(
|
|
149
|
+
f'Unknown capacity type: {capacity_type}. Unable to determine'
|
|
150
|
+
' capacity args.'
|
|
151
|
+
)
|
|
152
|
+
return_code = 1
|
|
153
|
+
return capacity_args, return_code
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def get_capacity_node_selectors_from_capacity_type(
|
|
157
|
+
args, capacity_type: str
|
|
158
|
+
) -> tuple[str, int]:
|
|
159
|
+
"""Determine the node selectors for a workload to run on a specific capacity type.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
args: user provided arguments for running the command.
|
|
163
|
+
capacity_type: The type of capacity the user configured.
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
Tuple with string with the node selectors to use and
|
|
167
|
+
int of 0 if successful and 1 otherwise.
|
|
168
|
+
"""
|
|
169
|
+
node_selector = ''
|
|
170
|
+
return_code = 0
|
|
171
|
+
|
|
172
|
+
match capacity_type:
|
|
173
|
+
case CapacityType.ON_DEMAND.name:
|
|
174
|
+
node_selector = ''
|
|
175
|
+
case CapacityType.SPOT.name:
|
|
176
|
+
node_selector = 'cloud.google.com/gke-spot="true"'
|
|
177
|
+
case CapacityType.RESERVATION.name:
|
|
178
|
+
node_selector = f'cloud.google.com/reservation-name: {args.reservation}'
|
|
179
|
+
case _:
|
|
180
|
+
xpk_print(
|
|
181
|
+
f'Unknown capacity type: {capacity_type}. Unable to determine the'
|
|
182
|
+
' node selectors.'
|
|
183
|
+
)
|
|
184
|
+
return_code = 1
|
|
185
|
+
return node_selector, return_code
|